ali@0: /*************************************************************************/ ali@0: /* gutcheck - check for assorted weirdnesses in a PG candidate text file */ ali@0: /* */ ali@0: /* Version 0.991 */ ali@0: /* Copyright 2000-2005 Jim Tinsley */ ali@0: /* */ ali@0: /* This program is free software; you can redistribute it and/or modify */ ali@0: /* it under the terms of the GNU General Public License as published by */ ali@0: /* the Free Software Foundation; either version 2 of the License, or */ ali@0: /* (at your option) any later version. */ ali@0: /* */ ali@0: /* This program is distributed in the hope that it will be useful, */ ali@0: /* but WITHOUT ANY WARRANTY; without even the implied warranty of */ ali@0: /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ ali@0: /* GNU General Public License for more details. */ ali@0: /* */ ali@0: /* You should have received a copy of the GNU General Public License */ ali@0: /* along with this program; if not, write to the */ ali@0: /* Free Software Foundation, Inc., */ ali@0: /* 59 Temple Place, */ ali@0: /* Suite 330, */ ali@0: /* Boston, MA 02111-1307 USA */ ali@0: /* */ ali@0: /* */ ali@0: /* */ ali@0: /* Overview comments: */ ali@0: /* */ ali@0: /* If you're reading this, you're either interested in how to detect */ ali@0: /* formatting errors, or very very bored. */ ali@0: /* */ ali@0: /* Gutcheck is a homebrew formatting checker specifically for */ ali@0: /* spotting common formatting problems in a PG e-text. I typically */ ali@0: /* run it once or twice on a file I'm about to submit; it usually */ ali@0: /* finds a few formatting problems. It also usually finds lots of */ ali@0: /* queries that aren't problems at all; it _really_ doesn't like */ ali@0: /* the standard PG header, for example. It's optimized for straight */ ali@0: /* prose; poetry and non-fiction involving tables tend to trigger */ ali@0: /* false alarms. */ ali@0: /* */ ali@0: /* The code of gutcheck is not very interesting, but the experience */ ali@0: /* of what constitutes a possible error may be, and the best way to */ ali@0: /* illustrate that is by example. */ ali@0: /* */ ali@0: /* */ ali@0: /* Here are some common typos found in PG texts that gutcheck */ ali@0: /* will flag as errors: */ ali@0: /* */ ali@0: /* "Look!John , over there!" */ ali@0: /* */ ali@0: /* &so is this; */ ali@0: /* Margaret said: " Now you should start for school." */ ali@0: /* Margaret said: "Now you should start for school. (if end of para) */ ali@0: /* The horse is said to he worth a lot. */ ali@0: /* 0K - this'11 make you look close1y. */ ali@0: /* "If you do. you'll regret it!" */ ali@0: /* */ ali@0: /* There are some complications . The extra space left around that */ ali@0: /* period was an error . . . but that ellipsis wasn't. */ ali@0: /* */ ali@0: /* The last line of a paragraph */ ali@0: /* is usually short. */ ali@0: /* */ ali@0: /* This period is an error.But the periods in a.m. aren't. */ ali@0: /* */ ali@0: /* Checks that are do-able but not (well) implemented are: */ ali@0: /* Single-quote chcking. */ ali@0: /* Despite 3 attempts at it, singlequote checking is still */ ali@0: /* crap in gutcheck. It may not be possible without analysis */ ali@0: /* of the whole paragraph. */ ali@0: /* */ ali@0: /*************************************************************************/ ali@0: ali@0: ali@0: #include ali@0: #include ali@0: #include ali@0: #include ali@0: ali@0: #define MAXWORDLEN 80 /* max length of one word */ ali@0: #define LINEBUFSIZE 2048 /* buffer size for an input line */ ali@0: ali@0: #define MAX_USER_TYPOS 1000 ali@0: #define USERTYPO_FILE "gutcheck.typ" ali@0: ali@0: #ifndef MAX_PATH ali@0: #define MAX_PATH 16384 ali@0: #endif ali@0: ali@0: char aline[LINEBUFSIZE]; ali@0: char prevline[LINEBUFSIZE]; ali@0: ali@0: /* Common typos. */ ali@0: char *typo[] = { "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane", "nad", ali@0: "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa", "bakc", "om", ali@0: "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt", "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", ali@0: "hmi", "hse", "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd", "gerat", "goign", ali@0: "gruop", "haev", "hda", "hearign", "seeign", "sayign", "herat", "hge", "hsa", "hsi", "hte", "htere", ali@0: "htese", "htey", "htis", "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut", "loev", ali@0: "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter", "omre", "onyl", "otehr", "otu", "owrk", ali@0: "owuld", "peice", "peices", "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem", ali@0: "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe", "sohw", "stnad", "stopry", ali@0: "stoyr", "stpo", "tahn", "taht", "tath", "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", ali@0: "thier", "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne", "tirne", "tkae", ali@0: "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey", "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", ali@0: "whihc", "whta", "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats", ali@0: "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking", "wtih", "wuould", "wya", "yera", ali@0: "yeras", "yersa", "yoiu", "youve", "ytou", "yuor", ali@0: /* added h/b words for version 12 - removed a few with "tbe" v.25 */ ali@0: "abead", "ahle", "ahout", "ahove", "altbough", "balf", "bardly", "bas", "bave", "baving", "bebind", ali@0: "beld", "belp", "belped", "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge", "dehates", ali@0: "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan", "hegin", "heing", ali@0: "helieve", "henefit", "hetter", "hetween", "heyond", "hig", "higber", "huild", "huy", "hy", "jobn", "joh", ali@0: "meanwbile", "memher", "memhers", "numher", "numhers", ali@0: "perbaps", "prohlem", "puhlic", "witbout", ali@0: /* and a few more for .18 */ ali@0: "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud", "prornise", "prornised", "modem", "bo", ali@0: "heside", "chapteb", "chaptee", "se", ali@0: ""}; ali@0: ali@0: char *usertypo[MAX_USER_TYPOS]; ali@0: ali@0: /* Common abbreviations and other OK words not to query as typos. */ ali@0: /* 0.99 last-minute - removed "ms" */ ali@0: char *okword[] = {"mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm", "rd", "sh", "br", ali@0: "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "pompeii","hawaii","hawaiian", ali@0: "hotbed", "heartbeat", "heartbeats", "outbid", "outbids", "frostbite", "frostbitten", ali@0: ""}; ali@0: ali@0: /* Common abbreviations that cause otherwise unexplained periods. */ ali@0: char *abbrev[] = {"cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op", "cit", ali@0: "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ali@0: ""}; ali@0: /* Two-Letter combinations that rarely if ever start words, */ ali@0: /* but are common scannos or otherwise common letter */ ali@0: /* combinations. */ ali@0: char *nostart[] = { "hr", "hl", "cb", "sb", "tb", "wb", "tl", ali@0: "tn", "rn", "lt", "tj", ali@0: "" }; ali@0: ali@0: /* Two-Letter combinations that rarely if ever end words */ ali@0: /* but are common scannos or otherwise common letter */ ali@0: /* combinations */ ali@0: char *noend[] = { "cb", "gb", "pb", "sb", "tb", ali@0: "wh","fr","br","qu","tw","gl","fl","sw","gr","sl","cl", ali@0: "iy", ali@0: ""}; ali@0: ali@0: char *markup[] = { "a", "b", "big", "blockquote", "body", "br", "center", ali@0: "col", "div", "em", "font", "h1", "h2", "h3", "h4", ali@0: "h5", "h6", "head", "hr", "html", "i", "img", "li", ali@0: "meta", "ol", "p", "pre", "small", "span", "strong", ali@0: "sub", "sup", "table", "td", "tfoot", "thead", "title", ali@0: "tr", "tt", "u", "ul", ali@0: ""}; ali@0: ali@0: char *DPmarkup[] = { "", "", "/*", "*/", "/#", "#/", "/$", "$/", "", ali@0: ""}; /* added .991 */ ali@0: ali@0: char *nocomma[] = { "the", "it's", "their", "an", "mrs", "a", "our", "that's", ali@0: "its", "whose", "every", "i'll", "your", "my", ali@0: "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "rd", ali@0: "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", ali@0: "i'm", "during", "let", "toward", "among", ali@0: ""}; ali@0: ali@0: ali@0: char *noperiod[] = { "every", "i'm", "during", "that's", "their", "your", "our", "my", "or", ali@0: "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether", ali@0: "i'll", "whose", "who", "because", "when", "let", "till", "very", ali@0: "an", "among", "those", "into", "whom", "having", "thence", ali@0: ""}; ali@0: ali@0: ali@0: char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü"; /* Carlo's old suggestion, updated .991 */ ali@0: ali@0: struct { ali@0: char *htmlent; ali@0: char *htmlnum; ali@0: char *textent; ali@0: } entities[] = { "&", "&", "&", ali@0: "<", "<", "<", ali@0: ">", ">", ">", ali@0: "°", "°", " degrees", ali@0: "£", "£", "L", ali@0: """, """, "\"", /* -- quotation mark = APL quote, */ ali@0: "Œ", "Œ", "OE", /* -- latin capital ligature OE, */ ali@0: "œ", "œ", "oe", /* -- latin small ligature oe, U+0153 ISOlat2 --> */ ali@0: "Š", "Š", "S", /* -- latin capital letter S with caron, */ ali@0: "š", "š", "s", /* -- latin small letter s with caron, */ ali@0: "Ÿ", "Ÿ", "Y", /* -- latin capital letter Y with diaeresis, */ ali@0: "ˆ", "ˆ", "", /* -- modifier letter circumflex accent, */ ali@0: "˜", "˜", "~", /* -- small tilde, U+02DC ISOdia --> */ ali@0: " ", " ", " ", /* -- en space, U+2002 ISOpub --> */ ali@0: " ", " ", " ", /* -- em space, U+2003 ISOpub --> */ ali@0: " ", " ", " ", /* -- thin space, U+2009 ISOpub --> */ ali@0: "–", "–", "-", /* -- en dash, U+2013 ISOpub --> */ ali@0: "—", "—", "--", /* -- em dash, U+2014 ISOpub --> */ ali@0: "‘", "‘", "'", /* -- left single quotation mark, */ ali@0: "’", "’", "'", /* -- right single quotation mark, */ ali@0: "‚", "‚", "'", /* -- single low-9 quotation mark, U+201A NEW --> */ ali@0: "“", "“", "\"", /* -- left double quotation mark, */ ali@0: "”", "”", "\"", /* -- right double quotation mark, */ ali@0: "„", "„", "\"", /* -- double low-9 quotation mark, U+201E NEW --> */ ali@0: "‹", "‹", "\"", /* -- single left-pointing angle quotation mark, */ ali@0: "›", "›", "\"", /* -- single right-pointing angle quotation mark, */ ali@0: " ", " ", " ", /* -- no-break space = non-breaking space, */ ali@0: "¡", "¡", "!", /* -- inverted exclamation mark, U+00A1 ISOnum --> */ ali@0: "¢", "¢", "c", /* -- cent sign, U+00A2 ISOnum --> */ ali@0: "£", "£", "L", /* -- pound sign, U+00A3 ISOnum --> */ ali@0: "¤", "¤", "$", /* -- currency sign, U+00A4 ISOnum --> */ ali@0: "¥", "¥", "Y", /* -- yen sign = yuan sign, U+00A5 ISOnum --> */ ali@0: "§", "§", "--", /* -- section sign, U+00A7 ISOnum --> */ ali@0: "¨", "¨", " ", /* -- diaeresis = spacing diaeresis, */ ali@0: "©", "©", "(C) ", /* -- copyright sign, U+00A9 ISOnum --> */ ali@0: "ª", "ª", " ", /* -- feminine ordinal indicator, U+00AA ISOnum --> */ ali@0: "«", "«", "\"", /* -- left-pointing double angle quotation mark */ ali@0: "­", "­", "-", /* -- soft hyphen = discretionary hyphen, */ ali@0: "®", "®", "(R) ", /* -- registered sign = registered trade mark sign, */ ali@0: "¯", "¯", " ", /* -- macron = spacing macron = overline */ ali@0: "°", "°", " degrees", /* -- degree sign, U+00B0 ISOnum --> */ ali@0: "±", "±", "+-", /* -- plus-minus sign = plus-or-minus sign, */ ali@0: "²", "²", "2", /* -- superscript two = superscript digit two */ ali@0: "³", "³", "3", /* -- superscript three = superscript digit three */ ali@0: "´", "´", " ", /* -- acute accent = spacing acute, */ ali@0: "µ", "µ", "m", /* -- micro sign, U+00B5 ISOnum --> */ ali@0: "¶", "¶", "--", /* -- pilcrow sign = paragraph sign, */ ali@0: "¸", "¸", " ", /* -- cedilla = spacing cedilla, U+00B8 ISOdia --> */ ali@0: "¹", "¹", "1", /* -- superscript one = superscript digit one, */ ali@0: "º", "º", " ", /* -- masculine ordinal indicator, */ ali@0: "»", "»", "\"", /* -- right-pointing double angle quotation mark */ ali@0: "¼", "¼", "1/4", /* -- vulgar fraction one quarter */ ali@0: "½", "½", "1/2", /* -- vulgar fraction one half */ ali@0: "¾", "¾", "3/4", /* -- vulgar fraction three quarters */ ali@0: "¿", "¿", "?", /* -- inverted question mark */ ali@0: "À", "À", "A", /* -- latin capital letter A with grave */ ali@0: "Á", "Á", "A", /* -- latin capital letter A with acute, */ ali@0: "Â", "Â", "A", /* -- latin capital letter A with circumflex, */ ali@0: "Ã", "Ã", "A", /* -- latin capital letter A with tilde, */ ali@0: "Ä", "Ä", "A", /* -- latin capital letter A with diaeresis, */ ali@0: "Å", "Å", "A", /* -- latin capital letter A with ring above */ ali@0: "Æ", "Æ", "AE", /* -- latin capital letter AE */ ali@0: "Ç", "Ç", "C", /* -- latin capital letter C with cedilla, */ ali@0: "È", "È", "E", /* -- latin capital letter E with grave, */ ali@0: "É", "É", "E", /* -- latin capital letter E with acute, */ ali@0: "Ê", "Ê", "E", /* -- latin capital letter E with circumflex, */ ali@0: "Ë", "Ë", "E", /* -- latin capital letter E with diaeresis, */ ali@0: "Ì", "Ì", "I", /* -- latin capital letter I with grave, */ ali@0: "Í", "Í", "I", /* -- latin capital letter I with acute, */ ali@0: "Î", "Î", "I", /* -- latin capital letter I with circumflex, */ ali@0: "Ï", "Ï", "I", /* -- latin capital letter I with diaeresis, */ ali@0: "Ð", "Ð", "E", /* -- latin capital letter ETH, U+00D0 ISOlat1 --> */ ali@0: "Ñ", "Ñ", "N", /* -- latin capital letter N with tilde, */ ali@0: "Ò", "Ò", "O", /* -- latin capital letter O with grave, */ ali@0: "Ó", "Ó", "O", /* -- latin capital letter O with acute, */ ali@0: "Ô", "Ô", "O", /* -- latin capital letter O with circumflex, */ ali@0: "Õ", "Õ", "O", /* -- latin capital letter O with tilde, */ ali@0: "Ö", "Ö", "O", /* -- latin capital letter O with diaeresis, */ ali@0: "×", "×", "*", /* -- multiplication sign, U+00D7 ISOnum --> */ ali@0: "Ø", "Ø", "O", /* -- latin capital letter O with stroke */ ali@0: "Ù", "Ù", "U", /* -- latin capital letter U with grave, */ ali@0: "Ú", "Ú", "U", /* -- latin capital letter U with acute, */ ali@0: "Û", "Û", "U", /* -- latin capital letter U with circumflex, */ ali@0: "Ü", "Ü", "U", /* -- latin capital letter U with diaeresis, */ ali@0: "Ý", "Ý", "Y", /* -- latin capital letter Y with acute, */ ali@0: "Þ", "Þ", "TH", /* -- latin capital letter THORN, */ ali@0: "ß", "ß", "sz", /* -- latin small letter sharp s = ess-zed, */ ali@0: "à", "à", "a", /* -- latin small letter a with grave */ ali@0: "á", "á", "a", /* -- latin small letter a with acute, */ ali@0: "â", "â", "a", /* -- latin small letter a with circumflex, */ ali@0: "ã", "ã", "a", /* -- latin small letter a with tilde, */ ali@0: "ä", "ä", "a", /* -- latin small letter a with diaeresis, */ ali@0: "å", "å", "a", /* -- latin small letter a with ring above */ ali@0: "æ", "æ", "ae", /* -- latin small letter ae */ ali@0: "ç", "ç", "c", /* -- latin small letter c with cedilla, */ ali@0: "è", "è", "e", /* -- latin small letter e with grave, */ ali@0: "é", "é", "e", /* -- latin small letter e with acute, */ ali@0: "ê", "ê", "e", /* -- latin small letter e with circumflex, */ ali@0: "ë", "ë", "e", /* -- latin small letter e with diaeresis, */ ali@0: "ì", "ì", "i", /* -- latin small letter i with grave, */ ali@0: "í", "í", "i", /* -- latin small letter i with acute, */ ali@0: "î", "î", "i", /* -- latin small letter i with circumflex, */ ali@0: "ï", "ï", "i", /* -- latin small letter i with diaeresis, */ ali@0: "ð", "ð", "eth", /* -- latin small letter eth, U+00F0 ISOlat1 --> */ ali@0: "ñ", "ñ", "n", /* -- latin small letter n with tilde, */ ali@0: "ò", "ò", "o", /* -- latin small letter o with grave, */ ali@0: "ó", "ó", "o", /* -- latin small letter o with acute, */ ali@0: "ô", "ô", "o", /* -- latin small letter o with circumflex, */ ali@0: "õ", "õ", "o", /* -- latin small letter o with tilde, */ ali@0: "ö", "ö", "o", /* -- latin small letter o with diaeresis, */ ali@0: "÷", "÷", "/", /* -- division sign, U+00F7 ISOnum --> */ ali@0: "ø", "ø", "o", /* -- latin small letter o with stroke, */ ali@0: "ù", "ù", "u", /* -- latin small letter u with grave, */ ali@0: "ú", "ú", "u", /* -- latin small letter u with acute, */ ali@0: "û", "û", "u", /* -- latin small letter u with circumflex, */ ali@0: "ü", "ü", "u", /* -- latin small letter u with diaeresis, */ ali@0: "ý", "ý", "y", /* -- latin small letter y with acute, */ ali@0: "þ", "þ", "th", /* -- latin small letter thorn, */ ali@0: "ÿ", "ÿ", "y", /* -- latin small letter y with diaeresis, */ ali@0: "", "" }; ali@0: ali@0: /* ---- list of special characters ---- */ ali@0: #define CHAR_SPACE 32 ali@0: #define CHAR_TAB 9 ali@0: #define CHAR_LF 10 ali@0: #define CHAR_CR 13 ali@0: #define CHAR_DQUOTE 34 ali@0: #define CHAR_SQUOTE 39 ali@0: #define CHAR_OPEN_SQUOTE 96 ali@0: #define CHAR_TILDE 126 ali@0: #define CHAR_ASTERISK 42 ali@0: #define CHAR_FORESLASH 47 ali@0: #define CHAR_CARAT 94 ali@0: ali@0: #define CHAR_UNDERSCORE '_' ali@0: #define CHAR_OPEN_CBRACK '{' ali@0: #define CHAR_CLOSE_CBRACK '}' ali@0: #define CHAR_OPEN_RBRACK '(' ali@0: #define CHAR_CLOSE_RBRACK ')' ali@0: #define CHAR_OPEN_SBRACK '[' ali@0: #define CHAR_CLOSE_SBRACK ']' ali@0: ali@0: ali@0: ali@0: ali@0: ali@0: /* ---- longest and shortest normal PG line lengths ----*/ ali@0: #define LONGEST_PG_LINE 75 ali@0: #define WAY_TOO_LONG 80 ali@0: #define SHORTEST_PG_LINE 55 ali@0: ali@0: #define SWITCHES "ESTPXLOYHWVMUD" /* switches:- */ ali@0: /* D - ignore DP-specific markup */ ali@0: /* E - echo queried line */ ali@0: /* S - check single quotes */ ali@0: /* T - check common typos */ ali@0: /* P - require closure of quotes on */ ali@0: /* every paragraph */ ali@0: /* X - "Trust no one" :-) Paranoid! */ ali@0: /* Queries everything */ ali@0: /* L - line end checking defaults on */ ali@0: /* -L turns it off */ ali@0: /* O - overview. Just shows counts. */ ali@0: /* Y - puts errors to stdout */ ali@0: /* instead of stderr */ ali@0: /* H - Echoes header fields */ ali@0: /* M - Ignore markup in < > */ ali@0: /* U - Use file of User-defined Typos*/ ali@0: /* W - Defaults for use on Web upload*/ ali@0: /* V - Verbose - list EVERYTHING! */ ali@0: #define SWITNO 14 /* max number of switch parms */ ali@0: /* - used for defining array-size */ ali@0: #define MINARGS 1 /* minimum no of args excl switches */ ali@0: #define MAXARGS 1 /* maximum no of args excl switches */ ali@0: ali@0: int pswit[SWITNO]; /* program switches set by SWITCHES */ ali@0: ali@0: #define ECHO_SWITCH 0 ali@0: #define SQUOTE_SWITCH 1 ali@0: #define TYPO_SWITCH 2 ali@0: #define QPARA_SWITCH 3 ali@0: #define PARANOID_SWITCH 4 ali@0: #define LINE_END_SWITCH 5 ali@0: #define OVERVIEW_SWITCH 6 ali@0: #define STDOUT_SWITCH 7 ali@0: #define HEADER_SWITCH 8 ali@0: #define WEB_SWITCH 9 ali@0: #define VERBOSE_SWITCH 10 ali@0: #define MARKUP_SWITCH 11 ali@0: #define USERTYPO_SWITCH 12 ali@0: #define DP_SWITCH 13 ali@0: ali@0: ali@0: ali@0: long cnt_dquot; /* for overview mode, count of doublequote queries */ ali@0: long cnt_squot; /* for overview mode, count of singlequote queries */ ali@0: long cnt_brack; /* for overview mode, count of brackets queries */ ali@0: long cnt_bin; /* for overview mode, count of non-ASCII queries */ ali@0: long cnt_odd; /* for overview mode, count of odd character queries */ ali@0: long cnt_long; /* for overview mode, count of long line errors */ ali@0: long cnt_short; /* for overview mode, count of short line queries */ ali@0: long cnt_punct; /* for overview mode, count of punctuation and spacing queries */ ali@0: long cnt_dash; /* for overview mode, count of dash-related queries */ ali@0: long cnt_word; /* for overview mode, count of word queries */ ali@0: long cnt_html; /* for overview mode, count of html queries */ ali@0: long cnt_lineend; /* for overview mode, count of line-end queries */ ali@0: long cnt_spacend; /* count of lines with space at end V .21 */ ali@0: long linecnt; /* count of total lines in the file */ ali@0: long checked_linecnt; /* count of lines actually gutchecked V .26 */ ali@0: ali@0: void proghelp(void); ali@0: void procfile(char *); ali@0: ali@0: #define LOW_THRESHOLD 0 ali@0: #define HIGH_THRESHOLD 1 ali@0: ali@0: #define START 0 ali@0: #define END 1 ali@0: #define PREV 0 ali@0: #define NEXT 1 ali@0: #define FIRST_OF_PAIR 0 ali@0: #define SECOND_OF_PAIR 1 ali@0: ali@0: #define MAX_WORDPAIR 1000 ali@0: ali@0: char running_from[MAX_PATH]; ali@0: ali@0: int mixdigit(char *); ali@0: char *getaword(char *, char *); ali@0: int matchword(char *, char *); ali@0: char *flgets(char *, int, FILE *, long); ali@0: void lowerit(char *); ali@0: int gcisalpha(unsigned char); ali@0: int gcisdigit(unsigned char); ali@0: int gcisletter(unsigned char); ali@0: char *gcstrchr(char *s, char c); ali@0: void postprocess_for_HTML(char *); ali@0: char *linehasmarkup(char *); ali@0: char *losemarkup(char *); ali@0: int tagcomp(char *, char *); ali@0: char *loseentities(char *); ali@0: int isroman(char *); ali@0: int usertypo_count; ali@0: void postprocess_for_DP(char *); ali@0: ali@0: char wrk[LINEBUFSIZE]; ali@0: ali@0: /* This is disgustingly lazy, predefining max words & lengths, */ ali@0: /* but now I'm out of 16-bit restrictions, what's a couple of K? */ ali@0: #define MAX_QWORD 50 ali@0: #define MAX_QWORD_LENGTH 40 ali@0: char qword[MAX_QWORD][MAX_QWORD_LENGTH]; ali@0: char qperiod[MAX_QWORD][MAX_QWORD_LENGTH]; ali@0: signed int dupcnt[MAX_QWORD]; ali@0: ali@0: ali@0: ali@0: ali@0: int main(int argc, char **argv) ali@0: { ali@0: char *argsw, *s; ali@0: int i, switno, invarg; ali@0: char usertypo_file[MAX_PATH]; ali@0: FILE *usertypofile; ali@0: ali@0: ali@0: if (strlen(argv[0]) < sizeof(running_from)) ali@0: strcpy(running_from, argv[0]); /* save the path to the executable gutcheck */ ali@0: ali@0: /* find out what directory we're running from */ ali@0: for (s = running_from + strlen(running_from); *s != '/' && *s != '\\' && s >= running_from; s--) ali@0: *s = 0; ali@0: ali@0: ali@0: switno = strlen(SWITCHES); ali@0: for (i = switno ; --i >0 ; ) ali@0: pswit[i] = 0; /* initialise switches */ ali@0: ali@0: /* Standard loop to extract switches. */ ali@0: /* When we come out of this loop, the arguments will be */ ali@0: /* in argv[0] upwards and the switches used will be */ ali@0: /* represented by their equivalent elements in pswit[] */ ali@0: while ( --argc > 0 && **++argv == '-') ali@0: for (argsw = argv[0]+1; *argsw !='\0'; argsw++) ali@0: for (i = switno, invarg = 1; (--i >= 0) && invarg == 1 ; ) ali@0: if ((toupper(*argsw)) == SWITCHES[i] ) { ali@0: invarg = 0; ali@0: pswit[i] = 1; ali@0: } ali@0: ali@0: pswit[PARANOID_SWITCH] ^= 1; /* Paranoid checking is turned OFF, not on, by its switch */ ali@0: ali@0: if (pswit[PARANOID_SWITCH]) { /* if running in paranoid mode */ ali@0: pswit[TYPO_SWITCH] = pswit[TYPO_SWITCH] ^ 1; /* force typo checks as well */ ali@0: } /* v.20 removed s and p switches from paranoid mode */ ali@0: ali@0: pswit[LINE_END_SWITCH] ^= 1; /* Line-end checking is turned OFF, not on, by its switch */ ali@0: pswit[ECHO_SWITCH] ^= 1; /* V.21 Echoing is turned OFF, not on, by its switch */ ali@0: ali@0: if (pswit[OVERVIEW_SWITCH]) /* just print summary; don't echo */ ali@0: pswit[ECHO_SWITCH] = 0; ali@0: ali@0: /* Web uploads - for the moment, this is really just a placeholder */ ali@0: /* until we decide what processing we really want to do on web uploads */ ali@0: if (pswit[WEB_SWITCH]) { /* specific override for web uploads */ ali@0: pswit[ECHO_SWITCH] = 1; ali@0: pswit[SQUOTE_SWITCH] = 0; ali@0: pswit[TYPO_SWITCH] = 1; ali@0: pswit[QPARA_SWITCH] = 0; ali@0: pswit[PARANOID_SWITCH] = 1; ali@0: pswit[LINE_END_SWITCH] = 0; ali@0: pswit[OVERVIEW_SWITCH] = 0; ali@0: pswit[STDOUT_SWITCH] = 0; ali@0: pswit[HEADER_SWITCH] = 1; ali@0: pswit[VERBOSE_SWITCH] = 0; ali@0: pswit[MARKUP_SWITCH] = 0; ali@0: pswit[USERTYPO_SWITCH] = 0; ali@0: pswit[DP_SWITCH] = 0; ali@0: } ali@0: ali@0: ali@0: if (argc < MINARGS || argc > MAXARGS) { /* check number of args */ ali@0: proghelp(); ali@0: return(1); /* exit */ ali@0: } ali@0: ali@0: ali@0: /* read in the user-defined stealth scanno list */ ali@0: ali@0: if (pswit[USERTYPO_SWITCH]) { /* ... we were told we had one! */ ali@0: if ((usertypofile = fopen(USERTYPO_FILE, "rb")) == NULL) { /* not in cwd. try gutcheck directory. */ ali@0: strcpy(usertypo_file, running_from); ali@0: strcat(usertypo_file, USERTYPO_FILE); ali@0: if ((usertypofile = fopen(usertypo_file, "rb")) == NULL) { /* we ain't got no user typo file! */ ali@0: printf(" --> I couldn't find gutcheck.typ -- proceeding without user typos.\n"); ali@0: } ali@0: } ali@0: ali@0: usertypo_count = 0; ali@0: if (usertypofile) { /* we managed to open a User Typo File! */ ali@0: if (pswit[USERTYPO_SWITCH]) { ali@0: while (flgets(aline, LINEBUFSIZE-1, usertypofile, (long)usertypo_count)) { ali@0: if (strlen(aline) > 1) { ali@0: if ((int)*aline > 33) { ali@0: s = malloc(strlen(aline)+1); ali@0: if (!s) { ali@0: fprintf(stderr, "gutcheck: cannot get enough memory for user typo file!!\n"); ali@0: exit(1); ali@0: } ali@0: strcpy(s, aline); ali@0: usertypo[usertypo_count] = s; ali@0: usertypo_count++; ali@0: if (usertypo_count >= MAX_USER_TYPOS) { ali@0: printf(" --> Only %d user-defined typos allowed: ignoring the rest\n"); ali@0: break; ali@0: } ali@0: } ali@0: } ali@0: } ali@0: } ali@0: fclose(usertypofile); ali@0: } ali@0: } ali@0: ali@0: ali@0: ali@0: ali@0: fprintf(stderr, "gutcheck: Check and report on an e-text\n"); ali@0: ali@0: cnt_dquot = cnt_squot = cnt_brack = cnt_bin = cnt_odd = cnt_long = ali@0: cnt_short = cnt_punct = cnt_dash = cnt_word = cnt_html = cnt_lineend = ali@0: cnt_spacend = 0; ali@0: ali@0: procfile(argv[0]); ali@0: ali@0: if (pswit[OVERVIEW_SWITCH]) { ali@0: printf(" Checked %ld lines of %ld (head+foot = %ld)\n\n", ali@0: checked_linecnt, linecnt, linecnt - checked_linecnt); ali@0: printf(" --------------- Queries found --------------\n"); ali@0: if (cnt_long) printf(" Long lines: %5ld\n",cnt_long); ali@0: if (cnt_short) printf(" Short lines: %5ld\n",cnt_short); ali@0: if (cnt_lineend) printf(" Line-end problems: %5ld\n",cnt_lineend); ali@0: if (cnt_word) printf(" Common typos: %5ld\n",cnt_word); ali@0: if (cnt_dquot) printf(" Unmatched quotes: %5ld\n",cnt_dquot); ali@0: if (cnt_squot) printf(" Unmatched SingleQuotes: %5ld\n",cnt_squot); ali@0: if (cnt_brack) printf(" Unmatched brackets: %5ld\n",cnt_brack); ali@0: if (cnt_bin) printf(" Non-ASCII characters: %5ld\n",cnt_bin); ali@0: if (cnt_odd) printf(" Proofing characters: %5ld\n",cnt_odd); ali@0: if (cnt_punct) printf(" Punctuation & spacing queries: %5ld\n",cnt_punct); ali@0: if (cnt_dash) printf(" Non-standard dashes: %5ld\n",cnt_dash); ali@0: if (cnt_html) printf(" Possible HTML tags: %5ld\n",cnt_html); ali@0: printf("\n"); ali@0: printf(" TOTAL QUERIES %5ld\n", ali@0: cnt_dquot + cnt_squot + cnt_brack + cnt_bin + cnt_odd + cnt_long + ali@0: cnt_short + cnt_punct + cnt_dash + cnt_word + cnt_html + cnt_lineend); ali@0: } ali@0: ali@0: return(0); ali@0: } ali@0: ali@0: ali@0: ali@0: /* procfile - process one file */ ali@0: ali@0: void procfile(char *filename) ali@0: { ali@0: ali@0: char *s, *t, *s1, laststart, *wordstart; ali@0: char inword[MAXWORDLEN], testword[MAXWORDLEN]; ali@0: char parastart[81]; /* first line of current para */ ali@0: FILE *infile; ali@0: long quot, squot, firstline, alphalen, totlen, binlen, ali@0: shortline, longline, verylongline, spacedash, emdash, ali@0: space_emdash, non_PG_space_emdash, PG_space_emdash, ali@0: footerline, dotcomma, start_para_line, astline, fslashline, ali@0: standalone_digit, hyphens, htmcount, endquote_count; ali@0: long spline, nspline; ali@0: signed int i, j, llen, isemptyline, isacro, isellipsis, istypo, alower, ali@0: eNon_A, eTab, eTilde, eAst, eFSlash, eCarat; ali@0: signed int warn_short, warn_long, warn_bin, warn_dash, warn_dotcomma, ali@0: warn_ast, warn_fslash, warn_digit, warn_hyphen, warn_endquote; ali@0: unsigned int lastlen, lastblen; ali@0: signed int s_brack, c_brack, r_brack, c_unders; ali@0: signed int open_single_quote, close_single_quote, guessquote, dquotepar, squotepar; ali@0: signed int isnewpara, vowel, consonant; ali@0: char dquote_err[80], squote_err[80], rbrack_err[80], sbrack_err[80], cbrack_err[80], ali@0: unders_err[80]; ali@0: signed int qword_index, qperiod_index, isdup; ali@0: signed int enddash; ali@0: signed int Dutchcount, isDutch, Frenchcount, isFrench; ali@0: ali@0: ali@0: ali@0: ali@0: ali@0: laststart = CHAR_SPACE; ali@0: lastlen = lastblen = 0; ali@0: *dquote_err = *squote_err = *rbrack_err = *cbrack_err = *sbrack_err = ali@0: *unders_err = *prevline = 0; ali@0: linecnt = firstline = alphalen = totlen = binlen = ali@0: shortline = longline = spacedash = emdash = checked_linecnt = ali@0: space_emdash = non_PG_space_emdash = PG_space_emdash = ali@0: footerline = dotcomma = start_para_line = astline = fslashline = ali@0: standalone_digit = hyphens = htmcount = endquote_count = 0; ali@0: quot = squot = s_brack = c_brack = r_brack = c_unders = 0; ali@0: i = llen = isemptyline = isacro = isellipsis = istypo = 0; ali@0: warn_short = warn_long = warn_bin = warn_dash = warn_dotcomma = ali@0: warn_ast = warn_fslash = warn_digit = warn_endquote = 0; ali@0: isnewpara = vowel = consonant = enddash = 0; ali@0: spline = nspline = 0; ali@0: qword_index = qperiod_index = isdup = 0; ali@0: *inword = *testword = 0; ali@0: open_single_quote = close_single_quote = guessquote = dquotepar = squotepar = 0; ali@0: Dutchcount = isDutch = Frenchcount = isFrench = 0; ali@0: ali@0: ali@0: for (j = 0; j < MAX_QWORD; j++) { ali@0: dupcnt[j] = 0; ali@0: for (i = 0; i < MAX_QWORD_LENGTH; i++) ali@0: qword[i][j] = 0; ali@0: qperiod[i][j] = 0; ali@0: } ali@0: ali@0: ali@0: if ((infile = fopen(filename, "rb")) == NULL) { ali@0: if (pswit[STDOUT_SWITCH]) ali@0: fprintf(stdout, "gutcheck: cannot open %s\n", filename); ali@0: else ali@0: fprintf(stderr, "gutcheck: cannot open %s\n", filename); ali@0: exit(1); ali@0: } ali@0: ali@0: fprintf(stdout, "\n\nFile: %s\n\n", filename); ali@0: firstline = shortline = longline = verylongline = 0; ali@0: ali@0: ali@0: /*****************************************************/ ali@0: /* */ ali@0: /* Run a first pass - verify that it's a valid PG */ ali@0: /* file, decide whether to report some things that */ ali@0: /* occur many times in the text like long or short */ ali@0: /* lines, non-standard dashes, and other good stuff */ ali@0: /* I'll doubtless think of later. */ ali@0: /* */ ali@0: /*****************************************************/ ali@0: ali@0: /*****************************************************/ ali@0: /* V.24 Sigh. Yet Another Header Change */ ali@0: /*****************************************************/ ali@0: ali@0: while (fgets(aline, LINEBUFSIZE-1, infile)) { ali@0: while (aline[strlen(aline)-1] == 10 || aline[strlen(aline)-1] == 13 ) aline[strlen(aline)-1] = 0; ali@0: linecnt++; ali@0: if (strstr(aline, "*END") && strstr(aline, "SMALL PRINT") && (strstr(aline, "PUBLIC DOMAIN") || strstr(aline, "COPYRIGHT"))) { ali@0: if (spline) ali@0: printf(" --> Duplicate header?\n"); ali@0: spline = linecnt + 1; /* first line of non-header text, that is */ ali@0: } ali@0: if (!strncmp(aline, "*** START", 9) && strstr(aline, "PROJECT GUTENBERG")) { ali@0: if (nspline) ali@0: printf(" --> Duplicate header?\n"); ali@0: nspline = linecnt + 1; /* first line of non-header text, that is */ ali@0: } ali@0: if (spline || nspline) { ali@0: lowerit(aline); ali@0: if (strstr(aline, "end") && strstr(aline, "project gutenberg")) { ali@0: if (strstr(aline, "end") < strstr(aline, "project gutenberg")) { ali@0: if (footerline) { ali@0: if (!nspline) /* it's an old-form header - we can detect duplicates */ ali@0: printf(" --> Duplicate footer?\n"); ali@0: else ali@0: ; ali@0: } ali@0: else { ali@0: footerline = linecnt; ali@0: } ali@0: } ali@0: } ali@0: } ali@0: if (spline) firstline = spline; ali@0: if (nspline) firstline = nspline; /* override with new */ ali@0: ali@0: if (footerline) continue; /* 0.99+ don't count the boilerplate in the footer */ ali@0: ali@0: llen = strlen(aline); ali@0: totlen += llen; ali@0: for (i = 0; i < llen; i++) { ali@0: if ((unsigned char)aline[i] > 127) binlen++; ali@0: if (gcisalpha(aline[i])) alphalen++; ali@0: if (i > 0) ali@0: if (aline[i] == CHAR_DQUOTE && isalpha(aline[i-1])) ali@0: endquote_count++; ali@0: } ali@0: if (strlen(aline) > 2 ali@0: && lastlen > 2 && lastlen < SHORTEST_PG_LINE ali@0: && lastblen > 2 && lastblen > SHORTEST_PG_LINE ali@0: && laststart != CHAR_SPACE) ali@0: shortline++; ali@0: ali@0: if (*aline) /* fixed line below for 0.96 */ ali@0: if ((unsigned char)aline[strlen(aline)-1] <= CHAR_SPACE) cnt_spacend++; ali@0: ali@0: if (strstr(aline, ".,")) dotcomma++; ali@0: /* 0.98 only count ast lines for ignoring purposes where there is */ ali@0: /* locase text on the line */ ali@0: if (strstr(aline, "*")) { ali@0: for (s = aline; *s; s++) ali@0: if (*s >='a' && *s <= 'z') ali@0: break; ali@0: if (*s) astline++; ali@0: } ali@0: if (strstr(aline, "/")) ali@0: fslashline++; ali@0: for (i = llen-1; i > 0 && (unsigned char)aline[i] <= CHAR_SPACE; i--); ali@0: if (aline[i] == '-' && aline[i-1] != '-') hyphens++; ali@0: ali@0: if (llen > LONGEST_PG_LINE) longline++; ali@0: if (llen > WAY_TOO_LONG) verylongline++; ali@0: ali@0: if (strstr(aline, "<") && strstr(aline, ">")) { ali@0: i = (signed int) (strstr(aline, ">") - strstr(aline, "<") + 1); ali@0: if (i > 0) ali@0: htmcount++; ali@0: if (strstr(aline, "")) htmcount +=4; /* bonus marks! */ ali@0: } ali@0: ali@0: /* Check for spaced em-dashes */ ali@0: if (strstr(aline,"--")) { ali@0: emdash++; ali@0: if (*(strstr(aline, "--")-1) == CHAR_SPACE || ali@0: (*(strstr(aline, "--")+2) == CHAR_SPACE)) ali@0: space_emdash++; ali@0: if (*(strstr(aline, "--")-1) == CHAR_SPACE && ali@0: (*(strstr(aline, "--")+2) == CHAR_SPACE)) ali@0: non_PG_space_emdash++; /* count of em-dashes with spaces both sides */ ali@0: if (*(strstr(aline, "--")-1) != CHAR_SPACE && ali@0: (*(strstr(aline, "--")+2) != CHAR_SPACE)) ali@0: PG_space_emdash++; /* count of PG-type em-dashes with no spaces */ ali@0: } ali@0: ali@0: for (s = aline; *s;) { ali@0: s = getaword(s, inword); ali@0: if (!strcmp(inword, "hij") || !strcmp(inword, "niet")) ali@0: Dutchcount++; ali@0: if (!strcmp(inword, "dans") || !strcmp(inword, "avec")) ali@0: Frenchcount++; ali@0: if (!strcmp(inword, "0") || !strcmp(inword, "1")) ali@0: standalone_digit++; ali@0: } ali@0: ali@0: /* Check for spaced dashes */ ali@0: if (strstr(aline," -")) ali@0: if (*(strstr(aline, " -")+2) != '-') ali@0: spacedash++; ali@0: lastblen = lastlen; ali@0: lastlen = strlen(aline); ali@0: laststart = aline[0]; ali@0: ali@0: } ali@0: fclose(infile); ali@0: ali@0: ali@0: /* now, based on this quick view, make some snap decisions */ ali@0: if (cnt_spacend > 0) { ali@0: printf(" --> %ld lines in this file have white space at end\n", cnt_spacend); ali@0: } ali@0: ali@0: warn_dotcomma = 1; ali@0: if (dotcomma > 5) { ali@0: warn_dotcomma = 0; ali@0: printf(" --> %ld lines in this file contain '.,'. Not reporting them.\n", dotcomma); ali@0: } ali@0: ali@0: /* if more than 50 lines, or one-tenth, are short, don't bother reporting them */ ali@0: warn_short = 1; ali@0: if (shortline > 50 || shortline * 10 > linecnt) { ali@0: warn_short = 0; ali@0: printf(" --> %ld lines in this file are short. Not reporting short lines.\n", shortline); ali@0: } ali@0: ali@0: /* if more than 50 lines, or one-tenth, are long, don't bother reporting them */ ali@0: warn_long = 1; ali@0: if (longline > 50 || longline * 10 > linecnt) { ali@0: warn_long = 0; ali@0: printf(" --> %ld lines in this file are long. Not reporting long lines.\n", longline); ali@0: } ali@0: ali@0: /* if more than 10 lines contain asterisks, don't bother reporting them V.0.97 */ ali@0: warn_ast = 1; ali@0: if (astline > 10 ) { ali@0: warn_ast = 0; ali@0: printf(" --> %ld lines in this file contain asterisks. Not reporting them.\n", astline); ali@0: } ali@0: ali@0: /* if more than 10 lines contain forward slashes, don't bother reporting them V.0.99 */ ali@0: warn_fslash = 1; ali@0: if (fslashline > 10 ) { ali@0: warn_fslash = 0; ali@0: printf(" --> %ld lines in this file contain forward slashes. Not reporting them.\n", fslashline); ali@0: } ali@0: ali@0: /* if more than 20 lines contain unpunctuated endquotes, don't bother reporting them V.0.99 */ ali@0: warn_endquote = 1; ali@0: if (endquote_count > 20 ) { ali@0: warn_endquote = 0; ali@0: printf(" --> %ld lines in this file contain unpunctuated endquotes. Not reporting them.\n", endquote_count); ali@0: } ali@0: ali@0: /* if more than 15 lines contain standalone digits, don't bother reporting them V.0.97 */ ali@0: warn_digit = 1; ali@0: if (standalone_digit > 10 ) { ali@0: warn_digit = 0; ali@0: printf(" --> %ld lines in this file contain standalone 0s and 1s. Not reporting them.\n", standalone_digit); ali@0: } ali@0: ali@0: /* if more than 20 lines contain hyphens at end, don't bother reporting them V.0.98 */ ali@0: warn_hyphen = 1; ali@0: if (hyphens > 20 ) { ali@0: warn_hyphen = 0; ali@0: printf(" --> %ld lines in this file have hyphens at end. Not reporting them.\n", hyphens); ali@0: } ali@0: ali@0: if (htmcount > 20 && !pswit[MARKUP_SWITCH]) { ali@0: printf(" --> Looks like this is HTML. Switching HTML mode ON.\n"); ali@0: pswit[MARKUP_SWITCH] = 1; ali@0: } ali@0: ali@0: if (verylongline > 0) { ali@0: printf(" --> %ld lines in this file are VERY long!\n", verylongline); ali@0: } ali@0: ali@0: /* If there are more non-PG spaced dashes than PG em-dashes, */ ali@0: /* assume it's deliberate */ ali@0: /* Current PG guidelines say don't use them, but older texts do,*/ ali@0: /* and some people insist on them whatever the guidelines say. */ ali@0: /* V.20 removed requirement that PG_space_emdash be greater than*/ ali@0: /* ten before turning off warnings about spaced dashes. */ ali@0: warn_dash = 1; ali@0: if (spacedash + non_PG_space_emdash > PG_space_emdash) { ali@0: warn_dash = 0; ali@0: printf(" --> There are %ld spaced dashes and em-dashes. Not reporting them.\n", spacedash + non_PG_space_emdash); ali@0: } ali@0: ali@0: /* if more than a quarter of characters are hi-bit, bug out */ ali@0: warn_bin = 1; ali@0: if (binlen * 4 > totlen) { ali@0: printf(" --> This file does not appear to be ASCII. Terminating. Best of luck with it!\n"); ali@0: exit(1); ali@0: } ali@0: if (alphalen * 4 < totlen) { ali@0: printf(" --> This file does not appear to be text. Terminating. Best of luck with it!\n"); ali@0: exit(1); ali@0: } ali@0: if ((binlen * 100 > totlen) || (binlen > 100)) { ali@0: printf(" --> There are a lot of foreign letters here. Not reporting them.\n"); ali@0: warn_bin = 0; ali@0: } ali@0: ali@0: /* isDutch and isFrench added .991 Feb 06 for Frank, Jeroen, Renald */ ali@0: isDutch = 0; ali@0: if (Dutchcount > 50) { ali@0: isDutch = 1; ali@0: printf(" --> This looks like Dutch - switching off dashes and warnings for 's Middags case.\n"); ali@0: } ali@0: ali@0: isFrench = 0; ali@0: if (Frenchcount > 50) { ali@0: isFrench = 1; ali@0: printf(" --> This looks like French - switching off some doublepunct.\n"); ali@0: } ali@0: ali@0: if (firstline && footerline) ali@0: printf(" The PG header and footer appear to be already on.\n"); ali@0: else { ali@0: if (firstline) ali@0: printf(" The PG header is on - no footer.\n"); ali@0: if (footerline) ali@0: printf(" The PG footer is on - no header.\n"); ali@0: } ali@0: printf("\n"); ali@0: ali@0: /* V.22 George Davis asked for an override switch to force it to list everything */ ali@0: if (pswit[VERBOSE_SWITCH]) { ali@0: warn_bin = 1; ali@0: warn_short = 1; ali@0: warn_dotcomma = 1; ali@0: warn_long = 1; ali@0: warn_dash = 1; ali@0: warn_digit = 1; ali@0: warn_ast = 1; ali@0: warn_fslash = 1; ali@0: warn_hyphen = 1; ali@0: warn_endquote = 1; ali@0: printf(" *** Verbose output is ON -- you asked for it! ***\n"); ali@0: } ali@0: ali@0: if (isDutch) ali@0: warn_dash = 0; /* Frank suggested turning it REALLY off for Dutch */ ali@0: ali@0: if ((infile = fopen(filename, "rb")) == NULL) { ali@0: if (pswit[STDOUT_SWITCH]) ali@0: fprintf(stdout, "gutcheck: cannot open %s\n", filename); ali@0: else ali@0: fprintf(stderr, "gutcheck: cannot open %s\n", filename); ali@0: exit(1); ali@0: } ali@0: ali@0: if (footerline > 0 && firstline > 0 && footerline > firstline && footerline - firstline < 100) { /* ugh */ ali@0: printf(" --> I don't really know where this text starts. \n"); ali@0: printf(" There are no reference points.\n"); ali@0: printf(" I'm going to have to report the header and footer as well.\n"); ali@0: firstline=0; ali@0: } ali@0: ali@0: ali@0: ali@0: /*****************************************************/ ali@0: /* */ ali@0: /* Here we go with the main pass. Hold onto yer hat! */ ali@0: /* */ ali@0: /*****************************************************/ ali@0: ali@0: /* Re-init some variables we've dirtied */ ali@0: quot = squot = linecnt = 0; ali@0: laststart = CHAR_SPACE; ali@0: lastlen = lastblen = 0; ali@0: ali@0: while (flgets(aline, LINEBUFSIZE-1, infile, linecnt+1)) { ali@0: linecnt++; ali@0: if (linecnt == 1) isnewpara = 1; ali@0: if (pswit[DP_SWITCH]) ali@0: if (!strncmp(aline, "-----File: ", 11)) ali@0: continue; // skip DP page separators completely ali@0: if (linecnt < firstline || (footerline > 0 && linecnt > footerline)) { ali@0: if (pswit[HEADER_SWITCH]) { ali@0: if (!strncmp(aline, "Title:", 6)) ali@0: printf(" %s\n", aline); ali@0: if (!strncmp (aline, "Author:", 7)) ali@0: printf(" %s\n", aline); ali@0: if (!strncmp(aline, "Release Date:", 13)) ali@0: printf(" %s\n", aline); ali@0: if (!strncmp(aline, "Edition:", 8)) ali@0: printf(" %s\n\n", aline); ali@0: } ali@0: continue; /* skip through the header */ ali@0: } ali@0: checked_linecnt++; ali@0: s = aline; ali@0: isemptyline = 1; /* assume the line is empty until proven otherwise */ ali@0: ali@0: /* If we are in a state of unbalanced quotes, and this line */ ali@0: /* doesn't begin with a quote, output the stored error message */ ali@0: /* If the -P switch was used, print the warning even if the */ ali@0: /* new para starts with quotes */ ali@0: /* Version .20 - if the new paragraph does start with a quote, */ ali@0: /* but is indented, I was giving a spurious error. Need to */ ali@0: /* check the first _non-space_ character on the line rather */ ali@0: /* than the first character when deciding whether the para */ ali@0: /* starts with a quote. Using *t for this. */ ali@0: t = s; ali@0: while (*t == ' ') t++; ali@0: if (*dquote_err) ali@0: if (*t != CHAR_DQUOTE || pswit[QPARA_SWITCH]) { ali@0: if (!pswit[OVERVIEW_SWITCH]) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart); ali@0: printf(dquote_err); ali@0: } ali@0: else ali@0: cnt_dquot++; ali@0: } ali@0: if (*squote_err) { ali@0: if (*t != CHAR_SQUOTE && *t != CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] || squot) { ali@0: if (!pswit[OVERVIEW_SWITCH]) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart); ali@0: printf(squote_err); ali@0: } ali@0: else ali@0: cnt_squot++; ali@0: } ali@0: squot = 0; ali@0: } ali@0: if (*rbrack_err) { ali@0: if (!pswit[OVERVIEW_SWITCH]) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart); ali@0: printf(rbrack_err); ali@0: } ali@0: else ali@0: cnt_brack++; ali@0: } ali@0: if (*sbrack_err) { ali@0: if (!pswit[OVERVIEW_SWITCH]) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart); ali@0: printf(sbrack_err); ali@0: } ali@0: else ali@0: cnt_brack++; ali@0: } ali@0: if (*cbrack_err) { ali@0: if (!pswit[OVERVIEW_SWITCH]) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart); ali@0: printf(cbrack_err); ali@0: } ali@0: else ali@0: cnt_brack++; ali@0: } ali@0: if (*unders_err) { ali@0: if (!pswit[OVERVIEW_SWITCH]) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart); ali@0: printf(unders_err); ali@0: } ali@0: else ali@0: cnt_brack++; ali@0: } ali@0: ali@0: *dquote_err = *squote_err = *rbrack_err = *cbrack_err = ali@0: *sbrack_err = *unders_err = 0; ali@0: ali@0: ali@0: /* look along the line, accumulate the count of quotes, and see */ ali@0: /* if this is an empty line - i.e. a line with nothing on it */ ali@0: /* but spaces. */ ali@0: /* V .12 also if line has just spaces, * and/or - on it, don't */ ali@0: /* count it, since empty lines with asterisks or dashes to */ ali@0: /* separate sections are common. */ ali@0: /* V .15 new single-quote checking - has to be better than the */ ali@0: /* previous version, but how much better? fingers crossed! */ ali@0: /* V .20 add period to * and - as characters on a separator line*/ ali@0: s = aline; ali@0: while (*s) { ali@0: if (*s == CHAR_DQUOTE) quot++; ali@0: if (*s == CHAR_SQUOTE || *s == CHAR_OPEN_SQUOTE) ali@0: if (s == aline) { /* at start of line, it can only be an openquote */ ali@0: if (strncmp(s+2, "tis", 3) && strncmp(s+2, "Tis", 3)) /* hardcode a very common exception! */ ali@0: open_single_quote++; ali@0: } ali@0: else ali@0: if (gcisalpha(*(s-1)) && gcisalpha(*(s+1))) ali@0: ; /* do nothing! - it's definitely an apostrophe, not a quote */ ali@0: else /* it's outside a word - let's check it out */ ali@0: if (*s == CHAR_OPEN_SQUOTE || gcisalpha(*(s+1))) { /* it damwell better BE an openquote */ ali@0: if (strncmp(s+1, "tis", 3) && strncmp(s+1, "Tis", 3)) /* hardcode a very common exception! */ ali@0: open_single_quote++; ali@0: } ali@0: else { /* now - is it a closequote? */ ali@0: guessquote = 0; /* accumulate clues */ ali@0: if (gcisalpha(*(s-1))) { /* it follows a letter - could be either */ ali@0: guessquote += 1; ali@0: if (*(s-1) == 's') { /* looks like a plural apostrophe */ ali@0: guessquote -= 3; ali@0: if (*(s+1) == CHAR_SPACE) /* bonus marks! */ ali@0: guessquote -= 2; ali@0: } ali@0: } ali@0: else /* it doesn't have a letter either side */ ali@0: if (strchr(".?!,;:", *(s-1)) && (strchr(".?!,;: ", *(s+1)))) ali@0: guessquote += 8; /* looks like a closequote */ ali@0: else ali@0: guessquote += 1; ali@0: if (open_single_quote > close_single_quote) ali@0: guessquote += 1; /* give it the benefit of some doubt - if a squote is already open */ ali@0: else ali@0: guessquote -= 1; ali@0: if (guessquote >= 0) ali@0: close_single_quote++; ali@0: } ali@0: ali@0: if (*s != CHAR_SPACE ali@0: && *s != '-' ali@0: && *s != '.' ali@0: && *s != CHAR_ASTERISK ali@0: && *s != 13 ali@0: && *s != 10) isemptyline = 0; /* ignore lines like * * * as spacers */ ali@0: if (*s == CHAR_UNDERSCORE) c_unders++; ali@0: if (*s == CHAR_OPEN_CBRACK) c_brack++; ali@0: if (*s == CHAR_CLOSE_CBRACK) c_brack--; ali@0: if (*s == CHAR_OPEN_RBRACK) r_brack++; ali@0: if (*s == CHAR_CLOSE_RBRACK) r_brack--; ali@0: if (*s == CHAR_OPEN_SBRACK) s_brack++; ali@0: if (*s == CHAR_CLOSE_SBRACK) s_brack--; ali@0: s++; ali@0: } ali@0: ali@0: if (isnewpara && !isemptyline) { /* This line is the start of a new paragraph */ ali@0: start_para_line = linecnt; ali@0: strncpy(parastart, aline, 80); /* Capture its first line in case we want to report it later */ ali@0: parastart[79] = 0; ali@0: dquotepar = squotepar = 0; /* restart the quote count 0.98 */ ali@0: s = aline; ali@0: while (!gcisalpha(*s) && !gcisdigit(*s) && *s) s++; /* V.97 fixed bug - overran line and gave false warning - rare */ ali@0: if (*s >= 'a' && *s <='z') { /* and its first letter is lowercase */ ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - Paragraph starts with lower-case\n", linecnt, (int)(s - aline) +1); ali@0: else ali@0: cnt_punct++; ali@0: } ali@0: isnewpara = 0; /* Signal the end of new para processing */ ali@0: } ali@0: ali@0: /* Check for an em-dash broken at line end */ ali@0: if (enddash && *aline == '-') { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column 1 - Broken em-dash?\n", linecnt); ali@0: else ali@0: cnt_punct++; ali@0: } ali@0: enddash = 0; ali@0: for (s = aline + strlen(aline) - 1; *s == ' ' && s > aline; s--); ali@0: if (s >= aline && *s == '-') ali@0: enddash = 1; ali@0: ali@0: ali@0: /* Check for invalid or questionable characters in the line */ ali@0: /* Anything above 127 is invalid for plain ASCII, and */ ali@0: /* non-printable control characters should also be flagged. */ ali@0: /* Tabs should generally not be there. */ ali@0: /* Jan 06, in 0.99: Hm. For some strange reason, I either */ ali@0: /* never created or deleted the check for unprintable */ ali@0: /* control characters. They should be reported even if */ ali@0: /* warn_bin is on, I think, and in full. */ ali@0: ali@0: for (s = aline; *s; s++) { ali@0: i = (unsigned char) *s; ali@0: if (i < CHAR_SPACE && i != CHAR_LF && i != CHAR_CR && i != CHAR_TAB) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - Control character %d\n", linecnt, (int) (s - aline) + 1, i); ali@0: else ali@0: cnt_bin++; ali@0: } ali@0: } ali@0: ali@0: if (warn_bin) { ali@0: eNon_A = eTab = eTilde = eCarat = eFSlash = eAst = 0; /* don't repeat multiple warnings on one line */ ali@0: for (s = aline; *s; s++) { ali@0: if (!eNon_A && ((*s < CHAR_SPACE && *s != 9 && *s != '\n') || (unsigned char)*s > 127)) { ali@0: i = *s; /* annoying kludge for signed chars */ ali@0: if (i < 0) i += 256; ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: if (i > 127 && i < 160) ali@0: printf(" Line %ld column %d - Non-ISO-8859 character %d\n", linecnt, (int) (s - aline) + 1, i); ali@0: else ali@0: printf(" Line %ld column %d - Non-ASCII character %d\n", linecnt, (int) (s - aline) + 1, i); ali@0: else ali@0: cnt_bin++; ali@0: eNon_A = 1; ali@0: } ali@0: if (!eTab && *s == CHAR_TAB) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - Tab character?\n", linecnt, (int) (s - aline) + 1); ali@0: else ali@0: cnt_odd++; ali@0: eTab = 1; ali@0: } ali@0: if (!eTilde && *s == CHAR_TILDE) { /* often used by OCR software to indicate an unrecognizable character */ ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - Tilde character?\n", linecnt, (int) (s - aline) + 1); ali@0: else ali@0: cnt_odd++; ali@0: eTilde = 1; ali@0: } ali@0: if (!eCarat && *s == CHAR_CARAT) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - Carat character?\n", linecnt, (int) (s - aline) + 1); ali@0: else ali@0: cnt_odd++; ali@0: eCarat = 1; ali@0: } ali@0: if (!eFSlash && *s == CHAR_FORESLASH && warn_fslash) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - Forward slash?\n", linecnt, (int) (s - aline) + 1); ali@0: else ali@0: cnt_odd++; ali@0: eFSlash = 1; ali@0: } ali@0: /* report asterisks only in paranoid mode, since they're often deliberate */ ali@0: if (!eAst && pswit[PARANOID_SWITCH] && warn_ast && !isemptyline && *s == CHAR_ASTERISK) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - Asterisk?\n", linecnt, (int) (s - aline) + 1); ali@0: else ali@0: cnt_odd++; ali@0: eAst = 1; ali@0: } ali@0: } ali@0: } ali@0: ali@0: /* Check for line too long */ ali@0: if (warn_long) { ali@0: if (strlen(aline) > LONGEST_PG_LINE) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - Long line %d\n", linecnt, strlen(aline), strlen(aline)); ali@0: else ali@0: cnt_long++; ali@0: } ali@0: } ali@0: ali@0: /* Check for line too short. */ ali@0: /* This one is a bit trickier to implement: we don't want to */ ali@0: /* flag the last line of a paragraph for being short, so we */ ali@0: /* have to wait until we know that our current line is a */ ali@0: /* "normal" line, then report the _previous_ line if it was too */ ali@0: /* short. We also don't want to report indented lines like */ ali@0: /* chapter heads or formatted quotations. We therefore keep */ ali@0: /* lastlen as the length of the last line examined, and */ ali@0: /* lastblen as the length of the last but one, and try to */ ali@0: /* suppress unnecessary warnings by checking that both were of */ ali@0: /* "normal" length. We keep the first character of the last */ ali@0: /* line in laststart, and if it was a space, we assume that the */ ali@0: /* formatting is deliberate. I can't figure out a way to */ ali@0: /* distinguish something like a quoted verse left-aligned or */ ali@0: /* the header or footer of a letter from a paragraph of short */ ali@0: /* lines - maybe if I examined the whole paragraph, and if the */ ali@0: /* para has less than, say, 8 lines and if all lines are short, */ ali@0: /* then just assume it's OK? Need to look at some texts to see */ ali@0: /* how often a formula like this would get the right result. */ ali@0: /* V0.99 changed the tolerance for length to ignore from 2 to 1 */ ali@0: if (warn_short) { ali@0: if (strlen(aline) > 1 ali@0: && lastlen > 1 && lastlen < SHORTEST_PG_LINE ali@0: && lastblen > 1 && lastblen > SHORTEST_PG_LINE ali@0: && laststart != CHAR_SPACE) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", prevline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - Short line %d?\n", linecnt-1, strlen(prevline), strlen(prevline)); ali@0: else ali@0: cnt_short++; ali@0: } ali@0: } ali@0: lastblen = lastlen; ali@0: lastlen = strlen(aline); ali@0: laststart = aline[0]; ali@0: ali@0: /* look for punctuation at start of line */ ali@0: if (*aline && strchr(".?!,;:", aline[0])) { /* if it's punctuation */ ali@0: if (strncmp(". . .", aline, 5)) { /* exception for ellipsis: V.98 tightened up to except only a full ellipsis */ ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column 1 - Begins with punctuation?\n", linecnt); ali@0: else ali@0: cnt_punct++; ali@0: } ali@0: } ali@0: ali@0: /* Check for spaced em-dashes */ ali@0: /* V.20 must check _all_ occurrences of "--" on the line */ ali@0: /* hence the loop - even if the first double-dash is OK */ ali@0: /* there may be another that's wrong later on. */ ali@0: if (warn_dash) { ali@0: s = aline; ali@0: while (strstr(s,"--")) { ali@0: if (*(strstr(s, "--")-1) == CHAR_SPACE || ali@0: (*(strstr(s, "--")+2) == CHAR_SPACE)) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - Spaced em-dash?\n", linecnt, (int) (strstr(s,"--") - aline) + 1); ali@0: else ali@0: cnt_dash++; ali@0: } ali@0: s = strstr(s,"--") + 2; ali@0: } ali@0: } ali@0: ali@0: /* Check for spaced dashes */ ali@0: if (warn_dash) ali@0: if (strstr(aline," -")) { ali@0: if (*(strstr(aline, " -")+2) != '-') { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - Spaced dash?\n", linecnt, (int) (strstr(aline," -") - aline) + 1); ali@0: else ali@0: cnt_dash++; ali@0: } ali@0: } ali@0: else ali@0: if (strstr(aline,"- ")) { ali@0: if (*(strstr(aline, "- ")-1) != '-') { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - Spaced dash?\n", linecnt, (int) (strstr(aline,"- ") - aline) + 1); ali@0: else ali@0: cnt_dash++; ali@0: } ali@0: } ali@0: ali@0: /* v 0.99 */ ali@0: /* Check for unmarked paragraphs indicated by separate speakers */ ali@0: /* May well be false positive: */ ali@0: /* "Bravo!" "Wonderful!" called the crowd. */ ali@0: /* but useful all the same. */ ali@0: s = wrk; ali@0: *s = 0; ali@0: if (strstr(aline, "\" \"")) s = strstr(aline, "\" \""); ali@0: if (strstr(aline, "\" \"")) s = strstr(aline, "\" \""); ali@0: if (*s) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - Query missing paragraph break?\n", linecnt, (int)(s - aline) +1); ali@0: else ali@0: cnt_punct++; ali@0: } ali@0: ali@0: ali@0: ali@0: /* Check for "to he" and other easy he/be errors */ ali@0: /* This is a very inadequate effort on the he/be problem, */ ali@0: /* but the phrase "to he" is always an error, whereas "to */ ali@0: /* be" is quite common. I chuckle when it does catch one! */ ali@0: /* Similarly, '"Quiet!", be said.' is a non-be error */ ali@0: /* V .18 - "to he" is _not_ always an error!: */ ali@0: /* "Where they went to he couldn't say." */ ali@0: /* but I'm leaving it in anyway. */ ali@0: /* V .20 Another false positive: */ ali@0: /* What would "Cinderella" be without the . . . */ ali@0: /* and another "If he wants to he can see for himself." */ ali@0: /* V .21 Added " is be " and " be is " and " be was " */ ali@0: /* V .99 Added jeebies code -- removed again. */ ali@0: /* Is jeebies code worth adding? Rare to see he/be */ ali@0: /* errors with modern OCR. Separate program? Yes! */ ali@0: /* jeebies does the job without cluttering up this. */ ali@0: /* We do get a few more queryable pairs from the */ ali@0: /* project though -- they're cheap to implement. */ ali@0: /* Also added a column number for guiguts. */ ali@0: ali@0: s = wrk; ali@0: *s = 0; ali@0: if (strstr(aline," to he ")) s = strstr(aline," to he "); ali@0: if (strstr(aline,"\" be ")) s = strstr(aline,"\" be "); ali@0: if (strstr(aline,"\", be ")) s = strstr(aline,"\", be "); ali@0: if (strstr(aline," is be ")) s = strstr(aline," is be "); ali@0: if (strstr(aline," be is ")) s = strstr(aline," be is "); ali@0: if (strstr(aline," was be ")) s = strstr(aline," was be "); ali@0: if (strstr(aline," be would ")) s = strstr(aline," be would "); ali@0: if (strstr(aline," be could ")) s = strstr(aline," be could "); ali@0: if (*s) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - Query he/be error?\n", linecnt, (int)(s - aline) +1); ali@0: else ali@0: cnt_word++; ali@0: } ali@0: ali@0: s = wrk; ali@0: *s = 0; ali@0: if (strstr(aline," i bad ")) s = strstr(aline," i bad "); ali@0: if (strstr(aline," you bad ")) s = strstr(aline," you bad "); ali@0: if (strstr(aline," he bad ")) s = strstr(aline," he bad "); ali@0: if (strstr(aline," she bad ")) s = strstr(aline," she bad "); ali@0: if (strstr(aline," they bad ")) s = strstr(aline," they bad "); ali@0: if (strstr(aline," a had ")) s = strstr(aline," a had "); ali@0: if (strstr(aline," the had ")) s = strstr(aline," the had "); ali@0: if (*s) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - Query had/bad error?\n", linecnt, (int)(s - aline) +1); ali@0: else ali@0: cnt_word++; ali@0: } ali@0: ali@0: ali@0: /* V .97 Added ", hut " Not too common, hut pretty certain */ ali@0: /* V.99 changed to add a column number for guiguts */ ali@0: s = wrk; ali@0: *s = 0; ali@0: if (strstr(aline,", hut ")) s = strstr(aline,", hut "); ali@0: if (strstr(aline,"; hut ")) s = strstr(aline,"; hut "); ali@0: if (*s) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - Query hut/but error?\n", linecnt, (int)(s - aline) +1); ali@0: else ali@0: cnt_word++; ali@0: } ali@0: ali@0: /* Special case - angled bracket in front of "From" placed there by an MTA */ ali@0: /* when sending an e-mail. V .21 */ ali@0: if (strstr(aline, ">From")) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - Query angled bracket with From\n", linecnt, (int)(strstr(aline, ">From") - aline) +1); ali@0: else ali@0: cnt_punct++; ali@0: } ali@0: ali@0: /* V 0.98 Check for a single character line - often an overflow from bad wrapping. */ ali@0: if (*aline && !*(aline+1)) { ali@0: if (*aline == 'I' || *aline == 'V' || *aline == 'X' || *aline == 'L' || gcisdigit(*aline)) ali@0: ; /* nothing - ignore numerals alone on a line. */ ali@0: else { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column 1 - Query single character line\n", linecnt); ali@0: else ali@0: cnt_punct++; ali@0: } ali@0: } ali@0: ali@0: /* V 0.98 Check for I" - often should be ! */ ali@0: if (strstr(aline, " I\"")) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %ld - Query I=exclamation mark?\n", linecnt, strstr(aline, " I\"") - aline); ali@0: else ali@0: cnt_punct++; ali@0: } ali@0: ali@0: /* V 0.98 Check for period without a capital letter. Cut-down from gutspell */ ali@0: /* Only works when it happens on a single line. */ ali@0: ali@0: if (pswit[PARANOID_SWITCH]) ali@0: for (t = s = aline; strstr(t,". ");) { ali@0: t = strstr(t, ". "); ali@0: if (t == s) { ali@0: t++; ali@0: continue; /* start of line punctuation is handled elsewhere */ ali@0: } ali@0: if (!gcisalpha(*(t-1))) { ali@0: t++; ali@0: continue; ali@0: } ali@0: if (isDutch) { /* For Frank & Jeroen -- 's Middags case */ ali@0: if (*(t+2) == CHAR_SQUOTE && ali@0: *(t+3)>='a' && *(t+3)<='z' && ali@0: *(t+4) == CHAR_SPACE && ali@0: *(t+5)>='A' && *(t+5)<='Z') { ali@0: t++; ali@0: continue; ali@0: } ali@0: } ali@0: s1 = t+2; ali@0: while (*s1 && !gcisalpha(*s1) && !isdigit(*s1)) ali@0: s1++; ali@0: if (*s1 >= 'a' && *s1 <= 'z') { /* we have something to investigate */ ali@0: istypo = 1; ali@0: for (s1 = t - 1; s1 >= s && ali@0: (gcisalpha(*s1) || gcisdigit(*s1) || ali@0: (*s1 == CHAR_SQUOTE && gcisalpha(*(s1+1)) && gcisalpha(*(s1-1)))); s1--); /* so let's go back and find out */ ali@0: s1++; ali@0: for (i = 0; *s1 && *s1 != '.'; s1++, i++) ali@0: testword[i] = *s1; ali@0: testword[i] = 0; ali@0: for (i = 0; *abbrev[i]; i++) ali@0: if (!strcmp(testword, abbrev[i])) ali@0: istypo = 0; ali@0: // if (*testword >= 'A' && *testword <= 'Z') ali@0: // istypo = 0; ali@0: if (gcisdigit(*testword)) istypo = 0; ali@0: if (!*(testword+1)) istypo = 0; ali@0: if (isroman(testword)) istypo = 0; ali@0: if (istypo) { ali@0: istypo = 0; ali@0: for (i = 0; testword[i]; i++) ali@0: if (strchr(vowels, testword[i])) ali@0: istypo = 1; ali@0: } ali@0: if (istypo) { ali@0: isdup = 0; ali@0: if (strlen(testword) < MAX_QWORD_LENGTH && !pswit[VERBOSE_SWITCH]) ali@0: for (i = 0; i < qperiod_index; i++) ali@0: if (!strcmp(testword, qperiod[i])) { ali@0: isdup = 1; ali@0: } ali@0: if (!isdup) { ali@0: if (qperiod_index < MAX_QWORD && strlen(testword) < MAX_QWORD_LENGTH) { ali@0: strcpy(qperiod[qperiod_index], testword); ali@0: qperiod_index++; ali@0: } ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - Extra period?\n", linecnt, (int)(t - aline)+1); ali@0: else ali@0: cnt_punct++; ali@0: } ali@0: } ali@0: } ali@0: t++; ali@0: } ali@0: ali@0: ali@0: if (pswit[TYPO_SWITCH]) { /* Should have put this condition in at the start of 0.99. Duh! */ ali@0: /* Check for words usually not followed by punctuation 0.99 */ ali@0: for (s = aline; *s;) { ali@0: wordstart = s; ali@0: s = getaword(s, inword); ali@0: if (!*inword) continue; ali@0: lowerit(inword); ali@0: for (i = 0; *nocomma[i]; i++) ali@0: if (!strcmp(inword, nocomma[i])) { ali@0: if (*s == ',' || *s == ';' || *s == ':') { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - Query punctuation after %s?\n", linecnt, (int)(s - aline)+1, inword); ali@0: else ali@0: cnt_punct++; ali@0: } ali@0: } ali@0: for (i = 0; *noperiod[i]; i++) ali@0: if (!strcmp(inword, noperiod[i])) { ali@0: if (*s == '.' || *s == '!') { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - Query punctuation after %s?\n", linecnt, (int)(s - aline)+1, inword); ali@0: else ali@0: cnt_punct++; ali@0: } ali@0: } ali@0: } ali@0: } ali@0: ali@0: ali@0: ali@0: /* Check for commonly mistyped words, and digits like 0 for O in a word */ ali@0: for (s = aline; *s;) { ali@0: wordstart = s; ali@0: s = getaword(s, inword); ali@0: if (!*inword) continue; /* don't bother with empty lines */ ali@0: if (mixdigit(inword)) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %ld - Query digit in %s\n", linecnt, (int)(wordstart - aline) + 1, inword); ali@0: else ali@0: cnt_word++; ali@0: } ali@0: ali@0: /* put the word through a series of tests for likely typos and OCR errors */ ali@0: /* V.21 I had allowed lots of typo-checking even with the typo switch */ ali@0: /* turned off, but I really should disallow reporting of them when */ ali@0: /* the switch is off. Hence the "if" below. */ ali@0: if (pswit[TYPO_SWITCH]) { ali@0: istypo = 0; ali@0: strcpy(testword, inword); ali@0: alower = 0; ali@0: for (i = 0; i < (signed int)strlen(testword); i++) { /* lowercase for testing */ ali@0: if (testword[i] >= 'a' && testword[i] <= 'z') alower = 1; ali@0: if (alower && testword[i] >= 'A' && testword[i] <= 'Z') { ali@0: /* we have an uppercase mid-word. However, there are common cases: */ ali@0: /* Mac and Mc like McGill */ ali@0: /* French contractions like l'Abbe */ ali@0: if ((i == 2 && testword[0] == 'm' && testword[1] == 'c') || ali@0: (i == 3 && testword[0] == 'm' && testword[1] == 'a' && testword[2] == 'c') || ali@0: (i > 0 && testword[i-1] == CHAR_SQUOTE)) ali@0: ; /* do nothing! */ ali@0: ali@0: else { /* V.97 - remove separate case of uppercase within word so that */ ali@0: /* names like VanAllen fall into qword_index and get reported only once */ ali@0: istypo = 1; ali@0: } ali@0: } ali@0: testword[i] = (char)tolower(testword[i]); ali@0: } ali@0: ali@0: /* check for certain unlikely two-letter combinations at word start and end */ ali@0: /* V.0.97 - this replaces individual hardcoded checks in previous versions */ ali@0: if (strlen(testword) > 1) { ali@0: for (i = 0; *nostart[i]; i++) ali@0: if (!strncmp(testword, nostart[i], 2)) ali@0: istypo = 1; ali@0: for (i = 0; *noend[i]; i++) ali@0: if (!strncmp(testword + strlen(testword) -2, noend[i], 2)) ali@0: istypo = 1; ali@0: } ali@0: ali@0: ali@0: /* ght is common, gbt never. Like that. */ ali@0: if (strstr(testword, "cb")) istypo = 1; ali@0: if (strstr(testword, "gbt")) istypo = 1; ali@0: if (strstr(testword, "pbt")) istypo = 1; ali@0: if (strstr(testword, "tbs")) istypo = 1; ali@0: if (strstr(testword, "mrn")) istypo = 1; ali@0: if (strstr(testword, "ahle")) istypo = 1; ali@0: if (strstr(testword, "ihle")) istypo = 1; ali@0: ali@0: /* "TBE" does happen - like HEARTBEAT - but uncommon. */ ali@0: /* Also "TBI" - frostbite, outbid - but uncommon. */ ali@0: /* Similarly "ii" like Hawaii, or Pompeii, and in Roman numerals, */ ali@0: /* but these are covered in V.20. "ii" is a common scanno. */ ali@0: if (strstr(testword, "tbi")) istypo = 1; ali@0: if (strstr(testword, "tbe")) istypo = 1; ali@0: if (strstr(testword, "ii")) istypo = 1; ali@0: ali@0: /* check for no vowels or no consonants. */ ali@0: /* If none, flag a typo */ ali@0: if (!istypo && strlen(testword)>1) { ali@0: vowel = consonant = 0; ali@0: for (i = 0; testword[i]; i++) ali@0: if (testword[i] == 'y' || gcisdigit(testword[i])) { /* Yah, this is loose. */ ali@0: vowel++; ali@0: consonant++; ali@0: } ali@0: else ali@0: if (strchr(vowels, testword[i])) vowel++; ali@0: else consonant++; ali@0: if (!vowel || !consonant) { ali@0: istypo = 1; ali@0: } ali@0: } ali@0: ali@0: /* now exclude the word from being reported if it's in */ ali@0: /* the okword list */ ali@0: for (i = 0; *okword[i]; i++) ali@0: if (!strcmp(testword, okword[i])) ali@0: istypo = 0; ali@0: ali@0: /* what looks like a typo may be a Roman numeral. Exclude these */ ali@0: if (istypo) ali@0: if (isroman(testword)) ali@0: istypo = 0; ali@0: ali@0: /* check the manual list of typos */ ali@0: if (!istypo) ali@0: for (i = 0; *typo[i]; i++) ali@0: if (!strcmp(testword, typo[i])) ali@0: istypo = 1; ali@0: ali@0: ali@0: /* V.21 - check lowercase s and l - special cases */ ali@0: /* V.98 - added "i" and "m" */ ali@0: /* V.99 - added "j" often a semi-colon gone wrong */ ali@0: /* - and "d" for a missing apostrophe - he d */ ali@0: /* - and "n" for "in" */ ali@0: if (!istypo && strlen(testword) == 1) ali@0: if (strchr("slmijdn", *inword)) ali@0: istypo = 1; ali@0: ali@0: ali@0: if (istypo) { ali@0: isdup = 0; ali@0: if (strlen(testword) < MAX_QWORD_LENGTH && !pswit[VERBOSE_SWITCH]) ali@0: for (i = 0; i < qword_index; i++) ali@0: if (!strcmp(testword, qword[i])) { ali@0: isdup = 1; ali@0: ++dupcnt[i]; ali@0: } ali@0: if (!isdup) { ali@0: if (qword_index < MAX_QWORD && strlen(testword) < MAX_QWORD_LENGTH) { ali@0: strcpy(qword[qword_index], testword); ali@0: qword_index++; ali@0: } ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) { ali@0: printf(" Line %ld column %d - Query word %s", linecnt, (int)(wordstart - aline) + 1, inword); ali@0: if (strlen(testword) < MAX_QWORD_LENGTH && !pswit[VERBOSE_SWITCH]) ali@0: printf(" - not reporting duplicates"); ali@0: printf("\n"); ali@0: } ali@0: else ali@0: cnt_word++; ali@0: } ali@0: } ali@0: } /* end of typo-checking */ ali@0: ali@0: /* check the user's list of typos */ ali@0: if (!istypo) ali@0: if (usertypo_count) ali@0: for (i = 0; i < usertypo_count; i++) ali@0: if (!strcmp(testword, usertypo[i])) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - Query possible scanno %s\n", linecnt, (int)(wordstart - aline) + 2, inword); ali@0: } ali@0: ali@0: ali@0: ali@0: if (pswit[PARANOID_SWITCH] && warn_digit) { /* in paranoid mode, query all 0 and 1 standing alone - added warn_digit V.97*/ ali@0: if (!strcmp(inword, "0") || !strcmp(inword, "1")) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - Query standalone %s\n", linecnt, (int)(wordstart - aline) + 2, inword); ali@0: else ali@0: cnt_word++; ali@0: } ali@0: } ali@0: } ali@0: ali@0: /* look for added or missing spaces around punctuation and quotes */ ali@0: /* If there is a punctuation character like ! with no space on */ ali@0: /* either side, suspect a missing!space. If there are spaces on */ ali@0: /* both sides , assume a typo. If we see a double quote with no */ ali@0: /* space or punctuation on either side of it, assume unspaced */ ali@0: /* quotes "like"this. */ ali@0: llen = strlen(aline); ali@0: for (i = 1; i < llen; i++) { /* for each character in the line after the first */ ali@0: if (strchr(".?!,;:_", aline[i])) { /* if it's punctuation */ ali@0: isacro = 0; /* we need to suppress warnings for acronyms like M.D. */ ali@0: isellipsis = 0; /* we need to suppress warnings for ellipsis . . . */ ali@0: if ( (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1])) || /* if there are letters on both sides of it or ... */ ali@0: (gcisalpha(aline[i+1]) && strchr("?!,;:", aline[i]))) { /* ...if it's strict punctuation followed by an alpha */ ali@0: if (aline[i] == '.') { ali@0: if (i > 2) ali@0: if (aline[i-2] == '.') isacro = 1; ali@0: if (i + 2 < llen) ali@0: if (aline[i+2] == '.') isacro = 1; ali@0: } ali@0: if (!isacro) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - Missing space?\n", linecnt, i+1); ali@0: else ali@0: cnt_punct++; ali@0: } ali@0: } ali@0: if (aline[i-1] == CHAR_SPACE && (aline[i+1] == CHAR_SPACE || aline[i+1] == 0)) { /* if there are spaces on both sides, or space before and end of line */ ali@0: if (aline[i] == '.') { ali@0: if (i > 2) ali@0: if (aline[i-2] == '.') isellipsis = 1; ali@0: if (i + 2 < llen) ali@0: if (aline[i+2] == '.') isellipsis = 1; ali@0: } ali@0: if (!isemptyline && !isellipsis) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - Spaced punctuation?\n", linecnt, i+1); ali@0: else ali@0: cnt_punct++; ali@0: } ali@0: } ali@0: } ali@0: } ali@0: ali@0: /* 0.98 -- split out the characters that CANNOT be preceded by space */ ali@0: llen = strlen(aline); ali@0: for (i = 1; i < llen; i++) { /* for each character in the line after the first */ ali@0: if (strchr("?!,;:", aline[i])) { /* if it's punctuation that _cannot_ have a space before it */ ali@0: if (aline[i-1] == CHAR_SPACE && !isemptyline && aline[i+1] != CHAR_SPACE) { /* if aline[i+1) DOES == space, it was already reported just above */ ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - Spaced punctuation?\n", linecnt, i+1); ali@0: else ali@0: cnt_punct++; ali@0: } ali@0: } ali@0: } ali@0: ali@0: ali@0: /* 0.99 -- special case " .X" where X is any alpha. */ ali@0: /* This plugs a hole in the acronym code above. Inelegant, but maintainable. */ ali@0: llen = strlen(aline); ali@0: for (i = 1; i < llen; i++) { /* for each character in the line after the first */ ali@0: if (aline[i] == '.') { /* if it's a period */ ali@0: if (aline[i-1] == CHAR_SPACE && gcisalpha(aline[i+1])) { /* if the period follows a space and is followed by a letter */ ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - Spaced punctuation?\n", linecnt, i+1); ali@0: else ali@0: cnt_punct++; ali@0: } ali@0: } ali@0: } ali@0: ali@0: ali@0: ali@0: ali@0: /* v.21 breaking out the search for unspaced doublequotes */ ali@0: /* This is not as efficient, but it's more maintainable */ ali@0: /* V.97 added underscore to the list of characters not to query, */ ali@0: /* since underscores are commonly used as italics indicators. */ ali@0: /* V.98 Added slash as well, same reason. */ ali@0: for (i = 1; i < llen; i++) { /* for each character in the line after the first */ ali@0: if (aline[i] == CHAR_DQUOTE) { ali@0: if ((!strchr(" _-.'`,;:!/([{?}])", aline[i-1]) && ali@0: !strchr(" _-.'`,;:!/([{?}])", aline[i+1]) && ali@0: aline[i+1] != 0 ali@0: || (!strchr(" _-([{'`", aline[i-1]) && gcisalpha(aline[i+1])))) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - Unspaced quotes?\n", linecnt, i+1); ali@0: else ali@0: cnt_punct++; ali@0: } ali@0: } ali@0: } ali@0: ali@0: ali@0: /* v.98 check parity of quotes */ ali@0: /* v.99 added !*(s+1) in some tests to catch "I am," he said, but I will not be soon". */ ali@0: for (s = aline; *s; s++) { ali@0: if (*s == CHAR_DQUOTE) { ali@0: if (!(dquotepar = !dquotepar)) { /* parity even */ ali@0: if (!strchr("_-.'`/,;:!?)]} ", *(s+1))) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - Wrongspaced quotes?\n", linecnt, (int)(s - aline)+1); ali@0: else ali@0: cnt_punct++; ali@0: } ali@0: } ali@0: else { /* parity odd */ ali@0: if (!gcisalpha(*(s+1)) && !isdigit(*(s+1)) && !strchr("_-/.'`([{$", *(s+1)) || !*(s+1)) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - Wrongspaced quotes?\n", linecnt, (int)(s - aline)+1); ali@0: else ali@0: cnt_punct++; ali@0: } ali@0: } ali@0: } ali@0: } ali@0: ali@0: if (*aline == CHAR_DQUOTE) { ali@0: if (strchr(",;:!?)]} ", aline[1])) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column 1 - Wrongspaced quotes?\n", linecnt, (int)(s - aline)+1); ali@0: else ali@0: cnt_punct++; ali@0: } ali@0: } ali@0: ali@0: if (pswit[SQUOTE_SWITCH]) ali@0: for (s = aline; *s; s++) { ali@0: if ((*s == CHAR_SQUOTE || *s == CHAR_OPEN_SQUOTE) ali@0: && ( s == aline || (s > aline && !gcisalpha(*(s-1))) || !gcisalpha(*(s+1)))) { ali@0: if (!(squotepar = !squotepar)) { /* parity even */ ali@0: if (!strchr("_-.'`/\",;:!?)]} ", *(s+1))) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - Wrongspaced singlequotes?\n", linecnt, (int)(s - aline)+1); ali@0: else ali@0: cnt_punct++; ali@0: } ali@0: } ali@0: else { /* parity odd */ ali@0: if (!gcisalpha(*(s+1)) && !isdigit(*(s+1)) && !strchr("_-/\".'`", *(s+1)) || !*(s+1)) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - Wrongspaced singlequotes?\n", linecnt, (int)(s - aline)+1); ali@0: else ali@0: cnt_punct++; ali@0: } ali@0: } ali@0: } ali@0: } ali@0: ali@0: ali@0: /* v.20 also look for double punctuation like ,. or ,, */ ali@0: /* Thanks to DW for the suggestion! */ ali@0: /* I'm putting this in a separate loop for clarity */ ali@0: /* In books with references, ".," and ".;" are common */ ali@0: /* e.g. "etc., etc.," and vol. 1.; vol 3.; */ ali@0: /* OTOH, from my initial tests, there are also fairly */ ali@0: /* common errors. What to do? Make these cases paranoid? */ ali@0: /* V.21 ".," is the most common, so invented warn_dotcomma */ ali@0: /* to suppress detailed reporting if it occurs often */ ali@0: llen = strlen(aline); ali@0: for (i = 0; i < llen; i++) /* for each character in the line */ ali@0: if (strchr(".?!,;:", aline[i]) /* if it's punctuation */ ali@0: && (strchr(".?!,;:", aline[i+1])) ali@0: && aline[i] && aline[i+1]) /* followed by punctuation, it's a query, unless . . . */ ali@0: if ( ali@0: (aline[i] == aline[i+1] ali@0: && (aline[i] == '.' || aline[i] == '?' || aline[i] == '!')) ali@0: || (!warn_dotcomma && aline[i] == '.' && aline[i+1] == ',') ali@0: || (isFrench && !strncmp(aline+i, ",...", 4)) ali@0: || (isFrench && !strncmp(aline+i, "...,", 4)) ali@0: || (isFrench && !strncmp(aline+i, ";...", 4)) ali@0: || (isFrench && !strncmp(aline+i, "...;", 4)) ali@0: || (isFrench && !strncmp(aline+i, ":...", 4)) ali@0: || (isFrench && !strncmp(aline+i, "...:", 4)) ali@0: || (isFrench && !strncmp(aline+i, "!...", 4)) ali@0: || (isFrench && !strncmp(aline+i, "...!", 4)) ali@0: || (isFrench && !strncmp(aline+i, "?...", 4)) ali@0: || (isFrench && !strncmp(aline+i, "...?", 4)) ali@0: ) { ali@0: if ((isFrench && !strncmp(aline+i, ",...", 4)) /* could this BE any more awkward? */ ali@0: || (isFrench && !strncmp(aline+i, "...,", 4)) ali@0: || (isFrench && !strncmp(aline+i, ";...", 4)) ali@0: || (isFrench && !strncmp(aline+i, "...;", 4)) ali@0: || (isFrench && !strncmp(aline+i, ":...", 4)) ali@0: || (isFrench && !strncmp(aline+i, "...:", 4)) ali@0: || (isFrench && !strncmp(aline+i, "!...", 4)) ali@0: || (isFrench && !strncmp(aline+i, "...!", 4)) ali@0: || (isFrench && !strncmp(aline+i, "?...", 4)) ali@0: || (isFrench && !strncmp(aline+i, "...?", 4))) ali@0: i +=4; ali@0: ; /* do nothing for .. !! and ?? which can be legit */ ali@0: } ali@0: else { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - Double punctuation?\n", linecnt, i+1); ali@0: else ali@0: cnt_punct++; ali@0: } ali@0: ali@0: /* v.21 breaking out the search for spaced doublequotes */ ali@0: /* This is not as efficient, but it's more maintainable */ ali@0: s = aline; ali@0: while (strstr(s," \" ")) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - Spaced doublequote?\n", linecnt, (int)(strstr(s," \" ")-aline+1)); ali@0: else ali@0: cnt_punct++; ali@0: s = strstr(s," \" ") + 2; ali@0: } ali@0: ali@0: /* v.20 also look for spaced singlequotes ' and ` */ ali@0: s = aline; ali@0: while (strstr(s," ' ")) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - Spaced singlequote?\n", linecnt, (int)(strstr(s," ' ")-aline+1)); ali@0: else ali@0: cnt_punct++; ali@0: s = strstr(s," ' ") + 2; ali@0: } ali@0: ali@0: s = aline; ali@0: while (strstr(s," ` ")) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - Spaced singlequote?\n", linecnt, (int)(strstr(s," ` ")-aline+1)); ali@0: else ali@0: cnt_punct++; ali@0: s = strstr(s," ` ") + 2; ali@0: } ali@0: ali@0: /* v.99 check special case of 'S instead of 's at end of word */ ali@0: s = aline + 1; ali@0: while (*s) { ali@0: if (*s == CHAR_SQUOTE && *(s+1) == 'S' && *(s-1)>='a' && *(s-1)<='z') { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - Capital \"S\"?\n", linecnt, (int)(s-aline+2)); ali@0: else ali@0: cnt_punct++; ali@0: } ali@0: s++; ali@0: } ali@0: ali@0: ali@0: /* v.21 Now check special cases - start and end of line - */ ali@0: /* for single and double quotes. Start is sometimes [sic] */ ali@0: /* but better to query it anyway. */ ali@0: /* While I'm here, check for dash at end of line */ ali@0: llen = strlen(aline); ali@0: if (llen > 1) { ali@0: if (aline[llen-1] == CHAR_DQUOTE || ali@0: aline[llen-1] == CHAR_SQUOTE || ali@0: aline[llen-1] == CHAR_OPEN_SQUOTE) ali@0: if (aline[llen-2] == CHAR_SPACE) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - Spaced quote?\n", linecnt, llen); ali@0: else ali@0: cnt_punct++; ali@0: } ali@0: ali@0: /* V 0.98 removed aline[0] == CHAR_DQUOTE from the test below, since */ ali@0: /* Wrongspaced quotes test also catches it for " */ ali@0: if (aline[0] == CHAR_SQUOTE || ali@0: aline[0] == CHAR_OPEN_SQUOTE) ali@0: if (aline[1] == CHAR_SPACE) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column 1 - Spaced quote?\n", linecnt); ali@0: else ali@0: cnt_punct++; ali@0: } ali@0: /* dash at end of line may well be legit - paranoid mode only */ ali@0: /* and don't report em-dash at line-end */ ali@0: if (pswit[PARANOID_SWITCH] && warn_hyphen) { ali@0: for (i = llen-1; i > 0 && (unsigned char)aline[i] <= CHAR_SPACE; i--); ali@0: if (aline[i] == '-' && aline[i-1] != '-') { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - Hyphen at end of line?\n", linecnt, i); ali@0: } ali@0: } ali@0: } ali@0: ali@0: /* v.21 also look for brackets surrounded by alpha */ ali@0: /* Brackets are often unspaced, but shouldn't be surrounded by alpha. */ ali@0: /* If so, suspect a scanno like "a]most" */ ali@0: llen = strlen(aline); ali@0: for (i = 1; i < llen-1; i++) { /* for each character in the line except 1st & last*/ ali@0: if (strchr("{[()]}", aline[i]) /* if it's a bracket */ ali@0: && gcisalpha(aline[i-1]) && gcisalpha(aline[i+1])) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - Unspaced bracket?\n", linecnt, i); ali@0: else ali@0: cnt_punct++; ali@0: } ali@0: } ali@0: /* The "Cinderella" case, back in again! :-S Give it another shot */ ali@0: if (warn_endquote) { ali@0: llen = strlen(aline); ali@0: for (i = 1; i < llen; i++) { /* for each character in the line except 1st */ ali@0: if (aline[i] == CHAR_DQUOTE) ali@0: if (isalpha(aline[i-1])) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - endquote missing punctuation?\n", linecnt, i); ali@0: else ali@0: cnt_punct++; ali@0: } ali@0: } ali@0: } ali@0: ali@0: llen = strlen(aline); ali@0: ali@0: /* Check for */ ali@0: /* If there is a < in the line, followed at some point */ ali@0: /* by a > then we suspect HTML */ ali@0: if (strstr(aline, "<") && strstr(aline, ">")) { ali@0: i = (signed int) (strstr(aline, ">") - strstr(aline, "<") + 1); ali@0: if (i > 0) { ali@0: strncpy(wrk, strstr(aline, "<"), i); ali@0: wrk[i] = 0; ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - HTML Tag? %s \n", linecnt, (int)(strstr(aline, "<") - aline) + 1, wrk); ali@0: else ali@0: cnt_html++; ali@0: } ali@0: } ali@0: ali@0: /* Check for &symbol; HTML */ ali@0: /* If there is a & in the line, followed at */ ali@0: /* some point by a ; then we suspect HTML */ ali@0: if (strstr(aline, "&") && strstr(aline, ";")) { ali@0: i = (int)(strstr(aline, ";") - strstr(aline, "&") + 1); ali@0: for (s = strstr(aline, "&"); s < strstr(aline, ";"); s++) ali@0: if (*s == CHAR_SPACE) i = 0; /* 0.99 don't report "Jones & Son;" */ ali@0: if (i > 0) { ali@0: strncpy(wrk, strstr(aline,"&"), i); ali@0: wrk[i] = 0; ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - HTML symbol? %s \n", linecnt, (int)(strstr(aline, "&") - aline) + 1, wrk); ali@0: else ali@0: cnt_html++; ali@0: } ali@0: } ali@0: ali@0: /* At end of paragraph, check for mismatched quotes. */ ali@0: /* We don't want to report an error immediately, since it is a */ ali@0: /* common convention to omit the quotes at end of paragraph if */ ali@0: /* the next paragraph is a continuation of the same speaker. */ ali@0: /* Where this is the case, the next para should begin with a */ ali@0: /* quote, so we store the warning message and only display it */ ali@0: /* at the top of the next iteration if the new para doesn't */ ali@0: /* start with a quote. */ ali@0: /* The -p switch overrides this default, and warns of unclosed */ ali@0: /* quotes on _every_ paragraph, whether the next begins with a */ ali@0: /* quote or not. */ ali@0: /* Version .16 - only report mismatched single quotes if */ ali@0: /* an open_single_quotes was found. */ ali@0: ali@0: if (isemptyline) { /* end of para - add up the totals */ ali@0: if (quot % 2) ali@0: sprintf(dquote_err, " Line %ld - Mismatched quotes\n", linecnt); ali@0: if (pswit[SQUOTE_SWITCH] && open_single_quote && (open_single_quote != close_single_quote) ) ali@0: sprintf(squote_err," Line %ld - Mismatched singlequotes?\n", linecnt); ali@0: if (pswit[SQUOTE_SWITCH] && open_single_quote ali@0: && (open_single_quote != close_single_quote) ali@0: && (open_single_quote != close_single_quote +1) ) ali@0: squot = 1; /* flag it to be noted regardless of the first char of the next para */ ali@0: if (r_brack) ali@0: sprintf(rbrack_err, " Line %ld - Mismatched round brackets?\n", linecnt); ali@0: if (s_brack) ali@0: sprintf(sbrack_err, " Line %ld - Mismatched square brackets?\n", linecnt); ali@0: if (c_brack) ali@0: sprintf(cbrack_err, " Line %ld - Mismatched curly brackets?\n", linecnt); ali@0: if (c_unders % 2) ali@0: sprintf(unders_err, " Line %ld - Mismatched underscores?\n", linecnt); ali@0: quot = s_brack = c_brack = r_brack = c_unders = ali@0: open_single_quote = close_single_quote = 0; ali@0: isnewpara = 1; /* let the next iteration know that it's starting a new para */ ali@0: } ali@0: ali@0: /* V.21 _ALSO_ at end of paragraph, check for omitted punctuation. */ ali@0: /* by working back through prevline. DW. */ ali@0: /* Hmmm. Need to check this only for "normal" paras. */ ali@0: /* So what is a "normal" para? ouch! */ ali@0: /* Not normal if one-liner (chapter headings, etc.) */ ali@0: /* Not normal if doesn't contain at least one locase letter */ ali@0: /* Not normal if starts with space */ ali@0: ali@0: /* 0.99 tighten up on para end checks. Disallow comma and */ ali@0: /* semi-colon. Check for legit para end before quotes. */ ali@0: if (isemptyline) { /* end of para */ ali@0: for (s = prevline, i = 0; *s && !i; s++) ali@0: if (gcisletter(*s)) ali@0: i = 1; /* use i to indicate the presence of a letter on the line */ ali@0: /* This next "if" is a problem. */ ali@0: /* If I say "start_para_line <= linecnt - 1", that includes one-line */ ali@0: /* "paragraphs" like chapter heads. Lotsa false positives. */ ali@0: /* If I say "start_para_line < linecnt - 1" it doesn't, but then it */ ali@0: /* misses genuine one-line paragraphs. */ ali@0: /* So what do I do? */ ali@0: if (i ali@0: && lastblen > 2 ali@0: && start_para_line < linecnt - 1 ali@0: && *prevline > CHAR_SPACE ali@0: ) { ali@0: for (i = strlen(prevline)-1; (prevline[i] == CHAR_DQUOTE || prevline[i] == CHAR_SQUOTE) && prevline[i] > CHAR_SPACE && i > 0; i--); ali@0: for ( ; i > 0; i--) { ali@0: if (gcisalpha(prevline[i])) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", prevline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - No punctuation at para end?\n", linecnt-1, strlen(prevline)); ali@0: else ali@0: cnt_punct++; ali@0: break; ali@0: } ali@0: if (strchr("-.:!([{?}])", prevline[i])) ali@0: break; ali@0: } ali@0: } ali@0: } ali@0: strcpy(prevline, aline); ali@0: } ali@0: fclose (infile); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: for (i = 0; i < MAX_QWORD; i++) ali@0: if (dupcnt[i]) ali@0: printf("\nNote: Queried word %s was duplicated %d time%s\n", qword[i], dupcnt[i], "s"); ali@0: } ali@0: ali@0: ali@0: ali@0: /* flgets - get one line from the input stream, checking for */ ali@0: /* the existence of exactly one CR/LF line-end per line. */ ali@0: /* Returns a pointer to the line. */ ali@0: ali@0: char *flgets(char *theline, int maxlen, FILE *thefile, long lcnt) ali@0: { ali@0: char c; ali@0: int len, isCR, cint; ali@0: ali@0: *theline = 0; ali@0: len = isCR = 0; ali@0: c = cint = fgetc(thefile); ali@0: do { ali@0: if (cint == EOF) ali@0: return (NULL); ali@0: if (c == 10) /* either way, it's end of line */ ali@0: if (isCR) ali@0: break; ali@0: else { /* Error - a LF without a preceding CR */ ali@0: if (pswit[LINE_END_SWITCH]) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", theline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld - No CR?\n", lcnt); ali@0: else ali@0: cnt_lineend++; ali@0: } ali@0: break; ali@0: } ali@0: if (c == 13) { ali@0: if (isCR) { /* Error - two successive CRs */ ali@0: if (pswit[LINE_END_SWITCH]) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", theline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld - Two successive CRs?\n", lcnt); ali@0: else ali@0: cnt_lineend++; ali@0: } ali@0: } ali@0: isCR = 1; ali@0: } ali@0: else { ali@0: if (pswit[LINE_END_SWITCH] && isCR) { ali@0: if (pswit[ECHO_SWITCH]) printf("\n%s\n", theline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@0: printf(" Line %ld column %d - CR without LF?\n", lcnt, len+1); ali@0: else ali@0: cnt_lineend++; ali@0: } ali@0: theline[len] = c; ali@0: len++; ali@0: theline[len] = 0; ali@0: isCR = 0; ali@0: } ali@0: c = cint = fgetc(thefile); ali@0: } while(len < maxlen); ali@0: if (pswit[MARKUP_SWITCH]) ali@0: postprocess_for_HTML(theline); ali@0: if (pswit[DP_SWITCH]) ali@0: postprocess_for_DP(theline); ali@0: return(theline); ali@0: } ali@0: ali@0: ali@0: ali@0: ali@0: /* mixdigit - takes a "word" as a parameter, and checks whether it */ ali@0: /* contains a mixture of alpha and digits. Generally, this is an */ ali@0: /* error, but may not be for cases like 4th or L5 12s. 3d. */ ali@0: /* Returns 0 if no error found, 1 if error. */ ali@0: ali@0: int mixdigit(char *checkword) /* check for digits like 1 or 0 in words */ ali@0: { ali@0: int wehaveadigit, wehavealetter, firstdigits, query, wl; ali@0: char *s; ali@0: ali@0: ali@0: wehaveadigit = wehavealetter = query = 0; ali@0: for (s = checkword; *s; s++) ali@0: if (gcisalpha(*s)) ali@0: wehavealetter = 1; ali@0: else ali@0: if (gcisdigit(*s)) ali@0: wehaveadigit = 1; ali@0: if (wehaveadigit && wehavealetter) { /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */ ali@0: query = 1; ali@0: wl = strlen(checkword); ali@0: for (firstdigits = 0; gcisdigit(checkword[firstdigits]); firstdigits++) ali@0: ; ali@0: /* digits, ending in st, rd, nd, th of either case */ ali@0: /* 0.99 donovan points out an error below. Turns out */ ali@0: /* I was using matchword like strcmp when the */ ali@0: /* return values are different! Duh. */ ali@0: if (firstdigits + 2 == wl && ali@0: (matchword(checkword + wl - 2, "st") ali@0: || matchword(checkword + wl - 2, "rd") ali@0: || matchword(checkword + wl - 2, "nd") ali@0: || matchword(checkword + wl - 2, "th")) ali@0: ) ali@0: query = 0; ali@0: if (firstdigits + 3 == wl && ali@0: (matchword(checkword + wl - 3, "sts") ali@0: || matchword(checkword + wl - 3, "rds") ali@0: || matchword(checkword + wl - 3, "nds") ali@0: || matchword(checkword + wl - 3, "ths")) ali@0: ) ali@0: query = 0; ali@0: if (firstdigits + 3 == wl && ali@0: (matchword(checkword + wl - 4, "stly") ali@0: || matchword(checkword + wl - 4, "rdly") ali@0: || matchword(checkword + wl - 4, "ndly") ali@0: || matchword(checkword + wl - 4, "thly")) ali@0: ) ali@0: query = 0; ali@0: ali@0: /* digits, ending in l, L, s or d */ ali@0: if (firstdigits + 1 == wl && ali@0: (checkword[wl-1] == 'l' ali@0: || checkword[wl-1] == 'L' ali@0: || checkword[wl-1] == 's' ali@0: || checkword[wl-1] == 'd')) ali@0: query = 0; ali@0: /* L at the start of a number, representing Britsh pounds, like L500 */ ali@0: /* This is cute. We know the current word is mixeddigit. If the first */ ali@0: /* letter is L, there must be at least one digit following. If both */ ali@0: /* digits and letters follow, we have a genuine error, else we have a */ ali@0: /* capital L followed by digits, and we accept that as a non-error. */ ali@0: if (checkword[0] == 'L') ali@0: if (!mixdigit(checkword+1)) ali@0: query = 0; ali@0: } ali@0: return (query); ali@0: } ali@0: ali@0: ali@0: ali@0: ali@0: /* getaword - extracts the first/next "word" from the line, and puts */ ali@0: /* it into "thisword". A word is defined as one English word unit */ ali@0: /* -- or at least that's what I'm trying for. */ ali@0: /* Returns a pointer to the position in the line where we will start */ ali@0: /* looking for the next word. */ ali@0: ali@0: char *getaword(char *fromline, char *thisword) ali@0: { ali@0: int i, wordlen; ali@0: char *s; ali@0: ali@0: wordlen = 0; ali@0: for ( ; !gcisdigit(*fromline) && !gcisalpha(*fromline) && *fromline ; fromline++ ); ali@0: ali@0: /* V .20 */ ali@0: /* add a look-ahead to handle exceptions for numbers like 1,000 and 1.35. */ ali@0: /* Especially yucky is the case of L1,000 */ ali@0: /* I hate this, and I see other ways, but I don't see that any is _better_.*/ ali@0: /* This section looks for a pattern of characters including a digit */ ali@0: /* followed by a comma or period followed by one or more digits. */ ali@0: /* If found, it returns this whole pattern as a word; otherwise we discard */ ali@0: /* the results and resume our normal programming. */ ali@0: s = fromline; ali@0: for ( ; (gcisdigit(*s) || gcisalpha(*s) || *s == ',' || *s == '.') && wordlen < MAXWORDLEN ; s++ ) { ali@0: thisword[wordlen] = *s; ali@0: wordlen++; ali@0: } ali@0: thisword[wordlen] = 0; ali@0: for (i = 1; i < wordlen -1; i++) { ali@0: if (thisword[i] == '.' || thisword[i] == ',') { ali@0: if (gcisdigit(thisword[i-1]) && gcisdigit(thisword[i-1])) { /* we have one of the damned things */ ali@0: fromline = s; ali@0: return(fromline); ali@0: } ali@0: } ali@0: } ali@0: ali@0: /* we didn't find a punctuated number - do the regular getword thing */ ali@0: wordlen = 0; ali@0: for ( ; (gcisdigit(*fromline) || gcisalpha(*fromline) || *fromline == '\'') && wordlen < MAXWORDLEN ; fromline++ ) { ali@0: thisword[wordlen] = *fromline; ali@0: wordlen++; ali@0: } ali@0: thisword[wordlen] = 0; ali@0: return(fromline); ali@0: } ali@0: ali@0: ali@0: ali@0: ali@0: ali@0: /* matchword - just a case-insensitive string matcher */ ali@0: /* yes, I know this is not efficient. I'll worry about */ ali@0: /* that when I have a clear idea where I'm going with it.*/ ali@0: ali@0: int matchword(char *checkfor, char *thisword) ali@0: { ali@0: unsigned int ismatch, i; ali@0: ali@0: if (strlen(checkfor) != strlen(thisword)) return(0); ali@0: ali@0: ismatch = 1; /* assume a match until we find a difference */ ali@0: for (i = 0; i ='A' && *theline <='Z') ali@0: *theline += 32; ali@0: } ali@0: ali@0: ali@0: /* Is this word a Roman Numeral? */ ali@0: /* v 0.99 improved to be better. It still doesn't actually */ ali@0: /* validate that the number is a valid Roman Numeral -- for example */ ali@0: /* it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not*/ ali@0: /* what we're here to do. If it passes this, it LOOKS like a Roman */ ali@0: /* numeral. Anyway, the actual Romans were pretty tolerant of bad */ ali@0: /* arithmetic, or expressions thereof, except when it came to taxes.*/ ali@0: /* Allow any number of M, an optional D, an optional CM or CD, */ ali@0: /* any number of optional Cs, an optional XL or an optional XC, an */ ali@0: /* optional IX or IV, an optional V and any number of optional Is. */ ali@0: /* Good enough for jazz chords. */ ali@0: ali@0: int isroman(char *t) ali@0: { ali@0: char *s; ali@0: ali@0: if (!t || !*t) return (0); ali@0: ali@0: s = t; ali@0: ali@0: while (*t == 'm' && *t ) t++; ali@0: if (*t == 'd') t++; ali@0: if (*t == 'c' && *(t+1) == 'm') t+=2; ali@0: if (*t == 'c' && *(t+1) == 'd') t+=2; ali@0: while (*t == 'c' && *t) t++; ali@0: if (*t == 'x' && *(t+1) == 'l') t+=2; ali@0: if (*t == 'x' && *(t+1) == 'c') t+=2; ali@0: if (*t == 'l') t++; ali@0: while (*t == 'x' && *t) t++; ali@0: if (*t == 'i' && *(t+1) == 'x') t+=2; ali@0: if (*t == 'i' && *(t+1) == 'v') t+=2; ali@0: if (*t == 'v') t++; ali@0: while (*t == 'i' && *t) t++; ali@0: if (!*t) return (1); ali@0: ali@0: return(0); ali@0: } ali@0: ali@0: ali@0: ali@0: ali@0: /* gcisalpha is a special version that is somewhat lenient on 8-bit texts. */ ali@0: /* If we use the standard isalpha() function, 8-bit accented characters break */ ali@0: /* words, so that tete with accented characters appears to be two words, "t" */ ali@0: /* and "t", with 8-bit characters between them. This causes over-reporting of */ ali@0: /* errors. gcisalpha() recognizes accented letters from the CP1252 (Windows) */ ali@0: /* and ISO-8859-1 character sets, which are the most common PG 8-bit types. */ ali@0: ali@0: int gcisalpha(unsigned char c) ali@0: { ali@0: if (c >='a' && c <='z') return(1); ali@0: if (c >='A' && c <='Z') return(1); ali@0: if (c < 140) return(0); ali@0: if (c >=192 && c != 208 && c != 215 && c != 222 && c != 240 && c != 247 && c != 254) return(1); ali@0: if (c == 140 || c == 142 || c == 156 || c == 158 || c == 159) return (1); ali@0: return(0); ali@0: } ali@0: ali@0: /* gcisdigit is a special version that doesn't get confused in 8-bit texts. */ ali@0: int gcisdigit(unsigned char c) ali@0: { ali@0: if (c >= '0' && c <='9') return(1); ali@0: return(0); ali@0: } ali@0: ali@0: /* gcisletter is a special version that doesn't get confused in 8-bit texts. */ ali@0: /* Yeah, we're ISO-8891-1-specific. So sue me. */ ali@0: int gcisletter(unsigned char c) ali@0: { ali@0: if ((c >= 'A' && c <='Z') || (c >= 'a' && c <='z') || c >= 192) return(1); ali@0: return(0); ali@0: } ali@0: ali@0: ali@0: ali@0: ali@0: /* gcstrchr wraps strchr to return NULL if the character being searched for is zero */ ali@0: ali@0: char *gcstrchr(char *s, char c) ali@0: { ali@0: if (c == 0) return(NULL); ali@0: return(strchr(s,c)); ali@0: } ali@0: ali@0: /* postprocess_for_DP is derived from postprocess_for_HTML */ ali@0: /* It is invoked with the -d switch from flgets(). */ ali@0: /* It simply "removes" from the line a hard-coded set of common */ ali@0: /* DP-specific tags, so that the line passed to the main routine has*/ ali@0: /* been pre-cleaned of DP markup. */ ali@0: ali@0: void postprocess_for_DP(char *theline) ali@0: { ali@0: ali@0: char *s, *t; ali@0: int i; ali@0: ali@0: if (!*theline) ali@0: return; ali@0: ali@0: for (i = 0; *DPmarkup[i]; i++) { ali@0: s = strstr(theline, DPmarkup[i]); ali@0: while (s) { ali@0: t = s + strlen(DPmarkup[i]); ali@0: while (*t) { ali@0: *s = *t; ali@0: t++; s++; ali@0: } ali@0: *s = 0; ali@0: s = strstr(theline, DPmarkup[i]); ali@0: } ali@0: } ali@0: ali@0: } ali@0: ali@0: ali@0: /* postprocess_for_HTML is, at the moment (0.97), a very nasty */ ali@0: /* short-term fix for Charlz. Nasty, nasty, nasty. */ ali@0: /* It is invoked with the -m switch from flgets(). */ ali@0: /* It simply "removes" from the line a hard-coded set of common */ ali@0: /* HTML tags and "replaces" a hard-coded set of common HTML */ ali@0: /* entities, so that the line passed to the main routine has */ ali@0: /* been pre-cleaned of HTML. This is _so_ not the right way to */ ali@0: /* deal with HTML, but what Charlz needs now is not HTML handling */ ali@0: /* proper: just ignoring tags and some others. */ ali@0: /* To be revisited in future releases! */ ali@0: ali@0: void postprocess_for_HTML(char *theline) ali@0: { ali@0: ali@0: if (strstr(theline, "<") && strstr(theline, ">")) ali@0: while (losemarkup(theline)) ali@0: ; ali@0: while (loseentities(theline)) ali@0: ; ali@0: } ali@0: ali@0: char *losemarkup(char *theline) ali@0: { ali@0: char *s, *t; ali@0: int i; ali@0: ali@0: if (!*theline) ali@0: return(NULL); ali@0: ali@0: s = strstr(theline, "<"); ali@0: t = strstr(theline, ">"); ali@0: if (!s || !t) return(NULL); ali@0: for (i = 0; *markup[i]; i++) ali@0: if (!tagcomp(s+1, markup[i])) { ali@0: if (!*(t+1)) { ali@0: *s = 0; ali@0: return(s); ali@0: } ali@0: else ali@0: if (t > s) { ali@0: strcpy(s, t+1); ali@0: return(s); ali@0: } ali@0: } ali@0: /* it's an unrecognized */ ali@0: return(NULL); ali@0: } ali@0: ali@0: char *loseentities(char *theline) ali@0: { ali@0: int i; ali@0: char *s, *t; ali@0: ali@0: if (!*theline) ali@0: return(NULL); ali@0: ali@0: for (i = 0; *entities[i].htmlent; i++) { ali@0: s = strstr(theline, entities[i].htmlent); ali@0: if (s) { ali@0: t = malloc((size_t)strlen(s)); ali@0: if (!t) return(NULL); ali@0: strcpy(t, s + strlen(entities[i].htmlent)); ali@0: strcpy(s, entities[i].textent); ali@0: strcat(s, t); ali@0: free(t); ali@0: return(theline); ali@0: } ali@0: } ali@0: ali@0: /* V0.97 Duh. Forgot to check the htmlnum member */ ali@0: for (i = 0; *entities[i].htmlnum; i++) { ali@0: s = strstr(theline, entities[i].htmlnum); ali@0: if (s) { ali@0: t = malloc((size_t)strlen(s)); ali@0: if (!t) return(NULL); ali@0: strcpy(t, s + strlen(entities[i].htmlnum)); ali@0: strcpy(s, entities[i].textent); ali@0: strcat(s, t); ali@0: free(t); ali@0: return(theline); ali@0: } ali@0: } ali@0: return(NULL); ali@0: } ali@0: ali@0: ali@0: int tagcomp(char *strin, char *basetag) ali@0: { ali@0: char *s, *t; ali@0: ali@0: s = basetag; ali@0: t = strin; ali@0: if (*t == '/') t++; /* ignore a slash */ ali@0: while (*s && *t) { ali@0: if (tolower(*s) != tolower(*t)) return(1); ali@0: s++; t++; ali@0: } ali@0: /* OK, we have < followed by a valid tag start */ ali@0: /* should I do something about length? */ ali@0: /* this is messy. The length of an tag is */ ali@0: /* limited, but a could go on for miles */ ali@0: /* so I'd have to parse the tags . . . ugh. */ ali@0: /* It isn't what Charlz needs now, so mark it */ ali@0: /* as 'pending'. */ ali@0: return(0); ali@0: } ali@0: ali@0: void proghelp() /* explain program usage here */ ali@0: { ali@0: fputs("V. 0.991. Copyright 2000-2005 Jim Tinsley .\n",stderr); ali@0: fputs("Gutcheck comes wih ABSOLUTELY NO WARRANTY. For details, read the file COPYING.\n", stderr); ali@0: fputs("This is Free Software; you may redistribute it under certain conditions (GPL);\n", stderr); ali@0: fputs("read the file COPYING for details.\n\n", stderr); ali@0: fputs("Usage is: gutcheck [-setpxloyhud] filename\n",stderr); ali@0: fputs(" where -s checks single quotes, -e suppresses echoing lines, -t checks typos\n",stderr); ali@0: fputs(" -x (paranoid) switches OFF -t and extra checks, -l turns OFF line-end checks\n",stderr); ali@0: fputs(" -o just displays overview without detail, -h echoes header fields\n",stderr); ali@0: fputs(" -v (verbose) unsuppresses duplicate reporting, -m suppresses markup\n",stderr); ali@0: fputs(" -d ignores DP-specific markup,\n",stderr); ali@0: fputs(" -u uses a file gutcheck.typ to query user-defined possible typos\n",stderr); ali@0: fputs("Sample usage: gutcheck warpeace.txt \n",stderr); ali@0: fputs("\n",stderr); ali@0: fputs("Gutcheck looks for errors in Project Gutenberg(TM) etexts.\n", stderr); ali@0: fputs("Gutcheck queries anything it thinks shouldn't be in a PG text; non-ASCII\n",stderr); ali@0: fputs("characters like accented letters, lines longer than 75 or shorter than 55,\n",stderr); ali@0: fputs("unbalanced quotes or brackets, a variety of badly formatted punctuation, \n",stderr); ali@0: fputs("HTML tags, some likely typos. It is NOT a substitute for human judgement.\n",stderr); ali@0: fputs("\n",stderr); ali@0: } ali@0: ali@0: ali@0: ali@0: /********************************************************************* ali@0: Revision History: ali@0: ali@0: 04/22/01 Cleaned up some stuff and released .10 ali@0: ali@0: --------------- ali@0: ali@0: 05/09/01 Added the typo list, added two extra cases of he/be error, ali@0: added -p switch, OPEN_SINGLE QUOTE char as .11 ali@0: ali@0: --------------- ali@0: ali@0: 05/20/01 Increased the typo list, ali@0: added paranoid mode, ali@0: ANSIfied the code and added some casts ali@0: so the compiler wouldn't keep asking if I knew what I was doing, ali@0: fixed bug in l.s.d. condition (thanks, Dave!), ali@0: standardized spacing when echoing, ali@0: added letter-combo checking code to typo section, ali@0: added more h/b words to typo array. ali@0: Not too sure about putting letter combos outside of the TYPO conditions - ali@0: someone is sure to have a book about the tbaka tribe, or something. Anyway, let's see. ali@0: Released as .12 ali@0: ali@0: --------------- ali@0: ali@0: 06/01/01 Removed duplicate reporting of Tildes, asterisks, etc. ali@0: 06/10/01 Added flgets routine to help with platform-independent ali@0: detection of invalid line-ends. All PG text files should ali@0: have CR/LF (13/10) at end of line, regardless of system. ali@0: Gutcheck now validates this by default. (Thanks, Charles!) ali@0: Released as .13 ali@0: ali@0: --------------- ali@0: ali@0: 06/11/01 Added parenthesis match checking. (c_brack, cbrack_err etc.) ali@0: Released as .14 ali@0: ali@0: --------------- ali@0: ali@0: 06/23/01 Fixed: 'No',he said. not being flagged. ali@0: ali@0: Improved: better single-quotes checking: ali@0: ali@0: Ignore singlequotes surrounded by alpha, like didn't. (was OK) ali@0: ali@0: If a singlequote is at the END of a word AND the word ends in "s": ali@0: The dogs' tails wagged. ali@0: it's probably an apostrophe, but less commonly may be a closequote: ali@0: "These 'pack dogs' of yours look more like wolves." ali@0: ali@0: If it's got punctuation before it and is followed by a space ali@0: or punctuation: ali@0: . . . was a problem,' he said ali@0: . . . was a problem,'" ali@0: it is probably (certainly?) a closequote. ali@0: ali@0: If it's at start of paragraph, it's probably an openquote. ali@0: (but watch dialect) ali@0: ali@0: Words with ' at beginning and end are probably quoted: ali@0: "You have the word 'chivalry' frequently on your lips." ali@0: (Not specifically implemented) ali@0: V.18 I'm glad I didn't implement this, 'cos it jest ain't so ali@0: where the convention is to punctuate outside the quotes. ali@0: 'Come', he said, 'and join the party'. ali@0: ali@0: If it is followed by an alpha, and especially a capital: ali@0: 'Hello,' called he. ali@0: it is either an openquote or dialect. ali@0: ali@0: Dialect breaks ALL the rules: ali@0: A man's a man for a' that. ali@0: "Aye, but 'tis all in the pas' now." ali@0: "'Tis often the way," he said. ali@0: 'Ave a drink on me. ali@0: ali@0: This version looks to be an improvement, and produces ali@0: fewer false positives, but is still not perfect. The ali@0: 'pack dogs' case still fools it, and dialect is still ali@0: a problem. Oh, well, it's an improvement, and I have ali@0: a weighted structure in place for refining guesses at ali@0: closequotes. Maybe next time, I'll add a bit of logic ali@0: where if there is an open quote and one that was guessed ali@0: to be a possessive apostrophe after s, I'll re-guess it ali@0: to be a closequote. Let's see how this one flies, first. ali@0: ali@0: (Afterview: it's still crap. Needs much work, and a deeper insight.) ali@0: ali@0: Released as .15 ali@0: ali@0: TODO: More he/be checks. Can't be perfect - counterexamples: ali@0: I gave my son good advice: be married regardless of the world's opinion. ali@0: I gave my son good advice: he married regardless of the world's opinion. ali@0: ali@0: If by "primitive" be meant "crude", we can understand the sentence. ali@0: If by "primitive" he meant "crude", we can understand the sentence. ali@0: ali@0: No matter what be said, I must go on. ali@0: No matter what he said, I must go on. ali@0: ali@0: No value, however great, can be set upon them. ali@0: No value, however great, can he set upon them. ali@0: ali@0: Real-Life one from a DP International Weekly Miscellany: ali@0: He wandered through the forest without fear, sleeping ali@0: much, for in sleep be had companionship--the Great ali@0: Spirit teaching him what he should know in dreams. ali@0: That one found by jeebies, and it turned out to be "he". ali@0: ali@0: ali@0: --------------- ali@0: ali@0: 07/01/01 Added -O option. ali@0: Improved singlequotes by reporting mismatched single quotes ali@0: only if an open_single_quotes was found. ali@0: ali@0: Released as .16 ali@0: ali@0: --------------- ali@0: ali@0: 08/27/01 Added -Y switch for Robert Rowe to allow his app to ali@0: catch the error output. ali@0: ali@0: Released as .17 ali@0: ali@0: --------------- ali@0: ali@0: 09/08/01 Added checking Capitals at start of paragraph, but not ali@0: checking them at start of sentence. ali@0: ali@0: TODO: Parse sentences out so can check reliably for start of ali@0: sentence. Need a whole different approach for that. ali@0: (Can't just rely on periods, since they are also ali@0: used for abbreviations, etc.) ali@0: ali@0: Added checking for all vowels or all consonants in a word. ali@0: ali@0: While I was in, I added "ii" checking and "tl" at start of word. ali@0: ali@0: Added echoing of first line of paragraph when reporting ali@0: mismatched quoted or brackets (thanks to David Widger for the ali@0: suggestion) ali@0: ali@0: Not querying L at start of a number (used for British pounds). ali@0: ali@0: The spelling changes are sort of half-done but released anyway ali@0: Skipped .18 because I had given out a couple of test versions ali@0: with that number. ali@0: ali@0: 09/25/01 Released as .19 ali@0: ali@0: --------------- ali@0: ali@0: TODO: ali@0: Use the logic from my new version of safewrap to stop querying ali@0: short lines like poems and TOCs. ali@0: Ignore non-standard ellipses like . . . or ... ali@0: ali@0: ali@0: --------------- ali@0: 10/01/01 Made any line over 80 a VERY long line (was 85). ali@0: Recognized openquotes on indented paragraphs as continuations ali@0: of the same speech. ali@0: Added "cf" to the okword list (how did I forget _that_?) and a few others. ali@0: Moved abbrev to okword and made it more general. ali@0: Removed requirement that PG_space_emdash be greater than ali@0: ten before turning off warnings about spaced dashes. ali@0: Added period to list of characters that might constitute a separator line. ali@0: Now checking for double punctuation (Thanks, David!) ali@0: Now if two spaced em-dashes on a line, reports both. (DW) ali@0: Bug: Wasn't catching spaced punctuation at line-end since I ali@0: added flgets in version .13 - fixed. ali@0: Bug: Wasn't catching spaced singlequotes - fixed ali@0: Now reads punctuated numbers like 1,000 as a single word. ali@0: (Used to give "standalone 1" type queries) ali@0: Changed paranoid mode - not including s and p options. -ex is now quite usable. ali@0: Bug: was calling `"For it is perfectly impossible," Unspaced Quotes - fixed ali@0: Bug: Sometimes gave _next_ line number for queried word at end of line - fixed ali@0: ali@0: 10/22/01 Released as .20 ali@0: ali@0: --------------- ali@0: ali@0: Added count of lines with spaces at end. (cnt_spacend) (Thanks, Brett!) ali@0: Reduced the number of hi-bit letters needed to stop reporting them ali@0: from 1/20 to 1/100 or 200 in total. ali@0: Added PG footer check. ali@0: Added the -h switch. ali@0: Fixed platform-specific CHAR_EOL checking for isemptyline - changed to 13 and 10 ali@0: Not reporting ".," when there are many of them, such as a book with many references to "Vol 1., p. 23" ali@0: Added unspaced brackets check when surrounded by alpha. ali@0: Removed all typo reporting unless the typo switch is on. ali@0: Added gcisalpha to ease over-reporting of 8-bit queries. ali@0: ECHO_SWITCH is now ON by default! ali@0: PARANOID_SWITCH is now ON by default! ali@0: Checking for ">From" placed there by e-mail MTA (Thanks Andrew & Greg) ali@0: Checking for standalone lowercase "l" ali@0: Checking for standalone lowercase "s" ali@0: Considering "is be" and "be is" "be was" "was be" as he/be errors ali@0: Looking at punct at end of para ali@0: ali@0: 01/20/02 Released as .21 ali@0: ali@0: --------------- ali@0: ali@0: Added VERBOSE_SWITCH to make it list everything. (George Davis) ali@0: ali@0: --------------- ali@0: ali@0: 02/17/02 Added cint in flgets to try fix an EOF failure on a compiler I don't have. ali@0: after which ali@0: This line caused a coredump on Solaris - fixed. ali@0: Da sagte die Figur: " Das ist alles gar schoen, und man mag die Puppe ali@0: 03/09/02 Changed header recognition for another header change ali@0: Called it .24 ali@0: 03/29/02 Added qword[][] so I can suppress massive overreporting ali@0: of queried "words" like "FN", "Wm.", "th'", people's ali@0: initials, chemical formulae and suchlike in some texts. ali@0: Called it .25 ali@0: 04/07/02 The qword summary reports at end shouldn't show in OVERVIEW mode. Fixed. ali@0: Added linecounts in overview mode. ali@0: Wow! gutcheck gutcheck.exe doesn't report a binary! :-) Need to tighten up. Done. ali@0: "m" is a not uncommon scanno for "in", but also appears in "a.m." - Can I get round that? ali@0: 07/07/02 Added GPL. ali@0: Added checking for broken em-dash at line-end (enddash) ali@0: Released as 0.95 ali@0: 08/17/02 Fixed a bug that treated some hi-bit characters as spaces. Thanks, Carlo. ali@0: Released as 0.96 ali@0: 10/10/02 Suppressing some annoying multiple reports by default: ali@0: Standalone Ones, Asterisks, Square Brackets. ali@0: Digit 1 occurs often in many scientific texts. ali@0: Asterisk occurs often in multi-footnoted texts. ali@0: Mismatch Square Brackets occurs often in multi-para footnotes. ali@0: Added -m switch for Charlz. Horrible. Nasty. Kludgy. Evil. ali@0: . . . but it does more or less work for the main cases. ali@0: Removed uppercase within a word as a separate category so ali@0: that names like VanAllen get reported only once, like other ali@0: suspected typos. ali@0: 11/24/02 Fixed - -m switch wasn't looking at htmlnum in ali@0: loseentities (Thanks, Brett!) ali@0: Fixed bug which occasionally gave false warning of ali@0: paragraph starting with lowercase. ali@0: Added underscore as character not to query around doublequotes. ali@0: Split the "Non-ASCII" message into "Non-ASCII" vs. "Non-ISO-8859" ali@0: . . . this is to help detect things like CP1252 characters. ali@0: Released as 0.97 ali@0: ali@0: 12/01/02 Hacked a simplified version of the "Wrongspaced quotes" out of gutspell, ali@0: for doublequotes only. Replaces "Spaced quote", since it also covers that ali@0: case. ali@0: Added "warn_hyphen" to ease over-reporting of hyphens. ali@0: ali@0: 12/20/02 Added "extra period" checks. ali@0: Added single character line check ali@0: Added I" check - is usually an exclam ali@0: Released as 0.98 ali@0: ali@0: 1/5/03 Eeek! Left in a lowerit(argv[0]) at the start before procfile() ali@0: from when I was looking at ways to identify markup. Refuses to ali@0: open files for *nix users with upcase in the filemanes. Removed. ali@0: Fixed quickly and released as 0.981 ali@0: ali@0: 1/8/03 Added "arid" to the list of typos, slightly against my better ali@0: judgement, but the DP gang are all excited about it. :-) ali@0: Added a check for comma followed by capital letter, where ali@0: a period has OCRed into a comma. (DW). Not sure about this ali@0: either; we'll see. ali@0: Compiling for Win32 to allow longfilenames. ali@0: ali@0: 6/1/04 A messy test release for DW to include the "gutcheck.typ" ali@0: process. And the gutcheck.jee trials. Removed "arid" -- ali@0: it can go in gutcheck.typ ali@0: ali@0: Added checks for carats ^ and slants / but disabling slant ali@0: queries if more than 20 of them, because some people use them ali@0: for /italics/. Slants are commonly mistaken italic "I"s. ali@0: ali@0: Later: removed gutcheck.jee -- wrote jeebies instead. ali@0: ali@0: Random TODO: ali@0: Check brackets more closely, like quotes, so that it becomes ali@0: easy to find the error in long paragraphs full of brackets. ali@0: ali@0: ali@0: 11/4/04 Assorted cleanup. Fixed case where text started with an ali@0: unbalanced paragraph. ali@0: ali@0: 1/2/05 Has it really been that long? Added "nocomma", "noperiod" check. ali@0: Bits and pieces: improved isroman(). Added isletter(). ali@0: Other stuff I never noted before this. ali@0: ali@0: 7/3/05 Stuck in a quick start on DP-markup ignoring ali@0: at BillFlis's suggestion. ali@0: ali@0: 1/23/06 Took out nocomma etc if typos are off. Why did I ever leave that in? ali@0: Don't count footer for dotcomma etc. ali@0: ali@0: ali@0: 1 I ali@0: ail all ali@0: arc are ali@0: arid and ali@0: bad had ali@0: ball hall ali@0: band hand ali@0: bar her ali@0: bat but ali@0: be he ali@0: bead head ali@0: beads heads ali@0: bear hear ali@0: bit hit ali@0: bo be ali@0: boon been ali@0: borne home ali@0: bow how ali@0: bumbled humbled ali@0: car ear ali@0: carnage carriage ali@0: carne came ali@0: cast east ali@0: cat cut ali@0: cat eat ali@0: cheek check ali@0: clay day ali@0: coining coming ali@0: comer corner ali@0: die she ali@0: docs does ali@0: ease case ali@0: fail fall ali@0: fee he ali@0: haying having ali@0: ho he ali@0: ho who ali@0: hut but ali@0: is as ali@0: lie he ali@0: lime time ali@0: loth 10th ali@0: m in ali@0: modem modern ali@0: Ms his ali@0: ray away ali@0: ray my ali@0: ringer finger ali@0: ringers fingers ali@0: rioted noted ali@0: tho the ali@0: tie he ali@0: tie the ali@0: tier her ali@0: tight right ali@0: tile the ali@0: tiling thing ali@0: tip up ali@0: tram train ali@0: tune time ali@0: u " ali@0: wen well ali@0: yon you ali@0: ali@0: *********************************************************************/ ali@0: