ali@0: /*************************************************************************/ ali@40: /* bookloupe--check for assorted weirdnesses in a PG candidate text file */ ali@0: /* */ ali@0: /* Copyright 2000-2005 Jim Tinsley */ ali@40: /* Copyright 2012- J. Ali Harlow */ ali@0: /* */ ali@0: /* This program is free software; you can redistribute it and/or modify */ ali@0: /* it under the terms of the GNU General Public License as published by */ ali@0: /* the Free Software Foundation; either version 2 of the License, or */ ali@0: /* (at your option) any later version. */ ali@0: /* */ ali@0: /* This program is distributed in the hope that it will be useful, */ ali@0: /* but WITHOUT ANY WARRANTY; without even the implied warranty of */ ali@40: /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ ali@0: /* GNU General Public License for more details. */ ali@0: /* */ ali@0: /* You should have received a copy of the GNU General Public License */ ali@40: /* along with this program. If not, see . */ ali@0: /*************************************************************************/ ali@0: ali@0: #include ali@0: #include ali@0: #include ali@0: #include ali@0: ali@0: #define MAXWORDLEN 80 /* max length of one word */ ali@0: #define LINEBUFSIZE 2048 /* buffer size for an input line */ ali@0: ali@0: #define MAX_USER_TYPOS 1000 ali@0: #define USERTYPO_FILE "gutcheck.typ" ali@0: ali@0: #ifndef MAX_PATH ali@0: #define MAX_PATH 16384 ali@0: #endif ali@0: ali@0: char aline[LINEBUFSIZE]; ali@0: char prevline[LINEBUFSIZE]; ali@0: ali@40: /* Common typos. */ ali@40: char *typo[] = { ali@40: "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane", ali@40: "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa", ali@40: "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt", ali@40: "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse", ali@40: "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd", ali@40: "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign", ali@40: "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis", ali@40: "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut", ali@40: "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter", ali@40: "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices", ali@40: "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem", ali@40: "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe", ali@40: "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath", ali@40: "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier", ali@40: "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne", ali@40: "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey", ali@40: "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta", ali@40: "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats", ali@40: "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking", ali@40: "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve", ali@40: "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf", ali@40: "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped", ali@40: "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge", ali@40: "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan", ali@40: "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond", ali@40: "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile", ali@40: "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic", ali@40: "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud", ali@40: "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee", ali@40: "se", "" ali@40: }; ali@0: ali@0: char *usertypo[MAX_USER_TYPOS]; ali@0: ali@40: /* Common abbreviations and other OK words not to query as typos. */ ali@40: char *okword[] = { ali@40: "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm", ali@40: "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", ali@40: "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats", ali@40: "outbid", "outbids", "frostbite", "frostbitten", "" ali@40: }; ali@0: ali@40: /* Common abbreviations that cause otherwise unexplained periods. */ ali@40: char *abbrev[] = { ali@40: "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op", ali@40: "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", "" ali@40: }; ali@0: ali@40: /* ali@40: * Two-Letter combinations that rarely if ever start words, ali@40: * but are common scannos or otherwise common letter combinations. ali@40: */ ali@40: char *nostart[] = { ali@40: "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", "" ali@40: }; ali@0: ali@40: /* ali@40: * Two-Letter combinations that rarely if ever end words, ali@40: * but are common scannos or otherwise common letter combinations. ali@40: */ ali@40: char *noend[] = { ali@40: "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl", ali@40: "sw", "gr", "sl", "cl", "iy", "" ali@40: }; ali@0: ali@40: char *markup[] = { ali@40: "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em", ali@40: "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i", ali@40: "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub", ali@40: "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", "" ali@40: }; ali@0: ali@40: char *DPmarkup[] = { ali@40: "", "", "/*", "*/", "/#", "#/", "/$", "$/", "", "" ali@40: }; ali@0: ali@40: char *nocomma[] = { ali@40: "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose", ali@40: "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm", ali@40: "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm", ali@40: "during", "let", "toward", "among", "" ali@40: }; ali@0: ali@40: char *noperiod[] = { ali@40: "every", "i'm", "during", "that's", "their", "your", "our", "my", "or", ali@40: "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether", ali@40: "i'll", "whose", "who", "because", "when", "let", "till", "very", "an", ali@40: "among", "those", "into", "whom", "having", "thence", "" ali@40: }; ali@0: ali@40: char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü"; ali@0: ali@0: struct { ali@0: char *htmlent; ali@0: char *htmlnum; ali@0: char *textent; ali@40: } entities[] = { ali@40: "&", "&", "&", ali@40: "<", "<", "<", ali@40: ">", ">", ">", ali@40: "°", "°", " degrees", ali@40: "£", "£", "L", ali@40: """, """, "\"", /* quotation mark = APL quote */ ali@40: "Œ", "Œ", "OE", /* latin capital ligature OE */ ali@40: "œ", "œ", "oe", /* latin small ligature oe */ ali@40: "Š", "Š", "S", /* latin capital letter S with caron */ ali@40: "š", "š", "s", /* latin small letter s with caron */ ali@40: "Ÿ", "Ÿ", "Y", /* latin capital letter Y with diaeresis */ ali@40: "ˆ", "ˆ", "", /* modifier letter circumflex accent */ ali@40: "˜", "˜", "~", /* small tilde, U+02DC ISOdia */ ali@40: " ", " ", " ", /* en space, U+2002 ISOpub */ ali@40: " ", " ", " ", /* em space, U+2003 ISOpub */ ali@40: " ", " ", " ", /* thin space, U+2009 ISOpub */ ali@40: "–", "–", "-", /* en dash, U+2013 ISOpub */ ali@40: "—", "—", "--", /* em dash, U+2014 ISOpub */ ali@40: "’", "’", "'", /* right single quotation mark */ ali@40: "‚", "‚", "'", /* single low-9 quotation mark */ ali@40: "“", "“", "\"", /* left double quotation mark */ ali@40: "”", "”", "\"", /* right double quotation mark */ ali@40: "„", "„", "\"", /* double low-9 quotation mark */ ali@40: "‹", "‹", "\"", /* single left-pointing angle quotation mark */ ali@40: "›", "›", "\"", /* single right-pointing angle quotation mark */ ali@40: " ", " ", " ", /* no-break space = non-breaking space, */ ali@40: "¡", "¡", "!", /* inverted exclamation mark */ ali@40: "¢", "¢", "c", /* cent sign */ ali@40: "£", "£", "L", /* pound sign */ ali@40: "¤", "¤", "$", /* currency sign */ ali@40: "¥", "¥", "Y", /* yen sign = yuan sign */ ali@40: "§", "§", "--", /* section sign */ ali@40: "¨", "¨", " ", /* diaeresis = spacing diaeresis */ ali@40: "©", "©", "(C) ", /* copyright sign */ ali@40: "ª", "ª", " ", /* feminine ordinal indicator */ ali@40: "«", "«", "\"", /* left-pointing double angle quotation mark */ ali@40: "­", "­", "-", /* soft hyphen = discretionary hyphen */ ali@40: "®", "®", "(R) ", /* registered sign = registered trade mark sign */ ali@40: "¯", "¯", " ", /* macron = spacing macron = overline */ ali@40: "°", "°", " degrees", /* degree sign */ ali@40: "±", "±", "+-", /* plus-minus sign = plus-or-minus sign */ ali@40: "²", "²", "2", /* superscript two = superscript digit two */ ali@40: "³", "³", "3", /* superscript three = superscript digit three */ ali@40: "´", "´", " ", /* acute accent = spacing acute */ ali@40: "µ", "µ", "m", /* micro sign */ ali@40: "¶", "¶", "--", /* pilcrow sign = paragraph sign */ ali@40: "¸", "¸", " ", /* cedilla = spacing cedilla */ ali@40: "¹", "¹", "1", /* superscript one = superscript digit one */ ali@40: "º", "º", " ", /* masculine ordinal indicator */ ali@40: "»", "»", "\"", /* right-pointing double angle quotation mark */ ali@40: "¼", "¼", "1/4", /* vulgar fraction one quarter */ ali@40: "½", "½", "1/2", /* vulgar fraction one half */ ali@40: "¾", "¾", "3/4", /* vulgar fraction three quarters */ ali@40: "¿", "¿", "?", /* inverted question mark */ ali@40: "À", "À", "A", /* latin capital letter A with grave */ ali@40: "Á", "Á", "A", /* latin capital letter A with acute */ ali@40: "Â", "Â", "A", /* latin capital letter A with circumflex */ ali@40: "Ã", "Ã", "A", /* latin capital letter A with tilde */ ali@40: "Ä", "Ä", "A", /* latin capital letter A with diaeresis */ ali@40: "Å", "Å", "A", /* latin capital letter A with ring above */ ali@40: "Æ", "Æ", "AE", /* latin capital letter AE */ ali@40: "Ç", "Ç", "C", /* latin capital letter C with cedilla */ ali@40: "È", "È", "E", /* latin capital letter E with grave */ ali@40: "É", "É", "E", /* latin capital letter E with acute */ ali@40: "Ê", "Ê", "E", /* latin capital letter E with circumflex */ ali@40: "Ë", "Ë", "E", /* latin capital letter E with diaeresis */ ali@40: "Ì", "Ì", "I", /* latin capital letter I with grave */ ali@40: "Í", "Í", "I", /* latin capital letter I with acute */ ali@40: "Î", "Î", "I", /* latin capital letter I with circumflex */ ali@40: "Ï", "Ï", "I", /* latin capital letter I with diaeresis */ ali@40: "Ð", "Ð", "E", /* latin capital letter ETH */ ali@40: "Ñ", "Ñ", "N", /* latin capital letter N with tilde */ ali@40: "Ò", "Ò", "O", /* latin capital letter O with grave */ ali@40: "Ó", "Ó", "O", /* latin capital letter O with acute */ ali@40: "Ô", "Ô", "O", /* latin capital letter O with circumflex */ ali@40: "Õ", "Õ", "O", /* latin capital letter O with tilde */ ali@40: "Ö", "Ö", "O", /* latin capital letter O with diaeresis */ ali@40: "×", "×", "*", /* multiplication sign */ ali@40: "Ø", "Ø", "O", /* latin capital letter O with stroke */ ali@40: "Ù", "Ù", "U", /* latin capital letter U with grave */ ali@40: "Ú", "Ú", "U", /* latin capital letter U with acute */ ali@40: "Û", "Û", "U", /* latin capital letter U with circumflex */ ali@40: "Ü", "Ü", "U", /* latin capital letter U with diaeresis */ ali@40: "Ý", "Ý", "Y", /* latin capital letter Y with acute */ ali@40: "Þ", "Þ", "TH", /* latin capital letter THORN */ ali@40: "ß", "ß", "sz", /* latin small letter sharp s = ess-zed */ ali@40: "à", "à", "a", /* latin small letter a with grave */ ali@40: "á", "á", "a", /* latin small letter a with acute */ ali@40: "â", "â", "a", /* latin small letter a with circumflex */ ali@40: "ã", "ã", "a", /* latin small letter a with tilde */ ali@40: "ä", "ä", "a", /* latin small letter a with diaeresis */ ali@40: "å", "å", "a", /* latin small letter a with ring above */ ali@40: "æ", "æ", "ae", /* latin small letter ae */ ali@40: "ç", "ç", "c", /* latin small letter c with cedilla */ ali@40: "è", "è", "e", /* latin small letter e with grave */ ali@40: "é", "é", "e", /* latin small letter e with acute */ ali@40: "ê", "ê", "e", /* latin small letter e with circumflex */ ali@40: "ë", "ë", "e", /* latin small letter e with diaeresis */ ali@40: "ì", "ì", "i", /* latin small letter i with grave */ ali@40: "í", "í", "i", /* latin small letter i with acute */ ali@40: "î", "î", "i", /* latin small letter i with circumflex */ ali@40: "ï", "ï", "i", /* latin small letter i with diaeresis */ ali@40: "ð", "ð", "eth", /* latin small letter eth */ ali@40: "ñ", "ñ", "n", /* latin small letter n with tilde */ ali@40: "ò", "ò", "o", /* latin small letter o with grave */ ali@40: "ó", "ó", "o", /* latin small letter o with acute */ ali@40: "ô", "ô", "o", /* latin small letter o with circumflex */ ali@40: "õ", "õ", "o", /* latin small letter o with tilde */ ali@40: "ö", "ö", "o", /* latin small letter o with diaeresis */ ali@40: "÷", "÷", "/", /* division sign */ ali@40: "ø", "ø", "o", /* latin small letter o with stroke */ ali@40: "ù", "ù", "u", /* latin small letter u with grave */ ali@40: "ú", "ú", "u", /* latin small letter u with acute */ ali@40: "û", "û", "u", /* latin small letter u with circumflex */ ali@40: "ü", "ü", "u", /* latin small letter u with diaeresis */ ali@40: "ý", "ý", "y", /* latin small letter y with acute */ ali@40: "þ", "þ", "th", /* latin small letter thorn */ ali@40: "ÿ", "ÿ", "y", /* latin small letter y with diaeresis */ ali@40: "", "" ali@40: }; ali@40: ali@40: /* special characters */ ali@0: #define CHAR_SPACE 32 ali@0: #define CHAR_TAB 9 ali@0: #define CHAR_LF 10 ali@0: #define CHAR_CR 13 ali@0: #define CHAR_DQUOTE 34 ali@0: #define CHAR_SQUOTE 39 ali@0: #define CHAR_OPEN_SQUOTE 96 ali@0: #define CHAR_TILDE 126 ali@0: #define CHAR_ASTERISK 42 ali@0: #define CHAR_FORESLASH 47 ali@0: #define CHAR_CARAT 94 ali@0: ali@0: #define CHAR_UNDERSCORE '_' ali@0: #define CHAR_OPEN_CBRACK '{' ali@0: #define CHAR_CLOSE_CBRACK '}' ali@0: #define CHAR_OPEN_RBRACK '(' ali@0: #define CHAR_CLOSE_RBRACK ')' ali@0: #define CHAR_OPEN_SBRACK '[' ali@0: #define CHAR_CLOSE_SBRACK ']' ali@0: ali@40: /* longest and shortest normal PG line lengths */ ali@0: #define LONGEST_PG_LINE 75 ali@0: #define WAY_TOO_LONG 80 ali@0: #define SHORTEST_PG_LINE 55 ali@0: ali@0: #define SWITCHES "ESTPXLOYHWVMUD" /* switches:- */ ali@0: /* D - ignore DP-specific markup */ ali@0: /* E - echo queried line */ ali@0: /* S - check single quotes */ ali@0: /* T - check common typos */ ali@0: /* P - require closure of quotes on */ ali@0: /* every paragraph */ ali@0: /* X - "Trust no one" :-) Paranoid! */ ali@0: /* Queries everything */ ali@0: /* L - line end checking defaults on */ ali@0: /* -L turns it off */ ali@0: /* O - overview. Just shows counts. */ ali@0: /* Y - puts errors to stdout */ ali@0: /* instead of stderr */ ali@0: /* H - Echoes header fields */ ali@0: /* M - Ignore markup in < > */ ali@0: /* U - Use file of User-defined Typos*/ ali@0: /* W - Defaults for use on Web upload*/ ali@0: /* V - Verbose - list EVERYTHING! */ ali@0: #define SWITNO 14 /* max number of switch parms */ ali@0: /* - used for defining array-size */ ali@0: #define MINARGS 1 /* minimum no of args excl switches */ ali@0: #define MAXARGS 1 /* maximum no of args excl switches */ ali@0: ali@0: int pswit[SWITNO]; /* program switches set by SWITCHES */ ali@0: ali@0: #define ECHO_SWITCH 0 ali@0: #define SQUOTE_SWITCH 1 ali@0: #define TYPO_SWITCH 2 ali@0: #define QPARA_SWITCH 3 ali@0: #define PARANOID_SWITCH 4 ali@0: #define LINE_END_SWITCH 5 ali@0: #define OVERVIEW_SWITCH 6 ali@0: #define STDOUT_SWITCH 7 ali@0: #define HEADER_SWITCH 8 ali@0: #define WEB_SWITCH 9 ali@0: #define VERBOSE_SWITCH 10 ali@0: #define MARKUP_SWITCH 11 ali@0: #define USERTYPO_SWITCH 12 ali@0: #define DP_SWITCH 13 ali@0: ali@0: long cnt_dquot; /* for overview mode, count of doublequote queries */ ali@0: long cnt_squot; /* for overview mode, count of singlequote queries */ ali@0: long cnt_brack; /* for overview mode, count of brackets queries */ ali@0: long cnt_bin; /* for overview mode, count of non-ASCII queries */ ali@0: long cnt_odd; /* for overview mode, count of odd character queries */ ali@0: long cnt_long; /* for overview mode, count of long line errors */ ali@0: long cnt_short; /* for overview mode, count of short line queries */ ali@0: long cnt_punct; /* for overview mode, count of punctuation and spacing queries */ ali@0: long cnt_dash; /* for overview mode, count of dash-related queries */ ali@0: long cnt_word; /* for overview mode, count of word queries */ ali@0: long cnt_html; /* for overview mode, count of html queries */ ali@0: long cnt_lineend; /* for overview mode, count of line-end queries */ ali@40: long cnt_spacend; /* count of lines with space at end */ ali@0: long linecnt; /* count of total lines in the file */ ali@40: long checked_linecnt; /* count of lines actually checked */ ali@0: ali@0: void proghelp(void); ali@0: void procfile(char *); ali@0: ali@0: #define LOW_THRESHOLD 0 ali@0: #define HIGH_THRESHOLD 1 ali@0: ali@0: #define START 0 ali@0: #define END 1 ali@0: #define PREV 0 ali@0: #define NEXT 1 ali@0: #define FIRST_OF_PAIR 0 ali@0: #define SECOND_OF_PAIR 1 ali@0: ali@0: #define MAX_WORDPAIR 1000 ali@0: ali@0: char running_from[MAX_PATH]; ali@0: ali@0: int mixdigit(char *); ali@54: const char *getaword(const char *,char *); ali@40: int matchword(char *,char *); ali@40: char *flgets(char *,int,FILE *,long); ali@0: void lowerit(char *); ali@0: int gcisalpha(unsigned char); ali@0: int gcisdigit(unsigned char); ali@0: int gcisletter(unsigned char); ali@40: char *gcstrchr(char *s,char c); ali@0: void postprocess_for_HTML(char *); ali@0: char *linehasmarkup(char *); ali@0: char *losemarkup(char *); ali@40: int tagcomp(char *,char *); ali@0: char *loseentities(char *); ali@0: int isroman(char *); ali@0: int usertypo_count; ali@0: void postprocess_for_DP(char *); ali@0: ali@0: char wrk[LINEBUFSIZE]; ali@0: ali@40: #define MAX_QWORD 50 ali@40: #define MAX_QWORD_LENGTH 40 ali@0: char qword[MAX_QWORD][MAX_QWORD_LENGTH]; ali@0: signed int dupcnt[MAX_QWORD]; ali@0: ali@40: int main(int argc,char **argv) ali@0: { ali@40: char *argsw,*s; ali@40: int i,switno,invarg; ali@0: char usertypo_file[MAX_PATH]; ali@0: FILE *usertypofile; ali@40: if (strlen(argv[0])=running_from;s--) ali@40: *s=0; ali@40: switno=strlen(SWITCHES); ali@40: for (i=switno;--i>0;) ali@40: pswit[i]=0; /* initialise switches */ ali@40: /* ali@40: * Standard loop to extract switches. ali@40: * When we come out of this loop, the arguments will be ali@40: * in argv[0] upwards and the switches used will be ali@40: * represented by their equivalent elements in pswit[] ali@40: */ ali@40: while (--argc>0 && **++argv=='-') ali@40: for (argsw=argv[0]+1;*argsw!='\0';argsw++) ali@40: for (i=switno,invarg=1;(--i>=0) && invarg==1;) ali@40: if ((toupper(*argsw))==SWITCHES[i]) ali@40: { ali@40: invarg=0; ali@40: pswit[i]=1; ali@40: } ali@40: /* Paranoid checking is turned OFF, not on, by its switch */ ali@40: pswit[PARANOID_SWITCH]^=1; ali@40: if (pswit[PARANOID_SWITCH]) ali@40: /* if running in paranoid mode force typo checks as well */ ali@40: pswit[TYPO_SWITCH]=pswit[TYPO_SWITCH]^1; ali@40: /* Line-end checking is turned OFF, not on, by its switch */ ali@40: pswit[LINE_END_SWITCH]^=1; ali@40: /* Echoing is turned OFF, not on, by its switch */ ali@40: pswit[ECHO_SWITCH]^=1; ali@40: if (pswit[OVERVIEW_SWITCH]) ali@40: /* just print summary; don't echo */ ali@40: pswit[ECHO_SWITCH]=0; ali@40: /* ali@40: * Web uploads - for the moment, this is really just a placeholder ali@40: * until we decide what processing we really want to do on web uploads ali@40: */ ali@40: if (pswit[WEB_SWITCH]) ali@40: { ali@40: /* specific override for web uploads */ ali@40: pswit[ECHO_SWITCH]=1; ali@40: pswit[SQUOTE_SWITCH]=0; ali@40: pswit[TYPO_SWITCH]=1; ali@40: pswit[QPARA_SWITCH]=0; ali@40: pswit[PARANOID_SWITCH]=1; ali@40: pswit[LINE_END_SWITCH]=0; ali@40: pswit[OVERVIEW_SWITCH]=0; ali@40: pswit[STDOUT_SWITCH]=0; ali@40: pswit[HEADER_SWITCH]=1; ali@40: pswit[VERBOSE_SWITCH]=0; ali@40: pswit[MARKUP_SWITCH]=0; ali@40: pswit[USERTYPO_SWITCH]=0; ali@40: pswit[DP_SWITCH]=0; ali@40: } ali@40: if (argcMAXARGS) ali@40: { ali@40: /* check number of args */ ali@0: proghelp(); ali@40: return 1; ali@40: } ali@0: /* read in the user-defined stealth scanno list */ ali@40: if (pswit[USERTYPO_SWITCH]) ali@40: { ali@40: /* ... we were told we had one! */ ali@40: usertypofile=fopen(USERTYPO_FILE,"rb"); ali@40: if (!usertypofile) ali@40: { ali@40: /* not in cwd. try excuteable directory. */ ali@40: strcpy(usertypo_file,running_from); ali@40: strcat(usertypo_file,USERTYPO_FILE); ali@40: usertypofile=fopen(usertypo_file,"rb"); ali@40: if (!usertypofile) { ali@40: /* we ain't got no user typo file! */ ali@40: printf(" --> I couldn't find gutcheck.typ " ali@40: "-- proceeding without user typos.\n"); ali@40: } ali@40: } ali@40: usertypo_count=0; ali@40: if (usertypofile) ali@40: { ali@40: /* we managed to open a User Typo File! */ ali@40: if (pswit[USERTYPO_SWITCH]) ali@40: { ali@40: while (flgets(aline,LINEBUFSIZE-1,usertypofile, ali@40: (long)usertypo_count)) ali@40: { ali@40: if (strlen(aline)>1) ali@40: { ali@40: if ((int)*aline>33) ali@40: { ali@40: s=malloc(strlen(aline)+1); ali@40: if (!s) ali@40: { ali@40: fprintf(stderr,"bookloupe: cannot get enough " ali@40: "memory for user typo file!\n"); ali@0: exit(1); ali@40: } ali@40: strcpy(s,aline); ali@40: usertypo[usertypo_count]=s; ali@0: usertypo_count++; ali@40: if (usertypo_count>=MAX_USER_TYPOS) ali@40: { ali@40: printf(" --> Only %d user-defined typos " ali@42: "allowed: ignoring the rest\n", ali@42: MAX_USER_TYPOS); ali@0: break; ali@40: } ali@40: } ali@40: } ali@40: } ali@40: } ali@0: fclose(usertypofile); ali@40: } ali@40: } ali@40: fprintf(stderr,"bookloupe: Check and report on an e-text\n"); ali@40: cnt_dquot=cnt_squot=cnt_brack=cnt_bin=cnt_odd=cnt_long= ali@40: cnt_short=cnt_punct=cnt_dash=cnt_word=cnt_html=cnt_lineend= ali@40: cnt_spacend=0; ali@0: procfile(argv[0]); ali@40: if (pswit[OVERVIEW_SWITCH]) ali@40: { ali@40: printf(" Checked %ld lines of %ld (head+foot = %ld)\n\n", ali@40: checked_linecnt,linecnt,linecnt-checked_linecnt); ali@40: printf(" --------------- Queries found --------------\n"); ali@40: if (cnt_long) ali@40: printf(" Long lines: %14ld\n",cnt_long); ali@40: if (cnt_short) ali@40: printf(" Short lines: %14ld\n",cnt_short); ali@40: if (cnt_lineend) ali@40: printf(" Line-end problems: %14ld\n",cnt_lineend); ali@40: if (cnt_word) ali@40: printf(" Common typos: %14ld\n",cnt_word); ali@40: if (cnt_dquot) ali@40: printf(" Unmatched quotes: %14ld\n",cnt_dquot); ali@40: if (cnt_squot) ali@40: printf(" Unmatched SingleQuotes: %14ld\n",cnt_squot); ali@40: if (cnt_brack) ali@40: printf(" Unmatched brackets: %14ld\n",cnt_brack); ali@40: if (cnt_bin) ali@40: printf(" Non-ASCII characters: %14ld\n",cnt_bin); ali@40: if (cnt_odd) ali@40: printf(" Proofing characters: %14ld\n",cnt_odd); ali@40: if (cnt_punct) ali@40: printf(" Punctuation & spacing queries: %14ld\n",cnt_punct); ali@40: if (cnt_dash) ali@40: printf(" Non-standard dashes: %14ld\n",cnt_dash); ali@40: if (cnt_html) ali@40: printf(" Possible HTML tags: %14ld\n",cnt_html); ali@0: printf("\n"); ali@40: printf(" TOTAL QUERIES %14ld\n", ali@40: cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+ ali@40: cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend); ali@40: } ali@40: return 0; ali@0: } ali@0: ali@41: struct first_pass_results { ali@41: long firstline,astline; ali@41: long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma; ali@41: long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit; ali@41: long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash; ali@41: signed int Dutchcount,Frenchcount; ali@41: }; ali@41: ali@40: /* ali@41: * first_pass: ali@40: * ali@41: * Run a first pass - verify that it's a valid PG ali@41: * file, decide whether to report some things that ali@41: * occur many times in the text like long or short ali@41: * lines, non-standard dashes, etc. ali@40: */ ali@41: struct first_pass_results *first_pass(FILE *infile) ali@0: { ali@54: char laststart=CHAR_SPACE; ali@54: const char *s; ali@41: signed int i,llen; ali@41: unsigned int lastlen=0,lastblen=0; ali@41: long spline=0,nspline=0; ali@41: static struct first_pass_results results={0}; ali@41: char inword[MAXWORDLEN]=""; ali@40: while (fgets(aline,LINEBUFSIZE-1,infile)) ali@40: { ali@40: while (aline[strlen(aline)-1]==10 || aline[strlen(aline)-1]==13) ali@40: aline[strlen(aline)-1]=0; ali@0: linecnt++; ali@40: if (strstr(aline,"*END") && strstr(aline,"SMALL PRINT") && ali@40: (strstr(aline,"PUBLIC DOMAIN") || strstr(aline,"COPYRIGHT"))) ali@40: { ali@0: if (spline) ali@0: printf(" --> Duplicate header?\n"); ali@40: spline=linecnt+1; /* first line of non-header text, that is */ ali@40: } ali@40: if (!strncmp(aline,"*** START",9) && strstr(aline,"PROJECT GUTENBERG")) ali@40: { ali@0: if (nspline) ali@0: printf(" --> Duplicate header?\n"); ali@40: nspline=linecnt+1; /* first line of non-header text, that is */ ali@40: } ali@40: if (spline || nspline) ali@40: { ali@0: lowerit(aline); ali@40: if (strstr(aline,"end") && strstr(aline,"project gutenberg")) ali@40: { ali@40: if (strstr(aline,"end") Duplicate footer?\n"); ali@40: } ali@40: else ali@41: results.footerline=linecnt; ali@40: } ali@40: } ali@40: } ali@40: if (spline) ali@41: results.firstline=spline; ali@40: if (nspline) ali@41: results.firstline=nspline; /* override with new */ ali@41: if (results.footerline) ali@40: continue; /* don't count the boilerplate in the footer */ ali@40: llen=strlen(aline); ali@41: results.totlen+=llen; ali@40: for (i=0;i127) ali@41: results.binlen++; ali@40: if (gcisalpha(aline[i])) ali@41: results.alphalen++; ali@40: if (i>0 && aline[i]==CHAR_DQUOTE && isalpha(aline[i-1])) ali@41: results.endquote_count++; ali@40: } ali@40: if (strlen(aline)>2 && lastlen>2 && lastlen2 && lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE) ali@41: results.shortline++; ali@40: if (*aline && (unsigned char)aline[strlen(aline)-1]<=CHAR_SPACE) ali@40: cnt_spacend++; ali@40: if (strstr(aline,".,")) ali@41: results.dotcomma++; ali@40: /* only count ast lines for ignoring purposes where there is */ ali@0: /* locase text on the line */ ali@40: if (strstr(aline,"*")) ali@40: { ali@40: for (s=aline;*s;s++) ali@40: if (*s>='a' && *s<='z') ali@0: break; ali@40: if (*s) ali@41: results.astline++; ali@40: } ali@40: if (strstr(aline,"/")) ali@41: results.fslashline++; ali@40: for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--) ali@40: ; ali@40: if (aline[i]=='-' && aline[i-1]!='-') ali@41: results.hyphens++; ali@40: if (llen>LONGEST_PG_LINE) ali@41: results.longline++; ali@40: if (llen>WAY_TOO_LONG) ali@41: results.verylongline++; ali@40: if (strstr(aline,"<") && strstr(aline,">")) ali@40: { ali@40: i=(signed int)(strstr(aline,">")-strstr(aline,"<")+1); ali@40: if (i>0) ali@41: results.htmcount++; ali@40: if (strstr(aline,"")) ali@41: results.htmcount+=4; /* bonus marks! */ ali@40: } ali@0: /* Check for spaced em-dashes */ ali@40: if (strstr(aline,"--")) ali@40: { ali@41: results.emdash++; ali@40: if (*(strstr(aline,"--")-1)==CHAR_SPACE || ali@40: (*(strstr(aline,"--")+2)==CHAR_SPACE)) ali@41: results.space_emdash++; ali@40: if (*(strstr(aline,"--")-1)==CHAR_SPACE && ali@40: (*(strstr(aline,"--")+2)==CHAR_SPACE)) ali@40: /* count of em-dashes with spaces both sides */ ali@41: results.non_PG_space_emdash++; ali@40: if (*(strstr(aline,"--")-1)!=CHAR_SPACE && ali@40: (*(strstr(aline,"--")+2)!=CHAR_SPACE)) ali@40: /* count of PG-type em-dashes with no spaces */ ali@41: results.PG_space_emdash++; ali@40: } ali@40: for (s=aline;*s;) ali@40: { ali@40: s=getaword(s,inword); ali@40: if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) ali@41: results.Dutchcount++; ali@40: if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) ali@41: results.Frenchcount++; ali@40: if (!strcmp(inword,"0") || !strcmp(inword,"1")) ali@41: results.standalone_digit++; ali@40: } ali@0: /* Check for spaced dashes */ ali@40: if (strstr(aline," -") && *(strstr(aline," -")+2)!='-') ali@41: results.spacedash++; ali@40: lastblen=lastlen; ali@40: lastlen=strlen(aline); ali@40: laststart=aline[0]; ali@40: } ali@41: return &results; ali@41: } ali@41: ali@42: struct warnings { ali@42: signed int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen; ali@42: signed int endquote,isDutch,isFrench; ali@42: }; ali@42: ali@42: /* ali@42: * report_first_pass: ali@42: * ali@42: * Make some snap decisions based on the first pass results. ali@42: */ ali@42: struct warnings *report_first_pass(struct first_pass_results *results) ali@42: { ali@42: static struct warnings warnings={0}; ali@42: if (cnt_spacend>0) ali@42: printf(" --> %ld lines in this file have white space at end\n", ali@42: cnt_spacend); ali@42: warnings.dotcomma=1; ali@42: if (results->dotcomma>5) ali@42: { ali@42: warnings.dotcomma=0; ali@42: printf(" --> %ld lines in this file contain '.,'. " ali@42: "Not reporting them.\n",results->dotcomma); ali@42: } ali@42: /* ali@42: * If more than 50 lines, or one-tenth, are short, ali@42: * don't bother reporting them. ali@42: */ ali@42: warnings.shortline=1; ali@42: if (results->shortline>50 || results->shortline*10>linecnt) ali@42: { ali@42: warnings.shortline=0; ali@42: printf(" --> %ld lines in this file are short. " ali@42: "Not reporting short lines.\n",results->shortline); ali@42: } ali@42: /* ali@42: * If more than 50 lines, or one-tenth, are long, ali@42: * don't bother reporting them. ali@42: */ ali@42: warnings.longline=1; ali@42: if (results->longline>50 || results->longline*10>linecnt) ali@42: { ali@42: warnings.longline=0; ali@42: printf(" --> %ld lines in this file are long. " ali@42: "Not reporting long lines.\n",results->longline); ali@42: } ali@42: /* If more than 10 lines contain asterisks, don't bother reporting them. */ ali@42: warnings.ast=1; ali@42: if (results->astline>10) ali@42: { ali@42: warnings.ast=0; ali@42: printf(" --> %ld lines in this file contain asterisks. " ali@42: "Not reporting them.\n",results->astline); ali@42: } ali@42: /* ali@42: * If more than 10 lines contain forward slashes, ali@42: * don't bother reporting them. ali@42: */ ali@42: warnings.fslash=1; ali@42: if (results->fslashline>10) ali@42: { ali@42: warnings.fslash=0; ali@42: printf(" --> %ld lines in this file contain forward slashes. " ali@42: "Not reporting them.\n",results->fslashline); ali@42: } ali@42: /* ali@42: * If more than 20 lines contain unpunctuated endquotes, ali@42: * don't bother reporting them. ali@42: */ ali@42: warnings.endquote=1; ali@42: if (results->endquote_count>20) ali@42: { ali@42: warnings.endquote=0; ali@42: printf(" --> %ld lines in this file contain unpunctuated endquotes. " ali@42: "Not reporting them.\n",results->endquote_count); ali@42: } ali@42: /* ali@42: * If more than 15 lines contain standalone digits, ali@42: * don't bother reporting them. ali@42: */ ali@42: warnings.digit=1; ali@42: if (results->standalone_digit>10) ali@42: { ali@42: warnings.digit=0; ali@42: printf(" --> %ld lines in this file contain standalone 0s and 1s. " ali@42: "Not reporting them.\n",results->standalone_digit); ali@42: } ali@42: /* ali@42: * If more than 20 lines contain hyphens at end, ali@42: * don't bother reporting them. ali@42: */ ali@42: warnings.hyphen=1; ali@42: if (results->hyphens>20) ali@42: { ali@42: warnings.hyphen=0; ali@42: printf(" --> %ld lines in this file have hyphens at end. " ali@42: "Not reporting them.\n",results->hyphens); ali@42: } ali@42: if (results->htmcount>20 && !pswit[MARKUP_SWITCH]) ali@42: { ali@42: printf(" --> Looks like this is HTML. Switching HTML mode ON.\n"); ali@42: pswit[MARKUP_SWITCH]=1; ali@42: } ali@42: if (results->verylongline>0) ali@42: printf(" --> %ld lines in this file are VERY long!\n", ali@42: results->verylongline); ali@42: /* ali@42: * If there are more non-PG spaced dashes than PG em-dashes, ali@42: * assume it's deliberate. ali@42: * Current PG guidelines say don't use them, but older texts do, ali@42: * and some people insist on them whatever the guidelines say. ali@42: */ ali@42: warnings.dash=1; ali@42: if (results->spacedash+results->non_PG_space_emdash> ali@42: results->PG_space_emdash) ali@42: { ali@42: warnings.dash=0; ali@42: printf(" --> There are %ld spaced dashes and em-dashes. " ali@42: "Not reporting them.\n", ali@42: results->spacedash+results->non_PG_space_emdash); ali@42: } ali@42: /* If more than a quarter of characters are hi-bit, bug out. */ ali@42: warnings.bin=1; ali@42: if (results->binlen*4>results->totlen) ali@42: { ali@42: printf(" --> This file does not appear to be ASCII. " ali@42: "Terminating. Best of luck with it!\n"); ali@42: exit(1); ali@42: } ali@42: if (results->alphalen*4totlen) ali@42: { ali@42: printf(" --> This file does not appear to be text. " ali@42: "Terminating. Best of luck with it!\n"); ali@42: exit(1); ali@42: } ali@42: if (results->binlen*100>results->totlen || results->binlen>100) ali@42: { ali@42: printf(" --> There are a lot of foreign letters here. " ali@42: "Not reporting them.\n"); ali@42: warnings.bin=0; ali@42: } ali@42: warnings.isDutch=0; ali@42: if (results->Dutchcount>50) ali@42: { ali@42: warnings.isDutch=1; ali@42: printf(" --> This looks like Dutch - " ali@42: "switching off dashes and warnings for 's Middags case.\n"); ali@42: } ali@42: warnings.isFrench=0; ali@42: if (results->Frenchcount>50) ali@42: { ali@42: warnings.isFrench=1; ali@42: printf(" --> This looks like French - " ali@42: "switching off some doublepunct.\n"); ali@42: } ali@42: if (results->firstline && results->footerline) ali@42: printf(" The PG header and footer appear to be already on.\n"); ali@42: else ali@42: { ali@42: if (results->firstline) ali@42: printf(" The PG header is on - no footer.\n"); ali@42: if (results->footerline) ali@42: printf(" The PG footer is on - no header.\n"); ali@42: } ali@42: printf("\n"); ali@42: if (pswit[VERBOSE_SWITCH]) ali@42: { ali@42: warnings.bin=1; ali@42: warnings.shortline=1; ali@42: warnings.dotcomma=1; ali@42: warnings.longline=1; ali@42: warnings.dash=1; ali@42: warnings.digit=1; ali@42: warnings.ast=1; ali@42: warnings.fslash=1; ali@42: warnings.hyphen=1; ali@42: warnings.endquote=1; ali@42: printf(" *** Verbose output is ON -- you asked for it! ***\n"); ali@42: } ali@42: if (warnings.isDutch) ali@42: warnings.dash=0; ali@42: if (results->footerline>0 && results->firstline>0 && ali@42: results->footerline>results->firstline && ali@42: results->footerline-results->firstline<100) ali@42: { ali@42: printf(" --> I don't really know where this text starts. \n"); ali@42: printf(" There are no reference points.\n"); ali@42: printf(" I'm going to have to report the header and footer " ali@42: "as well.\n"); ali@42: results->firstline=0; ali@42: } ali@42: return &warnings; ali@42: } ali@42: ali@43: struct counters { ali@43: long quot; ali@43: signed int c_unders,c_brack,s_brack,r_brack; ali@43: signed int open_single_quote,close_single_quote; ali@43: }; ali@43: ali@43: /* ali@43: * analyse_quotes: ali@43: * ali@43: * Look along the line, accumulate the count of quotes, and see ali@43: * if this is an empty line - i.e. a line with nothing on it ali@43: * but spaces. ali@43: * If line has just spaces, period, * and/or - on it, don't ali@43: * count it, since empty lines with asterisks or dashes to ali@43: * separate sections are common. ali@43: * ali@43: * Returns: Non-zero if the line is empty. ali@43: */ ali@43: int analyse_quotes(const char *s,struct counters *counters) ali@43: { ali@43: signed int guessquote=0; ali@43: int isemptyline=1; /* assume the line is empty until proven otherwise */ ali@43: while (*s) ali@43: { ali@43: if (*s==CHAR_DQUOTE) ali@43: counters->quot++; ali@43: if (*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE) ali@43: { ali@43: if (s==aline) ali@43: { ali@43: /* ali@43: * At start of line, it can only be an openquote. ali@43: * Hardcode a very common exception! ali@43: */ ali@43: if (strncmp(s+2,"tis",3) && strncmp(s+2,"Tis",3)) ali@43: counters->open_single_quote++; ali@43: } ali@43: else if (gcisalpha(s[-1]) && gcisalpha(s[1])) ali@43: /* Do nothing! it's definitely an apostrophe, not a quote */ ali@43: ; ali@43: /* it's outside a word - let's check it out */ ali@43: else if (*s==CHAR_OPEN_SQUOTE || gcisalpha(s[1])) ali@43: { ali@43: /* it damwell better BE an openquote */ ali@43: if (strncmp(s+1,"tis",3) && strncmp(s+1,"Tis",3)) ali@43: /* hardcode a very common exception! */ ali@43: counters->open_single_quote++; ali@43: } ali@43: else ali@43: { ali@43: /* now - is it a closequote? */ ali@43: guessquote=0; /* accumulate clues */ ali@43: if (gcisalpha(s[-1])) ali@43: { ali@43: /* it follows a letter - could be either */ ali@43: guessquote++; ali@43: if (s[-1]=='s') ali@43: { ali@43: /* looks like a plural apostrophe */ ali@43: guessquote-=3; ali@43: if (s[1]==CHAR_SPACE) /* bonus marks! */ ali@43: guessquote-=2; ali@43: } ali@43: } ali@43: /* it doesn't have a letter either side */ ali@43: else if (strchr(".?!,;:",s[-1]) && strchr(".?!,;: ",s[1])) ali@43: guessquote+=8; /* looks like a closequote */ ali@43: else ali@43: guessquote++; ali@43: if (counters->open_single_quote>counters->close_single_quote) ali@43: /* ali@43: * Give it the benefit of some doubt, ali@43: * if a squote is already open. ali@43: */ ali@43: guessquote++; ali@43: else ali@43: guessquote--; ali@43: if (guessquote>=0) ali@43: counters->close_single_quote++; ali@43: } ali@43: } ali@43: if (*s!=CHAR_SPACE && *s!='-' && *s!='.' && *s!=CHAR_ASTERISK && ali@43: *s!=13 && *s!=10) ali@43: isemptyline=0; /* ignore lines like * * * as spacers */ ali@43: if (*s==CHAR_UNDERSCORE) ali@43: counters->c_unders++; ali@43: if (*s==CHAR_OPEN_CBRACK) ali@43: counters->c_brack++; ali@43: if (*s==CHAR_CLOSE_CBRACK) ali@43: counters->c_brack--; ali@43: if (*s==CHAR_OPEN_RBRACK) ali@43: counters->r_brack++; ali@43: if (*s==CHAR_CLOSE_RBRACK) ali@43: counters->r_brack--; ali@43: if (*s==CHAR_OPEN_SBRACK) ali@43: counters->s_brack++; ali@43: if (*s==CHAR_CLOSE_SBRACK) ali@43: counters->s_brack--; ali@43: s++; ali@43: } ali@43: return isemptyline; ali@43: } ali@43: ali@41: /* ali@44: * check_for_odd_characters: ali@44: * ali@44: * Check for binary and other odd characters. ali@44: */ ali@44: void check_for_odd_characters(const char *aline,const struct warnings *warnings, ali@44: int isemptyline) ali@44: { ali@44: /* Don't repeat multiple warnings on one line. */ ali@44: signed int eNon_A=0,eTab=0,eTilde=0,eCarat=0,eFSlash=0,eAst=0; ali@44: const char *s; ali@44: unsigned char c; ali@44: for (s=aline;*s;s++) ali@44: { ali@44: c=*(unsigned char *)s; ali@44: if (!eNon_A && (*s127)) ali@44: { ali@44: if (pswit[ECHO_SWITCH]) ali@44: printf("\n%s\n",aline); ali@44: if (!pswit[OVERVIEW_SWITCH]) ali@44: if (c>127 && c<160) ali@44: printf(" Line %ld column %d - " ali@44: "Non-ISO-8859 character %d\n",linecnt,(int)(s-aline)+1,c); ali@44: else ali@44: printf(" Line %ld column %d - Non-ASCII character %d\n", ali@44: linecnt,(int)(s-aline)+1,c); ali@44: else ali@44: cnt_bin++; ali@44: eNon_A=1; ali@44: } ali@44: if (!eTab && *s==CHAR_TAB) ali@44: { ali@44: if (pswit[ECHO_SWITCH]) ali@44: printf("\n%s\n",aline); ali@44: if (!pswit[OVERVIEW_SWITCH]) ali@44: printf(" Line %ld column %d - Tab character?\n", ali@44: linecnt,(int)(s-aline)+1); ali@44: else ali@44: cnt_odd++; ali@44: eTab=1; ali@44: } ali@44: if (!eTilde && *s==CHAR_TILDE) ali@44: { ali@44: /* ali@44: * Often used by OCR software to indicate an ali@44: * unrecognizable character. ali@44: */ ali@44: if (pswit[ECHO_SWITCH]) ali@44: printf("\n%s\n",aline); ali@44: if (!pswit[OVERVIEW_SWITCH]) ali@44: printf(" Line %ld column %d - Tilde character?\n", ali@44: linecnt,(int)(s-aline)+1); ali@44: else ali@44: cnt_odd++; ali@44: eTilde=1; ali@44: } ali@44: if (!eCarat && *s==CHAR_CARAT) ali@44: { ali@44: if (pswit[ECHO_SWITCH]) ali@44: printf("\n%s\n",aline); ali@44: if (!pswit[OVERVIEW_SWITCH]) ali@44: printf(" Line %ld column %d - Carat character?\n", ali@44: linecnt,(int)(s-aline)+1); ali@44: else ali@44: cnt_odd++; ali@44: eCarat=1; ali@44: } ali@44: if (!eFSlash && *s==CHAR_FORESLASH && warnings->fslash) ali@44: { ali@44: if (pswit[ECHO_SWITCH]) ali@44: printf("\n%s\n",aline); ali@44: if (!pswit[OVERVIEW_SWITCH]) ali@44: printf(" Line %ld column %d - Forward slash?\n", ali@44: linecnt,(int)(s-aline)+1); ali@44: else ali@44: cnt_odd++; ali@44: eFSlash=1; ali@44: } ali@44: /* ali@44: * Report asterisks only in paranoid mode, ali@44: * since they're often deliberate. ali@44: */ ali@44: if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline && ali@44: *s==CHAR_ASTERISK) ali@44: { ali@44: if (pswit[ECHO_SWITCH]) ali@44: printf("\n%s\n",aline); ali@44: if (!pswit[OVERVIEW_SWITCH]) ali@44: printf(" Line %ld column %d - Asterisk?\n", ali@44: linecnt,(int)(s-aline)+1); ali@44: else ali@44: cnt_odd++; ali@44: eAst=1; ali@44: } ali@44: } ali@44: } ali@44: ali@44: /* ali@45: * check_for_long_line: ali@45: * ali@45: * Check for line too long. ali@45: */ ali@45: void check_for_long_line(const char *aline) ali@45: { ali@45: if (strlen(aline)>LONGEST_PG_LINE) ali@45: { ali@45: if (pswit[ECHO_SWITCH]) ali@45: printf("\n%s\n",aline); ali@45: if (!pswit[OVERVIEW_SWITCH]) ali@45: printf(" Line %ld column %d - Long line %d\n", ali@45: linecnt,strlen(aline),strlen(aline)); ali@45: else ali@45: cnt_long++; ali@45: } ali@45: } ali@45: ali@45: struct line_properties { ali@45: unsigned int len,blen; ali@45: char start; ali@45: }; ali@45: ali@45: /* ali@45: * check_for_short_line: ali@45: * ali@45: * Check for line too short. ali@45: * ali@45: * This one is a bit trickier to implement: we don't want to ali@45: * flag the last line of a paragraph for being short, so we ali@45: * have to wait until we know that our current line is a ali@45: * "normal" line, then report the _previous_ line if it was too ali@45: * short. We also don't want to report indented lines like ali@45: * chapter heads or formatted quotations. We therefore keep ali@45: * last->len as the length of the last line examined, and ali@45: * last->blen as the length of the last but one, and try to ali@45: * suppress unnecessary warnings by checking that both were of ali@45: * "normal" length. We keep the first character of the last ali@45: * line in last->start, and if it was a space, we assume that ali@45: * the formatting is deliberate. I can't figure out a way to ali@45: * distinguish something like a quoted verse left-aligned or ali@45: * the header or footer of a letter from a paragraph of short ali@45: * lines - maybe if I examined the whole paragraph, and if the ali@45: * para has less than, say, 8 lines and if all lines are short, ali@45: * then just assume it's OK? Need to look at some texts to see ali@45: * how often a formula like this would get the right result. ali@45: */ ali@45: void check_for_short_line(const char *aline,const struct line_properties *last) ali@45: { ali@45: if (strlen(aline)>1 && last->len>1 && last->lenblen>1 && last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE) ali@45: { ali@45: if (pswit[ECHO_SWITCH]) ali@45: printf("\n%s\n",prevline); ali@45: if (!pswit[OVERVIEW_SWITCH]) ali@45: printf(" Line %ld column %d - Short line %d?\n", ali@45: linecnt-1,strlen(prevline),strlen(prevline)); ali@45: else ali@45: cnt_short++; ali@45: } ali@45: } ali@45: ali@45: /* ali@46: * check_for_starting_punctuation: ali@46: * ali@46: * Look for punctuation other than full ellipses at start of line. ali@46: */ ali@46: void check_for_starting_punctuation(const char *aline) ali@46: { ali@46: if (*aline && strchr(".?!,;:",aline[0]) && strncmp(". . .",aline,5)) ali@46: { ali@46: if (pswit[ECHO_SWITCH]) ali@46: printf("\n%s\n",aline); ali@46: if (!pswit[OVERVIEW_SWITCH]) ali@46: printf(" Line %ld column 1 - Begins with punctuation?\n", ali@46: linecnt); ali@46: else ali@46: cnt_punct++; ali@46: } ali@46: } ali@46: ali@46: /* ali@47: * check_for_spaced_emdash: ali@47: * ali@47: * Check for spaced em-dashes. ali@47: * ali@47: * We must check _all_ occurrences of "--" on the line ali@47: * hence the loop - even if the first double-dash is OK ali@47: * there may be another that's wrong later on. ali@47: */ ali@47: void check_for_spaced_emdash(const char *aline) ali@47: { ali@47: const char *s,*t; ali@47: s=aline; ali@47: while ((t=strstr(s,"--"))) ali@47: { ali@47: if (t>aline && t[-1]==CHAR_SPACE || t[2]==CHAR_SPACE) ali@47: { ali@47: if (pswit[ECHO_SWITCH]) ali@47: printf("\n%s\n",aline); ali@47: if (!pswit[OVERVIEW_SWITCH]) ali@47: printf(" Line %ld column %d - Spaced em-dash?\n", ali@47: linecnt,(int)(t-aline)+1); ali@47: else ali@47: cnt_dash++; ali@47: } ali@47: s=t+2; ali@47: } ali@47: } ali@47: ali@47: /* ali@47: * check_for_spaced_dash: ali@47: * ali@47: * Check for spaced dashes. ali@47: */ ali@47: void check_for_spaced_dash(const char *aline) ali@47: { ali@47: const char *s; ali@47: if ((s=strstr(aline," -"))) ali@47: { ali@47: if (s[2]!='-') ali@47: { ali@47: if (pswit[ECHO_SWITCH]) ali@47: printf("\n%s\n",aline); ali@47: if (!pswit[OVERVIEW_SWITCH]) ali@47: printf(" Line %ld column %d - Spaced dash?\n", ali@47: linecnt,(int)(s-aline)+1); ali@47: else ali@47: cnt_dash++; ali@47: } ali@47: } ali@47: else if ((s=strstr(aline,"- "))) ali@47: { ali@47: if (s==aline || s[-1]!='-') ali@47: { ali@47: if (pswit[ECHO_SWITCH]) ali@47: printf("\n%s\n",aline); ali@47: if (!pswit[OVERVIEW_SWITCH]) ali@47: printf(" Line %ld column %d - Spaced dash?\n", ali@47: linecnt,(int)(s-aline)+1); ali@47: else ali@47: cnt_dash++; ali@47: } ali@47: } ali@47: } ali@47: ali@47: /* ali@48: * check_for_unmarked_paragraphs: ali@48: * ali@48: * Check for unmarked paragraphs indicated by separate speakers. ali@48: * ali@48: * May well be false positive: ali@48: * "Bravo!" "Wonderful!" called the crowd. ali@48: * but useful all the same. ali@48: */ ali@48: void check_for_unmarked_paragraphs(const char *aline) ali@48: { ali@48: const char *s; ali@48: s=strstr(aline,"\" \""); ali@48: if (!s) ali@48: s=strstr(aline,"\" \""); ali@48: if (s) ali@48: { ali@48: if (pswit[ECHO_SWITCH]) ali@48: printf("\n%s\n",aline); ali@48: if (!pswit[OVERVIEW_SWITCH]) ali@48: printf(" Line %ld column %d - Query missing paragraph break?\n", ali@48: linecnt,(int)(s-aline)+1); ali@48: else ali@48: cnt_punct++; ali@48: } ali@48: } ali@48: ali@48: /* ali@49: * check_for_jeebies: ali@49: * ali@49: * Check for "to he" and other easy h/b errors. ali@49: * ali@49: * This is a very inadequate effort on the h/b problem, ali@49: * but the phrase "to he" is always an error, whereas "to ali@49: * be" is quite common. ali@49: * Similarly, '"Quiet!", be said.' is a non-be error ali@49: * "to he" is _not_ always an error!: ali@49: * "Where they went to he couldn't say." ali@49: * Another false positive: ali@49: * What would "Cinderella" be without the . . . ali@49: * and another: "If he wants to he can see for himself." ali@49: */ ali@49: void check_for_jeebies(const char *aline) ali@49: { ali@49: const char *s; ali@49: s=strstr(aline," be could "); ali@49: if (!s) ali@49: s=strstr(aline," be would "); ali@49: if (!s) ali@49: s=strstr(aline," was be "); ali@49: if (!s) ali@49: s=strstr(aline," be is "); ali@49: if (!s) ali@49: s=strstr(aline," is be "); ali@49: if (!s) ali@49: s=strstr(aline,"\", be "); ali@49: if (!s) ali@49: s=strstr(aline,"\" be "); ali@49: if (!s) ali@49: s=strstr(aline,"\" be "); ali@49: if (!s) ali@49: s=strstr(aline," to he "); ali@49: if (s) ali@49: { ali@49: if (pswit[ECHO_SWITCH]) ali@49: printf("\n%s\n",aline); ali@49: if (!pswit[OVERVIEW_SWITCH]) ali@49: printf(" Line %ld column %d - Query he/be error?\n", ali@49: linecnt,(int)(s-aline)+1); ali@49: else ali@49: cnt_word++; ali@49: } ali@49: s=strstr(aline," the had "); ali@49: if (!s) ali@49: s=strstr(aline," a had "); ali@49: if (!s) ali@49: s=strstr(aline," they bad "); ali@49: if (!s) ali@49: s=strstr(aline," she bad "); ali@49: if (!s) ali@49: s=strstr(aline," he bad "); ali@49: if (!s) ali@49: s=strstr(aline," you bad "); ali@49: if (!s) ali@49: s=strstr(aline," i bad "); ali@49: if (s) ali@49: { ali@49: if (pswit[ECHO_SWITCH]) ali@49: printf("\n%s\n",aline); ali@49: if (!pswit[OVERVIEW_SWITCH]) ali@49: printf(" Line %ld column %d - Query had/bad error?\n", ali@49: linecnt,(int)(s-aline)+1); ali@49: else ali@49: cnt_word++; ali@49: } ali@49: s=strstr(aline,"; hut "); ali@49: if (!s) ali@49: s=strstr(aline,", hut "); ali@49: if (s) ali@49: { ali@49: if (pswit[ECHO_SWITCH]) ali@49: printf("\n%s\n",aline); ali@49: if (!pswit[OVERVIEW_SWITCH]) ali@49: printf(" Line %ld column %d - Query hut/but error?\n", ali@49: linecnt,(int)(s-aline)+1); ali@49: else ali@49: cnt_word++; ali@49: } ali@49: } ali@49: ali@49: /* ali@50: * check_for_mta_from: ali@50: * ali@50: * Special case - angled bracket in front of "From" placed there by an ali@50: * MTA when sending an e-mail. ali@50: */ ali@50: void check_for_mta_from(const char *aline) ali@50: { ali@50: const char *s; ali@50: s=strstr(aline,">From"); ali@50: if (s) ali@50: { ali@50: if (pswit[ECHO_SWITCH]) ali@50: printf("\n%s\n",aline); ali@50: if (!pswit[OVERVIEW_SWITCH]) ali@50: printf(" Line %ld column %d - Query angled bracket with From\n", ali@50: linecnt,(int)(s-aline)+1); ali@50: else ali@50: cnt_punct++; ali@50: } ali@50: } ali@50: ali@50: /* ali@51: * check_for_orphan_character: ali@51: * ali@51: * Check for a single character line - ali@51: * often an overflow from bad wrapping. ali@51: */ ali@51: void check_for_orphan_character(const char *aline) ali@51: { ali@51: if (*aline && !aline[1]) ali@51: { ali@51: if (*aline=='I' || *aline=='V' || *aline=='X' || *aline=='L' || ali@51: gcisdigit(*aline)) ali@51: ; /* Nothing - ignore numerals alone on a line. */ ali@51: else ali@51: { ali@51: if (pswit[ECHO_SWITCH]) ali@51: printf("\n%s\n",aline); ali@51: if (!pswit[OVERVIEW_SWITCH]) ali@51: printf(" Line %ld column 1 - Query single character line\n", ali@51: linecnt); ali@51: else ali@51: cnt_punct++; ali@51: } ali@51: } ali@51: } ali@51: ali@51: /* ali@52: * check_for_pling_scanno: ali@52: * ali@52: * Check for I" - often should be ! ali@52: */ ali@52: void check_for_pling_scanno(const char *aline) ali@52: { ali@52: const char *s; ali@52: s=strstr(aline," I\""); ali@52: if (s) ali@52: { ali@52: if (pswit[ECHO_SWITCH]) ali@52: printf("\n%s\n",aline); ali@52: if (!pswit[OVERVIEW_SWITCH]) ali@52: printf(" Line %ld column %ld - Query I=exclamation mark?\n", ali@52: linecnt,s-aline); ali@52: else ali@52: cnt_punct++; ali@52: } ali@52: } ali@52: ali@52: /* ali@53: * check_for_extra_period: ali@53: * ali@53: * Check for period without a capital letter. Cut-down from gutspell. ali@53: * Only works when it happens on a single line. ali@53: */ ali@53: void check_for_extra_period(const char *aline,const struct warnings *warnings) ali@53: { ali@53: const char *s,*t,*s1; ali@53: signed int i,istypo,isdup; ali@53: static char qperiod[MAX_QWORD][MAX_QWORD_LENGTH]; ali@53: static int qperiod_index=0; ali@53: char testword[MAXWORDLEN]=""; ali@53: if (pswit[PARANOID_SWITCH]) ali@53: { ali@53: for (t=s=aline;strstr(t,". ");) ali@53: { ali@53: t=strstr(t,". "); ali@53: if (t==s) ali@53: { ali@53: t++; ali@53: /* start of line punctuation is handled elsewhere */ ali@53: continue; ali@53: } ali@53: if (!gcisalpha(t[-1])) ali@53: { ali@53: t++; ali@53: continue; ali@53: } ali@53: if (warnings->isDutch) ali@53: { ali@53: /* For Frank & Jeroen -- 's Middags case */ ali@53: if (t[2]==CHAR_SQUOTE && t[3]>='a' && t[3]<='z' && ali@53: t[4]==CHAR_SPACE && t[5]>='A' && t[5]<='Z') ali@53: { ali@53: t++; ali@53: continue; ali@53: } ali@53: } ali@53: s1=t+2; ali@53: while (*s1 && !gcisalpha(*s1) && !isdigit(*s1)) ali@53: s1++; ali@53: if (*s1>='a' && *s1<='z') ali@53: { ali@53: /* we have something to investigate */ ali@53: istypo=1; ali@53: /* so let's go back and find out */ ali@53: for (s1=t-1;s1>=s && ali@53: (gcisalpha(*s1) || gcisdigit(*s1) || *s1==CHAR_SQUOTE && ali@53: gcisalpha(s1[1]) && gcisalpha(s1[-1]));s1--) ali@53: ; ali@53: s1++; ali@53: for (i=0;*s1 && *s1!='.';s1++,i++) ali@53: testword[i]=*s1; ali@53: testword[i]=0; ali@53: for (i=0;*abbrev[i];i++) ali@53: if (!strcmp(testword,abbrev[i])) ali@53: istypo=0; ali@53: if (gcisdigit(*testword)) ali@53: istypo=0; ali@53: if (!testword[1]) ali@53: istypo=0; ali@53: if (isroman(testword)) ali@53: istypo=0; ali@53: if (istypo) ali@53: { ali@53: istypo=0; ali@53: for (i=0;testword[i];i++) ali@53: if (strchr(vowels,testword[i])) ali@53: istypo=1; ali@53: } ali@53: if (istypo) ali@53: { ali@53: isdup=0; ali@53: if (strlen(testword)='a' && testword[i]<='z') ali@55: alower=1; ali@55: if (alower && testword[i]>='A' && testword[i]<='Z') ali@55: { ali@55: /* ali@55: * We have an uppercase mid-word. However, there are ali@55: * common cases: ali@55: * Mac and Mc like McGill ali@55: * French contractions like l'Abbe ali@55: */ ali@55: if (i==2 && testword[0]=='m' && testword[1]=='c' || ali@55: i==3 && testword[0]=='m' && testword[1]=='a' && ali@55: testword[2]=='c' || i>0 && testword[i-1]==CHAR_SQUOTE) ali@55: ; /* do nothing! */ ali@55: else ali@55: istypo=1; ali@55: } ali@55: testword[i]=(char)tolower(testword[i]); ali@55: } ali@55: /* ali@55: * Check for certain unlikely two-letter combinations at word ali@55: * start and end. ali@55: */ ali@55: if (strlen(testword)>1) ali@55: { ali@55: for (i=0;*nostart[i];i++) ali@55: if (!strncmp(testword,nostart[i],2)) ali@55: istypo=1; ali@55: for (i=0;*noend[i];i++) ali@55: if (!strncmp(testword+strlen(testword)-2,noend[i],2)) ali@55: istypo=1; ali@55: } ali@55: /* ght is common, gbt never. Like that. */ ali@55: if (strstr(testword,"cb")) ali@55: istypo=1; ali@55: if (strstr(testword,"gbt")) ali@55: istypo=1; ali@55: if (strstr(testword,"pbt")) ali@55: istypo=1; ali@55: if (strstr(testword,"tbs")) ali@55: istypo=1; ali@55: if (strstr(testword,"mrn")) ali@55: istypo=1; ali@55: if (strstr(testword,"ahle")) ali@55: istypo=1; ali@55: if (strstr(testword,"ihle")) ali@55: istypo=1; ali@55: /* ali@55: * "TBE" does happen - like HEARTBEAT - but uncommon. ali@55: * Also "TBI" - frostbite, outbid - but uncommon. ali@55: * Similarly "ii" like Hawaii, or Pompeii, and in Roman ali@55: * numerals, but "ii" is a common scanno. ali@55: */ ali@55: if (strstr(testword,"tbi")) ali@55: istypo=1; ali@55: if (strstr(testword,"tbe")) ali@55: istypo=1; ali@55: if (strstr(testword,"ii")) ali@55: istypo=1; ali@55: /* ali@55: * Check for no vowels or no consonants. ali@55: * If none, flag a typo. ali@55: */ ali@55: if (!istypo && strlen(testword)>1) ali@55: { ali@55: vowel=consonant=0; ali@55: for (i=0;testword[i];i++) ali@55: { ali@55: if (testword[i]=='y' || gcisdigit(testword[i])) ali@55: { ali@55: /* Yah, this is loose. */ ali@55: vowel++; ali@55: consonant++; ali@55: } ali@55: else if (strchr(vowels,testword[i])) ali@55: vowel++; ali@55: else ali@55: consonant++; ali@55: } ali@55: if (!vowel || !consonant) ali@55: istypo=1; ali@55: } ali@55: /* ali@55: * Now exclude the word from being reported if it's in ali@55: * the okword list. ali@55: */ ali@55: for (i=0;*okword[i];i++) ali@55: if (!strcmp(testword,okword[i])) ali@55: istypo=0; ali@55: /* ali@55: * What looks like a typo may be a Roman numeral. ali@55: * Exclude these. ali@55: */ ali@55: if (istypo && isroman(testword)) ali@55: istypo=0; ali@55: /* Check the manual list of typos. */ ali@55: if (!istypo) ali@55: for (i=0;*typo[i];i++) ali@55: if (!strcmp(testword,typo[i])) ali@55: istypo=1; ali@55: /* ali@55: * Check lowercase s, l, i and m - special cases. ali@55: * "j" - often a semi-colon gone wrong. ali@55: * "d" for a missing apostrophe - he d ali@55: * "n" for "in" ali@55: */ ali@55: if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword)) ali@55: istypo=1; ali@55: if (istypo) ali@55: { ali@55: isdup=0; ali@55: if (strlen(testword)digit) ali@55: { ali@55: /* In paranoid mode, query all 0 and 1 standing alone. */ ali@55: if (!strcmp(inword,"0") || !strcmp(inword,"1")) ali@55: { ali@55: if (pswit[ECHO_SWITCH]) ali@55: printf("\n%s\n",aline); ali@55: if (!pswit[OVERVIEW_SWITCH]) ali@55: printf(" Line %ld column %d - Query standalone %s\n", ali@55: linecnt,(int)(wordstart-aline)+2,inword); ali@55: else ali@55: cnt_word++; ali@55: } ali@55: } ali@55: } ali@55: } ali@55: ali@56: struct parities { ali@56: int dquote,squote; ali@56: }; ali@56: ali@56: /* ali@56: * check_for_misspaced_punctuation: ali@56: * ali@56: * Look for added or missing spaces around punctuation and quotes. ali@56: * If there is a punctuation character like ! with no space on ali@56: * either side, suspect a missing!space. If there are spaces on ali@56: * both sides , assume a typo. If we see a double quote with no ali@56: * space or punctuation on either side of it, assume unspaced ali@56: * quotes "like"this. ali@56: */ ali@56: void check_for_misspaced_punctuation(const char *aline, ali@56: struct parities *parities,int isemptyline) ali@56: { ali@56: int i,llen,isacro,isellipsis; ali@56: const char *s; ali@56: llen=strlen(aline); ali@56: for (i=1;i2 && aline[i-2]=='.') ali@56: isacro=1; ali@56: if (i+22 && aline[i-2]=='.') ali@56: isellipsis=1; ali@56: if (i+2dquote=!parities->dquote; ali@56: if (!parities->dquote) ali@56: { ali@56: /* parity even */ ali@56: if (!strchr("_-.'`/,;:!?)]} ",s[1])) ali@56: { ali@56: if (pswit[ECHO_SWITCH]) ali@56: printf("\n%s\n",aline); ali@56: if (!pswit[OVERVIEW_SWITCH]) ali@56: printf(" Line %ld column %d - " ali@56: "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1); ali@56: else ali@56: cnt_punct++; ali@56: } ali@56: } ali@56: else ali@56: { ali@56: /* parity odd */ ali@56: if (!gcisalpha(s[1]) && !isdigit(s[1]) && ali@56: !strchr("_-/.'`([{$",s[1]) || !s[1]) ali@56: { ali@56: if (pswit[ECHO_SWITCH]) ali@56: printf("\n%s\n",aline); ali@56: if (!pswit[OVERVIEW_SWITCH]) ali@56: printf(" Line %ld column %d - " ali@56: "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1); ali@56: else ali@56: cnt_punct++; ali@56: } ali@56: } ali@56: } ali@56: } ali@56: if (*aline==CHAR_DQUOTE) ali@56: { ali@56: if (strchr(",;:!?)]} ",aline[1])) ali@56: { ali@56: if (pswit[ECHO_SWITCH]) ali@56: printf("\n%s\n",aline); ali@56: if (!pswit[OVERVIEW_SWITCH]) ali@56: printf(" Line %ld column 1 - Wrongspaced quotes?\n", ali@56: linecnt); ali@56: else ali@56: cnt_punct++; ali@56: } ali@56: } ali@56: if (pswit[SQUOTE_SWITCH]) ali@56: { ali@56: for (s=aline;*s;s++) ali@56: { ali@56: if ((*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE) && ali@56: (s==aline || s>aline && !gcisalpha(s[-1]) || ali@56: !gcisalpha(s[1]))) ali@56: { ali@56: parities->squote=!parities->squote; ali@56: if (!parities->squote) ali@56: { ali@56: /* parity even */ ali@56: if (!strchr("_-.'`/\",;:!?)]} ",s[1])) ali@56: { ali@56: if (pswit[ECHO_SWITCH]) ali@56: printf("\n%s\n",aline); ali@56: if (!pswit[OVERVIEW_SWITCH]) ali@56: printf(" Line %ld column %d - " ali@56: "Wrongspaced singlequotes?\n", ali@56: linecnt,(int)(s-aline)+1); ali@56: else ali@56: cnt_punct++; ali@56: } ali@56: } ali@56: else ali@56: { ali@56: /* parity odd */ ali@56: if (!gcisalpha(s[1]) && !isdigit(s[1]) && ali@56: !strchr("_-/\".'`",s[1]) || !s[1]) ali@56: { ali@56: if (pswit[ECHO_SWITCH]) ali@56: printf("\n%s\n",aline); ali@56: if (!pswit[OVERVIEW_SWITCH]) ali@56: printf(" Line %ld column %d - " ali@56: "Wrongspaced singlequotes?\n", ali@56: linecnt,(int)(s-aline)+1); ali@56: else ali@56: cnt_punct++; ali@56: } ali@56: } ali@56: } ali@56: } ali@56: } ali@56: } ali@56: ali@55: /* ali@57: * check_for_double_punctuation: ali@57: * ali@57: * Look for double punctuation like ,. or ,, ali@57: * Thanks to DW for the suggestion! ali@57: * In books with references, ".," and ".;" are common ali@57: * e.g. "etc., etc.," and vol. 1.; vol 3.; ali@57: * OTOH, from my initial tests, there are also fairly ali@57: * common errors. What to do? Make these cases paranoid? ali@57: * ".," is the most common, so warnings->dotcomma is used ali@57: * to suppress detailed reporting if it occurs often. ali@57: */ ali@57: void check_for_double_punctuation(const char *aline,struct warnings *warnings) ali@57: { ali@57: int i,llen; ali@57: llen=strlen(aline); ali@57: for (i=0;idotcomma && aline[i]=='.' && aline[i+1]==',' || ali@57: warnings->isFrench && !strncmp(aline+i,",...",4) || ali@57: warnings->isFrench && !strncmp(aline+i,"...,",4) || ali@57: warnings->isFrench && !strncmp(aline+i,";...",4) || ali@57: warnings->isFrench && !strncmp(aline+i,"...;",4) || ali@57: warnings->isFrench && !strncmp(aline+i,":...",4) || ali@57: warnings->isFrench && !strncmp(aline+i,"...:",4) || ali@57: warnings->isFrench && !strncmp(aline+i,"!...",4) || ali@57: warnings->isFrench && !strncmp(aline+i,"...!",4) || ali@57: warnings->isFrench && !strncmp(aline+i,"?...",4) || ali@57: warnings->isFrench && !strncmp(aline+i,"...?",4)) ali@57: { ali@57: if (warnings->isFrench && !strncmp(aline+i,",...",4) || ali@57: warnings->isFrench && !strncmp(aline+i,"...,",4) || ali@57: warnings->isFrench && !strncmp(aline+i,";...",4) || ali@57: warnings->isFrench && !strncmp(aline+i,"...;",4) || ali@57: warnings->isFrench && !strncmp(aline+i,":...",4) || ali@57: warnings->isFrench && !strncmp(aline+i,"...:",4) || ali@57: warnings->isFrench && !strncmp(aline+i,"!...",4) || ali@57: warnings->isFrench && !strncmp(aline+i,"...!",4) || ali@57: warnings->isFrench && !strncmp(aline+i,"?...",4) || ali@57: warnings->isFrench && !strncmp(aline+i,"...?",4)) ali@57: i+=4; ali@57: ; /* do nothing for .. !! and ?? which can be legit */ ali@57: } ali@57: else ali@57: { ali@57: if (pswit[ECHO_SWITCH]) ali@57: printf("\n%s\n",aline); ali@57: if (!pswit[OVERVIEW_SWITCH]) ali@57: printf(" Line %ld column %d - Double punctuation?\n", ali@57: linecnt,i+1); ali@57: else ali@57: cnt_punct++; ali@57: } ali@57: } ali@57: } ali@57: } ali@57: ali@57: /* ali@58: * check_for_spaced_quotes: ali@58: */ ali@58: void check_for_spaced_quotes(const char *aline) ali@58: { ali@58: const char *s,*t; ali@58: s=aline; ali@58: while ((t=strstr(s," \" "))) ali@58: { ali@58: if (pswit[ECHO_SWITCH]) ali@58: printf("\n%s\n",aline); ali@58: if (!pswit[OVERVIEW_SWITCH]) ali@58: printf(" Line %ld column %d - Spaced doublequote?\n", ali@58: linecnt,(int)(t-aline+1)); ali@58: else ali@58: cnt_punct++; ali@58: s=t+2; ali@58: } ali@58: s=aline; ali@58: while ((t=strstr(s," ' "))) ali@58: { ali@58: if (pswit[ECHO_SWITCH]) ali@58: printf("\n%s\n",aline); ali@58: if (!pswit[OVERVIEW_SWITCH]) ali@58: printf(" Line %ld column %d - Spaced singlequote?\n", ali@58: linecnt,(int)(t-aline+1)); ali@58: else ali@58: cnt_punct++; ali@58: s=t+2; ali@58: } ali@58: s=aline; ali@58: while ((t=strstr(s," ` "))) ali@58: { ali@58: if (pswit[ECHO_SWITCH]) ali@58: printf("\n%s\n",aline); ali@58: if (!pswit[OVERVIEW_SWITCH]) ali@58: printf(" Line %ld column %d - Spaced singlequote?\n", ali@58: linecnt,(int)(t-aline+1)); ali@58: else ali@58: cnt_punct++; ali@58: s=t+2; ali@58: } ali@58: } ali@58: ali@58: /* ali@59: * check_for_miscased_genative: ali@59: * ali@59: * Check special case of 'S instead of 's at end of word. ali@59: */ ali@59: void check_for_miscased_genative(const char *aline) ali@59: { ali@59: const char *s; ali@59: s=aline+1; ali@59: while (*s) ali@59: { ali@59: if (*s==CHAR_SQUOTE && s[1]=='S' && s[-1]>='a' && s[-1]<='z') ali@59: { ali@59: if (pswit[ECHO_SWITCH]) ali@59: printf("\n%s\n",aline); ali@59: if (!pswit[OVERVIEW_SWITCH]) ali@59: printf(" Line %ld column %d - Capital \"S\"?\n", ali@59: linecnt,(int)(s-aline+2)); ali@59: else ali@59: cnt_punct++; ali@59: } ali@59: s++; ali@59: } ali@59: } ali@59: ali@59: /* ali@41: * procfile: ali@41: * ali@41: * Process one file. ali@41: */ ali@41: void procfile(char *filename) ali@41: { ali@55: const char *s,*t; ali@41: char parastart[81]; /* first line of current para */ ali@41: FILE *infile; ali@41: struct first_pass_results *first_pass_results; ali@42: struct warnings *warnings; ali@43: struct counters counters={0}; ali@45: struct line_properties last={0}; ali@56: struct parities parities={0}; ali@43: int isemptyline; ali@43: long squot,start_para_line; ali@55: signed int i,llen,isacro,isellipsis; ali@55: signed int isnewpara; ali@41: char dquote_err[80],squote_err[80],rbrack_err[80],sbrack_err[80], ali@41: cbrack_err[80],unders_err[80]; ali@41: signed int enddash; ali@45: last.start=CHAR_SPACE; ali@41: *dquote_err=*squote_err=*rbrack_err=*cbrack_err=*sbrack_err= ali@41: *unders_err=*prevline=0; ali@41: linecnt=checked_linecnt=start_para_line=0; ali@43: squot=0; ali@53: i=llen=isacro=isellipsis=0; ali@55: isnewpara=enddash=0; ali@41: infile=fopen(filename,"rb"); ali@41: if (!infile) ali@41: { ali@41: if (pswit[STDOUT_SWITCH]) ali@41: fprintf(stdout,"bookloupe: cannot open %s\n",filename); ali@41: else ali@41: fprintf(stderr,"bookloupe: cannot open %s\n",filename); ali@41: exit(1); ali@41: } ali@41: fprintf(stdout,"\n\nFile: %s\n\n",filename); ali@41: first_pass_results=first_pass(infile); ali@42: warnings=report_first_pass(first_pass_results); ali@42: rewind(infile); ali@40: /* ali@40: * Here we go with the main pass. Hold onto yer hat! ali@40: * Re-init some variables we've dirtied. ali@40: */ ali@43: squot=linecnt=0; ali@40: while (flgets(aline,LINEBUFSIZE-1,infile,linecnt+1)) ali@40: { ali@0: linecnt++; ali@40: if (linecnt==1) ali@40: isnewpara=1; ali@40: if (pswit[DP_SWITCH] && !strncmp(aline,"-----File: ",11)) ali@40: continue; // skip DP page separators completely ali@41: if (linecntfirstline || ali@41: (first_pass_results->footerline>0 && ali@41: linecnt>first_pass_results->footerline)) ali@40: { ali@40: if (pswit[HEADER_SWITCH]) ali@40: { ali@40: if (!strncmp(aline,"Title:",6)) ali@40: printf(" %s\n",aline); ali@40: if (!strncmp(aline,"Author:",7)) ali@40: printf(" %s\n",aline); ali@40: if (!strncmp(aline,"Release Date:",13)) ali@40: printf(" %s\n",aline); ali@40: if (!strncmp(aline,"Edition:",8)) ali@40: printf(" %s\n\n",aline); ali@40: } ali@0: continue; /* skip through the header */ ali@40: } ali@0: checked_linecnt++; ali@40: s=aline; ali@40: /* ali@40: * If we are in a state of unbalanced quotes, and this line ali@40: * doesn't begin with a quote, output the stored error message. ali@40: * If the -P switch was used, print the warning even if the ali@40: * new para starts with quotes. ali@40: */ ali@40: t=s; ali@40: while (*t==' ') ali@40: t++; ali@0: if (*dquote_err) ali@40: if (*t!=CHAR_DQUOTE || pswit[QPARA_SWITCH]) ali@40: { ali@40: if (!pswit[OVERVIEW_SWITCH]) ali@40: { ali@40: if (pswit[ECHO_SWITCH]) ali@40: printf("\n%s\n",parastart); ali@0: printf(dquote_err); ali@40: } ali@0: else ali@0: cnt_dquot++; ali@0: } ali@40: if (*squote_err) ali@40: { ali@40: if (*t!=CHAR_SQUOTE && *t!=CHAR_OPEN_SQUOTE || ali@40: pswit[QPARA_SWITCH] || squot) ali@40: { ali@40: if (!pswit[OVERVIEW_SWITCH]) ali@40: { ali@40: if (pswit[ECHO_SWITCH]) ali@40: printf("\n%s\n",parastart); ali@0: printf(squote_err); ali@40: } ali@0: else ali@0: cnt_squot++; ali@40: } ali@40: squot=0; ali@40: } ali@40: if (*rbrack_err) ali@40: { ali@40: if (!pswit[OVERVIEW_SWITCH]) ali@40: { ali@40: if (pswit[ECHO_SWITCH]) ali@40: printf("\n%s\n",parastart); ali@0: printf(rbrack_err); ali@40: } ali@0: else ali@0: cnt_brack++; ali@40: } ali@40: if (*sbrack_err) ali@40: { ali@40: if (!pswit[OVERVIEW_SWITCH]) ali@40: { ali@40: if (pswit[ECHO_SWITCH]) ali@40: printf("\n%s\n",parastart); ali@0: printf(sbrack_err); ali@40: } ali@0: else ali@0: cnt_brack++; ali@40: } ali@40: if (*cbrack_err) ali@40: { ali@40: if (!pswit[OVERVIEW_SWITCH]) ali@40: { ali@40: if (pswit[ECHO_SWITCH]) ali@40: printf("\n%s\n",parastart); ali@0: printf(cbrack_err); ali@40: } ali@0: else ali@0: cnt_brack++; ali@40: } ali@40: if (*unders_err) ali@40: { ali@40: if (!pswit[OVERVIEW_SWITCH]) ali@40: { ali@40: if (pswit[ECHO_SWITCH]) ali@40: printf("\n%s\n",parastart); ali@0: printf(unders_err); ali@40: } ali@0: else ali@0: cnt_brack++; ali@40: } ali@40: *dquote_err=*squote_err=*rbrack_err=*cbrack_err= ali@40: *sbrack_err=*unders_err=0; ali@43: isemptyline=analyse_quotes(aline,&counters); ali@40: if (isnewpara && !isemptyline) ali@40: { ali@40: /* This line is the start of a new paragraph. */ ali@40: start_para_line=linecnt; ali@40: /* Capture its first line in case we want to report it later. */ ali@40: strncpy(parastart,aline,80); ali@40: parastart[79]=0; ali@56: memset(&parities,0,sizeof(parities)); /* restart the quote count */ ali@40: s=aline; ali@40: while (!gcisalpha(*s) && !gcisdigit(*s) && *s) ali@40: s++; ali@40: if (*s>='a' && *s<='z') ali@40: { ali@40: /* and its first letter is lowercase */ ali@40: if (pswit[ECHO_SWITCH]) ali@40: printf("\n%s\n",aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@40: printf(" Line %ld column %d - " ali@40: "Paragraph starts with lower-case\n", ali@40: linecnt,(int)(s-aline)+1); ali@0: else ali@0: cnt_punct++; ali@40: } ali@40: isnewpara=0; /* Signal the end of new para processing. */ ali@40: } ali@40: /* Check for an em-dash broken at line end. */ ali@40: if (enddash && *aline=='-') ali@40: { ali@40: if (pswit[ECHO_SWITCH]) ali@40: printf("\n%s\n",aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@40: printf(" Line %ld column 1 - Broken em-dash?\n",linecnt); ali@0: else ali@0: cnt_punct++; ali@40: } ali@40: enddash=0; ali@40: for (s=aline+strlen(aline)-1;*s==' ' && s>aline;s--) ali@40: ; ali@40: if (s>=aline && *s=='-') ali@40: enddash=1; ali@40: /* ali@40: * Check for invalid or questionable characters in the line ali@40: * Anything above 127 is invalid for plain ASCII, and ali@40: * non-printable control characters should also be flagged. ali@40: * Tabs should generally not be there. ali@40: */ ali@40: for (s=aline;*s;s++) ali@40: { ali@40: i=(unsigned char)*s; ali@40: if (ibin) ali@44: check_for_odd_characters(aline,warnings,isemptyline); ali@42: if (warnings->longline) ali@45: check_for_long_line(aline); ali@45: if (warnings->shortline) ali@45: check_for_short_line(aline,&last); ali@45: last.blen=last.len; ali@45: last.len=strlen(aline); ali@45: last.start=aline[0]; ali@46: check_for_starting_punctuation(aline); ali@42: if (warnings->dash) ali@40: { ali@47: check_for_spaced_emdash(aline); ali@47: check_for_spaced_dash(aline); ali@40: } ali@48: check_for_unmarked_paragraphs(aline); ali@49: check_for_jeebies(aline); ali@50: check_for_mta_from(aline); ali@51: check_for_orphan_character(aline); ali@52: check_for_pling_scanno(aline); ali@53: check_for_extra_period(aline,warnings); ali@54: check_for_following_punctuation(aline); ali@55: check_for_typos(aline,warnings); ali@56: check_for_misspaced_punctuation(aline,&parities,isemptyline); ali@57: check_for_double_punctuation(aline,warnings); ali@58: check_for_spaced_quotes(aline); ali@59: check_for_miscased_genative(aline); ali@40: /* ali@40: * Now check special cases - start and end of line - ali@40: * for single and double quotes. Start is sometimes [sic] ali@40: * but better to query it anyway. ali@40: * While we're here, check for dash at end of line. ali@40: */ ali@40: llen=strlen(aline); ali@40: if (llen>1) ali@40: { ali@40: if (aline[llen-1]==CHAR_DQUOTE || aline[llen-1]==CHAR_SQUOTE || ali@40: aline[llen-1]==CHAR_OPEN_SQUOTE) ali@40: if (aline[llen-2]==CHAR_SPACE) ali@40: { ali@40: if (pswit[ECHO_SWITCH]) ali@40: printf("\n%s\n",aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@40: printf(" Line %ld column %d - Spaced quote?\n", ali@40: linecnt,llen); ali@0: else ali@0: cnt_punct++; ali@40: } ali@40: if ((aline[0]==CHAR_SQUOTE || aline[0]==CHAR_OPEN_SQUOTE) && ali@40: aline[1]==CHAR_SPACE) ali@40: { ali@40: if (pswit[ECHO_SWITCH]) ali@40: printf("\n%s\n",aline); ali@40: if (!pswit[OVERVIEW_SWITCH]) ali@40: printf(" Line %ld column 1 - Spaced quote?\n",linecnt); ali@40: else ali@40: cnt_punct++; ali@40: } ali@40: /* ali@40: * Dash at end of line may well be legit - paranoid mode only ali@40: * and don't report em-dash at line-end. ali@40: */ ali@42: if (pswit[PARANOID_SWITCH] && warnings->hyphen) ali@40: { ali@40: for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--) ali@40: ; ali@40: if (aline[i]=='-' && aline[i-1]!='-') ali@40: { ali@40: if (pswit[ECHO_SWITCH]) ali@40: printf("\n%s\n",aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@40: printf(" Line %ld column %d - " ali@40: "Hyphen at end of line?\n",linecnt,i); ali@40: } ali@40: } ali@40: } ali@40: /* ali@40: * Brackets are often unspaced, but shouldn't be surrounded by alpha. ali@40: * If so, suspect a scanno like "a]most". ali@40: */ ali@40: llen=strlen(aline); ali@40: for (i=1;iendquote) ali@40: { ali@40: for (i=1;i. ali@40: * If there is a < in the line, followed at some point ali@40: * by a > then we suspect HTML. ali@40: */ ali@40: if (strstr(aline,"<") && strstr(aline,">")) ali@40: { ali@40: i=(signed int)(strstr(aline,">")-strstr(aline,"<")+1); ali@40: if (i>0) ali@40: { ali@40: strncpy(wrk,strstr(aline,"<"),i); ali@40: wrk[i]=0; ali@40: if (pswit[ECHO_SWITCH]) ali@40: printf("\n%s\n",aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@40: printf(" Line %ld column %d - HTML Tag? %s \n", ali@40: linecnt,(int)(strstr(aline,"<")-aline)+1,wrk); ali@0: else ali@0: cnt_html++; ali@40: } ali@40: } ali@40: /* ali@40: * Check for &symbol; HTML. ali@40: * If there is a & in the line, followed at ali@40: * some point by a ; then we suspect HTML. ali@40: */ ali@40: if (strstr(aline,"&") && strstr(aline,";")) ali@40: { ali@40: i=(int)(strstr(aline,";")-strstr(aline,"&")+1); ali@40: for (s=strstr(aline,"&");s0) ali@40: { ali@40: strncpy(wrk,strstr(aline,"&"),i); ali@40: wrk[i]=0; ali@40: if (pswit[ECHO_SWITCH]) ali@40: printf("\n%s\n",aline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@40: printf(" Line %ld column %d - HTML symbol? %s \n", ali@40: linecnt,(int)(strstr(aline,"&")-aline)+1,wrk); ali@0: else ali@0: cnt_html++; ali@40: } ali@40: } ali@40: /* ali@40: * At end of paragraph, check for mismatched quotes. ali@40: * We don't want to report an error immediately, since it is a ali@40: * common convention to omit the quotes at end of paragraph if ali@40: * the next paragraph is a continuation of the same speaker. ali@40: * Where this is the case, the next para should begin with a ali@40: * quote, so we store the warning message and only display it ali@40: * at the top of the next iteration if the new para doesn't ali@40: * start with a quote. ali@40: * The -p switch overrides this default, and warns of unclosed ali@40: * quotes on _every_ paragraph, whether the next begins with a ali@40: * quote or not. ali@40: */ ali@40: if (isemptyline) ali@40: { ali@40: /* end of para - add up the totals */ ali@43: if (counters.quot%2) ali@40: sprintf(dquote_err," Line %ld - Mismatched quotes\n", ali@40: linecnt); ali@43: if (pswit[SQUOTE_SWITCH] && counters.open_single_quote && ali@43: counters.open_single_quote!=counters.close_single_quote) ali@40: sprintf(squote_err," Line %ld - Mismatched singlequotes?\n", ali@40: linecnt); ali@43: if (pswit[SQUOTE_SWITCH] && counters.open_single_quote && ali@43: counters.open_single_quote!=counters.close_single_quote && ali@43: counters.open_single_quote!=counters.close_single_quote+1) ali@40: /* ali@40: * Flag it to be noted regardless of the ali@40: * first char of the next para. ali@40: */ ali@40: squot=1; ali@43: if (counters.r_brack) ali@40: sprintf(rbrack_err," Line %ld - " ali@40: "Mismatched round brackets?\n",linecnt); ali@43: if (counters.s_brack) ali@40: sprintf(sbrack_err," Line %ld - " ali@40: "Mismatched square brackets?\n",linecnt); ali@43: if (counters.c_brack) ali@40: sprintf(cbrack_err," Line %ld - " ali@40: "Mismatched curly brackets?\n",linecnt); ali@43: if (counters.c_unders%2) ali@40: sprintf(unders_err," Line %ld - Mismatched underscores?\n", ali@40: linecnt); ali@43: memset(&counters,0,sizeof(counters)); ali@40: /* let the next iteration know that it's starting a new para */ ali@40: isnewpara=1; ali@40: } ali@40: /* ali@40: * Check for omitted punctuation at end of paragraph by working back ali@40: * through prevline. DW. ali@40: * Need to check this only for "normal" paras. ali@40: * So what is a "normal" para? ali@40: * Not normal if one-liner (chapter headings, etc.) ali@40: * Not normal if doesn't contain at least one locase letter ali@40: * Not normal if starts with space ali@40: */ ali@40: if (isemptyline) ali@40: { ali@40: /* end of para */ ali@40: for (s=prevline,i=0;*s && !i;s++) ali@0: if (gcisletter(*s)) ali@40: /* use i to indicate the presence of a letter on the line */ ali@40: i=1; ali@40: /* ali@40: * This next "if" is a problem. ali@40: * If we say "start_para_line <= linecnt - 1", that includes ali@40: * one-line "paragraphs" like chapter heads. Lotsa false positives. ali@40: * If we say "start_para_line < linecnt - 1" it doesn't, but then it ali@40: * misses genuine one-line paragraphs. ali@40: */ ali@45: if (i && last.blen>2 && start_para_lineCHAR_SPACE) ali@40: { ali@40: for (i=strlen(prevline)-1; ali@40: (prevline[i]==CHAR_DQUOTE || prevline[i]==CHAR_SQUOTE) && ali@40: prevline[i]>CHAR_SPACE && i>0; ali@40: i--) ali@40: ; ali@40: for (;i>0;i--) ali@40: { ali@40: if (gcisalpha(prevline[i])) ali@40: { ali@40: if (pswit[ECHO_SWITCH]) ali@40: printf("\n%s\n",prevline); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@40: printf(" Line %ld column %d - " ali@40: "No punctuation at para end?\n", ali@40: linecnt-1,strlen(prevline)); ali@0: else ali@0: cnt_punct++; ali@0: break; ali@40: } ali@40: if (strchr("-.:!([{?}])",prevline[i])) ali@0: break; ali@40: } ali@40: } ali@40: } ali@40: strcpy(prevline,aline); ali@0: } ali@40: fclose(infile); ali@0: if (!pswit[OVERVIEW_SWITCH]) ali@40: for (i=0;i='A' && *theline<='Z') ali@40: *theline+=32; ali@0: } ali@0: ali@40: /* ali@40: * isroman: ali@40: * ali@40: * Is this word a Roman Numeral? ali@40: * ali@40: * It doesn't actually validate that the number is a valid Roman Numeral--for ali@40: * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not ali@40: * what we're here to do. If it passes this, it LOOKS like a Roman numeral. ali@40: * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or ali@40: * expressions thereof, except when it came to taxes. Allow any number of M, ali@40: * an optional D, an optional CM or CD, any number of optional Cs, an optional ali@40: * XL or an optional XC, an optional IX or IV, an optional V and any number ali@40: * of optional Is. ali@40: */ ali@0: int isroman(char *t) ali@0: { ali@0: char *s; ali@40: if (!t || !*t) ali@40: return 0; ali@40: s=t; ali@40: while (*t=='m' && *t) ali@40: t++; ali@40: if (*t=='d') ali@40: t++; ali@40: if (*t=='c' && t[1]=='m') ali@40: t+=2; ali@40: if (*t=='c' && t[1]=='d') ali@40: t+=2; ali@40: while (*t=='c' && *t) ali@40: t++; ali@40: if (*t=='x' && t[1]=='l') ali@40: t+=2; ali@40: if (*t=='x' && t[1]=='c') ali@40: t+=2; ali@40: if (*t=='l') ali@40: t++; ali@40: while (*t=='x' && *t) ali@40: t++; ali@40: if (*t=='i' && t[1]=='x') ali@40: t+=2; ali@40: if (*t=='i' && t[1]=='v') ali@40: t+=2; ali@40: if (*t=='v') ali@40: t++; ali@40: while (*t=='i' && *t) ali@40: t++; ali@40: return !*t; ali@0: } ali@0: ali@40: /* ali@40: * gcisalpha: ali@40: * ali@40: * A version of isalpha() that is somewhat lenient on 8-bit texts. ali@40: * If we use the standard function, 8-bit accented characters break ali@40: * words, so that tete with accented characters appears to be two words, "t" ali@40: * and "t", with 8-bit characters between them. This causes over-reporting of ali@40: * errors. gcisalpha() recognizes accented letters from the CP1252 (Windows) ali@40: * and ISO-8859-1 character sets, which are the most common PG 8-bit types. ali@40: */ ali@0: int gcisalpha(unsigned char c) ali@0: { ali@40: if (c>='a' && c<='z') ali@40: return 1; ali@40: if (c>='A' && c<='Z') ali@40: return 1; ali@40: if (c<140) ali@40: return 0; ali@40: if (c>=192 && c!=208 && c!=215 && c!=222 && c!=240 && c!=247 && c!=254) ali@40: return 1; ali@40: if (c==140 || c==142 || c==156 || c==158 || c==159) ali@40: return 1; ali@40: return 0; ali@0: } ali@0: ali@40: /* ali@40: * gcisdigit: ali@40: * ali@40: * A version of isdigit() that doesn't get confused in 8-bit texts. ali@40: */ ali@0: int gcisdigit(unsigned char c) ali@0: { ali@40: return c>='0' && c<='9'; ali@0: } ali@0: ali@40: /* ali@40: * gcisletter: ali@40: * ali@40: * A version of isletter() that doesn't get confused in 8-bit texts. ali@40: * NB: this is ISO-8891-1-specific. ali@40: */ ali@0: int gcisletter(unsigned char c) ali@0: { ali@40: return c>='A' && c<='Z' || c>='a' && c<='z' || c>=192; ali@0: } ali@0: ali@40: /* ali@40: * gcstrchr: ali@40: * ali@40: * Wraps strchr to return NULL if the character being searched for is zero. ali@40: */ ali@40: char *gcstrchr(char *s,char c) ali@0: { ali@40: if (!c) ali@40: return NULL; ali@40: return strchr(s,c); ali@0: } ali@0: ali@40: /* ali@40: * postprocess_for_DP: ali@40: * ali@40: * Invoked with the -d switch from flgets(). ali@40: * It simply "removes" from the line a hard-coded set of common ali@40: * DP-specific tags, so that the line passed to the main routine has ali@40: * been pre-cleaned of DP markup. ali@40: */ ali@0: void postprocess_for_DP(char *theline) ali@0: { ali@40: char *s,*t; ali@0: int i; ali@0: if (!*theline) ali@0: return; ali@40: for (i=0;*DPmarkup[i];i++) ali@40: { ali@40: s=strstr(theline,DPmarkup[i]); ali@40: while (s) ali@40: { ali@40: t=s+strlen(DPmarkup[i]); ali@40: while (*t) ali@40: { ali@40: *s=*t; ali@40: t++; ali@40: s++; ali@40: } ali@40: *s=0; ali@40: s=strstr(theline,DPmarkup[i]); ali@40: } ali@40: } ali@0: } ali@0: ali@40: /* ali@40: * postprocess_for_HTML: ali@40: * ali@40: * Invoked with the -m switch from flgets(). ali@40: * It simply "removes" from the line a hard-coded set of common ali@40: * HTML tags and "replaces" a hard-coded set of common HTML ali@40: * entities, so that the line passed to the main routine has ali@40: * been pre-cleaned of HTML. ali@40: */ ali@0: void postprocess_for_HTML(char *theline) ali@0: { ali@40: if (strstr(theline,"<") && strstr(theline,">")) ali@0: while (losemarkup(theline)) ali@0: ; ali@0: while (loseentities(theline)) ali@0: ; ali@0: } ali@0: ali@0: char *losemarkup(char *theline) ali@0: { ali@40: char *s,*t; ali@0: int i; ali@0: if (!*theline) ali@40: return NULL; ali@40: s=strstr(theline,"<"); ali@40: t=strstr(theline,">"); ali@40: if (!s || !t) ali@40: return NULL; ali@40: for (i=0;*markup[i];i++) ali@40: if (!tagcomp(s+1,markup[i])) ali@40: { ali@40: if (!t[1]) ali@40: { ali@40: *s=0; ali@40: return s; ali@40: } ali@40: else if (t>s) ali@40: { ali@40: strcpy(s,t+1); ali@40: return s; ali@40: } ali@0: } ali@40: /* It's an unrecognized . */ ali@40: return NULL; ali@0: } ali@0: ali@0: char *loseentities(char *theline) ali@0: { ali@0: int i; ali@40: char *s,*t; ali@0: if (!*theline) ali@40: return NULL; ali@40: for (i=0;*entities[i].htmlent;i++) ali@40: { ali@40: s=strstr(theline,entities[i].htmlent); ali@40: if (s) ali@40: { ali@40: t=malloc((size_t)strlen(s)); ali@40: if (!t) ali@40: return NULL; ali@40: strcpy(t,s+strlen(entities[i].htmlent)); ali@40: strcpy(s,entities[i].textent); ali@40: strcat(s,t); ali@0: free(t); ali@40: return theline; ali@40: } ali@40: } ali@40: for (i=0;*entities[i].htmlnum;i++) ali@40: { ali@40: s=strstr(theline,entities[i].htmlnum); ali@40: if (s) ali@40: { ali@40: t=malloc((size_t)strlen(s)); ali@40: if (!t) ali@40: return NULL; ali@40: strcpy(t,s+strlen(entities[i].htmlnum)); ali@40: strcpy(s,entities[i].textent); ali@40: strcat(s,t); ali@0: free(t); ali@40: return theline; ali@40: } ali@40: } ali@40: return NULL; ali@0: } ali@0: ali@40: int tagcomp(char *strin,char *basetag) ali@0: { ali@40: char *s,*t; ali@40: s=basetag; ali@40: t=strin; ali@40: if (*t=='/') ali@40: t++; /* ignore a slash */ ali@40: while (*s && *t) ali@40: { ali@40: if (tolower(*s)!=tolower(*t)) ali@40: return 1; ali@40: s++; ali@40: t++; ali@40: } ali@40: return 0; ali@0: } ali@0: ali@40: void proghelp() ali@0: { ali@40: fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr); ali@40: fputs("Copyright 2000-2005 Jim Tinsley .\n",stderr); ali@40: fputs("Copyright 2012- J. Ali Harlow .\n",stderr); ali@40: fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. " ali@40: "For details, read the file COPYING.\n",stderr); ali@40: fputs("This is Free Software; " ali@40: "you may redistribute it under certain conditions (GPL);\n",stderr); ali@40: fputs("read the file COPYING for details.\n\n",stderr); ali@40: fputs("Usage is: bookloupe [-setpxloyhud] filename\n",stderr); ali@40: fputs(" where -s checks single quotes, -e suppresses echoing lines, " ali@40: "-t checks typos\n",stderr); ali@40: fputs(" -x (paranoid) switches OFF -t and extra checks, " ali@40: "-l turns OFF line-end checks\n",stderr); ali@40: fputs(" -o just displays overview without detail, " ali@40: "-h echoes header fields\n",stderr); ali@40: fputs(" -v (verbose) unsuppresses duplicate reporting, " ali@40: "-m suppresses markup\n",stderr); ali@0: fputs(" -d ignores DP-specific markup,\n",stderr); ali@40: fputs(" -u uses a file gutcheck.typ to query user-defined " ali@40: "possible typos\n",stderr); ali@40: fputs("Sample usage: bookloupe warpeace.txt \n",stderr); ali@0: fputs("\n",stderr); ali@40: fputs("Bookloupe looks for errors in Project Gutenberg(TM) etexts.\n", ali@40: stderr); ali@40: fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; " ali@40: "non-ASCII\n",stderr); ali@40: fputs("characters like accented letters, " ali@40: "lines longer than 75 or shorter than 55,\n",stderr); ali@40: fputs("unbalanced quotes or brackets, " ali@40: "a variety of badly formatted punctuation, \n",stderr); ali@40: fputs("HTML tags, some likely typos. " ali@40: "It is NOT a substitute for human judgement.\n",stderr); ali@0: fputs("\n",stderr); ali@0: }