ali@0: /*************************************************************************/
ali@40: /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
ali@0: /*                                                                       */
ali@0: /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>                  */
ali@40: /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>                     */
ali@0: /*                                                                       */
ali@0: /* This program is free software; you can redistribute it and/or modify  */
ali@0: /* it under the terms of the GNU General Public License as published by  */
ali@0: /* the Free Software Foundation; either version 2 of the License, or     */
ali@0: /* (at your option) any later version.                                   */
ali@0: /*                                                                       */
ali@0: /* This program is distributed in the hope that it will be useful,       */
ali@0: /* but WITHOUT ANY WARRANTY; without even the implied warranty of        */
ali@40: /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the          */
ali@0: /* GNU General Public License for more details.                          */
ali@0: /*                                                                       */
ali@0: /* You should have received a copy of the GNU General Public License     */
ali@40: /* along with this program. If not, see <http://www.gnu.org/licenses/>.  */
ali@0: /*************************************************************************/
ali@0: 
ali@0: #include <stdio.h>
ali@0: #include <stdlib.h>
ali@0: #include <string.h>
ali@0: #include <ctype.h>
ali@0: 
ali@0: #define MAXWORDLEN    80    /* max length of one word             */
ali@0: #define LINEBUFSIZE 2048    /* buffer size for an input line      */
ali@0: 
ali@0: #define MAX_USER_TYPOS 1000
ali@0: #define USERTYPO_FILE "gutcheck.typ"
ali@0: 
ali@0: #ifndef MAX_PATH
ali@0: #define MAX_PATH 16384
ali@0: #endif
ali@0: 
ali@0: char aline[LINEBUFSIZE];
ali@0: char prevline[LINEBUFSIZE];
ali@0: 
ali@40: /* Common typos. */
ali@40: char *typo[] = {
ali@40:     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
ali@40:     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",
ali@40:     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
ali@40:     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
ali@40:     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
ali@40:     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
ali@40:     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
ali@40:     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
ali@40:     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
ali@40:     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
ali@40:     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
ali@40:     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
ali@40:     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
ali@40:     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
ali@40:     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
ali@40:     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
ali@40:     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
ali@40:     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
ali@40:     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
ali@40:     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
ali@40:     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
ali@40:     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
ali@40:     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
ali@40:     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
ali@40:     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
ali@40:     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
ali@40:     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
ali@40:     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
ali@40:     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
ali@40:     "se", ""
ali@40: };
ali@0: 
ali@0: char *usertypo[MAX_USER_TYPOS];
ali@0: 
ali@40: /* Common abbreviations and other OK words not to query as typos. */
ali@40: char *okword[] = {
ali@40:     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
ali@40:     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
ali@40:     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
ali@40:     "outbid", "outbids", "frostbite", "frostbitten", ""
ali@40: };
ali@0: 
ali@40: /* Common abbreviations that cause otherwise unexplained periods. */
ali@40: char *abbrev[] = {
ali@40:     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
ali@40:     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
ali@40: };
ali@0: 
ali@40: /*
ali@40:  * Two-Letter combinations that rarely if ever start words,
ali@40:  * but are common scannos or otherwise common letter combinations.
ali@40:  */
ali@40: char *nostart[] = {
ali@40:     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
ali@40: };
ali@0: 
ali@40: /*
ali@40:  * Two-Letter combinations that rarely if ever end words,
ali@40:  * but are common scannos or otherwise common letter combinations.
ali@40:  */
ali@40: char *noend[] = {
ali@40:     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
ali@40:     "sw", "gr", "sl", "cl", "iy", ""
ali@40: };
ali@0: 
ali@40: char *markup[] = {
ali@40:     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
ali@40:     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
ali@40:     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
ali@40:     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
ali@40: };
ali@0: 
ali@40: char *DPmarkup[] = {
ali@40:     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
ali@40: };
ali@0: 
ali@40: char *nocomma[] = {
ali@40:     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
ali@40:     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
ali@40:     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
ali@40:     "during", "let", "toward", "among", ""
ali@40: };
ali@0: 
ali@40: char *noperiod[] = {
ali@40:     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
ali@40:     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
ali@40:     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
ali@40:     "among", "those", "into", "whom", "having", "thence", ""
ali@40: }; 
ali@0: 
ali@40: char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü";
ali@0: 
ali@0: struct {
ali@0:     char *htmlent;
ali@0:     char *htmlnum;
ali@0:     char *textent;
ali@40: } entities[] = {
ali@40:     "&amp;",	"&#38;",     "&", 
ali@40:     "&lt;",	"&#60;",     "<",
ali@40:     "&gt;",	"&#62;",     ">",
ali@40:     "&deg;",	"&#176;",    " degrees",
ali@40:     "&pound;",	"&#163;",    "L",
ali@40:     "&quot;",	"&#34;",     "\"", /* quotation mark = APL quote */
ali@40:     "&OElig;",	"&#338;",    "OE", /* latin capital ligature OE */
ali@40:     "&oelig;",	"&#339;",    "oe", /* latin small ligature oe */
ali@40:     "&Scaron;",	"&#352;",    "S", /* latin capital letter S with caron */
ali@40:     "&scaron;",	"&#353;",    "s", /* latin small letter s with caron */
ali@40:     "&Yuml;",	"&#376;",    "Y", /* latin capital letter Y with diaeresis */
ali@40:     "&circ;",	"&#710;",    "",  /* modifier letter circumflex accent */
ali@40:     "&tilde;",	"&#732;",    "~", /* small tilde, U+02DC ISOdia */
ali@40:     "&ensp;",	"&#8194;",   " ", /* en space, U+2002 ISOpub */
ali@40:     "&emsp;",	"&#8195;",   " ", /* em space, U+2003 ISOpub */
ali@40:     "&thinsp;",	"&#8201;",   " ", /* thin space, U+2009 ISOpub */
ali@40:     "&ndash;",	"&#8211;",   "-", /* en dash, U+2013 ISOpub */
ali@40:     "&mdash;",	"&#8212;",   "--", /* em dash, U+2014 ISOpub */
ali@40:     "&rsquo;",	"&#8217;",   "'", /* right single quotation mark */
ali@40:     "&sbquo;",	"&#8218;",   "'", /* single low-9 quotation mark */
ali@40:     "&ldquo;",	"&#8220;",   "\"", /* left double quotation mark */
ali@40:     "&rdquo;",	"&#8221;",   "\"", /* right double quotation mark */
ali@40:     "&bdquo;",	"&#8222;",   "\"", /* double low-9 quotation mark */
ali@40:     "&lsaquo;",	"&#8249;",   "\"", /* single left-pointing angle quotation mark */
ali@40:     "&rsaquo;",	"&#8250;",   "\"", /* single right-pointing angle quotation mark */
ali@40:     "&nbsp;",	"&#160;",    " ", /* no-break space = non-breaking space, */
ali@40:     "&iexcl;",	"&#161;",    "!", /* inverted exclamation mark */
ali@40:     "&cent;",	"&#162;",    "c", /* cent sign */
ali@40:     "&pound;",	"&#163;",    "L", /* pound sign */
ali@40:     "&curren;",	"&#164;",    "$", /* currency sign */
ali@40:     "&yen;",	"&#165;",    "Y", /* yen sign = yuan sign */
ali@40:     "&sect;",	"&#167;",    "--", /* section sign */
ali@40:     "&uml;",	"&#168;",    " ", /* diaeresis = spacing diaeresis */
ali@40:     "&copy;",	"&#169;",    "(C) ", /* copyright sign */
ali@40:     "&ordf;",	"&#170;",    " ", /* feminine ordinal indicator */
ali@40:     "&laquo;",	"&#171;",    "\"", /* left-pointing double angle quotation mark */
ali@40:     "&shy;",	"&#173;",    "-", /* soft hyphen = discretionary hyphen */
ali@40:     "&reg;",	"&#174;",    "(R) ", /* registered sign = registered trade mark sign */
ali@40:     "&macr;",	"&#175;",    " ", /* macron = spacing macron = overline */
ali@40:     "&deg;",	"&#176;",    " degrees", /* degree sign */
ali@40:     "&plusmn;",	"&#177;",    "+-", /* plus-minus sign = plus-or-minus sign */
ali@40:     "&sup2;",	"&#178;",    "2", /* superscript two = superscript digit two */
ali@40:     "&sup3;",	"&#179;",    "3", /* superscript three = superscript digit three */
ali@40:     "&acute;",	"&#180;",    " ", /* acute accent = spacing acute */
ali@40:     "&micro;",	"&#181;",    "m", /* micro sign */
ali@40:     "&para;",	"&#182;",    "--", /* pilcrow sign = paragraph sign */
ali@40:     "&cedil;",	"&#184;",    " ", /* cedilla = spacing cedilla */
ali@40:     "&sup1;",	"&#185;",    "1", /* superscript one = superscript digit one */
ali@40:     "&ordm;",	"&#186;",    " ", /* masculine ordinal indicator */
ali@40:     "&raquo;",	"&#187;",    "\"", /* right-pointing double angle quotation mark */
ali@40:     "&frac14;",	"&#188;",    "1/4", /* vulgar fraction one quarter */
ali@40:     "&frac12;",	"&#189;",    "1/2", /* vulgar fraction one half */
ali@40:     "&frac34;",	"&#190;",    "3/4", /* vulgar fraction three quarters */
ali@40:     "&iquest;",	"&#191;",    "?", /* inverted question mark */
ali@40:     "&Agrave;",	"&#192;",    "A", /* latin capital letter A with grave */
ali@40:     "&Aacute;",	"&#193;",    "A", /* latin capital letter A with acute */
ali@40:     "&Acirc;",	"&#194;",    "A", /* latin capital letter A with circumflex */
ali@40:     "&Atilde;",	"&#195;",    "A", /* latin capital letter A with tilde */
ali@40:     "&Auml;",	"&#196;",    "A", /* latin capital letter A with diaeresis */
ali@40:     "&Aring;",	"&#197;",    "A", /* latin capital letter A with ring above */
ali@40:     "&AElig;",	"&#198;",    "AE", /* latin capital letter AE */
ali@40:     "&Ccedil;",	"&#199;",    "C", /* latin capital letter C with cedilla */
ali@40:     "&Egrave;",	"&#200;",    "E", /* latin capital letter E with grave */
ali@40:     "&Eacute;",	"&#201;",    "E", /* latin capital letter E with acute */
ali@40:     "&Ecirc;",	"&#202;",    "E", /* latin capital letter E with circumflex */
ali@40:     "&Euml;",	"&#203;",    "E", /* latin capital letter E with diaeresis */
ali@40:     "&Igrave;",	"&#204;",    "I", /* latin capital letter I with grave */
ali@40:     "&Iacute;",	"&#205;",    "I", /* latin capital letter I with acute */
ali@40:     "&Icirc;",	"&#206;",    "I", /* latin capital letter I with circumflex */
ali@40:     "&Iuml;",	"&#207;",    "I", /* latin capital letter I with diaeresis */
ali@40:     "&ETH;",	"&#208;",    "E", /* latin capital letter ETH */
ali@40:     "&Ntilde;",	"&#209;",    "N", /* latin capital letter N with tilde */
ali@40:     "&Ograve;",	"&#210;",    "O", /* latin capital letter O with grave */
ali@40:     "&Oacute;",	"&#211;",    "O", /* latin capital letter O with acute */
ali@40:     "&Ocirc;",	"&#212;",    "O", /* latin capital letter O with circumflex */
ali@40:     "&Otilde;",	"&#213;",    "O", /* latin capital letter O with tilde */
ali@40:     "&Ouml;",	"&#214;",    "O", /* latin capital letter O with diaeresis */
ali@40:     "&times;",	"&#215;",    "*", /* multiplication sign */
ali@40:     "&Oslash;",	"&#216;",    "O", /* latin capital letter O with stroke */
ali@40:     "&Ugrave;",	"&#217;",    "U", /* latin capital letter U with grave */
ali@40:     "&Uacute;",	"&#218;",    "U", /* latin capital letter U with acute */
ali@40:     "&Ucirc;",	"&#219;",    "U", /* latin capital letter U with circumflex */
ali@40:     "&Uuml;",	"&#220;",    "U", /* latin capital letter U with diaeresis */
ali@40:     "&Yacute;",	"&#221;",    "Y", /* latin capital letter Y with acute */
ali@40:     "&THORN;",	"&#222;",    "TH", /* latin capital letter THORN */
ali@40:     "&szlig;",	"&#223;",    "sz", /* latin small letter sharp s = ess-zed */
ali@40:     "&agrave;",	"&#224;",    "a", /* latin small letter a with grave */
ali@40:     "&aacute;",	"&#225;",    "a", /* latin small letter a with acute */
ali@40:     "&acirc;",	"&#226;",    "a", /* latin small letter a with circumflex */
ali@40:     "&atilde;",	"&#227;",    "a", /* latin small letter a with tilde */
ali@40:     "&auml;",	"&#228;",    "a", /* latin small letter a with diaeresis */
ali@40:     "&aring;",	"&#229;",    "a", /* latin small letter a with ring above */
ali@40:     "&aelig;",	"&#230;",    "ae", /* latin small letter ae */
ali@40:     "&ccedil;",	"&#231;",    "c", /* latin small letter c with cedilla */
ali@40:     "&egrave;",	"&#232;",    "e", /* latin small letter e with grave */
ali@40:     "&eacute;",	"&#233;",    "e", /* latin small letter e with acute */
ali@40:     "&ecirc;",	"&#234;",    "e", /* latin small letter e with circumflex */
ali@40:     "&euml;",	"&#235;",    "e", /* latin small letter e with diaeresis */
ali@40:     "&igrave;",	"&#236;",    "i", /* latin small letter i with grave */
ali@40:     "&iacute;",	"&#237;",    "i", /* latin small letter i with acute */
ali@40:     "&icirc;",	"&#238;",    "i", /* latin small letter i with circumflex */
ali@40:     "&iuml;",	"&#239;",    "i", /* latin small letter i with diaeresis */
ali@40:     "&eth;",	"&#240;",    "eth", /* latin small letter eth */
ali@40:     "&ntilde;",	"&#241;",    "n", /* latin small letter n with tilde */
ali@40:     "&ograve;",	"&#242;",    "o", /* latin small letter o with grave */
ali@40:     "&oacute;",	"&#243;",    "o", /* latin small letter o with acute */
ali@40:     "&ocirc;",	"&#244;",    "o", /* latin small letter o with circumflex */
ali@40:     "&otilde;",	"&#245;",    "o", /* latin small letter o with tilde */
ali@40:     "&ouml;",	"&#246;",    "o", /* latin small letter o with diaeresis */
ali@40:     "&divide;",	"&#247;",    "/", /* division sign */
ali@40:     "&oslash;",	"&#248;",    "o", /* latin small letter o with stroke */
ali@40:     "&ugrave;",	"&#249;",    "u", /* latin small letter u with grave */
ali@40:     "&uacute;",	"&#250;",    "u", /* latin small letter u with acute */
ali@40:     "&ucirc;",	"&#251;",    "u", /* latin small letter u with circumflex */
ali@40:     "&uuml;",	"&#252;",    "u", /* latin small letter u with diaeresis */
ali@40:     "&yacute;",	"&#253;",    "y", /* latin small letter y with acute */
ali@40:     "&thorn;",	"&#254;",    "th", /* latin small letter thorn */
ali@40:     "&yuml;",	"&#255;",    "y", /* latin small letter y with diaeresis */
ali@40:     "", ""
ali@40: };
ali@40: 
ali@40: /* special characters */
ali@0: #define CHAR_SPACE        32
ali@0: #define CHAR_TAB           9
ali@0: #define CHAR_LF           10
ali@0: #define CHAR_CR           13
ali@0: #define CHAR_DQUOTE       34
ali@0: #define CHAR_SQUOTE       39
ali@0: #define CHAR_OPEN_SQUOTE  96
ali@0: #define CHAR_TILDE       126
ali@0: #define CHAR_ASTERISK     42
ali@0: #define CHAR_FORESLASH    47
ali@0: #define CHAR_CARAT        94
ali@0: 
ali@0: #define CHAR_UNDERSCORE    '_'
ali@0: #define CHAR_OPEN_CBRACK   '{'
ali@0: #define CHAR_CLOSE_CBRACK  '}'
ali@0: #define CHAR_OPEN_RBRACK   '('
ali@0: #define CHAR_CLOSE_RBRACK  ')'
ali@0: #define CHAR_OPEN_SBRACK   '['
ali@0: #define CHAR_CLOSE_SBRACK  ']'
ali@0: 
ali@40: /* longest and shortest normal PG line lengths */
ali@0: #define LONGEST_PG_LINE   75
ali@0: #define WAY_TOO_LONG      80
ali@0: #define SHORTEST_PG_LINE  55
ali@0: 
ali@0: #define SWITCHES "ESTPXLOYHWVMUD" /* switches:-                            */
ali@0:                                   /*     D - ignore DP-specific markup     */
ali@0:                                   /*     E - echo queried line             */
ali@0:                                   /*     S - check single quotes           */
ali@0:                                   /*     T - check common typos            */
ali@0:                                   /*     P - require closure of quotes on  */
ali@0:                                   /*         every paragraph               */
ali@0:                                   /*     X - "Trust no one" :-) Paranoid!  */
ali@0:                                   /*         Queries everything            */
ali@0:                                   /*     L - line end checking defaults on */
ali@0:                                   /*         -L turns it off               */
ali@0:                                   /*     O - overview. Just shows counts.  */
ali@0:                                   /*     Y - puts errors to stdout         */
ali@0:                                   /*         instead of stderr             */
ali@0:                                   /*     H - Echoes header fields          */
ali@0:                                   /*     M - Ignore markup in < >          */
ali@0:                                   /*     U - Use file of User-defined Typos*/
ali@0:                                   /*     W - Defaults for use on Web upload*/
ali@0:                                   /*     V - Verbose - list EVERYTHING!    */
ali@0: #define SWITNO 14                 /* max number of switch parms            */
ali@0:                                   /*        - used for defining array-size */
ali@0: #define MINARGS   1               /* minimum no of args excl switches      */
ali@0: #define MAXARGS   1               /* maximum no of args excl switches      */
ali@0: 
ali@0: int pswit[SWITNO];                /* program switches set by SWITCHES      */
ali@0: 
ali@0: #define ECHO_SWITCH      0
ali@0: #define SQUOTE_SWITCH    1
ali@0: #define TYPO_SWITCH      2
ali@0: #define QPARA_SWITCH     3
ali@0: #define PARANOID_SWITCH  4
ali@0: #define LINE_END_SWITCH  5
ali@0: #define OVERVIEW_SWITCH  6
ali@0: #define STDOUT_SWITCH    7
ali@0: #define HEADER_SWITCH    8
ali@0: #define WEB_SWITCH       9
ali@0: #define VERBOSE_SWITCH   10
ali@0: #define MARKUP_SWITCH    11
ali@0: #define USERTYPO_SWITCH  12
ali@0: #define DP_SWITCH        13
ali@0: 
ali@0: long cnt_dquot;       /* for overview mode, count of doublequote queries */
ali@0: long cnt_squot;       /* for overview mode, count of singlequote queries */
ali@0: long cnt_brack;       /* for overview mode, count of brackets queries */
ali@0: long cnt_bin;         /* for overview mode, count of non-ASCII queries */
ali@0: long cnt_odd;         /* for overview mode, count of odd character queries */
ali@0: long cnt_long;        /* for overview mode, count of long line errors */
ali@0: long cnt_short;       /* for overview mode, count of short line queries */
ali@0: long cnt_punct;       /* for overview mode, count of punctuation and spacing queries */
ali@0: long cnt_dash;        /* for overview mode, count of dash-related queries */
ali@0: long cnt_word;        /* for overview mode, count of word queries */
ali@0: long cnt_html;        /* for overview mode, count of html queries */
ali@0: long cnt_lineend;     /* for overview mode, count of line-end queries */
ali@40: long cnt_spacend;     /* count of lines with space at end */
ali@0: long linecnt;         /* count of total lines in the file */
ali@40: long checked_linecnt; /* count of lines actually checked */
ali@0: 
ali@0: void proghelp(void);
ali@0: void procfile(char *);
ali@0: 
ali@0: #define LOW_THRESHOLD    0
ali@0: #define HIGH_THRESHOLD   1
ali@0: 
ali@0: #define START 0
ali@0: #define END 1
ali@0: #define PREV 0
ali@0: #define NEXT 1
ali@0: #define FIRST_OF_PAIR 0
ali@0: #define SECOND_OF_PAIR 1
ali@0: 
ali@0: #define MAX_WORDPAIR 1000
ali@0: 
ali@0: char running_from[MAX_PATH];
ali@0: 
ali@0: int mixdigit(char *);
ali@54: const char *getaword(const char *,char *);
ali@40: int matchword(char *,char *);
ali@40: char *flgets(char *,int,FILE *,long);
ali@0: void lowerit(char *);
ali@0: int gcisalpha(unsigned char);
ali@0: int gcisdigit(unsigned char);
ali@0: int gcisletter(unsigned char);
ali@40: char *gcstrchr(char *s,char c);
ali@0: void postprocess_for_HTML(char *);
ali@0: char *linehasmarkup(char *);
ali@0: char *losemarkup(char *);
ali@40: int tagcomp(char *,char *);
ali@0: char *loseentities(char *);
ali@0: int isroman(char *);
ali@0: int usertypo_count;
ali@0: void postprocess_for_DP(char *);
ali@0: 
ali@0: char wrk[LINEBUFSIZE];
ali@0: 
ali@40: #define MAX_QWORD 50
ali@40: #define MAX_QWORD_LENGTH 40
ali@0: char qword[MAX_QWORD][MAX_QWORD_LENGTH];
ali@0: signed int dupcnt[MAX_QWORD];
ali@0: 
ali@40: int main(int argc,char **argv)
ali@0: {
ali@40:     char *argsw,*s;
ali@40:     int i,switno,invarg;
ali@0:     char usertypo_file[MAX_PATH];
ali@0:     FILE *usertypofile;
ali@40:     if (strlen(argv[0])<sizeof(running_from))
ali@40: 	/* save the path to the executable */
ali@40:         strcpy(running_from,argv[0]);
ali@0:     /* find out what directory we're running from */
ali@40:     s=running_from+strlen(running_from);
ali@40:     for (;*s!='/' && *s!='\\' && s>=running_from;s--)
ali@40:         *s=0;
ali@40:     switno=strlen(SWITCHES);
ali@40:     for (i=switno;--i>0;)
ali@40:         pswit[i]=0;           /* initialise switches */
ali@40:     /*
ali@40:      * Standard loop to extract switches.
ali@40:      * When we come out of this loop, the arguments will be
ali@40:      * in argv[0] upwards and the switches used will be
ali@40:      * represented by their equivalent elements in pswit[]
ali@40:      */
ali@40:     while (--argc>0 && **++argv=='-')
ali@40:         for (argsw=argv[0]+1;*argsw!='\0';argsw++)
ali@40:             for (i=switno,invarg=1;(--i>=0) && invarg==1;)
ali@40:                 if ((toupper(*argsw))==SWITCHES[i])
ali@40: 		{
ali@40:                     invarg=0;
ali@40:                     pswit[i]=1;
ali@40: 		}
ali@40:     /* Paranoid checking is turned OFF, not on, by its switch */
ali@40:     pswit[PARANOID_SWITCH]^=1;
ali@40:     if (pswit[PARANOID_SWITCH])
ali@40: 	/* if running in paranoid mode force typo checks as well   */
ali@40:         pswit[TYPO_SWITCH]=pswit[TYPO_SWITCH]^1;
ali@40:     /* Line-end checking is turned OFF, not on, by its switch */
ali@40:     pswit[LINE_END_SWITCH]^=1;
ali@40:     /* Echoing is turned OFF, not on, by its switch */
ali@40:     pswit[ECHO_SWITCH]^=1;
ali@40:     if (pswit[OVERVIEW_SWITCH])
ali@40: 	/* just print summary; don't echo */
ali@40:         pswit[ECHO_SWITCH]=0;
ali@40:     /*
ali@40:      * Web uploads - for the moment, this is really just a placeholder
ali@40:      * until we decide what processing we really want to do on web uploads
ali@40:      */
ali@40:     if (pswit[WEB_SWITCH])
ali@40:     {
ali@40: 	/* specific override for web uploads */
ali@40:         pswit[ECHO_SWITCH]=1;
ali@40:         pswit[SQUOTE_SWITCH]=0;
ali@40:         pswit[TYPO_SWITCH]=1;
ali@40:         pswit[QPARA_SWITCH]=0;
ali@40:         pswit[PARANOID_SWITCH]=1;
ali@40:         pswit[LINE_END_SWITCH]=0;
ali@40:         pswit[OVERVIEW_SWITCH]=0;
ali@40:         pswit[STDOUT_SWITCH]=0;
ali@40:         pswit[HEADER_SWITCH]=1;
ali@40:         pswit[VERBOSE_SWITCH]=0;
ali@40:         pswit[MARKUP_SWITCH]=0;
ali@40:         pswit[USERTYPO_SWITCH]=0;
ali@40:         pswit[DP_SWITCH]=0;
ali@40:     }
ali@40:     if (argc<MINARGS || argc>MAXARGS)
ali@40:     {
ali@40: 	/* check number of args */
ali@0:         proghelp();
ali@40:         return 1;
ali@40:     }
ali@0:     /* read in the user-defined stealth scanno list */
ali@40:     if (pswit[USERTYPO_SWITCH])
ali@40:     {
ali@40: 	/* ... we were told we had one! */
ali@40:         usertypofile=fopen(USERTYPO_FILE,"rb");
ali@40:         if (!usertypofile)
ali@40: 	{
ali@40: 	    /* not in cwd. try excuteable directory. */
ali@40:             strcpy(usertypo_file,running_from);
ali@40:             strcat(usertypo_file,USERTYPO_FILE);
ali@40:             usertypofile=fopen(usertypo_file,"rb");
ali@40:             if (!usertypofile) {
ali@40: 		/* we ain't got no user typo file! */
ali@40:                 printf("   --> I couldn't find gutcheck.typ "
ali@40: 		  "-- proceeding without user typos.\n");
ali@40: 	    }
ali@40: 	}
ali@40:         usertypo_count=0;
ali@40:         if (usertypofile)
ali@40: 	{
ali@40: 	    /* we managed to open a User Typo File! */
ali@40:             if (pswit[USERTYPO_SWITCH])
ali@40: 	    {
ali@40:                 while (flgets(aline,LINEBUFSIZE-1,usertypofile,
ali@40: 		  (long)usertypo_count))
ali@40: 		{
ali@40:                     if (strlen(aline)>1)
ali@40: 		    {
ali@40:                         if ((int)*aline>33)
ali@40: 			{
ali@40:                             s=malloc(strlen(aline)+1);
ali@40:                             if (!s)
ali@40: 			    {
ali@40:                                 fprintf(stderr,"bookloupe: cannot get enough "
ali@40: 				  "memory for user typo file!\n");
ali@0:                                 exit(1);
ali@40: 			    }
ali@40:                             strcpy(s,aline);
ali@40:                             usertypo[usertypo_count]=s;
ali@0:                             usertypo_count++;
ali@40:                             if (usertypo_count>=MAX_USER_TYPOS)
ali@40: 			    {
ali@40:                                 printf("   --> Only %d user-defined typos "
ali@42: 				  "allowed: ignoring the rest\n",
ali@42: 				  MAX_USER_TYPOS);
ali@0:                                 break;
ali@40: 			    }
ali@40: 			}
ali@40: 		    }
ali@40: 		}
ali@40: 	    }
ali@0:             fclose(usertypofile);
ali@40: 	}
ali@40:     }
ali@40:     fprintf(stderr,"bookloupe: Check and report on an e-text\n");
ali@40:     cnt_dquot=cnt_squot=cnt_brack=cnt_bin=cnt_odd=cnt_long=
ali@40:     cnt_short=cnt_punct=cnt_dash=cnt_word=cnt_html=cnt_lineend=
ali@40:     cnt_spacend=0;
ali@0:     procfile(argv[0]);
ali@40:     if (pswit[OVERVIEW_SWITCH])
ali@40:     {
ali@40: 	printf("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
ali@40: 	  checked_linecnt,linecnt,linecnt-checked_linecnt);
ali@40:         printf("    --------------- Queries found --------------\n");
ali@40:         if (cnt_long)
ali@40: 	    printf("    Long lines:                    %14ld\n",cnt_long);
ali@40:         if (cnt_short)
ali@40: 	    printf("    Short lines:                   %14ld\n",cnt_short);
ali@40:         if (cnt_lineend)
ali@40: 	    printf("    Line-end problems:             %14ld\n",cnt_lineend);
ali@40:         if (cnt_word)
ali@40: 	    printf("    Common typos:                  %14ld\n",cnt_word);
ali@40:         if (cnt_dquot)
ali@40: 	    printf("    Unmatched quotes:              %14ld\n",cnt_dquot);
ali@40:         if (cnt_squot)
ali@40: 	    printf("    Unmatched SingleQuotes:        %14ld\n",cnt_squot);
ali@40:         if (cnt_brack)
ali@40: 	    printf("    Unmatched brackets:            %14ld\n",cnt_brack);
ali@40:         if (cnt_bin)
ali@40: 	    printf("    Non-ASCII characters:          %14ld\n",cnt_bin);
ali@40:         if (cnt_odd)
ali@40: 	    printf("    Proofing characters:           %14ld\n",cnt_odd);
ali@40:         if (cnt_punct)
ali@40: 	    printf("    Punctuation & spacing queries: %14ld\n",cnt_punct);
ali@40:         if (cnt_dash)
ali@40: 	    printf("    Non-standard dashes:           %14ld\n",cnt_dash);
ali@40:         if (cnt_html)
ali@40: 	    printf("    Possible HTML tags:            %14ld\n",cnt_html);
ali@0:         printf("\n");
ali@40:         printf("    TOTAL QUERIES                  %14ld\n",
ali@40:           cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
ali@40:           cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
ali@40:     }
ali@40:     return 0;
ali@0: }
ali@0: 
ali@41: struct first_pass_results {
ali@41:     long firstline,astline;
ali@41:     long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;
ali@41:     long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;
ali@41:     long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;
ali@41:     signed int Dutchcount,Frenchcount;
ali@41: };
ali@41: 
ali@40: /*
ali@41:  * first_pass:
ali@40:  *
ali@41:  * Run a first pass - verify that it's a valid PG
ali@41:  * file, decide whether to report some things that
ali@41:  * occur many times in the text like long or short
ali@41:  * lines, non-standard dashes, etc.
ali@40:  */
ali@41: struct first_pass_results *first_pass(FILE *infile)
ali@0: {
ali@54:     char laststart=CHAR_SPACE;
ali@54:     const char *s;
ali@41:     signed int i,llen;
ali@41:     unsigned int lastlen=0,lastblen=0;
ali@41:     long spline=0,nspline=0;
ali@41:     static struct first_pass_results results={0};
ali@41:     char inword[MAXWORDLEN]="";
ali@40:     while (fgets(aline,LINEBUFSIZE-1,infile))
ali@40:     {
ali@40:         while (aline[strlen(aline)-1]==10 || aline[strlen(aline)-1]==13)
ali@40: 	    aline[strlen(aline)-1]=0;
ali@0:         linecnt++;
ali@40:         if (strstr(aline,"*END") && strstr(aline,"SMALL PRINT") &&
ali@40: 	  (strstr(aline,"PUBLIC DOMAIN") || strstr(aline,"COPYRIGHT")))
ali@40: 	{
ali@0:             if (spline)
ali@0:                 printf("   --> Duplicate header?\n");
ali@40:             spline=linecnt+1;   /* first line of non-header text, that is */
ali@40: 	}
ali@40:         if (!strncmp(aline,"*** START",9) && strstr(aline,"PROJECT GUTENBERG"))
ali@40: 	{
ali@0:             if (nspline)
ali@0:                 printf("   --> Duplicate header?\n");
ali@40:             nspline=linecnt+1;   /* first line of non-header text, that is */
ali@40: 	}
ali@40:         if (spline || nspline)
ali@40: 	{
ali@0:             lowerit(aline);
ali@40:             if (strstr(aline,"end") && strstr(aline,"project gutenberg"))
ali@40: 	    {
ali@40:                 if (strstr(aline,"end")<strstr(aline,"project gutenberg"))
ali@40: 		{
ali@41:                     if (results.footerline)
ali@40: 		    {
ali@40: 			/* it's an old-form header - we can detect duplicates */
ali@40:                         if (!nspline)
ali@0:                             printf("   --> Duplicate footer?\n");
ali@40: 		    }
ali@40:                     else
ali@41:                         results.footerline=linecnt;
ali@40: 		}
ali@40: 	    }
ali@40: 	}
ali@40:         if (spline)
ali@41: 	    results.firstline=spline;
ali@40:         if (nspline)
ali@41: 	    results.firstline=nspline;  /* override with new */
ali@41:         if (results.footerline)
ali@40: 	    continue;    /* don't count the boilerplate in the footer */
ali@40:         llen=strlen(aline);
ali@41:         results.totlen+=llen;
ali@40:         for (i=0;i<llen;i++)
ali@40: 	{
ali@40:             if ((unsigned char)aline[i]>127)
ali@41: 		results.binlen++;
ali@40:             if (gcisalpha(aline[i]))
ali@41: 		results.alphalen++;
ali@40:             if (i>0 && aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))
ali@41: 		results.endquote_count++;
ali@40: 	}
ali@40:         if (strlen(aline)>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE &&
ali@40: 	  lastblen>2 && lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
ali@41: 	    results.shortline++;
ali@40:         if (*aline && (unsigned char)aline[strlen(aline)-1]<=CHAR_SPACE)
ali@40: 	    cnt_spacend++;
ali@40:         if (strstr(aline,".,"))
ali@41: 	    results.dotcomma++;
ali@40:         /* only count ast lines for ignoring purposes where there is */
ali@0:         /* locase text on the line */
ali@40:         if (strstr(aline,"*"))
ali@40: 	{
ali@40:             for (s=aline;*s;s++)
ali@40:                 if (*s>='a' && *s<='z')
ali@0:                     break;
ali@40:              if (*s)
ali@41: 		results.astline++;
ali@40: 	}
ali@40:         if (strstr(aline,"/"))
ali@41:             results.fslashline++;
ali@40:         for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)
ali@40: 	    ;
ali@40:         if (aline[i]=='-' && aline[i-1]!='-')
ali@41: 	    results.hyphens++;
ali@40:         if (llen>LONGEST_PG_LINE)
ali@41: 	    results.longline++;
ali@40:         if (llen>WAY_TOO_LONG)
ali@41: 	    results.verylongline++;
ali@40:         if (strstr(aline,"<") && strstr(aline,">"))
ali@40: 	{
ali@40:             i=(signed int)(strstr(aline,">")-strstr(aline,"<")+1);
ali@40:             if (i>0)
ali@41:                 results.htmcount++;
ali@40:             if (strstr(aline,"<i>"))
ali@41: 		results.htmcount+=4; /* bonus marks! */
ali@40: 	}
ali@0:         /* Check for spaced em-dashes */
ali@40:         if (strstr(aline,"--"))
ali@40: 	{
ali@41:             results.emdash++;
ali@40:             if (*(strstr(aline,"--")-1)==CHAR_SPACE ||
ali@40:                (*(strstr(aline,"--")+2)==CHAR_SPACE))
ali@41: 		results.space_emdash++;
ali@40:             if (*(strstr(aline,"--")-1)==CHAR_SPACE &&
ali@40:                (*(strstr(aline,"--")+2)==CHAR_SPACE))
ali@40: 		/* count of em-dashes with spaces both sides */
ali@41: 		results.non_PG_space_emdash++;
ali@40:             if (*(strstr(aline,"--")-1)!=CHAR_SPACE &&
ali@40:                (*(strstr(aline,"--")+2)!=CHAR_SPACE))
ali@40: 		/* count of PG-type em-dashes with no spaces */
ali@41: 		results.PG_space_emdash++;
ali@40: 	}
ali@40:         for (s=aline;*s;)
ali@40: 	{
ali@40:             s=getaword(s,inword);
ali@40:             if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) 
ali@41:                 results.Dutchcount++;
ali@40:             if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) 
ali@41:                 results.Frenchcount++;
ali@40:             if (!strcmp(inword,"0") || !strcmp(inword,"1")) 
ali@41:                 results.standalone_digit++;
ali@40: 	}
ali@0:         /* Check for spaced dashes */
ali@40:         if (strstr(aline," -") && *(strstr(aline," -")+2)!='-')
ali@41: 	    results.spacedash++;
ali@40:         lastblen=lastlen;
ali@40:         lastlen=strlen(aline);
ali@40:         laststart=aline[0];
ali@40:     }
ali@41:     return &results;
ali@41: }
ali@41: 
ali@42: struct warnings {
ali@42:     signed int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;
ali@42:     signed int endquote,isDutch,isFrench;
ali@42: };
ali@42: 
ali@42: /*
ali@42:  * report_first_pass:
ali@42:  *
ali@42:  * Make some snap decisions based on the first pass results.
ali@42:  */
ali@42: struct warnings *report_first_pass(struct first_pass_results *results)
ali@42: {
ali@42:     static struct warnings warnings={0};
ali@42:     if (cnt_spacend>0)
ali@42:         printf("   --> %ld lines in this file have white space at end\n",
ali@42: 	  cnt_spacend);
ali@42:     warnings.dotcomma=1;
ali@42:     if (results->dotcomma>5)
ali@42:     {
ali@42:         warnings.dotcomma=0;
ali@42:         printf("   --> %ld lines in this file contain '.,'. "
ali@42: 	  "Not reporting them.\n",results->dotcomma);
ali@42:     }
ali@42:     /*
ali@42:      * If more than 50 lines, or one-tenth, are short,
ali@42:      * don't bother reporting them.
ali@42:      */
ali@42:     warnings.shortline=1;
ali@42:     if (results->shortline>50 || results->shortline*10>linecnt)
ali@42:     {
ali@42:         warnings.shortline=0;
ali@42:         printf("   --> %ld lines in this file are short. "
ali@42: 	  "Not reporting short lines.\n",results->shortline);
ali@42:     }
ali@42:     /*
ali@42:      * If more than 50 lines, or one-tenth, are long,
ali@42:      * don't bother reporting them.
ali@42:      */
ali@42:     warnings.longline=1;
ali@42:     if (results->longline>50 || results->longline*10>linecnt)
ali@42:     {
ali@42:         warnings.longline=0;
ali@42:         printf("   --> %ld lines in this file are long. "
ali@42: 	  "Not reporting long lines.\n",results->longline);
ali@42:     }
ali@42:     /* If more than 10 lines contain asterisks, don't bother reporting them. */
ali@42:     warnings.ast=1;
ali@42:     if (results->astline>10)
ali@42:     {
ali@42:         warnings.ast=0;
ali@42:         printf("   --> %ld lines in this file contain asterisks. "
ali@42: 	  "Not reporting them.\n",results->astline);
ali@42:     }
ali@42:     /*
ali@42:      * If more than 10 lines contain forward slashes,
ali@42:      * don't bother reporting them.
ali@42:      */
ali@42:     warnings.fslash=1;
ali@42:     if (results->fslashline>10)
ali@42:     {
ali@42:         warnings.fslash=0;
ali@42:         printf("   --> %ld lines in this file contain forward slashes. "
ali@42: 	  "Not reporting them.\n",results->fslashline);
ali@42:     }
ali@42:     /*
ali@42:      * If more than 20 lines contain unpunctuated endquotes,
ali@42:      * don't bother reporting them.
ali@42:      */
ali@42:     warnings.endquote=1;
ali@42:     if (results->endquote_count>20)
ali@42:     {
ali@42:         warnings.endquote=0;
ali@42:         printf("   --> %ld lines in this file contain unpunctuated endquotes. "
ali@42: 	  "Not reporting them.\n",results->endquote_count);
ali@42:     }
ali@42:     /*
ali@42:      * If more than 15 lines contain standalone digits,
ali@42:      * don't bother reporting them.
ali@42:      */
ali@42:     warnings.digit=1;
ali@42:     if (results->standalone_digit>10)
ali@42:     {
ali@42:         warnings.digit=0;
ali@42:         printf("   --> %ld lines in this file contain standalone 0s and 1s. "
ali@42: 	  "Not reporting them.\n",results->standalone_digit);
ali@42:     }
ali@42:     /*
ali@42:      * If more than 20 lines contain hyphens at end,
ali@42:      * don't bother reporting them.
ali@42:      */
ali@42:     warnings.hyphen=1;
ali@42:     if (results->hyphens>20)
ali@42:     {
ali@42:         warnings.hyphen=0;
ali@42:         printf("   --> %ld lines in this file have hyphens at end. "
ali@42: 	  "Not reporting them.\n",results->hyphens);
ali@42:     }
ali@42:     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
ali@42:     {
ali@42:         printf("   --> Looks like this is HTML. Switching HTML mode ON.\n");
ali@42:         pswit[MARKUP_SWITCH]=1;
ali@42:     }
ali@42:     if (results->verylongline>0)
ali@42:         printf("   --> %ld lines in this file are VERY long!\n",
ali@42: 	  results->verylongline);
ali@42:     /*
ali@42:      * If there are more non-PG spaced dashes than PG em-dashes,
ali@42:      * assume it's deliberate.
ali@42:      * Current PG guidelines say don't use them, but older texts do,
ali@42:      * and some people insist on them whatever the guidelines say.
ali@42:      */
ali@42:     warnings.dash=1;
ali@42:     if (results->spacedash+results->non_PG_space_emdash>
ali@42:       results->PG_space_emdash)
ali@42:     {
ali@42:         warnings.dash=0;
ali@42:         printf("   --> There are %ld spaced dashes and em-dashes. "
ali@42: 	  "Not reporting them.\n",
ali@42: 	  results->spacedash+results->non_PG_space_emdash);
ali@42:     }
ali@42:     /* If more than a quarter of characters are hi-bit, bug out. */
ali@42:     warnings.bin=1;
ali@42:     if (results->binlen*4>results->totlen)
ali@42:     {
ali@42:         printf("   --> This file does not appear to be ASCII. "
ali@42: 	  "Terminating. Best of luck with it!\n");
ali@42:         exit(1);
ali@42:     }
ali@42:     if (results->alphalen*4<results->totlen)
ali@42:     {
ali@42:         printf("   --> This file does not appear to be text. "
ali@42: 	  "Terminating. Best of luck with it!\n");
ali@42:         exit(1);
ali@42:     }
ali@42:     if (results->binlen*100>results->totlen || results->binlen>100)
ali@42:     {
ali@42:         printf("   --> There are a lot of foreign letters here. "
ali@42: 	  "Not reporting them.\n");
ali@42:         warnings.bin=0;
ali@42:     }
ali@42:     warnings.isDutch=0;
ali@42:     if (results->Dutchcount>50)
ali@42:     {
ali@42:         warnings.isDutch=1;
ali@42:         printf("   --> This looks like Dutch - "
ali@42: 	  "switching off dashes and warnings for 's Middags case.\n");
ali@42:     }
ali@42:     warnings.isFrench=0;
ali@42:     if (results->Frenchcount>50)
ali@42:     {
ali@42:         warnings.isFrench=1;
ali@42:         printf("   --> This looks like French - "
ali@42: 	  "switching off some doublepunct.\n");
ali@42:     }
ali@42:     if (results->firstline && results->footerline)
ali@42:         printf("    The PG header and footer appear to be already on.\n");
ali@42:     else
ali@42:     {
ali@42:         if (results->firstline)
ali@42:             printf("    The PG header is on - no footer.\n");
ali@42:         if (results->footerline)
ali@42:             printf("    The PG footer is on - no header.\n");
ali@42:     }
ali@42:     printf("\n");
ali@42:     if (pswit[VERBOSE_SWITCH])
ali@42:     {
ali@42:         warnings.bin=1;
ali@42:         warnings.shortline=1;
ali@42:         warnings.dotcomma=1;
ali@42:         warnings.longline=1;
ali@42:         warnings.dash=1;
ali@42:         warnings.digit=1;
ali@42:         warnings.ast=1;
ali@42:         warnings.fslash=1;
ali@42:         warnings.hyphen=1;
ali@42:         warnings.endquote=1;
ali@42:         printf("   *** Verbose output is ON -- you asked for it! ***\n");
ali@42:     }
ali@42:     if (warnings.isDutch)
ali@42:         warnings.dash=0;
ali@42:     if (results->footerline>0 && results->firstline>0 &&
ali@42:       results->footerline>results->firstline &&
ali@42:       results->footerline-results->firstline<100)
ali@42:     {
ali@42:         printf("   --> I don't really know where this text starts. \n");
ali@42:         printf("       There are no reference points.\n");
ali@42:         printf("       I'm going to have to report the header and footer "
ali@42: 	  "as well.\n");
ali@42:         results->firstline=0;
ali@42:     }
ali@42:     return &warnings;
ali@42: }
ali@42: 
ali@43: struct counters {
ali@43:     long quot;
ali@43:     signed int c_unders,c_brack,s_brack,r_brack;
ali@43:     signed int open_single_quote,close_single_quote;
ali@43: };
ali@43: 
ali@43: /*
ali@43:  * analyse_quotes:
ali@43:  *
ali@43:  * Look along the line, accumulate the count of quotes, and see
ali@43:  * if this is an empty line - i.e. a line with nothing on it
ali@43:  * but spaces.
ali@43:  * If line has just spaces, period, * and/or - on it, don't
ali@43:  * count it, since empty lines with asterisks or dashes to
ali@43:  * separate sections are common.
ali@43:  *
ali@43:  * Returns: Non-zero if the line is empty.
ali@43:  */
ali@43: int analyse_quotes(const char *s,struct counters *counters)
ali@43: {
ali@43:     signed int guessquote=0;
ali@43:     int isemptyline=1;    /* assume the line is empty until proven otherwise */
ali@43:     while (*s)
ali@43:     {
ali@43: 	if (*s==CHAR_DQUOTE)
ali@43: 	    counters->quot++;
ali@43: 	if (*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE)
ali@43: 	{
ali@43: 	    if (s==aline)
ali@43: 	    {
ali@43: 		/*
ali@43: 		 * At start of line, it can only be an openquote.
ali@43: 		 * Hardcode a very common exception!
ali@43: 		 */
ali@43: 		if (strncmp(s+2,"tis",3) && strncmp(s+2,"Tis",3))
ali@43: 		    counters->open_single_quote++;
ali@43: 	    }
ali@43: 	    else if (gcisalpha(s[-1]) && gcisalpha(s[1]))
ali@43: 		/* Do nothing! it's definitely an apostrophe, not a quote */
ali@43: 		;
ali@43: 	    /* it's outside a word - let's check it out */
ali@43: 	    else if (*s==CHAR_OPEN_SQUOTE || gcisalpha(s[1]))
ali@43: 	    {
ali@43: 		/* it damwell better BE an openquote */
ali@43: 		if (strncmp(s+1,"tis",3) && strncmp(s+1,"Tis",3))
ali@43: 		    /* hardcode a very common exception! */
ali@43: 		    counters->open_single_quote++;
ali@43: 	    }
ali@43: 	    else
ali@43: 	    {
ali@43: 		/* now - is it a closequote? */
ali@43: 		guessquote=0;   /* accumulate clues */
ali@43: 		if (gcisalpha(s[-1]))
ali@43: 		{
ali@43: 		    /* it follows a letter - could be either */
ali@43: 		    guessquote++;
ali@43: 		    if (s[-1]=='s')
ali@43: 		    {
ali@43: 			/* looks like a plural apostrophe */
ali@43: 			guessquote-=3;
ali@43: 			if (s[1]==CHAR_SPACE)  /* bonus marks! */
ali@43: 			    guessquote-=2;
ali@43: 		    }
ali@43: 		}
ali@43: 		/* it doesn't have a letter either side */
ali@43: 		else if (strchr(".?!,;:",s[-1]) && strchr(".?!,;: ",s[1]))
ali@43: 		    guessquote+=8; /* looks like a closequote */
ali@43: 		else
ali@43: 		    guessquote++;
ali@43: 		if (counters->open_single_quote>counters->close_single_quote)
ali@43: 		    /*
ali@43: 		     * Give it the benefit of some doubt,
ali@43: 		     * if a squote is already open.
ali@43: 		     */
ali@43: 		    guessquote++;
ali@43: 		else
ali@43: 		    guessquote--;
ali@43: 		if (guessquote>=0)
ali@43: 		    counters->close_single_quote++;
ali@43: 	    }
ali@43: 	}
ali@43: 	if (*s!=CHAR_SPACE && *s!='-' && *s!='.' && *s!=CHAR_ASTERISK &&
ali@43: 	  *s!=13 && *s!=10)
ali@43: 	    isemptyline=0;  /* ignore lines like  *  *  *  as spacers */
ali@43: 	if (*s==CHAR_UNDERSCORE)
ali@43: 	    counters->c_unders++;
ali@43: 	if (*s==CHAR_OPEN_CBRACK)
ali@43: 	    counters->c_brack++;
ali@43: 	if (*s==CHAR_CLOSE_CBRACK)
ali@43: 	    counters->c_brack--;
ali@43: 	if (*s==CHAR_OPEN_RBRACK)
ali@43: 	    counters->r_brack++;
ali@43: 	if (*s==CHAR_CLOSE_RBRACK)
ali@43: 	    counters->r_brack--;
ali@43: 	if (*s==CHAR_OPEN_SBRACK)
ali@43: 	    counters->s_brack++;
ali@43: 	if (*s==CHAR_CLOSE_SBRACK)
ali@43: 	    counters->s_brack--;
ali@43: 	s++;
ali@43:     }
ali@43:     return isemptyline;
ali@43: }
ali@43: 
ali@41: /*
ali@44:  * check_for_odd_characters:
ali@44:  *
ali@44:  * Check for binary and other odd characters.
ali@44:  */
ali@44: void check_for_odd_characters(const char *aline,const struct warnings *warnings,
ali@44:   int isemptyline)
ali@44: {
ali@44:     /* Don't repeat multiple warnings on one line. */
ali@44:     signed int eNon_A=0,eTab=0,eTilde=0,eCarat=0,eFSlash=0,eAst=0;
ali@44:     const char *s;
ali@44:     unsigned char c;
ali@44:     for (s=aline;*s;s++)
ali@44:     {
ali@44: 	c=*(unsigned char *)s;
ali@44: 	if (!eNon_A && (*s<CHAR_SPACE && *s!=9 && *s!='\n' || c>127))
ali@44: 	{
ali@44: 	    if (pswit[ECHO_SWITCH])
ali@44: 		printf("\n%s\n",aline);
ali@44: 	    if (!pswit[OVERVIEW_SWITCH])
ali@44: 		if (c>127 && c<160)
ali@44: 		    printf("    Line %ld column %d - "
ali@44: 		      "Non-ISO-8859 character %d\n",linecnt,(int)(s-aline)+1,c);
ali@44: 		else
ali@44: 		    printf("    Line %ld column %d - Non-ASCII character %d\n",
ali@44: 		      linecnt,(int)(s-aline)+1,c);
ali@44: 	    else
ali@44: 		cnt_bin++;
ali@44: 	    eNon_A=1;
ali@44: 	}
ali@44: 	if (!eTab && *s==CHAR_TAB)
ali@44: 	{
ali@44: 	    if (pswit[ECHO_SWITCH])
ali@44: 		printf("\n%s\n",aline);
ali@44: 	    if (!pswit[OVERVIEW_SWITCH])
ali@44: 		printf("    Line %ld column %d - Tab character?\n",
ali@44: 		  linecnt,(int)(s-aline)+1);
ali@44: 	    else
ali@44: 		cnt_odd++;
ali@44: 	    eTab=1;
ali@44: 	}
ali@44: 	if (!eTilde && *s==CHAR_TILDE)
ali@44: 	{
ali@44: 	    /*
ali@44: 	     * Often used by OCR software to indicate an
ali@44: 	     * unrecognizable character.
ali@44: 	     */
ali@44: 	    if (pswit[ECHO_SWITCH])
ali@44: 		printf("\n%s\n",aline);
ali@44: 	    if (!pswit[OVERVIEW_SWITCH])
ali@44: 		printf("    Line %ld column %d - Tilde character?\n",
ali@44: 		  linecnt,(int)(s-aline)+1);
ali@44: 	    else
ali@44: 		cnt_odd++;
ali@44: 	    eTilde=1;
ali@44: 	}
ali@44: 	if (!eCarat && *s==CHAR_CARAT)
ali@44: 	{  
ali@44: 	    if (pswit[ECHO_SWITCH])
ali@44: 		printf("\n%s\n",aline);
ali@44: 	    if (!pswit[OVERVIEW_SWITCH])
ali@44: 		printf("    Line %ld column %d - Carat character?\n",
ali@44: 		  linecnt,(int)(s-aline)+1);
ali@44: 	    else
ali@44: 		cnt_odd++;
ali@44: 	    eCarat=1;
ali@44: 	}
ali@44: 	if (!eFSlash && *s==CHAR_FORESLASH && warnings->fslash)
ali@44: 	{  
ali@44: 	    if (pswit[ECHO_SWITCH])
ali@44: 		printf("\n%s\n",aline);
ali@44: 	    if (!pswit[OVERVIEW_SWITCH])
ali@44: 		printf("    Line %ld column %d - Forward slash?\n",
ali@44: 		  linecnt,(int)(s-aline)+1);
ali@44: 	    else
ali@44: 		cnt_odd++;
ali@44: 	    eFSlash=1;
ali@44: 	}
ali@44: 	/*
ali@44: 	 * Report asterisks only in paranoid mode,
ali@44: 	 * since they're often deliberate.
ali@44: 	 */
ali@44: 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
ali@44: 	  *s==CHAR_ASTERISK)
ali@44: 	{
ali@44: 	    if (pswit[ECHO_SWITCH])
ali@44: 		printf("\n%s\n",aline);
ali@44: 	    if (!pswit[OVERVIEW_SWITCH])
ali@44: 		printf("    Line %ld column %d - Asterisk?\n",
ali@44: 		  linecnt,(int)(s-aline)+1);
ali@44: 	    else
ali@44: 		cnt_odd++;
ali@44: 	    eAst=1;
ali@44: 	}
ali@44:     }
ali@44: }
ali@44: 
ali@44: /*
ali@45:  * check_for_long_line:
ali@45:  *
ali@45:  * Check for line too long.
ali@45:  */
ali@45: void check_for_long_line(const char *aline)
ali@45: {
ali@45:     if (strlen(aline)>LONGEST_PG_LINE)
ali@45:     {
ali@45: 	if (pswit[ECHO_SWITCH])
ali@45: 	    printf("\n%s\n",aline);
ali@45: 	if (!pswit[OVERVIEW_SWITCH])
ali@45: 	    printf("    Line %ld column %d - Long line %d\n",
ali@45: 	      linecnt,strlen(aline),strlen(aline));
ali@45: 	else
ali@45: 	    cnt_long++;
ali@45:     }
ali@45: }
ali@45: 
ali@45: struct line_properties {
ali@45:     unsigned int len,blen;
ali@45:     char start;
ali@45: };
ali@45: 
ali@45: /*
ali@45:  * check_for_short_line:
ali@45:  *
ali@45:  * Check for line too short.
ali@45:  *
ali@45:  * This one is a bit trickier to implement: we don't want to
ali@45:  * flag the last line of a paragraph for being short, so we
ali@45:  * have to wait until we know that our current line is a
ali@45:  * "normal" line, then report the _previous_ line if it was too
ali@45:  * short. We also don't want to report indented lines like
ali@45:  * chapter heads or formatted quotations. We therefore keep
ali@45:  * last->len as the length of the last line examined, and
ali@45:  * last->blen as the length of the last but one, and try to
ali@45:  * suppress unnecessary warnings by checking that both were of
ali@45:  * "normal" length. We keep the first character of the last
ali@45:  * line in last->start, and if it was a space, we assume that
ali@45:  * the formatting is deliberate. I can't figure out a way to
ali@45:  * distinguish something like a quoted verse left-aligned or
ali@45:  * the header or footer of a letter from a paragraph of short
ali@45:  * lines - maybe if I examined the whole paragraph, and if the
ali@45:  * para has less than, say, 8 lines and if all lines are short,
ali@45:  * then just assume it's OK? Need to look at some texts to see
ali@45:  * how often a formula like this would get the right result.
ali@45:  */
ali@45: void check_for_short_line(const char *aline,const struct line_properties *last)
ali@45: {
ali@45:     if (strlen(aline)>1 && last->len>1 && last->len<SHORTEST_PG_LINE &&
ali@45:       last->blen>1 && last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
ali@45:     {
ali@45: 	if (pswit[ECHO_SWITCH])
ali@45: 	    printf("\n%s\n",prevline);
ali@45: 	if (!pswit[OVERVIEW_SWITCH])
ali@45: 	    printf("    Line %ld column %d - Short line %d?\n",
ali@45: 	      linecnt-1,strlen(prevline),strlen(prevline));
ali@45: 	else
ali@45: 	    cnt_short++;
ali@45:     }
ali@45: }
ali@45: 
ali@45: /*
ali@46:  * check_for_starting_punctuation:
ali@46:  *
ali@46:  * Look for punctuation other than full ellipses at start of line.
ali@46:  */
ali@46: void check_for_starting_punctuation(const char *aline)
ali@46: {
ali@46:     if (*aline && strchr(".?!,;:",aline[0]) && strncmp(". . .",aline,5))
ali@46:     {
ali@46: 	if (pswit[ECHO_SWITCH])
ali@46: 	    printf("\n%s\n",aline);
ali@46: 	if (!pswit[OVERVIEW_SWITCH])
ali@46: 	    printf("    Line %ld column 1 - Begins with punctuation?\n",
ali@46: 	      linecnt);
ali@46: 	else
ali@46: 	    cnt_punct++;
ali@46:     }
ali@46: }
ali@46: 
ali@46: /*
ali@47:  * check_for_spaced_emdash:
ali@47:  *
ali@47:  * Check for spaced em-dashes.
ali@47:  *
ali@47:  * We must check _all_ occurrences of "--" on the line
ali@47:  * hence the loop - even if the first double-dash is OK
ali@47:  * there may be another that's wrong later on.
ali@47:  */
ali@47: void check_for_spaced_emdash(const char *aline)
ali@47: {
ali@47:     const char *s,*t;
ali@47:     s=aline;
ali@47:     while ((t=strstr(s,"--")))
ali@47:     {
ali@47: 	if (t>aline && t[-1]==CHAR_SPACE || t[2]==CHAR_SPACE)
ali@47: 	{
ali@47: 	    if (pswit[ECHO_SWITCH])
ali@47: 		printf("\n%s\n",aline);
ali@47: 	    if (!pswit[OVERVIEW_SWITCH])
ali@47: 		printf("    Line %ld column %d - Spaced em-dash?\n",
ali@47: 		  linecnt,(int)(t-aline)+1);
ali@47: 	    else
ali@47: 		cnt_dash++;
ali@47: 	}
ali@47: 	s=t+2;
ali@47:     }
ali@47: }
ali@47: 
ali@47: /*
ali@47:  * check_for_spaced_dash:
ali@47:  *
ali@47:  * Check for spaced dashes.
ali@47:  */
ali@47: void check_for_spaced_dash(const char *aline)
ali@47: {
ali@47:     const char *s;
ali@47:     if ((s=strstr(aline," -")))
ali@47:     {
ali@47: 	if (s[2]!='-')
ali@47: 	{
ali@47: 	    if (pswit[ECHO_SWITCH])
ali@47: 		printf("\n%s\n",aline);
ali@47: 	    if (!pswit[OVERVIEW_SWITCH])
ali@47: 		printf("    Line %ld column %d - Spaced dash?\n",
ali@47: 		  linecnt,(int)(s-aline)+1);
ali@47: 	    else
ali@47: 		cnt_dash++;
ali@47: 	}
ali@47:     }
ali@47:     else if ((s=strstr(aline,"- ")))
ali@47:     {
ali@47: 	if (s==aline || s[-1]!='-')
ali@47: 	{
ali@47: 	    if (pswit[ECHO_SWITCH])
ali@47: 		printf("\n%s\n",aline);
ali@47: 	    if (!pswit[OVERVIEW_SWITCH])
ali@47: 		printf("    Line %ld column %d - Spaced dash?\n",
ali@47: 		  linecnt,(int)(s-aline)+1);
ali@47: 	    else
ali@47: 		cnt_dash++;
ali@47: 	}
ali@47:     }
ali@47: }
ali@47: 
ali@47: /*
ali@48:  * check_for_unmarked_paragraphs:
ali@48:  *
ali@48:  * Check for unmarked paragraphs indicated by separate speakers.
ali@48:  *
ali@48:  * May well be false positive:
ali@48:  * "Bravo!" "Wonderful!" called the crowd.
ali@48:  * but useful all the same.
ali@48:  */
ali@48: void check_for_unmarked_paragraphs(const char *aline)
ali@48: {
ali@48:     const char *s;
ali@48:     s=strstr(aline,"\"  \"");
ali@48:     if (!s)
ali@48: 	s=strstr(aline,"\" \"");
ali@48:     if (s)
ali@48:     {
ali@48: 	if (pswit[ECHO_SWITCH])
ali@48: 	    printf("\n%s\n",aline);
ali@48: 	if (!pswit[OVERVIEW_SWITCH])
ali@48: 	    printf("    Line %ld column %d - Query missing paragraph break?\n",
ali@48: 	      linecnt,(int)(s-aline)+1);
ali@48: 	else
ali@48: 	    cnt_punct++;
ali@48:     }
ali@48: }
ali@48: 
ali@48: /*
ali@49:  * check_for_jeebies:
ali@49:  *
ali@49:  * Check for "to he" and other easy h/b errors.
ali@49:  *
ali@49:  * This is a very inadequate effort on the h/b problem,
ali@49:  * but the phrase "to he" is always an error, whereas "to
ali@49:  * be" is quite common.
ali@49:  * Similarly, '"Quiet!", be said.' is a non-be error
ali@49:  * "to he" is _not_ always an error!:
ali@49:  *       "Where they went to he couldn't say."
ali@49:  * Another false positive:
ali@49:  *       What would "Cinderella" be without the . . .
ali@49:  * and another: "If he wants to he can see for himself."
ali@49:  */
ali@49: void check_for_jeebies(const char *aline)
ali@49: {
ali@49:     const char *s;
ali@49:     s=strstr(aline," be could ");
ali@49:     if (!s)
ali@49: 	s=strstr(aline," be would ");
ali@49:     if (!s)
ali@49: 	s=strstr(aline," was be ");
ali@49:     if (!s)
ali@49: 	s=strstr(aline," be is ");
ali@49:     if (!s)
ali@49: 	s=strstr(aline," is be ");
ali@49:     if (!s)
ali@49: 	s=strstr(aline,"\", be ");
ali@49:     if (!s)
ali@49: 	s=strstr(aline,"\" be ");
ali@49:     if (!s)
ali@49: 	s=strstr(aline,"\" be ");
ali@49:     if (!s)
ali@49: 	s=strstr(aline," to he ");
ali@49:     if (s)
ali@49:     {
ali@49: 	if (pswit[ECHO_SWITCH])
ali@49: 	    printf("\n%s\n",aline);
ali@49: 	if (!pswit[OVERVIEW_SWITCH])
ali@49: 	    printf("    Line %ld column %d - Query he/be error?\n",
ali@49: 	      linecnt,(int)(s-aline)+1);
ali@49: 	else
ali@49: 	    cnt_word++;
ali@49:     }
ali@49:     s=strstr(aline," the had ");
ali@49:     if (!s)
ali@49: 	s=strstr(aline," a had ");
ali@49:     if (!s)
ali@49: 	s=strstr(aline," they bad ");
ali@49:     if (!s)
ali@49: 	s=strstr(aline," she bad ");
ali@49:     if (!s)
ali@49: 	s=strstr(aline," he bad ");
ali@49:     if (!s)
ali@49: 	s=strstr(aline," you bad ");
ali@49:     if (!s)
ali@49: 	s=strstr(aline," i bad ");
ali@49:     if (s)
ali@49:     {
ali@49: 	if (pswit[ECHO_SWITCH])
ali@49: 	    printf("\n%s\n",aline);
ali@49: 	if (!pswit[OVERVIEW_SWITCH])
ali@49: 	    printf("    Line %ld column %d - Query had/bad error?\n",
ali@49: 	      linecnt,(int)(s-aline)+1);
ali@49: 	else
ali@49: 	    cnt_word++;
ali@49:     }
ali@49:     s=strstr(aline,"; hut ");
ali@49:     if (!s)
ali@49: 	s=strstr(aline,", hut ");
ali@49:     if (s)
ali@49:     {
ali@49: 	if (pswit[ECHO_SWITCH])
ali@49: 	    printf("\n%s\n",aline);
ali@49: 	if (!pswit[OVERVIEW_SWITCH])
ali@49: 	    printf("    Line %ld column %d - Query hut/but error?\n",
ali@49: 	      linecnt,(int)(s-aline)+1);
ali@49: 	else
ali@49: 	    cnt_word++;
ali@49:     }
ali@49: }
ali@49: 
ali@49: /*
ali@50:  * check_for_mta_from:
ali@50:  *
ali@50:  * Special case - angled bracket in front of "From" placed there by an
ali@50:  * MTA when sending an e-mail.
ali@50:  */
ali@50: void check_for_mta_from(const char *aline)
ali@50: {
ali@50:     const char *s;
ali@50:     s=strstr(aline,">From");
ali@50:     if (s)
ali@50:     {
ali@50: 	if (pswit[ECHO_SWITCH])
ali@50: 	    printf("\n%s\n",aline);
ali@50: 	if (!pswit[OVERVIEW_SWITCH])
ali@50: 	    printf("    Line %ld column %d - Query angled bracket with From\n",
ali@50: 	      linecnt,(int)(s-aline)+1);
ali@50: 	else
ali@50: 	    cnt_punct++;
ali@50:     }
ali@50: }
ali@50: 
ali@50: /*
ali@51:  * check_for_orphan_character:
ali@51:  *
ali@51:  * Check for a single character line -
ali@51:  * often an overflow from bad wrapping.
ali@51:  */
ali@51: void check_for_orphan_character(const char *aline)
ali@51: {
ali@51:     if (*aline && !aline[1])
ali@51:     {
ali@51: 	if (*aline=='I' || *aline=='V' || *aline=='X' || *aline=='L' ||
ali@51: 	  gcisdigit(*aline))
ali@51: 	    ; /* Nothing - ignore numerals alone on a line. */
ali@51: 	else
ali@51: 	{
ali@51: 	    if (pswit[ECHO_SWITCH])
ali@51: 		printf("\n%s\n",aline);
ali@51: 	    if (!pswit[OVERVIEW_SWITCH])
ali@51: 		printf("    Line %ld column 1 - Query single character line\n",
ali@51: 		  linecnt);
ali@51: 	    else
ali@51: 		cnt_punct++;
ali@51: 	}
ali@51:     }
ali@51: }
ali@51: 
ali@51: /*
ali@52:  * check_for_pling_scanno:
ali@52:  *
ali@52:  * Check for I" - often should be !
ali@52:  */
ali@52: void check_for_pling_scanno(const char *aline)
ali@52: {
ali@52:     const char *s;
ali@52:     s=strstr(aline," I\"");
ali@52:     if (s)
ali@52:     {
ali@52: 	if (pswit[ECHO_SWITCH])
ali@52: 	    printf("\n%s\n",aline);
ali@52: 	if (!pswit[OVERVIEW_SWITCH])
ali@52: 	    printf("    Line %ld column %ld - Query I=exclamation mark?\n",
ali@52: 	      linecnt,s-aline);
ali@52: 	else
ali@52: 	    cnt_punct++;
ali@52:     }
ali@52: }
ali@52: 
ali@52: /*
ali@53:  * check_for_extra_period:
ali@53:  *
ali@53:  * Check for period without a capital letter. Cut-down from gutspell.
ali@53:  * Only works when it happens on a single line.
ali@53:  */
ali@53: void check_for_extra_period(const char *aline,const struct warnings *warnings)
ali@53: {
ali@53:     const char *s,*t,*s1;
ali@53:     signed int i,istypo,isdup;
ali@53:     static char qperiod[MAX_QWORD][MAX_QWORD_LENGTH];
ali@53:     static int qperiod_index=0;
ali@53:     char testword[MAXWORDLEN]="";
ali@53:     if (pswit[PARANOID_SWITCH])
ali@53:     {
ali@53: 	for (t=s=aline;strstr(t,". ");)
ali@53: 	{
ali@53: 	    t=strstr(t,". ");
ali@53: 	    if (t==s)
ali@53: 	    {
ali@53: 		t++;
ali@53: 		/* start of line punctuation is handled elsewhere */
ali@53: 		continue;
ali@53: 	    }
ali@53: 	    if (!gcisalpha(t[-1]))
ali@53: 	    {
ali@53: 		t++;
ali@53: 		continue;
ali@53: 	    }
ali@53: 	    if (warnings->isDutch)
ali@53: 	    {
ali@53: 		/* For Frank & Jeroen -- 's Middags case */
ali@53: 		if (t[2]==CHAR_SQUOTE && t[3]>='a' && t[3]<='z' &&
ali@53: 		  t[4]==CHAR_SPACE && t[5]>='A' && t[5]<='Z')
ali@53: 		{
ali@53: 		    t++;
ali@53: 		    continue;
ali@53: 		}
ali@53: 	    }
ali@53: 	    s1=t+2;
ali@53: 	    while (*s1 && !gcisalpha(*s1) && !isdigit(*s1))
ali@53: 		s1++;
ali@53: 	    if (*s1>='a' && *s1<='z')
ali@53: 	    {
ali@53: 		/* we have something to investigate */
ali@53: 		istypo=1;
ali@53: 		/* so let's go back and find out */
ali@53: 		for (s1=t-1;s1>=s &&
ali@53: 		  (gcisalpha(*s1) || gcisdigit(*s1) || *s1==CHAR_SQUOTE &&
ali@53: 		  gcisalpha(s1[1]) && gcisalpha(s1[-1]));s1--)
ali@53: 		    ;
ali@53: 		s1++;
ali@53: 		for (i=0;*s1 && *s1!='.';s1++,i++)
ali@53: 		    testword[i]=*s1;
ali@53: 		testword[i]=0;
ali@53: 		for (i=0;*abbrev[i];i++)
ali@53: 		    if (!strcmp(testword,abbrev[i]))
ali@53: 			istypo=0;
ali@53: 		if (gcisdigit(*testword))
ali@53: 		    istypo=0;
ali@53: 		if (!testword[1])
ali@53: 		    istypo=0;
ali@53: 		if (isroman(testword))
ali@53: 		    istypo=0;
ali@53: 		if (istypo)
ali@53: 		{
ali@53: 		    istypo=0;
ali@53: 		    for (i=0;testword[i];i++)
ali@53: 			if (strchr(vowels,testword[i]))
ali@53: 			    istypo=1;
ali@53: 		}
ali@53: 		if (istypo)
ali@53: 		{
ali@53: 		    isdup=0;
ali@53: 		    if (strlen(testword)<MAX_QWORD_LENGTH &&
ali@53: 		      !pswit[VERBOSE_SWITCH])
ali@53: 			for (i=0;i<qperiod_index;i++)
ali@53: 			    if (!strcmp(testword,qperiod[i]))
ali@53: 				isdup=1;
ali@53: 		    if (!isdup)
ali@53: 		    {
ali@53: 			if (qperiod_index<MAX_QWORD &&
ali@53: 			  strlen(testword)<MAX_QWORD_LENGTH)
ali@53: 			{
ali@53: 			    strcpy(qperiod[qperiod_index],testword);
ali@53: 			    qperiod_index++;
ali@53: 			}
ali@53: 			if (pswit[ECHO_SWITCH])
ali@53: 			    printf("\n%s\n",aline);
ali@53: 			if (!pswit[OVERVIEW_SWITCH])
ali@53: 			    printf("    Line %ld column %d - Extra period?\n",
ali@53: 			      linecnt,(int)(t-aline)+1);
ali@53: 			else
ali@53: 			    cnt_punct++;
ali@53: 		    }
ali@53: 		}
ali@53: 	    }
ali@53: 	    t++;
ali@53: 	}
ali@53:     }
ali@53: }
ali@53: 
ali@53: /*
ali@54:  * check_for_following_punctuation:
ali@54:  *
ali@54:  * Check for words usually not followed by punctuation.
ali@54:  */
ali@54: void check_for_following_punctuation(const char *aline)
ali@54: {
ali@54:     int i;
ali@54:     const char *s,*wordstart;
ali@54:     char inword[MAXWORDLEN];
ali@54:     if (pswit[TYPO_SWITCH])
ali@54:     {
ali@54: 	for (s=aline;*s;)
ali@54: 	{
ali@54: 	    wordstart=s;
ali@54: 	    s=getaword(s,inword);
ali@54: 	    if (!*inword)
ali@54: 		continue;
ali@54: 	    lowerit(inword);
ali@54: 	    for (i=0;*nocomma[i];i++)
ali@54: 		if (!strcmp(inword,nocomma[i]))
ali@54: 		{
ali@54: 		    if (*s==',' || *s==';' || *s==':')
ali@54: 		    {
ali@54: 			if (pswit[ECHO_SWITCH])
ali@54: 			    printf("\n%s\n",aline);
ali@54: 			if (!pswit[OVERVIEW_SWITCH])
ali@54: 			    printf("    Line %ld column %d - "
ali@54: 			      "Query punctuation after %s?\n",
ali@54: 			      linecnt,(int)(s-aline)+1,inword);
ali@54: 			else
ali@54: 			    cnt_punct++;
ali@54: 		    }
ali@54: 		}
ali@54: 	    for (i=0;*noperiod[i];i++)
ali@54: 		if (!strcmp(inword,noperiod[i]))
ali@54: 		{
ali@54: 		    if (*s=='.' || *s=='!')
ali@54: 		    {
ali@54: 			if (pswit[ECHO_SWITCH])
ali@54: 			    printf("\n%s\n",aline);
ali@54: 			if (!pswit[OVERVIEW_SWITCH])
ali@54: 			    printf("    Line %ld column %d - "
ali@54: 			      "Query punctuation after %s?\n",
ali@54: 			      linecnt,(int)(s-aline)+1,inword);
ali@54: 			else
ali@54: 			    cnt_punct++;
ali@54: 		    }
ali@54: 		}
ali@54: 	}
ali@54:     }
ali@54: }
ali@54: 
ali@54: /*
ali@55:  * check_for_typos:
ali@55:  *
ali@55:  * Check for commonly mistyped words,
ali@55:  * and digits like 0 for O in a word.
ali@55:  */
ali@55: void check_for_typos(const char *aline,struct warnings *warnings)
ali@55: {
ali@55:     const char *s,*wordstart;
ali@55:     char inword[MAXWORDLEN],testword[MAXWORDLEN];
ali@55:     int i,istypo,isdup,alower,vowel,consonant;
ali@55:     static int qword_index=0;
ali@55:     for (s=aline;*s;)
ali@55:     {
ali@55: 	wordstart=s;
ali@55: 	s=getaword(s,inword);
ali@55: 	if (!*inword)
ali@55: 	    continue; /* don't bother with empty lines */
ali@55: 	if (mixdigit(inword))
ali@55: 	{
ali@55: 	    if (pswit[ECHO_SWITCH])
ali@55: 		printf("\n%s\n",aline);
ali@55: 	    if (!pswit[OVERVIEW_SWITCH])
ali@55: 		printf("    Line %ld column %d - Query digit in %s\n",
ali@55: 		  linecnt,(int)(wordstart-aline)+1,inword);
ali@55: 	    else
ali@55: 		cnt_word++;
ali@55: 	}
ali@55: 	/*
ali@55: 	 * Put the word through a series of tests for likely typos and OCR
ali@55: 	 * errors.
ali@55: 	 */
ali@55: 	if (pswit[TYPO_SWITCH])
ali@55: 	{
ali@55: 	    istypo=0;
ali@55: 	    strcpy(testword,inword);
ali@55: 	    alower=0;
ali@55: 	    for (i=0;i<(signed int)strlen(testword);i++)
ali@55: 	    {
ali@55: 		/* lowercase for testing */
ali@55: 		if (testword[i]>='a' && testword[i]<='z')
ali@55: 		    alower=1;
ali@55: 		if (alower && testword[i]>='A' && testword[i]<='Z')
ali@55: 		{
ali@55: 		    /*
ali@55: 		     * We have an uppercase mid-word. However, there are
ali@55: 		     * common cases:
ali@55: 		     *   Mac and Mc like McGill
ali@55: 		     *   French contractions like l'Abbe
ali@55: 		     */
ali@55: 		    if (i==2 && testword[0]=='m' && testword[1]=='c' ||
ali@55: 		      i==3 && testword[0]=='m' && testword[1]=='a' &&
ali@55: 		      testword[2]=='c' || i>0 && testword[i-1]==CHAR_SQUOTE)
ali@55: 			; /* do nothing! */
ali@55: 		    else
ali@55: 			istypo=1;
ali@55: 		}
ali@55: 		testword[i]=(char)tolower(testword[i]);
ali@55: 	    }
ali@55: 	    /*
ali@55: 	     * Check for certain unlikely two-letter combinations at word
ali@55: 	     * start and end.
ali@55: 	     */
ali@55: 	    if (strlen(testword)>1)
ali@55: 	    {
ali@55: 		for (i=0;*nostart[i];i++)
ali@55: 		    if (!strncmp(testword,nostart[i],2))
ali@55: 			istypo=1;
ali@55: 		for (i=0;*noend[i];i++)
ali@55: 		    if (!strncmp(testword+strlen(testword)-2,noend[i],2))
ali@55: 			istypo=1;
ali@55: 	    }
ali@55: 	    /* ght is common, gbt never. Like that. */
ali@55: 	    if (strstr(testword,"cb"))
ali@55: 		istypo=1;
ali@55: 	    if (strstr(testword,"gbt"))
ali@55: 		istypo=1;
ali@55: 	    if (strstr(testword,"pbt"))
ali@55: 		istypo=1;
ali@55: 	    if (strstr(testword,"tbs"))
ali@55: 		istypo=1;
ali@55: 	    if (strstr(testword,"mrn"))
ali@55: 		istypo=1;
ali@55: 	    if (strstr(testword,"ahle"))
ali@55: 		istypo=1;
ali@55: 	    if (strstr(testword,"ihle"))
ali@55: 		istypo=1;
ali@55: 	    /*
ali@55: 	     * "TBE" does happen - like HEARTBEAT - but uncommon.
ali@55: 	     * Also "TBI" - frostbite, outbid - but uncommon.
ali@55: 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman
ali@55: 	     * numerals, but "ii" is a common scanno.
ali@55: 	     */
ali@55: 	    if (strstr(testword,"tbi"))
ali@55: 		istypo=1;
ali@55: 	    if (strstr(testword,"tbe"))
ali@55: 		istypo=1;
ali@55: 	    if (strstr(testword,"ii"))
ali@55: 		istypo=1;
ali@55: 	    /*
ali@55: 	     * Check for no vowels or no consonants.
ali@55: 	     * If none, flag a typo.
ali@55: 	     */
ali@55: 	    if (!istypo && strlen(testword)>1)
ali@55: 	    {
ali@55: 		vowel=consonant=0;
ali@55: 		for (i=0;testword[i];i++)
ali@55: 		{
ali@55: 		    if (testword[i]=='y' || gcisdigit(testword[i]))
ali@55: 		    {
ali@55: 			/* Yah, this is loose. */
ali@55: 			vowel++;
ali@55: 			consonant++;
ali@55: 		    }
ali@55: 		    else if (strchr(vowels,testword[i]))
ali@55: 			vowel++;
ali@55: 		    else
ali@55: 			consonant++;
ali@55: 		}
ali@55: 		if (!vowel || !consonant)
ali@55: 		    istypo=1;
ali@55: 	    }
ali@55: 	    /*
ali@55: 	     * Now exclude the word from being reported if it's in
ali@55: 	     * the okword list.
ali@55: 	     */
ali@55: 	    for (i=0;*okword[i];i++)
ali@55: 		if (!strcmp(testword,okword[i]))
ali@55: 		    istypo=0;
ali@55: 	    /*
ali@55: 	     * What looks like a typo may be a Roman numeral.
ali@55: 	     * Exclude these.
ali@55: 	     */
ali@55: 	    if (istypo && isroman(testword))
ali@55: 		istypo=0;
ali@55: 	    /* Check the manual list of typos. */
ali@55: 	    if (!istypo)
ali@55: 		for (i=0;*typo[i];i++)
ali@55: 		    if (!strcmp(testword,typo[i]))
ali@55: 			istypo=1;
ali@55: 	    /*
ali@55: 	     * Check lowercase s, l, i and m - special cases.
ali@55: 	     *   "j" - often a semi-colon gone wrong.
ali@55: 	     *   "d" for a missing apostrophe - he d
ali@55: 	     *   "n" for "in"
ali@55: 	     */
ali@55: 	    if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword))
ali@55: 		istypo=1;
ali@55: 	    if (istypo)
ali@55: 	    {
ali@55: 		isdup=0;
ali@55: 		if (strlen(testword)<MAX_QWORD_LENGTH &&
ali@55: 		  !pswit[VERBOSE_SWITCH])
ali@55: 		    for (i=0;i<qword_index;i++)
ali@55: 			if (!strcmp(testword,qword[i]))
ali@55: 			{
ali@55: 			    isdup=1;
ali@55: 			    ++dupcnt[i];
ali@55: 			}
ali@55: 		if (!isdup)
ali@55: 		{
ali@55: 		    if (qword_index<MAX_QWORD &&
ali@55: 		      strlen(testword)<MAX_QWORD_LENGTH)
ali@55: 		    {
ali@55: 			strcpy(qword[qword_index],testword);
ali@55: 			qword_index++;
ali@55: 		    }
ali@55: 		    if (pswit[ECHO_SWITCH])
ali@55: 			printf("\n%s\n",aline);
ali@55: 		    if (!pswit[OVERVIEW_SWITCH])
ali@55: 		    {
ali@55: 			printf("    Line %ld column %d - Query word %s",
ali@55: 			  linecnt,(int)(wordstart-aline)+1,inword);
ali@55: 			if (strlen(testword)<MAX_QWORD_LENGTH &&
ali@55: 			  !pswit[VERBOSE_SWITCH])
ali@55: 			    printf(" - not reporting duplicates");
ali@55: 			printf("\n");
ali@55: 		    }
ali@55: 		    else
ali@55: 			cnt_word++;
ali@55: 		}
ali@55: 	    }
ali@55: 	}
ali@55: 	/* check the user's list of typos */
ali@55: 	if (!istypo && usertypo_count)
ali@55: 	    for (i=0;i<usertypo_count;i++)
ali@55: 		if (!strcmp(testword,usertypo[i]))
ali@55: 		{
ali@55: 		    if (pswit[ECHO_SWITCH])
ali@55: 			printf("\n%s\n",aline);
ali@55: 		    if (!pswit[OVERVIEW_SWITCH])  
ali@55: 			printf("    Line %ld column %d - "
ali@55: 			  "Query possible scanno %s\n",
ali@55: 			  linecnt,(int)(wordstart-aline)+2,inword);
ali@55: 		}
ali@55: 	if (pswit[PARANOID_SWITCH] && warnings->digit)
ali@55: 	{
ali@55: 	    /* In paranoid mode, query all 0 and 1 standing alone. */
ali@55: 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))
ali@55: 	    {
ali@55: 		if (pswit[ECHO_SWITCH])
ali@55: 		    printf("\n%s\n",aline);
ali@55: 		if (!pswit[OVERVIEW_SWITCH])
ali@55: 		    printf("    Line %ld column %d - Query standalone %s\n",
ali@55: 		      linecnt,(int)(wordstart-aline)+2,inword);
ali@55: 		else
ali@55: 		    cnt_word++;
ali@55: 	    }
ali@55: 	}
ali@55:     }
ali@55: }
ali@55: 
ali@56: struct parities {
ali@56:     int dquote,squote;
ali@56: };
ali@56: 
ali@56: /*
ali@56:  * check_for_misspaced_punctuation:
ali@56:  *
ali@56:  * Look for added or missing spaces around punctuation and quotes.
ali@56:  * If there is a punctuation character like ! with no space on
ali@56:  * either side, suspect a missing!space. If there are spaces on
ali@56:  * both sides , assume a typo. If we see a double quote with no
ali@56:  * space or punctuation on either side of it, assume unspaced
ali@56:  * quotes "like"this.
ali@56:  */
ali@56: void check_for_misspaced_punctuation(const char *aline,
ali@56:   struct parities *parities,int isemptyline)
ali@56: {
ali@56:     int i,llen,isacro,isellipsis;
ali@56:     const char *s;
ali@56:     llen=strlen(aline);
ali@56:     for (i=1;i<llen;i++)
ali@56:     {
ali@56: 	/* For each character in the line after the first. */
ali@56: 	if (strchr(".?!,;:_",aline[i]))  /* if it's punctuation */
ali@56: 	{
ali@56: 	    /* we need to suppress warnings for acronyms like M.D. */
ali@56: 	    isacro=0;
ali@56: 	    /* we need to suppress warnings for ellipsis . . . */
ali@56: 	    isellipsis=0;
ali@56: 	    /* if there are letters on both sides of it or ... */
ali@56: 	    if (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1]) ||
ali@56: 	       gcisalpha(aline[i+1]) && strchr("?!,;:",aline[i]))
ali@56: 	    {
ali@56: 		/* ...if it's strict punctuation followed by an alpha */
ali@56: 		if (aline[i]=='.')
ali@56: 		{
ali@56: 		    if (i>2 && aline[i-2]=='.')
ali@56: 			isacro=1;
ali@56: 		    if (i+2<llen && aline[i+2]=='.')
ali@56: 			isacro=1;
ali@56: 		}
ali@56: 		if (!isacro)
ali@56: 		{
ali@56: 		    if (pswit[ECHO_SWITCH])
ali@56: 			printf("\n%s\n",aline);
ali@56: 		    if (!pswit[OVERVIEW_SWITCH])
ali@56: 			printf("    Line %ld column %d - Missing space?\n",
ali@56: 			  linecnt,i+1);
ali@56: 		    else
ali@56: 			cnt_punct++;
ali@56: 		}
ali@56: 	    }
ali@56: 	    if (aline[i-1]==CHAR_SPACE &&
ali@56: 	      (aline[i+1]==CHAR_SPACE || aline[i+1]==0))
ali@56: 	    {
ali@56: 		/*
ali@56: 		 * If there are spaces on both sides,
ali@56: 		 * or space before and end of line.
ali@56: 		 */
ali@56: 		if (aline[i]=='.')
ali@56: 		{
ali@56: 		    if (i>2 && aline[i-2]=='.')
ali@56: 			isellipsis=1;
ali@56: 		    if (i+2<llen && aline[i+2]=='.')
ali@56: 			isellipsis=1;
ali@56: 		}
ali@56: 		if (!isemptyline && !isellipsis)
ali@56: 		{
ali@56: 		    if (pswit[ECHO_SWITCH])
ali@56: 			printf("\n%s\n",aline);
ali@56: 		    if (!pswit[OVERVIEW_SWITCH])
ali@56: 			printf("    Line %ld column %d - "
ali@56: 			  "Spaced punctuation?\n",linecnt,i+1);
ali@56: 		    else
ali@56: 			cnt_punct++;
ali@56: 		}
ali@56: 	    }
ali@56: 	}
ali@56:     }
ali@56:     /* Split out the characters that CANNOT be preceded by space. */
ali@56:     llen=strlen(aline);
ali@56:     for (i=1;i<llen;i++)
ali@56:     {
ali@56: 	/* for each character in the line after the first */
ali@56: 	if (strchr("?!,;:",aline[i]))
ali@56: 	{
ali@56: 	    /* if it's punctuation that _cannot_ have a space before it */
ali@56: 	    if (aline[i-1]==CHAR_SPACE && !isemptyline &&
ali@56: 	      aline[i+1]!=CHAR_SPACE)
ali@56: 	    {
ali@56: 		/*
ali@56: 		 * If aline[i+1) DOES == space,
ali@56: 		 * it was already reported just above.
ali@56: 		 */
ali@56: 		if (pswit[ECHO_SWITCH])
ali@56: 		    printf("\n%s\n",aline);
ali@56: 		if (!pswit[OVERVIEW_SWITCH])
ali@56: 		    printf("    Line %ld column %d - Spaced punctuation?\n",
ali@56: 		      linecnt,i+1);
ali@56: 		else
ali@56: 		    cnt_punct++;
ali@56: 	    }
ali@56: 	}
ali@56:     }
ali@56:     /*
ali@56:      * Special case " .X" where X is any alpha.
ali@56:      * This plugs a hole in the acronym code above.
ali@56:      * Inelegant, but maintainable.
ali@56:      */
ali@56:     llen=strlen(aline);
ali@56:     for (i=1;i<llen;i++)
ali@56:     {
ali@56: 	/* for each character in the line after the first */
ali@56: 	if (aline[i]=='.')
ali@56: 	{
ali@56: 	    /* if it's a period */
ali@56: 	    if (aline[i-1]==CHAR_SPACE && gcisalpha(aline[i+1]))
ali@56: 	    {
ali@56: 		/*
ali@56: 		 * If the period follows a space and
ali@56: 		 * is followed by a letter.
ali@56: 		 */
ali@56: 		if (pswit[ECHO_SWITCH])
ali@56: 		    printf("\n%s\n",aline);
ali@56: 		if (!pswit[OVERVIEW_SWITCH])
ali@56: 		    printf("    Line %ld column %d - Spaced punctuation?\n",
ali@56: 		      linecnt,i+1);
ali@56: 		else
ali@56: 		    cnt_punct++;
ali@56: 	    }
ali@56: 	}
ali@56:     }
ali@56:     for (i=1;i<llen;i++)
ali@56:     {
ali@56: 	/* for each character in the line after the first */
ali@56: 	if (aline[i]==CHAR_DQUOTE)
ali@56: 	{
ali@56: 	    if (!strchr(" _-.'`,;:!/([{?}])",aline[i-1]) &&
ali@56: 	      !strchr(" _-.'`,;:!/([{?}])",aline[i+1]) && aline[i+1] ||
ali@56: 	      !strchr(" _-([{'`",aline[i-1]) && gcisalpha(aline[i+1]))
ali@56: 	    {
ali@56: 		if (pswit[ECHO_SWITCH])
ali@56: 		    printf("\n%s\n",aline);
ali@56: 		if (!pswit[OVERVIEW_SWITCH])
ali@56: 		    printf("    Line %ld column %d - Unspaced quotes?\n",
ali@56: 		      linecnt,i+1);
ali@56: 		else
ali@56: 		    cnt_punct++;
ali@56: 	    }
ali@56: 	}
ali@56:     }
ali@56:     /* Check parity of quotes. */
ali@56:     for (s=aline;*s;s++)
ali@56:     {
ali@56: 	if (*s==CHAR_DQUOTE)
ali@56: 	{
ali@56: 	    parities->dquote=!parities->dquote;
ali@56: 	    if (!parities->dquote)
ali@56: 	    {
ali@56: 		/* parity even */
ali@56: 		if (!strchr("_-.'`/,;:!?)]} ",s[1]))
ali@56: 		{
ali@56: 		    if (pswit[ECHO_SWITCH])
ali@56: 			printf("\n%s\n",aline);
ali@56: 		    if (!pswit[OVERVIEW_SWITCH])
ali@56: 			printf("    Line %ld column %d - "
ali@56: 			  "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);
ali@56: 		    else
ali@56: 			cnt_punct++;
ali@56: 		}
ali@56: 	    }
ali@56: 	    else
ali@56: 	    {
ali@56: 		/* parity odd */
ali@56: 		if (!gcisalpha(s[1]) && !isdigit(s[1]) &&
ali@56: 		  !strchr("_-/.'`([{$",s[1]) || !s[1])
ali@56: 		{
ali@56: 		    if (pswit[ECHO_SWITCH])
ali@56: 			printf("\n%s\n",aline);
ali@56: 		    if (!pswit[OVERVIEW_SWITCH])
ali@56: 			printf("    Line %ld column %d - "
ali@56: 			  "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);
ali@56: 		    else
ali@56: 			cnt_punct++;
ali@56: 		}
ali@56: 	    }
ali@56: 	}
ali@56:     }
ali@56:     if (*aline==CHAR_DQUOTE)
ali@56:     {
ali@56: 	if (strchr(",;:!?)]} ",aline[1]))
ali@56: 	{
ali@56: 	    if (pswit[ECHO_SWITCH])
ali@56: 		printf("\n%s\n",aline);
ali@56: 	    if (!pswit[OVERVIEW_SWITCH])
ali@56: 		printf("    Line %ld column 1 - Wrongspaced quotes?\n",
ali@56: 		  linecnt);
ali@56: 	    else
ali@56: 		cnt_punct++;
ali@56: 	}
ali@56:     }
ali@56:     if (pswit[SQUOTE_SWITCH])
ali@56:     {
ali@56: 	for (s=aline;*s;s++)
ali@56: 	{
ali@56: 	    if ((*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE) &&
ali@56: 	      (s==aline || s>aline && !gcisalpha(s[-1]) ||
ali@56: 	      !gcisalpha(s[1])))
ali@56: 	    {
ali@56: 		parities->squote=!parities->squote;
ali@56: 		if (!parities->squote)
ali@56: 		{
ali@56: 		    /* parity even */
ali@56: 		    if (!strchr("_-.'`/\",;:!?)]} ",s[1]))
ali@56: 		    {
ali@56: 			if (pswit[ECHO_SWITCH])
ali@56: 			    printf("\n%s\n",aline);
ali@56: 			if (!pswit[OVERVIEW_SWITCH])
ali@56: 			    printf("    Line %ld column %d - "
ali@56: 			      "Wrongspaced singlequotes?\n",
ali@56: 			      linecnt,(int)(s-aline)+1);
ali@56: 			else
ali@56: 			    cnt_punct++;
ali@56: 		    }
ali@56: 		}
ali@56: 		else
ali@56: 		{
ali@56: 		    /* parity odd */
ali@56: 		    if (!gcisalpha(s[1]) && !isdigit(s[1]) &&
ali@56: 		      !strchr("_-/\".'`",s[1]) || !s[1])
ali@56: 		    {
ali@56: 			if (pswit[ECHO_SWITCH])
ali@56: 			    printf("\n%s\n",aline);
ali@56: 			if (!pswit[OVERVIEW_SWITCH])
ali@56: 			    printf("    Line %ld column %d - "
ali@56: 			      "Wrongspaced singlequotes?\n",
ali@56: 			      linecnt,(int)(s-aline)+1);
ali@56: 			else
ali@56: 			    cnt_punct++;
ali@56: 		    }
ali@56: 		}
ali@56: 	    }
ali@56: 	}
ali@56:     }
ali@56: }
ali@56: 
ali@55: /*
ali@57:  * check_for_double_punctuation:
ali@57:  *
ali@57:  * Look for double punctuation like ,. or ,,
ali@57:  * Thanks to DW for the suggestion!
ali@57:  * In books with references, ".," and ".;" are common
ali@57:  * e.g. "etc., etc.," and vol. 1.; vol 3.;
ali@57:  * OTOH, from my initial tests, there are also fairly
ali@57:  * common errors. What to do? Make these cases paranoid?
ali@57:  * ".," is the most common, so warnings->dotcomma is used
ali@57:  * to suppress detailed reporting if it occurs often.
ali@57:  */
ali@57: void check_for_double_punctuation(const char *aline,struct warnings *warnings)
ali@57: {
ali@57:     int i,llen;
ali@57:     llen=strlen(aline);
ali@57:     for (i=0;i<llen;i++)
ali@57:     {
ali@57: 	/* for each punctuation character in the line */
ali@57: 	if (strchr(".?!,;:",aline[i]) && strchr(".?!,;:",aline[i+1]) &&
ali@57: 	  aline[i] && aline[i+1])
ali@57: 	{
ali@57: 	    /* followed by punctuation, it's a query, unless . . . */
ali@57: 	    if (aline[i]==aline[i+1] && (aline[i]=='.' || aline[i]=='?' ||
ali@57: 	      aline[i]=='!') ||
ali@57: 	      !warnings->dotcomma && aline[i]=='.' && aline[i+1]==',' ||
ali@57: 	      warnings->isFrench && !strncmp(aline+i,",...",4) ||
ali@57: 	      warnings->isFrench && !strncmp(aline+i,"...,",4) ||
ali@57: 	      warnings->isFrench && !strncmp(aline+i,";...",4) ||
ali@57: 	      warnings->isFrench && !strncmp(aline+i,"...;",4) ||
ali@57: 	      warnings->isFrench && !strncmp(aline+i,":...",4) ||
ali@57: 	      warnings->isFrench && !strncmp(aline+i,"...:",4) ||
ali@57: 	      warnings->isFrench && !strncmp(aline+i,"!...",4) ||
ali@57: 	      warnings->isFrench && !strncmp(aline+i,"...!",4) ||
ali@57: 	      warnings->isFrench && !strncmp(aline+i,"?...",4) ||
ali@57: 	      warnings->isFrench && !strncmp(aline+i,"...?",4))
ali@57: 	    {
ali@57: 		if (warnings->isFrench && !strncmp(aline+i,",...",4) ||
ali@57: 		  warnings->isFrench && !strncmp(aline+i,"...,",4) ||
ali@57: 		  warnings->isFrench && !strncmp(aline+i,";...",4) ||
ali@57: 		  warnings->isFrench && !strncmp(aline+i,"...;",4) ||
ali@57: 		  warnings->isFrench && !strncmp(aline+i,":...",4) ||
ali@57: 		  warnings->isFrench && !strncmp(aline+i,"...:",4) ||
ali@57: 		  warnings->isFrench && !strncmp(aline+i,"!...",4) ||
ali@57: 		  warnings->isFrench && !strncmp(aline+i,"...!",4) ||
ali@57: 		  warnings->isFrench && !strncmp(aline+i,"?...",4) ||
ali@57: 		  warnings->isFrench && !strncmp(aline+i,"...?",4))
ali@57: 		    i+=4;
ali@57: 		; /* do nothing for .. !! and ?? which can be legit */
ali@57: 	    }
ali@57: 	    else
ali@57: 	    {
ali@57: 		if (pswit[ECHO_SWITCH])
ali@57: 		    printf("\n%s\n",aline);
ali@57: 		if (!pswit[OVERVIEW_SWITCH])
ali@57: 		    printf("    Line %ld column %d - Double punctuation?\n",
ali@57: 		      linecnt,i+1);
ali@57: 		else
ali@57: 		    cnt_punct++;
ali@57: 	    }
ali@57: 	}
ali@57:     }
ali@57: }
ali@57: 
ali@57: /*
ali@58:  * check_for_spaced_quotes:
ali@58:  */
ali@58: void check_for_spaced_quotes(const char *aline)
ali@58: {
ali@58:     const char *s,*t;
ali@58:     s=aline;
ali@58:     while ((t=strstr(s," \" ")))
ali@58:     {
ali@58: 	if (pswit[ECHO_SWITCH])
ali@58: 	    printf("\n%s\n",aline);
ali@58: 	if (!pswit[OVERVIEW_SWITCH])
ali@58: 	    printf("    Line %ld column %d - Spaced doublequote?\n",
ali@58: 	      linecnt,(int)(t-aline+1));
ali@58: 	else
ali@58: 	    cnt_punct++;
ali@58: 	s=t+2;
ali@58:     }
ali@58:     s=aline;
ali@58:     while ((t=strstr(s," ' ")))
ali@58:     {
ali@58: 	if (pswit[ECHO_SWITCH])
ali@58: 	    printf("\n%s\n",aline);
ali@58: 	if (!pswit[OVERVIEW_SWITCH])
ali@58: 	    printf("    Line %ld column %d - Spaced singlequote?\n",
ali@58: 	      linecnt,(int)(t-aline+1));
ali@58: 	else
ali@58: 	    cnt_punct++;
ali@58: 	s=t+2;
ali@58:     }
ali@58:     s=aline;
ali@58:     while ((t=strstr(s," ` ")))
ali@58:     {
ali@58: 	if (pswit[ECHO_SWITCH])
ali@58: 	    printf("\n%s\n",aline);
ali@58: 	if (!pswit[OVERVIEW_SWITCH])
ali@58: 	    printf("    Line %ld column %d - Spaced singlequote?\n",
ali@58: 	      linecnt,(int)(t-aline+1));
ali@58: 	else
ali@58: 	    cnt_punct++;
ali@58: 	s=t+2;
ali@58:     }
ali@58: }
ali@58: 
ali@58: /*
ali@59:  * check_for_miscased_genative:
ali@59:  *
ali@59:  * Check special case of 'S instead of 's at end of word.
ali@59:  */
ali@59: void check_for_miscased_genative(const char *aline)
ali@59: {
ali@59:     const char *s;
ali@59:     s=aline+1;
ali@59:     while (*s)
ali@59:     {
ali@59: 	if (*s==CHAR_SQUOTE && s[1]=='S' && s[-1]>='a' && s[-1]<='z')
ali@59: 	{
ali@59: 	    if (pswit[ECHO_SWITCH])
ali@59: 		printf("\n%s\n",aline);
ali@59: 	    if (!pswit[OVERVIEW_SWITCH])
ali@59: 		printf("    Line %ld column %d - Capital \"S\"?\n",
ali@59: 		  linecnt,(int)(s-aline+2));
ali@59: 	    else
ali@59: 		cnt_punct++;
ali@59: 	}
ali@59: 	s++;
ali@59:     }
ali@59: }
ali@59: 
ali@59: /*
ali@41:  * procfile:
ali@41:  *
ali@41:  * Process one file.
ali@41:  */
ali@41: void procfile(char *filename)
ali@41: {
ali@55:     const char *s,*t;
ali@41:     char parastart[81];     /* first line of current para */
ali@41:     FILE *infile;
ali@41:     struct first_pass_results *first_pass_results;
ali@42:     struct warnings *warnings;
ali@43:     struct counters counters={0};
ali@45:     struct line_properties last={0};
ali@56:     struct parities parities={0};
ali@43:     int isemptyline;
ali@43:     long squot,start_para_line;
ali@55:     signed int i,llen,isacro,isellipsis;
ali@55:     signed int isnewpara;
ali@41:     char dquote_err[80],squote_err[80],rbrack_err[80],sbrack_err[80],
ali@41:       cbrack_err[80],unders_err[80];
ali@41:     signed int enddash;
ali@45:     last.start=CHAR_SPACE;
ali@41:     *dquote_err=*squote_err=*rbrack_err=*cbrack_err=*sbrack_err=
ali@41:       *unders_err=*prevline=0;
ali@41:     linecnt=checked_linecnt=start_para_line=0;
ali@43:     squot=0;
ali@53:     i=llen=isacro=isellipsis=0;
ali@55:     isnewpara=enddash=0;
ali@41:     infile=fopen(filename,"rb");
ali@41:     if (!infile)
ali@41:     {
ali@41:         if (pswit[STDOUT_SWITCH])
ali@41:             fprintf(stdout,"bookloupe: cannot open %s\n",filename);
ali@41:         else
ali@41:             fprintf(stderr,"bookloupe: cannot open %s\n",filename);
ali@41: 	exit(1);
ali@41:     }
ali@41:     fprintf(stdout,"\n\nFile: %s\n\n",filename);
ali@41:     first_pass_results=first_pass(infile);
ali@42:     warnings=report_first_pass(first_pass_results);
ali@42:     rewind(infile);
ali@40:     /*
ali@40:      * Here we go with the main pass. Hold onto yer hat!
ali@40:      * Re-init some variables we've dirtied.
ali@40:      */
ali@43:     squot=linecnt=0;
ali@40:     while (flgets(aline,LINEBUFSIZE-1,infile,linecnt+1))
ali@40:     {
ali@0:         linecnt++;
ali@40:         if (linecnt==1)
ali@40: 	    isnewpara=1;
ali@40:         if (pswit[DP_SWITCH] && !strncmp(aline,"-----File: ",11))
ali@40: 	    continue;    // skip DP page separators completely
ali@41:         if (linecnt<first_pass_results->firstline ||
ali@41: 	  (first_pass_results->footerline>0 &&
ali@41: 	  linecnt>first_pass_results->footerline))
ali@40: 	{
ali@40:             if (pswit[HEADER_SWITCH])
ali@40: 	    {
ali@40:                 if (!strncmp(aline,"Title:",6))
ali@40:                     printf("    %s\n",aline);
ali@40:                 if (!strncmp(aline,"Author:",7))
ali@40:                     printf("    %s\n",aline);
ali@40:                 if (!strncmp(aline,"Release Date:",13))
ali@40:                     printf("    %s\n",aline);
ali@40:                 if (!strncmp(aline,"Edition:",8))
ali@40:                     printf("    %s\n\n",aline);
ali@40: 	    }
ali@0:             continue;                /* skip through the header */
ali@40: 	}
ali@0:         checked_linecnt++;
ali@40:         s=aline;
ali@40:         /*
ali@40: 	 * If we are in a state of unbalanced quotes, and this line
ali@40:          * doesn't begin with a quote, output the stored error message.
ali@40:          * If the -P switch was used, print the warning even if the
ali@40:          * new para starts with quotes.
ali@40: 	 */
ali@40:         t=s;
ali@40:         while (*t==' ')
ali@40: 	    t++;
ali@0:         if (*dquote_err)
ali@40:             if (*t!=CHAR_DQUOTE || pswit[QPARA_SWITCH])
ali@40: 	    {
ali@40:                 if (!pswit[OVERVIEW_SWITCH])
ali@40: 		{
ali@40:                     if (pswit[ECHO_SWITCH])
ali@40: 			printf("\n%s\n",parastart);
ali@0:                     printf(dquote_err);
ali@40: 		}
ali@0:                 else
ali@0:                     cnt_dquot++;
ali@0:             }
ali@40:         if (*squote_err)
ali@40: 	{
ali@40:             if (*t!=CHAR_SQUOTE && *t!=CHAR_OPEN_SQUOTE ||
ali@40: 	      pswit[QPARA_SWITCH] || squot)
ali@40: 	    {
ali@40:                 if (!pswit[OVERVIEW_SWITCH])
ali@40: 		{
ali@40:                     if (pswit[ECHO_SWITCH])
ali@40: 			printf("\n%s\n",parastart);
ali@0:                     printf(squote_err);
ali@40: 		}
ali@0:                 else
ali@0:                     cnt_squot++;
ali@40: 	    }
ali@40:             squot=0;
ali@40: 	}
ali@40:         if (*rbrack_err)
ali@40: 	{
ali@40:             if (!pswit[OVERVIEW_SWITCH])
ali@40: 	    {
ali@40:                 if (pswit[ECHO_SWITCH])
ali@40: 		    printf("\n%s\n",parastart);
ali@0:                 printf(rbrack_err);
ali@40: 	    }
ali@0:             else
ali@0:                 cnt_brack++;
ali@40: 	}
ali@40:         if (*sbrack_err)
ali@40: 	{
ali@40:             if (!pswit[OVERVIEW_SWITCH])
ali@40: 	    {
ali@40:                 if (pswit[ECHO_SWITCH])
ali@40: 		    printf("\n%s\n",parastart);
ali@0:                 printf(sbrack_err);
ali@40: 	    }
ali@0:             else
ali@0:                 cnt_brack++;
ali@40: 	}
ali@40:         if (*cbrack_err)
ali@40: 	{
ali@40:             if (!pswit[OVERVIEW_SWITCH])
ali@40: 	    {
ali@40:                 if (pswit[ECHO_SWITCH])
ali@40: 		    printf("\n%s\n",parastart);
ali@0:                 printf(cbrack_err);
ali@40: 	    }
ali@0:             else
ali@0:                 cnt_brack++;
ali@40: 	}
ali@40:         if (*unders_err)
ali@40: 	{
ali@40:             if (!pswit[OVERVIEW_SWITCH])
ali@40: 	    {
ali@40:                 if (pswit[ECHO_SWITCH])
ali@40: 		    printf("\n%s\n",parastart);
ali@0:                 printf(unders_err);
ali@40: 	    }
ali@0:             else
ali@0:                 cnt_brack++;
ali@40: 	}
ali@40:         *dquote_err=*squote_err=*rbrack_err=*cbrack_err= 
ali@40: 	  *sbrack_err=*unders_err=0;
ali@43: 	isemptyline=analyse_quotes(aline,&counters);
ali@40:         if (isnewpara && !isemptyline)
ali@40: 	{
ali@40: 	    /* This line is the start of a new paragraph. */
ali@40:             start_para_line=linecnt;
ali@40: 	    /* Capture its first line in case we want to report it later. */
ali@40:             strncpy(parastart,aline,80);
ali@40:             parastart[79]=0;
ali@56: 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */
ali@40:             s=aline;
ali@40:             while (!gcisalpha(*s) && !gcisdigit(*s) && *s)
ali@40: 		s++;
ali@40:             if (*s>='a' && *s<='z')
ali@40: 	    {
ali@40: 		/* and its first letter is lowercase */
ali@40:                 if (pswit[ECHO_SWITCH])
ali@40: 		    printf("\n%s\n",aline);
ali@0:                 if (!pswit[OVERVIEW_SWITCH])
ali@40:                     printf("    Line %ld column %d - "
ali@40: 		      "Paragraph starts with lower-case\n",
ali@40: 		      linecnt,(int)(s-aline)+1);
ali@0:                 else
ali@0:                     cnt_punct++;
ali@40: 	    }
ali@40:             isnewpara=0; /* Signal the end of new para processing. */
ali@40: 	}
ali@40:         /* Check for an em-dash broken at line end. */
ali@40:         if (enddash && *aline=='-')
ali@40: 	{
ali@40:             if (pswit[ECHO_SWITCH])
ali@40: 		printf("\n%s\n",aline);
ali@0:             if (!pswit[OVERVIEW_SWITCH])
ali@40:                 printf("    Line %ld column 1 - Broken em-dash?\n",linecnt);
ali@0:             else
ali@0:                 cnt_punct++;
ali@40: 	}
ali@40:         enddash=0;
ali@40:         for (s=aline+strlen(aline)-1;*s==' ' && s>aline;s--)
ali@40: 	    ;
ali@40:         if (s>=aline && *s=='-')
ali@40:             enddash=1;
ali@40: 	/*
ali@40:          * Check for invalid or questionable characters in the line
ali@40:          * Anything above 127 is invalid for plain ASCII, and
ali@40:          * non-printable control characters should also be flagged.
ali@40:          * Tabs should generally not be there.
ali@40: 	 */
ali@40:         for (s=aline;*s;s++)
ali@40: 	{
ali@40:             i=(unsigned char)*s;
ali@40:             if (i<CHAR_SPACE && i!=CHAR_LF && i!=CHAR_CR && i!=CHAR_TAB)
ali@40: 	    {
ali@40:                 if (pswit[ECHO_SWITCH])
ali@40: 		    printf("\n%s\n",aline);
ali@0:                 if (!pswit[OVERVIEW_SWITCH])
ali@40:                     printf("    Line %ld column %d - Control character %d\n",
ali@40: 		      linecnt,(int)(s-aline)+1,i);
ali@0:                 else
ali@0:                     cnt_bin++;
ali@40: 	    }
ali@40: 	}
ali@42:         if (warnings->bin)
ali@44: 	    check_for_odd_characters(aline,warnings,isemptyline);
ali@42:         if (warnings->longline)
ali@45: 	    check_for_long_line(aline);
ali@45:         if (warnings->shortline)
ali@45: 	    check_for_short_line(aline,&last);
ali@45:         last.blen=last.len;
ali@45:         last.len=strlen(aline);
ali@45:         last.start=aline[0];
ali@46: 	check_for_starting_punctuation(aline);
ali@42:         if (warnings->dash)
ali@40: 	{
ali@47: 	    check_for_spaced_emdash(aline);
ali@47: 	    check_for_spaced_dash(aline);
ali@40: 	}
ali@48: 	check_for_unmarked_paragraphs(aline);
ali@49: 	check_for_jeebies(aline);
ali@50: 	check_for_mta_from(aline);
ali@51: 	check_for_orphan_character(aline);
ali@52: 	check_for_pling_scanno(aline);
ali@53: 	check_for_extra_period(aline,warnings);
ali@54: 	check_for_following_punctuation(aline);
ali@55: 	check_for_typos(aline,warnings);
ali@56: 	check_for_misspaced_punctuation(aline,&parities,isemptyline);
ali@57: 	check_for_double_punctuation(aline,warnings);
ali@58: 	check_for_spaced_quotes(aline);
ali@59: 	check_for_miscased_genative(aline);
ali@40:         /*
ali@40: 	 * Now check special cases - start and end of line -
ali@40:          * for single and double quotes. Start is sometimes [sic]
ali@40:          * but better to query it anyway.
ali@40:          * While we're here, check for dash at end of line.
ali@40: 	 */
ali@40:         llen=strlen(aline);
ali@40:         if (llen>1)
ali@40: 	{
ali@40:             if (aline[llen-1]==CHAR_DQUOTE || aline[llen-1]==CHAR_SQUOTE ||
ali@40: 	      aline[llen-1]==CHAR_OPEN_SQUOTE)
ali@40:                 if (aline[llen-2]==CHAR_SPACE)
ali@40: 		{
ali@40:                     if (pswit[ECHO_SWITCH])
ali@40: 			printf("\n%s\n",aline);
ali@0:                     if (!pswit[OVERVIEW_SWITCH])
ali@40:                         printf("    Line %ld column %d - Spaced quote?\n",
ali@40: 			  linecnt,llen);
ali@0:                     else
ali@0:                         cnt_punct++;
ali@40: 		}
ali@40:             if ((aline[0]==CHAR_SQUOTE || aline[0]==CHAR_OPEN_SQUOTE) &&
ali@40: 	      aline[1]==CHAR_SPACE)
ali@40: 	    {
ali@40: 		if (pswit[ECHO_SWITCH])
ali@40: 		    printf("\n%s\n",aline);
ali@40: 		if (!pswit[OVERVIEW_SWITCH])
ali@40: 		    printf("    Line %ld column 1 - Spaced quote?\n",linecnt);
ali@40: 		else
ali@40: 		    cnt_punct++;
ali@40: 	    }
ali@40:             /*
ali@40: 	     * Dash at end of line may well be legit - paranoid mode only
ali@40:              * and don't report em-dash at line-end.
ali@40: 	     */
ali@42:             if (pswit[PARANOID_SWITCH] && warnings->hyphen)
ali@40: 	    {
ali@40:                 for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)
ali@40: 		    ;
ali@40:                 if (aline[i]=='-' && aline[i-1]!='-')
ali@40: 		{
ali@40:                     if (pswit[ECHO_SWITCH])
ali@40: 			printf("\n%s\n",aline);
ali@0:                     if (!pswit[OVERVIEW_SWITCH])
ali@40:                         printf("    Line %ld column %d - "
ali@40: 			  "Hyphen at end of line?\n",linecnt,i);
ali@40: 		}
ali@40: 	    }
ali@40: 	}
ali@40:         /*
ali@40: 	 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
ali@40:          * If so, suspect a scanno like "a]most".
ali@40: 	 */
ali@40:         llen=strlen(aline);
ali@40:         for (i=1;i<llen-1;i++)
ali@40: 	{
ali@40: 	    /* for each bracket character in the line except 1st & last */
ali@40:             if (strchr("{[()]}",aline[i]) && gcisalpha(aline[i-1]) &&
ali@40: 	      gcisalpha(aline[i+1]))
ali@40: 	    {
ali@40:                 if (pswit[ECHO_SWITCH])
ali@40: 		    printf("\n%s\n",aline);
ali@0:                 if (!pswit[OVERVIEW_SWITCH])
ali@40:                     printf("    Line %ld column %d - Unspaced bracket?\n",
ali@40: 		      linecnt,i);
ali@0:                 else
ali@0:                     cnt_punct++;
ali@40: 	    }
ali@40: 	}
ali@40:         llen=strlen(aline);
ali@42:         if (warnings->endquote)
ali@40: 	{
ali@40:             for (i=1;i<llen;i++)
ali@40: 	    {
ali@40: 		/* for each character in the line except 1st */
ali@40:                 if (aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))
ali@40: 		{
ali@40: 		    if (pswit[ECHO_SWITCH])
ali@40: 			printf("\n%s\n",aline);
ali@40: 		    if (!pswit[OVERVIEW_SWITCH])
ali@40: 			printf("    Line %ld column %d - "
ali@40: 			  "endquote missing punctuation?\n",linecnt,i);
ali@40: 		    else
ali@40: 			cnt_punct++;
ali@40: 		}
ali@40: 	    }
ali@40: 	}
ali@40: 	/*
ali@40:          * Check for <HTML TAG>.
ali@40:          * If there is a < in the line, followed at some point
ali@40:          * by a > then we suspect HTML.
ali@40: 	 */
ali@40:         if (strstr(aline,"<") && strstr(aline,">"))
ali@40: 	{
ali@40:             i=(signed int)(strstr(aline,">")-strstr(aline,"<")+1);
ali@40:             if (i>0)
ali@40: 	    {
ali@40:                 strncpy(wrk,strstr(aline,"<"),i);
ali@40:                 wrk[i]=0;
ali@40:                 if (pswit[ECHO_SWITCH])
ali@40: 		    printf("\n%s\n",aline);
ali@0:                 if (!pswit[OVERVIEW_SWITCH])
ali@40:                     printf("    Line %ld column %d - HTML Tag? %s \n",
ali@40: 		      linecnt,(int)(strstr(aline,"<")-aline)+1,wrk);
ali@0:                 else
ali@0:                     cnt_html++;
ali@40: 	    }
ali@40: 	}
ali@40:         /*
ali@40: 	 * Check for &symbol; HTML.
ali@40:          * If there is a & in the line, followed at
ali@40:          * some point by a ; then we suspect HTML.
ali@40: 	 */
ali@40:         if (strstr(aline,"&") && strstr(aline,";"))
ali@40: 	{
ali@40:             i=(int)(strstr(aline,";")-strstr(aline,"&")+1);
ali@40:             for (s=strstr(aline,"&");s<strstr(aline,";");s++)   
ali@40:                 if (*s==CHAR_SPACE)
ali@40: 		    i=0;                /* Don't report "Jones & Son;" */
ali@40:             if (i>0)
ali@40: 	    {
ali@40:                 strncpy(wrk,strstr(aline,"&"),i);
ali@40:                 wrk[i]=0;
ali@40:                 if (pswit[ECHO_SWITCH])
ali@40: 		    printf("\n%s\n",aline);
ali@0:                 if (!pswit[OVERVIEW_SWITCH])
ali@40:                     printf("    Line %ld column %d - HTML symbol? %s \n",
ali@40: 		      linecnt,(int)(strstr(aline,"&")-aline)+1,wrk);
ali@0:                 else
ali@0:                     cnt_html++;
ali@40: 	    }
ali@40: 	}
ali@40:         /*
ali@40: 	 * At end of paragraph, check for mismatched quotes.
ali@40:          * We don't want to report an error immediately, since it is a
ali@40:          * common convention to omit the quotes at end of paragraph if
ali@40:          * the next paragraph is a continuation of the same speaker.
ali@40:          * Where this is the case, the next para should begin with a
ali@40:          * quote, so we store the warning message and only display it
ali@40:          * at the top of the next iteration if the new para doesn't
ali@40:          * start with a quote.
ali@40:          * The -p switch overrides this default, and warns of unclosed
ali@40:          * quotes on _every_ paragraph, whether the next begins with a
ali@40:          * quote or not.
ali@40: 	 */
ali@40:         if (isemptyline)
ali@40: 	{
ali@40: 	    /* end of para - add up the totals */
ali@43:             if (counters.quot%2)
ali@40:                 sprintf(dquote_err,"    Line %ld - Mismatched quotes\n",
ali@40: 		  linecnt);
ali@43:             if (pswit[SQUOTE_SWITCH] && counters.open_single_quote &&
ali@43: 	      counters.open_single_quote!=counters.close_single_quote)
ali@40:                 sprintf(squote_err,"    Line %ld - Mismatched singlequotes?\n",
ali@40: 		  linecnt);
ali@43:             if (pswit[SQUOTE_SWITCH] && counters.open_single_quote &&
ali@43: 	      counters.open_single_quote!=counters.close_single_quote &&
ali@43: 	      counters.open_single_quote!=counters.close_single_quote+1)
ali@40: 		/*
ali@40: 		 * Flag it to be noted regardless of the
ali@40: 		 * first char of the next para.
ali@40: 		 */
ali@40:                 squot=1;
ali@43:             if (counters.r_brack)
ali@40:                 sprintf(rbrack_err,"    Line %ld - "
ali@40: 		  "Mismatched round brackets?\n",linecnt);
ali@43:             if (counters.s_brack)
ali@40:                 sprintf(sbrack_err,"    Line %ld - "
ali@40: 		  "Mismatched square brackets?\n",linecnt);
ali@43:             if (counters.c_brack)
ali@40:                 sprintf(cbrack_err,"    Line %ld - "
ali@40: 		  "Mismatched curly brackets?\n",linecnt);
ali@43:             if (counters.c_unders%2)
ali@40:                 sprintf(unders_err,"    Line %ld - Mismatched underscores?\n",
ali@40: 		  linecnt);
ali@43: 	    memset(&counters,0,sizeof(counters));
ali@40: 	    /* let the next iteration know that it's starting a new para */
ali@40:             isnewpara=1;
ali@40: 	}
ali@40:         /*
ali@40: 	 * Check for omitted punctuation at end of paragraph by working back
ali@40: 	 * through prevline. DW.
ali@40:          * Need to check this only for "normal" paras.
ali@40:          * So what is a "normal" para?
ali@40:          *    Not normal if one-liner (chapter headings, etc.)
ali@40:          *    Not normal if doesn't contain at least one locase letter
ali@40:          *    Not normal if starts with space
ali@40: 	 */
ali@40:         if (isemptyline)
ali@40: 	{
ali@40: 	    /* end of para */
ali@40:             for (s=prevline,i=0;*s && !i;s++)
ali@0:                 if (gcisletter(*s))
ali@40: 		    /* use i to indicate the presence of a letter on the line */
ali@40:                     i=1;
ali@40:             /*
ali@40: 	     * This next "if" is a problem.
ali@40:              * If we say "start_para_line <= linecnt - 1", that includes
ali@40: 	     * one-line "paragraphs" like chapter heads. Lotsa false positives.
ali@40:              * If we say "start_para_line < linecnt - 1" it doesn't, but then it
ali@40:              * misses genuine one-line paragraphs.
ali@40: 	     */
ali@45:             if (i && last.blen>2 && start_para_line<linecnt-1 &&
ali@40: 	      *prevline>CHAR_SPACE)
ali@40: 	    {
ali@40:                 for (i=strlen(prevline)-1;
ali@40: 		  (prevline[i]==CHAR_DQUOTE || prevline[i]==CHAR_SQUOTE) &&
ali@40: 		  prevline[i]>CHAR_SPACE && i>0;
ali@40: 		  i--)
ali@40: 		    ;
ali@40:                 for (;i>0;i--)
ali@40: 		{
ali@40:                     if (gcisalpha(prevline[i]))
ali@40: 		    {
ali@40:                         if (pswit[ECHO_SWITCH])
ali@40: 			    printf("\n%s\n",prevline);
ali@0:                         if (!pswit[OVERVIEW_SWITCH])
ali@40:                             printf("    Line %ld column %d - "
ali@40: 			      "No punctuation at para end?\n",
ali@40: 			      linecnt-1,strlen(prevline));
ali@0:                         else
ali@0:                             cnt_punct++;
ali@0:                         break;
ali@40: 		    }
ali@40:                     if (strchr("-.:!([{?}])",prevline[i]))
ali@0:                         break;
ali@40: 		}
ali@40: 	    }
ali@40: 	}
ali@40:         strcpy(prevline,aline);
ali@0:     }
ali@40:     fclose(infile);
ali@0:     if (!pswit[OVERVIEW_SWITCH])
ali@40:         for (i=0;i<MAX_QWORD;i++)
ali@0:             if (dupcnt[i])
ali@40:                 printf("\nNote: Queried word %s was duplicated %d time%s\n",
ali@40: 		  qword[i],dupcnt[i],"s");
ali@0: }
ali@0: 
ali@40: /*
ali@40:  * flgets:
ali@40:  *
ali@40:  * Get one line from the input stream, checking for
ali@40:  * the existence of exactly one CR/LF line-end per line.
ali@40:  *
ali@40:  * Returns: a pointer to the line.
ali@40:  */
ali@40: char *flgets(char *theline,int maxlen,FILE *thefile,long lcnt)
ali@0: {
ali@0:     char c;
ali@40:     int len,isCR,cint;
ali@40:     *theline=0;
ali@40:     len=isCR=0;
ali@40:     c=cint=fgetc(thefile);
ali@40:     do
ali@40:     {
ali@40:         if (cint==EOF)
ali@40:             return NULL;
ali@40: 	/* either way, it's end of line */
ali@40:         if (c==10)
ali@40: 	{
ali@0:             if (isCR)
ali@0:                 break;
ali@40:             else
ali@40: 	    {
ali@40: 		/* Error - a LF without a preceding CR */
ali@40:                 if (pswit[LINE_END_SWITCH])
ali@40: 		{
ali@40:                     if (pswit[ECHO_SWITCH])
ali@40: 			printf("\n%s\n",theline);
ali@0:                     if (!pswit[OVERVIEW_SWITCH])
ali@40:                         printf("    Line %ld - No CR?\n",lcnt);
ali@0:                     else
ali@0:                         cnt_lineend++;
ali@40: 		}
ali@0:                 break;
ali@40: 	    }
ali@40: 	}
ali@40:         if (c==13)
ali@40: 	{
ali@40:             if (isCR)
ali@40: 	    {
ali@40: 		/* Error - two successive CRs */
ali@40:                 if (pswit[LINE_END_SWITCH])
ali@40: 		{
ali@40:                     if (pswit[ECHO_SWITCH])
ali@40: 			printf("\n%s\n",theline);
ali@0:                     if (!pswit[OVERVIEW_SWITCH])
ali@40:                         printf("    Line %ld - Two successive CRs?\n",lcnt);
ali@0:                     else
ali@0:                         cnt_lineend++;
ali@40: 		}
ali@40: 	    }
ali@40:             isCR=1;
ali@40: 	}
ali@40:         else
ali@40: 	{
ali@40:             if (pswit[LINE_END_SWITCH] && isCR)
ali@40: 	    {
ali@40:                 if (pswit[ECHO_SWITCH])
ali@40: 		    printf("\n%s\n",theline);
ali@0:                 if (!pswit[OVERVIEW_SWITCH])
ali@40:                     printf("    Line %ld column %d - CR without LF?\n",
ali@40: 		      lcnt,len+1);
ali@0:                 else
ali@0:                     cnt_lineend++;
ali@40: 	    }
ali@40:             theline[len]=c;
ali@40:             len++;
ali@40:             theline[len]=0;
ali@40:             isCR=0;
ali@40: 	}
ali@40:         c=cint=fgetc(thefile);
ali@40:     } while(len<maxlen);
ali@0:     if (pswit[MARKUP_SWITCH])  
ali@0:         postprocess_for_HTML(theline);
ali@0:     if (pswit[DP_SWITCH])  
ali@0:         postprocess_for_DP(theline);
ali@40:     return theline;
ali@0: }
ali@0: 
ali@40: /*
ali@40:  * mixdigit:
ali@40:  *
ali@40:  * Takes a "word" as a parameter, and checks whether it
ali@40:  * contains a mixture of alpha and digits. Generally, this is an
ali@40:  * error, but may not be for cases like 4th or L5 12s. 3d.
ali@40:  *
ali@40:  * Returns: 0 if no error found, 1 if error.
ali@40:  */
ali@40: int mixdigit(char *checkword)
ali@0: {
ali@40:     int wehaveadigit,wehavealetter,firstdigits,query,wl;
ali@0:     char *s;
ali@40:     wehaveadigit=wehavealetter=query=0;
ali@40:     for (s=checkword;*s;s++)
ali@0:         if (gcisalpha(*s))
ali@40:             wehavealetter=1;
ali@0:         else
ali@0:             if (gcisdigit(*s))
ali@40:                 wehaveadigit=1;
ali@40:     if (wehaveadigit && wehavealetter)
ali@40:     {
ali@40: 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
ali@40:         query=1;
ali@40:         wl=strlen(checkword);
ali@40:         for (firstdigits=0;gcisdigit(checkword[firstdigits]);firstdigits++)
ali@0:             ;
ali@0:         /* digits, ending in st, rd, nd, th of either case */
ali@40:         if (firstdigits+2==wl && (matchword(checkword+wl-2,"st") ||
ali@40: 	  matchword(checkword+wl-2,"rd") || matchword(checkword+wl-2,"nd") ||
ali@40: 	  matchword(checkword+wl-2,"th")))
ali@40: 	    query=0;
ali@40:         if (firstdigits+3==wl && (matchword(checkword+wl-3,"sts") ||
ali@40: 	  matchword(checkword+wl-3,"rds") || matchword(checkword+wl-3,"nds") ||
ali@40: 	  matchword(checkword+wl-3,"ths")))
ali@40: 	    query=0;
ali@40:         if (firstdigits+3==wl && (matchword(checkword+wl-4,"stly") ||
ali@40: 	  matchword(checkword+wl-4,"rdly") ||
ali@40: 	  matchword(checkword+wl-4,"ndly") || matchword(checkword+wl-4,"thly")))
ali@40: 	    query=0;
ali@0:         /* digits, ending in l, L, s or d */
ali@40:         if (firstdigits+1==wl && (checkword[wl-1]=='l' ||
ali@40: 	  checkword[wl-1]=='L' || checkword[wl-1]=='s' || checkword[wl-1]=='d'))
ali@40: 	    query=0;
ali@40:         /*
ali@40: 	 * L at the start of a number, representing Britsh pounds, like L500.
ali@40:          * This is cute. We know the current word is mixeddigit. If the first
ali@40:          * letter is L, there must be at least one digit following. If both
ali@40:          * digits and letters follow, we have a genuine error, else we have a
ali@40:          * capital L followed by digits, and we accept that as a non-error.
ali@40: 	 */
ali@40:         if (checkword[0]=='L' && !mixdigit(checkword+1))
ali@40: 	    query=0;
ali@40:     }
ali@40:     return query;
ali@0: }
ali@0: 
ali@40: /*
ali@40:  * getaword:
ali@40:  *
ali@40:  * Extracts the first/next "word" from the line, and puts
ali@40:  * it into "thisword". A word is defined as one English word unit--or
ali@40:  * at least that's the aim.
ali@40:  *
ali@40:  * Returns: a pointer to the position in the line where we will start
ali@40:  *          looking for the next word.
ali@40:  */
ali@54: const char *getaword(const char *fromline,char *thisword)
ali@0: {
ali@40:     int i,wordlen;
ali@54:     const char *s;
ali@40:     wordlen=0;
ali@40:     for (;!gcisdigit(*fromline) && !gcisalpha(*fromline) && *fromline;
ali@40:       fromline++)
ali@40: 	;
ali@40:     /*
ali@40:      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
ali@40:      * Especially yucky is the case of L1,000
ali@40:      * This section looks for a pattern of characters including a digit
ali@40:      * followed by a comma or period followed by one or more digits.
ali@40:      * If found, it returns this whole pattern as a word; otherwise we discard
ali@40:      * the results and resume our normal programming.
ali@40:      */
ali@40:     s=fromline;
ali@40:     for (;(gcisdigit(*s) || gcisalpha(*s) || *s==',' || *s=='.') &&
ali@40:       wordlen<MAXWORDLEN;s++)
ali@40:     {
ali@40: 	thisword[wordlen]=*s;
ali@0:         wordlen++;
ali@40:     }
ali@40:     thisword[wordlen]=0;
ali@40:     for (i=1;i<wordlen-1;i++)
ali@40:     {
ali@40:         if (thisword[i]=='.' || thisword[i]==',')
ali@40: 	{
ali@40:             if (gcisdigit(thisword[i-1]) && gcisdigit(thisword[i-1]))
ali@40: 	    {
ali@40:                 fromline=s;
ali@40:                 return fromline;
ali@40: 	    }
ali@40: 	}
ali@40:     }
ali@0:     /* we didn't find a punctuated number - do the regular getword thing */
ali@40:     wordlen=0;
ali@40:     for (;(gcisdigit(*fromline) || gcisalpha(*fromline) || *fromline=='\'') &&
ali@40:       wordlen<MAXWORDLEN;fromline++)
ali@40:     {
ali@40:         thisword[wordlen]=*fromline;
ali@0:         wordlen++;
ali@40:     }
ali@40:     thisword[wordlen]=0;
ali@40:     return fromline;
ali@0: }
ali@0: 
ali@40: /*
ali@40:  * matchword:
ali@40:  *
ali@40:  * A case-insensitive string matcher.
ali@40:  */
ali@40: int matchword(char *checkfor,char *thisword)
ali@0: {
ali@40:     unsigned int ismatch,i;
ali@40:     if (strlen(checkfor)!=strlen(thisword))
ali@40: 	return 0;
ali@40:     ismatch=1;     /* assume a match until we find a difference */
ali@40:     for (i=0;i<strlen(checkfor);i++)
ali@40:         if (toupper(checkfor[i])!=toupper(thisword[i]))
ali@40:             ismatch=0;
ali@40:     return ismatch;
ali@0: }
ali@0: 
ali@40: /*
ali@40:  * lowerit:
ali@40:  *
ali@40:  * Lowercase the line.
ali@40:  */
ali@0: 
ali@0: void lowerit(char *theline)
ali@0: {
ali@40:     for (;*theline;theline++)
ali@40:         if (*theline>='A' && *theline<='Z')
ali@40:             *theline+=32;
ali@0: }
ali@0: 
ali@40: /*
ali@40:  * isroman:
ali@40:  *
ali@40:  * Is this word a Roman Numeral?
ali@40:  *
ali@40:  * It doesn't actually validate that the number is a valid Roman Numeral--for
ali@40:  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
ali@40:  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
ali@40:  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
ali@40:  * expressions thereof, except when it came to taxes. Allow any number of M,
ali@40:  * an optional D, an optional CM or CD, any number of optional Cs, an optional
ali@40:  * XL or an optional XC, an optional IX or IV, an optional V and any number
ali@40:  * of optional Is.
ali@40:  */
ali@0: int isroman(char *t)
ali@0: {
ali@0:     char *s;
ali@40:     if (!t || !*t)
ali@40: 	return 0;
ali@40:     s=t;
ali@40:     while (*t=='m' && *t)
ali@40: 	t++;
ali@40:     if (*t=='d')
ali@40: 	t++;
ali@40:     if (*t=='c' && t[1]=='m')
ali@40: 	t+=2;
ali@40:     if (*t=='c' && t[1]=='d')
ali@40: 	t+=2;
ali@40:     while (*t=='c' && *t)
ali@40: 	t++;
ali@40:     if (*t=='x' && t[1]=='l')
ali@40: 	t+=2;
ali@40:     if (*t=='x' && t[1]=='c')
ali@40: 	t+=2;
ali@40:     if (*t=='l')
ali@40: 	t++;
ali@40:     while (*t=='x' && *t)
ali@40: 	t++;
ali@40:     if (*t=='i' && t[1]=='x')
ali@40: 	t+=2;
ali@40:     if (*t=='i' && t[1]=='v')
ali@40: 	t+=2;
ali@40:     if (*t=='v')
ali@40: 	t++;
ali@40:     while (*t=='i' && *t)
ali@40: 	t++;
ali@40:     return !*t;
ali@0: }
ali@0: 
ali@40: /*
ali@40:  * gcisalpha:
ali@40:  *
ali@40:  * A version of isalpha() that is somewhat lenient on 8-bit texts.
ali@40:  * If we use the standard function, 8-bit accented characters break
ali@40:  * words, so that tete with accented characters appears to be two words, "t"
ali@40:  * and "t", with 8-bit characters between them. This causes over-reporting of
ali@40:  * errors. gcisalpha() recognizes accented letters from the CP1252 (Windows)
ali@40:  * and ISO-8859-1 character sets, which are the most common PG 8-bit types.
ali@40:  */
ali@0: int gcisalpha(unsigned char c)
ali@0: {
ali@40:     if (c>='a' && c<='z')
ali@40: 	return 1;
ali@40:     if (c>='A' && c<='Z')
ali@40: 	return 1;
ali@40:     if (c<140)
ali@40: 	return 0;
ali@40:     if (c>=192 && c!=208 && c!=215 && c!=222 && c!=240 && c!=247 && c!=254)
ali@40: 	return 1;
ali@40:     if (c==140 || c==142 || c==156 || c==158 || c==159)
ali@40: 	return 1;
ali@40:     return 0;
ali@0: }
ali@0: 
ali@40: /*
ali@40:  * gcisdigit:
ali@40:  *
ali@40:  * A version of isdigit() that doesn't get confused in 8-bit texts.
ali@40:  */
ali@0: int gcisdigit(unsigned char c)
ali@0: {   
ali@40:     return c>='0' && c<='9';
ali@0: }
ali@0: 
ali@40: /*
ali@40:  * gcisletter:
ali@40:  *
ali@40:  * A version of isletter() that doesn't get confused in 8-bit texts.
ali@40:  * NB: this is ISO-8891-1-specific.
ali@40:  */
ali@0: int gcisletter(unsigned char c)
ali@0: {   
ali@40:     return c>='A' && c<='Z' || c>='a' && c<='z' || c>=192;
ali@0: }
ali@0: 
ali@40: /*
ali@40:  * gcstrchr:
ali@40:  *
ali@40:  * Wraps strchr to return NULL if the character being searched for is zero.
ali@40:  */
ali@40: char *gcstrchr(char *s,char c)
ali@0: {
ali@40:     if (!c)
ali@40: 	return NULL;
ali@40:     return strchr(s,c);
ali@0: }
ali@0: 
ali@40: /*
ali@40:  * postprocess_for_DP:
ali@40:  *
ali@40:  * Invoked with the -d switch from flgets().
ali@40:  * It simply "removes" from the line a hard-coded set of common
ali@40:  * DP-specific tags, so that the line passed to the main routine has
ali@40:  * been pre-cleaned of DP markup.
ali@40:  */
ali@0: void postprocess_for_DP(char *theline)
ali@0: {
ali@40:     char *s,*t;
ali@0:     int i;
ali@0:     if (!*theline) 
ali@0:         return;
ali@40:     for (i=0;*DPmarkup[i];i++)
ali@40:     {
ali@40:         s=strstr(theline,DPmarkup[i]);
ali@40:         while (s)
ali@40: 	{
ali@40:             t=s+strlen(DPmarkup[i]);
ali@40:             while (*t)
ali@40: 	    {
ali@40:                 *s=*t;
ali@40:                 t++;
ali@40: 		s++;
ali@40: 	    }
ali@40:             *s=0;
ali@40:             s=strstr(theline,DPmarkup[i]);
ali@40: 	}
ali@40:     }
ali@0: }
ali@0: 
ali@40: /*
ali@40:  * postprocess_for_HTML:
ali@40:  *
ali@40:  * Invoked with the -m switch from flgets().
ali@40:  * It simply "removes" from the line a hard-coded set of common
ali@40:  * HTML tags and "replaces" a hard-coded set of common HTML
ali@40:  * entities, so that the line passed to the main routine has
ali@40:  * been pre-cleaned of HTML.
ali@40:  */
ali@0: void postprocess_for_HTML(char *theline)
ali@0: {
ali@40:     if (strstr(theline,"<") && strstr(theline,">"))
ali@0:         while (losemarkup(theline))
ali@0:             ;
ali@0:     while (loseentities(theline))
ali@0:         ;
ali@0: }
ali@0: 
ali@0: char *losemarkup(char *theline)
ali@0: {
ali@40:     char *s,*t;
ali@0:     int i;
ali@0:     if (!*theline) 
ali@40:         return NULL;
ali@40:     s=strstr(theline,"<");
ali@40:     t=strstr(theline,">");
ali@40:     if (!s || !t)
ali@40: 	return NULL;
ali@40:     for (i=0;*markup[i];i++)
ali@40:         if (!tagcomp(s+1,markup[i]))
ali@40: 	{
ali@40:             if (!t[1])
ali@40: 	    {
ali@40:                 *s=0;
ali@40:                 return s;
ali@40: 	    }
ali@40:             else if (t>s)
ali@40: 	    {
ali@40: 		strcpy(s,t+1);
ali@40: 		return s;
ali@40: 	    }
ali@0:         }
ali@40:     /* It's an unrecognized <xxx>. */
ali@40:     return NULL;
ali@0: }
ali@0: 
ali@0: char *loseentities(char *theline)
ali@0: {
ali@0:     int i;
ali@40:     char *s,*t;
ali@0:     if (!*theline) 
ali@40:         return NULL;
ali@40:     for (i=0;*entities[i].htmlent;i++)
ali@40:     {
ali@40:         s=strstr(theline,entities[i].htmlent);
ali@40:         if (s)
ali@40: 	{
ali@40:             t=malloc((size_t)strlen(s));
ali@40:             if (!t)
ali@40: 		return NULL;
ali@40:             strcpy(t,s+strlen(entities[i].htmlent));
ali@40:             strcpy(s,entities[i].textent);
ali@40:             strcat(s,t);
ali@0:             free(t);
ali@40:             return theline;
ali@40: 	}
ali@40:     }
ali@40:     for (i=0;*entities[i].htmlnum;i++)
ali@40:     {
ali@40:         s=strstr(theline,entities[i].htmlnum);
ali@40:         if (s)
ali@40: 	{
ali@40:             t=malloc((size_t)strlen(s));
ali@40:             if (!t)
ali@40: 		return NULL;
ali@40:             strcpy(t,s+strlen(entities[i].htmlnum));
ali@40:             strcpy(s,entities[i].textent);
ali@40:             strcat(s,t);
ali@0:             free(t);
ali@40:             return theline;
ali@40: 	}
ali@40:     }
ali@40:     return NULL;
ali@0: }
ali@0: 
ali@40: int tagcomp(char *strin,char *basetag)
ali@0: {
ali@40:     char *s,*t;
ali@40:     s=basetag;
ali@40:     t=strin;
ali@40:     if (*t=='/')
ali@40: 	t++; /* ignore a slash */
ali@40:     while (*s && *t)
ali@40:     {
ali@40:         if (tolower(*s)!=tolower(*t))
ali@40: 	    return 1;
ali@40:         s++;
ali@40: 	t++;
ali@40:     }
ali@40:     return 0;
ali@0: }
ali@0: 
ali@40: void proghelp()
ali@0: {
ali@40:     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
ali@40:     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
ali@40:     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
ali@40:     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
ali@40:       "For details, read the file COPYING.\n",stderr);
ali@40:     fputs("This is Free Software; "
ali@40:       "you may redistribute it under certain conditions (GPL);\n",stderr);
ali@40:     fputs("read the file COPYING for details.\n\n",stderr);
ali@40:     fputs("Usage is: bookloupe [-setpxloyhud] filename\n",stderr);
ali@40:     fputs("  where -s checks single quotes, -e suppresses echoing lines, "
ali@40:       "-t checks typos\n",stderr);
ali@40:     fputs("  -x (paranoid) switches OFF -t and extra checks, "
ali@40:       "-l turns OFF line-end checks\n",stderr);
ali@40:     fputs("  -o just displays overview without detail, "
ali@40:       "-h echoes header fields\n",stderr);
ali@40:     fputs("  -v (verbose) unsuppresses duplicate reporting, "
ali@40:       "-m suppresses markup\n",stderr);
ali@0:     fputs("  -d ignores DP-specific markup,\n",stderr);
ali@40:     fputs("  -u uses a file gutcheck.typ to query user-defined "
ali@40:       "possible typos\n",stderr);
ali@40:     fputs("Sample usage: bookloupe warpeace.txt \n",stderr);
ali@0:     fputs("\n",stderr);
ali@40:     fputs("Bookloupe looks for errors in Project Gutenberg(TM) etexts.\n",
ali@40:       stderr);
ali@40:     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
ali@40:       "non-ASCII\n",stderr);
ali@40:     fputs("characters like accented letters, "
ali@40:       "lines longer than 75 or shorter than 55,\n",stderr);
ali@40:     fputs("unbalanced quotes or brackets, "
ali@40:       "a variety of badly formatted punctuation, \n",stderr);
ali@40:     fputs("HTML tags, some likely typos. "
ali@40:       "It is NOT a substitute for human judgement.\n",stderr);
ali@0:     fputs("\n",stderr);
ali@0: }