gutcheck/gutcheck.c
author ali <ali@juiblex.co.uk>
Tue Jan 24 23:54:05 2012 +0000 (2012-01-24)
changeset 0 c2f4c0285180
permissions -rw-r--r--
Initial version
ali@0
     1
/*************************************************************************/
ali@0
     2
/* gutcheck - check for assorted weirdnesses in a PG candidate text file */
ali@0
     3
/*                                                                       */
ali@0
     4
/* Version 0.991                                                         */
ali@0
     5
/* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>                  */
ali@0
     6
/*                                                                       */
ali@0
     7
/* This program is free software; you can redistribute it and/or modify  */
ali@0
     8
/* it under the terms of the GNU General Public License as published by  */
ali@0
     9
/* the Free Software Foundation; either version 2 of the License, or     */
ali@0
    10
/* (at your option) any later version.                                   */
ali@0
    11
/*                                                                       */
ali@0
    12
/* This program is distributed in the hope that it will be useful,       */
ali@0
    13
/* but WITHOUT ANY WARRANTY; without even the implied warranty of        */
ali@0
    14
/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         */
ali@0
    15
/* GNU General Public License for more details.                          */
ali@0
    16
/*                                                                       */
ali@0
    17
/* You should have received a copy of the GNU General Public License     */
ali@0
    18
/* along with this program; if not, write to the                         */
ali@0
    19
/*      Free Software Foundation, Inc.,                                  */
ali@0
    20
/*      59 Temple Place,                                                 */
ali@0
    21
/*      Suite 330,                                                       */
ali@0
    22
/*      Boston, MA  02111-1307  USA                                      */
ali@0
    23
/*                                                                       */
ali@0
    24
/*                                                                       */
ali@0
    25
/*                                                                       */
ali@0
    26
/* Overview comments:                                                    */
ali@0
    27
/*                                                                       */
ali@0
    28
/* If you're reading this, you're either interested in how to detect     */
ali@0
    29
/* formatting errors, or very very bored.                                */
ali@0
    30
/*                                                                       */
ali@0
    31
/* Gutcheck is a homebrew formatting checker specifically for            */
ali@0
    32
/* spotting common formatting problems in a PG e-text. I typically       */
ali@0
    33
/* run it once or twice on a file I'm about to submit; it usually        */
ali@0
    34
/* finds a few formatting problems. It also usually finds lots of        */
ali@0
    35
/* queries that aren't problems at all; it _really_ doesn't like         */
ali@0
    36
/* the standard PG header, for example.  It's optimized for straight     */
ali@0
    37
/* prose; poetry and non-fiction involving tables tend to trigger        */
ali@0
    38
/* false alarms.                                                         */
ali@0
    39
/*                                                                       */
ali@0
    40
/* The code of gutcheck is not very interesting, but the experience      */
ali@0
    41
/* of what constitutes a possible error may be, and the best way to      */
ali@0
    42
/* illustrate that is by example.                                        */
ali@0
    43
/*                                                                       */
ali@0
    44
/*                                                                       */
ali@0
    45
/* Here are some common typos found in PG texts that gutcheck            */
ali@0
    46
/* will flag as errors:                                                  */
ali@0
    47
/*                                                                       */
ali@0
    48
/* "Look!John , over there!"                                             */
ali@0
    49
/* <this is a HTML tag>                                                  */
ali@0
    50
/* &so is this;                                                          */
ali@0
    51
/* Margaret said: " Now you should start for school."                    */
ali@0
    52
/* Margaret said: "Now you should start for school. (if end of para)     */
ali@0
    53
/* The horse is said to he worth a lot.                                  */
ali@0
    54
/* 0K - this'11 make you look close1y.                                   */
ali@0
    55
/* "If you do. you'll regret it!"                                        */
ali@0
    56
/*                                                                       */
ali@0
    57
/* There are some complications . The extra space left around that       */
ali@0
    58
/* period was an error . . . but that ellipsis wasn't.                   */
ali@0
    59
/*                                                                       */
ali@0
    60
/* The last line of a paragraph                                          */
ali@0
    61
/* is usually short.                                                     */
ali@0
    62
/*                                                                       */
ali@0
    63
/* This period is an error.But the periods in a.m. aren't.               */
ali@0
    64
/*                                                                       */
ali@0
    65
/* Checks that are do-able but not (well) implemented are:               */
ali@0
    66
/*        Single-quote chcking.                                          */
ali@0
    67
/*          Despite 3 attempts at it, singlequote checking is still      */
ali@0
    68
/*          crap in gutcheck. It may not be possible without analysis    */
ali@0
    69
/*          of the whole paragraph.                                      */
ali@0
    70
/*                                                                       */
ali@0
    71
/*************************************************************************/
ali@0
    72
ali@0
    73
ali@0
    74
#include <stdio.h>
ali@0
    75
#include <stdlib.h>
ali@0
    76
#include <string.h>
ali@0
    77
#include <ctype.h>
ali@0
    78
ali@0
    79
#define MAXWORDLEN    80    /* max length of one word             */
ali@0
    80
#define LINEBUFSIZE 2048    /* buffer size for an input line      */
ali@0
    81
ali@0
    82
#define MAX_USER_TYPOS 1000
ali@0
    83
#define USERTYPO_FILE "gutcheck.typ"
ali@0
    84
ali@0
    85
#ifndef MAX_PATH
ali@0
    86
#define MAX_PATH 16384
ali@0
    87
#endif
ali@0
    88
ali@0
    89
char aline[LINEBUFSIZE];
ali@0
    90
char prevline[LINEBUFSIZE];
ali@0
    91
ali@0
    92
                 /* Common typos. */
ali@0
    93
char *typo[] = { "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane", "nad",
ali@0
    94
                "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa", "bakc", "om",
ali@0
    95
                "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt", "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr",
ali@0
    96
                "hmi", "hse", "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd", "gerat", "goign",
ali@0
    97
                "gruop", "haev", "hda", "hearign", "seeign", "sayign", "herat", "hge", "hsa", "hsi", "hte", "htere",
ali@0
    98
                "htese", "htey", "htis", "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut", "loev",
ali@0
    99
                "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter", "omre", "onyl", "otehr", "otu", "owrk",
ali@0
   100
                "owuld", "peice", "peices", "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
ali@0
   101
                "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe", "sohw", "stnad", "stopry",
ali@0
   102
                "stoyr", "stpo", "tahn", "taht", "tath", "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge",
ali@0
   103
                "thier", "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne", "tirne", "tkae",
ali@0
   104
                "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey", "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih",
ali@0
   105
                "whihc", "whta", "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
ali@0
   106
                "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking", "wtih", "wuould", "wya", "yera",
ali@0
   107
                "yeras", "yersa", "yoiu", "youve", "ytou", "yuor",
ali@0
   108
                /* added h/b words for version 12 - removed a few with "tbe" v.25 */
ali@0
   109
                "abead", "ahle", "ahout", "ahove", "altbough", "balf", "bardly", "bas", "bave", "baving", "bebind", 
ali@0
   110
                "beld", "belp", "belped", "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge", "dehates", 
ali@0
   111
                "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan", "hegin", "heing", 
ali@0
   112
                "helieve", "henefit", "hetter", "hetween", "heyond", "hig", "higber", "huild", "huy", "hy", "jobn", "joh", 
ali@0
   113
                "meanwbile", "memher", "memhers", "numher", "numhers", 
ali@0
   114
                "perbaps", "prohlem", "puhlic", "witbout", 
ali@0
   115
                /* and a few more for .18 */
ali@0
   116
                "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud", "prornise", "prornised", "modem", "bo",
ali@0
   117
                "heside", "chapteb", "chaptee", "se",
ali@0
   118
                 ""};
ali@0
   119
ali@0
   120
char *usertypo[MAX_USER_TYPOS];
ali@0
   121
ali@0
   122
                 /* Common abbreviations and other OK words not to query as typos. */
ali@0
   123
                 /* 0.99 last-minute - removed "ms"      */
ali@0
   124
char *okword[] = {"mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm", "rd", "sh", "br",
ali@0
   125
                  "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "pompeii","hawaii","hawaiian",
ali@0
   126
                  "hotbed", "heartbeat", "heartbeats", "outbid", "outbids", "frostbite", "frostbitten",
ali@0
   127
                  ""};
ali@0
   128
ali@0
   129
                 /* Common abbreviations that cause otherwise unexplained periods. */
ali@0
   130
char *abbrev[] = {"cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op", "cit",
ali@0
   131
                  "deg", "min", "chap", "oz", "mme", "mlle", "mssrs",
ali@0
   132
                  ""};
ali@0
   133
                 /* Two-Letter combinations that rarely if ever start words, */
ali@0
   134
                 /* but are common scannos or otherwise common letter        */
ali@0
   135
                 /* combinations.                                            */
ali@0
   136
char *nostart[] = { "hr", "hl", "cb", "sb", "tb", "wb", "tl",
ali@0
   137
                    "tn", "rn", "lt", "tj",
ali@0
   138
                    "" };
ali@0
   139
ali@0
   140
                 /* Two-Letter combinations that rarely if ever end words    */
ali@0
   141
                 /* but are common scannos or otherwise common letter        */
ali@0
   142
                 /* combinations                                             */
ali@0
   143
char *noend[]   = { "cb", "gb", "pb", "sb", "tb", 
ali@0
   144
                    "wh","fr","br","qu","tw","gl","fl","sw","gr","sl","cl",
ali@0
   145
                    "iy",
ali@0
   146
                    ""};
ali@0
   147
ali@0
   148
char *markup[]  = { "a", "b", "big", "blockquote", "body", "br", "center", 
ali@0
   149
                    "col", "div", "em", "font", "h1", "h2", "h3", "h4", 
ali@0
   150
                    "h5", "h6", "head", "hr", "html", "i", "img", "li", 
ali@0
   151
                    "meta", "ol", "p", "pre", "small", "span", "strong", 
ali@0
   152
                    "sub", "sup", "table", "td", "tfoot", "thead", "title", 
ali@0
   153
                    "tr", "tt", "u", "ul", 
ali@0
   154
                    ""};
ali@0
   155
ali@0
   156
char *DPmarkup[] = { "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>",
ali@0
   157
                    ""}; /* <tb> added .991 */
ali@0
   158
ali@0
   159
char *nocomma[]  = { "the", "it's", "their", "an", "mrs", "a", "our", "that's",
ali@0
   160
                     "its", "whose", "every", "i'll", "your", "my", 
ali@0
   161
                     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "rd", 
ali@0
   162
                     "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", 
ali@0
   163
                     "i'm", "during", "let", "toward", "among",
ali@0
   164
                     ""};
ali@0
   165
ali@0
   166
ali@0
   167
char *noperiod[] = { "every", "i'm", "during", "that's", "their", "your", "our", "my", "or", 
ali@0
   168
                     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether", 
ali@0
   169
                     "i'll", "whose", "who", "because", "when", "let", "till", "very",
ali@0
   170
                     "an", "among", "those", "into", "whom", "having", "thence",
ali@0
   171
                     ""}; 
ali@0
   172
ali@0
   173
ali@0
   174
char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü";  /* Carlo's old suggestion, updated .991 */
ali@0
   175
ali@0
   176
struct {
ali@0
   177
    char *htmlent;
ali@0
   178
    char *htmlnum;
ali@0
   179
    char *textent;
ali@0
   180
    } entities[] = { "&amp;",           "&#38;",        "&", 
ali@0
   181
                     "&lt;",            "&#60;",        "<",
ali@0
   182
                     "&gt;",            "&#62;",        ">",
ali@0
   183
                     "&deg;",           "&#176;",       " degrees",
ali@0
   184
                     "&pound;",         "&#163;",       "L",
ali@0
   185
                     "&quot;",          "&#34;",        "\"",   /* -- quotation mark = APL quote, */
ali@0
   186
                     "&OElig;",         "&#338;",       "OE",  /* -- latin capital ligature OE, */
ali@0
   187
                     "&oelig;",         "&#339;",       "oe",  /* -- latin small ligature oe, U+0153 ISOlat2 --> */
ali@0
   188
                     "&Scaron;",        "&#352;",       "S",  /* -- latin capital letter S with caron, */
ali@0
   189
                     "&scaron;",        "&#353;",       "s",  /* -- latin small letter s with caron, */
ali@0
   190
                     "&Yuml;",          "&#376;",       "Y",  /* -- latin capital letter Y with diaeresis, */
ali@0
   191
                     "&circ;",          "&#710;",       "",  /* -- modifier letter circumflex accent, */
ali@0
   192
                     "&tilde;",         "&#732;",       "~",  /* -- small tilde, U+02DC ISOdia --> */
ali@0
   193
                     "&ensp;",          "&#8194;",      " ", /* -- en space, U+2002 ISOpub --> */
ali@0
   194
                     "&emsp;",          "&#8195;",      " ", /* -- em space, U+2003 ISOpub --> */
ali@0
   195
                     "&thinsp;",        "&#8201;",      " ", /* -- thin space, U+2009 ISOpub --> */
ali@0
   196
                     "&ndash;",         "&#8211;",      "-", /* -- en dash, U+2013 ISOpub --> */
ali@0
   197
                     "&mdash;",         "&#8212;",      "--", /* -- em dash, U+2014 ISOpub --> */
ali@0
   198
                     "&lsquo;",         "&#8216;",      "'", /* -- left single quotation mark, */
ali@0
   199
                     "&rsquo;",         "&#8217;",      "'", /* -- right single quotation mark, */
ali@0
   200
                     "&sbquo;",         "&#8218;",      "'", /* -- single low-9 quotation mark, U+201A NEW --> */
ali@0
   201
                     "&ldquo;",         "&#8220;",      "\"", /* -- left double quotation mark, */
ali@0
   202
                     "&rdquo;",         "&#8221;",      "\"", /* -- right double quotation mark, */
ali@0
   203
                     "&bdquo;",         "&#8222;",      "\"", /* -- double low-9 quotation mark, U+201E NEW --> */
ali@0
   204
                     "&lsaquo;",        "&#8249;",      "\"", /* -- single left-pointing angle quotation mark, */
ali@0
   205
                     "&rsaquo;",        "&#8250;",      "\"", /* -- single right-pointing angle quotation mark, */
ali@0
   206
                     "&nbsp;",          "&#160;",       " ", /* -- no-break space = non-breaking space, */
ali@0
   207
                     "&iexcl;",         "&#161;",       "!", /* -- inverted exclamation mark, U+00A1 ISOnum --> */
ali@0
   208
                     "&cent;",          "&#162;",       "c", /* -- cent sign, U+00A2 ISOnum --> */
ali@0
   209
                     "&pound;",         "&#163;",       "L", /* -- pound sign, U+00A3 ISOnum --> */
ali@0
   210
                     "&curren;",        "&#164;",       "$", /* -- currency sign, U+00A4 ISOnum --> */
ali@0
   211
                     "&yen;",           "&#165;",       "Y", /* -- yen sign = yuan sign, U+00A5 ISOnum --> */
ali@0
   212
                     "&sect;",          "&#167;",       "--", /* -- section sign, U+00A7 ISOnum --> */
ali@0
   213
                     "&uml;",           "&#168;",       " ", /* -- diaeresis = spacing diaeresis, */
ali@0
   214
                     "&copy;",          "&#169;",       "(C) ", /* -- copyright sign, U+00A9 ISOnum --> */
ali@0
   215
                     "&ordf;",          "&#170;",       " ", /* -- feminine ordinal indicator, U+00AA ISOnum --> */
ali@0
   216
                     "&laquo;",         "&#171;",       "\"", /* -- left-pointing double angle quotation mark */
ali@0
   217
                     "&shy;",           "&#173;",       "-", /* -- soft hyphen = discretionary hyphen, */
ali@0
   218
                     "&reg;",           "&#174;",       "(R) ", /* -- registered sign = registered trade mark sign, */
ali@0
   219
                     "&macr;",          "&#175;",       " ", /* -- macron = spacing macron = overline */
ali@0
   220
                     "&deg;",           "&#176;",       " degrees", /* -- degree sign, U+00B0 ISOnum --> */
ali@0
   221
                     "&plusmn;",        "&#177;",       "+-", /* -- plus-minus sign = plus-or-minus sign, */
ali@0
   222
                     "&sup2;",          "&#178;",       "2", /* -- superscript two = superscript digit two */
ali@0
   223
                     "&sup3;",          "&#179;",       "3", /* -- superscript three = superscript digit three */
ali@0
   224
                     "&acute;",         "&#180;",       " ", /* -- acute accent = spacing acute, */
ali@0
   225
                     "&micro;",         "&#181;",       "m", /* -- micro sign, U+00B5 ISOnum --> */
ali@0
   226
                     "&para;",          "&#182;",       "--", /* -- pilcrow sign = paragraph sign, */
ali@0
   227
                     "&cedil;",         "&#184;",       " ", /* -- cedilla = spacing cedilla, U+00B8 ISOdia --> */
ali@0
   228
                     "&sup1;",          "&#185;",       "1", /* -- superscript one = superscript digit one, */
ali@0
   229
                     "&ordm;",          "&#186;",       " ", /* -- masculine ordinal indicator, */
ali@0
   230
                     "&raquo;",         "&#187;",       "\"", /* -- right-pointing double angle quotation mark */
ali@0
   231
                     "&frac14;",        "&#188;",       "1/4", /* -- vulgar fraction one quarter */
ali@0
   232
                     "&frac12;",        "&#189;",       "1/2", /* -- vulgar fraction one half */
ali@0
   233
                     "&frac34;",        "&#190;",       "3/4", /* -- vulgar fraction three quarters */
ali@0
   234
                     "&iquest;",        "&#191;",       "?", /* -- inverted question mark */
ali@0
   235
                     "&Agrave;",        "&#192;",       "A", /* -- latin capital letter A with grave */
ali@0
   236
                     "&Aacute;",        "&#193;",       "A", /* -- latin capital letter A with acute, */
ali@0
   237
                     "&Acirc;",         "&#194;",       "A", /* -- latin capital letter A with circumflex, */
ali@0
   238
                     "&Atilde;",        "&#195;",       "A", /* -- latin capital letter A with tilde, */
ali@0
   239
                     "&Auml;",          "&#196;",       "A", /* -- latin capital letter A with diaeresis, */
ali@0
   240
                     "&Aring;",         "&#197;",       "A", /* -- latin capital letter A with ring above */
ali@0
   241
                     "&AElig;",         "&#198;",       "AE", /* -- latin capital letter AE */
ali@0
   242
                     "&Ccedil;",        "&#199;",       "C", /* -- latin capital letter C with cedilla, */
ali@0
   243
                     "&Egrave;",        "&#200;",       "E", /* -- latin capital letter E with grave, */
ali@0
   244
                     "&Eacute;",        "&#201;",       "E", /* -- latin capital letter E with acute, */
ali@0
   245
                     "&Ecirc;",         "&#202;",       "E", /* -- latin capital letter E with circumflex, */
ali@0
   246
                     "&Euml;",          "&#203;",       "E", /* -- latin capital letter E with diaeresis, */
ali@0
   247
                     "&Igrave;",        "&#204;",       "I", /* -- latin capital letter I with grave, */
ali@0
   248
                     "&Iacute;",        "&#205;",       "I", /* -- latin capital letter I with acute, */
ali@0
   249
                     "&Icirc;",         "&#206;",       "I", /* -- latin capital letter I with circumflex, */
ali@0
   250
                     "&Iuml;",          "&#207;",       "I", /* -- latin capital letter I with diaeresis, */
ali@0
   251
                     "&ETH;",           "&#208;",       "E", /* -- latin capital letter ETH, U+00D0 ISOlat1 --> */
ali@0
   252
                     "&Ntilde;",        "&#209;",       "N", /* -- latin capital letter N with tilde, */
ali@0
   253
                     "&Ograve;",        "&#210;",       "O", /* -- latin capital letter O with grave, */
ali@0
   254
                     "&Oacute;",        "&#211;",       "O", /* -- latin capital letter O with acute, */
ali@0
   255
                     "&Ocirc;",         "&#212;",       "O", /* -- latin capital letter O with circumflex, */
ali@0
   256
                     "&Otilde;",        "&#213;",       "O", /* -- latin capital letter O with tilde, */
ali@0
   257
                     "&Ouml;",          "&#214;",       "O", /* -- latin capital letter O with diaeresis, */
ali@0
   258
                     "&times;",         "&#215;",       "*", /* -- multiplication sign, U+00D7 ISOnum --> */
ali@0
   259
                     "&Oslash;",        "&#216;",       "O", /* -- latin capital letter O with stroke */
ali@0
   260
                     "&Ugrave;",        "&#217;",       "U", /* -- latin capital letter U with grave, */
ali@0
   261
                     "&Uacute;",        "&#218;",       "U", /* -- latin capital letter U with acute, */
ali@0
   262
                     "&Ucirc;",         "&#219;",       "U", /* -- latin capital letter U with circumflex, */
ali@0
   263
                     "&Uuml;",          "&#220;",       "U", /* -- latin capital letter U with diaeresis, */
ali@0
   264
                     "&Yacute;",        "&#221;",       "Y", /* -- latin capital letter Y with acute, */
ali@0
   265
                     "&THORN;",         "&#222;",       "TH", /* -- latin capital letter THORN, */
ali@0
   266
                     "&szlig;",         "&#223;",       "sz", /* -- latin small letter sharp s = ess-zed, */
ali@0
   267
                     "&agrave;",        "&#224;",       "a", /* -- latin small letter a with grave */
ali@0
   268
                     "&aacute;",        "&#225;",       "a", /* -- latin small letter a with acute, */
ali@0
   269
                     "&acirc;",         "&#226;",       "a", /* -- latin small letter a with circumflex, */
ali@0
   270
                     "&atilde;",        "&#227;",       "a", /* -- latin small letter a with tilde, */
ali@0
   271
                     "&auml;",          "&#228;",       "a", /* -- latin small letter a with diaeresis, */
ali@0
   272
                     "&aring;",         "&#229;",       "a", /* -- latin small letter a with ring above */
ali@0
   273
                     "&aelig;",         "&#230;",       "ae", /* -- latin small letter ae */
ali@0
   274
                     "&ccedil;",        "&#231;",       "c", /* -- latin small letter c with cedilla, */
ali@0
   275
                     "&egrave;",        "&#232;",       "e", /* -- latin small letter e with grave, */
ali@0
   276
                     "&eacute;",        "&#233;",       "e", /* -- latin small letter e with acute, */
ali@0
   277
                     "&ecirc;",         "&#234;",       "e", /* -- latin small letter e with circumflex, */
ali@0
   278
                     "&euml;",          "&#235;",       "e", /* -- latin small letter e with diaeresis, */
ali@0
   279
                     "&igrave;",        "&#236;",       "i", /* -- latin small letter i with grave, */
ali@0
   280
                     "&iacute;",        "&#237;",       "i", /* -- latin small letter i with acute, */
ali@0
   281
                     "&icirc;",         "&#238;",       "i", /* -- latin small letter i with circumflex, */
ali@0
   282
                     "&iuml;",          "&#239;",       "i", /* -- latin small letter i with diaeresis, */
ali@0
   283
                     "&eth;",           "&#240;",       "eth", /* -- latin small letter eth, U+00F0 ISOlat1 --> */
ali@0
   284
                     "&ntilde;",        "&#241;",       "n", /* -- latin small letter n with tilde, */
ali@0
   285
                     "&ograve;",        "&#242;",       "o", /* -- latin small letter o with grave, */
ali@0
   286
                     "&oacute;",        "&#243;",       "o", /* -- latin small letter o with acute, */
ali@0
   287
                     "&ocirc;",         "&#244;",       "o", /* -- latin small letter o with circumflex, */
ali@0
   288
                     "&otilde;",        "&#245;",       "o", /* -- latin small letter o with tilde, */
ali@0
   289
                     "&ouml;",          "&#246;",       "o", /* -- latin small letter o with diaeresis, */
ali@0
   290
                     "&divide;",        "&#247;",       "/", /* -- division sign, U+00F7 ISOnum --> */
ali@0
   291
                     "&oslash;",        "&#248;",       "o", /* -- latin small letter o with stroke, */
ali@0
   292
                     "&ugrave;",        "&#249;",       "u", /* -- latin small letter u with grave, */
ali@0
   293
                     "&uacute;",        "&#250;",       "u", /* -- latin small letter u with acute, */
ali@0
   294
                     "&ucirc;",         "&#251;",       "u", /* -- latin small letter u with circumflex, */
ali@0
   295
                     "&uuml;",          "&#252;",       "u", /* -- latin small letter u with diaeresis, */
ali@0
   296
                     "&yacute;",        "&#253;",       "y", /* -- latin small letter y with acute, */
ali@0
   297
                     "&thorn;",         "&#254;",       "th", /* -- latin small letter thorn, */
ali@0
   298
                     "&yuml;",          "&#255;",       "y", /* -- latin small letter y with diaeresis, */
ali@0
   299
                      "", "" };
ali@0
   300
                    
ali@0
   301
/* ---- list of special characters ---- */
ali@0
   302
#define CHAR_SPACE        32
ali@0
   303
#define CHAR_TAB           9
ali@0
   304
#define CHAR_LF           10
ali@0
   305
#define CHAR_CR           13
ali@0
   306
#define CHAR_DQUOTE       34
ali@0
   307
#define CHAR_SQUOTE       39
ali@0
   308
#define CHAR_OPEN_SQUOTE  96
ali@0
   309
#define CHAR_TILDE       126
ali@0
   310
#define CHAR_ASTERISK     42
ali@0
   311
#define CHAR_FORESLASH    47
ali@0
   312
#define CHAR_CARAT        94
ali@0
   313
ali@0
   314
#define CHAR_UNDERSCORE    '_'
ali@0
   315
#define CHAR_OPEN_CBRACK   '{'
ali@0
   316
#define CHAR_CLOSE_CBRACK  '}'
ali@0
   317
#define CHAR_OPEN_RBRACK   '('
ali@0
   318
#define CHAR_CLOSE_RBRACK  ')'
ali@0
   319
#define CHAR_OPEN_SBRACK   '['
ali@0
   320
#define CHAR_CLOSE_SBRACK  ']'
ali@0
   321
ali@0
   322
ali@0
   323
ali@0
   324
ali@0
   325
ali@0
   326
/* ---- longest and shortest normal PG line lengths ----*/
ali@0
   327
#define LONGEST_PG_LINE   75
ali@0
   328
#define WAY_TOO_LONG      80
ali@0
   329
#define SHORTEST_PG_LINE  55
ali@0
   330
ali@0
   331
#define SWITCHES "ESTPXLOYHWVMUD" /* switches:-                            */
ali@0
   332
                                  /*     D - ignore DP-specific markup     */
ali@0
   333
                                  /*     E - echo queried line             */
ali@0
   334
                                  /*     S - check single quotes           */
ali@0
   335
                                  /*     T - check common typos            */
ali@0
   336
                                  /*     P - require closure of quotes on  */
ali@0
   337
                                  /*         every paragraph               */
ali@0
   338
                                  /*     X - "Trust no one" :-) Paranoid!  */
ali@0
   339
                                  /*         Queries everything            */
ali@0
   340
                                  /*     L - line end checking defaults on */
ali@0
   341
                                  /*         -L turns it off               */
ali@0
   342
                                  /*     O - overview. Just shows counts.  */
ali@0
   343
                                  /*     Y - puts errors to stdout         */
ali@0
   344
                                  /*         instead of stderr             */
ali@0
   345
                                  /*     H - Echoes header fields          */
ali@0
   346
                                  /*     M - Ignore markup in < >          */
ali@0
   347
                                  /*     U - Use file of User-defined Typos*/
ali@0
   348
                                  /*     W - Defaults for use on Web upload*/
ali@0
   349
                                  /*     V - Verbose - list EVERYTHING!    */
ali@0
   350
#define SWITNO 14                 /* max number of switch parms            */
ali@0
   351
                                  /*        - used for defining array-size */
ali@0
   352
#define MINARGS   1               /* minimum no of args excl switches      */
ali@0
   353
#define MAXARGS   1               /* maximum no of args excl switches      */
ali@0
   354
ali@0
   355
int pswit[SWITNO];                /* program switches set by SWITCHES      */
ali@0
   356
ali@0
   357
#define ECHO_SWITCH      0
ali@0
   358
#define SQUOTE_SWITCH    1
ali@0
   359
#define TYPO_SWITCH      2
ali@0
   360
#define QPARA_SWITCH     3
ali@0
   361
#define PARANOID_SWITCH  4
ali@0
   362
#define LINE_END_SWITCH  5
ali@0
   363
#define OVERVIEW_SWITCH  6
ali@0
   364
#define STDOUT_SWITCH    7
ali@0
   365
#define HEADER_SWITCH    8
ali@0
   366
#define WEB_SWITCH       9
ali@0
   367
#define VERBOSE_SWITCH   10
ali@0
   368
#define MARKUP_SWITCH    11
ali@0
   369
#define USERTYPO_SWITCH  12
ali@0
   370
#define DP_SWITCH        13
ali@0
   371
ali@0
   372
ali@0
   373
ali@0
   374
long cnt_dquot;       /* for overview mode, count of doublequote queries */
ali@0
   375
long cnt_squot;       /* for overview mode, count of singlequote queries */
ali@0
   376
long cnt_brack;       /* for overview mode, count of brackets queries */
ali@0
   377
long cnt_bin;         /* for overview mode, count of non-ASCII queries */
ali@0
   378
long cnt_odd;         /* for overview mode, count of odd character queries */
ali@0
   379
long cnt_long;        /* for overview mode, count of long line errors */
ali@0
   380
long cnt_short;       /* for overview mode, count of short line queries */
ali@0
   381
long cnt_punct;       /* for overview mode, count of punctuation and spacing queries */
ali@0
   382
long cnt_dash;        /* for overview mode, count of dash-related queries */
ali@0
   383
long cnt_word;        /* for overview mode, count of word queries */
ali@0
   384
long cnt_html;        /* for overview mode, count of html queries */
ali@0
   385
long cnt_lineend;     /* for overview mode, count of line-end queries */
ali@0
   386
long cnt_spacend;     /* count of lines with space at end  V .21 */
ali@0
   387
long linecnt;         /* count of total lines in the file */
ali@0
   388
long checked_linecnt; /* count of lines actually gutchecked V .26 */
ali@0
   389
ali@0
   390
void proghelp(void);
ali@0
   391
void procfile(char *);
ali@0
   392
ali@0
   393
#define LOW_THRESHOLD    0
ali@0
   394
#define HIGH_THRESHOLD   1
ali@0
   395
ali@0
   396
#define START 0
ali@0
   397
#define END 1
ali@0
   398
#define PREV 0
ali@0
   399
#define NEXT 1
ali@0
   400
#define FIRST_OF_PAIR 0
ali@0
   401
#define SECOND_OF_PAIR 1
ali@0
   402
ali@0
   403
#define MAX_WORDPAIR 1000
ali@0
   404
ali@0
   405
char running_from[MAX_PATH];
ali@0
   406
ali@0
   407
int mixdigit(char *);
ali@0
   408
char *getaword(char *, char *);
ali@0
   409
int matchword(char *, char *);
ali@0
   410
char *flgets(char *, int, FILE *, long);
ali@0
   411
void lowerit(char *);
ali@0
   412
int gcisalpha(unsigned char);
ali@0
   413
int gcisdigit(unsigned char);
ali@0
   414
int gcisletter(unsigned char);
ali@0
   415
char *gcstrchr(char *s, char c);
ali@0
   416
void postprocess_for_HTML(char *);
ali@0
   417
char *linehasmarkup(char *);
ali@0
   418
char *losemarkup(char *);
ali@0
   419
int tagcomp(char *, char *);
ali@0
   420
char *loseentities(char *);
ali@0
   421
int isroman(char *);
ali@0
   422
int usertypo_count;
ali@0
   423
void postprocess_for_DP(char *);
ali@0
   424
ali@0
   425
char wrk[LINEBUFSIZE];
ali@0
   426
ali@0
   427
/* This is disgustingly lazy, predefining max words & lengths,   */
ali@0
   428
/* but now I'm out of 16-bit restrictions, what's a couple of K? */
ali@0
   429
#define MAX_QWORD           50
ali@0
   430
#define MAX_QWORD_LENGTH    40
ali@0
   431
char qword[MAX_QWORD][MAX_QWORD_LENGTH];
ali@0
   432
char qperiod[MAX_QWORD][MAX_QWORD_LENGTH];
ali@0
   433
signed int dupcnt[MAX_QWORD];
ali@0
   434
ali@0
   435
ali@0
   436
ali@0
   437
ali@0
   438
int main(int argc, char **argv)
ali@0
   439
{
ali@0
   440
    char *argsw, *s;
ali@0
   441
    int i, switno, invarg;
ali@0
   442
    char usertypo_file[MAX_PATH];
ali@0
   443
    FILE *usertypofile;
ali@0
   444
ali@0
   445
ali@0
   446
    if (strlen(argv[0]) < sizeof(running_from))
ali@0
   447
        strcpy(running_from, argv[0]);  /* save the path to the executable gutcheck */
ali@0
   448
ali@0
   449
    /* find out what directory we're running from */
ali@0
   450
    for (s = running_from + strlen(running_from); *s != '/' && *s != '\\' && s >= running_from; s--)
ali@0
   451
        *s = 0;
ali@0
   452
ali@0
   453
ali@0
   454
    switno = strlen(SWITCHES);
ali@0
   455
    for (i = switno ; --i >0 ; )
ali@0
   456
        pswit[i] = 0;           /* initialise switches */
ali@0
   457
ali@0
   458
    /* Standard loop to extract switches.                   */
ali@0
   459
    /* When we come out of this loop, the arguments will be */
ali@0
   460
    /* in argv[0] upwards and the switches used will be     */
ali@0
   461
    /* represented by their equivalent elements in pswit[]  */
ali@0
   462
    while ( --argc > 0 && **++argv == '-')
ali@0
   463
        for (argsw = argv[0]+1; *argsw !='\0'; argsw++)
ali@0
   464
            for (i = switno, invarg = 1; (--i >= 0) && invarg == 1 ; )
ali@0
   465
                if ((toupper(*argsw)) == SWITCHES[i] ) {
ali@0
   466
                    invarg = 0;
ali@0
   467
                    pswit[i] = 1;
ali@0
   468
                    }
ali@0
   469
ali@0
   470
    pswit[PARANOID_SWITCH] ^= 1;         /* Paranoid checking is turned OFF, not on, by its switch */
ali@0
   471
ali@0
   472
    if (pswit[PARANOID_SWITCH]) {                         /* if running in paranoid mode */
ali@0
   473
        pswit[TYPO_SWITCH] = pswit[TYPO_SWITCH] ^ 1;      /* force typo checks as well   */
ali@0
   474
        }                                                 /* v.20 removed s and p switches from paranoid mode */
ali@0
   475
ali@0
   476
    pswit[LINE_END_SWITCH] ^= 1;         /* Line-end checking is turned OFF, not on, by its switch */
ali@0
   477
    pswit[ECHO_SWITCH] ^= 1;             /* V.21 Echoing is turned OFF, not on, by its switch      */
ali@0
   478
ali@0
   479
    if (pswit[OVERVIEW_SWITCH])       /* just print summary; don't echo */
ali@0
   480
        pswit[ECHO_SWITCH] = 0;
ali@0
   481
ali@0
   482
    /* Web uploads - for the moment, this is really just a placeholder     */
ali@0
   483
    /* until we decide what processing we really want to do on web uploads */
ali@0
   484
    if (pswit[WEB_SWITCH]) {          /* specific override for web uploads */
ali@0
   485
        pswit[ECHO_SWITCH] =     1;
ali@0
   486
        pswit[SQUOTE_SWITCH] =   0;
ali@0
   487
        pswit[TYPO_SWITCH] =     1;
ali@0
   488
        pswit[QPARA_SWITCH] =    0;
ali@0
   489
        pswit[PARANOID_SWITCH] = 1;
ali@0
   490
        pswit[LINE_END_SWITCH] = 0;
ali@0
   491
        pswit[OVERVIEW_SWITCH] = 0;
ali@0
   492
        pswit[STDOUT_SWITCH] =   0;
ali@0
   493
        pswit[HEADER_SWITCH] =   1;
ali@0
   494
        pswit[VERBOSE_SWITCH] =  0;
ali@0
   495
        pswit[MARKUP_SWITCH] =   0;
ali@0
   496
        pswit[USERTYPO_SWITCH] = 0;
ali@0
   497
        pswit[DP_SWITCH] = 0;
ali@0
   498
        }
ali@0
   499
ali@0
   500
ali@0
   501
    if (argc < MINARGS || argc > MAXARGS) {  /* check number of args */
ali@0
   502
        proghelp();
ali@0
   503
        return(1);            /* exit */
ali@0
   504
        }
ali@0
   505
ali@0
   506
ali@0
   507
    /* read in the user-defined stealth scanno list */
ali@0
   508
ali@0
   509
    if (pswit[USERTYPO_SWITCH]) {                    /* ... we were told we had one! */
ali@0
   510
        if ((usertypofile = fopen(USERTYPO_FILE, "rb")) == NULL) {   /* not in cwd. try gutcheck directory. */
ali@0
   511
            strcpy(usertypo_file, running_from);
ali@0
   512
            strcat(usertypo_file, USERTYPO_FILE);
ali@0
   513
            if ((usertypofile = fopen(usertypo_file, "rb")) == NULL) {  /* we ain't got no user typo file! */
ali@0
   514
                printf("   --> I couldn't find gutcheck.typ -- proceeding without user typos.\n");
ali@0
   515
                }
ali@0
   516
            }
ali@0
   517
ali@0
   518
        usertypo_count = 0;
ali@0
   519
        if (usertypofile) {  /* we managed to open a User Typo File! */
ali@0
   520
            if (pswit[USERTYPO_SWITCH]) {
ali@0
   521
                while (flgets(aline, LINEBUFSIZE-1, usertypofile, (long)usertypo_count)) {
ali@0
   522
                    if (strlen(aline) > 1) {
ali@0
   523
                        if ((int)*aline > 33) {
ali@0
   524
                            s = malloc(strlen(aline)+1);
ali@0
   525
                            if (!s) {
ali@0
   526
                                fprintf(stderr, "gutcheck: cannot get enough memory for user typo file!!\n");
ali@0
   527
                                exit(1);
ali@0
   528
                                }
ali@0
   529
                            strcpy(s, aline);
ali@0
   530
                            usertypo[usertypo_count] = s;
ali@0
   531
                            usertypo_count++;
ali@0
   532
                            if (usertypo_count >= MAX_USER_TYPOS) {
ali@0
   533
                                printf("   --> Only %d user-defined typos allowed: ignoring the rest\n");
ali@0
   534
                                break;
ali@0
   535
                                }
ali@0
   536
                            }
ali@0
   537
                        }
ali@0
   538
                    }
ali@0
   539
                }
ali@0
   540
            fclose(usertypofile);
ali@0
   541
            }
ali@0
   542
        }
ali@0
   543
ali@0
   544
ali@0
   545
ali@0
   546
ali@0
   547
    fprintf(stderr, "gutcheck: Check and report on an e-text\n");
ali@0
   548
ali@0
   549
    cnt_dquot = cnt_squot = cnt_brack = cnt_bin = cnt_odd = cnt_long =
ali@0
   550
    cnt_short = cnt_punct = cnt_dash = cnt_word = cnt_html = cnt_lineend =
ali@0
   551
    cnt_spacend = 0;
ali@0
   552
ali@0
   553
    procfile(argv[0]);
ali@0
   554
ali@0
   555
    if (pswit[OVERVIEW_SWITCH]) {
ali@0
   556
                         printf("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
ali@0
   557
                            checked_linecnt, linecnt, linecnt - checked_linecnt);
ali@0
   558
                         printf("    --------------- Queries found --------------\n");
ali@0
   559
        if (cnt_long)    printf("    Long lines:                             %5ld\n",cnt_long);
ali@0
   560
        if (cnt_short)   printf("    Short lines:                            %5ld\n",cnt_short);
ali@0
   561
        if (cnt_lineend) printf("    Line-end problems:                      %5ld\n",cnt_lineend);
ali@0
   562
        if (cnt_word)    printf("    Common typos:                           %5ld\n",cnt_word);
ali@0
   563
        if (cnt_dquot)   printf("    Unmatched quotes:                       %5ld\n",cnt_dquot);
ali@0
   564
        if (cnt_squot)   printf("    Unmatched SingleQuotes:                 %5ld\n",cnt_squot);
ali@0
   565
        if (cnt_brack)   printf("    Unmatched brackets:                     %5ld\n",cnt_brack);
ali@0
   566
        if (cnt_bin)     printf("    Non-ASCII characters:                   %5ld\n",cnt_bin);
ali@0
   567
        if (cnt_odd)     printf("    Proofing characters:                    %5ld\n",cnt_odd);
ali@0
   568
        if (cnt_punct)   printf("    Punctuation & spacing queries:          %5ld\n",cnt_punct);
ali@0
   569
        if (cnt_dash)    printf("    Non-standard dashes:                    %5ld\n",cnt_dash);
ali@0
   570
        if (cnt_html)    printf("    Possible HTML tags:                     %5ld\n",cnt_html);
ali@0
   571
        printf("\n");
ali@0
   572
        printf("    TOTAL QUERIES                           %5ld\n",
ali@0
   573
            cnt_dquot + cnt_squot + cnt_brack + cnt_bin + cnt_odd + cnt_long +
ali@0
   574
            cnt_short + cnt_punct + cnt_dash + cnt_word + cnt_html + cnt_lineend);
ali@0
   575
        }
ali@0
   576
ali@0
   577
    return(0);
ali@0
   578
}
ali@0
   579
ali@0
   580
ali@0
   581
ali@0
   582
/* procfile - process one file */
ali@0
   583
ali@0
   584
void procfile(char *filename)
ali@0
   585
{
ali@0
   586
ali@0
   587
    char *s, *t, *s1, laststart, *wordstart;
ali@0
   588
    char inword[MAXWORDLEN], testword[MAXWORDLEN];
ali@0
   589
    char parastart[81];     /* first line of current para */
ali@0
   590
    FILE *infile;
ali@0
   591
    long quot, squot, firstline, alphalen, totlen, binlen,
ali@0
   592
         shortline, longline, verylongline, spacedash, emdash,
ali@0
   593
         space_emdash, non_PG_space_emdash, PG_space_emdash,
ali@0
   594
         footerline, dotcomma, start_para_line, astline, fslashline,
ali@0
   595
         standalone_digit, hyphens, htmcount, endquote_count;
ali@0
   596
    long spline, nspline;
ali@0
   597
    signed int i, j, llen, isemptyline, isacro, isellipsis, istypo, alower,
ali@0
   598
         eNon_A, eTab, eTilde, eAst, eFSlash, eCarat;
ali@0
   599
    signed int warn_short, warn_long, warn_bin, warn_dash, warn_dotcomma,
ali@0
   600
         warn_ast, warn_fslash, warn_digit, warn_hyphen, warn_endquote;
ali@0
   601
    unsigned int lastlen, lastblen;
ali@0
   602
    signed int s_brack, c_brack, r_brack, c_unders;
ali@0
   603
    signed int open_single_quote, close_single_quote, guessquote, dquotepar, squotepar;
ali@0
   604
    signed int isnewpara, vowel, consonant;
ali@0
   605
    char dquote_err[80], squote_err[80], rbrack_err[80], sbrack_err[80], cbrack_err[80],
ali@0
   606
         unders_err[80];
ali@0
   607
    signed int qword_index, qperiod_index, isdup;
ali@0
   608
    signed int enddash;
ali@0
   609
    signed int Dutchcount, isDutch, Frenchcount, isFrench;
ali@0
   610
ali@0
   611
ali@0
   612
    
ali@0
   613
ali@0
   614
ali@0
   615
    laststart = CHAR_SPACE;
ali@0
   616
    lastlen = lastblen = 0;
ali@0
   617
    *dquote_err = *squote_err = *rbrack_err = *cbrack_err = *sbrack_err =
ali@0
   618
        *unders_err = *prevline = 0;
ali@0
   619
    linecnt = firstline = alphalen = totlen = binlen =
ali@0
   620
        shortline = longline = spacedash = emdash = checked_linecnt =
ali@0
   621
        space_emdash = non_PG_space_emdash = PG_space_emdash =
ali@0
   622
        footerline = dotcomma = start_para_line = astline = fslashline = 
ali@0
   623
        standalone_digit = hyphens = htmcount = endquote_count = 0;
ali@0
   624
    quot = squot = s_brack = c_brack = r_brack = c_unders = 0;
ali@0
   625
    i = llen = isemptyline = isacro = isellipsis = istypo = 0;
ali@0
   626
    warn_short = warn_long = warn_bin = warn_dash = warn_dotcomma = 
ali@0
   627
        warn_ast = warn_fslash = warn_digit = warn_endquote = 0;
ali@0
   628
    isnewpara = vowel = consonant = enddash = 0;
ali@0
   629
    spline = nspline = 0;
ali@0
   630
    qword_index = qperiod_index = isdup = 0;
ali@0
   631
    *inword = *testword = 0;
ali@0
   632
    open_single_quote = close_single_quote = guessquote = dquotepar = squotepar = 0;
ali@0
   633
    Dutchcount = isDutch = Frenchcount = isFrench = 0;
ali@0
   634
ali@0
   635
ali@0
   636
    for (j = 0; j < MAX_QWORD; j++) {
ali@0
   637
        dupcnt[j] = 0;
ali@0
   638
        for (i = 0; i < MAX_QWORD_LENGTH; i++)
ali@0
   639
            qword[i][j] = 0;
ali@0
   640
            qperiod[i][j] = 0;
ali@0
   641
            }
ali@0
   642
ali@0
   643
ali@0
   644
    if ((infile = fopen(filename, "rb")) == NULL) {
ali@0
   645
        if (pswit[STDOUT_SWITCH])
ali@0
   646
            fprintf(stdout, "gutcheck: cannot open %s\n", filename);
ali@0
   647
        else
ali@0
   648
            fprintf(stderr, "gutcheck: cannot open %s\n", filename);
ali@0
   649
        exit(1);
ali@0
   650
        }
ali@0
   651
ali@0
   652
    fprintf(stdout, "\n\nFile: %s\n\n", filename);
ali@0
   653
    firstline = shortline = longline = verylongline = 0;
ali@0
   654
ali@0
   655
ali@0
   656
    /*****************************************************/
ali@0
   657
    /*                                                   */
ali@0
   658
    /*  Run a first pass - verify that it's a valid PG   */
ali@0
   659
    /*  file, decide whether to report some things that  */
ali@0
   660
    /*  occur many times in the text like long or short  */
ali@0
   661
    /*  lines, non-standard dashes, and other good stuff */
ali@0
   662
    /*  I'll doubtless think of later.                   */
ali@0
   663
    /*                                                   */
ali@0
   664
    /*****************************************************/
ali@0
   665
ali@0
   666
    /*****************************************************/
ali@0
   667
    /* V.24  Sigh. Yet Another Header Change             */
ali@0
   668
    /*****************************************************/
ali@0
   669
ali@0
   670
    while (fgets(aline, LINEBUFSIZE-1, infile)) {
ali@0
   671
        while (aline[strlen(aline)-1] == 10 || aline[strlen(aline)-1] == 13 ) aline[strlen(aline)-1] = 0;
ali@0
   672
        linecnt++;
ali@0
   673
        if (strstr(aline, "*END") && strstr(aline, "SMALL PRINT") && (strstr(aline, "PUBLIC DOMAIN") || strstr(aline, "COPYRIGHT"))) {
ali@0
   674
            if (spline)
ali@0
   675
                printf("   --> Duplicate header?\n");
ali@0
   676
            spline = linecnt + 1;   /* first line of non-header text, that is */
ali@0
   677
            }
ali@0
   678
        if (!strncmp(aline, "*** START", 9) && strstr(aline, "PROJECT GUTENBERG")) {
ali@0
   679
            if (nspline)
ali@0
   680
                printf("   --> Duplicate header?\n");
ali@0
   681
            nspline = linecnt + 1;   /* first line of non-header text, that is */
ali@0
   682
            }
ali@0
   683
        if (spline || nspline) {
ali@0
   684
            lowerit(aline);
ali@0
   685
            if (strstr(aline, "end") && strstr(aline, "project gutenberg")) {
ali@0
   686
                if (strstr(aline, "end") < strstr(aline, "project gutenberg")) {
ali@0
   687
                    if (footerline) {
ali@0
   688
                        if (!nspline) /* it's an old-form header - we can detect duplicates */
ali@0
   689
                            printf("   --> Duplicate footer?\n");
ali@0
   690
                        else 
ali@0
   691
                            ;
ali@0
   692
                        }
ali@0
   693
                    else {
ali@0
   694
                        footerline = linecnt;
ali@0
   695
                        }
ali@0
   696
                    }
ali@0
   697
                }
ali@0
   698
            }
ali@0
   699
        if (spline) firstline = spline;
ali@0
   700
        if (nspline) firstline = nspline;  /* override with new */
ali@0
   701
ali@0
   702
        if (footerline) continue;    /* 0.99+ don't count the boilerplate in the footer */
ali@0
   703
ali@0
   704
        llen = strlen(aline);
ali@0
   705
        totlen += llen;
ali@0
   706
        for (i = 0; i < llen; i++) {
ali@0
   707
            if ((unsigned char)aline[i] > 127) binlen++;
ali@0
   708
            if (gcisalpha(aline[i])) alphalen++;
ali@0
   709
            if (i > 0)
ali@0
   710
                if (aline[i] == CHAR_DQUOTE && isalpha(aline[i-1]))
ali@0
   711
                    endquote_count++;
ali@0
   712
            }
ali@0
   713
        if (strlen(aline) > 2
ali@0
   714
            && lastlen > 2 && lastlen < SHORTEST_PG_LINE
ali@0
   715
            && lastblen > 2 && lastblen > SHORTEST_PG_LINE
ali@0
   716
            && laststart != CHAR_SPACE)
ali@0
   717
                shortline++;
ali@0
   718
ali@0
   719
        if (*aline) /* fixed line below for 0.96 */
ali@0
   720
            if ((unsigned char)aline[strlen(aline)-1] <= CHAR_SPACE) cnt_spacend++;
ali@0
   721
ali@0
   722
        if (strstr(aline, ".,")) dotcomma++;
ali@0
   723
        /* 0.98 only count ast lines for ignoring purposes where there is */
ali@0
   724
        /* locase text on the line */
ali@0
   725
        if (strstr(aline, "*")) {
ali@0
   726
            for (s = aline; *s; s++)
ali@0
   727
                if (*s >='a' && *s <= 'z')
ali@0
   728
                    break;
ali@0
   729
             if (*s) astline++;
ali@0
   730
             }
ali@0
   731
        if (strstr(aline, "/"))
ali@0
   732
            fslashline++;
ali@0
   733
        for (i = llen-1; i > 0 && (unsigned char)aline[i] <= CHAR_SPACE; i--);
ali@0
   734
        if (aline[i] == '-' && aline[i-1] != '-') hyphens++;
ali@0
   735
ali@0
   736
        if (llen > LONGEST_PG_LINE) longline++;
ali@0
   737
        if (llen > WAY_TOO_LONG) verylongline++;
ali@0
   738
ali@0
   739
        if (strstr(aline, "<") && strstr(aline, ">")) {
ali@0
   740
            i = (signed int) (strstr(aline, ">") - strstr(aline, "<") + 1);
ali@0
   741
            if (i > 0) 
ali@0
   742
                htmcount++;
ali@0
   743
            if (strstr(aline, "<i>")) htmcount +=4; /* bonus marks! */
ali@0
   744
            }
ali@0
   745
ali@0
   746
        /* Check for spaced em-dashes */
ali@0
   747
        if (strstr(aline,"--")) {
ali@0
   748
            emdash++;
ali@0
   749
            if (*(strstr(aline, "--")-1) == CHAR_SPACE ||
ali@0
   750
               (*(strstr(aline, "--")+2) == CHAR_SPACE))
ali@0
   751
                    space_emdash++;
ali@0
   752
            if (*(strstr(aline, "--")-1) == CHAR_SPACE &&
ali@0
   753
               (*(strstr(aline, "--")+2) == CHAR_SPACE))
ali@0
   754
                    non_PG_space_emdash++;             /* count of em-dashes with spaces both sides */
ali@0
   755
            if (*(strstr(aline, "--")-1) != CHAR_SPACE &&
ali@0
   756
               (*(strstr(aline, "--")+2) != CHAR_SPACE))
ali@0
   757
                    PG_space_emdash++;                 /* count of PG-type em-dashes with no spaces */
ali@0
   758
            }
ali@0
   759
ali@0
   760
        for (s = aline; *s;) {
ali@0
   761
            s = getaword(s, inword);
ali@0
   762
            if (!strcmp(inword, "hij") || !strcmp(inword, "niet")) 
ali@0
   763
                Dutchcount++;
ali@0
   764
            if (!strcmp(inword, "dans") || !strcmp(inword, "avec")) 
ali@0
   765
                Frenchcount++;
ali@0
   766
            if (!strcmp(inword, "0") || !strcmp(inword, "1")) 
ali@0
   767
                standalone_digit++;
ali@0
   768
            }
ali@0
   769
ali@0
   770
        /* Check for spaced dashes */
ali@0
   771
        if (strstr(aline," -"))
ali@0
   772
            if (*(strstr(aline, " -")+2) != '-')
ali@0
   773
                    spacedash++;
ali@0
   774
        lastblen = lastlen;
ali@0
   775
        lastlen = strlen(aline);
ali@0
   776
        laststart = aline[0];
ali@0
   777
ali@0
   778
        }
ali@0
   779
    fclose(infile);
ali@0
   780
ali@0
   781
ali@0
   782
    /* now, based on this quick view, make some snap decisions */
ali@0
   783
    if (cnt_spacend > 0) {
ali@0
   784
        printf("   --> %ld lines in this file have white space at end\n", cnt_spacend);
ali@0
   785
        }
ali@0
   786
ali@0
   787
    warn_dotcomma = 1;
ali@0
   788
    if (dotcomma > 5) {
ali@0
   789
        warn_dotcomma = 0;
ali@0
   790
        printf("   --> %ld lines in this file contain '.,'. Not reporting them.\n", dotcomma);
ali@0
   791
        }
ali@0
   792
ali@0
   793
    /* if more than 50 lines, or one-tenth, are short, don't bother reporting them */
ali@0
   794
    warn_short = 1;
ali@0
   795
    if (shortline > 50 || shortline * 10 > linecnt) {
ali@0
   796
        warn_short = 0;
ali@0
   797
        printf("   --> %ld lines in this file are short. Not reporting short lines.\n", shortline);
ali@0
   798
        }
ali@0
   799
ali@0
   800
    /* if more than 50 lines, or one-tenth, are long, don't bother reporting them */
ali@0
   801
    warn_long = 1;
ali@0
   802
    if (longline > 50 || longline * 10 > linecnt) {
ali@0
   803
        warn_long = 0;
ali@0
   804
        printf("   --> %ld lines in this file are long. Not reporting long lines.\n", longline);
ali@0
   805
        }
ali@0
   806
ali@0
   807
    /* if more than 10 lines contain asterisks, don't bother reporting them V.0.97 */
ali@0
   808
    warn_ast = 1;
ali@0
   809
    if (astline > 10 ) {
ali@0
   810
        warn_ast = 0;
ali@0
   811
        printf("   --> %ld lines in this file contain asterisks. Not reporting them.\n", astline);
ali@0
   812
        }
ali@0
   813
ali@0
   814
    /* if more than 10 lines contain forward slashes, don't bother reporting them V.0.99 */
ali@0
   815
    warn_fslash = 1;
ali@0
   816
    if (fslashline > 10 ) {
ali@0
   817
        warn_fslash = 0;
ali@0
   818
        printf("   --> %ld lines in this file contain forward slashes. Not reporting them.\n", fslashline);
ali@0
   819
        }
ali@0
   820
ali@0
   821
    /* if more than 20 lines contain unpunctuated endquotes, don't bother reporting them V.0.99 */
ali@0
   822
    warn_endquote = 1;
ali@0
   823
    if (endquote_count > 20 ) {
ali@0
   824
        warn_endquote = 0;
ali@0
   825
        printf("   --> %ld lines in this file contain unpunctuated endquotes. Not reporting them.\n", endquote_count);
ali@0
   826
        }
ali@0
   827
ali@0
   828
    /* if more than 15 lines contain standalone digits, don't bother reporting them V.0.97 */
ali@0
   829
    warn_digit = 1;
ali@0
   830
    if (standalone_digit > 10 ) {
ali@0
   831
        warn_digit = 0;
ali@0
   832
        printf("   --> %ld lines in this file contain standalone 0s and 1s. Not reporting them.\n", standalone_digit);
ali@0
   833
        }
ali@0
   834
ali@0
   835
    /* if more than 20 lines contain hyphens at end, don't bother reporting them V.0.98 */
ali@0
   836
    warn_hyphen = 1;
ali@0
   837
    if (hyphens > 20 ) {
ali@0
   838
        warn_hyphen = 0;
ali@0
   839
        printf("   --> %ld lines in this file have hyphens at end. Not reporting them.\n", hyphens);
ali@0
   840
        }
ali@0
   841
ali@0
   842
    if (htmcount > 20 && !pswit[MARKUP_SWITCH]) {
ali@0
   843
        printf("   --> Looks like this is HTML. Switching HTML mode ON.\n");
ali@0
   844
        pswit[MARKUP_SWITCH] = 1;
ali@0
   845
        }
ali@0
   846
        
ali@0
   847
    if (verylongline > 0) {
ali@0
   848
        printf("   --> %ld lines in this file are VERY long!\n", verylongline);
ali@0
   849
        }
ali@0
   850
ali@0
   851
    /* If there are more non-PG spaced dashes than PG em-dashes,    */
ali@0
   852
    /* assume it's deliberate                                       */
ali@0
   853
    /* Current PG guidelines say don't use them, but older texts do,*/
ali@0
   854
    /* and some people insist on them whatever the guidelines say.  */
ali@0
   855
    /* V.20 removed requirement that PG_space_emdash be greater than*/
ali@0
   856
    /* ten before turning off warnings about spaced dashes.         */
ali@0
   857
    warn_dash = 1;
ali@0
   858
    if (spacedash + non_PG_space_emdash > PG_space_emdash) {
ali@0
   859
        warn_dash = 0;
ali@0
   860
        printf("   --> There are %ld spaced dashes and em-dashes. Not reporting them.\n", spacedash + non_PG_space_emdash);
ali@0
   861
        }
ali@0
   862
ali@0
   863
    /* if more than a quarter of characters are hi-bit, bug out */
ali@0
   864
    warn_bin = 1;
ali@0
   865
    if (binlen * 4 > totlen) {
ali@0
   866
        printf("   --> This file does not appear to be ASCII. Terminating. Best of luck with it!\n");
ali@0
   867
        exit(1);
ali@0
   868
        }
ali@0
   869
    if (alphalen * 4 < totlen) {
ali@0
   870
        printf("   --> This file does not appear to be text. Terminating. Best of luck with it!\n");
ali@0
   871
        exit(1);
ali@0
   872
        }
ali@0
   873
    if ((binlen * 100 > totlen) || (binlen > 100)) {
ali@0
   874
        printf("   --> There are a lot of foreign letters here. Not reporting them.\n");
ali@0
   875
        warn_bin = 0;
ali@0
   876
        }
ali@0
   877
ali@0
   878
    /* isDutch and isFrench added .991 Feb 06 for Frank, Jeroen, Renald */
ali@0
   879
    isDutch = 0;
ali@0
   880
    if (Dutchcount > 50) {
ali@0
   881
        isDutch = 1;
ali@0
   882
        printf("   --> This looks like Dutch - switching off dashes and warnings for 's Middags case.\n");
ali@0
   883
        }
ali@0
   884
ali@0
   885
    isFrench = 0;
ali@0
   886
    if (Frenchcount > 50) {
ali@0
   887
        isFrench = 1;
ali@0
   888
        printf("   --> This looks like French - switching off some doublepunct.\n");
ali@0
   889
        }
ali@0
   890
ali@0
   891
    if (firstline && footerline)
ali@0
   892
        printf("    The PG header and footer appear to be already on.\n");
ali@0
   893
    else {
ali@0
   894
        if (firstline)
ali@0
   895
            printf("    The PG header is on - no footer.\n");
ali@0
   896
        if (footerline)
ali@0
   897
            printf("    The PG footer is on - no header.\n");
ali@0
   898
        }
ali@0
   899
    printf("\n");
ali@0
   900
ali@0
   901
    /* V.22 George Davis asked for an override switch to force it to list everything */
ali@0
   902
    if (pswit[VERBOSE_SWITCH]) {
ali@0
   903
        warn_bin = 1;
ali@0
   904
        warn_short = 1;
ali@0
   905
        warn_dotcomma = 1;
ali@0
   906
        warn_long = 1;
ali@0
   907
        warn_dash = 1;
ali@0
   908
        warn_digit = 1;
ali@0
   909
        warn_ast = 1;
ali@0
   910
        warn_fslash = 1;
ali@0
   911
        warn_hyphen = 1;
ali@0
   912
        warn_endquote = 1;
ali@0
   913
        printf("   *** Verbose output is ON -- you asked for it! ***\n");
ali@0
   914
        }
ali@0
   915
ali@0
   916
    if (isDutch)
ali@0
   917
        warn_dash = 0;  /* Frank suggested turning it REALLY off for Dutch */
ali@0
   918
ali@0
   919
    if ((infile = fopen(filename, "rb")) == NULL) {
ali@0
   920
        if (pswit[STDOUT_SWITCH])
ali@0
   921
            fprintf(stdout, "gutcheck: cannot open %s\n", filename);
ali@0
   922
        else
ali@0
   923
            fprintf(stderr, "gutcheck: cannot open %s\n", filename);
ali@0
   924
        exit(1);
ali@0
   925
        }
ali@0
   926
ali@0
   927
    if (footerline > 0 && firstline > 0 && footerline > firstline && footerline - firstline < 100) { /* ugh */
ali@0
   928
        printf("   --> I don't really know where this text starts. \n");
ali@0
   929
        printf("       There are no reference points.\n");
ali@0
   930
        printf("       I'm going to have to report the header and footer as well.\n");
ali@0
   931
        firstline=0;
ali@0
   932
        }
ali@0
   933
        
ali@0
   934
ali@0
   935
ali@0
   936
    /*****************************************************/
ali@0
   937
    /*                                                   */
ali@0
   938
    /* Here we go with the main pass. Hold onto yer hat! */
ali@0
   939
    /*                                                   */
ali@0
   940
    /*****************************************************/
ali@0
   941
ali@0
   942
    /* Re-init some variables we've dirtied */
ali@0
   943
    quot = squot = linecnt = 0;
ali@0
   944
    laststart = CHAR_SPACE;
ali@0
   945
    lastlen = lastblen = 0;
ali@0
   946
ali@0
   947
    while (flgets(aline, LINEBUFSIZE-1, infile, linecnt+1)) {
ali@0
   948
        linecnt++;
ali@0
   949
        if (linecnt == 1) isnewpara = 1;
ali@0
   950
        if (pswit[DP_SWITCH])
ali@0
   951
            if (!strncmp(aline, "-----File: ", 11))
ali@0
   952
                continue;    // skip DP page separators completely
ali@0
   953
        if (linecnt < firstline || (footerline > 0 && linecnt > footerline)) {
ali@0
   954
            if (pswit[HEADER_SWITCH]) {
ali@0
   955
                if (!strncmp(aline, "Title:", 6))
ali@0
   956
                    printf("    %s\n", aline);
ali@0
   957
                if (!strncmp (aline, "Author:", 7))
ali@0
   958
                    printf("    %s\n", aline);
ali@0
   959
                if (!strncmp(aline, "Release Date:", 13))
ali@0
   960
                    printf("    %s\n", aline);
ali@0
   961
                if (!strncmp(aline, "Edition:", 8))
ali@0
   962
                    printf("    %s\n\n", aline);
ali@0
   963
                }
ali@0
   964
            continue;                /* skip through the header */
ali@0
   965
            }
ali@0
   966
        checked_linecnt++;
ali@0
   967
        s = aline;
ali@0
   968
        isemptyline = 1;      /* assume the line is empty until proven otherwise */
ali@0
   969
ali@0
   970
        /* If we are in a state of unbalanced quotes, and this line    */
ali@0
   971
        /* doesn't begin with a quote, output the stored error message */
ali@0
   972
        /* If the -P switch was used, print the warning even if the    */
ali@0
   973
        /* new para starts with quotes                                 */
ali@0
   974
        /* Version .20 - if the new paragraph does start with a quote, */
ali@0
   975
        /* but is indented, I was giving a spurious error. Need to     */
ali@0
   976
        /* check the first _non-space_ character on the line rather    */
ali@0
   977
        /* than the first character when deciding whether the para     */
ali@0
   978
        /* starts with a quote. Using *t for this.                     */
ali@0
   979
        t = s;
ali@0
   980
        while (*t == ' ') t++;
ali@0
   981
        if (*dquote_err)
ali@0
   982
            if (*t != CHAR_DQUOTE || pswit[QPARA_SWITCH]) {
ali@0
   983
                if (!pswit[OVERVIEW_SWITCH]) {
ali@0
   984
                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
ali@0
   985
                    printf(dquote_err);
ali@0
   986
                    }
ali@0
   987
                else
ali@0
   988
                    cnt_dquot++;
ali@0
   989
            }
ali@0
   990
        if (*squote_err) {
ali@0
   991
            if (*t != CHAR_SQUOTE && *t != CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] || squot) {
ali@0
   992
                if (!pswit[OVERVIEW_SWITCH]) {
ali@0
   993
                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
ali@0
   994
                    printf(squote_err);
ali@0
   995
                    }
ali@0
   996
                else
ali@0
   997
                    cnt_squot++;
ali@0
   998
                }
ali@0
   999
            squot = 0;
ali@0
  1000
            }
ali@0
  1001
        if (*rbrack_err) {
ali@0
  1002
            if (!pswit[OVERVIEW_SWITCH]) {
ali@0
  1003
                if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
ali@0
  1004
                printf(rbrack_err);
ali@0
  1005
                }
ali@0
  1006
            else
ali@0
  1007
                cnt_brack++;
ali@0
  1008
            }
ali@0
  1009
        if (*sbrack_err) {
ali@0
  1010
            if (!pswit[OVERVIEW_SWITCH]) {
ali@0
  1011
                if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
ali@0
  1012
                printf(sbrack_err);
ali@0
  1013
                }
ali@0
  1014
            else
ali@0
  1015
                cnt_brack++;
ali@0
  1016
            }
ali@0
  1017
        if (*cbrack_err) {
ali@0
  1018
            if (!pswit[OVERVIEW_SWITCH]) {
ali@0
  1019
                if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
ali@0
  1020
                printf(cbrack_err);
ali@0
  1021
                }
ali@0
  1022
            else
ali@0
  1023
                cnt_brack++;
ali@0
  1024
            }
ali@0
  1025
        if (*unders_err) {
ali@0
  1026
            if (!pswit[OVERVIEW_SWITCH]) {
ali@0
  1027
                if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
ali@0
  1028
                printf(unders_err);
ali@0
  1029
                }
ali@0
  1030
            else
ali@0
  1031
                cnt_brack++;
ali@0
  1032
            }
ali@0
  1033
ali@0
  1034
        *dquote_err = *squote_err = *rbrack_err = *cbrack_err = 
ali@0
  1035
            *sbrack_err = *unders_err = 0;
ali@0
  1036
ali@0
  1037
ali@0
  1038
        /* look along the line, accumulate the count of quotes, and see */
ali@0
  1039
        /* if this is an empty line - i.e. a line with nothing on it    */
ali@0
  1040
        /* but spaces.                                                  */
ali@0
  1041
        /* V .12 also if line has just spaces, * and/or - on it, don't  */
ali@0
  1042
        /* count it, since empty lines with asterisks or dashes to      */
ali@0
  1043
        /* separate sections are common.                                */
ali@0
  1044
        /* V .15 new single-quote checking - has to be better than the  */
ali@0
  1045
        /* previous version, but how much better? fingers crossed!      */
ali@0
  1046
        /* V .20 add period to * and - as characters on a separator line*/
ali@0
  1047
        s = aline;
ali@0
  1048
        while (*s) {
ali@0
  1049
            if (*s == CHAR_DQUOTE) quot++;
ali@0
  1050
            if (*s == CHAR_SQUOTE || *s == CHAR_OPEN_SQUOTE)
ali@0
  1051
                if (s == aline) { /* at start of line, it can only be an openquote */
ali@0
  1052
                    if (strncmp(s+2, "tis", 3) && strncmp(s+2, "Tis", 3)) /* hardcode a very common exception! */
ali@0
  1053
                        open_single_quote++;
ali@0
  1054
                    }
ali@0
  1055
                else
ali@0
  1056
                    if (gcisalpha(*(s-1)) && gcisalpha(*(s+1)))
ali@0
  1057
                        ; /* do nothing! - it's definitely an apostrophe, not a quote */
ali@0
  1058
                    else        /* it's outside a word - let's check it out */
ali@0
  1059
                        if (*s == CHAR_OPEN_SQUOTE || gcisalpha(*(s+1))) { /* it damwell better BE an openquote */
ali@0
  1060
                            if (strncmp(s+1, "tis", 3) && strncmp(s+1, "Tis", 3)) /* hardcode a very common exception! */
ali@0
  1061
                                open_single_quote++;
ali@0
  1062
                            }
ali@0
  1063
                        else { /* now - is it a closequote? */
ali@0
  1064
                            guessquote = 0;   /* accumulate clues */
ali@0
  1065
                            if (gcisalpha(*(s-1))) { /* it follows a letter - could be either */
ali@0
  1066
                                guessquote += 1;
ali@0
  1067
                                if (*(s-1) == 's') { /* looks like a plural apostrophe */
ali@0
  1068
                                    guessquote -= 3;
ali@0
  1069
                                    if (*(s+1) == CHAR_SPACE)  /* bonus marks! */
ali@0
  1070
                                        guessquote -= 2;
ali@0
  1071
                                    }
ali@0
  1072
                                }
ali@0
  1073
                            else /* it doesn't have a letter either side */
ali@0
  1074
                                if (strchr(".?!,;:", *(s-1)) && (strchr(".?!,;: ", *(s+1))))
ali@0
  1075
                                    guessquote += 8; /* looks like a closequote */
ali@0
  1076
                                else
ali@0
  1077
                                    guessquote += 1;
ali@0
  1078
                            if (open_single_quote > close_single_quote)
ali@0
  1079
                                guessquote += 1; /* give it the benefit of some doubt - if a squote is already open */
ali@0
  1080
                            else
ali@0
  1081
                                guessquote -= 1;
ali@0
  1082
                            if (guessquote >= 0)
ali@0
  1083
                                close_single_quote++;
ali@0
  1084
                            }
ali@0
  1085
ali@0
  1086
            if (*s != CHAR_SPACE
ali@0
  1087
                && *s != '-'
ali@0
  1088
                && *s != '.'
ali@0
  1089
                && *s != CHAR_ASTERISK
ali@0
  1090
                && *s != 13
ali@0
  1091
                && *s != 10) isemptyline = 0;  /* ignore lines like  *  *  *  as spacers */
ali@0
  1092
            if (*s == CHAR_UNDERSCORE) c_unders++;
ali@0
  1093
            if (*s == CHAR_OPEN_CBRACK) c_brack++;
ali@0
  1094
            if (*s == CHAR_CLOSE_CBRACK) c_brack--;
ali@0
  1095
            if (*s == CHAR_OPEN_RBRACK) r_brack++;
ali@0
  1096
            if (*s == CHAR_CLOSE_RBRACK) r_brack--;
ali@0
  1097
            if (*s == CHAR_OPEN_SBRACK) s_brack++;
ali@0
  1098
            if (*s == CHAR_CLOSE_SBRACK) s_brack--;
ali@0
  1099
            s++;
ali@0
  1100
            }
ali@0
  1101
ali@0
  1102
        if (isnewpara && !isemptyline) {   /* This line is the start of a new paragraph */
ali@0
  1103
            start_para_line = linecnt;
ali@0
  1104
            strncpy(parastart, aline, 80); /* Capture its first line in case we want to report it later */
ali@0
  1105
            parastart[79] = 0;
ali@0
  1106
            dquotepar = squotepar = 0; /* restart the quote count 0.98 */
ali@0
  1107
            s = aline;
ali@0
  1108
            while (!gcisalpha(*s) && !gcisdigit(*s) && *s) s++;    /* V.97 fixed bug - overran line and gave false warning - rare */
ali@0
  1109
            if (*s >= 'a' && *s <='z') { /* and its first letter is lowercase */
ali@0
  1110
                if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1111
                if (!pswit[OVERVIEW_SWITCH])
ali@0
  1112
                    printf("    Line %ld column %d - Paragraph starts with lower-case\n", linecnt, (int)(s - aline) +1);
ali@0
  1113
                else
ali@0
  1114
                    cnt_punct++;
ali@0
  1115
                }
ali@0
  1116
            isnewpara = 0; /* Signal the end of new para processing */
ali@0
  1117
            }
ali@0
  1118
ali@0
  1119
        /* Check for an em-dash broken at line end */
ali@0
  1120
        if (enddash && *aline == '-') {
ali@0
  1121
            if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1122
            if (!pswit[OVERVIEW_SWITCH])
ali@0
  1123
                printf("    Line %ld column 1 - Broken em-dash?\n", linecnt);
ali@0
  1124
            else
ali@0
  1125
                cnt_punct++;
ali@0
  1126
            }
ali@0
  1127
        enddash = 0;
ali@0
  1128
        for (s = aline + strlen(aline) - 1; *s == ' ' && s > aline; s--);
ali@0
  1129
        if (s >= aline && *s == '-')
ali@0
  1130
            enddash = 1;
ali@0
  1131
            
ali@0
  1132
ali@0
  1133
        /* Check for invalid or questionable characters in the line */
ali@0
  1134
        /* Anything above 127 is invalid for plain ASCII,  and      */
ali@0
  1135
        /* non-printable control characters should also be flagged. */
ali@0
  1136
        /* Tabs should generally not be there.                      */
ali@0
  1137
        /* Jan 06, in 0.99: Hm. For some strange reason, I either   */
ali@0
  1138
        /* never created or deleted the check for unprintable       */
ali@0
  1139
        /* control characters. They should be reported even if      */
ali@0
  1140
        /* warn_bin is on, I think, and in full.                    */
ali@0
  1141
ali@0
  1142
        for (s = aline; *s; s++) {
ali@0
  1143
            i = (unsigned char) *s;
ali@0
  1144
            if (i < CHAR_SPACE && i != CHAR_LF && i != CHAR_CR && i != CHAR_TAB) {
ali@0
  1145
                if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1146
                if (!pswit[OVERVIEW_SWITCH])
ali@0
  1147
                    printf("    Line %ld column %d - Control character %d\n", linecnt, (int) (s - aline) + 1, i);
ali@0
  1148
                else
ali@0
  1149
                    cnt_bin++;
ali@0
  1150
                }
ali@0
  1151
            }
ali@0
  1152
ali@0
  1153
        if (warn_bin) {
ali@0
  1154
            eNon_A = eTab = eTilde = eCarat = eFSlash = eAst = 0;  /* don't repeat multiple warnings on one line */
ali@0
  1155
            for (s = aline; *s; s++) {
ali@0
  1156
                if (!eNon_A && ((*s < CHAR_SPACE && *s != 9 && *s != '\n') || (unsigned char)*s > 127)) {
ali@0
  1157
                    i = *s;                           /* annoying kludge for signed chars */
ali@0
  1158
                    if (i < 0) i += 256;
ali@0
  1159
                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1160
                    if (!pswit[OVERVIEW_SWITCH])
ali@0
  1161
                        if (i > 127 && i < 160)
ali@0
  1162
                            printf("    Line %ld column %d - Non-ISO-8859 character %d\n", linecnt, (int) (s - aline) + 1, i);
ali@0
  1163
                        else
ali@0
  1164
                            printf("    Line %ld column %d - Non-ASCII character %d\n", linecnt, (int) (s - aline) + 1, i);
ali@0
  1165
                    else
ali@0
  1166
                        cnt_bin++;
ali@0
  1167
                    eNon_A = 1;
ali@0
  1168
                    }
ali@0
  1169
                if (!eTab && *s == CHAR_TAB) {
ali@0
  1170
                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1171
                    if (!pswit[OVERVIEW_SWITCH])
ali@0
  1172
                        printf("    Line %ld column %d - Tab character?\n", linecnt, (int) (s - aline) + 1);
ali@0
  1173
                    else
ali@0
  1174
                        cnt_odd++;
ali@0
  1175
                    eTab = 1;
ali@0
  1176
                    }
ali@0
  1177
                if (!eTilde && *s == CHAR_TILDE) {  /* often used by OCR software to indicate an unrecognizable character */
ali@0
  1178
                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1179
                    if (!pswit[OVERVIEW_SWITCH])
ali@0
  1180
                        printf("    Line %ld column %d - Tilde character?\n", linecnt, (int) (s - aline) + 1);
ali@0
  1181
                    else
ali@0
  1182
                        cnt_odd++;
ali@0
  1183
                    eTilde = 1;
ali@0
  1184
                    }
ali@0
  1185
                if (!eCarat && *s == CHAR_CARAT) {  
ali@0
  1186
                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1187
                    if (!pswit[OVERVIEW_SWITCH])
ali@0
  1188
                        printf("    Line %ld column %d - Carat character?\n", linecnt, (int) (s - aline) + 1);
ali@0
  1189
                    else
ali@0
  1190
                        cnt_odd++;
ali@0
  1191
                    eCarat = 1;
ali@0
  1192
                    }
ali@0
  1193
                if (!eFSlash && *s == CHAR_FORESLASH && warn_fslash) {  
ali@0
  1194
                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1195
                    if (!pswit[OVERVIEW_SWITCH])
ali@0
  1196
                        printf("    Line %ld column %d - Forward slash?\n", linecnt, (int) (s - aline) + 1);
ali@0
  1197
                    else
ali@0
  1198
                        cnt_odd++;
ali@0
  1199
                    eFSlash = 1;
ali@0
  1200
                    }
ali@0
  1201
                /* report asterisks only in paranoid mode, since they're often deliberate */
ali@0
  1202
                if (!eAst && pswit[PARANOID_SWITCH] && warn_ast && !isemptyline && *s == CHAR_ASTERISK) {
ali@0
  1203
                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1204
                    if (!pswit[OVERVIEW_SWITCH])
ali@0
  1205
                        printf("    Line %ld column %d - Asterisk?\n", linecnt, (int) (s - aline) + 1);
ali@0
  1206
                    else
ali@0
  1207
                        cnt_odd++;
ali@0
  1208
                    eAst = 1;
ali@0
  1209
                    }
ali@0
  1210
                }
ali@0
  1211
            }
ali@0
  1212
ali@0
  1213
        /* Check for line too long */
ali@0
  1214
        if (warn_long) {
ali@0
  1215
            if (strlen(aline) > LONGEST_PG_LINE) {
ali@0
  1216
                if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1217
                if (!pswit[OVERVIEW_SWITCH])
ali@0
  1218
                    printf("    Line %ld column %d - Long line %d\n", linecnt, strlen(aline), strlen(aline));
ali@0
  1219
                else
ali@0
  1220
                    cnt_long++;
ali@0
  1221
                }
ali@0
  1222
            }
ali@0
  1223
ali@0
  1224
        /* Check for line too short.                                     */
ali@0
  1225
        /* This one is a bit trickier to implement: we don't want to     */
ali@0
  1226
        /* flag the last line of a paragraph for being short, so we      */
ali@0
  1227
        /* have to wait until we know that our current line is a         */
ali@0
  1228
        /* "normal" line, then report the _previous_ line if it was too  */
ali@0
  1229
        /* short. We also don't want to report indented lines like       */
ali@0
  1230
        /* chapter heads or formatted quotations. We therefore keep      */
ali@0
  1231
        /* lastlen as the length of the last line examined, and          */
ali@0
  1232
        /* lastblen as the length of the last but one, and try to        */
ali@0
  1233
        /* suppress unnecessary warnings by checking that both were of   */
ali@0
  1234
        /* "normal" length. We keep the first character of the last      */
ali@0
  1235
        /* line in laststart, and if it was a space, we assume that the  */
ali@0
  1236
        /* formatting is deliberate. I can't figure out a way to         */
ali@0
  1237
        /* distinguish something like a quoted verse left-aligned or     */
ali@0
  1238
        /* the header or footer of a letter from a paragraph of short    */
ali@0
  1239
        /* lines - maybe if I examined the whole paragraph, and if the   */
ali@0
  1240
        /* para has less than, say, 8 lines and if all lines are short,  */
ali@0
  1241
        /* then just assume it's OK? Need to look at some texts to see   */
ali@0
  1242
        /* how often a formula like this would get the right result.     */
ali@0
  1243
        /* V0.99 changed the tolerance for length to ignore from 2 to 1  */
ali@0
  1244
        if (warn_short) {
ali@0
  1245
            if (strlen(aline) > 1
ali@0
  1246
                && lastlen > 1 && lastlen < SHORTEST_PG_LINE
ali@0
  1247
                && lastblen > 1 && lastblen > SHORTEST_PG_LINE
ali@0
  1248
                && laststart != CHAR_SPACE) {
ali@0
  1249
                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", prevline);
ali@0
  1250
                    if (!pswit[OVERVIEW_SWITCH])
ali@0
  1251
                        printf("    Line %ld column %d - Short line %d?\n", linecnt-1, strlen(prevline), strlen(prevline));
ali@0
  1252
                    else
ali@0
  1253
                        cnt_short++;
ali@0
  1254
                    }
ali@0
  1255
            }
ali@0
  1256
        lastblen = lastlen;
ali@0
  1257
        lastlen = strlen(aline);
ali@0
  1258
        laststart = aline[0];
ali@0
  1259
ali@0
  1260
        /* look for punctuation at start of line */
ali@0
  1261
        if  (*aline && strchr(".?!,;:",  aline[0]))  {            /* if it's punctuation */
ali@0
  1262
            if (strncmp(". . .", aline, 5)) {   /* exception for ellipsis: V.98 tightened up to except only a full ellipsis */
ali@0
  1263
                if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1264
                if (!pswit[OVERVIEW_SWITCH])
ali@0
  1265
                    printf("    Line %ld column 1 - Begins with punctuation?\n", linecnt);
ali@0
  1266
                else
ali@0
  1267
                    cnt_punct++;
ali@0
  1268
                }
ali@0
  1269
            }
ali@0
  1270
ali@0
  1271
        /* Check for spaced em-dashes                            */
ali@0
  1272
        /* V.20 must check _all_ occurrences of "--" on the line */
ali@0
  1273
        /* hence the loop - even if the first double-dash is OK  */
ali@0
  1274
        /* there may be another that's wrong later on.           */
ali@0
  1275
        if (warn_dash) {
ali@0
  1276
            s = aline;
ali@0
  1277
            while (strstr(s,"--")) {
ali@0
  1278
                if (*(strstr(s, "--")-1) == CHAR_SPACE ||
ali@0
  1279
                   (*(strstr(s, "--")+2) == CHAR_SPACE)) {
ali@0
  1280
                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1281
                    if (!pswit[OVERVIEW_SWITCH])
ali@0
  1282
                        printf("    Line %ld column %d - Spaced em-dash?\n", linecnt, (int) (strstr(s,"--") - aline) + 1);
ali@0
  1283
                    else
ali@0
  1284
                        cnt_dash++;
ali@0
  1285
                    }
ali@0
  1286
                s = strstr(s,"--") + 2;
ali@0
  1287
                }
ali@0
  1288
            }
ali@0
  1289
ali@0
  1290
        /* Check for spaced dashes */
ali@0
  1291
        if (warn_dash)
ali@0
  1292
            if (strstr(aline," -")) {
ali@0
  1293
                if (*(strstr(aline, " -")+2) != '-') {
ali@0
  1294
                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1295
                    if (!pswit[OVERVIEW_SWITCH])
ali@0
  1296
                        printf("    Line %ld column %d - Spaced dash?\n", linecnt, (int) (strstr(aline," -") - aline) + 1);
ali@0
  1297
                    else
ali@0
  1298
                        cnt_dash++;
ali@0
  1299
                    }
ali@0
  1300
                }
ali@0
  1301
            else
ali@0
  1302
                if (strstr(aline,"- ")) {
ali@0
  1303
                    if (*(strstr(aline, "- ")-1) != '-') {
ali@0
  1304
                        if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1305
                        if (!pswit[OVERVIEW_SWITCH])
ali@0
  1306
                            printf("    Line %ld column %d - Spaced dash?\n", linecnt, (int) (strstr(aline,"- ") - aline) + 1);
ali@0
  1307
                        else
ali@0
  1308
                            cnt_dash++;
ali@0
  1309
                        }
ali@0
  1310
                    }
ali@0
  1311
ali@0
  1312
        /* v 0.99                                                       */
ali@0
  1313
        /* Check for unmarked paragraphs indicated by separate speakers */
ali@0
  1314
        /* May well be false positive:                                  */
ali@0
  1315
        /* "Bravo!" "Wonderful!" called the crowd.                      */
ali@0
  1316
        /* but useful all the same.                                     */
ali@0
  1317
        s = wrk;
ali@0
  1318
        *s = 0;
ali@0
  1319
        if (strstr(aline, "\" \"")) s = strstr(aline, "\" \"");
ali@0
  1320
        if (strstr(aline, "\"  \"")) s = strstr(aline, "\"  \"");
ali@0
  1321
        if (*s) {
ali@0
  1322
            if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1323
            if (!pswit[OVERVIEW_SWITCH])
ali@0
  1324
                printf("    Line %ld column %d - Query missing paragraph break?\n", linecnt, (int)(s - aline) +1);
ali@0
  1325
            else
ali@0
  1326
                cnt_punct++;
ali@0
  1327
            }
ali@0
  1328
ali@0
  1329
ali@0
  1330
ali@0
  1331
        /* Check for "to he" and other easy he/be errors          */
ali@0
  1332
        /* This is a very inadequate effort on the he/be problem, */
ali@0
  1333
        /* but the phrase "to he" is always an error, whereas "to */
ali@0
  1334
        /* be" is quite common. I chuckle when it does catch one! */
ali@0
  1335
        /* Similarly, '"Quiet!", be said.' is a non-be error      */
ali@0
  1336
        /* V .18 - "to he" is _not_ always an error!:             */
ali@0
  1337
        /*           "Where they went to he couldn't say."        */
ali@0
  1338
        /* but I'm leaving it in anyway.                          */
ali@0
  1339
        /* V .20 Another false positive:                          */
ali@0
  1340
        /*       What would "Cinderella" be without the . . .     */
ali@0
  1341
        /* and another "If he wants to he can see for himself."   */
ali@0
  1342
        /* V .21 Added " is be " and " be is " and " be was "     */
ali@0
  1343
        /* V .99 Added jeebies code -- removed again.             */
ali@0
  1344
        /*       Is jeebies code worth adding? Rare to see he/be  */
ali@0
  1345
        /*       errors with modern OCR. Separate program? Yes!   */
ali@0
  1346
        /*       jeebies does the job without cluttering up this. */
ali@0
  1347
        /*       We do get a few more queryable pairs from the    */
ali@0
  1348
        /*       project though -- they're cheap to implement.    */
ali@0
  1349
        /*       Also added a column number for guiguts.          */
ali@0
  1350
ali@0
  1351
        s = wrk;
ali@0
  1352
        *s = 0;
ali@0
  1353
        if (strstr(aline," to he ")) s = strstr(aline," to he ");
ali@0
  1354
        if (strstr(aline,"\" be ")) s = strstr(aline,"\" be ");
ali@0
  1355
        if (strstr(aline,"\", be ")) s = strstr(aline,"\", be ");
ali@0
  1356
        if (strstr(aline," is be ")) s = strstr(aline," is be ");
ali@0
  1357
        if (strstr(aline," be is ")) s = strstr(aline," be is ");
ali@0
  1358
        if (strstr(aline," was be ")) s = strstr(aline," was be ");
ali@0
  1359
        if (strstr(aline," be would ")) s = strstr(aline," be would ");
ali@0
  1360
        if (strstr(aline," be could ")) s = strstr(aline," be could ");
ali@0
  1361
        if (*s) {
ali@0
  1362
            if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1363
            if (!pswit[OVERVIEW_SWITCH])
ali@0
  1364
                printf("    Line %ld column %d - Query he/be error?\n", linecnt, (int)(s - aline) +1);
ali@0
  1365
            else
ali@0
  1366
                cnt_word++;
ali@0
  1367
            }
ali@0
  1368
ali@0
  1369
        s = wrk;
ali@0
  1370
        *s = 0;
ali@0
  1371
        if (strstr(aline," i bad ")) s = strstr(aline," i bad ");
ali@0
  1372
        if (strstr(aline," you bad ")) s = strstr(aline," you bad ");
ali@0
  1373
        if (strstr(aline," he bad ")) s = strstr(aline," he bad ");
ali@0
  1374
        if (strstr(aline," she bad ")) s = strstr(aline," she bad ");
ali@0
  1375
        if (strstr(aline," they bad ")) s = strstr(aline," they bad ");
ali@0
  1376
        if (strstr(aline," a had ")) s = strstr(aline," a had ");
ali@0
  1377
        if (strstr(aline," the had ")) s = strstr(aline," the had ");
ali@0
  1378
        if (*s) {
ali@0
  1379
            if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1380
            if (!pswit[OVERVIEW_SWITCH])
ali@0
  1381
                printf("    Line %ld column %d - Query had/bad error?\n", linecnt, (int)(s - aline) +1);
ali@0
  1382
            else
ali@0
  1383
                cnt_word++;
ali@0
  1384
            }
ali@0
  1385
ali@0
  1386
ali@0
  1387
        /* V .97 Added ", hut "  Not too common, hut pretty certain   */
ali@0
  1388
        /* V.99 changed to add a column number for guiguts            */
ali@0
  1389
        s = wrk;
ali@0
  1390
        *s = 0;
ali@0
  1391
        if (strstr(aline,", hut ")) s = strstr(aline,", hut ");
ali@0
  1392
        if (strstr(aline,"; hut ")) s = strstr(aline,"; hut ");
ali@0
  1393
        if (*s) {
ali@0
  1394
            if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1395
            if (!pswit[OVERVIEW_SWITCH])
ali@0
  1396
                printf("    Line %ld column %d - Query hut/but error?\n", linecnt, (int)(s - aline) +1);
ali@0
  1397
            else
ali@0
  1398
                cnt_word++;
ali@0
  1399
            }
ali@0
  1400
ali@0
  1401
        /* Special case - angled bracket in front of "From" placed there by an MTA */
ali@0
  1402
        /* when sending an e-mail.  V .21                                          */
ali@0
  1403
        if (strstr(aline, ">From")) {
ali@0
  1404
            if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1405
            if (!pswit[OVERVIEW_SWITCH])
ali@0
  1406
                printf("    Line %ld column %d - Query angled bracket with From\n", linecnt, (int)(strstr(aline, ">From") - aline) +1);
ali@0
  1407
            else
ali@0
  1408
                cnt_punct++;
ali@0
  1409
            }
ali@0
  1410
ali@0
  1411
        /* V 0.98 Check for a single character line - often an overflow from bad wrapping. */
ali@0
  1412
        if (*aline && !*(aline+1)) {
ali@0
  1413
            if (*aline == 'I' || *aline == 'V' || *aline == 'X' || *aline == 'L' || gcisdigit(*aline))
ali@0
  1414
                ; /* nothing - ignore numerals alone on a line. */
ali@0
  1415
            else {
ali@0
  1416
                if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1417
                if (!pswit[OVERVIEW_SWITCH])
ali@0
  1418
                    printf("    Line %ld column 1 - Query single character line\n", linecnt);
ali@0
  1419
                else
ali@0
  1420
                    cnt_punct++;
ali@0
  1421
                }
ali@0
  1422
            }
ali@0
  1423
ali@0
  1424
        /* V 0.98 Check for I" - often should be ! */
ali@0
  1425
        if (strstr(aline, " I\"")) {
ali@0
  1426
            if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1427
            if (!pswit[OVERVIEW_SWITCH])
ali@0
  1428
                printf("    Line %ld column %ld - Query I=exclamation mark?\n", linecnt, strstr(aline, " I\"") - aline);
ali@0
  1429
            else
ali@0
  1430
                cnt_punct++;
ali@0
  1431
            }
ali@0
  1432
ali@0
  1433
        /* V 0.98 Check for period without a capital letter. Cut-down from gutspell */
ali@0
  1434
        /*        Only works when it happens on a single line.                      */
ali@0
  1435
ali@0
  1436
        if (pswit[PARANOID_SWITCH])
ali@0
  1437
            for (t = s = aline; strstr(t,". ");) {
ali@0
  1438
                t = strstr(t, ". ");
ali@0
  1439
                if (t == s)  {
ali@0
  1440
                    t++;
ali@0
  1441
                    continue; /* start of line punctuation is handled elsewhere */
ali@0
  1442
                    }
ali@0
  1443
                if (!gcisalpha(*(t-1))) {
ali@0
  1444
                    t++;
ali@0
  1445
                    continue;
ali@0
  1446
                    }
ali@0
  1447
                if (isDutch) {  /* For Frank & Jeroen -- 's Middags case */
ali@0
  1448
                    if (*(t+2) == CHAR_SQUOTE &&
ali@0
  1449
                      *(t+3)>='a' && *(t+3)<='z' &&
ali@0
  1450
                      *(t+4) == CHAR_SPACE &&
ali@0
  1451
                      *(t+5)>='A' && *(t+5)<='Z') {
ali@0
  1452
                        t++;
ali@0
  1453
                        continue;
ali@0
  1454
                        }
ali@0
  1455
                      }
ali@0
  1456
                s1 = t+2;
ali@0
  1457
                while (*s1 && !gcisalpha(*s1) && !isdigit(*s1))
ali@0
  1458
                    s1++;
ali@0
  1459
                if (*s1 >= 'a' && *s1 <= 'z') {  /* we have something to investigate */
ali@0
  1460
                    istypo = 1;
ali@0
  1461
                    for (s1 = t - 1; s1 >= s && 
ali@0
  1462
                        (gcisalpha(*s1) || gcisdigit(*s1) || 
ali@0
  1463
                        (*s1 == CHAR_SQUOTE && gcisalpha(*(s1+1)) && gcisalpha(*(s1-1)))); s1--); /* so let's go back and find out */
ali@0
  1464
                    s1++;
ali@0
  1465
                    for (i = 0; *s1 && *s1 != '.'; s1++, i++)
ali@0
  1466
                        testword[i] = *s1;
ali@0
  1467
                    testword[i] = 0;
ali@0
  1468
                    for (i = 0; *abbrev[i]; i++)
ali@0
  1469
                        if (!strcmp(testword, abbrev[i]))
ali@0
  1470
                            istypo = 0;
ali@0
  1471
//                    if (*testword >= 'A' && *testword <= 'Z') 
ali@0
  1472
//                        istypo = 0;
ali@0
  1473
                    if (gcisdigit(*testword)) istypo = 0;
ali@0
  1474
                    if (!*(testword+1)) istypo = 0;
ali@0
  1475
                    if (isroman(testword)) istypo = 0;
ali@0
  1476
                    if (istypo) {
ali@0
  1477
                        istypo = 0;
ali@0
  1478
                        for (i = 0; testword[i]; i++)
ali@0
  1479
                            if (strchr(vowels, testword[i]))
ali@0
  1480
                                istypo = 1;
ali@0
  1481
                        }
ali@0
  1482
                    if (istypo) {
ali@0
  1483
                        isdup = 0;
ali@0
  1484
                        if (strlen(testword) < MAX_QWORD_LENGTH && !pswit[VERBOSE_SWITCH])
ali@0
  1485
                            for (i = 0; i < qperiod_index; i++)
ali@0
  1486
                                if (!strcmp(testword, qperiod[i])) {
ali@0
  1487
                                    isdup = 1;
ali@0
  1488
                                    }
ali@0
  1489
                        if (!isdup) {
ali@0
  1490
                            if (qperiod_index < MAX_QWORD && strlen(testword) < MAX_QWORD_LENGTH) {
ali@0
  1491
                                strcpy(qperiod[qperiod_index], testword);
ali@0
  1492
                                qperiod_index++;
ali@0
  1493
                                }
ali@0
  1494
                            if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1495
                            if (!pswit[OVERVIEW_SWITCH])
ali@0
  1496
                                printf("    Line %ld column %d - Extra period?\n", linecnt, (int)(t - aline)+1);
ali@0
  1497
                            else
ali@0
  1498
                                cnt_punct++;
ali@0
  1499
                            }
ali@0
  1500
                        }
ali@0
  1501
                    }
ali@0
  1502
                t++;
ali@0
  1503
                }
ali@0
  1504
ali@0
  1505
ali@0
  1506
        if (pswit[TYPO_SWITCH]) {    /* Should have put this condition in at the start of 0.99. Duh! */
ali@0
  1507
            /* Check for words usually not followed by punctuation 0.99 */
ali@0
  1508
            for (s = aline; *s;) {
ali@0
  1509
                wordstart = s;
ali@0
  1510
                s = getaword(s, inword);
ali@0
  1511
                if (!*inword) continue;
ali@0
  1512
                lowerit(inword);
ali@0
  1513
                for (i = 0; *nocomma[i]; i++)
ali@0
  1514
                    if (!strcmp(inword, nocomma[i])) {
ali@0
  1515
                        if (*s == ',' || *s == ';' || *s == ':') {
ali@0
  1516
                            if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1517
                            if (!pswit[OVERVIEW_SWITCH])
ali@0
  1518
                                printf("    Line %ld column %d - Query punctuation after %s?\n", linecnt, (int)(s - aline)+1, inword);
ali@0
  1519
                            else
ali@0
  1520
                                cnt_punct++;
ali@0
  1521
                            }
ali@0
  1522
                        }
ali@0
  1523
                for (i = 0; *noperiod[i]; i++)
ali@0
  1524
                    if (!strcmp(inword, noperiod[i])) {
ali@0
  1525
                        if (*s == '.' || *s == '!') {
ali@0
  1526
                            if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1527
                            if (!pswit[OVERVIEW_SWITCH])
ali@0
  1528
                                printf("    Line %ld column %d - Query punctuation after %s?\n", linecnt, (int)(s - aline)+1, inword);
ali@0
  1529
                            else
ali@0
  1530
                                cnt_punct++;
ali@0
  1531
                            }
ali@0
  1532
                        }
ali@0
  1533
                }
ali@0
  1534
            }
ali@0
  1535
ali@0
  1536
ali@0
  1537
ali@0
  1538
        /* Check for commonly mistyped words, and digits like 0 for O in a word */
ali@0
  1539
        for (s = aline; *s;) {
ali@0
  1540
            wordstart = s;
ali@0
  1541
            s = getaword(s, inword);
ali@0
  1542
            if (!*inword) continue; /* don't bother with empty lines */
ali@0
  1543
            if (mixdigit(inword)) {
ali@0
  1544
                if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1545
                if (!pswit[OVERVIEW_SWITCH])
ali@0
  1546
                    printf("    Line %ld column %ld - Query digit in %s\n", linecnt, (int)(wordstart - aline) + 1, inword);
ali@0
  1547
                else
ali@0
  1548
                    cnt_word++;
ali@0
  1549
                }
ali@0
  1550
ali@0
  1551
            /* put the word through a series of tests for likely typos and OCR errors */
ali@0
  1552
            /* V.21 I had allowed lots of typo-checking even with the typo switch     */
ali@0
  1553
            /* turned off, but I really should disallow reporting of them when        */
ali@0
  1554
            /* the switch is off. Hence the "if" below.                               */
ali@0
  1555
            if (pswit[TYPO_SWITCH]) {
ali@0
  1556
                istypo = 0;
ali@0
  1557
                strcpy(testword, inword);
ali@0
  1558
                alower = 0;
ali@0
  1559
                for (i = 0; i < (signed int)strlen(testword); i++) { /* lowercase for testing */
ali@0
  1560
                    if (testword[i] >= 'a' && testword[i] <= 'z') alower = 1;
ali@0
  1561
                    if (alower && testword[i] >= 'A' && testword[i] <= 'Z') {
ali@0
  1562
                        /* we have an uppercase mid-word. However, there are common cases: */
ali@0
  1563
                        /*   Mac and Mc like McGill                                        */
ali@0
  1564
                        /*   French contractions like l'Abbe                               */
ali@0
  1565
                        if ((i == 2 && testword[0] == 'm' && testword[1] == 'c') ||
ali@0
  1566
                            (i == 3 && testword[0] == 'm' && testword[1] == 'a' && testword[2] == 'c') ||
ali@0
  1567
                            (i > 0 && testword[i-1] == CHAR_SQUOTE))
ali@0
  1568
                                ; /* do nothing! */
ali@0
  1569
ali@0
  1570
                        else {  /* V.97 - remove separate case of uppercase within word so that         */
ali@0
  1571
                                /* names like VanAllen fall into qword_index and get reported only once */
ali@0
  1572
                            istypo = 1;
ali@0
  1573
                            }
ali@0
  1574
                        }
ali@0
  1575
                    testword[i] = (char)tolower(testword[i]);
ali@0
  1576
                    }
ali@0
  1577
ali@0
  1578
                /* check for certain unlikely two-letter combinations at word start and end */
ali@0
  1579
                /* V.0.97 - this replaces individual hardcoded checks in previous versions */
ali@0
  1580
                if (strlen(testword) > 1) {
ali@0
  1581
                    for (i = 0; *nostart[i]; i++)
ali@0
  1582
                        if (!strncmp(testword, nostart[i], 2))
ali@0
  1583
                            istypo = 1;
ali@0
  1584
                    for (i = 0; *noend[i]; i++)
ali@0
  1585
                        if (!strncmp(testword + strlen(testword) -2, noend[i], 2))
ali@0
  1586
                            istypo = 1;
ali@0
  1587
                    }
ali@0
  1588
ali@0
  1589
ali@0
  1590
                /* ght is common, gbt never. Like that. */
ali@0
  1591
                if (strstr(testword, "cb")) istypo = 1;
ali@0
  1592
                if (strstr(testword, "gbt")) istypo = 1;
ali@0
  1593
                if (strstr(testword, "pbt")) istypo = 1;
ali@0
  1594
                if (strstr(testword, "tbs")) istypo = 1;
ali@0
  1595
                if (strstr(testword, "mrn")) istypo = 1;
ali@0
  1596
                if (strstr(testword, "ahle")) istypo = 1;
ali@0
  1597
                if (strstr(testword, "ihle")) istypo = 1;
ali@0
  1598
ali@0
  1599
                /* "TBE" does happen - like HEARTBEAT - but uncommon.                    */
ali@0
  1600
                /*  Also "TBI" - frostbite, outbid - but uncommon.                       */
ali@0
  1601
                /*  Similarly "ii" like Hawaii, or Pompeii, and in Roman numerals,       */
ali@0
  1602
                /*  but these are covered in V.20. "ii" is a common scanno.              */
ali@0
  1603
                if (strstr(testword, "tbi")) istypo = 1;
ali@0
  1604
                if (strstr(testword, "tbe")) istypo = 1;
ali@0
  1605
                if (strstr(testword, "ii")) istypo = 1;
ali@0
  1606
ali@0
  1607
                /* check for no vowels or no consonants. */
ali@0
  1608
                /* If none, flag a typo                  */
ali@0
  1609
                if (!istypo && strlen(testword)>1) {
ali@0
  1610
                    vowel = consonant = 0;
ali@0
  1611
                    for (i = 0; testword[i]; i++)
ali@0
  1612
                        if (testword[i] == 'y' || gcisdigit(testword[i])) {  /* Yah, this is loose. */
ali@0
  1613
                            vowel++;
ali@0
  1614
                            consonant++;
ali@0
  1615
                            }
ali@0
  1616
                        else
ali@0
  1617
                            if  (strchr(vowels, testword[i])) vowel++;
ali@0
  1618
                            else consonant++;
ali@0
  1619
                    if (!vowel || !consonant) {
ali@0
  1620
                        istypo = 1;
ali@0
  1621
                        }
ali@0
  1622
                    }
ali@0
  1623
ali@0
  1624
                /* now exclude the word from being reported if it's in */
ali@0
  1625
                /* the okword list                                     */
ali@0
  1626
                for (i = 0; *okword[i]; i++)
ali@0
  1627
                    if (!strcmp(testword, okword[i]))
ali@0
  1628
                        istypo = 0;
ali@0
  1629
ali@0
  1630
                /* what looks like a typo may be a Roman numeral. Exclude these */
ali@0
  1631
                if (istypo)
ali@0
  1632
                    if (isroman(testword))
ali@0
  1633
                        istypo = 0;
ali@0
  1634
ali@0
  1635
                /* check the manual list of typos */
ali@0
  1636
                if (!istypo)
ali@0
  1637
                    for (i = 0; *typo[i]; i++)
ali@0
  1638
                        if (!strcmp(testword, typo[i]))
ali@0
  1639
                            istypo = 1;
ali@0
  1640
ali@0
  1641
ali@0
  1642
                /* V.21 - check lowercase s and l - special cases */
ali@0
  1643
                /* V.98 - added "i" and "m"                       */
ali@0
  1644
                /* V.99 - added "j" often a semi-colon gone wrong */
ali@0
  1645
                /*      - and "d" for a missing apostrophe - he d */
ali@0
  1646
                /*      - and "n" for "in"                        */
ali@0
  1647
                if (!istypo && strlen(testword) == 1)
ali@0
  1648
                    if (strchr("slmijdn", *inword))
ali@0
  1649
                        istypo = 1;
ali@0
  1650
ali@0
  1651
ali@0
  1652
                if (istypo) {
ali@0
  1653
                    isdup = 0;
ali@0
  1654
                    if (strlen(testword) < MAX_QWORD_LENGTH && !pswit[VERBOSE_SWITCH])
ali@0
  1655
                        for (i = 0; i < qword_index; i++)
ali@0
  1656
                            if (!strcmp(testword, qword[i])) {
ali@0
  1657
                                isdup = 1;
ali@0
  1658
                                ++dupcnt[i];
ali@0
  1659
                                }
ali@0
  1660
                    if (!isdup) {
ali@0
  1661
                        if (qword_index < MAX_QWORD && strlen(testword) < MAX_QWORD_LENGTH) {
ali@0
  1662
                            strcpy(qword[qword_index], testword);
ali@0
  1663
                            qword_index++;
ali@0
  1664
                            }
ali@0
  1665
                        if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1666
                        if (!pswit[OVERVIEW_SWITCH]) {
ali@0
  1667
                            printf("    Line %ld column %d - Query word %s", linecnt, (int)(wordstart - aline) + 1, inword);
ali@0
  1668
                            if (strlen(testword) < MAX_QWORD_LENGTH && !pswit[VERBOSE_SWITCH])
ali@0
  1669
                                printf(" - not reporting duplicates");
ali@0
  1670
                            printf("\n");
ali@0
  1671
                            }
ali@0
  1672
                        else
ali@0
  1673
                            cnt_word++;
ali@0
  1674
                        }
ali@0
  1675
                    }
ali@0
  1676
                }        /* end of typo-checking */
ali@0
  1677
ali@0
  1678
                /* check the user's list of typos */
ali@0
  1679
                if (!istypo)
ali@0
  1680
                    if (usertypo_count)
ali@0
  1681
                        for (i = 0; i < usertypo_count; i++)
ali@0
  1682
                            if (!strcmp(testword, usertypo[i])) {
ali@0
  1683
                                if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1684
                                if (!pswit[OVERVIEW_SWITCH])  
ali@0
  1685
                                    printf("    Line %ld column %d - Query possible scanno %s\n", linecnt, (int)(wordstart - aline) + 2, inword);
ali@0
  1686
                                }
ali@0
  1687
ali@0
  1688
ali@0
  1689
ali@0
  1690
            if (pswit[PARANOID_SWITCH] && warn_digit) {   /* in paranoid mode, query all 0 and 1 standing alone - added warn_digit V.97*/
ali@0
  1691
                if (!strcmp(inword, "0") || !strcmp(inword, "1")) {
ali@0
  1692
                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1693
                    if (!pswit[OVERVIEW_SWITCH])
ali@0
  1694
                        printf("    Line %ld column %d - Query standalone %s\n", linecnt, (int)(wordstart - aline) + 2, inword);
ali@0
  1695
                    else
ali@0
  1696
                        cnt_word++;
ali@0
  1697
                    }
ali@0
  1698
                }
ali@0
  1699
            }
ali@0
  1700
ali@0
  1701
        /* look for added or missing spaces around punctuation and quotes */
ali@0
  1702
        /* If there is a punctuation character like ! with no space on    */
ali@0
  1703
        /* either side, suspect a missing!space. If there are spaces on   */
ali@0
  1704
        /* both sides , assume a typo. If we see a double quote with no   */
ali@0
  1705
        /* space or punctuation on either side of it, assume unspaced     */
ali@0
  1706
        /* quotes "like"this.                                             */
ali@0
  1707
        llen = strlen(aline);
ali@0
  1708
        for (i = 1; i < llen; i++) {                               /* for each character in the line after the first */
ali@0
  1709
            if  (strchr(".?!,;:_", aline[i])) {                    /* if it's punctuation */
ali@0
  1710
                isacro = 0;                       /* we need to suppress warnings for acronyms like M.D. */
ali@0
  1711
                isellipsis = 0;                   /* we need to suppress warnings for ellipsis . . . */
ali@0
  1712
                if ( (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1])) ||     /* if there are letters on both sides of it or ... */
ali@0
  1713
                   (gcisalpha(aline[i+1]) && strchr("?!,;:", aline[i]))) { /* ...if it's strict punctuation followed by an alpha */
ali@0
  1714
                    if (aline[i] == '.') {
ali@0
  1715
                        if (i > 2)
ali@0
  1716
                            if (aline[i-2] == '.') isacro = 1;
ali@0
  1717
                        if (i + 2 < llen)
ali@0
  1718
                            if (aline[i+2] == '.') isacro = 1;
ali@0
  1719
                        }
ali@0
  1720
                    if (!isacro) {
ali@0
  1721
                        if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1722
                        if (!pswit[OVERVIEW_SWITCH])
ali@0
  1723
                            printf("    Line %ld column %d - Missing space?\n", linecnt, i+1);
ali@0
  1724
                        else
ali@0
  1725
                            cnt_punct++;
ali@0
  1726
                        }
ali@0
  1727
                    }
ali@0
  1728
                if (aline[i-1] == CHAR_SPACE && (aline[i+1] == CHAR_SPACE || aline[i+1] == 0)) { /* if there are spaces on both sides, or space before and end of line */
ali@0
  1729
                    if (aline[i] == '.') {
ali@0
  1730
                        if (i > 2)
ali@0
  1731
                            if (aline[i-2] == '.') isellipsis = 1;
ali@0
  1732
                        if (i + 2 < llen)
ali@0
  1733
                            if (aline[i+2] == '.') isellipsis = 1;
ali@0
  1734
                        }
ali@0
  1735
                    if (!isemptyline && !isellipsis) {
ali@0
  1736
                        if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1737
                        if (!pswit[OVERVIEW_SWITCH])
ali@0
  1738
                            printf("    Line %ld column %d - Spaced punctuation?\n", linecnt, i+1);
ali@0
  1739
                        else
ali@0
  1740
                            cnt_punct++;
ali@0
  1741
                        }
ali@0
  1742
                    }
ali@0
  1743
                }
ali@0
  1744
            }
ali@0
  1745
ali@0
  1746
        /* 0.98 -- split out the characters that CANNOT be preceded by space */
ali@0
  1747
        llen = strlen(aline);
ali@0
  1748
        for (i = 1; i < llen; i++) {                             /* for each character in the line after the first */
ali@0
  1749
            if  (strchr("?!,;:", aline[i])) {                    /* if it's punctuation that _cannot_ have a space before it */
ali@0
  1750
                if (aline[i-1] == CHAR_SPACE && !isemptyline && aline[i+1] != CHAR_SPACE) { /* if aline[i+1) DOES == space, it was already reported just above */
ali@0
  1751
                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1752
                    if (!pswit[OVERVIEW_SWITCH])
ali@0
  1753
                        printf("    Line %ld column %d - Spaced punctuation?\n", linecnt, i+1);
ali@0
  1754
                    else
ali@0
  1755
                        cnt_punct++;
ali@0
  1756
                    }
ali@0
  1757
                }
ali@0
  1758
            }
ali@0
  1759
ali@0
  1760
ali@0
  1761
        /* 0.99 -- special case " .X" where X is any alpha. */
ali@0
  1762
        /* This plugs a hole in the acronym code above. Inelegant, but maintainable. */
ali@0
  1763
        llen = strlen(aline);
ali@0
  1764
        for (i = 1; i < llen; i++) {             /* for each character in the line after the first */
ali@0
  1765
            if  (aline[i] == '.') {              /* if it's a period */
ali@0
  1766
                if (aline[i-1] == CHAR_SPACE && gcisalpha(aline[i+1])) { /* if the period follows a space and is followed by a letter */
ali@0
  1767
                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1768
                    if (!pswit[OVERVIEW_SWITCH])
ali@0
  1769
                        printf("    Line %ld column %d - Spaced punctuation?\n", linecnt, i+1);
ali@0
  1770
                    else
ali@0
  1771
                        cnt_punct++;
ali@0
  1772
                    }
ali@0
  1773
                }
ali@0
  1774
            }
ali@0
  1775
ali@0
  1776
ali@0
  1777
ali@0
  1778
ali@0
  1779
        /* v.21 breaking out the search for unspaced doublequotes        */
ali@0
  1780
        /* This is not as efficient, but it's more maintainable          */
ali@0
  1781
        /* V.97 added underscore to the list of characters not to query, */
ali@0
  1782
        /* since underscores are commonly used as italics indicators.    */
ali@0
  1783
        /* V.98 Added slash as well, same reason.                        */
ali@0
  1784
        for (i = 1; i < llen; i++) {                               /* for each character in the line after the first */
ali@0
  1785
            if (aline[i] == CHAR_DQUOTE) {
ali@0
  1786
                if ((!strchr(" _-.'`,;:!/([{?}])",  aline[i-1]) &&
ali@0
  1787
                     !strchr(" _-.'`,;:!/([{?}])",  aline[i+1]) &&
ali@0
  1788
                     aline[i+1] != 0
ali@0
  1789
                     || (!strchr(" _-([{'`", aline[i-1]) && gcisalpha(aline[i+1])))) {
ali@0
  1790
                        if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1791
                        if (!pswit[OVERVIEW_SWITCH])
ali@0
  1792
                            printf("    Line %ld column %d - Unspaced quotes?\n", linecnt, i+1);
ali@0
  1793
                        else
ali@0
  1794
                            cnt_punct++;
ali@0
  1795
                        }
ali@0
  1796
                }
ali@0
  1797
            }
ali@0
  1798
ali@0
  1799
ali@0
  1800
        /* v.98 check parity of quotes                             */
ali@0
  1801
        /* v.99 added !*(s+1) in some tests to catch "I am," he said, but I will not be soon". */
ali@0
  1802
        for (s = aline; *s; s++) {
ali@0
  1803
            if (*s == CHAR_DQUOTE) {
ali@0
  1804
                if (!(dquotepar = !dquotepar)) {    /* parity even */
ali@0
  1805
                    if (!strchr("_-.'`/,;:!?)]} ",  *(s+1))) {
ali@0
  1806
                        if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1807
                        if (!pswit[OVERVIEW_SWITCH])
ali@0
  1808
                            printf("    Line %ld column %d - Wrongspaced quotes?\n", linecnt, (int)(s - aline)+1);
ali@0
  1809
                        else
ali@0
  1810
                            cnt_punct++;
ali@0
  1811
                        }
ali@0
  1812
                    }
ali@0
  1813
                else {                              /* parity odd */
ali@0
  1814
                    if (!gcisalpha(*(s+1)) && !isdigit(*(s+1)) && !strchr("_-/.'`([{$",  *(s+1)) || !*(s+1)) {
ali@0
  1815
                        if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1816
                        if (!pswit[OVERVIEW_SWITCH])
ali@0
  1817
                            printf("    Line %ld column %d - Wrongspaced quotes?\n", linecnt, (int)(s - aline)+1);
ali@0
  1818
                        else
ali@0
  1819
                            cnt_punct++;
ali@0
  1820
                        }
ali@0
  1821
                    }
ali@0
  1822
                }
ali@0
  1823
            }
ali@0
  1824
ali@0
  1825
            if (*aline == CHAR_DQUOTE) {
ali@0
  1826
                if (strchr(",;:!?)]} ", aline[1])) {
ali@0
  1827
                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1828
                    if (!pswit[OVERVIEW_SWITCH])
ali@0
  1829
                        printf("    Line %ld column 1 - Wrongspaced quotes?\n", linecnt, (int)(s - aline)+1);
ali@0
  1830
                    else
ali@0
  1831
                        cnt_punct++;
ali@0
  1832
                    }
ali@0
  1833
                }
ali@0
  1834
ali@0
  1835
        if (pswit[SQUOTE_SWITCH])
ali@0
  1836
            for (s = aline; *s; s++) {
ali@0
  1837
                if ((*s == CHAR_SQUOTE || *s == CHAR_OPEN_SQUOTE)
ali@0
  1838
                     && ( s == aline || (s > aline && !gcisalpha(*(s-1))) || !gcisalpha(*(s+1)))) {
ali@0
  1839
                    if (!(squotepar = !squotepar)) {    /* parity even */
ali@0
  1840
                        if (!strchr("_-.'`/\",;:!?)]} ",  *(s+1))) {
ali@0
  1841
                            if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1842
                            if (!pswit[OVERVIEW_SWITCH])
ali@0
  1843
                                printf("    Line %ld column %d - Wrongspaced singlequotes?\n", linecnt, (int)(s - aline)+1);
ali@0
  1844
                            else
ali@0
  1845
                                cnt_punct++;
ali@0
  1846
                            }
ali@0
  1847
                        }
ali@0
  1848
                    else {                              /* parity odd */
ali@0
  1849
                        if (!gcisalpha(*(s+1)) && !isdigit(*(s+1)) && !strchr("_-/\".'`",  *(s+1)) || !*(s+1)) {
ali@0
  1850
                            if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1851
                            if (!pswit[OVERVIEW_SWITCH])
ali@0
  1852
                                printf("    Line %ld column %d - Wrongspaced singlequotes?\n", linecnt, (int)(s - aline)+1);
ali@0
  1853
                            else
ali@0
  1854
                                cnt_punct++;
ali@0
  1855
                            }
ali@0
  1856
                        }
ali@0
  1857
                    }
ali@0
  1858
                }
ali@0
  1859
                    
ali@0
  1860
ali@0
  1861
        /* v.20 also look for double punctuation like ,. or ,,     */
ali@0
  1862
        /* Thanks to DW for the suggestion!                        */
ali@0
  1863
        /* I'm putting this in a separate loop for clarity         */
ali@0
  1864
        /* In books with references, ".," and ".;" are common      */
ali@0
  1865
        /* e.g. "etc., etc.," and vol. 1.; vol 3.;                 */
ali@0
  1866
        /* OTOH, from my initial tests, there are also fairly      */
ali@0
  1867
        /* common errors. What to do? Make these cases paranoid?   */
ali@0
  1868
        /* V.21 ".," is the most common, so invented warn_dotcomma */
ali@0
  1869
        /* to suppress detailed reporting if it occurs often       */
ali@0
  1870
        llen = strlen(aline);
ali@0
  1871
        for (i = 0; i < llen; i++)                  /* for each character in the line */
ali@0
  1872
            if (strchr(".?!,;:", aline[i])          /* if it's punctuation */
ali@0
  1873
            && (strchr(".?!,;:", aline[i+1]))
ali@0
  1874
            && aline[i] && aline[i+1])      /* followed by punctuation, it's a query, unless . . . */
ali@0
  1875
                if (
ali@0
  1876
                  (aline[i] == aline[i+1]
ali@0
  1877
                  && (aline[i] == '.' || aline[i] == '?' || aline[i] == '!'))
ali@0
  1878
                  || (!warn_dotcomma && aline[i] == '.' && aline[i+1] == ',')
ali@0
  1879
                  || (isFrench && !strncmp(aline+i, ",...", 4))
ali@0
  1880
                  || (isFrench && !strncmp(aline+i, "...,", 4))
ali@0
  1881
                  || (isFrench && !strncmp(aline+i, ";...", 4))
ali@0
  1882
                  || (isFrench && !strncmp(aline+i, "...;", 4))
ali@0
  1883
                  || (isFrench && !strncmp(aline+i, ":...", 4))
ali@0
  1884
                  || (isFrench && !strncmp(aline+i, "...:", 4))
ali@0
  1885
                  || (isFrench && !strncmp(aline+i, "!...", 4))
ali@0
  1886
                  || (isFrench && !strncmp(aline+i, "...!", 4))
ali@0
  1887
                  || (isFrench && !strncmp(aline+i, "?...", 4))
ali@0
  1888
                  || (isFrench && !strncmp(aline+i, "...?", 4))
ali@0
  1889
                ) {
ali@0
  1890
                if ((isFrench && !strncmp(aline+i, ",...", 4))    /* could this BE any more awkward? */
ali@0
  1891
                  || (isFrench && !strncmp(aline+i, "...,", 4))
ali@0
  1892
                  || (isFrench && !strncmp(aline+i, ";...", 4))
ali@0
  1893
                  || (isFrench && !strncmp(aline+i, "...;", 4))
ali@0
  1894
                  || (isFrench && !strncmp(aline+i, ":...", 4))
ali@0
  1895
                  || (isFrench && !strncmp(aline+i, "...:", 4))
ali@0
  1896
                  || (isFrench && !strncmp(aline+i, "!...", 4))
ali@0
  1897
                  || (isFrench && !strncmp(aline+i, "...!", 4))
ali@0
  1898
                  || (isFrench && !strncmp(aline+i, "?...", 4))
ali@0
  1899
                  || (isFrench && !strncmp(aline+i, "...?", 4)))
ali@0
  1900
                    i +=4;
ali@0
  1901
                        ; /* do nothing for .. !! and ?? which can be legit */
ali@0
  1902
                    }
ali@0
  1903
                else {
ali@0
  1904
                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1905
                    if (!pswit[OVERVIEW_SWITCH])
ali@0
  1906
                        printf("    Line %ld column %d - Double punctuation?\n", linecnt, i+1);
ali@0
  1907
                    else
ali@0
  1908
                        cnt_punct++;
ali@0
  1909
                    }
ali@0
  1910
ali@0
  1911
        /* v.21 breaking out the search for spaced doublequotes */
ali@0
  1912
        /* This is not as efficient, but it's more maintainable */
ali@0
  1913
        s = aline;
ali@0
  1914
        while (strstr(s," \" ")) {
ali@0
  1915
            if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1916
            if (!pswit[OVERVIEW_SWITCH])
ali@0
  1917
                printf("    Line %ld column %d - Spaced doublequote?\n", linecnt, (int)(strstr(s," \" ")-aline+1));
ali@0
  1918
            else
ali@0
  1919
                cnt_punct++;
ali@0
  1920
            s = strstr(s," \" ") + 2;
ali@0
  1921
            }
ali@0
  1922
ali@0
  1923
        /* v.20 also look for spaced singlequotes ' and `  */
ali@0
  1924
        s = aline;
ali@0
  1925
        while (strstr(s," ' ")) {
ali@0
  1926
            if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1927
            if (!pswit[OVERVIEW_SWITCH])
ali@0
  1928
                printf("    Line %ld column %d - Spaced singlequote?\n", linecnt, (int)(strstr(s," ' ")-aline+1));
ali@0
  1929
            else
ali@0
  1930
                cnt_punct++;
ali@0
  1931
            s = strstr(s," ' ") + 2;
ali@0
  1932
            }
ali@0
  1933
ali@0
  1934
        s = aline;
ali@0
  1935
        while (strstr(s," ` ")) {
ali@0
  1936
            if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1937
            if (!pswit[OVERVIEW_SWITCH])
ali@0
  1938
                printf("    Line %ld column %d - Spaced singlequote?\n", linecnt, (int)(strstr(s," ` ")-aline+1));
ali@0
  1939
            else
ali@0
  1940
                cnt_punct++;
ali@0
  1941
            s = strstr(s," ` ") + 2;
ali@0
  1942
            }
ali@0
  1943
ali@0
  1944
        /* v.99 check special case of 'S instead of 's at end of word */
ali@0
  1945
        s = aline + 1;
ali@0
  1946
        while (*s) {
ali@0
  1947
            if (*s == CHAR_SQUOTE && *(s+1) == 'S' && *(s-1)>='a' && *(s-1)<='z')  {
ali@0
  1948
                if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1949
                if (!pswit[OVERVIEW_SWITCH])
ali@0
  1950
                    printf("    Line %ld column %d - Capital \"S\"?\n", linecnt, (int)(s-aline+2));
ali@0
  1951
                else
ali@0
  1952
                    cnt_punct++;
ali@0
  1953
                }
ali@0
  1954
            s++;
ali@0
  1955
            }
ali@0
  1956
ali@0
  1957
ali@0
  1958
        /* v.21 Now check special cases - start and end of line - */
ali@0
  1959
        /* for single and double quotes. Start is sometimes [sic] */
ali@0
  1960
        /* but better to query it anyway.                         */
ali@0
  1961
        /* While I'm here, check for dash at end of line          */
ali@0
  1962
        llen = strlen(aline);
ali@0
  1963
        if (llen > 1) {
ali@0
  1964
            if (aline[llen-1] == CHAR_DQUOTE ||
ali@0
  1965
                aline[llen-1] == CHAR_SQUOTE ||
ali@0
  1966
                aline[llen-1] == CHAR_OPEN_SQUOTE)
ali@0
  1967
                if (aline[llen-2] == CHAR_SPACE) {
ali@0
  1968
                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1969
                    if (!pswit[OVERVIEW_SWITCH])
ali@0
  1970
                        printf("    Line %ld column %d - Spaced quote?\n", linecnt, llen);
ali@0
  1971
                    else
ali@0
  1972
                        cnt_punct++;
ali@0
  1973
                    }
ali@0
  1974
            
ali@0
  1975
            /* V 0.98 removed aline[0] == CHAR_DQUOTE from the test below, since */
ali@0
  1976
            /* Wrongspaced quotes test also catches it for "                     */
ali@0
  1977
            if (aline[0] == CHAR_SQUOTE ||
ali@0
  1978
                aline[0] == CHAR_OPEN_SQUOTE)
ali@0
  1979
                if (aline[1] == CHAR_SPACE) {
ali@0
  1980
                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1981
                    if (!pswit[OVERVIEW_SWITCH])
ali@0
  1982
                        printf("    Line %ld column 1 - Spaced quote?\n", linecnt);
ali@0
  1983
                    else
ali@0
  1984
                        cnt_punct++;
ali@0
  1985
                    }
ali@0
  1986
            /* dash at end of line may well be legit - paranoid mode only */
ali@0
  1987
            /* and don't report em-dash at line-end                       */
ali@0
  1988
            if (pswit[PARANOID_SWITCH] && warn_hyphen) {
ali@0
  1989
                for (i = llen-1; i > 0 && (unsigned char)aline[i] <= CHAR_SPACE; i--);
ali@0
  1990
                if (aline[i] == '-' && aline[i-1] != '-') {
ali@0
  1991
                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  1992
                    if (!pswit[OVERVIEW_SWITCH])
ali@0
  1993
                        printf("    Line %ld column %d - Hyphen at end of line?\n", linecnt, i);
ali@0
  1994
                    }
ali@0
  1995
                }
ali@0
  1996
            }
ali@0
  1997
ali@0
  1998
        /* v.21 also look for brackets surrounded by alpha                    */
ali@0
  1999
        /* Brackets are often unspaced, but shouldn't be surrounded by alpha. */
ali@0
  2000
        /* If so, suspect a scanno like "a]most"                              */
ali@0
  2001
        llen = strlen(aline);
ali@0
  2002
        for (i = 1; i < llen-1; i++) {           /* for each character in the line except 1st & last*/
ali@0
  2003
            if (strchr("{[()]}", aline[i])         /* if it's a bracket */
ali@0
  2004
                && gcisalpha(aline[i-1]) && gcisalpha(aline[i+1])) {
ali@0
  2005
                if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  2006
                if (!pswit[OVERVIEW_SWITCH])
ali@0
  2007
                    printf("    Line %ld column %d - Unspaced bracket?\n", linecnt, i);
ali@0
  2008
                else
ali@0
  2009
                    cnt_punct++;
ali@0
  2010
                }
ali@0
  2011
            }
ali@0
  2012
        /* The "Cinderella" case, back in again! :-S Give it another shot */
ali@0
  2013
        if (warn_endquote) {
ali@0
  2014
            llen = strlen(aline);
ali@0
  2015
            for (i = 1; i < llen; i++) {           /* for each character in the line except 1st */
ali@0
  2016
                if (aline[i] == CHAR_DQUOTE)
ali@0
  2017
                    if (isalpha(aline[i-1])) {
ali@0
  2018
                        if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  2019
                        if (!pswit[OVERVIEW_SWITCH])
ali@0
  2020
                            printf("    Line %ld column %d - endquote missing punctuation?\n", linecnt, i);
ali@0
  2021
                        else
ali@0
  2022
                            cnt_punct++;
ali@0
  2023
                        }
ali@0
  2024
                }
ali@0
  2025
            }
ali@0
  2026
ali@0
  2027
        llen = strlen(aline);
ali@0
  2028
ali@0
  2029
        /* Check for <HTML TAG> */
ali@0
  2030
        /* If there is a < in the line, followed at some point  */
ali@0
  2031
        /* by a > then we suspect HTML                          */
ali@0
  2032
        if (strstr(aline, "<") && strstr(aline, ">")) {
ali@0
  2033
            i = (signed int) (strstr(aline, ">") - strstr(aline, "<") + 1);
ali@0
  2034
            if (i > 0) {
ali@0
  2035
                strncpy(wrk, strstr(aline, "<"), i);
ali@0
  2036
                wrk[i] = 0;
ali@0
  2037
                if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  2038
                if (!pswit[OVERVIEW_SWITCH])
ali@0
  2039
                    printf("    Line %ld column %d - HTML Tag? %s \n", linecnt, (int)(strstr(aline, "<") - aline) + 1, wrk);
ali@0
  2040
                else
ali@0
  2041
                    cnt_html++;
ali@0
  2042
                }
ali@0
  2043
            }
ali@0
  2044
ali@0
  2045
        /* Check for &symbol; HTML                   */
ali@0
  2046
        /* If there is a & in the line, followed at  */
ali@0
  2047
        /* some point by a ; then we suspect HTML    */
ali@0
  2048
        if (strstr(aline, "&") && strstr(aline, ";")) {
ali@0
  2049
            i = (int)(strstr(aline, ";") - strstr(aline, "&") + 1);
ali@0
  2050
            for (s = strstr(aline, "&"); s < strstr(aline, ";"); s++)   
ali@0
  2051
                if (*s == CHAR_SPACE) i = 0;                /* 0.99 don't report "Jones & Son;" */
ali@0
  2052
            if (i > 0) {
ali@0
  2053
                strncpy(wrk, strstr(aline,"&"), i);
ali@0
  2054
                wrk[i] = 0;
ali@0
  2055
                if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0
  2056
                if (!pswit[OVERVIEW_SWITCH])
ali@0
  2057
                    printf("    Line %ld column %d - HTML symbol? %s \n", linecnt, (int)(strstr(aline, "&") - aline) + 1, wrk);
ali@0
  2058
                else
ali@0
  2059
                    cnt_html++;
ali@0
  2060
                }
ali@0
  2061
            }
ali@0
  2062
ali@0
  2063
        /* At end of paragraph, check for mismatched quotes.           */
ali@0
  2064
        /* We don't want to report an error immediately, since it is a */
ali@0
  2065
        /* common convention to omit the quotes at end of paragraph if */
ali@0
  2066
        /* the next paragraph is a continuation of the same speaker.   */
ali@0
  2067
        /* Where this is the case, the next para should begin with a   */
ali@0
  2068
        /* quote, so we store the warning message and only display it  */
ali@0
  2069
        /* at the top of the next iteration if the new para doesn't    */
ali@0
  2070
        /* start with a quote.                                         */
ali@0
  2071
        /* The -p switch overrides this default, and warns of unclosed */
ali@0
  2072
        /* quotes on _every_ paragraph, whether the next begins with a */
ali@0
  2073
        /* quote or not.                                               */
ali@0
  2074
        /* Version .16 - only report mismatched single quotes if       */
ali@0
  2075
        /* an open_single_quotes was found.                            */
ali@0
  2076
ali@0
  2077
        if (isemptyline) {          /* end of para - add up the totals */
ali@0
  2078
            if (quot % 2)
ali@0
  2079
                sprintf(dquote_err, "    Line %ld - Mismatched quotes\n", linecnt);
ali@0
  2080
            if (pswit[SQUOTE_SWITCH] && open_single_quote && (open_single_quote != close_single_quote) )
ali@0
  2081
                sprintf(squote_err,"    Line %ld - Mismatched singlequotes?\n", linecnt);
ali@0
  2082
            if (pswit[SQUOTE_SWITCH] && open_single_quote
ali@0
  2083
                                     && (open_single_quote != close_single_quote)
ali@0
  2084
                                     && (open_single_quote != close_single_quote +1) )
ali@0
  2085
                squot = 1;    /* flag it to be noted regardless of the first char of the next para */
ali@0
  2086
            if (r_brack)
ali@0
  2087
                sprintf(rbrack_err, "    Line %ld - Mismatched round brackets?\n", linecnt);
ali@0
  2088
            if (s_brack)
ali@0
  2089
                sprintf(sbrack_err, "    Line %ld - Mismatched square brackets?\n", linecnt);
ali@0
  2090
            if (c_brack)
ali@0
  2091
                sprintf(cbrack_err, "    Line %ld - Mismatched curly brackets?\n", linecnt);
ali@0
  2092
            if (c_unders % 2)
ali@0
  2093
                sprintf(unders_err, "    Line %ld - Mismatched underscores?\n", linecnt);
ali@0
  2094
            quot = s_brack = c_brack = r_brack = c_unders =
ali@0
  2095
                open_single_quote = close_single_quote = 0;
ali@0
  2096
            isnewpara = 1;     /* let the next iteration know that it's starting a new para */
ali@0
  2097
            }
ali@0
  2098
ali@0
  2099
        /* V.21 _ALSO_ at end of paragraph, check for omitted punctuation. */
ali@0
  2100
        /*      by working back through prevline. DW.                      */
ali@0
  2101
        /* Hmmm. Need to check this only for "normal" paras.               */
ali@0
  2102
        /* So what is a "normal" para? ouch!                               */
ali@0
  2103
        /* Not normal if one-liner (chapter headings, etc.)                */
ali@0
  2104
        /* Not normal if doesn't contain at least one locase letter        */
ali@0
  2105
        /* Not normal if starts with space                                 */
ali@0
  2106
ali@0
  2107
        /* 0.99 tighten up on para end checks. Disallow comma and */
ali@0
  2108
        /* semi-colon. Check for legit para end before quotes.    */
ali@0
  2109
        if (isemptyline) {          /* end of para */
ali@0
  2110
            for (s = prevline, i = 0; *s && !i; s++)
ali@0
  2111
                if (gcisletter(*s))
ali@0
  2112
                    i = 1;    /* use i to indicate the presence of a letter on the line */
ali@0
  2113
            /* This next "if" is a problem.                                             */
ali@0
  2114
            /* If I say "start_para_line <= linecnt - 1", that includes one-line        */
ali@0
  2115
            /* "paragraphs" like chapter heads. Lotsa false positives.                  */
ali@0
  2116
            /* If I say "start_para_line < linecnt - 1" it doesn't, but then it         */
ali@0
  2117
            /* misses genuine one-line paragraphs.                                      */
ali@0
  2118
            /* So what do I do? */
ali@0
  2119
            if (i
ali@0
  2120
                && lastblen > 2
ali@0
  2121
                && start_para_line < linecnt - 1
ali@0
  2122
                && *prevline > CHAR_SPACE
ali@0
  2123
                ) {
ali@0
  2124
                for (i = strlen(prevline)-1; (prevline[i] == CHAR_DQUOTE || prevline[i] == CHAR_SQUOTE) && prevline[i] > CHAR_SPACE && i > 0; i--);
ali@0
  2125
                for (  ; i > 0; i--) {
ali@0
  2126
                    if (gcisalpha(prevline[i])) {
ali@0
  2127
                        if (pswit[ECHO_SWITCH]) printf("\n%s\n", prevline);
ali@0
  2128
                        if (!pswit[OVERVIEW_SWITCH])
ali@0
  2129
                            printf("    Line %ld column %d - No punctuation at para end?\n", linecnt-1, strlen(prevline));
ali@0
  2130
                        else
ali@0
  2131
                            cnt_punct++;
ali@0
  2132
                        break;
ali@0
  2133
                        }
ali@0
  2134
                    if (strchr("-.:!([{?}])", prevline[i]))
ali@0
  2135
                        break;
ali@0
  2136
                    }
ali@0
  2137
                }
ali@0
  2138
            }
ali@0
  2139
        strcpy(prevline, aline);
ali@0
  2140
    }
ali@0
  2141
    fclose (infile);
ali@0
  2142
    if (!pswit[OVERVIEW_SWITCH])
ali@0
  2143
        for (i = 0; i < MAX_QWORD; i++)
ali@0
  2144
            if (dupcnt[i])
ali@0
  2145
                printf("\nNote: Queried word %s was duplicated %d time%s\n", qword[i], dupcnt[i], "s");
ali@0
  2146
}
ali@0
  2147
ali@0
  2148
ali@0
  2149
ali@0
  2150
/* flgets - get one line from the input stream, checking for   */
ali@0
  2151
/* the existence of exactly one CR/LF line-end per line.       */
ali@0
  2152
/* Returns a pointer to the line.                              */
ali@0
  2153
ali@0
  2154
char *flgets(char *theline, int maxlen, FILE *thefile, long lcnt)
ali@0
  2155
{
ali@0
  2156
    char c;
ali@0
  2157
    int len, isCR, cint;
ali@0
  2158
ali@0
  2159
    *theline = 0;
ali@0
  2160
    len = isCR = 0;
ali@0
  2161
    c = cint = fgetc(thefile);
ali@0
  2162
    do {
ali@0
  2163
        if (cint == EOF)
ali@0
  2164
            return (NULL);
ali@0
  2165
        if (c == 10)  /* either way, it's end of line */
ali@0
  2166
            if (isCR)
ali@0
  2167
                break;
ali@0
  2168
            else {   /* Error - a LF without a preceding CR */
ali@0
  2169
                if (pswit[LINE_END_SWITCH]) {
ali@0
  2170
                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", theline);
ali@0
  2171
                    if (!pswit[OVERVIEW_SWITCH])
ali@0
  2172
                        printf("    Line %ld - No CR?\n", lcnt);
ali@0
  2173
                    else
ali@0
  2174
                        cnt_lineend++;
ali@0
  2175
                    }
ali@0
  2176
                break;
ali@0
  2177
                }
ali@0
  2178
        if (c == 13) {
ali@0
  2179
            if (isCR) { /* Error - two successive CRs */
ali@0
  2180
                if (pswit[LINE_END_SWITCH]) {
ali@0
  2181
                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", theline);
ali@0
  2182
                    if (!pswit[OVERVIEW_SWITCH])
ali@0
  2183
                        printf("    Line %ld - Two successive CRs?\n", lcnt);
ali@0
  2184
                    else
ali@0
  2185
                        cnt_lineend++;
ali@0
  2186
                    }
ali@0
  2187
                }
ali@0
  2188
            isCR = 1;
ali@0
  2189
            }
ali@0
  2190
        else {
ali@0
  2191
            if (pswit[LINE_END_SWITCH] && isCR) {
ali@0
  2192
                if (pswit[ECHO_SWITCH]) printf("\n%s\n", theline);
ali@0
  2193
                if (!pswit[OVERVIEW_SWITCH])
ali@0
  2194
                    printf("    Line %ld column %d - CR without LF?\n", lcnt, len+1);
ali@0
  2195
                else
ali@0
  2196
                    cnt_lineend++;
ali@0
  2197
                }
ali@0
  2198
             theline[len] = c;
ali@0
  2199
             len++;
ali@0
  2200
             theline[len] = 0;
ali@0
  2201
             isCR = 0;
ali@0
  2202
             }
ali@0
  2203
        c = cint = fgetc(thefile);
ali@0
  2204
    } while(len < maxlen);
ali@0
  2205
    if (pswit[MARKUP_SWITCH])  
ali@0
  2206
        postprocess_for_HTML(theline);
ali@0
  2207
    if (pswit[DP_SWITCH])  
ali@0
  2208
        postprocess_for_DP(theline);
ali@0
  2209
    return(theline);
ali@0
  2210
}
ali@0
  2211
ali@0
  2212
ali@0
  2213
ali@0
  2214
ali@0
  2215
/* mixdigit - takes a "word" as a parameter, and checks whether it   */
ali@0
  2216
/* contains a mixture of alpha and digits. Generally, this is an     */
ali@0
  2217
/* error, but may not be for cases like 4th or L5 12s. 3d.           */
ali@0
  2218
/* Returns 0 if no error found, 1 if error.                          */
ali@0
  2219
ali@0
  2220
int mixdigit(char *checkword)   /* check for digits like 1 or 0 in words */
ali@0
  2221
{
ali@0
  2222
    int wehaveadigit, wehavealetter, firstdigits, query, wl;
ali@0
  2223
    char *s;
ali@0
  2224
ali@0
  2225
ali@0
  2226
    wehaveadigit = wehavealetter = query = 0;
ali@0
  2227
    for (s = checkword; *s; s++)
ali@0
  2228
        if (gcisalpha(*s))
ali@0
  2229
            wehavealetter = 1;
ali@0
  2230
        else
ali@0
  2231
            if (gcisdigit(*s))
ali@0
  2232
                wehaveadigit = 1;
ali@0
  2233
    if (wehaveadigit && wehavealetter) {         /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
ali@0
  2234
        query = 1;
ali@0
  2235
        wl = strlen(checkword);
ali@0
  2236
        for (firstdigits = 0; gcisdigit(checkword[firstdigits]); firstdigits++)
ali@0
  2237
            ;
ali@0
  2238
        /* digits, ending in st, rd, nd, th of either case */
ali@0
  2239
        /* 0.99 donovan points out an error below. Turns out */
ali@0
  2240
        /*      I was using matchword like strcmp when the   */
ali@0
  2241
        /*      return values are different! Duh.            */
ali@0
  2242
        if (firstdigits + 2 == wl &&
ali@0
  2243
              (matchword(checkword + wl - 2, "st")
ali@0
  2244
            || matchword(checkword + wl - 2, "rd")
ali@0
  2245
            || matchword(checkword + wl - 2, "nd")
ali@0
  2246
            || matchword(checkword + wl - 2, "th"))
ali@0
  2247
            )
ali@0
  2248
                query = 0;
ali@0
  2249
        if (firstdigits + 3 == wl &&
ali@0
  2250
              (matchword(checkword + wl - 3, "sts")
ali@0
  2251
            || matchword(checkword + wl - 3, "rds")
ali@0
  2252
            || matchword(checkword + wl - 3, "nds")
ali@0
  2253
            || matchword(checkword + wl - 3, "ths"))
ali@0
  2254
            )
ali@0
  2255
                query = 0;
ali@0
  2256
        if (firstdigits + 3 == wl &&
ali@0
  2257
              (matchword(checkword + wl - 4, "stly")
ali@0
  2258
            || matchword(checkword + wl - 4, "rdly")
ali@0
  2259
            || matchword(checkword + wl - 4, "ndly")
ali@0
  2260
            || matchword(checkword + wl - 4, "thly"))
ali@0
  2261
            )
ali@0
  2262
                query = 0;
ali@0
  2263
ali@0
  2264
        /* digits, ending in l, L, s or d */
ali@0
  2265
        if (firstdigits + 1 == wl &&
ali@0
  2266
            (checkword[wl-1] == 'l'
ali@0
  2267
            || checkword[wl-1] == 'L'
ali@0
  2268
            || checkword[wl-1] == 's'
ali@0
  2269
            || checkword[wl-1] == 'd'))
ali@0
  2270
                query = 0;
ali@0
  2271
        /* L at the start of a number, representing Britsh pounds, like L500  */
ali@0
  2272
        /* This is cute. We know the current word is mixeddigit. If the first */
ali@0
  2273
        /* letter is L, there must be at least one digit following. If both   */
ali@0
  2274
        /* digits and letters follow, we have a genuine error, else we have a */
ali@0
  2275
        /* capital L followed by digits, and we accept that as a non-error.   */
ali@0
  2276
        if (checkword[0] == 'L')
ali@0
  2277
            if (!mixdigit(checkword+1))
ali@0
  2278
                query = 0;
ali@0
  2279
        }
ali@0
  2280
    return (query);
ali@0
  2281
}
ali@0
  2282
ali@0
  2283
ali@0
  2284
ali@0
  2285
ali@0
  2286
/* getaword - extracts the first/next "word" from the line, and puts */
ali@0
  2287
/* it into "thisword". A word is defined as one English word unit    */
ali@0
  2288
/* -- or at least that's what I'm trying for.                        */
ali@0
  2289
/* Returns a pointer to the position in the line where we will start */
ali@0
  2290
/* looking for the next word.                                        */
ali@0
  2291
ali@0
  2292
char *getaword(char *fromline, char *thisword)
ali@0
  2293
{
ali@0
  2294
    int i, wordlen;
ali@0
  2295
    char *s;
ali@0
  2296
ali@0
  2297
    wordlen = 0;
ali@0
  2298
    for ( ; !gcisdigit(*fromline) && !gcisalpha(*fromline) && *fromline ; fromline++ );
ali@0
  2299
ali@0
  2300
    /* V .20                                                                   */
ali@0
  2301
    /* add a look-ahead to handle exceptions for numbers like 1,000 and 1.35.  */
ali@0
  2302
    /* Especially yucky is the case of L1,000                                  */
ali@0
  2303
    /* I hate this, and I see other ways, but I don't see that any is _better_.*/
ali@0
  2304
    /* This section looks for a pattern of characters including a digit        */
ali@0
  2305
    /* followed by a comma or period followed by one or more digits.           */
ali@0
  2306
    /* If found, it returns this whole pattern as a word; otherwise we discard */
ali@0
  2307
    /* the results and resume our normal programming.                          */
ali@0
  2308
    s = fromline;
ali@0
  2309
    for (  ; (gcisdigit(*s) || gcisalpha(*s) || *s == ',' || *s == '.') && wordlen < MAXWORDLEN ; s++ ) {
ali@0
  2310
        thisword[wordlen] = *s;
ali@0
  2311
        wordlen++;
ali@0
  2312
        }
ali@0
  2313
    thisword[wordlen] = 0;
ali@0
  2314
    for (i = 1; i < wordlen -1; i++) {
ali@0
  2315
        if (thisword[i] == '.' || thisword[i] == ',') {
ali@0
  2316
            if (gcisdigit(thisword[i-1]) && gcisdigit(thisword[i-1])) {   /* we have one of the damned things */
ali@0
  2317
                fromline = s;
ali@0
  2318
                return(fromline);
ali@0
  2319
                }
ali@0
  2320
            }
ali@0
  2321
        }
ali@0
  2322
ali@0
  2323
    /* we didn't find a punctuated number - do the regular getword thing */
ali@0
  2324
    wordlen = 0;
ali@0
  2325
    for (  ; (gcisdigit(*fromline) || gcisalpha(*fromline) || *fromline == '\'') && wordlen < MAXWORDLEN ; fromline++ ) {
ali@0
  2326
        thisword[wordlen] = *fromline;
ali@0
  2327
        wordlen++;
ali@0
  2328
        }
ali@0
  2329
    thisword[wordlen] = 0;
ali@0
  2330
    return(fromline);
ali@0
  2331
}
ali@0
  2332
ali@0
  2333
ali@0
  2334
ali@0
  2335
ali@0
  2336
ali@0
  2337
/* matchword - just a case-insensitive string matcher    */
ali@0
  2338
/* yes, I know this is not efficient. I'll worry about   */
ali@0
  2339
/* that when I have a clear idea where I'm going with it.*/
ali@0
  2340
ali@0
  2341
int matchword(char *checkfor, char *thisword)
ali@0
  2342
{
ali@0
  2343
    unsigned int ismatch, i;
ali@0
  2344
ali@0
  2345
    if (strlen(checkfor) != strlen(thisword)) return(0);
ali@0
  2346
ali@0
  2347
    ismatch = 1;     /* assume a match until we find a difference */
ali@0
  2348
    for (i = 0; i <strlen(checkfor); i++)
ali@0
  2349
        if (toupper(checkfor[i]) != toupper(thisword[i]))
ali@0
  2350
            ismatch = 0;
ali@0
  2351
    return (ismatch);
ali@0
  2352
}
ali@0
  2353
ali@0
  2354
ali@0
  2355
ali@0
  2356
ali@0
  2357
ali@0
  2358
/* lowerit - lowercase the line. Yes, strlwr does the same job,  */
ali@0
  2359
/* but not on all platforms, and I'm a bit paranoid about what   */
ali@0
  2360
/* some implementations of tolower might do to hi-bit characters,*/
ali@0
  2361
/* which shouldn't matter, but better safe than sorry.           */
ali@0
  2362
ali@0
  2363
void lowerit(char *theline)
ali@0
  2364
{
ali@0
  2365
    for ( ; *theline; theline++)
ali@0
  2366
        if (*theline >='A' && *theline <='Z')
ali@0
  2367
            *theline += 32;
ali@0
  2368
}
ali@0
  2369
ali@0
  2370
ali@0
  2371
/* Is this word a Roman Numeral?                                    */
ali@0
  2372
/* v 0.99 improved to be better. It still doesn't actually          */
ali@0
  2373
/* validate that the number is a valid Roman Numeral -- for example */
ali@0
  2374
/* it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not*/
ali@0
  2375
/* what we're here to do. If it passes this, it LOOKS like a Roman  */
ali@0
  2376
/* numeral. Anyway, the actual Romans were pretty tolerant of bad   */
ali@0
  2377
/* arithmetic, or expressions thereof, except when it came to taxes.*/
ali@0
  2378
/* Allow any number of M, an optional D, an optional CM or CD,      */
ali@0
  2379
/* any number of optional Cs, an optional XL or an optional XC, an  */
ali@0
  2380
/* optional IX or IV, an optional V and any number of optional Is.  */
ali@0
  2381
/* Good enough for jazz chords.                                     */
ali@0
  2382
ali@0
  2383
int isroman(char *t)
ali@0
  2384
{
ali@0
  2385
    char *s;
ali@0
  2386
ali@0
  2387
    if (!t || !*t) return (0);
ali@0
  2388
ali@0
  2389
    s = t;
ali@0
  2390
ali@0
  2391
    while (*t == 'm' && *t ) t++;
ali@0
  2392
    if (*t == 'd') t++;
ali@0
  2393
    if (*t == 'c' && *(t+1) == 'm') t+=2;
ali@0
  2394
    if (*t == 'c' && *(t+1) == 'd') t+=2;
ali@0
  2395
    while (*t == 'c' && *t) t++;
ali@0
  2396
    if (*t == 'x' && *(t+1) == 'l') t+=2;
ali@0
  2397
    if (*t == 'x' && *(t+1) == 'c') t+=2;
ali@0
  2398
    if (*t == 'l') t++;
ali@0
  2399
    while (*t == 'x' && *t) t++;
ali@0
  2400
    if (*t == 'i' && *(t+1) == 'x') t+=2;
ali@0
  2401
    if (*t == 'i' && *(t+1) == 'v') t+=2;
ali@0
  2402
    if (*t == 'v') t++;
ali@0
  2403
    while (*t == 'i' && *t) t++;
ali@0
  2404
    if (!*t) return (1);
ali@0
  2405
ali@0
  2406
    return(0);
ali@0
  2407
}
ali@0
  2408
ali@0
  2409
ali@0
  2410
ali@0
  2411
ali@0
  2412
/* gcisalpha is a special version that is somewhat lenient on 8-bit texts.     */
ali@0
  2413
/* If we use the standard isalpha() function, 8-bit accented characters break  */
ali@0
  2414
/* words, so that tete with accented characters appears to be two words, "t"   */
ali@0
  2415
/* and "t", with 8-bit characters between them. This causes over-reporting of  */
ali@0
  2416
/* errors. gcisalpha() recognizes accented letters from the CP1252 (Windows)   */
ali@0
  2417
/* and ISO-8859-1 character sets, which are the most common PG 8-bit types.    */
ali@0
  2418
ali@0
  2419
int gcisalpha(unsigned char c)
ali@0
  2420
{
ali@0
  2421
    if (c >='a' && c <='z') return(1);
ali@0
  2422
    if (c >='A' && c <='Z') return(1);
ali@0
  2423
    if (c < 140) return(0);
ali@0
  2424
    if (c >=192 && c != 208 && c != 215 && c != 222 && c != 240 && c != 247 && c != 254) return(1);
ali@0
  2425
    if (c == 140 || c == 142 || c == 156 || c == 158 || c == 159) return (1);
ali@0
  2426
    return(0);
ali@0
  2427
}
ali@0
  2428
ali@0
  2429
/* gcisdigit is a special version that doesn't get confused in 8-bit texts.    */
ali@0
  2430
int gcisdigit(unsigned char c)
ali@0
  2431
{   
ali@0
  2432
    if (c >= '0' && c <='9') return(1);
ali@0
  2433
    return(0);
ali@0
  2434
}
ali@0
  2435
ali@0
  2436
/* gcisletter is a special version that doesn't get confused in 8-bit texts.    */
ali@0
  2437
/* Yeah, we're ISO-8891-1-specific. So sue me.                                  */
ali@0
  2438
int gcisletter(unsigned char c)
ali@0
  2439
{   
ali@0
  2440
    if ((c >= 'A' && c <='Z') || (c >= 'a' && c <='z') || c >= 192) return(1);
ali@0
  2441
    return(0);
ali@0
  2442
}
ali@0
  2443
ali@0
  2444
ali@0
  2445
ali@0
  2446
ali@0
  2447
/* gcstrchr wraps strchr to return NULL if the character being searched for is zero */
ali@0
  2448
ali@0
  2449
char *gcstrchr(char *s, char c)
ali@0
  2450
{
ali@0
  2451
    if (c == 0) return(NULL);
ali@0
  2452
    return(strchr(s,c));
ali@0
  2453
}
ali@0
  2454
ali@0
  2455
/* postprocess_for_DP is derived from postprocess_for_HTML          */
ali@0
  2456
/* It is invoked with the -d switch from flgets().                  */
ali@0
  2457
/* It simply "removes" from the line a hard-coded set of common     */
ali@0
  2458
/* DP-specific tags, so that the line passed to the main routine has*/
ali@0
  2459
/* been pre-cleaned of DP markup.                                   */
ali@0
  2460
ali@0
  2461
void postprocess_for_DP(char *theline)
ali@0
  2462
{
ali@0
  2463
ali@0
  2464
    char *s, *t;
ali@0
  2465
    int i;
ali@0
  2466
ali@0
  2467
    if (!*theline) 
ali@0
  2468
        return;
ali@0
  2469
ali@0
  2470
    for (i = 0; *DPmarkup[i]; i++) {
ali@0
  2471
        s = strstr(theline, DPmarkup[i]);
ali@0
  2472
        while (s) {
ali@0
  2473
            t = s + strlen(DPmarkup[i]);
ali@0
  2474
            while (*t) {
ali@0
  2475
                *s = *t;
ali@0
  2476
                t++; s++;
ali@0
  2477
                }
ali@0
  2478
            *s = 0;
ali@0
  2479
            s = strstr(theline, DPmarkup[i]);
ali@0
  2480
            }
ali@0
  2481
        }
ali@0
  2482
ali@0
  2483
}
ali@0
  2484
ali@0
  2485
ali@0
  2486
/* postprocess_for_HTML is, at the moment (0.97), a very nasty      */
ali@0
  2487
/* short-term fix for Charlz. Nasty, nasty, nasty.                  */
ali@0
  2488
/* It is invoked with the -m switch from flgets().                  */
ali@0
  2489
/* It simply "removes" from the line a hard-coded set of common     */
ali@0
  2490
/* HTML tags and "replaces" a hard-coded set of common HTML         */
ali@0
  2491
/* entities, so that the line passed to the main routine has        */
ali@0
  2492
/* been pre-cleaned of HTML. This is _so_ not the right way to      */
ali@0
  2493
/* deal with HTML, but what Charlz needs now is not HTML handling   */
ali@0
  2494
/* proper: just ignoring <i> tags and some others.                  */
ali@0
  2495
/* To be revisited in future releases!                              */
ali@0
  2496
ali@0
  2497
void postprocess_for_HTML(char *theline)
ali@0
  2498
{
ali@0
  2499
ali@0
  2500
    if (strstr(theline, "<") && strstr(theline, ">"))
ali@0
  2501
        while (losemarkup(theline))
ali@0
  2502
            ;
ali@0
  2503
    while (loseentities(theline))
ali@0
  2504
        ;
ali@0
  2505
}
ali@0
  2506
ali@0
  2507
char *losemarkup(char *theline)
ali@0
  2508
{
ali@0
  2509
    char *s, *t;
ali@0
  2510
    int i;
ali@0
  2511
ali@0
  2512
    if (!*theline) 
ali@0
  2513
        return(NULL);
ali@0
  2514
ali@0
  2515
    s = strstr(theline, "<");
ali@0
  2516
    t = strstr(theline, ">");
ali@0
  2517
    if (!s || !t) return(NULL);
ali@0
  2518
    for (i = 0; *markup[i]; i++)
ali@0
  2519
        if (!tagcomp(s+1, markup[i])) {
ali@0
  2520
            if (!*(t+1)) {
ali@0
  2521
                *s = 0;
ali@0
  2522
                return(s);
ali@0
  2523
                }
ali@0
  2524
            else
ali@0
  2525
                if (t > s) {
ali@0
  2526
                    strcpy(s, t+1);
ali@0
  2527
                    return(s);
ali@0
  2528
                    }
ali@0
  2529
        }
ali@0
  2530
    /* it's an unrecognized <xxx> */
ali@0
  2531
    return(NULL);
ali@0
  2532
}
ali@0
  2533
ali@0
  2534
char *loseentities(char *theline)
ali@0
  2535
{
ali@0
  2536
    int i;
ali@0
  2537
    char *s, *t;
ali@0
  2538
ali@0
  2539
    if (!*theline) 
ali@0
  2540
        return(NULL);
ali@0
  2541
ali@0
  2542
    for (i = 0; *entities[i].htmlent; i++) {
ali@0
  2543
        s = strstr(theline, entities[i].htmlent);
ali@0
  2544
        if (s) {
ali@0
  2545
            t = malloc((size_t)strlen(s));
ali@0
  2546
            if (!t) return(NULL);
ali@0
  2547
            strcpy(t, s + strlen(entities[i].htmlent));
ali@0
  2548
            strcpy(s, entities[i].textent);
ali@0
  2549
            strcat(s, t);
ali@0
  2550
            free(t);
ali@0
  2551
            return(theline);
ali@0
  2552
            }
ali@0
  2553
        }
ali@0
  2554
ali@0
  2555
    /* V0.97 Duh. Forgot to check the htmlnum member */
ali@0
  2556
    for (i = 0; *entities[i].htmlnum; i++) {
ali@0
  2557
        s = strstr(theline, entities[i].htmlnum);
ali@0
  2558
        if (s) {
ali@0
  2559
            t = malloc((size_t)strlen(s));
ali@0
  2560
            if (!t) return(NULL);
ali@0
  2561
            strcpy(t, s + strlen(entities[i].htmlnum));
ali@0
  2562
            strcpy(s, entities[i].textent);
ali@0
  2563
            strcat(s, t);
ali@0
  2564
            free(t);
ali@0
  2565
            return(theline);
ali@0
  2566
            }
ali@0
  2567
        }
ali@0
  2568
    return(NULL);
ali@0
  2569
}
ali@0
  2570
ali@0
  2571
ali@0
  2572
int tagcomp(char *strin, char *basetag)
ali@0
  2573
{
ali@0
  2574
    char *s, *t;
ali@0
  2575
ali@0
  2576
    s = basetag;
ali@0
  2577
    t  = strin;
ali@0
  2578
    if (*t == '/') t++; /* ignore a slash */
ali@0
  2579
    while (*s && *t) {
ali@0
  2580
        if (tolower(*s) != tolower(*t)) return(1);
ali@0
  2581
        s++; t++;
ali@0
  2582
        }
ali@0
  2583
    /* OK, we have < followed by a valid tag start  */
ali@0
  2584
    /* should I do something about length?          */
ali@0
  2585
    /* this is messy. The length of an <i> tag is   */
ali@0
  2586
    /* limited, but a <table> could go on for miles */
ali@0
  2587
    /* so I'd have to parse the tags . . . ugh.     */
ali@0
  2588
    /* It isn't what Charlz needs now, so mark it   */
ali@0
  2589
    /* as 'pending'.                                */
ali@0
  2590
    return(0);
ali@0
  2591
}
ali@0
  2592
ali@0
  2593
void proghelp()                  /* explain program usage here */
ali@0
  2594
{
ali@0
  2595
    fputs("V. 0.991. Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
ali@0
  2596
    fputs("Gutcheck comes wih ABSOLUTELY NO WARRANTY. For details, read the file COPYING.\n", stderr);
ali@0
  2597
    fputs("This is Free Software; you may redistribute it under certain conditions (GPL);\n", stderr);
ali@0
  2598
    fputs("read the file COPYING for details.\n\n", stderr);
ali@0
  2599
    fputs("Usage is: gutcheck [-setpxloyhud] filename\n",stderr);
ali@0
  2600
    fputs("  where -s checks single quotes, -e suppresses echoing lines, -t checks typos\n",stderr);
ali@0
  2601
    fputs("  -x (paranoid) switches OFF -t and extra checks, -l turns OFF line-end checks\n",stderr);
ali@0
  2602
    fputs("  -o just displays overview without detail, -h echoes header fields\n",stderr);
ali@0
  2603
    fputs("  -v (verbose) unsuppresses duplicate reporting, -m suppresses markup\n",stderr);
ali@0
  2604
    fputs("  -d ignores DP-specific markup,\n",stderr);
ali@0
  2605
    fputs("  -u uses a file gutcheck.typ to query user-defined possible typos\n",stderr);
ali@0
  2606
    fputs("Sample usage: gutcheck warpeace.txt \n",stderr);
ali@0
  2607
    fputs("\n",stderr);
ali@0
  2608
    fputs("Gutcheck looks for errors in Project Gutenberg(TM) etexts.\n", stderr);
ali@0
  2609
    fputs("Gutcheck queries anything it thinks shouldn't be in a PG text; non-ASCII\n",stderr);
ali@0
  2610
    fputs("characters like accented letters, lines longer than 75 or shorter than 55,\n",stderr);
ali@0
  2611
    fputs("unbalanced quotes or brackets, a variety of badly formatted punctuation, \n",stderr);
ali@0
  2612
    fputs("HTML tags, some likely typos. It is NOT a substitute for human judgement.\n",stderr);
ali@0
  2613
    fputs("\n",stderr);
ali@0
  2614
}
ali@0
  2615
ali@0
  2616
ali@0
  2617
ali@0
  2618
/*********************************************************************
ali@0
  2619
  Revision History:
ali@0
  2620
ali@0
  2621
  04/22/01 Cleaned up some stuff and released .10
ali@0
  2622
ali@0
  2623
           ---------------
ali@0
  2624
ali@0
  2625
  05/09/01 Added the typo list, added two extra cases of he/be error,
ali@0
  2626
           added -p switch, OPEN_SINGLE QUOTE char as .11
ali@0
  2627
ali@0
  2628
           ---------------
ali@0
  2629
ali@0
  2630
  05/20/01 Increased the typo list,
ali@0
  2631
           added paranoid mode,
ali@0
  2632
           ANSIfied the code and added some casts
ali@0
  2633
              so the compiler wouldn't keep asking if I knew what I was doing,
ali@0
  2634
           fixed bug in l.s.d. condition (thanks, Dave!),
ali@0
  2635
           standardized spacing when echoing,
ali@0
  2636
           added letter-combo checking code to typo section,
ali@0
  2637
           added more h/b words to typo array.
ali@0
  2638
           Not too sure about putting letter combos outside of the TYPO conditions -
ali@0
  2639
           someone is sure to have a book about the tbaka tribe, or something. Anyway, let's see.
ali@0
  2640
           Released as .12
ali@0
  2641
ali@0
  2642
           ---------------
ali@0
  2643
ali@0
  2644
  06/01/01 Removed duplicate reporting of Tildes, asterisks, etc.
ali@0
  2645
  06/10/01 Added flgets routine to help with platform-independent
ali@0
  2646
           detection of invalid line-ends. All PG text files should
ali@0
  2647
           have CR/LF (13/10) at end of line, regardless of system.
ali@0
  2648
           Gutcheck now validates this by default. (Thanks, Charles!)
ali@0
  2649
           Released as .13
ali@0
  2650
ali@0
  2651
           ---------------
ali@0
  2652
ali@0
  2653
  06/11/01 Added parenthesis match checking. (c_brack, cbrack_err etc.)
ali@0
  2654
           Released as .14
ali@0
  2655
ali@0
  2656
           ---------------
ali@0
  2657
ali@0
  2658
  06/23/01 Fixed: 'No',he said. not being flagged.
ali@0
  2659
ali@0
  2660
           Improved: better single-quotes checking:
ali@0
  2661
ali@0
  2662
           Ignore singlequotes surrounded by alpha, like didn't. (was OK)
ali@0
  2663
ali@0
  2664
           If a singlequote is at the END of a word AND the word ends in "s":
ali@0
  2665
                  The dogs' tails wagged.
ali@0
  2666
           it's probably an apostrophe, but less commonly may be a closequote:
ali@0
  2667
                  "These 'pack dogs' of yours look more like wolves."
ali@0
  2668
ali@0
  2669
           If it's got punctuation before it and is followed by a space
ali@0
  2670
           or punctuation:
ali@0
  2671
              . . . was a problem,' he said
ali@0
  2672
              . . . was a problem,'"
ali@0
  2673
           it is probably (certainly?) a closequote.
ali@0
  2674
ali@0
  2675
           If it's at start of paragraph, it's probably an openquote.
ali@0
  2676
              (but watch dialect)
ali@0
  2677
ali@0
  2678
           Words with ' at beginning and end are probably quoted:
ali@0
  2679
               "You have the word 'chivalry' frequently on your lips."
ali@0
  2680
               (Not specifically implemented)
ali@0
  2681
           V.18 I'm glad I didn't implement this, 'cos it jest ain't so
ali@0
  2682
           where the convention is to punctuate outside the quotes.
ali@0
  2683
               'Come', he said, 'and join the party'.
ali@0
  2684
ali@0
  2685
           If it is followed by an alpha, and especially a capital:
ali@0
  2686
              'Hello,' called he.
ali@0
  2687
           it is either an openquote or dialect.
ali@0
  2688
ali@0
  2689
           Dialect breaks ALL the rules:
ali@0
  2690
                  A man's a man for a' that.
ali@0
  2691
                  "Aye, but 'tis all in the pas' now."
ali@0
  2692
                  "'Tis often the way," he said.
ali@0
  2693
                  'Ave a drink on me.
ali@0
  2694
ali@0
  2695
           This version looks to be an improvement, and produces
ali@0
  2696
           fewer false positives, but is still not perfect. The
ali@0
  2697
           'pack dogs' case still fools it, and dialect is still
ali@0
  2698
           a problem. Oh, well, it's an improvement, and I have
ali@0
  2699
           a weighted structure in place for refining guesses at
ali@0
  2700
           closequotes. Maybe next time, I'll add a bit of logic
ali@0
  2701
           where if there is an open quote and one that was guessed
ali@0
  2702
           to be a possessive apostrophe after s, I'll re-guess it
ali@0
  2703
           to be a closequote. Let's see how this one flies, first.
ali@0
  2704
ali@0
  2705
           (Afterview: it's still crap. Needs much work, and a deeper insight.)
ali@0
  2706
ali@0
  2707
           Released as .15
ali@0
  2708
ali@0
  2709
           TODO: More he/be checks. Can't be perfect - counterexamples:
ali@0
  2710
              I gave my son good advice: be married regardless of the world's opinion.
ali@0
  2711
              I gave my son good advice: he married regardless of the world's opinion.
ali@0
  2712
ali@0
  2713
              If by "primitive" be meant "crude", we can understand the sentence.
ali@0
  2714
              If by "primitive" he meant "crude", we can understand the sentence.
ali@0
  2715
ali@0
  2716
              No matter what be said, I must go on.
ali@0
  2717
              No matter what he said, I must go on.
ali@0
  2718
ali@0
  2719
              No value, however great, can be set upon them.
ali@0
  2720
              No value, however great, can he set upon them.
ali@0
  2721
ali@0
  2722
              Real-Life one from a DP International Weekly Miscellany:
ali@0
  2723
                He wandered through the forest without fear, sleeping
ali@0
  2724
                much, for in sleep be had companionship--the Great
ali@0
  2725
                Spirit teaching him what he should know in dreams.
ali@0
  2726
                That one found by jeebies, and it turned out to be "he".
ali@0
  2727
ali@0
  2728
ali@0
  2729
           ---------------
ali@0
  2730
ali@0
  2731
  07/01/01 Added -O option.
ali@0
  2732
           Improved singlequotes by reporting mismatched single quotes
ali@0
  2733
           only if an open_single_quotes was found.
ali@0
  2734
ali@0
  2735
           Released as .16
ali@0
  2736
ali@0
  2737
           ---------------
ali@0
  2738
ali@0
  2739
  08/27/01 Added -Y switch for Robert Rowe to allow his app to
ali@0
  2740
           catch the error output.
ali@0
  2741
ali@0
  2742
           Released as .17
ali@0
  2743
ali@0
  2744
           ---------------
ali@0
  2745
ali@0
  2746
  09/08/01 Added checking Capitals at start of paragraph, but not
ali@0
  2747
           checking them at start of sentence.
ali@0
  2748
ali@0
  2749
           TODO: Parse sentences out so can check reliably for start of
ali@0
  2750
                 sentence. Need a whole different approach for that.
ali@0
  2751
                 (Can't just rely on periods, since they are also
ali@0
  2752
                 used for abbreviations, etc.)
ali@0
  2753
ali@0
  2754
           Added checking for all vowels or all consonants in a word.
ali@0
  2755
ali@0
  2756
           While I was in, I added "ii" checking and "tl" at start of word.
ali@0
  2757
ali@0
  2758
           Added echoing of first line of paragraph when reporting
ali@0
  2759
           mismatched quoted or brackets (thanks to David Widger for the
ali@0
  2760
           suggestion)
ali@0
  2761
ali@0
  2762
           Not querying L at start of a number (used for British pounds).
ali@0
  2763
ali@0
  2764
           The spelling changes are sort of half-done but released anyway
ali@0
  2765
           Skipped .18 because I had given out a couple of test versions
ali@0
  2766
           with that number.
ali@0
  2767
ali@0
  2768
  09/25/01 Released as .19
ali@0
  2769
ali@0
  2770
           ---------------
ali@0
  2771
ali@0
  2772
           TODO:
ali@0
  2773
           Use the logic from my new version of safewrap to stop querying
ali@0
  2774
             short lines like poems and TOCs.
ali@0
  2775
           Ignore non-standard ellipses like .  .  . or ...
ali@0
  2776
ali@0
  2777
ali@0
  2778
           ---------------
ali@0
  2779
  10/01/01 Made any line over 80 a VERY long line (was 85).
ali@0
  2780
           Recognized openquotes on indented paragraphs as continuations
ali@0
  2781
               of the same speech.
ali@0
  2782
           Added "cf" to the okword list (how did I forget _that_?) and a few others.
ali@0
  2783
           Moved abbrev to okword and made it more general.
ali@0
  2784
           Removed requirement that PG_space_emdash be greater than
ali@0
  2785
               ten before turning off warnings about spaced dashes.
ali@0
  2786
           Added period to list of characters that might constitute a separator line.
ali@0
  2787
           Now checking for double punctuation (Thanks, David!)
ali@0
  2788
           Now if two spaced em-dashes on a line, reports both. (DW)
ali@0
  2789
           Bug: Wasn't catching spaced punctuation at line-end since I
ali@0
  2790
               added flgets in version .13 - fixed.
ali@0
  2791
           Bug: Wasn't catching spaced singlequotes - fixed
ali@0
  2792
           Now reads punctuated numbers like 1,000 as a single word.
ali@0
  2793
               (Used to give "standalone 1" type  queries)
ali@0
  2794
           Changed paranoid mode - not including s and p options. -ex is now quite usable.
ali@0
  2795
           Bug: was calling `"For it is perfectly impossible,"    Unspaced Quotes - fixed
ali@0
  2796
           Bug: Sometimes gave _next_ line number for queried word at end of line - fixed
ali@0
  2797
ali@0
  2798
  10/22/01 Released as .20
ali@0
  2799
ali@0
  2800
           ---------------
ali@0
  2801
ali@0
  2802
           Added count of lines with spaces at end. (cnt_spacend) (Thanks, Brett!)
ali@0
  2803
           Reduced the number of hi-bit letters needed to stop reporting them
ali@0
  2804
               from 1/20 to 1/100 or 200 in total.
ali@0
  2805
           Added PG footer check.
ali@0
  2806
           Added the -h switch.
ali@0
  2807
           Fixed platform-specific CHAR_EOL checking for isemptyline - changed to 13 and 10
ali@0
  2808
           Not reporting ".," when there are many of them, such as a book with many references to "Vol 1., p. 23"
ali@0
  2809
           Added unspaced brackets check when surrounded by alpha.
ali@0
  2810
           Removed all typo reporting unless the typo switch is on.
ali@0
  2811
           Added gcisalpha to ease over-reporting of 8-bit queries.
ali@0
  2812
           ECHO_SWITCH is now ON by default!
ali@0
  2813
           PARANOID_SWITCH is now ON by default!
ali@0
  2814
           Checking for ">From" placed there by e-mail MTA (Thanks Andrew & Greg)
ali@0
  2815
           Checking for standalone lowercase "l"
ali@0
  2816
           Checking for standalone lowercase "s"
ali@0
  2817
           Considering "is be" and "be is" "be was" "was be" as he/be errors
ali@0
  2818
           Looking at punct at end of para
ali@0
  2819
ali@0
  2820
  01/20/02 Released as .21
ali@0
  2821
ali@0
  2822
           ---------------
ali@0
  2823
ali@0
  2824
           Added VERBOSE_SWITCH to make it list everything. (George Davis)
ali@0
  2825
ali@0
  2826
           ---------------
ali@0
  2827
ali@0
  2828
  02/17/02 Added cint in flgets to try fix an EOF failure on a compiler I don't have.
ali@0
  2829
           after which
ali@0
  2830
           This line caused a coredump on Solaris - fixed.
ali@0
  2831
                Da sagte die Figur: " Das ist alles gar schoen, und man mag die Puppe
ali@0
  2832
  03/09/02 Changed header recognition for another header change
ali@0
  2833
           Called it .24
ali@0
  2834
  03/29/02 Added qword[][] so I can suppress massive overreporting
ali@0
  2835
           of queried "words" like "FN", "Wm.", "th'", people's 
ali@0
  2836
           initials, chemical formulae and suchlike in some texts.
ali@0
  2837
           Called it .25
ali@0
  2838
  04/07/02 The qword summary reports at end shouldn't show in OVERVIEW mode. Fixed.
ali@0
  2839
           Added linecounts in overview mode.
ali@0
  2840
           Wow! gutcheck gutcheck.exe doesn't report a binary! :-) Need to tighten up. Done.
ali@0
  2841
           "m" is a not uncommon scanno for "in", but also appears in "a.m." - Can I get round that?
ali@0
  2842
  07/07/02 Added GPL.
ali@0
  2843
           Added checking for broken em-dash at line-end (enddash)
ali@0
  2844
           Released as 0.95
ali@0
  2845
  08/17/02 Fixed a bug that treated some hi-bit characters as spaces. Thanks, Carlo.
ali@0
  2846
           Released as 0.96
ali@0
  2847
  10/10/02 Suppressing some annoying multiple reports by default:
ali@0
  2848
           Standalone Ones, Asterisks, Square Brackets.
ali@0
  2849
              Digit 1 occurs often in many scientific texts.
ali@0
  2850
              Asterisk occurs often in multi-footnoted texts.
ali@0
  2851
              Mismatch Square Brackets occurs often in multi-para footnotes.
ali@0
  2852
           Added -m switch for Charlz. Horrible. Nasty. Kludgy. Evil.
ali@0
  2853
              . . . but it does more or less work for the main cases.
ali@0
  2854
           Removed uppercase within a word as a separate category so
ali@0
  2855
           that names like VanAllen get reported only once, like other
ali@0
  2856
           suspected typos.
ali@0
  2857
  11/24/02 Fixed - -m switch wasn't looking at htmlnum in
ali@0
  2858
           loseentities (Thanks, Brett!)
ali@0
  2859
           Fixed bug which occasionally gave false warning of
ali@0
  2860
           paragraph starting with lowercase.
ali@0
  2861
           Added underscore as character not to query around doublequotes.
ali@0
  2862
           Split the "Non-ASCII" message into "Non-ASCII" vs. "Non-ISO-8859"
ali@0
  2863
           . . . this is to help detect things like CP1252 characters.
ali@0
  2864
           Released as 0.97
ali@0
  2865
ali@0
  2866
  12/01/02 Hacked a simplified version of the "Wrongspaced quotes" out of gutspell,
ali@0
  2867
           for doublequotes only. Replaces "Spaced quote", since it also covers that
ali@0
  2868
           case.
ali@0
  2869
           Added "warn_hyphen" to ease over-reporting of hyphens.
ali@0
  2870
ali@0
  2871
  12/20/02 Added "extra period" checks.
ali@0
  2872
           Added single character line check
ali@0
  2873
           Added I" check - is usually an exclam
ali@0
  2874
           Released as 0.98
ali@0
  2875
ali@0
  2876
  1/5/03   Eeek! Left in a lowerit(argv[0]) at the start before procfile()
ali@0
  2877
           from when I was looking at ways to identify markup. Refuses to
ali@0
  2878
           open files for *nix users with upcase in the filemanes. Removed.
ali@0
  2879
           Fixed quickly and released as 0.981
ali@0
  2880
ali@0
  2881
  1/8/03   Added "arid" to the list of typos, slightly against my better
ali@0
  2882
           judgement, but the DP gang are all excited about it. :-)
ali@0
  2883
           Added a check for comma followed by capital letter, where
ali@0
  2884
           a period has OCRed into a comma. (DW). Not sure about this
ali@0
  2885
           either; we'll see.
ali@0
  2886
           Compiling for Win32 to allow longfilenames.
ali@0
  2887
ali@0
  2888
  6/1/04   A messy test release for DW to include the "gutcheck.typ"
ali@0
  2889
           process. And the gutcheck.jee trials. Removed "arid" --
ali@0
  2890
           it can go in gutcheck.typ
ali@0
  2891
ali@0
  2892
           Added checks for carats ^ and slants / but disabling slant
ali@0
  2893
           queries if more than 20 of them, because some people use them
ali@0
  2894
           for /italics/. Slants are commonly mistaken italic "I"s.
ali@0
  2895
ali@0
  2896
           Later: removed gutcheck.jee -- wrote jeebies instead.
ali@0
  2897
ali@0
  2898
Random TODO: 
ali@0
  2899
           Check brackets more closely, like quotes, so that it becomes
ali@0
  2900
           easy to find the error in long paragraphs full of brackets.
ali@0
  2901
ali@0
  2902
ali@0
  2903
  11/4/04  Assorted cleanup. Fixed case where text started with an
ali@0
  2904
           unbalanced paragraph.
ali@0
  2905
ali@0
  2906
  1/2/05   Has it really been that long? Added "nocomma", "noperiod" check.
ali@0
  2907
           Bits and pieces: improved isroman(). Added isletter().
ali@0
  2908
           Other stuff I never noted before this.
ali@0
  2909
ali@0
  2910
  7/3/05   Stuck in a quick start on DP-markup ignoring 
ali@0
  2911
           at BillFlis's suggestion.
ali@0
  2912
ali@0
  2913
  1/23/06  Took out nocomma etc if typos are off. Why did I ever leave that in?
ali@0
  2914
           Don't count footer for dotcomma etc.
ali@0
  2915
ali@0
  2916
ali@0
  2917
1       I
ali@0
  2918
ail     all
ali@0
  2919
arc     are
ali@0
  2920
arid    and
ali@0
  2921
bad     had
ali@0
  2922
ball    hall
ali@0
  2923
band    hand
ali@0
  2924
bar     her
ali@0
  2925
bat     but
ali@0
  2926
be      he
ali@0
  2927
bead    head
ali@0
  2928
beads   heads
ali@0
  2929
bear    hear
ali@0
  2930
bit     hit
ali@0
  2931
bo      be
ali@0
  2932
boon    been
ali@0
  2933
borne   home
ali@0
  2934
bow     how
ali@0
  2935
bumbled humbled
ali@0
  2936
car     ear
ali@0
  2937
carnage carriage
ali@0
  2938
carne   came
ali@0
  2939
cast    east
ali@0
  2940
cat     cut
ali@0
  2941
cat     eat
ali@0
  2942
cheek   check
ali@0
  2943
clay    day
ali@0
  2944
coining coming
ali@0
  2945
comer   corner
ali@0
  2946
die     she
ali@0
  2947
docs    does
ali@0
  2948
ease    case
ali@0
  2949
fail    fall
ali@0
  2950
fee     he
ali@0
  2951
haying  having
ali@0
  2952
ho      he
ali@0
  2953
ho      who
ali@0
  2954
hut     but
ali@0
  2955
is      as
ali@0
  2956
lie     he
ali@0
  2957
lime    time
ali@0
  2958
loth    10th
ali@0
  2959
m       in
ali@0
  2960
modem   modern
ali@0
  2961
Ms      his
ali@0
  2962
ray     away
ali@0
  2963
ray     my
ali@0
  2964
ringer  finger
ali@0
  2965
ringers fingers
ali@0
  2966
rioted  noted
ali@0
  2967
tho     the
ali@0
  2968
tie     he
ali@0
  2969
tie     the
ali@0
  2970
tier    her
ali@0
  2971
tight   right
ali@0
  2972
tile    the
ali@0
  2973
tiling  thing
ali@0
  2974
tip     up
ali@0
  2975
tram    train
ali@0
  2976
tune    time
ali@0
  2977
u       "
ali@0
  2978
wen     well
ali@0
  2979
yon     you
ali@0
  2980
ali@0
  2981
*********************************************************************/
ali@0
  2982