bookloupe/bookloupe.c
author ali <ali@juiblex.co.uk>
Sun May 26 21:33:49 2013 +0100 (2013-05-26)
changeset 63 0a4f8d73b27f
parent 62 d4a66d961a69
child 64 a83ca5ff8511
permissions -rw-r--r--
Break check_for_html_tag() out
ali@0
     1
/*************************************************************************/
ali@40
     2
/* bookloupe--check for assorted weirdnesses in a PG candidate text file */
ali@0
     3
/*                                                                       */
ali@0
     4
/* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>                  */
ali@40
     5
/* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>                     */
ali@0
     6
/*                                                                       */
ali@0
     7
/* This program is free software; you can redistribute it and/or modify  */
ali@0
     8
/* it under the terms of the GNU General Public License as published by  */
ali@0
     9
/* the Free Software Foundation; either version 2 of the License, or     */
ali@0
    10
/* (at your option) any later version.                                   */
ali@0
    11
/*                                                                       */
ali@0
    12
/* This program is distributed in the hope that it will be useful,       */
ali@0
    13
/* but WITHOUT ANY WARRANTY; without even the implied warranty of        */
ali@40
    14
/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the          */
ali@0
    15
/* GNU General Public License for more details.                          */
ali@0
    16
/*                                                                       */
ali@0
    17
/* You should have received a copy of the GNU General Public License     */
ali@40
    18
/* along with this program. If not, see <http://www.gnu.org/licenses/>.  */
ali@0
    19
/*************************************************************************/
ali@0
    20
ali@0
    21
#include <stdio.h>
ali@0
    22
#include <stdlib.h>
ali@0
    23
#include <string.h>
ali@0
    24
#include <ctype.h>
ali@0
    25
ali@0
    26
#define MAXWORDLEN    80    /* max length of one word             */
ali@0
    27
#define LINEBUFSIZE 2048    /* buffer size for an input line      */
ali@0
    28
ali@0
    29
#define MAX_USER_TYPOS 1000
ali@0
    30
#define USERTYPO_FILE "gutcheck.typ"
ali@0
    31
ali@0
    32
#ifndef MAX_PATH
ali@0
    33
#define MAX_PATH 16384
ali@0
    34
#endif
ali@0
    35
ali@0
    36
char aline[LINEBUFSIZE];
ali@0
    37
char prevline[LINEBUFSIZE];
ali@0
    38
ali@40
    39
/* Common typos. */
ali@40
    40
char *typo[] = {
ali@40
    41
    "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
ali@40
    42
    "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",
ali@40
    43
    "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
ali@40
    44
    "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
ali@40
    45
    "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
ali@40
    46
    "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
ali@40
    47
    "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
ali@40
    48
    "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
ali@40
    49
    "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
ali@40
    50
    "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
ali@40
    51
    "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
ali@40
    52
    "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
ali@40
    53
    "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
ali@40
    54
    "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
ali@40
    55
    "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
ali@40
    56
    "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
ali@40
    57
    "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
ali@40
    58
    "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
ali@40
    59
    "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
ali@40
    60
    "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
ali@40
    61
    "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
ali@40
    62
    "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
ali@40
    63
    "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
ali@40
    64
    "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
ali@40
    65
    "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
ali@40
    66
    "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
ali@40
    67
    "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
ali@40
    68
    "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
ali@40
    69
    "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
ali@40
    70
    "se", ""
ali@40
    71
};
ali@0
    72
ali@0
    73
char *usertypo[MAX_USER_TYPOS];
ali@0
    74
ali@40
    75
/* Common abbreviations and other OK words not to query as typos. */
ali@40
    76
char *okword[] = {
ali@40
    77
    "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
ali@40
    78
    "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
ali@40
    79
    "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
ali@40
    80
    "outbid", "outbids", "frostbite", "frostbitten", ""
ali@40
    81
};
ali@0
    82
ali@40
    83
/* Common abbreviations that cause otherwise unexplained periods. */
ali@40
    84
char *abbrev[] = {
ali@40
    85
    "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
ali@40
    86
    "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
ali@40
    87
};
ali@0
    88
ali@40
    89
/*
ali@40
    90
 * Two-Letter combinations that rarely if ever start words,
ali@40
    91
 * but are common scannos or otherwise common letter combinations.
ali@40
    92
 */
ali@40
    93
char *nostart[] = {
ali@40
    94
    "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
ali@40
    95
};
ali@0
    96
ali@40
    97
/*
ali@40
    98
 * Two-Letter combinations that rarely if ever end words,
ali@40
    99
 * but are common scannos or otherwise common letter combinations.
ali@40
   100
 */
ali@40
   101
char *noend[] = {
ali@40
   102
    "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
ali@40
   103
    "sw", "gr", "sl", "cl", "iy", ""
ali@40
   104
};
ali@0
   105
ali@40
   106
char *markup[] = {
ali@40
   107
    "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
ali@40
   108
    "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
ali@40
   109
    "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
ali@40
   110
    "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
ali@40
   111
};
ali@0
   112
ali@40
   113
char *DPmarkup[] = {
ali@40
   114
    "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
ali@40
   115
};
ali@0
   116
ali@40
   117
char *nocomma[] = {
ali@40
   118
    "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
ali@40
   119
    "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
ali@40
   120
    "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
ali@40
   121
    "during", "let", "toward", "among", ""
ali@40
   122
};
ali@0
   123
ali@40
   124
char *noperiod[] = {
ali@40
   125
    "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
ali@40
   126
    "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
ali@40
   127
    "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
ali@40
   128
    "among", "those", "into", "whom", "having", "thence", ""
ali@40
   129
}; 
ali@0
   130
ali@40
   131
char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü";
ali@0
   132
ali@0
   133
struct {
ali@0
   134
    char *htmlent;
ali@0
   135
    char *htmlnum;
ali@0
   136
    char *textent;
ali@40
   137
} entities[] = {
ali@40
   138
    "&amp;",	"&#38;",     "&", 
ali@40
   139
    "&lt;",	"&#60;",     "<",
ali@40
   140
    "&gt;",	"&#62;",     ">",
ali@40
   141
    "&deg;",	"&#176;",    " degrees",
ali@40
   142
    "&pound;",	"&#163;",    "L",
ali@40
   143
    "&quot;",	"&#34;",     "\"", /* quotation mark = APL quote */
ali@40
   144
    "&OElig;",	"&#338;",    "OE", /* latin capital ligature OE */
ali@40
   145
    "&oelig;",	"&#339;",    "oe", /* latin small ligature oe */
ali@40
   146
    "&Scaron;",	"&#352;",    "S", /* latin capital letter S with caron */
ali@40
   147
    "&scaron;",	"&#353;",    "s", /* latin small letter s with caron */
ali@40
   148
    "&Yuml;",	"&#376;",    "Y", /* latin capital letter Y with diaeresis */
ali@40
   149
    "&circ;",	"&#710;",    "",  /* modifier letter circumflex accent */
ali@40
   150
    "&tilde;",	"&#732;",    "~", /* small tilde, U+02DC ISOdia */
ali@40
   151
    "&ensp;",	"&#8194;",   " ", /* en space, U+2002 ISOpub */
ali@40
   152
    "&emsp;",	"&#8195;",   " ", /* em space, U+2003 ISOpub */
ali@40
   153
    "&thinsp;",	"&#8201;",   " ", /* thin space, U+2009 ISOpub */
ali@40
   154
    "&ndash;",	"&#8211;",   "-", /* en dash, U+2013 ISOpub */
ali@40
   155
    "&mdash;",	"&#8212;",   "--", /* em dash, U+2014 ISOpub */
ali@40
   156
    "&rsquo;",	"&#8217;",   "'", /* right single quotation mark */
ali@40
   157
    "&sbquo;",	"&#8218;",   "'", /* single low-9 quotation mark */
ali@40
   158
    "&ldquo;",	"&#8220;",   "\"", /* left double quotation mark */
ali@40
   159
    "&rdquo;",	"&#8221;",   "\"", /* right double quotation mark */
ali@40
   160
    "&bdquo;",	"&#8222;",   "\"", /* double low-9 quotation mark */
ali@40
   161
    "&lsaquo;",	"&#8249;",   "\"", /* single left-pointing angle quotation mark */
ali@40
   162
    "&rsaquo;",	"&#8250;",   "\"", /* single right-pointing angle quotation mark */
ali@40
   163
    "&nbsp;",	"&#160;",    " ", /* no-break space = non-breaking space, */
ali@40
   164
    "&iexcl;",	"&#161;",    "!", /* inverted exclamation mark */
ali@40
   165
    "&cent;",	"&#162;",    "c", /* cent sign */
ali@40
   166
    "&pound;",	"&#163;",    "L", /* pound sign */
ali@40
   167
    "&curren;",	"&#164;",    "$", /* currency sign */
ali@40
   168
    "&yen;",	"&#165;",    "Y", /* yen sign = yuan sign */
ali@40
   169
    "&sect;",	"&#167;",    "--", /* section sign */
ali@40
   170
    "&uml;",	"&#168;",    " ", /* diaeresis = spacing diaeresis */
ali@40
   171
    "&copy;",	"&#169;",    "(C) ", /* copyright sign */
ali@40
   172
    "&ordf;",	"&#170;",    " ", /* feminine ordinal indicator */
ali@40
   173
    "&laquo;",	"&#171;",    "\"", /* left-pointing double angle quotation mark */
ali@40
   174
    "&shy;",	"&#173;",    "-", /* soft hyphen = discretionary hyphen */
ali@40
   175
    "&reg;",	"&#174;",    "(R) ", /* registered sign = registered trade mark sign */
ali@40
   176
    "&macr;",	"&#175;",    " ", /* macron = spacing macron = overline */
ali@40
   177
    "&deg;",	"&#176;",    " degrees", /* degree sign */
ali@40
   178
    "&plusmn;",	"&#177;",    "+-", /* plus-minus sign = plus-or-minus sign */
ali@40
   179
    "&sup2;",	"&#178;",    "2", /* superscript two = superscript digit two */
ali@40
   180
    "&sup3;",	"&#179;",    "3", /* superscript three = superscript digit three */
ali@40
   181
    "&acute;",	"&#180;",    " ", /* acute accent = spacing acute */
ali@40
   182
    "&micro;",	"&#181;",    "m", /* micro sign */
ali@40
   183
    "&para;",	"&#182;",    "--", /* pilcrow sign = paragraph sign */
ali@40
   184
    "&cedil;",	"&#184;",    " ", /* cedilla = spacing cedilla */
ali@40
   185
    "&sup1;",	"&#185;",    "1", /* superscript one = superscript digit one */
ali@40
   186
    "&ordm;",	"&#186;",    " ", /* masculine ordinal indicator */
ali@40
   187
    "&raquo;",	"&#187;",    "\"", /* right-pointing double angle quotation mark */
ali@40
   188
    "&frac14;",	"&#188;",    "1/4", /* vulgar fraction one quarter */
ali@40
   189
    "&frac12;",	"&#189;",    "1/2", /* vulgar fraction one half */
ali@40
   190
    "&frac34;",	"&#190;",    "3/4", /* vulgar fraction three quarters */
ali@40
   191
    "&iquest;",	"&#191;",    "?", /* inverted question mark */
ali@40
   192
    "&Agrave;",	"&#192;",    "A", /* latin capital letter A with grave */
ali@40
   193
    "&Aacute;",	"&#193;",    "A", /* latin capital letter A with acute */
ali@40
   194
    "&Acirc;",	"&#194;",    "A", /* latin capital letter A with circumflex */
ali@40
   195
    "&Atilde;",	"&#195;",    "A", /* latin capital letter A with tilde */
ali@40
   196
    "&Auml;",	"&#196;",    "A", /* latin capital letter A with diaeresis */
ali@40
   197
    "&Aring;",	"&#197;",    "A", /* latin capital letter A with ring above */
ali@40
   198
    "&AElig;",	"&#198;",    "AE", /* latin capital letter AE */
ali@40
   199
    "&Ccedil;",	"&#199;",    "C", /* latin capital letter C with cedilla */
ali@40
   200
    "&Egrave;",	"&#200;",    "E", /* latin capital letter E with grave */
ali@40
   201
    "&Eacute;",	"&#201;",    "E", /* latin capital letter E with acute */
ali@40
   202
    "&Ecirc;",	"&#202;",    "E", /* latin capital letter E with circumflex */
ali@40
   203
    "&Euml;",	"&#203;",    "E", /* latin capital letter E with diaeresis */
ali@40
   204
    "&Igrave;",	"&#204;",    "I", /* latin capital letter I with grave */
ali@40
   205
    "&Iacute;",	"&#205;",    "I", /* latin capital letter I with acute */
ali@40
   206
    "&Icirc;",	"&#206;",    "I", /* latin capital letter I with circumflex */
ali@40
   207
    "&Iuml;",	"&#207;",    "I", /* latin capital letter I with diaeresis */
ali@40
   208
    "&ETH;",	"&#208;",    "E", /* latin capital letter ETH */
ali@40
   209
    "&Ntilde;",	"&#209;",    "N", /* latin capital letter N with tilde */
ali@40
   210
    "&Ograve;",	"&#210;",    "O", /* latin capital letter O with grave */
ali@40
   211
    "&Oacute;",	"&#211;",    "O", /* latin capital letter O with acute */
ali@40
   212
    "&Ocirc;",	"&#212;",    "O", /* latin capital letter O with circumflex */
ali@40
   213
    "&Otilde;",	"&#213;",    "O", /* latin capital letter O with tilde */
ali@40
   214
    "&Ouml;",	"&#214;",    "O", /* latin capital letter O with diaeresis */
ali@40
   215
    "&times;",	"&#215;",    "*", /* multiplication sign */
ali@40
   216
    "&Oslash;",	"&#216;",    "O", /* latin capital letter O with stroke */
ali@40
   217
    "&Ugrave;",	"&#217;",    "U", /* latin capital letter U with grave */
ali@40
   218
    "&Uacute;",	"&#218;",    "U", /* latin capital letter U with acute */
ali@40
   219
    "&Ucirc;",	"&#219;",    "U", /* latin capital letter U with circumflex */
ali@40
   220
    "&Uuml;",	"&#220;",    "U", /* latin capital letter U with diaeresis */
ali@40
   221
    "&Yacute;",	"&#221;",    "Y", /* latin capital letter Y with acute */
ali@40
   222
    "&THORN;",	"&#222;",    "TH", /* latin capital letter THORN */
ali@40
   223
    "&szlig;",	"&#223;",    "sz", /* latin small letter sharp s = ess-zed */
ali@40
   224
    "&agrave;",	"&#224;",    "a", /* latin small letter a with grave */
ali@40
   225
    "&aacute;",	"&#225;",    "a", /* latin small letter a with acute */
ali@40
   226
    "&acirc;",	"&#226;",    "a", /* latin small letter a with circumflex */
ali@40
   227
    "&atilde;",	"&#227;",    "a", /* latin small letter a with tilde */
ali@40
   228
    "&auml;",	"&#228;",    "a", /* latin small letter a with diaeresis */
ali@40
   229
    "&aring;",	"&#229;",    "a", /* latin small letter a with ring above */
ali@40
   230
    "&aelig;",	"&#230;",    "ae", /* latin small letter ae */
ali@40
   231
    "&ccedil;",	"&#231;",    "c", /* latin small letter c with cedilla */
ali@40
   232
    "&egrave;",	"&#232;",    "e", /* latin small letter e with grave */
ali@40
   233
    "&eacute;",	"&#233;",    "e", /* latin small letter e with acute */
ali@40
   234
    "&ecirc;",	"&#234;",    "e", /* latin small letter e with circumflex */
ali@40
   235
    "&euml;",	"&#235;",    "e", /* latin small letter e with diaeresis */
ali@40
   236
    "&igrave;",	"&#236;",    "i", /* latin small letter i with grave */
ali@40
   237
    "&iacute;",	"&#237;",    "i", /* latin small letter i with acute */
ali@40
   238
    "&icirc;",	"&#238;",    "i", /* latin small letter i with circumflex */
ali@40
   239
    "&iuml;",	"&#239;",    "i", /* latin small letter i with diaeresis */
ali@40
   240
    "&eth;",	"&#240;",    "eth", /* latin small letter eth */
ali@40
   241
    "&ntilde;",	"&#241;",    "n", /* latin small letter n with tilde */
ali@40
   242
    "&ograve;",	"&#242;",    "o", /* latin small letter o with grave */
ali@40
   243
    "&oacute;",	"&#243;",    "o", /* latin small letter o with acute */
ali@40
   244
    "&ocirc;",	"&#244;",    "o", /* latin small letter o with circumflex */
ali@40
   245
    "&otilde;",	"&#245;",    "o", /* latin small letter o with tilde */
ali@40
   246
    "&ouml;",	"&#246;",    "o", /* latin small letter o with diaeresis */
ali@40
   247
    "&divide;",	"&#247;",    "/", /* division sign */
ali@40
   248
    "&oslash;",	"&#248;",    "o", /* latin small letter o with stroke */
ali@40
   249
    "&ugrave;",	"&#249;",    "u", /* latin small letter u with grave */
ali@40
   250
    "&uacute;",	"&#250;",    "u", /* latin small letter u with acute */
ali@40
   251
    "&ucirc;",	"&#251;",    "u", /* latin small letter u with circumflex */
ali@40
   252
    "&uuml;",	"&#252;",    "u", /* latin small letter u with diaeresis */
ali@40
   253
    "&yacute;",	"&#253;",    "y", /* latin small letter y with acute */
ali@40
   254
    "&thorn;",	"&#254;",    "th", /* latin small letter thorn */
ali@40
   255
    "&yuml;",	"&#255;",    "y", /* latin small letter y with diaeresis */
ali@40
   256
    "", ""
ali@40
   257
};
ali@40
   258
ali@40
   259
/* special characters */
ali@0
   260
#define CHAR_SPACE        32
ali@0
   261
#define CHAR_TAB           9
ali@0
   262
#define CHAR_LF           10
ali@0
   263
#define CHAR_CR           13
ali@0
   264
#define CHAR_DQUOTE       34
ali@0
   265
#define CHAR_SQUOTE       39
ali@0
   266
#define CHAR_OPEN_SQUOTE  96
ali@0
   267
#define CHAR_TILDE       126
ali@0
   268
#define CHAR_ASTERISK     42
ali@0
   269
#define CHAR_FORESLASH    47
ali@0
   270
#define CHAR_CARAT        94
ali@0
   271
ali@0
   272
#define CHAR_UNDERSCORE    '_'
ali@0
   273
#define CHAR_OPEN_CBRACK   '{'
ali@0
   274
#define CHAR_CLOSE_CBRACK  '}'
ali@0
   275
#define CHAR_OPEN_RBRACK   '('
ali@0
   276
#define CHAR_CLOSE_RBRACK  ')'
ali@0
   277
#define CHAR_OPEN_SBRACK   '['
ali@0
   278
#define CHAR_CLOSE_SBRACK  ']'
ali@0
   279
ali@40
   280
/* longest and shortest normal PG line lengths */
ali@0
   281
#define LONGEST_PG_LINE   75
ali@0
   282
#define WAY_TOO_LONG      80
ali@0
   283
#define SHORTEST_PG_LINE  55
ali@0
   284
ali@0
   285
#define SWITCHES "ESTPXLOYHWVMUD" /* switches:-                            */
ali@0
   286
                                  /*     D - ignore DP-specific markup     */
ali@0
   287
                                  /*     E - echo queried line             */
ali@0
   288
                                  /*     S - check single quotes           */
ali@0
   289
                                  /*     T - check common typos            */
ali@0
   290
                                  /*     P - require closure of quotes on  */
ali@0
   291
                                  /*         every paragraph               */
ali@0
   292
                                  /*     X - "Trust no one" :-) Paranoid!  */
ali@0
   293
                                  /*         Queries everything            */
ali@0
   294
                                  /*     L - line end checking defaults on */
ali@0
   295
                                  /*         -L turns it off               */
ali@0
   296
                                  /*     O - overview. Just shows counts.  */
ali@0
   297
                                  /*     Y - puts errors to stdout         */
ali@0
   298
                                  /*         instead of stderr             */
ali@0
   299
                                  /*     H - Echoes header fields          */
ali@0
   300
                                  /*     M - Ignore markup in < >          */
ali@0
   301
                                  /*     U - Use file of User-defined Typos*/
ali@0
   302
                                  /*     W - Defaults for use on Web upload*/
ali@0
   303
                                  /*     V - Verbose - list EVERYTHING!    */
ali@0
   304
#define SWITNO 14                 /* max number of switch parms            */
ali@0
   305
                                  /*        - used for defining array-size */
ali@0
   306
#define MINARGS   1               /* minimum no of args excl switches      */
ali@0
   307
#define MAXARGS   1               /* maximum no of args excl switches      */
ali@0
   308
ali@0
   309
int pswit[SWITNO];                /* program switches set by SWITCHES      */
ali@0
   310
ali@0
   311
#define ECHO_SWITCH      0
ali@0
   312
#define SQUOTE_SWITCH    1
ali@0
   313
#define TYPO_SWITCH      2
ali@0
   314
#define QPARA_SWITCH     3
ali@0
   315
#define PARANOID_SWITCH  4
ali@0
   316
#define LINE_END_SWITCH  5
ali@0
   317
#define OVERVIEW_SWITCH  6
ali@0
   318
#define STDOUT_SWITCH    7
ali@0
   319
#define HEADER_SWITCH    8
ali@0
   320
#define WEB_SWITCH       9
ali@0
   321
#define VERBOSE_SWITCH   10
ali@0
   322
#define MARKUP_SWITCH    11
ali@0
   323
#define USERTYPO_SWITCH  12
ali@0
   324
#define DP_SWITCH        13
ali@0
   325
ali@0
   326
long cnt_dquot;       /* for overview mode, count of doublequote queries */
ali@0
   327
long cnt_squot;       /* for overview mode, count of singlequote queries */
ali@0
   328
long cnt_brack;       /* for overview mode, count of brackets queries */
ali@0
   329
long cnt_bin;         /* for overview mode, count of non-ASCII queries */
ali@0
   330
long cnt_odd;         /* for overview mode, count of odd character queries */
ali@0
   331
long cnt_long;        /* for overview mode, count of long line errors */
ali@0
   332
long cnt_short;       /* for overview mode, count of short line queries */
ali@0
   333
long cnt_punct;       /* for overview mode, count of punctuation and spacing queries */
ali@0
   334
long cnt_dash;        /* for overview mode, count of dash-related queries */
ali@0
   335
long cnt_word;        /* for overview mode, count of word queries */
ali@0
   336
long cnt_html;        /* for overview mode, count of html queries */
ali@0
   337
long cnt_lineend;     /* for overview mode, count of line-end queries */
ali@40
   338
long cnt_spacend;     /* count of lines with space at end */
ali@0
   339
long linecnt;         /* count of total lines in the file */
ali@40
   340
long checked_linecnt; /* count of lines actually checked */
ali@0
   341
ali@0
   342
void proghelp(void);
ali@0
   343
void procfile(char *);
ali@0
   344
ali@0
   345
#define LOW_THRESHOLD    0
ali@0
   346
#define HIGH_THRESHOLD   1
ali@0
   347
ali@0
   348
#define START 0
ali@0
   349
#define END 1
ali@0
   350
#define PREV 0
ali@0
   351
#define NEXT 1
ali@0
   352
#define FIRST_OF_PAIR 0
ali@0
   353
#define SECOND_OF_PAIR 1
ali@0
   354
ali@0
   355
#define MAX_WORDPAIR 1000
ali@0
   356
ali@0
   357
char running_from[MAX_PATH];
ali@0
   358
ali@0
   359
int mixdigit(char *);
ali@54
   360
const char *getaword(const char *,char *);
ali@40
   361
int matchword(char *,char *);
ali@40
   362
char *flgets(char *,int,FILE *,long);
ali@0
   363
void lowerit(char *);
ali@0
   364
int gcisalpha(unsigned char);
ali@0
   365
int gcisdigit(unsigned char);
ali@0
   366
int gcisletter(unsigned char);
ali@40
   367
char *gcstrchr(char *s,char c);
ali@0
   368
void postprocess_for_HTML(char *);
ali@0
   369
char *linehasmarkup(char *);
ali@0
   370
char *losemarkup(char *);
ali@40
   371
int tagcomp(char *,char *);
ali@0
   372
char *loseentities(char *);
ali@0
   373
int isroman(char *);
ali@0
   374
int usertypo_count;
ali@0
   375
void postprocess_for_DP(char *);
ali@0
   376
ali@0
   377
char wrk[LINEBUFSIZE];
ali@0
   378
ali@40
   379
#define MAX_QWORD 50
ali@40
   380
#define MAX_QWORD_LENGTH 40
ali@0
   381
char qword[MAX_QWORD][MAX_QWORD_LENGTH];
ali@0
   382
signed int dupcnt[MAX_QWORD];
ali@0
   383
ali@40
   384
int main(int argc,char **argv)
ali@0
   385
{
ali@40
   386
    char *argsw,*s;
ali@40
   387
    int i,switno,invarg;
ali@0
   388
    char usertypo_file[MAX_PATH];
ali@0
   389
    FILE *usertypofile;
ali@40
   390
    if (strlen(argv[0])<sizeof(running_from))
ali@40
   391
	/* save the path to the executable */
ali@40
   392
        strcpy(running_from,argv[0]);
ali@0
   393
    /* find out what directory we're running from */
ali@40
   394
    s=running_from+strlen(running_from);
ali@40
   395
    for (;*s!='/' && *s!='\\' && s>=running_from;s--)
ali@40
   396
        *s=0;
ali@40
   397
    switno=strlen(SWITCHES);
ali@40
   398
    for (i=switno;--i>0;)
ali@40
   399
        pswit[i]=0;           /* initialise switches */
ali@40
   400
    /*
ali@40
   401
     * Standard loop to extract switches.
ali@40
   402
     * When we come out of this loop, the arguments will be
ali@40
   403
     * in argv[0] upwards and the switches used will be
ali@40
   404
     * represented by their equivalent elements in pswit[]
ali@40
   405
     */
ali@40
   406
    while (--argc>0 && **++argv=='-')
ali@40
   407
        for (argsw=argv[0]+1;*argsw!='\0';argsw++)
ali@40
   408
            for (i=switno,invarg=1;(--i>=0) && invarg==1;)
ali@40
   409
                if ((toupper(*argsw))==SWITCHES[i])
ali@40
   410
		{
ali@40
   411
                    invarg=0;
ali@40
   412
                    pswit[i]=1;
ali@40
   413
		}
ali@40
   414
    /* Paranoid checking is turned OFF, not on, by its switch */
ali@40
   415
    pswit[PARANOID_SWITCH]^=1;
ali@40
   416
    if (pswit[PARANOID_SWITCH])
ali@40
   417
	/* if running in paranoid mode force typo checks as well   */
ali@40
   418
        pswit[TYPO_SWITCH]=pswit[TYPO_SWITCH]^1;
ali@40
   419
    /* Line-end checking is turned OFF, not on, by its switch */
ali@40
   420
    pswit[LINE_END_SWITCH]^=1;
ali@40
   421
    /* Echoing is turned OFF, not on, by its switch */
ali@40
   422
    pswit[ECHO_SWITCH]^=1;
ali@40
   423
    if (pswit[OVERVIEW_SWITCH])
ali@40
   424
	/* just print summary; don't echo */
ali@40
   425
        pswit[ECHO_SWITCH]=0;
ali@40
   426
    /*
ali@40
   427
     * Web uploads - for the moment, this is really just a placeholder
ali@40
   428
     * until we decide what processing we really want to do on web uploads
ali@40
   429
     */
ali@40
   430
    if (pswit[WEB_SWITCH])
ali@40
   431
    {
ali@40
   432
	/* specific override for web uploads */
ali@40
   433
        pswit[ECHO_SWITCH]=1;
ali@40
   434
        pswit[SQUOTE_SWITCH]=0;
ali@40
   435
        pswit[TYPO_SWITCH]=1;
ali@40
   436
        pswit[QPARA_SWITCH]=0;
ali@40
   437
        pswit[PARANOID_SWITCH]=1;
ali@40
   438
        pswit[LINE_END_SWITCH]=0;
ali@40
   439
        pswit[OVERVIEW_SWITCH]=0;
ali@40
   440
        pswit[STDOUT_SWITCH]=0;
ali@40
   441
        pswit[HEADER_SWITCH]=1;
ali@40
   442
        pswit[VERBOSE_SWITCH]=0;
ali@40
   443
        pswit[MARKUP_SWITCH]=0;
ali@40
   444
        pswit[USERTYPO_SWITCH]=0;
ali@40
   445
        pswit[DP_SWITCH]=0;
ali@40
   446
    }
ali@40
   447
    if (argc<MINARGS || argc>MAXARGS)
ali@40
   448
    {
ali@40
   449
	/* check number of args */
ali@0
   450
        proghelp();
ali@40
   451
        return 1;
ali@40
   452
    }
ali@0
   453
    /* read in the user-defined stealth scanno list */
ali@40
   454
    if (pswit[USERTYPO_SWITCH])
ali@40
   455
    {
ali@40
   456
	/* ... we were told we had one! */
ali@40
   457
        usertypofile=fopen(USERTYPO_FILE,"rb");
ali@40
   458
        if (!usertypofile)
ali@40
   459
	{
ali@40
   460
	    /* not in cwd. try excuteable directory. */
ali@40
   461
            strcpy(usertypo_file,running_from);
ali@40
   462
            strcat(usertypo_file,USERTYPO_FILE);
ali@40
   463
            usertypofile=fopen(usertypo_file,"rb");
ali@40
   464
            if (!usertypofile) {
ali@40
   465
		/* we ain't got no user typo file! */
ali@40
   466
                printf("   --> I couldn't find gutcheck.typ "
ali@40
   467
		  "-- proceeding without user typos.\n");
ali@40
   468
	    }
ali@40
   469
	}
ali@40
   470
        usertypo_count=0;
ali@40
   471
        if (usertypofile)
ali@40
   472
	{
ali@40
   473
	    /* we managed to open a User Typo File! */
ali@40
   474
            if (pswit[USERTYPO_SWITCH])
ali@40
   475
	    {
ali@40
   476
                while (flgets(aline,LINEBUFSIZE-1,usertypofile,
ali@40
   477
		  (long)usertypo_count))
ali@40
   478
		{
ali@40
   479
                    if (strlen(aline)>1)
ali@40
   480
		    {
ali@40
   481
                        if ((int)*aline>33)
ali@40
   482
			{
ali@40
   483
                            s=malloc(strlen(aline)+1);
ali@40
   484
                            if (!s)
ali@40
   485
			    {
ali@40
   486
                                fprintf(stderr,"bookloupe: cannot get enough "
ali@40
   487
				  "memory for user typo file!\n");
ali@0
   488
                                exit(1);
ali@40
   489
			    }
ali@40
   490
                            strcpy(s,aline);
ali@40
   491
                            usertypo[usertypo_count]=s;
ali@0
   492
                            usertypo_count++;
ali@40
   493
                            if (usertypo_count>=MAX_USER_TYPOS)
ali@40
   494
			    {
ali@40
   495
                                printf("   --> Only %d user-defined typos "
ali@42
   496
				  "allowed: ignoring the rest\n",
ali@42
   497
				  MAX_USER_TYPOS);
ali@0
   498
                                break;
ali@40
   499
			    }
ali@40
   500
			}
ali@40
   501
		    }
ali@40
   502
		}
ali@40
   503
	    }
ali@0
   504
            fclose(usertypofile);
ali@40
   505
	}
ali@40
   506
    }
ali@40
   507
    fprintf(stderr,"bookloupe: Check and report on an e-text\n");
ali@40
   508
    cnt_dquot=cnt_squot=cnt_brack=cnt_bin=cnt_odd=cnt_long=
ali@40
   509
    cnt_short=cnt_punct=cnt_dash=cnt_word=cnt_html=cnt_lineend=
ali@40
   510
    cnt_spacend=0;
ali@0
   511
    procfile(argv[0]);
ali@40
   512
    if (pswit[OVERVIEW_SWITCH])
ali@40
   513
    {
ali@40
   514
	printf("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
ali@40
   515
	  checked_linecnt,linecnt,linecnt-checked_linecnt);
ali@40
   516
        printf("    --------------- Queries found --------------\n");
ali@40
   517
        if (cnt_long)
ali@40
   518
	    printf("    Long lines:                    %14ld\n",cnt_long);
ali@40
   519
        if (cnt_short)
ali@40
   520
	    printf("    Short lines:                   %14ld\n",cnt_short);
ali@40
   521
        if (cnt_lineend)
ali@40
   522
	    printf("    Line-end problems:             %14ld\n",cnt_lineend);
ali@40
   523
        if (cnt_word)
ali@40
   524
	    printf("    Common typos:                  %14ld\n",cnt_word);
ali@40
   525
        if (cnt_dquot)
ali@40
   526
	    printf("    Unmatched quotes:              %14ld\n",cnt_dquot);
ali@40
   527
        if (cnt_squot)
ali@40
   528
	    printf("    Unmatched SingleQuotes:        %14ld\n",cnt_squot);
ali@40
   529
        if (cnt_brack)
ali@40
   530
	    printf("    Unmatched brackets:            %14ld\n",cnt_brack);
ali@40
   531
        if (cnt_bin)
ali@40
   532
	    printf("    Non-ASCII characters:          %14ld\n",cnt_bin);
ali@40
   533
        if (cnt_odd)
ali@40
   534
	    printf("    Proofing characters:           %14ld\n",cnt_odd);
ali@40
   535
        if (cnt_punct)
ali@40
   536
	    printf("    Punctuation & spacing queries: %14ld\n",cnt_punct);
ali@40
   537
        if (cnt_dash)
ali@40
   538
	    printf("    Non-standard dashes:           %14ld\n",cnt_dash);
ali@40
   539
        if (cnt_html)
ali@40
   540
	    printf("    Possible HTML tags:            %14ld\n",cnt_html);
ali@0
   541
        printf("\n");
ali@40
   542
        printf("    TOTAL QUERIES                  %14ld\n",
ali@40
   543
          cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
ali@40
   544
          cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
ali@40
   545
    }
ali@40
   546
    return 0;
ali@0
   547
}
ali@0
   548
ali@41
   549
struct first_pass_results {
ali@41
   550
    long firstline,astline;
ali@41
   551
    long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;
ali@41
   552
    long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;
ali@41
   553
    long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;
ali@41
   554
    signed int Dutchcount,Frenchcount;
ali@41
   555
};
ali@41
   556
ali@40
   557
/*
ali@41
   558
 * first_pass:
ali@40
   559
 *
ali@41
   560
 * Run a first pass - verify that it's a valid PG
ali@41
   561
 * file, decide whether to report some things that
ali@41
   562
 * occur many times in the text like long or short
ali@41
   563
 * lines, non-standard dashes, etc.
ali@40
   564
 */
ali@41
   565
struct first_pass_results *first_pass(FILE *infile)
ali@0
   566
{
ali@54
   567
    char laststart=CHAR_SPACE;
ali@54
   568
    const char *s;
ali@41
   569
    signed int i,llen;
ali@41
   570
    unsigned int lastlen=0,lastblen=0;
ali@41
   571
    long spline=0,nspline=0;
ali@41
   572
    static struct first_pass_results results={0};
ali@41
   573
    char inword[MAXWORDLEN]="";
ali@40
   574
    while (fgets(aline,LINEBUFSIZE-1,infile))
ali@40
   575
    {
ali@40
   576
        while (aline[strlen(aline)-1]==10 || aline[strlen(aline)-1]==13)
ali@40
   577
	    aline[strlen(aline)-1]=0;
ali@0
   578
        linecnt++;
ali@40
   579
        if (strstr(aline,"*END") && strstr(aline,"SMALL PRINT") &&
ali@40
   580
	  (strstr(aline,"PUBLIC DOMAIN") || strstr(aline,"COPYRIGHT")))
ali@40
   581
	{
ali@0
   582
            if (spline)
ali@0
   583
                printf("   --> Duplicate header?\n");
ali@40
   584
            spline=linecnt+1;   /* first line of non-header text, that is */
ali@40
   585
	}
ali@40
   586
        if (!strncmp(aline,"*** START",9) && strstr(aline,"PROJECT GUTENBERG"))
ali@40
   587
	{
ali@0
   588
            if (nspline)
ali@0
   589
                printf("   --> Duplicate header?\n");
ali@40
   590
            nspline=linecnt+1;   /* first line of non-header text, that is */
ali@40
   591
	}
ali@40
   592
        if (spline || nspline)
ali@40
   593
	{
ali@0
   594
            lowerit(aline);
ali@40
   595
            if (strstr(aline,"end") && strstr(aline,"project gutenberg"))
ali@40
   596
	    {
ali@40
   597
                if (strstr(aline,"end")<strstr(aline,"project gutenberg"))
ali@40
   598
		{
ali@41
   599
                    if (results.footerline)
ali@40
   600
		    {
ali@40
   601
			/* it's an old-form header - we can detect duplicates */
ali@40
   602
                        if (!nspline)
ali@0
   603
                            printf("   --> Duplicate footer?\n");
ali@40
   604
		    }
ali@40
   605
                    else
ali@41
   606
                        results.footerline=linecnt;
ali@40
   607
		}
ali@40
   608
	    }
ali@40
   609
	}
ali@40
   610
        if (spline)
ali@41
   611
	    results.firstline=spline;
ali@40
   612
        if (nspline)
ali@41
   613
	    results.firstline=nspline;  /* override with new */
ali@41
   614
        if (results.footerline)
ali@40
   615
	    continue;    /* don't count the boilerplate in the footer */
ali@40
   616
        llen=strlen(aline);
ali@41
   617
        results.totlen+=llen;
ali@40
   618
        for (i=0;i<llen;i++)
ali@40
   619
	{
ali@40
   620
            if ((unsigned char)aline[i]>127)
ali@41
   621
		results.binlen++;
ali@40
   622
            if (gcisalpha(aline[i]))
ali@41
   623
		results.alphalen++;
ali@40
   624
            if (i>0 && aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))
ali@41
   625
		results.endquote_count++;
ali@40
   626
	}
ali@40
   627
        if (strlen(aline)>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE &&
ali@40
   628
	  lastblen>2 && lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
ali@41
   629
	    results.shortline++;
ali@40
   630
        if (*aline && (unsigned char)aline[strlen(aline)-1]<=CHAR_SPACE)
ali@40
   631
	    cnt_spacend++;
ali@40
   632
        if (strstr(aline,".,"))
ali@41
   633
	    results.dotcomma++;
ali@40
   634
        /* only count ast lines for ignoring purposes where there is */
ali@0
   635
        /* locase text on the line */
ali@40
   636
        if (strstr(aline,"*"))
ali@40
   637
	{
ali@40
   638
            for (s=aline;*s;s++)
ali@40
   639
                if (*s>='a' && *s<='z')
ali@0
   640
                    break;
ali@40
   641
             if (*s)
ali@41
   642
		results.astline++;
ali@40
   643
	}
ali@40
   644
        if (strstr(aline,"/"))
ali@41
   645
            results.fslashline++;
ali@40
   646
        for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)
ali@40
   647
	    ;
ali@40
   648
        if (aline[i]=='-' && aline[i-1]!='-')
ali@41
   649
	    results.hyphens++;
ali@40
   650
        if (llen>LONGEST_PG_LINE)
ali@41
   651
	    results.longline++;
ali@40
   652
        if (llen>WAY_TOO_LONG)
ali@41
   653
	    results.verylongline++;
ali@40
   654
        if (strstr(aline,"<") && strstr(aline,">"))
ali@40
   655
	{
ali@40
   656
            i=(signed int)(strstr(aline,">")-strstr(aline,"<")+1);
ali@40
   657
            if (i>0)
ali@41
   658
                results.htmcount++;
ali@40
   659
            if (strstr(aline,"<i>"))
ali@41
   660
		results.htmcount+=4; /* bonus marks! */
ali@40
   661
	}
ali@0
   662
        /* Check for spaced em-dashes */
ali@40
   663
        if (strstr(aline,"--"))
ali@40
   664
	{
ali@41
   665
            results.emdash++;
ali@40
   666
            if (*(strstr(aline,"--")-1)==CHAR_SPACE ||
ali@40
   667
               (*(strstr(aline,"--")+2)==CHAR_SPACE))
ali@41
   668
		results.space_emdash++;
ali@40
   669
            if (*(strstr(aline,"--")-1)==CHAR_SPACE &&
ali@40
   670
               (*(strstr(aline,"--")+2)==CHAR_SPACE))
ali@40
   671
		/* count of em-dashes with spaces both sides */
ali@41
   672
		results.non_PG_space_emdash++;
ali@40
   673
            if (*(strstr(aline,"--")-1)!=CHAR_SPACE &&
ali@40
   674
               (*(strstr(aline,"--")+2)!=CHAR_SPACE))
ali@40
   675
		/* count of PG-type em-dashes with no spaces */
ali@41
   676
		results.PG_space_emdash++;
ali@40
   677
	}
ali@40
   678
        for (s=aline;*s;)
ali@40
   679
	{
ali@40
   680
            s=getaword(s,inword);
ali@40
   681
            if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) 
ali@41
   682
                results.Dutchcount++;
ali@40
   683
            if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) 
ali@41
   684
                results.Frenchcount++;
ali@40
   685
            if (!strcmp(inword,"0") || !strcmp(inword,"1")) 
ali@41
   686
                results.standalone_digit++;
ali@40
   687
	}
ali@0
   688
        /* Check for spaced dashes */
ali@40
   689
        if (strstr(aline," -") && *(strstr(aline," -")+2)!='-')
ali@41
   690
	    results.spacedash++;
ali@40
   691
        lastblen=lastlen;
ali@40
   692
        lastlen=strlen(aline);
ali@40
   693
        laststart=aline[0];
ali@40
   694
    }
ali@41
   695
    return &results;
ali@41
   696
}
ali@41
   697
ali@42
   698
struct warnings {
ali@42
   699
    signed int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;
ali@42
   700
    signed int endquote,isDutch,isFrench;
ali@42
   701
};
ali@42
   702
ali@42
   703
/*
ali@42
   704
 * report_first_pass:
ali@42
   705
 *
ali@42
   706
 * Make some snap decisions based on the first pass results.
ali@42
   707
 */
ali@42
   708
struct warnings *report_first_pass(struct first_pass_results *results)
ali@42
   709
{
ali@42
   710
    static struct warnings warnings={0};
ali@42
   711
    if (cnt_spacend>0)
ali@42
   712
        printf("   --> %ld lines in this file have white space at end\n",
ali@42
   713
	  cnt_spacend);
ali@42
   714
    warnings.dotcomma=1;
ali@42
   715
    if (results->dotcomma>5)
ali@42
   716
    {
ali@42
   717
        warnings.dotcomma=0;
ali@42
   718
        printf("   --> %ld lines in this file contain '.,'. "
ali@42
   719
	  "Not reporting them.\n",results->dotcomma);
ali@42
   720
    }
ali@42
   721
    /*
ali@42
   722
     * If more than 50 lines, or one-tenth, are short,
ali@42
   723
     * don't bother reporting them.
ali@42
   724
     */
ali@42
   725
    warnings.shortline=1;
ali@42
   726
    if (results->shortline>50 || results->shortline*10>linecnt)
ali@42
   727
    {
ali@42
   728
        warnings.shortline=0;
ali@42
   729
        printf("   --> %ld lines in this file are short. "
ali@42
   730
	  "Not reporting short lines.\n",results->shortline);
ali@42
   731
    }
ali@42
   732
    /*
ali@42
   733
     * If more than 50 lines, or one-tenth, are long,
ali@42
   734
     * don't bother reporting them.
ali@42
   735
     */
ali@42
   736
    warnings.longline=1;
ali@42
   737
    if (results->longline>50 || results->longline*10>linecnt)
ali@42
   738
    {
ali@42
   739
        warnings.longline=0;
ali@42
   740
        printf("   --> %ld lines in this file are long. "
ali@42
   741
	  "Not reporting long lines.\n",results->longline);
ali@42
   742
    }
ali@42
   743
    /* If more than 10 lines contain asterisks, don't bother reporting them. */
ali@42
   744
    warnings.ast=1;
ali@42
   745
    if (results->astline>10)
ali@42
   746
    {
ali@42
   747
        warnings.ast=0;
ali@42
   748
        printf("   --> %ld lines in this file contain asterisks. "
ali@42
   749
	  "Not reporting them.\n",results->astline);
ali@42
   750
    }
ali@42
   751
    /*
ali@42
   752
     * If more than 10 lines contain forward slashes,
ali@42
   753
     * don't bother reporting them.
ali@42
   754
     */
ali@42
   755
    warnings.fslash=1;
ali@42
   756
    if (results->fslashline>10)
ali@42
   757
    {
ali@42
   758
        warnings.fslash=0;
ali@42
   759
        printf("   --> %ld lines in this file contain forward slashes. "
ali@42
   760
	  "Not reporting them.\n",results->fslashline);
ali@42
   761
    }
ali@42
   762
    /*
ali@42
   763
     * If more than 20 lines contain unpunctuated endquotes,
ali@42
   764
     * don't bother reporting them.
ali@42
   765
     */
ali@42
   766
    warnings.endquote=1;
ali@42
   767
    if (results->endquote_count>20)
ali@42
   768
    {
ali@42
   769
        warnings.endquote=0;
ali@42
   770
        printf("   --> %ld lines in this file contain unpunctuated endquotes. "
ali@42
   771
	  "Not reporting them.\n",results->endquote_count);
ali@42
   772
    }
ali@42
   773
    /*
ali@42
   774
     * If more than 15 lines contain standalone digits,
ali@42
   775
     * don't bother reporting them.
ali@42
   776
     */
ali@42
   777
    warnings.digit=1;
ali@42
   778
    if (results->standalone_digit>10)
ali@42
   779
    {
ali@42
   780
        warnings.digit=0;
ali@42
   781
        printf("   --> %ld lines in this file contain standalone 0s and 1s. "
ali@42
   782
	  "Not reporting them.\n",results->standalone_digit);
ali@42
   783
    }
ali@42
   784
    /*
ali@42
   785
     * If more than 20 lines contain hyphens at end,
ali@42
   786
     * don't bother reporting them.
ali@42
   787
     */
ali@42
   788
    warnings.hyphen=1;
ali@42
   789
    if (results->hyphens>20)
ali@42
   790
    {
ali@42
   791
        warnings.hyphen=0;
ali@42
   792
        printf("   --> %ld lines in this file have hyphens at end. "
ali@42
   793
	  "Not reporting them.\n",results->hyphens);
ali@42
   794
    }
ali@42
   795
    if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
ali@42
   796
    {
ali@42
   797
        printf("   --> Looks like this is HTML. Switching HTML mode ON.\n");
ali@42
   798
        pswit[MARKUP_SWITCH]=1;
ali@42
   799
    }
ali@42
   800
    if (results->verylongline>0)
ali@42
   801
        printf("   --> %ld lines in this file are VERY long!\n",
ali@42
   802
	  results->verylongline);
ali@42
   803
    /*
ali@42
   804
     * If there are more non-PG spaced dashes than PG em-dashes,
ali@42
   805
     * assume it's deliberate.
ali@42
   806
     * Current PG guidelines say don't use them, but older texts do,
ali@42
   807
     * and some people insist on them whatever the guidelines say.
ali@42
   808
     */
ali@42
   809
    warnings.dash=1;
ali@42
   810
    if (results->spacedash+results->non_PG_space_emdash>
ali@42
   811
      results->PG_space_emdash)
ali@42
   812
    {
ali@42
   813
        warnings.dash=0;
ali@42
   814
        printf("   --> There are %ld spaced dashes and em-dashes. "
ali@42
   815
	  "Not reporting them.\n",
ali@42
   816
	  results->spacedash+results->non_PG_space_emdash);
ali@42
   817
    }
ali@42
   818
    /* If more than a quarter of characters are hi-bit, bug out. */
ali@42
   819
    warnings.bin=1;
ali@42
   820
    if (results->binlen*4>results->totlen)
ali@42
   821
    {
ali@42
   822
        printf("   --> This file does not appear to be ASCII. "
ali@42
   823
	  "Terminating. Best of luck with it!\n");
ali@42
   824
        exit(1);
ali@42
   825
    }
ali@42
   826
    if (results->alphalen*4<results->totlen)
ali@42
   827
    {
ali@42
   828
        printf("   --> This file does not appear to be text. "
ali@42
   829
	  "Terminating. Best of luck with it!\n");
ali@42
   830
        exit(1);
ali@42
   831
    }
ali@42
   832
    if (results->binlen*100>results->totlen || results->binlen>100)
ali@42
   833
    {
ali@42
   834
        printf("   --> There are a lot of foreign letters here. "
ali@42
   835
	  "Not reporting them.\n");
ali@42
   836
        warnings.bin=0;
ali@42
   837
    }
ali@42
   838
    warnings.isDutch=0;
ali@42
   839
    if (results->Dutchcount>50)
ali@42
   840
    {
ali@42
   841
        warnings.isDutch=1;
ali@42
   842
        printf("   --> This looks like Dutch - "
ali@42
   843
	  "switching off dashes and warnings for 's Middags case.\n");
ali@42
   844
    }
ali@42
   845
    warnings.isFrench=0;
ali@42
   846
    if (results->Frenchcount>50)
ali@42
   847
    {
ali@42
   848
        warnings.isFrench=1;
ali@42
   849
        printf("   --> This looks like French - "
ali@42
   850
	  "switching off some doublepunct.\n");
ali@42
   851
    }
ali@42
   852
    if (results->firstline && results->footerline)
ali@42
   853
        printf("    The PG header and footer appear to be already on.\n");
ali@42
   854
    else
ali@42
   855
    {
ali@42
   856
        if (results->firstline)
ali@42
   857
            printf("    The PG header is on - no footer.\n");
ali@42
   858
        if (results->footerline)
ali@42
   859
            printf("    The PG footer is on - no header.\n");
ali@42
   860
    }
ali@42
   861
    printf("\n");
ali@42
   862
    if (pswit[VERBOSE_SWITCH])
ali@42
   863
    {
ali@42
   864
        warnings.bin=1;
ali@42
   865
        warnings.shortline=1;
ali@42
   866
        warnings.dotcomma=1;
ali@42
   867
        warnings.longline=1;
ali@42
   868
        warnings.dash=1;
ali@42
   869
        warnings.digit=1;
ali@42
   870
        warnings.ast=1;
ali@42
   871
        warnings.fslash=1;
ali@42
   872
        warnings.hyphen=1;
ali@42
   873
        warnings.endquote=1;
ali@42
   874
        printf("   *** Verbose output is ON -- you asked for it! ***\n");
ali@42
   875
    }
ali@42
   876
    if (warnings.isDutch)
ali@42
   877
        warnings.dash=0;
ali@42
   878
    if (results->footerline>0 && results->firstline>0 &&
ali@42
   879
      results->footerline>results->firstline &&
ali@42
   880
      results->footerline-results->firstline<100)
ali@42
   881
    {
ali@42
   882
        printf("   --> I don't really know where this text starts. \n");
ali@42
   883
        printf("       There are no reference points.\n");
ali@42
   884
        printf("       I'm going to have to report the header and footer "
ali@42
   885
	  "as well.\n");
ali@42
   886
        results->firstline=0;
ali@42
   887
    }
ali@42
   888
    return &warnings;
ali@42
   889
}
ali@42
   890
ali@43
   891
struct counters {
ali@43
   892
    long quot;
ali@43
   893
    signed int c_unders,c_brack,s_brack,r_brack;
ali@43
   894
    signed int open_single_quote,close_single_quote;
ali@43
   895
};
ali@43
   896
ali@43
   897
/*
ali@43
   898
 * analyse_quotes:
ali@43
   899
 *
ali@43
   900
 * Look along the line, accumulate the count of quotes, and see
ali@43
   901
 * if this is an empty line - i.e. a line with nothing on it
ali@43
   902
 * but spaces.
ali@43
   903
 * If line has just spaces, period, * and/or - on it, don't
ali@43
   904
 * count it, since empty lines with asterisks or dashes to
ali@43
   905
 * separate sections are common.
ali@43
   906
 *
ali@43
   907
 * Returns: Non-zero if the line is empty.
ali@43
   908
 */
ali@43
   909
int analyse_quotes(const char *s,struct counters *counters)
ali@43
   910
{
ali@43
   911
    signed int guessquote=0;
ali@43
   912
    int isemptyline=1;    /* assume the line is empty until proven otherwise */
ali@43
   913
    while (*s)
ali@43
   914
    {
ali@43
   915
	if (*s==CHAR_DQUOTE)
ali@43
   916
	    counters->quot++;
ali@43
   917
	if (*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE)
ali@43
   918
	{
ali@43
   919
	    if (s==aline)
ali@43
   920
	    {
ali@43
   921
		/*
ali@43
   922
		 * At start of line, it can only be an openquote.
ali@43
   923
		 * Hardcode a very common exception!
ali@43
   924
		 */
ali@43
   925
		if (strncmp(s+2,"tis",3) && strncmp(s+2,"Tis",3))
ali@43
   926
		    counters->open_single_quote++;
ali@43
   927
	    }
ali@43
   928
	    else if (gcisalpha(s[-1]) && gcisalpha(s[1]))
ali@43
   929
		/* Do nothing! it's definitely an apostrophe, not a quote */
ali@43
   930
		;
ali@43
   931
	    /* it's outside a word - let's check it out */
ali@43
   932
	    else if (*s==CHAR_OPEN_SQUOTE || gcisalpha(s[1]))
ali@43
   933
	    {
ali@43
   934
		/* it damwell better BE an openquote */
ali@43
   935
		if (strncmp(s+1,"tis",3) && strncmp(s+1,"Tis",3))
ali@43
   936
		    /* hardcode a very common exception! */
ali@43
   937
		    counters->open_single_quote++;
ali@43
   938
	    }
ali@43
   939
	    else
ali@43
   940
	    {
ali@43
   941
		/* now - is it a closequote? */
ali@43
   942
		guessquote=0;   /* accumulate clues */
ali@43
   943
		if (gcisalpha(s[-1]))
ali@43
   944
		{
ali@43
   945
		    /* it follows a letter - could be either */
ali@43
   946
		    guessquote++;
ali@43
   947
		    if (s[-1]=='s')
ali@43
   948
		    {
ali@43
   949
			/* looks like a plural apostrophe */
ali@43
   950
			guessquote-=3;
ali@43
   951
			if (s[1]==CHAR_SPACE)  /* bonus marks! */
ali@43
   952
			    guessquote-=2;
ali@43
   953
		    }
ali@43
   954
		}
ali@43
   955
		/* it doesn't have a letter either side */
ali@43
   956
		else if (strchr(".?!,;:",s[-1]) && strchr(".?!,;: ",s[1]))
ali@43
   957
		    guessquote+=8; /* looks like a closequote */
ali@43
   958
		else
ali@43
   959
		    guessquote++;
ali@43
   960
		if (counters->open_single_quote>counters->close_single_quote)
ali@43
   961
		    /*
ali@43
   962
		     * Give it the benefit of some doubt,
ali@43
   963
		     * if a squote is already open.
ali@43
   964
		     */
ali@43
   965
		    guessquote++;
ali@43
   966
		else
ali@43
   967
		    guessquote--;
ali@43
   968
		if (guessquote>=0)
ali@43
   969
		    counters->close_single_quote++;
ali@43
   970
	    }
ali@43
   971
	}
ali@43
   972
	if (*s!=CHAR_SPACE && *s!='-' && *s!='.' && *s!=CHAR_ASTERISK &&
ali@43
   973
	  *s!=13 && *s!=10)
ali@43
   974
	    isemptyline=0;  /* ignore lines like  *  *  *  as spacers */
ali@43
   975
	if (*s==CHAR_UNDERSCORE)
ali@43
   976
	    counters->c_unders++;
ali@43
   977
	if (*s==CHAR_OPEN_CBRACK)
ali@43
   978
	    counters->c_brack++;
ali@43
   979
	if (*s==CHAR_CLOSE_CBRACK)
ali@43
   980
	    counters->c_brack--;
ali@43
   981
	if (*s==CHAR_OPEN_RBRACK)
ali@43
   982
	    counters->r_brack++;
ali@43
   983
	if (*s==CHAR_CLOSE_RBRACK)
ali@43
   984
	    counters->r_brack--;
ali@43
   985
	if (*s==CHAR_OPEN_SBRACK)
ali@43
   986
	    counters->s_brack++;
ali@43
   987
	if (*s==CHAR_CLOSE_SBRACK)
ali@43
   988
	    counters->s_brack--;
ali@43
   989
	s++;
ali@43
   990
    }
ali@43
   991
    return isemptyline;
ali@43
   992
}
ali@43
   993
ali@41
   994
/*
ali@44
   995
 * check_for_odd_characters:
ali@44
   996
 *
ali@44
   997
 * Check for binary and other odd characters.
ali@44
   998
 */
ali@44
   999
void check_for_odd_characters(const char *aline,const struct warnings *warnings,
ali@44
  1000
  int isemptyline)
ali@44
  1001
{
ali@44
  1002
    /* Don't repeat multiple warnings on one line. */
ali@44
  1003
    signed int eNon_A=0,eTab=0,eTilde=0,eCarat=0,eFSlash=0,eAst=0;
ali@44
  1004
    const char *s;
ali@44
  1005
    unsigned char c;
ali@44
  1006
    for (s=aline;*s;s++)
ali@44
  1007
    {
ali@44
  1008
	c=*(unsigned char *)s;
ali@44
  1009
	if (!eNon_A && (*s<CHAR_SPACE && *s!=9 && *s!='\n' || c>127))
ali@44
  1010
	{
ali@44
  1011
	    if (pswit[ECHO_SWITCH])
ali@44
  1012
		printf("\n%s\n",aline);
ali@44
  1013
	    if (!pswit[OVERVIEW_SWITCH])
ali@44
  1014
		if (c>127 && c<160)
ali@44
  1015
		    printf("    Line %ld column %d - "
ali@44
  1016
		      "Non-ISO-8859 character %d\n",linecnt,(int)(s-aline)+1,c);
ali@44
  1017
		else
ali@44
  1018
		    printf("    Line %ld column %d - Non-ASCII character %d\n",
ali@44
  1019
		      linecnt,(int)(s-aline)+1,c);
ali@44
  1020
	    else
ali@44
  1021
		cnt_bin++;
ali@44
  1022
	    eNon_A=1;
ali@44
  1023
	}
ali@44
  1024
	if (!eTab && *s==CHAR_TAB)
ali@44
  1025
	{
ali@44
  1026
	    if (pswit[ECHO_SWITCH])
ali@44
  1027
		printf("\n%s\n",aline);
ali@44
  1028
	    if (!pswit[OVERVIEW_SWITCH])
ali@44
  1029
		printf("    Line %ld column %d - Tab character?\n",
ali@44
  1030
		  linecnt,(int)(s-aline)+1);
ali@44
  1031
	    else
ali@44
  1032
		cnt_odd++;
ali@44
  1033
	    eTab=1;
ali@44
  1034
	}
ali@44
  1035
	if (!eTilde && *s==CHAR_TILDE)
ali@44
  1036
	{
ali@44
  1037
	    /*
ali@44
  1038
	     * Often used by OCR software to indicate an
ali@44
  1039
	     * unrecognizable character.
ali@44
  1040
	     */
ali@44
  1041
	    if (pswit[ECHO_SWITCH])
ali@44
  1042
		printf("\n%s\n",aline);
ali@44
  1043
	    if (!pswit[OVERVIEW_SWITCH])
ali@44
  1044
		printf("    Line %ld column %d - Tilde character?\n",
ali@44
  1045
		  linecnt,(int)(s-aline)+1);
ali@44
  1046
	    else
ali@44
  1047
		cnt_odd++;
ali@44
  1048
	    eTilde=1;
ali@44
  1049
	}
ali@44
  1050
	if (!eCarat && *s==CHAR_CARAT)
ali@44
  1051
	{  
ali@44
  1052
	    if (pswit[ECHO_SWITCH])
ali@44
  1053
		printf("\n%s\n",aline);
ali@44
  1054
	    if (!pswit[OVERVIEW_SWITCH])
ali@44
  1055
		printf("    Line %ld column %d - Carat character?\n",
ali@44
  1056
		  linecnt,(int)(s-aline)+1);
ali@44
  1057
	    else
ali@44
  1058
		cnt_odd++;
ali@44
  1059
	    eCarat=1;
ali@44
  1060
	}
ali@44
  1061
	if (!eFSlash && *s==CHAR_FORESLASH && warnings->fslash)
ali@44
  1062
	{  
ali@44
  1063
	    if (pswit[ECHO_SWITCH])
ali@44
  1064
		printf("\n%s\n",aline);
ali@44
  1065
	    if (!pswit[OVERVIEW_SWITCH])
ali@44
  1066
		printf("    Line %ld column %d - Forward slash?\n",
ali@44
  1067
		  linecnt,(int)(s-aline)+1);
ali@44
  1068
	    else
ali@44
  1069
		cnt_odd++;
ali@44
  1070
	    eFSlash=1;
ali@44
  1071
	}
ali@44
  1072
	/*
ali@44
  1073
	 * Report asterisks only in paranoid mode,
ali@44
  1074
	 * since they're often deliberate.
ali@44
  1075
	 */
ali@44
  1076
	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
ali@44
  1077
	  *s==CHAR_ASTERISK)
ali@44
  1078
	{
ali@44
  1079
	    if (pswit[ECHO_SWITCH])
ali@44
  1080
		printf("\n%s\n",aline);
ali@44
  1081
	    if (!pswit[OVERVIEW_SWITCH])
ali@44
  1082
		printf("    Line %ld column %d - Asterisk?\n",
ali@44
  1083
		  linecnt,(int)(s-aline)+1);
ali@44
  1084
	    else
ali@44
  1085
		cnt_odd++;
ali@44
  1086
	    eAst=1;
ali@44
  1087
	}
ali@44
  1088
    }
ali@44
  1089
}
ali@44
  1090
ali@44
  1091
/*
ali@45
  1092
 * check_for_long_line:
ali@45
  1093
 *
ali@45
  1094
 * Check for line too long.
ali@45
  1095
 */
ali@45
  1096
void check_for_long_line(const char *aline)
ali@45
  1097
{
ali@45
  1098
    if (strlen(aline)>LONGEST_PG_LINE)
ali@45
  1099
    {
ali@45
  1100
	if (pswit[ECHO_SWITCH])
ali@45
  1101
	    printf("\n%s\n",aline);
ali@45
  1102
	if (!pswit[OVERVIEW_SWITCH])
ali@45
  1103
	    printf("    Line %ld column %d - Long line %d\n",
ali@45
  1104
	      linecnt,strlen(aline),strlen(aline));
ali@45
  1105
	else
ali@45
  1106
	    cnt_long++;
ali@45
  1107
    }
ali@45
  1108
}
ali@45
  1109
ali@45
  1110
struct line_properties {
ali@45
  1111
    unsigned int len,blen;
ali@45
  1112
    char start;
ali@45
  1113
};
ali@45
  1114
ali@45
  1115
/*
ali@45
  1116
 * check_for_short_line:
ali@45
  1117
 *
ali@45
  1118
 * Check for line too short.
ali@45
  1119
 *
ali@45
  1120
 * This one is a bit trickier to implement: we don't want to
ali@45
  1121
 * flag the last line of a paragraph for being short, so we
ali@45
  1122
 * have to wait until we know that our current line is a
ali@45
  1123
 * "normal" line, then report the _previous_ line if it was too
ali@45
  1124
 * short. We also don't want to report indented lines like
ali@45
  1125
 * chapter heads or formatted quotations. We therefore keep
ali@45
  1126
 * last->len as the length of the last line examined, and
ali@45
  1127
 * last->blen as the length of the last but one, and try to
ali@45
  1128
 * suppress unnecessary warnings by checking that both were of
ali@45
  1129
 * "normal" length. We keep the first character of the last
ali@45
  1130
 * line in last->start, and if it was a space, we assume that
ali@45
  1131
 * the formatting is deliberate. I can't figure out a way to
ali@45
  1132
 * distinguish something like a quoted verse left-aligned or
ali@45
  1133
 * the header or footer of a letter from a paragraph of short
ali@45
  1134
 * lines - maybe if I examined the whole paragraph, and if the
ali@45
  1135
 * para has less than, say, 8 lines and if all lines are short,
ali@45
  1136
 * then just assume it's OK? Need to look at some texts to see
ali@45
  1137
 * how often a formula like this would get the right result.
ali@45
  1138
 */
ali@45
  1139
void check_for_short_line(const char *aline,const struct line_properties *last)
ali@45
  1140
{
ali@45
  1141
    if (strlen(aline)>1 && last->len>1 && last->len<SHORTEST_PG_LINE &&
ali@45
  1142
      last->blen>1 && last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
ali@45
  1143
    {
ali@45
  1144
	if (pswit[ECHO_SWITCH])
ali@45
  1145
	    printf("\n%s\n",prevline);
ali@45
  1146
	if (!pswit[OVERVIEW_SWITCH])
ali@45
  1147
	    printf("    Line %ld column %d - Short line %d?\n",
ali@45
  1148
	      linecnt-1,strlen(prevline),strlen(prevline));
ali@45
  1149
	else
ali@45
  1150
	    cnt_short++;
ali@45
  1151
    }
ali@45
  1152
}
ali@45
  1153
ali@45
  1154
/*
ali@46
  1155
 * check_for_starting_punctuation:
ali@46
  1156
 *
ali@46
  1157
 * Look for punctuation other than full ellipses at start of line.
ali@46
  1158
 */
ali@46
  1159
void check_for_starting_punctuation(const char *aline)
ali@46
  1160
{
ali@46
  1161
    if (*aline && strchr(".?!,;:",aline[0]) && strncmp(". . .",aline,5))
ali@46
  1162
    {
ali@46
  1163
	if (pswit[ECHO_SWITCH])
ali@46
  1164
	    printf("\n%s\n",aline);
ali@46
  1165
	if (!pswit[OVERVIEW_SWITCH])
ali@46
  1166
	    printf("    Line %ld column 1 - Begins with punctuation?\n",
ali@46
  1167
	      linecnt);
ali@46
  1168
	else
ali@46
  1169
	    cnt_punct++;
ali@46
  1170
    }
ali@46
  1171
}
ali@46
  1172
ali@46
  1173
/*
ali@47
  1174
 * check_for_spaced_emdash:
ali@47
  1175
 *
ali@47
  1176
 * Check for spaced em-dashes.
ali@47
  1177
 *
ali@47
  1178
 * We must check _all_ occurrences of "--" on the line
ali@47
  1179
 * hence the loop - even if the first double-dash is OK
ali@47
  1180
 * there may be another that's wrong later on.
ali@47
  1181
 */
ali@47
  1182
void check_for_spaced_emdash(const char *aline)
ali@47
  1183
{
ali@47
  1184
    const char *s,*t;
ali@47
  1185
    s=aline;
ali@47
  1186
    while ((t=strstr(s,"--")))
ali@47
  1187
    {
ali@47
  1188
	if (t>aline && t[-1]==CHAR_SPACE || t[2]==CHAR_SPACE)
ali@47
  1189
	{
ali@47
  1190
	    if (pswit[ECHO_SWITCH])
ali@47
  1191
		printf("\n%s\n",aline);
ali@47
  1192
	    if (!pswit[OVERVIEW_SWITCH])
ali@47
  1193
		printf("    Line %ld column %d - Spaced em-dash?\n",
ali@47
  1194
		  linecnt,(int)(t-aline)+1);
ali@47
  1195
	    else
ali@47
  1196
		cnt_dash++;
ali@47
  1197
	}
ali@47
  1198
	s=t+2;
ali@47
  1199
    }
ali@47
  1200
}
ali@47
  1201
ali@47
  1202
/*
ali@47
  1203
 * check_for_spaced_dash:
ali@47
  1204
 *
ali@47
  1205
 * Check for spaced dashes.
ali@47
  1206
 */
ali@47
  1207
void check_for_spaced_dash(const char *aline)
ali@47
  1208
{
ali@47
  1209
    const char *s;
ali@47
  1210
    if ((s=strstr(aline," -")))
ali@47
  1211
    {
ali@47
  1212
	if (s[2]!='-')
ali@47
  1213
	{
ali@47
  1214
	    if (pswit[ECHO_SWITCH])
ali@47
  1215
		printf("\n%s\n",aline);
ali@47
  1216
	    if (!pswit[OVERVIEW_SWITCH])
ali@47
  1217
		printf("    Line %ld column %d - Spaced dash?\n",
ali@47
  1218
		  linecnt,(int)(s-aline)+1);
ali@47
  1219
	    else
ali@47
  1220
		cnt_dash++;
ali@47
  1221
	}
ali@47
  1222
    }
ali@47
  1223
    else if ((s=strstr(aline,"- ")))
ali@47
  1224
    {
ali@47
  1225
	if (s==aline || s[-1]!='-')
ali@47
  1226
	{
ali@47
  1227
	    if (pswit[ECHO_SWITCH])
ali@47
  1228
		printf("\n%s\n",aline);
ali@47
  1229
	    if (!pswit[OVERVIEW_SWITCH])
ali@47
  1230
		printf("    Line %ld column %d - Spaced dash?\n",
ali@47
  1231
		  linecnt,(int)(s-aline)+1);
ali@47
  1232
	    else
ali@47
  1233
		cnt_dash++;
ali@47
  1234
	}
ali@47
  1235
    }
ali@47
  1236
}
ali@47
  1237
ali@47
  1238
/*
ali@48
  1239
 * check_for_unmarked_paragraphs:
ali@48
  1240
 *
ali@48
  1241
 * Check for unmarked paragraphs indicated by separate speakers.
ali@48
  1242
 *
ali@48
  1243
 * May well be false positive:
ali@48
  1244
 * "Bravo!" "Wonderful!" called the crowd.
ali@48
  1245
 * but useful all the same.
ali@48
  1246
 */
ali@48
  1247
void check_for_unmarked_paragraphs(const char *aline)
ali@48
  1248
{
ali@48
  1249
    const char *s;
ali@48
  1250
    s=strstr(aline,"\"  \"");
ali@48
  1251
    if (!s)
ali@48
  1252
	s=strstr(aline,"\" \"");
ali@48
  1253
    if (s)
ali@48
  1254
    {
ali@48
  1255
	if (pswit[ECHO_SWITCH])
ali@48
  1256
	    printf("\n%s\n",aline);
ali@48
  1257
	if (!pswit[OVERVIEW_SWITCH])
ali@48
  1258
	    printf("    Line %ld column %d - Query missing paragraph break?\n",
ali@48
  1259
	      linecnt,(int)(s-aline)+1);
ali@48
  1260
	else
ali@48
  1261
	    cnt_punct++;
ali@48
  1262
    }
ali@48
  1263
}
ali@48
  1264
ali@48
  1265
/*
ali@49
  1266
 * check_for_jeebies:
ali@49
  1267
 *
ali@49
  1268
 * Check for "to he" and other easy h/b errors.
ali@49
  1269
 *
ali@49
  1270
 * This is a very inadequate effort on the h/b problem,
ali@49
  1271
 * but the phrase "to he" is always an error, whereas "to
ali@49
  1272
 * be" is quite common.
ali@49
  1273
 * Similarly, '"Quiet!", be said.' is a non-be error
ali@49
  1274
 * "to he" is _not_ always an error!:
ali@49
  1275
 *       "Where they went to he couldn't say."
ali@49
  1276
 * Another false positive:
ali@49
  1277
 *       What would "Cinderella" be without the . . .
ali@49
  1278
 * and another: "If he wants to he can see for himself."
ali@49
  1279
 */
ali@49
  1280
void check_for_jeebies(const char *aline)
ali@49
  1281
{
ali@49
  1282
    const char *s;
ali@49
  1283
    s=strstr(aline," be could ");
ali@49
  1284
    if (!s)
ali@49
  1285
	s=strstr(aline," be would ");
ali@49
  1286
    if (!s)
ali@49
  1287
	s=strstr(aline," was be ");
ali@49
  1288
    if (!s)
ali@49
  1289
	s=strstr(aline," be is ");
ali@49
  1290
    if (!s)
ali@49
  1291
	s=strstr(aline," is be ");
ali@49
  1292
    if (!s)
ali@49
  1293
	s=strstr(aline,"\", be ");
ali@49
  1294
    if (!s)
ali@49
  1295
	s=strstr(aline,"\" be ");
ali@49
  1296
    if (!s)
ali@49
  1297
	s=strstr(aline,"\" be ");
ali@49
  1298
    if (!s)
ali@49
  1299
	s=strstr(aline," to he ");
ali@49
  1300
    if (s)
ali@49
  1301
    {
ali@49
  1302
	if (pswit[ECHO_SWITCH])
ali@49
  1303
	    printf("\n%s\n",aline);
ali@49
  1304
	if (!pswit[OVERVIEW_SWITCH])
ali@49
  1305
	    printf("    Line %ld column %d - Query he/be error?\n",
ali@49
  1306
	      linecnt,(int)(s-aline)+1);
ali@49
  1307
	else
ali@49
  1308
	    cnt_word++;
ali@49
  1309
    }
ali@49
  1310
    s=strstr(aline," the had ");
ali@49
  1311
    if (!s)
ali@49
  1312
	s=strstr(aline," a had ");
ali@49
  1313
    if (!s)
ali@49
  1314
	s=strstr(aline," they bad ");
ali@49
  1315
    if (!s)
ali@49
  1316
	s=strstr(aline," she bad ");
ali@49
  1317
    if (!s)
ali@49
  1318
	s=strstr(aline," he bad ");
ali@49
  1319
    if (!s)
ali@49
  1320
	s=strstr(aline," you bad ");
ali@49
  1321
    if (!s)
ali@49
  1322
	s=strstr(aline," i bad ");
ali@49
  1323
    if (s)
ali@49
  1324
    {
ali@49
  1325
	if (pswit[ECHO_SWITCH])
ali@49
  1326
	    printf("\n%s\n",aline);
ali@49
  1327
	if (!pswit[OVERVIEW_SWITCH])
ali@49
  1328
	    printf("    Line %ld column %d - Query had/bad error?\n",
ali@49
  1329
	      linecnt,(int)(s-aline)+1);
ali@49
  1330
	else
ali@49
  1331
	    cnt_word++;
ali@49
  1332
    }
ali@49
  1333
    s=strstr(aline,"; hut ");
ali@49
  1334
    if (!s)
ali@49
  1335
	s=strstr(aline,", hut ");
ali@49
  1336
    if (s)
ali@49
  1337
    {
ali@49
  1338
	if (pswit[ECHO_SWITCH])
ali@49
  1339
	    printf("\n%s\n",aline);
ali@49
  1340
	if (!pswit[OVERVIEW_SWITCH])
ali@49
  1341
	    printf("    Line %ld column %d - Query hut/but error?\n",
ali@49
  1342
	      linecnt,(int)(s-aline)+1);
ali@49
  1343
	else
ali@49
  1344
	    cnt_word++;
ali@49
  1345
    }
ali@49
  1346
}
ali@49
  1347
ali@49
  1348
/*
ali@50
  1349
 * check_for_mta_from:
ali@50
  1350
 *
ali@50
  1351
 * Special case - angled bracket in front of "From" placed there by an
ali@50
  1352
 * MTA when sending an e-mail.
ali@50
  1353
 */
ali@50
  1354
void check_for_mta_from(const char *aline)
ali@50
  1355
{
ali@50
  1356
    const char *s;
ali@50
  1357
    s=strstr(aline,">From");
ali@50
  1358
    if (s)
ali@50
  1359
    {
ali@50
  1360
	if (pswit[ECHO_SWITCH])
ali@50
  1361
	    printf("\n%s\n",aline);
ali@50
  1362
	if (!pswit[OVERVIEW_SWITCH])
ali@50
  1363
	    printf("    Line %ld column %d - Query angled bracket with From\n",
ali@50
  1364
	      linecnt,(int)(s-aline)+1);
ali@50
  1365
	else
ali@50
  1366
	    cnt_punct++;
ali@50
  1367
    }
ali@50
  1368
}
ali@50
  1369
ali@50
  1370
/*
ali@51
  1371
 * check_for_orphan_character:
ali@51
  1372
 *
ali@51
  1373
 * Check for a single character line -
ali@51
  1374
 * often an overflow from bad wrapping.
ali@51
  1375
 */
ali@51
  1376
void check_for_orphan_character(const char *aline)
ali@51
  1377
{
ali@51
  1378
    if (*aline && !aline[1])
ali@51
  1379
    {
ali@51
  1380
	if (*aline=='I' || *aline=='V' || *aline=='X' || *aline=='L' ||
ali@51
  1381
	  gcisdigit(*aline))
ali@51
  1382
	    ; /* Nothing - ignore numerals alone on a line. */
ali@51
  1383
	else
ali@51
  1384
	{
ali@51
  1385
	    if (pswit[ECHO_SWITCH])
ali@51
  1386
		printf("\n%s\n",aline);
ali@51
  1387
	    if (!pswit[OVERVIEW_SWITCH])
ali@51
  1388
		printf("    Line %ld column 1 - Query single character line\n",
ali@51
  1389
		  linecnt);
ali@51
  1390
	    else
ali@51
  1391
		cnt_punct++;
ali@51
  1392
	}
ali@51
  1393
    }
ali@51
  1394
}
ali@51
  1395
ali@51
  1396
/*
ali@52
  1397
 * check_for_pling_scanno:
ali@52
  1398
 *
ali@52
  1399
 * Check for I" - often should be !
ali@52
  1400
 */
ali@52
  1401
void check_for_pling_scanno(const char *aline)
ali@52
  1402
{
ali@52
  1403
    const char *s;
ali@52
  1404
    s=strstr(aline," I\"");
ali@52
  1405
    if (s)
ali@52
  1406
    {
ali@52
  1407
	if (pswit[ECHO_SWITCH])
ali@52
  1408
	    printf("\n%s\n",aline);
ali@52
  1409
	if (!pswit[OVERVIEW_SWITCH])
ali@52
  1410
	    printf("    Line %ld column %ld - Query I=exclamation mark?\n",
ali@52
  1411
	      linecnt,s-aline);
ali@52
  1412
	else
ali@52
  1413
	    cnt_punct++;
ali@52
  1414
    }
ali@52
  1415
}
ali@52
  1416
ali@52
  1417
/*
ali@53
  1418
 * check_for_extra_period:
ali@53
  1419
 *
ali@53
  1420
 * Check for period without a capital letter. Cut-down from gutspell.
ali@53
  1421
 * Only works when it happens on a single line.
ali@53
  1422
 */
ali@53
  1423
void check_for_extra_period(const char *aline,const struct warnings *warnings)
ali@53
  1424
{
ali@53
  1425
    const char *s,*t,*s1;
ali@53
  1426
    signed int i,istypo,isdup;
ali@53
  1427
    static char qperiod[MAX_QWORD][MAX_QWORD_LENGTH];
ali@53
  1428
    static int qperiod_index=0;
ali@53
  1429
    char testword[MAXWORDLEN]="";
ali@53
  1430
    if (pswit[PARANOID_SWITCH])
ali@53
  1431
    {
ali@53
  1432
	for (t=s=aline;strstr(t,". ");)
ali@53
  1433
	{
ali@53
  1434
	    t=strstr(t,". ");
ali@53
  1435
	    if (t==s)
ali@53
  1436
	    {
ali@53
  1437
		t++;
ali@53
  1438
		/* start of line punctuation is handled elsewhere */
ali@53
  1439
		continue;
ali@53
  1440
	    }
ali@53
  1441
	    if (!gcisalpha(t[-1]))
ali@53
  1442
	    {
ali@53
  1443
		t++;
ali@53
  1444
		continue;
ali@53
  1445
	    }
ali@53
  1446
	    if (warnings->isDutch)
ali@53
  1447
	    {
ali@53
  1448
		/* For Frank & Jeroen -- 's Middags case */
ali@53
  1449
		if (t[2]==CHAR_SQUOTE && t[3]>='a' && t[3]<='z' &&
ali@53
  1450
		  t[4]==CHAR_SPACE && t[5]>='A' && t[5]<='Z')
ali@53
  1451
		{
ali@53
  1452
		    t++;
ali@53
  1453
		    continue;
ali@53
  1454
		}
ali@53
  1455
	    }
ali@53
  1456
	    s1=t+2;
ali@53
  1457
	    while (*s1 && !gcisalpha(*s1) && !isdigit(*s1))
ali@53
  1458
		s1++;
ali@53
  1459
	    if (*s1>='a' && *s1<='z')
ali@53
  1460
	    {
ali@53
  1461
		/* we have something to investigate */
ali@53
  1462
		istypo=1;
ali@53
  1463
		/* so let's go back and find out */
ali@53
  1464
		for (s1=t-1;s1>=s &&
ali@53
  1465
		  (gcisalpha(*s1) || gcisdigit(*s1) || *s1==CHAR_SQUOTE &&
ali@53
  1466
		  gcisalpha(s1[1]) && gcisalpha(s1[-1]));s1--)
ali@53
  1467
		    ;
ali@53
  1468
		s1++;
ali@53
  1469
		for (i=0;*s1 && *s1!='.';s1++,i++)
ali@53
  1470
		    testword[i]=*s1;
ali@53
  1471
		testword[i]=0;
ali@53
  1472
		for (i=0;*abbrev[i];i++)
ali@53
  1473
		    if (!strcmp(testword,abbrev[i]))
ali@53
  1474
			istypo=0;
ali@53
  1475
		if (gcisdigit(*testword))
ali@53
  1476
		    istypo=0;
ali@53
  1477
		if (!testword[1])
ali@53
  1478
		    istypo=0;
ali@53
  1479
		if (isroman(testword))
ali@53
  1480
		    istypo=0;
ali@53
  1481
		if (istypo)
ali@53
  1482
		{
ali@53
  1483
		    istypo=0;
ali@53
  1484
		    for (i=0;testword[i];i++)
ali@53
  1485
			if (strchr(vowels,testword[i]))
ali@53
  1486
			    istypo=1;
ali@53
  1487
		}
ali@53
  1488
		if (istypo)
ali@53
  1489
		{
ali@53
  1490
		    isdup=0;
ali@53
  1491
		    if (strlen(testword)<MAX_QWORD_LENGTH &&
ali@53
  1492
		      !pswit[VERBOSE_SWITCH])
ali@53
  1493
			for (i=0;i<qperiod_index;i++)
ali@53
  1494
			    if (!strcmp(testword,qperiod[i]))
ali@53
  1495
				isdup=1;
ali@53
  1496
		    if (!isdup)
ali@53
  1497
		    {
ali@53
  1498
			if (qperiod_index<MAX_QWORD &&
ali@53
  1499
			  strlen(testword)<MAX_QWORD_LENGTH)
ali@53
  1500
			{
ali@53
  1501
			    strcpy(qperiod[qperiod_index],testword);
ali@53
  1502
			    qperiod_index++;
ali@53
  1503
			}
ali@53
  1504
			if (pswit[ECHO_SWITCH])
ali@53
  1505
			    printf("\n%s\n",aline);
ali@53
  1506
			if (!pswit[OVERVIEW_SWITCH])
ali@53
  1507
			    printf("    Line %ld column %d - Extra period?\n",
ali@53
  1508
			      linecnt,(int)(t-aline)+1);
ali@53
  1509
			else
ali@53
  1510
			    cnt_punct++;
ali@53
  1511
		    }
ali@53
  1512
		}
ali@53
  1513
	    }
ali@53
  1514
	    t++;
ali@53
  1515
	}
ali@53
  1516
    }
ali@53
  1517
}
ali@53
  1518
ali@53
  1519
/*
ali@54
  1520
 * check_for_following_punctuation:
ali@54
  1521
 *
ali@54
  1522
 * Check for words usually not followed by punctuation.
ali@54
  1523
 */
ali@54
  1524
void check_for_following_punctuation(const char *aline)
ali@54
  1525
{
ali@54
  1526
    int i;
ali@54
  1527
    const char *s,*wordstart;
ali@54
  1528
    char inword[MAXWORDLEN];
ali@54
  1529
    if (pswit[TYPO_SWITCH])
ali@54
  1530
    {
ali@54
  1531
	for (s=aline;*s;)
ali@54
  1532
	{
ali@54
  1533
	    wordstart=s;
ali@54
  1534
	    s=getaword(s,inword);
ali@54
  1535
	    if (!*inword)
ali@54
  1536
		continue;
ali@54
  1537
	    lowerit(inword);
ali@54
  1538
	    for (i=0;*nocomma[i];i++)
ali@54
  1539
		if (!strcmp(inword,nocomma[i]))
ali@54
  1540
		{
ali@54
  1541
		    if (*s==',' || *s==';' || *s==':')
ali@54
  1542
		    {
ali@54
  1543
			if (pswit[ECHO_SWITCH])
ali@54
  1544
			    printf("\n%s\n",aline);
ali@54
  1545
			if (!pswit[OVERVIEW_SWITCH])
ali@54
  1546
			    printf("    Line %ld column %d - "
ali@54
  1547
			      "Query punctuation after %s?\n",
ali@54
  1548
			      linecnt,(int)(s-aline)+1,inword);
ali@54
  1549
			else
ali@54
  1550
			    cnt_punct++;
ali@54
  1551
		    }
ali@54
  1552
		}
ali@54
  1553
	    for (i=0;*noperiod[i];i++)
ali@54
  1554
		if (!strcmp(inword,noperiod[i]))
ali@54
  1555
		{
ali@54
  1556
		    if (*s=='.' || *s=='!')
ali@54
  1557
		    {
ali@54
  1558
			if (pswit[ECHO_SWITCH])
ali@54
  1559
			    printf("\n%s\n",aline);
ali@54
  1560
			if (!pswit[OVERVIEW_SWITCH])
ali@54
  1561
			    printf("    Line %ld column %d - "
ali@54
  1562
			      "Query punctuation after %s?\n",
ali@54
  1563
			      linecnt,(int)(s-aline)+1,inword);
ali@54
  1564
			else
ali@54
  1565
			    cnt_punct++;
ali@54
  1566
		    }
ali@54
  1567
		}
ali@54
  1568
	}
ali@54
  1569
    }
ali@54
  1570
}
ali@54
  1571
ali@54
  1572
/*
ali@55
  1573
 * check_for_typos:
ali@55
  1574
 *
ali@55
  1575
 * Check for commonly mistyped words,
ali@55
  1576
 * and digits like 0 for O in a word.
ali@55
  1577
 */
ali@55
  1578
void check_for_typos(const char *aline,struct warnings *warnings)
ali@55
  1579
{
ali@55
  1580
    const char *s,*wordstart;
ali@55
  1581
    char inword[MAXWORDLEN],testword[MAXWORDLEN];
ali@55
  1582
    int i,istypo,isdup,alower,vowel,consonant;
ali@55
  1583
    static int qword_index=0;
ali@55
  1584
    for (s=aline;*s;)
ali@55
  1585
    {
ali@55
  1586
	wordstart=s;
ali@55
  1587
	s=getaword(s,inword);
ali@55
  1588
	if (!*inword)
ali@55
  1589
	    continue; /* don't bother with empty lines */
ali@55
  1590
	if (mixdigit(inword))
ali@55
  1591
	{
ali@55
  1592
	    if (pswit[ECHO_SWITCH])
ali@55
  1593
		printf("\n%s\n",aline);
ali@55
  1594
	    if (!pswit[OVERVIEW_SWITCH])
ali@55
  1595
		printf("    Line %ld column %d - Query digit in %s\n",
ali@55
  1596
		  linecnt,(int)(wordstart-aline)+1,inword);
ali@55
  1597
	    else
ali@55
  1598
		cnt_word++;
ali@55
  1599
	}
ali@55
  1600
	/*
ali@55
  1601
	 * Put the word through a series of tests for likely typos and OCR
ali@55
  1602
	 * errors.
ali@55
  1603
	 */
ali@55
  1604
	if (pswit[TYPO_SWITCH])
ali@55
  1605
	{
ali@55
  1606
	    istypo=0;
ali@55
  1607
	    strcpy(testword,inword);
ali@55
  1608
	    alower=0;
ali@55
  1609
	    for (i=0;i<(signed int)strlen(testword);i++)
ali@55
  1610
	    {
ali@55
  1611
		/* lowercase for testing */
ali@55
  1612
		if (testword[i]>='a' && testword[i]<='z')
ali@55
  1613
		    alower=1;
ali@55
  1614
		if (alower && testword[i]>='A' && testword[i]<='Z')
ali@55
  1615
		{
ali@55
  1616
		    /*
ali@55
  1617
		     * We have an uppercase mid-word. However, there are
ali@55
  1618
		     * common cases:
ali@55
  1619
		     *   Mac and Mc like McGill
ali@55
  1620
		     *   French contractions like l'Abbe
ali@55
  1621
		     */
ali@55
  1622
		    if (i==2 && testword[0]=='m' && testword[1]=='c' ||
ali@55
  1623
		      i==3 && testword[0]=='m' && testword[1]=='a' &&
ali@55
  1624
		      testword[2]=='c' || i>0 && testword[i-1]==CHAR_SQUOTE)
ali@55
  1625
			; /* do nothing! */
ali@55
  1626
		    else
ali@55
  1627
			istypo=1;
ali@55
  1628
		}
ali@55
  1629
		testword[i]=(char)tolower(testword[i]);
ali@55
  1630
	    }
ali@55
  1631
	    /*
ali@55
  1632
	     * Check for certain unlikely two-letter combinations at word
ali@55
  1633
	     * start and end.
ali@55
  1634
	     */
ali@55
  1635
	    if (strlen(testword)>1)
ali@55
  1636
	    {
ali@55
  1637
		for (i=0;*nostart[i];i++)
ali@55
  1638
		    if (!strncmp(testword,nostart[i],2))
ali@55
  1639
			istypo=1;
ali@55
  1640
		for (i=0;*noend[i];i++)
ali@55
  1641
		    if (!strncmp(testword+strlen(testword)-2,noend[i],2))
ali@55
  1642
			istypo=1;
ali@55
  1643
	    }
ali@55
  1644
	    /* ght is common, gbt never. Like that. */
ali@55
  1645
	    if (strstr(testword,"cb"))
ali@55
  1646
		istypo=1;
ali@55
  1647
	    if (strstr(testword,"gbt"))
ali@55
  1648
		istypo=1;
ali@55
  1649
	    if (strstr(testword,"pbt"))
ali@55
  1650
		istypo=1;
ali@55
  1651
	    if (strstr(testword,"tbs"))
ali@55
  1652
		istypo=1;
ali@55
  1653
	    if (strstr(testword,"mrn"))
ali@55
  1654
		istypo=1;
ali@55
  1655
	    if (strstr(testword,"ahle"))
ali@55
  1656
		istypo=1;
ali@55
  1657
	    if (strstr(testword,"ihle"))
ali@55
  1658
		istypo=1;
ali@55
  1659
	    /*
ali@55
  1660
	     * "TBE" does happen - like HEARTBEAT - but uncommon.
ali@55
  1661
	     * Also "TBI" - frostbite, outbid - but uncommon.
ali@55
  1662
	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman
ali@55
  1663
	     * numerals, but "ii" is a common scanno.
ali@55
  1664
	     */
ali@55
  1665
	    if (strstr(testword,"tbi"))
ali@55
  1666
		istypo=1;
ali@55
  1667
	    if (strstr(testword,"tbe"))
ali@55
  1668
		istypo=1;
ali@55
  1669
	    if (strstr(testword,"ii"))
ali@55
  1670
		istypo=1;
ali@55
  1671
	    /*
ali@55
  1672
	     * Check for no vowels or no consonants.
ali@55
  1673
	     * If none, flag a typo.
ali@55
  1674
	     */
ali@55
  1675
	    if (!istypo && strlen(testword)>1)
ali@55
  1676
	    {
ali@55
  1677
		vowel=consonant=0;
ali@55
  1678
		for (i=0;testword[i];i++)
ali@55
  1679
		{
ali@55
  1680
		    if (testword[i]=='y' || gcisdigit(testword[i]))
ali@55
  1681
		    {
ali@55
  1682
			/* Yah, this is loose. */
ali@55
  1683
			vowel++;
ali@55
  1684
			consonant++;
ali@55
  1685
		    }
ali@55
  1686
		    else if (strchr(vowels,testword[i]))
ali@55
  1687
			vowel++;
ali@55
  1688
		    else
ali@55
  1689
			consonant++;
ali@55
  1690
		}
ali@55
  1691
		if (!vowel || !consonant)
ali@55
  1692
		    istypo=1;
ali@55
  1693
	    }
ali@55
  1694
	    /*
ali@55
  1695
	     * Now exclude the word from being reported if it's in
ali@55
  1696
	     * the okword list.
ali@55
  1697
	     */
ali@55
  1698
	    for (i=0;*okword[i];i++)
ali@55
  1699
		if (!strcmp(testword,okword[i]))
ali@55
  1700
		    istypo=0;
ali@55
  1701
	    /*
ali@55
  1702
	     * What looks like a typo may be a Roman numeral.
ali@55
  1703
	     * Exclude these.
ali@55
  1704
	     */
ali@55
  1705
	    if (istypo && isroman(testword))
ali@55
  1706
		istypo=0;
ali@55
  1707
	    /* Check the manual list of typos. */
ali@55
  1708
	    if (!istypo)
ali@55
  1709
		for (i=0;*typo[i];i++)
ali@55
  1710
		    if (!strcmp(testword,typo[i]))
ali@55
  1711
			istypo=1;
ali@55
  1712
	    /*
ali@55
  1713
	     * Check lowercase s, l, i and m - special cases.
ali@55
  1714
	     *   "j" - often a semi-colon gone wrong.
ali@55
  1715
	     *   "d" for a missing apostrophe - he d
ali@55
  1716
	     *   "n" for "in"
ali@55
  1717
	     */
ali@55
  1718
	    if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword))
ali@55
  1719
		istypo=1;
ali@55
  1720
	    if (istypo)
ali@55
  1721
	    {
ali@55
  1722
		isdup=0;
ali@55
  1723
		if (strlen(testword)<MAX_QWORD_LENGTH &&
ali@55
  1724
		  !pswit[VERBOSE_SWITCH])
ali@55
  1725
		    for (i=0;i<qword_index;i++)
ali@55
  1726
			if (!strcmp(testword,qword[i]))
ali@55
  1727
			{
ali@55
  1728
			    isdup=1;
ali@55
  1729
			    ++dupcnt[i];
ali@55
  1730
			}
ali@55
  1731
		if (!isdup)
ali@55
  1732
		{
ali@55
  1733
		    if (qword_index<MAX_QWORD &&
ali@55
  1734
		      strlen(testword)<MAX_QWORD_LENGTH)
ali@55
  1735
		    {
ali@55
  1736
			strcpy(qword[qword_index],testword);
ali@55
  1737
			qword_index++;
ali@55
  1738
		    }
ali@55
  1739
		    if (pswit[ECHO_SWITCH])
ali@55
  1740
			printf("\n%s\n",aline);
ali@55
  1741
		    if (!pswit[OVERVIEW_SWITCH])
ali@55
  1742
		    {
ali@55
  1743
			printf("    Line %ld column %d - Query word %s",
ali@55
  1744
			  linecnt,(int)(wordstart-aline)+1,inword);
ali@55
  1745
			if (strlen(testword)<MAX_QWORD_LENGTH &&
ali@55
  1746
			  !pswit[VERBOSE_SWITCH])
ali@55
  1747
			    printf(" - not reporting duplicates");
ali@55
  1748
			printf("\n");
ali@55
  1749
		    }
ali@55
  1750
		    else
ali@55
  1751
			cnt_word++;
ali@55
  1752
		}
ali@55
  1753
	    }
ali@55
  1754
	}
ali@55
  1755
	/* check the user's list of typos */
ali@55
  1756
	if (!istypo && usertypo_count)
ali@55
  1757
	    for (i=0;i<usertypo_count;i++)
ali@55
  1758
		if (!strcmp(testword,usertypo[i]))
ali@55
  1759
		{
ali@55
  1760
		    if (pswit[ECHO_SWITCH])
ali@55
  1761
			printf("\n%s\n",aline);
ali@55
  1762
		    if (!pswit[OVERVIEW_SWITCH])  
ali@55
  1763
			printf("    Line %ld column %d - "
ali@55
  1764
			  "Query possible scanno %s\n",
ali@55
  1765
			  linecnt,(int)(wordstart-aline)+2,inword);
ali@55
  1766
		}
ali@55
  1767
	if (pswit[PARANOID_SWITCH] && warnings->digit)
ali@55
  1768
	{
ali@55
  1769
	    /* In paranoid mode, query all 0 and 1 standing alone. */
ali@55
  1770
	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))
ali@55
  1771
	    {
ali@55
  1772
		if (pswit[ECHO_SWITCH])
ali@55
  1773
		    printf("\n%s\n",aline);
ali@55
  1774
		if (!pswit[OVERVIEW_SWITCH])
ali@55
  1775
		    printf("    Line %ld column %d - Query standalone %s\n",
ali@55
  1776
		      linecnt,(int)(wordstart-aline)+2,inword);
ali@55
  1777
		else
ali@55
  1778
		    cnt_word++;
ali@55
  1779
	    }
ali@55
  1780
	}
ali@55
  1781
    }
ali@55
  1782
}
ali@55
  1783
ali@56
  1784
struct parities {
ali@56
  1785
    int dquote,squote;
ali@56
  1786
};
ali@56
  1787
ali@56
  1788
/*
ali@56
  1789
 * check_for_misspaced_punctuation:
ali@56
  1790
 *
ali@56
  1791
 * Look for added or missing spaces around punctuation and quotes.
ali@56
  1792
 * If there is a punctuation character like ! with no space on
ali@56
  1793
 * either side, suspect a missing!space. If there are spaces on
ali@56
  1794
 * both sides , assume a typo. If we see a double quote with no
ali@56
  1795
 * space or punctuation on either side of it, assume unspaced
ali@56
  1796
 * quotes "like"this.
ali@56
  1797
 */
ali@56
  1798
void check_for_misspaced_punctuation(const char *aline,
ali@56
  1799
  struct parities *parities,int isemptyline)
ali@56
  1800
{
ali@56
  1801
    int i,llen,isacro,isellipsis;
ali@56
  1802
    const char *s;
ali@56
  1803
    llen=strlen(aline);
ali@56
  1804
    for (i=1;i<llen;i++)
ali@56
  1805
    {
ali@56
  1806
	/* For each character in the line after the first. */
ali@56
  1807
	if (strchr(".?!,;:_",aline[i]))  /* if it's punctuation */
ali@56
  1808
	{
ali@56
  1809
	    /* we need to suppress warnings for acronyms like M.D. */
ali@56
  1810
	    isacro=0;
ali@56
  1811
	    /* we need to suppress warnings for ellipsis . . . */
ali@56
  1812
	    isellipsis=0;
ali@56
  1813
	    /* if there are letters on both sides of it or ... */
ali@56
  1814
	    if (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1]) ||
ali@56
  1815
	       gcisalpha(aline[i+1]) && strchr("?!,;:",aline[i]))
ali@56
  1816
	    {
ali@56
  1817
		/* ...if it's strict punctuation followed by an alpha */
ali@56
  1818
		if (aline[i]=='.')
ali@56
  1819
		{
ali@56
  1820
		    if (i>2 && aline[i-2]=='.')
ali@56
  1821
			isacro=1;
ali@56
  1822
		    if (i+2<llen && aline[i+2]=='.')
ali@56
  1823
			isacro=1;
ali@56
  1824
		}
ali@56
  1825
		if (!isacro)
ali@56
  1826
		{
ali@56
  1827
		    if (pswit[ECHO_SWITCH])
ali@56
  1828
			printf("\n%s\n",aline);
ali@56
  1829
		    if (!pswit[OVERVIEW_SWITCH])
ali@56
  1830
			printf("    Line %ld column %d - Missing space?\n",
ali@56
  1831
			  linecnt,i+1);
ali@56
  1832
		    else
ali@56
  1833
			cnt_punct++;
ali@56
  1834
		}
ali@56
  1835
	    }
ali@56
  1836
	    if (aline[i-1]==CHAR_SPACE &&
ali@56
  1837
	      (aline[i+1]==CHAR_SPACE || aline[i+1]==0))
ali@56
  1838
	    {
ali@56
  1839
		/*
ali@56
  1840
		 * If there are spaces on both sides,
ali@56
  1841
		 * or space before and end of line.
ali@56
  1842
		 */
ali@56
  1843
		if (aline[i]=='.')
ali@56
  1844
		{
ali@56
  1845
		    if (i>2 && aline[i-2]=='.')
ali@56
  1846
			isellipsis=1;
ali@56
  1847
		    if (i+2<llen && aline[i+2]=='.')
ali@56
  1848
			isellipsis=1;
ali@56
  1849
		}
ali@56
  1850
		if (!isemptyline && !isellipsis)
ali@56
  1851
		{
ali@56
  1852
		    if (pswit[ECHO_SWITCH])
ali@56
  1853
			printf("\n%s\n",aline);
ali@56
  1854
		    if (!pswit[OVERVIEW_SWITCH])
ali@56
  1855
			printf("    Line %ld column %d - "
ali@56
  1856
			  "Spaced punctuation?\n",linecnt,i+1);
ali@56
  1857
		    else
ali@56
  1858
			cnt_punct++;
ali@56
  1859
		}
ali@56
  1860
	    }
ali@56
  1861
	}
ali@56
  1862
    }
ali@56
  1863
    /* Split out the characters that CANNOT be preceded by space. */
ali@56
  1864
    llen=strlen(aline);
ali@56
  1865
    for (i=1;i<llen;i++)
ali@56
  1866
    {
ali@56
  1867
	/* for each character in the line after the first */
ali@56
  1868
	if (strchr("?!,;:",aline[i]))
ali@56
  1869
	{
ali@56
  1870
	    /* if it's punctuation that _cannot_ have a space before it */
ali@56
  1871
	    if (aline[i-1]==CHAR_SPACE && !isemptyline &&
ali@56
  1872
	      aline[i+1]!=CHAR_SPACE)
ali@56
  1873
	    {
ali@56
  1874
		/*
ali@56
  1875
		 * If aline[i+1) DOES == space,
ali@56
  1876
		 * it was already reported just above.
ali@56
  1877
		 */
ali@56
  1878
		if (pswit[ECHO_SWITCH])
ali@56
  1879
		    printf("\n%s\n",aline);
ali@56
  1880
		if (!pswit[OVERVIEW_SWITCH])
ali@56
  1881
		    printf("    Line %ld column %d - Spaced punctuation?\n",
ali@56
  1882
		      linecnt,i+1);
ali@56
  1883
		else
ali@56
  1884
		    cnt_punct++;
ali@56
  1885
	    }
ali@56
  1886
	}
ali@56
  1887
    }
ali@56
  1888
    /*
ali@56
  1889
     * Special case " .X" where X is any alpha.
ali@56
  1890
     * This plugs a hole in the acronym code above.
ali@56
  1891
     * Inelegant, but maintainable.
ali@56
  1892
     */
ali@56
  1893
    llen=strlen(aline);
ali@56
  1894
    for (i=1;i<llen;i++)
ali@56
  1895
    {
ali@56
  1896
	/* for each character in the line after the first */
ali@56
  1897
	if (aline[i]=='.')
ali@56
  1898
	{
ali@56
  1899
	    /* if it's a period */
ali@56
  1900
	    if (aline[i-1]==CHAR_SPACE && gcisalpha(aline[i+1]))
ali@56
  1901
	    {
ali@56
  1902
		/*
ali@56
  1903
		 * If the period follows a space and
ali@56
  1904
		 * is followed by a letter.
ali@56
  1905
		 */
ali@56
  1906
		if (pswit[ECHO_SWITCH])
ali@56
  1907
		    printf("\n%s\n",aline);
ali@56
  1908
		if (!pswit[OVERVIEW_SWITCH])
ali@56
  1909
		    printf("    Line %ld column %d - Spaced punctuation?\n",
ali@56
  1910
		      linecnt,i+1);
ali@56
  1911
		else
ali@56
  1912
		    cnt_punct++;
ali@56
  1913
	    }
ali@56
  1914
	}
ali@56
  1915
    }
ali@56
  1916
    for (i=1;i<llen;i++)
ali@56
  1917
    {
ali@56
  1918
	/* for each character in the line after the first */
ali@56
  1919
	if (aline[i]==CHAR_DQUOTE)
ali@56
  1920
	{
ali@56
  1921
	    if (!strchr(" _-.'`,;:!/([{?}])",aline[i-1]) &&
ali@56
  1922
	      !strchr(" _-.'`,;:!/([{?}])",aline[i+1]) && aline[i+1] ||
ali@56
  1923
	      !strchr(" _-([{'`",aline[i-1]) && gcisalpha(aline[i+1]))
ali@56
  1924
	    {
ali@56
  1925
		if (pswit[ECHO_SWITCH])
ali@56
  1926
		    printf("\n%s\n",aline);
ali@56
  1927
		if (!pswit[OVERVIEW_SWITCH])
ali@56
  1928
		    printf("    Line %ld column %d - Unspaced quotes?\n",
ali@56
  1929
		      linecnt,i+1);
ali@56
  1930
		else
ali@56
  1931
		    cnt_punct++;
ali@56
  1932
	    }
ali@56
  1933
	}
ali@56
  1934
    }
ali@56
  1935
    /* Check parity of quotes. */
ali@56
  1936
    for (s=aline;*s;s++)
ali@56
  1937
    {
ali@56
  1938
	if (*s==CHAR_DQUOTE)
ali@56
  1939
	{
ali@56
  1940
	    parities->dquote=!parities->dquote;
ali@56
  1941
	    if (!parities->dquote)
ali@56
  1942
	    {
ali@56
  1943
		/* parity even */
ali@56
  1944
		if (!strchr("_-.'`/,;:!?)]} ",s[1]))
ali@56
  1945
		{
ali@56
  1946
		    if (pswit[ECHO_SWITCH])
ali@56
  1947
			printf("\n%s\n",aline);
ali@56
  1948
		    if (!pswit[OVERVIEW_SWITCH])
ali@56
  1949
			printf("    Line %ld column %d - "
ali@56
  1950
			  "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);
ali@56
  1951
		    else
ali@56
  1952
			cnt_punct++;
ali@56
  1953
		}
ali@56
  1954
	    }
ali@56
  1955
	    else
ali@56
  1956
	    {
ali@56
  1957
		/* parity odd */
ali@56
  1958
		if (!gcisalpha(s[1]) && !isdigit(s[1]) &&
ali@56
  1959
		  !strchr("_-/.'`([{$",s[1]) || !s[1])
ali@56
  1960
		{
ali@56
  1961
		    if (pswit[ECHO_SWITCH])
ali@56
  1962
			printf("\n%s\n",aline);
ali@56
  1963
		    if (!pswit[OVERVIEW_SWITCH])
ali@56
  1964
			printf("    Line %ld column %d - "
ali@56
  1965
			  "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);
ali@56
  1966
		    else
ali@56
  1967
			cnt_punct++;
ali@56
  1968
		}
ali@56
  1969
	    }
ali@56
  1970
	}
ali@56
  1971
    }
ali@56
  1972
    if (*aline==CHAR_DQUOTE)
ali@56
  1973
    {
ali@56
  1974
	if (strchr(",;:!?)]} ",aline[1]))
ali@56
  1975
	{
ali@56
  1976
	    if (pswit[ECHO_SWITCH])
ali@56
  1977
		printf("\n%s\n",aline);
ali@56
  1978
	    if (!pswit[OVERVIEW_SWITCH])
ali@56
  1979
		printf("    Line %ld column 1 - Wrongspaced quotes?\n",
ali@56
  1980
		  linecnt);
ali@56
  1981
	    else
ali@56
  1982
		cnt_punct++;
ali@56
  1983
	}
ali@56
  1984
    }
ali@56
  1985
    if (pswit[SQUOTE_SWITCH])
ali@56
  1986
    {
ali@56
  1987
	for (s=aline;*s;s++)
ali@56
  1988
	{
ali@56
  1989
	    if ((*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE) &&
ali@56
  1990
	      (s==aline || s>aline && !gcisalpha(s[-1]) ||
ali@56
  1991
	      !gcisalpha(s[1])))
ali@56
  1992
	    {
ali@56
  1993
		parities->squote=!parities->squote;
ali@56
  1994
		if (!parities->squote)
ali@56
  1995
		{
ali@56
  1996
		    /* parity even */
ali@56
  1997
		    if (!strchr("_-.'`/\",;:!?)]} ",s[1]))
ali@56
  1998
		    {
ali@56
  1999
			if (pswit[ECHO_SWITCH])
ali@56
  2000
			    printf("\n%s\n",aline);
ali@56
  2001
			if (!pswit[OVERVIEW_SWITCH])
ali@56
  2002
			    printf("    Line %ld column %d - "
ali@56
  2003
			      "Wrongspaced singlequotes?\n",
ali@56
  2004
			      linecnt,(int)(s-aline)+1);
ali@56
  2005
			else
ali@56
  2006
			    cnt_punct++;
ali@56
  2007
		    }
ali@56
  2008
		}
ali@56
  2009
		else
ali@56
  2010
		{
ali@56
  2011
		    /* parity odd */
ali@56
  2012
		    if (!gcisalpha(s[1]) && !isdigit(s[1]) &&
ali@56
  2013
		      !strchr("_-/\".'`",s[1]) || !s[1])
ali@56
  2014
		    {
ali@56
  2015
			if (pswit[ECHO_SWITCH])
ali@56
  2016
			    printf("\n%s\n",aline);
ali@56
  2017
			if (!pswit[OVERVIEW_SWITCH])
ali@56
  2018
			    printf("    Line %ld column %d - "
ali@56
  2019
			      "Wrongspaced singlequotes?\n",
ali@56
  2020
			      linecnt,(int)(s-aline)+1);
ali@56
  2021
			else
ali@56
  2022
			    cnt_punct++;
ali@56
  2023
		    }
ali@56
  2024
		}
ali@56
  2025
	    }
ali@56
  2026
	}
ali@56
  2027
    }
ali@56
  2028
}
ali@56
  2029
ali@55
  2030
/*
ali@57
  2031
 * check_for_double_punctuation:
ali@57
  2032
 *
ali@57
  2033
 * Look for double punctuation like ,. or ,,
ali@57
  2034
 * Thanks to DW for the suggestion!
ali@57
  2035
 * In books with references, ".," and ".;" are common
ali@57
  2036
 * e.g. "etc., etc.," and vol. 1.; vol 3.;
ali@57
  2037
 * OTOH, from my initial tests, there are also fairly
ali@57
  2038
 * common errors. What to do? Make these cases paranoid?
ali@57
  2039
 * ".," is the most common, so warnings->dotcomma is used
ali@57
  2040
 * to suppress detailed reporting if it occurs often.
ali@57
  2041
 */
ali@57
  2042
void check_for_double_punctuation(const char *aline,struct warnings *warnings)
ali@57
  2043
{
ali@57
  2044
    int i,llen;
ali@57
  2045
    llen=strlen(aline);
ali@57
  2046
    for (i=0;i<llen;i++)
ali@57
  2047
    {
ali@57
  2048
	/* for each punctuation character in the line */
ali@57
  2049
	if (strchr(".?!,;:",aline[i]) && strchr(".?!,;:",aline[i+1]) &&
ali@57
  2050
	  aline[i] && aline[i+1])
ali@57
  2051
	{
ali@57
  2052
	    /* followed by punctuation, it's a query, unless . . . */
ali@57
  2053
	    if (aline[i]==aline[i+1] && (aline[i]=='.' || aline[i]=='?' ||
ali@57
  2054
	      aline[i]=='!') ||
ali@57
  2055
	      !warnings->dotcomma && aline[i]=='.' && aline[i+1]==',' ||
ali@57
  2056
	      warnings->isFrench && !strncmp(aline+i,",...",4) ||
ali@57
  2057
	      warnings->isFrench && !strncmp(aline+i,"...,",4) ||
ali@57
  2058
	      warnings->isFrench && !strncmp(aline+i,";...",4) ||
ali@57
  2059
	      warnings->isFrench && !strncmp(aline+i,"...;",4) ||
ali@57
  2060
	      warnings->isFrench && !strncmp(aline+i,":...",4) ||
ali@57
  2061
	      warnings->isFrench && !strncmp(aline+i,"...:",4) ||
ali@57
  2062
	      warnings->isFrench && !strncmp(aline+i,"!...",4) ||
ali@57
  2063
	      warnings->isFrench && !strncmp(aline+i,"...!",4) ||
ali@57
  2064
	      warnings->isFrench && !strncmp(aline+i,"?...",4) ||
ali@57
  2065
	      warnings->isFrench && !strncmp(aline+i,"...?",4))
ali@57
  2066
	    {
ali@57
  2067
		if (warnings->isFrench && !strncmp(aline+i,",...",4) ||
ali@57
  2068
		  warnings->isFrench && !strncmp(aline+i,"...,",4) ||
ali@57
  2069
		  warnings->isFrench && !strncmp(aline+i,";...",4) ||
ali@57
  2070
		  warnings->isFrench && !strncmp(aline+i,"...;",4) ||
ali@57
  2071
		  warnings->isFrench && !strncmp(aline+i,":...",4) ||
ali@57
  2072
		  warnings->isFrench && !strncmp(aline+i,"...:",4) ||
ali@57
  2073
		  warnings->isFrench && !strncmp(aline+i,"!...",4) ||
ali@57
  2074
		  warnings->isFrench && !strncmp(aline+i,"...!",4) ||
ali@57
  2075
		  warnings->isFrench && !strncmp(aline+i,"?...",4) ||
ali@57
  2076
		  warnings->isFrench && !strncmp(aline+i,"...?",4))
ali@57
  2077
		    i+=4;
ali@57
  2078
		; /* do nothing for .. !! and ?? which can be legit */
ali@57
  2079
	    }
ali@57
  2080
	    else
ali@57
  2081
	    {
ali@57
  2082
		if (pswit[ECHO_SWITCH])
ali@57
  2083
		    printf("\n%s\n",aline);
ali@57
  2084
		if (!pswit[OVERVIEW_SWITCH])
ali@57
  2085
		    printf("    Line %ld column %d - Double punctuation?\n",
ali@57
  2086
		      linecnt,i+1);
ali@57
  2087
		else
ali@57
  2088
		    cnt_punct++;
ali@57
  2089
	    }
ali@57
  2090
	}
ali@57
  2091
    }
ali@57
  2092
}
ali@57
  2093
ali@57
  2094
/*
ali@58
  2095
 * check_for_spaced_quotes:
ali@58
  2096
 */
ali@58
  2097
void check_for_spaced_quotes(const char *aline)
ali@58
  2098
{
ali@58
  2099
    const char *s,*t;
ali@58
  2100
    s=aline;
ali@58
  2101
    while ((t=strstr(s," \" ")))
ali@58
  2102
    {
ali@58
  2103
	if (pswit[ECHO_SWITCH])
ali@58
  2104
	    printf("\n%s\n",aline);
ali@58
  2105
	if (!pswit[OVERVIEW_SWITCH])
ali@58
  2106
	    printf("    Line %ld column %d - Spaced doublequote?\n",
ali@58
  2107
	      linecnt,(int)(t-aline+1));
ali@58
  2108
	else
ali@58
  2109
	    cnt_punct++;
ali@58
  2110
	s=t+2;
ali@58
  2111
    }
ali@58
  2112
    s=aline;
ali@58
  2113
    while ((t=strstr(s," ' ")))
ali@58
  2114
    {
ali@58
  2115
	if (pswit[ECHO_SWITCH])
ali@58
  2116
	    printf("\n%s\n",aline);
ali@58
  2117
	if (!pswit[OVERVIEW_SWITCH])
ali@58
  2118
	    printf("    Line %ld column %d - Spaced singlequote?\n",
ali@58
  2119
	      linecnt,(int)(t-aline+1));
ali@58
  2120
	else
ali@58
  2121
	    cnt_punct++;
ali@58
  2122
	s=t+2;
ali@58
  2123
    }
ali@58
  2124
    s=aline;
ali@58
  2125
    while ((t=strstr(s," ` ")))
ali@58
  2126
    {
ali@58
  2127
	if (pswit[ECHO_SWITCH])
ali@58
  2128
	    printf("\n%s\n",aline);
ali@58
  2129
	if (!pswit[OVERVIEW_SWITCH])
ali@58
  2130
	    printf("    Line %ld column %d - Spaced singlequote?\n",
ali@58
  2131
	      linecnt,(int)(t-aline+1));
ali@58
  2132
	else
ali@58
  2133
	    cnt_punct++;
ali@58
  2134
	s=t+2;
ali@58
  2135
    }
ali@58
  2136
}
ali@58
  2137
ali@58
  2138
/*
ali@59
  2139
 * check_for_miscased_genative:
ali@59
  2140
 *
ali@59
  2141
 * Check special case of 'S instead of 's at end of word.
ali@59
  2142
 */
ali@59
  2143
void check_for_miscased_genative(const char *aline)
ali@59
  2144
{
ali@59
  2145
    const char *s;
ali@59
  2146
    s=aline+1;
ali@59
  2147
    while (*s)
ali@59
  2148
    {
ali@59
  2149
	if (*s==CHAR_SQUOTE && s[1]=='S' && s[-1]>='a' && s[-1]<='z')
ali@59
  2150
	{
ali@59
  2151
	    if (pswit[ECHO_SWITCH])
ali@59
  2152
		printf("\n%s\n",aline);
ali@59
  2153
	    if (!pswit[OVERVIEW_SWITCH])
ali@59
  2154
		printf("    Line %ld column %d - Capital \"S\"?\n",
ali@59
  2155
		  linecnt,(int)(s-aline+2));
ali@59
  2156
	    else
ali@59
  2157
		cnt_punct++;
ali@59
  2158
	}
ali@59
  2159
	s++;
ali@59
  2160
    }
ali@59
  2161
}
ali@59
  2162
ali@59
  2163
/*
ali@60
  2164
 * check_end_of_line:
ali@60
  2165
 *
ali@60
  2166
 * Now check special cases - start and end of line -
ali@60
  2167
 * for single and double quotes. Start is sometimes [sic]
ali@60
  2168
 * but better to query it anyway.
ali@60
  2169
 * While we're here, check for dash at end of line.
ali@60
  2170
 */
ali@60
  2171
void check_end_of_line(const char *aline,struct warnings *warnings)
ali@60
  2172
{
ali@60
  2173
    int i,llen;
ali@60
  2174
    llen=strlen(aline);
ali@60
  2175
    if (llen>1)
ali@60
  2176
    {
ali@60
  2177
	if (aline[llen-1]==CHAR_DQUOTE || aline[llen-1]==CHAR_SQUOTE ||
ali@60
  2178
	  aline[llen-1]==CHAR_OPEN_SQUOTE)
ali@60
  2179
	    if (aline[llen-2]==CHAR_SPACE)
ali@60
  2180
	    {
ali@60
  2181
		if (pswit[ECHO_SWITCH])
ali@60
  2182
		    printf("\n%s\n",aline);
ali@60
  2183
		if (!pswit[OVERVIEW_SWITCH])
ali@60
  2184
		    printf("    Line %ld column %d - Spaced quote?\n",
ali@60
  2185
		      linecnt,llen);
ali@60
  2186
		else
ali@60
  2187
		    cnt_punct++;
ali@60
  2188
	    }
ali@60
  2189
	if ((aline[0]==CHAR_SQUOTE || aline[0]==CHAR_OPEN_SQUOTE) &&
ali@60
  2190
	  aline[1]==CHAR_SPACE)
ali@60
  2191
	{
ali@60
  2192
	    if (pswit[ECHO_SWITCH])
ali@60
  2193
		printf("\n%s\n",aline);
ali@60
  2194
	    if (!pswit[OVERVIEW_SWITCH])
ali@60
  2195
		printf("    Line %ld column 1 - Spaced quote?\n",linecnt);
ali@60
  2196
	    else
ali@60
  2197
		cnt_punct++;
ali@60
  2198
	}
ali@60
  2199
	/*
ali@60
  2200
	 * Dash at end of line may well be legit - paranoid mode only
ali@60
  2201
	 * and don't report em-dash at line-end.
ali@60
  2202
	 */
ali@60
  2203
	if (pswit[PARANOID_SWITCH] && warnings->hyphen)
ali@60
  2204
	{
ali@60
  2205
	    for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)
ali@60
  2206
		;
ali@60
  2207
	    if (aline[i]=='-' && aline[i-1]!='-')
ali@60
  2208
	    {
ali@60
  2209
		if (pswit[ECHO_SWITCH])
ali@60
  2210
		    printf("\n%s\n",aline);
ali@60
  2211
		if (!pswit[OVERVIEW_SWITCH])
ali@60
  2212
		    printf("    Line %ld column %d - Hyphen at end of line?\n",
ali@60
  2213
		      linecnt,i);
ali@60
  2214
	    }
ali@60
  2215
	}
ali@60
  2216
    }
ali@60
  2217
}
ali@60
  2218
ali@60
  2219
/*
ali@61
  2220
 * check_for_unspaced_bracket:
ali@61
  2221
 *
ali@61
  2222
 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
ali@61
  2223
 * If so, suspect a scanno like "a]most".
ali@61
  2224
 */
ali@61
  2225
void check_for_unspaced_bracket(const char *aline)
ali@61
  2226
{
ali@61
  2227
    int i,llen;
ali@61
  2228
    llen=strlen(aline);
ali@61
  2229
    for (i=1;i<llen-1;i++)
ali@61
  2230
    {
ali@61
  2231
	/* for each bracket character in the line except 1st & last */
ali@61
  2232
	if (strchr("{[()]}",aline[i]) && gcisalpha(aline[i-1]) &&
ali@61
  2233
	  gcisalpha(aline[i+1]))
ali@61
  2234
	{
ali@61
  2235
	    if (pswit[ECHO_SWITCH])
ali@61
  2236
		printf("\n%s\n",aline);
ali@61
  2237
	    if (!pswit[OVERVIEW_SWITCH])
ali@61
  2238
		printf("    Line %ld column %d - Unspaced bracket?\n",
ali@61
  2239
		  linecnt,i);
ali@61
  2240
	    else
ali@61
  2241
		cnt_punct++;
ali@61
  2242
	}
ali@61
  2243
    }
ali@61
  2244
}
ali@61
  2245
ali@61
  2246
/*
ali@62
  2247
 * check_for_unpunctuated_endquote:
ali@62
  2248
 */
ali@62
  2249
void check_for_unpunctuated_endquote(const char *aline)
ali@62
  2250
{
ali@62
  2251
    int i,llen;
ali@62
  2252
    llen=strlen(aline);
ali@62
  2253
    for (i=1;i<llen;i++)
ali@62
  2254
    {
ali@62
  2255
	/* for each character in the line except 1st */
ali@62
  2256
	if (aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))
ali@62
  2257
	{
ali@62
  2258
	    if (pswit[ECHO_SWITCH])
ali@62
  2259
		printf("\n%s\n",aline);
ali@62
  2260
	    if (!pswit[OVERVIEW_SWITCH])
ali@62
  2261
		printf("    Line %ld column %d - "
ali@62
  2262
		  "endquote missing punctuation?\n",linecnt,i);
ali@62
  2263
	    else
ali@62
  2264
		cnt_punct++;
ali@62
  2265
	}
ali@62
  2266
    }
ali@62
  2267
}
ali@62
  2268
ali@62
  2269
/*
ali@63
  2270
 * check_for_html_tag:
ali@63
  2271
 *
ali@63
  2272
 * Check for <HTML TAG>.
ali@63
  2273
 *
ali@63
  2274
 * If there is a < in the line, followed at some point
ali@63
  2275
 * by a > then we suspect HTML.
ali@63
  2276
 */
ali@63
  2277
void check_for_html_tag(const char *aline)
ali@63
  2278
{
ali@63
  2279
    int i;
ali@63
  2280
    const char *open,*close;
ali@63
  2281
    open=strstr(aline,"<");
ali@63
  2282
    if (open)
ali@63
  2283
    {
ali@63
  2284
	close=strstr(aline,">");
ali@63
  2285
	if (close)
ali@63
  2286
	{
ali@63
  2287
	    i=(signed int)(close-open+1);
ali@63
  2288
	    if (i>0)
ali@63
  2289
	    {
ali@63
  2290
		strncpy(wrk,open,i);
ali@63
  2291
		wrk[i]=0;
ali@63
  2292
		if (pswit[ECHO_SWITCH])
ali@63
  2293
		    printf("\n%s\n",aline);
ali@63
  2294
		if (!pswit[OVERVIEW_SWITCH])
ali@63
  2295
		    printf("    Line %ld column %d - HTML Tag? %s \n",
ali@63
  2296
		      linecnt,(int)(open-aline)+1,wrk);
ali@63
  2297
		else
ali@63
  2298
		    cnt_html++;
ali@63
  2299
	    }
ali@63
  2300
	}
ali@63
  2301
    }
ali@63
  2302
}
ali@63
  2303
ali@63
  2304
/*
ali@41
  2305
 * procfile:
ali@41
  2306
 *
ali@41
  2307
 * Process one file.
ali@41
  2308
 */
ali@41
  2309
void procfile(char *filename)
ali@41
  2310
{
ali@55
  2311
    const char *s,*t;
ali@41
  2312
    char parastart[81];     /* first line of current para */
ali@41
  2313
    FILE *infile;
ali@41
  2314
    struct first_pass_results *first_pass_results;
ali@42
  2315
    struct warnings *warnings;
ali@43
  2316
    struct counters counters={0};
ali@45
  2317
    struct line_properties last={0};
ali@56
  2318
    struct parities parities={0};
ali@43
  2319
    int isemptyline;
ali@43
  2320
    long squot,start_para_line;
ali@55
  2321
    signed int i,llen,isacro,isellipsis;
ali@55
  2322
    signed int isnewpara;
ali@41
  2323
    char dquote_err[80],squote_err[80],rbrack_err[80],sbrack_err[80],
ali@41
  2324
      cbrack_err[80],unders_err[80];
ali@41
  2325
    signed int enddash;
ali@45
  2326
    last.start=CHAR_SPACE;
ali@41
  2327
    *dquote_err=*squote_err=*rbrack_err=*cbrack_err=*sbrack_err=
ali@41
  2328
      *unders_err=*prevline=0;
ali@41
  2329
    linecnt=checked_linecnt=start_para_line=0;
ali@43
  2330
    squot=0;
ali@53
  2331
    i=llen=isacro=isellipsis=0;
ali@55
  2332
    isnewpara=enddash=0;
ali@41
  2333
    infile=fopen(filename,"rb");
ali@41
  2334
    if (!infile)
ali@41
  2335
    {
ali@41
  2336
        if (pswit[STDOUT_SWITCH])
ali@41
  2337
            fprintf(stdout,"bookloupe: cannot open %s\n",filename);
ali@41
  2338
        else
ali@41
  2339
            fprintf(stderr,"bookloupe: cannot open %s\n",filename);
ali@41
  2340
	exit(1);
ali@41
  2341
    }
ali@41
  2342
    fprintf(stdout,"\n\nFile: %s\n\n",filename);
ali@41
  2343
    first_pass_results=first_pass(infile);
ali@42
  2344
    warnings=report_first_pass(first_pass_results);
ali@42
  2345
    rewind(infile);
ali@40
  2346
    /*
ali@40
  2347
     * Here we go with the main pass. Hold onto yer hat!
ali@40
  2348
     * Re-init some variables we've dirtied.
ali@40
  2349
     */
ali@43
  2350
    squot=linecnt=0;
ali@40
  2351
    while (flgets(aline,LINEBUFSIZE-1,infile,linecnt+1))
ali@40
  2352
    {
ali@0
  2353
        linecnt++;
ali@40
  2354
        if (linecnt==1)
ali@40
  2355
	    isnewpara=1;
ali@40
  2356
        if (pswit[DP_SWITCH] && !strncmp(aline,"-----File: ",11))
ali@40
  2357
	    continue;    // skip DP page separators completely
ali@41
  2358
        if (linecnt<first_pass_results->firstline ||
ali@41
  2359
	  (first_pass_results->footerline>0 &&
ali@41
  2360
	  linecnt>first_pass_results->footerline))
ali@40
  2361
	{
ali@40
  2362
            if (pswit[HEADER_SWITCH])
ali@40
  2363
	    {
ali@40
  2364
                if (!strncmp(aline,"Title:",6))
ali@40
  2365
                    printf("    %s\n",aline);
ali@40
  2366
                if (!strncmp(aline,"Author:",7))
ali@40
  2367
                    printf("    %s\n",aline);
ali@40
  2368
                if (!strncmp(aline,"Release Date:",13))
ali@40
  2369
                    printf("    %s\n",aline);
ali@40
  2370
                if (!strncmp(aline,"Edition:",8))
ali@40
  2371
                    printf("    %s\n\n",aline);
ali@40
  2372
	    }
ali@0
  2373
            continue;                /* skip through the header */
ali@40
  2374
	}
ali@0
  2375
        checked_linecnt++;
ali@40
  2376
        s=aline;
ali@40
  2377
        /*
ali@40
  2378
	 * If we are in a state of unbalanced quotes, and this line
ali@40
  2379
         * doesn't begin with a quote, output the stored error message.
ali@40
  2380
         * If the -P switch was used, print the warning even if the
ali@40
  2381
         * new para starts with quotes.
ali@40
  2382
	 */
ali@40
  2383
        t=s;
ali@40
  2384
        while (*t==' ')
ali@40
  2385
	    t++;
ali@0
  2386
        if (*dquote_err)
ali@40
  2387
            if (*t!=CHAR_DQUOTE || pswit[QPARA_SWITCH])
ali@40
  2388
	    {
ali@40
  2389
                if (!pswit[OVERVIEW_SWITCH])
ali@40
  2390
		{
ali@40
  2391
                    if (pswit[ECHO_SWITCH])
ali@40
  2392
			printf("\n%s\n",parastart);
ali@0
  2393
                    printf(dquote_err);
ali@40
  2394
		}
ali@0
  2395
                else
ali@0
  2396
                    cnt_dquot++;
ali@0
  2397
            }
ali@40
  2398
        if (*squote_err)
ali@40
  2399
	{
ali@40
  2400
            if (*t!=CHAR_SQUOTE && *t!=CHAR_OPEN_SQUOTE ||
ali@40
  2401
	      pswit[QPARA_SWITCH] || squot)
ali@40
  2402
	    {
ali@40
  2403
                if (!pswit[OVERVIEW_SWITCH])
ali@40
  2404
		{
ali@40
  2405
                    if (pswit[ECHO_SWITCH])
ali@40
  2406
			printf("\n%s\n",parastart);
ali@0
  2407
                    printf(squote_err);
ali@40
  2408
		}
ali@0
  2409
                else
ali@0
  2410
                    cnt_squot++;
ali@40
  2411
	    }
ali@40
  2412
            squot=0;
ali@40
  2413
	}
ali@40
  2414
        if (*rbrack_err)
ali@40
  2415
	{
ali@40
  2416
            if (!pswit[OVERVIEW_SWITCH])
ali@40
  2417
	    {
ali@40
  2418
                if (pswit[ECHO_SWITCH])
ali@40
  2419
		    printf("\n%s\n",parastart);
ali@0
  2420
                printf(rbrack_err);
ali@40
  2421
	    }
ali@0
  2422
            else
ali@0
  2423
                cnt_brack++;
ali@40
  2424
	}
ali@40
  2425
        if (*sbrack_err)
ali@40
  2426
	{
ali@40
  2427
            if (!pswit[OVERVIEW_SWITCH])
ali@40
  2428
	    {
ali@40
  2429
                if (pswit[ECHO_SWITCH])
ali@40
  2430
		    printf("\n%s\n",parastart);
ali@0
  2431
                printf(sbrack_err);
ali@40
  2432
	    }
ali@0
  2433
            else
ali@0
  2434
                cnt_brack++;
ali@40
  2435
	}
ali@40
  2436
        if (*cbrack_err)
ali@40
  2437
	{
ali@40
  2438
            if (!pswit[OVERVIEW_SWITCH])
ali@40
  2439
	    {
ali@40
  2440
                if (pswit[ECHO_SWITCH])
ali@40
  2441
		    printf("\n%s\n",parastart);
ali@0
  2442
                printf(cbrack_err);
ali@40
  2443
	    }
ali@0
  2444
            else
ali@0
  2445
                cnt_brack++;
ali@40
  2446
	}
ali@40
  2447
        if (*unders_err)
ali@40
  2448
	{
ali@40
  2449
            if (!pswit[OVERVIEW_SWITCH])
ali@40
  2450
	    {
ali@40
  2451
                if (pswit[ECHO_SWITCH])
ali@40
  2452
		    printf("\n%s\n",parastart);
ali@0
  2453
                printf(unders_err);
ali@40
  2454
	    }
ali@0
  2455
            else
ali@0
  2456
                cnt_brack++;
ali@40
  2457
	}
ali@40
  2458
        *dquote_err=*squote_err=*rbrack_err=*cbrack_err= 
ali@40
  2459
	  *sbrack_err=*unders_err=0;
ali@43
  2460
	isemptyline=analyse_quotes(aline,&counters);
ali@40
  2461
        if (isnewpara && !isemptyline)
ali@40
  2462
	{
ali@40
  2463
	    /* This line is the start of a new paragraph. */
ali@40
  2464
            start_para_line=linecnt;
ali@40
  2465
	    /* Capture its first line in case we want to report it later. */
ali@40
  2466
            strncpy(parastart,aline,80);
ali@40
  2467
            parastart[79]=0;
ali@56
  2468
	    memset(&parities,0,sizeof(parities));  /* restart the quote count */
ali@40
  2469
            s=aline;
ali@40
  2470
            while (!gcisalpha(*s) && !gcisdigit(*s) && *s)
ali@40
  2471
		s++;
ali@40
  2472
            if (*s>='a' && *s<='z')
ali@40
  2473
	    {
ali@40
  2474
		/* and its first letter is lowercase */
ali@40
  2475
                if (pswit[ECHO_SWITCH])
ali@40
  2476
		    printf("\n%s\n",aline);
ali@0
  2477
                if (!pswit[OVERVIEW_SWITCH])
ali@40
  2478
                    printf("    Line %ld column %d - "
ali@40
  2479
		      "Paragraph starts with lower-case\n",
ali@40
  2480
		      linecnt,(int)(s-aline)+1);
ali@0
  2481
                else
ali@0
  2482
                    cnt_punct++;
ali@40
  2483
	    }
ali@40
  2484
            isnewpara=0; /* Signal the end of new para processing. */
ali@40
  2485
	}
ali@40
  2486
        /* Check for an em-dash broken at line end. */
ali@40
  2487
        if (enddash && *aline=='-')
ali@40
  2488
	{
ali@40
  2489
            if (pswit[ECHO_SWITCH])
ali@40
  2490
		printf("\n%s\n",aline);
ali@0
  2491
            if (!pswit[OVERVIEW_SWITCH])
ali@40
  2492
                printf("    Line %ld column 1 - Broken em-dash?\n",linecnt);
ali@0
  2493
            else
ali@0
  2494
                cnt_punct++;
ali@40
  2495
	}
ali@40
  2496
        enddash=0;
ali@40
  2497
        for (s=aline+strlen(aline)-1;*s==' ' && s>aline;s--)
ali@40
  2498
	    ;
ali@40
  2499
        if (s>=aline && *s=='-')
ali@40
  2500
            enddash=1;
ali@40
  2501
	/*
ali@40
  2502
         * Check for invalid or questionable characters in the line
ali@40
  2503
         * Anything above 127 is invalid for plain ASCII, and
ali@40
  2504
         * non-printable control characters should also be flagged.
ali@40
  2505
         * Tabs should generally not be there.
ali@40
  2506
	 */
ali@40
  2507
        for (s=aline;*s;s++)
ali@40
  2508
	{
ali@40
  2509
            i=(unsigned char)*s;
ali@40
  2510
            if (i<CHAR_SPACE && i!=CHAR_LF && i!=CHAR_CR && i!=CHAR_TAB)
ali@40
  2511
	    {
ali@40
  2512
                if (pswit[ECHO_SWITCH])
ali@40
  2513
		    printf("\n%s\n",aline);
ali@0
  2514
                if (!pswit[OVERVIEW_SWITCH])
ali@40
  2515
                    printf("    Line %ld column %d - Control character %d\n",
ali@40
  2516
		      linecnt,(int)(s-aline)+1,i);
ali@0
  2517
                else
ali@0
  2518
                    cnt_bin++;
ali@40
  2519
	    }
ali@40
  2520
	}
ali@42
  2521
        if (warnings->bin)
ali@44
  2522
	    check_for_odd_characters(aline,warnings,isemptyline);
ali@42
  2523
        if (warnings->longline)
ali@45
  2524
	    check_for_long_line(aline);
ali@45
  2525
        if (warnings->shortline)
ali@45
  2526
	    check_for_short_line(aline,&last);
ali@45
  2527
        last.blen=last.len;
ali@45
  2528
        last.len=strlen(aline);
ali@45
  2529
        last.start=aline[0];
ali@46
  2530
	check_for_starting_punctuation(aline);
ali@42
  2531
        if (warnings->dash)
ali@40
  2532
	{
ali@47
  2533
	    check_for_spaced_emdash(aline);
ali@47
  2534
	    check_for_spaced_dash(aline);
ali@40
  2535
	}
ali@48
  2536
	check_for_unmarked_paragraphs(aline);
ali@49
  2537
	check_for_jeebies(aline);
ali@50
  2538
	check_for_mta_from(aline);
ali@51
  2539
	check_for_orphan_character(aline);
ali@52
  2540
	check_for_pling_scanno(aline);
ali@53
  2541
	check_for_extra_period(aline,warnings);
ali@54
  2542
	check_for_following_punctuation(aline);
ali@55
  2543
	check_for_typos(aline,warnings);
ali@56
  2544
	check_for_misspaced_punctuation(aline,&parities,isemptyline);
ali@57
  2545
	check_for_double_punctuation(aline,warnings);
ali@58
  2546
	check_for_spaced_quotes(aline);
ali@59
  2547
	check_for_miscased_genative(aline);
ali@60
  2548
	check_end_of_line(aline,warnings);
ali@61
  2549
	check_for_unspaced_bracket(aline);
ali@42
  2550
        if (warnings->endquote)
ali@62
  2551
	    check_for_unpunctuated_endquote(aline);
ali@63
  2552
	check_for_html_tag(aline);
ali@40
  2553
        /*
ali@40
  2554
	 * Check for &symbol; HTML.
ali@40
  2555
         * If there is a & in the line, followed at
ali@40
  2556
         * some point by a ; then we suspect HTML.
ali@40
  2557
	 */
ali@40
  2558
        if (strstr(aline,"&") && strstr(aline,";"))
ali@40
  2559
	{
ali@40
  2560
            i=(int)(strstr(aline,";")-strstr(aline,"&")+1);
ali@40
  2561
            for (s=strstr(aline,"&");s<strstr(aline,";");s++)   
ali@40
  2562
                if (*s==CHAR_SPACE)
ali@40
  2563
		    i=0;                /* Don't report "Jones & Son;" */
ali@40
  2564
            if (i>0)
ali@40
  2565
	    {
ali@40
  2566
                strncpy(wrk,strstr(aline,"&"),i);
ali@40
  2567
                wrk[i]=0;
ali@40
  2568
                if (pswit[ECHO_SWITCH])
ali@40
  2569
		    printf("\n%s\n",aline);
ali@0
  2570
                if (!pswit[OVERVIEW_SWITCH])
ali@40
  2571
                    printf("    Line %ld column %d - HTML symbol? %s \n",
ali@40
  2572
		      linecnt,(int)(strstr(aline,"&")-aline)+1,wrk);
ali@0
  2573
                else
ali@0
  2574
                    cnt_html++;
ali@40
  2575
	    }
ali@40
  2576
	}
ali@40
  2577
        /*
ali@40
  2578
	 * At end of paragraph, check for mismatched quotes.
ali@40
  2579
         * We don't want to report an error immediately, since it is a
ali@40
  2580
         * common convention to omit the quotes at end of paragraph if
ali@40
  2581
         * the next paragraph is a continuation of the same speaker.
ali@40
  2582
         * Where this is the case, the next para should begin with a
ali@40
  2583
         * quote, so we store the warning message and only display it
ali@40
  2584
         * at the top of the next iteration if the new para doesn't
ali@40
  2585
         * start with a quote.
ali@40
  2586
         * The -p switch overrides this default, and warns of unclosed
ali@40
  2587
         * quotes on _every_ paragraph, whether the next begins with a
ali@40
  2588
         * quote or not.
ali@40
  2589
	 */
ali@40
  2590
        if (isemptyline)
ali@40
  2591
	{
ali@40
  2592
	    /* end of para - add up the totals */
ali@43
  2593
            if (counters.quot%2)
ali@40
  2594
                sprintf(dquote_err,"    Line %ld - Mismatched quotes\n",
ali@40
  2595
		  linecnt);
ali@43
  2596
            if (pswit[SQUOTE_SWITCH] && counters.open_single_quote &&
ali@43
  2597
	      counters.open_single_quote!=counters.close_single_quote)
ali@40
  2598
                sprintf(squote_err,"    Line %ld - Mismatched singlequotes?\n",
ali@40
  2599
		  linecnt);
ali@43
  2600
            if (pswit[SQUOTE_SWITCH] && counters.open_single_quote &&
ali@43
  2601
	      counters.open_single_quote!=counters.close_single_quote &&
ali@43
  2602
	      counters.open_single_quote!=counters.close_single_quote+1)
ali@40
  2603
		/*
ali@40
  2604
		 * Flag it to be noted regardless of the
ali@40
  2605
		 * first char of the next para.
ali@40
  2606
		 */
ali@40
  2607
                squot=1;
ali@43
  2608
            if (counters.r_brack)
ali@40
  2609
                sprintf(rbrack_err,"    Line %ld - "
ali@40
  2610
		  "Mismatched round brackets?\n",linecnt);
ali@43
  2611
            if (counters.s_brack)
ali@40
  2612
                sprintf(sbrack_err,"    Line %ld - "
ali@40
  2613
		  "Mismatched square brackets?\n",linecnt);
ali@43
  2614
            if (counters.c_brack)
ali@40
  2615
                sprintf(cbrack_err,"    Line %ld - "
ali@40
  2616
		  "Mismatched curly brackets?\n",linecnt);
ali@43
  2617
            if (counters.c_unders%2)
ali@40
  2618
                sprintf(unders_err,"    Line %ld - Mismatched underscores?\n",
ali@40
  2619
		  linecnt);
ali@43
  2620
	    memset(&counters,0,sizeof(counters));
ali@40
  2621
	    /* let the next iteration know that it's starting a new para */
ali@40
  2622
            isnewpara=1;
ali@40
  2623
	}
ali@40
  2624
        /*
ali@40
  2625
	 * Check for omitted punctuation at end of paragraph by working back
ali@40
  2626
	 * through prevline. DW.
ali@40
  2627
         * Need to check this only for "normal" paras.
ali@40
  2628
         * So what is a "normal" para?
ali@40
  2629
         *    Not normal if one-liner (chapter headings, etc.)
ali@40
  2630
         *    Not normal if doesn't contain at least one locase letter
ali@40
  2631
         *    Not normal if starts with space
ali@40
  2632
	 */
ali@40
  2633
        if (isemptyline)
ali@40
  2634
	{
ali@40
  2635
	    /* end of para */
ali@40
  2636
            for (s=prevline,i=0;*s && !i;s++)
ali@0
  2637
                if (gcisletter(*s))
ali@40
  2638
		    /* use i to indicate the presence of a letter on the line */
ali@40
  2639
                    i=1;
ali@40
  2640
            /*
ali@40
  2641
	     * This next "if" is a problem.
ali@40
  2642
             * If we say "start_para_line <= linecnt - 1", that includes
ali@40
  2643
	     * one-line "paragraphs" like chapter heads. Lotsa false positives.
ali@40
  2644
             * If we say "start_para_line < linecnt - 1" it doesn't, but then it
ali@40
  2645
             * misses genuine one-line paragraphs.
ali@40
  2646
	     */
ali@45
  2647
            if (i && last.blen>2 && start_para_line<linecnt-1 &&
ali@40
  2648
	      *prevline>CHAR_SPACE)
ali@40
  2649
	    {
ali@40
  2650
                for (i=strlen(prevline)-1;
ali@40
  2651
		  (prevline[i]==CHAR_DQUOTE || prevline[i]==CHAR_SQUOTE) &&
ali@40
  2652
		  prevline[i]>CHAR_SPACE && i>0;
ali@40
  2653
		  i--)
ali@40
  2654
		    ;
ali@40
  2655
                for (;i>0;i--)
ali@40
  2656
		{
ali@40
  2657
                    if (gcisalpha(prevline[i]))
ali@40
  2658
		    {
ali@40
  2659
                        if (pswit[ECHO_SWITCH])
ali@40
  2660
			    printf("\n%s\n",prevline);
ali@0
  2661
                        if (!pswit[OVERVIEW_SWITCH])
ali@40
  2662
                            printf("    Line %ld column %d - "
ali@40
  2663
			      "No punctuation at para end?\n",
ali@40
  2664
			      linecnt-1,strlen(prevline));
ali@0
  2665
                        else
ali@0
  2666
                            cnt_punct++;
ali@0
  2667
                        break;
ali@40
  2668
		    }
ali@40
  2669
                    if (strchr("-.:!([{?}])",prevline[i]))
ali@0
  2670
                        break;
ali@40
  2671
		}
ali@40
  2672
	    }
ali@40
  2673
	}
ali@40
  2674
        strcpy(prevline,aline);
ali@0
  2675
    }
ali@40
  2676
    fclose(infile);
ali@0
  2677
    if (!pswit[OVERVIEW_SWITCH])
ali@40
  2678
        for (i=0;i<MAX_QWORD;i++)
ali@0
  2679
            if (dupcnt[i])
ali@40
  2680
                printf("\nNote: Queried word %s was duplicated %d time%s\n",
ali@40
  2681
		  qword[i],dupcnt[i],"s");
ali@0
  2682
}
ali@0
  2683
ali@40
  2684
/*
ali@40
  2685
 * flgets:
ali@40
  2686
 *
ali@40
  2687
 * Get one line from the input stream, checking for
ali@40
  2688
 * the existence of exactly one CR/LF line-end per line.
ali@40
  2689
 *
ali@40
  2690
 * Returns: a pointer to the line.
ali@40
  2691
 */
ali@40
  2692
char *flgets(char *theline,int maxlen,FILE *thefile,long lcnt)
ali@0
  2693
{
ali@0
  2694
    char c;
ali@40
  2695
    int len,isCR,cint;
ali@40
  2696
    *theline=0;
ali@40
  2697
    len=isCR=0;
ali@40
  2698
    c=cint=fgetc(thefile);
ali@40
  2699
    do
ali@40
  2700
    {
ali@40
  2701
        if (cint==EOF)
ali@40
  2702
            return NULL;
ali@40
  2703
	/* either way, it's end of line */
ali@40
  2704
        if (c==10)
ali@40
  2705
	{
ali@0
  2706
            if (isCR)
ali@0
  2707
                break;
ali@40
  2708
            else
ali@40
  2709
	    {
ali@40
  2710
		/* Error - a LF without a preceding CR */
ali@40
  2711
                if (pswit[LINE_END_SWITCH])
ali@40
  2712
		{
ali@40
  2713
                    if (pswit[ECHO_SWITCH])
ali@40
  2714
			printf("\n%s\n",theline);
ali@0
  2715
                    if (!pswit[OVERVIEW_SWITCH])
ali@40
  2716
                        printf("    Line %ld - No CR?\n",lcnt);
ali@0
  2717
                    else
ali@0
  2718
                        cnt_lineend++;
ali@40
  2719
		}
ali@0
  2720
                break;
ali@40
  2721
	    }
ali@40
  2722
	}
ali@40
  2723
        if (c==13)
ali@40
  2724
	{
ali@40
  2725
            if (isCR)
ali@40
  2726
	    {
ali@40
  2727
		/* Error - two successive CRs */
ali@40
  2728
                if (pswit[LINE_END_SWITCH])
ali@40
  2729
		{
ali@40
  2730
                    if (pswit[ECHO_SWITCH])
ali@40
  2731
			printf("\n%s\n",theline);
ali@0
  2732
                    if (!pswit[OVERVIEW_SWITCH])
ali@40
  2733
                        printf("    Line %ld - Two successive CRs?\n",lcnt);
ali@0
  2734
                    else
ali@0
  2735
                        cnt_lineend++;
ali@40
  2736
		}
ali@40
  2737
	    }
ali@40
  2738
            isCR=1;
ali@40
  2739
	}
ali@40
  2740
        else
ali@40
  2741
	{
ali@40
  2742
            if (pswit[LINE_END_SWITCH] && isCR)
ali@40
  2743
	    {
ali@40
  2744
                if (pswit[ECHO_SWITCH])
ali@40
  2745
		    printf("\n%s\n",theline);
ali@0
  2746
                if (!pswit[OVERVIEW_SWITCH])
ali@40
  2747
                    printf("    Line %ld column %d - CR without LF?\n",
ali@40
  2748
		      lcnt,len+1);
ali@0
  2749
                else
ali@0
  2750
                    cnt_lineend++;
ali@40
  2751
	    }
ali@40
  2752
            theline[len]=c;
ali@40
  2753
            len++;
ali@40
  2754
            theline[len]=0;
ali@40
  2755
            isCR=0;
ali@40
  2756
	}
ali@40
  2757
        c=cint=fgetc(thefile);
ali@40
  2758
    } while(len<maxlen);
ali@0
  2759
    if (pswit[MARKUP_SWITCH])  
ali@0
  2760
        postprocess_for_HTML(theline);
ali@0
  2761
    if (pswit[DP_SWITCH])  
ali@0
  2762
        postprocess_for_DP(theline);
ali@40
  2763
    return theline;
ali@0
  2764
}
ali@0
  2765
ali@40
  2766
/*
ali@40
  2767
 * mixdigit:
ali@40
  2768
 *
ali@40
  2769
 * Takes a "word" as a parameter, and checks whether it
ali@40
  2770
 * contains a mixture of alpha and digits. Generally, this is an
ali@40
  2771
 * error, but may not be for cases like 4th or L5 12s. 3d.
ali@40
  2772
 *
ali@40
  2773
 * Returns: 0 if no error found, 1 if error.
ali@40
  2774
 */
ali@40
  2775
int mixdigit(char *checkword)
ali@0
  2776
{
ali@40
  2777
    int wehaveadigit,wehavealetter,firstdigits,query,wl;
ali@0
  2778
    char *s;
ali@40
  2779
    wehaveadigit=wehavealetter=query=0;
ali@40
  2780
    for (s=checkword;*s;s++)
ali@0
  2781
        if (gcisalpha(*s))
ali@40
  2782
            wehavealetter=1;
ali@0
  2783
        else
ali@0
  2784
            if (gcisdigit(*s))
ali@40
  2785
                wehaveadigit=1;
ali@40
  2786
    if (wehaveadigit && wehavealetter)
ali@40
  2787
    {
ali@40
  2788
	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
ali@40
  2789
        query=1;
ali@40
  2790
        wl=strlen(checkword);
ali@40
  2791
        for (firstdigits=0;gcisdigit(checkword[firstdigits]);firstdigits++)
ali@0
  2792
            ;
ali@0
  2793
        /* digits, ending in st, rd, nd, th of either case */
ali@40
  2794
        if (firstdigits+2==wl && (matchword(checkword+wl-2,"st") ||
ali@40
  2795
	  matchword(checkword+wl-2,"rd") || matchword(checkword+wl-2,"nd") ||
ali@40
  2796
	  matchword(checkword+wl-2,"th")))
ali@40
  2797
	    query=0;
ali@40
  2798
        if (firstdigits+3==wl && (matchword(checkword+wl-3,"sts") ||
ali@40
  2799
	  matchword(checkword+wl-3,"rds") || matchword(checkword+wl-3,"nds") ||
ali@40
  2800
	  matchword(checkword+wl-3,"ths")))
ali@40
  2801
	    query=0;
ali@40
  2802
        if (firstdigits+3==wl && (matchword(checkword+wl-4,"stly") ||
ali@40
  2803
	  matchword(checkword+wl-4,"rdly") ||
ali@40
  2804
	  matchword(checkword+wl-4,"ndly") || matchword(checkword+wl-4,"thly")))
ali@40
  2805
	    query=0;
ali@0
  2806
        /* digits, ending in l, L, s or d */
ali@40
  2807
        if (firstdigits+1==wl && (checkword[wl-1]=='l' ||
ali@40
  2808
	  checkword[wl-1]=='L' || checkword[wl-1]=='s' || checkword[wl-1]=='d'))
ali@40
  2809
	    query=0;
ali@40
  2810
        /*
ali@40
  2811
	 * L at the start of a number, representing Britsh pounds, like L500.
ali@40
  2812
         * This is cute. We know the current word is mixeddigit. If the first
ali@40
  2813
         * letter is L, there must be at least one digit following. If both
ali@40
  2814
         * digits and letters follow, we have a genuine error, else we have a
ali@40
  2815
         * capital L followed by digits, and we accept that as a non-error.
ali@40
  2816
	 */
ali@40
  2817
        if (checkword[0]=='L' && !mixdigit(checkword+1))
ali@40
  2818
	    query=0;
ali@40
  2819
    }
ali@40
  2820
    return query;
ali@0
  2821
}
ali@0
  2822
ali@40
  2823
/*
ali@40
  2824
 * getaword:
ali@40
  2825
 *
ali@40
  2826
 * Extracts the first/next "word" from the line, and puts
ali@40
  2827
 * it into "thisword". A word is defined as one English word unit--or
ali@40
  2828
 * at least that's the aim.
ali@40
  2829
 *
ali@40
  2830
 * Returns: a pointer to the position in the line where we will start
ali@40
  2831
 *          looking for the next word.
ali@40
  2832
 */
ali@54
  2833
const char *getaword(const char *fromline,char *thisword)
ali@0
  2834
{
ali@40
  2835
    int i,wordlen;
ali@54
  2836
    const char *s;
ali@40
  2837
    wordlen=0;
ali@40
  2838
    for (;!gcisdigit(*fromline) && !gcisalpha(*fromline) && *fromline;
ali@40
  2839
      fromline++)
ali@40
  2840
	;
ali@40
  2841
    /*
ali@40
  2842
     * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
ali@40
  2843
     * Especially yucky is the case of L1,000
ali@40
  2844
     * This section looks for a pattern of characters including a digit
ali@40
  2845
     * followed by a comma or period followed by one or more digits.
ali@40
  2846
     * If found, it returns this whole pattern as a word; otherwise we discard
ali@40
  2847
     * the results and resume our normal programming.
ali@40
  2848
     */
ali@40
  2849
    s=fromline;
ali@40
  2850
    for (;(gcisdigit(*s) || gcisalpha(*s) || *s==',' || *s=='.') &&
ali@40
  2851
      wordlen<MAXWORDLEN;s++)
ali@40
  2852
    {
ali@40
  2853
	thisword[wordlen]=*s;
ali@0
  2854
        wordlen++;
ali@40
  2855
    }
ali@40
  2856
    thisword[wordlen]=0;
ali@40
  2857
    for (i=1;i<wordlen-1;i++)
ali@40
  2858
    {
ali@40
  2859
        if (thisword[i]=='.' || thisword[i]==',')
ali@40
  2860
	{
ali@40
  2861
            if (gcisdigit(thisword[i-1]) && gcisdigit(thisword[i-1]))
ali@40
  2862
	    {
ali@40
  2863
                fromline=s;
ali@40
  2864
                return fromline;
ali@40
  2865
	    }
ali@40
  2866
	}
ali@40
  2867
    }
ali@0
  2868
    /* we didn't find a punctuated number - do the regular getword thing */
ali@40
  2869
    wordlen=0;
ali@40
  2870
    for (;(gcisdigit(*fromline) || gcisalpha(*fromline) || *fromline=='\'') &&
ali@40
  2871
      wordlen<MAXWORDLEN;fromline++)
ali@40
  2872
    {
ali@40
  2873
        thisword[wordlen]=*fromline;
ali@0
  2874
        wordlen++;
ali@40
  2875
    }
ali@40
  2876
    thisword[wordlen]=0;
ali@40
  2877
    return fromline;
ali@0
  2878
}
ali@0
  2879
ali@40
  2880
/*
ali@40
  2881
 * matchword:
ali@40
  2882
 *
ali@40
  2883
 * A case-insensitive string matcher.
ali@40
  2884
 */
ali@40
  2885
int matchword(char *checkfor,char *thisword)
ali@0
  2886
{
ali@40
  2887
    unsigned int ismatch,i;
ali@40
  2888
    if (strlen(checkfor)!=strlen(thisword))
ali@40
  2889
	return 0;
ali@40
  2890
    ismatch=1;     /* assume a match until we find a difference */
ali@40
  2891
    for (i=0;i<strlen(checkfor);i++)
ali@40
  2892
        if (toupper(checkfor[i])!=toupper(thisword[i]))
ali@40
  2893
            ismatch=0;
ali@40
  2894
    return ismatch;
ali@0
  2895
}
ali@0
  2896
ali@40
  2897
/*
ali@40
  2898
 * lowerit:
ali@40
  2899
 *
ali@40
  2900
 * Lowercase the line.
ali@40
  2901
 */
ali@0
  2902
ali@0
  2903
void lowerit(char *theline)
ali@0
  2904
{
ali@40
  2905
    for (;*theline;theline++)
ali@40
  2906
        if (*theline>='A' && *theline<='Z')
ali@40
  2907
            *theline+=32;
ali@0
  2908
}
ali@0
  2909
ali@40
  2910
/*
ali@40
  2911
 * isroman:
ali@40
  2912
 *
ali@40
  2913
 * Is this word a Roman Numeral?
ali@40
  2914
 *
ali@40
  2915
 * It doesn't actually validate that the number is a valid Roman Numeral--for
ali@40
  2916
 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
ali@40
  2917
 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
ali@40
  2918
 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
ali@40
  2919
 * expressions thereof, except when it came to taxes. Allow any number of M,
ali@40
  2920
 * an optional D, an optional CM or CD, any number of optional Cs, an optional
ali@40
  2921
 * XL or an optional XC, an optional IX or IV, an optional V and any number
ali@40
  2922
 * of optional Is.
ali@40
  2923
 */
ali@0
  2924
int isroman(char *t)
ali@0
  2925
{
ali@0
  2926
    char *s;
ali@40
  2927
    if (!t || !*t)
ali@40
  2928
	return 0;
ali@40
  2929
    s=t;
ali@40
  2930
    while (*t=='m' && *t)
ali@40
  2931
	t++;
ali@40
  2932
    if (*t=='d')
ali@40
  2933
	t++;
ali@40
  2934
    if (*t=='c' && t[1]=='m')
ali@40
  2935
	t+=2;
ali@40
  2936
    if (*t=='c' && t[1]=='d')
ali@40
  2937
	t+=2;
ali@40
  2938
    while (*t=='c' && *t)
ali@40
  2939
	t++;
ali@40
  2940
    if (*t=='x' && t[1]=='l')
ali@40
  2941
	t+=2;
ali@40
  2942
    if (*t=='x' && t[1]=='c')
ali@40
  2943
	t+=2;
ali@40
  2944
    if (*t=='l')
ali@40
  2945
	t++;
ali@40
  2946
    while (*t=='x' && *t)
ali@40
  2947
	t++;
ali@40
  2948
    if (*t=='i' && t[1]=='x')
ali@40
  2949
	t+=2;
ali@40
  2950
    if (*t=='i' && t[1]=='v')
ali@40
  2951
	t+=2;
ali@40
  2952
    if (*t=='v')
ali@40
  2953
	t++;
ali@40
  2954
    while (*t=='i' && *t)
ali@40
  2955
	t++;
ali@40
  2956
    return !*t;
ali@0
  2957
}
ali@0
  2958
ali@40
  2959
/*
ali@40
  2960
 * gcisalpha:
ali@40
  2961
 *
ali@40
  2962
 * A version of isalpha() that is somewhat lenient on 8-bit texts.
ali@40
  2963
 * If we use the standard function, 8-bit accented characters break
ali@40
  2964
 * words, so that tete with accented characters appears to be two words, "t"
ali@40
  2965
 * and "t", with 8-bit characters between them. This causes over-reporting of
ali@40
  2966
 * errors. gcisalpha() recognizes accented letters from the CP1252 (Windows)
ali@40
  2967
 * and ISO-8859-1 character sets, which are the most common PG 8-bit types.
ali@40
  2968
 */
ali@0
  2969
int gcisalpha(unsigned char c)
ali@0
  2970
{
ali@40
  2971
    if (c>='a' && c<='z')
ali@40
  2972
	return 1;
ali@40
  2973
    if (c>='A' && c<='Z')
ali@40
  2974
	return 1;
ali@40
  2975
    if (c<140)
ali@40
  2976
	return 0;
ali@40
  2977
    if (c>=192 && c!=208 && c!=215 && c!=222 && c!=240 && c!=247 && c!=254)
ali@40
  2978
	return 1;
ali@40
  2979
    if (c==140 || c==142 || c==156 || c==158 || c==159)
ali@40
  2980
	return 1;
ali@40
  2981
    return 0;
ali@0
  2982
}
ali@0
  2983
ali@40
  2984
/*
ali@40
  2985
 * gcisdigit:
ali@40
  2986
 *
ali@40
  2987
 * A version of isdigit() that doesn't get confused in 8-bit texts.
ali@40
  2988
 */
ali@0
  2989
int gcisdigit(unsigned char c)
ali@0
  2990
{   
ali@40
  2991
    return c>='0' && c<='9';
ali@0
  2992
}
ali@0
  2993
ali@40
  2994
/*
ali@40
  2995
 * gcisletter:
ali@40
  2996
 *
ali@40
  2997
 * A version of isletter() that doesn't get confused in 8-bit texts.
ali@40
  2998
 * NB: this is ISO-8891-1-specific.
ali@40
  2999
 */
ali@0
  3000
int gcisletter(unsigned char c)
ali@0
  3001
{   
ali@40
  3002
    return c>='A' && c<='Z' || c>='a' && c<='z' || c>=192;
ali@0
  3003
}
ali@0
  3004
ali@40
  3005
/*
ali@40
  3006
 * gcstrchr:
ali@40
  3007
 *
ali@40
  3008
 * Wraps strchr to return NULL if the character being searched for is zero.
ali@40
  3009
 */
ali@40
  3010
char *gcstrchr(char *s,char c)
ali@0
  3011
{
ali@40
  3012
    if (!c)
ali@40
  3013
	return NULL;
ali@40
  3014
    return strchr(s,c);
ali@0
  3015
}
ali@0
  3016
ali@40
  3017
/*
ali@40
  3018
 * postprocess_for_DP:
ali@40
  3019
 *
ali@40
  3020
 * Invoked with the -d switch from flgets().
ali@40
  3021
 * It simply "removes" from the line a hard-coded set of common
ali@40
  3022
 * DP-specific tags, so that the line passed to the main routine has
ali@40
  3023
 * been pre-cleaned of DP markup.
ali@40
  3024
 */
ali@0
  3025
void postprocess_for_DP(char *theline)
ali@0
  3026
{
ali@40
  3027
    char *s,*t;
ali@0
  3028
    int i;
ali@0
  3029
    if (!*theline) 
ali@0
  3030
        return;
ali@40
  3031
    for (i=0;*DPmarkup[i];i++)
ali@40
  3032
    {
ali@40
  3033
        s=strstr(theline,DPmarkup[i]);
ali@40
  3034
        while (s)
ali@40
  3035
	{
ali@40
  3036
            t=s+strlen(DPmarkup[i]);
ali@40
  3037
            while (*t)
ali@40
  3038
	    {
ali@40
  3039
                *s=*t;
ali@40
  3040
                t++;
ali@40
  3041
		s++;
ali@40
  3042
	    }
ali@40
  3043
            *s=0;
ali@40
  3044
            s=strstr(theline,DPmarkup[i]);
ali@40
  3045
	}
ali@40
  3046
    }
ali@0
  3047
}
ali@0
  3048
ali@40
  3049
/*
ali@40
  3050
 * postprocess_for_HTML:
ali@40
  3051
 *
ali@40
  3052
 * Invoked with the -m switch from flgets().
ali@40
  3053
 * It simply "removes" from the line a hard-coded set of common
ali@40
  3054
 * HTML tags and "replaces" a hard-coded set of common HTML
ali@40
  3055
 * entities, so that the line passed to the main routine has
ali@40
  3056
 * been pre-cleaned of HTML.
ali@40
  3057
 */
ali@0
  3058
void postprocess_for_HTML(char *theline)
ali@0
  3059
{
ali@40
  3060
    if (strstr(theline,"<") && strstr(theline,">"))
ali@0
  3061
        while (losemarkup(theline))
ali@0
  3062
            ;
ali@0
  3063
    while (loseentities(theline))
ali@0
  3064
        ;
ali@0
  3065
}
ali@0
  3066
ali@0
  3067
char *losemarkup(char *theline)
ali@0
  3068
{
ali@40
  3069
    char *s,*t;
ali@0
  3070
    int i;
ali@0
  3071
    if (!*theline) 
ali@40
  3072
        return NULL;
ali@40
  3073
    s=strstr(theline,"<");
ali@40
  3074
    t=strstr(theline,">");
ali@40
  3075
    if (!s || !t)
ali@40
  3076
	return NULL;
ali@40
  3077
    for (i=0;*markup[i];i++)
ali@40
  3078
        if (!tagcomp(s+1,markup[i]))
ali@40
  3079
	{
ali@40
  3080
            if (!t[1])
ali@40
  3081
	    {
ali@40
  3082
                *s=0;
ali@40
  3083
                return s;
ali@40
  3084
	    }
ali@40
  3085
            else if (t>s)
ali@40
  3086
	    {
ali@40
  3087
		strcpy(s,t+1);
ali@40
  3088
		return s;
ali@40
  3089
	    }
ali@0
  3090
        }
ali@40
  3091
    /* It's an unrecognized <xxx>. */
ali@40
  3092
    return NULL;
ali@0
  3093
}
ali@0
  3094
ali@0
  3095
char *loseentities(char *theline)
ali@0
  3096
{
ali@0
  3097
    int i;
ali@40
  3098
    char *s,*t;
ali@0
  3099
    if (!*theline) 
ali@40
  3100
        return NULL;
ali@40
  3101
    for (i=0;*entities[i].htmlent;i++)
ali@40
  3102
    {
ali@40
  3103
        s=strstr(theline,entities[i].htmlent);
ali@40
  3104
        if (s)
ali@40
  3105
	{
ali@40
  3106
            t=malloc((size_t)strlen(s));
ali@40
  3107
            if (!t)
ali@40
  3108
		return NULL;
ali@40
  3109
            strcpy(t,s+strlen(entities[i].htmlent));
ali@40
  3110
            strcpy(s,entities[i].textent);
ali@40
  3111
            strcat(s,t);
ali@0
  3112
            free(t);
ali@40
  3113
            return theline;
ali@40
  3114
	}
ali@40
  3115
    }
ali@40
  3116
    for (i=0;*entities[i].htmlnum;i++)
ali@40
  3117
    {
ali@40
  3118
        s=strstr(theline,entities[i].htmlnum);
ali@40
  3119
        if (s)
ali@40
  3120
	{
ali@40
  3121
            t=malloc((size_t)strlen(s));
ali@40
  3122
            if (!t)
ali@40
  3123
		return NULL;
ali@40
  3124
            strcpy(t,s+strlen(entities[i].htmlnum));
ali@40
  3125
            strcpy(s,entities[i].textent);
ali@40
  3126
            strcat(s,t);
ali@0
  3127
            free(t);
ali@40
  3128
            return theline;
ali@40
  3129
	}
ali@40
  3130
    }
ali@40
  3131
    return NULL;
ali@0
  3132
}
ali@0
  3133
ali@40
  3134
int tagcomp(char *strin,char *basetag)
ali@0
  3135
{
ali@40
  3136
    char *s,*t;
ali@40
  3137
    s=basetag;
ali@40
  3138
    t=strin;
ali@40
  3139
    if (*t=='/')
ali@40
  3140
	t++; /* ignore a slash */
ali@40
  3141
    while (*s && *t)
ali@40
  3142
    {
ali@40
  3143
        if (tolower(*s)!=tolower(*t))
ali@40
  3144
	    return 1;
ali@40
  3145
        s++;
ali@40
  3146
	t++;
ali@40
  3147
    }
ali@40
  3148
    return 0;
ali@0
  3149
}
ali@0
  3150
ali@40
  3151
void proghelp()
ali@0
  3152
{
ali@40
  3153
    fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
ali@40
  3154
    fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
ali@40
  3155
    fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
ali@40
  3156
    fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
ali@40
  3157
      "For details, read the file COPYING.\n",stderr);
ali@40
  3158
    fputs("This is Free Software; "
ali@40
  3159
      "you may redistribute it under certain conditions (GPL);\n",stderr);
ali@40
  3160
    fputs("read the file COPYING for details.\n\n",stderr);
ali@40
  3161
    fputs("Usage is: bookloupe [-setpxloyhud] filename\n",stderr);
ali@40
  3162
    fputs("  where -s checks single quotes, -e suppresses echoing lines, "
ali@40
  3163
      "-t checks typos\n",stderr);
ali@40
  3164
    fputs("  -x (paranoid) switches OFF -t and extra checks, "
ali@40
  3165
      "-l turns OFF line-end checks\n",stderr);
ali@40
  3166
    fputs("  -o just displays overview without detail, "
ali@40
  3167
      "-h echoes header fields\n",stderr);
ali@40
  3168
    fputs("  -v (verbose) unsuppresses duplicate reporting, "
ali@40
  3169
      "-m suppresses markup\n",stderr);
ali@0
  3170
    fputs("  -d ignores DP-specific markup,\n",stderr);
ali@40
  3171
    fputs("  -u uses a file gutcheck.typ to query user-defined "
ali@40
  3172
      "possible typos\n",stderr);
ali@40
  3173
    fputs("Sample usage: bookloupe warpeace.txt \n",stderr);
ali@0
  3174
    fputs("\n",stderr);
ali@40
  3175
    fputs("Bookloupe looks for errors in Project Gutenberg(TM) etexts.\n",
ali@40
  3176
      stderr);
ali@40
  3177
    fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
ali@40
  3178
      "non-ASCII\n",stderr);
ali@40
  3179
    fputs("characters like accented letters, "
ali@40
  3180
      "lines longer than 75 or shorter than 55,\n",stderr);
ali@40
  3181
    fputs("unbalanced quotes or brackets, "
ali@40
  3182
      "a variety of badly formatted punctuation, \n",stderr);
ali@40
  3183
    fputs("HTML tags, some likely typos. "
ali@40
  3184
      "It is NOT a substitute for human judgement.\n",stderr);
ali@0
  3185
    fputs("\n",stderr);
ali@0
  3186
}