bookloupe/bookloupe.c
author ali <ali@juiblex.co.uk>
Tue May 28 15:17:19 2013 +0100 (2013-05-28)
changeset 69 1016349e619f
parent 68 adb087007d08
child 70 aa916da2e452
permissions -rw-r--r--
Use GLib functions and data types
ali@0
     1
/*************************************************************************/
ali@40
     2
/* bookloupe--check for assorted weirdnesses in a PG candidate text file */
ali@68
     3
/*									 */
ali@68
     4
/* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */
ali@68
     5
/* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */
ali@68
     6
/*									 */
ali@0
     7
/* This program is free software; you can redistribute it and/or modify  */
ali@0
     8
/* it under the terms of the GNU General Public License as published by  */
ali@0
     9
/* the Free Software Foundation; either version 2 of the License, or     */
ali@68
    10
/* (at your option) any later version.					 */
ali@68
    11
/*									 */
ali@0
    12
/* This program is distributed in the hope that it will be useful,       */
ali@68
    13
/* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */
ali@68
    14
/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */
ali@68
    15
/* GNU General Public License for more details.				 */
ali@68
    16
/*									 */
ali@68
    17
/* You should have received a copy of the GNU General Public License	 */
ali@68
    18
/* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */
ali@0
    19
/*************************************************************************/
ali@0
    20
ali@0
    21
#include <stdio.h>
ali@0
    22
#include <stdlib.h>
ali@0
    23
#include <string.h>
ali@0
    24
#include <ctype.h>
ali@69
    25
#include <glib.h>
ali@69
    26
#include <bl/bl.h>
ali@0
    27
ali@69
    28
gchar *prevline;
ali@0
    29
ali@40
    30
/* Common typos. */
ali@40
    31
char *typo[] = {
ali@40
    32
    "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
ali@40
    33
    "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",
ali@40
    34
    "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
ali@40
    35
    "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
ali@40
    36
    "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
ali@40
    37
    "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
ali@40
    38
    "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
ali@40
    39
    "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
ali@40
    40
    "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
ali@40
    41
    "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
ali@40
    42
    "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
ali@40
    43
    "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
ali@40
    44
    "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
ali@40
    45
    "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
ali@40
    46
    "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
ali@40
    47
    "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
ali@40
    48
    "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
ali@40
    49
    "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
ali@40
    50
    "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
ali@40
    51
    "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
ali@40
    52
    "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
ali@40
    53
    "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
ali@40
    54
    "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
ali@40
    55
    "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
ali@40
    56
    "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
ali@40
    57
    "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
ali@40
    58
    "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
ali@40
    59
    "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
ali@40
    60
    "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
ali@40
    61
    "se", ""
ali@40
    62
};
ali@0
    63
ali@69
    64
GTree *usertypo;
ali@0
    65
ali@40
    66
/* Common abbreviations and other OK words not to query as typos. */
ali@40
    67
char *okword[] = {
ali@40
    68
    "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
ali@40
    69
    "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
ali@40
    70
    "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
ali@40
    71
    "outbid", "outbids", "frostbite", "frostbitten", ""
ali@40
    72
};
ali@0
    73
ali@40
    74
/* Common abbreviations that cause otherwise unexplained periods. */
ali@40
    75
char *abbrev[] = {
ali@40
    76
    "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
ali@40
    77
    "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
ali@40
    78
};
ali@0
    79
ali@40
    80
/*
ali@40
    81
 * Two-Letter combinations that rarely if ever start words,
ali@40
    82
 * but are common scannos or otherwise common letter combinations.
ali@40
    83
 */
ali@40
    84
char *nostart[] = {
ali@40
    85
    "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
ali@40
    86
};
ali@0
    87
ali@40
    88
/*
ali@40
    89
 * Two-Letter combinations that rarely if ever end words,
ali@40
    90
 * but are common scannos or otherwise common letter combinations.
ali@40
    91
 */
ali@40
    92
char *noend[] = {
ali@40
    93
    "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
ali@40
    94
    "sw", "gr", "sl", "cl", "iy", ""
ali@40
    95
};
ali@0
    96
ali@40
    97
char *markup[] = {
ali@40
    98
    "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
ali@40
    99
    "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
ali@40
   100
    "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
ali@40
   101
    "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
ali@40
   102
};
ali@0
   103
ali@40
   104
char *DPmarkup[] = {
ali@40
   105
    "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
ali@40
   106
};
ali@0
   107
ali@40
   108
char *nocomma[] = {
ali@40
   109
    "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
ali@40
   110
    "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
ali@40
   111
    "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
ali@40
   112
    "during", "let", "toward", "among", ""
ali@40
   113
};
ali@0
   114
ali@40
   115
char *noperiod[] = {
ali@40
   116
    "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
ali@40
   117
    "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
ali@40
   118
    "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
ali@40
   119
    "among", "those", "into", "whom", "having", "thence", ""
ali@40
   120
}; 
ali@0
   121
ali@40
   122
char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü";
ali@0
   123
ali@0
   124
struct {
ali@0
   125
    char *htmlent;
ali@0
   126
    char *htmlnum;
ali@0
   127
    char *textent;
ali@40
   128
} entities[] = {
ali@40
   129
    "&amp;",	"&#38;",     "&", 
ali@40
   130
    "&lt;",	"&#60;",     "<",
ali@40
   131
    "&gt;",	"&#62;",     ">",
ali@40
   132
    "&deg;",	"&#176;",    " degrees",
ali@40
   133
    "&pound;",	"&#163;",    "L",
ali@40
   134
    "&quot;",	"&#34;",     "\"", /* quotation mark = APL quote */
ali@40
   135
    "&OElig;",	"&#338;",    "OE", /* latin capital ligature OE */
ali@40
   136
    "&oelig;",	"&#339;",    "oe", /* latin small ligature oe */
ali@40
   137
    "&Scaron;",	"&#352;",    "S", /* latin capital letter S with caron */
ali@40
   138
    "&scaron;",	"&#353;",    "s", /* latin small letter s with caron */
ali@40
   139
    "&Yuml;",	"&#376;",    "Y", /* latin capital letter Y with diaeresis */
ali@40
   140
    "&circ;",	"&#710;",    "",  /* modifier letter circumflex accent */
ali@40
   141
    "&tilde;",	"&#732;",    "~", /* small tilde, U+02DC ISOdia */
ali@40
   142
    "&ensp;",	"&#8194;",   " ", /* en space, U+2002 ISOpub */
ali@40
   143
    "&emsp;",	"&#8195;",   " ", /* em space, U+2003 ISOpub */
ali@40
   144
    "&thinsp;",	"&#8201;",   " ", /* thin space, U+2009 ISOpub */
ali@40
   145
    "&ndash;",	"&#8211;",   "-", /* en dash, U+2013 ISOpub */
ali@40
   146
    "&mdash;",	"&#8212;",   "--", /* em dash, U+2014 ISOpub */
ali@40
   147
    "&rsquo;",	"&#8217;",   "'", /* right single quotation mark */
ali@40
   148
    "&sbquo;",	"&#8218;",   "'", /* single low-9 quotation mark */
ali@40
   149
    "&ldquo;",	"&#8220;",   "\"", /* left double quotation mark */
ali@40
   150
    "&rdquo;",	"&#8221;",   "\"", /* right double quotation mark */
ali@40
   151
    "&bdquo;",	"&#8222;",   "\"", /* double low-9 quotation mark */
ali@40
   152
    "&lsaquo;",	"&#8249;",   "\"", /* single left-pointing angle quotation mark */
ali@40
   153
    "&rsaquo;",	"&#8250;",   "\"", /* single right-pointing angle quotation mark */
ali@40
   154
    "&nbsp;",	"&#160;",    " ", /* no-break space = non-breaking space, */
ali@40
   155
    "&iexcl;",	"&#161;",    "!", /* inverted exclamation mark */
ali@40
   156
    "&cent;",	"&#162;",    "c", /* cent sign */
ali@40
   157
    "&pound;",	"&#163;",    "L", /* pound sign */
ali@40
   158
    "&curren;",	"&#164;",    "$", /* currency sign */
ali@40
   159
    "&yen;",	"&#165;",    "Y", /* yen sign = yuan sign */
ali@40
   160
    "&sect;",	"&#167;",    "--", /* section sign */
ali@40
   161
    "&uml;",	"&#168;",    " ", /* diaeresis = spacing diaeresis */
ali@40
   162
    "&copy;",	"&#169;",    "(C) ", /* copyright sign */
ali@40
   163
    "&ordf;",	"&#170;",    " ", /* feminine ordinal indicator */
ali@40
   164
    "&laquo;",	"&#171;",    "\"", /* left-pointing double angle quotation mark */
ali@40
   165
    "&shy;",	"&#173;",    "-", /* soft hyphen = discretionary hyphen */
ali@40
   166
    "&reg;",	"&#174;",    "(R) ", /* registered sign = registered trade mark sign */
ali@40
   167
    "&macr;",	"&#175;",    " ", /* macron = spacing macron = overline */
ali@40
   168
    "&deg;",	"&#176;",    " degrees", /* degree sign */
ali@40
   169
    "&plusmn;",	"&#177;",    "+-", /* plus-minus sign = plus-or-minus sign */
ali@40
   170
    "&sup2;",	"&#178;",    "2", /* superscript two = superscript digit two */
ali@40
   171
    "&sup3;",	"&#179;",    "3", /* superscript three = superscript digit three */
ali@40
   172
    "&acute;",	"&#180;",    " ", /* acute accent = spacing acute */
ali@40
   173
    "&micro;",	"&#181;",    "m", /* micro sign */
ali@40
   174
    "&para;",	"&#182;",    "--", /* pilcrow sign = paragraph sign */
ali@40
   175
    "&cedil;",	"&#184;",    " ", /* cedilla = spacing cedilla */
ali@40
   176
    "&sup1;",	"&#185;",    "1", /* superscript one = superscript digit one */
ali@40
   177
    "&ordm;",	"&#186;",    " ", /* masculine ordinal indicator */
ali@40
   178
    "&raquo;",	"&#187;",    "\"", /* right-pointing double angle quotation mark */
ali@40
   179
    "&frac14;",	"&#188;",    "1/4", /* vulgar fraction one quarter */
ali@40
   180
    "&frac12;",	"&#189;",    "1/2", /* vulgar fraction one half */
ali@40
   181
    "&frac34;",	"&#190;",    "3/4", /* vulgar fraction three quarters */
ali@40
   182
    "&iquest;",	"&#191;",    "?", /* inverted question mark */
ali@40
   183
    "&Agrave;",	"&#192;",    "A", /* latin capital letter A with grave */
ali@40
   184
    "&Aacute;",	"&#193;",    "A", /* latin capital letter A with acute */
ali@40
   185
    "&Acirc;",	"&#194;",    "A", /* latin capital letter A with circumflex */
ali@40
   186
    "&Atilde;",	"&#195;",    "A", /* latin capital letter A with tilde */
ali@40
   187
    "&Auml;",	"&#196;",    "A", /* latin capital letter A with diaeresis */
ali@40
   188
    "&Aring;",	"&#197;",    "A", /* latin capital letter A with ring above */
ali@40
   189
    "&AElig;",	"&#198;",    "AE", /* latin capital letter AE */
ali@40
   190
    "&Ccedil;",	"&#199;",    "C", /* latin capital letter C with cedilla */
ali@40
   191
    "&Egrave;",	"&#200;",    "E", /* latin capital letter E with grave */
ali@40
   192
    "&Eacute;",	"&#201;",    "E", /* latin capital letter E with acute */
ali@40
   193
    "&Ecirc;",	"&#202;",    "E", /* latin capital letter E with circumflex */
ali@40
   194
    "&Euml;",	"&#203;",    "E", /* latin capital letter E with diaeresis */
ali@40
   195
    "&Igrave;",	"&#204;",    "I", /* latin capital letter I with grave */
ali@40
   196
    "&Iacute;",	"&#205;",    "I", /* latin capital letter I with acute */
ali@40
   197
    "&Icirc;",	"&#206;",    "I", /* latin capital letter I with circumflex */
ali@40
   198
    "&Iuml;",	"&#207;",    "I", /* latin capital letter I with diaeresis */
ali@40
   199
    "&ETH;",	"&#208;",    "E", /* latin capital letter ETH */
ali@40
   200
    "&Ntilde;",	"&#209;",    "N", /* latin capital letter N with tilde */
ali@40
   201
    "&Ograve;",	"&#210;",    "O", /* latin capital letter O with grave */
ali@40
   202
    "&Oacute;",	"&#211;",    "O", /* latin capital letter O with acute */
ali@40
   203
    "&Ocirc;",	"&#212;",    "O", /* latin capital letter O with circumflex */
ali@40
   204
    "&Otilde;",	"&#213;",    "O", /* latin capital letter O with tilde */
ali@40
   205
    "&Ouml;",	"&#214;",    "O", /* latin capital letter O with diaeresis */
ali@40
   206
    "&times;",	"&#215;",    "*", /* multiplication sign */
ali@40
   207
    "&Oslash;",	"&#216;",    "O", /* latin capital letter O with stroke */
ali@40
   208
    "&Ugrave;",	"&#217;",    "U", /* latin capital letter U with grave */
ali@40
   209
    "&Uacute;",	"&#218;",    "U", /* latin capital letter U with acute */
ali@40
   210
    "&Ucirc;",	"&#219;",    "U", /* latin capital letter U with circumflex */
ali@40
   211
    "&Uuml;",	"&#220;",    "U", /* latin capital letter U with diaeresis */
ali@40
   212
    "&Yacute;",	"&#221;",    "Y", /* latin capital letter Y with acute */
ali@40
   213
    "&THORN;",	"&#222;",    "TH", /* latin capital letter THORN */
ali@40
   214
    "&szlig;",	"&#223;",    "sz", /* latin small letter sharp s = ess-zed */
ali@40
   215
    "&agrave;",	"&#224;",    "a", /* latin small letter a with grave */
ali@40
   216
    "&aacute;",	"&#225;",    "a", /* latin small letter a with acute */
ali@40
   217
    "&acirc;",	"&#226;",    "a", /* latin small letter a with circumflex */
ali@40
   218
    "&atilde;",	"&#227;",    "a", /* latin small letter a with tilde */
ali@40
   219
    "&auml;",	"&#228;",    "a", /* latin small letter a with diaeresis */
ali@40
   220
    "&aring;",	"&#229;",    "a", /* latin small letter a with ring above */
ali@40
   221
    "&aelig;",	"&#230;",    "ae", /* latin small letter ae */
ali@40
   222
    "&ccedil;",	"&#231;",    "c", /* latin small letter c with cedilla */
ali@40
   223
    "&egrave;",	"&#232;",    "e", /* latin small letter e with grave */
ali@40
   224
    "&eacute;",	"&#233;",    "e", /* latin small letter e with acute */
ali@40
   225
    "&ecirc;",	"&#234;",    "e", /* latin small letter e with circumflex */
ali@40
   226
    "&euml;",	"&#235;",    "e", /* latin small letter e with diaeresis */
ali@40
   227
    "&igrave;",	"&#236;",    "i", /* latin small letter i with grave */
ali@40
   228
    "&iacute;",	"&#237;",    "i", /* latin small letter i with acute */
ali@40
   229
    "&icirc;",	"&#238;",    "i", /* latin small letter i with circumflex */
ali@40
   230
    "&iuml;",	"&#239;",    "i", /* latin small letter i with diaeresis */
ali@40
   231
    "&eth;",	"&#240;",    "eth", /* latin small letter eth */
ali@40
   232
    "&ntilde;",	"&#241;",    "n", /* latin small letter n with tilde */
ali@40
   233
    "&ograve;",	"&#242;",    "o", /* latin small letter o with grave */
ali@40
   234
    "&oacute;",	"&#243;",    "o", /* latin small letter o with acute */
ali@40
   235
    "&ocirc;",	"&#244;",    "o", /* latin small letter o with circumflex */
ali@40
   236
    "&otilde;",	"&#245;",    "o", /* latin small letter o with tilde */
ali@40
   237
    "&ouml;",	"&#246;",    "o", /* latin small letter o with diaeresis */
ali@40
   238
    "&divide;",	"&#247;",    "/", /* division sign */
ali@40
   239
    "&oslash;",	"&#248;",    "o", /* latin small letter o with stroke */
ali@40
   240
    "&ugrave;",	"&#249;",    "u", /* latin small letter u with grave */
ali@40
   241
    "&uacute;",	"&#250;",    "u", /* latin small letter u with acute */
ali@40
   242
    "&ucirc;",	"&#251;",    "u", /* latin small letter u with circumflex */
ali@40
   243
    "&uuml;",	"&#252;",    "u", /* latin small letter u with diaeresis */
ali@40
   244
    "&yacute;",	"&#253;",    "y", /* latin small letter y with acute */
ali@40
   245
    "&thorn;",	"&#254;",    "th", /* latin small letter thorn */
ali@40
   246
    "&yuml;",	"&#255;",    "y", /* latin small letter y with diaeresis */
ali@40
   247
    "", ""
ali@40
   248
};
ali@40
   249
ali@40
   250
/* special characters */
ali@68
   251
#define CHAR_SPACE	  32
ali@68
   252
#define CHAR_TAB	   9
ali@68
   253
#define CHAR_LF		  10
ali@68
   254
#define CHAR_CR		  13
ali@68
   255
#define CHAR_DQUOTE	  34
ali@68
   256
#define CHAR_SQUOTE	  39
ali@0
   257
#define CHAR_OPEN_SQUOTE  96
ali@68
   258
#define CHAR_TILDE	 126
ali@68
   259
#define CHAR_ASTERISK	  42
ali@68
   260
#define CHAR_FORESLASH	  47
ali@68
   261
#define CHAR_CARAT	  94
ali@0
   262
ali@0
   263
#define CHAR_UNDERSCORE    '_'
ali@0
   264
#define CHAR_OPEN_CBRACK   '{'
ali@0
   265
#define CHAR_CLOSE_CBRACK  '}'
ali@0
   266
#define CHAR_OPEN_RBRACK   '('
ali@0
   267
#define CHAR_CLOSE_RBRACK  ')'
ali@0
   268
#define CHAR_OPEN_SBRACK   '['
ali@0
   269
#define CHAR_CLOSE_SBRACK  ']'
ali@0
   270
ali@40
   271
/* longest and shortest normal PG line lengths */
ali@0
   272
#define LONGEST_PG_LINE   75
ali@0
   273
#define WAY_TOO_LONG      80
ali@0
   274
#define SHORTEST_PG_LINE  55
ali@0
   275
ali@69
   276
enum {
ali@69
   277
    ECHO_SWITCH,
ali@69
   278
    SQUOTE_SWITCH,
ali@69
   279
    TYPO_SWITCH,
ali@69
   280
    QPARA_SWITCH,
ali@69
   281
    PARANOID_SWITCH,
ali@69
   282
    LINE_END_SWITCH,
ali@69
   283
    OVERVIEW_SWITCH,
ali@69
   284
    STDOUT_SWITCH,
ali@69
   285
    HEADER_SWITCH,
ali@69
   286
    WEB_SWITCH,
ali@69
   287
    VERBOSE_SWITCH,
ali@69
   288
    MARKUP_SWITCH,
ali@69
   289
    USERTYPO_SWITCH,
ali@69
   290
    DP_SWITCH,
ali@69
   291
    SWITNO
ali@69
   292
};
ali@0
   293
ali@69
   294
gboolean pswit[SWITNO];  /* program switches */
ali@0
   295
ali@69
   296
static GOptionEntry options[]={
ali@69
   297
    { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
ali@69
   298
      "Ignore DP-specific markup", NULL },
ali@69
   299
    { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
ali@69
   300
      "Don't echo queried line", NULL },
ali@69
   301
    { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
ali@69
   302
      "Check single quotes", NULL },
ali@69
   303
    { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
ali@69
   304
      "Check common typos", NULL },
ali@69
   305
    { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
ali@69
   306
      "Require closure of quotes on every paragraph", NULL },
ali@69
   307
    { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
ali@69
   308
      "Disable paranoid querying of everything", NULL },
ali@69
   309
    { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
ali@69
   310
      "Disable line end checking", NULL },
ali@69
   311
    { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
ali@69
   312
      "Overview: just show counts", NULL },
ali@69
   313
    { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
ali@69
   314
      "Output errors to stdout instead of stderr", NULL },
ali@69
   315
    { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
ali@69
   316
      "Echo header fields", NULL },
ali@69
   317
    { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
ali@69
   318
      "Ignore markup in < >", NULL },
ali@69
   319
    { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
ali@69
   320
      "Use file of user-defined typos", NULL },
ali@69
   321
    { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
ali@69
   322
      "Defaults for use on www upload", NULL },
ali@69
   323
    { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
ali@69
   324
      "Verbose - list everything", NULL },
ali@69
   325
    { NULL }
ali@69
   326
};
ali@0
   327
ali@68
   328
long cnt_dquot;		/* for overview mode, count of doublequote queries */
ali@68
   329
long cnt_squot;		/* for overview mode, count of singlequote queries */
ali@68
   330
long cnt_brack;		/* for overview mode, count of brackets queries */
ali@68
   331
long cnt_bin;		/* for overview mode, count of non-ASCII queries */
ali@68
   332
long cnt_odd;		/* for overview mode, count of odd character queries */
ali@68
   333
long cnt_long;		/* for overview mode, count of long line errors */
ali@68
   334
long cnt_short;		/* for overview mode, count of short line queries */
ali@68
   335
long cnt_punct;		/* for overview mode,
ali@68
   336
			   count of punctuation and spacing queries */
ali@68
   337
long cnt_dash;		/* for overview mode, count of dash-related queries */
ali@68
   338
long cnt_word;		/* for overview mode, count of word queries */
ali@68
   339
long cnt_html;		/* for overview mode, count of html queries */
ali@68
   340
long cnt_lineend;	/* for overview mode, count of line-end queries */
ali@68
   341
long cnt_spacend;	/* count of lines with space at end */
ali@68
   342
long linecnt;		/* count of total lines in the file */
ali@68
   343
long checked_linecnt;	/* count of lines actually checked */
ali@0
   344
ali@69
   345
void proghelp(GOptionContext *context);
ali@69
   346
void procfile(const char *);
ali@0
   347
ali@69
   348
gchar *running_from;
ali@0
   349
ali@69
   350
int mixdigit(const char *);
ali@69
   351
gchar *getaword(const char **);
ali@69
   352
char *flgets(char **,long);
ali@69
   353
gboolean gcisalpha(unsigned char);
ali@69
   354
gboolean gcisdigit(unsigned char);
ali@69
   355
gboolean gcisletter(unsigned char);
ali@0
   356
void postprocess_for_HTML(char *);
ali@0
   357
char *linehasmarkup(char *);
ali@0
   358
char *losemarkup(char *);
ali@69
   359
int tagcomp(const char *,const char *);
ali@0
   360
char *loseentities(char *);
ali@69
   361
gboolean isroman(const char *);
ali@0
   362
void postprocess_for_DP(char *);
ali@0
   363
ali@69
   364
GTree *qword,*qperiod;
ali@68
   365
ali@68
   366
struct first_pass_results {
ali@68
   367
    long firstline,astline;
ali@68
   368
    long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;
ali@68
   369
    long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;
ali@68
   370
    long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;
ali@68
   371
    int Dutchcount,Frenchcount;
ali@68
   372
};
ali@68
   373
ali@68
   374
struct warnings {
ali@68
   375
    int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;
ali@69
   376
    int endquote;
ali@69
   377
    gboolean isDutch,isFrench;
ali@68
   378
};
ali@68
   379
ali@68
   380
struct counters {
ali@68
   381
    long quot;
ali@68
   382
    int c_unders,c_brack,s_brack,r_brack;
ali@68
   383
    int open_single_quote,close_single_quote;
ali@68
   384
};
ali@68
   385
ali@68
   386
struct line_properties {
ali@68
   387
    unsigned int len,blen;
ali@68
   388
    char start;
ali@68
   389
};
ali@68
   390
ali@68
   391
struct parities {
ali@68
   392
    int dquote,squote;
ali@68
   393
};
ali@68
   394
ali@68
   395
struct pending {
ali@69
   396
    char *dquote,*squote,*rbrack,*sbrack,*cbrack,*unders;
ali@68
   397
    long squot;
ali@68
   398
};
ali@0
   399
ali@69
   400
void parse_options(int *argc,char ***argv)
ali@0
   401
{
ali@69
   402
    GError *err=NULL;
ali@69
   403
    GOptionContext *context;
ali@69
   404
    context=g_option_context_new(
ali@69
   405
      "file - looks for errors in Project Gutenberg(TM) etexts");
ali@69
   406
    g_option_context_add_main_entries(context,options,NULL);
ali@69
   407
    if (!g_option_context_parse(context,argc,argv,&err))
ali@69
   408
    {
ali@69
   409
	g_printerr("Bookloupe: %s\n",err->message);
ali@69
   410
	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
ali@69
   411
	exit(1);
ali@69
   412
    }
ali@40
   413
    /* Paranoid checking is turned OFF, not on, by its switch */
ali@69
   414
    pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
ali@40
   415
    if (pswit[PARANOID_SWITCH])
ali@69
   416
	/* if running in paranoid mode, typo checks default to enabled */
ali@69
   417
	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
ali@40
   418
    /* Line-end checking is turned OFF, not on, by its switch */
ali@69
   419
    pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
ali@40
   420
    /* Echoing is turned OFF, not on, by its switch */
ali@69
   421
    pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
ali@40
   422
    if (pswit[OVERVIEW_SWITCH])
ali@40
   423
	/* just print summary; don't echo */
ali@69
   424
	pswit[ECHO_SWITCH]=FALSE;
ali@40
   425
    /*
ali@40
   426
     * Web uploads - for the moment, this is really just a placeholder
ali@40
   427
     * until we decide what processing we really want to do on web uploads
ali@40
   428
     */
ali@40
   429
    if (pswit[WEB_SWITCH])
ali@40
   430
    {
ali@40
   431
	/* specific override for web uploads */
ali@69
   432
	pswit[ECHO_SWITCH]=TRUE;
ali@69
   433
	pswit[SQUOTE_SWITCH]=FALSE;
ali@69
   434
	pswit[TYPO_SWITCH]=TRUE;
ali@69
   435
	pswit[QPARA_SWITCH]=FALSE;
ali@69
   436
	pswit[PARANOID_SWITCH]=TRUE;
ali@69
   437
	pswit[LINE_END_SWITCH]=FALSE;
ali@69
   438
	pswit[OVERVIEW_SWITCH]=FALSE;
ali@69
   439
	pswit[STDOUT_SWITCH]=FALSE;
ali@69
   440
	pswit[HEADER_SWITCH]=TRUE;
ali@69
   441
	pswit[VERBOSE_SWITCH]=FALSE;
ali@69
   442
	pswit[MARKUP_SWITCH]=FALSE;
ali@69
   443
	pswit[USERTYPO_SWITCH]=FALSE;
ali@69
   444
	pswit[DP_SWITCH]=FALSE;
ali@40
   445
    }
ali@69
   446
    if (*argc<2)
ali@40
   447
    {
ali@69
   448
	proghelp(context);
ali@69
   449
	exit(1);
ali@40
   450
    }
ali@69
   451
    g_option_context_free(context);
ali@69
   452
}
ali@69
   453
ali@69
   454
/*
ali@69
   455
 * read_user_scannos:
ali@69
   456
 *
ali@69
   457
 * Read in the user-defined stealth scanno list.
ali@69
   458
 */
ali@69
   459
void read_user_scannos(void)
ali@69
   460
{
ali@69
   461
    GError *err=NULL;
ali@69
   462
    gchar *usertypo_file;
ali@69
   463
    gboolean okay;
ali@69
   464
    int i;
ali@69
   465
    gsize len;
ali@69
   466
    gchar *contents,**lines;
ali@69
   467
    usertypo_file=g_strdup("bookloupe.typ");
ali@69
   468
    okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
ali@69
   469
    if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
ali@69
   470
    {
ali@69
   471
	g_clear_error(&err);
ali@69
   472
	g_free(usertypo_file);
ali@69
   473
	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
ali@69
   474
	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
ali@69
   475
    }
ali@69
   476
    if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
ali@69
   477
    {
ali@69
   478
	g_clear_error(&err);
ali@69
   479
	g_free(usertypo_file);
ali@69
   480
	usertypo_file=g_strdup("gutcheck.typ");
ali@69
   481
	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
ali@69
   482
    }
ali@69
   483
    if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
ali@69
   484
    {
ali@69
   485
	g_clear_error(&err);
ali@69
   486
	g_free(usertypo_file);
ali@69
   487
	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
ali@69
   488
	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
ali@69
   489
    }
ali@69
   490
    if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
ali@69
   491
    {
ali@69
   492
	g_free(usertypo_file);
ali@69
   493
	printf("   --> I couldn't find bookloupe.typ "
ali@69
   494
	  "-- proceeding without user typos.\n");
ali@69
   495
	return;
ali@69
   496
    }
ali@69
   497
    else if (!okay)
ali@69
   498
    {
ali@69
   499
	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
ali@69
   500
	g_free(usertypo_file);
ali@69
   501
	g_clear_error(&err);
ali@69
   502
	exit(1);
ali@69
   503
    }
ali@69
   504
    lines=g_strsplit(contents,"\n",0);
ali@69
   505
    usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
ali@69
   506
    for (i=0;lines[i];i++)
ali@69
   507
	if (*(unsigned char *)lines[i]>'!')
ali@69
   508
	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
ali@69
   509
	else
ali@69
   510
	    g_free(lines[i]);
ali@69
   511
    g_free(lines);
ali@69
   512
}
ali@69
   513
ali@69
   514
#if 0
ali@69
   515
/*
ali@69
   516
 * read_etext:
ali@69
   517
 *
ali@69
   518
 * Read an etext returning an array of lines. Lines are normally expected
ali@69
   519
 * to be terminated by CR LF. Solitary LFs delimit lines but are left
ali@69
   520
 * embedded at the end of the line for further processing. Solitary CRs
ali@69
   521
 * do not delimit lines.
ali@69
   522
 */
ali@69
   523
gchar **read_etext(const char *filename,GError **err)
ali@69
   524
{
ali@69
   525
    int i;
ali@69
   526
    const char *s,*t;
ali@69
   527
    gchar *contents;
ali@69
   528
    gchar **raw_lines;
ali@69
   529
    GPtrArray *lines;
ali@69
   530
    gsize len;
ali@69
   531
    if (!g_file_get_contents(filename,&contents,&len,err))
ali@69
   532
	return NULL;
ali@69
   533
    raw_lines=g_strsplit(contents,"\r\n",0);
ali@69
   534
    lines=g_ptr_array_sized_new(g_strv_length(raw_lines)+1);
ali@69
   535
    for (i=0;raw_lines[i];i++)
ali@69
   536
    {
ali@69
   537
	t=strchr(raw_lines[i],'\n');
ali@69
   538
	if (t)
ali@69
   539
	{
ali@69
   540
	    s=raw_lines[i];
ali@69
   541
	    while ((t=strchr(s,'\n')))
ali@69
   542
	    {
ali@69
   543
		g_ptr_array_add(lines,g_strndup(s,t-s+1));
ali@69
   544
		s=t+1;
ali@69
   545
	    }
ali@69
   546
	    g_ptr_array_add(lines,g_strdup(s));
ali@69
   547
	    g_free(raw_lines[i]);
ali@69
   548
	}
ali@69
   549
	else
ali@69
   550
	    g_ptr_array_add(lines,raw_lines[i]);
ali@69
   551
    }
ali@69
   552
    g_free(raw_lines);
ali@69
   553
    g_ptr_array_add(lines,NULL);
ali@69
   554
    return (gchar **)g_ptr_array_free(lines,FALSE);
ali@69
   555
}
ali@69
   556
#else
ali@69
   557
/*
ali@69
   558
 * read_etext:
ali@69
   559
 *
ali@69
   560
 * Read an etext returning a newly allocated string containing the file
ali@69
   561
 * contents or NULL on error.
ali@69
   562
 */
ali@69
   563
gchar *read_etext(const char *filename,GError **err)
ali@69
   564
{
ali@69
   565
    gchar *contents;
ali@69
   566
    gsize len;
ali@69
   567
    if (!g_file_get_contents(filename,&contents,&len,err))
ali@69
   568
	return NULL;
ali@69
   569
    return contents;
ali@69
   570
}
ali@69
   571
#endif
ali@69
   572
ali@69
   573
int main(int argc,char **argv)
ali@69
   574
{
ali@69
   575
    running_from=g_path_get_dirname(argv[0]);
ali@69
   576
    parse_options(&argc,&argv);
ali@40
   577
    if (pswit[USERTYPO_SWITCH])
ali@69
   578
	read_user_scannos();
ali@40
   579
    fprintf(stderr,"bookloupe: Check and report on an e-text\n");
ali@69
   580
    procfile(argv[1]);
ali@40
   581
    if (pswit[OVERVIEW_SWITCH])
ali@40
   582
    {
ali@40
   583
	printf("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
ali@40
   584
	  checked_linecnt,linecnt,linecnt-checked_linecnt);
ali@68
   585
	printf("    --------------- Queries found --------------\n");
ali@68
   586
	if (cnt_long)
ali@68
   587
	    printf("    Long lines:		    %14ld\n",cnt_long);
ali@68
   588
	if (cnt_short)
ali@68
   589
	    printf("    Short lines:		   %14ld\n",cnt_short);
ali@68
   590
	if (cnt_lineend)
ali@68
   591
	    printf("    Line-end problems:	     %14ld\n",cnt_lineend);
ali@68
   592
	if (cnt_word)
ali@68
   593
	    printf("    Common typos:		  %14ld\n",cnt_word);
ali@68
   594
	if (cnt_dquot)
ali@68
   595
	    printf("    Unmatched quotes:	      %14ld\n",cnt_dquot);
ali@68
   596
	if (cnt_squot)
ali@68
   597
	    printf("    Unmatched SingleQuotes:	%14ld\n",cnt_squot);
ali@68
   598
	if (cnt_brack)
ali@68
   599
	    printf("    Unmatched brackets:	    %14ld\n",cnt_brack);
ali@68
   600
	if (cnt_bin)
ali@68
   601
	    printf("    Non-ASCII characters:	  %14ld\n",cnt_bin);
ali@68
   602
	if (cnt_odd)
ali@68
   603
	    printf("    Proofing characters:	   %14ld\n",cnt_odd);
ali@68
   604
	if (cnt_punct)
ali@40
   605
	    printf("    Punctuation & spacing queries: %14ld\n",cnt_punct);
ali@68
   606
	if (cnt_dash)
ali@68
   607
	    printf("    Non-standard dashes:	   %14ld\n",cnt_dash);
ali@68
   608
	if (cnt_html)
ali@68
   609
	    printf("    Possible HTML tags:	    %14ld\n",cnt_html);
ali@68
   610
	printf("\n");
ali@68
   611
	printf("    TOTAL QUERIES		  %14ld\n",
ali@68
   612
	  cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
ali@68
   613
	  cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
ali@40
   614
    }
ali@69
   615
    g_free(running_from);
ali@69
   616
    if (usertypo)
ali@69
   617
	g_tree_unref(usertypo);
ali@40
   618
    return 0;
ali@0
   619
}
ali@0
   620
ali@40
   621
/*
ali@41
   622
 * first_pass:
ali@40
   623
 *
ali@41
   624
 * Run a first pass - verify that it's a valid PG
ali@41
   625
 * file, decide whether to report some things that
ali@41
   626
 * occur many times in the text like long or short
ali@41
   627
 * lines, non-standard dashes, etc.
ali@40
   628
 */
ali@69
   629
struct first_pass_results *first_pass(const char *etext)
ali@0
   630
{
ali@54
   631
    char laststart=CHAR_SPACE;
ali@54
   632
    const char *s;
ali@69
   633
    gchar *lc_line;
ali@69
   634
    int i,j,llen;
ali@69
   635
    gchar **lines;
ali@41
   636
    unsigned int lastlen=0,lastblen=0;
ali@41
   637
    long spline=0,nspline=0;
ali@41
   638
    static struct first_pass_results results={0};
ali@69
   639
    gchar *inword;
ali@69
   640
    lines=g_strsplit(etext,"\n",0);
ali@69
   641
    for (j=0;lines[j];j++)
ali@40
   642
    {
ali@69
   643
	llen=strlen(lines[j]);
ali@69
   644
	while(lines[j][llen-1]=='\r')
ali@69
   645
	    lines[j][llen--]='\0';
ali@68
   646
	linecnt++;
ali@69
   647
	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
ali@69
   648
	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
ali@40
   649
	{
ali@68
   650
	    if (spline)
ali@68
   651
		printf("   --> Duplicate header?\n");
ali@68
   652
	    spline=linecnt+1;   /* first line of non-header text, that is */
ali@40
   653
	}
ali@69
   654
	if (!strncmp(lines[j],"*** START",9) &&
ali@69
   655
	  strstr(lines[j],"PROJECT GUTENBERG"))
ali@40
   656
	{
ali@68
   657
	    if (nspline)
ali@68
   658
		printf("   --> Duplicate header?\n");
ali@68
   659
	    nspline=linecnt+1;   /* first line of non-header text, that is */
ali@40
   660
	}
ali@68
   661
	if (spline || nspline)
ali@40
   662
	{
ali@69
   663
	    lc_line=g_ascii_strdown(lines[j],llen);
ali@69
   664
	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
ali@40
   665
	    {
ali@69
   666
		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
ali@40
   667
		{
ali@68
   668
		    if (results.footerline)
ali@40
   669
		    {
ali@40
   670
			/* it's an old-form header - we can detect duplicates */
ali@68
   671
			if (!nspline)
ali@68
   672
			    printf("   --> Duplicate footer?\n");
ali@40
   673
		    }
ali@68
   674
		    else
ali@68
   675
			results.footerline=linecnt;
ali@40
   676
		}
ali@40
   677
	    }
ali@69
   678
	    g_free(lc_line);
ali@40
   679
	}
ali@68
   680
	if (spline)
ali@41
   681
	    results.firstline=spline;
ali@68
   682
	if (nspline)
ali@41
   683
	    results.firstline=nspline;  /* override with new */
ali@68
   684
	if (results.footerline)
ali@40
   685
	    continue;    /* don't count the boilerplate in the footer */
ali@68
   686
	results.totlen+=llen;
ali@68
   687
	for (i=0;i<llen;i++)
ali@40
   688
	{
ali@69
   689
	    if ((unsigned char)lines[j][i]>127)
ali@41
   690
		results.binlen++;
ali@69
   691
	    if (gcisalpha(lines[j][i]))
ali@41
   692
		results.alphalen++;
ali@69
   693
	    if (i>0 && lines[j][i]==CHAR_DQUOTE && isalpha(lines[j][i-1]))
ali@41
   694
		results.endquote_count++;
ali@40
   695
	}
ali@69
   696
	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
ali@69
   697
	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
ali@41
   698
	    results.shortline++;
ali@69
   699
	if (llen>0 && (unsigned char)lines[j][llen-1]<=CHAR_SPACE)
ali@40
   700
	    cnt_spacend++;
ali@69
   701
	if (strstr(lines[j],".,"))
ali@41
   702
	    results.dotcomma++;
ali@68
   703
	/* only count ast lines for ignoring purposes where there is */
ali@68
   704
	/* locase text on the line */
ali@69
   705
	if (strchr(lines[j],'*'))
ali@40
   706
	{
ali@69
   707
	    for (s=lines[j];*s;s++)
ali@68
   708
		if (*s>='a' && *s<='z')
ali@68
   709
		    break;
ali@68
   710
	     if (*s)
ali@41
   711
		results.astline++;
ali@40
   712
	}
ali@69
   713
	if (strchr(lines[j],'/'))
ali@68
   714
	    results.fslashline++;
ali@69
   715
	for (i=llen-1;i>0 && (unsigned char)lines[j][i]<=CHAR_SPACE;i--)
ali@40
   716
	    ;
ali@69
   717
	if (i>1 && lines[j][i]=='-' && lines[j][i-1]!='-')
ali@41
   718
	    results.hyphens++;
ali@68
   719
	if (llen>LONGEST_PG_LINE)
ali@41
   720
	    results.longline++;
ali@68
   721
	if (llen>WAY_TOO_LONG)
ali@41
   722
	    results.verylongline++;
ali@69
   723
	if (strchr(lines[j],'<') && strchr(lines[j],'>'))
ali@40
   724
	{
ali@69
   725
	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
ali@68
   726
	    if (i>0)
ali@68
   727
		results.htmcount++;
ali@69
   728
	    if (strstr(lines[j],"<i>"))
ali@41
   729
		results.htmcount+=4; /* bonus marks! */
ali@40
   730
	}
ali@68
   731
	/* Check for spaced em-dashes */
ali@69
   732
	if (lines[j][0] && (s=strstr(lines[j]+1,"--")))
ali@40
   733
	{
ali@68
   734
	    results.emdash++;
ali@69
   735
	    if (s[-1]==CHAR_SPACE || (s[2]==CHAR_SPACE))
ali@41
   736
		results.space_emdash++;
ali@69
   737
	    if (s[-1]==CHAR_SPACE && (s[2]==CHAR_SPACE))
ali@40
   738
		/* count of em-dashes with spaces both sides */
ali@41
   739
		results.non_PG_space_emdash++;
ali@69
   740
	    if (s[-1]!=CHAR_SPACE && (s[2]!=CHAR_SPACE))
ali@40
   741
		/* count of PG-type em-dashes with no spaces */
ali@41
   742
		results.PG_space_emdash++;
ali@40
   743
	}
ali@69
   744
	for (s=lines[j];*s;)
ali@40
   745
	{
ali@69
   746
	    inword=getaword(&s);
ali@68
   747
	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) 
ali@68
   748
		results.Dutchcount++;
ali@68
   749
	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) 
ali@68
   750
		results.Frenchcount++;
ali@68
   751
	    if (!strcmp(inword,"0") || !strcmp(inword,"1")) 
ali@68
   752
		results.standalone_digit++;
ali@69
   753
	    g_free(inword);
ali@40
   754
	}
ali@68
   755
	/* Check for spaced dashes */
ali@69
   756
	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
ali@41
   757
	    results.spacedash++;
ali@68
   758
	lastblen=lastlen;
ali@69
   759
	lastlen=llen;
ali@69
   760
	laststart=lines[j][0];
ali@40
   761
    }
ali@69
   762
    g_strfreev(lines);
ali@41
   763
    return &results;
ali@41
   764
}
ali@41
   765
ali@42
   766
/*
ali@42
   767
 * report_first_pass:
ali@42
   768
 *
ali@42
   769
 * Make some snap decisions based on the first pass results.
ali@42
   770
 */
ali@42
   771
struct warnings *report_first_pass(struct first_pass_results *results)
ali@42
   772
{
ali@42
   773
    static struct warnings warnings={0};
ali@42
   774
    if (cnt_spacend>0)
ali@68
   775
	printf("   --> %ld lines in this file have white space at end\n",
ali@42
   776
	  cnt_spacend);
ali@42
   777
    warnings.dotcomma=1;
ali@42
   778
    if (results->dotcomma>5)
ali@42
   779
    {
ali@68
   780
	warnings.dotcomma=0;
ali@68
   781
	printf("   --> %ld lines in this file contain '.,'. "
ali@42
   782
	  "Not reporting them.\n",results->dotcomma);
ali@42
   783
    }
ali@42
   784
    /*
ali@42
   785
     * If more than 50 lines, or one-tenth, are short,
ali@42
   786
     * don't bother reporting them.
ali@42
   787
     */
ali@42
   788
    warnings.shortline=1;
ali@42
   789
    if (results->shortline>50 || results->shortline*10>linecnt)
ali@42
   790
    {
ali@68
   791
	warnings.shortline=0;
ali@68
   792
	printf("   --> %ld lines in this file are short. "
ali@42
   793
	  "Not reporting short lines.\n",results->shortline);
ali@42
   794
    }
ali@42
   795
    /*
ali@42
   796
     * If more than 50 lines, or one-tenth, are long,
ali@42
   797
     * don't bother reporting them.
ali@42
   798
     */
ali@42
   799
    warnings.longline=1;
ali@42
   800
    if (results->longline>50 || results->longline*10>linecnt)
ali@42
   801
    {
ali@68
   802
	warnings.longline=0;
ali@68
   803
	printf("   --> %ld lines in this file are long. "
ali@42
   804
	  "Not reporting long lines.\n",results->longline);
ali@42
   805
    }
ali@42
   806
    /* If more than 10 lines contain asterisks, don't bother reporting them. */
ali@42
   807
    warnings.ast=1;
ali@42
   808
    if (results->astline>10)
ali@42
   809
    {
ali@68
   810
	warnings.ast=0;
ali@68
   811
	printf("   --> %ld lines in this file contain asterisks. "
ali@42
   812
	  "Not reporting them.\n",results->astline);
ali@42
   813
    }
ali@42
   814
    /*
ali@42
   815
     * If more than 10 lines contain forward slashes,
ali@42
   816
     * don't bother reporting them.
ali@42
   817
     */
ali@42
   818
    warnings.fslash=1;
ali@42
   819
    if (results->fslashline>10)
ali@42
   820
    {
ali@68
   821
	warnings.fslash=0;
ali@68
   822
	printf("   --> %ld lines in this file contain forward slashes. "
ali@42
   823
	  "Not reporting them.\n",results->fslashline);
ali@42
   824
    }
ali@42
   825
    /*
ali@42
   826
     * If more than 20 lines contain unpunctuated endquotes,
ali@42
   827
     * don't bother reporting them.
ali@42
   828
     */
ali@42
   829
    warnings.endquote=1;
ali@42
   830
    if (results->endquote_count>20)
ali@42
   831
    {
ali@68
   832
	warnings.endquote=0;
ali@68
   833
	printf("   --> %ld lines in this file contain unpunctuated endquotes. "
ali@42
   834
	  "Not reporting them.\n",results->endquote_count);
ali@42
   835
    }
ali@42
   836
    /*
ali@42
   837
     * If more than 15 lines contain standalone digits,
ali@42
   838
     * don't bother reporting them.
ali@42
   839
     */
ali@42
   840
    warnings.digit=1;
ali@42
   841
    if (results->standalone_digit>10)
ali@42
   842
    {
ali@68
   843
	warnings.digit=0;
ali@68
   844
	printf("   --> %ld lines in this file contain standalone 0s and 1s. "
ali@42
   845
	  "Not reporting them.\n",results->standalone_digit);
ali@42
   846
    }
ali@42
   847
    /*
ali@42
   848
     * If more than 20 lines contain hyphens at end,
ali@42
   849
     * don't bother reporting them.
ali@42
   850
     */
ali@42
   851
    warnings.hyphen=1;
ali@42
   852
    if (results->hyphens>20)
ali@42
   853
    {
ali@68
   854
	warnings.hyphen=0;
ali@68
   855
	printf("   --> %ld lines in this file have hyphens at end. "
ali@42
   856
	  "Not reporting them.\n",results->hyphens);
ali@42
   857
    }
ali@42
   858
    if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
ali@42
   859
    {
ali@68
   860
	printf("   --> Looks like this is HTML. Switching HTML mode ON.\n");
ali@68
   861
	pswit[MARKUP_SWITCH]=1;
ali@42
   862
    }
ali@42
   863
    if (results->verylongline>0)
ali@68
   864
	printf("   --> %ld lines in this file are VERY long!\n",
ali@42
   865
	  results->verylongline);
ali@42
   866
    /*
ali@42
   867
     * If there are more non-PG spaced dashes than PG em-dashes,
ali@42
   868
     * assume it's deliberate.
ali@42
   869
     * Current PG guidelines say don't use them, but older texts do,
ali@42
   870
     * and some people insist on them whatever the guidelines say.
ali@42
   871
     */
ali@42
   872
    warnings.dash=1;
ali@42
   873
    if (results->spacedash+results->non_PG_space_emdash>
ali@42
   874
      results->PG_space_emdash)
ali@42
   875
    {
ali@68
   876
	warnings.dash=0;
ali@68
   877
	printf("   --> There are %ld spaced dashes and em-dashes. "
ali@42
   878
	  "Not reporting them.\n",
ali@42
   879
	  results->spacedash+results->non_PG_space_emdash);
ali@42
   880
    }
ali@42
   881
    /* If more than a quarter of characters are hi-bit, bug out. */
ali@42
   882
    warnings.bin=1;
ali@42
   883
    if (results->binlen*4>results->totlen)
ali@42
   884
    {
ali@68
   885
	printf("   --> This file does not appear to be ASCII. "
ali@42
   886
	  "Terminating. Best of luck with it!\n");
ali@68
   887
	exit(1);
ali@42
   888
    }
ali@42
   889
    if (results->alphalen*4<results->totlen)
ali@42
   890
    {
ali@68
   891
	printf("   --> This file does not appear to be text. "
ali@42
   892
	  "Terminating. Best of luck with it!\n");
ali@68
   893
	exit(1);
ali@42
   894
    }
ali@42
   895
    if (results->binlen*100>results->totlen || results->binlen>100)
ali@42
   896
    {
ali@68
   897
	printf("   --> There are a lot of foreign letters here. "
ali@42
   898
	  "Not reporting them.\n");
ali@68
   899
	warnings.bin=0;
ali@42
   900
    }
ali@69
   901
    warnings.isDutch=FALSE;
ali@42
   902
    if (results->Dutchcount>50)
ali@42
   903
    {
ali@69
   904
	warnings.isDutch=TRUE;
ali@68
   905
	printf("   --> This looks like Dutch - "
ali@42
   906
	  "switching off dashes and warnings for 's Middags case.\n");
ali@42
   907
    }
ali@69
   908
    warnings.isFrench=FALSE;
ali@42
   909
    if (results->Frenchcount>50)
ali@42
   910
    {
ali@69
   911
	warnings.isFrench=TRUE;
ali@68
   912
	printf("   --> This looks like French - "
ali@42
   913
	  "switching off some doublepunct.\n");
ali@42
   914
    }
ali@42
   915
    if (results->firstline && results->footerline)
ali@68
   916
	printf("    The PG header and footer appear to be already on.\n");
ali@42
   917
    else
ali@42
   918
    {
ali@68
   919
	if (results->firstline)
ali@68
   920
	    printf("    The PG header is on - no footer.\n");
ali@68
   921
	if (results->footerline)
ali@68
   922
	    printf("    The PG footer is on - no header.\n");
ali@42
   923
    }
ali@42
   924
    printf("\n");
ali@42
   925
    if (pswit[VERBOSE_SWITCH])
ali@42
   926
    {
ali@68
   927
	warnings.bin=1;
ali@68
   928
	warnings.shortline=1;
ali@68
   929
	warnings.dotcomma=1;
ali@68
   930
	warnings.longline=1;
ali@68
   931
	warnings.dash=1;
ali@68
   932
	warnings.digit=1;
ali@68
   933
	warnings.ast=1;
ali@68
   934
	warnings.fslash=1;
ali@68
   935
	warnings.hyphen=1;
ali@68
   936
	warnings.endquote=1;
ali@68
   937
	printf("   *** Verbose output is ON -- you asked for it! ***\n");
ali@42
   938
    }
ali@42
   939
    if (warnings.isDutch)
ali@68
   940
	warnings.dash=0;
ali@42
   941
    if (results->footerline>0 && results->firstline>0 &&
ali@42
   942
      results->footerline>results->firstline &&
ali@42
   943
      results->footerline-results->firstline<100)
ali@42
   944
    {
ali@68
   945
	printf("   --> I don't really know where this text starts. \n");
ali@68
   946
	printf("       There are no reference points.\n");
ali@68
   947
	printf("       I'm going to have to report the header and footer "
ali@42
   948
	  "as well.\n");
ali@68
   949
	results->firstline=0;
ali@42
   950
    }
ali@42
   951
    return &warnings;
ali@42
   952
}
ali@42
   953
ali@43
   954
/*
ali@43
   955
 * analyse_quotes:
ali@43
   956
 *
ali@43
   957
 * Look along the line, accumulate the count of quotes, and see
ali@43
   958
 * if this is an empty line - i.e. a line with nothing on it
ali@43
   959
 * but spaces.
ali@43
   960
 * If line has just spaces, period, * and/or - on it, don't
ali@43
   961
 * count it, since empty lines with asterisks or dashes to
ali@43
   962
 * separate sections are common.
ali@43
   963
 *
ali@69
   964
 * Returns: TRUE if the line is empty.
ali@43
   965
 */
ali@69
   966
gboolean analyse_quotes(const char *aline,struct counters *counters)
ali@43
   967
{
ali@68
   968
    int guessquote=0;
ali@69
   969
    /* assume the line is empty until proven otherwise */
ali@69
   970
    gboolean isemptyline=TRUE;
ali@69
   971
    const char *s=aline;
ali@43
   972
    while (*s)
ali@43
   973
    {
ali@43
   974
	if (*s==CHAR_DQUOTE)
ali@43
   975
	    counters->quot++;
ali@43
   976
	if (*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE)
ali@43
   977
	{
ali@43
   978
	    if (s==aline)
ali@43
   979
	    {
ali@43
   980
		/*
ali@43
   981
		 * At start of line, it can only be an openquote.
ali@43
   982
		 * Hardcode a very common exception!
ali@43
   983
		 */
ali@43
   984
		if (strncmp(s+2,"tis",3) && strncmp(s+2,"Tis",3))
ali@43
   985
		    counters->open_single_quote++;
ali@43
   986
	    }
ali@43
   987
	    else if (gcisalpha(s[-1]) && gcisalpha(s[1]))
ali@43
   988
		/* Do nothing! it's definitely an apostrophe, not a quote */
ali@43
   989
		;
ali@43
   990
	    /* it's outside a word - let's check it out */
ali@43
   991
	    else if (*s==CHAR_OPEN_SQUOTE || gcisalpha(s[1]))
ali@43
   992
	    {
ali@43
   993
		/* it damwell better BE an openquote */
ali@43
   994
		if (strncmp(s+1,"tis",3) && strncmp(s+1,"Tis",3))
ali@43
   995
		    /* hardcode a very common exception! */
ali@43
   996
		    counters->open_single_quote++;
ali@43
   997
	    }
ali@43
   998
	    else
ali@43
   999
	    {
ali@43
  1000
		/* now - is it a closequote? */
ali@43
  1001
		guessquote=0;   /* accumulate clues */
ali@43
  1002
		if (gcisalpha(s[-1]))
ali@43
  1003
		{
ali@43
  1004
		    /* it follows a letter - could be either */
ali@43
  1005
		    guessquote++;
ali@43
  1006
		    if (s[-1]=='s')
ali@43
  1007
		    {
ali@43
  1008
			/* looks like a plural apostrophe */
ali@43
  1009
			guessquote-=3;
ali@43
  1010
			if (s[1]==CHAR_SPACE)  /* bonus marks! */
ali@43
  1011
			    guessquote-=2;
ali@43
  1012
		    }
ali@43
  1013
		}
ali@43
  1014
		/* it doesn't have a letter either side */
ali@43
  1015
		else if (strchr(".?!,;:",s[-1]) && strchr(".?!,;: ",s[1]))
ali@43
  1016
		    guessquote+=8; /* looks like a closequote */
ali@43
  1017
		else
ali@43
  1018
		    guessquote++;
ali@43
  1019
		if (counters->open_single_quote>counters->close_single_quote)
ali@43
  1020
		    /*
ali@43
  1021
		     * Give it the benefit of some doubt,
ali@43
  1022
		     * if a squote is already open.
ali@43
  1023
		     */
ali@43
  1024
		    guessquote++;
ali@43
  1025
		else
ali@43
  1026
		    guessquote--;
ali@43
  1027
		if (guessquote>=0)
ali@43
  1028
		    counters->close_single_quote++;
ali@43
  1029
	    }
ali@43
  1030
	}
ali@43
  1031
	if (*s!=CHAR_SPACE && *s!='-' && *s!='.' && *s!=CHAR_ASTERISK &&
ali@43
  1032
	  *s!=13 && *s!=10)
ali@69
  1033
	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */
ali@43
  1034
	if (*s==CHAR_UNDERSCORE)
ali@43
  1035
	    counters->c_unders++;
ali@43
  1036
	if (*s==CHAR_OPEN_CBRACK)
ali@43
  1037
	    counters->c_brack++;
ali@43
  1038
	if (*s==CHAR_CLOSE_CBRACK)
ali@43
  1039
	    counters->c_brack--;
ali@43
  1040
	if (*s==CHAR_OPEN_RBRACK)
ali@43
  1041
	    counters->r_brack++;
ali@43
  1042
	if (*s==CHAR_CLOSE_RBRACK)
ali@43
  1043
	    counters->r_brack--;
ali@43
  1044
	if (*s==CHAR_OPEN_SBRACK)
ali@43
  1045
	    counters->s_brack++;
ali@43
  1046
	if (*s==CHAR_CLOSE_SBRACK)
ali@43
  1047
	    counters->s_brack--;
ali@43
  1048
	s++;
ali@43
  1049
    }
ali@43
  1050
    return isemptyline;
ali@43
  1051
}
ali@43
  1052
ali@41
  1053
/*
ali@67
  1054
 * check_for_control_characters:
ali@67
  1055
 *
ali@67
  1056
 * Check for invalid or questionable characters in the line
ali@67
  1057
 * Anything above 127 is invalid for plain ASCII, and
ali@67
  1058
 * non-printable control characters should also be flagged.
ali@67
  1059
 * Tabs should generally not be there.
ali@67
  1060
 */
ali@67
  1061
void check_for_control_characters(const char *aline)
ali@67
  1062
{
ali@67
  1063
    unsigned char c;
ali@67
  1064
    const char *s;
ali@67
  1065
    for (s=aline;*s;s++)
ali@67
  1066
    {
ali@67
  1067
	c=*(unsigned char *)s;
ali@67
  1068
	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
ali@67
  1069
	{
ali@67
  1070
	    if (pswit[ECHO_SWITCH])
ali@67
  1071
		printf("\n%s\n",aline);
ali@67
  1072
	    if (!pswit[OVERVIEW_SWITCH])
ali@67
  1073
		printf("    Line %ld column %d - Control character %d\n",
ali@67
  1074
		  linecnt,(int)(s-aline)+1,c);
ali@67
  1075
	    else
ali@67
  1076
		cnt_bin++;
ali@67
  1077
	}
ali@67
  1078
    }
ali@67
  1079
}
ali@67
  1080
ali@67
  1081
/*
ali@44
  1082
 * check_for_odd_characters:
ali@44
  1083
 *
ali@44
  1084
 * Check for binary and other odd characters.
ali@44
  1085
 */
ali@44
  1086
void check_for_odd_characters(const char *aline,const struct warnings *warnings,
ali@69
  1087
  gboolean isemptyline)
ali@44
  1088
{
ali@44
  1089
    /* Don't repeat multiple warnings on one line. */
ali@68
  1090
    int eNon_A=0,eTab=0,eTilde=0,eCarat=0,eFSlash=0,eAst=0;
ali@44
  1091
    const char *s;
ali@44
  1092
    unsigned char c;
ali@44
  1093
    for (s=aline;*s;s++)
ali@44
  1094
    {
ali@44
  1095
	c=*(unsigned char *)s;
ali@44
  1096
	if (!eNon_A && (*s<CHAR_SPACE && *s!=9 && *s!='\n' || c>127))
ali@44
  1097
	{
ali@44
  1098
	    if (pswit[ECHO_SWITCH])
ali@44
  1099
		printf("\n%s\n",aline);
ali@44
  1100
	    if (!pswit[OVERVIEW_SWITCH])
ali@44
  1101
		if (c>127 && c<160)
ali@44
  1102
		    printf("    Line %ld column %d - "
ali@44
  1103
		      "Non-ISO-8859 character %d\n",linecnt,(int)(s-aline)+1,c);
ali@44
  1104
		else
ali@44
  1105
		    printf("    Line %ld column %d - Non-ASCII character %d\n",
ali@44
  1106
		      linecnt,(int)(s-aline)+1,c);
ali@44
  1107
	    else
ali@44
  1108
		cnt_bin++;
ali@44
  1109
	    eNon_A=1;
ali@44
  1110
	}
ali@44
  1111
	if (!eTab && *s==CHAR_TAB)
ali@44
  1112
	{
ali@44
  1113
	    if (pswit[ECHO_SWITCH])
ali@44
  1114
		printf("\n%s\n",aline);
ali@44
  1115
	    if (!pswit[OVERVIEW_SWITCH])
ali@44
  1116
		printf("    Line %ld column %d - Tab character?\n",
ali@44
  1117
		  linecnt,(int)(s-aline)+1);
ali@44
  1118
	    else
ali@44
  1119
		cnt_odd++;
ali@44
  1120
	    eTab=1;
ali@44
  1121
	}
ali@44
  1122
	if (!eTilde && *s==CHAR_TILDE)
ali@44
  1123
	{
ali@44
  1124
	    /*
ali@44
  1125
	     * Often used by OCR software to indicate an
ali@44
  1126
	     * unrecognizable character.
ali@44
  1127
	     */
ali@44
  1128
	    if (pswit[ECHO_SWITCH])
ali@44
  1129
		printf("\n%s\n",aline);
ali@44
  1130
	    if (!pswit[OVERVIEW_SWITCH])
ali@44
  1131
		printf("    Line %ld column %d - Tilde character?\n",
ali@44
  1132
		  linecnt,(int)(s-aline)+1);
ali@44
  1133
	    else
ali@44
  1134
		cnt_odd++;
ali@44
  1135
	    eTilde=1;
ali@44
  1136
	}
ali@44
  1137
	if (!eCarat && *s==CHAR_CARAT)
ali@44
  1138
	{  
ali@44
  1139
	    if (pswit[ECHO_SWITCH])
ali@44
  1140
		printf("\n%s\n",aline);
ali@44
  1141
	    if (!pswit[OVERVIEW_SWITCH])
ali@44
  1142
		printf("    Line %ld column %d - Carat character?\n",
ali@44
  1143
		  linecnt,(int)(s-aline)+1);
ali@44
  1144
	    else
ali@44
  1145
		cnt_odd++;
ali@44
  1146
	    eCarat=1;
ali@44
  1147
	}
ali@44
  1148
	if (!eFSlash && *s==CHAR_FORESLASH && warnings->fslash)
ali@44
  1149
	{  
ali@44
  1150
	    if (pswit[ECHO_SWITCH])
ali@44
  1151
		printf("\n%s\n",aline);
ali@44
  1152
	    if (!pswit[OVERVIEW_SWITCH])
ali@44
  1153
		printf("    Line %ld column %d - Forward slash?\n",
ali@44
  1154
		  linecnt,(int)(s-aline)+1);
ali@44
  1155
	    else
ali@44
  1156
		cnt_odd++;
ali@44
  1157
	    eFSlash=1;
ali@44
  1158
	}
ali@44
  1159
	/*
ali@44
  1160
	 * Report asterisks only in paranoid mode,
ali@44
  1161
	 * since they're often deliberate.
ali@44
  1162
	 */
ali@44
  1163
	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
ali@44
  1164
	  *s==CHAR_ASTERISK)
ali@44
  1165
	{
ali@44
  1166
	    if (pswit[ECHO_SWITCH])
ali@44
  1167
		printf("\n%s\n",aline);
ali@44
  1168
	    if (!pswit[OVERVIEW_SWITCH])
ali@44
  1169
		printf("    Line %ld column %d - Asterisk?\n",
ali@44
  1170
		  linecnt,(int)(s-aline)+1);
ali@44
  1171
	    else
ali@44
  1172
		cnt_odd++;
ali@44
  1173
	    eAst=1;
ali@44
  1174
	}
ali@44
  1175
    }
ali@44
  1176
}
ali@44
  1177
ali@44
  1178
/*
ali@45
  1179
 * check_for_long_line:
ali@45
  1180
 *
ali@45
  1181
 * Check for line too long.
ali@45
  1182
 */
ali@45
  1183
void check_for_long_line(const char *aline)
ali@45
  1184
{
ali@45
  1185
    if (strlen(aline)>LONGEST_PG_LINE)
ali@45
  1186
    {
ali@45
  1187
	if (pswit[ECHO_SWITCH])
ali@45
  1188
	    printf("\n%s\n",aline);
ali@45
  1189
	if (!pswit[OVERVIEW_SWITCH])
ali@45
  1190
	    printf("    Line %ld column %d - Long line %d\n",
ali@68
  1191
	      linecnt,(int)strlen(aline),(int)strlen(aline));
ali@45
  1192
	else
ali@45
  1193
	    cnt_long++;
ali@45
  1194
    }
ali@45
  1195
}
ali@45
  1196
ali@45
  1197
/*
ali@45
  1198
 * check_for_short_line:
ali@45
  1199
 *
ali@45
  1200
 * Check for line too short.
ali@45
  1201
 *
ali@45
  1202
 * This one is a bit trickier to implement: we don't want to
ali@45
  1203
 * flag the last line of a paragraph for being short, so we
ali@45
  1204
 * have to wait until we know that our current line is a
ali@45
  1205
 * "normal" line, then report the _previous_ line if it was too
ali@45
  1206
 * short. We also don't want to report indented lines like
ali@45
  1207
 * chapter heads or formatted quotations. We therefore keep
ali@45
  1208
 * last->len as the length of the last line examined, and
ali@45
  1209
 * last->blen as the length of the last but one, and try to
ali@45
  1210
 * suppress unnecessary warnings by checking that both were of
ali@45
  1211
 * "normal" length. We keep the first character of the last
ali@45
  1212
 * line in last->start, and if it was a space, we assume that
ali@45
  1213
 * the formatting is deliberate. I can't figure out a way to
ali@45
  1214
 * distinguish something like a quoted verse left-aligned or
ali@45
  1215
 * the header or footer of a letter from a paragraph of short
ali@45
  1216
 * lines - maybe if I examined the whole paragraph, and if the
ali@45
  1217
 * para has less than, say, 8 lines and if all lines are short,
ali@45
  1218
 * then just assume it's OK? Need to look at some texts to see
ali@45
  1219
 * how often a formula like this would get the right result.
ali@45
  1220
 */
ali@45
  1221
void check_for_short_line(const char *aline,const struct line_properties *last)
ali@45
  1222
{
ali@45
  1223
    if (strlen(aline)>1 && last->len>1 && last->len<SHORTEST_PG_LINE &&
ali@45
  1224
      last->blen>1 && last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
ali@45
  1225
    {
ali@45
  1226
	if (pswit[ECHO_SWITCH])
ali@45
  1227
	    printf("\n%s\n",prevline);
ali@45
  1228
	if (!pswit[OVERVIEW_SWITCH])
ali@45
  1229
	    printf("    Line %ld column %d - Short line %d?\n",
ali@68
  1230
	      linecnt-1,(int)strlen(prevline),(int)strlen(prevline));
ali@45
  1231
	else
ali@45
  1232
	    cnt_short++;
ali@45
  1233
    }
ali@45
  1234
}
ali@45
  1235
ali@45
  1236
/*
ali@46
  1237
 * check_for_starting_punctuation:
ali@46
  1238
 *
ali@46
  1239
 * Look for punctuation other than full ellipses at start of line.
ali@46
  1240
 */
ali@46
  1241
void check_for_starting_punctuation(const char *aline)
ali@46
  1242
{
ali@46
  1243
    if (*aline && strchr(".?!,;:",aline[0]) && strncmp(". . .",aline,5))
ali@46
  1244
    {
ali@46
  1245
	if (pswit[ECHO_SWITCH])
ali@46
  1246
	    printf("\n%s\n",aline);
ali@46
  1247
	if (!pswit[OVERVIEW_SWITCH])
ali@46
  1248
	    printf("    Line %ld column 1 - Begins with punctuation?\n",
ali@46
  1249
	      linecnt);
ali@46
  1250
	else
ali@46
  1251
	    cnt_punct++;
ali@46
  1252
    }
ali@46
  1253
}
ali@46
  1254
ali@46
  1255
/*
ali@47
  1256
 * check_for_spaced_emdash:
ali@47
  1257
 *
ali@47
  1258
 * Check for spaced em-dashes.
ali@47
  1259
 *
ali@47
  1260
 * We must check _all_ occurrences of "--" on the line
ali@47
  1261
 * hence the loop - even if the first double-dash is OK
ali@47
  1262
 * there may be another that's wrong later on.
ali@47
  1263
 */
ali@47
  1264
void check_for_spaced_emdash(const char *aline)
ali@47
  1265
{
ali@47
  1266
    const char *s,*t;
ali@47
  1267
    s=aline;
ali@47
  1268
    while ((t=strstr(s,"--")))
ali@47
  1269
    {
ali@47
  1270
	if (t>aline && t[-1]==CHAR_SPACE || t[2]==CHAR_SPACE)
ali@47
  1271
	{
ali@47
  1272
	    if (pswit[ECHO_SWITCH])
ali@47
  1273
		printf("\n%s\n",aline);
ali@47
  1274
	    if (!pswit[OVERVIEW_SWITCH])
ali@47
  1275
		printf("    Line %ld column %d - Spaced em-dash?\n",
ali@47
  1276
		  linecnt,(int)(t-aline)+1);
ali@47
  1277
	    else
ali@47
  1278
		cnt_dash++;
ali@47
  1279
	}
ali@47
  1280
	s=t+2;
ali@47
  1281
    }
ali@47
  1282
}
ali@47
  1283
ali@47
  1284
/*
ali@47
  1285
 * check_for_spaced_dash:
ali@47
  1286
 *
ali@47
  1287
 * Check for spaced dashes.
ali@47
  1288
 */
ali@47
  1289
void check_for_spaced_dash(const char *aline)
ali@47
  1290
{
ali@47
  1291
    const char *s;
ali@47
  1292
    if ((s=strstr(aline," -")))
ali@47
  1293
    {
ali@47
  1294
	if (s[2]!='-')
ali@47
  1295
	{
ali@47
  1296
	    if (pswit[ECHO_SWITCH])
ali@47
  1297
		printf("\n%s\n",aline);
ali@47
  1298
	    if (!pswit[OVERVIEW_SWITCH])
ali@47
  1299
		printf("    Line %ld column %d - Spaced dash?\n",
ali@47
  1300
		  linecnt,(int)(s-aline)+1);
ali@47
  1301
	    else
ali@47
  1302
		cnt_dash++;
ali@47
  1303
	}
ali@47
  1304
    }
ali@47
  1305
    else if ((s=strstr(aline,"- ")))
ali@47
  1306
    {
ali@47
  1307
	if (s==aline || s[-1]!='-')
ali@47
  1308
	{
ali@47
  1309
	    if (pswit[ECHO_SWITCH])
ali@47
  1310
		printf("\n%s\n",aline);
ali@47
  1311
	    if (!pswit[OVERVIEW_SWITCH])
ali@47
  1312
		printf("    Line %ld column %d - Spaced dash?\n",
ali@47
  1313
		  linecnt,(int)(s-aline)+1);
ali@47
  1314
	    else
ali@47
  1315
		cnt_dash++;
ali@47
  1316
	}
ali@47
  1317
    }
ali@47
  1318
}
ali@47
  1319
ali@47
  1320
/*
ali@48
  1321
 * check_for_unmarked_paragraphs:
ali@48
  1322
 *
ali@48
  1323
 * Check for unmarked paragraphs indicated by separate speakers.
ali@48
  1324
 *
ali@48
  1325
 * May well be false positive:
ali@48
  1326
 * "Bravo!" "Wonderful!" called the crowd.
ali@48
  1327
 * but useful all the same.
ali@48
  1328
 */
ali@48
  1329
void check_for_unmarked_paragraphs(const char *aline)
ali@48
  1330
{
ali@48
  1331
    const char *s;
ali@48
  1332
    s=strstr(aline,"\"  \"");
ali@48
  1333
    if (!s)
ali@48
  1334
	s=strstr(aline,"\" \"");
ali@48
  1335
    if (s)
ali@48
  1336
    {
ali@48
  1337
	if (pswit[ECHO_SWITCH])
ali@48
  1338
	    printf("\n%s\n",aline);
ali@48
  1339
	if (!pswit[OVERVIEW_SWITCH])
ali@48
  1340
	    printf("    Line %ld column %d - Query missing paragraph break?\n",
ali@48
  1341
	      linecnt,(int)(s-aline)+1);
ali@48
  1342
	else
ali@48
  1343
	    cnt_punct++;
ali@48
  1344
    }
ali@48
  1345
}
ali@48
  1346
ali@48
  1347
/*
ali@49
  1348
 * check_for_jeebies:
ali@49
  1349
 *
ali@49
  1350
 * Check for "to he" and other easy h/b errors.
ali@49
  1351
 *
ali@49
  1352
 * This is a very inadequate effort on the h/b problem,
ali@49
  1353
 * but the phrase "to he" is always an error, whereas "to
ali@49
  1354
 * be" is quite common.
ali@49
  1355
 * Similarly, '"Quiet!", be said.' is a non-be error
ali@49
  1356
 * "to he" is _not_ always an error!:
ali@49
  1357
 *       "Where they went to he couldn't say."
ali@49
  1358
 * Another false positive:
ali@49
  1359
 *       What would "Cinderella" be without the . . .
ali@49
  1360
 * and another: "If he wants to he can see for himself."
ali@49
  1361
 */
ali@49
  1362
void check_for_jeebies(const char *aline)
ali@49
  1363
{
ali@49
  1364
    const char *s;
ali@49
  1365
    s=strstr(aline," be could ");
ali@49
  1366
    if (!s)
ali@49
  1367
	s=strstr(aline," be would ");
ali@49
  1368
    if (!s)
ali@49
  1369
	s=strstr(aline," was be ");
ali@49
  1370
    if (!s)
ali@49
  1371
	s=strstr(aline," be is ");
ali@49
  1372
    if (!s)
ali@49
  1373
	s=strstr(aline," is be ");
ali@49
  1374
    if (!s)
ali@49
  1375
	s=strstr(aline,"\", be ");
ali@49
  1376
    if (!s)
ali@49
  1377
	s=strstr(aline,"\" be ");
ali@49
  1378
    if (!s)
ali@49
  1379
	s=strstr(aline,"\" be ");
ali@49
  1380
    if (!s)
ali@49
  1381
	s=strstr(aline," to he ");
ali@49
  1382
    if (s)
ali@49
  1383
    {
ali@49
  1384
	if (pswit[ECHO_SWITCH])
ali@49
  1385
	    printf("\n%s\n",aline);
ali@49
  1386
	if (!pswit[OVERVIEW_SWITCH])
ali@49
  1387
	    printf("    Line %ld column %d - Query he/be error?\n",
ali@49
  1388
	      linecnt,(int)(s-aline)+1);
ali@49
  1389
	else
ali@49
  1390
	    cnt_word++;
ali@49
  1391
    }
ali@49
  1392
    s=strstr(aline," the had ");
ali@49
  1393
    if (!s)
ali@49
  1394
	s=strstr(aline," a had ");
ali@49
  1395
    if (!s)
ali@49
  1396
	s=strstr(aline," they bad ");
ali@49
  1397
    if (!s)
ali@49
  1398
	s=strstr(aline," she bad ");
ali@49
  1399
    if (!s)
ali@49
  1400
	s=strstr(aline," he bad ");
ali@49
  1401
    if (!s)
ali@49
  1402
	s=strstr(aline," you bad ");
ali@49
  1403
    if (!s)
ali@49
  1404
	s=strstr(aline," i bad ");
ali@49
  1405
    if (s)
ali@49
  1406
    {
ali@49
  1407
	if (pswit[ECHO_SWITCH])
ali@49
  1408
	    printf("\n%s\n",aline);
ali@49
  1409
	if (!pswit[OVERVIEW_SWITCH])
ali@49
  1410
	    printf("    Line %ld column %d - Query had/bad error?\n",
ali@49
  1411
	      linecnt,(int)(s-aline)+1);
ali@49
  1412
	else
ali@49
  1413
	    cnt_word++;
ali@49
  1414
    }
ali@49
  1415
    s=strstr(aline,"; hut ");
ali@49
  1416
    if (!s)
ali@49
  1417
	s=strstr(aline,", hut ");
ali@49
  1418
    if (s)
ali@49
  1419
    {
ali@49
  1420
	if (pswit[ECHO_SWITCH])
ali@49
  1421
	    printf("\n%s\n",aline);
ali@49
  1422
	if (!pswit[OVERVIEW_SWITCH])
ali@49
  1423
	    printf("    Line %ld column %d - Query hut/but error?\n",
ali@49
  1424
	      linecnt,(int)(s-aline)+1);
ali@49
  1425
	else
ali@49
  1426
	    cnt_word++;
ali@49
  1427
    }
ali@49
  1428
}
ali@49
  1429
ali@49
  1430
/*
ali@50
  1431
 * check_for_mta_from:
ali@50
  1432
 *
ali@50
  1433
 * Special case - angled bracket in front of "From" placed there by an
ali@50
  1434
 * MTA when sending an e-mail.
ali@50
  1435
 */
ali@50
  1436
void check_for_mta_from(const char *aline)
ali@50
  1437
{
ali@50
  1438
    const char *s;
ali@50
  1439
    s=strstr(aline,">From");
ali@50
  1440
    if (s)
ali@50
  1441
    {
ali@50
  1442
	if (pswit[ECHO_SWITCH])
ali@50
  1443
	    printf("\n%s\n",aline);
ali@50
  1444
	if (!pswit[OVERVIEW_SWITCH])
ali@50
  1445
	    printf("    Line %ld column %d - Query angled bracket with From\n",
ali@50
  1446
	      linecnt,(int)(s-aline)+1);
ali@50
  1447
	else
ali@50
  1448
	    cnt_punct++;
ali@50
  1449
    }
ali@50
  1450
}
ali@50
  1451
ali@50
  1452
/*
ali@51
  1453
 * check_for_orphan_character:
ali@51
  1454
 *
ali@51
  1455
 * Check for a single character line -
ali@51
  1456
 * often an overflow from bad wrapping.
ali@51
  1457
 */
ali@51
  1458
void check_for_orphan_character(const char *aline)
ali@51
  1459
{
ali@51
  1460
    if (*aline && !aline[1])
ali@51
  1461
    {
ali@51
  1462
	if (*aline=='I' || *aline=='V' || *aline=='X' || *aline=='L' ||
ali@51
  1463
	  gcisdigit(*aline))
ali@51
  1464
	    ; /* Nothing - ignore numerals alone on a line. */
ali@51
  1465
	else
ali@51
  1466
	{
ali@51
  1467
	    if (pswit[ECHO_SWITCH])
ali@51
  1468
		printf("\n%s\n",aline);
ali@51
  1469
	    if (!pswit[OVERVIEW_SWITCH])
ali@51
  1470
		printf("    Line %ld column 1 - Query single character line\n",
ali@51
  1471
		  linecnt);
ali@51
  1472
	    else
ali@51
  1473
		cnt_punct++;
ali@51
  1474
	}
ali@51
  1475
    }
ali@51
  1476
}
ali@51
  1477
ali@51
  1478
/*
ali@52
  1479
 * check_for_pling_scanno:
ali@52
  1480
 *
ali@52
  1481
 * Check for I" - often should be !
ali@52
  1482
 */
ali@52
  1483
void check_for_pling_scanno(const char *aline)
ali@52
  1484
{
ali@52
  1485
    const char *s;
ali@52
  1486
    s=strstr(aline," I\"");
ali@52
  1487
    if (s)
ali@52
  1488
    {
ali@52
  1489
	if (pswit[ECHO_SWITCH])
ali@52
  1490
	    printf("\n%s\n",aline);
ali@52
  1491
	if (!pswit[OVERVIEW_SWITCH])
ali@52
  1492
	    printf("    Line %ld column %ld - Query I=exclamation mark?\n",
ali@52
  1493
	      linecnt,s-aline);
ali@52
  1494
	else
ali@52
  1495
	    cnt_punct++;
ali@52
  1496
    }
ali@52
  1497
}
ali@52
  1498
ali@52
  1499
/*
ali@53
  1500
 * check_for_extra_period:
ali@53
  1501
 *
ali@53
  1502
 * Check for period without a capital letter. Cut-down from gutspell.
ali@53
  1503
 * Only works when it happens on a single line.
ali@53
  1504
 */
ali@53
  1505
void check_for_extra_period(const char *aline,const struct warnings *warnings)
ali@53
  1506
{
ali@53
  1507
    const char *s,*t,*s1;
ali@69
  1508
    int i;
ali@69
  1509
    gboolean istypo;
ali@69
  1510
    gchar *testword;
ali@53
  1511
    if (pswit[PARANOID_SWITCH])
ali@53
  1512
    {
ali@69
  1513
	for (t=aline;strstr(t,". ");)
ali@53
  1514
	{
ali@53
  1515
	    t=strstr(t,". ");
ali@69
  1516
	    if (t==aline)
ali@53
  1517
	    {
ali@53
  1518
		t++;
ali@53
  1519
		/* start of line punctuation is handled elsewhere */
ali@53
  1520
		continue;
ali@53
  1521
	    }
ali@53
  1522
	    if (!gcisalpha(t[-1]))
ali@53
  1523
	    {
ali@53
  1524
		t++;
ali@53
  1525
		continue;
ali@53
  1526
	    }
ali@53
  1527
	    if (warnings->isDutch)
ali@53
  1528
	    {
ali@53
  1529
		/* For Frank & Jeroen -- 's Middags case */
ali@53
  1530
		if (t[2]==CHAR_SQUOTE && t[3]>='a' && t[3]<='z' &&
ali@53
  1531
		  t[4]==CHAR_SPACE && t[5]>='A' && t[5]<='Z')
ali@53
  1532
		{
ali@53
  1533
		    t++;
ali@53
  1534
		    continue;
ali@53
  1535
		}
ali@53
  1536
	    }
ali@53
  1537
	    s1=t+2;
ali@53
  1538
	    while (*s1 && !gcisalpha(*s1) && !isdigit(*s1))
ali@53
  1539
		s1++;
ali@53
  1540
	    if (*s1>='a' && *s1<='z')
ali@53
  1541
	    {
ali@53
  1542
		/* we have something to investigate */
ali@69
  1543
		istypo=TRUE;
ali@53
  1544
		/* so let's go back and find out */
ali@69
  1545
		for (s1=t-1;s1>=aline &&
ali@53
  1546
		  (gcisalpha(*s1) || gcisdigit(*s1) || *s1==CHAR_SQUOTE &&
ali@53
  1547
		  gcisalpha(s1[1]) && gcisalpha(s1[-1]));s1--)
ali@53
  1548
		    ;
ali@53
  1549
		s1++;
ali@69
  1550
		s=strchr(s1,'.');
ali@69
  1551
		if (s)
ali@69
  1552
		    testword=g_strndup(s1,s-s1);
ali@69
  1553
		else
ali@69
  1554
		    testword=g_strdup(s1);
ali@53
  1555
		for (i=0;*abbrev[i];i++)
ali@53
  1556
		    if (!strcmp(testword,abbrev[i]))
ali@69
  1557
			istypo=FALSE;
ali@53
  1558
		if (gcisdigit(*testword))
ali@69
  1559
		    istypo=FALSE;
ali@53
  1560
		if (!testword[1])
ali@69
  1561
		    istypo=FALSE;
ali@53
  1562
		if (isroman(testword))
ali@69
  1563
		    istypo=FALSE;
ali@53
  1564
		if (istypo)
ali@53
  1565
		{
ali@69
  1566
		    istypo=FALSE;
ali@53
  1567
		    for (i=0;testword[i];i++)
ali@53
  1568
			if (strchr(vowels,testword[i]))
ali@69
  1569
			    istypo=TRUE;
ali@53
  1570
		}
ali@69
  1571
		if (istypo &&
ali@69
  1572
		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
ali@53
  1573
		{
ali@69
  1574
		    g_tree_insert(qperiod,g_strdup(testword),
ali@69
  1575
		      GINT_TO_POINTER(1));
ali@69
  1576
		    if (pswit[ECHO_SWITCH])
ali@69
  1577
			printf("\n%s\n",aline);
ali@69
  1578
		    if (!pswit[OVERVIEW_SWITCH])
ali@69
  1579
			printf("    Line %ld column %d - Extra period?\n",
ali@69
  1580
			  linecnt,(int)(t-aline)+1);
ali@69
  1581
		    else
ali@69
  1582
			cnt_punct++;
ali@53
  1583
		}
ali@69
  1584
		g_free(testword);
ali@53
  1585
	    }
ali@53
  1586
	    t++;
ali@53
  1587
	}
ali@53
  1588
    }
ali@53
  1589
}
ali@53
  1590
ali@53
  1591
/*
ali@54
  1592
 * check_for_following_punctuation:
ali@54
  1593
 *
ali@54
  1594
 * Check for words usually not followed by punctuation.
ali@54
  1595
 */
ali@54
  1596
void check_for_following_punctuation(const char *aline)
ali@54
  1597
{
ali@54
  1598
    int i;
ali@54
  1599
    const char *s,*wordstart;
ali@69
  1600
    gchar *inword,*t;
ali@54
  1601
    if (pswit[TYPO_SWITCH])
ali@54
  1602
    {
ali@54
  1603
	for (s=aline;*s;)
ali@54
  1604
	{
ali@54
  1605
	    wordstart=s;
ali@69
  1606
	    t=getaword(&s);
ali@69
  1607
	    if (!*t)
ali@69
  1608
	    {
ali@69
  1609
		g_free(t);
ali@54
  1610
		continue;
ali@69
  1611
	    }
ali@69
  1612
	    inword=g_ascii_strdown(t,-1);
ali@69
  1613
	    g_free(t);
ali@54
  1614
	    for (i=0;*nocomma[i];i++)
ali@54
  1615
		if (!strcmp(inword,nocomma[i]))
ali@54
  1616
		{
ali@54
  1617
		    if (*s==',' || *s==';' || *s==':')
ali@54
  1618
		    {
ali@54
  1619
			if (pswit[ECHO_SWITCH])
ali@54
  1620
			    printf("\n%s\n",aline);
ali@54
  1621
			if (!pswit[OVERVIEW_SWITCH])
ali@54
  1622
			    printf("    Line %ld column %d - "
ali@54
  1623
			      "Query punctuation after %s?\n",
ali@54
  1624
			      linecnt,(int)(s-aline)+1,inword);
ali@54
  1625
			else
ali@54
  1626
			    cnt_punct++;
ali@54
  1627
		    }
ali@54
  1628
		}
ali@54
  1629
	    for (i=0;*noperiod[i];i++)
ali@54
  1630
		if (!strcmp(inword,noperiod[i]))
ali@54
  1631
		{
ali@54
  1632
		    if (*s=='.' || *s=='!')
ali@54
  1633
		    {
ali@54
  1634
			if (pswit[ECHO_SWITCH])
ali@54
  1635
			    printf("\n%s\n",aline);
ali@54
  1636
			if (!pswit[OVERVIEW_SWITCH])
ali@54
  1637
			    printf("    Line %ld column %d - "
ali@54
  1638
			      "Query punctuation after %s?\n",
ali@54
  1639
			      linecnt,(int)(s-aline)+1,inword);
ali@54
  1640
			else
ali@54
  1641
			    cnt_punct++;
ali@54
  1642
		    }
ali@54
  1643
		}
ali@69
  1644
	    g_free(inword);
ali@54
  1645
	}
ali@54
  1646
    }
ali@54
  1647
}
ali@54
  1648
ali@54
  1649
/*
ali@55
  1650
 * check_for_typos:
ali@55
  1651
 *
ali@55
  1652
 * Check for commonly mistyped words,
ali@55
  1653
 * and digits like 0 for O in a word.
ali@55
  1654
 */
ali@55
  1655
void check_for_typos(const char *aline,struct warnings *warnings)
ali@55
  1656
{
ali@55
  1657
    const char *s,*wordstart;
ali@69
  1658
    gchar *inword,*testword;
ali@69
  1659
    int i,alower,vowel,consonant,*dupcnt;
ali@69
  1660
    gboolean isdup,istypo;
ali@55
  1661
    for (s=aline;*s;)
ali@55
  1662
    {
ali@55
  1663
	wordstart=s;
ali@69
  1664
	inword=getaword(&s);
ali@55
  1665
	if (!*inword)
ali@69
  1666
	{
ali@69
  1667
	    g_free(inword);
ali@55
  1668
	    continue; /* don't bother with empty lines */
ali@69
  1669
	}
ali@55
  1670
	if (mixdigit(inword))
ali@55
  1671
	{
ali@55
  1672
	    if (pswit[ECHO_SWITCH])
ali@55
  1673
		printf("\n%s\n",aline);
ali@55
  1674
	    if (!pswit[OVERVIEW_SWITCH])
ali@55
  1675
		printf("    Line %ld column %d - Query digit in %s\n",
ali@55
  1676
		  linecnt,(int)(wordstart-aline)+1,inword);
ali@55
  1677
	    else
ali@55
  1678
		cnt_word++;
ali@55
  1679
	}
ali@55
  1680
	/*
ali@55
  1681
	 * Put the word through a series of tests for likely typos and OCR
ali@55
  1682
	 * errors.
ali@55
  1683
	 */
ali@69
  1684
	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
ali@55
  1685
	{
ali@69
  1686
	    istypo=FALSE;
ali@69
  1687
	    testword=g_strdup(inword);
ali@55
  1688
	    alower=0;
ali@68
  1689
	    for (i=0;i<(int)strlen(testword);i++)
ali@55
  1690
	    {
ali@55
  1691
		/* lowercase for testing */
ali@55
  1692
		if (testword[i]>='a' && testword[i]<='z')
ali@55
  1693
		    alower=1;
ali@55
  1694
		if (alower && testword[i]>='A' && testword[i]<='Z')
ali@55
  1695
		{
ali@55
  1696
		    /*
ali@55
  1697
		     * We have an uppercase mid-word. However, there are
ali@55
  1698
		     * common cases:
ali@55
  1699
		     *   Mac and Mc like McGill
ali@55
  1700
		     *   French contractions like l'Abbe
ali@55
  1701
		     */
ali@55
  1702
		    if (i==2 && testword[0]=='m' && testword[1]=='c' ||
ali@55
  1703
		      i==3 && testword[0]=='m' && testword[1]=='a' &&
ali@55
  1704
		      testword[2]=='c' || i>0 && testword[i-1]==CHAR_SQUOTE)
ali@55
  1705
			; /* do nothing! */
ali@55
  1706
		    else
ali@69
  1707
			istypo=TRUE;
ali@55
  1708
		}
ali@55
  1709
		testword[i]=(char)tolower(testword[i]);
ali@55
  1710
	    }
ali@69
  1711
	}
ali@69
  1712
	if (pswit[TYPO_SWITCH])
ali@69
  1713
	{
ali@55
  1714
	    /*
ali@55
  1715
	     * Check for certain unlikely two-letter combinations at word
ali@55
  1716
	     * start and end.
ali@55
  1717
	     */
ali@55
  1718
	    if (strlen(testword)>1)
ali@55
  1719
	    {
ali@55
  1720
		for (i=0;*nostart[i];i++)
ali@55
  1721
		    if (!strncmp(testword,nostart[i],2))
ali@69
  1722
			istypo=TRUE;
ali@55
  1723
		for (i=0;*noend[i];i++)
ali@55
  1724
		    if (!strncmp(testword+strlen(testword)-2,noend[i],2))
ali@69
  1725
			istypo=TRUE;
ali@55
  1726
	    }
ali@55
  1727
	    /* ght is common, gbt never. Like that. */
ali@55
  1728
	    if (strstr(testword,"cb"))
ali@69
  1729
		istypo=TRUE;
ali@55
  1730
	    if (strstr(testword,"gbt"))
ali@69
  1731
		istypo=TRUE;
ali@55
  1732
	    if (strstr(testword,"pbt"))
ali@69
  1733
		istypo=TRUE;
ali@55
  1734
	    if (strstr(testword,"tbs"))
ali@69
  1735
		istypo=TRUE;
ali@55
  1736
	    if (strstr(testword,"mrn"))
ali@69
  1737
		istypo=TRUE;
ali@55
  1738
	    if (strstr(testword,"ahle"))
ali@69
  1739
		istypo=TRUE;
ali@55
  1740
	    if (strstr(testword,"ihle"))
ali@69
  1741
		istypo=TRUE;
ali@55
  1742
	    /*
ali@55
  1743
	     * "TBE" does happen - like HEARTBEAT - but uncommon.
ali@55
  1744
	     * Also "TBI" - frostbite, outbid - but uncommon.
ali@55
  1745
	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman
ali@55
  1746
	     * numerals, but "ii" is a common scanno.
ali@55
  1747
	     */
ali@55
  1748
	    if (strstr(testword,"tbi"))
ali@69
  1749
		istypo=TRUE;
ali@55
  1750
	    if (strstr(testword,"tbe"))
ali@69
  1751
		istypo=TRUE;
ali@55
  1752
	    if (strstr(testword,"ii"))
ali@69
  1753
		istypo=TRUE;
ali@55
  1754
	    /*
ali@55
  1755
	     * Check for no vowels or no consonants.
ali@55
  1756
	     * If none, flag a typo.
ali@55
  1757
	     */
ali@55
  1758
	    if (!istypo && strlen(testword)>1)
ali@55
  1759
	    {
ali@55
  1760
		vowel=consonant=0;
ali@55
  1761
		for (i=0;testword[i];i++)
ali@55
  1762
		{
ali@55
  1763
		    if (testword[i]=='y' || gcisdigit(testword[i]))
ali@55
  1764
		    {
ali@55
  1765
			/* Yah, this is loose. */
ali@55
  1766
			vowel++;
ali@55
  1767
			consonant++;
ali@55
  1768
		    }
ali@55
  1769
		    else if (strchr(vowels,testword[i]))
ali@55
  1770
			vowel++;
ali@55
  1771
		    else
ali@55
  1772
			consonant++;
ali@55
  1773
		}
ali@55
  1774
		if (!vowel || !consonant)
ali@69
  1775
		    istypo=TRUE;
ali@55
  1776
	    }
ali@55
  1777
	    /*
ali@55
  1778
	     * Now exclude the word from being reported if it's in
ali@55
  1779
	     * the okword list.
ali@55
  1780
	     */
ali@55
  1781
	    for (i=0;*okword[i];i++)
ali@55
  1782
		if (!strcmp(testword,okword[i]))
ali@69
  1783
		    istypo=FALSE;
ali@55
  1784
	    /*
ali@55
  1785
	     * What looks like a typo may be a Roman numeral.
ali@55
  1786
	     * Exclude these.
ali@55
  1787
	     */
ali@55
  1788
	    if (istypo && isroman(testword))
ali@69
  1789
		istypo=FALSE;
ali@55
  1790
	    /* Check the manual list of typos. */
ali@55
  1791
	    if (!istypo)
ali@55
  1792
		for (i=0;*typo[i];i++)
ali@55
  1793
		    if (!strcmp(testword,typo[i]))
ali@69
  1794
			istypo=TRUE;
ali@55
  1795
	    /*
ali@55
  1796
	     * Check lowercase s, l, i and m - special cases.
ali@55
  1797
	     *   "j" - often a semi-colon gone wrong.
ali@55
  1798
	     *   "d" for a missing apostrophe - he d
ali@55
  1799
	     *   "n" for "in"
ali@55
  1800
	     */
ali@55
  1801
	    if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword))
ali@69
  1802
		istypo=TRUE;
ali@55
  1803
	    if (istypo)
ali@55
  1804
	    {
ali@69
  1805
		dupcnt=g_tree_lookup(qword,testword);
ali@69
  1806
		if (dupcnt)
ali@69
  1807
		{
ali@69
  1808
		    (*dupcnt)++;
ali@69
  1809
		    isdup=!pswit[VERBOSE_SWITCH];
ali@69
  1810
		}
ali@69
  1811
		else
ali@69
  1812
		{
ali@69
  1813
		    dupcnt=g_new0(int,1);
ali@69
  1814
		    g_tree_insert(qword,g_strdup(testword),dupcnt);
ali@69
  1815
		    isdup=FALSE;
ali@69
  1816
		}
ali@55
  1817
		if (!isdup)
ali@55
  1818
		{
ali@55
  1819
		    if (pswit[ECHO_SWITCH])
ali@55
  1820
			printf("\n%s\n",aline);
ali@55
  1821
		    if (!pswit[OVERVIEW_SWITCH])
ali@55
  1822
		    {
ali@55
  1823
			printf("    Line %ld column %d - Query word %s",
ali@55
  1824
			  linecnt,(int)(wordstart-aline)+1,inword);
ali@69
  1825
			if (!pswit[VERBOSE_SWITCH])
ali@55
  1826
			    printf(" - not reporting duplicates");
ali@55
  1827
			printf("\n");
ali@55
  1828
		    }
ali@55
  1829
		    else
ali@55
  1830
			cnt_word++;
ali@55
  1831
		}
ali@55
  1832
	    }
ali@55
  1833
	}
ali@55
  1834
	/* check the user's list of typos */
ali@69
  1835
	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
ali@69
  1836
	{
ali@69
  1837
	    if (pswit[ECHO_SWITCH])
ali@69
  1838
		printf("\n%s\n",aline);
ali@69
  1839
	    if (!pswit[OVERVIEW_SWITCH])  
ali@69
  1840
		printf("    Line %ld column %d - Query possible scanno %s\n",
ali@69
  1841
		  linecnt,(int)(wordstart-aline)+2,inword);
ali@69
  1842
	}
ali@69
  1843
	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
ali@69
  1844
	    g_free(testword);
ali@55
  1845
	if (pswit[PARANOID_SWITCH] && warnings->digit)
ali@55
  1846
	{
ali@55
  1847
	    /* In paranoid mode, query all 0 and 1 standing alone. */
ali@55
  1848
	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))
ali@55
  1849
	    {
ali@55
  1850
		if (pswit[ECHO_SWITCH])
ali@55
  1851
		    printf("\n%s\n",aline);
ali@55
  1852
		if (!pswit[OVERVIEW_SWITCH])
ali@55
  1853
		    printf("    Line %ld column %d - Query standalone %s\n",
ali@55
  1854
		      linecnt,(int)(wordstart-aline)+2,inword);
ali@55
  1855
		else
ali@55
  1856
		    cnt_word++;
ali@55
  1857
	    }
ali@55
  1858
	}
ali@69
  1859
	g_free(inword);
ali@55
  1860
    }
ali@55
  1861
}
ali@55
  1862
ali@56
  1863
/*
ali@56
  1864
 * check_for_misspaced_punctuation:
ali@56
  1865
 *
ali@56
  1866
 * Look for added or missing spaces around punctuation and quotes.
ali@56
  1867
 * If there is a punctuation character like ! with no space on
ali@56
  1868
 * either side, suspect a missing!space. If there are spaces on
ali@56
  1869
 * both sides , assume a typo. If we see a double quote with no
ali@56
  1870
 * space or punctuation on either side of it, assume unspaced
ali@56
  1871
 * quotes "like"this.
ali@56
  1872
 */
ali@56
  1873
void check_for_misspaced_punctuation(const char *aline,
ali@69
  1874
  struct parities *parities,gboolean isemptyline)
ali@56
  1875
{
ali@69
  1876
    int i,llen;
ali@69
  1877
    gboolean isacro,isellipsis;
ali@56
  1878
    const char *s;
ali@56
  1879
    llen=strlen(aline);
ali@56
  1880
    for (i=1;i<llen;i++)
ali@56
  1881
    {
ali@56
  1882
	/* For each character in the line after the first. */
ali@56
  1883
	if (strchr(".?!,;:_",aline[i]))  /* if it's punctuation */
ali@56
  1884
	{
ali@56
  1885
	    /* we need to suppress warnings for acronyms like M.D. */
ali@69
  1886
	    isacro=FALSE;
ali@56
  1887
	    /* we need to suppress warnings for ellipsis . . . */
ali@69
  1888
	    isellipsis=FALSE;
ali@56
  1889
	    /* if there are letters on both sides of it or ... */
ali@56
  1890
	    if (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1]) ||
ali@56
  1891
	       gcisalpha(aline[i+1]) && strchr("?!,;:",aline[i]))
ali@56
  1892
	    {
ali@56
  1893
		/* ...if it's strict punctuation followed by an alpha */
ali@56
  1894
		if (aline[i]=='.')
ali@56
  1895
		{
ali@56
  1896
		    if (i>2 && aline[i-2]=='.')
ali@69
  1897
			isacro=TRUE;
ali@56
  1898
		    if (i+2<llen && aline[i+2]=='.')
ali@69
  1899
			isacro=TRUE;
ali@56
  1900
		}
ali@56
  1901
		if (!isacro)
ali@56
  1902
		{
ali@56
  1903
		    if (pswit[ECHO_SWITCH])
ali@56
  1904
			printf("\n%s\n",aline);
ali@56
  1905
		    if (!pswit[OVERVIEW_SWITCH])
ali@56
  1906
			printf("    Line %ld column %d - Missing space?\n",
ali@56
  1907
			  linecnt,i+1);
ali@56
  1908
		    else
ali@56
  1909
			cnt_punct++;
ali@56
  1910
		}
ali@56
  1911
	    }
ali@56
  1912
	    if (aline[i-1]==CHAR_SPACE &&
ali@56
  1913
	      (aline[i+1]==CHAR_SPACE || aline[i+1]==0))
ali@56
  1914
	    {
ali@56
  1915
		/*
ali@56
  1916
		 * If there are spaces on both sides,
ali@56
  1917
		 * or space before and end of line.
ali@56
  1918
		 */
ali@56
  1919
		if (aline[i]=='.')
ali@56
  1920
		{
ali@56
  1921
		    if (i>2 && aline[i-2]=='.')
ali@69
  1922
			isellipsis=TRUE;
ali@56
  1923
		    if (i+2<llen && aline[i+2]=='.')
ali@69
  1924
			isellipsis=TRUE;
ali@56
  1925
		}
ali@56
  1926
		if (!isemptyline && !isellipsis)
ali@56
  1927
		{
ali@56
  1928
		    if (pswit[ECHO_SWITCH])
ali@56
  1929
			printf("\n%s\n",aline);
ali@56
  1930
		    if (!pswit[OVERVIEW_SWITCH])
ali@56
  1931
			printf("    Line %ld column %d - "
ali@56
  1932
			  "Spaced punctuation?\n",linecnt,i+1);
ali@56
  1933
		    else
ali@56
  1934
			cnt_punct++;
ali@56
  1935
		}
ali@56
  1936
	    }
ali@56
  1937
	}
ali@56
  1938
    }
ali@56
  1939
    /* Split out the characters that CANNOT be preceded by space. */
ali@56
  1940
    llen=strlen(aline);
ali@56
  1941
    for (i=1;i<llen;i++)
ali@56
  1942
    {
ali@56
  1943
	/* for each character in the line after the first */
ali@56
  1944
	if (strchr("?!,;:",aline[i]))
ali@56
  1945
	{
ali@56
  1946
	    /* if it's punctuation that _cannot_ have a space before it */
ali@56
  1947
	    if (aline[i-1]==CHAR_SPACE && !isemptyline &&
ali@56
  1948
	      aline[i+1]!=CHAR_SPACE)
ali@56
  1949
	    {
ali@56
  1950
		/*
ali@56
  1951
		 * If aline[i+1) DOES == space,
ali@56
  1952
		 * it was already reported just above.
ali@56
  1953
		 */
ali@56
  1954
		if (pswit[ECHO_SWITCH])
ali@56
  1955
		    printf("\n%s\n",aline);
ali@56
  1956
		if (!pswit[OVERVIEW_SWITCH])
ali@56
  1957
		    printf("    Line %ld column %d - Spaced punctuation?\n",
ali@56
  1958
		      linecnt,i+1);
ali@56
  1959
		else
ali@56
  1960
		    cnt_punct++;
ali@56
  1961
	    }
ali@56
  1962
	}
ali@56
  1963
    }
ali@56
  1964
    /*
ali@56
  1965
     * Special case " .X" where X is any alpha.
ali@56
  1966
     * This plugs a hole in the acronym code above.
ali@56
  1967
     * Inelegant, but maintainable.
ali@56
  1968
     */
ali@56
  1969
    llen=strlen(aline);
ali@56
  1970
    for (i=1;i<llen;i++)
ali@56
  1971
    {
ali@56
  1972
	/* for each character in the line after the first */
ali@56
  1973
	if (aline[i]=='.')
ali@56
  1974
	{
ali@56
  1975
	    /* if it's a period */
ali@56
  1976
	    if (aline[i-1]==CHAR_SPACE && gcisalpha(aline[i+1]))
ali@56
  1977
	    {
ali@56
  1978
		/*
ali@56
  1979
		 * If the period follows a space and
ali@56
  1980
		 * is followed by a letter.
ali@56
  1981
		 */
ali@56
  1982
		if (pswit[ECHO_SWITCH])
ali@56
  1983
		    printf("\n%s\n",aline);
ali@56
  1984
		if (!pswit[OVERVIEW_SWITCH])
ali@56
  1985
		    printf("    Line %ld column %d - Spaced punctuation?\n",
ali@56
  1986
		      linecnt,i+1);
ali@56
  1987
		else
ali@56
  1988
		    cnt_punct++;
ali@56
  1989
	    }
ali@56
  1990
	}
ali@56
  1991
    }
ali@56
  1992
    for (i=1;i<llen;i++)
ali@56
  1993
    {
ali@56
  1994
	/* for each character in the line after the first */
ali@56
  1995
	if (aline[i]==CHAR_DQUOTE)
ali@56
  1996
	{
ali@56
  1997
	    if (!strchr(" _-.'`,;:!/([{?}])",aline[i-1]) &&
ali@56
  1998
	      !strchr(" _-.'`,;:!/([{?}])",aline[i+1]) && aline[i+1] ||
ali@56
  1999
	      !strchr(" _-([{'`",aline[i-1]) && gcisalpha(aline[i+1]))
ali@56
  2000
	    {
ali@56
  2001
		if (pswit[ECHO_SWITCH])
ali@56
  2002
		    printf("\n%s\n",aline);
ali@56
  2003
		if (!pswit[OVERVIEW_SWITCH])
ali@56
  2004
		    printf("    Line %ld column %d - Unspaced quotes?\n",
ali@56
  2005
		      linecnt,i+1);
ali@56
  2006
		else
ali@56
  2007
		    cnt_punct++;
ali@56
  2008
	    }
ali@56
  2009
	}
ali@56
  2010
    }
ali@56
  2011
    /* Check parity of quotes. */
ali@56
  2012
    for (s=aline;*s;s++)
ali@56
  2013
    {
ali@56
  2014
	if (*s==CHAR_DQUOTE)
ali@56
  2015
	{
ali@56
  2016
	    parities->dquote=!parities->dquote;
ali@56
  2017
	    if (!parities->dquote)
ali@56
  2018
	    {
ali@56
  2019
		/* parity even */
ali@56
  2020
		if (!strchr("_-.'`/,;:!?)]} ",s[1]))
ali@56
  2021
		{
ali@56
  2022
		    if (pswit[ECHO_SWITCH])
ali@56
  2023
			printf("\n%s\n",aline);
ali@56
  2024
		    if (!pswit[OVERVIEW_SWITCH])
ali@56
  2025
			printf("    Line %ld column %d - "
ali@56
  2026
			  "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);
ali@56
  2027
		    else
ali@56
  2028
			cnt_punct++;
ali@56
  2029
		}
ali@56
  2030
	    }
ali@56
  2031
	    else
ali@56
  2032
	    {
ali@56
  2033
		/* parity odd */
ali@56
  2034
		if (!gcisalpha(s[1]) && !isdigit(s[1]) &&
ali@56
  2035
		  !strchr("_-/.'`([{$",s[1]) || !s[1])
ali@56
  2036
		{
ali@56
  2037
		    if (pswit[ECHO_SWITCH])
ali@56
  2038
			printf("\n%s\n",aline);
ali@56
  2039
		    if (!pswit[OVERVIEW_SWITCH])
ali@56
  2040
			printf("    Line %ld column %d - "
ali@56
  2041
			  "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);
ali@56
  2042
		    else
ali@56
  2043
			cnt_punct++;
ali@56
  2044
		}
ali@56
  2045
	    }
ali@56
  2046
	}
ali@56
  2047
    }
ali@56
  2048
    if (*aline==CHAR_DQUOTE)
ali@56
  2049
    {
ali@56
  2050
	if (strchr(",;:!?)]} ",aline[1]))
ali@56
  2051
	{
ali@56
  2052
	    if (pswit[ECHO_SWITCH])
ali@56
  2053
		printf("\n%s\n",aline);
ali@56
  2054
	    if (!pswit[OVERVIEW_SWITCH])
ali@56
  2055
		printf("    Line %ld column 1 - Wrongspaced quotes?\n",
ali@56
  2056
		  linecnt);
ali@56
  2057
	    else
ali@56
  2058
		cnt_punct++;
ali@56
  2059
	}
ali@56
  2060
    }
ali@56
  2061
    if (pswit[SQUOTE_SWITCH])
ali@56
  2062
    {
ali@56
  2063
	for (s=aline;*s;s++)
ali@56
  2064
	{
ali@56
  2065
	    if ((*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE) &&
ali@56
  2066
	      (s==aline || s>aline && !gcisalpha(s[-1]) ||
ali@56
  2067
	      !gcisalpha(s[1])))
ali@56
  2068
	    {
ali@56
  2069
		parities->squote=!parities->squote;
ali@56
  2070
		if (!parities->squote)
ali@56
  2071
		{
ali@56
  2072
		    /* parity even */
ali@56
  2073
		    if (!strchr("_-.'`/\",;:!?)]} ",s[1]))
ali@56
  2074
		    {
ali@56
  2075
			if (pswit[ECHO_SWITCH])
ali@56
  2076
			    printf("\n%s\n",aline);
ali@56
  2077
			if (!pswit[OVERVIEW_SWITCH])
ali@56
  2078
			    printf("    Line %ld column %d - "
ali@56
  2079
			      "Wrongspaced singlequotes?\n",
ali@56
  2080
			      linecnt,(int)(s-aline)+1);
ali@56
  2081
			else
ali@56
  2082
			    cnt_punct++;
ali@56
  2083
		    }
ali@56
  2084
		}
ali@56
  2085
		else
ali@56
  2086
		{
ali@56
  2087
		    /* parity odd */
ali@56
  2088
		    if (!gcisalpha(s[1]) && !isdigit(s[1]) &&
ali@56
  2089
		      !strchr("_-/\".'`",s[1]) || !s[1])
ali@56
  2090
		    {
ali@56
  2091
			if (pswit[ECHO_SWITCH])
ali@56
  2092
			    printf("\n%s\n",aline);
ali@56
  2093
			if (!pswit[OVERVIEW_SWITCH])
ali@56
  2094
			    printf("    Line %ld column %d - "
ali@56
  2095
			      "Wrongspaced singlequotes?\n",
ali@56
  2096
			      linecnt,(int)(s-aline)+1);
ali@56
  2097
			else
ali@56
  2098
			    cnt_punct++;
ali@56
  2099
		    }
ali@56
  2100
		}
ali@56
  2101
	    }
ali@56
  2102
	}
ali@56
  2103
    }
ali@56
  2104
}
ali@56
  2105
ali@55
  2106
/*
ali@57
  2107
 * check_for_double_punctuation:
ali@57
  2108
 *
ali@57
  2109
 * Look for double punctuation like ,. or ,,
ali@57
  2110
 * Thanks to DW for the suggestion!
ali@57
  2111
 * In books with references, ".," and ".;" are common
ali@57
  2112
 * e.g. "etc., etc.," and vol. 1.; vol 3.;
ali@57
  2113
 * OTOH, from my initial tests, there are also fairly
ali@57
  2114
 * common errors. What to do? Make these cases paranoid?
ali@57
  2115
 * ".," is the most common, so warnings->dotcomma is used
ali@57
  2116
 * to suppress detailed reporting if it occurs often.
ali@57
  2117
 */
ali@57
  2118
void check_for_double_punctuation(const char *aline,struct warnings *warnings)
ali@57
  2119
{
ali@57
  2120
    int i,llen;
ali@57
  2121
    llen=strlen(aline);
ali@57
  2122
    for (i=0;i<llen;i++)
ali@57
  2123
    {
ali@57
  2124
	/* for each punctuation character in the line */
ali@57
  2125
	if (strchr(".?!,;:",aline[i]) && strchr(".?!,;:",aline[i+1]) &&
ali@57
  2126
	  aline[i] && aline[i+1])
ali@57
  2127
	{
ali@57
  2128
	    /* followed by punctuation, it's a query, unless . . . */
ali@57
  2129
	    if (aline[i]==aline[i+1] && (aline[i]=='.' || aline[i]=='?' ||
ali@57
  2130
	      aline[i]=='!') ||
ali@57
  2131
	      !warnings->dotcomma && aline[i]=='.' && aline[i+1]==',' ||
ali@57
  2132
	      warnings->isFrench && !strncmp(aline+i,",...",4) ||
ali@57
  2133
	      warnings->isFrench && !strncmp(aline+i,"...,",4) ||
ali@57
  2134
	      warnings->isFrench && !strncmp(aline+i,";...",4) ||
ali@57
  2135
	      warnings->isFrench && !strncmp(aline+i,"...;",4) ||
ali@57
  2136
	      warnings->isFrench && !strncmp(aline+i,":...",4) ||
ali@57
  2137
	      warnings->isFrench && !strncmp(aline+i,"...:",4) ||
ali@57
  2138
	      warnings->isFrench && !strncmp(aline+i,"!...",4) ||
ali@57
  2139
	      warnings->isFrench && !strncmp(aline+i,"...!",4) ||
ali@57
  2140
	      warnings->isFrench && !strncmp(aline+i,"?...",4) ||
ali@57
  2141
	      warnings->isFrench && !strncmp(aline+i,"...?",4))
ali@57
  2142
	    {
ali@57
  2143
		if (warnings->isFrench && !strncmp(aline+i,",...",4) ||
ali@57
  2144
		  warnings->isFrench && !strncmp(aline+i,"...,",4) ||
ali@57
  2145
		  warnings->isFrench && !strncmp(aline+i,";...",4) ||
ali@57
  2146
		  warnings->isFrench && !strncmp(aline+i,"...;",4) ||
ali@57
  2147
		  warnings->isFrench && !strncmp(aline+i,":...",4) ||
ali@57
  2148
		  warnings->isFrench && !strncmp(aline+i,"...:",4) ||
ali@57
  2149
		  warnings->isFrench && !strncmp(aline+i,"!...",4) ||
ali@57
  2150
		  warnings->isFrench && !strncmp(aline+i,"...!",4) ||
ali@57
  2151
		  warnings->isFrench && !strncmp(aline+i,"?...",4) ||
ali@57
  2152
		  warnings->isFrench && !strncmp(aline+i,"...?",4))
ali@57
  2153
		    i+=4;
ali@57
  2154
		; /* do nothing for .. !! and ?? which can be legit */
ali@57
  2155
	    }
ali@57
  2156
	    else
ali@57
  2157
	    {
ali@57
  2158
		if (pswit[ECHO_SWITCH])
ali@57
  2159
		    printf("\n%s\n",aline);
ali@57
  2160
		if (!pswit[OVERVIEW_SWITCH])
ali@57
  2161
		    printf("    Line %ld column %d - Double punctuation?\n",
ali@57
  2162
		      linecnt,i+1);
ali@57
  2163
		else
ali@57
  2164
		    cnt_punct++;
ali@57
  2165
	    }
ali@57
  2166
	}
ali@57
  2167
    }
ali@57
  2168
}
ali@57
  2169
ali@57
  2170
/*
ali@58
  2171
 * check_for_spaced_quotes:
ali@58
  2172
 */
ali@58
  2173
void check_for_spaced_quotes(const char *aline)
ali@58
  2174
{
ali@58
  2175
    const char *s,*t;
ali@58
  2176
    s=aline;
ali@58
  2177
    while ((t=strstr(s," \" ")))
ali@58
  2178
    {
ali@58
  2179
	if (pswit[ECHO_SWITCH])
ali@58
  2180
	    printf("\n%s\n",aline);
ali@58
  2181
	if (!pswit[OVERVIEW_SWITCH])
ali@58
  2182
	    printf("    Line %ld column %d - Spaced doublequote?\n",
ali@58
  2183
	      linecnt,(int)(t-aline+1));
ali@58
  2184
	else
ali@58
  2185
	    cnt_punct++;
ali@58
  2186
	s=t+2;
ali@58
  2187
    }
ali@58
  2188
    s=aline;
ali@58
  2189
    while ((t=strstr(s," ' ")))
ali@58
  2190
    {
ali@58
  2191
	if (pswit[ECHO_SWITCH])
ali@58
  2192
	    printf("\n%s\n",aline);
ali@58
  2193
	if (!pswit[OVERVIEW_SWITCH])
ali@58
  2194
	    printf("    Line %ld column %d - Spaced singlequote?\n",
ali@58
  2195
	      linecnt,(int)(t-aline+1));
ali@58
  2196
	else
ali@58
  2197
	    cnt_punct++;
ali@58
  2198
	s=t+2;
ali@58
  2199
    }
ali@58
  2200
    s=aline;
ali@58
  2201
    while ((t=strstr(s," ` ")))
ali@58
  2202
    {
ali@58
  2203
	if (pswit[ECHO_SWITCH])
ali@58
  2204
	    printf("\n%s\n",aline);
ali@58
  2205
	if (!pswit[OVERVIEW_SWITCH])
ali@58
  2206
	    printf("    Line %ld column %d - Spaced singlequote?\n",
ali@58
  2207
	      linecnt,(int)(t-aline+1));
ali@58
  2208
	else
ali@58
  2209
	    cnt_punct++;
ali@58
  2210
	s=t+2;
ali@58
  2211
    }
ali@58
  2212
}
ali@58
  2213
ali@58
  2214
/*
ali@59
  2215
 * check_for_miscased_genative:
ali@59
  2216
 *
ali@59
  2217
 * Check special case of 'S instead of 's at end of word.
ali@59
  2218
 */
ali@59
  2219
void check_for_miscased_genative(const char *aline)
ali@59
  2220
{
ali@59
  2221
    const char *s;
ali@69
  2222
    if (!*aline)
ali@69
  2223
	return;
ali@59
  2224
    s=aline+1;
ali@59
  2225
    while (*s)
ali@59
  2226
    {
ali@59
  2227
	if (*s==CHAR_SQUOTE && s[1]=='S' && s[-1]>='a' && s[-1]<='z')
ali@59
  2228
	{
ali@59
  2229
	    if (pswit[ECHO_SWITCH])
ali@59
  2230
		printf("\n%s\n",aline);
ali@59
  2231
	    if (!pswit[OVERVIEW_SWITCH])
ali@59
  2232
		printf("    Line %ld column %d - Capital \"S\"?\n",
ali@59
  2233
		  linecnt,(int)(s-aline+2));
ali@59
  2234
	    else
ali@59
  2235
		cnt_punct++;
ali@59
  2236
	}
ali@59
  2237
	s++;
ali@59
  2238
    }
ali@59
  2239
}
ali@59
  2240
ali@59
  2241
/*
ali@60
  2242
 * check_end_of_line:
ali@60
  2243
 *
ali@60
  2244
 * Now check special cases - start and end of line -
ali@60
  2245
 * for single and double quotes. Start is sometimes [sic]
ali@60
  2246
 * but better to query it anyway.
ali@60
  2247
 * While we're here, check for dash at end of line.
ali@60
  2248
 */
ali@60
  2249
void check_end_of_line(const char *aline,struct warnings *warnings)
ali@60
  2250
{
ali@60
  2251
    int i,llen;
ali@60
  2252
    llen=strlen(aline);
ali@60
  2253
    if (llen>1)
ali@60
  2254
    {
ali@60
  2255
	if (aline[llen-1]==CHAR_DQUOTE || aline[llen-1]==CHAR_SQUOTE ||
ali@60
  2256
	  aline[llen-1]==CHAR_OPEN_SQUOTE)
ali@60
  2257
	    if (aline[llen-2]==CHAR_SPACE)
ali@60
  2258
	    {
ali@60
  2259
		if (pswit[ECHO_SWITCH])
ali@60
  2260
		    printf("\n%s\n",aline);
ali@60
  2261
		if (!pswit[OVERVIEW_SWITCH])
ali@60
  2262
		    printf("    Line %ld column %d - Spaced quote?\n",
ali@60
  2263
		      linecnt,llen);
ali@60
  2264
		else
ali@60
  2265
		    cnt_punct++;
ali@60
  2266
	    }
ali@60
  2267
	if ((aline[0]==CHAR_SQUOTE || aline[0]==CHAR_OPEN_SQUOTE) &&
ali@60
  2268
	  aline[1]==CHAR_SPACE)
ali@60
  2269
	{
ali@60
  2270
	    if (pswit[ECHO_SWITCH])
ali@60
  2271
		printf("\n%s\n",aline);
ali@60
  2272
	    if (!pswit[OVERVIEW_SWITCH])
ali@60
  2273
		printf("    Line %ld column 1 - Spaced quote?\n",linecnt);
ali@60
  2274
	    else
ali@60
  2275
		cnt_punct++;
ali@60
  2276
	}
ali@60
  2277
	/*
ali@60
  2278
	 * Dash at end of line may well be legit - paranoid mode only
ali@60
  2279
	 * and don't report em-dash at line-end.
ali@60
  2280
	 */
ali@60
  2281
	if (pswit[PARANOID_SWITCH] && warnings->hyphen)
ali@60
  2282
	{
ali@60
  2283
	    for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)
ali@60
  2284
		;
ali@60
  2285
	    if (aline[i]=='-' && aline[i-1]!='-')
ali@60
  2286
	    {
ali@60
  2287
		if (pswit[ECHO_SWITCH])
ali@60
  2288
		    printf("\n%s\n",aline);
ali@60
  2289
		if (!pswit[OVERVIEW_SWITCH])
ali@60
  2290
		    printf("    Line %ld column %d - Hyphen at end of line?\n",
ali@60
  2291
		      linecnt,i);
ali@60
  2292
	    }
ali@60
  2293
	}
ali@60
  2294
    }
ali@60
  2295
}
ali@60
  2296
ali@60
  2297
/*
ali@61
  2298
 * check_for_unspaced_bracket:
ali@61
  2299
 *
ali@61
  2300
 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
ali@61
  2301
 * If so, suspect a scanno like "a]most".
ali@61
  2302
 */
ali@61
  2303
void check_for_unspaced_bracket(const char *aline)
ali@61
  2304
{
ali@61
  2305
    int i,llen;
ali@61
  2306
    llen=strlen(aline);
ali@61
  2307
    for (i=1;i<llen-1;i++)
ali@61
  2308
    {
ali@61
  2309
	/* for each bracket character in the line except 1st & last */
ali@61
  2310
	if (strchr("{[()]}",aline[i]) && gcisalpha(aline[i-1]) &&
ali@61
  2311
	  gcisalpha(aline[i+1]))
ali@61
  2312
	{
ali@61
  2313
	    if (pswit[ECHO_SWITCH])
ali@61
  2314
		printf("\n%s\n",aline);
ali@61
  2315
	    if (!pswit[OVERVIEW_SWITCH])
ali@61
  2316
		printf("    Line %ld column %d - Unspaced bracket?\n",
ali@61
  2317
		  linecnt,i);
ali@61
  2318
	    else
ali@61
  2319
		cnt_punct++;
ali@61
  2320
	}
ali@61
  2321
    }
ali@61
  2322
}
ali@61
  2323
ali@61
  2324
/*
ali@62
  2325
 * check_for_unpunctuated_endquote:
ali@62
  2326
 */
ali@62
  2327
void check_for_unpunctuated_endquote(const char *aline)
ali@62
  2328
{
ali@62
  2329
    int i,llen;
ali@62
  2330
    llen=strlen(aline);
ali@62
  2331
    for (i=1;i<llen;i++)
ali@62
  2332
    {
ali@62
  2333
	/* for each character in the line except 1st */
ali@62
  2334
	if (aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))
ali@62
  2335
	{
ali@62
  2336
	    if (pswit[ECHO_SWITCH])
ali@62
  2337
		printf("\n%s\n",aline);
ali@62
  2338
	    if (!pswit[OVERVIEW_SWITCH])
ali@62
  2339
		printf("    Line %ld column %d - "
ali@62
  2340
		  "endquote missing punctuation?\n",linecnt,i);
ali@62
  2341
	    else
ali@62
  2342
		cnt_punct++;
ali@62
  2343
	}
ali@62
  2344
    }
ali@62
  2345
}
ali@62
  2346
ali@62
  2347
/*
ali@63
  2348
 * check_for_html_tag:
ali@63
  2349
 *
ali@63
  2350
 * Check for <HTML TAG>.
ali@63
  2351
 *
ali@63
  2352
 * If there is a < in the line, followed at some point
ali@63
  2353
 * by a > then we suspect HTML.
ali@63
  2354
 */
ali@63
  2355
void check_for_html_tag(const char *aline)
ali@63
  2356
{
ali@63
  2357
    int i;
ali@63
  2358
    const char *open,*close;
ali@63
  2359
    open=strstr(aline,"<");
ali@63
  2360
    if (open)
ali@63
  2361
    {
ali@63
  2362
	close=strstr(aline,">");
ali@63
  2363
	if (close)
ali@63
  2364
	{
ali@68
  2365
	    i=(int)(close-open+1);
ali@63
  2366
	    if (i>0)
ali@63
  2367
	    {
ali@63
  2368
		if (pswit[ECHO_SWITCH])
ali@63
  2369
		    printf("\n%s\n",aline);
ali@63
  2370
		if (!pswit[OVERVIEW_SWITCH])
ali@69
  2371
		    printf("    Line %ld column %d - HTML Tag? %*.*s \n",
ali@69
  2372
		      linecnt,(int)(open-aline)+1,i,i,open);
ali@63
  2373
		else
ali@63
  2374
		    cnt_html++;
ali@63
  2375
	    }
ali@63
  2376
	}
ali@63
  2377
    }
ali@63
  2378
}
ali@63
  2379
ali@63
  2380
/*
ali@64
  2381
 * check_for_html_entity:
ali@64
  2382
 *
ali@64
  2383
 * Check for &symbol; HTML.
ali@64
  2384
 *
ali@64
  2385
 * If there is a & in the line, followed at
ali@64
  2386
 * some point by a ; then we suspect HTML.
ali@64
  2387
 */
ali@64
  2388
void check_for_html_entity(const char *aline)
ali@64
  2389
{
ali@64
  2390
    int i;
ali@64
  2391
    const char *s,*amp,*scolon;
ali@64
  2392
    amp=strstr(aline,"&");
ali@64
  2393
    if (amp)
ali@64
  2394
    {
ali@64
  2395
	scolon=strstr(aline,";");
ali@64
  2396
	if (scolon)
ali@64
  2397
	{
ali@64
  2398
	    i=(int)(scolon-amp+1);
ali@64
  2399
	    for (s=amp;s<scolon;s++)   
ali@64
  2400
		if (*s==CHAR_SPACE)
ali@68
  2401
		    i=0;		/* Don't report "Jones & Son;" */
ali@64
  2402
	    if (i>0)
ali@64
  2403
	    {
ali@64
  2404
		if (pswit[ECHO_SWITCH])
ali@64
  2405
		    printf("\n%s\n",aline);
ali@64
  2406
		if (!pswit[OVERVIEW_SWITCH])
ali@69
  2407
		    printf("    Line %ld column %d - HTML symbol? %*.*s \n",
ali@69
  2408
		      linecnt,(int)(amp-aline)+1,i,i,amp);
ali@64
  2409
		else
ali@64
  2410
		    cnt_html++;
ali@64
  2411
	    }
ali@64
  2412
	}
ali@64
  2413
    }
ali@64
  2414
}
ali@64
  2415
ali@65
  2416
/*
ali@65
  2417
 * print_pending:
ali@65
  2418
 *
ali@65
  2419
 * If we are in a state of unbalanced quotes, and this line
ali@65
  2420
 * doesn't begin with a quote, output the stored error message.
ali@65
  2421
 * If the -P switch was used, print the warning even if the
ali@65
  2422
 * new para starts with quotes.
ali@65
  2423
 */
ali@65
  2424
void print_pending(const char *aline,const char *parastart,
ali@65
  2425
  struct pending *pending)
ali@65
  2426
{
ali@65
  2427
    const char *s;
ali@65
  2428
    s=aline;
ali@65
  2429
    while (*s==' ')
ali@65
  2430
	s++;
ali@69
  2431
    if (pending->dquote)
ali@69
  2432
    {
ali@65
  2433
	if (*s!=CHAR_DQUOTE || pswit[QPARA_SWITCH])
ali@65
  2434
	{
ali@65
  2435
	    if (!pswit[OVERVIEW_SWITCH])
ali@65
  2436
	    {
ali@65
  2437
		if (pswit[ECHO_SWITCH])
ali@65
  2438
		    printf("\n%s\n",parastart);
ali@65
  2439
		puts(pending->dquote);
ali@65
  2440
	    }
ali@65
  2441
	    else
ali@65
  2442
		cnt_dquot++;
ali@65
  2443
	}
ali@69
  2444
	g_free(pending->dquote);
ali@69
  2445
	pending->dquote=NULL;
ali@69
  2446
    }
ali@69
  2447
    if (pending->squote)
ali@65
  2448
    {
ali@65
  2449
	if (*s!=CHAR_SQUOTE && *s!=CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] ||
ali@65
  2450
	  pending->squot)
ali@65
  2451
	{
ali@65
  2452
	    if (!pswit[OVERVIEW_SWITCH])
ali@65
  2453
	    {
ali@65
  2454
		if (pswit[ECHO_SWITCH])
ali@65
  2455
		    printf("\n%s\n",parastart);
ali@65
  2456
		puts(pending->squote);
ali@65
  2457
	    }
ali@65
  2458
	    else
ali@65
  2459
		cnt_squot++;
ali@65
  2460
	}
ali@69
  2461
	g_free(pending->squote);
ali@69
  2462
	pending->squote=NULL;
ali@65
  2463
    }
ali@69
  2464
    if (pending->rbrack)
ali@65
  2465
    {
ali@65
  2466
	if (!pswit[OVERVIEW_SWITCH])
ali@65
  2467
	{
ali@65
  2468
	    if (pswit[ECHO_SWITCH])
ali@65
  2469
		printf("\n%s\n",parastart);
ali@65
  2470
	    puts(pending->rbrack);
ali@65
  2471
	}
ali@65
  2472
	else
ali@65
  2473
	    cnt_brack++;
ali@69
  2474
	g_free(pending->rbrack);
ali@69
  2475
	pending->rbrack=NULL;
ali@65
  2476
    }
ali@69
  2477
    if (pending->sbrack)
ali@65
  2478
    {
ali@65
  2479
	if (!pswit[OVERVIEW_SWITCH])
ali@65
  2480
	{
ali@65
  2481
	    if (pswit[ECHO_SWITCH])
ali@65
  2482
		printf("\n%s\n",parastart);
ali@65
  2483
	    puts(pending->sbrack);
ali@65
  2484
	}
ali@65
  2485
	else
ali@65
  2486
	    cnt_brack++;
ali@69
  2487
	g_free(pending->sbrack);
ali@69
  2488
	pending->sbrack=NULL;
ali@65
  2489
    }
ali@69
  2490
    if (pending->cbrack)
ali@65
  2491
    {
ali@65
  2492
	if (!pswit[OVERVIEW_SWITCH])
ali@65
  2493
	{
ali@65
  2494
	    if (pswit[ECHO_SWITCH])
ali@65
  2495
		printf("\n%s\n",parastart);
ali@65
  2496
	    puts(pending->cbrack);
ali@65
  2497
	}
ali@65
  2498
	else
ali@65
  2499
	    cnt_brack++;
ali@69
  2500
	g_free(pending->cbrack);
ali@69
  2501
	pending->cbrack=NULL;
ali@65
  2502
    }
ali@69
  2503
    if (pending->unders)
ali@65
  2504
    {
ali@65
  2505
	if (!pswit[OVERVIEW_SWITCH])
ali@65
  2506
	{
ali@65
  2507
	    if (pswit[ECHO_SWITCH])
ali@65
  2508
		printf("\n%s\n",parastart);
ali@65
  2509
	    puts(pending->unders);
ali@65
  2510
	}
ali@65
  2511
	else
ali@65
  2512
	    cnt_brack++;
ali@69
  2513
	g_free(pending->unders);
ali@69
  2514
	pending->unders=NULL;
ali@65
  2515
    }
ali@65
  2516
}
ali@65
  2517
ali@65
  2518
/*
ali@65
  2519
 * check_for_mismatched_quotes:
ali@65
  2520
 *
ali@65
  2521
 * At end of paragraph, check for mismatched quotes.
ali@65
  2522
 *
ali@65
  2523
 * We don't want to report an error immediately, since it is a
ali@65
  2524
 * common convention to omit the quotes at end of paragraph if
ali@65
  2525
 * the next paragraph is a continuation of the same speaker.
ali@65
  2526
 * Where this is the case, the next para should begin with a
ali@65
  2527
 * quote, so we store the warning message and only display it
ali@65
  2528
 * at the top of the next iteration if the new para doesn't
ali@65
  2529
 * start with a quote.
ali@65
  2530
 * The -p switch overrides this default, and warns of unclosed
ali@65
  2531
 * quotes on _every_ paragraph, whether the next begins with a
ali@65
  2532
 * quote or not.
ali@65
  2533
 */
ali@65
  2534
void check_for_mismatched_quotes(const struct counters *counters,
ali@65
  2535
  struct pending *pending)
ali@65
  2536
{
ali@65
  2537
    if (counters->quot%2)
ali@69
  2538
	pending->dquote=
ali@69
  2539
	  g_strdup_printf("    Line %ld - Mismatched quotes",linecnt);
ali@65
  2540
    if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
ali@65
  2541
      counters->open_single_quote!=counters->close_single_quote)
ali@69
  2542
	pending->squote=
ali@69
  2543
	  g_strdup_printf("    Line %ld - Mismatched singlequotes?",linecnt);
ali@65
  2544
    if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
ali@65
  2545
      counters->open_single_quote!=counters->close_single_quote &&
ali@65
  2546
      counters->open_single_quote!=counters->close_single_quote+1)
ali@65
  2547
	/*
ali@65
  2548
	 * Flag it to be noted regardless of the
ali@65
  2549
	 * first char of the next para.
ali@65
  2550
	 */
ali@65
  2551
	pending->squot=1;
ali@65
  2552
    if (counters->r_brack)
ali@69
  2553
	pending->rbrack=
ali@69
  2554
	  g_strdup_printf("    Line %ld - Mismatched round brackets?",linecnt);
ali@65
  2555
    if (counters->s_brack)
ali@69
  2556
	pending->sbrack=
ali@69
  2557
	  g_strdup_printf("    Line %ld - Mismatched square brackets?",linecnt);
ali@65
  2558
    if (counters->c_brack)
ali@69
  2559
	pending->cbrack=
ali@69
  2560
	  g_strdup_printf("    Line %ld - Mismatched curly brackets?",linecnt);
ali@65
  2561
    if (counters->c_unders%2)
ali@69
  2562
	pending->unders=
ali@69
  2563
	  g_strdup_printf("    Line %ld - Mismatched underscores?",linecnt);
ali@65
  2564
}
ali@65
  2565
ali@64
  2566
/*
ali@66
  2567
 * check_for_omitted_punctuation:
ali@66
  2568
 *
ali@66
  2569
 * Check for omitted punctuation at end of paragraph by working back
ali@66
  2570
 * through prevline. DW.
ali@66
  2571
 * Need to check this only for "normal" paras.
ali@66
  2572
 * So what is a "normal" para?
ali@66
  2573
 *    Not normal if one-liner (chapter headings, etc.)
ali@66
  2574
 *    Not normal if doesn't contain at least one locase letter
ali@66
  2575
 *    Not normal if starts with space
ali@66
  2576
 */
ali@66
  2577
void check_for_omitted_punctuation(const char *prevline,
ali@66
  2578
  struct line_properties *last,int start_para_line)
ali@66
  2579
{
ali@66
  2580
    int i;
ali@66
  2581
    const char *s;
ali@66
  2582
    for (s=prevline,i=0;*s && !i;s++)
ali@66
  2583
	if (gcisletter(*s))
ali@66
  2584
	    /* use i to indicate the presence of a letter on the line */
ali@66
  2585
	    i=1;
ali@66
  2586
    /*
ali@66
  2587
     * This next "if" is a problem.
ali@66
  2588
     * If we say "start_para_line <= linecnt - 1", that includes
ali@66
  2589
     * one-line "paragraphs" like chapter heads. Lotsa false positives.
ali@66
  2590
     * If we say "start_para_line < linecnt - 1" it doesn't, but then it
ali@66
  2591
     * misses genuine one-line paragraphs.
ali@66
  2592
     */
ali@66
  2593
    if (i && last->blen>2 && start_para_line<linecnt-1 && *prevline>CHAR_SPACE)
ali@66
  2594
    {
ali@66
  2595
	for (i=strlen(prevline)-1;
ali@66
  2596
	  (prevline[i]==CHAR_DQUOTE || prevline[i]==CHAR_SQUOTE) &&
ali@66
  2597
	  prevline[i]>CHAR_SPACE && i>0;
ali@66
  2598
	  i--)
ali@66
  2599
	    ;
ali@66
  2600
	for (;i>0;i--)
ali@66
  2601
	{
ali@66
  2602
	    if (gcisalpha(prevline[i]))
ali@66
  2603
	    {
ali@66
  2604
		if (pswit[ECHO_SWITCH])
ali@66
  2605
		    printf("\n%s\n",prevline);
ali@66
  2606
		if (!pswit[OVERVIEW_SWITCH])
ali@66
  2607
		    printf("    Line %ld column %d - "
ali@66
  2608
		      "No punctuation at para end?\n",
ali@68
  2609
		      linecnt-1,(int)strlen(prevline));
ali@66
  2610
		else
ali@66
  2611
		    cnt_punct++;
ali@66
  2612
		break;
ali@66
  2613
	    }
ali@66
  2614
	    if (strchr("-.:!([{?}])",prevline[i]))
ali@66
  2615
		break;
ali@66
  2616
	}
ali@66
  2617
    }
ali@66
  2618
}
ali@66
  2619
ali@69
  2620
gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
ali@69
  2621
{
ali@69
  2622
    const char *word=key;
ali@69
  2623
    int *dupcnt=value;
ali@69
  2624
    if (*dupcnt)
ali@69
  2625
	printf("\nNote: Queried word %s was duplicated %d times\n",
ali@69
  2626
	  word,*dupcnt);
ali@69
  2627
    return FALSE;
ali@69
  2628
}
ali@69
  2629
ali@66
  2630
/*
ali@41
  2631
 * procfile:
ali@41
  2632
 *
ali@41
  2633
 * Process one file.
ali@41
  2634
 */
ali@69
  2635
void procfile(const char *filename)
ali@41
  2636
{
ali@65
  2637
    const char *s;
ali@69
  2638
    gchar *parastart=NULL;	/* first line of current para */
ali@69
  2639
    gchar *etext,*aline;
ali@69
  2640
    gchar *etext_ptr;
ali@69
  2641
    GError *err=NULL;
ali@41
  2642
    struct first_pass_results *first_pass_results;
ali@42
  2643
    struct warnings *warnings;
ali@43
  2644
    struct counters counters={0};
ali@45
  2645
    struct line_properties last={0};
ali@56
  2646
    struct parities parities={0};
ali@69
  2647
    struct pending pending={0};
ali@69
  2648
    gboolean isemptyline;
ali@68
  2649
    long start_para_line=0;
ali@69
  2650
    gboolean isnewpara=FALSE,enddash=FALSE;
ali@45
  2651
    last.start=CHAR_SPACE;
ali@68
  2652
    linecnt=checked_linecnt=0;
ali@69
  2653
    etext=read_etext(filename,&err);
ali@69
  2654
    if (!etext)
ali@41
  2655
    {
ali@68
  2656
	if (pswit[STDOUT_SWITCH])
ali@69
  2657
	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
ali@68
  2658
	else
ali@69
  2659
	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
ali@41
  2660
	exit(1);
ali@41
  2661
    }
ali@41
  2662
    fprintf(stdout,"\n\nFile: %s\n\n",filename);
ali@69
  2663
    first_pass_results=first_pass(etext);
ali@42
  2664
    warnings=report_first_pass(first_pass_results);
ali@69
  2665
    qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
ali@69
  2666
    qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
ali@40
  2667
    /*
ali@40
  2668
     * Here we go with the main pass. Hold onto yer hat!
ali@40
  2669
     */
ali@65
  2670
    linecnt=0;
ali@69
  2671
    etext_ptr=etext;
ali@69
  2672
    while ((aline=flgets(&etext_ptr,linecnt+1)))
ali@40
  2673
    {
ali@68
  2674
	linecnt++;
ali@68
  2675
	if (linecnt==1)
ali@69
  2676
	    isnewpara=TRUE;
ali@68
  2677
	if (pswit[DP_SWITCH] && !strncmp(aline,"-----File: ",11))
ali@40
  2678
	    continue;    // skip DP page separators completely
ali@68
  2679
	if (linecnt<first_pass_results->firstline ||
ali@41
  2680
	  (first_pass_results->footerline>0 &&
ali@41
  2681
	  linecnt>first_pass_results->footerline))
ali@40
  2682
	{
ali@68
  2683
	    if (pswit[HEADER_SWITCH])
ali@40
  2684
	    {
ali@68
  2685
		if (!strncmp(aline,"Title:",6))
ali@68
  2686
		    printf("    %s\n",aline);
ali@68
  2687
		if (!strncmp(aline,"Author:",7))
ali@68
  2688
		    printf("    %s\n",aline);
ali@68
  2689
		if (!strncmp(aline,"Release Date:",13))
ali@68
  2690
		    printf("    %s\n",aline);
ali@68
  2691
		if (!strncmp(aline,"Edition:",8))
ali@68
  2692
		    printf("    %s\n\n",aline);
ali@40
  2693
	    }
ali@68
  2694
	    continue;		/* skip through the header */
ali@40
  2695
	}
ali@68
  2696
	checked_linecnt++;
ali@65
  2697
	print_pending(aline,parastart,&pending);
ali@65
  2698
	memset(&pending,0,sizeof(pending));
ali@43
  2699
	isemptyline=analyse_quotes(aline,&counters);
ali@68
  2700
	if (isnewpara && !isemptyline)
ali@40
  2701
	{
ali@40
  2702
	    /* This line is the start of a new paragraph. */
ali@68
  2703
	    start_para_line=linecnt;
ali@40
  2704
	    /* Capture its first line in case we want to report it later. */
ali@69
  2705
	    g_free(parastart);
ali@69
  2706
	    parastart=g_strdup(aline);
ali@56
  2707
	    memset(&parities,0,sizeof(parities));  /* restart the quote count */
ali@68
  2708
	    s=aline;
ali@68
  2709
	    while (!gcisalpha(*s) && !gcisdigit(*s) && *s)
ali@40
  2710
		s++;
ali@68
  2711
	    if (*s>='a' && *s<='z')
ali@40
  2712
	    {
ali@40
  2713
		/* and its first letter is lowercase */
ali@68
  2714
		if (pswit[ECHO_SWITCH])
ali@40
  2715
		    printf("\n%s\n",aline);
ali@68
  2716
		if (!pswit[OVERVIEW_SWITCH])
ali@68
  2717
		    printf("    Line %ld column %d - "
ali@40
  2718
		      "Paragraph starts with lower-case\n",
ali@40
  2719
		      linecnt,(int)(s-aline)+1);
ali@68
  2720
		else
ali@68
  2721
		    cnt_punct++;
ali@40
  2722
	    }
ali@69
  2723
	    isnewpara=FALSE; /* Signal the end of new para processing. */
ali@40
  2724
	}
ali@68
  2725
	/* Check for an em-dash broken at line end. */
ali@68
  2726
	if (enddash && *aline=='-')
ali@40
  2727
	{
ali@68
  2728
	    if (pswit[ECHO_SWITCH])
ali@40
  2729
		printf("\n%s\n",aline);
ali@68
  2730
	    if (!pswit[OVERVIEW_SWITCH])
ali@68
  2731
		printf("    Line %ld column 1 - Broken em-dash?\n",linecnt);
ali@68
  2732
	    else
ali@68
  2733
		cnt_punct++;
ali@40
  2734
	}
ali@69
  2735
	enddash=FALSE;
ali@68
  2736
	for (s=aline+strlen(aline)-1;*s==' ' && s>aline;s--)
ali@40
  2737
	    ;
ali@68
  2738
	if (s>=aline && *s=='-')
ali@69
  2739
	    enddash=TRUE;
ali@67
  2740
	check_for_control_characters(aline);
ali@68
  2741
	if (warnings->bin)
ali@44
  2742
	    check_for_odd_characters(aline,warnings,isemptyline);
ali@68
  2743
	if (warnings->longline)
ali@45
  2744
	    check_for_long_line(aline);
ali@68
  2745
	if (warnings->shortline)
ali@45
  2746
	    check_for_short_line(aline,&last);
ali@68
  2747
	last.blen=last.len;
ali@68
  2748
	last.len=strlen(aline);
ali@68
  2749
	last.start=aline[0];
ali@46
  2750
	check_for_starting_punctuation(aline);
ali@68
  2751
	if (warnings->dash)
ali@40
  2752
	{
ali@47
  2753
	    check_for_spaced_emdash(aline);
ali@47
  2754
	    check_for_spaced_dash(aline);
ali@40
  2755
	}
ali@48
  2756
	check_for_unmarked_paragraphs(aline);
ali@49
  2757
	check_for_jeebies(aline);
ali@50
  2758
	check_for_mta_from(aline);
ali@51
  2759
	check_for_orphan_character(aline);
ali@52
  2760
	check_for_pling_scanno(aline);
ali@53
  2761
	check_for_extra_period(aline,warnings);
ali@54
  2762
	check_for_following_punctuation(aline);
ali@55
  2763
	check_for_typos(aline,warnings);
ali@56
  2764
	check_for_misspaced_punctuation(aline,&parities,isemptyline);
ali@57
  2765
	check_for_double_punctuation(aline,warnings);
ali@58
  2766
	check_for_spaced_quotes(aline);
ali@59
  2767
	check_for_miscased_genative(aline);
ali@60
  2768
	check_end_of_line(aline,warnings);
ali@61
  2769
	check_for_unspaced_bracket(aline);
ali@68
  2770
	if (warnings->endquote)
ali@62
  2771
	    check_for_unpunctuated_endquote(aline);
ali@63
  2772
	check_for_html_tag(aline);
ali@64
  2773
	check_for_html_entity(aline);
ali@68
  2774
	if (isemptyline)
ali@40
  2775
	{
ali@65
  2776
	    check_for_mismatched_quotes(&counters,&pending);
ali@43
  2777
	    memset(&counters,0,sizeof(counters));
ali@40
  2778
	    /* let the next iteration know that it's starting a new para */
ali@69
  2779
	    isnewpara=TRUE;
ali@69
  2780
	    if (prevline)
ali@69
  2781
		check_for_omitted_punctuation(prevline,&last,start_para_line);
ali@40
  2782
	}
ali@69
  2783
	g_free(prevline);
ali@69
  2784
	prevline=g_strdup(aline);
ali@0
  2785
    }
ali@69
  2786
    if (prevline)
ali@69
  2787
    {
ali@69
  2788
	g_free(prevline);
ali@69
  2789
	prevline=NULL;
ali@69
  2790
    }
ali@69
  2791
    g_free(parastart);
ali@69
  2792
    g_free(prevline);
ali@69
  2793
    g_free(etext);
ali@0
  2794
    if (!pswit[OVERVIEW_SWITCH])
ali@69
  2795
	g_tree_foreach(qword,report_duplicate_queries,NULL);
ali@69
  2796
    g_tree_unref(qword);
ali@69
  2797
    g_tree_unref(qperiod);
ali@0
  2798
}
ali@0
  2799
ali@40
  2800
/*
ali@40
  2801
 * flgets:
ali@40
  2802
 *
ali@69
  2803
 * Get one line from the input text, checking for
ali@40
  2804
 * the existence of exactly one CR/LF line-end per line.
ali@40
  2805
 *
ali@40
  2806
 * Returns: a pointer to the line.
ali@40
  2807
 */
ali@69
  2808
char *flgets(char **etext,long lcnt)
ali@0
  2809
{
ali@0
  2810
    char c;
ali@69
  2811
    int len;
ali@69
  2812
    gboolean isCR=FALSE;
ali@69
  2813
    char *theline=*etext;
ali@69
  2814
    len=0;
ali@69
  2815
    for(;;)
ali@40
  2816
    {
ali@69
  2817
	c=*(*etext)++;
ali@69
  2818
	if (!c)
ali@68
  2819
	    return NULL;
ali@40
  2820
	/* either way, it's end of line */
ali@69
  2821
	if (c=='\n')
ali@40
  2822
	{
ali@68
  2823
	    if (isCR)
ali@68
  2824
		break;
ali@68
  2825
	    else
ali@40
  2826
	    {
ali@40
  2827
		/* Error - a LF without a preceding CR */
ali@68
  2828
		if (pswit[LINE_END_SWITCH])
ali@40
  2829
		{
ali@68
  2830
		    if (pswit[ECHO_SWITCH])
ali@69
  2831
			printf("\n%*.*s\n",len,len,theline);
ali@68
  2832
		    if (!pswit[OVERVIEW_SWITCH])
ali@68
  2833
			printf("    Line %ld - No CR?\n",lcnt);
ali@68
  2834
		    else
ali@68
  2835
			cnt_lineend++;
ali@40
  2836
		}
ali@68
  2837
		break;
ali@40
  2838
	    }
ali@40
  2839
	}
ali@69
  2840
	if (c=='\r')
ali@40
  2841
	{
ali@68
  2842
	    if (isCR)
ali@40
  2843
	    {
ali@40
  2844
		/* Error - two successive CRs */
ali@68
  2845
		if (pswit[LINE_END_SWITCH])
ali@40
  2846
		{
ali@68
  2847
		    if (pswit[ECHO_SWITCH])
ali@69
  2848
			printf("\n%*.*s\n",len,len,theline);
ali@68
  2849
		    if (!pswit[OVERVIEW_SWITCH])
ali@68
  2850
			printf("    Line %ld - Two successive CRs?\n",lcnt);
ali@68
  2851
		    else
ali@68
  2852
			cnt_lineend++;
ali@40
  2853
		}
ali@40
  2854
	    }
ali@69
  2855
	    isCR=TRUE;
ali@40
  2856
	}
ali@68
  2857
	else
ali@40
  2858
	{
ali@68
  2859
	    if (pswit[LINE_END_SWITCH] && isCR)
ali@40
  2860
	    {
ali@68
  2861
		if (pswit[ECHO_SWITCH])
ali@69
  2862
		    printf("\n%*.*s\n",len,len,theline);
ali@68
  2863
		if (!pswit[OVERVIEW_SWITCH])
ali@68
  2864
		    printf("    Line %ld column %d - CR without LF?\n",
ali@40
  2865
		      lcnt,len+1);
ali@68
  2866
		else
ali@68
  2867
		    cnt_lineend++;
ali@69
  2868
		theline[len]=' ';
ali@40
  2869
	    }
ali@69
  2870
	    isCR=FALSE;
ali@68
  2871
	    len++;
ali@40
  2872
	}
ali@69
  2873
    }
ali@69
  2874
    theline[len]='\0';
ali@0
  2875
    if (pswit[MARKUP_SWITCH])  
ali@68
  2876
	postprocess_for_HTML(theline);
ali@0
  2877
    if (pswit[DP_SWITCH])  
ali@68
  2878
	postprocess_for_DP(theline);
ali@40
  2879
    return theline;
ali@0
  2880
}
ali@0
  2881
ali@40
  2882
/*
ali@40
  2883
 * mixdigit:
ali@40
  2884
 *
ali@40
  2885
 * Takes a "word" as a parameter, and checks whether it
ali@40
  2886
 * contains a mixture of alpha and digits. Generally, this is an
ali@40
  2887
 * error, but may not be for cases like 4th or L5 12s. 3d.
ali@40
  2888
 *
ali@40
  2889
 * Returns: 0 if no error found, 1 if error.
ali@40
  2890
 */
ali@69
  2891
int mixdigit(const char *checkword)
ali@0
  2892
{
ali@40
  2893
    int wehaveadigit,wehavealetter,firstdigits,query,wl;
ali@69
  2894
    const char *s;
ali@40
  2895
    wehaveadigit=wehavealetter=query=0;
ali@40
  2896
    for (s=checkword;*s;s++)
ali@68
  2897
	if (gcisalpha(*s))
ali@68
  2898
	    wehavealetter=1;
ali@68
  2899
	else
ali@68
  2900
	    if (gcisdigit(*s))
ali@68
  2901
		wehaveadigit=1;
ali@40
  2902
    if (wehaveadigit && wehavealetter)
ali@40
  2903
    {
ali@40
  2904
	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
ali@68
  2905
	query=1;
ali@68
  2906
	wl=strlen(checkword);
ali@68
  2907
	for (firstdigits=0;gcisdigit(checkword[firstdigits]);firstdigits++)
ali@68
  2908
	    ;
ali@68
  2909
	/* digits, ending in st, rd, nd, th of either case */
ali@69
  2910
	if (firstdigits+2==wl && (!g_ascii_strcasecmp(checkword+wl-2,"st") ||
ali@69
  2911
	  !g_ascii_strcasecmp(checkword+wl-2,"rd") ||
ali@69
  2912
	  !g_ascii_strcasecmp(checkword+wl-2,"nd") ||
ali@69
  2913
	  !g_ascii_strcasecmp(checkword+wl-2,"th")))
ali@40
  2914
	    query=0;
ali@69
  2915
	if (firstdigits+3==wl && (!g_ascii_strcasecmp(checkword+wl-3,"sts") ||
ali@69
  2916
	  !g_ascii_strcasecmp(checkword+wl-3,"rds") ||
ali@69
  2917
	  !g_ascii_strcasecmp(checkword+wl-3,"nds") ||
ali@69
  2918
	  !g_ascii_strcasecmp(checkword+wl-3,"ths")))
ali@40
  2919
	    query=0;
ali@69
  2920
	if (firstdigits+3==wl && (!g_ascii_strcasecmp(checkword+wl-4,"stly") ||
ali@69
  2921
	  !g_ascii_strcasecmp(checkword+wl-4,"rdly") ||
ali@69
  2922
	  !g_ascii_strcasecmp(checkword+wl-4,"ndly") ||
ali@69
  2923
	  !g_ascii_strcasecmp(checkword+wl-4,"thly")))
ali@40
  2924
	    query=0;
ali@68
  2925
	/* digits, ending in l, L, s or d */
ali@68
  2926
	if (firstdigits+1==wl && (checkword[wl-1]=='l' ||
ali@40
  2927
	  checkword[wl-1]=='L' || checkword[wl-1]=='s' || checkword[wl-1]=='d'))
ali@40
  2928
	    query=0;
ali@68
  2929
	/*
ali@40
  2930
	 * L at the start of a number, representing Britsh pounds, like L500.
ali@68
  2931
	 * This is cute. We know the current word is mixeddigit. If the first
ali@68
  2932
	 * letter is L, there must be at least one digit following. If both
ali@68
  2933
	 * digits and letters follow, we have a genuine error, else we have a
ali@68
  2934
	 * capital L followed by digits, and we accept that as a non-error.
ali@40
  2935
	 */
ali@68
  2936
	if (checkword[0]=='L' && !mixdigit(checkword+1))
ali@40
  2937
	    query=0;
ali@40
  2938
    }
ali@40
  2939
    return query;
ali@0
  2940
}
ali@0
  2941
ali@40
  2942
/*
ali@40
  2943
 * getaword:
ali@40
  2944
 *
ali@69
  2945
 * Extracts the first/next "word" from the line, and returns it.
ali@69
  2946
 * A word is defined as one English word unit--or at least that's the aim.
ali@69
  2947
 * "ptr" is advanced to the position in the line where we will start
ali@69
  2948
 * looking for the next word.
ali@40
  2949
 *
ali@69
  2950
 * Returns: A newly-allocated string.
ali@40
  2951
 */
ali@69
  2952
gchar *getaword(const char **ptr)
ali@0
  2953
{
ali@69
  2954
    int i;
ali@54
  2955
    const char *s;
ali@69
  2956
    GString *word;
ali@69
  2957
    word=g_string_new(NULL);
ali@69
  2958
    for (;!gcisdigit(**ptr) && !gcisalpha(**ptr) && **ptr;(*ptr)++)
ali@40
  2959
	;
ali@40
  2960
    /*
ali@40
  2961
     * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
ali@40
  2962
     * Especially yucky is the case of L1,000
ali@40
  2963
     * This section looks for a pattern of characters including a digit
ali@40
  2964
     * followed by a comma or period followed by one or more digits.
ali@40
  2965
     * If found, it returns this whole pattern as a word; otherwise we discard
ali@40
  2966
     * the results and resume our normal programming.
ali@40
  2967
     */
ali@69
  2968
    s=*ptr;
ali@69
  2969
    for (;gcisdigit(*s) || gcisalpha(*s) || *s==',' || *s=='.';s++)
ali@69
  2970
	g_string_append_c(word,*s);
ali@69
  2971
    for (i=1;i+1<word->len;i++)
ali@40
  2972
    {
ali@69
  2973
	if (word->str[i]=='.' || word->str[i]==',')
ali@40
  2974
	{
ali@69
  2975
	    if (gcisdigit(word->str[i-1]) && gcisdigit(word->str[i-1]))
ali@40
  2976
	    {
ali@69
  2977
		*ptr=s;
ali@69
  2978
		return g_string_free(word,FALSE);
ali@40
  2979
	    }
ali@40
  2980
	}
ali@40
  2981
    }
ali@0
  2982
    /* we didn't find a punctuated number - do the regular getword thing */
ali@69
  2983
    g_string_truncate(word,0);
ali@69
  2984
    for (;gcisdigit(**ptr) || gcisalpha(**ptr) || **ptr=='\'';(*ptr)++)
ali@69
  2985
	g_string_append_c(word,**ptr);
ali@69
  2986
    return g_string_free(word,FALSE);
ali@0
  2987
}
ali@0
  2988
ali@40
  2989
/*
ali@40
  2990
 * isroman:
ali@40
  2991
 *
ali@40
  2992
 * Is this word a Roman Numeral?
ali@40
  2993
 *
ali@40
  2994
 * It doesn't actually validate that the number is a valid Roman Numeral--for
ali@40
  2995
 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
ali@40
  2996
 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
ali@40
  2997
 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
ali@40
  2998
 * expressions thereof, except when it came to taxes. Allow any number of M,
ali@40
  2999
 * an optional D, an optional CM or CD, any number of optional Cs, an optional
ali@40
  3000
 * XL or an optional XC, an optional IX or IV, an optional V and any number
ali@40
  3001
 * of optional Is.
ali@40
  3002
 */
ali@69
  3003
gboolean isroman(const char *t)
ali@0
  3004
{
ali@69
  3005
    const char *s;
ali@40
  3006
    if (!t || !*t)
ali@69
  3007
	return FALSE;
ali@40
  3008
    s=t;
ali@40
  3009
    while (*t=='m' && *t)
ali@40
  3010
	t++;
ali@40
  3011
    if (*t=='d')
ali@40
  3012
	t++;
ali@40
  3013
    if (*t=='c' && t[1]=='m')
ali@40
  3014
	t+=2;
ali@40
  3015
    if (*t=='c' && t[1]=='d')
ali@40
  3016
	t+=2;
ali@40
  3017
    while (*t=='c' && *t)
ali@40
  3018
	t++;
ali@40
  3019
    if (*t=='x' && t[1]=='l')
ali@40
  3020
	t+=2;
ali@40
  3021
    if (*t=='x' && t[1]=='c')
ali@40
  3022
	t+=2;
ali@40
  3023
    if (*t=='l')
ali@40
  3024
	t++;
ali@40
  3025
    while (*t=='x' && *t)
ali@40
  3026
	t++;
ali@40
  3027
    if (*t=='i' && t[1]=='x')
ali@40
  3028
	t+=2;
ali@40
  3029
    if (*t=='i' && t[1]=='v')
ali@40
  3030
	t+=2;
ali@40
  3031
    if (*t=='v')
ali@40
  3032
	t++;
ali@40
  3033
    while (*t=='i' && *t)
ali@40
  3034
	t++;
ali@40
  3035
    return !*t;
ali@0
  3036
}
ali@0
  3037
ali@40
  3038
/*
ali@40
  3039
 * gcisalpha:
ali@40
  3040
 *
ali@40
  3041
 * A version of isalpha() that is somewhat lenient on 8-bit texts.
ali@40
  3042
 * If we use the standard function, 8-bit accented characters break
ali@40
  3043
 * words, so that tete with accented characters appears to be two words, "t"
ali@40
  3044
 * and "t", with 8-bit characters between them. This causes over-reporting of
ali@40
  3045
 * errors. gcisalpha() recognizes accented letters from the CP1252 (Windows)
ali@40
  3046
 * and ISO-8859-1 character sets, which are the most common PG 8-bit types.
ali@40
  3047
 */
ali@69
  3048
gboolean gcisalpha(unsigned char c)
ali@0
  3049
{
ali@40
  3050
    if (c>='a' && c<='z')
ali@69
  3051
	return TRUE;
ali@40
  3052
    if (c>='A' && c<='Z')
ali@69
  3053
	return TRUE;
ali@40
  3054
    if (c<140)
ali@69
  3055
	return FALSE;
ali@40
  3056
    if (c>=192 && c!=208 && c!=215 && c!=222 && c!=240 && c!=247 && c!=254)
ali@69
  3057
	return TRUE;
ali@40
  3058
    if (c==140 || c==142 || c==156 || c==158 || c==159)
ali@69
  3059
	return TRUE;
ali@69
  3060
    return FALSE;
ali@0
  3061
}
ali@0
  3062
ali@40
  3063
/*
ali@40
  3064
 * gcisdigit:
ali@40
  3065
 *
ali@40
  3066
 * A version of isdigit() that doesn't get confused in 8-bit texts.
ali@40
  3067
 */
ali@69
  3068
gboolean gcisdigit(unsigned char c)
ali@0
  3069
{   
ali@40
  3070
    return c>='0' && c<='9';
ali@0
  3071
}
ali@0
  3072
ali@40
  3073
/*
ali@40
  3074
 * gcisletter:
ali@40
  3075
 *
ali@40
  3076
 * A version of isletter() that doesn't get confused in 8-bit texts.
ali@40
  3077
 * NB: this is ISO-8891-1-specific.
ali@40
  3078
 */
ali@69
  3079
gboolean gcisletter(unsigned char c)
ali@0
  3080
{   
ali@40
  3081
    return c>='A' && c<='Z' || c>='a' && c<='z' || c>=192;
ali@0
  3082
}
ali@0
  3083
ali@40
  3084
/*
ali@40
  3085
 * postprocess_for_DP:
ali@40
  3086
 *
ali@40
  3087
 * Invoked with the -d switch from flgets().
ali@40
  3088
 * It simply "removes" from the line a hard-coded set of common
ali@40
  3089
 * DP-specific tags, so that the line passed to the main routine has
ali@40
  3090
 * been pre-cleaned of DP markup.
ali@40
  3091
 */
ali@0
  3092
void postprocess_for_DP(char *theline)
ali@0
  3093
{
ali@40
  3094
    char *s,*t;
ali@0
  3095
    int i;
ali@0
  3096
    if (!*theline) 
ali@68
  3097
	return;
ali@40
  3098
    for (i=0;*DPmarkup[i];i++)
ali@40
  3099
    {
ali@68
  3100
	s=strstr(theline,DPmarkup[i]);
ali@68
  3101
	while (s)
ali@40
  3102
	{
ali@68
  3103
	    t=s+strlen(DPmarkup[i]);
ali@68
  3104
	    while (*t)
ali@40
  3105
	    {
ali@68
  3106
		*s=*t;
ali@68
  3107
		t++;
ali@40
  3108
		s++;
ali@40
  3109
	    }
ali@68
  3110
	    *s=0;
ali@68
  3111
	    s=strstr(theline,DPmarkup[i]);
ali@40
  3112
	}
ali@40
  3113
    }
ali@0
  3114
}
ali@0
  3115
ali@40
  3116
/*
ali@40
  3117
 * postprocess_for_HTML:
ali@40
  3118
 *
ali@40
  3119
 * Invoked with the -m switch from flgets().
ali@40
  3120
 * It simply "removes" from the line a hard-coded set of common
ali@40
  3121
 * HTML tags and "replaces" a hard-coded set of common HTML
ali@40
  3122
 * entities, so that the line passed to the main routine has
ali@40
  3123
 * been pre-cleaned of HTML.
ali@40
  3124
 */
ali@0
  3125
void postprocess_for_HTML(char *theline)
ali@0
  3126
{
ali@69
  3127
    if (strchr(theline,'<') && strchr(theline,'>'))
ali@68
  3128
	while (losemarkup(theline))
ali@68
  3129
	    ;
ali@0
  3130
    while (loseentities(theline))
ali@68
  3131
	;
ali@0
  3132
}
ali@0
  3133
ali@0
  3134
char *losemarkup(char *theline)
ali@0
  3135
{
ali@40
  3136
    char *s,*t;
ali@0
  3137
    int i;
ali@0
  3138
    if (!*theline) 
ali@68
  3139
	return NULL;
ali@40
  3140
    s=strstr(theline,"<");
ali@40
  3141
    t=strstr(theline,">");
ali@40
  3142
    if (!s || !t)
ali@40
  3143
	return NULL;
ali@40
  3144
    for (i=0;*markup[i];i++)
ali@68
  3145
	if (!tagcomp(s+1,markup[i]))
ali@40
  3146
	{
ali@68
  3147
	    if (!t[1])
ali@40
  3148
	    {
ali@68
  3149
		*s=0;
ali@68
  3150
		return s;
ali@40
  3151
	    }
ali@68
  3152
	    else if (t>s)
ali@40
  3153
	    {
ali@40
  3154
		strcpy(s,t+1);
ali@40
  3155
		return s;
ali@40
  3156
	    }
ali@68
  3157
	}
ali@40
  3158
    /* It's an unrecognized <xxx>. */
ali@40
  3159
    return NULL;
ali@0
  3160
}
ali@0
  3161
ali@0
  3162
char *loseentities(char *theline)
ali@0
  3163
{
ali@0
  3164
    int i;
ali@40
  3165
    char *s,*t;
ali@0
  3166
    if (!*theline) 
ali@68
  3167
	return NULL;
ali@40
  3168
    for (i=0;*entities[i].htmlent;i++)
ali@40
  3169
    {
ali@68
  3170
	s=strstr(theline,entities[i].htmlent);
ali@68
  3171
	if (s)
ali@40
  3172
	{
ali@68
  3173
	    t=malloc((size_t)strlen(s));
ali@68
  3174
	    if (!t)
ali@40
  3175
		return NULL;
ali@68
  3176
	    strcpy(t,s+strlen(entities[i].htmlent));
ali@68
  3177
	    strcpy(s,entities[i].textent);
ali@68
  3178
	    strcat(s,t);
ali@68
  3179
	    free(t);
ali@68
  3180
	    return theline;
ali@40
  3181
	}
ali@40
  3182
    }
ali@40
  3183
    for (i=0;*entities[i].htmlnum;i++)
ali@40
  3184
    {
ali@68
  3185
	s=strstr(theline,entities[i].htmlnum);
ali@68
  3186
	if (s)
ali@40
  3187
	{
ali@68
  3188
	    t=malloc((size_t)strlen(s));
ali@68
  3189
	    if (!t)
ali@40
  3190
		return NULL;
ali@68
  3191
	    strcpy(t,s+strlen(entities[i].htmlnum));
ali@68
  3192
	    strcpy(s,entities[i].textent);
ali@68
  3193
	    strcat(s,t);
ali@68
  3194
	    free(t);
ali@68
  3195
	    return theline;
ali@40
  3196
	}
ali@40
  3197
    }
ali@40
  3198
    return NULL;
ali@0
  3199
}
ali@0
  3200
ali@69
  3201
int tagcomp(const char *strin,const char *basetag)
ali@0
  3202
{
ali@69
  3203
    const char *s,*t;
ali@40
  3204
    s=basetag;
ali@40
  3205
    t=strin;
ali@40
  3206
    if (*t=='/')
ali@40
  3207
	t++; /* ignore a slash */
ali@40
  3208
    while (*s && *t)
ali@40
  3209
    {
ali@68
  3210
	if (tolower(*s)!=tolower(*t))
ali@40
  3211
	    return 1;
ali@68
  3212
	s++;
ali@40
  3213
	t++;
ali@40
  3214
    }
ali@40
  3215
    return 0;
ali@0
  3216
}
ali@0
  3217
ali@69
  3218
void proghelp(GOptionContext *context)
ali@0
  3219
{
ali@69
  3220
    gchar *help;
ali@40
  3221
    fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
ali@40
  3222
    fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
ali@40
  3223
    fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
ali@40
  3224
    fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
ali@40
  3225
      "For details, read the file COPYING.\n",stderr);
ali@40
  3226
    fputs("This is Free Software; "
ali@40
  3227
      "you may redistribute it under certain conditions (GPL);\n",stderr);
ali@40
  3228
    fputs("read the file COPYING for details.\n\n",stderr);
ali@69
  3229
    help=g_option_context_get_help(context,TRUE,NULL);
ali@69
  3230
    fputs(help,stderr);
ali@69
  3231
    g_free(help);
ali@69
  3232
    fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
ali@40
  3233
    fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
ali@40
  3234
      "non-ASCII\n",stderr);
ali@40
  3235
    fputs("characters like accented letters, "
ali@40
  3236
      "lines longer than 75 or shorter than 55,\n",stderr);
ali@40
  3237
    fputs("unbalanced quotes or brackets, "
ali@40
  3238
      "a variety of badly formatted punctuation, \n",stderr);
ali@40
  3239
    fputs("HTML tags, some likely typos. "
ali@40
  3240
      "It is NOT a substitute for human judgement.\n",stderr);
ali@0
  3241
    fputs("\n",stderr);
ali@0
  3242
}