ali@0
|
1 |
/*************************************************************************/
|
ali@0
|
2 |
/* gutcheck - check for assorted weirdnesses in a PG candidate text file */
|
ali@0
|
3 |
/* */
|
ali@0
|
4 |
/* Version 0.991 */
|
ali@0
|
5 |
/* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
|
ali@0
|
6 |
/* */
|
ali@0
|
7 |
/* This program is free software; you can redistribute it and/or modify */
|
ali@0
|
8 |
/* it under the terms of the GNU General Public License as published by */
|
ali@0
|
9 |
/* the Free Software Foundation; either version 2 of the License, or */
|
ali@0
|
10 |
/* (at your option) any later version. */
|
ali@0
|
11 |
/* */
|
ali@0
|
12 |
/* This program is distributed in the hope that it will be useful, */
|
ali@0
|
13 |
/* but WITHOUT ANY WARRANTY; without even the implied warranty of */
|
ali@0
|
14 |
/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
|
ali@0
|
15 |
/* GNU General Public License for more details. */
|
ali@0
|
16 |
/* */
|
ali@0
|
17 |
/* You should have received a copy of the GNU General Public License */
|
ali@0
|
18 |
/* along with this program; if not, write to the */
|
ali@0
|
19 |
/* Free Software Foundation, Inc., */
|
ali@0
|
20 |
/* 59 Temple Place, */
|
ali@0
|
21 |
/* Suite 330, */
|
ali@0
|
22 |
/* Boston, MA 02111-1307 USA */
|
ali@0
|
23 |
/* */
|
ali@0
|
24 |
/* */
|
ali@0
|
25 |
/* */
|
ali@0
|
26 |
/* Overview comments: */
|
ali@0
|
27 |
/* */
|
ali@0
|
28 |
/* If you're reading this, you're either interested in how to detect */
|
ali@0
|
29 |
/* formatting errors, or very very bored. */
|
ali@0
|
30 |
/* */
|
ali@0
|
31 |
/* Gutcheck is a homebrew formatting checker specifically for */
|
ali@0
|
32 |
/* spotting common formatting problems in a PG e-text. I typically */
|
ali@0
|
33 |
/* run it once or twice on a file I'm about to submit; it usually */
|
ali@0
|
34 |
/* finds a few formatting problems. It also usually finds lots of */
|
ali@0
|
35 |
/* queries that aren't problems at all; it _really_ doesn't like */
|
ali@0
|
36 |
/* the standard PG header, for example. It's optimized for straight */
|
ali@0
|
37 |
/* prose; poetry and non-fiction involving tables tend to trigger */
|
ali@0
|
38 |
/* false alarms. */
|
ali@0
|
39 |
/* */
|
ali@0
|
40 |
/* The code of gutcheck is not very interesting, but the experience */
|
ali@0
|
41 |
/* of what constitutes a possible error may be, and the best way to */
|
ali@0
|
42 |
/* illustrate that is by example. */
|
ali@0
|
43 |
/* */
|
ali@0
|
44 |
/* */
|
ali@0
|
45 |
/* Here are some common typos found in PG texts that gutcheck */
|
ali@0
|
46 |
/* will flag as errors: */
|
ali@0
|
47 |
/* */
|
ali@0
|
48 |
/* "Look!John , over there!" */
|
ali@0
|
49 |
/* <this is a HTML tag> */
|
ali@0
|
50 |
/* &so is this; */
|
ali@0
|
51 |
/* Margaret said: " Now you should start for school." */
|
ali@0
|
52 |
/* Margaret said: "Now you should start for school. (if end of para) */
|
ali@0
|
53 |
/* The horse is said to he worth a lot. */
|
ali@0
|
54 |
/* 0K - this'11 make you look close1y. */
|
ali@0
|
55 |
/* "If you do. you'll regret it!" */
|
ali@0
|
56 |
/* */
|
ali@0
|
57 |
/* There are some complications . The extra space left around that */
|
ali@0
|
58 |
/* period was an error . . . but that ellipsis wasn't. */
|
ali@0
|
59 |
/* */
|
ali@0
|
60 |
/* The last line of a paragraph */
|
ali@0
|
61 |
/* is usually short. */
|
ali@0
|
62 |
/* */
|
ali@0
|
63 |
/* This period is an error.But the periods in a.m. aren't. */
|
ali@0
|
64 |
/* */
|
ali@0
|
65 |
/* Checks that are do-able but not (well) implemented are: */
|
ali@0
|
66 |
/* Single-quote chcking. */
|
ali@0
|
67 |
/* Despite 3 attempts at it, singlequote checking is still */
|
ali@0
|
68 |
/* crap in gutcheck. It may not be possible without analysis */
|
ali@0
|
69 |
/* of the whole paragraph. */
|
ali@0
|
70 |
/* */
|
ali@0
|
71 |
/*************************************************************************/
|
ali@0
|
72 |
|
ali@0
|
73 |
|
ali@0
|
74 |
#include <stdio.h>
|
ali@0
|
75 |
#include <stdlib.h>
|
ali@0
|
76 |
#include <string.h>
|
ali@0
|
77 |
#include <ctype.h>
|
ali@0
|
78 |
|
ali@0
|
79 |
#define MAXWORDLEN 80 /* max length of one word */
|
ali@0
|
80 |
#define LINEBUFSIZE 2048 /* buffer size for an input line */
|
ali@0
|
81 |
|
ali@0
|
82 |
#define MAX_USER_TYPOS 1000
|
ali@0
|
83 |
#define USERTYPO_FILE "gutcheck.typ"
|
ali@0
|
84 |
|
ali@0
|
85 |
#ifndef MAX_PATH
|
ali@0
|
86 |
#define MAX_PATH 16384
|
ali@0
|
87 |
#endif
|
ali@0
|
88 |
|
ali@0
|
89 |
char aline[LINEBUFSIZE];
|
ali@0
|
90 |
char prevline[LINEBUFSIZE];
|
ali@0
|
91 |
|
ali@0
|
92 |
/* Common typos. */
|
ali@0
|
93 |
char *typo[] = { "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane", "nad",
|
ali@0
|
94 |
"te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa", "bakc", "om",
|
ali@0
|
95 |
"btu", "byt", "cna", "cxan", "coudl", "dont", "didnt", "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr",
|
ali@0
|
96 |
"hmi", "hse", "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd", "gerat", "goign",
|
ali@0
|
97 |
"gruop", "haev", "hda", "hearign", "seeign", "sayign", "herat", "hge", "hsa", "hsi", "hte", "htere",
|
ali@0
|
98 |
"htese", "htey", "htis", "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut", "loev",
|
ali@0
|
99 |
"sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter", "omre", "onyl", "otehr", "otu", "owrk",
|
ali@0
|
100 |
"owuld", "peice", "peices", "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
|
ali@0
|
101 |
"porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe", "sohw", "stnad", "stopry",
|
ali@0
|
102 |
"stoyr", "stpo", "tahn", "taht", "tath", "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge",
|
ali@0
|
103 |
"thier", "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne", "tirne", "tkae",
|
ali@0
|
104 |
"tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey", "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih",
|
ali@0
|
105 |
"whihc", "whta", "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
|
ali@0
|
106 |
"woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking", "wtih", "wuould", "wya", "yera",
|
ali@0
|
107 |
"yeras", "yersa", "yoiu", "youve", "ytou", "yuor",
|
ali@0
|
108 |
/* added h/b words for version 12 - removed a few with "tbe" v.25 */
|
ali@0
|
109 |
"abead", "ahle", "ahout", "ahove", "altbough", "balf", "bardly", "bas", "bave", "baving", "bebind",
|
ali@0
|
110 |
"beld", "belp", "belped", "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge", "dehates",
|
ali@0
|
111 |
"deht", "han", "hecause", "hecome", "heen", "hefore", "hegan", "hegin", "heing",
|
ali@0
|
112 |
"helieve", "henefit", "hetter", "hetween", "heyond", "hig", "higber", "huild", "huy", "hy", "jobn", "joh",
|
ali@0
|
113 |
"meanwbile", "memher", "memhers", "numher", "numhers",
|
ali@0
|
114 |
"perbaps", "prohlem", "puhlic", "witbout",
|
ali@0
|
115 |
/* and a few more for .18 */
|
ali@0
|
116 |
"arn", "hin", "hirn", "wrok", "wroked", "amd", "aud", "prornise", "prornised", "modem", "bo",
|
ali@0
|
117 |
"heside", "chapteb", "chaptee", "se",
|
ali@0
|
118 |
""};
|
ali@0
|
119 |
|
ali@0
|
120 |
char *usertypo[MAX_USER_TYPOS];
|
ali@0
|
121 |
|
ali@0
|
122 |
/* Common abbreviations and other OK words not to query as typos. */
|
ali@0
|
123 |
/* 0.99 last-minute - removed "ms" */
|
ali@0
|
124 |
char *okword[] = {"mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm", "rd", "sh", "br",
|
ali@0
|
125 |
"pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "pompeii","hawaii","hawaiian",
|
ali@0
|
126 |
"hotbed", "heartbeat", "heartbeats", "outbid", "outbids", "frostbite", "frostbitten",
|
ali@0
|
127 |
""};
|
ali@0
|
128 |
|
ali@0
|
129 |
/* Common abbreviations that cause otherwise unexplained periods. */
|
ali@0
|
130 |
char *abbrev[] = {"cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op", "cit",
|
ali@0
|
131 |
"deg", "min", "chap", "oz", "mme", "mlle", "mssrs",
|
ali@0
|
132 |
""};
|
ali@0
|
133 |
/* Two-Letter combinations that rarely if ever start words, */
|
ali@0
|
134 |
/* but are common scannos or otherwise common letter */
|
ali@0
|
135 |
/* combinations. */
|
ali@0
|
136 |
char *nostart[] = { "hr", "hl", "cb", "sb", "tb", "wb", "tl",
|
ali@0
|
137 |
"tn", "rn", "lt", "tj",
|
ali@0
|
138 |
"" };
|
ali@0
|
139 |
|
ali@0
|
140 |
/* Two-Letter combinations that rarely if ever end words */
|
ali@0
|
141 |
/* but are common scannos or otherwise common letter */
|
ali@0
|
142 |
/* combinations */
|
ali@0
|
143 |
char *noend[] = { "cb", "gb", "pb", "sb", "tb",
|
ali@0
|
144 |
"wh","fr","br","qu","tw","gl","fl","sw","gr","sl","cl",
|
ali@0
|
145 |
"iy",
|
ali@0
|
146 |
""};
|
ali@0
|
147 |
|
ali@0
|
148 |
char *markup[] = { "a", "b", "big", "blockquote", "body", "br", "center",
|
ali@0
|
149 |
"col", "div", "em", "font", "h1", "h2", "h3", "h4",
|
ali@0
|
150 |
"h5", "h6", "head", "hr", "html", "i", "img", "li",
|
ali@0
|
151 |
"meta", "ol", "p", "pre", "small", "span", "strong",
|
ali@0
|
152 |
"sub", "sup", "table", "td", "tfoot", "thead", "title",
|
ali@0
|
153 |
"tr", "tt", "u", "ul",
|
ali@0
|
154 |
""};
|
ali@0
|
155 |
|
ali@0
|
156 |
char *DPmarkup[] = { "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>",
|
ali@0
|
157 |
""}; /* <tb> added .991 */
|
ali@0
|
158 |
|
ali@0
|
159 |
char *nocomma[] = { "the", "it's", "their", "an", "mrs", "a", "our", "that's",
|
ali@0
|
160 |
"its", "whose", "every", "i'll", "your", "my",
|
ali@0
|
161 |
"mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "rd",
|
ali@0
|
162 |
"pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
|
ali@0
|
163 |
"i'm", "during", "let", "toward", "among",
|
ali@0
|
164 |
""};
|
ali@0
|
165 |
|
ali@0
|
166 |
|
ali@0
|
167 |
char *noperiod[] = { "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
|
ali@0
|
168 |
"and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
|
ali@0
|
169 |
"i'll", "whose", "who", "because", "when", "let", "till", "very",
|
ali@0
|
170 |
"an", "among", "those", "into", "whom", "having", "thence",
|
ali@0
|
171 |
""};
|
ali@0
|
172 |
|
ali@0
|
173 |
|
ali@0
|
174 |
char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü"; /* Carlo's old suggestion, updated .991 */
|
ali@0
|
175 |
|
ali@0
|
176 |
struct {
|
ali@0
|
177 |
char *htmlent;
|
ali@0
|
178 |
char *htmlnum;
|
ali@0
|
179 |
char *textent;
|
ali@0
|
180 |
} entities[] = { "&", "&", "&",
|
ali@0
|
181 |
"<", "<", "<",
|
ali@0
|
182 |
">", ">", ">",
|
ali@0
|
183 |
"°", "°", " degrees",
|
ali@0
|
184 |
"£", "£", "L",
|
ali@0
|
185 |
""", """, "\"", /* -- quotation mark = APL quote, */
|
ali@0
|
186 |
"Œ", "Œ", "OE", /* -- latin capital ligature OE, */
|
ali@0
|
187 |
"œ", "œ", "oe", /* -- latin small ligature oe, U+0153 ISOlat2 --> */
|
ali@0
|
188 |
"Š", "Š", "S", /* -- latin capital letter S with caron, */
|
ali@0
|
189 |
"š", "š", "s", /* -- latin small letter s with caron, */
|
ali@0
|
190 |
"Ÿ", "Ÿ", "Y", /* -- latin capital letter Y with diaeresis, */
|
ali@0
|
191 |
"ˆ", "ˆ", "", /* -- modifier letter circumflex accent, */
|
ali@0
|
192 |
"˜", "˜", "~", /* -- small tilde, U+02DC ISOdia --> */
|
ali@0
|
193 |
" ", " ", " ", /* -- en space, U+2002 ISOpub --> */
|
ali@0
|
194 |
" ", " ", " ", /* -- em space, U+2003 ISOpub --> */
|
ali@0
|
195 |
" ", " ", " ", /* -- thin space, U+2009 ISOpub --> */
|
ali@0
|
196 |
"–", "–", "-", /* -- en dash, U+2013 ISOpub --> */
|
ali@0
|
197 |
"—", "—", "--", /* -- em dash, U+2014 ISOpub --> */
|
ali@0
|
198 |
"‘", "‘", "'", /* -- left single quotation mark, */
|
ali@0
|
199 |
"’", "’", "'", /* -- right single quotation mark, */
|
ali@0
|
200 |
"‚", "‚", "'", /* -- single low-9 quotation mark, U+201A NEW --> */
|
ali@0
|
201 |
"“", "“", "\"", /* -- left double quotation mark, */
|
ali@0
|
202 |
"”", "”", "\"", /* -- right double quotation mark, */
|
ali@0
|
203 |
"„", "„", "\"", /* -- double low-9 quotation mark, U+201E NEW --> */
|
ali@0
|
204 |
"‹", "‹", "\"", /* -- single left-pointing angle quotation mark, */
|
ali@0
|
205 |
"›", "›", "\"", /* -- single right-pointing angle quotation mark, */
|
ali@0
|
206 |
" ", " ", " ", /* -- no-break space = non-breaking space, */
|
ali@0
|
207 |
"¡", "¡", "!", /* -- inverted exclamation mark, U+00A1 ISOnum --> */
|
ali@0
|
208 |
"¢", "¢", "c", /* -- cent sign, U+00A2 ISOnum --> */
|
ali@0
|
209 |
"£", "£", "L", /* -- pound sign, U+00A3 ISOnum --> */
|
ali@0
|
210 |
"¤", "¤", "$", /* -- currency sign, U+00A4 ISOnum --> */
|
ali@0
|
211 |
"¥", "¥", "Y", /* -- yen sign = yuan sign, U+00A5 ISOnum --> */
|
ali@0
|
212 |
"§", "§", "--", /* -- section sign, U+00A7 ISOnum --> */
|
ali@0
|
213 |
"¨", "¨", " ", /* -- diaeresis = spacing diaeresis, */
|
ali@0
|
214 |
"©", "©", "(C) ", /* -- copyright sign, U+00A9 ISOnum --> */
|
ali@0
|
215 |
"ª", "ª", " ", /* -- feminine ordinal indicator, U+00AA ISOnum --> */
|
ali@0
|
216 |
"«", "«", "\"", /* -- left-pointing double angle quotation mark */
|
ali@0
|
217 |
"­", "­", "-", /* -- soft hyphen = discretionary hyphen, */
|
ali@0
|
218 |
"®", "®", "(R) ", /* -- registered sign = registered trade mark sign, */
|
ali@0
|
219 |
"¯", "¯", " ", /* -- macron = spacing macron = overline */
|
ali@0
|
220 |
"°", "°", " degrees", /* -- degree sign, U+00B0 ISOnum --> */
|
ali@0
|
221 |
"±", "±", "+-", /* -- plus-minus sign = plus-or-minus sign, */
|
ali@0
|
222 |
"²", "²", "2", /* -- superscript two = superscript digit two */
|
ali@0
|
223 |
"³", "³", "3", /* -- superscript three = superscript digit three */
|
ali@0
|
224 |
"´", "´", " ", /* -- acute accent = spacing acute, */
|
ali@0
|
225 |
"µ", "µ", "m", /* -- micro sign, U+00B5 ISOnum --> */
|
ali@0
|
226 |
"¶", "¶", "--", /* -- pilcrow sign = paragraph sign, */
|
ali@0
|
227 |
"¸", "¸", " ", /* -- cedilla = spacing cedilla, U+00B8 ISOdia --> */
|
ali@0
|
228 |
"¹", "¹", "1", /* -- superscript one = superscript digit one, */
|
ali@0
|
229 |
"º", "º", " ", /* -- masculine ordinal indicator, */
|
ali@0
|
230 |
"»", "»", "\"", /* -- right-pointing double angle quotation mark */
|
ali@0
|
231 |
"¼", "¼", "1/4", /* -- vulgar fraction one quarter */
|
ali@0
|
232 |
"½", "½", "1/2", /* -- vulgar fraction one half */
|
ali@0
|
233 |
"¾", "¾", "3/4", /* -- vulgar fraction three quarters */
|
ali@0
|
234 |
"¿", "¿", "?", /* -- inverted question mark */
|
ali@0
|
235 |
"À", "À", "A", /* -- latin capital letter A with grave */
|
ali@0
|
236 |
"Á", "Á", "A", /* -- latin capital letter A with acute, */
|
ali@0
|
237 |
"Â", "Â", "A", /* -- latin capital letter A with circumflex, */
|
ali@0
|
238 |
"Ã", "Ã", "A", /* -- latin capital letter A with tilde, */
|
ali@0
|
239 |
"Ä", "Ä", "A", /* -- latin capital letter A with diaeresis, */
|
ali@0
|
240 |
"Å", "Å", "A", /* -- latin capital letter A with ring above */
|
ali@0
|
241 |
"Æ", "Æ", "AE", /* -- latin capital letter AE */
|
ali@0
|
242 |
"Ç", "Ç", "C", /* -- latin capital letter C with cedilla, */
|
ali@0
|
243 |
"È", "È", "E", /* -- latin capital letter E with grave, */
|
ali@0
|
244 |
"É", "É", "E", /* -- latin capital letter E with acute, */
|
ali@0
|
245 |
"Ê", "Ê", "E", /* -- latin capital letter E with circumflex, */
|
ali@0
|
246 |
"Ë", "Ë", "E", /* -- latin capital letter E with diaeresis, */
|
ali@0
|
247 |
"Ì", "Ì", "I", /* -- latin capital letter I with grave, */
|
ali@0
|
248 |
"Í", "Í", "I", /* -- latin capital letter I with acute, */
|
ali@0
|
249 |
"Î", "Î", "I", /* -- latin capital letter I with circumflex, */
|
ali@0
|
250 |
"Ï", "Ï", "I", /* -- latin capital letter I with diaeresis, */
|
ali@0
|
251 |
"Ð", "Ð", "E", /* -- latin capital letter ETH, U+00D0 ISOlat1 --> */
|
ali@0
|
252 |
"Ñ", "Ñ", "N", /* -- latin capital letter N with tilde, */
|
ali@0
|
253 |
"Ò", "Ò", "O", /* -- latin capital letter O with grave, */
|
ali@0
|
254 |
"Ó", "Ó", "O", /* -- latin capital letter O with acute, */
|
ali@0
|
255 |
"Ô", "Ô", "O", /* -- latin capital letter O with circumflex, */
|
ali@0
|
256 |
"Õ", "Õ", "O", /* -- latin capital letter O with tilde, */
|
ali@0
|
257 |
"Ö", "Ö", "O", /* -- latin capital letter O with diaeresis, */
|
ali@0
|
258 |
"×", "×", "*", /* -- multiplication sign, U+00D7 ISOnum --> */
|
ali@0
|
259 |
"Ø", "Ø", "O", /* -- latin capital letter O with stroke */
|
ali@0
|
260 |
"Ù", "Ù", "U", /* -- latin capital letter U with grave, */
|
ali@0
|
261 |
"Ú", "Ú", "U", /* -- latin capital letter U with acute, */
|
ali@0
|
262 |
"Û", "Û", "U", /* -- latin capital letter U with circumflex, */
|
ali@0
|
263 |
"Ü", "Ü", "U", /* -- latin capital letter U with diaeresis, */
|
ali@0
|
264 |
"Ý", "Ý", "Y", /* -- latin capital letter Y with acute, */
|
ali@0
|
265 |
"Þ", "Þ", "TH", /* -- latin capital letter THORN, */
|
ali@0
|
266 |
"ß", "ß", "sz", /* -- latin small letter sharp s = ess-zed, */
|
ali@0
|
267 |
"à", "à", "a", /* -- latin small letter a with grave */
|
ali@0
|
268 |
"á", "á", "a", /* -- latin small letter a with acute, */
|
ali@0
|
269 |
"â", "â", "a", /* -- latin small letter a with circumflex, */
|
ali@0
|
270 |
"ã", "ã", "a", /* -- latin small letter a with tilde, */
|
ali@0
|
271 |
"ä", "ä", "a", /* -- latin small letter a with diaeresis, */
|
ali@0
|
272 |
"å", "å", "a", /* -- latin small letter a with ring above */
|
ali@0
|
273 |
"æ", "æ", "ae", /* -- latin small letter ae */
|
ali@0
|
274 |
"ç", "ç", "c", /* -- latin small letter c with cedilla, */
|
ali@0
|
275 |
"è", "è", "e", /* -- latin small letter e with grave, */
|
ali@0
|
276 |
"é", "é", "e", /* -- latin small letter e with acute, */
|
ali@0
|
277 |
"ê", "ê", "e", /* -- latin small letter e with circumflex, */
|
ali@0
|
278 |
"ë", "ë", "e", /* -- latin small letter e with diaeresis, */
|
ali@0
|
279 |
"ì", "ì", "i", /* -- latin small letter i with grave, */
|
ali@0
|
280 |
"í", "í", "i", /* -- latin small letter i with acute, */
|
ali@0
|
281 |
"î", "î", "i", /* -- latin small letter i with circumflex, */
|
ali@0
|
282 |
"ï", "ï", "i", /* -- latin small letter i with diaeresis, */
|
ali@0
|
283 |
"ð", "ð", "eth", /* -- latin small letter eth, U+00F0 ISOlat1 --> */
|
ali@0
|
284 |
"ñ", "ñ", "n", /* -- latin small letter n with tilde, */
|
ali@0
|
285 |
"ò", "ò", "o", /* -- latin small letter o with grave, */
|
ali@0
|
286 |
"ó", "ó", "o", /* -- latin small letter o with acute, */
|
ali@0
|
287 |
"ô", "ô", "o", /* -- latin small letter o with circumflex, */
|
ali@0
|
288 |
"õ", "õ", "o", /* -- latin small letter o with tilde, */
|
ali@0
|
289 |
"ö", "ö", "o", /* -- latin small letter o with diaeresis, */
|
ali@0
|
290 |
"÷", "÷", "/", /* -- division sign, U+00F7 ISOnum --> */
|
ali@0
|
291 |
"ø", "ø", "o", /* -- latin small letter o with stroke, */
|
ali@0
|
292 |
"ù", "ù", "u", /* -- latin small letter u with grave, */
|
ali@0
|
293 |
"ú", "ú", "u", /* -- latin small letter u with acute, */
|
ali@0
|
294 |
"û", "û", "u", /* -- latin small letter u with circumflex, */
|
ali@0
|
295 |
"ü", "ü", "u", /* -- latin small letter u with diaeresis, */
|
ali@0
|
296 |
"ý", "ý", "y", /* -- latin small letter y with acute, */
|
ali@0
|
297 |
"þ", "þ", "th", /* -- latin small letter thorn, */
|
ali@0
|
298 |
"ÿ", "ÿ", "y", /* -- latin small letter y with diaeresis, */
|
ali@0
|
299 |
"", "" };
|
ali@0
|
300 |
|
ali@0
|
301 |
/* ---- list of special characters ---- */
|
ali@0
|
302 |
#define CHAR_SPACE 32
|
ali@0
|
303 |
#define CHAR_TAB 9
|
ali@0
|
304 |
#define CHAR_LF 10
|
ali@0
|
305 |
#define CHAR_CR 13
|
ali@0
|
306 |
#define CHAR_DQUOTE 34
|
ali@0
|
307 |
#define CHAR_SQUOTE 39
|
ali@0
|
308 |
#define CHAR_OPEN_SQUOTE 96
|
ali@0
|
309 |
#define CHAR_TILDE 126
|
ali@0
|
310 |
#define CHAR_ASTERISK 42
|
ali@0
|
311 |
#define CHAR_FORESLASH 47
|
ali@0
|
312 |
#define CHAR_CARAT 94
|
ali@0
|
313 |
|
ali@0
|
314 |
#define CHAR_UNDERSCORE '_'
|
ali@0
|
315 |
#define CHAR_OPEN_CBRACK '{'
|
ali@0
|
316 |
#define CHAR_CLOSE_CBRACK '}'
|
ali@0
|
317 |
#define CHAR_OPEN_RBRACK '('
|
ali@0
|
318 |
#define CHAR_CLOSE_RBRACK ')'
|
ali@0
|
319 |
#define CHAR_OPEN_SBRACK '['
|
ali@0
|
320 |
#define CHAR_CLOSE_SBRACK ']'
|
ali@0
|
321 |
|
ali@0
|
322 |
|
ali@0
|
323 |
|
ali@0
|
324 |
|
ali@0
|
325 |
|
ali@0
|
326 |
/* ---- longest and shortest normal PG line lengths ----*/
|
ali@0
|
327 |
#define LONGEST_PG_LINE 75
|
ali@0
|
328 |
#define WAY_TOO_LONG 80
|
ali@0
|
329 |
#define SHORTEST_PG_LINE 55
|
ali@0
|
330 |
|
ali@0
|
331 |
#define SWITCHES "ESTPXLOYHWVMUD" /* switches:- */
|
ali@0
|
332 |
/* D - ignore DP-specific markup */
|
ali@0
|
333 |
/* E - echo queried line */
|
ali@0
|
334 |
/* S - check single quotes */
|
ali@0
|
335 |
/* T - check common typos */
|
ali@0
|
336 |
/* P - require closure of quotes on */
|
ali@0
|
337 |
/* every paragraph */
|
ali@0
|
338 |
/* X - "Trust no one" :-) Paranoid! */
|
ali@0
|
339 |
/* Queries everything */
|
ali@0
|
340 |
/* L - line end checking defaults on */
|
ali@0
|
341 |
/* -L turns it off */
|
ali@0
|
342 |
/* O - overview. Just shows counts. */
|
ali@0
|
343 |
/* Y - puts errors to stdout */
|
ali@0
|
344 |
/* instead of stderr */
|
ali@0
|
345 |
/* H - Echoes header fields */
|
ali@0
|
346 |
/* M - Ignore markup in < > */
|
ali@0
|
347 |
/* U - Use file of User-defined Typos*/
|
ali@0
|
348 |
/* W - Defaults for use on Web upload*/
|
ali@0
|
349 |
/* V - Verbose - list EVERYTHING! */
|
ali@0
|
350 |
#define SWITNO 14 /* max number of switch parms */
|
ali@0
|
351 |
/* - used for defining array-size */
|
ali@0
|
352 |
#define MINARGS 1 /* minimum no of args excl switches */
|
ali@0
|
353 |
#define MAXARGS 1 /* maximum no of args excl switches */
|
ali@0
|
354 |
|
ali@0
|
355 |
int pswit[SWITNO]; /* program switches set by SWITCHES */
|
ali@0
|
356 |
|
ali@0
|
357 |
#define ECHO_SWITCH 0
|
ali@0
|
358 |
#define SQUOTE_SWITCH 1
|
ali@0
|
359 |
#define TYPO_SWITCH 2
|
ali@0
|
360 |
#define QPARA_SWITCH 3
|
ali@0
|
361 |
#define PARANOID_SWITCH 4
|
ali@0
|
362 |
#define LINE_END_SWITCH 5
|
ali@0
|
363 |
#define OVERVIEW_SWITCH 6
|
ali@0
|
364 |
#define STDOUT_SWITCH 7
|
ali@0
|
365 |
#define HEADER_SWITCH 8
|
ali@0
|
366 |
#define WEB_SWITCH 9
|
ali@0
|
367 |
#define VERBOSE_SWITCH 10
|
ali@0
|
368 |
#define MARKUP_SWITCH 11
|
ali@0
|
369 |
#define USERTYPO_SWITCH 12
|
ali@0
|
370 |
#define DP_SWITCH 13
|
ali@0
|
371 |
|
ali@0
|
372 |
|
ali@0
|
373 |
|
ali@0
|
374 |
long cnt_dquot; /* for overview mode, count of doublequote queries */
|
ali@0
|
375 |
long cnt_squot; /* for overview mode, count of singlequote queries */
|
ali@0
|
376 |
long cnt_brack; /* for overview mode, count of brackets queries */
|
ali@0
|
377 |
long cnt_bin; /* for overview mode, count of non-ASCII queries */
|
ali@0
|
378 |
long cnt_odd; /* for overview mode, count of odd character queries */
|
ali@0
|
379 |
long cnt_long; /* for overview mode, count of long line errors */
|
ali@0
|
380 |
long cnt_short; /* for overview mode, count of short line queries */
|
ali@0
|
381 |
long cnt_punct; /* for overview mode, count of punctuation and spacing queries */
|
ali@0
|
382 |
long cnt_dash; /* for overview mode, count of dash-related queries */
|
ali@0
|
383 |
long cnt_word; /* for overview mode, count of word queries */
|
ali@0
|
384 |
long cnt_html; /* for overview mode, count of html queries */
|
ali@0
|
385 |
long cnt_lineend; /* for overview mode, count of line-end queries */
|
ali@0
|
386 |
long cnt_spacend; /* count of lines with space at end V .21 */
|
ali@0
|
387 |
long linecnt; /* count of total lines in the file */
|
ali@0
|
388 |
long checked_linecnt; /* count of lines actually gutchecked V .26 */
|
ali@0
|
389 |
|
ali@0
|
390 |
void proghelp(void);
|
ali@0
|
391 |
void procfile(char *);
|
ali@0
|
392 |
|
ali@0
|
393 |
#define LOW_THRESHOLD 0
|
ali@0
|
394 |
#define HIGH_THRESHOLD 1
|
ali@0
|
395 |
|
ali@0
|
396 |
#define START 0
|
ali@0
|
397 |
#define END 1
|
ali@0
|
398 |
#define PREV 0
|
ali@0
|
399 |
#define NEXT 1
|
ali@0
|
400 |
#define FIRST_OF_PAIR 0
|
ali@0
|
401 |
#define SECOND_OF_PAIR 1
|
ali@0
|
402 |
|
ali@0
|
403 |
#define MAX_WORDPAIR 1000
|
ali@0
|
404 |
|
ali@0
|
405 |
char running_from[MAX_PATH];
|
ali@0
|
406 |
|
ali@0
|
407 |
int mixdigit(char *);
|
ali@0
|
408 |
char *getaword(char *, char *);
|
ali@0
|
409 |
int matchword(char *, char *);
|
ali@0
|
410 |
char *flgets(char *, int, FILE *, long);
|
ali@0
|
411 |
void lowerit(char *);
|
ali@0
|
412 |
int gcisalpha(unsigned char);
|
ali@0
|
413 |
int gcisdigit(unsigned char);
|
ali@0
|
414 |
int gcisletter(unsigned char);
|
ali@0
|
415 |
char *gcstrchr(char *s, char c);
|
ali@0
|
416 |
void postprocess_for_HTML(char *);
|
ali@0
|
417 |
char *linehasmarkup(char *);
|
ali@0
|
418 |
char *losemarkup(char *);
|
ali@0
|
419 |
int tagcomp(char *, char *);
|
ali@0
|
420 |
char *loseentities(char *);
|
ali@0
|
421 |
int isroman(char *);
|
ali@0
|
422 |
int usertypo_count;
|
ali@0
|
423 |
void postprocess_for_DP(char *);
|
ali@0
|
424 |
|
ali@0
|
425 |
char wrk[LINEBUFSIZE];
|
ali@0
|
426 |
|
ali@0
|
427 |
/* This is disgustingly lazy, predefining max words & lengths, */
|
ali@0
|
428 |
/* but now I'm out of 16-bit restrictions, what's a couple of K? */
|
ali@0
|
429 |
#define MAX_QWORD 50
|
ali@0
|
430 |
#define MAX_QWORD_LENGTH 40
|
ali@0
|
431 |
char qword[MAX_QWORD][MAX_QWORD_LENGTH];
|
ali@0
|
432 |
char qperiod[MAX_QWORD][MAX_QWORD_LENGTH];
|
ali@0
|
433 |
signed int dupcnt[MAX_QWORD];
|
ali@0
|
434 |
|
ali@0
|
435 |
|
ali@0
|
436 |
|
ali@0
|
437 |
|
ali@0
|
438 |
int main(int argc, char **argv)
|
ali@0
|
439 |
{
|
ali@0
|
440 |
char *argsw, *s;
|
ali@0
|
441 |
int i, switno, invarg;
|
ali@0
|
442 |
char usertypo_file[MAX_PATH];
|
ali@0
|
443 |
FILE *usertypofile;
|
ali@0
|
444 |
|
ali@0
|
445 |
|
ali@0
|
446 |
if (strlen(argv[0]) < sizeof(running_from))
|
ali@0
|
447 |
strcpy(running_from, argv[0]); /* save the path to the executable gutcheck */
|
ali@0
|
448 |
|
ali@0
|
449 |
/* find out what directory we're running from */
|
ali@0
|
450 |
for (s = running_from + strlen(running_from); *s != '/' && *s != '\\' && s >= running_from; s--)
|
ali@0
|
451 |
*s = 0;
|
ali@0
|
452 |
|
ali@0
|
453 |
|
ali@0
|
454 |
switno = strlen(SWITCHES);
|
ali@0
|
455 |
for (i = switno ; --i >0 ; )
|
ali@0
|
456 |
pswit[i] = 0; /* initialise switches */
|
ali@0
|
457 |
|
ali@0
|
458 |
/* Standard loop to extract switches. */
|
ali@0
|
459 |
/* When we come out of this loop, the arguments will be */
|
ali@0
|
460 |
/* in argv[0] upwards and the switches used will be */
|
ali@0
|
461 |
/* represented by their equivalent elements in pswit[] */
|
ali@0
|
462 |
while ( --argc > 0 && **++argv == '-')
|
ali@0
|
463 |
for (argsw = argv[0]+1; *argsw !='\0'; argsw++)
|
ali@0
|
464 |
for (i = switno, invarg = 1; (--i >= 0) && invarg == 1 ; )
|
ali@0
|
465 |
if ((toupper(*argsw)) == SWITCHES[i] ) {
|
ali@0
|
466 |
invarg = 0;
|
ali@0
|
467 |
pswit[i] = 1;
|
ali@0
|
468 |
}
|
ali@0
|
469 |
|
ali@0
|
470 |
pswit[PARANOID_SWITCH] ^= 1; /* Paranoid checking is turned OFF, not on, by its switch */
|
ali@0
|
471 |
|
ali@0
|
472 |
if (pswit[PARANOID_SWITCH]) { /* if running in paranoid mode */
|
ali@0
|
473 |
pswit[TYPO_SWITCH] = pswit[TYPO_SWITCH] ^ 1; /* force typo checks as well */
|
ali@0
|
474 |
} /* v.20 removed s and p switches from paranoid mode */
|
ali@0
|
475 |
|
ali@0
|
476 |
pswit[LINE_END_SWITCH] ^= 1; /* Line-end checking is turned OFF, not on, by its switch */
|
ali@0
|
477 |
pswit[ECHO_SWITCH] ^= 1; /* V.21 Echoing is turned OFF, not on, by its switch */
|
ali@0
|
478 |
|
ali@0
|
479 |
if (pswit[OVERVIEW_SWITCH]) /* just print summary; don't echo */
|
ali@0
|
480 |
pswit[ECHO_SWITCH] = 0;
|
ali@0
|
481 |
|
ali@0
|
482 |
/* Web uploads - for the moment, this is really just a placeholder */
|
ali@0
|
483 |
/* until we decide what processing we really want to do on web uploads */
|
ali@0
|
484 |
if (pswit[WEB_SWITCH]) { /* specific override for web uploads */
|
ali@0
|
485 |
pswit[ECHO_SWITCH] = 1;
|
ali@0
|
486 |
pswit[SQUOTE_SWITCH] = 0;
|
ali@0
|
487 |
pswit[TYPO_SWITCH] = 1;
|
ali@0
|
488 |
pswit[QPARA_SWITCH] = 0;
|
ali@0
|
489 |
pswit[PARANOID_SWITCH] = 1;
|
ali@0
|
490 |
pswit[LINE_END_SWITCH] = 0;
|
ali@0
|
491 |
pswit[OVERVIEW_SWITCH] = 0;
|
ali@0
|
492 |
pswit[STDOUT_SWITCH] = 0;
|
ali@0
|
493 |
pswit[HEADER_SWITCH] = 1;
|
ali@0
|
494 |
pswit[VERBOSE_SWITCH] = 0;
|
ali@0
|
495 |
pswit[MARKUP_SWITCH] = 0;
|
ali@0
|
496 |
pswit[USERTYPO_SWITCH] = 0;
|
ali@0
|
497 |
pswit[DP_SWITCH] = 0;
|
ali@0
|
498 |
}
|
ali@0
|
499 |
|
ali@0
|
500 |
|
ali@0
|
501 |
if (argc < MINARGS || argc > MAXARGS) { /* check number of args */
|
ali@0
|
502 |
proghelp();
|
ali@0
|
503 |
return(1); /* exit */
|
ali@0
|
504 |
}
|
ali@0
|
505 |
|
ali@0
|
506 |
|
ali@0
|
507 |
/* read in the user-defined stealth scanno list */
|
ali@0
|
508 |
|
ali@0
|
509 |
if (pswit[USERTYPO_SWITCH]) { /* ... we were told we had one! */
|
ali@0
|
510 |
if ((usertypofile = fopen(USERTYPO_FILE, "rb")) == NULL) { /* not in cwd. try gutcheck directory. */
|
ali@0
|
511 |
strcpy(usertypo_file, running_from);
|
ali@0
|
512 |
strcat(usertypo_file, USERTYPO_FILE);
|
ali@0
|
513 |
if ((usertypofile = fopen(usertypo_file, "rb")) == NULL) { /* we ain't got no user typo file! */
|
ali@0
|
514 |
printf(" --> I couldn't find gutcheck.typ -- proceeding without user typos.\n");
|
ali@0
|
515 |
}
|
ali@0
|
516 |
}
|
ali@0
|
517 |
|
ali@0
|
518 |
usertypo_count = 0;
|
ali@0
|
519 |
if (usertypofile) { /* we managed to open a User Typo File! */
|
ali@0
|
520 |
if (pswit[USERTYPO_SWITCH]) {
|
ali@0
|
521 |
while (flgets(aline, LINEBUFSIZE-1, usertypofile, (long)usertypo_count)) {
|
ali@0
|
522 |
if (strlen(aline) > 1) {
|
ali@0
|
523 |
if ((int)*aline > 33) {
|
ali@0
|
524 |
s = malloc(strlen(aline)+1);
|
ali@0
|
525 |
if (!s) {
|
ali@0
|
526 |
fprintf(stderr, "gutcheck: cannot get enough memory for user typo file!!\n");
|
ali@0
|
527 |
exit(1);
|
ali@0
|
528 |
}
|
ali@0
|
529 |
strcpy(s, aline);
|
ali@0
|
530 |
usertypo[usertypo_count] = s;
|
ali@0
|
531 |
usertypo_count++;
|
ali@0
|
532 |
if (usertypo_count >= MAX_USER_TYPOS) {
|
ali@0
|
533 |
printf(" --> Only %d user-defined typos allowed: ignoring the rest\n");
|
ali@0
|
534 |
break;
|
ali@0
|
535 |
}
|
ali@0
|
536 |
}
|
ali@0
|
537 |
}
|
ali@0
|
538 |
}
|
ali@0
|
539 |
}
|
ali@0
|
540 |
fclose(usertypofile);
|
ali@0
|
541 |
}
|
ali@0
|
542 |
}
|
ali@0
|
543 |
|
ali@0
|
544 |
|
ali@0
|
545 |
|
ali@0
|
546 |
|
ali@0
|
547 |
fprintf(stderr, "gutcheck: Check and report on an e-text\n");
|
ali@0
|
548 |
|
ali@0
|
549 |
cnt_dquot = cnt_squot = cnt_brack = cnt_bin = cnt_odd = cnt_long =
|
ali@0
|
550 |
cnt_short = cnt_punct = cnt_dash = cnt_word = cnt_html = cnt_lineend =
|
ali@0
|
551 |
cnt_spacend = 0;
|
ali@0
|
552 |
|
ali@0
|
553 |
procfile(argv[0]);
|
ali@0
|
554 |
|
ali@0
|
555 |
if (pswit[OVERVIEW_SWITCH]) {
|
ali@0
|
556 |
printf(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
|
ali@0
|
557 |
checked_linecnt, linecnt, linecnt - checked_linecnt);
|
ali@0
|
558 |
printf(" --------------- Queries found --------------\n");
|
ali@0
|
559 |
if (cnt_long) printf(" Long lines: %5ld\n",cnt_long);
|
ali@0
|
560 |
if (cnt_short) printf(" Short lines: %5ld\n",cnt_short);
|
ali@0
|
561 |
if (cnt_lineend) printf(" Line-end problems: %5ld\n",cnt_lineend);
|
ali@0
|
562 |
if (cnt_word) printf(" Common typos: %5ld\n",cnt_word);
|
ali@0
|
563 |
if (cnt_dquot) printf(" Unmatched quotes: %5ld\n",cnt_dquot);
|
ali@0
|
564 |
if (cnt_squot) printf(" Unmatched SingleQuotes: %5ld\n",cnt_squot);
|
ali@0
|
565 |
if (cnt_brack) printf(" Unmatched brackets: %5ld\n",cnt_brack);
|
ali@0
|
566 |
if (cnt_bin) printf(" Non-ASCII characters: %5ld\n",cnt_bin);
|
ali@0
|
567 |
if (cnt_odd) printf(" Proofing characters: %5ld\n",cnt_odd);
|
ali@0
|
568 |
if (cnt_punct) printf(" Punctuation & spacing queries: %5ld\n",cnt_punct);
|
ali@0
|
569 |
if (cnt_dash) printf(" Non-standard dashes: %5ld\n",cnt_dash);
|
ali@0
|
570 |
if (cnt_html) printf(" Possible HTML tags: %5ld\n",cnt_html);
|
ali@0
|
571 |
printf("\n");
|
ali@0
|
572 |
printf(" TOTAL QUERIES %5ld\n",
|
ali@0
|
573 |
cnt_dquot + cnt_squot + cnt_brack + cnt_bin + cnt_odd + cnt_long +
|
ali@0
|
574 |
cnt_short + cnt_punct + cnt_dash + cnt_word + cnt_html + cnt_lineend);
|
ali@0
|
575 |
}
|
ali@0
|
576 |
|
ali@0
|
577 |
return(0);
|
ali@0
|
578 |
}
|
ali@0
|
579 |
|
ali@0
|
580 |
|
ali@0
|
581 |
|
ali@0
|
582 |
/* procfile - process one file */
|
ali@0
|
583 |
|
ali@0
|
584 |
void procfile(char *filename)
|
ali@0
|
585 |
{
|
ali@0
|
586 |
|
ali@0
|
587 |
char *s, *t, *s1, laststart, *wordstart;
|
ali@0
|
588 |
char inword[MAXWORDLEN], testword[MAXWORDLEN];
|
ali@0
|
589 |
char parastart[81]; /* first line of current para */
|
ali@0
|
590 |
FILE *infile;
|
ali@0
|
591 |
long quot, squot, firstline, alphalen, totlen, binlen,
|
ali@0
|
592 |
shortline, longline, verylongline, spacedash, emdash,
|
ali@0
|
593 |
space_emdash, non_PG_space_emdash, PG_space_emdash,
|
ali@0
|
594 |
footerline, dotcomma, start_para_line, astline, fslashline,
|
ali@0
|
595 |
standalone_digit, hyphens, htmcount, endquote_count;
|
ali@0
|
596 |
long spline, nspline;
|
ali@0
|
597 |
signed int i, j, llen, isemptyline, isacro, isellipsis, istypo, alower,
|
ali@0
|
598 |
eNon_A, eTab, eTilde, eAst, eFSlash, eCarat;
|
ali@0
|
599 |
signed int warn_short, warn_long, warn_bin, warn_dash, warn_dotcomma,
|
ali@0
|
600 |
warn_ast, warn_fslash, warn_digit, warn_hyphen, warn_endquote;
|
ali@0
|
601 |
unsigned int lastlen, lastblen;
|
ali@0
|
602 |
signed int s_brack, c_brack, r_brack, c_unders;
|
ali@0
|
603 |
signed int open_single_quote, close_single_quote, guessquote, dquotepar, squotepar;
|
ali@0
|
604 |
signed int isnewpara, vowel, consonant;
|
ali@0
|
605 |
char dquote_err[80], squote_err[80], rbrack_err[80], sbrack_err[80], cbrack_err[80],
|
ali@0
|
606 |
unders_err[80];
|
ali@0
|
607 |
signed int qword_index, qperiod_index, isdup;
|
ali@0
|
608 |
signed int enddash;
|
ali@0
|
609 |
signed int Dutchcount, isDutch, Frenchcount, isFrench;
|
ali@0
|
610 |
|
ali@0
|
611 |
|
ali@0
|
612 |
|
ali@0
|
613 |
|
ali@0
|
614 |
|
ali@0
|
615 |
laststart = CHAR_SPACE;
|
ali@0
|
616 |
lastlen = lastblen = 0;
|
ali@0
|
617 |
*dquote_err = *squote_err = *rbrack_err = *cbrack_err = *sbrack_err =
|
ali@0
|
618 |
*unders_err = *prevline = 0;
|
ali@0
|
619 |
linecnt = firstline = alphalen = totlen = binlen =
|
ali@0
|
620 |
shortline = longline = spacedash = emdash = checked_linecnt =
|
ali@0
|
621 |
space_emdash = non_PG_space_emdash = PG_space_emdash =
|
ali@0
|
622 |
footerline = dotcomma = start_para_line = astline = fslashline =
|
ali@0
|
623 |
standalone_digit = hyphens = htmcount = endquote_count = 0;
|
ali@0
|
624 |
quot = squot = s_brack = c_brack = r_brack = c_unders = 0;
|
ali@0
|
625 |
i = llen = isemptyline = isacro = isellipsis = istypo = 0;
|
ali@0
|
626 |
warn_short = warn_long = warn_bin = warn_dash = warn_dotcomma =
|
ali@0
|
627 |
warn_ast = warn_fslash = warn_digit = warn_endquote = 0;
|
ali@0
|
628 |
isnewpara = vowel = consonant = enddash = 0;
|
ali@0
|
629 |
spline = nspline = 0;
|
ali@0
|
630 |
qword_index = qperiod_index = isdup = 0;
|
ali@0
|
631 |
*inword = *testword = 0;
|
ali@0
|
632 |
open_single_quote = close_single_quote = guessquote = dquotepar = squotepar = 0;
|
ali@0
|
633 |
Dutchcount = isDutch = Frenchcount = isFrench = 0;
|
ali@0
|
634 |
|
ali@0
|
635 |
|
ali@0
|
636 |
for (j = 0; j < MAX_QWORD; j++) {
|
ali@0
|
637 |
dupcnt[j] = 0;
|
ali@0
|
638 |
for (i = 0; i < MAX_QWORD_LENGTH; i++)
|
ali@0
|
639 |
qword[i][j] = 0;
|
ali@0
|
640 |
qperiod[i][j] = 0;
|
ali@0
|
641 |
}
|
ali@0
|
642 |
|
ali@0
|
643 |
|
ali@0
|
644 |
if ((infile = fopen(filename, "rb")) == NULL) {
|
ali@0
|
645 |
if (pswit[STDOUT_SWITCH])
|
ali@0
|
646 |
fprintf(stdout, "gutcheck: cannot open %s\n", filename);
|
ali@0
|
647 |
else
|
ali@0
|
648 |
fprintf(stderr, "gutcheck: cannot open %s\n", filename);
|
ali@0
|
649 |
exit(1);
|
ali@0
|
650 |
}
|
ali@0
|
651 |
|
ali@0
|
652 |
fprintf(stdout, "\n\nFile: %s\n\n", filename);
|
ali@0
|
653 |
firstline = shortline = longline = verylongline = 0;
|
ali@0
|
654 |
|
ali@0
|
655 |
|
ali@0
|
656 |
/*****************************************************/
|
ali@0
|
657 |
/* */
|
ali@0
|
658 |
/* Run a first pass - verify that it's a valid PG */
|
ali@0
|
659 |
/* file, decide whether to report some things that */
|
ali@0
|
660 |
/* occur many times in the text like long or short */
|
ali@0
|
661 |
/* lines, non-standard dashes, and other good stuff */
|
ali@0
|
662 |
/* I'll doubtless think of later. */
|
ali@0
|
663 |
/* */
|
ali@0
|
664 |
/*****************************************************/
|
ali@0
|
665 |
|
ali@0
|
666 |
/*****************************************************/
|
ali@0
|
667 |
/* V.24 Sigh. Yet Another Header Change */
|
ali@0
|
668 |
/*****************************************************/
|
ali@0
|
669 |
|
ali@0
|
670 |
while (fgets(aline, LINEBUFSIZE-1, infile)) {
|
ali@0
|
671 |
while (aline[strlen(aline)-1] == 10 || aline[strlen(aline)-1] == 13 ) aline[strlen(aline)-1] = 0;
|
ali@0
|
672 |
linecnt++;
|
ali@0
|
673 |
if (strstr(aline, "*END") && strstr(aline, "SMALL PRINT") && (strstr(aline, "PUBLIC DOMAIN") || strstr(aline, "COPYRIGHT"))) {
|
ali@0
|
674 |
if (spline)
|
ali@0
|
675 |
printf(" --> Duplicate header?\n");
|
ali@0
|
676 |
spline = linecnt + 1; /* first line of non-header text, that is */
|
ali@0
|
677 |
}
|
ali@0
|
678 |
if (!strncmp(aline, "*** START", 9) && strstr(aline, "PROJECT GUTENBERG")) {
|
ali@0
|
679 |
if (nspline)
|
ali@0
|
680 |
printf(" --> Duplicate header?\n");
|
ali@0
|
681 |
nspline = linecnt + 1; /* first line of non-header text, that is */
|
ali@0
|
682 |
}
|
ali@0
|
683 |
if (spline || nspline) {
|
ali@0
|
684 |
lowerit(aline);
|
ali@0
|
685 |
if (strstr(aline, "end") && strstr(aline, "project gutenberg")) {
|
ali@0
|
686 |
if (strstr(aline, "end") < strstr(aline, "project gutenberg")) {
|
ali@0
|
687 |
if (footerline) {
|
ali@0
|
688 |
if (!nspline) /* it's an old-form header - we can detect duplicates */
|
ali@0
|
689 |
printf(" --> Duplicate footer?\n");
|
ali@0
|
690 |
else
|
ali@0
|
691 |
;
|
ali@0
|
692 |
}
|
ali@0
|
693 |
else {
|
ali@0
|
694 |
footerline = linecnt;
|
ali@0
|
695 |
}
|
ali@0
|
696 |
}
|
ali@0
|
697 |
}
|
ali@0
|
698 |
}
|
ali@0
|
699 |
if (spline) firstline = spline;
|
ali@0
|
700 |
if (nspline) firstline = nspline; /* override with new */
|
ali@0
|
701 |
|
ali@0
|
702 |
if (footerline) continue; /* 0.99+ don't count the boilerplate in the footer */
|
ali@0
|
703 |
|
ali@0
|
704 |
llen = strlen(aline);
|
ali@0
|
705 |
totlen += llen;
|
ali@0
|
706 |
for (i = 0; i < llen; i++) {
|
ali@0
|
707 |
if ((unsigned char)aline[i] > 127) binlen++;
|
ali@0
|
708 |
if (gcisalpha(aline[i])) alphalen++;
|
ali@0
|
709 |
if (i > 0)
|
ali@0
|
710 |
if (aline[i] == CHAR_DQUOTE && isalpha(aline[i-1]))
|
ali@0
|
711 |
endquote_count++;
|
ali@0
|
712 |
}
|
ali@0
|
713 |
if (strlen(aline) > 2
|
ali@0
|
714 |
&& lastlen > 2 && lastlen < SHORTEST_PG_LINE
|
ali@0
|
715 |
&& lastblen > 2 && lastblen > SHORTEST_PG_LINE
|
ali@0
|
716 |
&& laststart != CHAR_SPACE)
|
ali@0
|
717 |
shortline++;
|
ali@0
|
718 |
|
ali@0
|
719 |
if (*aline) /* fixed line below for 0.96 */
|
ali@0
|
720 |
if ((unsigned char)aline[strlen(aline)-1] <= CHAR_SPACE) cnt_spacend++;
|
ali@0
|
721 |
|
ali@0
|
722 |
if (strstr(aline, ".,")) dotcomma++;
|
ali@0
|
723 |
/* 0.98 only count ast lines for ignoring purposes where there is */
|
ali@0
|
724 |
/* locase text on the line */
|
ali@0
|
725 |
if (strstr(aline, "*")) {
|
ali@0
|
726 |
for (s = aline; *s; s++)
|
ali@0
|
727 |
if (*s >='a' && *s <= 'z')
|
ali@0
|
728 |
break;
|
ali@0
|
729 |
if (*s) astline++;
|
ali@0
|
730 |
}
|
ali@0
|
731 |
if (strstr(aline, "/"))
|
ali@0
|
732 |
fslashline++;
|
ali@0
|
733 |
for (i = llen-1; i > 0 && (unsigned char)aline[i] <= CHAR_SPACE; i--);
|
ali@0
|
734 |
if (aline[i] == '-' && aline[i-1] != '-') hyphens++;
|
ali@0
|
735 |
|
ali@0
|
736 |
if (llen > LONGEST_PG_LINE) longline++;
|
ali@0
|
737 |
if (llen > WAY_TOO_LONG) verylongline++;
|
ali@0
|
738 |
|
ali@0
|
739 |
if (strstr(aline, "<") && strstr(aline, ">")) {
|
ali@0
|
740 |
i = (signed int) (strstr(aline, ">") - strstr(aline, "<") + 1);
|
ali@0
|
741 |
if (i > 0)
|
ali@0
|
742 |
htmcount++;
|
ali@0
|
743 |
if (strstr(aline, "<i>")) htmcount +=4; /* bonus marks! */
|
ali@0
|
744 |
}
|
ali@0
|
745 |
|
ali@0
|
746 |
/* Check for spaced em-dashes */
|
ali@0
|
747 |
if (strstr(aline,"--")) {
|
ali@0
|
748 |
emdash++;
|
ali@0
|
749 |
if (*(strstr(aline, "--")-1) == CHAR_SPACE ||
|
ali@0
|
750 |
(*(strstr(aline, "--")+2) == CHAR_SPACE))
|
ali@0
|
751 |
space_emdash++;
|
ali@0
|
752 |
if (*(strstr(aline, "--")-1) == CHAR_SPACE &&
|
ali@0
|
753 |
(*(strstr(aline, "--")+2) == CHAR_SPACE))
|
ali@0
|
754 |
non_PG_space_emdash++; /* count of em-dashes with spaces both sides */
|
ali@0
|
755 |
if (*(strstr(aline, "--")-1) != CHAR_SPACE &&
|
ali@0
|
756 |
(*(strstr(aline, "--")+2) != CHAR_SPACE))
|
ali@0
|
757 |
PG_space_emdash++; /* count of PG-type em-dashes with no spaces */
|
ali@0
|
758 |
}
|
ali@0
|
759 |
|
ali@0
|
760 |
for (s = aline; *s;) {
|
ali@0
|
761 |
s = getaword(s, inword);
|
ali@0
|
762 |
if (!strcmp(inword, "hij") || !strcmp(inword, "niet"))
|
ali@0
|
763 |
Dutchcount++;
|
ali@0
|
764 |
if (!strcmp(inword, "dans") || !strcmp(inword, "avec"))
|
ali@0
|
765 |
Frenchcount++;
|
ali@0
|
766 |
if (!strcmp(inword, "0") || !strcmp(inword, "1"))
|
ali@0
|
767 |
standalone_digit++;
|
ali@0
|
768 |
}
|
ali@0
|
769 |
|
ali@0
|
770 |
/* Check for spaced dashes */
|
ali@0
|
771 |
if (strstr(aline," -"))
|
ali@0
|
772 |
if (*(strstr(aline, " -")+2) != '-')
|
ali@0
|
773 |
spacedash++;
|
ali@0
|
774 |
lastblen = lastlen;
|
ali@0
|
775 |
lastlen = strlen(aline);
|
ali@0
|
776 |
laststart = aline[0];
|
ali@0
|
777 |
|
ali@0
|
778 |
}
|
ali@0
|
779 |
fclose(infile);
|
ali@0
|
780 |
|
ali@0
|
781 |
|
ali@0
|
782 |
/* now, based on this quick view, make some snap decisions */
|
ali@0
|
783 |
if (cnt_spacend > 0) {
|
ali@0
|
784 |
printf(" --> %ld lines in this file have white space at end\n", cnt_spacend);
|
ali@0
|
785 |
}
|
ali@0
|
786 |
|
ali@0
|
787 |
warn_dotcomma = 1;
|
ali@0
|
788 |
if (dotcomma > 5) {
|
ali@0
|
789 |
warn_dotcomma = 0;
|
ali@0
|
790 |
printf(" --> %ld lines in this file contain '.,'. Not reporting them.\n", dotcomma);
|
ali@0
|
791 |
}
|
ali@0
|
792 |
|
ali@0
|
793 |
/* if more than 50 lines, or one-tenth, are short, don't bother reporting them */
|
ali@0
|
794 |
warn_short = 1;
|
ali@0
|
795 |
if (shortline > 50 || shortline * 10 > linecnt) {
|
ali@0
|
796 |
warn_short = 0;
|
ali@0
|
797 |
printf(" --> %ld lines in this file are short. Not reporting short lines.\n", shortline);
|
ali@0
|
798 |
}
|
ali@0
|
799 |
|
ali@0
|
800 |
/* if more than 50 lines, or one-tenth, are long, don't bother reporting them */
|
ali@0
|
801 |
warn_long = 1;
|
ali@0
|
802 |
if (longline > 50 || longline * 10 > linecnt) {
|
ali@0
|
803 |
warn_long = 0;
|
ali@0
|
804 |
printf(" --> %ld lines in this file are long. Not reporting long lines.\n", longline);
|
ali@0
|
805 |
}
|
ali@0
|
806 |
|
ali@0
|
807 |
/* if more than 10 lines contain asterisks, don't bother reporting them V.0.97 */
|
ali@0
|
808 |
warn_ast = 1;
|
ali@0
|
809 |
if (astline > 10 ) {
|
ali@0
|
810 |
warn_ast = 0;
|
ali@0
|
811 |
printf(" --> %ld lines in this file contain asterisks. Not reporting them.\n", astline);
|
ali@0
|
812 |
}
|
ali@0
|
813 |
|
ali@0
|
814 |
/* if more than 10 lines contain forward slashes, don't bother reporting them V.0.99 */
|
ali@0
|
815 |
warn_fslash = 1;
|
ali@0
|
816 |
if (fslashline > 10 ) {
|
ali@0
|
817 |
warn_fslash = 0;
|
ali@0
|
818 |
printf(" --> %ld lines in this file contain forward slashes. Not reporting them.\n", fslashline);
|
ali@0
|
819 |
}
|
ali@0
|
820 |
|
ali@0
|
821 |
/* if more than 20 lines contain unpunctuated endquotes, don't bother reporting them V.0.99 */
|
ali@0
|
822 |
warn_endquote = 1;
|
ali@0
|
823 |
if (endquote_count > 20 ) {
|
ali@0
|
824 |
warn_endquote = 0;
|
ali@0
|
825 |
printf(" --> %ld lines in this file contain unpunctuated endquotes. Not reporting them.\n", endquote_count);
|
ali@0
|
826 |
}
|
ali@0
|
827 |
|
ali@0
|
828 |
/* if more than 15 lines contain standalone digits, don't bother reporting them V.0.97 */
|
ali@0
|
829 |
warn_digit = 1;
|
ali@0
|
830 |
if (standalone_digit > 10 ) {
|
ali@0
|
831 |
warn_digit = 0;
|
ali@0
|
832 |
printf(" --> %ld lines in this file contain standalone 0s and 1s. Not reporting them.\n", standalone_digit);
|
ali@0
|
833 |
}
|
ali@0
|
834 |
|
ali@0
|
835 |
/* if more than 20 lines contain hyphens at end, don't bother reporting them V.0.98 */
|
ali@0
|
836 |
warn_hyphen = 1;
|
ali@0
|
837 |
if (hyphens > 20 ) {
|
ali@0
|
838 |
warn_hyphen = 0;
|
ali@0
|
839 |
printf(" --> %ld lines in this file have hyphens at end. Not reporting them.\n", hyphens);
|
ali@0
|
840 |
}
|
ali@0
|
841 |
|
ali@0
|
842 |
if (htmcount > 20 && !pswit[MARKUP_SWITCH]) {
|
ali@0
|
843 |
printf(" --> Looks like this is HTML. Switching HTML mode ON.\n");
|
ali@0
|
844 |
pswit[MARKUP_SWITCH] = 1;
|
ali@0
|
845 |
}
|
ali@0
|
846 |
|
ali@0
|
847 |
if (verylongline > 0) {
|
ali@0
|
848 |
printf(" --> %ld lines in this file are VERY long!\n", verylongline);
|
ali@0
|
849 |
}
|
ali@0
|
850 |
|
ali@0
|
851 |
/* If there are more non-PG spaced dashes than PG em-dashes, */
|
ali@0
|
852 |
/* assume it's deliberate */
|
ali@0
|
853 |
/* Current PG guidelines say don't use them, but older texts do,*/
|
ali@0
|
854 |
/* and some people insist on them whatever the guidelines say. */
|
ali@0
|
855 |
/* V.20 removed requirement that PG_space_emdash be greater than*/
|
ali@0
|
856 |
/* ten before turning off warnings about spaced dashes. */
|
ali@0
|
857 |
warn_dash = 1;
|
ali@0
|
858 |
if (spacedash + non_PG_space_emdash > PG_space_emdash) {
|
ali@0
|
859 |
warn_dash = 0;
|
ali@0
|
860 |
printf(" --> There are %ld spaced dashes and em-dashes. Not reporting them.\n", spacedash + non_PG_space_emdash);
|
ali@0
|
861 |
}
|
ali@0
|
862 |
|
ali@0
|
863 |
/* if more than a quarter of characters are hi-bit, bug out */
|
ali@0
|
864 |
warn_bin = 1;
|
ali@0
|
865 |
if (binlen * 4 > totlen) {
|
ali@0
|
866 |
printf(" --> This file does not appear to be ASCII. Terminating. Best of luck with it!\n");
|
ali@0
|
867 |
exit(1);
|
ali@0
|
868 |
}
|
ali@0
|
869 |
if (alphalen * 4 < totlen) {
|
ali@0
|
870 |
printf(" --> This file does not appear to be text. Terminating. Best of luck with it!\n");
|
ali@0
|
871 |
exit(1);
|
ali@0
|
872 |
}
|
ali@0
|
873 |
if ((binlen * 100 > totlen) || (binlen > 100)) {
|
ali@0
|
874 |
printf(" --> There are a lot of foreign letters here. Not reporting them.\n");
|
ali@0
|
875 |
warn_bin = 0;
|
ali@0
|
876 |
}
|
ali@0
|
877 |
|
ali@0
|
878 |
/* isDutch and isFrench added .991 Feb 06 for Frank, Jeroen, Renald */
|
ali@0
|
879 |
isDutch = 0;
|
ali@0
|
880 |
if (Dutchcount > 50) {
|
ali@0
|
881 |
isDutch = 1;
|
ali@0
|
882 |
printf(" --> This looks like Dutch - switching off dashes and warnings for 's Middags case.\n");
|
ali@0
|
883 |
}
|
ali@0
|
884 |
|
ali@0
|
885 |
isFrench = 0;
|
ali@0
|
886 |
if (Frenchcount > 50) {
|
ali@0
|
887 |
isFrench = 1;
|
ali@0
|
888 |
printf(" --> This looks like French - switching off some doublepunct.\n");
|
ali@0
|
889 |
}
|
ali@0
|
890 |
|
ali@0
|
891 |
if (firstline && footerline)
|
ali@0
|
892 |
printf(" The PG header and footer appear to be already on.\n");
|
ali@0
|
893 |
else {
|
ali@0
|
894 |
if (firstline)
|
ali@0
|
895 |
printf(" The PG header is on - no footer.\n");
|
ali@0
|
896 |
if (footerline)
|
ali@0
|
897 |
printf(" The PG footer is on - no header.\n");
|
ali@0
|
898 |
}
|
ali@0
|
899 |
printf("\n");
|
ali@0
|
900 |
|
ali@0
|
901 |
/* V.22 George Davis asked for an override switch to force it to list everything */
|
ali@0
|
902 |
if (pswit[VERBOSE_SWITCH]) {
|
ali@0
|
903 |
warn_bin = 1;
|
ali@0
|
904 |
warn_short = 1;
|
ali@0
|
905 |
warn_dotcomma = 1;
|
ali@0
|
906 |
warn_long = 1;
|
ali@0
|
907 |
warn_dash = 1;
|
ali@0
|
908 |
warn_digit = 1;
|
ali@0
|
909 |
warn_ast = 1;
|
ali@0
|
910 |
warn_fslash = 1;
|
ali@0
|
911 |
warn_hyphen = 1;
|
ali@0
|
912 |
warn_endquote = 1;
|
ali@0
|
913 |
printf(" *** Verbose output is ON -- you asked for it! ***\n");
|
ali@0
|
914 |
}
|
ali@0
|
915 |
|
ali@0
|
916 |
if (isDutch)
|
ali@0
|
917 |
warn_dash = 0; /* Frank suggested turning it REALLY off for Dutch */
|
ali@0
|
918 |
|
ali@0
|
919 |
if ((infile = fopen(filename, "rb")) == NULL) {
|
ali@0
|
920 |
if (pswit[STDOUT_SWITCH])
|
ali@0
|
921 |
fprintf(stdout, "gutcheck: cannot open %s\n", filename);
|
ali@0
|
922 |
else
|
ali@0
|
923 |
fprintf(stderr, "gutcheck: cannot open %s\n", filename);
|
ali@0
|
924 |
exit(1);
|
ali@0
|
925 |
}
|
ali@0
|
926 |
|
ali@0
|
927 |
if (footerline > 0 && firstline > 0 && footerline > firstline && footerline - firstline < 100) { /* ugh */
|
ali@0
|
928 |
printf(" --> I don't really know where this text starts. \n");
|
ali@0
|
929 |
printf(" There are no reference points.\n");
|
ali@0
|
930 |
printf(" I'm going to have to report the header and footer as well.\n");
|
ali@0
|
931 |
firstline=0;
|
ali@0
|
932 |
}
|
ali@0
|
933 |
|
ali@0
|
934 |
|
ali@0
|
935 |
|
ali@0
|
936 |
/*****************************************************/
|
ali@0
|
937 |
/* */
|
ali@0
|
938 |
/* Here we go with the main pass. Hold onto yer hat! */
|
ali@0
|
939 |
/* */
|
ali@0
|
940 |
/*****************************************************/
|
ali@0
|
941 |
|
ali@0
|
942 |
/* Re-init some variables we've dirtied */
|
ali@0
|
943 |
quot = squot = linecnt = 0;
|
ali@0
|
944 |
laststart = CHAR_SPACE;
|
ali@0
|
945 |
lastlen = lastblen = 0;
|
ali@0
|
946 |
|
ali@0
|
947 |
while (flgets(aline, LINEBUFSIZE-1, infile, linecnt+1)) {
|
ali@0
|
948 |
linecnt++;
|
ali@0
|
949 |
if (linecnt == 1) isnewpara = 1;
|
ali@0
|
950 |
if (pswit[DP_SWITCH])
|
ali@0
|
951 |
if (!strncmp(aline, "-----File: ", 11))
|
ali@0
|
952 |
continue; // skip DP page separators completely
|
ali@0
|
953 |
if (linecnt < firstline || (footerline > 0 && linecnt > footerline)) {
|
ali@0
|
954 |
if (pswit[HEADER_SWITCH]) {
|
ali@0
|
955 |
if (!strncmp(aline, "Title:", 6))
|
ali@0
|
956 |
printf(" %s\n", aline);
|
ali@0
|
957 |
if (!strncmp (aline, "Author:", 7))
|
ali@0
|
958 |
printf(" %s\n", aline);
|
ali@0
|
959 |
if (!strncmp(aline, "Release Date:", 13))
|
ali@0
|
960 |
printf(" %s\n", aline);
|
ali@0
|
961 |
if (!strncmp(aline, "Edition:", 8))
|
ali@0
|
962 |
printf(" %s\n\n", aline);
|
ali@0
|
963 |
}
|
ali@0
|
964 |
continue; /* skip through the header */
|
ali@0
|
965 |
}
|
ali@0
|
966 |
checked_linecnt++;
|
ali@0
|
967 |
s = aline;
|
ali@0
|
968 |
isemptyline = 1; /* assume the line is empty until proven otherwise */
|
ali@0
|
969 |
|
ali@0
|
970 |
/* If we are in a state of unbalanced quotes, and this line */
|
ali@0
|
971 |
/* doesn't begin with a quote, output the stored error message */
|
ali@0
|
972 |
/* If the -P switch was used, print the warning even if the */
|
ali@0
|
973 |
/* new para starts with quotes */
|
ali@0
|
974 |
/* Version .20 - if the new paragraph does start with a quote, */
|
ali@0
|
975 |
/* but is indented, I was giving a spurious error. Need to */
|
ali@0
|
976 |
/* check the first _non-space_ character on the line rather */
|
ali@0
|
977 |
/* than the first character when deciding whether the para */
|
ali@0
|
978 |
/* starts with a quote. Using *t for this. */
|
ali@0
|
979 |
t = s;
|
ali@0
|
980 |
while (*t == ' ') t++;
|
ali@0
|
981 |
if (*dquote_err)
|
ali@0
|
982 |
if (*t != CHAR_DQUOTE || pswit[QPARA_SWITCH]) {
|
ali@0
|
983 |
if (!pswit[OVERVIEW_SWITCH]) {
|
ali@0
|
984 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
|
ali@0
|
985 |
printf(dquote_err);
|
ali@0
|
986 |
}
|
ali@0
|
987 |
else
|
ali@0
|
988 |
cnt_dquot++;
|
ali@0
|
989 |
}
|
ali@0
|
990 |
if (*squote_err) {
|
ali@0
|
991 |
if (*t != CHAR_SQUOTE && *t != CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] || squot) {
|
ali@0
|
992 |
if (!pswit[OVERVIEW_SWITCH]) {
|
ali@0
|
993 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
|
ali@0
|
994 |
printf(squote_err);
|
ali@0
|
995 |
}
|
ali@0
|
996 |
else
|
ali@0
|
997 |
cnt_squot++;
|
ali@0
|
998 |
}
|
ali@0
|
999 |
squot = 0;
|
ali@0
|
1000 |
}
|
ali@0
|
1001 |
if (*rbrack_err) {
|
ali@0
|
1002 |
if (!pswit[OVERVIEW_SWITCH]) {
|
ali@0
|
1003 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
|
ali@0
|
1004 |
printf(rbrack_err);
|
ali@0
|
1005 |
}
|
ali@0
|
1006 |
else
|
ali@0
|
1007 |
cnt_brack++;
|
ali@0
|
1008 |
}
|
ali@0
|
1009 |
if (*sbrack_err) {
|
ali@0
|
1010 |
if (!pswit[OVERVIEW_SWITCH]) {
|
ali@0
|
1011 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
|
ali@0
|
1012 |
printf(sbrack_err);
|
ali@0
|
1013 |
}
|
ali@0
|
1014 |
else
|
ali@0
|
1015 |
cnt_brack++;
|
ali@0
|
1016 |
}
|
ali@0
|
1017 |
if (*cbrack_err) {
|
ali@0
|
1018 |
if (!pswit[OVERVIEW_SWITCH]) {
|
ali@0
|
1019 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
|
ali@0
|
1020 |
printf(cbrack_err);
|
ali@0
|
1021 |
}
|
ali@0
|
1022 |
else
|
ali@0
|
1023 |
cnt_brack++;
|
ali@0
|
1024 |
}
|
ali@0
|
1025 |
if (*unders_err) {
|
ali@0
|
1026 |
if (!pswit[OVERVIEW_SWITCH]) {
|
ali@0
|
1027 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
|
ali@0
|
1028 |
printf(unders_err);
|
ali@0
|
1029 |
}
|
ali@0
|
1030 |
else
|
ali@0
|
1031 |
cnt_brack++;
|
ali@0
|
1032 |
}
|
ali@0
|
1033 |
|
ali@0
|
1034 |
*dquote_err = *squote_err = *rbrack_err = *cbrack_err =
|
ali@0
|
1035 |
*sbrack_err = *unders_err = 0;
|
ali@0
|
1036 |
|
ali@0
|
1037 |
|
ali@0
|
1038 |
/* look along the line, accumulate the count of quotes, and see */
|
ali@0
|
1039 |
/* if this is an empty line - i.e. a line with nothing on it */
|
ali@0
|
1040 |
/* but spaces. */
|
ali@0
|
1041 |
/* V .12 also if line has just spaces, * and/or - on it, don't */
|
ali@0
|
1042 |
/* count it, since empty lines with asterisks or dashes to */
|
ali@0
|
1043 |
/* separate sections are common. */
|
ali@0
|
1044 |
/* V .15 new single-quote checking - has to be better than the */
|
ali@0
|
1045 |
/* previous version, but how much better? fingers crossed! */
|
ali@0
|
1046 |
/* V .20 add period to * and - as characters on a separator line*/
|
ali@0
|
1047 |
s = aline;
|
ali@0
|
1048 |
while (*s) {
|
ali@0
|
1049 |
if (*s == CHAR_DQUOTE) quot++;
|
ali@0
|
1050 |
if (*s == CHAR_SQUOTE || *s == CHAR_OPEN_SQUOTE)
|
ali@0
|
1051 |
if (s == aline) { /* at start of line, it can only be an openquote */
|
ali@0
|
1052 |
if (strncmp(s+2, "tis", 3) && strncmp(s+2, "Tis", 3)) /* hardcode a very common exception! */
|
ali@0
|
1053 |
open_single_quote++;
|
ali@0
|
1054 |
}
|
ali@0
|
1055 |
else
|
ali@0
|
1056 |
if (gcisalpha(*(s-1)) && gcisalpha(*(s+1)))
|
ali@0
|
1057 |
; /* do nothing! - it's definitely an apostrophe, not a quote */
|
ali@0
|
1058 |
else /* it's outside a word - let's check it out */
|
ali@0
|
1059 |
if (*s == CHAR_OPEN_SQUOTE || gcisalpha(*(s+1))) { /* it damwell better BE an openquote */
|
ali@0
|
1060 |
if (strncmp(s+1, "tis", 3) && strncmp(s+1, "Tis", 3)) /* hardcode a very common exception! */
|
ali@0
|
1061 |
open_single_quote++;
|
ali@0
|
1062 |
}
|
ali@0
|
1063 |
else { /* now - is it a closequote? */
|
ali@0
|
1064 |
guessquote = 0; /* accumulate clues */
|
ali@0
|
1065 |
if (gcisalpha(*(s-1))) { /* it follows a letter - could be either */
|
ali@0
|
1066 |
guessquote += 1;
|
ali@0
|
1067 |
if (*(s-1) == 's') { /* looks like a plural apostrophe */
|
ali@0
|
1068 |
guessquote -= 3;
|
ali@0
|
1069 |
if (*(s+1) == CHAR_SPACE) /* bonus marks! */
|
ali@0
|
1070 |
guessquote -= 2;
|
ali@0
|
1071 |
}
|
ali@0
|
1072 |
}
|
ali@0
|
1073 |
else /* it doesn't have a letter either side */
|
ali@0
|
1074 |
if (strchr(".?!,;:", *(s-1)) && (strchr(".?!,;: ", *(s+1))))
|
ali@0
|
1075 |
guessquote += 8; /* looks like a closequote */
|
ali@0
|
1076 |
else
|
ali@0
|
1077 |
guessquote += 1;
|
ali@0
|
1078 |
if (open_single_quote > close_single_quote)
|
ali@0
|
1079 |
guessquote += 1; /* give it the benefit of some doubt - if a squote is already open */
|
ali@0
|
1080 |
else
|
ali@0
|
1081 |
guessquote -= 1;
|
ali@0
|
1082 |
if (guessquote >= 0)
|
ali@0
|
1083 |
close_single_quote++;
|
ali@0
|
1084 |
}
|
ali@0
|
1085 |
|
ali@0
|
1086 |
if (*s != CHAR_SPACE
|
ali@0
|
1087 |
&& *s != '-'
|
ali@0
|
1088 |
&& *s != '.'
|
ali@0
|
1089 |
&& *s != CHAR_ASTERISK
|
ali@0
|
1090 |
&& *s != 13
|
ali@0
|
1091 |
&& *s != 10) isemptyline = 0; /* ignore lines like * * * as spacers */
|
ali@0
|
1092 |
if (*s == CHAR_UNDERSCORE) c_unders++;
|
ali@0
|
1093 |
if (*s == CHAR_OPEN_CBRACK) c_brack++;
|
ali@0
|
1094 |
if (*s == CHAR_CLOSE_CBRACK) c_brack--;
|
ali@0
|
1095 |
if (*s == CHAR_OPEN_RBRACK) r_brack++;
|
ali@0
|
1096 |
if (*s == CHAR_CLOSE_RBRACK) r_brack--;
|
ali@0
|
1097 |
if (*s == CHAR_OPEN_SBRACK) s_brack++;
|
ali@0
|
1098 |
if (*s == CHAR_CLOSE_SBRACK) s_brack--;
|
ali@0
|
1099 |
s++;
|
ali@0
|
1100 |
}
|
ali@0
|
1101 |
|
ali@0
|
1102 |
if (isnewpara && !isemptyline) { /* This line is the start of a new paragraph */
|
ali@0
|
1103 |
start_para_line = linecnt;
|
ali@0
|
1104 |
strncpy(parastart, aline, 80); /* Capture its first line in case we want to report it later */
|
ali@0
|
1105 |
parastart[79] = 0;
|
ali@0
|
1106 |
dquotepar = squotepar = 0; /* restart the quote count 0.98 */
|
ali@0
|
1107 |
s = aline;
|
ali@0
|
1108 |
while (!gcisalpha(*s) && !gcisdigit(*s) && *s) s++; /* V.97 fixed bug - overran line and gave false warning - rare */
|
ali@0
|
1109 |
if (*s >= 'a' && *s <='z') { /* and its first letter is lowercase */
|
ali@0
|
1110 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1111 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1112 |
printf(" Line %ld column %d - Paragraph starts with lower-case\n", linecnt, (int)(s - aline) +1);
|
ali@0
|
1113 |
else
|
ali@0
|
1114 |
cnt_punct++;
|
ali@0
|
1115 |
}
|
ali@0
|
1116 |
isnewpara = 0; /* Signal the end of new para processing */
|
ali@0
|
1117 |
}
|
ali@0
|
1118 |
|
ali@0
|
1119 |
/* Check for an em-dash broken at line end */
|
ali@0
|
1120 |
if (enddash && *aline == '-') {
|
ali@0
|
1121 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1122 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1123 |
printf(" Line %ld column 1 - Broken em-dash?\n", linecnt);
|
ali@0
|
1124 |
else
|
ali@0
|
1125 |
cnt_punct++;
|
ali@0
|
1126 |
}
|
ali@0
|
1127 |
enddash = 0;
|
ali@0
|
1128 |
for (s = aline + strlen(aline) - 1; *s == ' ' && s > aline; s--);
|
ali@0
|
1129 |
if (s >= aline && *s == '-')
|
ali@0
|
1130 |
enddash = 1;
|
ali@0
|
1131 |
|
ali@0
|
1132 |
|
ali@0
|
1133 |
/* Check for invalid or questionable characters in the line */
|
ali@0
|
1134 |
/* Anything above 127 is invalid for plain ASCII, and */
|
ali@0
|
1135 |
/* non-printable control characters should also be flagged. */
|
ali@0
|
1136 |
/* Tabs should generally not be there. */
|
ali@0
|
1137 |
/* Jan 06, in 0.99: Hm. For some strange reason, I either */
|
ali@0
|
1138 |
/* never created or deleted the check for unprintable */
|
ali@0
|
1139 |
/* control characters. They should be reported even if */
|
ali@0
|
1140 |
/* warn_bin is on, I think, and in full. */
|
ali@0
|
1141 |
|
ali@0
|
1142 |
for (s = aline; *s; s++) {
|
ali@0
|
1143 |
i = (unsigned char) *s;
|
ali@0
|
1144 |
if (i < CHAR_SPACE && i != CHAR_LF && i != CHAR_CR && i != CHAR_TAB) {
|
ali@0
|
1145 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1146 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1147 |
printf(" Line %ld column %d - Control character %d\n", linecnt, (int) (s - aline) + 1, i);
|
ali@0
|
1148 |
else
|
ali@0
|
1149 |
cnt_bin++;
|
ali@0
|
1150 |
}
|
ali@0
|
1151 |
}
|
ali@0
|
1152 |
|
ali@0
|
1153 |
if (warn_bin) {
|
ali@0
|
1154 |
eNon_A = eTab = eTilde = eCarat = eFSlash = eAst = 0; /* don't repeat multiple warnings on one line */
|
ali@0
|
1155 |
for (s = aline; *s; s++) {
|
ali@0
|
1156 |
if (!eNon_A && ((*s < CHAR_SPACE && *s != 9 && *s != '\n') || (unsigned char)*s > 127)) {
|
ali@0
|
1157 |
i = *s; /* annoying kludge for signed chars */
|
ali@0
|
1158 |
if (i < 0) i += 256;
|
ali@0
|
1159 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1160 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1161 |
if (i > 127 && i < 160)
|
ali@0
|
1162 |
printf(" Line %ld column %d - Non-ISO-8859 character %d\n", linecnt, (int) (s - aline) + 1, i);
|
ali@0
|
1163 |
else
|
ali@0
|
1164 |
printf(" Line %ld column %d - Non-ASCII character %d\n", linecnt, (int) (s - aline) + 1, i);
|
ali@0
|
1165 |
else
|
ali@0
|
1166 |
cnt_bin++;
|
ali@0
|
1167 |
eNon_A = 1;
|
ali@0
|
1168 |
}
|
ali@0
|
1169 |
if (!eTab && *s == CHAR_TAB) {
|
ali@0
|
1170 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1171 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1172 |
printf(" Line %ld column %d - Tab character?\n", linecnt, (int) (s - aline) + 1);
|
ali@0
|
1173 |
else
|
ali@0
|
1174 |
cnt_odd++;
|
ali@0
|
1175 |
eTab = 1;
|
ali@0
|
1176 |
}
|
ali@0
|
1177 |
if (!eTilde && *s == CHAR_TILDE) { /* often used by OCR software to indicate an unrecognizable character */
|
ali@0
|
1178 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1179 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1180 |
printf(" Line %ld column %d - Tilde character?\n", linecnt, (int) (s - aline) + 1);
|
ali@0
|
1181 |
else
|
ali@0
|
1182 |
cnt_odd++;
|
ali@0
|
1183 |
eTilde = 1;
|
ali@0
|
1184 |
}
|
ali@0
|
1185 |
if (!eCarat && *s == CHAR_CARAT) {
|
ali@0
|
1186 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1187 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1188 |
printf(" Line %ld column %d - Carat character?\n", linecnt, (int) (s - aline) + 1);
|
ali@0
|
1189 |
else
|
ali@0
|
1190 |
cnt_odd++;
|
ali@0
|
1191 |
eCarat = 1;
|
ali@0
|
1192 |
}
|
ali@0
|
1193 |
if (!eFSlash && *s == CHAR_FORESLASH && warn_fslash) {
|
ali@0
|
1194 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1195 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1196 |
printf(" Line %ld column %d - Forward slash?\n", linecnt, (int) (s - aline) + 1);
|
ali@0
|
1197 |
else
|
ali@0
|
1198 |
cnt_odd++;
|
ali@0
|
1199 |
eFSlash = 1;
|
ali@0
|
1200 |
}
|
ali@0
|
1201 |
/* report asterisks only in paranoid mode, since they're often deliberate */
|
ali@0
|
1202 |
if (!eAst && pswit[PARANOID_SWITCH] && warn_ast && !isemptyline && *s == CHAR_ASTERISK) {
|
ali@0
|
1203 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1204 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1205 |
printf(" Line %ld column %d - Asterisk?\n", linecnt, (int) (s - aline) + 1);
|
ali@0
|
1206 |
else
|
ali@0
|
1207 |
cnt_odd++;
|
ali@0
|
1208 |
eAst = 1;
|
ali@0
|
1209 |
}
|
ali@0
|
1210 |
}
|
ali@0
|
1211 |
}
|
ali@0
|
1212 |
|
ali@0
|
1213 |
/* Check for line too long */
|
ali@0
|
1214 |
if (warn_long) {
|
ali@0
|
1215 |
if (strlen(aline) > LONGEST_PG_LINE) {
|
ali@0
|
1216 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1217 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1218 |
printf(" Line %ld column %d - Long line %d\n", linecnt, strlen(aline), strlen(aline));
|
ali@0
|
1219 |
else
|
ali@0
|
1220 |
cnt_long++;
|
ali@0
|
1221 |
}
|
ali@0
|
1222 |
}
|
ali@0
|
1223 |
|
ali@0
|
1224 |
/* Check for line too short. */
|
ali@0
|
1225 |
/* This one is a bit trickier to implement: we don't want to */
|
ali@0
|
1226 |
/* flag the last line of a paragraph for being short, so we */
|
ali@0
|
1227 |
/* have to wait until we know that our current line is a */
|
ali@0
|
1228 |
/* "normal" line, then report the _previous_ line if it was too */
|
ali@0
|
1229 |
/* short. We also don't want to report indented lines like */
|
ali@0
|
1230 |
/* chapter heads or formatted quotations. We therefore keep */
|
ali@0
|
1231 |
/* lastlen as the length of the last line examined, and */
|
ali@0
|
1232 |
/* lastblen as the length of the last but one, and try to */
|
ali@0
|
1233 |
/* suppress unnecessary warnings by checking that both were of */
|
ali@0
|
1234 |
/* "normal" length. We keep the first character of the last */
|
ali@0
|
1235 |
/* line in laststart, and if it was a space, we assume that the */
|
ali@0
|
1236 |
/* formatting is deliberate. I can't figure out a way to */
|
ali@0
|
1237 |
/* distinguish something like a quoted verse left-aligned or */
|
ali@0
|
1238 |
/* the header or footer of a letter from a paragraph of short */
|
ali@0
|
1239 |
/* lines - maybe if I examined the whole paragraph, and if the */
|
ali@0
|
1240 |
/* para has less than, say, 8 lines and if all lines are short, */
|
ali@0
|
1241 |
/* then just assume it's OK? Need to look at some texts to see */
|
ali@0
|
1242 |
/* how often a formula like this would get the right result. */
|
ali@0
|
1243 |
/* V0.99 changed the tolerance for length to ignore from 2 to 1 */
|
ali@0
|
1244 |
if (warn_short) {
|
ali@0
|
1245 |
if (strlen(aline) > 1
|
ali@0
|
1246 |
&& lastlen > 1 && lastlen < SHORTEST_PG_LINE
|
ali@0
|
1247 |
&& lastblen > 1 && lastblen > SHORTEST_PG_LINE
|
ali@0
|
1248 |
&& laststart != CHAR_SPACE) {
|
ali@0
|
1249 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", prevline);
|
ali@0
|
1250 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1251 |
printf(" Line %ld column %d - Short line %d?\n", linecnt-1, strlen(prevline), strlen(prevline));
|
ali@0
|
1252 |
else
|
ali@0
|
1253 |
cnt_short++;
|
ali@0
|
1254 |
}
|
ali@0
|
1255 |
}
|
ali@0
|
1256 |
lastblen = lastlen;
|
ali@0
|
1257 |
lastlen = strlen(aline);
|
ali@0
|
1258 |
laststart = aline[0];
|
ali@0
|
1259 |
|
ali@0
|
1260 |
/* look for punctuation at start of line */
|
ali@0
|
1261 |
if (*aline && strchr(".?!,;:", aline[0])) { /* if it's punctuation */
|
ali@0
|
1262 |
if (strncmp(". . .", aline, 5)) { /* exception for ellipsis: V.98 tightened up to except only a full ellipsis */
|
ali@0
|
1263 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1264 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1265 |
printf(" Line %ld column 1 - Begins with punctuation?\n", linecnt);
|
ali@0
|
1266 |
else
|
ali@0
|
1267 |
cnt_punct++;
|
ali@0
|
1268 |
}
|
ali@0
|
1269 |
}
|
ali@0
|
1270 |
|
ali@0
|
1271 |
/* Check for spaced em-dashes */
|
ali@0
|
1272 |
/* V.20 must check _all_ occurrences of "--" on the line */
|
ali@0
|
1273 |
/* hence the loop - even if the first double-dash is OK */
|
ali@0
|
1274 |
/* there may be another that's wrong later on. */
|
ali@0
|
1275 |
if (warn_dash) {
|
ali@0
|
1276 |
s = aline;
|
ali@0
|
1277 |
while (strstr(s,"--")) {
|
ali@0
|
1278 |
if (*(strstr(s, "--")-1) == CHAR_SPACE ||
|
ali@0
|
1279 |
(*(strstr(s, "--")+2) == CHAR_SPACE)) {
|
ali@0
|
1280 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1281 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1282 |
printf(" Line %ld column %d - Spaced em-dash?\n", linecnt, (int) (strstr(s,"--") - aline) + 1);
|
ali@0
|
1283 |
else
|
ali@0
|
1284 |
cnt_dash++;
|
ali@0
|
1285 |
}
|
ali@0
|
1286 |
s = strstr(s,"--") + 2;
|
ali@0
|
1287 |
}
|
ali@0
|
1288 |
}
|
ali@0
|
1289 |
|
ali@0
|
1290 |
/* Check for spaced dashes */
|
ali@0
|
1291 |
if (warn_dash)
|
ali@0
|
1292 |
if (strstr(aline," -")) {
|
ali@0
|
1293 |
if (*(strstr(aline, " -")+2) != '-') {
|
ali@0
|
1294 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1295 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1296 |
printf(" Line %ld column %d - Spaced dash?\n", linecnt, (int) (strstr(aline," -") - aline) + 1);
|
ali@0
|
1297 |
else
|
ali@0
|
1298 |
cnt_dash++;
|
ali@0
|
1299 |
}
|
ali@0
|
1300 |
}
|
ali@0
|
1301 |
else
|
ali@0
|
1302 |
if (strstr(aline,"- ")) {
|
ali@0
|
1303 |
if (*(strstr(aline, "- ")-1) != '-') {
|
ali@0
|
1304 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1305 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1306 |
printf(" Line %ld column %d - Spaced dash?\n", linecnt, (int) (strstr(aline,"- ") - aline) + 1);
|
ali@0
|
1307 |
else
|
ali@0
|
1308 |
cnt_dash++;
|
ali@0
|
1309 |
}
|
ali@0
|
1310 |
}
|
ali@0
|
1311 |
|
ali@0
|
1312 |
/* v 0.99 */
|
ali@0
|
1313 |
/* Check for unmarked paragraphs indicated by separate speakers */
|
ali@0
|
1314 |
/* May well be false positive: */
|
ali@0
|
1315 |
/* "Bravo!" "Wonderful!" called the crowd. */
|
ali@0
|
1316 |
/* but useful all the same. */
|
ali@0
|
1317 |
s = wrk;
|
ali@0
|
1318 |
*s = 0;
|
ali@0
|
1319 |
if (strstr(aline, "\" \"")) s = strstr(aline, "\" \"");
|
ali@0
|
1320 |
if (strstr(aline, "\" \"")) s = strstr(aline, "\" \"");
|
ali@0
|
1321 |
if (*s) {
|
ali@0
|
1322 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1323 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1324 |
printf(" Line %ld column %d - Query missing paragraph break?\n", linecnt, (int)(s - aline) +1);
|
ali@0
|
1325 |
else
|
ali@0
|
1326 |
cnt_punct++;
|
ali@0
|
1327 |
}
|
ali@0
|
1328 |
|
ali@0
|
1329 |
|
ali@0
|
1330 |
|
ali@0
|
1331 |
/* Check for "to he" and other easy he/be errors */
|
ali@0
|
1332 |
/* This is a very inadequate effort on the he/be problem, */
|
ali@0
|
1333 |
/* but the phrase "to he" is always an error, whereas "to */
|
ali@0
|
1334 |
/* be" is quite common. I chuckle when it does catch one! */
|
ali@0
|
1335 |
/* Similarly, '"Quiet!", be said.' is a non-be error */
|
ali@0
|
1336 |
/* V .18 - "to he" is _not_ always an error!: */
|
ali@0
|
1337 |
/* "Where they went to he couldn't say." */
|
ali@0
|
1338 |
/* but I'm leaving it in anyway. */
|
ali@0
|
1339 |
/* V .20 Another false positive: */
|
ali@0
|
1340 |
/* What would "Cinderella" be without the . . . */
|
ali@0
|
1341 |
/* and another "If he wants to he can see for himself." */
|
ali@0
|
1342 |
/* V .21 Added " is be " and " be is " and " be was " */
|
ali@0
|
1343 |
/* V .99 Added jeebies code -- removed again. */
|
ali@0
|
1344 |
/* Is jeebies code worth adding? Rare to see he/be */
|
ali@0
|
1345 |
/* errors with modern OCR. Separate program? Yes! */
|
ali@0
|
1346 |
/* jeebies does the job without cluttering up this. */
|
ali@0
|
1347 |
/* We do get a few more queryable pairs from the */
|
ali@0
|
1348 |
/* project though -- they're cheap to implement. */
|
ali@0
|
1349 |
/* Also added a column number for guiguts. */
|
ali@0
|
1350 |
|
ali@0
|
1351 |
s = wrk;
|
ali@0
|
1352 |
*s = 0;
|
ali@0
|
1353 |
if (strstr(aline," to he ")) s = strstr(aline," to he ");
|
ali@0
|
1354 |
if (strstr(aline,"\" be ")) s = strstr(aline,"\" be ");
|
ali@0
|
1355 |
if (strstr(aline,"\", be ")) s = strstr(aline,"\", be ");
|
ali@0
|
1356 |
if (strstr(aline," is be ")) s = strstr(aline," is be ");
|
ali@0
|
1357 |
if (strstr(aline," be is ")) s = strstr(aline," be is ");
|
ali@0
|
1358 |
if (strstr(aline," was be ")) s = strstr(aline," was be ");
|
ali@0
|
1359 |
if (strstr(aline," be would ")) s = strstr(aline," be would ");
|
ali@0
|
1360 |
if (strstr(aline," be could ")) s = strstr(aline," be could ");
|
ali@0
|
1361 |
if (*s) {
|
ali@0
|
1362 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1363 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1364 |
printf(" Line %ld column %d - Query he/be error?\n", linecnt, (int)(s - aline) +1);
|
ali@0
|
1365 |
else
|
ali@0
|
1366 |
cnt_word++;
|
ali@0
|
1367 |
}
|
ali@0
|
1368 |
|
ali@0
|
1369 |
s = wrk;
|
ali@0
|
1370 |
*s = 0;
|
ali@0
|
1371 |
if (strstr(aline," i bad ")) s = strstr(aline," i bad ");
|
ali@0
|
1372 |
if (strstr(aline," you bad ")) s = strstr(aline," you bad ");
|
ali@0
|
1373 |
if (strstr(aline," he bad ")) s = strstr(aline," he bad ");
|
ali@0
|
1374 |
if (strstr(aline," she bad ")) s = strstr(aline," she bad ");
|
ali@0
|
1375 |
if (strstr(aline," they bad ")) s = strstr(aline," they bad ");
|
ali@0
|
1376 |
if (strstr(aline," a had ")) s = strstr(aline," a had ");
|
ali@0
|
1377 |
if (strstr(aline," the had ")) s = strstr(aline," the had ");
|
ali@0
|
1378 |
if (*s) {
|
ali@0
|
1379 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1380 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1381 |
printf(" Line %ld column %d - Query had/bad error?\n", linecnt, (int)(s - aline) +1);
|
ali@0
|
1382 |
else
|
ali@0
|
1383 |
cnt_word++;
|
ali@0
|
1384 |
}
|
ali@0
|
1385 |
|
ali@0
|
1386 |
|
ali@0
|
1387 |
/* V .97 Added ", hut " Not too common, hut pretty certain */
|
ali@0
|
1388 |
/* V.99 changed to add a column number for guiguts */
|
ali@0
|
1389 |
s = wrk;
|
ali@0
|
1390 |
*s = 0;
|
ali@0
|
1391 |
if (strstr(aline,", hut ")) s = strstr(aline,", hut ");
|
ali@0
|
1392 |
if (strstr(aline,"; hut ")) s = strstr(aline,"; hut ");
|
ali@0
|
1393 |
if (*s) {
|
ali@0
|
1394 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1395 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1396 |
printf(" Line %ld column %d - Query hut/but error?\n", linecnt, (int)(s - aline) +1);
|
ali@0
|
1397 |
else
|
ali@0
|
1398 |
cnt_word++;
|
ali@0
|
1399 |
}
|
ali@0
|
1400 |
|
ali@0
|
1401 |
/* Special case - angled bracket in front of "From" placed there by an MTA */
|
ali@0
|
1402 |
/* when sending an e-mail. V .21 */
|
ali@0
|
1403 |
if (strstr(aline, ">From")) {
|
ali@0
|
1404 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1405 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1406 |
printf(" Line %ld column %d - Query angled bracket with From\n", linecnt, (int)(strstr(aline, ">From") - aline) +1);
|
ali@0
|
1407 |
else
|
ali@0
|
1408 |
cnt_punct++;
|
ali@0
|
1409 |
}
|
ali@0
|
1410 |
|
ali@0
|
1411 |
/* V 0.98 Check for a single character line - often an overflow from bad wrapping. */
|
ali@0
|
1412 |
if (*aline && !*(aline+1)) {
|
ali@0
|
1413 |
if (*aline == 'I' || *aline == 'V' || *aline == 'X' || *aline == 'L' || gcisdigit(*aline))
|
ali@0
|
1414 |
; /* nothing - ignore numerals alone on a line. */
|
ali@0
|
1415 |
else {
|
ali@0
|
1416 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1417 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1418 |
printf(" Line %ld column 1 - Query single character line\n", linecnt);
|
ali@0
|
1419 |
else
|
ali@0
|
1420 |
cnt_punct++;
|
ali@0
|
1421 |
}
|
ali@0
|
1422 |
}
|
ali@0
|
1423 |
|
ali@0
|
1424 |
/* V 0.98 Check for I" - often should be ! */
|
ali@0
|
1425 |
if (strstr(aline, " I\"")) {
|
ali@0
|
1426 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1427 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1428 |
printf(" Line %ld column %ld - Query I=exclamation mark?\n", linecnt, strstr(aline, " I\"") - aline);
|
ali@0
|
1429 |
else
|
ali@0
|
1430 |
cnt_punct++;
|
ali@0
|
1431 |
}
|
ali@0
|
1432 |
|
ali@0
|
1433 |
/* V 0.98 Check for period without a capital letter. Cut-down from gutspell */
|
ali@0
|
1434 |
/* Only works when it happens on a single line. */
|
ali@0
|
1435 |
|
ali@0
|
1436 |
if (pswit[PARANOID_SWITCH])
|
ali@0
|
1437 |
for (t = s = aline; strstr(t,". ");) {
|
ali@0
|
1438 |
t = strstr(t, ". ");
|
ali@0
|
1439 |
if (t == s) {
|
ali@0
|
1440 |
t++;
|
ali@0
|
1441 |
continue; /* start of line punctuation is handled elsewhere */
|
ali@0
|
1442 |
}
|
ali@0
|
1443 |
if (!gcisalpha(*(t-1))) {
|
ali@0
|
1444 |
t++;
|
ali@0
|
1445 |
continue;
|
ali@0
|
1446 |
}
|
ali@0
|
1447 |
if (isDutch) { /* For Frank & Jeroen -- 's Middags case */
|
ali@0
|
1448 |
if (*(t+2) == CHAR_SQUOTE &&
|
ali@0
|
1449 |
*(t+3)>='a' && *(t+3)<='z' &&
|
ali@0
|
1450 |
*(t+4) == CHAR_SPACE &&
|
ali@0
|
1451 |
*(t+5)>='A' && *(t+5)<='Z') {
|
ali@0
|
1452 |
t++;
|
ali@0
|
1453 |
continue;
|
ali@0
|
1454 |
}
|
ali@0
|
1455 |
}
|
ali@0
|
1456 |
s1 = t+2;
|
ali@0
|
1457 |
while (*s1 && !gcisalpha(*s1) && !isdigit(*s1))
|
ali@0
|
1458 |
s1++;
|
ali@0
|
1459 |
if (*s1 >= 'a' && *s1 <= 'z') { /* we have something to investigate */
|
ali@0
|
1460 |
istypo = 1;
|
ali@0
|
1461 |
for (s1 = t - 1; s1 >= s &&
|
ali@0
|
1462 |
(gcisalpha(*s1) || gcisdigit(*s1) ||
|
ali@0
|
1463 |
(*s1 == CHAR_SQUOTE && gcisalpha(*(s1+1)) && gcisalpha(*(s1-1)))); s1--); /* so let's go back and find out */
|
ali@0
|
1464 |
s1++;
|
ali@0
|
1465 |
for (i = 0; *s1 && *s1 != '.'; s1++, i++)
|
ali@0
|
1466 |
testword[i] = *s1;
|
ali@0
|
1467 |
testword[i] = 0;
|
ali@0
|
1468 |
for (i = 0; *abbrev[i]; i++)
|
ali@0
|
1469 |
if (!strcmp(testword, abbrev[i]))
|
ali@0
|
1470 |
istypo = 0;
|
ali@0
|
1471 |
// if (*testword >= 'A' && *testword <= 'Z')
|
ali@0
|
1472 |
// istypo = 0;
|
ali@0
|
1473 |
if (gcisdigit(*testword)) istypo = 0;
|
ali@0
|
1474 |
if (!*(testword+1)) istypo = 0;
|
ali@0
|
1475 |
if (isroman(testword)) istypo = 0;
|
ali@0
|
1476 |
if (istypo) {
|
ali@0
|
1477 |
istypo = 0;
|
ali@0
|
1478 |
for (i = 0; testword[i]; i++)
|
ali@0
|
1479 |
if (strchr(vowels, testword[i]))
|
ali@0
|
1480 |
istypo = 1;
|
ali@0
|
1481 |
}
|
ali@0
|
1482 |
if (istypo) {
|
ali@0
|
1483 |
isdup = 0;
|
ali@0
|
1484 |
if (strlen(testword) < MAX_QWORD_LENGTH && !pswit[VERBOSE_SWITCH])
|
ali@0
|
1485 |
for (i = 0; i < qperiod_index; i++)
|
ali@0
|
1486 |
if (!strcmp(testword, qperiod[i])) {
|
ali@0
|
1487 |
isdup = 1;
|
ali@0
|
1488 |
}
|
ali@0
|
1489 |
if (!isdup) {
|
ali@0
|
1490 |
if (qperiod_index < MAX_QWORD && strlen(testword) < MAX_QWORD_LENGTH) {
|
ali@0
|
1491 |
strcpy(qperiod[qperiod_index], testword);
|
ali@0
|
1492 |
qperiod_index++;
|
ali@0
|
1493 |
}
|
ali@0
|
1494 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1495 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1496 |
printf(" Line %ld column %d - Extra period?\n", linecnt, (int)(t - aline)+1);
|
ali@0
|
1497 |
else
|
ali@0
|
1498 |
cnt_punct++;
|
ali@0
|
1499 |
}
|
ali@0
|
1500 |
}
|
ali@0
|
1501 |
}
|
ali@0
|
1502 |
t++;
|
ali@0
|
1503 |
}
|
ali@0
|
1504 |
|
ali@0
|
1505 |
|
ali@0
|
1506 |
if (pswit[TYPO_SWITCH]) { /* Should have put this condition in at the start of 0.99. Duh! */
|
ali@0
|
1507 |
/* Check for words usually not followed by punctuation 0.99 */
|
ali@0
|
1508 |
for (s = aline; *s;) {
|
ali@0
|
1509 |
wordstart = s;
|
ali@0
|
1510 |
s = getaword(s, inword);
|
ali@0
|
1511 |
if (!*inword) continue;
|
ali@0
|
1512 |
lowerit(inword);
|
ali@0
|
1513 |
for (i = 0; *nocomma[i]; i++)
|
ali@0
|
1514 |
if (!strcmp(inword, nocomma[i])) {
|
ali@0
|
1515 |
if (*s == ',' || *s == ';' || *s == ':') {
|
ali@0
|
1516 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1517 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1518 |
printf(" Line %ld column %d - Query punctuation after %s?\n", linecnt, (int)(s - aline)+1, inword);
|
ali@0
|
1519 |
else
|
ali@0
|
1520 |
cnt_punct++;
|
ali@0
|
1521 |
}
|
ali@0
|
1522 |
}
|
ali@0
|
1523 |
for (i = 0; *noperiod[i]; i++)
|
ali@0
|
1524 |
if (!strcmp(inword, noperiod[i])) {
|
ali@0
|
1525 |
if (*s == '.' || *s == '!') {
|
ali@0
|
1526 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1527 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1528 |
printf(" Line %ld column %d - Query punctuation after %s?\n", linecnt, (int)(s - aline)+1, inword);
|
ali@0
|
1529 |
else
|
ali@0
|
1530 |
cnt_punct++;
|
ali@0
|
1531 |
}
|
ali@0
|
1532 |
}
|
ali@0
|
1533 |
}
|
ali@0
|
1534 |
}
|
ali@0
|
1535 |
|
ali@0
|
1536 |
|
ali@0
|
1537 |
|
ali@0
|
1538 |
/* Check for commonly mistyped words, and digits like 0 for O in a word */
|
ali@0
|
1539 |
for (s = aline; *s;) {
|
ali@0
|
1540 |
wordstart = s;
|
ali@0
|
1541 |
s = getaword(s, inword);
|
ali@0
|
1542 |
if (!*inword) continue; /* don't bother with empty lines */
|
ali@0
|
1543 |
if (mixdigit(inword)) {
|
ali@0
|
1544 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1545 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1546 |
printf(" Line %ld column %ld - Query digit in %s\n", linecnt, (int)(wordstart - aline) + 1, inword);
|
ali@0
|
1547 |
else
|
ali@0
|
1548 |
cnt_word++;
|
ali@0
|
1549 |
}
|
ali@0
|
1550 |
|
ali@0
|
1551 |
/* put the word through a series of tests for likely typos and OCR errors */
|
ali@0
|
1552 |
/* V.21 I had allowed lots of typo-checking even with the typo switch */
|
ali@0
|
1553 |
/* turned off, but I really should disallow reporting of them when */
|
ali@0
|
1554 |
/* the switch is off. Hence the "if" below. */
|
ali@0
|
1555 |
if (pswit[TYPO_SWITCH]) {
|
ali@0
|
1556 |
istypo = 0;
|
ali@0
|
1557 |
strcpy(testword, inword);
|
ali@0
|
1558 |
alower = 0;
|
ali@0
|
1559 |
for (i = 0; i < (signed int)strlen(testword); i++) { /* lowercase for testing */
|
ali@0
|
1560 |
if (testword[i] >= 'a' && testword[i] <= 'z') alower = 1;
|
ali@0
|
1561 |
if (alower && testword[i] >= 'A' && testword[i] <= 'Z') {
|
ali@0
|
1562 |
/* we have an uppercase mid-word. However, there are common cases: */
|
ali@0
|
1563 |
/* Mac and Mc like McGill */
|
ali@0
|
1564 |
/* French contractions like l'Abbe */
|
ali@0
|
1565 |
if ((i == 2 && testword[0] == 'm' && testword[1] == 'c') ||
|
ali@0
|
1566 |
(i == 3 && testword[0] == 'm' && testword[1] == 'a' && testword[2] == 'c') ||
|
ali@0
|
1567 |
(i > 0 && testword[i-1] == CHAR_SQUOTE))
|
ali@0
|
1568 |
; /* do nothing! */
|
ali@0
|
1569 |
|
ali@0
|
1570 |
else { /* V.97 - remove separate case of uppercase within word so that */
|
ali@0
|
1571 |
/* names like VanAllen fall into qword_index and get reported only once */
|
ali@0
|
1572 |
istypo = 1;
|
ali@0
|
1573 |
}
|
ali@0
|
1574 |
}
|
ali@0
|
1575 |
testword[i] = (char)tolower(testword[i]);
|
ali@0
|
1576 |
}
|
ali@0
|
1577 |
|
ali@0
|
1578 |
/* check for certain unlikely two-letter combinations at word start and end */
|
ali@0
|
1579 |
/* V.0.97 - this replaces individual hardcoded checks in previous versions */
|
ali@0
|
1580 |
if (strlen(testword) > 1) {
|
ali@0
|
1581 |
for (i = 0; *nostart[i]; i++)
|
ali@0
|
1582 |
if (!strncmp(testword, nostart[i], 2))
|
ali@0
|
1583 |
istypo = 1;
|
ali@0
|
1584 |
for (i = 0; *noend[i]; i++)
|
ali@0
|
1585 |
if (!strncmp(testword + strlen(testword) -2, noend[i], 2))
|
ali@0
|
1586 |
istypo = 1;
|
ali@0
|
1587 |
}
|
ali@0
|
1588 |
|
ali@0
|
1589 |
|
ali@0
|
1590 |
/* ght is common, gbt never. Like that. */
|
ali@0
|
1591 |
if (strstr(testword, "cb")) istypo = 1;
|
ali@0
|
1592 |
if (strstr(testword, "gbt")) istypo = 1;
|
ali@0
|
1593 |
if (strstr(testword, "pbt")) istypo = 1;
|
ali@0
|
1594 |
if (strstr(testword, "tbs")) istypo = 1;
|
ali@0
|
1595 |
if (strstr(testword, "mrn")) istypo = 1;
|
ali@0
|
1596 |
if (strstr(testword, "ahle")) istypo = 1;
|
ali@0
|
1597 |
if (strstr(testword, "ihle")) istypo = 1;
|
ali@0
|
1598 |
|
ali@0
|
1599 |
/* "TBE" does happen - like HEARTBEAT - but uncommon. */
|
ali@0
|
1600 |
/* Also "TBI" - frostbite, outbid - but uncommon. */
|
ali@0
|
1601 |
/* Similarly "ii" like Hawaii, or Pompeii, and in Roman numerals, */
|
ali@0
|
1602 |
/* but these are covered in V.20. "ii" is a common scanno. */
|
ali@0
|
1603 |
if (strstr(testword, "tbi")) istypo = 1;
|
ali@0
|
1604 |
if (strstr(testword, "tbe")) istypo = 1;
|
ali@0
|
1605 |
if (strstr(testword, "ii")) istypo = 1;
|
ali@0
|
1606 |
|
ali@0
|
1607 |
/* check for no vowels or no consonants. */
|
ali@0
|
1608 |
/* If none, flag a typo */
|
ali@0
|
1609 |
if (!istypo && strlen(testword)>1) {
|
ali@0
|
1610 |
vowel = consonant = 0;
|
ali@0
|
1611 |
for (i = 0; testword[i]; i++)
|
ali@0
|
1612 |
if (testword[i] == 'y' || gcisdigit(testword[i])) { /* Yah, this is loose. */
|
ali@0
|
1613 |
vowel++;
|
ali@0
|
1614 |
consonant++;
|
ali@0
|
1615 |
}
|
ali@0
|
1616 |
else
|
ali@0
|
1617 |
if (strchr(vowels, testword[i])) vowel++;
|
ali@0
|
1618 |
else consonant++;
|
ali@0
|
1619 |
if (!vowel || !consonant) {
|
ali@0
|
1620 |
istypo = 1;
|
ali@0
|
1621 |
}
|
ali@0
|
1622 |
}
|
ali@0
|
1623 |
|
ali@0
|
1624 |
/* now exclude the word from being reported if it's in */
|
ali@0
|
1625 |
/* the okword list */
|
ali@0
|
1626 |
for (i = 0; *okword[i]; i++)
|
ali@0
|
1627 |
if (!strcmp(testword, okword[i]))
|
ali@0
|
1628 |
istypo = 0;
|
ali@0
|
1629 |
|
ali@0
|
1630 |
/* what looks like a typo may be a Roman numeral. Exclude these */
|
ali@0
|
1631 |
if (istypo)
|
ali@0
|
1632 |
if (isroman(testword))
|
ali@0
|
1633 |
istypo = 0;
|
ali@0
|
1634 |
|
ali@0
|
1635 |
/* check the manual list of typos */
|
ali@0
|
1636 |
if (!istypo)
|
ali@0
|
1637 |
for (i = 0; *typo[i]; i++)
|
ali@0
|
1638 |
if (!strcmp(testword, typo[i]))
|
ali@0
|
1639 |
istypo = 1;
|
ali@0
|
1640 |
|
ali@0
|
1641 |
|
ali@0
|
1642 |
/* V.21 - check lowercase s and l - special cases */
|
ali@0
|
1643 |
/* V.98 - added "i" and "m" */
|
ali@0
|
1644 |
/* V.99 - added "j" often a semi-colon gone wrong */
|
ali@0
|
1645 |
/* - and "d" for a missing apostrophe - he d */
|
ali@0
|
1646 |
/* - and "n" for "in" */
|
ali@0
|
1647 |
if (!istypo && strlen(testword) == 1)
|
ali@0
|
1648 |
if (strchr("slmijdn", *inword))
|
ali@0
|
1649 |
istypo = 1;
|
ali@0
|
1650 |
|
ali@0
|
1651 |
|
ali@0
|
1652 |
if (istypo) {
|
ali@0
|
1653 |
isdup = 0;
|
ali@0
|
1654 |
if (strlen(testword) < MAX_QWORD_LENGTH && !pswit[VERBOSE_SWITCH])
|
ali@0
|
1655 |
for (i = 0; i < qword_index; i++)
|
ali@0
|
1656 |
if (!strcmp(testword, qword[i])) {
|
ali@0
|
1657 |
isdup = 1;
|
ali@0
|
1658 |
++dupcnt[i];
|
ali@0
|
1659 |
}
|
ali@0
|
1660 |
if (!isdup) {
|
ali@0
|
1661 |
if (qword_index < MAX_QWORD && strlen(testword) < MAX_QWORD_LENGTH) {
|
ali@0
|
1662 |
strcpy(qword[qword_index], testword);
|
ali@0
|
1663 |
qword_index++;
|
ali@0
|
1664 |
}
|
ali@0
|
1665 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1666 |
if (!pswit[OVERVIEW_SWITCH]) {
|
ali@0
|
1667 |
printf(" Line %ld column %d - Query word %s", linecnt, (int)(wordstart - aline) + 1, inword);
|
ali@0
|
1668 |
if (strlen(testword) < MAX_QWORD_LENGTH && !pswit[VERBOSE_SWITCH])
|
ali@0
|
1669 |
printf(" - not reporting duplicates");
|
ali@0
|
1670 |
printf("\n");
|
ali@0
|
1671 |
}
|
ali@0
|
1672 |
else
|
ali@0
|
1673 |
cnt_word++;
|
ali@0
|
1674 |
}
|
ali@0
|
1675 |
}
|
ali@0
|
1676 |
} /* end of typo-checking */
|
ali@0
|
1677 |
|
ali@0
|
1678 |
/* check the user's list of typos */
|
ali@0
|
1679 |
if (!istypo)
|
ali@0
|
1680 |
if (usertypo_count)
|
ali@0
|
1681 |
for (i = 0; i < usertypo_count; i++)
|
ali@0
|
1682 |
if (!strcmp(testword, usertypo[i])) {
|
ali@0
|
1683 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1684 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1685 |
printf(" Line %ld column %d - Query possible scanno %s\n", linecnt, (int)(wordstart - aline) + 2, inword);
|
ali@0
|
1686 |
}
|
ali@0
|
1687 |
|
ali@0
|
1688 |
|
ali@0
|
1689 |
|
ali@0
|
1690 |
if (pswit[PARANOID_SWITCH] && warn_digit) { /* in paranoid mode, query all 0 and 1 standing alone - added warn_digit V.97*/
|
ali@0
|
1691 |
if (!strcmp(inword, "0") || !strcmp(inword, "1")) {
|
ali@0
|
1692 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1693 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1694 |
printf(" Line %ld column %d - Query standalone %s\n", linecnt, (int)(wordstart - aline) + 2, inword);
|
ali@0
|
1695 |
else
|
ali@0
|
1696 |
cnt_word++;
|
ali@0
|
1697 |
}
|
ali@0
|
1698 |
}
|
ali@0
|
1699 |
}
|
ali@0
|
1700 |
|
ali@0
|
1701 |
/* look for added or missing spaces around punctuation and quotes */
|
ali@0
|
1702 |
/* If there is a punctuation character like ! with no space on */
|
ali@0
|
1703 |
/* either side, suspect a missing!space. If there are spaces on */
|
ali@0
|
1704 |
/* both sides , assume a typo. If we see a double quote with no */
|
ali@0
|
1705 |
/* space or punctuation on either side of it, assume unspaced */
|
ali@0
|
1706 |
/* quotes "like"this. */
|
ali@0
|
1707 |
llen = strlen(aline);
|
ali@0
|
1708 |
for (i = 1; i < llen; i++) { /* for each character in the line after the first */
|
ali@0
|
1709 |
if (strchr(".?!,;:_", aline[i])) { /* if it's punctuation */
|
ali@0
|
1710 |
isacro = 0; /* we need to suppress warnings for acronyms like M.D. */
|
ali@0
|
1711 |
isellipsis = 0; /* we need to suppress warnings for ellipsis . . . */
|
ali@0
|
1712 |
if ( (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1])) || /* if there are letters on both sides of it or ... */
|
ali@0
|
1713 |
(gcisalpha(aline[i+1]) && strchr("?!,;:", aline[i]))) { /* ...if it's strict punctuation followed by an alpha */
|
ali@0
|
1714 |
if (aline[i] == '.') {
|
ali@0
|
1715 |
if (i > 2)
|
ali@0
|
1716 |
if (aline[i-2] == '.') isacro = 1;
|
ali@0
|
1717 |
if (i + 2 < llen)
|
ali@0
|
1718 |
if (aline[i+2] == '.') isacro = 1;
|
ali@0
|
1719 |
}
|
ali@0
|
1720 |
if (!isacro) {
|
ali@0
|
1721 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1722 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1723 |
printf(" Line %ld column %d - Missing space?\n", linecnt, i+1);
|
ali@0
|
1724 |
else
|
ali@0
|
1725 |
cnt_punct++;
|
ali@0
|
1726 |
}
|
ali@0
|
1727 |
}
|
ali@0
|
1728 |
if (aline[i-1] == CHAR_SPACE && (aline[i+1] == CHAR_SPACE || aline[i+1] == 0)) { /* if there are spaces on both sides, or space before and end of line */
|
ali@0
|
1729 |
if (aline[i] == '.') {
|
ali@0
|
1730 |
if (i > 2)
|
ali@0
|
1731 |
if (aline[i-2] == '.') isellipsis = 1;
|
ali@0
|
1732 |
if (i + 2 < llen)
|
ali@0
|
1733 |
if (aline[i+2] == '.') isellipsis = 1;
|
ali@0
|
1734 |
}
|
ali@0
|
1735 |
if (!isemptyline && !isellipsis) {
|
ali@0
|
1736 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1737 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1738 |
printf(" Line %ld column %d - Spaced punctuation?\n", linecnt, i+1);
|
ali@0
|
1739 |
else
|
ali@0
|
1740 |
cnt_punct++;
|
ali@0
|
1741 |
}
|
ali@0
|
1742 |
}
|
ali@0
|
1743 |
}
|
ali@0
|
1744 |
}
|
ali@0
|
1745 |
|
ali@0
|
1746 |
/* 0.98 -- split out the characters that CANNOT be preceded by space */
|
ali@0
|
1747 |
llen = strlen(aline);
|
ali@0
|
1748 |
for (i = 1; i < llen; i++) { /* for each character in the line after the first */
|
ali@0
|
1749 |
if (strchr("?!,;:", aline[i])) { /* if it's punctuation that _cannot_ have a space before it */
|
ali@0
|
1750 |
if (aline[i-1] == CHAR_SPACE && !isemptyline && aline[i+1] != CHAR_SPACE) { /* if aline[i+1) DOES == space, it was already reported just above */
|
ali@0
|
1751 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1752 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1753 |
printf(" Line %ld column %d - Spaced punctuation?\n", linecnt, i+1);
|
ali@0
|
1754 |
else
|
ali@0
|
1755 |
cnt_punct++;
|
ali@0
|
1756 |
}
|
ali@0
|
1757 |
}
|
ali@0
|
1758 |
}
|
ali@0
|
1759 |
|
ali@0
|
1760 |
|
ali@0
|
1761 |
/* 0.99 -- special case " .X" where X is any alpha. */
|
ali@0
|
1762 |
/* This plugs a hole in the acronym code above. Inelegant, but maintainable. */
|
ali@0
|
1763 |
llen = strlen(aline);
|
ali@0
|
1764 |
for (i = 1; i < llen; i++) { /* for each character in the line after the first */
|
ali@0
|
1765 |
if (aline[i] == '.') { /* if it's a period */
|
ali@0
|
1766 |
if (aline[i-1] == CHAR_SPACE && gcisalpha(aline[i+1])) { /* if the period follows a space and is followed by a letter */
|
ali@0
|
1767 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1768 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1769 |
printf(" Line %ld column %d - Spaced punctuation?\n", linecnt, i+1);
|
ali@0
|
1770 |
else
|
ali@0
|
1771 |
cnt_punct++;
|
ali@0
|
1772 |
}
|
ali@0
|
1773 |
}
|
ali@0
|
1774 |
}
|
ali@0
|
1775 |
|
ali@0
|
1776 |
|
ali@0
|
1777 |
|
ali@0
|
1778 |
|
ali@0
|
1779 |
/* v.21 breaking out the search for unspaced doublequotes */
|
ali@0
|
1780 |
/* This is not as efficient, but it's more maintainable */
|
ali@0
|
1781 |
/* V.97 added underscore to the list of characters not to query, */
|
ali@0
|
1782 |
/* since underscores are commonly used as italics indicators. */
|
ali@0
|
1783 |
/* V.98 Added slash as well, same reason. */
|
ali@0
|
1784 |
for (i = 1; i < llen; i++) { /* for each character in the line after the first */
|
ali@0
|
1785 |
if (aline[i] == CHAR_DQUOTE) {
|
ali@0
|
1786 |
if ((!strchr(" _-.'`,;:!/([{?}])", aline[i-1]) &&
|
ali@0
|
1787 |
!strchr(" _-.'`,;:!/([{?}])", aline[i+1]) &&
|
ali@0
|
1788 |
aline[i+1] != 0
|
ali@0
|
1789 |
|| (!strchr(" _-([{'`", aline[i-1]) && gcisalpha(aline[i+1])))) {
|
ali@0
|
1790 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1791 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1792 |
printf(" Line %ld column %d - Unspaced quotes?\n", linecnt, i+1);
|
ali@0
|
1793 |
else
|
ali@0
|
1794 |
cnt_punct++;
|
ali@0
|
1795 |
}
|
ali@0
|
1796 |
}
|
ali@0
|
1797 |
}
|
ali@0
|
1798 |
|
ali@0
|
1799 |
|
ali@0
|
1800 |
/* v.98 check parity of quotes */
|
ali@0
|
1801 |
/* v.99 added !*(s+1) in some tests to catch "I am," he said, but I will not be soon". */
|
ali@0
|
1802 |
for (s = aline; *s; s++) {
|
ali@0
|
1803 |
if (*s == CHAR_DQUOTE) {
|
ali@0
|
1804 |
if (!(dquotepar = !dquotepar)) { /* parity even */
|
ali@0
|
1805 |
if (!strchr("_-.'`/,;:!?)]} ", *(s+1))) {
|
ali@0
|
1806 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1807 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1808 |
printf(" Line %ld column %d - Wrongspaced quotes?\n", linecnt, (int)(s - aline)+1);
|
ali@0
|
1809 |
else
|
ali@0
|
1810 |
cnt_punct++;
|
ali@0
|
1811 |
}
|
ali@0
|
1812 |
}
|
ali@0
|
1813 |
else { /* parity odd */
|
ali@0
|
1814 |
if (!gcisalpha(*(s+1)) && !isdigit(*(s+1)) && !strchr("_-/.'`([{$", *(s+1)) || !*(s+1)) {
|
ali@0
|
1815 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1816 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1817 |
printf(" Line %ld column %d - Wrongspaced quotes?\n", linecnt, (int)(s - aline)+1);
|
ali@0
|
1818 |
else
|
ali@0
|
1819 |
cnt_punct++;
|
ali@0
|
1820 |
}
|
ali@0
|
1821 |
}
|
ali@0
|
1822 |
}
|
ali@0
|
1823 |
}
|
ali@0
|
1824 |
|
ali@0
|
1825 |
if (*aline == CHAR_DQUOTE) {
|
ali@0
|
1826 |
if (strchr(",;:!?)]} ", aline[1])) {
|
ali@0
|
1827 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1828 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1829 |
printf(" Line %ld column 1 - Wrongspaced quotes?\n", linecnt, (int)(s - aline)+1);
|
ali@0
|
1830 |
else
|
ali@0
|
1831 |
cnt_punct++;
|
ali@0
|
1832 |
}
|
ali@0
|
1833 |
}
|
ali@0
|
1834 |
|
ali@0
|
1835 |
if (pswit[SQUOTE_SWITCH])
|
ali@0
|
1836 |
for (s = aline; *s; s++) {
|
ali@0
|
1837 |
if ((*s == CHAR_SQUOTE || *s == CHAR_OPEN_SQUOTE)
|
ali@0
|
1838 |
&& ( s == aline || (s > aline && !gcisalpha(*(s-1))) || !gcisalpha(*(s+1)))) {
|
ali@0
|
1839 |
if (!(squotepar = !squotepar)) { /* parity even */
|
ali@0
|
1840 |
if (!strchr("_-.'`/\",;:!?)]} ", *(s+1))) {
|
ali@0
|
1841 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1842 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1843 |
printf(" Line %ld column %d - Wrongspaced singlequotes?\n", linecnt, (int)(s - aline)+1);
|
ali@0
|
1844 |
else
|
ali@0
|
1845 |
cnt_punct++;
|
ali@0
|
1846 |
}
|
ali@0
|
1847 |
}
|
ali@0
|
1848 |
else { /* parity odd */
|
ali@0
|
1849 |
if (!gcisalpha(*(s+1)) && !isdigit(*(s+1)) && !strchr("_-/\".'`", *(s+1)) || !*(s+1)) {
|
ali@0
|
1850 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1851 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1852 |
printf(" Line %ld column %d - Wrongspaced singlequotes?\n", linecnt, (int)(s - aline)+1);
|
ali@0
|
1853 |
else
|
ali@0
|
1854 |
cnt_punct++;
|
ali@0
|
1855 |
}
|
ali@0
|
1856 |
}
|
ali@0
|
1857 |
}
|
ali@0
|
1858 |
}
|
ali@0
|
1859 |
|
ali@0
|
1860 |
|
ali@0
|
1861 |
/* v.20 also look for double punctuation like ,. or ,, */
|
ali@0
|
1862 |
/* Thanks to DW for the suggestion! */
|
ali@0
|
1863 |
/* I'm putting this in a separate loop for clarity */
|
ali@0
|
1864 |
/* In books with references, ".," and ".;" are common */
|
ali@0
|
1865 |
/* e.g. "etc., etc.," and vol. 1.; vol 3.; */
|
ali@0
|
1866 |
/* OTOH, from my initial tests, there are also fairly */
|
ali@0
|
1867 |
/* common errors. What to do? Make these cases paranoid? */
|
ali@0
|
1868 |
/* V.21 ".," is the most common, so invented warn_dotcomma */
|
ali@0
|
1869 |
/* to suppress detailed reporting if it occurs often */
|
ali@0
|
1870 |
llen = strlen(aline);
|
ali@0
|
1871 |
for (i = 0; i < llen; i++) /* for each character in the line */
|
ali@0
|
1872 |
if (strchr(".?!,;:", aline[i]) /* if it's punctuation */
|
ali@0
|
1873 |
&& (strchr(".?!,;:", aline[i+1]))
|
ali@0
|
1874 |
&& aline[i] && aline[i+1]) /* followed by punctuation, it's a query, unless . . . */
|
ali@0
|
1875 |
if (
|
ali@0
|
1876 |
(aline[i] == aline[i+1]
|
ali@0
|
1877 |
&& (aline[i] == '.' || aline[i] == '?' || aline[i] == '!'))
|
ali@0
|
1878 |
|| (!warn_dotcomma && aline[i] == '.' && aline[i+1] == ',')
|
ali@0
|
1879 |
|| (isFrench && !strncmp(aline+i, ",...", 4))
|
ali@0
|
1880 |
|| (isFrench && !strncmp(aline+i, "...,", 4))
|
ali@0
|
1881 |
|| (isFrench && !strncmp(aline+i, ";...", 4))
|
ali@0
|
1882 |
|| (isFrench && !strncmp(aline+i, "...;", 4))
|
ali@0
|
1883 |
|| (isFrench && !strncmp(aline+i, ":...", 4))
|
ali@0
|
1884 |
|| (isFrench && !strncmp(aline+i, "...:", 4))
|
ali@0
|
1885 |
|| (isFrench && !strncmp(aline+i, "!...", 4))
|
ali@0
|
1886 |
|| (isFrench && !strncmp(aline+i, "...!", 4))
|
ali@0
|
1887 |
|| (isFrench && !strncmp(aline+i, "?...", 4))
|
ali@0
|
1888 |
|| (isFrench && !strncmp(aline+i, "...?", 4))
|
ali@0
|
1889 |
) {
|
ali@0
|
1890 |
if ((isFrench && !strncmp(aline+i, ",...", 4)) /* could this BE any more awkward? */
|
ali@0
|
1891 |
|| (isFrench && !strncmp(aline+i, "...,", 4))
|
ali@0
|
1892 |
|| (isFrench && !strncmp(aline+i, ";...", 4))
|
ali@0
|
1893 |
|| (isFrench && !strncmp(aline+i, "...;", 4))
|
ali@0
|
1894 |
|| (isFrench && !strncmp(aline+i, ":...", 4))
|
ali@0
|
1895 |
|| (isFrench && !strncmp(aline+i, "...:", 4))
|
ali@0
|
1896 |
|| (isFrench && !strncmp(aline+i, "!...", 4))
|
ali@0
|
1897 |
|| (isFrench && !strncmp(aline+i, "...!", 4))
|
ali@0
|
1898 |
|| (isFrench && !strncmp(aline+i, "?...", 4))
|
ali@0
|
1899 |
|| (isFrench && !strncmp(aline+i, "...?", 4)))
|
ali@0
|
1900 |
i +=4;
|
ali@0
|
1901 |
; /* do nothing for .. !! and ?? which can be legit */
|
ali@0
|
1902 |
}
|
ali@0
|
1903 |
else {
|
ali@0
|
1904 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1905 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1906 |
printf(" Line %ld column %d - Double punctuation?\n", linecnt, i+1);
|
ali@0
|
1907 |
else
|
ali@0
|
1908 |
cnt_punct++;
|
ali@0
|
1909 |
}
|
ali@0
|
1910 |
|
ali@0
|
1911 |
/* v.21 breaking out the search for spaced doublequotes */
|
ali@0
|
1912 |
/* This is not as efficient, but it's more maintainable */
|
ali@0
|
1913 |
s = aline;
|
ali@0
|
1914 |
while (strstr(s," \" ")) {
|
ali@0
|
1915 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1916 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1917 |
printf(" Line %ld column %d - Spaced doublequote?\n", linecnt, (int)(strstr(s," \" ")-aline+1));
|
ali@0
|
1918 |
else
|
ali@0
|
1919 |
cnt_punct++;
|
ali@0
|
1920 |
s = strstr(s," \" ") + 2;
|
ali@0
|
1921 |
}
|
ali@0
|
1922 |
|
ali@0
|
1923 |
/* v.20 also look for spaced singlequotes ' and ` */
|
ali@0
|
1924 |
s = aline;
|
ali@0
|
1925 |
while (strstr(s," ' ")) {
|
ali@0
|
1926 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1927 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1928 |
printf(" Line %ld column %d - Spaced singlequote?\n", linecnt, (int)(strstr(s," ' ")-aline+1));
|
ali@0
|
1929 |
else
|
ali@0
|
1930 |
cnt_punct++;
|
ali@0
|
1931 |
s = strstr(s," ' ") + 2;
|
ali@0
|
1932 |
}
|
ali@0
|
1933 |
|
ali@0
|
1934 |
s = aline;
|
ali@0
|
1935 |
while (strstr(s," ` ")) {
|
ali@0
|
1936 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1937 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1938 |
printf(" Line %ld column %d - Spaced singlequote?\n", linecnt, (int)(strstr(s," ` ")-aline+1));
|
ali@0
|
1939 |
else
|
ali@0
|
1940 |
cnt_punct++;
|
ali@0
|
1941 |
s = strstr(s," ` ") + 2;
|
ali@0
|
1942 |
}
|
ali@0
|
1943 |
|
ali@0
|
1944 |
/* v.99 check special case of 'S instead of 's at end of word */
|
ali@0
|
1945 |
s = aline + 1;
|
ali@0
|
1946 |
while (*s) {
|
ali@0
|
1947 |
if (*s == CHAR_SQUOTE && *(s+1) == 'S' && *(s-1)>='a' && *(s-1)<='z') {
|
ali@0
|
1948 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1949 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1950 |
printf(" Line %ld column %d - Capital \"S\"?\n", linecnt, (int)(s-aline+2));
|
ali@0
|
1951 |
else
|
ali@0
|
1952 |
cnt_punct++;
|
ali@0
|
1953 |
}
|
ali@0
|
1954 |
s++;
|
ali@0
|
1955 |
}
|
ali@0
|
1956 |
|
ali@0
|
1957 |
|
ali@0
|
1958 |
/* v.21 Now check special cases - start and end of line - */
|
ali@0
|
1959 |
/* for single and double quotes. Start is sometimes [sic] */
|
ali@0
|
1960 |
/* but better to query it anyway. */
|
ali@0
|
1961 |
/* While I'm here, check for dash at end of line */
|
ali@0
|
1962 |
llen = strlen(aline);
|
ali@0
|
1963 |
if (llen > 1) {
|
ali@0
|
1964 |
if (aline[llen-1] == CHAR_DQUOTE ||
|
ali@0
|
1965 |
aline[llen-1] == CHAR_SQUOTE ||
|
ali@0
|
1966 |
aline[llen-1] == CHAR_OPEN_SQUOTE)
|
ali@0
|
1967 |
if (aline[llen-2] == CHAR_SPACE) {
|
ali@0
|
1968 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1969 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1970 |
printf(" Line %ld column %d - Spaced quote?\n", linecnt, llen);
|
ali@0
|
1971 |
else
|
ali@0
|
1972 |
cnt_punct++;
|
ali@0
|
1973 |
}
|
ali@0
|
1974 |
|
ali@0
|
1975 |
/* V 0.98 removed aline[0] == CHAR_DQUOTE from the test below, since */
|
ali@0
|
1976 |
/* Wrongspaced quotes test also catches it for " */
|
ali@0
|
1977 |
if (aline[0] == CHAR_SQUOTE ||
|
ali@0
|
1978 |
aline[0] == CHAR_OPEN_SQUOTE)
|
ali@0
|
1979 |
if (aline[1] == CHAR_SPACE) {
|
ali@0
|
1980 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1981 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1982 |
printf(" Line %ld column 1 - Spaced quote?\n", linecnt);
|
ali@0
|
1983 |
else
|
ali@0
|
1984 |
cnt_punct++;
|
ali@0
|
1985 |
}
|
ali@0
|
1986 |
/* dash at end of line may well be legit - paranoid mode only */
|
ali@0
|
1987 |
/* and don't report em-dash at line-end */
|
ali@0
|
1988 |
if (pswit[PARANOID_SWITCH] && warn_hyphen) {
|
ali@0
|
1989 |
for (i = llen-1; i > 0 && (unsigned char)aline[i] <= CHAR_SPACE; i--);
|
ali@0
|
1990 |
if (aline[i] == '-' && aline[i-1] != '-') {
|
ali@0
|
1991 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
1992 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
1993 |
printf(" Line %ld column %d - Hyphen at end of line?\n", linecnt, i);
|
ali@0
|
1994 |
}
|
ali@0
|
1995 |
}
|
ali@0
|
1996 |
}
|
ali@0
|
1997 |
|
ali@0
|
1998 |
/* v.21 also look for brackets surrounded by alpha */
|
ali@0
|
1999 |
/* Brackets are often unspaced, but shouldn't be surrounded by alpha. */
|
ali@0
|
2000 |
/* If so, suspect a scanno like "a]most" */
|
ali@0
|
2001 |
llen = strlen(aline);
|
ali@0
|
2002 |
for (i = 1; i < llen-1; i++) { /* for each character in the line except 1st & last*/
|
ali@0
|
2003 |
if (strchr("{[()]}", aline[i]) /* if it's a bracket */
|
ali@0
|
2004 |
&& gcisalpha(aline[i-1]) && gcisalpha(aline[i+1])) {
|
ali@0
|
2005 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
2006 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
2007 |
printf(" Line %ld column %d - Unspaced bracket?\n", linecnt, i);
|
ali@0
|
2008 |
else
|
ali@0
|
2009 |
cnt_punct++;
|
ali@0
|
2010 |
}
|
ali@0
|
2011 |
}
|
ali@0
|
2012 |
/* The "Cinderella" case, back in again! :-S Give it another shot */
|
ali@0
|
2013 |
if (warn_endquote) {
|
ali@0
|
2014 |
llen = strlen(aline);
|
ali@0
|
2015 |
for (i = 1; i < llen; i++) { /* for each character in the line except 1st */
|
ali@0
|
2016 |
if (aline[i] == CHAR_DQUOTE)
|
ali@0
|
2017 |
if (isalpha(aline[i-1])) {
|
ali@0
|
2018 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
2019 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
2020 |
printf(" Line %ld column %d - endquote missing punctuation?\n", linecnt, i);
|
ali@0
|
2021 |
else
|
ali@0
|
2022 |
cnt_punct++;
|
ali@0
|
2023 |
}
|
ali@0
|
2024 |
}
|
ali@0
|
2025 |
}
|
ali@0
|
2026 |
|
ali@0
|
2027 |
llen = strlen(aline);
|
ali@0
|
2028 |
|
ali@0
|
2029 |
/* Check for <HTML TAG> */
|
ali@0
|
2030 |
/* If there is a < in the line, followed at some point */
|
ali@0
|
2031 |
/* by a > then we suspect HTML */
|
ali@0
|
2032 |
if (strstr(aline, "<") && strstr(aline, ">")) {
|
ali@0
|
2033 |
i = (signed int) (strstr(aline, ">") - strstr(aline, "<") + 1);
|
ali@0
|
2034 |
if (i > 0) {
|
ali@0
|
2035 |
strncpy(wrk, strstr(aline, "<"), i);
|
ali@0
|
2036 |
wrk[i] = 0;
|
ali@0
|
2037 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
2038 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
2039 |
printf(" Line %ld column %d - HTML Tag? %s \n", linecnt, (int)(strstr(aline, "<") - aline) + 1, wrk);
|
ali@0
|
2040 |
else
|
ali@0
|
2041 |
cnt_html++;
|
ali@0
|
2042 |
}
|
ali@0
|
2043 |
}
|
ali@0
|
2044 |
|
ali@0
|
2045 |
/* Check for &symbol; HTML */
|
ali@0
|
2046 |
/* If there is a & in the line, followed at */
|
ali@0
|
2047 |
/* some point by a ; then we suspect HTML */
|
ali@0
|
2048 |
if (strstr(aline, "&") && strstr(aline, ";")) {
|
ali@0
|
2049 |
i = (int)(strstr(aline, ";") - strstr(aline, "&") + 1);
|
ali@0
|
2050 |
for (s = strstr(aline, "&"); s < strstr(aline, ";"); s++)
|
ali@0
|
2051 |
if (*s == CHAR_SPACE) i = 0; /* 0.99 don't report "Jones & Son;" */
|
ali@0
|
2052 |
if (i > 0) {
|
ali@0
|
2053 |
strncpy(wrk, strstr(aline,"&"), i);
|
ali@0
|
2054 |
wrk[i] = 0;
|
ali@0
|
2055 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
|
ali@0
|
2056 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
2057 |
printf(" Line %ld column %d - HTML symbol? %s \n", linecnt, (int)(strstr(aline, "&") - aline) + 1, wrk);
|
ali@0
|
2058 |
else
|
ali@0
|
2059 |
cnt_html++;
|
ali@0
|
2060 |
}
|
ali@0
|
2061 |
}
|
ali@0
|
2062 |
|
ali@0
|
2063 |
/* At end of paragraph, check for mismatched quotes. */
|
ali@0
|
2064 |
/* We don't want to report an error immediately, since it is a */
|
ali@0
|
2065 |
/* common convention to omit the quotes at end of paragraph if */
|
ali@0
|
2066 |
/* the next paragraph is a continuation of the same speaker. */
|
ali@0
|
2067 |
/* Where this is the case, the next para should begin with a */
|
ali@0
|
2068 |
/* quote, so we store the warning message and only display it */
|
ali@0
|
2069 |
/* at the top of the next iteration if the new para doesn't */
|
ali@0
|
2070 |
/* start with a quote. */
|
ali@0
|
2071 |
/* The -p switch overrides this default, and warns of unclosed */
|
ali@0
|
2072 |
/* quotes on _every_ paragraph, whether the next begins with a */
|
ali@0
|
2073 |
/* quote or not. */
|
ali@0
|
2074 |
/* Version .16 - only report mismatched single quotes if */
|
ali@0
|
2075 |
/* an open_single_quotes was found. */
|
ali@0
|
2076 |
|
ali@0
|
2077 |
if (isemptyline) { /* end of para - add up the totals */
|
ali@0
|
2078 |
if (quot % 2)
|
ali@0
|
2079 |
sprintf(dquote_err, " Line %ld - Mismatched quotes\n", linecnt);
|
ali@0
|
2080 |
if (pswit[SQUOTE_SWITCH] && open_single_quote && (open_single_quote != close_single_quote) )
|
ali@0
|
2081 |
sprintf(squote_err," Line %ld - Mismatched singlequotes?\n", linecnt);
|
ali@0
|
2082 |
if (pswit[SQUOTE_SWITCH] && open_single_quote
|
ali@0
|
2083 |
&& (open_single_quote != close_single_quote)
|
ali@0
|
2084 |
&& (open_single_quote != close_single_quote +1) )
|
ali@0
|
2085 |
squot = 1; /* flag it to be noted regardless of the first char of the next para */
|
ali@0
|
2086 |
if (r_brack)
|
ali@0
|
2087 |
sprintf(rbrack_err, " Line %ld - Mismatched round brackets?\n", linecnt);
|
ali@0
|
2088 |
if (s_brack)
|
ali@0
|
2089 |
sprintf(sbrack_err, " Line %ld - Mismatched square brackets?\n", linecnt);
|
ali@0
|
2090 |
if (c_brack)
|
ali@0
|
2091 |
sprintf(cbrack_err, " Line %ld - Mismatched curly brackets?\n", linecnt);
|
ali@0
|
2092 |
if (c_unders % 2)
|
ali@0
|
2093 |
sprintf(unders_err, " Line %ld - Mismatched underscores?\n", linecnt);
|
ali@0
|
2094 |
quot = s_brack = c_brack = r_brack = c_unders =
|
ali@0
|
2095 |
open_single_quote = close_single_quote = 0;
|
ali@0
|
2096 |
isnewpara = 1; /* let the next iteration know that it's starting a new para */
|
ali@0
|
2097 |
}
|
ali@0
|
2098 |
|
ali@0
|
2099 |
/* V.21 _ALSO_ at end of paragraph, check for omitted punctuation. */
|
ali@0
|
2100 |
/* by working back through prevline. DW. */
|
ali@0
|
2101 |
/* Hmmm. Need to check this only for "normal" paras. */
|
ali@0
|
2102 |
/* So what is a "normal" para? ouch! */
|
ali@0
|
2103 |
/* Not normal if one-liner (chapter headings, etc.) */
|
ali@0
|
2104 |
/* Not normal if doesn't contain at least one locase letter */
|
ali@0
|
2105 |
/* Not normal if starts with space */
|
ali@0
|
2106 |
|
ali@0
|
2107 |
/* 0.99 tighten up on para end checks. Disallow comma and */
|
ali@0
|
2108 |
/* semi-colon. Check for legit para end before quotes. */
|
ali@0
|
2109 |
if (isemptyline) { /* end of para */
|
ali@0
|
2110 |
for (s = prevline, i = 0; *s && !i; s++)
|
ali@0
|
2111 |
if (gcisletter(*s))
|
ali@0
|
2112 |
i = 1; /* use i to indicate the presence of a letter on the line */
|
ali@0
|
2113 |
/* This next "if" is a problem. */
|
ali@0
|
2114 |
/* If I say "start_para_line <= linecnt - 1", that includes one-line */
|
ali@0
|
2115 |
/* "paragraphs" like chapter heads. Lotsa false positives. */
|
ali@0
|
2116 |
/* If I say "start_para_line < linecnt - 1" it doesn't, but then it */
|
ali@0
|
2117 |
/* misses genuine one-line paragraphs. */
|
ali@0
|
2118 |
/* So what do I do? */
|
ali@0
|
2119 |
if (i
|
ali@0
|
2120 |
&& lastblen > 2
|
ali@0
|
2121 |
&& start_para_line < linecnt - 1
|
ali@0
|
2122 |
&& *prevline > CHAR_SPACE
|
ali@0
|
2123 |
) {
|
ali@0
|
2124 |
for (i = strlen(prevline)-1; (prevline[i] == CHAR_DQUOTE || prevline[i] == CHAR_SQUOTE) && prevline[i] > CHAR_SPACE && i > 0; i--);
|
ali@0
|
2125 |
for ( ; i > 0; i--) {
|
ali@0
|
2126 |
if (gcisalpha(prevline[i])) {
|
ali@0
|
2127 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", prevline);
|
ali@0
|
2128 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
2129 |
printf(" Line %ld column %d - No punctuation at para end?\n", linecnt-1, strlen(prevline));
|
ali@0
|
2130 |
else
|
ali@0
|
2131 |
cnt_punct++;
|
ali@0
|
2132 |
break;
|
ali@0
|
2133 |
}
|
ali@0
|
2134 |
if (strchr("-.:!([{?}])", prevline[i]))
|
ali@0
|
2135 |
break;
|
ali@0
|
2136 |
}
|
ali@0
|
2137 |
}
|
ali@0
|
2138 |
}
|
ali@0
|
2139 |
strcpy(prevline, aline);
|
ali@0
|
2140 |
}
|
ali@0
|
2141 |
fclose (infile);
|
ali@0
|
2142 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
2143 |
for (i = 0; i < MAX_QWORD; i++)
|
ali@0
|
2144 |
if (dupcnt[i])
|
ali@0
|
2145 |
printf("\nNote: Queried word %s was duplicated %d time%s\n", qword[i], dupcnt[i], "s");
|
ali@0
|
2146 |
}
|
ali@0
|
2147 |
|
ali@0
|
2148 |
|
ali@0
|
2149 |
|
ali@0
|
2150 |
/* flgets - get one line from the input stream, checking for */
|
ali@0
|
2151 |
/* the existence of exactly one CR/LF line-end per line. */
|
ali@0
|
2152 |
/* Returns a pointer to the line. */
|
ali@0
|
2153 |
|
ali@0
|
2154 |
char *flgets(char *theline, int maxlen, FILE *thefile, long lcnt)
|
ali@0
|
2155 |
{
|
ali@0
|
2156 |
char c;
|
ali@0
|
2157 |
int len, isCR, cint;
|
ali@0
|
2158 |
|
ali@0
|
2159 |
*theline = 0;
|
ali@0
|
2160 |
len = isCR = 0;
|
ali@0
|
2161 |
c = cint = fgetc(thefile);
|
ali@0
|
2162 |
do {
|
ali@0
|
2163 |
if (cint == EOF)
|
ali@0
|
2164 |
return (NULL);
|
ali@0
|
2165 |
if (c == 10) /* either way, it's end of line */
|
ali@0
|
2166 |
if (isCR)
|
ali@0
|
2167 |
break;
|
ali@0
|
2168 |
else { /* Error - a LF without a preceding CR */
|
ali@0
|
2169 |
if (pswit[LINE_END_SWITCH]) {
|
ali@0
|
2170 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", theline);
|
ali@0
|
2171 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
2172 |
printf(" Line %ld - No CR?\n", lcnt);
|
ali@0
|
2173 |
else
|
ali@0
|
2174 |
cnt_lineend++;
|
ali@0
|
2175 |
}
|
ali@0
|
2176 |
break;
|
ali@0
|
2177 |
}
|
ali@0
|
2178 |
if (c == 13) {
|
ali@0
|
2179 |
if (isCR) { /* Error - two successive CRs */
|
ali@0
|
2180 |
if (pswit[LINE_END_SWITCH]) {
|
ali@0
|
2181 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", theline);
|
ali@0
|
2182 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
2183 |
printf(" Line %ld - Two successive CRs?\n", lcnt);
|
ali@0
|
2184 |
else
|
ali@0
|
2185 |
cnt_lineend++;
|
ali@0
|
2186 |
}
|
ali@0
|
2187 |
}
|
ali@0
|
2188 |
isCR = 1;
|
ali@0
|
2189 |
}
|
ali@0
|
2190 |
else {
|
ali@0
|
2191 |
if (pswit[LINE_END_SWITCH] && isCR) {
|
ali@0
|
2192 |
if (pswit[ECHO_SWITCH]) printf("\n%s\n", theline);
|
ali@0
|
2193 |
if (!pswit[OVERVIEW_SWITCH])
|
ali@0
|
2194 |
printf(" Line %ld column %d - CR without LF?\n", lcnt, len+1);
|
ali@0
|
2195 |
else
|
ali@0
|
2196 |
cnt_lineend++;
|
ali@0
|
2197 |
}
|
ali@0
|
2198 |
theline[len] = c;
|
ali@0
|
2199 |
len++;
|
ali@0
|
2200 |
theline[len] = 0;
|
ali@0
|
2201 |
isCR = 0;
|
ali@0
|
2202 |
}
|
ali@0
|
2203 |
c = cint = fgetc(thefile);
|
ali@0
|
2204 |
} while(len < maxlen);
|
ali@0
|
2205 |
if (pswit[MARKUP_SWITCH])
|
ali@0
|
2206 |
postprocess_for_HTML(theline);
|
ali@0
|
2207 |
if (pswit[DP_SWITCH])
|
ali@0
|
2208 |
postprocess_for_DP(theline);
|
ali@0
|
2209 |
return(theline);
|
ali@0
|
2210 |
}
|
ali@0
|
2211 |
|
ali@0
|
2212 |
|
ali@0
|
2213 |
|
ali@0
|
2214 |
|
ali@0
|
2215 |
/* mixdigit - takes a "word" as a parameter, and checks whether it */
|
ali@0
|
2216 |
/* contains a mixture of alpha and digits. Generally, this is an */
|
ali@0
|
2217 |
/* error, but may not be for cases like 4th or L5 12s. 3d. */
|
ali@0
|
2218 |
/* Returns 0 if no error found, 1 if error. */
|
ali@0
|
2219 |
|
ali@0
|
2220 |
int mixdigit(char *checkword) /* check for digits like 1 or 0 in words */
|
ali@0
|
2221 |
{
|
ali@0
|
2222 |
int wehaveadigit, wehavealetter, firstdigits, query, wl;
|
ali@0
|
2223 |
char *s;
|
ali@0
|
2224 |
|
ali@0
|
2225 |
|
ali@0
|
2226 |
wehaveadigit = wehavealetter = query = 0;
|
ali@0
|
2227 |
for (s = checkword; *s; s++)
|
ali@0
|
2228 |
if (gcisalpha(*s))
|
ali@0
|
2229 |
wehavealetter = 1;
|
ali@0
|
2230 |
else
|
ali@0
|
2231 |
if (gcisdigit(*s))
|
ali@0
|
2232 |
wehaveadigit = 1;
|
ali@0
|
2233 |
if (wehaveadigit && wehavealetter) { /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
|
ali@0
|
2234 |
query = 1;
|
ali@0
|
2235 |
wl = strlen(checkword);
|
ali@0
|
2236 |
for (firstdigits = 0; gcisdigit(checkword[firstdigits]); firstdigits++)
|
ali@0
|
2237 |
;
|
ali@0
|
2238 |
/* digits, ending in st, rd, nd, th of either case */
|
ali@0
|
2239 |
/* 0.99 donovan points out an error below. Turns out */
|
ali@0
|
2240 |
/* I was using matchword like strcmp when the */
|
ali@0
|
2241 |
/* return values are different! Duh. */
|
ali@0
|
2242 |
if (firstdigits + 2 == wl &&
|
ali@0
|
2243 |
(matchword(checkword + wl - 2, "st")
|
ali@0
|
2244 |
|| matchword(checkword + wl - 2, "rd")
|
ali@0
|
2245 |
|| matchword(checkword + wl - 2, "nd")
|
ali@0
|
2246 |
|| matchword(checkword + wl - 2, "th"))
|
ali@0
|
2247 |
)
|
ali@0
|
2248 |
query = 0;
|
ali@0
|
2249 |
if (firstdigits + 3 == wl &&
|
ali@0
|
2250 |
(matchword(checkword + wl - 3, "sts")
|
ali@0
|
2251 |
|| matchword(checkword + wl - 3, "rds")
|
ali@0
|
2252 |
|| matchword(checkword + wl - 3, "nds")
|
ali@0
|
2253 |
|| matchword(checkword + wl - 3, "ths"))
|
ali@0
|
2254 |
)
|
ali@0
|
2255 |
query = 0;
|
ali@0
|
2256 |
if (firstdigits + 3 == wl &&
|
ali@0
|
2257 |
(matchword(checkword + wl - 4, "stly")
|
ali@0
|
2258 |
|| matchword(checkword + wl - 4, "rdly")
|
ali@0
|
2259 |
|| matchword(checkword + wl - 4, "ndly")
|
ali@0
|
2260 |
|| matchword(checkword + wl - 4, "thly"))
|
ali@0
|
2261 |
)
|
ali@0
|
2262 |
query = 0;
|
ali@0
|
2263 |
|
ali@0
|
2264 |
/* digits, ending in l, L, s or d */
|
ali@0
|
2265 |
if (firstdigits + 1 == wl &&
|
ali@0
|
2266 |
(checkword[wl-1] == 'l'
|
ali@0
|
2267 |
|| checkword[wl-1] == 'L'
|
ali@0
|
2268 |
|| checkword[wl-1] == 's'
|
ali@0
|
2269 |
|| checkword[wl-1] == 'd'))
|
ali@0
|
2270 |
query = 0;
|
ali@0
|
2271 |
/* L at the start of a number, representing Britsh pounds, like L500 */
|
ali@0
|
2272 |
/* This is cute. We know the current word is mixeddigit. If the first */
|
ali@0
|
2273 |
/* letter is L, there must be at least one digit following. If both */
|
ali@0
|
2274 |
/* digits and letters follow, we have a genuine error, else we have a */
|
ali@0
|
2275 |
/* capital L followed by digits, and we accept that as a non-error. */
|
ali@0
|
2276 |
if (checkword[0] == 'L')
|
ali@0
|
2277 |
if (!mixdigit(checkword+1))
|
ali@0
|
2278 |
query = 0;
|
ali@0
|
2279 |
}
|
ali@0
|
2280 |
return (query);
|
ali@0
|
2281 |
}
|
ali@0
|
2282 |
|
ali@0
|
2283 |
|
ali@0
|
2284 |
|
ali@0
|
2285 |
|
ali@0
|
2286 |
/* getaword - extracts the first/next "word" from the line, and puts */
|
ali@0
|
2287 |
/* it into "thisword". A word is defined as one English word unit */
|
ali@0
|
2288 |
/* -- or at least that's what I'm trying for. */
|
ali@0
|
2289 |
/* Returns a pointer to the position in the line where we will start */
|
ali@0
|
2290 |
/* looking for the next word. */
|
ali@0
|
2291 |
|
ali@0
|
2292 |
char *getaword(char *fromline, char *thisword)
|
ali@0
|
2293 |
{
|
ali@0
|
2294 |
int i, wordlen;
|
ali@0
|
2295 |
char *s;
|
ali@0
|
2296 |
|
ali@0
|
2297 |
wordlen = 0;
|
ali@0
|
2298 |
for ( ; !gcisdigit(*fromline) && !gcisalpha(*fromline) && *fromline ; fromline++ );
|
ali@0
|
2299 |
|
ali@0
|
2300 |
/* V .20 */
|
ali@0
|
2301 |
/* add a look-ahead to handle exceptions for numbers like 1,000 and 1.35. */
|
ali@0
|
2302 |
/* Especially yucky is the case of L1,000 */
|
ali@0
|
2303 |
/* I hate this, and I see other ways, but I don't see that any is _better_.*/
|
ali@0
|
2304 |
/* This section looks for a pattern of characters including a digit */
|
ali@0
|
2305 |
/* followed by a comma or period followed by one or more digits. */
|
ali@0
|
2306 |
/* If found, it returns this whole pattern as a word; otherwise we discard */
|
ali@0
|
2307 |
/* the results and resume our normal programming. */
|
ali@0
|
2308 |
s = fromline;
|
ali@0
|
2309 |
for ( ; (gcisdigit(*s) || gcisalpha(*s) || *s == ',' || *s == '.') && wordlen < MAXWORDLEN ; s++ ) {
|
ali@0
|
2310 |
thisword[wordlen] = *s;
|
ali@0
|
2311 |
wordlen++;
|
ali@0
|
2312 |
}
|
ali@0
|
2313 |
thisword[wordlen] = 0;
|
ali@0
|
2314 |
for (i = 1; i < wordlen -1; i++) {
|
ali@0
|
2315 |
if (thisword[i] == '.' || thisword[i] == ',') {
|
ali@0
|
2316 |
if (gcisdigit(thisword[i-1]) && gcisdigit(thisword[i-1])) { /* we have one of the damned things */
|
ali@0
|
2317 |
fromline = s;
|
ali@0
|
2318 |
return(fromline);
|
ali@0
|
2319 |
}
|
ali@0
|
2320 |
}
|
ali@0
|
2321 |
}
|
ali@0
|
2322 |
|
ali@0
|
2323 |
/* we didn't find a punctuated number - do the regular getword thing */
|
ali@0
|
2324 |
wordlen = 0;
|
ali@0
|
2325 |
for ( ; (gcisdigit(*fromline) || gcisalpha(*fromline) || *fromline == '\'') && wordlen < MAXWORDLEN ; fromline++ ) {
|
ali@0
|
2326 |
thisword[wordlen] = *fromline;
|
ali@0
|
2327 |
wordlen++;
|
ali@0
|
2328 |
}
|
ali@0
|
2329 |
thisword[wordlen] = 0;
|
ali@0
|
2330 |
return(fromline);
|
ali@0
|
2331 |
}
|
ali@0
|
2332 |
|
ali@0
|
2333 |
|
ali@0
|
2334 |
|
ali@0
|
2335 |
|
ali@0
|
2336 |
|
ali@0
|
2337 |
/* matchword - just a case-insensitive string matcher */
|
ali@0
|
2338 |
/* yes, I know this is not efficient. I'll worry about */
|
ali@0
|
2339 |
/* that when I have a clear idea where I'm going with it.*/
|
ali@0
|
2340 |
|
ali@0
|
2341 |
int matchword(char *checkfor, char *thisword)
|
ali@0
|
2342 |
{
|
ali@0
|
2343 |
unsigned int ismatch, i;
|
ali@0
|
2344 |
|
ali@0
|
2345 |
if (strlen(checkfor) != strlen(thisword)) return(0);
|
ali@0
|
2346 |
|
ali@0
|
2347 |
ismatch = 1; /* assume a match until we find a difference */
|
ali@0
|
2348 |
for (i = 0; i <strlen(checkfor); i++)
|
ali@0
|
2349 |
if (toupper(checkfor[i]) != toupper(thisword[i]))
|
ali@0
|
2350 |
ismatch = 0;
|
ali@0
|
2351 |
return (ismatch);
|
ali@0
|
2352 |
}
|
ali@0
|
2353 |
|
ali@0
|
2354 |
|
ali@0
|
2355 |
|
ali@0
|
2356 |
|
ali@0
|
2357 |
|
ali@0
|
2358 |
/* lowerit - lowercase the line. Yes, strlwr does the same job, */
|
ali@0
|
2359 |
/* but not on all platforms, and I'm a bit paranoid about what */
|
ali@0
|
2360 |
/* some implementations of tolower might do to hi-bit characters,*/
|
ali@0
|
2361 |
/* which shouldn't matter, but better safe than sorry. */
|
ali@0
|
2362 |
|
ali@0
|
2363 |
void lowerit(char *theline)
|
ali@0
|
2364 |
{
|
ali@0
|
2365 |
for ( ; *theline; theline++)
|
ali@0
|
2366 |
if (*theline >='A' && *theline <='Z')
|
ali@0
|
2367 |
*theline += 32;
|
ali@0
|
2368 |
}
|
ali@0
|
2369 |
|
ali@0
|
2370 |
|
ali@0
|
2371 |
/* Is this word a Roman Numeral? */
|
ali@0
|
2372 |
/* v 0.99 improved to be better. It still doesn't actually */
|
ali@0
|
2373 |
/* validate that the number is a valid Roman Numeral -- for example */
|
ali@0
|
2374 |
/* it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not*/
|
ali@0
|
2375 |
/* what we're here to do. If it passes this, it LOOKS like a Roman */
|
ali@0
|
2376 |
/* numeral. Anyway, the actual Romans were pretty tolerant of bad */
|
ali@0
|
2377 |
/* arithmetic, or expressions thereof, except when it came to taxes.*/
|
ali@0
|
2378 |
/* Allow any number of M, an optional D, an optional CM or CD, */
|
ali@0
|
2379 |
/* any number of optional Cs, an optional XL or an optional XC, an */
|
ali@0
|
2380 |
/* optional IX or IV, an optional V and any number of optional Is. */
|
ali@0
|
2381 |
/* Good enough for jazz chords. */
|
ali@0
|
2382 |
|
ali@0
|
2383 |
int isroman(char *t)
|
ali@0
|
2384 |
{
|
ali@0
|
2385 |
char *s;
|
ali@0
|
2386 |
|
ali@0
|
2387 |
if (!t || !*t) return (0);
|
ali@0
|
2388 |
|
ali@0
|
2389 |
s = t;
|
ali@0
|
2390 |
|
ali@0
|
2391 |
while (*t == 'm' && *t ) t++;
|
ali@0
|
2392 |
if (*t == 'd') t++;
|
ali@0
|
2393 |
if (*t == 'c' && *(t+1) == 'm') t+=2;
|
ali@0
|
2394 |
if (*t == 'c' && *(t+1) == 'd') t+=2;
|
ali@0
|
2395 |
while (*t == 'c' && *t) t++;
|
ali@0
|
2396 |
if (*t == 'x' && *(t+1) == 'l') t+=2;
|
ali@0
|
2397 |
if (*t == 'x' && *(t+1) == 'c') t+=2;
|
ali@0
|
2398 |
if (*t == 'l') t++;
|
ali@0
|
2399 |
while (*t == 'x' && *t) t++;
|
ali@0
|
2400 |
if (*t == 'i' && *(t+1) == 'x') t+=2;
|
ali@0
|
2401 |
if (*t == 'i' && *(t+1) == 'v') t+=2;
|
ali@0
|
2402 |
if (*t == 'v') t++;
|
ali@0
|
2403 |
while (*t == 'i' && *t) t++;
|
ali@0
|
2404 |
if (!*t) return (1);
|
ali@0
|
2405 |
|
ali@0
|
2406 |
return(0);
|
ali@0
|
2407 |
}
|
ali@0
|
2408 |
|
ali@0
|
2409 |
|
ali@0
|
2410 |
|
ali@0
|
2411 |
|
ali@0
|
2412 |
/* gcisalpha is a special version that is somewhat lenient on 8-bit texts. */
|
ali@0
|
2413 |
/* If we use the standard isalpha() function, 8-bit accented characters break */
|
ali@0
|
2414 |
/* words, so that tete with accented characters appears to be two words, "t" */
|
ali@0
|
2415 |
/* and "t", with 8-bit characters between them. This causes over-reporting of */
|
ali@0
|
2416 |
/* errors. gcisalpha() recognizes accented letters from the CP1252 (Windows) */
|
ali@0
|
2417 |
/* and ISO-8859-1 character sets, which are the most common PG 8-bit types. */
|
ali@0
|
2418 |
|
ali@0
|
2419 |
int gcisalpha(unsigned char c)
|
ali@0
|
2420 |
{
|
ali@0
|
2421 |
if (c >='a' && c <='z') return(1);
|
ali@0
|
2422 |
if (c >='A' && c <='Z') return(1);
|
ali@0
|
2423 |
if (c < 140) return(0);
|
ali@0
|
2424 |
if (c >=192 && c != 208 && c != 215 && c != 222 && c != 240 && c != 247 && c != 254) return(1);
|
ali@0
|
2425 |
if (c == 140 || c == 142 || c == 156 || c == 158 || c == 159) return (1);
|
ali@0
|
2426 |
return(0);
|
ali@0
|
2427 |
}
|
ali@0
|
2428 |
|
ali@0
|
2429 |
/* gcisdigit is a special version that doesn't get confused in 8-bit texts. */
|
ali@0
|
2430 |
int gcisdigit(unsigned char c)
|
ali@0
|
2431 |
{
|
ali@0
|
2432 |
if (c >= '0' && c <='9') return(1);
|
ali@0
|
2433 |
return(0);
|
ali@0
|
2434 |
}
|
ali@0
|
2435 |
|
ali@0
|
2436 |
/* gcisletter is a special version that doesn't get confused in 8-bit texts. */
|
ali@0
|
2437 |
/* Yeah, we're ISO-8891-1-specific. So sue me. */
|
ali@0
|
2438 |
int gcisletter(unsigned char c)
|
ali@0
|
2439 |
{
|
ali@0
|
2440 |
if ((c >= 'A' && c <='Z') || (c >= 'a' && c <='z') || c >= 192) return(1);
|
ali@0
|
2441 |
return(0);
|
ali@0
|
2442 |
}
|
ali@0
|
2443 |
|
ali@0
|
2444 |
|
ali@0
|
2445 |
|
ali@0
|
2446 |
|
ali@0
|
2447 |
/* gcstrchr wraps strchr to return NULL if the character being searched for is zero */
|
ali@0
|
2448 |
|
ali@0
|
2449 |
char *gcstrchr(char *s, char c)
|
ali@0
|
2450 |
{
|
ali@0
|
2451 |
if (c == 0) return(NULL);
|
ali@0
|
2452 |
return(strchr(s,c));
|
ali@0
|
2453 |
}
|
ali@0
|
2454 |
|
ali@0
|
2455 |
/* postprocess_for_DP is derived from postprocess_for_HTML */
|
ali@0
|
2456 |
/* It is invoked with the -d switch from flgets(). */
|
ali@0
|
2457 |
/* It simply "removes" from the line a hard-coded set of common */
|
ali@0
|
2458 |
/* DP-specific tags, so that the line passed to the main routine has*/
|
ali@0
|
2459 |
/* been pre-cleaned of DP markup. */
|
ali@0
|
2460 |
|
ali@0
|
2461 |
void postprocess_for_DP(char *theline)
|
ali@0
|
2462 |
{
|
ali@0
|
2463 |
|
ali@0
|
2464 |
char *s, *t;
|
ali@0
|
2465 |
int i;
|
ali@0
|
2466 |
|
ali@0
|
2467 |
if (!*theline)
|
ali@0
|
2468 |
return;
|
ali@0
|
2469 |
|
ali@0
|
2470 |
for (i = 0; *DPmarkup[i]; i++) {
|
ali@0
|
2471 |
s = strstr(theline, DPmarkup[i]);
|
ali@0
|
2472 |
while (s) {
|
ali@0
|
2473 |
t = s + strlen(DPmarkup[i]);
|
ali@0
|
2474 |
while (*t) {
|
ali@0
|
2475 |
*s = *t;
|
ali@0
|
2476 |
t++; s++;
|
ali@0
|
2477 |
}
|
ali@0
|
2478 |
*s = 0;
|
ali@0
|
2479 |
s = strstr(theline, DPmarkup[i]);
|
ali@0
|
2480 |
}
|
ali@0
|
2481 |
}
|
ali@0
|
2482 |
|
ali@0
|
2483 |
}
|
ali@0
|
2484 |
|
ali@0
|
2485 |
|
ali@0
|
2486 |
/* postprocess_for_HTML is, at the moment (0.97), a very nasty */
|
ali@0
|
2487 |
/* short-term fix for Charlz. Nasty, nasty, nasty. */
|
ali@0
|
2488 |
/* It is invoked with the -m switch from flgets(). */
|
ali@0
|
2489 |
/* It simply "removes" from the line a hard-coded set of common */
|
ali@0
|
2490 |
/* HTML tags and "replaces" a hard-coded set of common HTML */
|
ali@0
|
2491 |
/* entities, so that the line passed to the main routine has */
|
ali@0
|
2492 |
/* been pre-cleaned of HTML. This is _so_ not the right way to */
|
ali@0
|
2493 |
/* deal with HTML, but what Charlz needs now is not HTML handling */
|
ali@0
|
2494 |
/* proper: just ignoring <i> tags and some others. */
|
ali@0
|
2495 |
/* To be revisited in future releases! */
|
ali@0
|
2496 |
|
ali@0
|
2497 |
void postprocess_for_HTML(char *theline)
|
ali@0
|
2498 |
{
|
ali@0
|
2499 |
|
ali@0
|
2500 |
if (strstr(theline, "<") && strstr(theline, ">"))
|
ali@0
|
2501 |
while (losemarkup(theline))
|
ali@0
|
2502 |
;
|
ali@0
|
2503 |
while (loseentities(theline))
|
ali@0
|
2504 |
;
|
ali@0
|
2505 |
}
|
ali@0
|
2506 |
|
ali@0
|
2507 |
char *losemarkup(char *theline)
|
ali@0
|
2508 |
{
|
ali@0
|
2509 |
char *s, *t;
|
ali@0
|
2510 |
int i;
|
ali@0
|
2511 |
|
ali@0
|
2512 |
if (!*theline)
|
ali@0
|
2513 |
return(NULL);
|
ali@0
|
2514 |
|
ali@0
|
2515 |
s = strstr(theline, "<");
|
ali@0
|
2516 |
t = strstr(theline, ">");
|
ali@0
|
2517 |
if (!s || !t) return(NULL);
|
ali@0
|
2518 |
for (i = 0; *markup[i]; i++)
|
ali@0
|
2519 |
if (!tagcomp(s+1, markup[i])) {
|
ali@0
|
2520 |
if (!*(t+1)) {
|
ali@0
|
2521 |
*s = 0;
|
ali@0
|
2522 |
return(s);
|
ali@0
|
2523 |
}
|
ali@0
|
2524 |
else
|
ali@0
|
2525 |
if (t > s) {
|
ali@0
|
2526 |
strcpy(s, t+1);
|
ali@0
|
2527 |
return(s);
|
ali@0
|
2528 |
}
|
ali@0
|
2529 |
}
|
ali@0
|
2530 |
/* it's an unrecognized <xxx> */
|
ali@0
|
2531 |
return(NULL);
|
ali@0
|
2532 |
}
|
ali@0
|
2533 |
|
ali@0
|
2534 |
char *loseentities(char *theline)
|
ali@0
|
2535 |
{
|
ali@0
|
2536 |
int i;
|
ali@0
|
2537 |
char *s, *t;
|
ali@0
|
2538 |
|
ali@0
|
2539 |
if (!*theline)
|
ali@0
|
2540 |
return(NULL);
|
ali@0
|
2541 |
|
ali@0
|
2542 |
for (i = 0; *entities[i].htmlent; i++) {
|
ali@0
|
2543 |
s = strstr(theline, entities[i].htmlent);
|
ali@0
|
2544 |
if (s) {
|
ali@0
|
2545 |
t = malloc((size_t)strlen(s));
|
ali@0
|
2546 |
if (!t) return(NULL);
|
ali@0
|
2547 |
strcpy(t, s + strlen(entities[i].htmlent));
|
ali@0
|
2548 |
strcpy(s, entities[i].textent);
|
ali@0
|
2549 |
strcat(s, t);
|
ali@0
|
2550 |
free(t);
|
ali@0
|
2551 |
return(theline);
|
ali@0
|
2552 |
}
|
ali@0
|
2553 |
}
|
ali@0
|
2554 |
|
ali@0
|
2555 |
/* V0.97 Duh. Forgot to check the htmlnum member */
|
ali@0
|
2556 |
for (i = 0; *entities[i].htmlnum; i++) {
|
ali@0
|
2557 |
s = strstr(theline, entities[i].htmlnum);
|
ali@0
|
2558 |
if (s) {
|
ali@0
|
2559 |
t = malloc((size_t)strlen(s));
|
ali@0
|
2560 |
if (!t) return(NULL);
|
ali@0
|
2561 |
strcpy(t, s + strlen(entities[i].htmlnum));
|
ali@0
|
2562 |
strcpy(s, entities[i].textent);
|
ali@0
|
2563 |
strcat(s, t);
|
ali@0
|
2564 |
free(t);
|
ali@0
|
2565 |
return(theline);
|
ali@0
|
2566 |
}
|
ali@0
|
2567 |
}
|
ali@0
|
2568 |
return(NULL);
|
ali@0
|
2569 |
}
|
ali@0
|
2570 |
|
ali@0
|
2571 |
|
ali@0
|
2572 |
int tagcomp(char *strin, char *basetag)
|
ali@0
|
2573 |
{
|
ali@0
|
2574 |
char *s, *t;
|
ali@0
|
2575 |
|
ali@0
|
2576 |
s = basetag;
|
ali@0
|
2577 |
t = strin;
|
ali@0
|
2578 |
if (*t == '/') t++; /* ignore a slash */
|
ali@0
|
2579 |
while (*s && *t) {
|
ali@0
|
2580 |
if (tolower(*s) != tolower(*t)) return(1);
|
ali@0
|
2581 |
s++; t++;
|
ali@0
|
2582 |
}
|
ali@0
|
2583 |
/* OK, we have < followed by a valid tag start */
|
ali@0
|
2584 |
/* should I do something about length? */
|
ali@0
|
2585 |
/* this is messy. The length of an <i> tag is */
|
ali@0
|
2586 |
/* limited, but a <table> could go on for miles */
|
ali@0
|
2587 |
/* so I'd have to parse the tags . . . ugh. */
|
ali@0
|
2588 |
/* It isn't what Charlz needs now, so mark it */
|
ali@0
|
2589 |
/* as 'pending'. */
|
ali@0
|
2590 |
return(0);
|
ali@0
|
2591 |
}
|
ali@0
|
2592 |
|
ali@0
|
2593 |
void proghelp() /* explain program usage here */
|
ali@0
|
2594 |
{
|
ali@0
|
2595 |
fputs("V. 0.991. Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
|
ali@0
|
2596 |
fputs("Gutcheck comes wih ABSOLUTELY NO WARRANTY. For details, read the file COPYING.\n", stderr);
|
ali@0
|
2597 |
fputs("This is Free Software; you may redistribute it under certain conditions (GPL);\n", stderr);
|
ali@0
|
2598 |
fputs("read the file COPYING for details.\n\n", stderr);
|
ali@0
|
2599 |
fputs("Usage is: gutcheck [-setpxloyhud] filename\n",stderr);
|
ali@0
|
2600 |
fputs(" where -s checks single quotes, -e suppresses echoing lines, -t checks typos\n",stderr);
|
ali@0
|
2601 |
fputs(" -x (paranoid) switches OFF -t and extra checks, -l turns OFF line-end checks\n",stderr);
|
ali@0
|
2602 |
fputs(" -o just displays overview without detail, -h echoes header fields\n",stderr);
|
ali@0
|
2603 |
fputs(" -v (verbose) unsuppresses duplicate reporting, -m suppresses markup\n",stderr);
|
ali@0
|
2604 |
fputs(" -d ignores DP-specific markup,\n",stderr);
|
ali@0
|
2605 |
fputs(" -u uses a file gutcheck.typ to query user-defined possible typos\n",stderr);
|
ali@0
|
2606 |
fputs("Sample usage: gutcheck warpeace.txt \n",stderr);
|
ali@0
|
2607 |
fputs("\n",stderr);
|
ali@0
|
2608 |
fputs("Gutcheck looks for errors in Project Gutenberg(TM) etexts.\n", stderr);
|
ali@0
|
2609 |
fputs("Gutcheck queries anything it thinks shouldn't be in a PG text; non-ASCII\n",stderr);
|
ali@0
|
2610 |
fputs("characters like accented letters, lines longer than 75 or shorter than 55,\n",stderr);
|
ali@0
|
2611 |
fputs("unbalanced quotes or brackets, a variety of badly formatted punctuation, \n",stderr);
|
ali@0
|
2612 |
fputs("HTML tags, some likely typos. It is NOT a substitute for human judgement.\n",stderr);
|
ali@0
|
2613 |
fputs("\n",stderr);
|
ali@0
|
2614 |
}
|
ali@0
|
2615 |
|
ali@0
|
2616 |
|
ali@0
|
2617 |
|
ali@0
|
2618 |
/*********************************************************************
|
ali@0
|
2619 |
Revision History:
|
ali@0
|
2620 |
|
ali@0
|
2621 |
04/22/01 Cleaned up some stuff and released .10
|
ali@0
|
2622 |
|
ali@0
|
2623 |
---------------
|
ali@0
|
2624 |
|
ali@0
|
2625 |
05/09/01 Added the typo list, added two extra cases of he/be error,
|
ali@0
|
2626 |
added -p switch, OPEN_SINGLE QUOTE char as .11
|
ali@0
|
2627 |
|
ali@0
|
2628 |
---------------
|
ali@0
|
2629 |
|
ali@0
|
2630 |
05/20/01 Increased the typo list,
|
ali@0
|
2631 |
added paranoid mode,
|
ali@0
|
2632 |
ANSIfied the code and added some casts
|
ali@0
|
2633 |
so the compiler wouldn't keep asking if I knew what I was doing,
|
ali@0
|
2634 |
fixed bug in l.s.d. condition (thanks, Dave!),
|
ali@0
|
2635 |
standardized spacing when echoing,
|
ali@0
|
2636 |
added letter-combo checking code to typo section,
|
ali@0
|
2637 |
added more h/b words to typo array.
|
ali@0
|
2638 |
Not too sure about putting letter combos outside of the TYPO conditions -
|
ali@0
|
2639 |
someone is sure to have a book about the tbaka tribe, or something. Anyway, let's see.
|
ali@0
|
2640 |
Released as .12
|
ali@0
|
2641 |
|
ali@0
|
2642 |
---------------
|
ali@0
|
2643 |
|
ali@0
|
2644 |
06/01/01 Removed duplicate reporting of Tildes, asterisks, etc.
|
ali@0
|
2645 |
06/10/01 Added flgets routine to help with platform-independent
|
ali@0
|
2646 |
detection of invalid line-ends. All PG text files should
|
ali@0
|
2647 |
have CR/LF (13/10) at end of line, regardless of system.
|
ali@0
|
2648 |
Gutcheck now validates this by default. (Thanks, Charles!)
|
ali@0
|
2649 |
Released as .13
|
ali@0
|
2650 |
|
ali@0
|
2651 |
---------------
|
ali@0
|
2652 |
|
ali@0
|
2653 |
06/11/01 Added parenthesis match checking. (c_brack, cbrack_err etc.)
|
ali@0
|
2654 |
Released as .14
|
ali@0
|
2655 |
|
ali@0
|
2656 |
---------------
|
ali@0
|
2657 |
|
ali@0
|
2658 |
06/23/01 Fixed: 'No',he said. not being flagged.
|
ali@0
|
2659 |
|
ali@0
|
2660 |
Improved: better single-quotes checking:
|
ali@0
|
2661 |
|
ali@0
|
2662 |
Ignore singlequotes surrounded by alpha, like didn't. (was OK)
|
ali@0
|
2663 |
|
ali@0
|
2664 |
If a singlequote is at the END of a word AND the word ends in "s":
|
ali@0
|
2665 |
The dogs' tails wagged.
|
ali@0
|
2666 |
it's probably an apostrophe, but less commonly may be a closequote:
|
ali@0
|
2667 |
"These 'pack dogs' of yours look more like wolves."
|
ali@0
|
2668 |
|
ali@0
|
2669 |
If it's got punctuation before it and is followed by a space
|
ali@0
|
2670 |
or punctuation:
|
ali@0
|
2671 |
. . . was a problem,' he said
|
ali@0
|
2672 |
. . . was a problem,'"
|
ali@0
|
2673 |
it is probably (certainly?) a closequote.
|
ali@0
|
2674 |
|
ali@0
|
2675 |
If it's at start of paragraph, it's probably an openquote.
|
ali@0
|
2676 |
(but watch dialect)
|
ali@0
|
2677 |
|
ali@0
|
2678 |
Words with ' at beginning and end are probably quoted:
|
ali@0
|
2679 |
"You have the word 'chivalry' frequently on your lips."
|
ali@0
|
2680 |
(Not specifically implemented)
|
ali@0
|
2681 |
V.18 I'm glad I didn't implement this, 'cos it jest ain't so
|
ali@0
|
2682 |
where the convention is to punctuate outside the quotes.
|
ali@0
|
2683 |
'Come', he said, 'and join the party'.
|
ali@0
|
2684 |
|
ali@0
|
2685 |
If it is followed by an alpha, and especially a capital:
|
ali@0
|
2686 |
'Hello,' called he.
|
ali@0
|
2687 |
it is either an openquote or dialect.
|
ali@0
|
2688 |
|
ali@0
|
2689 |
Dialect breaks ALL the rules:
|
ali@0
|
2690 |
A man's a man for a' that.
|
ali@0
|
2691 |
"Aye, but 'tis all in the pas' now."
|
ali@0
|
2692 |
"'Tis often the way," he said.
|
ali@0
|
2693 |
'Ave a drink on me.
|
ali@0
|
2694 |
|
ali@0
|
2695 |
This version looks to be an improvement, and produces
|
ali@0
|
2696 |
fewer false positives, but is still not perfect. The
|
ali@0
|
2697 |
'pack dogs' case still fools it, and dialect is still
|
ali@0
|
2698 |
a problem. Oh, well, it's an improvement, and I have
|
ali@0
|
2699 |
a weighted structure in place for refining guesses at
|
ali@0
|
2700 |
closequotes. Maybe next time, I'll add a bit of logic
|
ali@0
|
2701 |
where if there is an open quote and one that was guessed
|
ali@0
|
2702 |
to be a possessive apostrophe after s, I'll re-guess it
|
ali@0
|
2703 |
to be a closequote. Let's see how this one flies, first.
|
ali@0
|
2704 |
|
ali@0
|
2705 |
(Afterview: it's still crap. Needs much work, and a deeper insight.)
|
ali@0
|
2706 |
|
ali@0
|
2707 |
Released as .15
|
ali@0
|
2708 |
|
ali@0
|
2709 |
TODO: More he/be checks. Can't be perfect - counterexamples:
|
ali@0
|
2710 |
I gave my son good advice: be married regardless of the world's opinion.
|
ali@0
|
2711 |
I gave my son good advice: he married regardless of the world's opinion.
|
ali@0
|
2712 |
|
ali@0
|
2713 |
If by "primitive" be meant "crude", we can understand the sentence.
|
ali@0
|
2714 |
If by "primitive" he meant "crude", we can understand the sentence.
|
ali@0
|
2715 |
|
ali@0
|
2716 |
No matter what be said, I must go on.
|
ali@0
|
2717 |
No matter what he said, I must go on.
|
ali@0
|
2718 |
|
ali@0
|
2719 |
No value, however great, can be set upon them.
|
ali@0
|
2720 |
No value, however great, can he set upon them.
|
ali@0
|
2721 |
|
ali@0
|
2722 |
Real-Life one from a DP International Weekly Miscellany:
|
ali@0
|
2723 |
He wandered through the forest without fear, sleeping
|
ali@0
|
2724 |
much, for in sleep be had companionship--the Great
|
ali@0
|
2725 |
Spirit teaching him what he should know in dreams.
|
ali@0
|
2726 |
That one found by jeebies, and it turned out to be "he".
|
ali@0
|
2727 |
|
ali@0
|
2728 |
|
ali@0
|
2729 |
---------------
|
ali@0
|
2730 |
|
ali@0
|
2731 |
07/01/01 Added -O option.
|
ali@0
|
2732 |
Improved singlequotes by reporting mismatched single quotes
|
ali@0
|
2733 |
only if an open_single_quotes was found.
|
ali@0
|
2734 |
|
ali@0
|
2735 |
Released as .16
|
ali@0
|
2736 |
|
ali@0
|
2737 |
---------------
|
ali@0
|
2738 |
|
ali@0
|
2739 |
08/27/01 Added -Y switch for Robert Rowe to allow his app to
|
ali@0
|
2740 |
catch the error output.
|
ali@0
|
2741 |
|
ali@0
|
2742 |
Released as .17
|
ali@0
|
2743 |
|
ali@0
|
2744 |
---------------
|
ali@0
|
2745 |
|
ali@0
|
2746 |
09/08/01 Added checking Capitals at start of paragraph, but not
|
ali@0
|
2747 |
checking them at start of sentence.
|
ali@0
|
2748 |
|
ali@0
|
2749 |
TODO: Parse sentences out so can check reliably for start of
|
ali@0
|
2750 |
sentence. Need a whole different approach for that.
|
ali@0
|
2751 |
(Can't just rely on periods, since they are also
|
ali@0
|
2752 |
used for abbreviations, etc.)
|
ali@0
|
2753 |
|
ali@0
|
2754 |
Added checking for all vowels or all consonants in a word.
|
ali@0
|
2755 |
|
ali@0
|
2756 |
While I was in, I added "ii" checking and "tl" at start of word.
|
ali@0
|
2757 |
|
ali@0
|
2758 |
Added echoing of first line of paragraph when reporting
|
ali@0
|
2759 |
mismatched quoted or brackets (thanks to David Widger for the
|
ali@0
|
2760 |
suggestion)
|
ali@0
|
2761 |
|
ali@0
|
2762 |
Not querying L at start of a number (used for British pounds).
|
ali@0
|
2763 |
|
ali@0
|
2764 |
The spelling changes are sort of half-done but released anyway
|
ali@0
|
2765 |
Skipped .18 because I had given out a couple of test versions
|
ali@0
|
2766 |
with that number.
|
ali@0
|
2767 |
|
ali@0
|
2768 |
09/25/01 Released as .19
|
ali@0
|
2769 |
|
ali@0
|
2770 |
---------------
|
ali@0
|
2771 |
|
ali@0
|
2772 |
TODO:
|
ali@0
|
2773 |
Use the logic from my new version of safewrap to stop querying
|
ali@0
|
2774 |
short lines like poems and TOCs.
|
ali@0
|
2775 |
Ignore non-standard ellipses like . . . or ...
|
ali@0
|
2776 |
|
ali@0
|
2777 |
|
ali@0
|
2778 |
---------------
|
ali@0
|
2779 |
10/01/01 Made any line over 80 a VERY long line (was 85).
|
ali@0
|
2780 |
Recognized openquotes on indented paragraphs as continuations
|
ali@0
|
2781 |
of the same speech.
|
ali@0
|
2782 |
Added "cf" to the okword list (how did I forget _that_?) and a few others.
|
ali@0
|
2783 |
Moved abbrev to okword and made it more general.
|
ali@0
|
2784 |
Removed requirement that PG_space_emdash be greater than
|
ali@0
|
2785 |
ten before turning off warnings about spaced dashes.
|
ali@0
|
2786 |
Added period to list of characters that might constitute a separator line.
|
ali@0
|
2787 |
Now checking for double punctuation (Thanks, David!)
|
ali@0
|
2788 |
Now if two spaced em-dashes on a line, reports both. (DW)
|
ali@0
|
2789 |
Bug: Wasn't catching spaced punctuation at line-end since I
|
ali@0
|
2790 |
added flgets in version .13 - fixed.
|
ali@0
|
2791 |
Bug: Wasn't catching spaced singlequotes - fixed
|
ali@0
|
2792 |
Now reads punctuated numbers like 1,000 as a single word.
|
ali@0
|
2793 |
(Used to give "standalone 1" type queries)
|
ali@0
|
2794 |
Changed paranoid mode - not including s and p options. -ex is now quite usable.
|
ali@0
|
2795 |
Bug: was calling `"For it is perfectly impossible," Unspaced Quotes - fixed
|
ali@0
|
2796 |
Bug: Sometimes gave _next_ line number for queried word at end of line - fixed
|
ali@0
|
2797 |
|
ali@0
|
2798 |
10/22/01 Released as .20
|
ali@0
|
2799 |
|
ali@0
|
2800 |
---------------
|
ali@0
|
2801 |
|
ali@0
|
2802 |
Added count of lines with spaces at end. (cnt_spacend) (Thanks, Brett!)
|
ali@0
|
2803 |
Reduced the number of hi-bit letters needed to stop reporting them
|
ali@0
|
2804 |
from 1/20 to 1/100 or 200 in total.
|
ali@0
|
2805 |
Added PG footer check.
|
ali@0
|
2806 |
Added the -h switch.
|
ali@0
|
2807 |
Fixed platform-specific CHAR_EOL checking for isemptyline - changed to 13 and 10
|
ali@0
|
2808 |
Not reporting ".," when there are many of them, such as a book with many references to "Vol 1., p. 23"
|
ali@0
|
2809 |
Added unspaced brackets check when surrounded by alpha.
|
ali@0
|
2810 |
Removed all typo reporting unless the typo switch is on.
|
ali@0
|
2811 |
Added gcisalpha to ease over-reporting of 8-bit queries.
|
ali@0
|
2812 |
ECHO_SWITCH is now ON by default!
|
ali@0
|
2813 |
PARANOID_SWITCH is now ON by default!
|
ali@0
|
2814 |
Checking for ">From" placed there by e-mail MTA (Thanks Andrew & Greg)
|
ali@0
|
2815 |
Checking for standalone lowercase "l"
|
ali@0
|
2816 |
Checking for standalone lowercase "s"
|
ali@0
|
2817 |
Considering "is be" and "be is" "be was" "was be" as he/be errors
|
ali@0
|
2818 |
Looking at punct at end of para
|
ali@0
|
2819 |
|
ali@0
|
2820 |
01/20/02 Released as .21
|
ali@0
|
2821 |
|
ali@0
|
2822 |
---------------
|
ali@0
|
2823 |
|
ali@0
|
2824 |
Added VERBOSE_SWITCH to make it list everything. (George Davis)
|
ali@0
|
2825 |
|
ali@0
|
2826 |
---------------
|
ali@0
|
2827 |
|
ali@0
|
2828 |
02/17/02 Added cint in flgets to try fix an EOF failure on a compiler I don't have.
|
ali@0
|
2829 |
after which
|
ali@0
|
2830 |
This line caused a coredump on Solaris - fixed.
|
ali@0
|
2831 |
Da sagte die Figur: " Das ist alles gar schoen, und man mag die Puppe
|
ali@0
|
2832 |
03/09/02 Changed header recognition for another header change
|
ali@0
|
2833 |
Called it .24
|
ali@0
|
2834 |
03/29/02 Added qword[][] so I can suppress massive overreporting
|
ali@0
|
2835 |
of queried "words" like "FN", "Wm.", "th'", people's
|
ali@0
|
2836 |
initials, chemical formulae and suchlike in some texts.
|
ali@0
|
2837 |
Called it .25
|
ali@0
|
2838 |
04/07/02 The qword summary reports at end shouldn't show in OVERVIEW mode. Fixed.
|
ali@0
|
2839 |
Added linecounts in overview mode.
|
ali@0
|
2840 |
Wow! gutcheck gutcheck.exe doesn't report a binary! :-) Need to tighten up. Done.
|
ali@0
|
2841 |
"m" is a not uncommon scanno for "in", but also appears in "a.m." - Can I get round that?
|
ali@0
|
2842 |
07/07/02 Added GPL.
|
ali@0
|
2843 |
Added checking for broken em-dash at line-end (enddash)
|
ali@0
|
2844 |
Released as 0.95
|
ali@0
|
2845 |
08/17/02 Fixed a bug that treated some hi-bit characters as spaces. Thanks, Carlo.
|
ali@0
|
2846 |
Released as 0.96
|
ali@0
|
2847 |
10/10/02 Suppressing some annoying multiple reports by default:
|
ali@0
|
2848 |
Standalone Ones, Asterisks, Square Brackets.
|
ali@0
|
2849 |
Digit 1 occurs often in many scientific texts.
|
ali@0
|
2850 |
Asterisk occurs often in multi-footnoted texts.
|
ali@0
|
2851 |
Mismatch Square Brackets occurs often in multi-para footnotes.
|
ali@0
|
2852 |
Added -m switch for Charlz. Horrible. Nasty. Kludgy. Evil.
|
ali@0
|
2853 |
. . . but it does more or less work for the main cases.
|
ali@0
|
2854 |
Removed uppercase within a word as a separate category so
|
ali@0
|
2855 |
that names like VanAllen get reported only once, like other
|
ali@0
|
2856 |
suspected typos.
|
ali@0
|
2857 |
11/24/02 Fixed - -m switch wasn't looking at htmlnum in
|
ali@0
|
2858 |
loseentities (Thanks, Brett!)
|
ali@0
|
2859 |
Fixed bug which occasionally gave false warning of
|
ali@0
|
2860 |
paragraph starting with lowercase.
|
ali@0
|
2861 |
Added underscore as character not to query around doublequotes.
|
ali@0
|
2862 |
Split the "Non-ASCII" message into "Non-ASCII" vs. "Non-ISO-8859"
|
ali@0
|
2863 |
. . . this is to help detect things like CP1252 characters.
|
ali@0
|
2864 |
Released as 0.97
|
ali@0
|
2865 |
|
ali@0
|
2866 |
12/01/02 Hacked a simplified version of the "Wrongspaced quotes" out of gutspell,
|
ali@0
|
2867 |
for doublequotes only. Replaces "Spaced quote", since it also covers that
|
ali@0
|
2868 |
case.
|
ali@0
|
2869 |
Added "warn_hyphen" to ease over-reporting of hyphens.
|
ali@0
|
2870 |
|
ali@0
|
2871 |
12/20/02 Added "extra period" checks.
|
ali@0
|
2872 |
Added single character line check
|
ali@0
|
2873 |
Added I" check - is usually an exclam
|
ali@0
|
2874 |
Released as 0.98
|
ali@0
|
2875 |
|
ali@0
|
2876 |
1/5/03 Eeek! Left in a lowerit(argv[0]) at the start before procfile()
|
ali@0
|
2877 |
from when I was looking at ways to identify markup. Refuses to
|
ali@0
|
2878 |
open files for *nix users with upcase in the filemanes. Removed.
|
ali@0
|
2879 |
Fixed quickly and released as 0.981
|
ali@0
|
2880 |
|
ali@0
|
2881 |
1/8/03 Added "arid" to the list of typos, slightly against my better
|
ali@0
|
2882 |
judgement, but the DP gang are all excited about it. :-)
|
ali@0
|
2883 |
Added a check for comma followed by capital letter, where
|
ali@0
|
2884 |
a period has OCRed into a comma. (DW). Not sure about this
|
ali@0
|
2885 |
either; we'll see.
|
ali@0
|
2886 |
Compiling for Win32 to allow longfilenames.
|
ali@0
|
2887 |
|
ali@0
|
2888 |
6/1/04 A messy test release for DW to include the "gutcheck.typ"
|
ali@0
|
2889 |
process. And the gutcheck.jee trials. Removed "arid" --
|
ali@0
|
2890 |
it can go in gutcheck.typ
|
ali@0
|
2891 |
|
ali@0
|
2892 |
Added checks for carats ^ and slants / but disabling slant
|
ali@0
|
2893 |
queries if more than 20 of them, because some people use them
|
ali@0
|
2894 |
for /italics/. Slants are commonly mistaken italic "I"s.
|
ali@0
|
2895 |
|
ali@0
|
2896 |
Later: removed gutcheck.jee -- wrote jeebies instead.
|
ali@0
|
2897 |
|
ali@0
|
2898 |
Random TODO:
|
ali@0
|
2899 |
Check brackets more closely, like quotes, so that it becomes
|
ali@0
|
2900 |
easy to find the error in long paragraphs full of brackets.
|
ali@0
|
2901 |
|
ali@0
|
2902 |
|
ali@0
|
2903 |
11/4/04 Assorted cleanup. Fixed case where text started with an
|
ali@0
|
2904 |
unbalanced paragraph.
|
ali@0
|
2905 |
|
ali@0
|
2906 |
1/2/05 Has it really been that long? Added "nocomma", "noperiod" check.
|
ali@0
|
2907 |
Bits and pieces: improved isroman(). Added isletter().
|
ali@0
|
2908 |
Other stuff I never noted before this.
|
ali@0
|
2909 |
|
ali@0
|
2910 |
7/3/05 Stuck in a quick start on DP-markup ignoring
|
ali@0
|
2911 |
at BillFlis's suggestion.
|
ali@0
|
2912 |
|
ali@0
|
2913 |
1/23/06 Took out nocomma etc if typos are off. Why did I ever leave that in?
|
ali@0
|
2914 |
Don't count footer for dotcomma etc.
|
ali@0
|
2915 |
|
ali@0
|
2916 |
|
ali@0
|
2917 |
1 I
|
ali@0
|
2918 |
ail all
|
ali@0
|
2919 |
arc are
|
ali@0
|
2920 |
arid and
|
ali@0
|
2921 |
bad had
|
ali@0
|
2922 |
ball hall
|
ali@0
|
2923 |
band hand
|
ali@0
|
2924 |
bar her
|
ali@0
|
2925 |
bat but
|
ali@0
|
2926 |
be he
|
ali@0
|
2927 |
bead head
|
ali@0
|
2928 |
beads heads
|
ali@0
|
2929 |
bear hear
|
ali@0
|
2930 |
bit hit
|
ali@0
|
2931 |
bo be
|
ali@0
|
2932 |
boon been
|
ali@0
|
2933 |
borne home
|
ali@0
|
2934 |
bow how
|
ali@0
|
2935 |
bumbled humbled
|
ali@0
|
2936 |
car ear
|
ali@0
|
2937 |
carnage carriage
|
ali@0
|
2938 |
carne came
|
ali@0
|
2939 |
cast east
|
ali@0
|
2940 |
cat cut
|
ali@0
|
2941 |
cat eat
|
ali@0
|
2942 |
cheek check
|
ali@0
|
2943 |
clay day
|
ali@0
|
2944 |
coining coming
|
ali@0
|
2945 |
comer corner
|
ali@0
|
2946 |
die she
|
ali@0
|
2947 |
docs does
|
ali@0
|
2948 |
ease case
|
ali@0
|
2949 |
fail fall
|
ali@0
|
2950 |
fee he
|
ali@0
|
2951 |
haying having
|
ali@0
|
2952 |
ho he
|
ali@0
|
2953 |
ho who
|
ali@0
|
2954 |
hut but
|
ali@0
|
2955 |
is as
|
ali@0
|
2956 |
lie he
|
ali@0
|
2957 |
lime time
|
ali@0
|
2958 |
loth 10th
|
ali@0
|
2959 |
m in
|
ali@0
|
2960 |
modem modern
|
ali@0
|
2961 |
Ms his
|
ali@0
|
2962 |
ray away
|
ali@0
|
2963 |
ray my
|
ali@0
|
2964 |
ringer finger
|
ali@0
|
2965 |
ringers fingers
|
ali@0
|
2966 |
rioted noted
|
ali@0
|
2967 |
tho the
|
ali@0
|
2968 |
tie he
|
ali@0
|
2969 |
tie the
|
ali@0
|
2970 |
tier her
|
ali@0
|
2971 |
tight right
|
ali@0
|
2972 |
tile the
|
ali@0
|
2973 |
tiling thing
|
ali@0
|
2974 |
tip up
|
ali@0
|
2975 |
tram train
|
ali@0
|
2976 |
tune time
|
ali@0
|
2977 |
u "
|
ali@0
|
2978 |
wen well
|
ali@0
|
2979 |
yon you
|
ali@0
|
2980 |
|
ali@0
|
2981 |
*********************************************************************/
|
ali@0
|
2982 |
|