bookloupe: gutcheck/gutcheck.c@707d51fedbe0 (annotated)

ali@0	1	/*************************************************************************/
ali@0	2	/* gutcheck - check for assorted weirdnesses in a PG candidate text file */
ali@0	3	/* */
ali@0	4	/* Version 0.991 */
ali@0	5	/* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
ali@0	6	/* */
ali@0	7	/* This program is free software; you can redistribute it and/or modify */
ali@0	8	/* it under the terms of the GNU General Public License as published by */
ali@0	9	/* the Free Software Foundation; either version 2 of the License, or */
ali@0	10	/* (at your option) any later version. */
ali@0	11	/* */
ali@0	12	/* This program is distributed in the hope that it will be useful, */
ali@0	13	/* but WITHOUT ANY WARRANTY; without even the implied warranty of */
ali@0	14	/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
ali@0	15	/* GNU General Public License for more details. */
ali@0	16	/* */
ali@0	17	/* You should have received a copy of the GNU General Public License */
ali@0	18	/* along with this program; if not, write to the */
ali@0	19	/* Free Software Foundation, Inc., */
ali@0	20	/* 59 Temple Place, */
ali@0	21	/* Suite 330, */
ali@0	22	/* Boston, MA 02111-1307 USA */
ali@0	23	/* */
ali@0	24	/* */
ali@0	25	/* */
ali@0	26	/* Overview comments: */
ali@0	27	/* */
ali@0	28	/* If you're reading this, you're either interested in how to detect */
ali@0	29	/* formatting errors, or very very bored. */
ali@0	30	/* */
ali@0	31	/* Gutcheck is a homebrew formatting checker specifically for */
ali@0	32	/* spotting common formatting problems in a PG e-text. I typically */
ali@0	33	/* run it once or twice on a file I'm about to submit; it usually */
ali@0	34	/* finds a few formatting problems. It also usually finds lots of */
ali@0	35	/* queries that aren't problems at all; it _really_ doesn't like */
ali@0	36	/* the standard PG header, for example. It's optimized for straight */
ali@0	37	/* prose; poetry and non-fiction involving tables tend to trigger */
ali@0	38	/* false alarms. */
ali@0	39	/* */
ali@0	40	/* The code of gutcheck is not very interesting, but the experience */
ali@0	41	/* of what constitutes a possible error may be, and the best way to */
ali@0	42	/* illustrate that is by example. */
ali@0	43	/* */
ali@0	44	/* */
ali@0	45	/* Here are some common typos found in PG texts that gutcheck */
ali@0	46	/* will flag as errors: */
ali@0	47	/* */
ali@0	48	/* "Look!John , over there!" */
ali@0	49	/* <this is a HTML tag> */
ali@0	50	/* &so is this; */
ali@0	51	/* Margaret said: " Now you should start for school." */
ali@0	52	/* Margaret said: "Now you should start for school. (if end of para) */
ali@0	53	/* The horse is said to he worth a lot. */
ali@0	54	/* 0K - this'11 make you look close1y. */
ali@0	55	/* "If you do. you'll regret it!" */
ali@0	56	/* */
ali@0	57	/* There are some complications . The extra space left around that */
ali@0	58	/* period was an error . . . but that ellipsis wasn't. */
ali@0	59	/* */
ali@0	60	/* The last line of a paragraph */
ali@0	61	/* is usually short. */
ali@0	62	/* */
ali@0	63	/* This period is an error.But the periods in a.m. aren't. */
ali@0	64	/* */
ali@0	65	/* Checks that are do-able but not (well) implemented are: */
ali@0	66	/* Single-quote chcking. */
ali@0	67	/* Despite 3 attempts at it, singlequote checking is still */
ali@0	68	/* crap in gutcheck. It may not be possible without analysis */
ali@0	69	/* of the whole paragraph. */
ali@0	70	/* */
ali@0	71	/*************************************************************************/
ali@0	72
ali@0	73
ali@0	74	#include <stdio.h>
ali@0	75	#include <stdlib.h>
ali@0	76	#include <string.h>
ali@0	77	#include <ctype.h>
ali@0	78
ali@0	79	#define MAXWORDLEN 80 /* max length of one word */
ali@0	80	#define LINEBUFSIZE 2048 /* buffer size for an input line */
ali@0	81
ali@0	82	#define MAX_USER_TYPOS 1000
ali@0	83	#define USERTYPO_FILE "gutcheck.typ"
ali@0	84
ali@0	85	#ifndef MAX_PATH
ali@0	86	#define MAX_PATH 16384
ali@0	87	#endif
ali@0	88
ali@0	89	char aline[LINEBUFSIZE];
ali@0	90	char prevline[LINEBUFSIZE];
ali@0	91
ali@0	92	/* Common typos. */
ali@0	93	char *typo[] = { "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane", "nad",
ali@0	94	"te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa", "bakc", "om",
ali@0	95	"btu", "byt", "cna", "cxan", "coudl", "dont", "didnt", "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr",
ali@0	96	"hmi", "hse", "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd", "gerat", "goign",
ali@0	97	"gruop", "haev", "hda", "hearign", "seeign", "sayign", "herat", "hge", "hsa", "hsi", "hte", "htere",
ali@0	98	"htese", "htey", "htis", "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut", "loev",
ali@0	99	"sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter", "omre", "onyl", "otehr", "otu", "owrk",
ali@0	100	"owuld", "peice", "peices", "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
ali@0	101	"porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe", "sohw", "stnad", "stopry",
ali@0	102	"stoyr", "stpo", "tahn", "taht", "tath", "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge",
ali@0	103	"thier", "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne", "tirne", "tkae",
ali@0	104	"tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey", "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih",
ali@0	105	"whihc", "whta", "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
ali@0	106	"woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking", "wtih", "wuould", "wya", "yera",
ali@0	107	"yeras", "yersa", "yoiu", "youve", "ytou", "yuor",
ali@0	108	/* added h/b words for version 12 - removed a few with "tbe" v.25 */
ali@0	109	"abead", "ahle", "ahout", "ahove", "altbough", "balf", "bardly", "bas", "bave", "baving", "bebind",
ali@0	110	"beld", "belp", "belped", "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge", "dehates",
ali@0	111	"deht", "han", "hecause", "hecome", "heen", "hefore", "hegan", "hegin", "heing",
ali@0	112	"helieve", "henefit", "hetter", "hetween", "heyond", "hig", "higber", "huild", "huy", "hy", "jobn", "joh",
ali@0	113	"meanwbile", "memher", "memhers", "numher", "numhers",
ali@0	114	"perbaps", "prohlem", "puhlic", "witbout",
ali@0	115	/* and a few more for .18 */
ali@0	116	"arn", "hin", "hirn", "wrok", "wroked", "amd", "aud", "prornise", "prornised", "modem", "bo",
ali@0	117	"heside", "chapteb", "chaptee", "se",
ali@0	118	""};
ali@0	119
ali@0	120	char *usertypo[MAX_USER_TYPOS];
ali@0	121
ali@0	122	/* Common abbreviations and other OK words not to query as typos. */
ali@0	123	/* 0.99 last-minute - removed "ms" */
ali@0	124	char *okword[] = {"mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm", "rd", "sh", "br",
ali@0	125	"pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "pompeii","hawaii","hawaiian",
ali@0	126	"hotbed", "heartbeat", "heartbeats", "outbid", "outbids", "frostbite", "frostbitten",
ali@0	127	""};
ali@0	128
ali@0	129	/* Common abbreviations that cause otherwise unexplained periods. */
ali@0	130	char *abbrev[] = {"cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op", "cit",
ali@0	131	"deg", "min", "chap", "oz", "mme", "mlle", "mssrs",
ali@0	132	""};
ali@0	133	/* Two-Letter combinations that rarely if ever start words, */
ali@0	134	/* but are common scannos or otherwise common letter */
ali@0	135	/* combinations. */
ali@0	136	char *nostart[] = { "hr", "hl", "cb", "sb", "tb", "wb", "tl",
ali@0	137	"tn", "rn", "lt", "tj",
ali@0	138	"" };
ali@0	139
ali@0	140	/* Two-Letter combinations that rarely if ever end words */
ali@0	141	/* but are common scannos or otherwise common letter */
ali@0	142	/* combinations */
ali@0	143	char *noend[] = { "cb", "gb", "pb", "sb", "tb",
ali@0	144	"wh","fr","br","qu","tw","gl","fl","sw","gr","sl","cl",
ali@0	145	"iy",
ali@0	146	""};
ali@0	147
ali@0	148	char *markup[] = { "a", "b", "big", "blockquote", "body", "br", "center",
ali@0	149	"col", "div", "em", "font", "h1", "h2", "h3", "h4",
ali@0	150	"h5", "h6", "head", "hr", "html", "i", "img", "li",
ali@0	151	"meta", "ol", "p", "pre", "small", "span", "strong",
ali@0	152	"sub", "sup", "table", "td", "tfoot", "thead", "title",
ali@0	153	"tr", "tt", "u", "ul",
ali@0	154	""};
ali@0	155
ali@0	156	char DPmarkup[] = { "<sc>", "</sc>", "/", "*/", "/#", "#/", "/$", "$/", "<tb>",
ali@0	157	""}; /* <tb> added .991 */
ali@0	158
ali@0	159	char *nocomma[] = { "the", "it's", "their", "an", "mrs", "a", "our", "that's",
ali@0	160	"its", "whose", "every", "i'll", "your", "my",
ali@0	161	"mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "rd",
ali@0	162	"pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
ali@0	163	"i'm", "during", "let", "toward", "among",
ali@0	164	""};
ali@0	165
ali@0	166
ali@0	167	char *noperiod[] = { "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
ali@0	168	"and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
ali@0	169	"i'll", "whose", "who", "because", "when", "let", "till", "very",
ali@0	170	"an", "among", "those", "into", "whom", "having", "thence",
ali@0	171	""};
ali@0	172
ali@0	173
ali@0	174	char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü"; /* Carlo's old suggestion, updated .991 */
ali@0	175
ali@0	176	struct {
ali@0	177	char *htmlent;
ali@0	178	char *htmlnum;
ali@0	179	char *textent;
ali@0	180	} entities[] = { "&", "&", "&",
ali@0	181	"<", "<", "<",
ali@0	182	">", ">", ">",
ali@0	183	"°", "°", " degrees",
ali@0	184	"£", "£", "L",
ali@0	185	""", """, "\"", /* -- quotation mark = APL quote, */
ali@0	186	"&OElig;", "Œ", "OE", /* -- latin capital ligature OE, */
ali@0	187	"&oelig;", "œ", "oe", /* -- latin small ligature oe, U+0153 ISOlat2 --> */
ali@0	188	"&Scaron;", "Š", "S", /* -- latin capital letter S with caron, */
ali@0	189	"&scaron;", "š", "s", /* -- latin small letter s with caron, */
ali@0	190	"&Yuml;", "Ÿ", "Y", /* -- latin capital letter Y with diaeresis, */
ali@0	191	"&circ;", "ˆ", "", /* -- modifier letter circumflex accent, */
ali@0	192	"&tilde;", "˜", "~", /* -- small tilde, U+02DC ISOdia --> */
ali@0	193	"&ensp;", " ", " ", /* -- en space, U+2002 ISOpub --> */
ali@0	194	"&emsp;", " ", " ", /* -- em space, U+2003 ISOpub --> */
ali@0	195	" ", " ", " ", /* -- thin space, U+2009 ISOpub --> */
ali@0	196	"–", "–", "-", /* -- en dash, U+2013 ISOpub --> */
ali@0	197	"—", "—", "--", /* -- em dash, U+2014 ISOpub --> */
ali@0	198	"‘", "‘", "'", /* -- left single quotation mark, */
ali@0	199	"’", "’", "'", /* -- right single quotation mark, */
ali@0	200	"&sbquo;", "‚", "'", /* -- single low-9 quotation mark, U+201A NEW --> */
ali@0	201	"“", "“", "\"", /* -- left double quotation mark, */
ali@0	202	"”", "”", "\"", /* -- right double quotation mark, */
ali@0	203	"&bdquo;", "„", "\"", /* -- double low-9 quotation mark, U+201E NEW --> */
ali@0	204	"&lsaquo;", "‹", "\"", /* -- single left-pointing angle quotation mark, */
ali@0	205	"&rsaquo;", "›", "\"", /* -- single right-pointing angle quotation mark, */
ali@0	206	" ", " ", " ", /* -- no-break space = non-breaking space, */
ali@0	207	"¡", "¡", "!", /* -- inverted exclamation mark, U+00A1 ISOnum --> */
ali@0	208	"¢", "¢", "c", /* -- cent sign, U+00A2 ISOnum --> */
ali@0	209	"£", "£", "L", /* -- pound sign, U+00A3 ISOnum --> */
ali@0	210	"¤", "¤", "$", /* -- currency sign, U+00A4 ISOnum --> */
ali@0	211	"¥", "¥", "Y", /* -- yen sign = yuan sign, U+00A5 ISOnum --> */
ali@0	212	"§", "§", "--", /* -- section sign, U+00A7 ISOnum --> */
ali@0	213	"¨", "¨", " ", /* -- diaeresis = spacing diaeresis, */
ali@0	214	"©", "©", "(C) ", /* -- copyright sign, U+00A9 ISOnum --> */
ali@0	215	"ª", "ª", " ", /* -- feminine ordinal indicator, U+00AA ISOnum --> */
ali@0	216	"«", "«", "\"", /* -- left-pointing double angle quotation mark */
ali@0	217	"", "", "-", /* -- soft hyphen = discretionary hyphen, */
ali@0	218	"®", "®", "(R) ", /* -- registered sign = registered trade mark sign, */
ali@0	219	"¯", "¯", " ", /* -- macron = spacing macron = overline */
ali@0	220	"°", "°", " degrees", /* -- degree sign, U+00B0 ISOnum --> */
ali@0	221	"±", "±", "+-", /* -- plus-minus sign = plus-or-minus sign, */
ali@0	222	"²", "²", "2", /* -- superscript two = superscript digit two */
ali@0	223	"³", "³", "3", /* -- superscript three = superscript digit three */
ali@0	224	"´", "´", " ", /* -- acute accent = spacing acute, */
ali@0	225	"µ", "µ", "m", /* -- micro sign, U+00B5 ISOnum --> */
ali@0	226	"¶", "¶", "--", /* -- pilcrow sign = paragraph sign, */
ali@0	227	"¸", "¸", " ", /* -- cedilla = spacing cedilla, U+00B8 ISOdia --> */
ali@0	228	"¹", "¹", "1", /* -- superscript one = superscript digit one, */
ali@0	229	"º", "º", " ", /* -- masculine ordinal indicator, */
ali@0	230	"»", "»", "\"", /* -- right-pointing double angle quotation mark */
ali@0	231	"¼", "¼", "1/4", /* -- vulgar fraction one quarter */
ali@0	232	"½", "½", "1/2", /* -- vulgar fraction one half */
ali@0	233	"¾", "¾", "3/4", /* -- vulgar fraction three quarters */
ali@0	234	"¿", "¿", "?", /* -- inverted question mark */
ali@0	235	"À", "À", "A", /* -- latin capital letter A with grave */
ali@0	236	"Á", "Á", "A", /* -- latin capital letter A with acute, */
ali@0	237	"Â", "Â", "A", /* -- latin capital letter A with circumflex, */
ali@0	238	"Ã", "Ã", "A", /* -- latin capital letter A with tilde, */
ali@0	239	"Ä", "Ä", "A", /* -- latin capital letter A with diaeresis, */
ali@0	240	"Å", "Å", "A", /* -- latin capital letter A with ring above */
ali@0	241	"Æ", "Æ", "AE", /* -- latin capital letter AE */
ali@0	242	"Ç", "Ç", "C", /* -- latin capital letter C with cedilla, */
ali@0	243	"È", "È", "E", /* -- latin capital letter E with grave, */
ali@0	244	"É", "É", "E", /* -- latin capital letter E with acute, */
ali@0	245	"Ê", "Ê", "E", /* -- latin capital letter E with circumflex, */
ali@0	246	"Ë", "Ë", "E", /* -- latin capital letter E with diaeresis, */
ali@0	247	"Ì", "Ì", "I", /* -- latin capital letter I with grave, */
ali@0	248	"Í", "Í", "I", /* -- latin capital letter I with acute, */
ali@0	249	"Î", "Î", "I", /* -- latin capital letter I with circumflex, */
ali@0	250	"Ï", "Ï", "I", /* -- latin capital letter I with diaeresis, */
ali@0	251	"Ð", "Ð", "E", /* -- latin capital letter ETH, U+00D0 ISOlat1 --> */
ali@0	252	"Ñ", "Ñ", "N", /* -- latin capital letter N with tilde, */
ali@0	253	"Ò", "Ò", "O", /* -- latin capital letter O with grave, */
ali@0	254	"Ó", "Ó", "O", /* -- latin capital letter O with acute, */
ali@0	255	"Ô", "Ô", "O", /* -- latin capital letter O with circumflex, */
ali@0	256	"Õ", "Õ", "O", /* -- latin capital letter O with tilde, */
ali@0	257	"Ö", "Ö", "O", /* -- latin capital letter O with diaeresis, */
ali@0	258	"×", "×", "", / -- multiplication sign, U+00D7 ISOnum --> */
ali@0	259	"Ø", "Ø", "O", /* -- latin capital letter O with stroke */
ali@0	260	"Ù", "Ù", "U", /* -- latin capital letter U with grave, */
ali@0	261	"Ú", "Ú", "U", /* -- latin capital letter U with acute, */
ali@0	262	"Û", "Û", "U", /* -- latin capital letter U with circumflex, */
ali@0	263	"Ü", "Ü", "U", /* -- latin capital letter U with diaeresis, */
ali@0	264	"Ý", "Ý", "Y", /* -- latin capital letter Y with acute, */
ali@0	265	"Þ", "Þ", "TH", /* -- latin capital letter THORN, */
ali@0	266	"ß", "ß", "sz", /* -- latin small letter sharp s = ess-zed, */
ali@0	267	"à", "à", "a", /* -- latin small letter a with grave */
ali@0	268	"á", "á", "a", /* -- latin small letter a with acute, */
ali@0	269	"â", "â", "a", /* -- latin small letter a with circumflex, */
ali@0	270	"ã", "ã", "a", /* -- latin small letter a with tilde, */
ali@0	271	"ä", "ä", "a", /* -- latin small letter a with diaeresis, */
ali@0	272	"å", "å", "a", /* -- latin small letter a with ring above */
ali@0	273	"æ", "æ", "ae", /* -- latin small letter ae */
ali@0	274	"ç", "ç", "c", /* -- latin small letter c with cedilla, */
ali@0	275	"è", "è", "e", /* -- latin small letter e with grave, */
ali@0	276	"é", "é", "e", /* -- latin small letter e with acute, */
ali@0	277	"ê", "ê", "e", /* -- latin small letter e with circumflex, */
ali@0	278	"ë", "ë", "e", /* -- latin small letter e with diaeresis, */
ali@0	279	"ì", "ì", "i", /* -- latin small letter i with grave, */
ali@0	280	"í", "í", "i", /* -- latin small letter i with acute, */
ali@0	281	"î", "î", "i", /* -- latin small letter i with circumflex, */
ali@0	282	"ï", "ï", "i", /* -- latin small letter i with diaeresis, */
ali@0	283	"ð", "ð", "eth", /* -- latin small letter eth, U+00F0 ISOlat1 --> */
ali@0	284	"ñ", "ñ", "n", /* -- latin small letter n with tilde, */
ali@0	285	"ò", "ò", "o", /* -- latin small letter o with grave, */
ali@0	286	"ó", "ó", "o", /* -- latin small letter o with acute, */
ali@0	287	"ô", "ô", "o", /* -- latin small letter o with circumflex, */
ali@0	288	"õ", "õ", "o", /* -- latin small letter o with tilde, */
ali@0	289	"ö", "ö", "o", /* -- latin small letter o with diaeresis, */
ali@0	290	"÷", "÷", "/", /* -- division sign, U+00F7 ISOnum --> */
ali@0	291	"ø", "ø", "o", /* -- latin small letter o with stroke, */
ali@0	292	"ù", "ù", "u", /* -- latin small letter u with grave, */
ali@0	293	"ú", "ú", "u", /* -- latin small letter u with acute, */
ali@0	294	"û", "û", "u", /* -- latin small letter u with circumflex, */
ali@0	295	"ü", "ü", "u", /* -- latin small letter u with diaeresis, */
ali@0	296	"ý", "ý", "y", /* -- latin small letter y with acute, */
ali@0	297	"þ", "þ", "th", /* -- latin small letter thorn, */
ali@0	298	"ÿ", "ÿ", "y", /* -- latin small letter y with diaeresis, */
ali@0	299	"", "" };
ali@0	300
ali@0	301	/* ---- list of special characters ---- */
ali@0	302	#define CHAR_SPACE 32
ali@0	303	#define CHAR_TAB 9
ali@0	304	#define CHAR_LF 10
ali@0	305	#define CHAR_CR 13
ali@0	306	#define CHAR_DQUOTE 34
ali@0	307	#define CHAR_SQUOTE 39
ali@0	308	#define CHAR_OPEN_SQUOTE 96
ali@0	309	#define CHAR_TILDE 126
ali@0	310	#define CHAR_ASTERISK 42
ali@0	311	#define CHAR_FORESLASH 47
ali@0	312	#define CHAR_CARAT 94
ali@0	313
ali@0	314	#define CHAR_UNDERSCORE '_'
ali@0	315	#define CHAR_OPEN_CBRACK '{'
ali@0	316	#define CHAR_CLOSE_CBRACK '}'
ali@0	317	#define CHAR_OPEN_RBRACK '('
ali@0	318	#define CHAR_CLOSE_RBRACK ')'
ali@0	319	#define CHAR_OPEN_SBRACK '['
ali@0	320	#define CHAR_CLOSE_SBRACK ']'
ali@0	321
ali@0	322
ali@0	323
ali@0	324
ali@0	325
ali@0	326	/* ---- longest and shortest normal PG line lengths ----*/
ali@0	327	#define LONGEST_PG_LINE 75
ali@0	328	#define WAY_TOO_LONG 80
ali@0	329	#define SHORTEST_PG_LINE 55
ali@0	330
ali@0	331	#define SWITCHES "ESTPXLOYHWVMUD" /* switches:- */
ali@0	332	/* D - ignore DP-specific markup */
ali@0	333	/* E - echo queried line */
ali@0	334	/* S - check single quotes */
ali@0	335	/* T - check common typos */
ali@0	336	/* P - require closure of quotes on */
ali@0	337	/* every paragraph */
ali@0	338	/* X - "Trust no one" :-) Paranoid! */
ali@0	339	/* Queries everything */
ali@0	340	/* L - line end checking defaults on */
ali@0	341	/* -L turns it off */
ali@0	342	/* O - overview. Just shows counts. */
ali@0	343	/* Y - puts errors to stdout */
ali@0	344	/* instead of stderr */
ali@0	345	/* H - Echoes header fields */
ali@0	346	/* M - Ignore markup in < > */
ali@0	347	/* U - Use file of User-defined Typos*/
ali@0	348	/* W - Defaults for use on Web upload*/
ali@0	349	/* V - Verbose - list EVERYTHING! */
ali@0	350	#define SWITNO 14 /* max number of switch parms */
ali@0	351	/* - used for defining array-size */
ali@0	352	#define MINARGS 1 /* minimum no of args excl switches */
ali@0	353	#define MAXARGS 1 /* maximum no of args excl switches */
ali@0	354
ali@0	355	int pswit[SWITNO]; /* program switches set by SWITCHES */
ali@0	356
ali@0	357	#define ECHO_SWITCH 0
ali@0	358	#define SQUOTE_SWITCH 1
ali@0	359	#define TYPO_SWITCH 2
ali@0	360	#define QPARA_SWITCH 3
ali@0	361	#define PARANOID_SWITCH 4
ali@0	362	#define LINE_END_SWITCH 5
ali@0	363	#define OVERVIEW_SWITCH 6
ali@0	364	#define STDOUT_SWITCH 7
ali@0	365	#define HEADER_SWITCH 8
ali@0	366	#define WEB_SWITCH 9
ali@0	367	#define VERBOSE_SWITCH 10
ali@0	368	#define MARKUP_SWITCH 11
ali@0	369	#define USERTYPO_SWITCH 12
ali@0	370	#define DP_SWITCH 13
ali@0	371
ali@0	372
ali@0	373
ali@0	374	long cnt_dquot; /* for overview mode, count of doublequote queries */
ali@0	375	long cnt_squot; /* for overview mode, count of singlequote queries */
ali@0	376	long cnt_brack; /* for overview mode, count of brackets queries */
ali@0	377	long cnt_bin; /* for overview mode, count of non-ASCII queries */
ali@0	378	long cnt_odd; /* for overview mode, count of odd character queries */
ali@0	379	long cnt_long; /* for overview mode, count of long line errors */
ali@0	380	long cnt_short; /* for overview mode, count of short line queries */
ali@0	381	long cnt_punct; /* for overview mode, count of punctuation and spacing queries */
ali@0	382	long cnt_dash; /* for overview mode, count of dash-related queries */
ali@0	383	long cnt_word; /* for overview mode, count of word queries */
ali@0	384	long cnt_html; /* for overview mode, count of html queries */
ali@0	385	long cnt_lineend; /* for overview mode, count of line-end queries */
ali@0	386	long cnt_spacend; /* count of lines with space at end V .21 */
ali@0	387	long linecnt; /* count of total lines in the file */
ali@0	388	long checked_linecnt; /* count of lines actually gutchecked V .26 */
ali@0	389
ali@0	390	void proghelp(void);
ali@0	391	void procfile(char *);
ali@0	392
ali@0	393	#define LOW_THRESHOLD 0
ali@0	394	#define HIGH_THRESHOLD 1
ali@0	395
ali@0	396	#define START 0
ali@0	397	#define END 1
ali@0	398	#define PREV 0
ali@0	399	#define NEXT 1
ali@0	400	#define FIRST_OF_PAIR 0
ali@0	401	#define SECOND_OF_PAIR 1
ali@0	402
ali@0	403	#define MAX_WORDPAIR 1000
ali@0	404
ali@0	405	char running_from[MAX_PATH];
ali@0	406
ali@0	407	int mixdigit(char *);
ali@0	408	char getaword(char , char *);
ali@0	409	int matchword(char , char );
ali@0	410	char flgets(char , int, FILE *, long);
ali@0	411	void lowerit(char *);
ali@0	412	int gcisalpha(unsigned char);
ali@0	413	int gcisdigit(unsigned char);
ali@0	414	int gcisletter(unsigned char);
ali@0	415	char gcstrchr(char s, char c);
ali@0	416	void postprocess_for_HTML(char *);
ali@0	417	char linehasmarkup(char );
ali@0	418	char losemarkup(char );
ali@0	419	int tagcomp(char , char );
ali@0	420	char loseentities(char );
ali@0	421	int isroman(char *);
ali@0	422	int usertypo_count;
ali@0	423	void postprocess_for_DP(char *);
ali@0	424
ali@0	425	char wrk[LINEBUFSIZE];
ali@0	426
ali@0	427	/* This is disgustingly lazy, predefining max words & lengths, */
ali@0	428	/* but now I'm out of 16-bit restrictions, what's a couple of K? */
ali@0	429	#define MAX_QWORD 50
ali@0	430	#define MAX_QWORD_LENGTH 40
ali@0	431	char qword[MAX_QWORD][MAX_QWORD_LENGTH];
ali@0	432	char qperiod[MAX_QWORD][MAX_QWORD_LENGTH];
ali@0	433	signed int dupcnt[MAX_QWORD];
ali@0	434
ali@0	435
ali@0	436
ali@0	437
ali@0	438	int main(int argc, char **argv)
ali@0	439	{
ali@0	440	char argsw, s;
ali@0	441	int i, switno, invarg;
ali@0	442	char usertypo_file[MAX_PATH];
ali@0	443	FILE *usertypofile;
ali@0	444
ali@0	445
ali@0	446	if (strlen(argv[0]) < sizeof(running_from))
ali@0	447	strcpy(running_from, argv[0]); /* save the path to the executable gutcheck */
ali@0	448
ali@0	449	/* find out what directory we're running from */
ali@0	450	for (s = running_from + strlen(running_from); s != '/' && s != '\\' && s >= running_from; s--)
ali@0	451	*s = 0;
ali@0	452
ali@0	453
ali@0	454	switno = strlen(SWITCHES);
ali@0	455	for (i = switno ; --i >0 ; )
ali@0	456	pswit[i] = 0; /* initialise switches */
ali@0	457
ali@0	458	/* Standard loop to extract switches. */
ali@0	459	/* When we come out of this loop, the arguments will be */
ali@0	460	/* in argv[0] upwards and the switches used will be */
ali@0	461	/* represented by their equivalent elements in pswit[] */
ali@0	462	while ( --argc > 0 && **++argv == '-')
ali@0	463	for (argsw = argv[0]+1; *argsw !='\0'; argsw++)
ali@0	464	for (i = switno, invarg = 1; (--i >= 0) && invarg == 1 ; )
ali@0	465	if ((toupper(*argsw)) == SWITCHES[i] ) {
ali@0	466	invarg = 0;
ali@0	467	pswit[i] = 1;
ali@0	468	}
ali@0	469
ali@0	470	pswit[PARANOID_SWITCH] ^= 1; /* Paranoid checking is turned OFF, not on, by its switch */
ali@0	471
ali@0	472	if (pswit[PARANOID_SWITCH]) { /* if running in paranoid mode */
ali@0	473	pswit[TYPO_SWITCH] = pswit[TYPO_SWITCH] ^ 1; /* force typo checks as well */
ali@0	474	} /* v.20 removed s and p switches from paranoid mode */
ali@0	475
ali@0	476	pswit[LINE_END_SWITCH] ^= 1; /* Line-end checking is turned OFF, not on, by its switch */
ali@0	477	pswit[ECHO_SWITCH] ^= 1; /* V.21 Echoing is turned OFF, not on, by its switch */
ali@0	478
ali@0	479	if (pswit[OVERVIEW_SWITCH]) /* just print summary; don't echo */
ali@0	480	pswit[ECHO_SWITCH] = 0;
ali@0	481
ali@0	482	/* Web uploads - for the moment, this is really just a placeholder */
ali@0	483	/* until we decide what processing we really want to do on web uploads */
ali@0	484	if (pswit[WEB_SWITCH]) { /* specific override for web uploads */
ali@0	485	pswit[ECHO_SWITCH] = 1;
ali@0	486	pswit[SQUOTE_SWITCH] = 0;
ali@0	487	pswit[TYPO_SWITCH] = 1;
ali@0	488	pswit[QPARA_SWITCH] = 0;
ali@0	489	pswit[PARANOID_SWITCH] = 1;
ali@0	490	pswit[LINE_END_SWITCH] = 0;
ali@0	491	pswit[OVERVIEW_SWITCH] = 0;
ali@0	492	pswit[STDOUT_SWITCH] = 0;
ali@0	493	pswit[HEADER_SWITCH] = 1;
ali@0	494	pswit[VERBOSE_SWITCH] = 0;
ali@0	495	pswit[MARKUP_SWITCH] = 0;
ali@0	496	pswit[USERTYPO_SWITCH] = 0;
ali@0	497	pswit[DP_SWITCH] = 0;
ali@0	498	}
ali@0	499
ali@0	500
ali@0	501	if (argc < MINARGS \|\| argc > MAXARGS) { /* check number of args */
ali@0	502	proghelp();
ali@0	503	return(1); /* exit */
ali@0	504	}
ali@0	505
ali@0	506
ali@0	507	/* read in the user-defined stealth scanno list */
ali@0	508
ali@0	509	if (pswit[USERTYPO_SWITCH]) { /* ... we were told we had one! */
ali@0	510	if ((usertypofile = fopen(USERTYPO_FILE, "rb")) == NULL) { /* not in cwd. try gutcheck directory. */
ali@0	511	strcpy(usertypo_file, running_from);
ali@0	512	strcat(usertypo_file, USERTYPO_FILE);
ali@0	513	if ((usertypofile = fopen(usertypo_file, "rb")) == NULL) { /* we ain't got no user typo file! */
ali@0	514	printf(" --> I couldn't find gutcheck.typ -- proceeding without user typos.\n");
ali@0	515	}
ali@0	516	}
ali@0	517
ali@0	518	usertypo_count = 0;
ali@0	519	if (usertypofile) { /* we managed to open a User Typo File! */
ali@0	520	if (pswit[USERTYPO_SWITCH]) {
ali@0	521	while (flgets(aline, LINEBUFSIZE-1, usertypofile, (long)usertypo_count)) {
ali@0	522	if (strlen(aline) > 1) {
ali@0	523	if ((int)*aline > 33) {
ali@0	524	s = malloc(strlen(aline)+1);
ali@0	525	if (!s) {
ali@0	526	fprintf(stderr, "gutcheck: cannot get enough memory for user typo file!!\n");
ali@0	527	exit(1);
ali@0	528	}
ali@0	529	strcpy(s, aline);
ali@0	530	usertypo[usertypo_count] = s;
ali@0	531	usertypo_count++;
ali@0	532	if (usertypo_count >= MAX_USER_TYPOS) {
ali@0	533	printf(" --> Only %d user-defined typos allowed: ignoring the rest\n");
ali@0	534	break;
ali@0	535	}
ali@0	536	}
ali@0	537	}
ali@0	538	}
ali@0	539	}
ali@0	540	fclose(usertypofile);
ali@0	541	}
ali@0	542	}
ali@0	543
ali@0	544
ali@0	545
ali@0	546
ali@0	547	fprintf(stderr, "gutcheck: Check and report on an e-text\n");
ali@0	548
ali@0	549	cnt_dquot = cnt_squot = cnt_brack = cnt_bin = cnt_odd = cnt_long =
ali@0	550	cnt_short = cnt_punct = cnt_dash = cnt_word = cnt_html = cnt_lineend =
ali@0	551	cnt_spacend = 0;
ali@0	552
ali@0	553	procfile(argv[0]);
ali@0	554
ali@0	555	if (pswit[OVERVIEW_SWITCH]) {
ali@0	556	printf(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
ali@0	557	checked_linecnt, linecnt, linecnt - checked_linecnt);
ali@0	558	printf(" --------------- Queries found --------------\n");
ali@0	559	if (cnt_long) printf(" Long lines: %5ld\n",cnt_long);
ali@0	560	if (cnt_short) printf(" Short lines: %5ld\n",cnt_short);
ali@0	561	if (cnt_lineend) printf(" Line-end problems: %5ld\n",cnt_lineend);
ali@0	562	if (cnt_word) printf(" Common typos: %5ld\n",cnt_word);
ali@0	563	if (cnt_dquot) printf(" Unmatched quotes: %5ld\n",cnt_dquot);
ali@0	564	if (cnt_squot) printf(" Unmatched SingleQuotes: %5ld\n",cnt_squot);
ali@0	565	if (cnt_brack) printf(" Unmatched brackets: %5ld\n",cnt_brack);
ali@0	566	if (cnt_bin) printf(" Non-ASCII characters: %5ld\n",cnt_bin);
ali@0	567	if (cnt_odd) printf(" Proofing characters: %5ld\n",cnt_odd);
ali@0	568	if (cnt_punct) printf(" Punctuation & spacing queries: %5ld\n",cnt_punct);
ali@0	569	if (cnt_dash) printf(" Non-standard dashes: %5ld\n",cnt_dash);
ali@0	570	if (cnt_html) printf(" Possible HTML tags: %5ld\n",cnt_html);
ali@0	571	printf("\n");
ali@0	572	printf(" TOTAL QUERIES %5ld\n",
ali@0	573	cnt_dquot + cnt_squot + cnt_brack + cnt_bin + cnt_odd + cnt_long +
ali@0	574	cnt_short + cnt_punct + cnt_dash + cnt_word + cnt_html + cnt_lineend);
ali@0	575	}
ali@0	576
ali@0	577	return(0);
ali@0	578	}
ali@0	579
ali@0	580
ali@0	581
ali@0	582	/* procfile - process one file */
ali@0	583
ali@0	584	void procfile(char *filename)
ali@0	585	{
ali@0	586
ali@0	587	char s, t, s1, laststart, wordstart;
ali@0	588	char inword[MAXWORDLEN], testword[MAXWORDLEN];
ali@0	589	char parastart[81]; /* first line of current para */
ali@0	590	FILE *infile;
ali@0	591	long quot, squot, firstline, alphalen, totlen, binlen,
ali@0	592	shortline, longline, verylongline, spacedash, emdash,
ali@0	593	space_emdash, non_PG_space_emdash, PG_space_emdash,
ali@0	594	footerline, dotcomma, start_para_line, astline, fslashline,
ali@0	595	standalone_digit, hyphens, htmcount, endquote_count;
ali@0	596	long spline, nspline;
ali@0	597	signed int i, j, llen, isemptyline, isacro, isellipsis, istypo, alower,
ali@0	598	eNon_A, eTab, eTilde, eAst, eFSlash, eCarat;
ali@0	599	signed int warn_short, warn_long, warn_bin, warn_dash, warn_dotcomma,
ali@0	600	warn_ast, warn_fslash, warn_digit, warn_hyphen, warn_endquote;
ali@0	601	unsigned int lastlen, lastblen;
ali@0	602	signed int s_brack, c_brack, r_brack, c_unders;
ali@0	603	signed int open_single_quote, close_single_quote, guessquote, dquotepar, squotepar;
ali@0	604	signed int isnewpara, vowel, consonant;
ali@0	605	char dquote_err[80], squote_err[80], rbrack_err[80], sbrack_err[80], cbrack_err[80],
ali@0	606	unders_err[80];
ali@0	607	signed int qword_index, qperiod_index, isdup;
ali@0	608	signed int enddash;
ali@0	609	signed int Dutchcount, isDutch, Frenchcount, isFrench;
ali@0	610
ali@0	611
ali@0	612
ali@0	613
ali@0	614
ali@0	615	laststart = CHAR_SPACE;
ali@0	616	lastlen = lastblen = 0;
ali@0	617	dquote_err = squote_err = rbrack_err = cbrack_err = *sbrack_err =
ali@0	618	unders_err = prevline = 0;
ali@0	619	linecnt = firstline = alphalen = totlen = binlen =
ali@0	620	shortline = longline = spacedash = emdash = checked_linecnt =
ali@0	621	space_emdash = non_PG_space_emdash = PG_space_emdash =
ali@0	622	footerline = dotcomma = start_para_line = astline = fslashline =
ali@0	623	standalone_digit = hyphens = htmcount = endquote_count = 0;
ali@0	624	quot = squot = s_brack = c_brack = r_brack = c_unders = 0;
ali@0	625	i = llen = isemptyline = isacro = isellipsis = istypo = 0;
ali@0	626	warn_short = warn_long = warn_bin = warn_dash = warn_dotcomma =
ali@0	627	warn_ast = warn_fslash = warn_digit = warn_endquote = 0;
ali@0	628	isnewpara = vowel = consonant = enddash = 0;
ali@0	629	spline = nspline = 0;
ali@0	630	qword_index = qperiod_index = isdup = 0;
ali@0	631	inword = testword = 0;
ali@0	632	open_single_quote = close_single_quote = guessquote = dquotepar = squotepar = 0;
ali@0	633	Dutchcount = isDutch = Frenchcount = isFrench = 0;
ali@0	634
ali@0	635
ali@0	636	for (j = 0; j < MAX_QWORD; j++) {
ali@0	637	dupcnt[j] = 0;
ali@0	638	for (i = 0; i < MAX_QWORD_LENGTH; i++)
ali@0	639	qword[i][j] = 0;
ali@0	640	qperiod[i][j] = 0;
ali@0	641	}
ali@0	642
ali@0	643
ali@0	644	if ((infile = fopen(filename, "rb")) == NULL) {
ali@0	645	if (pswit[STDOUT_SWITCH])
ali@0	646	fprintf(stdout, "gutcheck: cannot open %s\n", filename);
ali@0	647	else
ali@0	648	fprintf(stderr, "gutcheck: cannot open %s\n", filename);
ali@0	649	exit(1);
ali@0	650	}
ali@0	651
ali@0	652	fprintf(stdout, "\n\nFile: %s\n\n", filename);
ali@0	653	firstline = shortline = longline = verylongline = 0;
ali@0	654
ali@0	655
ali@0	656	/*****************************************************/
ali@0	657	/* */
ali@0	658	/* Run a first pass - verify that it's a valid PG */
ali@0	659	/* file, decide whether to report some things that */
ali@0	660	/* occur many times in the text like long or short */
ali@0	661	/* lines, non-standard dashes, and other good stuff */
ali@0	662	/* I'll doubtless think of later. */
ali@0	663	/* */
ali@0	664	/*****************************************************/
ali@0	665
ali@0	666	/*****************************************************/
ali@0	667	/* V.24 Sigh. Yet Another Header Change */
ali@0	668	/*****************************************************/
ali@0	669
ali@0	670	while (fgets(aline, LINEBUFSIZE-1, infile)) {
ali@0	671	while (aline[strlen(aline)-1] == 10 \|\| aline[strlen(aline)-1] == 13 ) aline[strlen(aline)-1] = 0;
ali@0	672	linecnt++;
ali@0	673	if (strstr(aline, "*END") && strstr(aline, "SMALL PRINT") && (strstr(aline, "PUBLIC DOMAIN") \|\| strstr(aline, "COPYRIGHT"))) {
ali@0	674	if (spline)
ali@0	675	printf(" --> Duplicate header?\n");
ali@0	676	spline = linecnt + 1; /* first line of non-header text, that is */
ali@0	677	}
ali@0	678	if (!strncmp(aline, "*** START", 9) && strstr(aline, "PROJECT GUTENBERG")) {
ali@0	679	if (nspline)
ali@0	680	printf(" --> Duplicate header?\n");
ali@0	681	nspline = linecnt + 1; /* first line of non-header text, that is */
ali@0	682	}
ali@0	683	if (spline \|\| nspline) {
ali@0	684	lowerit(aline);
ali@0	685	if (strstr(aline, "end") && strstr(aline, "project gutenberg")) {
ali@0	686	if (strstr(aline, "end") < strstr(aline, "project gutenberg")) {
ali@0	687	if (footerline) {
ali@0	688	if (!nspline) /* it's an old-form header - we can detect duplicates */
ali@0	689	printf(" --> Duplicate footer?\n");
ali@0	690	else
ali@0	691	;
ali@0	692	}
ali@0	693	else {
ali@0	694	footerline = linecnt;
ali@0	695	}
ali@0	696	}
ali@0	697	}
ali@0	698	}
ali@0	699	if (spline) firstline = spline;
ali@0	700	if (nspline) firstline = nspline; /* override with new */
ali@0	701
ali@0	702	if (footerline) continue; /* 0.99+ don't count the boilerplate in the footer */
ali@0	703
ali@0	704	llen = strlen(aline);
ali@0	705	totlen += llen;
ali@0	706	for (i = 0; i < llen; i++) {
ali@0	707	if ((unsigned char)aline[i] > 127) binlen++;
ali@0	708	if (gcisalpha(aline[i])) alphalen++;
ali@0	709	if (i > 0)
ali@0	710	if (aline[i] == CHAR_DQUOTE && isalpha(aline[i-1]))
ali@0	711	endquote_count++;
ali@0	712	}
ali@0	713	if (strlen(aline) > 2
ali@0	714	&& lastlen > 2 && lastlen < SHORTEST_PG_LINE
ali@0	715	&& lastblen > 2 && lastblen > SHORTEST_PG_LINE
ali@0	716	&& laststart != CHAR_SPACE)
ali@0	717	shortline++;
ali@0	718
ali@0	719	if (aline) / fixed line below for 0.96 */
ali@0	720	if ((unsigned char)aline[strlen(aline)-1] <= CHAR_SPACE) cnt_spacend++;
ali@0	721
ali@0	722	if (strstr(aline, ".,")) dotcomma++;
ali@0	723	/* 0.98 only count ast lines for ignoring purposes where there is */
ali@0	724	/* locase text on the line */
ali@0	725	if (strstr(aline, "*")) {
ali@0	726	for (s = aline; *s; s++)
ali@0	727	if (s >='a' && s <= 'z')
ali@0	728	break;
ali@0	729	if (*s) astline++;
ali@0	730	}
ali@0	731	if (strstr(aline, "/"))
ali@0	732	fslashline++;
ali@0	733	for (i = llen-1; i > 0 && (unsigned char)aline[i] <= CHAR_SPACE; i--);
ali@0	734	if (aline[i] == '-' && aline[i-1] != '-') hyphens++;
ali@0	735
ali@0	736	if (llen > LONGEST_PG_LINE) longline++;
ali@0	737	if (llen > WAY_TOO_LONG) verylongline++;
ali@0	738
ali@0	739	if (strstr(aline, "<") && strstr(aline, ">")) {
ali@0	740	i = (signed int) (strstr(aline, ">") - strstr(aline, "<") + 1);
ali@0	741	if (i > 0)
ali@0	742	htmcount++;
ali@0	743	if (strstr(aline, "<i>")) htmcount +=4; /* bonus marks! */
ali@0	744	}
ali@0	745
ali@0	746	/* Check for spaced em-dashes */
ali@0	747	if (strstr(aline,"--")) {
ali@0	748	emdash++;
ali@0	749	if (*(strstr(aline, "--")-1) == CHAR_SPACE \|\|
ali@0	750	(*(strstr(aline, "--")+2) == CHAR_SPACE))
ali@0	751	space_emdash++;
ali@0	752	if (*(strstr(aline, "--")-1) == CHAR_SPACE &&
ali@0	753	(*(strstr(aline, "--")+2) == CHAR_SPACE))
ali@0	754	non_PG_space_emdash++; /* count of em-dashes with spaces both sides */
ali@0	755	if (*(strstr(aline, "--")-1) != CHAR_SPACE &&
ali@0	756	(*(strstr(aline, "--")+2) != CHAR_SPACE))
ali@0	757	PG_space_emdash++; /* count of PG-type em-dashes with no spaces */
ali@0	758	}
ali@0	759
ali@0	760	for (s = aline; *s;) {
ali@0	761	s = getaword(s, inword);
ali@0	762	if (!strcmp(inword, "hij") \|\| !strcmp(inword, "niet"))
ali@0	763	Dutchcount++;
ali@0	764	if (!strcmp(inword, "dans") \|\| !strcmp(inword, "avec"))
ali@0	765	Frenchcount++;
ali@0	766	if (!strcmp(inword, "0") \|\| !strcmp(inword, "1"))
ali@0	767	standalone_digit++;
ali@0	768	}
ali@0	769
ali@0	770	/* Check for spaced dashes */
ali@0	771	if (strstr(aline," -"))
ali@0	772	if (*(strstr(aline, " -")+2) != '-')
ali@0	773	spacedash++;
ali@0	774	lastblen = lastlen;
ali@0	775	lastlen = strlen(aline);
ali@0	776	laststart = aline[0];
ali@0	777
ali@0	778	}
ali@0	779	fclose(infile);
ali@0	780
ali@0	781
ali@0	782	/* now, based on this quick view, make some snap decisions */
ali@0	783	if (cnt_spacend > 0) {
ali@0	784	printf(" --> %ld lines in this file have white space at end\n", cnt_spacend);
ali@0	785	}
ali@0	786
ali@0	787	warn_dotcomma = 1;
ali@0	788	if (dotcomma > 5) {
ali@0	789	warn_dotcomma = 0;
ali@0	790	printf(" --> %ld lines in this file contain '.,'. Not reporting them.\n", dotcomma);
ali@0	791	}
ali@0	792
ali@0	793	/* if more than 50 lines, or one-tenth, are short, don't bother reporting them */
ali@0	794	warn_short = 1;
ali@0	795	if (shortline > 50 \|\| shortline * 10 > linecnt) {
ali@0	796	warn_short = 0;
ali@0	797	printf(" --> %ld lines in this file are short. Not reporting short lines.\n", shortline);
ali@0	798	}
ali@0	799
ali@0	800	/* if more than 50 lines, or one-tenth, are long, don't bother reporting them */
ali@0	801	warn_long = 1;
ali@0	802	if (longline > 50 \|\| longline * 10 > linecnt) {
ali@0	803	warn_long = 0;
ali@0	804	printf(" --> %ld lines in this file are long. Not reporting long lines.\n", longline);
ali@0	805	}
ali@0	806
ali@0	807	/* if more than 10 lines contain asterisks, don't bother reporting them V.0.97 */
ali@0	808	warn_ast = 1;
ali@0	809	if (astline > 10 ) {
ali@0	810	warn_ast = 0;
ali@0	811	printf(" --> %ld lines in this file contain asterisks. Not reporting them.\n", astline);
ali@0	812	}
ali@0	813
ali@0	814	/* if more than 10 lines contain forward slashes, don't bother reporting them V.0.99 */
ali@0	815	warn_fslash = 1;
ali@0	816	if (fslashline > 10 ) {
ali@0	817	warn_fslash = 0;
ali@0	818	printf(" --> %ld lines in this file contain forward slashes. Not reporting them.\n", fslashline);
ali@0	819	}
ali@0	820
ali@0	821	/* if more than 20 lines contain unpunctuated endquotes, don't bother reporting them V.0.99 */
ali@0	822	warn_endquote = 1;
ali@0	823	if (endquote_count > 20 ) {
ali@0	824	warn_endquote = 0;
ali@0	825	printf(" --> %ld lines in this file contain unpunctuated endquotes. Not reporting them.\n", endquote_count);
ali@0	826	}
ali@0	827
ali@0	828	/* if more than 15 lines contain standalone digits, don't bother reporting them V.0.97 */
ali@0	829	warn_digit = 1;
ali@0	830	if (standalone_digit > 10 ) {
ali@0	831	warn_digit = 0;
ali@0	832	printf(" --> %ld lines in this file contain standalone 0s and 1s. Not reporting them.\n", standalone_digit);
ali@0	833	}
ali@0	834
ali@0	835	/* if more than 20 lines contain hyphens at end, don't bother reporting them V.0.98 */
ali@0	836	warn_hyphen = 1;
ali@0	837	if (hyphens > 20 ) {
ali@0	838	warn_hyphen = 0;
ali@0	839	printf(" --> %ld lines in this file have hyphens at end. Not reporting them.\n", hyphens);
ali@0	840	}
ali@0	841
ali@0	842	if (htmcount > 20 && !pswit[MARKUP_SWITCH]) {
ali@0	843	printf(" --> Looks like this is HTML. Switching HTML mode ON.\n");
ali@0	844	pswit[MARKUP_SWITCH] = 1;
ali@0	845	}
ali@0	846
ali@0	847	if (verylongline > 0) {
ali@0	848	printf(" --> %ld lines in this file are VERY long!\n", verylongline);
ali@0	849	}
ali@0	850
ali@0	851	/* If there are more non-PG spaced dashes than PG em-dashes, */
ali@0	852	/* assume it's deliberate */
ali@0	853	/* Current PG guidelines say don't use them, but older texts do,*/
ali@0	854	/* and some people insist on them whatever the guidelines say. */
ali@0	855	/* V.20 removed requirement that PG_space_emdash be greater than*/
ali@0	856	/* ten before turning off warnings about spaced dashes. */
ali@0	857	warn_dash = 1;
ali@0	858	if (spacedash + non_PG_space_emdash > PG_space_emdash) {
ali@0	859	warn_dash = 0;
ali@0	860	printf(" --> There are %ld spaced dashes and em-dashes. Not reporting them.\n", spacedash + non_PG_space_emdash);
ali@0	861	}
ali@0	862
ali@0	863	/* if more than a quarter of characters are hi-bit, bug out */
ali@0	864	warn_bin = 1;
ali@0	865	if (binlen * 4 > totlen) {
ali@0	866	printf(" --> This file does not appear to be ASCII. Terminating. Best of luck with it!\n");
ali@0	867	exit(1);
ali@0	868	}
ali@0	869	if (alphalen * 4 < totlen) {
ali@0	870	printf(" --> This file does not appear to be text. Terminating. Best of luck with it!\n");
ali@0	871	exit(1);
ali@0	872	}
ali@0	873	if ((binlen * 100 > totlen) \|\| (binlen > 100)) {
ali@0	874	printf(" --> There are a lot of foreign letters here. Not reporting them.\n");
ali@0	875	warn_bin = 0;
ali@0	876	}
ali@0	877
ali@0	878	/* isDutch and isFrench added .991 Feb 06 for Frank, Jeroen, Renald */
ali@0	879	isDutch = 0;
ali@0	880	if (Dutchcount > 50) {
ali@0	881	isDutch = 1;
ali@0	882	printf(" --> This looks like Dutch - switching off dashes and warnings for 's Middags case.\n");
ali@0	883	}
ali@0	884
ali@0	885	isFrench = 0;
ali@0	886	if (Frenchcount > 50) {
ali@0	887	isFrench = 1;
ali@0	888	printf(" --> This looks like French - switching off some doublepunct.\n");
ali@0	889	}
ali@0	890
ali@0	891	if (firstline && footerline)
ali@0	892	printf(" The PG header and footer appear to be already on.\n");
ali@0	893	else {
ali@0	894	if (firstline)
ali@0	895	printf(" The PG header is on - no footer.\n");
ali@0	896	if (footerline)
ali@0	897	printf(" The PG footer is on - no header.\n");
ali@0	898	}
ali@0	899	printf("\n");
ali@0	900
ali@0	901	/* V.22 George Davis asked for an override switch to force it to list everything */
ali@0	902	if (pswit[VERBOSE_SWITCH]) {
ali@0	903	warn_bin = 1;
ali@0	904	warn_short = 1;
ali@0	905	warn_dotcomma = 1;
ali@0	906	warn_long = 1;
ali@0	907	warn_dash = 1;
ali@0	908	warn_digit = 1;
ali@0	909	warn_ast = 1;
ali@0	910	warn_fslash = 1;
ali@0	911	warn_hyphen = 1;
ali@0	912	warn_endquote = 1;
ali@0	913	printf(" * Verbose output is ON -- you asked for it! *\n");
ali@0	914	}
ali@0	915
ali@0	916	if (isDutch)
ali@0	917	warn_dash = 0; /* Frank suggested turning it REALLY off for Dutch */
ali@0	918
ali@0	919	if ((infile = fopen(filename, "rb")) == NULL) {
ali@0	920	if (pswit[STDOUT_SWITCH])
ali@0	921	fprintf(stdout, "gutcheck: cannot open %s\n", filename);
ali@0	922	else
ali@0	923	fprintf(stderr, "gutcheck: cannot open %s\n", filename);
ali@0	924	exit(1);
ali@0	925	}
ali@0	926
ali@0	927	if (footerline > 0 && firstline > 0 && footerline > firstline && footerline - firstline < 100) { /* ugh */
ali@0	928	printf(" --> I don't really know where this text starts. \n");
ali@0	929	printf(" There are no reference points.\n");
ali@0	930	printf(" I'm going to have to report the header and footer as well.\n");
ali@0	931	firstline=0;
ali@0	932	}
ali@0	933
ali@0	934
ali@0	935
ali@0	936	/*****************************************************/
ali@0	937	/* */
ali@0	938	/* Here we go with the main pass. Hold onto yer hat! */
ali@0	939	/* */
ali@0	940	/*****************************************************/
ali@0	941
ali@0	942	/* Re-init some variables we've dirtied */
ali@0	943	quot = squot = linecnt = 0;
ali@0	944	laststart = CHAR_SPACE;
ali@0	945	lastlen = lastblen = 0;
ali@0	946
ali@0	947	while (flgets(aline, LINEBUFSIZE-1, infile, linecnt+1)) {
ali@0	948	linecnt++;
ali@0	949	if (linecnt == 1) isnewpara = 1;
ali@0	950	if (pswit[DP_SWITCH])
ali@0	951	if (!strncmp(aline, "-----File: ", 11))
ali@0	952	continue; // skip DP page separators completely
ali@0	953	if (linecnt < firstline \|\| (footerline > 0 && linecnt > footerline)) {
ali@0	954	if (pswit[HEADER_SWITCH]) {
ali@0	955	if (!strncmp(aline, "Title:", 6))
ali@0	956	printf(" %s\n", aline);
ali@0	957	if (!strncmp (aline, "Author:", 7))
ali@0	958	printf(" %s\n", aline);
ali@0	959	if (!strncmp(aline, "Release Date:", 13))
ali@0	960	printf(" %s\n", aline);
ali@0	961	if (!strncmp(aline, "Edition:", 8))
ali@0	962	printf(" %s\n\n", aline);
ali@0	963	}
ali@0	964	continue; /* skip through the header */
ali@0	965	}
ali@0	966	checked_linecnt++;
ali@0	967	s = aline;
ali@0	968	isemptyline = 1; /* assume the line is empty until proven otherwise */
ali@0	969
ali@0	970	/* If we are in a state of unbalanced quotes, and this line */
ali@0	971	/* doesn't begin with a quote, output the stored error message */
ali@0	972	/* If the -P switch was used, print the warning even if the */
ali@0	973	/* new para starts with quotes */
ali@0	974	/* Version .20 - if the new paragraph does start with a quote, */
ali@0	975	/* but is indented, I was giving a spurious error. Need to */
ali@0	976	/* check the first _non-space_ character on the line rather */
ali@0	977	/* than the first character when deciding whether the para */
ali@0	978	/* starts with a quote. Using t for this. /
ali@0	979	t = s;
ali@0	980	while (*t == ' ') t++;
ali@0	981	if (*dquote_err)
ali@0	982	if (*t != CHAR_DQUOTE \|\| pswit[QPARA_SWITCH]) {
ali@0	983	if (!pswit[OVERVIEW_SWITCH]) {
ali@0	984	if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
ali@0	985	printf(dquote_err);
ali@0	986	}
ali@0	987	else
ali@0	988	cnt_dquot++;
ali@0	989	}
ali@0	990	if (*squote_err) {
ali@0	991	if (t != CHAR_SQUOTE && t != CHAR_OPEN_SQUOTE \|\| pswit[QPARA_SWITCH] \|\| squot) {
ali@0	992	if (!pswit[OVERVIEW_SWITCH]) {
ali@0	993	if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
ali@0	994	printf(squote_err);
ali@0	995	}
ali@0	996	else
ali@0	997	cnt_squot++;
ali@0	998	}
ali@0	999	squot = 0;
ali@0	1000	}
ali@0	1001	if (*rbrack_err) {
ali@0	1002	if (!pswit[OVERVIEW_SWITCH]) {
ali@0	1003	if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
ali@0	1004	printf(rbrack_err);
ali@0	1005	}
ali@0	1006	else
ali@0	1007	cnt_brack++;
ali@0	1008	}
ali@0	1009	if (*sbrack_err) {
ali@0	1010	if (!pswit[OVERVIEW_SWITCH]) {
ali@0	1011	if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
ali@0	1012	printf(sbrack_err);
ali@0	1013	}
ali@0	1014	else
ali@0	1015	cnt_brack++;
ali@0	1016	}
ali@0	1017	if (*cbrack_err) {
ali@0	1018	if (!pswit[OVERVIEW_SWITCH]) {
ali@0	1019	if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
ali@0	1020	printf(cbrack_err);
ali@0	1021	}
ali@0	1022	else
ali@0	1023	cnt_brack++;
ali@0	1024	}
ali@0	1025	if (*unders_err) {
ali@0	1026	if (!pswit[OVERVIEW_SWITCH]) {
ali@0	1027	if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
ali@0	1028	printf(unders_err);
ali@0	1029	}
ali@0	1030	else
ali@0	1031	cnt_brack++;
ali@0	1032	}
ali@0	1033
ali@0	1034	dquote_err = squote_err = rbrack_err = cbrack_err =
ali@0	1035	sbrack_err = unders_err = 0;
ali@0	1036
ali@0	1037
ali@0	1038	/* look along the line, accumulate the count of quotes, and see */
ali@0	1039	/* if this is an empty line - i.e. a line with nothing on it */
ali@0	1040	/* but spaces. */
ali@0	1041	/* V .12 also if line has just spaces, * and/or - on it, don't */
ali@0	1042	/* count it, since empty lines with asterisks or dashes to */
ali@0	1043	/* separate sections are common. */
ali@0	1044	/* V .15 new single-quote checking - has to be better than the */
ali@0	1045	/* previous version, but how much better? fingers crossed! */
ali@0	1046	/* V .20 add period to * and - as characters on a separator line*/
ali@0	1047	s = aline;
ali@0	1048	while (*s) {
ali@0	1049	if (*s == CHAR_DQUOTE) quot++;
ali@0	1050	if (s == CHAR_SQUOTE \|\| s == CHAR_OPEN_SQUOTE)
ali@0	1051	if (s == aline) { /* at start of line, it can only be an openquote */
ali@0	1052	if (strncmp(s+2, "tis", 3) && strncmp(s+2, "Tis", 3)) /* hardcode a very common exception! */
ali@0	1053	open_single_quote++;
ali@0	1054	}
ali@0	1055	else
ali@0	1056	if (gcisalpha((s-1)) && gcisalpha((s+1)))
ali@0	1057	; /* do nothing! - it's definitely an apostrophe, not a quote */
ali@0	1058	else /* it's outside a word - let's check it out */
ali@0	1059	if (s == CHAR_OPEN_SQUOTE \|\| gcisalpha((s+1))) { /* it damwell better BE an openquote */
ali@0	1060	if (strncmp(s+1, "tis", 3) && strncmp(s+1, "Tis", 3)) /* hardcode a very common exception! */
ali@0	1061	open_single_quote++;
ali@0	1062	}
ali@0	1063	else { /* now - is it a closequote? */
ali@0	1064	guessquote = 0; /* accumulate clues */
ali@0	1065	if (gcisalpha((s-1))) { / it follows a letter - could be either */
ali@0	1066	guessquote += 1;
ali@0	1067	if ((s-1) == 's') { / looks like a plural apostrophe */
ali@0	1068	guessquote -= 3;
ali@0	1069	if ((s+1) == CHAR_SPACE) / bonus marks! */
ali@0	1070	guessquote -= 2;
ali@0	1071	}
ali@0	1072	}
ali@0	1073	else /* it doesn't have a letter either side */
ali@0	1074	if (strchr(".?!,;:", (s-1)) && (strchr(".?!,;: ", (s+1))))
ali@0	1075	guessquote += 8; /* looks like a closequote */
ali@0	1076	else
ali@0	1077	guessquote += 1;
ali@0	1078	if (open_single_quote > close_single_quote)
ali@0	1079	guessquote += 1; /* give it the benefit of some doubt - if a squote is already open */
ali@0	1080	else
ali@0	1081	guessquote -= 1;
ali@0	1082	if (guessquote >= 0)
ali@0	1083	close_single_quote++;
ali@0	1084	}
ali@0	1085
ali@0	1086	if (*s != CHAR_SPACE
ali@0	1087	&& *s != '-'
ali@0	1088	&& *s != '.'
ali@0	1089	&& *s != CHAR_ASTERISK
ali@0	1090	&& *s != 13
ali@0	1091	&& s != 10) isemptyline = 0; / ignore lines like * * * as spacers */
ali@0	1092	if (*s == CHAR_UNDERSCORE) c_unders++;
ali@0	1093	if (*s == CHAR_OPEN_CBRACK) c_brack++;
ali@0	1094	if (*s == CHAR_CLOSE_CBRACK) c_brack--;
ali@0	1095	if (*s == CHAR_OPEN_RBRACK) r_brack++;
ali@0	1096	if (*s == CHAR_CLOSE_RBRACK) r_brack--;
ali@0	1097	if (*s == CHAR_OPEN_SBRACK) s_brack++;
ali@0	1098	if (*s == CHAR_CLOSE_SBRACK) s_brack--;
ali@0	1099	s++;
ali@0	1100	}
ali@0	1101
ali@0	1102	if (isnewpara && !isemptyline) { /* This line is the start of a new paragraph */
ali@0	1103	start_para_line = linecnt;
ali@0	1104	strncpy(parastart, aline, 80); /* Capture its first line in case we want to report it later */
ali@0	1105	parastart[79] = 0;
ali@0	1106	dquotepar = squotepar = 0; /* restart the quote count 0.98 */
ali@0	1107	s = aline;
ali@0	1108	while (!gcisalpha(s) && !gcisdigit(s) && s) s++; / V.97 fixed bug - overran line and gave false warning - rare */
ali@0	1109	if (s >= 'a' && s <='z') { /* and its first letter is lowercase */
ali@0	1110	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1111	if (!pswit[OVERVIEW_SWITCH])
ali@0	1112	printf(" Line %ld column %d - Paragraph starts with lower-case\n", linecnt, (int)(s - aline) +1);
ali@0	1113	else
ali@0	1114	cnt_punct++;
ali@0	1115	}
ali@0	1116	isnewpara = 0; /* Signal the end of new para processing */
ali@0	1117	}
ali@0	1118
ali@0	1119	/* Check for an em-dash broken at line end */
ali@0	1120	if (enddash && *aline == '-') {
ali@0	1121	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1122	if (!pswit[OVERVIEW_SWITCH])
ali@0	1123	printf(" Line %ld column 1 - Broken em-dash?\n", linecnt);
ali@0	1124	else
ali@0	1125	cnt_punct++;
ali@0	1126	}
ali@0	1127	enddash = 0;
ali@0	1128	for (s = aline + strlen(aline) - 1; *s == ' ' && s > aline; s--);
ali@0	1129	if (s >= aline && *s == '-')
ali@0	1130	enddash = 1;
ali@0	1131
ali@0	1132
ali@0	1133	/* Check for invalid or questionable characters in the line */
ali@0	1134	/* Anything above 127 is invalid for plain ASCII, and */
ali@0	1135	/* non-printable control characters should also be flagged. */
ali@0	1136	/* Tabs should generally not be there. */
ali@0	1137	/* Jan 06, in 0.99: Hm. For some strange reason, I either */
ali@0	1138	/* never created or deleted the check for unprintable */
ali@0	1139	/* control characters. They should be reported even if */
ali@0	1140	/* warn_bin is on, I think, and in full. */
ali@0	1141
ali@0	1142	for (s = aline; *s; s++) {
ali@0	1143	i = (unsigned char) *s;
ali@0	1144	if (i < CHAR_SPACE && i != CHAR_LF && i != CHAR_CR && i != CHAR_TAB) {
ali@0	1145	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1146	if (!pswit[OVERVIEW_SWITCH])
ali@0	1147	printf(" Line %ld column %d - Control character %d\n", linecnt, (int) (s - aline) + 1, i);
ali@0	1148	else
ali@0	1149	cnt_bin++;
ali@0	1150	}
ali@0	1151	}
ali@0	1152
ali@0	1153	if (warn_bin) {
ali@0	1154	eNon_A = eTab = eTilde = eCarat = eFSlash = eAst = 0; /* don't repeat multiple warnings on one line */
ali@0	1155	for (s = aline; *s; s++) {
ali@0	1156	if (!eNon_A && ((s < CHAR_SPACE && s != 9 && s != '\n') \|\| (unsigned char)s > 127)) {
ali@0	1157	i = s; / annoying kludge for signed chars */
ali@0	1158	if (i < 0) i += 256;
ali@0	1159	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1160	if (!pswit[OVERVIEW_SWITCH])
ali@0	1161	if (i > 127 && i < 160)
ali@0	1162	printf(" Line %ld column %d - Non-ISO-8859 character %d\n", linecnt, (int) (s - aline) + 1, i);
ali@0	1163	else
ali@0	1164	printf(" Line %ld column %d - Non-ASCII character %d\n", linecnt, (int) (s - aline) + 1, i);
ali@0	1165	else
ali@0	1166	cnt_bin++;
ali@0	1167	eNon_A = 1;
ali@0	1168	}
ali@0	1169	if (!eTab && *s == CHAR_TAB) {
ali@0	1170	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1171	if (!pswit[OVERVIEW_SWITCH])
ali@0	1172	printf(" Line %ld column %d - Tab character?\n", linecnt, (int) (s - aline) + 1);
ali@0	1173	else
ali@0	1174	cnt_odd++;
ali@0	1175	eTab = 1;
ali@0	1176	}
ali@0	1177	if (!eTilde && s == CHAR_TILDE) { / often used by OCR software to indicate an unrecognizable character */
ali@0	1178	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1179	if (!pswit[OVERVIEW_SWITCH])
ali@0	1180	printf(" Line %ld column %d - Tilde character?\n", linecnt, (int) (s - aline) + 1);
ali@0	1181	else
ali@0	1182	cnt_odd++;
ali@0	1183	eTilde = 1;
ali@0	1184	}
ali@0	1185	if (!eCarat && *s == CHAR_CARAT) {
ali@0	1186	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1187	if (!pswit[OVERVIEW_SWITCH])
ali@0	1188	printf(" Line %ld column %d - Carat character?\n", linecnt, (int) (s - aline) + 1);
ali@0	1189	else
ali@0	1190	cnt_odd++;
ali@0	1191	eCarat = 1;
ali@0	1192	}
ali@0	1193	if (!eFSlash && *s == CHAR_FORESLASH && warn_fslash) {
ali@0	1194	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1195	if (!pswit[OVERVIEW_SWITCH])
ali@0	1196	printf(" Line %ld column %d - Forward slash?\n", linecnt, (int) (s - aline) + 1);
ali@0	1197	else
ali@0	1198	cnt_odd++;
ali@0	1199	eFSlash = 1;
ali@0	1200	}
ali@0	1201	/* report asterisks only in paranoid mode, since they're often deliberate */
ali@0	1202	if (!eAst && pswit[PARANOID_SWITCH] && warn_ast && !isemptyline && *s == CHAR_ASTERISK) {
ali@0	1203	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1204	if (!pswit[OVERVIEW_SWITCH])
ali@0	1205	printf(" Line %ld column %d - Asterisk?\n", linecnt, (int) (s - aline) + 1);
ali@0	1206	else
ali@0	1207	cnt_odd++;
ali@0	1208	eAst = 1;
ali@0	1209	}
ali@0	1210	}
ali@0	1211	}
ali@0	1212
ali@0	1213	/* Check for line too long */
ali@0	1214	if (warn_long) {
ali@0	1215	if (strlen(aline) > LONGEST_PG_LINE) {
ali@0	1216	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1217	if (!pswit[OVERVIEW_SWITCH])
ali@0	1218	printf(" Line %ld column %d - Long line %d\n", linecnt, strlen(aline), strlen(aline));
ali@0	1219	else
ali@0	1220	cnt_long++;
ali@0	1221	}
ali@0	1222	}
ali@0	1223
ali@0	1224	/* Check for line too short. */
ali@0	1225	/* This one is a bit trickier to implement: we don't want to */
ali@0	1226	/* flag the last line of a paragraph for being short, so we */
ali@0	1227	/* have to wait until we know that our current line is a */
ali@0	1228	/* "normal" line, then report the _previous_ line if it was too */
ali@0	1229	/* short. We also don't want to report indented lines like */
ali@0	1230	/* chapter heads or formatted quotations. We therefore keep */
ali@0	1231	/* lastlen as the length of the last line examined, and */
ali@0	1232	/* lastblen as the length of the last but one, and try to */
ali@0	1233	/* suppress unnecessary warnings by checking that both were of */
ali@0	1234	/* "normal" length. We keep the first character of the last */
ali@0	1235	/* line in laststart, and if it was a space, we assume that the */
ali@0	1236	/* formatting is deliberate. I can't figure out a way to */
ali@0	1237	/* distinguish something like a quoted verse left-aligned or */
ali@0	1238	/* the header or footer of a letter from a paragraph of short */
ali@0	1239	/* lines - maybe if I examined the whole paragraph, and if the */
ali@0	1240	/* para has less than, say, 8 lines and if all lines are short, */
ali@0	1241	/* then just assume it's OK? Need to look at some texts to see */
ali@0	1242	/* how often a formula like this would get the right result. */
ali@0	1243	/* V0.99 changed the tolerance for length to ignore from 2 to 1 */
ali@0	1244	if (warn_short) {
ali@0	1245	if (strlen(aline) > 1
ali@0	1246	&& lastlen > 1 && lastlen < SHORTEST_PG_LINE
ali@0	1247	&& lastblen > 1 && lastblen > SHORTEST_PG_LINE
ali@0	1248	&& laststart != CHAR_SPACE) {
ali@0	1249	if (pswit[ECHO_SWITCH]) printf("\n%s\n", prevline);
ali@0	1250	if (!pswit[OVERVIEW_SWITCH])
ali@0	1251	printf(" Line %ld column %d - Short line %d?\n", linecnt-1, strlen(prevline), strlen(prevline));
ali@0	1252	else
ali@0	1253	cnt_short++;
ali@0	1254	}
ali@0	1255	}
ali@0	1256	lastblen = lastlen;
ali@0	1257	lastlen = strlen(aline);
ali@0	1258	laststart = aline[0];
ali@0	1259
ali@0	1260	/* look for punctuation at start of line */
ali@0	1261	if (aline && strchr(".?!,;:", aline[0])) { / if it's punctuation */
ali@0	1262	if (strncmp(". . .", aline, 5)) { /* exception for ellipsis: V.98 tightened up to except only a full ellipsis */
ali@0	1263	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1264	if (!pswit[OVERVIEW_SWITCH])
ali@0	1265	printf(" Line %ld column 1 - Begins with punctuation?\n", linecnt);
ali@0	1266	else
ali@0	1267	cnt_punct++;
ali@0	1268	}
ali@0	1269	}
ali@0	1270
ali@0	1271	/* Check for spaced em-dashes */
ali@0	1272	/* V.20 must check _all_ occurrences of "--" on the line */
ali@0	1273	/* hence the loop - even if the first double-dash is OK */
ali@0	1274	/* there may be another that's wrong later on. */
ali@0	1275	if (warn_dash) {
ali@0	1276	s = aline;
ali@0	1277	while (strstr(s,"--")) {
ali@0	1278	if (*(strstr(s, "--")-1) == CHAR_SPACE \|\|
ali@0	1279	(*(strstr(s, "--")+2) == CHAR_SPACE)) {
ali@0	1280	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1281	if (!pswit[OVERVIEW_SWITCH])
ali@0	1282	printf(" Line %ld column %d - Spaced em-dash?\n", linecnt, (int) (strstr(s,"--") - aline) + 1);
ali@0	1283	else
ali@0	1284	cnt_dash++;
ali@0	1285	}
ali@0	1286	s = strstr(s,"--") + 2;
ali@0	1287	}
ali@0	1288	}
ali@0	1289
ali@0	1290	/* Check for spaced dashes */
ali@0	1291	if (warn_dash)
ali@0	1292	if (strstr(aline," -")) {
ali@0	1293	if (*(strstr(aline, " -")+2) != '-') {
ali@0	1294	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1295	if (!pswit[OVERVIEW_SWITCH])
ali@0	1296	printf(" Line %ld column %d - Spaced dash?\n", linecnt, (int) (strstr(aline," -") - aline) + 1);
ali@0	1297	else
ali@0	1298	cnt_dash++;
ali@0	1299	}
ali@0	1300	}
ali@0	1301	else
ali@0	1302	if (strstr(aline,"- ")) {
ali@0	1303	if (*(strstr(aline, "- ")-1) != '-') {
ali@0	1304	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1305	if (!pswit[OVERVIEW_SWITCH])
ali@0	1306	printf(" Line %ld column %d - Spaced dash?\n", linecnt, (int) (strstr(aline,"- ") - aline) + 1);
ali@0	1307	else
ali@0	1308	cnt_dash++;
ali@0	1309	}
ali@0	1310	}
ali@0	1311
ali@0	1312	/* v 0.99 */
ali@0	1313	/* Check for unmarked paragraphs indicated by separate speakers */
ali@0	1314	/* May well be false positive: */
ali@0	1315	/* "Bravo!" "Wonderful!" called the crowd. */
ali@0	1316	/* but useful all the same. */
ali@0	1317	s = wrk;
ali@0	1318	*s = 0;
ali@0	1319	if (strstr(aline, "\" \"")) s = strstr(aline, "\" \"");
ali@0	1320	if (strstr(aline, "\" \"")) s = strstr(aline, "\" \"");
ali@0	1321	if (*s) {
ali@0	1322	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1323	if (!pswit[OVERVIEW_SWITCH])
ali@0	1324	printf(" Line %ld column %d - Query missing paragraph break?\n", linecnt, (int)(s - aline) +1);
ali@0	1325	else
ali@0	1326	cnt_punct++;
ali@0	1327	}
ali@0	1328
ali@0	1329
ali@0	1330
ali@0	1331	/* Check for "to he" and other easy he/be errors */
ali@0	1332	/* This is a very inadequate effort on the he/be problem, */
ali@0	1333	/* but the phrase "to he" is always an error, whereas "to */
ali@0	1334	/* be" is quite common. I chuckle when it does catch one! */
ali@0	1335	/* Similarly, '"Quiet!", be said.' is a non-be error */
ali@0	1336	/* V .18 - "to he" is _not_ always an error!: */
ali@0	1337	/* "Where they went to he couldn't say." */
ali@0	1338	/* but I'm leaving it in anyway. */
ali@0	1339	/* V .20 Another false positive: */
ali@0	1340	/* What would "Cinderella" be without the . . . */
ali@0	1341	/* and another "If he wants to he can see for himself." */
ali@0	1342	/* V .21 Added " is be " and " be is " and " be was " */
ali@0	1343	/* V .99 Added jeebies code -- removed again. */
ali@0	1344	/* Is jeebies code worth adding? Rare to see he/be */
ali@0	1345	/* errors with modern OCR. Separate program? Yes! */
ali@0	1346	/* jeebies does the job without cluttering up this. */
ali@0	1347	/* We do get a few more queryable pairs from the */
ali@0	1348	/* project though -- they're cheap to implement. */
ali@0	1349	/* Also added a column number for guiguts. */
ali@0	1350
ali@0	1351	s = wrk;
ali@0	1352	*s = 0;
ali@0	1353	if (strstr(aline," to he ")) s = strstr(aline," to he ");
ali@0	1354	if (strstr(aline,"\" be ")) s = strstr(aline,"\" be ");
ali@0	1355	if (strstr(aline,"\", be ")) s = strstr(aline,"\", be ");
ali@0	1356	if (strstr(aline," is be ")) s = strstr(aline," is be ");
ali@0	1357	if (strstr(aline," be is ")) s = strstr(aline," be is ");
ali@0	1358	if (strstr(aline," was be ")) s = strstr(aline," was be ");
ali@0	1359	if (strstr(aline," be would ")) s = strstr(aline," be would ");
ali@0	1360	if (strstr(aline," be could ")) s = strstr(aline," be could ");
ali@0	1361	if (*s) {
ali@0	1362	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1363	if (!pswit[OVERVIEW_SWITCH])
ali@0	1364	printf(" Line %ld column %d - Query he/be error?\n", linecnt, (int)(s - aline) +1);
ali@0	1365	else
ali@0	1366	cnt_word++;
ali@0	1367	}
ali@0	1368
ali@0	1369	s = wrk;
ali@0	1370	*s = 0;
ali@0	1371	if (strstr(aline," i bad ")) s = strstr(aline," i bad ");
ali@0	1372	if (strstr(aline," you bad ")) s = strstr(aline," you bad ");
ali@0	1373	if (strstr(aline," he bad ")) s = strstr(aline," he bad ");
ali@0	1374	if (strstr(aline," she bad ")) s = strstr(aline," she bad ");
ali@0	1375	if (strstr(aline," they bad ")) s = strstr(aline," they bad ");
ali@0	1376	if (strstr(aline," a had ")) s = strstr(aline," a had ");
ali@0	1377	if (strstr(aline," the had ")) s = strstr(aline," the had ");
ali@0	1378	if (*s) {
ali@0	1379	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1380	if (!pswit[OVERVIEW_SWITCH])
ali@0	1381	printf(" Line %ld column %d - Query had/bad error?\n", linecnt, (int)(s - aline) +1);
ali@0	1382	else
ali@0	1383	cnt_word++;
ali@0	1384	}
ali@0	1385
ali@0	1386
ali@0	1387	/* V .97 Added ", hut " Not too common, hut pretty certain */
ali@0	1388	/* V.99 changed to add a column number for guiguts */
ali@0	1389	s = wrk;
ali@0	1390	*s = 0;
ali@0	1391	if (strstr(aline,", hut ")) s = strstr(aline,", hut ");
ali@0	1392	if (strstr(aline,"; hut ")) s = strstr(aline,"; hut ");
ali@0	1393	if (*s) {
ali@0	1394	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1395	if (!pswit[OVERVIEW_SWITCH])
ali@0	1396	printf(" Line %ld column %d - Query hut/but error?\n", linecnt, (int)(s - aline) +1);
ali@0	1397	else
ali@0	1398	cnt_word++;
ali@0	1399	}
ali@0	1400
ali@0	1401	/* Special case - angled bracket in front of "From" placed there by an MTA */
ali@0	1402	/* when sending an e-mail. V .21 */
ali@0	1403	if (strstr(aline, ">From")) {
ali@0	1404	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1405	if (!pswit[OVERVIEW_SWITCH])
ali@0	1406	printf(" Line %ld column %d - Query angled bracket with From\n", linecnt, (int)(strstr(aline, ">From") - aline) +1);
ali@0	1407	else
ali@0	1408	cnt_punct++;
ali@0	1409	}
ali@0	1410
ali@0	1411	/* V 0.98 Check for a single character line - often an overflow from bad wrapping. */
ali@0	1412	if (aline && !(aline+1)) {
ali@0	1413	if (aline == 'I' \|\| aline == 'V' \|\| aline == 'X' \|\| aline == 'L' \|\| gcisdigit(*aline))
ali@0	1414	; /* nothing - ignore numerals alone on a line. */
ali@0	1415	else {
ali@0	1416	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1417	if (!pswit[OVERVIEW_SWITCH])
ali@0	1418	printf(" Line %ld column 1 - Query single character line\n", linecnt);
ali@0	1419	else
ali@0	1420	cnt_punct++;
ali@0	1421	}
ali@0	1422	}
ali@0	1423
ali@0	1424	/* V 0.98 Check for I" - often should be ! */
ali@0	1425	if (strstr(aline, " I\"")) {
ali@0	1426	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1427	if (!pswit[OVERVIEW_SWITCH])
ali@0	1428	printf(" Line %ld column %ld - Query I=exclamation mark?\n", linecnt, strstr(aline, " I\"") - aline);
ali@0	1429	else
ali@0	1430	cnt_punct++;
ali@0	1431	}
ali@0	1432
ali@0	1433	/* V 0.98 Check for period without a capital letter. Cut-down from gutspell */
ali@0	1434	/* Only works when it happens on a single line. */
ali@0	1435
ali@0	1436	if (pswit[PARANOID_SWITCH])
ali@0	1437	for (t = s = aline; strstr(t,". ");) {
ali@0	1438	t = strstr(t, ". ");
ali@0	1439	if (t == s) {
ali@0	1440	t++;
ali@0	1441	continue; /* start of line punctuation is handled elsewhere */
ali@0	1442	}
ali@0	1443	if (!gcisalpha(*(t-1))) {
ali@0	1444	t++;
ali@0	1445	continue;
ali@0	1446	}
ali@0	1447	if (isDutch) { /* For Frank & Jeroen -- 's Middags case */
ali@0	1448	if (*(t+2) == CHAR_SQUOTE &&
ali@0	1449	(t+3)>='a' && (t+3)<='z' &&
ali@0	1450	*(t+4) == CHAR_SPACE &&
ali@0	1451	(t+5)>='A' && (t+5)<='Z') {
ali@0	1452	t++;
ali@0	1453	continue;
ali@0	1454	}
ali@0	1455	}
ali@0	1456	s1 = t+2;
ali@0	1457	while (s1 && !gcisalpha(s1) && !isdigit(*s1))
ali@0	1458	s1++;
ali@0	1459	if (s1 >= 'a' && s1 <= 'z') { /* we have something to investigate */
ali@0	1460	istypo = 1;
ali@0	1461	for (s1 = t - 1; s1 >= s &&
ali@0	1462	(gcisalpha(s1) \|\| gcisdigit(s1) \|\|
ali@0	1463	(s1 == CHAR_SQUOTE && gcisalpha((s1+1)) && gcisalpha((s1-1)))); s1--); / so let's go back and find out */
ali@0	1464	s1++;
ali@0	1465	for (i = 0; s1 && s1 != '.'; s1++, i++)
ali@0	1466	testword[i] = *s1;
ali@0	1467	testword[i] = 0;
ali@0	1468	for (i = 0; *abbrev[i]; i++)
ali@0	1469	if (!strcmp(testword, abbrev[i]))
ali@0	1470	istypo = 0;
ali@0	1471	// if (testword >= 'A' && testword <= 'Z')
ali@0	1472	// istypo = 0;
ali@0	1473	if (gcisdigit(*testword)) istypo = 0;
ali@0	1474	if (!*(testword+1)) istypo = 0;
ali@0	1475	if (isroman(testword)) istypo = 0;
ali@0	1476	if (istypo) {
ali@0	1477	istypo = 0;
ali@0	1478	for (i = 0; testword[i]; i++)
ali@0	1479	if (strchr(vowels, testword[i]))
ali@0	1480	istypo = 1;
ali@0	1481	}
ali@0	1482	if (istypo) {
ali@0	1483	isdup = 0;
ali@0	1484	if (strlen(testword) < MAX_QWORD_LENGTH && !pswit[VERBOSE_SWITCH])
ali@0	1485	for (i = 0; i < qperiod_index; i++)
ali@0	1486	if (!strcmp(testword, qperiod[i])) {
ali@0	1487	isdup = 1;
ali@0	1488	}
ali@0	1489	if (!isdup) {
ali@0	1490	if (qperiod_index < MAX_QWORD && strlen(testword) < MAX_QWORD_LENGTH) {
ali@0	1491	strcpy(qperiod[qperiod_index], testword);
ali@0	1492	qperiod_index++;
ali@0	1493	}
ali@0	1494	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1495	if (!pswit[OVERVIEW_SWITCH])
ali@0	1496	printf(" Line %ld column %d - Extra period?\n", linecnt, (int)(t - aline)+1);
ali@0	1497	else
ali@0	1498	cnt_punct++;
ali@0	1499	}
ali@0	1500	}
ali@0	1501	}
ali@0	1502	t++;
ali@0	1503	}
ali@0	1504
ali@0	1505
ali@0	1506	if (pswit[TYPO_SWITCH]) { /* Should have put this condition in at the start of 0.99. Duh! */
ali@0	1507	/* Check for words usually not followed by punctuation 0.99 */
ali@0	1508	for (s = aline; *s;) {
ali@0	1509	wordstart = s;
ali@0	1510	s = getaword(s, inword);
ali@0	1511	if (!*inword) continue;
ali@0	1512	lowerit(inword);
ali@0	1513	for (i = 0; *nocomma[i]; i++)
ali@0	1514	if (!strcmp(inword, nocomma[i])) {
ali@0	1515	if (s == ',' \|\| s == ';' \|\| *s == ':') {
ali@0	1516	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1517	if (!pswit[OVERVIEW_SWITCH])
ali@0	1518	printf(" Line %ld column %d - Query punctuation after %s?\n", linecnt, (int)(s - aline)+1, inword);
ali@0	1519	else
ali@0	1520	cnt_punct++;
ali@0	1521	}
ali@0	1522	}
ali@0	1523	for (i = 0; *noperiod[i]; i++)
ali@0	1524	if (!strcmp(inword, noperiod[i])) {
ali@0	1525	if (s == '.' \|\| s == '!') {
ali@0	1526	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1527	if (!pswit[OVERVIEW_SWITCH])
ali@0	1528	printf(" Line %ld column %d - Query punctuation after %s?\n", linecnt, (int)(s - aline)+1, inword);
ali@0	1529	else
ali@0	1530	cnt_punct++;
ali@0	1531	}
ali@0	1532	}
ali@0	1533	}
ali@0	1534	}
ali@0	1535
ali@0	1536
ali@0	1537
ali@0	1538	/* Check for commonly mistyped words, and digits like 0 for O in a word */
ali@0	1539	for (s = aline; *s;) {
ali@0	1540	wordstart = s;
ali@0	1541	s = getaword(s, inword);
ali@0	1542	if (!inword) continue; / don't bother with empty lines */
ali@0	1543	if (mixdigit(inword)) {
ali@0	1544	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1545	if (!pswit[OVERVIEW_SWITCH])
ali@0	1546	printf(" Line %ld column %ld - Query digit in %s\n", linecnt, (int)(wordstart - aline) + 1, inword);
ali@0	1547	else
ali@0	1548	cnt_word++;
ali@0	1549	}
ali@0	1550
ali@0	1551	/* put the word through a series of tests for likely typos and OCR errors */
ali@0	1552	/* V.21 I had allowed lots of typo-checking even with the typo switch */
ali@0	1553	/* turned off, but I really should disallow reporting of them when */
ali@0	1554	/* the switch is off. Hence the "if" below. */
ali@0	1555	if (pswit[TYPO_SWITCH]) {
ali@0	1556	istypo = 0;
ali@0	1557	strcpy(testword, inword);
ali@0	1558	alower = 0;
ali@0	1559	for (i = 0; i < (signed int)strlen(testword); i++) { /* lowercase for testing */
ali@0	1560	if (testword[i] >= 'a' && testword[i] <= 'z') alower = 1;
ali@0	1561	if (alower && testword[i] >= 'A' && testword[i] <= 'Z') {
ali@0	1562	/* we have an uppercase mid-word. However, there are common cases: */
ali@0	1563	/* Mac and Mc like McGill */
ali@0	1564	/* French contractions like l'Abbe */
ali@0	1565	if ((i == 2 && testword[0] == 'm' && testword[1] == 'c') \|\|
ali@0	1566	(i == 3 && testword[0] == 'm' && testword[1] == 'a' && testword[2] == 'c') \|\|
ali@0	1567	(i > 0 && testword[i-1] == CHAR_SQUOTE))
ali@0	1568	; /* do nothing! */
ali@0	1569
ali@0	1570	else { /* V.97 - remove separate case of uppercase within word so that */
ali@0	1571	/* names like VanAllen fall into qword_index and get reported only once */
ali@0	1572	istypo = 1;
ali@0	1573	}
ali@0	1574	}
ali@0	1575	testword[i] = (char)tolower(testword[i]);
ali@0	1576	}
ali@0	1577
ali@0	1578	/* check for certain unlikely two-letter combinations at word start and end */
ali@0	1579	/* V.0.97 - this replaces individual hardcoded checks in previous versions */
ali@0	1580	if (strlen(testword) > 1) {
ali@0	1581	for (i = 0; *nostart[i]; i++)
ali@0	1582	if (!strncmp(testword, nostart[i], 2))
ali@0	1583	istypo = 1;
ali@0	1584	for (i = 0; *noend[i]; i++)
ali@0	1585	if (!strncmp(testword + strlen(testword) -2, noend[i], 2))
ali@0	1586	istypo = 1;
ali@0	1587	}
ali@0	1588
ali@0	1589
ali@0	1590	/* ght is common, gbt never. Like that. */
ali@0	1591	if (strstr(testword, "cb")) istypo = 1;
ali@0	1592	if (strstr(testword, "gbt")) istypo = 1;
ali@0	1593	if (strstr(testword, "pbt")) istypo = 1;
ali@0	1594	if (strstr(testword, "tbs")) istypo = 1;
ali@0	1595	if (strstr(testword, "mrn")) istypo = 1;
ali@0	1596	if (strstr(testword, "ahle")) istypo = 1;
ali@0	1597	if (strstr(testword, "ihle")) istypo = 1;
ali@0	1598
ali@0	1599	/* "TBE" does happen - like HEARTBEAT - but uncommon. */
ali@0	1600	/* Also "TBI" - frostbite, outbid - but uncommon. */
ali@0	1601	/* Similarly "ii" like Hawaii, or Pompeii, and in Roman numerals, */
ali@0	1602	/* but these are covered in V.20. "ii" is a common scanno. */
ali@0	1603	if (strstr(testword, "tbi")) istypo = 1;
ali@0	1604	if (strstr(testword, "tbe")) istypo = 1;
ali@0	1605	if (strstr(testword, "ii")) istypo = 1;
ali@0	1606
ali@0	1607	/* check for no vowels or no consonants. */
ali@0	1608	/* If none, flag a typo */
ali@0	1609	if (!istypo && strlen(testword)>1) {
ali@0	1610	vowel = consonant = 0;
ali@0	1611	for (i = 0; testword[i]; i++)
ali@0	1612	if (testword[i] == 'y' \|\| gcisdigit(testword[i])) { /* Yah, this is loose. */
ali@0	1613	vowel++;
ali@0	1614	consonant++;
ali@0	1615	}
ali@0	1616	else
ali@0	1617	if (strchr(vowels, testword[i])) vowel++;
ali@0	1618	else consonant++;
ali@0	1619	if (!vowel \|\| !consonant) {
ali@0	1620	istypo = 1;
ali@0	1621	}
ali@0	1622	}
ali@0	1623
ali@0	1624	/* now exclude the word from being reported if it's in */
ali@0	1625	/* the okword list */
ali@0	1626	for (i = 0; *okword[i]; i++)
ali@0	1627	if (!strcmp(testword, okword[i]))
ali@0	1628	istypo = 0;
ali@0	1629
ali@0	1630	/* what looks like a typo may be a Roman numeral. Exclude these */
ali@0	1631	if (istypo)
ali@0	1632	if (isroman(testword))
ali@0	1633	istypo = 0;
ali@0	1634
ali@0	1635	/* check the manual list of typos */
ali@0	1636	if (!istypo)
ali@0	1637	for (i = 0; *typo[i]; i++)
ali@0	1638	if (!strcmp(testword, typo[i]))
ali@0	1639	istypo = 1;
ali@0	1640
ali@0	1641
ali@0	1642	/* V.21 - check lowercase s and l - special cases */
ali@0	1643	/* V.98 - added "i" and "m" */
ali@0	1644	/* V.99 - added "j" often a semi-colon gone wrong */
ali@0	1645	/* - and "d" for a missing apostrophe - he d */
ali@0	1646	/* - and "n" for "in" */
ali@0	1647	if (!istypo && strlen(testword) == 1)
ali@0	1648	if (strchr("slmijdn", *inword))
ali@0	1649	istypo = 1;
ali@0	1650
ali@0	1651
ali@0	1652	if (istypo) {
ali@0	1653	isdup = 0;
ali@0	1654	if (strlen(testword) < MAX_QWORD_LENGTH && !pswit[VERBOSE_SWITCH])
ali@0	1655	for (i = 0; i < qword_index; i++)
ali@0	1656	if (!strcmp(testword, qword[i])) {
ali@0	1657	isdup = 1;
ali@0	1658	++dupcnt[i];
ali@0	1659	}
ali@0	1660	if (!isdup) {
ali@0	1661	if (qword_index < MAX_QWORD && strlen(testword) < MAX_QWORD_LENGTH) {
ali@0	1662	strcpy(qword[qword_index], testword);
ali@0	1663	qword_index++;
ali@0	1664	}
ali@0	1665	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1666	if (!pswit[OVERVIEW_SWITCH]) {
ali@0	1667	printf(" Line %ld column %d - Query word %s", linecnt, (int)(wordstart - aline) + 1, inword);
ali@0	1668	if (strlen(testword) < MAX_QWORD_LENGTH && !pswit[VERBOSE_SWITCH])
ali@0	1669	printf(" - not reporting duplicates");
ali@0	1670	printf("\n");
ali@0	1671	}
ali@0	1672	else
ali@0	1673	cnt_word++;
ali@0	1674	}
ali@0	1675	}
ali@0	1676	} /* end of typo-checking */
ali@0	1677
ali@0	1678	/* check the user's list of typos */
ali@0	1679	if (!istypo)
ali@0	1680	if (usertypo_count)
ali@0	1681	for (i = 0; i < usertypo_count; i++)
ali@0	1682	if (!strcmp(testword, usertypo[i])) {
ali@0	1683	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1684	if (!pswit[OVERVIEW_SWITCH])
ali@0	1685	printf(" Line %ld column %d - Query possible scanno %s\n", linecnt, (int)(wordstart - aline) + 2, inword);
ali@0	1686	}
ali@0	1687
ali@0	1688
ali@0	1689
ali@0	1690	if (pswit[PARANOID_SWITCH] && warn_digit) { /* in paranoid mode, query all 0 and 1 standing alone - added warn_digit V.97*/
ali@0	1691	if (!strcmp(inword, "0") \|\| !strcmp(inword, "1")) {
ali@0	1692	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1693	if (!pswit[OVERVIEW_SWITCH])
ali@0	1694	printf(" Line %ld column %d - Query standalone %s\n", linecnt, (int)(wordstart - aline) + 2, inword);
ali@0	1695	else
ali@0	1696	cnt_word++;
ali@0	1697	}
ali@0	1698	}
ali@0	1699	}
ali@0	1700
ali@0	1701	/* look for added or missing spaces around punctuation and quotes */
ali@0	1702	/* If there is a punctuation character like ! with no space on */
ali@0	1703	/* either side, suspect a missing!space. If there are spaces on */
ali@0	1704	/* both sides , assume a typo. If we see a double quote with no */
ali@0	1705	/* space or punctuation on either side of it, assume unspaced */
ali@0	1706	/* quotes "like"this. */
ali@0	1707	llen = strlen(aline);
ali@0	1708	for (i = 1; i < llen; i++) { /* for each character in the line after the first */
ali@0	1709	if (strchr(".?!,;:_", aline[i])) { /* if it's punctuation */
ali@0	1710	isacro = 0; /* we need to suppress warnings for acronyms like M.D. */
ali@0	1711	isellipsis = 0; /* we need to suppress warnings for ellipsis . . . */
ali@0	1712	if ( (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1])) \|\| /* if there are letters on both sides of it or ... */
ali@0	1713	(gcisalpha(aline[i+1]) && strchr("?!,;:", aline[i]))) { /* ...if it's strict punctuation followed by an alpha */
ali@0	1714	if (aline[i] == '.') {
ali@0	1715	if (i > 2)
ali@0	1716	if (aline[i-2] == '.') isacro = 1;
ali@0	1717	if (i + 2 < llen)
ali@0	1718	if (aline[i+2] == '.') isacro = 1;
ali@0	1719	}
ali@0	1720	if (!isacro) {
ali@0	1721	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1722	if (!pswit[OVERVIEW_SWITCH])
ali@0	1723	printf(" Line %ld column %d - Missing space?\n", linecnt, i+1);
ali@0	1724	else
ali@0	1725	cnt_punct++;
ali@0	1726	}
ali@0	1727	}
ali@0	1728	if (aline[i-1] == CHAR_SPACE && (aline[i+1] == CHAR_SPACE \|\| aline[i+1] == 0)) { /* if there are spaces on both sides, or space before and end of line */
ali@0	1729	if (aline[i] == '.') {
ali@0	1730	if (i > 2)
ali@0	1731	if (aline[i-2] == '.') isellipsis = 1;
ali@0	1732	if (i + 2 < llen)
ali@0	1733	if (aline[i+2] == '.') isellipsis = 1;
ali@0	1734	}
ali@0	1735	if (!isemptyline && !isellipsis) {
ali@0	1736	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1737	if (!pswit[OVERVIEW_SWITCH])
ali@0	1738	printf(" Line %ld column %d - Spaced punctuation?\n", linecnt, i+1);
ali@0	1739	else
ali@0	1740	cnt_punct++;
ali@0	1741	}
ali@0	1742	}
ali@0	1743	}
ali@0	1744	}
ali@0	1745
ali@0	1746	/* 0.98 -- split out the characters that CANNOT be preceded by space */
ali@0	1747	llen = strlen(aline);
ali@0	1748	for (i = 1; i < llen; i++) { /* for each character in the line after the first */
ali@0	1749	if (strchr("?!,;:", aline[i])) { /* if it's punctuation that _cannot_ have a space before it */
ali@0	1750	if (aline[i-1] == CHAR_SPACE && !isemptyline && aline[i+1] != CHAR_SPACE) { /* if aline[i+1) DOES == space, it was already reported just above */
ali@0	1751	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1752	if (!pswit[OVERVIEW_SWITCH])
ali@0	1753	printf(" Line %ld column %d - Spaced punctuation?\n", linecnt, i+1);
ali@0	1754	else
ali@0	1755	cnt_punct++;
ali@0	1756	}
ali@0	1757	}
ali@0	1758	}
ali@0	1759
ali@0	1760
ali@0	1761	/* 0.99 -- special case " .X" where X is any alpha. */
ali@0	1762	/* This plugs a hole in the acronym code above. Inelegant, but maintainable. */
ali@0	1763	llen = strlen(aline);
ali@0	1764	for (i = 1; i < llen; i++) { /* for each character in the line after the first */
ali@0	1765	if (aline[i] == '.') { /* if it's a period */
ali@0	1766	if (aline[i-1] == CHAR_SPACE && gcisalpha(aline[i+1])) { /* if the period follows a space and is followed by a letter */
ali@0	1767	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1768	if (!pswit[OVERVIEW_SWITCH])
ali@0	1769	printf(" Line %ld column %d - Spaced punctuation?\n", linecnt, i+1);
ali@0	1770	else
ali@0	1771	cnt_punct++;
ali@0	1772	}
ali@0	1773	}
ali@0	1774	}
ali@0	1775
ali@0	1776
ali@0	1777
ali@0	1778
ali@0	1779	/* v.21 breaking out the search for unspaced doublequotes */
ali@0	1780	/* This is not as efficient, but it's more maintainable */
ali@0	1781	/* V.97 added underscore to the list of characters not to query, */
ali@0	1782	/* since underscores are commonly used as italics indicators. */
ali@0	1783	/* V.98 Added slash as well, same reason. */
ali@0	1784	for (i = 1; i < llen; i++) { /* for each character in the line after the first */
ali@0	1785	if (aline[i] == CHAR_DQUOTE) {
ali@0	1786	if ((!strchr(" _-.'`,;:!/([{?}])", aline[i-1]) &&
ali@0	1787	!strchr(" _-.'`,;:!/([{?}])", aline[i+1]) &&
ali@0	1788	aline[i+1] != 0
ali@0	1789	\|\| (!strchr(" _-([{'`", aline[i-1]) && gcisalpha(aline[i+1])))) {
ali@0	1790	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1791	if (!pswit[OVERVIEW_SWITCH])
ali@0	1792	printf(" Line %ld column %d - Unspaced quotes?\n", linecnt, i+1);
ali@0	1793	else
ali@0	1794	cnt_punct++;
ali@0	1795	}
ali@0	1796	}
ali@0	1797	}
ali@0	1798
ali@0	1799
ali@0	1800	/* v.98 check parity of quotes */
ali@0	1801	/* v.99 added !(s+1) in some tests to catch "I am," he said, but I will not be soon". /
ali@0	1802	for (s = aline; *s; s++) {
ali@0	1803	if (*s == CHAR_DQUOTE) {
ali@0	1804	if (!(dquotepar = !dquotepar)) { /* parity even */
ali@0	1805	if (!strchr("_-.'`/,;:!?)]} ", *(s+1))) {
ali@0	1806	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1807	if (!pswit[OVERVIEW_SWITCH])
ali@0	1808	printf(" Line %ld column %d - Wrongspaced quotes?\n", linecnt, (int)(s - aline)+1);
ali@0	1809	else
ali@0	1810	cnt_punct++;
ali@0	1811	}
ali@0	1812	}
ali@0	1813	else { /* parity odd */
ali@0	1814	if (!gcisalpha((s+1)) && !isdigit((s+1)) && !strchr("_-/.'`([{$", (s+1)) \|\| !(s+1)) {
ali@0	1815	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1816	if (!pswit[OVERVIEW_SWITCH])
ali@0	1817	printf(" Line %ld column %d - Wrongspaced quotes?\n", linecnt, (int)(s - aline)+1);
ali@0	1818	else
ali@0	1819	cnt_punct++;
ali@0	1820	}
ali@0	1821	}
ali@0	1822	}
ali@0	1823	}
ali@0	1824
ali@0	1825	if (*aline == CHAR_DQUOTE) {
ali@0	1826	if (strchr(",;:!?)]} ", aline[1])) {
ali@0	1827	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1828	if (!pswit[OVERVIEW_SWITCH])
ali@0	1829	printf(" Line %ld column 1 - Wrongspaced quotes?\n", linecnt, (int)(s - aline)+1);
ali@0	1830	else
ali@0	1831	cnt_punct++;
ali@0	1832	}
ali@0	1833	}
ali@0	1834
ali@0	1835	if (pswit[SQUOTE_SWITCH])
ali@0	1836	for (s = aline; *s; s++) {
ali@0	1837	if ((s == CHAR_SQUOTE \|\| s == CHAR_OPEN_SQUOTE)
ali@0	1838	&& ( s == aline \|\| (s > aline && !gcisalpha((s-1))) \|\| !gcisalpha((s+1)))) {
ali@0	1839	if (!(squotepar = !squotepar)) { /* parity even */
ali@0	1840	if (!strchr("_-.'`/\",;:!?)]} ", *(s+1))) {
ali@0	1841	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1842	if (!pswit[OVERVIEW_SWITCH])
ali@0	1843	printf(" Line %ld column %d - Wrongspaced singlequotes?\n", linecnt, (int)(s - aline)+1);
ali@0	1844	else
ali@0	1845	cnt_punct++;
ali@0	1846	}
ali@0	1847	}
ali@0	1848	else { /* parity odd */
ali@0	1849	if (!gcisalpha((s+1)) && !isdigit((s+1)) && !strchr("_-/\".'`", (s+1)) \|\| !(s+1)) {
ali@0	1850	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1851	if (!pswit[OVERVIEW_SWITCH])
ali@0	1852	printf(" Line %ld column %d - Wrongspaced singlequotes?\n", linecnt, (int)(s - aline)+1);
ali@0	1853	else
ali@0	1854	cnt_punct++;
ali@0	1855	}
ali@0	1856	}
ali@0	1857	}
ali@0	1858	}
ali@0	1859
ali@0	1860
ali@0	1861	/* v.20 also look for double punctuation like ,. or ,, */
ali@0	1862	/* Thanks to DW for the suggestion! */
ali@0	1863	/* I'm putting this in a separate loop for clarity */
ali@0	1864	/* In books with references, ".," and ".;" are common */
ali@0	1865	/* e.g. "etc., etc.," and vol. 1.; vol 3.; */
ali@0	1866	/* OTOH, from my initial tests, there are also fairly */
ali@0	1867	/* common errors. What to do? Make these cases paranoid? */
ali@0	1868	/* V.21 ".," is the most common, so invented warn_dotcomma */
ali@0	1869	/* to suppress detailed reporting if it occurs often */
ali@0	1870	llen = strlen(aline);
ali@0	1871	for (i = 0; i < llen; i++) /* for each character in the line */
ali@0	1872	if (strchr(".?!,;:", aline[i]) /* if it's punctuation */
ali@0	1873	&& (strchr(".?!,;:", aline[i+1]))
ali@0	1874	&& aline[i] && aline[i+1]) /* followed by punctuation, it's a query, unless . . . */
ali@0	1875	if (
ali@0	1876	(aline[i] == aline[i+1]
ali@0	1877	&& (aline[i] == '.' \|\| aline[i] == '?' \|\| aline[i] == '!'))
ali@0	1878	\|\| (!warn_dotcomma && aline[i] == '.' && aline[i+1] == ',')
ali@0	1879	\|\| (isFrench && !strncmp(aline+i, ",...", 4))
ali@0	1880	\|\| (isFrench && !strncmp(aline+i, "...,", 4))
ali@0	1881	\|\| (isFrench && !strncmp(aline+i, ";...", 4))
ali@0	1882	\|\| (isFrench && !strncmp(aline+i, "...;", 4))
ali@0	1883	\|\| (isFrench && !strncmp(aline+i, ":...", 4))
ali@0	1884	\|\| (isFrench && !strncmp(aline+i, "...:", 4))
ali@0	1885	\|\| (isFrench && !strncmp(aline+i, "!...", 4))
ali@0	1886	\|\| (isFrench && !strncmp(aline+i, "...!", 4))
ali@0	1887	\|\| (isFrench && !strncmp(aline+i, "?...", 4))
ali@0	1888	\|\| (isFrench && !strncmp(aline+i, "...?", 4))
ali@0	1889	) {
ali@0	1890	if ((isFrench && !strncmp(aline+i, ",...", 4)) /* could this BE any more awkward? */
ali@0	1891	\|\| (isFrench && !strncmp(aline+i, "...,", 4))
ali@0	1892	\|\| (isFrench && !strncmp(aline+i, ";...", 4))
ali@0	1893	\|\| (isFrench && !strncmp(aline+i, "...;", 4))
ali@0	1894	\|\| (isFrench && !strncmp(aline+i, ":...", 4))
ali@0	1895	\|\| (isFrench && !strncmp(aline+i, "...:", 4))
ali@0	1896	\|\| (isFrench && !strncmp(aline+i, "!...", 4))
ali@0	1897	\|\| (isFrench && !strncmp(aline+i, "...!", 4))
ali@0	1898	\|\| (isFrench && !strncmp(aline+i, "?...", 4))
ali@0	1899	\|\| (isFrench && !strncmp(aline+i, "...?", 4)))
ali@0	1900	i +=4;
ali@0	1901	; /* do nothing for .. !! and ?? which can be legit */
ali@0	1902	}
ali@0	1903	else {
ali@0	1904	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1905	if (!pswit[OVERVIEW_SWITCH])
ali@0	1906	printf(" Line %ld column %d - Double punctuation?\n", linecnt, i+1);
ali@0	1907	else
ali@0	1908	cnt_punct++;
ali@0	1909	}
ali@0	1910
ali@0	1911	/* v.21 breaking out the search for spaced doublequotes */
ali@0	1912	/* This is not as efficient, but it's more maintainable */
ali@0	1913	s = aline;
ali@0	1914	while (strstr(s," \" ")) {
ali@0	1915	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1916	if (!pswit[OVERVIEW_SWITCH])
ali@0	1917	printf(" Line %ld column %d - Spaced doublequote?\n", linecnt, (int)(strstr(s," \" ")-aline+1));
ali@0	1918	else
ali@0	1919	cnt_punct++;
ali@0	1920	s = strstr(s," \" ") + 2;
ali@0	1921	}
ali@0	1922
ali@0	1923	/* v.20 also look for spaced singlequotes ' and ` */
ali@0	1924	s = aline;
ali@0	1925	while (strstr(s," ' ")) {
ali@0	1926	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1927	if (!pswit[OVERVIEW_SWITCH])
ali@0	1928	printf(" Line %ld column %d - Spaced singlequote?\n", linecnt, (int)(strstr(s," ' ")-aline+1));
ali@0	1929	else
ali@0	1930	cnt_punct++;
ali@0	1931	s = strstr(s," ' ") + 2;
ali@0	1932	}
ali@0	1933
ali@0	1934	s = aline;
ali@0	1935	while (strstr(s," ` ")) {
ali@0	1936	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1937	if (!pswit[OVERVIEW_SWITCH])
ali@0	1938	printf(" Line %ld column %d - Spaced singlequote?\n", linecnt, (int)(strstr(s," ` ")-aline+1));
ali@0	1939	else
ali@0	1940	cnt_punct++;
ali@0	1941	s = strstr(s," ` ") + 2;
ali@0	1942	}
ali@0	1943
ali@0	1944	/* v.99 check special case of 'S instead of 's at end of word */
ali@0	1945	s = aline + 1;
ali@0	1946	while (*s) {
ali@0	1947	if (s == CHAR_SQUOTE && (s+1) == 'S' && (s-1)>='a' && (s-1)<='z') {
ali@0	1948	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1949	if (!pswit[OVERVIEW_SWITCH])
ali@0	1950	printf(" Line %ld column %d - Capital \"S\"?\n", linecnt, (int)(s-aline+2));
ali@0	1951	else
ali@0	1952	cnt_punct++;
ali@0	1953	}
ali@0	1954	s++;
ali@0	1955	}
ali@0	1956
ali@0	1957
ali@0	1958	/* v.21 Now check special cases - start and end of line - */
ali@0	1959	/* for single and double quotes. Start is sometimes [sic] */
ali@0	1960	/* but better to query it anyway. */
ali@0	1961	/* While I'm here, check for dash at end of line */
ali@0	1962	llen = strlen(aline);
ali@0	1963	if (llen > 1) {
ali@0	1964	if (aline[llen-1] == CHAR_DQUOTE \|\|
ali@0	1965	aline[llen-1] == CHAR_SQUOTE \|\|
ali@0	1966	aline[llen-1] == CHAR_OPEN_SQUOTE)
ali@0	1967	if (aline[llen-2] == CHAR_SPACE) {
ali@0	1968	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1969	if (!pswit[OVERVIEW_SWITCH])
ali@0	1970	printf(" Line %ld column %d - Spaced quote?\n", linecnt, llen);
ali@0	1971	else
ali@0	1972	cnt_punct++;
ali@0	1973	}
ali@0	1974
ali@0	1975	/* V 0.98 removed aline[0] == CHAR_DQUOTE from the test below, since */
ali@0	1976	/* Wrongspaced quotes test also catches it for " */
ali@0	1977	if (aline[0] == CHAR_SQUOTE \|\|
ali@0	1978	aline[0] == CHAR_OPEN_SQUOTE)
ali@0	1979	if (aline[1] == CHAR_SPACE) {
ali@0	1980	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1981	if (!pswit[OVERVIEW_SWITCH])
ali@0	1982	printf(" Line %ld column 1 - Spaced quote?\n", linecnt);
ali@0	1983	else
ali@0	1984	cnt_punct++;
ali@0	1985	}
ali@0	1986	/* dash at end of line may well be legit - paranoid mode only */
ali@0	1987	/* and don't report em-dash at line-end */
ali@0	1988	if (pswit[PARANOID_SWITCH] && warn_hyphen) {
ali@0	1989	for (i = llen-1; i > 0 && (unsigned char)aline[i] <= CHAR_SPACE; i--);
ali@0	1990	if (aline[i] == '-' && aline[i-1] != '-') {
ali@0	1991	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	1992	if (!pswit[OVERVIEW_SWITCH])
ali@0	1993	printf(" Line %ld column %d - Hyphen at end of line?\n", linecnt, i);
ali@0	1994	}
ali@0	1995	}
ali@0	1996	}
ali@0	1997
ali@0	1998	/* v.21 also look for brackets surrounded by alpha */
ali@0	1999	/* Brackets are often unspaced, but shouldn't be surrounded by alpha. */
ali@0	2000	/* If so, suspect a scanno like "a]most" */
ali@0	2001	llen = strlen(aline);
ali@0	2002	for (i = 1; i < llen-1; i++) { /* for each character in the line except 1st & last*/
ali@0	2003	if (strchr("{[()]}", aline[i]) /* if it's a bracket */
ali@0	2004	&& gcisalpha(aline[i-1]) && gcisalpha(aline[i+1])) {
ali@0	2005	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	2006	if (!pswit[OVERVIEW_SWITCH])
ali@0	2007	printf(" Line %ld column %d - Unspaced bracket?\n", linecnt, i);
ali@0	2008	else
ali@0	2009	cnt_punct++;
ali@0	2010	}
ali@0	2011	}
ali@0	2012	/* The "Cinderella" case, back in again! :-S Give it another shot */
ali@0	2013	if (warn_endquote) {
ali@0	2014	llen = strlen(aline);
ali@0	2015	for (i = 1; i < llen; i++) { /* for each character in the line except 1st */
ali@0	2016	if (aline[i] == CHAR_DQUOTE)
ali@0	2017	if (isalpha(aline[i-1])) {
ali@0	2018	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	2019	if (!pswit[OVERVIEW_SWITCH])
ali@0	2020	printf(" Line %ld column %d - endquote missing punctuation?\n", linecnt, i);
ali@0	2021	else
ali@0	2022	cnt_punct++;
ali@0	2023	}
ali@0	2024	}
ali@0	2025	}
ali@0	2026
ali@0	2027	llen = strlen(aline);
ali@0	2028
ali@0	2029	/* Check for <HTML TAG> */
ali@0	2030	/* If there is a < in the line, followed at some point */
ali@0	2031	/* by a > then we suspect HTML */
ali@0	2032	if (strstr(aline, "<") && strstr(aline, ">")) {
ali@0	2033	i = (signed int) (strstr(aline, ">") - strstr(aline, "<") + 1);
ali@0	2034	if (i > 0) {
ali@0	2035	strncpy(wrk, strstr(aline, "<"), i);
ali@0	2036	wrk[i] = 0;
ali@0	2037	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	2038	if (!pswit[OVERVIEW_SWITCH])
ali@0	2039	printf(" Line %ld column %d - HTML Tag? %s \n", linecnt, (int)(strstr(aline, "<") - aline) + 1, wrk);
ali@0	2040	else
ali@0	2041	cnt_html++;
ali@0	2042	}
ali@0	2043	}
ali@0	2044
ali@0	2045	/* Check for &symbol; HTML */
ali@0	2046	/* If there is a & in the line, followed at */
ali@0	2047	/* some point by a ; then we suspect HTML */
ali@0	2048	if (strstr(aline, "&") && strstr(aline, ";")) {
ali@0	2049	i = (int)(strstr(aline, ";") - strstr(aline, "&") + 1);
ali@0	2050	for (s = strstr(aline, "&"); s < strstr(aline, ";"); s++)
ali@0	2051	if (s == CHAR_SPACE) i = 0; / 0.99 don't report "Jones & Son;" */
ali@0	2052	if (i > 0) {
ali@0	2053	strncpy(wrk, strstr(aline,"&"), i);
ali@0	2054	wrk[i] = 0;
ali@0	2055	if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
ali@0	2056	if (!pswit[OVERVIEW_SWITCH])
ali@0	2057	printf(" Line %ld column %d - HTML symbol? %s \n", linecnt, (int)(strstr(aline, "&") - aline) + 1, wrk);
ali@0	2058	else
ali@0	2059	cnt_html++;
ali@0	2060	}
ali@0	2061	}
ali@0	2062
ali@0	2063	/* At end of paragraph, check for mismatched quotes. */
ali@0	2064	/* We don't want to report an error immediately, since it is a */
ali@0	2065	/* common convention to omit the quotes at end of paragraph if */
ali@0	2066	/* the next paragraph is a continuation of the same speaker. */
ali@0	2067	/* Where this is the case, the next para should begin with a */
ali@0	2068	/* quote, so we store the warning message and only display it */
ali@0	2069	/* at the top of the next iteration if the new para doesn't */
ali@0	2070	/* start with a quote. */
ali@0	2071	/* The -p switch overrides this default, and warns of unclosed */
ali@0	2072	/* quotes on _every_ paragraph, whether the next begins with a */
ali@0	2073	/* quote or not. */
ali@0	2074	/* Version .16 - only report mismatched single quotes if */
ali@0	2075	/* an open_single_quotes was found. */
ali@0	2076
ali@0	2077	if (isemptyline) { /* end of para - add up the totals */
ali@0	2078	if (quot % 2)
ali@0	2079	sprintf(dquote_err, " Line %ld - Mismatched quotes\n", linecnt);
ali@0	2080	if (pswit[SQUOTE_SWITCH] && open_single_quote && (open_single_quote != close_single_quote) )
ali@0	2081	sprintf(squote_err," Line %ld - Mismatched singlequotes?\n", linecnt);
ali@0	2082	if (pswit[SQUOTE_SWITCH] && open_single_quote
ali@0	2083	&& (open_single_quote != close_single_quote)
ali@0	2084	&& (open_single_quote != close_single_quote +1) )
ali@0	2085	squot = 1; /* flag it to be noted regardless of the first char of the next para */
ali@0	2086	if (r_brack)
ali@0	2087	sprintf(rbrack_err, " Line %ld - Mismatched round brackets?\n", linecnt);
ali@0	2088	if (s_brack)
ali@0	2089	sprintf(sbrack_err, " Line %ld - Mismatched square brackets?\n", linecnt);
ali@0	2090	if (c_brack)
ali@0	2091	sprintf(cbrack_err, " Line %ld - Mismatched curly brackets?\n", linecnt);
ali@0	2092	if (c_unders % 2)
ali@0	2093	sprintf(unders_err, " Line %ld - Mismatched underscores?\n", linecnt);
ali@0	2094	quot = s_brack = c_brack = r_brack = c_unders =
ali@0	2095	open_single_quote = close_single_quote = 0;
ali@0	2096	isnewpara = 1; /* let the next iteration know that it's starting a new para */
ali@0	2097	}
ali@0	2098
ali@0	2099	/* V.21 _ALSO_ at end of paragraph, check for omitted punctuation. */
ali@0	2100	/* by working back through prevline. DW. */
ali@0	2101	/* Hmmm. Need to check this only for "normal" paras. */
ali@0	2102	/* So what is a "normal" para? ouch! */
ali@0	2103	/* Not normal if one-liner (chapter headings, etc.) */
ali@0	2104	/* Not normal if doesn't contain at least one locase letter */
ali@0	2105	/* Not normal if starts with space */
ali@0	2106
ali@0	2107	/* 0.99 tighten up on para end checks. Disallow comma and */
ali@0	2108	/* semi-colon. Check for legit para end before quotes. */
ali@0	2109	if (isemptyline) { /* end of para */
ali@0	2110	for (s = prevline, i = 0; *s && !i; s++)
ali@0	2111	if (gcisletter(*s))
ali@0	2112	i = 1; /* use i to indicate the presence of a letter on the line */
ali@0	2113	/* This next "if" is a problem. */
ali@0	2114	/* If I say "start_para_line <= linecnt - 1", that includes one-line */
ali@0	2115	/* "paragraphs" like chapter heads. Lotsa false positives. */
ali@0	2116	/* If I say "start_para_line < linecnt - 1" it doesn't, but then it */
ali@0	2117	/* misses genuine one-line paragraphs. */
ali@0	2118	/* So what do I do? */
ali@0	2119	if (i
ali@0	2120	&& lastblen > 2
ali@0	2121	&& start_para_line < linecnt - 1
ali@0	2122	&& *prevline > CHAR_SPACE
ali@0	2123	) {
ali@0	2124	for (i = strlen(prevline)-1; (prevline[i] == CHAR_DQUOTE \|\| prevline[i] == CHAR_SQUOTE) && prevline[i] > CHAR_SPACE && i > 0; i--);
ali@0	2125	for ( ; i > 0; i--) {
ali@0	2126	if (gcisalpha(prevline[i])) {
ali@0	2127	if (pswit[ECHO_SWITCH]) printf("\n%s\n", prevline);
ali@0	2128	if (!pswit[OVERVIEW_SWITCH])
ali@0	2129	printf(" Line %ld column %d - No punctuation at para end?\n", linecnt-1, strlen(prevline));
ali@0	2130	else
ali@0	2131	cnt_punct++;
ali@0	2132	break;
ali@0	2133	}
ali@0	2134	if (strchr("-.:!([{?}])", prevline[i]))
ali@0	2135	break;
ali@0	2136	}
ali@0	2137	}
ali@0	2138	}
ali@0	2139	strcpy(prevline, aline);
ali@0	2140	}
ali@0	2141	fclose (infile);
ali@0	2142	if (!pswit[OVERVIEW_SWITCH])
ali@0	2143	for (i = 0; i < MAX_QWORD; i++)
ali@0	2144	if (dupcnt[i])
ali@0	2145	printf("\nNote: Queried word %s was duplicated %d time%s\n", qword[i], dupcnt[i], "s");
ali@0	2146	}
ali@0	2147
ali@0	2148
ali@0	2149
ali@0	2150	/* flgets - get one line from the input stream, checking for */
ali@0	2151	/* the existence of exactly one CR/LF line-end per line. */
ali@0	2152	/* Returns a pointer to the line. */
ali@0	2153
ali@0	2154	char flgets(char theline, int maxlen, FILE *thefile, long lcnt)
ali@0	2155	{
ali@0	2156	char c;
ali@0	2157	int len, isCR, cint;
ali@0	2158
ali@0	2159	*theline = 0;
ali@0	2160	len = isCR = 0;
ali@0	2161	c = cint = fgetc(thefile);
ali@0	2162	do {
ali@0	2163	if (cint == EOF)
ali@0	2164	return (NULL);
ali@0	2165	if (c == 10) /* either way, it's end of line */
ali@0	2166	if (isCR)
ali@0	2167	break;
ali@0	2168	else { /* Error - a LF without a preceding CR */
ali@0	2169	if (pswit[LINE_END_SWITCH]) {
ali@0	2170	if (pswit[ECHO_SWITCH]) printf("\n%s\n", theline);
ali@0	2171	if (!pswit[OVERVIEW_SWITCH])
ali@0	2172	printf(" Line %ld - No CR?\n", lcnt);
ali@0	2173	else
ali@0	2174	cnt_lineend++;
ali@0	2175	}
ali@0	2176	break;
ali@0	2177	}
ali@0	2178	if (c == 13) {
ali@0	2179	if (isCR) { /* Error - two successive CRs */
ali@0	2180	if (pswit[LINE_END_SWITCH]) {
ali@0	2181	if (pswit[ECHO_SWITCH]) printf("\n%s\n", theline);
ali@0	2182	if (!pswit[OVERVIEW_SWITCH])
ali@0	2183	printf(" Line %ld - Two successive CRs?\n", lcnt);
ali@0	2184	else
ali@0	2185	cnt_lineend++;
ali@0	2186	}
ali@0	2187	}
ali@0	2188	isCR = 1;
ali@0	2189	}
ali@0	2190	else {
ali@0	2191	if (pswit[LINE_END_SWITCH] && isCR) {
ali@0	2192	if (pswit[ECHO_SWITCH]) printf("\n%s\n", theline);
ali@0	2193	if (!pswit[OVERVIEW_SWITCH])
ali@0	2194	printf(" Line %ld column %d - CR without LF?\n", lcnt, len+1);
ali@0	2195	else
ali@0	2196	cnt_lineend++;
ali@0	2197	}
ali@0	2198	theline[len] = c;
ali@0	2199	len++;
ali@0	2200	theline[len] = 0;
ali@0	2201	isCR = 0;
ali@0	2202	}
ali@0	2203	c = cint = fgetc(thefile);
ali@0	2204	} while(len < maxlen);
ali@0	2205	if (pswit[MARKUP_SWITCH])
ali@0	2206	postprocess_for_HTML(theline);
ali@0	2207	if (pswit[DP_SWITCH])
ali@0	2208	postprocess_for_DP(theline);
ali@0	2209	return(theline);
ali@0	2210	}
ali@0	2211
ali@0	2212
ali@0	2213
ali@0	2214
ali@0	2215	/* mixdigit - takes a "word" as a parameter, and checks whether it */
ali@0	2216	/* contains a mixture of alpha and digits. Generally, this is an */
ali@0	2217	/* error, but may not be for cases like 4th or L5 12s. 3d. */
ali@0	2218	/* Returns 0 if no error found, 1 if error. */
ali@0	2219
ali@0	2220	int mixdigit(char checkword) / check for digits like 1 or 0 in words */
ali@0	2221	{
ali@0	2222	int wehaveadigit, wehavealetter, firstdigits, query, wl;
ali@0	2223	char *s;
ali@0	2224
ali@0	2225
ali@0	2226	wehaveadigit = wehavealetter = query = 0;
ali@0	2227	for (s = checkword; *s; s++)
ali@0	2228	if (gcisalpha(*s))
ali@0	2229	wehavealetter = 1;
ali@0	2230	else
ali@0	2231	if (gcisdigit(*s))
ali@0	2232	wehaveadigit = 1;
ali@0	2233	if (wehaveadigit && wehavealetter) { /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
ali@0	2234	query = 1;
ali@0	2235	wl = strlen(checkword);
ali@0	2236	for (firstdigits = 0; gcisdigit(checkword[firstdigits]); firstdigits++)
ali@0	2237	;
ali@0	2238	/* digits, ending in st, rd, nd, th of either case */
ali@0	2239	/* 0.99 donovan points out an error below. Turns out */
ali@0	2240	/* I was using matchword like strcmp when the */
ali@0	2241	/* return values are different! Duh. */
ali@0	2242	if (firstdigits + 2 == wl &&
ali@0	2243	(matchword(checkword + wl - 2, "st")
ali@0	2244	\|\| matchword(checkword + wl - 2, "rd")
ali@0	2245	\|\| matchword(checkword + wl - 2, "nd")
ali@0	2246	\|\| matchword(checkword + wl - 2, "th"))
ali@0	2247	)
ali@0	2248	query = 0;
ali@0	2249	if (firstdigits + 3 == wl &&
ali@0	2250	(matchword(checkword + wl - 3, "sts")
ali@0	2251	\|\| matchword(checkword + wl - 3, "rds")
ali@0	2252	\|\| matchword(checkword + wl - 3, "nds")
ali@0	2253	\|\| matchword(checkword + wl - 3, "ths"))
ali@0	2254	)
ali@0	2255	query = 0;
ali@0	2256	if (firstdigits + 3 == wl &&
ali@0	2257	(matchword(checkword + wl - 4, "stly")
ali@0	2258	\|\| matchword(checkword + wl - 4, "rdly")
ali@0	2259	\|\| matchword(checkword + wl - 4, "ndly")
ali@0	2260	\|\| matchword(checkword + wl - 4, "thly"))
ali@0	2261	)
ali@0	2262	query = 0;
ali@0	2263
ali@0	2264	/* digits, ending in l, L, s or d */
ali@0	2265	if (firstdigits + 1 == wl &&
ali@0	2266	(checkword[wl-1] == 'l'
ali@0	2267	\|\| checkword[wl-1] == 'L'
ali@0	2268	\|\| checkword[wl-1] == 's'
ali@0	2269	\|\| checkword[wl-1] == 'd'))
ali@0	2270	query = 0;
ali@0	2271	/* L at the start of a number, representing Britsh pounds, like L500 */
ali@0	2272	/* This is cute. We know the current word is mixeddigit. If the first */
ali@0	2273	/* letter is L, there must be at least one digit following. If both */
ali@0	2274	/* digits and letters follow, we have a genuine error, else we have a */
ali@0	2275	/* capital L followed by digits, and we accept that as a non-error. */
ali@0	2276	if (checkword[0] == 'L')
ali@0	2277	if (!mixdigit(checkword+1))
ali@0	2278	query = 0;
ali@0	2279	}
ali@0	2280	return (query);
ali@0	2281	}
ali@0	2282
ali@0	2283
ali@0	2284
ali@0	2285
ali@0	2286	/* getaword - extracts the first/next "word" from the line, and puts */
ali@0	2287	/* it into "thisword". A word is defined as one English word unit */
ali@0	2288	/* -- or at least that's what I'm trying for. */
ali@0	2289	/* Returns a pointer to the position in the line where we will start */
ali@0	2290	/* looking for the next word. */
ali@0	2291
ali@0	2292	char getaword(char fromline, char *thisword)
ali@0	2293	{
ali@0	2294	int i, wordlen;
ali@0	2295	char *s;
ali@0	2296
ali@0	2297	wordlen = 0;
ali@0	2298	for ( ; !gcisdigit(fromline) && !gcisalpha(fromline) && *fromline ; fromline++ );
ali@0	2299
ali@0	2300	/* V .20 */
ali@0	2301	/* add a look-ahead to handle exceptions for numbers like 1,000 and 1.35. */
ali@0	2302	/* Especially yucky is the case of L1,000 */
ali@0	2303	/* I hate this, and I see other ways, but I don't see that any is _better_.*/
ali@0	2304	/* This section looks for a pattern of characters including a digit */
ali@0	2305	/* followed by a comma or period followed by one or more digits. */
ali@0	2306	/* If found, it returns this whole pattern as a word; otherwise we discard */
ali@0	2307	/* the results and resume our normal programming. */
ali@0	2308	s = fromline;
ali@0	2309	for ( ; (gcisdigit(s) \|\| gcisalpha(s) \|\| s == ',' \|\| s == '.') && wordlen < MAXWORDLEN ; s++ ) {
ali@0	2310	thisword[wordlen] = *s;
ali@0	2311	wordlen++;
ali@0	2312	}
ali@0	2313	thisword[wordlen] = 0;
ali@0	2314	for (i = 1; i < wordlen -1; i++) {
ali@0	2315	if (thisword[i] == '.' \|\| thisword[i] == ',') {
ali@0	2316	if (gcisdigit(thisword[i-1]) && gcisdigit(thisword[i-1])) { /* we have one of the damned things */
ali@0	2317	fromline = s;
ali@0	2318	return(fromline);
ali@0	2319	}
ali@0	2320	}
ali@0	2321	}
ali@0	2322
ali@0	2323	/* we didn't find a punctuated number - do the regular getword thing */
ali@0	2324	wordlen = 0;
ali@0	2325	for ( ; (gcisdigit(fromline) \|\| gcisalpha(fromline) \|\| *fromline == '\'') && wordlen < MAXWORDLEN ; fromline++ ) {
ali@0	2326	thisword[wordlen] = *fromline;
ali@0	2327	wordlen++;
ali@0	2328	}
ali@0	2329	thisword[wordlen] = 0;
ali@0	2330	return(fromline);
ali@0	2331	}
ali@0	2332
ali@0	2333
ali@0	2334
ali@0	2335
ali@0	2336
ali@0	2337	/* matchword - just a case-insensitive string matcher */
ali@0	2338	/* yes, I know this is not efficient. I'll worry about */
ali@0	2339	/* that when I have a clear idea where I'm going with it.*/
ali@0	2340
ali@0	2341	int matchword(char checkfor, char thisword)
ali@0	2342	{
ali@0	2343	unsigned int ismatch, i;
ali@0	2344
ali@0	2345	if (strlen(checkfor) != strlen(thisword)) return(0);
ali@0	2346
ali@0	2347	ismatch = 1; /* assume a match until we find a difference */
ali@0	2348	for (i = 0; i <strlen(checkfor); i++)
ali@0	2349	if (toupper(checkfor[i]) != toupper(thisword[i]))
ali@0	2350	ismatch = 0;
ali@0	2351	return (ismatch);
ali@0	2352	}
ali@0	2353
ali@0	2354
ali@0	2355
ali@0	2356
ali@0	2357
ali@0	2358	/* lowerit - lowercase the line. Yes, strlwr does the same job, */
ali@0	2359	/* but not on all platforms, and I'm a bit paranoid about what */
ali@0	2360	/* some implementations of tolower might do to hi-bit characters,*/
ali@0	2361	/* which shouldn't matter, but better safe than sorry. */
ali@0	2362
ali@0	2363	void lowerit(char *theline)
ali@0	2364	{
ali@0	2365	for ( ; *theline; theline++)
ali@0	2366	if (theline >='A' && theline <='Z')
ali@0	2367	*theline += 32;
ali@0	2368	}
ali@0	2369
ali@0	2370
ali@0	2371	/* Is this word a Roman Numeral? */
ali@0	2372	/* v 0.99 improved to be better. It still doesn't actually */
ali@0	2373	/* validate that the number is a valid Roman Numeral -- for example */
ali@0	2374	/* it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not*/
ali@0	2375	/* what we're here to do. If it passes this, it LOOKS like a Roman */
ali@0	2376	/* numeral. Anyway, the actual Romans were pretty tolerant of bad */
ali@0	2377	/* arithmetic, or expressions thereof, except when it came to taxes.*/
ali@0	2378	/* Allow any number of M, an optional D, an optional CM or CD, */
ali@0	2379	/* any number of optional Cs, an optional XL or an optional XC, an */
ali@0	2380	/* optional IX or IV, an optional V and any number of optional Is. */
ali@0	2381	/* Good enough for jazz chords. */
ali@0	2382
ali@0	2383	int isroman(char *t)
ali@0	2384	{
ali@0	2385	char *s;
ali@0	2386
ali@0	2387	if (!t \|\| !*t) return (0);
ali@0	2388
ali@0	2389	s = t;
ali@0	2390
ali@0	2391	while (t == 'm' && t ) t++;
ali@0	2392	if (*t == 'd') t++;
ali@0	2393	if (t == 'c' && (t+1) == 'm') t+=2;
ali@0	2394	if (t == 'c' && (t+1) == 'd') t+=2;
ali@0	2395	while (t == 'c' && t) t++;
ali@0	2396	if (t == 'x' && (t+1) == 'l') t+=2;
ali@0	2397	if (t == 'x' && (t+1) == 'c') t+=2;
ali@0	2398	if (*t == 'l') t++;
ali@0	2399	while (t == 'x' && t) t++;
ali@0	2400	if (t == 'i' && (t+1) == 'x') t+=2;
ali@0	2401	if (t == 'i' && (t+1) == 'v') t+=2;
ali@0	2402	if (*t == 'v') t++;
ali@0	2403	while (t == 'i' && t) t++;
ali@0	2404	if (!*t) return (1);
ali@0	2405
ali@0	2406	return(0);
ali@0	2407	}
ali@0	2408
ali@0	2409
ali@0	2410
ali@0	2411
ali@0	2412	/* gcisalpha is a special version that is somewhat lenient on 8-bit texts. */
ali@0	2413	/* If we use the standard isalpha() function, 8-bit accented characters break */
ali@0	2414	/* words, so that tete with accented characters appears to be two words, "t" */
ali@0	2415	/* and "t", with 8-bit characters between them. This causes over-reporting of */
ali@0	2416	/* errors. gcisalpha() recognizes accented letters from the CP1252 (Windows) */
ali@0	2417	/* and ISO-8859-1 character sets, which are the most common PG 8-bit types. */
ali@0	2418
ali@0	2419	int gcisalpha(unsigned char c)
ali@0	2420	{
ali@0	2421	if (c >='a' && c <='z') return(1);
ali@0	2422	if (c >='A' && c <='Z') return(1);
ali@0	2423	if (c < 140) return(0);
ali@0	2424	if (c >=192 && c != 208 && c != 215 && c != 222 && c != 240 && c != 247 && c != 254) return(1);
ali@0	2425	if (c == 140 \|\| c == 142 \|\| c == 156 \|\| c == 158 \|\| c == 159) return (1);
ali@0	2426	return(0);
ali@0	2427	}
ali@0	2428
ali@0	2429	/* gcisdigit is a special version that doesn't get confused in 8-bit texts. */
ali@0	2430	int gcisdigit(unsigned char c)
ali@0	2431	{
ali@0	2432	if (c >= '0' && c <='9') return(1);
ali@0	2433	return(0);
ali@0	2434	}
ali@0	2435
ali@0	2436	/* gcisletter is a special version that doesn't get confused in 8-bit texts. */
ali@0	2437	/* Yeah, we're ISO-8891-1-specific. So sue me. */
ali@0	2438	int gcisletter(unsigned char c)
ali@0	2439	{
ali@0	2440	if ((c >= 'A' && c <='Z') \|\| (c >= 'a' && c <='z') \|\| c >= 192) return(1);
ali@0	2441	return(0);
ali@0	2442	}
ali@0	2443
ali@0	2444
ali@0	2445
ali@0	2446
ali@0	2447	/* gcstrchr wraps strchr to return NULL if the character being searched for is zero */
ali@0	2448
ali@0	2449	char gcstrchr(char s, char c)
ali@0	2450	{
ali@0	2451	if (c == 0) return(NULL);
ali@0	2452	return(strchr(s,c));
ali@0	2453	}
ali@0	2454
ali@0	2455	/* postprocess_for_DP is derived from postprocess_for_HTML */
ali@0	2456	/* It is invoked with the -d switch from flgets(). */
ali@0	2457	/* It simply "removes" from the line a hard-coded set of common */
ali@0	2458	/* DP-specific tags, so that the line passed to the main routine has*/
ali@0	2459	/* been pre-cleaned of DP markup. */
ali@0	2460
ali@0	2461	void postprocess_for_DP(char *theline)
ali@0	2462	{
ali@0	2463
ali@0	2464	char s, t;
ali@0	2465	int i;
ali@0	2466
ali@0	2467	if (!*theline)
ali@0	2468	return;
ali@0	2469
ali@0	2470	for (i = 0; *DPmarkup[i]; i++) {
ali@0	2471	s = strstr(theline, DPmarkup[i]);
ali@0	2472	while (s) {
ali@0	2473	t = s + strlen(DPmarkup[i]);
ali@0	2474	while (*t) {
ali@0	2475	s = t;
ali@0	2476	t++; s++;
ali@0	2477	}
ali@0	2478	*s = 0;
ali@0	2479	s = strstr(theline, DPmarkup[i]);
ali@0	2480	}
ali@0	2481	}
ali@0	2482
ali@0	2483	}
ali@0	2484
ali@0	2485
ali@0	2486	/* postprocess_for_HTML is, at the moment (0.97), a very nasty */
ali@0	2487	/* short-term fix for Charlz. Nasty, nasty, nasty. */
ali@0	2488	/* It is invoked with the -m switch from flgets(). */
ali@0	2489	/* It simply "removes" from the line a hard-coded set of common */
ali@0	2490	/* HTML tags and "replaces" a hard-coded set of common HTML */
ali@0	2491	/* entities, so that the line passed to the main routine has */
ali@0	2492	/* been pre-cleaned of HTML. This is _so_ not the right way to */
ali@0	2493	/* deal with HTML, but what Charlz needs now is not HTML handling */
ali@0	2494	/* proper: just ignoring <i> tags and some others. */
ali@0	2495	/* To be revisited in future releases! */
ali@0	2496
ali@0	2497	void postprocess_for_HTML(char *theline)
ali@0	2498	{
ali@0	2499
ali@0	2500	if (strstr(theline, "<") && strstr(theline, ">"))
ali@0	2501	while (losemarkup(theline))
ali@0	2502	;
ali@0	2503	while (loseentities(theline))
ali@0	2504	;
ali@0	2505	}
ali@0	2506
ali@0	2507	char losemarkup(char theline)
ali@0	2508	{
ali@0	2509	char s, t;
ali@0	2510	int i;
ali@0	2511
ali@0	2512	if (!*theline)
ali@0	2513	return(NULL);
ali@0	2514
ali@0	2515	s = strstr(theline, "<");
ali@0	2516	t = strstr(theline, ">");
ali@0	2517	if (!s \|\| !t) return(NULL);
ali@0	2518	for (i = 0; *markup[i]; i++)
ali@0	2519	if (!tagcomp(s+1, markup[i])) {
ali@0	2520	if (!*(t+1)) {
ali@0	2521	*s = 0;
ali@0	2522	return(s);
ali@0	2523	}
ali@0	2524	else
ali@0	2525	if (t > s) {
ali@0	2526	strcpy(s, t+1);
ali@0	2527	return(s);
ali@0	2528	}
ali@0	2529	}
ali@0	2530	/* it's an unrecognized <xxx> */
ali@0	2531	return(NULL);
ali@0	2532	}
ali@0	2533
ali@0	2534	char loseentities(char theline)
ali@0	2535	{
ali@0	2536	int i;
ali@0	2537	char s, t;
ali@0	2538
ali@0	2539	if (!*theline)
ali@0	2540	return(NULL);
ali@0	2541
ali@0	2542	for (i = 0; *entities[i].htmlent; i++) {
ali@0	2543	s = strstr(theline, entities[i].htmlent);
ali@0	2544	if (s) {
ali@0	2545	t = malloc((size_t)strlen(s));
ali@0	2546	if (!t) return(NULL);
ali@0	2547	strcpy(t, s + strlen(entities[i].htmlent));
ali@0	2548	strcpy(s, entities[i].textent);
ali@0	2549	strcat(s, t);
ali@0	2550	free(t);
ali@0	2551	return(theline);
ali@0	2552	}
ali@0	2553	}
ali@0	2554
ali@0	2555	/* V0.97 Duh. Forgot to check the htmlnum member */
ali@0	2556	for (i = 0; *entities[i].htmlnum; i++) {
ali@0	2557	s = strstr(theline, entities[i].htmlnum);
ali@0	2558	if (s) {
ali@0	2559	t = malloc((size_t)strlen(s));
ali@0	2560	if (!t) return(NULL);
ali@0	2561	strcpy(t, s + strlen(entities[i].htmlnum));
ali@0	2562	strcpy(s, entities[i].textent);
ali@0	2563	strcat(s, t);
ali@0	2564	free(t);
ali@0	2565	return(theline);
ali@0	2566	}
ali@0	2567	}
ali@0	2568	return(NULL);
ali@0	2569	}
ali@0	2570
ali@0	2571
ali@0	2572	int tagcomp(char strin, char basetag)
ali@0	2573	{
ali@0	2574	char s, t;
ali@0	2575
ali@0	2576	s = basetag;
ali@0	2577	t = strin;
ali@0	2578	if (t == '/') t++; / ignore a slash */
ali@0	2579	while (s && t) {
ali@0	2580	if (tolower(s) != tolower(t)) return(1);
ali@0	2581	s++; t++;
ali@0	2582	}
ali@0	2583	/* OK, we have < followed by a valid tag start */
ali@0	2584	/* should I do something about length? */
ali@0	2585	/* this is messy. The length of an <i> tag is */
ali@0	2586	/* limited, but a <table> could go on for miles */
ali@0	2587	/* so I'd have to parse the tags . . . ugh. */
ali@0	2588	/* It isn't what Charlz needs now, so mark it */
ali@0	2589	/* as 'pending'. */
ali@0	2590	return(0);
ali@0	2591	}
ali@0	2592
ali@0	2593	void proghelp() /* explain program usage here */
ali@0	2594	{
ali@0	2595	fputs("V. 0.991. Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
ali@0	2596	fputs("Gutcheck comes wih ABSOLUTELY NO WARRANTY. For details, read the file COPYING.\n", stderr);
ali@0	2597	fputs("This is Free Software; you may redistribute it under certain conditions (GPL);\n", stderr);
ali@0	2598	fputs("read the file COPYING for details.\n\n", stderr);
ali@0	2599	fputs("Usage is: gutcheck [-setpxloyhud] filename\n",stderr);
ali@0	2600	fputs(" where -s checks single quotes, -e suppresses echoing lines, -t checks typos\n",stderr);
ali@0	2601	fputs(" -x (paranoid) switches OFF -t and extra checks, -l turns OFF line-end checks\n",stderr);
ali@0	2602	fputs(" -o just displays overview without detail, -h echoes header fields\n",stderr);
ali@0	2603	fputs(" -v (verbose) unsuppresses duplicate reporting, -m suppresses markup\n",stderr);
ali@0	2604	fputs(" -d ignores DP-specific markup,\n",stderr);
ali@0	2605	fputs(" -u uses a file gutcheck.typ to query user-defined possible typos\n",stderr);
ali@0	2606	fputs("Sample usage: gutcheck warpeace.txt \n",stderr);
ali@0	2607	fputs("\n",stderr);
ali@0	2608	fputs("Gutcheck looks for errors in Project Gutenberg(TM) etexts.\n", stderr);
ali@0	2609	fputs("Gutcheck queries anything it thinks shouldn't be in a PG text; non-ASCII\n",stderr);
ali@0	2610	fputs("characters like accented letters, lines longer than 75 or shorter than 55,\n",stderr);
ali@0	2611	fputs("unbalanced quotes or brackets, a variety of badly formatted punctuation, \n",stderr);
ali@0	2612	fputs("HTML tags, some likely typos. It is NOT a substitute for human judgement.\n",stderr);
ali@0	2613	fputs("\n",stderr);
ali@0	2614	}
ali@0	2615
ali@0	2616
ali@0	2617
ali@0	2618	/*********************************************************************
ali@0	2619	Revision History:
ali@0	2620
ali@0	2621	04/22/01 Cleaned up some stuff and released .10
ali@0	2622
ali@0	2623	---------------
ali@0	2624
ali@0	2625	05/09/01 Added the typo list, added two extra cases of he/be error,
ali@0	2626	added -p switch, OPEN_SINGLE QUOTE char as .11
ali@0	2627
ali@0	2628	---------------
ali@0	2629
ali@0	2630	05/20/01 Increased the typo list,
ali@0	2631	added paranoid mode,
ali@0	2632	ANSIfied the code and added some casts
ali@0	2633	so the compiler wouldn't keep asking if I knew what I was doing,
ali@0	2634	fixed bug in l.s.d. condition (thanks, Dave!),
ali@0	2635	standardized spacing when echoing,
ali@0	2636	added letter-combo checking code to typo section,
ali@0	2637	added more h/b words to typo array.
ali@0	2638	Not too sure about putting letter combos outside of the TYPO conditions -
ali@0	2639	someone is sure to have a book about the tbaka tribe, or something. Anyway, let's see.
ali@0	2640	Released as .12
ali@0	2641
ali@0	2642	---------------
ali@0	2643
ali@0	2644	06/01/01 Removed duplicate reporting of Tildes, asterisks, etc.
ali@0	2645	06/10/01 Added flgets routine to help with platform-independent
ali@0	2646	detection of invalid line-ends. All PG text files should
ali@0	2647	have CR/LF (13/10) at end of line, regardless of system.
ali@0	2648	Gutcheck now validates this by default. (Thanks, Charles!)
ali@0	2649	Released as .13
ali@0	2650
ali@0	2651	---------------
ali@0	2652
ali@0	2653	06/11/01 Added parenthesis match checking. (c_brack, cbrack_err etc.)
ali@0	2654	Released as .14
ali@0	2655
ali@0	2656	---------------
ali@0	2657
ali@0	2658	06/23/01 Fixed: 'No',he said. not being flagged.
ali@0	2659
ali@0	2660	Improved: better single-quotes checking:
ali@0	2661
ali@0	2662	Ignore singlequotes surrounded by alpha, like didn't. (was OK)
ali@0	2663
ali@0	2664	If a singlequote is at the END of a word AND the word ends in "s":
ali@0	2665	The dogs' tails wagged.
ali@0	2666	it's probably an apostrophe, but less commonly may be a closequote:
ali@0	2667	"These 'pack dogs' of yours look more like wolves."
ali@0	2668
ali@0	2669	If it's got punctuation before it and is followed by a space
ali@0	2670	or punctuation:
ali@0	2671	. . . was a problem,' he said
ali@0	2672	. . . was a problem,'"
ali@0	2673	it is probably (certainly?) a closequote.
ali@0	2674
ali@0	2675	If it's at start of paragraph, it's probably an openquote.
ali@0	2676	(but watch dialect)
ali@0	2677
ali@0	2678	Words with ' at beginning and end are probably quoted:
ali@0	2679	"You have the word 'chivalry' frequently on your lips."
ali@0	2680	(Not specifically implemented)
ali@0	2681	V.18 I'm glad I didn't implement this, 'cos it jest ain't so
ali@0	2682	where the convention is to punctuate outside the quotes.
ali@0	2683	'Come', he said, 'and join the party'.
ali@0	2684
ali@0	2685	If it is followed by an alpha, and especially a capital:
ali@0	2686	'Hello,' called he.
ali@0	2687	it is either an openquote or dialect.
ali@0	2688
ali@0	2689	Dialect breaks ALL the rules:
ali@0	2690	A man's a man for a' that.
ali@0	2691	"Aye, but 'tis all in the pas' now."
ali@0	2692	"'Tis often the way," he said.
ali@0	2693	'Ave a drink on me.
ali@0	2694
ali@0	2695	This version looks to be an improvement, and produces
ali@0	2696	fewer false positives, but is still not perfect. The
ali@0	2697	'pack dogs' case still fools it, and dialect is still
ali@0	2698	a problem. Oh, well, it's an improvement, and I have
ali@0	2699	a weighted structure in place for refining guesses at
ali@0	2700	closequotes. Maybe next time, I'll add a bit of logic
ali@0	2701	where if there is an open quote and one that was guessed
ali@0	2702	to be a possessive apostrophe after s, I'll re-guess it
ali@0	2703	to be a closequote. Let's see how this one flies, first.
ali@0	2704
ali@0	2705	(Afterview: it's still crap. Needs much work, and a deeper insight.)
ali@0	2706
ali@0	2707	Released as .15
ali@0	2708
ali@0	2709	TODO: More he/be checks. Can't be perfect - counterexamples:
ali@0	2710	I gave my son good advice: be married regardless of the world's opinion.
ali@0	2711	I gave my son good advice: he married regardless of the world's opinion.
ali@0	2712
ali@0	2713	If by "primitive" be meant "crude", we can understand the sentence.
ali@0	2714	If by "primitive" he meant "crude", we can understand the sentence.
ali@0	2715
ali@0	2716	No matter what be said, I must go on.
ali@0	2717	No matter what he said, I must go on.
ali@0	2718
ali@0	2719	No value, however great, can be set upon them.
ali@0	2720	No value, however great, can he set upon them.
ali@0	2721
ali@0	2722	Real-Life one from a DP International Weekly Miscellany:
ali@0	2723	He wandered through the forest without fear, sleeping
ali@0	2724	much, for in sleep be had companionship--the Great
ali@0	2725	Spirit teaching him what he should know in dreams.
ali@0	2726	That one found by jeebies, and it turned out to be "he".
ali@0	2727
ali@0	2728
ali@0	2729	---------------
ali@0	2730
ali@0	2731	07/01/01 Added -O option.
ali@0	2732	Improved singlequotes by reporting mismatched single quotes
ali@0	2733	only if an open_single_quotes was found.
ali@0	2734
ali@0	2735	Released as .16
ali@0	2736
ali@0	2737	---------------
ali@0	2738
ali@0	2739	08/27/01 Added -Y switch for Robert Rowe to allow his app to
ali@0	2740	catch the error output.
ali@0	2741
ali@0	2742	Released as .17
ali@0	2743
ali@0	2744	---------------
ali@0	2745
ali@0	2746	09/08/01 Added checking Capitals at start of paragraph, but not
ali@0	2747	checking them at start of sentence.
ali@0	2748
ali@0	2749	TODO: Parse sentences out so can check reliably for start of
ali@0	2750	sentence. Need a whole different approach for that.
ali@0	2751	(Can't just rely on periods, since they are also
ali@0	2752	used for abbreviations, etc.)
ali@0	2753
ali@0	2754	Added checking for all vowels or all consonants in a word.
ali@0	2755
ali@0	2756	While I was in, I added "ii" checking and "tl" at start of word.
ali@0	2757
ali@0	2758	Added echoing of first line of paragraph when reporting
ali@0	2759	mismatched quoted or brackets (thanks to David Widger for the
ali@0	2760	suggestion)
ali@0	2761
ali@0	2762	Not querying L at start of a number (used for British pounds).
ali@0	2763
ali@0	2764	The spelling changes are sort of half-done but released anyway
ali@0	2765	Skipped .18 because I had given out a couple of test versions
ali@0	2766	with that number.
ali@0	2767
ali@0	2768	09/25/01 Released as .19
ali@0	2769
ali@0	2770	---------------
ali@0	2771
ali@0	2772	TODO:
ali@0	2773	Use the logic from my new version of safewrap to stop querying
ali@0	2774	short lines like poems and TOCs.
ali@0	2775	Ignore non-standard ellipses like . . . or ...
ali@0	2776
ali@0	2777
ali@0	2778	---------------
ali@0	2779	10/01/01 Made any line over 80 a VERY long line (was 85).
ali@0	2780	Recognized openquotes on indented paragraphs as continuations
ali@0	2781	of the same speech.
ali@0	2782	Added "cf" to the okword list (how did I forget _that_?) and a few others.
ali@0	2783	Moved abbrev to okword and made it more general.
ali@0	2784	Removed requirement that PG_space_emdash be greater than
ali@0	2785	ten before turning off warnings about spaced dashes.
ali@0	2786	Added period to list of characters that might constitute a separator line.
ali@0	2787	Now checking for double punctuation (Thanks, David!)
ali@0	2788	Now if two spaced em-dashes on a line, reports both. (DW)
ali@0	2789	Bug: Wasn't catching spaced punctuation at line-end since I
ali@0	2790	added flgets in version .13 - fixed.
ali@0	2791	Bug: Wasn't catching spaced singlequotes - fixed
ali@0	2792	Now reads punctuated numbers like 1,000 as a single word.
ali@0	2793	(Used to give "standalone 1" type queries)
ali@0	2794	Changed paranoid mode - not including s and p options. -ex is now quite usable.
ali@0	2795	Bug: was calling `"For it is perfectly impossible," Unspaced Quotes - fixed
ali@0	2796	Bug: Sometimes gave _next_ line number for queried word at end of line - fixed
ali@0	2797
ali@0	2798	10/22/01 Released as .20
ali@0	2799
ali@0	2800	---------------
ali@0	2801
ali@0	2802	Added count of lines with spaces at end. (cnt_spacend) (Thanks, Brett!)
ali@0	2803	Reduced the number of hi-bit letters needed to stop reporting them
ali@0	2804	from 1/20 to 1/100 or 200 in total.
ali@0	2805	Added PG footer check.
ali@0	2806	Added the -h switch.
ali@0	2807	Fixed platform-specific CHAR_EOL checking for isemptyline - changed to 13 and 10
ali@0	2808	Not reporting ".," when there are many of them, such as a book with many references to "Vol 1., p. 23"
ali@0	2809	Added unspaced brackets check when surrounded by alpha.
ali@0	2810	Removed all typo reporting unless the typo switch is on.
ali@0	2811	Added gcisalpha to ease over-reporting of 8-bit queries.
ali@0	2812	ECHO_SWITCH is now ON by default!
ali@0	2813	PARANOID_SWITCH is now ON by default!
ali@0	2814	Checking for ">From" placed there by e-mail MTA (Thanks Andrew & Greg)
ali@0	2815	Checking for standalone lowercase "l"
ali@0	2816	Checking for standalone lowercase "s"
ali@0	2817	Considering "is be" and "be is" "be was" "was be" as he/be errors
ali@0	2818	Looking at punct at end of para
ali@0	2819
ali@0	2820	01/20/02 Released as .21
ali@0	2821
ali@0	2822	---------------
ali@0	2823
ali@0	2824	Added VERBOSE_SWITCH to make it list everything. (George Davis)
ali@0	2825
ali@0	2826	---------------
ali@0	2827
ali@0	2828	02/17/02 Added cint in flgets to try fix an EOF failure on a compiler I don't have.
ali@0	2829	after which
ali@0	2830	This line caused a coredump on Solaris - fixed.
ali@0	2831	Da sagte die Figur: " Das ist alles gar schoen, und man mag die Puppe
ali@0	2832	03/09/02 Changed header recognition for another header change
ali@0	2833	Called it .24
ali@0	2834	03/29/02 Added qword[][] so I can suppress massive overreporting
ali@0	2835	of queried "words" like "FN", "Wm.", "th'", people's
ali@0	2836	initials, chemical formulae and suchlike in some texts.
ali@0	2837	Called it .25
ali@0	2838	04/07/02 The qword summary reports at end shouldn't show in OVERVIEW mode. Fixed.
ali@0	2839	Added linecounts in overview mode.
ali@0	2840	Wow! gutcheck gutcheck.exe doesn't report a binary! :-) Need to tighten up. Done.
ali@0	2841	"m" is a not uncommon scanno for "in", but also appears in "a.m." - Can I get round that?
ali@0	2842	07/07/02 Added GPL.
ali@0	2843	Added checking for broken em-dash at line-end (enddash)
ali@0	2844	Released as 0.95
ali@0	2845	08/17/02 Fixed a bug that treated some hi-bit characters as spaces. Thanks, Carlo.
ali@0	2846	Released as 0.96
ali@0	2847	10/10/02 Suppressing some annoying multiple reports by default:
ali@0	2848	Standalone Ones, Asterisks, Square Brackets.
ali@0	2849	Digit 1 occurs often in many scientific texts.
ali@0	2850	Asterisk occurs often in multi-footnoted texts.
ali@0	2851	Mismatch Square Brackets occurs often in multi-para footnotes.
ali@0	2852	Added -m switch for Charlz. Horrible. Nasty. Kludgy. Evil.
ali@0	2853	. . . but it does more or less work for the main cases.
ali@0	2854	Removed uppercase within a word as a separate category so
ali@0	2855	that names like VanAllen get reported only once, like other
ali@0	2856	suspected typos.
ali@0	2857	11/24/02 Fixed - -m switch wasn't looking at htmlnum in
ali@0	2858	loseentities (Thanks, Brett!)
ali@0	2859	Fixed bug which occasionally gave false warning of
ali@0	2860	paragraph starting with lowercase.
ali@0	2861	Added underscore as character not to query around doublequotes.
ali@0	2862	Split the "Non-ASCII" message into "Non-ASCII" vs. "Non-ISO-8859"
ali@0	2863	. . . this is to help detect things like CP1252 characters.
ali@0	2864	Released as 0.97
ali@0	2865
ali@0	2866	12/01/02 Hacked a simplified version of the "Wrongspaced quotes" out of gutspell,
ali@0	2867	for doublequotes only. Replaces "Spaced quote", since it also covers that
ali@0	2868	case.
ali@0	2869	Added "warn_hyphen" to ease over-reporting of hyphens.
ali@0	2870
ali@0	2871	12/20/02 Added "extra period" checks.
ali@0	2872	Added single character line check
ali@0	2873	Added I" check - is usually an exclam
ali@0	2874	Released as 0.98
ali@0	2875
ali@0	2876	1/5/03 Eeek! Left in a lowerit(argv[0]) at the start before procfile()
ali@0	2877	from when I was looking at ways to identify markup. Refuses to
ali@0	2878	open files for *nix users with upcase in the filemanes. Removed.
ali@0	2879	Fixed quickly and released as 0.981
ali@0	2880
ali@0	2881	1/8/03 Added "arid" to the list of typos, slightly against my better
ali@0	2882	judgement, but the DP gang are all excited about it. :-)
ali@0	2883	Added a check for comma followed by capital letter, where
ali@0	2884	a period has OCRed into a comma. (DW). Not sure about this
ali@0	2885	either; we'll see.
ali@0	2886	Compiling for Win32 to allow longfilenames.
ali@0	2887
ali@0	2888	6/1/04 A messy test release for DW to include the "gutcheck.typ"
ali@0	2889	process. And the gutcheck.jee trials. Removed "arid" --
ali@0	2890	it can go in gutcheck.typ
ali@0	2891
ali@0	2892	Added checks for carats ^ and slants / but disabling slant
ali@0	2893	queries if more than 20 of them, because some people use them
ali@0	2894	for /italics/. Slants are commonly mistaken italic "I"s.
ali@0	2895
ali@0	2896	Later: removed gutcheck.jee -- wrote jeebies instead.
ali@0	2897
ali@0	2898	Random TODO:
ali@0	2899	Check brackets more closely, like quotes, so that it becomes
ali@0	2900	easy to find the error in long paragraphs full of brackets.
ali@0	2901
ali@0	2902
ali@0	2903	11/4/04 Assorted cleanup. Fixed case where text started with an
ali@0	2904	unbalanced paragraph.
ali@0	2905
ali@0	2906	1/2/05 Has it really been that long? Added "nocomma", "noperiod" check.
ali@0	2907	Bits and pieces: improved isroman(). Added isletter().
ali@0	2908	Other stuff I never noted before this.
ali@0	2909
ali@0	2910	7/3/05 Stuck in a quick start on DP-markup ignoring
ali@0	2911	at BillFlis's suggestion.
ali@0	2912
ali@0	2913	1/23/06 Took out nocomma etc if typos are off. Why did I ever leave that in?
ali@0	2914	Don't count footer for dotcomma etc.
ali@0	2915
ali@0	2916
ali@0	2917	1 I
ali@0	2918	ail all
ali@0	2919	arc are
ali@0	2920	arid and
ali@0	2921	bad had
ali@0	2922	ball hall
ali@0	2923	band hand
ali@0	2924	bar her
ali@0	2925	bat but
ali@0	2926	be he
ali@0	2927	bead head
ali@0	2928	beads heads
ali@0	2929	bear hear
ali@0	2930	bit hit
ali@0	2931	bo be
ali@0	2932	boon been
ali@0	2933	borne home
ali@0	2934	bow how
ali@0	2935	bumbled humbled
ali@0	2936	car ear
ali@0	2937	carnage carriage
ali@0	2938	carne came
ali@0	2939	cast east
ali@0	2940	cat cut
ali@0	2941	cat eat
ali@0	2942	cheek check
ali@0	2943	clay day
ali@0	2944	coining coming
ali@0	2945	comer corner
ali@0	2946	die she
ali@0	2947	docs does
ali@0	2948	ease case
ali@0	2949	fail fall
ali@0	2950	fee he
ali@0	2951	haying having
ali@0	2952	ho he
ali@0	2953	ho who
ali@0	2954	hut but
ali@0	2955	is as
ali@0	2956	lie he
ali@0	2957	lime time
ali@0	2958	loth 10th
ali@0	2959	m in
ali@0	2960	modem modern
ali@0	2961	Ms his
ali@0	2962	ray away
ali@0	2963	ray my
ali@0	2964	ringer finger
ali@0	2965	ringers fingers
ali@0	2966	rioted noted
ali@0	2967	tho the
ali@0	2968	tie he
ali@0	2969	tie the
ali@0	2970	tier her
ali@0	2971	tight right
ali@0	2972	tile the
ali@0	2973	tiling thing
ali@0	2974	tip up
ali@0	2975	tram train
ali@0	2976	tune time
ali@0	2977	u "
ali@0	2978	wen well
ali@0	2979	yon you
ali@0	2980
ali@0	2981	*********************************************************************/
ali@0	2982

author	ali <ali@juiblex.co.uk>
	Tue Jan 24 23:57:11 2012 +0000 (2012-01-24)
changeset 1	707d51fedbe0
permissions	-rw-r--r--