bookloupe: bookloupe/bookloupe.c@0d08cd5055d5 (annotated)

ali@0	1	/*************************************************************************/
ali@40	2	/* bookloupe--check for assorted weirdnesses in a PG candidate text file */
ali@0	3	/* */
ali@0	4	/* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
ali@40	5	/* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
ali@0	6	/* */
ali@0	7	/* This program is free software; you can redistribute it and/or modify */
ali@0	8	/* it under the terms of the GNU General Public License as published by */
ali@0	9	/* the Free Software Foundation; either version 2 of the License, or */
ali@0	10	/* (at your option) any later version. */
ali@0	11	/* */
ali@0	12	/* This program is distributed in the hope that it will be useful, */
ali@0	13	/* but WITHOUT ANY WARRANTY; without even the implied warranty of */
ali@40	14	/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
ali@0	15	/* GNU General Public License for more details. */
ali@0	16	/* */
ali@0	17	/* You should have received a copy of the GNU General Public License */
ali@40	18	/* along with this program. If not, see <http://www.gnu.org/licenses/>. */
ali@0	19	/*************************************************************************/
ali@0	20
ali@0	21	#include <stdio.h>
ali@0	22	#include <stdlib.h>
ali@0	23	#include <string.h>
ali@0	24	#include <ctype.h>
ali@0	25
ali@0	26	#define MAXWORDLEN 80 /* max length of one word */
ali@0	27	#define LINEBUFSIZE 2048 /* buffer size for an input line */
ali@0	28
ali@0	29	#define MAX_USER_TYPOS 1000
ali@0	30	#define USERTYPO_FILE "gutcheck.typ"
ali@0	31
ali@0	32	#ifndef MAX_PATH
ali@0	33	#define MAX_PATH 16384
ali@0	34	#endif
ali@0	35
ali@0	36	char aline[LINEBUFSIZE];
ali@0	37	char prevline[LINEBUFSIZE];
ali@0	38
ali@40	39	/* Common typos. */
ali@40	40	char *typo[] = {
ali@40	41	"teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
ali@40	42	"nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
ali@40	43	"bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
ali@40	44	"couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
ali@40	45	"esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
ali@40	46	"gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
ali@40	47	"herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
ali@40	48	"hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
ali@40	49	"loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
ali@40	50	"omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
ali@40	51	"peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
ali@40	52	"porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
ali@40	53	"sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
ali@40	54	"tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
ali@40	55	"thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
ali@40	56	"tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
ali@40	57	"waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
ali@40	58	"wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
ali@40	59	"woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
ali@40	60	"wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
ali@40	61	"ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
ali@40	62	"bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
ali@40	63	"ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
ali@40	64	"dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
ali@40	65	"hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
ali@40	66	"hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
ali@40	67	"memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
ali@40	68	"witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
ali@40	69	"prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
ali@40	70	"se", ""
ali@40	71	};
ali@0	72
ali@0	73	char *usertypo[MAX_USER_TYPOS];
ali@0	74
ali@40	75	/* Common abbreviations and other OK words not to query as typos. */
ali@40	76	char *okword[] = {
ali@40	77	"mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
ali@40	78	"rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
ali@40	79	"pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
ali@40	80	"outbid", "outbids", "frostbite", "frostbitten", ""
ali@40	81	};
ali@0	82
ali@40	83	/* Common abbreviations that cause otherwise unexplained periods. */
ali@40	84	char *abbrev[] = {
ali@40	85	"cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
ali@40	86	"cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
ali@40	87	};
ali@0	88
ali@40	89	/*
ali@40	90	* Two-Letter combinations that rarely if ever start words,
ali@40	91	* but are common scannos or otherwise common letter combinations.
ali@40	92	*/
ali@40	93	char *nostart[] = {
ali@40	94	"hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
ali@40	95	};
ali@0	96
ali@40	97	/*
ali@40	98	* Two-Letter combinations that rarely if ever end words,
ali@40	99	* but are common scannos or otherwise common letter combinations.
ali@40	100	*/
ali@40	101	char *noend[] = {
ali@40	102	"cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
ali@40	103	"sw", "gr", "sl", "cl", "iy", ""
ali@40	104	};
ali@0	105
ali@40	106	char *markup[] = {
ali@40	107	"a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
ali@40	108	"font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
ali@40	109	"img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
ali@40	110	"sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
ali@40	111	};
ali@0	112
ali@40	113	char *DPmarkup[] = {
ali@40	114	"<sc>", "</sc>", "/", "/", "/#", "#/", "/$", "$/", "<tb>", ""
ali@40	115	};
ali@0	116
ali@40	117	char *nocomma[] = {
ali@40	118	"the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
ali@40	119	"every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
ali@40	120	"st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
ali@40	121	"during", "let", "toward", "among", ""
ali@40	122	};
ali@0	123
ali@40	124	char *noperiod[] = {
ali@40	125	"every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
ali@40	126	"and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
ali@40	127	"i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
ali@40	128	"among", "those", "into", "whom", "having", "thence", ""
ali@40	129	};
ali@0	130
ali@40	131	char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü";
ali@0	132
ali@0	133	struct {
ali@0	134	char *htmlent;
ali@0	135	char *htmlnum;
ali@0	136	char *textent;
ali@40	137	} entities[] = {
ali@40	138	"&", "&", "&",
ali@40	139	"<", "<", "<",
ali@40	140	">", ">", ">",
ali@40	141	"°", "°", " degrees",
ali@40	142	"£", "£", "L",
ali@40	143	""", """, "\"", /* quotation mark = APL quote */
ali@40	144	"&OElig;", "Œ", "OE", /* latin capital ligature OE */
ali@40	145	"&oelig;", "œ", "oe", /* latin small ligature oe */
ali@40	146	"&Scaron;", "Š", "S", /* latin capital letter S with caron */
ali@40	147	"&scaron;", "š", "s", /* latin small letter s with caron */
ali@40	148	"&Yuml;", "Ÿ", "Y", /* latin capital letter Y with diaeresis */
ali@40	149	"&circ;", "ˆ", "", /* modifier letter circumflex accent */
ali@40	150	"&tilde;", "˜", "~", /* small tilde, U+02DC ISOdia */
ali@40	151	"&ensp;", " ", " ", /* en space, U+2002 ISOpub */
ali@40	152	"&emsp;", " ", " ", /* em space, U+2003 ISOpub */
ali@40	153	" ", " ", " ", /* thin space, U+2009 ISOpub */
ali@40	154	"–", "–", "-", /* en dash, U+2013 ISOpub */
ali@40	155	"—", "—", "--", /* em dash, U+2014 ISOpub */
ali@40	156	"’", "’", "'", /* right single quotation mark */
ali@40	157	"&sbquo;", "‚", "'", /* single low-9 quotation mark */
ali@40	158	"“", "“", "\"", /* left double quotation mark */
ali@40	159	"”", "”", "\"", /* right double quotation mark */
ali@40	160	"&bdquo;", "„", "\"", /* double low-9 quotation mark */
ali@40	161	"&lsaquo;", "‹", "\"", /* single left-pointing angle quotation mark */
ali@40	162	"&rsaquo;", "›", "\"", /* single right-pointing angle quotation mark */
ali@40	163	" ", " ", " ", /* no-break space = non-breaking space, */
ali@40	164	"¡", "¡", "!", /* inverted exclamation mark */
ali@40	165	"¢", "¢", "c", /* cent sign */
ali@40	166	"£", "£", "L", /* pound sign */
ali@40	167	"¤", "¤", "$", /* currency sign */
ali@40	168	"¥", "¥", "Y", /* yen sign = yuan sign */
ali@40	169	"§", "§", "--", /* section sign */
ali@40	170	"¨", "¨", " ", /* diaeresis = spacing diaeresis */
ali@40	171	"©", "©", "(C) ", /* copyright sign */
ali@40	172	"ª", "ª", " ", /* feminine ordinal indicator */
ali@40	173	"«", "«", "\"", /* left-pointing double angle quotation mark */
ali@40	174	"", "", "-", /* soft hyphen = discretionary hyphen */
ali@40	175	"®", "®", "(R) ", /* registered sign = registered trade mark sign */
ali@40	176	"¯", "¯", " ", /* macron = spacing macron = overline */
ali@40	177	"°", "°", " degrees", /* degree sign */
ali@40	178	"±", "±", "+-", /* plus-minus sign = plus-or-minus sign */
ali@40	179	"²", "²", "2", /* superscript two = superscript digit two */
ali@40	180	"³", "³", "3", /* superscript three = superscript digit three */
ali@40	181	"´", "´", " ", /* acute accent = spacing acute */
ali@40	182	"µ", "µ", "m", /* micro sign */
ali@40	183	"¶", "¶", "--", /* pilcrow sign = paragraph sign */
ali@40	184	"¸", "¸", " ", /* cedilla = spacing cedilla */
ali@40	185	"¹", "¹", "1", /* superscript one = superscript digit one */
ali@40	186	"º", "º", " ", /* masculine ordinal indicator */
ali@40	187	"»", "»", "\"", /* right-pointing double angle quotation mark */
ali@40	188	"¼", "¼", "1/4", /* vulgar fraction one quarter */
ali@40	189	"½", "½", "1/2", /* vulgar fraction one half */
ali@40	190	"¾", "¾", "3/4", /* vulgar fraction three quarters */
ali@40	191	"¿", "¿", "?", /* inverted question mark */
ali@40	192	"À", "À", "A", /* latin capital letter A with grave */
ali@40	193	"Á", "Á", "A", /* latin capital letter A with acute */
ali@40	194	"Â", "Â", "A", /* latin capital letter A with circumflex */
ali@40	195	"Ã", "Ã", "A", /* latin capital letter A with tilde */
ali@40	196	"Ä", "Ä", "A", /* latin capital letter A with diaeresis */
ali@40	197	"Å", "Å", "A", /* latin capital letter A with ring above */
ali@40	198	"Æ", "Æ", "AE", /* latin capital letter AE */
ali@40	199	"Ç", "Ç", "C", /* latin capital letter C with cedilla */
ali@40	200	"È", "È", "E", /* latin capital letter E with grave */
ali@40	201	"É", "É", "E", /* latin capital letter E with acute */
ali@40	202	"Ê", "Ê", "E", /* latin capital letter E with circumflex */
ali@40	203	"Ë", "Ë", "E", /* latin capital letter E with diaeresis */
ali@40	204	"Ì", "Ì", "I", /* latin capital letter I with grave */
ali@40	205	"Í", "Í", "I", /* latin capital letter I with acute */
ali@40	206	"Î", "Î", "I", /* latin capital letter I with circumflex */
ali@40	207	"Ï", "Ï", "I", /* latin capital letter I with diaeresis */
ali@40	208	"Ð", "Ð", "E", /* latin capital letter ETH */
ali@40	209	"Ñ", "Ñ", "N", /* latin capital letter N with tilde */
ali@40	210	"Ò", "Ò", "O", /* latin capital letter O with grave */
ali@40	211	"Ó", "Ó", "O", /* latin capital letter O with acute */
ali@40	212	"Ô", "Ô", "O", /* latin capital letter O with circumflex */
ali@40	213	"Õ", "Õ", "O", /* latin capital letter O with tilde */
ali@40	214	"Ö", "Ö", "O", /* latin capital letter O with diaeresis */
ali@40	215	"×", "×", "", / multiplication sign */
ali@40	216	"Ø", "Ø", "O", /* latin capital letter O with stroke */
ali@40	217	"Ù", "Ù", "U", /* latin capital letter U with grave */
ali@40	218	"Ú", "Ú", "U", /* latin capital letter U with acute */
ali@40	219	"Û", "Û", "U", /* latin capital letter U with circumflex */
ali@40	220	"Ü", "Ü", "U", /* latin capital letter U with diaeresis */
ali@40	221	"Ý", "Ý", "Y", /* latin capital letter Y with acute */
ali@40	222	"Þ", "Þ", "TH", /* latin capital letter THORN */
ali@40	223	"ß", "ß", "sz", /* latin small letter sharp s = ess-zed */
ali@40	224	"à", "à", "a", /* latin small letter a with grave */
ali@40	225	"á", "á", "a", /* latin small letter a with acute */
ali@40	226	"â", "â", "a", /* latin small letter a with circumflex */
ali@40	227	"ã", "ã", "a", /* latin small letter a with tilde */
ali@40	228	"ä", "ä", "a", /* latin small letter a with diaeresis */
ali@40	229	"å", "å", "a", /* latin small letter a with ring above */
ali@40	230	"æ", "æ", "ae", /* latin small letter ae */
ali@40	231	"ç", "ç", "c", /* latin small letter c with cedilla */
ali@40	232	"è", "è", "e", /* latin small letter e with grave */
ali@40	233	"é", "é", "e", /* latin small letter e with acute */
ali@40	234	"ê", "ê", "e", /* latin small letter e with circumflex */
ali@40	235	"ë", "ë", "e", /* latin small letter e with diaeresis */
ali@40	236	"ì", "ì", "i", /* latin small letter i with grave */
ali@40	237	"í", "í", "i", /* latin small letter i with acute */
ali@40	238	"î", "î", "i", /* latin small letter i with circumflex */
ali@40	239	"ï", "ï", "i", /* latin small letter i with diaeresis */
ali@40	240	"ð", "ð", "eth", /* latin small letter eth */
ali@40	241	"ñ", "ñ", "n", /* latin small letter n with tilde */
ali@40	242	"ò", "ò", "o", /* latin small letter o with grave */
ali@40	243	"ó", "ó", "o", /* latin small letter o with acute */
ali@40	244	"ô", "ô", "o", /* latin small letter o with circumflex */
ali@40	245	"õ", "õ", "o", /* latin small letter o with tilde */
ali@40	246	"ö", "ö", "o", /* latin small letter o with diaeresis */
ali@40	247	"÷", "÷", "/", /* division sign */
ali@40	248	"ø", "ø", "o", /* latin small letter o with stroke */
ali@40	249	"ù", "ù", "u", /* latin small letter u with grave */
ali@40	250	"ú", "ú", "u", /* latin small letter u with acute */
ali@40	251	"û", "û", "u", /* latin small letter u with circumflex */
ali@40	252	"ü", "ü", "u", /* latin small letter u with diaeresis */
ali@40	253	"ý", "ý", "y", /* latin small letter y with acute */
ali@40	254	"þ", "þ", "th", /* latin small letter thorn */
ali@40	255	"ÿ", "ÿ", "y", /* latin small letter y with diaeresis */
ali@40	256	"", ""
ali@40	257	};
ali@40	258
ali@40	259	/* special characters */
ali@0	260	#define CHAR_SPACE 32
ali@0	261	#define CHAR_TAB 9
ali@0	262	#define CHAR_LF 10
ali@0	263	#define CHAR_CR 13
ali@0	264	#define CHAR_DQUOTE 34
ali@0	265	#define CHAR_SQUOTE 39
ali@0	266	#define CHAR_OPEN_SQUOTE 96
ali@0	267	#define CHAR_TILDE 126
ali@0	268	#define CHAR_ASTERISK 42
ali@0	269	#define CHAR_FORESLASH 47
ali@0	270	#define CHAR_CARAT 94
ali@0	271
ali@0	272	#define CHAR_UNDERSCORE '_'
ali@0	273	#define CHAR_OPEN_CBRACK '{'
ali@0	274	#define CHAR_CLOSE_CBRACK '}'
ali@0	275	#define CHAR_OPEN_RBRACK '('
ali@0	276	#define CHAR_CLOSE_RBRACK ')'
ali@0	277	#define CHAR_OPEN_SBRACK '['
ali@0	278	#define CHAR_CLOSE_SBRACK ']'
ali@0	279
ali@40	280	/* longest and shortest normal PG line lengths */
ali@0	281	#define LONGEST_PG_LINE 75
ali@0	282	#define WAY_TOO_LONG 80
ali@0	283	#define SHORTEST_PG_LINE 55
ali@0	284
ali@0	285	#define SWITCHES "ESTPXLOYHWVMUD" /* switches:- */
ali@0	286	/* D - ignore DP-specific markup */
ali@0	287	/* E - echo queried line */
ali@0	288	/* S - check single quotes */
ali@0	289	/* T - check common typos */
ali@0	290	/* P - require closure of quotes on */
ali@0	291	/* every paragraph */
ali@0	292	/* X - "Trust no one" :-) Paranoid! */
ali@0	293	/* Queries everything */
ali@0	294	/* L - line end checking defaults on */
ali@0	295	/* -L turns it off */
ali@0	296	/* O - overview. Just shows counts. */
ali@0	297	/* Y - puts errors to stdout */
ali@0	298	/* instead of stderr */
ali@0	299	/* H - Echoes header fields */
ali@0	300	/* M - Ignore markup in < > */
ali@0	301	/* U - Use file of User-defined Typos*/
ali@0	302	/* W - Defaults for use on Web upload*/
ali@0	303	/* V - Verbose - list EVERYTHING! */
ali@0	304	#define SWITNO 14 /* max number of switch parms */
ali@0	305	/* - used for defining array-size */
ali@0	306	#define MINARGS 1 /* minimum no of args excl switches */
ali@0	307	#define MAXARGS 1 /* maximum no of args excl switches */
ali@0	308
ali@0	309	int pswit[SWITNO]; /* program switches set by SWITCHES */
ali@0	310
ali@0	311	#define ECHO_SWITCH 0
ali@0	312	#define SQUOTE_SWITCH 1
ali@0	313	#define TYPO_SWITCH 2
ali@0	314	#define QPARA_SWITCH 3
ali@0	315	#define PARANOID_SWITCH 4
ali@0	316	#define LINE_END_SWITCH 5
ali@0	317	#define OVERVIEW_SWITCH 6
ali@0	318	#define STDOUT_SWITCH 7
ali@0	319	#define HEADER_SWITCH 8
ali@0	320	#define WEB_SWITCH 9
ali@0	321	#define VERBOSE_SWITCH 10
ali@0	322	#define MARKUP_SWITCH 11
ali@0	323	#define USERTYPO_SWITCH 12
ali@0	324	#define DP_SWITCH 13
ali@0	325
ali@0	326	long cnt_dquot; /* for overview mode, count of doublequote queries */
ali@0	327	long cnt_squot; /* for overview mode, count of singlequote queries */
ali@0	328	long cnt_brack; /* for overview mode, count of brackets queries */
ali@0	329	long cnt_bin; /* for overview mode, count of non-ASCII queries */
ali@0	330	long cnt_odd; /* for overview mode, count of odd character queries */
ali@0	331	long cnt_long; /* for overview mode, count of long line errors */
ali@0	332	long cnt_short; /* for overview mode, count of short line queries */
ali@0	333	long cnt_punct; /* for overview mode, count of punctuation and spacing queries */
ali@0	334	long cnt_dash; /* for overview mode, count of dash-related queries */
ali@0	335	long cnt_word; /* for overview mode, count of word queries */
ali@0	336	long cnt_html; /* for overview mode, count of html queries */
ali@0	337	long cnt_lineend; /* for overview mode, count of line-end queries */
ali@40	338	long cnt_spacend; /* count of lines with space at end */
ali@0	339	long linecnt; /* count of total lines in the file */
ali@40	340	long checked_linecnt; /* count of lines actually checked */
ali@0	341
ali@0	342	void proghelp(void);
ali@0	343	void procfile(char *);
ali@0	344
ali@0	345	#define LOW_THRESHOLD 0
ali@0	346	#define HIGH_THRESHOLD 1
ali@0	347
ali@0	348	#define START 0
ali@0	349	#define END 1
ali@0	350	#define PREV 0
ali@0	351	#define NEXT 1
ali@0	352	#define FIRST_OF_PAIR 0
ali@0	353	#define SECOND_OF_PAIR 1
ali@0	354
ali@0	355	#define MAX_WORDPAIR 1000
ali@0	356
ali@0	357	char running_from[MAX_PATH];
ali@0	358
ali@0	359	int mixdigit(char *);
ali@40	360	char getaword(char ,char *);
ali@40	361	int matchword(char ,char );
ali@40	362	char flgets(char ,int,FILE *,long);
ali@0	363	void lowerit(char *);
ali@0	364	int gcisalpha(unsigned char);
ali@0	365	int gcisdigit(unsigned char);
ali@0	366	int gcisletter(unsigned char);
ali@40	367	char gcstrchr(char s,char c);
ali@0	368	void postprocess_for_HTML(char *);
ali@0	369	char linehasmarkup(char );
ali@0	370	char losemarkup(char );
ali@40	371	int tagcomp(char ,char );
ali@0	372	char loseentities(char );
ali@0	373	int isroman(char *);
ali@0	374	int usertypo_count;
ali@0	375	void postprocess_for_DP(char *);
ali@0	376
ali@0	377	char wrk[LINEBUFSIZE];
ali@0	378
ali@40	379	#define MAX_QWORD 50
ali@40	380	#define MAX_QWORD_LENGTH 40
ali@0	381	char qword[MAX_QWORD][MAX_QWORD_LENGTH];
ali@0	382	char qperiod[MAX_QWORD][MAX_QWORD_LENGTH];
ali@0	383	signed int dupcnt[MAX_QWORD];
ali@0	384
ali@40	385	int main(int argc,char **argv)
ali@0	386	{
ali@40	387	char argsw,s;
ali@40	388	int i,switno,invarg;
ali@0	389	char usertypo_file[MAX_PATH];
ali@0	390	FILE *usertypofile;
ali@40	391	if (strlen(argv[0])<sizeof(running_from))
ali@40	392	/* save the path to the executable */
ali@40	393	strcpy(running_from,argv[0]);
ali@0	394	/* find out what directory we're running from */
ali@40	395	s=running_from+strlen(running_from);
ali@40	396	for (;s!='/' && s!='\\' && s>=running_from;s--)
ali@40	397	*s=0;
ali@40	398	switno=strlen(SWITCHES);
ali@40	399	for (i=switno;--i>0;)
ali@40	400	pswit[i]=0; /* initialise switches */
ali@40	401	/*
ali@40	402	* Standard loop to extract switches.
ali@40	403	* When we come out of this loop, the arguments will be
ali@40	404	* in argv[0] upwards and the switches used will be
ali@40	405	* represented by their equivalent elements in pswit[]
ali@40	406	*/
ali@40	407	while (--argc>0 && **++argv=='-')
ali@40	408	for (argsw=argv[0]+1;*argsw!='\0';argsw++)
ali@40	409	for (i=switno,invarg=1;(--i>=0) && invarg==1;)
ali@40	410	if ((toupper(*argsw))==SWITCHES[i])
ali@40	411	{
ali@40	412	invarg=0;
ali@40	413	pswit[i]=1;
ali@40	414	}
ali@40	415	/* Paranoid checking is turned OFF, not on, by its switch */
ali@40	416	pswit[PARANOID_SWITCH]^=1;
ali@40	417	if (pswit[PARANOID_SWITCH])
ali@40	418	/* if running in paranoid mode force typo checks as well */
ali@40	419	pswit[TYPO_SWITCH]=pswit[TYPO_SWITCH]^1;
ali@40	420	/* Line-end checking is turned OFF, not on, by its switch */
ali@40	421	pswit[LINE_END_SWITCH]^=1;
ali@40	422	/* Echoing is turned OFF, not on, by its switch */
ali@40	423	pswit[ECHO_SWITCH]^=1;
ali@40	424	if (pswit[OVERVIEW_SWITCH])
ali@40	425	/* just print summary; don't echo */
ali@40	426	pswit[ECHO_SWITCH]=0;
ali@40	427	/*
ali@40	428	* Web uploads - for the moment, this is really just a placeholder
ali@40	429	* until we decide what processing we really want to do on web uploads
ali@40	430	*/
ali@40	431	if (pswit[WEB_SWITCH])
ali@40	432	{
ali@40	433	/* specific override for web uploads */
ali@40	434	pswit[ECHO_SWITCH]=1;
ali@40	435	pswit[SQUOTE_SWITCH]=0;
ali@40	436	pswit[TYPO_SWITCH]=1;
ali@40	437	pswit[QPARA_SWITCH]=0;
ali@40	438	pswit[PARANOID_SWITCH]=1;
ali@40	439	pswit[LINE_END_SWITCH]=0;
ali@40	440	pswit[OVERVIEW_SWITCH]=0;
ali@40	441	pswit[STDOUT_SWITCH]=0;
ali@40	442	pswit[HEADER_SWITCH]=1;
ali@40	443	pswit[VERBOSE_SWITCH]=0;
ali@40	444	pswit[MARKUP_SWITCH]=0;
ali@40	445	pswit[USERTYPO_SWITCH]=0;
ali@40	446	pswit[DP_SWITCH]=0;
ali@40	447	}
ali@40	448	if (argc<MINARGS \|\| argc>MAXARGS)
ali@40	449	{
ali@40	450	/* check number of args */
ali@0	451	proghelp();
ali@40	452	return 1;
ali@40	453	}
ali@0	454	/* read in the user-defined stealth scanno list */
ali@40	455	if (pswit[USERTYPO_SWITCH])
ali@40	456	{
ali@40	457	/* ... we were told we had one! */
ali@40	458	usertypofile=fopen(USERTYPO_FILE,"rb");
ali@40	459	if (!usertypofile)
ali@40	460	{
ali@40	461	/* not in cwd. try excuteable directory. */
ali@40	462	strcpy(usertypo_file,running_from);
ali@40	463	strcat(usertypo_file,USERTYPO_FILE);
ali@40	464	usertypofile=fopen(usertypo_file,"rb");
ali@40	465	if (!usertypofile) {
ali@40	466	/* we ain't got no user typo file! */
ali@40	467	printf(" --> I couldn't find gutcheck.typ "
ali@40	468	"-- proceeding without user typos.\n");
ali@40	469	}
ali@40	470	}
ali@40	471	usertypo_count=0;
ali@40	472	if (usertypofile)
ali@40	473	{
ali@40	474	/* we managed to open a User Typo File! */
ali@40	475	if (pswit[USERTYPO_SWITCH])
ali@40	476	{
ali@40	477	while (flgets(aline,LINEBUFSIZE-1,usertypofile,
ali@40	478	(long)usertypo_count))
ali@40	479	{
ali@40	480	if (strlen(aline)>1)
ali@40	481	{
ali@40	482	if ((int)*aline>33)
ali@40	483	{
ali@40	484	s=malloc(strlen(aline)+1);
ali@40	485	if (!s)
ali@40	486	{
ali@40	487	fprintf(stderr,"bookloupe: cannot get enough "
ali@40	488	"memory for user typo file!\n");
ali@0	489	exit(1);
ali@40	490	}
ali@40	491	strcpy(s,aline);
ali@40	492	usertypo[usertypo_count]=s;
ali@0	493	usertypo_count++;
ali@40	494	if (usertypo_count>=MAX_USER_TYPOS)
ali@40	495	{
ali@40	496	printf(" --> Only %d user-defined typos "
ali@42	497	"allowed: ignoring the rest\n",
ali@42	498	MAX_USER_TYPOS);
ali@0	499	break;
ali@40	500	}
ali@40	501	}
ali@40	502	}
ali@40	503	}
ali@40	504	}
ali@0	505	fclose(usertypofile);
ali@40	506	}
ali@40	507	}
ali@40	508	fprintf(stderr,"bookloupe: Check and report on an e-text\n");
ali@40	509	cnt_dquot=cnt_squot=cnt_brack=cnt_bin=cnt_odd=cnt_long=
ali@40	510	cnt_short=cnt_punct=cnt_dash=cnt_word=cnt_html=cnt_lineend=
ali@40	511	cnt_spacend=0;
ali@0	512	procfile(argv[0]);
ali@40	513	if (pswit[OVERVIEW_SWITCH])
ali@40	514	{
ali@40	515	printf(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
ali@40	516	checked_linecnt,linecnt,linecnt-checked_linecnt);
ali@40	517	printf(" --------------- Queries found --------------\n");
ali@40	518	if (cnt_long)
ali@40	519	printf(" Long lines: %14ld\n",cnt_long);
ali@40	520	if (cnt_short)
ali@40	521	printf(" Short lines: %14ld\n",cnt_short);
ali@40	522	if (cnt_lineend)
ali@40	523	printf(" Line-end problems: %14ld\n",cnt_lineend);
ali@40	524	if (cnt_word)
ali@40	525	printf(" Common typos: %14ld\n",cnt_word);
ali@40	526	if (cnt_dquot)
ali@40	527	printf(" Unmatched quotes: %14ld\n",cnt_dquot);
ali@40	528	if (cnt_squot)
ali@40	529	printf(" Unmatched SingleQuotes: %14ld\n",cnt_squot);
ali@40	530	if (cnt_brack)
ali@40	531	printf(" Unmatched brackets: %14ld\n",cnt_brack);
ali@40	532	if (cnt_bin)
ali@40	533	printf(" Non-ASCII characters: %14ld\n",cnt_bin);
ali@40	534	if (cnt_odd)
ali@40	535	printf(" Proofing characters: %14ld\n",cnt_odd);
ali@40	536	if (cnt_punct)
ali@40	537	printf(" Punctuation & spacing queries: %14ld\n",cnt_punct);
ali@40	538	if (cnt_dash)
ali@40	539	printf(" Non-standard dashes: %14ld\n",cnt_dash);
ali@40	540	if (cnt_html)
ali@40	541	printf(" Possible HTML tags: %14ld\n",cnt_html);
ali@0	542	printf("\n");
ali@40	543	printf(" TOTAL QUERIES %14ld\n",
ali@40	544	cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
ali@40	545	cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
ali@40	546	}
ali@40	547	return 0;
ali@0	548	}
ali@0	549
ali@41	550	struct first_pass_results {
ali@41	551	long firstline,astline;
ali@41	552	long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;
ali@41	553	long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;
ali@41	554	long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;
ali@41	555	signed int Dutchcount,Frenchcount;
ali@41	556	};
ali@41	557
ali@40	558	/*
ali@41	559	* first_pass:
ali@40	560	*
ali@41	561	* Run a first pass - verify that it's a valid PG
ali@41	562	* file, decide whether to report some things that
ali@41	563	* occur many times in the text like long or short
ali@41	564	* lines, non-standard dashes, etc.
ali@40	565	*/
ali@41	566	struct first_pass_results first_pass(FILE infile)
ali@0	567	{
ali@41	568	char laststart=CHAR_SPACE,*s;
ali@41	569	signed int i,llen;
ali@41	570	unsigned int lastlen=0,lastblen=0;
ali@41	571	long spline=0,nspline=0;
ali@41	572	static struct first_pass_results results={0};
ali@41	573	char inword[MAXWORDLEN]="";
ali@40	574	while (fgets(aline,LINEBUFSIZE-1,infile))
ali@40	575	{
ali@40	576	while (aline[strlen(aline)-1]==10 \|\| aline[strlen(aline)-1]==13)
ali@40	577	aline[strlen(aline)-1]=0;
ali@0	578	linecnt++;
ali@40	579	if (strstr(aline,"*END") && strstr(aline,"SMALL PRINT") &&
ali@40	580	(strstr(aline,"PUBLIC DOMAIN") \|\| strstr(aline,"COPYRIGHT")))
ali@40	581	{
ali@0	582	if (spline)
ali@0	583	printf(" --> Duplicate header?\n");
ali@40	584	spline=linecnt+1; /* first line of non-header text, that is */
ali@40	585	}
ali@40	586	if (!strncmp(aline,"*** START",9) && strstr(aline,"PROJECT GUTENBERG"))
ali@40	587	{
ali@0	588	if (nspline)
ali@0	589	printf(" --> Duplicate header?\n");
ali@40	590	nspline=linecnt+1; /* first line of non-header text, that is */
ali@40	591	}
ali@40	592	if (spline \|\| nspline)
ali@40	593	{
ali@0	594	lowerit(aline);
ali@40	595	if (strstr(aline,"end") && strstr(aline,"project gutenberg"))
ali@40	596	{
ali@40	597	if (strstr(aline,"end")<strstr(aline,"project gutenberg"))
ali@40	598	{
ali@41	599	if (results.footerline)
ali@40	600	{
ali@40	601	/* it's an old-form header - we can detect duplicates */
ali@40	602	if (!nspline)
ali@0	603	printf(" --> Duplicate footer?\n");
ali@40	604	}
ali@40	605	else
ali@41	606	results.footerline=linecnt;
ali@40	607	}
ali@40	608	}
ali@40	609	}
ali@40	610	if (spline)
ali@41	611	results.firstline=spline;
ali@40	612	if (nspline)
ali@41	613	results.firstline=nspline; /* override with new */
ali@41	614	if (results.footerline)
ali@40	615	continue; /* don't count the boilerplate in the footer */
ali@40	616	llen=strlen(aline);
ali@41	617	results.totlen+=llen;
ali@40	618	for (i=0;i<llen;i++)
ali@40	619	{
ali@40	620	if ((unsigned char)aline[i]>127)
ali@41	621	results.binlen++;
ali@40	622	if (gcisalpha(aline[i]))
ali@41	623	results.alphalen++;
ali@40	624	if (i>0 && aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))
ali@41	625	results.endquote_count++;
ali@40	626	}
ali@40	627	if (strlen(aline)>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE &&
ali@40	628	lastblen>2 && lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
ali@41	629	results.shortline++;
ali@40	630	if (*aline && (unsigned char)aline[strlen(aline)-1]<=CHAR_SPACE)
ali@40	631	cnt_spacend++;
ali@40	632	if (strstr(aline,".,"))
ali@41	633	results.dotcomma++;
ali@40	634	/* only count ast lines for ignoring purposes where there is */
ali@0	635	/* locase text on the line */
ali@40	636	if (strstr(aline,"*"))
ali@40	637	{
ali@40	638	for (s=aline;*s;s++)
ali@40	639	if (s>='a' && s<='z')
ali@0	640	break;
ali@40	641	if (*s)
ali@41	642	results.astline++;
ali@40	643	}
ali@40	644	if (strstr(aline,"/"))
ali@41	645	results.fslashline++;
ali@40	646	for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)
ali@40	647	;
ali@40	648	if (aline[i]=='-' && aline[i-1]!='-')
ali@41	649	results.hyphens++;
ali@40	650	if (llen>LONGEST_PG_LINE)
ali@41	651	results.longline++;
ali@40	652	if (llen>WAY_TOO_LONG)
ali@41	653	results.verylongline++;
ali@40	654	if (strstr(aline,"<") && strstr(aline,">"))
ali@40	655	{
ali@40	656	i=(signed int)(strstr(aline,">")-strstr(aline,"<")+1);
ali@40	657	if (i>0)
ali@41	658	results.htmcount++;
ali@40	659	if (strstr(aline,"<i>"))
ali@41	660	results.htmcount+=4; /* bonus marks! */
ali@40	661	}
ali@0	662	/* Check for spaced em-dashes */
ali@40	663	if (strstr(aline,"--"))
ali@40	664	{
ali@41	665	results.emdash++;
ali@40	666	if (*(strstr(aline,"--")-1)==CHAR_SPACE \|\|
ali@40	667	(*(strstr(aline,"--")+2)==CHAR_SPACE))
ali@41	668	results.space_emdash++;
ali@40	669	if (*(strstr(aline,"--")-1)==CHAR_SPACE &&
ali@40	670	(*(strstr(aline,"--")+2)==CHAR_SPACE))
ali@40	671	/* count of em-dashes with spaces both sides */
ali@41	672	results.non_PG_space_emdash++;
ali@40	673	if (*(strstr(aline,"--")-1)!=CHAR_SPACE &&
ali@40	674	(*(strstr(aline,"--")+2)!=CHAR_SPACE))
ali@40	675	/* count of PG-type em-dashes with no spaces */
ali@41	676	results.PG_space_emdash++;
ali@40	677	}
ali@40	678	for (s=aline;*s;)
ali@40	679	{
ali@40	680	s=getaword(s,inword);
ali@40	681	if (!strcmp(inword,"hij") \|\| !strcmp(inword,"niet"))
ali@41	682	results.Dutchcount++;
ali@40	683	if (!strcmp(inword,"dans") \|\| !strcmp(inword,"avec"))
ali@41	684	results.Frenchcount++;
ali@40	685	if (!strcmp(inword,"0") \|\| !strcmp(inword,"1"))
ali@41	686	results.standalone_digit++;
ali@40	687	}
ali@0	688	/* Check for spaced dashes */
ali@40	689	if (strstr(aline," -") && *(strstr(aline," -")+2)!='-')
ali@41	690	results.spacedash++;
ali@40	691	lastblen=lastlen;
ali@40	692	lastlen=strlen(aline);
ali@40	693	laststart=aline[0];
ali@40	694	}
ali@41	695	return &results;
ali@41	696	}
ali@41	697
ali@42	698	struct warnings {
ali@42	699	signed int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;
ali@42	700	signed int endquote,isDutch,isFrench;
ali@42	701	};
ali@42	702
ali@42	703	/*
ali@42	704	* report_first_pass:
ali@42	705	*
ali@42	706	* Make some snap decisions based on the first pass results.
ali@42	707	*/
ali@42	708	struct warnings report_first_pass(struct first_pass_results results)
ali@42	709	{
ali@42	710	static struct warnings warnings={0};
ali@42	711	if (cnt_spacend>0)
ali@42	712	printf(" --> %ld lines in this file have white space at end\n",
ali@42	713	cnt_spacend);
ali@42	714	warnings.dotcomma=1;
ali@42	715	if (results->dotcomma>5)
ali@42	716	{
ali@42	717	warnings.dotcomma=0;
ali@42	718	printf(" --> %ld lines in this file contain '.,'. "
ali@42	719	"Not reporting them.\n",results->dotcomma);
ali@42	720	}
ali@42	721	/*
ali@42	722	* If more than 50 lines, or one-tenth, are short,
ali@42	723	* don't bother reporting them.
ali@42	724	*/
ali@42	725	warnings.shortline=1;
ali@42	726	if (results->shortline>50 \|\| results->shortline*10>linecnt)
ali@42	727	{
ali@42	728	warnings.shortline=0;
ali@42	729	printf(" --> %ld lines in this file are short. "
ali@42	730	"Not reporting short lines.\n",results->shortline);
ali@42	731	}
ali@42	732	/*
ali@42	733	* If more than 50 lines, or one-tenth, are long,
ali@42	734	* don't bother reporting them.
ali@42	735	*/
ali@42	736	warnings.longline=1;
ali@42	737	if (results->longline>50 \|\| results->longline*10>linecnt)
ali@42	738	{
ali@42	739	warnings.longline=0;
ali@42	740	printf(" --> %ld lines in this file are long. "
ali@42	741	"Not reporting long lines.\n",results->longline);
ali@42	742	}
ali@42	743	/* If more than 10 lines contain asterisks, don't bother reporting them. */
ali@42	744	warnings.ast=1;
ali@42	745	if (results->astline>10)
ali@42	746	{
ali@42	747	warnings.ast=0;
ali@42	748	printf(" --> %ld lines in this file contain asterisks. "
ali@42	749	"Not reporting them.\n",results->astline);
ali@42	750	}
ali@42	751	/*
ali@42	752	* If more than 10 lines contain forward slashes,
ali@42	753	* don't bother reporting them.
ali@42	754	*/
ali@42	755	warnings.fslash=1;
ali@42	756	if (results->fslashline>10)
ali@42	757	{
ali@42	758	warnings.fslash=0;
ali@42	759	printf(" --> %ld lines in this file contain forward slashes. "
ali@42	760	"Not reporting them.\n",results->fslashline);
ali@42	761	}
ali@42	762	/*
ali@42	763	* If more than 20 lines contain unpunctuated endquotes,
ali@42	764	* don't bother reporting them.
ali@42	765	*/
ali@42	766	warnings.endquote=1;
ali@42	767	if (results->endquote_count>20)
ali@42	768	{
ali@42	769	warnings.endquote=0;
ali@42	770	printf(" --> %ld lines in this file contain unpunctuated endquotes. "
ali@42	771	"Not reporting them.\n",results->endquote_count);
ali@42	772	}
ali@42	773	/*
ali@42	774	* If more than 15 lines contain standalone digits,
ali@42	775	* don't bother reporting them.
ali@42	776	*/
ali@42	777	warnings.digit=1;
ali@42	778	if (results->standalone_digit>10)
ali@42	779	{
ali@42	780	warnings.digit=0;
ali@42	781	printf(" --> %ld lines in this file contain standalone 0s and 1s. "
ali@42	782	"Not reporting them.\n",results->standalone_digit);
ali@42	783	}
ali@42	784	/*
ali@42	785	* If more than 20 lines contain hyphens at end,
ali@42	786	* don't bother reporting them.
ali@42	787	*/
ali@42	788	warnings.hyphen=1;
ali@42	789	if (results->hyphens>20)
ali@42	790	{
ali@42	791	warnings.hyphen=0;
ali@42	792	printf(" --> %ld lines in this file have hyphens at end. "
ali@42	793	"Not reporting them.\n",results->hyphens);
ali@42	794	}
ali@42	795	if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
ali@42	796	{
ali@42	797	printf(" --> Looks like this is HTML. Switching HTML mode ON.\n");
ali@42	798	pswit[MARKUP_SWITCH]=1;
ali@42	799	}
ali@42	800	if (results->verylongline>0)
ali@42	801	printf(" --> %ld lines in this file are VERY long!\n",
ali@42	802	results->verylongline);
ali@42	803	/*
ali@42	804	* If there are more non-PG spaced dashes than PG em-dashes,
ali@42	805	* assume it's deliberate.
ali@42	806	* Current PG guidelines say don't use them, but older texts do,
ali@42	807	* and some people insist on them whatever the guidelines say.
ali@42	808	*/
ali@42	809	warnings.dash=1;
ali@42	810	if (results->spacedash+results->non_PG_space_emdash>
ali@42	811	results->PG_space_emdash)
ali@42	812	{
ali@42	813	warnings.dash=0;
ali@42	814	printf(" --> There are %ld spaced dashes and em-dashes. "
ali@42	815	"Not reporting them.\n",
ali@42	816	results->spacedash+results->non_PG_space_emdash);
ali@42	817	}
ali@42	818	/* If more than a quarter of characters are hi-bit, bug out. */
ali@42	819	warnings.bin=1;
ali@42	820	if (results->binlen*4>results->totlen)
ali@42	821	{
ali@42	822	printf(" --> This file does not appear to be ASCII. "
ali@42	823	"Terminating. Best of luck with it!\n");
ali@42	824	exit(1);
ali@42	825	}
ali@42	826	if (results->alphalen*4<results->totlen)
ali@42	827	{
ali@42	828	printf(" --> This file does not appear to be text. "
ali@42	829	"Terminating. Best of luck with it!\n");
ali@42	830	exit(1);
ali@42	831	}
ali@42	832	if (results->binlen*100>results->totlen \|\| results->binlen>100)
ali@42	833	{
ali@42	834	printf(" --> There are a lot of foreign letters here. "
ali@42	835	"Not reporting them.\n");
ali@42	836	warnings.bin=0;
ali@42	837	}
ali@42	838	warnings.isDutch=0;
ali@42	839	if (results->Dutchcount>50)
ali@42	840	{
ali@42	841	warnings.isDutch=1;
ali@42	842	printf(" --> This looks like Dutch - "
ali@42	843	"switching off dashes and warnings for 's Middags case.\n");
ali@42	844	}
ali@42	845	warnings.isFrench=0;
ali@42	846	if (results->Frenchcount>50)
ali@42	847	{
ali@42	848	warnings.isFrench=1;
ali@42	849	printf(" --> This looks like French - "
ali@42	850	"switching off some doublepunct.\n");
ali@42	851	}
ali@42	852	if (results->firstline && results->footerline)
ali@42	853	printf(" The PG header and footer appear to be already on.\n");
ali@42	854	else
ali@42	855	{
ali@42	856	if (results->firstline)
ali@42	857	printf(" The PG header is on - no footer.\n");
ali@42	858	if (results->footerline)
ali@42	859	printf(" The PG footer is on - no header.\n");
ali@42	860	}
ali@42	861	printf("\n");
ali@42	862	if (pswit[VERBOSE_SWITCH])
ali@42	863	{
ali@42	864	warnings.bin=1;
ali@42	865	warnings.shortline=1;
ali@42	866	warnings.dotcomma=1;
ali@42	867	warnings.longline=1;
ali@42	868	warnings.dash=1;
ali@42	869	warnings.digit=1;
ali@42	870	warnings.ast=1;
ali@42	871	warnings.fslash=1;
ali@42	872	warnings.hyphen=1;
ali@42	873	warnings.endquote=1;
ali@42	874	printf(" * Verbose output is ON -- you asked for it! *\n");
ali@42	875	}
ali@42	876	if (warnings.isDutch)
ali@42	877	warnings.dash=0;
ali@42	878	if (results->footerline>0 && results->firstline>0 &&
ali@42	879	results->footerline>results->firstline &&
ali@42	880	results->footerline-results->firstline<100)
ali@42	881	{
ali@42	882	printf(" --> I don't really know where this text starts. \n");
ali@42	883	printf(" There are no reference points.\n");
ali@42	884	printf(" I'm going to have to report the header and footer "
ali@42	885	"as well.\n");
ali@42	886	results->firstline=0;
ali@42	887	}
ali@42	888	return &warnings;
ali@42	889	}
ali@42	890
ali@43	891	struct counters {
ali@43	892	long quot;
ali@43	893	signed int c_unders,c_brack,s_brack,r_brack;
ali@43	894	signed int open_single_quote,close_single_quote;
ali@43	895	};
ali@43	896
ali@43	897	/*
ali@43	898	* analyse_quotes:
ali@43	899	*
ali@43	900	* Look along the line, accumulate the count of quotes, and see
ali@43	901	* if this is an empty line - i.e. a line with nothing on it
ali@43	902	* but spaces.
ali@43	903	* If line has just spaces, period, * and/or - on it, don't
ali@43	904	* count it, since empty lines with asterisks or dashes to
ali@43	905	* separate sections are common.
ali@43	906	*
ali@43	907	* Returns: Non-zero if the line is empty.
ali@43	908	*/
ali@43	909	int analyse_quotes(const char s,struct counters counters)
ali@43	910	{
ali@43	911	signed int guessquote=0;
ali@43	912	int isemptyline=1; /* assume the line is empty until proven otherwise */
ali@43	913	while (*s)
ali@43	914	{
ali@43	915	if (*s==CHAR_DQUOTE)
ali@43	916	counters->quot++;
ali@43	917	if (s==CHAR_SQUOTE \|\| s==CHAR_OPEN_SQUOTE)
ali@43	918	{
ali@43	919	if (s==aline)
ali@43	920	{
ali@43	921	/*
ali@43	922	* At start of line, it can only be an openquote.
ali@43	923	* Hardcode a very common exception!
ali@43	924	*/
ali@43	925	if (strncmp(s+2,"tis",3) && strncmp(s+2,"Tis",3))
ali@43	926	counters->open_single_quote++;
ali@43	927	}
ali@43	928	else if (gcisalpha(s[-1]) && gcisalpha(s[1]))
ali@43	929	/* Do nothing! it's definitely an apostrophe, not a quote */
ali@43	930	;
ali@43	931	/* it's outside a word - let's check it out */
ali@43	932	else if (*s==CHAR_OPEN_SQUOTE \|\| gcisalpha(s[1]))
ali@43	933	{
ali@43	934	/* it damwell better BE an openquote */
ali@43	935	if (strncmp(s+1,"tis",3) && strncmp(s+1,"Tis",3))
ali@43	936	/* hardcode a very common exception! */
ali@43	937	counters->open_single_quote++;
ali@43	938	}
ali@43	939	else
ali@43	940	{
ali@43	941	/* now - is it a closequote? */
ali@43	942	guessquote=0; /* accumulate clues */
ali@43	943	if (gcisalpha(s[-1]))
ali@43	944	{
ali@43	945	/* it follows a letter - could be either */
ali@43	946	guessquote++;
ali@43	947	if (s[-1]=='s')
ali@43	948	{
ali@43	949	/* looks like a plural apostrophe */
ali@43	950	guessquote-=3;
ali@43	951	if (s[1]==CHAR_SPACE) /* bonus marks! */
ali@43	952	guessquote-=2;
ali@43	953	}
ali@43	954	}
ali@43	955	/* it doesn't have a letter either side */
ali@43	956	else if (strchr(".?!,;:",s[-1]) && strchr(".?!,;: ",s[1]))
ali@43	957	guessquote+=8; /* looks like a closequote */
ali@43	958	else
ali@43	959	guessquote++;
ali@43	960	if (counters->open_single_quote>counters->close_single_quote)
ali@43	961	/*
ali@43	962	* Give it the benefit of some doubt,
ali@43	963	* if a squote is already open.
ali@43	964	*/
ali@43	965	guessquote++;
ali@43	966	else
ali@43	967	guessquote--;
ali@43	968	if (guessquote>=0)
ali@43	969	counters->close_single_quote++;
ali@43	970	}
ali@43	971	}
ali@43	972	if (s!=CHAR_SPACE && s!='-' && s!='.' && s!=CHAR_ASTERISK &&
ali@43	973	s!=13 && s!=10)
ali@43	974	isemptyline=0; /* ignore lines like * * * as spacers */
ali@43	975	if (*s==CHAR_UNDERSCORE)
ali@43	976	counters->c_unders++;
ali@43	977	if (*s==CHAR_OPEN_CBRACK)
ali@43	978	counters->c_brack++;
ali@43	979	if (*s==CHAR_CLOSE_CBRACK)
ali@43	980	counters->c_brack--;
ali@43	981	if (*s==CHAR_OPEN_RBRACK)
ali@43	982	counters->r_brack++;
ali@43	983	if (*s==CHAR_CLOSE_RBRACK)
ali@43	984	counters->r_brack--;
ali@43	985	if (*s==CHAR_OPEN_SBRACK)
ali@43	986	counters->s_brack++;
ali@43	987	if (*s==CHAR_CLOSE_SBRACK)
ali@43	988	counters->s_brack--;
ali@43	989	s++;
ali@43	990	}
ali@43	991	return isemptyline;
ali@43	992	}
ali@43	993
ali@41	994	/*
ali@44	995	* check_for_odd_characters:
ali@44	996	*
ali@44	997	* Check for binary and other odd characters.
ali@44	998	*/
ali@44	999	void check_for_odd_characters(const char aline,const struct warnings warnings,
ali@44	1000	int isemptyline)
ali@44	1001	{
ali@44	1002	/* Don't repeat multiple warnings on one line. */
ali@44	1003	signed int eNon_A=0,eTab=0,eTilde=0,eCarat=0,eFSlash=0,eAst=0;
ali@44	1004	const char *s;
ali@44	1005	unsigned char c;
ali@44	1006	for (s=aline;*s;s++)
ali@44	1007	{
ali@44	1008	c=(unsigned char )s;
ali@44	1009	if (!eNon_A && (s<CHAR_SPACE && s!=9 && *s!='\n' \|\| c>127))
ali@44	1010	{
ali@44	1011	if (pswit[ECHO_SWITCH])
ali@44	1012	printf("\n%s\n",aline);
ali@44	1013	if (!pswit[OVERVIEW_SWITCH])
ali@44	1014	if (c>127 && c<160)
ali@44	1015	printf(" Line %ld column %d - "
ali@44	1016	"Non-ISO-8859 character %d\n",linecnt,(int)(s-aline)+1,c);
ali@44	1017	else
ali@44	1018	printf(" Line %ld column %d - Non-ASCII character %d\n",
ali@44	1019	linecnt,(int)(s-aline)+1,c);
ali@44	1020	else
ali@44	1021	cnt_bin++;
ali@44	1022	eNon_A=1;
ali@44	1023	}
ali@44	1024	if (!eTab && *s==CHAR_TAB)
ali@44	1025	{
ali@44	1026	if (pswit[ECHO_SWITCH])
ali@44	1027	printf("\n%s\n",aline);
ali@44	1028	if (!pswit[OVERVIEW_SWITCH])
ali@44	1029	printf(" Line %ld column %d - Tab character?\n",
ali@44	1030	linecnt,(int)(s-aline)+1);
ali@44	1031	else
ali@44	1032	cnt_odd++;
ali@44	1033	eTab=1;
ali@44	1034	}
ali@44	1035	if (!eTilde && *s==CHAR_TILDE)
ali@44	1036	{
ali@44	1037	/*
ali@44	1038	* Often used by OCR software to indicate an
ali@44	1039	* unrecognizable character.
ali@44	1040	*/
ali@44	1041	if (pswit[ECHO_SWITCH])
ali@44	1042	printf("\n%s\n",aline);
ali@44	1043	if (!pswit[OVERVIEW_SWITCH])
ali@44	1044	printf(" Line %ld column %d - Tilde character?\n",
ali@44	1045	linecnt,(int)(s-aline)+1);
ali@44	1046	else
ali@44	1047	cnt_odd++;
ali@44	1048	eTilde=1;
ali@44	1049	}
ali@44	1050	if (!eCarat && *s==CHAR_CARAT)
ali@44	1051	{
ali@44	1052	if (pswit[ECHO_SWITCH])
ali@44	1053	printf("\n%s\n",aline);
ali@44	1054	if (!pswit[OVERVIEW_SWITCH])
ali@44	1055	printf(" Line %ld column %d - Carat character?\n",
ali@44	1056	linecnt,(int)(s-aline)+1);
ali@44	1057	else
ali@44	1058	cnt_odd++;
ali@44	1059	eCarat=1;
ali@44	1060	}
ali@44	1061	if (!eFSlash && *s==CHAR_FORESLASH && warnings->fslash)
ali@44	1062	{
ali@44	1063	if (pswit[ECHO_SWITCH])
ali@44	1064	printf("\n%s\n",aline);
ali@44	1065	if (!pswit[OVERVIEW_SWITCH])
ali@44	1066	printf(" Line %ld column %d - Forward slash?\n",
ali@44	1067	linecnt,(int)(s-aline)+1);
ali@44	1068	else
ali@44	1069	cnt_odd++;
ali@44	1070	eFSlash=1;
ali@44	1071	}
ali@44	1072	/*
ali@44	1073	* Report asterisks only in paranoid mode,
ali@44	1074	* since they're often deliberate.
ali@44	1075	*/
ali@44	1076	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
ali@44	1077	*s==CHAR_ASTERISK)
ali@44	1078	{
ali@44	1079	if (pswit[ECHO_SWITCH])
ali@44	1080	printf("\n%s\n",aline);
ali@44	1081	if (!pswit[OVERVIEW_SWITCH])
ali@44	1082	printf(" Line %ld column %d - Asterisk?\n",
ali@44	1083	linecnt,(int)(s-aline)+1);
ali@44	1084	else
ali@44	1085	cnt_odd++;
ali@44	1086	eAst=1;
ali@44	1087	}
ali@44	1088	}
ali@44	1089	}
ali@44	1090
ali@44	1091	/*
ali@45	1092	* check_for_long_line:
ali@45	1093	*
ali@45	1094	* Check for line too long.
ali@45	1095	*/
ali@45	1096	void check_for_long_line(const char *aline)
ali@45	1097	{
ali@45	1098	if (strlen(aline)>LONGEST_PG_LINE)
ali@45	1099	{
ali@45	1100	if (pswit[ECHO_SWITCH])
ali@45	1101	printf("\n%s\n",aline);
ali@45	1102	if (!pswit[OVERVIEW_SWITCH])
ali@45	1103	printf(" Line %ld column %d - Long line %d\n",
ali@45	1104	linecnt,strlen(aline),strlen(aline));
ali@45	1105	else
ali@45	1106	cnt_long++;
ali@45	1107	}
ali@45	1108	}
ali@45	1109
ali@45	1110	struct line_properties {
ali@45	1111	unsigned int len,blen;
ali@45	1112	char start;
ali@45	1113	};
ali@45	1114
ali@45	1115	/*
ali@45	1116	* check_for_short_line:
ali@45	1117	*
ali@45	1118	* Check for line too short.
ali@45	1119	*
ali@45	1120	* This one is a bit trickier to implement: we don't want to
ali@45	1121	* flag the last line of a paragraph for being short, so we
ali@45	1122	* have to wait until we know that our current line is a
ali@45	1123	* "normal" line, then report the _previous_ line if it was too
ali@45	1124	* short. We also don't want to report indented lines like
ali@45	1125	* chapter heads or formatted quotations. We therefore keep
ali@45	1126	* last->len as the length of the last line examined, and
ali@45	1127	* last->blen as the length of the last but one, and try to
ali@45	1128	* suppress unnecessary warnings by checking that both were of
ali@45	1129	* "normal" length. We keep the first character of the last
ali@45	1130	* line in last->start, and if it was a space, we assume that
ali@45	1131	* the formatting is deliberate. I can't figure out a way to
ali@45	1132	* distinguish something like a quoted verse left-aligned or
ali@45	1133	* the header or footer of a letter from a paragraph of short
ali@45	1134	* lines - maybe if I examined the whole paragraph, and if the
ali@45	1135	* para has less than, say, 8 lines and if all lines are short,
ali@45	1136	* then just assume it's OK? Need to look at some texts to see
ali@45	1137	* how often a formula like this would get the right result.
ali@45	1138	*/
ali@45	1139	void check_for_short_line(const char aline,const struct line_properties last)
ali@45	1140	{
ali@45	1141	if (strlen(aline)>1 && last->len>1 && last->len<SHORTEST_PG_LINE &&
ali@45	1142	last->blen>1 && last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
ali@45	1143	{
ali@45	1144	if (pswit[ECHO_SWITCH])
ali@45	1145	printf("\n%s\n",prevline);
ali@45	1146	if (!pswit[OVERVIEW_SWITCH])
ali@45	1147	printf(" Line %ld column %d - Short line %d?\n",
ali@45	1148	linecnt-1,strlen(prevline),strlen(prevline));
ali@45	1149	else
ali@45	1150	cnt_short++;
ali@45	1151	}
ali@45	1152	}
ali@45	1153
ali@45	1154	/*
ali@46	1155	* check_for_starting_punctuation:
ali@46	1156	*
ali@46	1157	* Look for punctuation other than full ellipses at start of line.
ali@46	1158	*/
ali@46	1159	void check_for_starting_punctuation(const char *aline)
ali@46	1160	{
ali@46	1161	if (*aline && strchr(".?!,;:",aline[0]) && strncmp(". . .",aline,5))
ali@46	1162	{
ali@46	1163	if (pswit[ECHO_SWITCH])
ali@46	1164	printf("\n%s\n",aline);
ali@46	1165	if (!pswit[OVERVIEW_SWITCH])
ali@46	1166	printf(" Line %ld column 1 - Begins with punctuation?\n",
ali@46	1167	linecnt);
ali@46	1168	else
ali@46	1169	cnt_punct++;
ali@46	1170	}
ali@46	1171	}
ali@46	1172
ali@46	1173	/*
ali@47	1174	* check_for_spaced_emdash:
ali@47	1175	*
ali@47	1176	* Check for spaced em-dashes.
ali@47	1177	*
ali@47	1178	* We must check _all_ occurrences of "--" on the line
ali@47	1179	* hence the loop - even if the first double-dash is OK
ali@47	1180	* there may be another that's wrong later on.
ali@47	1181	*/
ali@47	1182	void check_for_spaced_emdash(const char *aline)
ali@47	1183	{
ali@47	1184	const char s,t;
ali@47	1185	s=aline;
ali@47	1186	while ((t=strstr(s,"--")))
ali@47	1187	{
ali@47	1188	if (t>aline && t[-1]==CHAR_SPACE \|\| t[2]==CHAR_SPACE)
ali@47	1189	{
ali@47	1190	if (pswit[ECHO_SWITCH])
ali@47	1191	printf("\n%s\n",aline);
ali@47	1192	if (!pswit[OVERVIEW_SWITCH])
ali@47	1193	printf(" Line %ld column %d - Spaced em-dash?\n",
ali@47	1194	linecnt,(int)(t-aline)+1);
ali@47	1195	else
ali@47	1196	cnt_dash++;
ali@47	1197	}
ali@47	1198	s=t+2;
ali@47	1199	}
ali@47	1200	}
ali@47	1201
ali@47	1202	/*
ali@47	1203	* check_for_spaced_dash:
ali@47	1204	*
ali@47	1205	* Check for spaced dashes.
ali@47	1206	*/
ali@47	1207	void check_for_spaced_dash(const char *aline)
ali@47	1208	{
ali@47	1209	const char *s;
ali@47	1210	if ((s=strstr(aline," -")))
ali@47	1211	{
ali@47	1212	if (s[2]!='-')
ali@47	1213	{
ali@47	1214	if (pswit[ECHO_SWITCH])
ali@47	1215	printf("\n%s\n",aline);
ali@47	1216	if (!pswit[OVERVIEW_SWITCH])
ali@47	1217	printf(" Line %ld column %d - Spaced dash?\n",
ali@47	1218	linecnt,(int)(s-aline)+1);
ali@47	1219	else
ali@47	1220	cnt_dash++;
ali@47	1221	}
ali@47	1222	}
ali@47	1223	else if ((s=strstr(aline,"- ")))
ali@47	1224	{
ali@47	1225	if (s==aline \|\| s[-1]!='-')
ali@47	1226	{
ali@47	1227	if (pswit[ECHO_SWITCH])
ali@47	1228	printf("\n%s\n",aline);
ali@47	1229	if (!pswit[OVERVIEW_SWITCH])
ali@47	1230	printf(" Line %ld column %d - Spaced dash?\n",
ali@47	1231	linecnt,(int)(s-aline)+1);
ali@47	1232	else
ali@47	1233	cnt_dash++;
ali@47	1234	}
ali@47	1235	}
ali@47	1236	}
ali@47	1237
ali@47	1238	/*
ali@48	1239	* check_for_unmarked_paragraphs:
ali@48	1240	*
ali@48	1241	* Check for unmarked paragraphs indicated by separate speakers.
ali@48	1242	*
ali@48	1243	* May well be false positive:
ali@48	1244	* "Bravo!" "Wonderful!" called the crowd.
ali@48	1245	* but useful all the same.
ali@48	1246	*/
ali@48	1247	void check_for_unmarked_paragraphs(const char *aline)
ali@48	1248	{
ali@48	1249	const char *s;
ali@48	1250	s=strstr(aline,"\" \"");
ali@48	1251	if (!s)
ali@48	1252	s=strstr(aline,"\" \"");
ali@48	1253	if (s)
ali@48	1254	{
ali@48	1255	if (pswit[ECHO_SWITCH])
ali@48	1256	printf("\n%s\n",aline);
ali@48	1257	if (!pswit[OVERVIEW_SWITCH])
ali@48	1258	printf(" Line %ld column %d - Query missing paragraph break?\n",
ali@48	1259	linecnt,(int)(s-aline)+1);
ali@48	1260	else
ali@48	1261	cnt_punct++;
ali@48	1262	}
ali@48	1263	}
ali@48	1264
ali@48	1265	/*
ali@49	1266	* check_for_jeebies:
ali@49	1267	*
ali@49	1268	* Check for "to he" and other easy h/b errors.
ali@49	1269	*
ali@49	1270	* This is a very inadequate effort on the h/b problem,
ali@49	1271	* but the phrase "to he" is always an error, whereas "to
ali@49	1272	* be" is quite common.
ali@49	1273	* Similarly, '"Quiet!", be said.' is a non-be error
ali@49	1274	* "to he" is _not_ always an error!:
ali@49	1275	* "Where they went to he couldn't say."
ali@49	1276	* Another false positive:
ali@49	1277	* What would "Cinderella" be without the . . .
ali@49	1278	* and another: "If he wants to he can see for himself."
ali@49	1279	*/
ali@49	1280	void check_for_jeebies(const char *aline)
ali@49	1281	{
ali@49	1282	const char *s;
ali@49	1283	s=strstr(aline," be could ");
ali@49	1284	if (!s)
ali@49	1285	s=strstr(aline," be would ");
ali@49	1286	if (!s)
ali@49	1287	s=strstr(aline," was be ");
ali@49	1288	if (!s)
ali@49	1289	s=strstr(aline," be is ");
ali@49	1290	if (!s)
ali@49	1291	s=strstr(aline," is be ");
ali@49	1292	if (!s)
ali@49	1293	s=strstr(aline,"\", be ");
ali@49	1294	if (!s)
ali@49	1295	s=strstr(aline,"\" be ");
ali@49	1296	if (!s)
ali@49	1297	s=strstr(aline,"\" be ");
ali@49	1298	if (!s)
ali@49	1299	s=strstr(aline," to he ");
ali@49	1300	if (s)
ali@49	1301	{
ali@49	1302	if (pswit[ECHO_SWITCH])
ali@49	1303	printf("\n%s\n",aline);
ali@49	1304	if (!pswit[OVERVIEW_SWITCH])
ali@49	1305	printf(" Line %ld column %d - Query he/be error?\n",
ali@49	1306	linecnt,(int)(s-aline)+1);
ali@49	1307	else
ali@49	1308	cnt_word++;
ali@49	1309	}
ali@49	1310	s=strstr(aline," the had ");
ali@49	1311	if (!s)
ali@49	1312	s=strstr(aline," a had ");
ali@49	1313	if (!s)
ali@49	1314	s=strstr(aline," they bad ");
ali@49	1315	if (!s)
ali@49	1316	s=strstr(aline," she bad ");
ali@49	1317	if (!s)
ali@49	1318	s=strstr(aline," he bad ");
ali@49	1319	if (!s)
ali@49	1320	s=strstr(aline," you bad ");
ali@49	1321	if (!s)
ali@49	1322	s=strstr(aline," i bad ");
ali@49	1323	if (s)
ali@49	1324	{
ali@49	1325	if (pswit[ECHO_SWITCH])
ali@49	1326	printf("\n%s\n",aline);
ali@49	1327	if (!pswit[OVERVIEW_SWITCH])
ali@49	1328	printf(" Line %ld column %d - Query had/bad error?\n",
ali@49	1329	linecnt,(int)(s-aline)+1);
ali@49	1330	else
ali@49	1331	cnt_word++;
ali@49	1332	}
ali@49	1333	s=strstr(aline,"; hut ");
ali@49	1334	if (!s)
ali@49	1335	s=strstr(aline,", hut ");
ali@49	1336	if (s)
ali@49	1337	{
ali@49	1338	if (pswit[ECHO_SWITCH])
ali@49	1339	printf("\n%s\n",aline);
ali@49	1340	if (!pswit[OVERVIEW_SWITCH])
ali@49	1341	printf(" Line %ld column %d - Query hut/but error?\n",
ali@49	1342	linecnt,(int)(s-aline)+1);
ali@49	1343	else
ali@49	1344	cnt_word++;
ali@49	1345	}
ali@49	1346	}
ali@49	1347
ali@49	1348	/*
ali@50	1349	* check_for_mta_from:
ali@50	1350	*
ali@50	1351	* Special case - angled bracket in front of "From" placed there by an
ali@50	1352	* MTA when sending an e-mail.
ali@50	1353	*/
ali@50	1354	void check_for_mta_from(const char *aline)
ali@50	1355	{
ali@50	1356	const char *s;
ali@50	1357	s=strstr(aline,">From");
ali@50	1358	if (s)
ali@50	1359	{
ali@50	1360	if (pswit[ECHO_SWITCH])
ali@50	1361	printf("\n%s\n",aline);
ali@50	1362	if (!pswit[OVERVIEW_SWITCH])
ali@50	1363	printf(" Line %ld column %d - Query angled bracket with From\n",
ali@50	1364	linecnt,(int)(s-aline)+1);
ali@50	1365	else
ali@50	1366	cnt_punct++;
ali@50	1367	}
ali@50	1368	}
ali@50	1369
ali@50	1370	/*
ali@51	1371	* check_for_orphan_character:
ali@51	1372	*
ali@51	1373	* Check for a single character line -
ali@51	1374	* often an overflow from bad wrapping.
ali@51	1375	*/
ali@51	1376	void check_for_orphan_character(const char *aline)
ali@51	1377	{
ali@51	1378	if (*aline && !aline[1])
ali@51	1379	{
ali@51	1380	if (aline=='I' \|\| aline=='V' \|\| aline=='X' \|\| aline=='L' \|\|
ali@51	1381	gcisdigit(*aline))
ali@51	1382	; /* Nothing - ignore numerals alone on a line. */
ali@51	1383	else
ali@51	1384	{
ali@51	1385	if (pswit[ECHO_SWITCH])
ali@51	1386	printf("\n%s\n",aline);
ali@51	1387	if (!pswit[OVERVIEW_SWITCH])
ali@51	1388	printf(" Line %ld column 1 - Query single character line\n",
ali@51	1389	linecnt);
ali@51	1390	else
ali@51	1391	cnt_punct++;
ali@51	1392	}
ali@51	1393	}
ali@51	1394	}
ali@51	1395
ali@51	1396	/*
ali@41	1397	* procfile:
ali@41	1398	*
ali@41	1399	* Process one file.
ali@41	1400	*/
ali@41	1401	void procfile(char *filename)
ali@41	1402	{
ali@45	1403	char s,t,s1,wordstart;
ali@41	1404	char inword[MAXWORDLEN],testword[MAXWORDLEN];
ali@41	1405	char parastart[81]; /* first line of current para */
ali@41	1406	FILE *infile;
ali@41	1407	struct first_pass_results *first_pass_results;
ali@42	1408	struct warnings *warnings;
ali@43	1409	struct counters counters={0};
ali@45	1410	struct line_properties last={0};
ali@43	1411	int isemptyline;
ali@43	1412	long squot,start_para_line;
ali@44	1413	signed int i,j,llen,isacro,isellipsis,istypo,alower;
ali@43	1414	signed int dquotepar,squotepar;
ali@41	1415	signed int isnewpara,vowel,consonant;
ali@41	1416	char dquote_err[80],squote_err[80],rbrack_err[80],sbrack_err[80],
ali@41	1417	cbrack_err[80],unders_err[80];
ali@41	1418	signed int qword_index,qperiod_index,isdup;
ali@41	1419	signed int enddash;
ali@45	1420	last.start=CHAR_SPACE;
ali@41	1421	dquote_err=squote_err=rbrack_err=cbrack_err=*sbrack_err=
ali@41	1422	unders_err=prevline=0;
ali@41	1423	linecnt=checked_linecnt=start_para_line=0;
ali@43	1424	squot=0;
ali@43	1425	i=llen=isacro=isellipsis=istypo=0;
ali@41	1426	isnewpara=vowel=consonant=enddash=0;
ali@41	1427	qword_index=qperiod_index=isdup=0;
ali@41	1428	inword=testword=0;
ali@43	1429	dquotepar=squotepar=0;
ali@41	1430	for (j=0;j<MAX_QWORD;j++)
ali@41	1431	{
ali@41	1432	dupcnt[j]=0;
ali@41	1433	for (i=0;i<MAX_QWORD_LENGTH;i++)
ali@41	1434	{
ali@41	1435	qword[i][j]=0;
ali@41	1436	qperiod[i][j]=0;
ali@41	1437	}
ali@41	1438	}
ali@41	1439	infile=fopen(filename,"rb");
ali@41	1440	if (!infile)
ali@41	1441	{
ali@41	1442	if (pswit[STDOUT_SWITCH])
ali@41	1443	fprintf(stdout,"bookloupe: cannot open %s\n",filename);
ali@41	1444	else
ali@41	1445	fprintf(stderr,"bookloupe: cannot open %s\n",filename);
ali@41	1446	exit(1);
ali@41	1447	}
ali@41	1448	fprintf(stdout,"\n\nFile: %s\n\n",filename);
ali@41	1449	first_pass_results=first_pass(infile);
ali@42	1450	warnings=report_first_pass(first_pass_results);
ali@42	1451	rewind(infile);
ali@40	1452	/*
ali@40	1453	* Here we go with the main pass. Hold onto yer hat!
ali@40	1454	* Re-init some variables we've dirtied.
ali@40	1455	*/
ali@43	1456	squot=linecnt=0;
ali@40	1457	while (flgets(aline,LINEBUFSIZE-1,infile,linecnt+1))
ali@40	1458	{
ali@0	1459	linecnt++;
ali@40	1460	if (linecnt==1)
ali@40	1461	isnewpara=1;
ali@40	1462	if (pswit[DP_SWITCH] && !strncmp(aline,"-----File: ",11))
ali@40	1463	continue; // skip DP page separators completely
ali@41	1464	if (linecnt<first_pass_results->firstline \|\|
ali@41	1465	(first_pass_results->footerline>0 &&
ali@41	1466	linecnt>first_pass_results->footerline))
ali@40	1467	{
ali@40	1468	if (pswit[HEADER_SWITCH])
ali@40	1469	{
ali@40	1470	if (!strncmp(aline,"Title:",6))
ali@40	1471	printf(" %s\n",aline);
ali@40	1472	if (!strncmp(aline,"Author:",7))
ali@40	1473	printf(" %s\n",aline);
ali@40	1474	if (!strncmp(aline,"Release Date:",13))
ali@40	1475	printf(" %s\n",aline);
ali@40	1476	if (!strncmp(aline,"Edition:",8))
ali@40	1477	printf(" %s\n\n",aline);
ali@40	1478	}
ali@0	1479	continue; /* skip through the header */
ali@40	1480	}
ali@0	1481	checked_linecnt++;
ali@40	1482	s=aline;
ali@40	1483	/*
ali@40	1484	* If we are in a state of unbalanced quotes, and this line
ali@40	1485	* doesn't begin with a quote, output the stored error message.
ali@40	1486	* If the -P switch was used, print the warning even if the
ali@40	1487	* new para starts with quotes.
ali@40	1488	*/
ali@40	1489	t=s;
ali@40	1490	while (*t==' ')
ali@40	1491	t++;
ali@0	1492	if (*dquote_err)
ali@40	1493	if (*t!=CHAR_DQUOTE \|\| pswit[QPARA_SWITCH])
ali@40	1494	{
ali@40	1495	if (!pswit[OVERVIEW_SWITCH])
ali@40	1496	{
ali@40	1497	if (pswit[ECHO_SWITCH])
ali@40	1498	printf("\n%s\n",parastart);
ali@0	1499	printf(dquote_err);
ali@40	1500	}
ali@0	1501	else
ali@0	1502	cnt_dquot++;
ali@0	1503	}
ali@40	1504	if (*squote_err)
ali@40	1505	{
ali@40	1506	if (t!=CHAR_SQUOTE && t!=CHAR_OPEN_SQUOTE \|\|
ali@40	1507	pswit[QPARA_SWITCH] \|\| squot)
ali@40	1508	{
ali@40	1509	if (!pswit[OVERVIEW_SWITCH])
ali@40	1510	{
ali@40	1511	if (pswit[ECHO_SWITCH])
ali@40	1512	printf("\n%s\n",parastart);
ali@0	1513	printf(squote_err);
ali@40	1514	}
ali@0	1515	else
ali@0	1516	cnt_squot++;
ali@40	1517	}
ali@40	1518	squot=0;
ali@40	1519	}
ali@40	1520	if (*rbrack_err)
ali@40	1521	{
ali@40	1522	if (!pswit[OVERVIEW_SWITCH])
ali@40	1523	{
ali@40	1524	if (pswit[ECHO_SWITCH])
ali@40	1525	printf("\n%s\n",parastart);
ali@0	1526	printf(rbrack_err);
ali@40	1527	}
ali@0	1528	else
ali@0	1529	cnt_brack++;
ali@40	1530	}
ali@40	1531	if (*sbrack_err)
ali@40	1532	{
ali@40	1533	if (!pswit[OVERVIEW_SWITCH])
ali@40	1534	{
ali@40	1535	if (pswit[ECHO_SWITCH])
ali@40	1536	printf("\n%s\n",parastart);
ali@0	1537	printf(sbrack_err);
ali@40	1538	}
ali@0	1539	else
ali@0	1540	cnt_brack++;
ali@40	1541	}
ali@40	1542	if (*cbrack_err)
ali@40	1543	{
ali@40	1544	if (!pswit[OVERVIEW_SWITCH])
ali@40	1545	{
ali@40	1546	if (pswit[ECHO_SWITCH])
ali@40	1547	printf("\n%s\n",parastart);
ali@0	1548	printf(cbrack_err);
ali@40	1549	}
ali@0	1550	else
ali@0	1551	cnt_brack++;
ali@40	1552	}
ali@40	1553	if (*unders_err)
ali@40	1554	{
ali@40	1555	if (!pswit[OVERVIEW_SWITCH])
ali@40	1556	{
ali@40	1557	if (pswit[ECHO_SWITCH])
ali@40	1558	printf("\n%s\n",parastart);
ali@0	1559	printf(unders_err);
ali@40	1560	}
ali@0	1561	else
ali@0	1562	cnt_brack++;
ali@40	1563	}
ali@40	1564	dquote_err=squote_err=rbrack_err=cbrack_err=
ali@40	1565	sbrack_err=unders_err=0;
ali@43	1566	isemptyline=analyse_quotes(aline,&counters);
ali@40	1567	if (isnewpara && !isemptyline)
ali@40	1568	{
ali@40	1569	/* This line is the start of a new paragraph. */
ali@40	1570	start_para_line=linecnt;
ali@40	1571	/* Capture its first line in case we want to report it later. */
ali@40	1572	strncpy(parastart,aline,80);
ali@40	1573	parastart[79]=0;
ali@40	1574	dquotepar=squotepar=0; /* restart the quote count */
ali@40	1575	s=aline;
ali@40	1576	while (!gcisalpha(s) && !gcisdigit(s) && *s)
ali@40	1577	s++;
ali@40	1578	if (s>='a' && s<='z')
ali@40	1579	{
ali@40	1580	/* and its first letter is lowercase */
ali@40	1581	if (pswit[ECHO_SWITCH])
ali@40	1582	printf("\n%s\n",aline);
ali@0	1583	if (!pswit[OVERVIEW_SWITCH])
ali@40	1584	printf(" Line %ld column %d - "
ali@40	1585	"Paragraph starts with lower-case\n",
ali@40	1586	linecnt,(int)(s-aline)+1);
ali@0	1587	else
ali@0	1588	cnt_punct++;
ali@40	1589	}
ali@40	1590	isnewpara=0; /* Signal the end of new para processing. */
ali@40	1591	}
ali@40	1592	/* Check for an em-dash broken at line end. */
ali@40	1593	if (enddash && *aline=='-')
ali@40	1594	{
ali@40	1595	if (pswit[ECHO_SWITCH])
ali@40	1596	printf("\n%s\n",aline);
ali@0	1597	if (!pswit[OVERVIEW_SWITCH])
ali@40	1598	printf(" Line %ld column 1 - Broken em-dash?\n",linecnt);
ali@0	1599	else
ali@0	1600	cnt_punct++;
ali@40	1601	}
ali@40	1602	enddash=0;
ali@40	1603	for (s=aline+strlen(aline)-1;*s==' ' && s>aline;s--)
ali@40	1604	;
ali@40	1605	if (s>=aline && *s=='-')
ali@40	1606	enddash=1;
ali@40	1607	/*
ali@40	1608	* Check for invalid or questionable characters in the line
ali@40	1609	* Anything above 127 is invalid for plain ASCII, and
ali@40	1610	* non-printable control characters should also be flagged.
ali@40	1611	* Tabs should generally not be there.
ali@40	1612	*/
ali@40	1613	for (s=aline;*s;s++)
ali@40	1614	{
ali@40	1615	i=(unsigned char)*s;
ali@40	1616	if (i<CHAR_SPACE && i!=CHAR_LF && i!=CHAR_CR && i!=CHAR_TAB)
ali@40	1617	{
ali@40	1618	if (pswit[ECHO_SWITCH])
ali@40	1619	printf("\n%s\n",aline);
ali@0	1620	if (!pswit[OVERVIEW_SWITCH])
ali@40	1621	printf(" Line %ld column %d - Control character %d\n",
ali@40	1622	linecnt,(int)(s-aline)+1,i);
ali@0	1623	else
ali@0	1624	cnt_bin++;
ali@40	1625	}
ali@40	1626	}
ali@42	1627	if (warnings->bin)
ali@44	1628	check_for_odd_characters(aline,warnings,isemptyline);
ali@42	1629	if (warnings->longline)
ali@45	1630	check_for_long_line(aline);
ali@45	1631	if (warnings->shortline)
ali@45	1632	check_for_short_line(aline,&last);
ali@45	1633	last.blen=last.len;
ali@45	1634	last.len=strlen(aline);
ali@45	1635	last.start=aline[0];
ali@46	1636	check_for_starting_punctuation(aline);
ali@42	1637	if (warnings->dash)
ali@40	1638	{
ali@47	1639	check_for_spaced_emdash(aline);
ali@47	1640	check_for_spaced_dash(aline);
ali@40	1641	}
ali@48	1642	check_for_unmarked_paragraphs(aline);
ali@49	1643	check_for_jeebies(aline);
ali@50	1644	check_for_mta_from(aline);
ali@51	1645	check_for_orphan_character(aline);
ali@40	1646	/* Check for I" - often should be ! */
ali@40	1647	if (strstr(aline," I\""))
ali@40	1648	{
ali@40	1649	if (pswit[ECHO_SWITCH])
ali@40	1650	printf("\n%s\n",aline);
ali@0	1651	if (!pswit[OVERVIEW_SWITCH])
ali@40	1652	printf(" Line %ld column %ld - Query I=exclamation mark?\n",
ali@40	1653	linecnt,strstr(aline," I\"")-aline);
ali@0	1654	else
ali@0	1655	cnt_punct++;
ali@40	1656	}
ali@40	1657	/*
ali@40	1658	* Check for period without a capital letter. Cut-down from gutspell.
ali@40	1659	* Only works when it happens on a single line.
ali@40	1660	*/
ali@40	1661	if (pswit[PARANOID_SWITCH])
ali@40	1662	{
ali@40	1663	for (t=s=aline;strstr(t,". ");)
ali@40	1664	{
ali@40	1665	t=strstr(t,". ");
ali@40	1666	if (t==s)
ali@40	1667	{
ali@40	1668	t++;
ali@40	1669	/* start of line punctuation is handled elsewhere */
ali@40	1670	continue;
ali@40	1671	}
ali@40	1672	if (!gcisalpha(t[-1]))
ali@40	1673	{
ali@40	1674	t++;
ali@40	1675	continue;
ali@40	1676	}
ali@42	1677	if (warnings->isDutch)
ali@40	1678	{
ali@40	1679	/* For Frank & Jeroen -- 's Middags case */
ali@40	1680	if (t[2]==CHAR_SQUOTE && t[3]>='a' && t[3]<='z' &&
ali@40	1681	t[4]==CHAR_SPACE && t[5]>='A' && t[5]<='Z')
ali@40	1682	{
ali@40	1683	t++;
ali@40	1684	continue;
ali@40	1685	}
ali@40	1686	}
ali@40	1687	s1=t+2;
ali@40	1688	while (s1 && !gcisalpha(s1) && !isdigit(*s1))
ali@40	1689	s1++;
ali@40	1690	if (s1>='a' && s1<='z')
ali@40	1691	{
ali@40	1692	/* we have something to investigate */
ali@40	1693	istypo=1;
ali@40	1694	/* so let's go back and find out */
ali@40	1695	for (s1=t-1;s1>=s &&
ali@40	1696	(gcisalpha(s1) \|\| gcisdigit(s1) \|\| *s1==CHAR_SQUOTE &&
ali@40	1697	gcisalpha(s1[1]) && gcisalpha(s1[-1]));s1--)
ali@40	1698	;
ali@40	1699	s1++;
ali@40	1700	for (i=0;s1 && s1!='.';s1++,i++)
ali@40	1701	testword[i]=*s1;
ali@40	1702	testword[i]=0;
ali@40	1703	for (i=0;*abbrev[i];i++)
ali@40	1704	if (!strcmp(testword,abbrev[i]))
ali@40	1705	istypo=0;
ali@40	1706	if (gcisdigit(*testword))
ali@40	1707	istypo=0;
ali@40	1708	if (!testword[1])
ali@40	1709	istypo=0;
ali@40	1710	if (isroman(testword))
ali@40	1711	istypo=0;
ali@40	1712	if (istypo)
ali@40	1713	{
ali@40	1714	istypo=0;
ali@40	1715	for (i=0;testword[i];i++)
ali@40	1716	if (strchr(vowels,testword[i]))
ali@40	1717	istypo=1;
ali@40	1718	}
ali@40	1719	if (istypo)
ali@40	1720	{
ali@40	1721	isdup=0;
ali@40	1722	if (strlen(testword)<MAX_QWORD_LENGTH &&
ali@40	1723	!pswit[VERBOSE_SWITCH])
ali@40	1724	for (i=0;i<qperiod_index;i++)
ali@40	1725	if (!strcmp(testword,qperiod[i]))
ali@40	1726	isdup=1;
ali@40	1727	if (!isdup)
ali@40	1728	{
ali@40	1729	if (qperiod_index<MAX_QWORD &&
ali@40	1730	strlen(testword)<MAX_QWORD_LENGTH)
ali@40	1731	{
ali@40	1732	strcpy(qperiod[qperiod_index],testword);
ali@40	1733	qperiod_index++;
ali@40	1734	}
ali@40	1735	if (pswit[ECHO_SWITCH])
ali@40	1736	printf("\n%s\n",aline);
ali@40	1737	if (!pswit[OVERVIEW_SWITCH])
ali@40	1738	printf(" Line %ld column %d - "
ali@40	1739	"Extra period?\n",linecnt,(int)(t-aline)+1);
ali@40	1740	else
ali@40	1741	cnt_punct++;
ali@40	1742	}
ali@40	1743	}
ali@40	1744	}
ali@40	1745	t++;
ali@40	1746	}
ali@40	1747	}
ali@40	1748	if (pswit[TYPO_SWITCH])
ali@40	1749	{
ali@40	1750	/* Check for words usually not followed by punctuation. */
ali@40	1751	for (s=aline;*s;)
ali@40	1752	{
ali@40	1753	wordstart=s;
ali@40	1754	s=getaword(s,inword);
ali@40	1755	if (!*inword)
ali@40	1756	continue;
ali@40	1757	lowerit(inword);
ali@40	1758	for (i=0;*nocomma[i];i++)
ali@40	1759	if (!strcmp(inword,nocomma[i]))
ali@40	1760	{
ali@40	1761	if (s==',' \|\| s==';' \|\| *s==':')
ali@40	1762	{
ali@40	1763	if (pswit[ECHO_SWITCH])
ali@40	1764	printf("\n%s\n",aline);
ali@40	1765	if (!pswit[OVERVIEW_SWITCH])
ali@40	1766	printf(" Line %ld column %d - "
ali@40	1767	"Query punctuation after %s?\n",
ali@40	1768	linecnt,(int)(s-aline)+1,inword);
ali@40	1769	else
ali@40	1770	cnt_punct++;
ali@40	1771	}
ali@40	1772	}
ali@40	1773	for (i=0;*noperiod[i];i++)
ali@40	1774	if (!strcmp(inword,noperiod[i]))
ali@40	1775	{
ali@40	1776	if (s=='.' \|\| s=='!')
ali@40	1777	{
ali@40	1778	if (pswit[ECHO_SWITCH])
ali@40	1779	printf("\n%s\n",aline);
ali@40	1780	if (!pswit[OVERVIEW_SWITCH])
ali@40	1781	printf(" Line %ld column %d - "
ali@40	1782	"Query punctuation after %s?\n",
ali@40	1783	linecnt,(int)(s-aline)+1,inword);
ali@40	1784	else
ali@40	1785	cnt_punct++;
ali@40	1786	}
ali@40	1787	}
ali@40	1788	}
ali@40	1789	}
ali@40	1790	/*
ali@40	1791	* Check for commonly mistyped words,
ali@40	1792	* and digits like 0 for O in a word.
ali@40	1793	*/
ali@40	1794	for (s=aline;*s;)
ali@40	1795	{
ali@40	1796	wordstart=s;
ali@40	1797	s=getaword(s,inword);
ali@40	1798	if (!*inword)
ali@40	1799	continue; /* don't bother with empty lines */
ali@40	1800	if (mixdigit(inword))
ali@40	1801	{
ali@40	1802	if (pswit[ECHO_SWITCH])
ali@40	1803	printf("\n%s\n",aline);
ali@40	1804	if (!pswit[OVERVIEW_SWITCH])
ali@42	1805	printf(" Line %ld column %d - Query digit in %s\n",
ali@40	1806	linecnt,(int)(wordstart-aline)+1,inword);
ali@40	1807	else
ali@40	1808	cnt_word++;
ali@40	1809	}
ali@40	1810	/*
ali@40	1811	* Put the word through a series of tests for likely typos and OCR
ali@40	1812	* errors.
ali@40	1813	*/
ali@40	1814	if (pswit[TYPO_SWITCH])
ali@40	1815	{
ali@40	1816	istypo=0;
ali@40	1817	strcpy(testword,inword);
ali@40	1818	alower=0;
ali@40	1819	for (i=0;i<(signed int)strlen(testword);i++)
ali@40	1820	{
ali@40	1821	/* lowercase for testing */
ali@40	1822	if (testword[i]>='a' && testword[i]<='z')
ali@40	1823	alower=1;
ali@40	1824	if (alower && testword[i]>='A' && testword[i]<='Z')
ali@40	1825	{
ali@40	1826	/*
ali@40	1827	* We have an uppercase mid-word. However, there are
ali@40	1828	* common cases:
ali@40	1829	* Mac and Mc like McGill
ali@40	1830	* French contractions like l'Abbe
ali@40	1831	*/
ali@40	1832	if (i==2 && testword[0]=='m' && testword[1]=='c' \|\|
ali@40	1833	i==3 && testword[0]=='m' && testword[1]=='a' &&
ali@40	1834	testword[2]=='c' \|\| i>0 && testword[i-1]==CHAR_SQUOTE)
ali@40	1835	; /* do nothing! */
ali@40	1836	else
ali@40	1837	istypo=1;
ali@40	1838	}
ali@40	1839	testword[i]=(char)tolower(testword[i]);
ali@40	1840	}
ali@40	1841	/*
ali@40	1842	* Check for certain unlikely two-letter combinations at word
ali@40	1843	* start and end.
ali@40	1844	*/
ali@40	1845	if (strlen(testword)>1)
ali@40	1846	{
ali@40	1847	for (i=0;*nostart[i];i++)
ali@40	1848	if (!strncmp(testword,nostart[i],2))
ali@40	1849	istypo=1;
ali@40	1850	for (i=0;*noend[i];i++)
ali@40	1851	if (!strncmp(testword+strlen(testword)-2,noend[i],2))
ali@40	1852	istypo=1;
ali@40	1853	}
ali@40	1854	/* ght is common, gbt never. Like that. */
ali@40	1855	if (strstr(testword,"cb"))
ali@40	1856	istypo=1;
ali@40	1857	if (strstr(testword,"gbt"))
ali@40	1858	istypo=1;
ali@40	1859	if (strstr(testword,"pbt"))
ali@40	1860	istypo=1;
ali@40	1861	if (strstr(testword,"tbs"))
ali@40	1862	istypo=1;
ali@40	1863	if (strstr(testword,"mrn"))
ali@40	1864	istypo=1;
ali@40	1865	if (strstr(testword,"ahle"))
ali@40	1866	istypo=1;
ali@40	1867	if (strstr(testword,"ihle"))
ali@40	1868	istypo=1;
ali@40	1869	/*
ali@40	1870	* "TBE" does happen - like HEARTBEAT - but uncommon.
ali@40	1871	* Also "TBI" - frostbite, outbid - but uncommon.
ali@40	1872	* Similarly "ii" like Hawaii, or Pompeii, and in Roman
ali@40	1873	* numerals, but "ii" is a common scanno.
ali@40	1874	*/
ali@40	1875	if (strstr(testword,"tbi"))
ali@40	1876	istypo=1;
ali@40	1877	if (strstr(testword,"tbe"))
ali@40	1878	istypo=1;
ali@40	1879	if (strstr(testword,"ii"))
ali@40	1880	istypo=1;
ali@40	1881	/*
ali@40	1882	* Check for no vowels or no consonants.
ali@40	1883	* If none, flag a typo.
ali@40	1884	*/
ali@40	1885	if (!istypo && strlen(testword)>1)
ali@40	1886	{
ali@40	1887	vowel=consonant=0;
ali@40	1888	for (i=0;testword[i];i++)
ali@40	1889	{
ali@40	1890	if (testword[i]=='y' \|\| gcisdigit(testword[i]))
ali@40	1891	{
ali@40	1892	/* Yah, this is loose. */
ali@40	1893	vowel++;
ali@40	1894	consonant++;
ali@40	1895	}
ali@40	1896	else if (strchr(vowels,testword[i]))
ali@40	1897	vowel++;
ali@40	1898	else
ali@40	1899	consonant++;
ali@40	1900	}
ali@40	1901	if (!vowel \|\| !consonant)
ali@40	1902	istypo=1;
ali@40	1903	}
ali@40	1904	/*
ali@40	1905	* Now exclude the word from being reported if it's in
ali@40	1906	* the okword list.
ali@40	1907	*/
ali@40	1908	for (i=0;*okword[i];i++)
ali@40	1909	if (!strcmp(testword,okword[i]))
ali@40	1910	istypo=0;
ali@40	1911	/*
ali@40	1912	* What looks like a typo may be a Roman numeral.
ali@40	1913	* Exclude these.
ali@40	1914	*/
ali@40	1915	if (istypo && isroman(testword))
ali@40	1916	istypo=0;
ali@40	1917	/* Check the manual list of typos. */
ali@40	1918	if (!istypo)
ali@40	1919	for (i=0;*typo[i];i++)
ali@40	1920	if (!strcmp(testword,typo[i]))
ali@40	1921	istypo=1;
ali@40	1922	/*
ali@40	1923	* Check lowercase s, l, i and m - special cases.
ali@40	1924	* "j" - often a semi-colon gone wrong.
ali@40	1925	* "d" for a missing apostrophe - he d
ali@40	1926	* "n" for "in"
ali@40	1927	*/
ali@40	1928	if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword))
ali@40	1929	istypo=1;
ali@40	1930	if (istypo)
ali@40	1931	{
ali@40	1932	isdup=0;
ali@40	1933	if (strlen(testword)<MAX_QWORD_LENGTH &&
ali@40	1934	!pswit[VERBOSE_SWITCH])
ali@40	1935	for (i=0;i<qword_index;i++)
ali@40	1936	if (!strcmp(testword,qword[i]))
ali@40	1937	{
ali@40	1938	isdup=1;
ali@40	1939	++dupcnt[i];
ali@40	1940	}
ali@40	1941	if (!isdup)
ali@40	1942	{
ali@40	1943	if (qword_index<MAX_QWORD &&
ali@40	1944	strlen(testword)<MAX_QWORD_LENGTH)
ali@40	1945	{
ali@40	1946	strcpy(qword[qword_index],testword);
ali@40	1947	qword_index++;
ali@40	1948	}
ali@40	1949	if (pswit[ECHO_SWITCH])
ali@40	1950	printf("\n%s\n",aline);
ali@40	1951	if (!pswit[OVERVIEW_SWITCH])
ali@40	1952	{
ali@40	1953	printf(" Line %ld column %d - Query word %s",
ali@40	1954	linecnt,(int)(wordstart-aline)+1,inword);
ali@40	1955	if (strlen(testword)<MAX_QWORD_LENGTH &&
ali@40	1956	!pswit[VERBOSE_SWITCH])
ali@40	1957	printf(" - not reporting duplicates");
ali@40	1958	printf("\n");
ali@40	1959	}
ali@40	1960	else
ali@40	1961	cnt_word++;
ali@40	1962	}
ali@40	1963	}
ali@40	1964	}
ali@40	1965	/* check the user's list of typos */
ali@40	1966	if (!istypo && usertypo_count)
ali@40	1967	for (i=0;i<usertypo_count;i++)
ali@40	1968	if (!strcmp(testword,usertypo[i]))
ali@40	1969	{
ali@40	1970	if (pswit[ECHO_SWITCH])
ali@40	1971	printf("\n%s\n",aline);
ali@40	1972	if (!pswit[OVERVIEW_SWITCH])
ali@40	1973	printf(" Line %ld column %d - "
ali@40	1974	"Query possible scanno %s\n",
ali@40	1975	linecnt,(int)(wordstart-aline)+2,inword);
ali@40	1976	}
ali@42	1977	if (pswit[PARANOID_SWITCH] && warnings->digit)
ali@40	1978	{
ali@40	1979	/* In paranoid mode, query all 0 and 1 standing alone. */
ali@40	1980	if (!strcmp(inword,"0") \|\| !strcmp(inword,"1"))
ali@40	1981	{
ali@40	1982	if (pswit[ECHO_SWITCH])
ali@40	1983	printf("\n%s\n",aline);
ali@40	1984	if (!pswit[OVERVIEW_SWITCH])
ali@40	1985	printf(" Line %ld column %d - Query standalone %s\n",
ali@40	1986	linecnt,(int)(wordstart-aline)+2,inword);
ali@40	1987	else
ali@40	1988	cnt_word++;
ali@40	1989	}
ali@40	1990	}
ali@40	1991	}
ali@40	1992	/*
ali@40	1993	* Look for added or missing spaces around punctuation and quotes.
ali@40	1994	* If there is a punctuation character like ! with no space on
ali@40	1995	* either side, suspect a missing!space. If there are spaces on
ali@40	1996	* both sides , assume a typo. If we see a double quote with no
ali@40	1997	* space or punctuation on either side of it, assume unspaced
ali@40	1998	* quotes "like"this.
ali@40	1999	*/
ali@40	2000	llen=strlen(aline);
ali@40	2001	for (i=1;i<llen;i++)
ali@40	2002	{
ali@40	2003	/* For each character in the line after the first. */
ali@40	2004	if (strchr(".?!,;:_",aline[i])) /* if it's punctuation */
ali@40	2005	{
ali@40	2006	/* we need to suppress warnings for acronyms like M.D. */
ali@40	2007	isacro=0;
ali@40	2008	/* we need to suppress warnings for ellipsis . . . */
ali@40	2009	isellipsis=0;
ali@40	2010	/* if there are letters on both sides of it or ... */
ali@40	2011	if (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1]) \|\|
ali@40	2012	gcisalpha(aline[i+1]) && strchr("?!,;:",aline[i]))
ali@40	2013	{
ali@40	2014	/* ...if it's strict punctuation followed by an alpha */
ali@40	2015	if (aline[i]=='.')
ali@40	2016	{
ali@40	2017	if (i>2 && aline[i-2]=='.')
ali@40	2018	isacro=1;
ali@40	2019	if (i+2<llen && aline[i+2]=='.')
ali@40	2020	isacro=1;
ali@40	2021	}
ali@40	2022	if (!isacro)
ali@40	2023	{
ali@40	2024	if (pswit[ECHO_SWITCH])
ali@40	2025	printf("\n%s\n",aline);
ali@40	2026	if (!pswit[OVERVIEW_SWITCH])
ali@40	2027	printf(" Line %ld column %d - Missing space?\n",
ali@40	2028	linecnt,i+1);
ali@40	2029	else
ali@40	2030	cnt_punct++;
ali@40	2031	}
ali@40	2032	}
ali@40	2033	if (aline[i-1]==CHAR_SPACE &&
ali@40	2034	(aline[i+1]==CHAR_SPACE \|\| aline[i+1]==0))
ali@40	2035	{
ali@40	2036	/*
ali@40	2037	* If there are spaces on both sides,
ali@40	2038	* or space before and end of line.
ali@40	2039	*/
ali@40	2040	if (aline[i]=='.')
ali@40	2041	{
ali@40	2042	if (i>2 && aline[i-2]=='.')
ali@40	2043	isellipsis=1;
ali@40	2044	if (i+2<llen && aline[i+2]=='.')
ali@40	2045	isellipsis=1;
ali@40	2046	}
ali@40	2047	if (!isemptyline && !isellipsis)
ali@40	2048	{
ali@40	2049	if (pswit[ECHO_SWITCH])
ali@40	2050	printf("\n%s\n",aline);
ali@40	2051	if (!pswit[OVERVIEW_SWITCH])
ali@40	2052	printf(" Line %ld column %d - "
ali@40	2053	"Spaced punctuation?\n",linecnt,i+1);
ali@40	2054	else
ali@40	2055	cnt_punct++;
ali@40	2056	}
ali@40	2057	}
ali@40	2058	}
ali@40	2059	}
ali@40	2060	/* Split out the characters that CANNOT be preceded by space. */
ali@40	2061	llen=strlen(aline);
ali@40	2062	for (i=1;i<llen;i++)
ali@40	2063	{
ali@40	2064	/* for each character in the line after the first */
ali@40	2065	if (strchr("?!,;:",aline[i]))
ali@40	2066	{
ali@40	2067	/* if it's punctuation that _cannot_ have a space before it */
ali@40	2068	if (aline[i-1]==CHAR_SPACE && !isemptyline &&
ali@40	2069	aline[i+1]!=CHAR_SPACE)
ali@40	2070	{
ali@40	2071	/*
ali@40	2072	* If aline[i+1) DOES == space,
ali@40	2073	* it was already reported just above.
ali@40	2074	*/
ali@40	2075	if (pswit[ECHO_SWITCH])
ali@40	2076	printf("\n%s\n",aline);
ali@40	2077	if (!pswit[OVERVIEW_SWITCH])
ali@40	2078	printf(" Line %ld column %d - Spaced punctuation?\n",
ali@40	2079	linecnt,i+1);
ali@40	2080	else
ali@40	2081	cnt_punct++;
ali@40	2082	}
ali@40	2083	}
ali@40	2084	}
ali@40	2085	/*
ali@40	2086	* Special case " .X" where X is any alpha.
ali@40	2087	* This plugs a hole in the acronym code above.
ali@40	2088	* Inelegant, but maintainable.
ali@40	2089	*/
ali@40	2090	llen=strlen(aline);
ali@40	2091	for (i=1;i<llen;i++)
ali@40	2092	{
ali@40	2093	/* for each character in the line after the first */
ali@40	2094	if (aline[i]=='.')
ali@40	2095	{
ali@40	2096	/* if it's a period */
ali@40	2097	if (aline[i-1]==CHAR_SPACE && gcisalpha(aline[i+1]))
ali@40	2098	{
ali@40	2099	/*
ali@40	2100	* If the period follows a space and
ali@40	2101	* is followed by a letter.
ali@40	2102	*/
ali@40	2103	if (pswit[ECHO_SWITCH])
ali@40	2104	printf("\n%s\n",aline);
ali@40	2105	if (!pswit[OVERVIEW_SWITCH])
ali@40	2106	printf(" Line %ld column %d - Spaced punctuation?\n",
ali@40	2107	linecnt,i+1);
ali@40	2108	else
ali@40	2109	cnt_punct++;
ali@40	2110	}
ali@40	2111	}
ali@40	2112	}
ali@40	2113	for (i=1;i<llen;i++)
ali@40	2114	{
ali@40	2115	/* for each character in the line after the first */
ali@40	2116	if (aline[i]==CHAR_DQUOTE)
ali@40	2117	{
ali@40	2118	if (!strchr(" _-.'`,;:!/([{?}])",aline[i-1]) &&
ali@40	2119	!strchr(" _-.'`,;:!/([{?}])",aline[i+1]) && aline[i+1] \|\|
ali@40	2120	!strchr(" _-([{'`",aline[i-1]) && gcisalpha(aline[i+1]))
ali@40	2121	{
ali@40	2122	if (pswit[ECHO_SWITCH])
ali@40	2123	printf("\n%s\n",aline);
ali@40	2124	if (!pswit[OVERVIEW_SWITCH])
ali@40	2125	printf(" Line %ld column %d - Unspaced quotes?\n",
ali@40	2126	linecnt,i+1);
ali@40	2127	else
ali@40	2128	cnt_punct++;
ali@40	2129	}
ali@40	2130	}
ali@40	2131	}
ali@40	2132	/* Check parity of quotes. */
ali@40	2133	for (s=aline;*s;s++)
ali@40	2134	{
ali@40	2135	if (*s==CHAR_DQUOTE)
ali@40	2136	{
ali@40	2137	if (!(dquotepar=!dquotepar))
ali@40	2138	{
ali@40	2139	/* parity even */
ali@40	2140	if (!strchr("_-.'`/,;:!?)]} ",s[1]))
ali@40	2141	{
ali@40	2142	if (pswit[ECHO_SWITCH])
ali@40	2143	printf("\n%s\n",aline);
ali@40	2144	if (!pswit[OVERVIEW_SWITCH])
ali@40	2145	printf(" Line %ld column %d - "
ali@40	2146	"Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);
ali@40	2147	else
ali@40	2148	cnt_punct++;
ali@40	2149	}
ali@40	2150	}
ali@40	2151	else
ali@40	2152	{
ali@40	2153	/* parity odd */
ali@40	2154	if (!gcisalpha(s[1]) && !isdigit(s[1]) &&
ali@40	2155	!strchr("_-/.'`([{$",s[1]) \|\| !s[1])
ali@40	2156	{
ali@40	2157	if (pswit[ECHO_SWITCH])
ali@40	2158	printf("\n%s\n",aline);
ali@40	2159	if (!pswit[OVERVIEW_SWITCH])
ali@40	2160	printf(" Line %ld column %d - "
ali@40	2161	"Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);
ali@40	2162	else
ali@40	2163	cnt_punct++;
ali@40	2164	}
ali@40	2165	}
ali@40	2166	}
ali@40	2167	}
ali@40	2168	if (*aline==CHAR_DQUOTE)
ali@40	2169	{
ali@40	2170	if (strchr(",;:!?)]} ",aline[1]))
ali@40	2171	{
ali@40	2172	if (pswit[ECHO_SWITCH])
ali@40	2173	printf("\n%s\n",aline);
ali@40	2174	if (!pswit[OVERVIEW_SWITCH])
ali@40	2175	printf(" Line %ld column 1 - Wrongspaced quotes?\n",
ali@42	2176	linecnt);
ali@40	2177	else
ali@40	2178	cnt_punct++;
ali@40	2179	}
ali@40	2180	}
ali@40	2181	if (pswit[SQUOTE_SWITCH])
ali@40	2182	{
ali@40	2183	for (s=aline;*s;s++)
ali@40	2184	{
ali@40	2185	if ((s==CHAR_SQUOTE \|\| s==CHAR_OPEN_SQUOTE) &&
ali@40	2186	(s==aline \|\| s>aline && !gcisalpha(s[-1]) \|\|
ali@40	2187	!gcisalpha(s[1])))
ali@40	2188	{
ali@40	2189	if (!(squotepar=!squotepar))
ali@40	2190	{
ali@40	2191	/* parity even */
ali@40	2192	if (!strchr("_-.'`/\",;:!?)]} ",s[1]))
ali@40	2193	{
ali@40	2194	if (pswit[ECHO_SWITCH])
ali@40	2195	printf("\n%s\n",aline);
ali@40	2196	if (!pswit[OVERVIEW_SWITCH])
ali@40	2197	printf(" Line %ld column %d - "
ali@40	2198	"Wrongspaced singlequotes?\n",
ali@40	2199	linecnt,(int)(s-aline)+1);
ali@40	2200	else
ali@40	2201	cnt_punct++;
ali@40	2202	}
ali@40	2203	}
ali@40	2204	else
ali@40	2205	{
ali@40	2206	/* parity odd */
ali@40	2207	if (!gcisalpha(s[1]) && !isdigit(s[1]) &&
ali@40	2208	!strchr("_-/\".'`",s[1]) \|\| !s[1])
ali@40	2209	{
ali@40	2210	if (pswit[ECHO_SWITCH])
ali@40	2211	printf("\n%s\n",aline);
ali@40	2212	if (!pswit[OVERVIEW_SWITCH])
ali@40	2213	printf(" Line %ld column %d - "
ali@40	2214	"Wrongspaced singlequotes?\n",
ali@40	2215	linecnt,(int)(s-aline)+1);
ali@40	2216	else
ali@40	2217	cnt_punct++;
ali@40	2218	}
ali@40	2219	}
ali@40	2220	}
ali@40	2221	}
ali@40	2222	}
ali@40	2223	/*
ali@40	2224	* Look for double punctuation like ,. or ,,
ali@40	2225	* Thanks to DW for the suggestion!
ali@40	2226	* In books with references, ".," and ".;" are common
ali@40	2227	* e.g. "etc., etc.," and vol. 1.; vol 3.;
ali@40	2228	* OTOH, from my initial tests, there are also fairly
ali@40	2229	* common errors. What to do? Make these cases paranoid?
ali@42	2230	* ".," is the most common, so warnings->dotcomma is used
ali@40	2231	* to suppress detailed reporting if it occurs often.
ali@40	2232	*/
ali@40	2233	llen=strlen(aline);
ali@40	2234	for (i=0;i<llen;i++)
ali@40	2235	{
ali@40	2236	/* for each punctuation character in the line */
ali@40	2237	if (strchr(".?!,;:",aline[i]) && (strchr(".?!,;:",aline[i+1])) &&
ali@40	2238	aline[i] && aline[i+1])
ali@40	2239	{
ali@40	2240	/* followed by punctuation, it's a query, unless . . . */
ali@40	2241	if (aline[i]==aline[i+1] && (aline[i]=='.' \|\| aline[i]=='?' \|\|
ali@40	2242	aline[i]=='!') \|\|
ali@42	2243	!warnings->dotcomma && aline[i]=='.' && aline[i+1]==',' \|\|
ali@42	2244	warnings->isFrench && !strncmp(aline+i,",...",4) \|\|
ali@42	2245	warnings->isFrench && !strncmp(aline+i,"...,",4) \|\|
ali@42	2246	warnings->isFrench && !strncmp(aline+i,";...",4) \|\|
ali@42	2247	warnings->isFrench && !strncmp(aline+i,"...;",4) \|\|
ali@42	2248	warnings->isFrench && !strncmp(aline+i,":...",4) \|\|
ali@42	2249	warnings->isFrench && !strncmp(aline+i,"...:",4) \|\|
ali@42	2250	warnings->isFrench && !strncmp(aline+i,"!...",4) \|\|
ali@42	2251	warnings->isFrench && !strncmp(aline+i,"...!",4) \|\|
ali@42	2252	warnings->isFrench && !strncmp(aline+i,"?...",4) \|\|
ali@42	2253	warnings->isFrench && !strncmp(aline+i,"...?",4))
ali@40	2254	{
ali@42	2255	if (warnings->isFrench && !strncmp(aline+i,",...",4) \|\|
ali@42	2256	warnings->isFrench && !strncmp(aline+i,"...,",4) \|\|
ali@42	2257	warnings->isFrench && !strncmp(aline+i,";...",4) \|\|
ali@42	2258	warnings->isFrench && !strncmp(aline+i,"...;",4) \|\|
ali@42	2259	warnings->isFrench && !strncmp(aline+i,":...",4) \|\|
ali@42	2260	warnings->isFrench && !strncmp(aline+i,"...:",4) \|\|
ali@42	2261	warnings->isFrench && !strncmp(aline+i,"!...",4) \|\|
ali@42	2262	warnings->isFrench && !strncmp(aline+i,"...!",4) \|\|
ali@42	2263	warnings->isFrench && !strncmp(aline+i,"?...",4) \|\|
ali@42	2264	warnings->isFrench && !strncmp(aline+i,"...?",4))
ali@40	2265	i+=4;
ali@40	2266	; /* do nothing for .. !! and ?? which can be legit */
ali@40	2267	}
ali@40	2268	else
ali@40	2269	{
ali@40	2270	if (pswit[ECHO_SWITCH])
ali@40	2271	printf("\n%s\n",aline);
ali@40	2272	if (!pswit[OVERVIEW_SWITCH])
ali@40	2273	printf(" Line %ld column %d - Double punctuation?\n",
ali@40	2274	linecnt,i+1);
ali@40	2275	else
ali@40	2276	cnt_punct++;
ali@40	2277	}
ali@40	2278	}
ali@40	2279	}
ali@40	2280	s=aline;
ali@40	2281	while (strstr(s," \" "))
ali@40	2282	{
ali@40	2283	if (pswit[ECHO_SWITCH])
ali@40	2284	printf("\n%s\n",aline);
ali@0	2285	if (!pswit[OVERVIEW_SWITCH])
ali@40	2286	printf(" Line %ld column %d - Spaced doublequote?\n",
ali@40	2287	linecnt,(int)(strstr(s," \" ")-aline+1));
ali@0	2288	else
ali@0	2289	cnt_punct++;
ali@40	2290	s=strstr(s," \" ")+2;
ali@40	2291	}
ali@40	2292	s=aline;
ali@40	2293	while (strstr(s," ' "))
ali@40	2294	{
ali@40	2295	if (pswit[ECHO_SWITCH])
ali@40	2296	printf("\n%s\n",aline);
ali@40	2297	if (!pswit[OVERVIEW_SWITCH])
ali@40	2298	printf(" Line %ld column %d - Spaced singlequote?\n",
ali@40	2299	linecnt,(int)(strstr(s," ' ")-aline+1));
ali@40	2300	else
ali@40	2301	cnt_punct++;
ali@40	2302	s=strstr(s," ' ")+2;
ali@40	2303	}
ali@40	2304	s=aline;
ali@40	2305	while (strstr(s," ` "))
ali@40	2306	{
ali@40	2307	if (pswit[ECHO_SWITCH])
ali@40	2308	printf("\n%s\n",aline);
ali@40	2309	if (!pswit[OVERVIEW_SWITCH])
ali@40	2310	printf(" Line %ld column %d - Spaced singlequote?\n",
ali@40	2311	linecnt,(int)(strstr(s," ` ")-aline+1));
ali@40	2312	else
ali@40	2313	cnt_punct++;
ali@40	2314	s=strstr(s," ` ")+2;
ali@40	2315	}
ali@40	2316	/* check special case of 'S instead of 's at end of word */
ali@40	2317	s=aline+1;
ali@40	2318	while (*s)
ali@40	2319	{
ali@40	2320	if (*s==CHAR_SQUOTE && s[1]=='S' && s[-1]>='a' && s[-1]<='z')
ali@40	2321	{
ali@40	2322	if (pswit[ECHO_SWITCH])
ali@40	2323	printf("\n%s\n",aline);
ali@0	2324	if (!pswit[OVERVIEW_SWITCH])
ali@40	2325	printf(" Line %ld column %d - Capital \"S\"?\n",
ali@40	2326	linecnt,(int)(s-aline+2));
ali@0	2327	else
ali@0	2328	cnt_punct++;
ali@40	2329	}
ali@40	2330	s++;
ali@40	2331	}
ali@40	2332	/*
ali@40	2333	* Now check special cases - start and end of line -
ali@40	2334	* for single and double quotes. Start is sometimes [sic]
ali@40	2335	* but better to query it anyway.
ali@40	2336	* While we're here, check for dash at end of line.
ali@40	2337	*/
ali@40	2338	llen=strlen(aline);
ali@40	2339	if (llen>1)
ali@40	2340	{
ali@40	2341	if (aline[llen-1]==CHAR_DQUOTE \|\| aline[llen-1]==CHAR_SQUOTE \|\|
ali@40	2342	aline[llen-1]==CHAR_OPEN_SQUOTE)
ali@40	2343	if (aline[llen-2]==CHAR_SPACE)
ali@40	2344	{
ali@40	2345	if (pswit[ECHO_SWITCH])
ali@40	2346	printf("\n%s\n",aline);
ali@0	2347	if (!pswit[OVERVIEW_SWITCH])
ali@40	2348	printf(" Line %ld column %d - Spaced quote?\n",
ali@40	2349	linecnt,llen);
ali@0	2350	else
ali@0	2351	cnt_punct++;
ali@40	2352	}
ali@40	2353	if ((aline[0]==CHAR_SQUOTE \|\| aline[0]==CHAR_OPEN_SQUOTE) &&
ali@40	2354	aline[1]==CHAR_SPACE)
ali@40	2355	{
ali@40	2356	if (pswit[ECHO_SWITCH])
ali@40	2357	printf("\n%s\n",aline);
ali@40	2358	if (!pswit[OVERVIEW_SWITCH])
ali@40	2359	printf(" Line %ld column 1 - Spaced quote?\n",linecnt);
ali@40	2360	else
ali@40	2361	cnt_punct++;
ali@40	2362	}
ali@40	2363	/*
ali@40	2364	* Dash at end of line may well be legit - paranoid mode only
ali@40	2365	* and don't report em-dash at line-end.
ali@40	2366	*/
ali@42	2367	if (pswit[PARANOID_SWITCH] && warnings->hyphen)
ali@40	2368	{
ali@40	2369	for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)
ali@40	2370	;
ali@40	2371	if (aline[i]=='-' && aline[i-1]!='-')
ali@40	2372	{
ali@40	2373	if (pswit[ECHO_SWITCH])
ali@40	2374	printf("\n%s\n",aline);
ali@0	2375	if (!pswit[OVERVIEW_SWITCH])
ali@40	2376	printf(" Line %ld column %d - "
ali@40	2377	"Hyphen at end of line?\n",linecnt,i);
ali@40	2378	}
ali@40	2379	}
ali@40	2380	}
ali@40	2381	/*
ali@40	2382	* Brackets are often unspaced, but shouldn't be surrounded by alpha.
ali@40	2383	* If so, suspect a scanno like "a]most".
ali@40	2384	*/
ali@40	2385	llen=strlen(aline);
ali@40	2386	for (i=1;i<llen-1;i++)
ali@40	2387	{
ali@40	2388	/* for each bracket character in the line except 1st & last */
ali@40	2389	if (strchr("{[()]}",aline[i]) && gcisalpha(aline[i-1]) &&
ali@40	2390	gcisalpha(aline[i+1]))
ali@40	2391	{
ali@40	2392	if (pswit[ECHO_SWITCH])
ali@40	2393	printf("\n%s\n",aline);
ali@0	2394	if (!pswit[OVERVIEW_SWITCH])
ali@40	2395	printf(" Line %ld column %d - Unspaced bracket?\n",
ali@40	2396	linecnt,i);
ali@0	2397	else
ali@0	2398	cnt_punct++;
ali@40	2399	}
ali@40	2400	}
ali@40	2401	llen=strlen(aline);
ali@42	2402	if (warnings->endquote)
ali@40	2403	{
ali@40	2404	for (i=1;i<llen;i++)
ali@40	2405	{
ali@40	2406	/* for each character in the line except 1st */
ali@40	2407	if (aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))
ali@40	2408	{
ali@40	2409	if (pswit[ECHO_SWITCH])
ali@40	2410	printf("\n%s\n",aline);
ali@40	2411	if (!pswit[OVERVIEW_SWITCH])
ali@40	2412	printf(" Line %ld column %d - "
ali@40	2413	"endquote missing punctuation?\n",linecnt,i);
ali@40	2414	else
ali@40	2415	cnt_punct++;
ali@40	2416	}
ali@40	2417	}
ali@40	2418	}
ali@40	2419	/*
ali@40	2420	* Check for <HTML TAG>.
ali@40	2421	* If there is a < in the line, followed at some point
ali@40	2422	* by a > then we suspect HTML.
ali@40	2423	*/
ali@40	2424	if (strstr(aline,"<") && strstr(aline,">"))
ali@40	2425	{
ali@40	2426	i=(signed int)(strstr(aline,">")-strstr(aline,"<")+1);
ali@40	2427	if (i>0)
ali@40	2428	{
ali@40	2429	strncpy(wrk,strstr(aline,"<"),i);
ali@40	2430	wrk[i]=0;
ali@40	2431	if (pswit[ECHO_SWITCH])
ali@40	2432	printf("\n%s\n",aline);
ali@0	2433	if (!pswit[OVERVIEW_SWITCH])
ali@40	2434	printf(" Line %ld column %d - HTML Tag? %s \n",
ali@40	2435	linecnt,(int)(strstr(aline,"<")-aline)+1,wrk);
ali@0	2436	else
ali@0	2437	cnt_html++;
ali@40	2438	}
ali@40	2439	}
ali@40	2440	/*
ali@40	2441	* Check for &symbol; HTML.
ali@40	2442	* If there is a & in the line, followed at
ali@40	2443	* some point by a ; then we suspect HTML.
ali@40	2444	*/
ali@40	2445	if (strstr(aline,"&") && strstr(aline,";"))
ali@40	2446	{
ali@40	2447	i=(int)(strstr(aline,";")-strstr(aline,"&")+1);
ali@40	2448	for (s=strstr(aline,"&");s<strstr(aline,";");s++)
ali@40	2449	if (*s==CHAR_SPACE)
ali@40	2450	i=0; /* Don't report "Jones & Son;" */
ali@40	2451	if (i>0)
ali@40	2452	{
ali@40	2453	strncpy(wrk,strstr(aline,"&"),i);
ali@40	2454	wrk[i]=0;
ali@40	2455	if (pswit[ECHO_SWITCH])
ali@40	2456	printf("\n%s\n",aline);
ali@0	2457	if (!pswit[OVERVIEW_SWITCH])
ali@40	2458	printf(" Line %ld column %d - HTML symbol? %s \n",
ali@40	2459	linecnt,(int)(strstr(aline,"&")-aline)+1,wrk);
ali@0	2460	else
ali@0	2461	cnt_html++;
ali@40	2462	}
ali@40	2463	}
ali@40	2464	/*
ali@40	2465	* At end of paragraph, check for mismatched quotes.
ali@40	2466	* We don't want to report an error immediately, since it is a
ali@40	2467	* common convention to omit the quotes at end of paragraph if
ali@40	2468	* the next paragraph is a continuation of the same speaker.
ali@40	2469	* Where this is the case, the next para should begin with a
ali@40	2470	* quote, so we store the warning message and only display it
ali@40	2471	* at the top of the next iteration if the new para doesn't
ali@40	2472	* start with a quote.
ali@40	2473	* The -p switch overrides this default, and warns of unclosed
ali@40	2474	* quotes on _every_ paragraph, whether the next begins with a
ali@40	2475	* quote or not.
ali@40	2476	*/
ali@40	2477	if (isemptyline)
ali@40	2478	{
ali@40	2479	/* end of para - add up the totals */
ali@43	2480	if (counters.quot%2)
ali@40	2481	sprintf(dquote_err," Line %ld - Mismatched quotes\n",
ali@40	2482	linecnt);
ali@43	2483	if (pswit[SQUOTE_SWITCH] && counters.open_single_quote &&
ali@43	2484	counters.open_single_quote!=counters.close_single_quote)
ali@40	2485	sprintf(squote_err," Line %ld - Mismatched singlequotes?\n",
ali@40	2486	linecnt);
ali@43	2487	if (pswit[SQUOTE_SWITCH] && counters.open_single_quote &&
ali@43	2488	counters.open_single_quote!=counters.close_single_quote &&
ali@43	2489	counters.open_single_quote!=counters.close_single_quote+1)
ali@40	2490	/*
ali@40	2491	* Flag it to be noted regardless of the
ali@40	2492	* first char of the next para.
ali@40	2493	*/
ali@40	2494	squot=1;
ali@43	2495	if (counters.r_brack)
ali@40	2496	sprintf(rbrack_err," Line %ld - "
ali@40	2497	"Mismatched round brackets?\n",linecnt);
ali@43	2498	if (counters.s_brack)
ali@40	2499	sprintf(sbrack_err," Line %ld - "
ali@40	2500	"Mismatched square brackets?\n",linecnt);
ali@43	2501	if (counters.c_brack)
ali@40	2502	sprintf(cbrack_err," Line %ld - "
ali@40	2503	"Mismatched curly brackets?\n",linecnt);
ali@43	2504	if (counters.c_unders%2)
ali@40	2505	sprintf(unders_err," Line %ld - Mismatched underscores?\n",
ali@40	2506	linecnt);
ali@43	2507	memset(&counters,0,sizeof(counters));
ali@40	2508	/* let the next iteration know that it's starting a new para */
ali@40	2509	isnewpara=1;
ali@40	2510	}
ali@40	2511	/*
ali@40	2512	* Check for omitted punctuation at end of paragraph by working back
ali@40	2513	* through prevline. DW.
ali@40	2514	* Need to check this only for "normal" paras.
ali@40	2515	* So what is a "normal" para?
ali@40	2516	* Not normal if one-liner (chapter headings, etc.)
ali@40	2517	* Not normal if doesn't contain at least one locase letter
ali@40	2518	* Not normal if starts with space
ali@40	2519	*/
ali@40	2520	if (isemptyline)
ali@40	2521	{
ali@40	2522	/* end of para */
ali@40	2523	for (s=prevline,i=0;*s && !i;s++)
ali@0	2524	if (gcisletter(*s))
ali@40	2525	/* use i to indicate the presence of a letter on the line */
ali@40	2526	i=1;
ali@40	2527	/*
ali@40	2528	* This next "if" is a problem.
ali@40	2529	* If we say "start_para_line <= linecnt - 1", that includes
ali@40	2530	* one-line "paragraphs" like chapter heads. Lotsa false positives.
ali@40	2531	* If we say "start_para_line < linecnt - 1" it doesn't, but then it
ali@40	2532	* misses genuine one-line paragraphs.
ali@40	2533	*/
ali@45	2534	if (i && last.blen>2 && start_para_line<linecnt-1 &&
ali@40	2535	*prevline>CHAR_SPACE)
ali@40	2536	{
ali@40	2537	for (i=strlen(prevline)-1;
ali@40	2538	(prevline[i]==CHAR_DQUOTE \|\| prevline[i]==CHAR_SQUOTE) &&
ali@40	2539	prevline[i]>CHAR_SPACE && i>0;
ali@40	2540	i--)
ali@40	2541	;
ali@40	2542	for (;i>0;i--)
ali@40	2543	{
ali@40	2544	if (gcisalpha(prevline[i]))
ali@40	2545	{
ali@40	2546	if (pswit[ECHO_SWITCH])
ali@40	2547	printf("\n%s\n",prevline);
ali@0	2548	if (!pswit[OVERVIEW_SWITCH])
ali@40	2549	printf(" Line %ld column %d - "
ali@40	2550	"No punctuation at para end?\n",
ali@40	2551	linecnt-1,strlen(prevline));
ali@0	2552	else
ali@0	2553	cnt_punct++;
ali@0	2554	break;
ali@40	2555	}
ali@40	2556	if (strchr("-.:!([{?}])",prevline[i]))
ali@0	2557	break;
ali@40	2558	}
ali@40	2559	}
ali@40	2560	}
ali@40	2561	strcpy(prevline,aline);
ali@0	2562	}
ali@40	2563	fclose(infile);
ali@0	2564	if (!pswit[OVERVIEW_SWITCH])
ali@40	2565	for (i=0;i<MAX_QWORD;i++)
ali@0	2566	if (dupcnt[i])
ali@40	2567	printf("\nNote: Queried word %s was duplicated %d time%s\n",
ali@40	2568	qword[i],dupcnt[i],"s");
ali@0	2569	}
ali@0	2570
ali@40	2571	/*
ali@40	2572	* flgets:
ali@40	2573	*
ali@40	2574	* Get one line from the input stream, checking for
ali@40	2575	* the existence of exactly one CR/LF line-end per line.
ali@40	2576	*
ali@40	2577	* Returns: a pointer to the line.
ali@40	2578	*/
ali@40	2579	char flgets(char theline,int maxlen,FILE *thefile,long lcnt)
ali@0	2580	{
ali@0	2581	char c;
ali@40	2582	int len,isCR,cint;
ali@40	2583	*theline=0;
ali@40	2584	len=isCR=0;
ali@40	2585	c=cint=fgetc(thefile);
ali@40	2586	do
ali@40	2587	{
ali@40	2588	if (cint==EOF)
ali@40	2589	return NULL;
ali@40	2590	/* either way, it's end of line */
ali@40	2591	if (c==10)
ali@40	2592	{
ali@0	2593	if (isCR)
ali@0	2594	break;
ali@40	2595	else
ali@40	2596	{
ali@40	2597	/* Error - a LF without a preceding CR */
ali@40	2598	if (pswit[LINE_END_SWITCH])
ali@40	2599	{
ali@40	2600	if (pswit[ECHO_SWITCH])
ali@40	2601	printf("\n%s\n",theline);
ali@0	2602	if (!pswit[OVERVIEW_SWITCH])
ali@40	2603	printf(" Line %ld - No CR?\n",lcnt);
ali@0	2604	else
ali@0	2605	cnt_lineend++;
ali@40	2606	}
ali@0	2607	break;
ali@40	2608	}
ali@40	2609	}
ali@40	2610	if (c==13)
ali@40	2611	{
ali@40	2612	if (isCR)
ali@40	2613	{
ali@40	2614	/* Error - two successive CRs */
ali@40	2615	if (pswit[LINE_END_SWITCH])
ali@40	2616	{
ali@40	2617	if (pswit[ECHO_SWITCH])
ali@40	2618	printf("\n%s\n",theline);
ali@0	2619	if (!pswit[OVERVIEW_SWITCH])
ali@40	2620	printf(" Line %ld - Two successive CRs?\n",lcnt);
ali@0	2621	else
ali@0	2622	cnt_lineend++;
ali@40	2623	}
ali@40	2624	}
ali@40	2625	isCR=1;
ali@40	2626	}
ali@40	2627	else
ali@40	2628	{
ali@40	2629	if (pswit[LINE_END_SWITCH] && isCR)
ali@40	2630	{
ali@40	2631	if (pswit[ECHO_SWITCH])
ali@40	2632	printf("\n%s\n",theline);
ali@0	2633	if (!pswit[OVERVIEW_SWITCH])
ali@40	2634	printf(" Line %ld column %d - CR without LF?\n",
ali@40	2635	lcnt,len+1);
ali@0	2636	else
ali@0	2637	cnt_lineend++;
ali@40	2638	}
ali@40	2639	theline[len]=c;
ali@40	2640	len++;
ali@40	2641	theline[len]=0;
ali@40	2642	isCR=0;
ali@40	2643	}
ali@40	2644	c=cint=fgetc(thefile);
ali@40	2645	} while(len<maxlen);
ali@0	2646	if (pswit[MARKUP_SWITCH])
ali@0	2647	postprocess_for_HTML(theline);
ali@0	2648	if (pswit[DP_SWITCH])
ali@0	2649	postprocess_for_DP(theline);
ali@40	2650	return theline;
ali@0	2651	}
ali@0	2652
ali@40	2653	/*
ali@40	2654	* mixdigit:
ali@40	2655	*
ali@40	2656	* Takes a "word" as a parameter, and checks whether it
ali@40	2657	* contains a mixture of alpha and digits. Generally, this is an
ali@40	2658	* error, but may not be for cases like 4th or L5 12s. 3d.
ali@40	2659	*
ali@40	2660	* Returns: 0 if no error found, 1 if error.
ali@40	2661	*/
ali@40	2662	int mixdigit(char *checkword)
ali@0	2663	{
ali@40	2664	int wehaveadigit,wehavealetter,firstdigits,query,wl;
ali@0	2665	char *s;
ali@40	2666	wehaveadigit=wehavealetter=query=0;
ali@40	2667	for (s=checkword;*s;s++)
ali@0	2668	if (gcisalpha(*s))
ali@40	2669	wehavealetter=1;
ali@0	2670	else
ali@0	2671	if (gcisdigit(*s))
ali@40	2672	wehaveadigit=1;
ali@40	2673	if (wehaveadigit && wehavealetter)
ali@40	2674	{
ali@40	2675	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
ali@40	2676	query=1;
ali@40	2677	wl=strlen(checkword);
ali@40	2678	for (firstdigits=0;gcisdigit(checkword[firstdigits]);firstdigits++)
ali@0	2679	;
ali@0	2680	/* digits, ending in st, rd, nd, th of either case */
ali@40	2681	if (firstdigits+2==wl && (matchword(checkword+wl-2,"st") \|\|
ali@40	2682	matchword(checkword+wl-2,"rd") \|\| matchword(checkword+wl-2,"nd") \|\|
ali@40	2683	matchword(checkword+wl-2,"th")))
ali@40	2684	query=0;
ali@40	2685	if (firstdigits+3==wl && (matchword(checkword+wl-3,"sts") \|\|
ali@40	2686	matchword(checkword+wl-3,"rds") \|\| matchword(checkword+wl-3,"nds") \|\|
ali@40	2687	matchword(checkword+wl-3,"ths")))
ali@40	2688	query=0;
ali@40	2689	if (firstdigits+3==wl && (matchword(checkword+wl-4,"stly") \|\|
ali@40	2690	matchword(checkword+wl-4,"rdly") \|\|
ali@40	2691	matchword(checkword+wl-4,"ndly") \|\| matchword(checkword+wl-4,"thly")))
ali@40	2692	query=0;
ali@0	2693	/* digits, ending in l, L, s or d */
ali@40	2694	if (firstdigits+1==wl && (checkword[wl-1]=='l' \|\|
ali@40	2695	checkword[wl-1]=='L' \|\| checkword[wl-1]=='s' \|\| checkword[wl-1]=='d'))
ali@40	2696	query=0;
ali@40	2697	/*
ali@40	2698	* L at the start of a number, representing Britsh pounds, like L500.
ali@40	2699	* This is cute. We know the current word is mixeddigit. If the first
ali@40	2700	* letter is L, there must be at least one digit following. If both
ali@40	2701	* digits and letters follow, we have a genuine error, else we have a
ali@40	2702	* capital L followed by digits, and we accept that as a non-error.
ali@40	2703	*/
ali@40	2704	if (checkword[0]=='L' && !mixdigit(checkword+1))
ali@40	2705	query=0;
ali@40	2706	}
ali@40	2707	return query;
ali@0	2708	}
ali@0	2709
ali@40	2710	/*
ali@40	2711	* getaword:
ali@40	2712	*
ali@40	2713	* Extracts the first/next "word" from the line, and puts
ali@40	2714	* it into "thisword". A word is defined as one English word unit--or
ali@40	2715	* at least that's the aim.
ali@40	2716	*
ali@40	2717	* Returns: a pointer to the position in the line where we will start
ali@40	2718	* looking for the next word.
ali@40	2719	*/
ali@40	2720	char getaword(char fromline,char *thisword)
ali@0	2721	{
ali@40	2722	int i,wordlen;
ali@0	2723	char *s;
ali@40	2724	wordlen=0;
ali@40	2725	for (;!gcisdigit(fromline) && !gcisalpha(fromline) && *fromline;
ali@40	2726	fromline++)
ali@40	2727	;
ali@40	2728	/*
ali@40	2729	* Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
ali@40	2730	* Especially yucky is the case of L1,000
ali@40	2731	* This section looks for a pattern of characters including a digit
ali@40	2732	* followed by a comma or period followed by one or more digits.
ali@40	2733	* If found, it returns this whole pattern as a word; otherwise we discard
ali@40	2734	* the results and resume our normal programming.
ali@40	2735	*/
ali@40	2736	s=fromline;
ali@40	2737	for (;(gcisdigit(s) \|\| gcisalpha(s) \|\| s==',' \|\| s=='.') &&
ali@40	2738	wordlen<MAXWORDLEN;s++)
ali@40	2739	{
ali@40	2740	thisword[wordlen]=*s;
ali@0	2741	wordlen++;
ali@40	2742	}
ali@40	2743	thisword[wordlen]=0;
ali@40	2744	for (i=1;i<wordlen-1;i++)
ali@40	2745	{
ali@40	2746	if (thisword[i]=='.' \|\| thisword[i]==',')
ali@40	2747	{
ali@40	2748	if (gcisdigit(thisword[i-1]) && gcisdigit(thisword[i-1]))
ali@40	2749	{
ali@40	2750	fromline=s;
ali@40	2751	return fromline;
ali@40	2752	}
ali@40	2753	}
ali@40	2754	}
ali@0	2755	/* we didn't find a punctuated number - do the regular getword thing */
ali@40	2756	wordlen=0;
ali@40	2757	for (;(gcisdigit(fromline) \|\| gcisalpha(fromline) \|\| *fromline=='\'') &&
ali@40	2758	wordlen<MAXWORDLEN;fromline++)
ali@40	2759	{
ali@40	2760	thisword[wordlen]=*fromline;
ali@0	2761	wordlen++;
ali@40	2762	}
ali@40	2763	thisword[wordlen]=0;
ali@40	2764	return fromline;
ali@0	2765	}
ali@0	2766
ali@40	2767	/*
ali@40	2768	* matchword:
ali@40	2769	*
ali@40	2770	* A case-insensitive string matcher.
ali@40	2771	*/
ali@40	2772	int matchword(char checkfor,char thisword)
ali@0	2773	{
ali@40	2774	unsigned int ismatch,i;
ali@40	2775	if (strlen(checkfor)!=strlen(thisword))
ali@40	2776	return 0;
ali@40	2777	ismatch=1; /* assume a match until we find a difference */
ali@40	2778	for (i=0;i<strlen(checkfor);i++)
ali@40	2779	if (toupper(checkfor[i])!=toupper(thisword[i]))
ali@40	2780	ismatch=0;
ali@40	2781	return ismatch;
ali@0	2782	}
ali@0	2783
ali@40	2784	/*
ali@40	2785	* lowerit:
ali@40	2786	*
ali@40	2787	* Lowercase the line.
ali@40	2788	*/
ali@0	2789
ali@0	2790	void lowerit(char *theline)
ali@0	2791	{
ali@40	2792	for (;*theline;theline++)
ali@40	2793	if (theline>='A' && theline<='Z')
ali@40	2794	*theline+=32;
ali@0	2795	}
ali@0	2796
ali@40	2797	/*
ali@40	2798	* isroman:
ali@40	2799	*
ali@40	2800	* Is this word a Roman Numeral?
ali@40	2801	*
ali@40	2802	* It doesn't actually validate that the number is a valid Roman Numeral--for
ali@40	2803	* example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
ali@40	2804	* what we're here to do. If it passes this, it LOOKS like a Roman numeral.
ali@40	2805	* Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
ali@40	2806	* expressions thereof, except when it came to taxes. Allow any number of M,
ali@40	2807	* an optional D, an optional CM or CD, any number of optional Cs, an optional
ali@40	2808	* XL or an optional XC, an optional IX or IV, an optional V and any number
ali@40	2809	* of optional Is.
ali@40	2810	*/
ali@0	2811	int isroman(char *t)
ali@0	2812	{
ali@0	2813	char *s;
ali@40	2814	if (!t \|\| !*t)
ali@40	2815	return 0;
ali@40	2816	s=t;
ali@40	2817	while (t=='m' && t)
ali@40	2818	t++;
ali@40	2819	if (*t=='d')
ali@40	2820	t++;
ali@40	2821	if (*t=='c' && t[1]=='m')
ali@40	2822	t+=2;
ali@40	2823	if (*t=='c' && t[1]=='d')
ali@40	2824	t+=2;
ali@40	2825	while (t=='c' && t)
ali@40	2826	t++;
ali@40	2827	if (*t=='x' && t[1]=='l')
ali@40	2828	t+=2;
ali@40	2829	if (*t=='x' && t[1]=='c')
ali@40	2830	t+=2;
ali@40	2831	if (*t=='l')
ali@40	2832	t++;
ali@40	2833	while (t=='x' && t)
ali@40	2834	t++;
ali@40	2835	if (*t=='i' && t[1]=='x')
ali@40	2836	t+=2;
ali@40	2837	if (*t=='i' && t[1]=='v')
ali@40	2838	t+=2;
ali@40	2839	if (*t=='v')
ali@40	2840	t++;
ali@40	2841	while (t=='i' && t)
ali@40	2842	t++;
ali@40	2843	return !*t;
ali@0	2844	}
ali@0	2845
ali@40	2846	/*
ali@40	2847	* gcisalpha:
ali@40	2848	*
ali@40	2849	* A version of isalpha() that is somewhat lenient on 8-bit texts.
ali@40	2850	* If we use the standard function, 8-bit accented characters break
ali@40	2851	* words, so that tete with accented characters appears to be two words, "t"
ali@40	2852	* and "t", with 8-bit characters between them. This causes over-reporting of
ali@40	2853	* errors. gcisalpha() recognizes accented letters from the CP1252 (Windows)
ali@40	2854	* and ISO-8859-1 character sets, which are the most common PG 8-bit types.
ali@40	2855	*/
ali@0	2856	int gcisalpha(unsigned char c)
ali@0	2857	{
ali@40	2858	if (c>='a' && c<='z')
ali@40	2859	return 1;
ali@40	2860	if (c>='A' && c<='Z')
ali@40	2861	return 1;
ali@40	2862	if (c<140)
ali@40	2863	return 0;
ali@40	2864	if (c>=192 && c!=208 && c!=215 && c!=222 && c!=240 && c!=247 && c!=254)
ali@40	2865	return 1;
ali@40	2866	if (c==140 \|\| c==142 \|\| c==156 \|\| c==158 \|\| c==159)
ali@40	2867	return 1;
ali@40	2868	return 0;
ali@0	2869	}
ali@0	2870
ali@40	2871	/*
ali@40	2872	* gcisdigit:
ali@40	2873	*
ali@40	2874	* A version of isdigit() that doesn't get confused in 8-bit texts.
ali@40	2875	*/
ali@0	2876	int gcisdigit(unsigned char c)
ali@0	2877	{
ali@40	2878	return c>='0' && c<='9';
ali@0	2879	}
ali@0	2880
ali@40	2881	/*
ali@40	2882	* gcisletter:
ali@40	2883	*
ali@40	2884	* A version of isletter() that doesn't get confused in 8-bit texts.
ali@40	2885	* NB: this is ISO-8891-1-specific.
ali@40	2886	*/
ali@0	2887	int gcisletter(unsigned char c)
ali@0	2888	{
ali@40	2889	return c>='A' && c<='Z' \|\| c>='a' && c<='z' \|\| c>=192;
ali@0	2890	}
ali@0	2891
ali@40	2892	/*
ali@40	2893	* gcstrchr:
ali@40	2894	*
ali@40	2895	* Wraps strchr to return NULL if the character being searched for is zero.
ali@40	2896	*/
ali@40	2897	char gcstrchr(char s,char c)
ali@0	2898	{
ali@40	2899	if (!c)
ali@40	2900	return NULL;
ali@40	2901	return strchr(s,c);
ali@0	2902	}
ali@0	2903
ali@40	2904	/*
ali@40	2905	* postprocess_for_DP:
ali@40	2906	*
ali@40	2907	* Invoked with the -d switch from flgets().
ali@40	2908	* It simply "removes" from the line a hard-coded set of common
ali@40	2909	* DP-specific tags, so that the line passed to the main routine has
ali@40	2910	* been pre-cleaned of DP markup.
ali@40	2911	*/
ali@0	2912	void postprocess_for_DP(char *theline)
ali@0	2913	{
ali@40	2914	char s,t;
ali@0	2915	int i;
ali@0	2916	if (!*theline)
ali@0	2917	return;
ali@40	2918	for (i=0;*DPmarkup[i];i++)
ali@40	2919	{
ali@40	2920	s=strstr(theline,DPmarkup[i]);
ali@40	2921	while (s)
ali@40	2922	{
ali@40	2923	t=s+strlen(DPmarkup[i]);
ali@40	2924	while (*t)
ali@40	2925	{
ali@40	2926	s=t;
ali@40	2927	t++;
ali@40	2928	s++;
ali@40	2929	}
ali@40	2930	*s=0;
ali@40	2931	s=strstr(theline,DPmarkup[i]);
ali@40	2932	}
ali@40	2933	}
ali@0	2934	}
ali@0	2935
ali@40	2936	/*
ali@40	2937	* postprocess_for_HTML:
ali@40	2938	*
ali@40	2939	* Invoked with the -m switch from flgets().
ali@40	2940	* It simply "removes" from the line a hard-coded set of common
ali@40	2941	* HTML tags and "replaces" a hard-coded set of common HTML
ali@40	2942	* entities, so that the line passed to the main routine has
ali@40	2943	* been pre-cleaned of HTML.
ali@40	2944	*/
ali@0	2945	void postprocess_for_HTML(char *theline)
ali@0	2946	{
ali@40	2947	if (strstr(theline,"<") && strstr(theline,">"))
ali@0	2948	while (losemarkup(theline))
ali@0	2949	;
ali@0	2950	while (loseentities(theline))
ali@0	2951	;
ali@0	2952	}
ali@0	2953
ali@0	2954	char losemarkup(char theline)
ali@0	2955	{
ali@40	2956	char s,t;
ali@0	2957	int i;
ali@0	2958	if (!*theline)
ali@40	2959	return NULL;
ali@40	2960	s=strstr(theline,"<");
ali@40	2961	t=strstr(theline,">");
ali@40	2962	if (!s \|\| !t)
ali@40	2963	return NULL;
ali@40	2964	for (i=0;*markup[i];i++)
ali@40	2965	if (!tagcomp(s+1,markup[i]))
ali@40	2966	{
ali@40	2967	if (!t[1])
ali@40	2968	{
ali@40	2969	*s=0;
ali@40	2970	return s;
ali@40	2971	}
ali@40	2972	else if (t>s)
ali@40	2973	{
ali@40	2974	strcpy(s,t+1);
ali@40	2975	return s;
ali@40	2976	}
ali@0	2977	}
ali@40	2978	/* It's an unrecognized <xxx>. */
ali@40	2979	return NULL;
ali@0	2980	}
ali@0	2981
ali@0	2982	char loseentities(char theline)
ali@0	2983	{
ali@0	2984	int i;
ali@40	2985	char s,t;
ali@0	2986	if (!*theline)
ali@40	2987	return NULL;
ali@40	2988	for (i=0;*entities[i].htmlent;i++)
ali@40	2989	{
ali@40	2990	s=strstr(theline,entities[i].htmlent);
ali@40	2991	if (s)
ali@40	2992	{
ali@40	2993	t=malloc((size_t)strlen(s));
ali@40	2994	if (!t)
ali@40	2995	return NULL;
ali@40	2996	strcpy(t,s+strlen(entities[i].htmlent));
ali@40	2997	strcpy(s,entities[i].textent);
ali@40	2998	strcat(s,t);
ali@0	2999	free(t);
ali@40	3000	return theline;
ali@40	3001	}
ali@40	3002	}
ali@40	3003	for (i=0;*entities[i].htmlnum;i++)
ali@40	3004	{
ali@40	3005	s=strstr(theline,entities[i].htmlnum);
ali@40	3006	if (s)
ali@40	3007	{
ali@40	3008	t=malloc((size_t)strlen(s));
ali@40	3009	if (!t)
ali@40	3010	return NULL;
ali@40	3011	strcpy(t,s+strlen(entities[i].htmlnum));
ali@40	3012	strcpy(s,entities[i].textent);
ali@40	3013	strcat(s,t);
ali@0	3014	free(t);
ali@40	3015	return theline;
ali@40	3016	}
ali@40	3017	}
ali@40	3018	return NULL;
ali@0	3019	}
ali@0	3020
ali@40	3021	int tagcomp(char strin,char basetag)
ali@0	3022	{
ali@40	3023	char s,t;
ali@40	3024	s=basetag;
ali@40	3025	t=strin;
ali@40	3026	if (*t=='/')
ali@40	3027	t++; /* ignore a slash */
ali@40	3028	while (s && t)
ali@40	3029	{
ali@40	3030	if (tolower(s)!=tolower(t))
ali@40	3031	return 1;
ali@40	3032	s++;
ali@40	3033	t++;
ali@40	3034	}
ali@40	3035	return 0;
ali@0	3036	}
ali@0	3037
ali@40	3038	void proghelp()
ali@0	3039	{
ali@40	3040	fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
ali@40	3041	fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
ali@40	3042	fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
ali@40	3043	fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
ali@40	3044	"For details, read the file COPYING.\n",stderr);
ali@40	3045	fputs("This is Free Software; "
ali@40	3046	"you may redistribute it under certain conditions (GPL);\n",stderr);
ali@40	3047	fputs("read the file COPYING for details.\n\n",stderr);
ali@40	3048	fputs("Usage is: bookloupe [-setpxloyhud] filename\n",stderr);
ali@40	3049	fputs(" where -s checks single quotes, -e suppresses echoing lines, "
ali@40	3050	"-t checks typos\n",stderr);
ali@40	3051	fputs(" -x (paranoid) switches OFF -t and extra checks, "
ali@40	3052	"-l turns OFF line-end checks\n",stderr);
ali@40	3053	fputs(" -o just displays overview without detail, "
ali@40	3054	"-h echoes header fields\n",stderr);
ali@40	3055	fputs(" -v (verbose) unsuppresses duplicate reporting, "
ali@40	3056	"-m suppresses markup\n",stderr);
ali@0	3057	fputs(" -d ignores DP-specific markup,\n",stderr);
ali@40	3058	fputs(" -u uses a file gutcheck.typ to query user-defined "
ali@40	3059	"possible typos\n",stderr);
ali@40	3060	fputs("Sample usage: bookloupe warpeace.txt \n",stderr);
ali@0	3061	fputs("\n",stderr);
ali@40	3062	fputs("Bookloupe looks for errors in Project Gutenberg(TM) etexts.\n",
ali@40	3063	stderr);
ali@40	3064	fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
ali@40	3065	"non-ASCII\n",stderr);
ali@40	3066	fputs("characters like accented letters, "
ali@40	3067	"lines longer than 75 or shorter than 55,\n",stderr);
ali@40	3068	fputs("unbalanced quotes or brackets, "
ali@40	3069	"a variety of badly formatted punctuation, \n",stderr);
ali@40	3070	fputs("HTML tags, some likely typos. "
ali@40	3071	"It is NOT a substitute for human judgement.\n",stderr);
ali@0	3072	fputs("\n",stderr);
ali@0	3073	}

author	ali <ali@juiblex.co.uk>
	Sat May 25 23:51:28 2013 +0100 (2013-05-25)
changeset 51	0d08cd5055d5
parent 50	1b646720d4a7
child 52	a1fd8d3f0940
permissions	-rw-r--r--