bookloupe: bookloupe/bookloupe.c@1e89f47e56df (annotated)

ali@0	1	/*************************************************************************/
ali@40	2	/* bookloupe--check for assorted weirdnesses in a PG candidate text file */
ali@0	3	/* */
ali@0	4	/* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
ali@40	5	/* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
ali@0	6	/* */
ali@0	7	/* This program is free software; you can redistribute it and/or modify */
ali@0	8	/* it under the terms of the GNU General Public License as published by */
ali@0	9	/* the Free Software Foundation; either version 2 of the License, or */
ali@0	10	/* (at your option) any later version. */
ali@0	11	/* */
ali@0	12	/* This program is distributed in the hope that it will be useful, */
ali@0	13	/* but WITHOUT ANY WARRANTY; without even the implied warranty of */
ali@40	14	/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
ali@0	15	/* GNU General Public License for more details. */
ali@0	16	/* */
ali@0	17	/* You should have received a copy of the GNU General Public License */
ali@40	18	/* along with this program. If not, see <http://www.gnu.org/licenses/>. */
ali@0	19	/*************************************************************************/
ali@0	20
ali@0	21	#include <stdio.h>
ali@0	22	#include <stdlib.h>
ali@0	23	#include <string.h>
ali@0	24	#include <ctype.h>
ali@0	25
ali@0	26	#define MAXWORDLEN 80 /* max length of one word */
ali@0	27	#define LINEBUFSIZE 2048 /* buffer size for an input line */
ali@0	28
ali@0	29	#define MAX_USER_TYPOS 1000
ali@0	30	#define USERTYPO_FILE "gutcheck.typ"
ali@0	31
ali@0	32	#ifndef MAX_PATH
ali@0	33	#define MAX_PATH 16384
ali@0	34	#endif
ali@0	35
ali@0	36	char aline[LINEBUFSIZE];
ali@0	37	char prevline[LINEBUFSIZE];
ali@0	38
ali@40	39	/* Common typos. */
ali@40	40	char *typo[] = {
ali@40	41	"teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
ali@40	42	"nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
ali@40	43	"bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
ali@40	44	"couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
ali@40	45	"esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
ali@40	46	"gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
ali@40	47	"herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
ali@40	48	"hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
ali@40	49	"loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
ali@40	50	"omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
ali@40	51	"peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
ali@40	52	"porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
ali@40	53	"sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
ali@40	54	"tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
ali@40	55	"thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
ali@40	56	"tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
ali@40	57	"waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
ali@40	58	"wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
ali@40	59	"woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
ali@40	60	"wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
ali@40	61	"ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
ali@40	62	"bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
ali@40	63	"ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
ali@40	64	"dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
ali@40	65	"hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
ali@40	66	"hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
ali@40	67	"memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
ali@40	68	"witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
ali@40	69	"prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
ali@40	70	"se", ""
ali@40	71	};
ali@0	72
ali@0	73	char *usertypo[MAX_USER_TYPOS];
ali@0	74
ali@40	75	/* Common abbreviations and other OK words not to query as typos. */
ali@40	76	char *okword[] = {
ali@40	77	"mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
ali@40	78	"rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
ali@40	79	"pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
ali@40	80	"outbid", "outbids", "frostbite", "frostbitten", ""
ali@40	81	};
ali@0	82
ali@40	83	/* Common abbreviations that cause otherwise unexplained periods. */
ali@40	84	char *abbrev[] = {
ali@40	85	"cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
ali@40	86	"cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
ali@40	87	};
ali@0	88
ali@40	89	/*
ali@40	90	* Two-Letter combinations that rarely if ever start words,
ali@40	91	* but are common scannos or otherwise common letter combinations.
ali@40	92	*/
ali@40	93	char *nostart[] = {
ali@40	94	"hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
ali@40	95	};
ali@0	96
ali@40	97	/*
ali@40	98	* Two-Letter combinations that rarely if ever end words,
ali@40	99	* but are common scannos or otherwise common letter combinations.
ali@40	100	*/
ali@40	101	char *noend[] = {
ali@40	102	"cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
ali@40	103	"sw", "gr", "sl", "cl", "iy", ""
ali@40	104	};
ali@0	105
ali@40	106	char *markup[] = {
ali@40	107	"a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
ali@40	108	"font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
ali@40	109	"img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
ali@40	110	"sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
ali@40	111	};
ali@0	112
ali@40	113	char *DPmarkup[] = {
ali@40	114	"<sc>", "</sc>", "/", "/", "/#", "#/", "/$", "$/", "<tb>", ""
ali@40	115	};
ali@0	116
ali@40	117	char *nocomma[] = {
ali@40	118	"the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
ali@40	119	"every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
ali@40	120	"st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
ali@40	121	"during", "let", "toward", "among", ""
ali@40	122	};
ali@0	123
ali@40	124	char *noperiod[] = {
ali@40	125	"every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
ali@40	126	"and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
ali@40	127	"i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
ali@40	128	"among", "those", "into", "whom", "having", "thence", ""
ali@40	129	};
ali@0	130
ali@40	131	char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü";
ali@0	132
ali@0	133	struct {
ali@0	134	char *htmlent;
ali@0	135	char *htmlnum;
ali@0	136	char *textent;
ali@40	137	} entities[] = {
ali@40	138	"&", "&", "&",
ali@40	139	"<", "<", "<",
ali@40	140	">", ">", ">",
ali@40	141	"°", "°", " degrees",
ali@40	142	"£", "£", "L",
ali@40	143	""", """, "\"", /* quotation mark = APL quote */
ali@40	144	"&OElig;", "Œ", "OE", /* latin capital ligature OE */
ali@40	145	"&oelig;", "œ", "oe", /* latin small ligature oe */
ali@40	146	"&Scaron;", "Š", "S", /* latin capital letter S with caron */
ali@40	147	"&scaron;", "š", "s", /* latin small letter s with caron */
ali@40	148	"&Yuml;", "Ÿ", "Y", /* latin capital letter Y with diaeresis */
ali@40	149	"&circ;", "ˆ", "", /* modifier letter circumflex accent */
ali@40	150	"&tilde;", "˜", "~", /* small tilde, U+02DC ISOdia */
ali@40	151	"&ensp;", " ", " ", /* en space, U+2002 ISOpub */
ali@40	152	"&emsp;", " ", " ", /* em space, U+2003 ISOpub */
ali@40	153	" ", " ", " ", /* thin space, U+2009 ISOpub */
ali@40	154	"–", "–", "-", /* en dash, U+2013 ISOpub */
ali@40	155	"—", "—", "--", /* em dash, U+2014 ISOpub */
ali@40	156	"’", "’", "'", /* right single quotation mark */
ali@40	157	"&sbquo;", "‚", "'", /* single low-9 quotation mark */
ali@40	158	"“", "“", "\"", /* left double quotation mark */
ali@40	159	"”", "”", "\"", /* right double quotation mark */
ali@40	160	"&bdquo;", "„", "\"", /* double low-9 quotation mark */
ali@40	161	"&lsaquo;", "‹", "\"", /* single left-pointing angle quotation mark */
ali@40	162	"&rsaquo;", "›", "\"", /* single right-pointing angle quotation mark */
ali@40	163	" ", " ", " ", /* no-break space = non-breaking space, */
ali@40	164	"¡", "¡", "!", /* inverted exclamation mark */
ali@40	165	"¢", "¢", "c", /* cent sign */
ali@40	166	"£", "£", "L", /* pound sign */
ali@40	167	"¤", "¤", "$", /* currency sign */
ali@40	168	"¥", "¥", "Y", /* yen sign = yuan sign */
ali@40	169	"§", "§", "--", /* section sign */
ali@40	170	"¨", "¨", " ", /* diaeresis = spacing diaeresis */
ali@40	171	"©", "©", "(C) ", /* copyright sign */
ali@40	172	"ª", "ª", " ", /* feminine ordinal indicator */
ali@40	173	"«", "«", "\"", /* left-pointing double angle quotation mark */
ali@40	174	"", "", "-", /* soft hyphen = discretionary hyphen */
ali@40	175	"®", "®", "(R) ", /* registered sign = registered trade mark sign */
ali@40	176	"¯", "¯", " ", /* macron = spacing macron = overline */
ali@40	177	"°", "°", " degrees", /* degree sign */
ali@40	178	"±", "±", "+-", /* plus-minus sign = plus-or-minus sign */
ali@40	179	"²", "²", "2", /* superscript two = superscript digit two */
ali@40	180	"³", "³", "3", /* superscript three = superscript digit three */
ali@40	181	"´", "´", " ", /* acute accent = spacing acute */
ali@40	182	"µ", "µ", "m", /* micro sign */
ali@40	183	"¶", "¶", "--", /* pilcrow sign = paragraph sign */
ali@40	184	"¸", "¸", " ", /* cedilla = spacing cedilla */
ali@40	185	"¹", "¹", "1", /* superscript one = superscript digit one */
ali@40	186	"º", "º", " ", /* masculine ordinal indicator */
ali@40	187	"»", "»", "\"", /* right-pointing double angle quotation mark */
ali@40	188	"¼", "¼", "1/4", /* vulgar fraction one quarter */
ali@40	189	"½", "½", "1/2", /* vulgar fraction one half */
ali@40	190	"¾", "¾", "3/4", /* vulgar fraction three quarters */
ali@40	191	"¿", "¿", "?", /* inverted question mark */
ali@40	192	"À", "À", "A", /* latin capital letter A with grave */
ali@40	193	"Á", "Á", "A", /* latin capital letter A with acute */
ali@40	194	"Â", "Â", "A", /* latin capital letter A with circumflex */
ali@40	195	"Ã", "Ã", "A", /* latin capital letter A with tilde */
ali@40	196	"Ä", "Ä", "A", /* latin capital letter A with diaeresis */
ali@40	197	"Å", "Å", "A", /* latin capital letter A with ring above */
ali@40	198	"Æ", "Æ", "AE", /* latin capital letter AE */
ali@40	199	"Ç", "Ç", "C", /* latin capital letter C with cedilla */
ali@40	200	"È", "È", "E", /* latin capital letter E with grave */
ali@40	201	"É", "É", "E", /* latin capital letter E with acute */
ali@40	202	"Ê", "Ê", "E", /* latin capital letter E with circumflex */
ali@40	203	"Ë", "Ë", "E", /* latin capital letter E with diaeresis */
ali@40	204	"Ì", "Ì", "I", /* latin capital letter I with grave */
ali@40	205	"Í", "Í", "I", /* latin capital letter I with acute */
ali@40	206	"Î", "Î", "I", /* latin capital letter I with circumflex */
ali@40	207	"Ï", "Ï", "I", /* latin capital letter I with diaeresis */
ali@40	208	"Ð", "Ð", "E", /* latin capital letter ETH */
ali@40	209	"Ñ", "Ñ", "N", /* latin capital letter N with tilde */
ali@40	210	"Ò", "Ò", "O", /* latin capital letter O with grave */
ali@40	211	"Ó", "Ó", "O", /* latin capital letter O with acute */
ali@40	212	"Ô", "Ô", "O", /* latin capital letter O with circumflex */
ali@40	213	"Õ", "Õ", "O", /* latin capital letter O with tilde */
ali@40	214	"Ö", "Ö", "O", /* latin capital letter O with diaeresis */
ali@40	215	"×", "×", "", / multiplication sign */
ali@40	216	"Ø", "Ø", "O", /* latin capital letter O with stroke */
ali@40	217	"Ù", "Ù", "U", /* latin capital letter U with grave */
ali@40	218	"Ú", "Ú", "U", /* latin capital letter U with acute */
ali@40	219	"Û", "Û", "U", /* latin capital letter U with circumflex */
ali@40	220	"Ü", "Ü", "U", /* latin capital letter U with diaeresis */
ali@40	221	"Ý", "Ý", "Y", /* latin capital letter Y with acute */
ali@40	222	"Þ", "Þ", "TH", /* latin capital letter THORN */
ali@40	223	"ß", "ß", "sz", /* latin small letter sharp s = ess-zed */
ali@40	224	"à", "à", "a", /* latin small letter a with grave */
ali@40	225	"á", "á", "a", /* latin small letter a with acute */
ali@40	226	"â", "â", "a", /* latin small letter a with circumflex */
ali@40	227	"ã", "ã", "a", /* latin small letter a with tilde */
ali@40	228	"ä", "ä", "a", /* latin small letter a with diaeresis */
ali@40	229	"å", "å", "a", /* latin small letter a with ring above */
ali@40	230	"æ", "æ", "ae", /* latin small letter ae */
ali@40	231	"ç", "ç", "c", /* latin small letter c with cedilla */
ali@40	232	"è", "è", "e", /* latin small letter e with grave */
ali@40	233	"é", "é", "e", /* latin small letter e with acute */
ali@40	234	"ê", "ê", "e", /* latin small letter e with circumflex */
ali@40	235	"ë", "ë", "e", /* latin small letter e with diaeresis */
ali@40	236	"ì", "ì", "i", /* latin small letter i with grave */
ali@40	237	"í", "í", "i", /* latin small letter i with acute */
ali@40	238	"î", "î", "i", /* latin small letter i with circumflex */
ali@40	239	"ï", "ï", "i", /* latin small letter i with diaeresis */
ali@40	240	"ð", "ð", "eth", /* latin small letter eth */
ali@40	241	"ñ", "ñ", "n", /* latin small letter n with tilde */
ali@40	242	"ò", "ò", "o", /* latin small letter o with grave */
ali@40	243	"ó", "ó", "o", /* latin small letter o with acute */
ali@40	244	"ô", "ô", "o", /* latin small letter o with circumflex */
ali@40	245	"õ", "õ", "o", /* latin small letter o with tilde */
ali@40	246	"ö", "ö", "o", /* latin small letter o with diaeresis */
ali@40	247	"÷", "÷", "/", /* division sign */
ali@40	248	"ø", "ø", "o", /* latin small letter o with stroke */
ali@40	249	"ù", "ù", "u", /* latin small letter u with grave */
ali@40	250	"ú", "ú", "u", /* latin small letter u with acute */
ali@40	251	"û", "û", "u", /* latin small letter u with circumflex */
ali@40	252	"ü", "ü", "u", /* latin small letter u with diaeresis */
ali@40	253	"ý", "ý", "y", /* latin small letter y with acute */
ali@40	254	"þ", "þ", "th", /* latin small letter thorn */
ali@40	255	"ÿ", "ÿ", "y", /* latin small letter y with diaeresis */
ali@40	256	"", ""
ali@40	257	};
ali@40	258
ali@40	259	/* special characters */
ali@0	260	#define CHAR_SPACE 32
ali@0	261	#define CHAR_TAB 9
ali@0	262	#define CHAR_LF 10
ali@0	263	#define CHAR_CR 13
ali@0	264	#define CHAR_DQUOTE 34
ali@0	265	#define CHAR_SQUOTE 39
ali@0	266	#define CHAR_OPEN_SQUOTE 96
ali@0	267	#define CHAR_TILDE 126
ali@0	268	#define CHAR_ASTERISK 42
ali@0	269	#define CHAR_FORESLASH 47
ali@0	270	#define CHAR_CARAT 94
ali@0	271
ali@0	272	#define CHAR_UNDERSCORE '_'
ali@0	273	#define CHAR_OPEN_CBRACK '{'
ali@0	274	#define CHAR_CLOSE_CBRACK '}'
ali@0	275	#define CHAR_OPEN_RBRACK '('
ali@0	276	#define CHAR_CLOSE_RBRACK ')'
ali@0	277	#define CHAR_OPEN_SBRACK '['
ali@0	278	#define CHAR_CLOSE_SBRACK ']'
ali@0	279
ali@40	280	/* longest and shortest normal PG line lengths */
ali@0	281	#define LONGEST_PG_LINE 75
ali@0	282	#define WAY_TOO_LONG 80
ali@0	283	#define SHORTEST_PG_LINE 55
ali@0	284
ali@0	285	#define SWITCHES "ESTPXLOYHWVMUD" /* switches:- */
ali@0	286	/* D - ignore DP-specific markup */
ali@0	287	/* E - echo queried line */
ali@0	288	/* S - check single quotes */
ali@0	289	/* T - check common typos */
ali@0	290	/* P - require closure of quotes on */
ali@0	291	/* every paragraph */
ali@0	292	/* X - "Trust no one" :-) Paranoid! */
ali@0	293	/* Queries everything */
ali@0	294	/* L - line end checking defaults on */
ali@0	295	/* -L turns it off */
ali@0	296	/* O - overview. Just shows counts. */
ali@0	297	/* Y - puts errors to stdout */
ali@0	298	/* instead of stderr */
ali@0	299	/* H - Echoes header fields */
ali@0	300	/* M - Ignore markup in < > */
ali@0	301	/* U - Use file of User-defined Typos*/
ali@0	302	/* W - Defaults for use on Web upload*/
ali@0	303	/* V - Verbose - list EVERYTHING! */
ali@0	304	#define SWITNO 14 /* max number of switch parms */
ali@0	305	/* - used for defining array-size */
ali@0	306	#define MINARGS 1 /* minimum no of args excl switches */
ali@0	307	#define MAXARGS 1 /* maximum no of args excl switches */
ali@0	308
ali@0	309	int pswit[SWITNO]; /* program switches set by SWITCHES */
ali@0	310
ali@0	311	#define ECHO_SWITCH 0
ali@0	312	#define SQUOTE_SWITCH 1
ali@0	313	#define TYPO_SWITCH 2
ali@0	314	#define QPARA_SWITCH 3
ali@0	315	#define PARANOID_SWITCH 4
ali@0	316	#define LINE_END_SWITCH 5
ali@0	317	#define OVERVIEW_SWITCH 6
ali@0	318	#define STDOUT_SWITCH 7
ali@0	319	#define HEADER_SWITCH 8
ali@0	320	#define WEB_SWITCH 9
ali@0	321	#define VERBOSE_SWITCH 10
ali@0	322	#define MARKUP_SWITCH 11
ali@0	323	#define USERTYPO_SWITCH 12
ali@0	324	#define DP_SWITCH 13
ali@0	325
ali@0	326	long cnt_dquot; /* for overview mode, count of doublequote queries */
ali@0	327	long cnt_squot; /* for overview mode, count of singlequote queries */
ali@0	328	long cnt_brack; /* for overview mode, count of brackets queries */
ali@0	329	long cnt_bin; /* for overview mode, count of non-ASCII queries */
ali@0	330	long cnt_odd; /* for overview mode, count of odd character queries */
ali@0	331	long cnt_long; /* for overview mode, count of long line errors */
ali@0	332	long cnt_short; /* for overview mode, count of short line queries */
ali@0	333	long cnt_punct; /* for overview mode, count of punctuation and spacing queries */
ali@0	334	long cnt_dash; /* for overview mode, count of dash-related queries */
ali@0	335	long cnt_word; /* for overview mode, count of word queries */
ali@0	336	long cnt_html; /* for overview mode, count of html queries */
ali@0	337	long cnt_lineend; /* for overview mode, count of line-end queries */
ali@40	338	long cnt_spacend; /* count of lines with space at end */
ali@0	339	long linecnt; /* count of total lines in the file */
ali@40	340	long checked_linecnt; /* count of lines actually checked */
ali@0	341
ali@0	342	void proghelp(void);
ali@0	343	void procfile(char *);
ali@0	344
ali@0	345	#define LOW_THRESHOLD 0
ali@0	346	#define HIGH_THRESHOLD 1
ali@0	347
ali@0	348	#define START 0
ali@0	349	#define END 1
ali@0	350	#define PREV 0
ali@0	351	#define NEXT 1
ali@0	352	#define FIRST_OF_PAIR 0
ali@0	353	#define SECOND_OF_PAIR 1
ali@0	354
ali@0	355	#define MAX_WORDPAIR 1000
ali@0	356
ali@0	357	char running_from[MAX_PATH];
ali@0	358
ali@0	359	int mixdigit(char *);
ali@54	360	const char getaword(const char ,char *);
ali@40	361	int matchword(char ,char );
ali@40	362	char flgets(char ,int,FILE *,long);
ali@0	363	void lowerit(char *);
ali@0	364	int gcisalpha(unsigned char);
ali@0	365	int gcisdigit(unsigned char);
ali@0	366	int gcisletter(unsigned char);
ali@40	367	char gcstrchr(char s,char c);
ali@0	368	void postprocess_for_HTML(char *);
ali@0	369	char linehasmarkup(char );
ali@0	370	char losemarkup(char );
ali@40	371	int tagcomp(char ,char );
ali@0	372	char loseentities(char );
ali@0	373	int isroman(char *);
ali@0	374	int usertypo_count;
ali@0	375	void postprocess_for_DP(char *);
ali@0	376
ali@0	377	char wrk[LINEBUFSIZE];
ali@0	378
ali@40	379	#define MAX_QWORD 50
ali@40	380	#define MAX_QWORD_LENGTH 40
ali@0	381	char qword[MAX_QWORD][MAX_QWORD_LENGTH];
ali@0	382	signed int dupcnt[MAX_QWORD];
ali@0	383
ali@40	384	int main(int argc,char **argv)
ali@0	385	{
ali@40	386	char argsw,s;
ali@40	387	int i,switno,invarg;
ali@0	388	char usertypo_file[MAX_PATH];
ali@0	389	FILE *usertypofile;
ali@40	390	if (strlen(argv[0])<sizeof(running_from))
ali@40	391	/* save the path to the executable */
ali@40	392	strcpy(running_from,argv[0]);
ali@0	393	/* find out what directory we're running from */
ali@40	394	s=running_from+strlen(running_from);
ali@40	395	for (;s!='/' && s!='\\' && s>=running_from;s--)
ali@40	396	*s=0;
ali@40	397	switno=strlen(SWITCHES);
ali@40	398	for (i=switno;--i>0;)
ali@40	399	pswit[i]=0; /* initialise switches */
ali@40	400	/*
ali@40	401	* Standard loop to extract switches.
ali@40	402	* When we come out of this loop, the arguments will be
ali@40	403	* in argv[0] upwards and the switches used will be
ali@40	404	* represented by their equivalent elements in pswit[]
ali@40	405	*/
ali@40	406	while (--argc>0 && **++argv=='-')
ali@40	407	for (argsw=argv[0]+1;*argsw!='\0';argsw++)
ali@40	408	for (i=switno,invarg=1;(--i>=0) && invarg==1;)
ali@40	409	if ((toupper(*argsw))==SWITCHES[i])
ali@40	410	{
ali@40	411	invarg=0;
ali@40	412	pswit[i]=1;
ali@40	413	}
ali@40	414	/* Paranoid checking is turned OFF, not on, by its switch */
ali@40	415	pswit[PARANOID_SWITCH]^=1;
ali@40	416	if (pswit[PARANOID_SWITCH])
ali@40	417	/* if running in paranoid mode force typo checks as well */
ali@40	418	pswit[TYPO_SWITCH]=pswit[TYPO_SWITCH]^1;
ali@40	419	/* Line-end checking is turned OFF, not on, by its switch */
ali@40	420	pswit[LINE_END_SWITCH]^=1;
ali@40	421	/* Echoing is turned OFF, not on, by its switch */
ali@40	422	pswit[ECHO_SWITCH]^=1;
ali@40	423	if (pswit[OVERVIEW_SWITCH])
ali@40	424	/* just print summary; don't echo */
ali@40	425	pswit[ECHO_SWITCH]=0;
ali@40	426	/*
ali@40	427	* Web uploads - for the moment, this is really just a placeholder
ali@40	428	* until we decide what processing we really want to do on web uploads
ali@40	429	*/
ali@40	430	if (pswit[WEB_SWITCH])
ali@40	431	{
ali@40	432	/* specific override for web uploads */
ali@40	433	pswit[ECHO_SWITCH]=1;
ali@40	434	pswit[SQUOTE_SWITCH]=0;
ali@40	435	pswit[TYPO_SWITCH]=1;
ali@40	436	pswit[QPARA_SWITCH]=0;
ali@40	437	pswit[PARANOID_SWITCH]=1;
ali@40	438	pswit[LINE_END_SWITCH]=0;
ali@40	439	pswit[OVERVIEW_SWITCH]=0;
ali@40	440	pswit[STDOUT_SWITCH]=0;
ali@40	441	pswit[HEADER_SWITCH]=1;
ali@40	442	pswit[VERBOSE_SWITCH]=0;
ali@40	443	pswit[MARKUP_SWITCH]=0;
ali@40	444	pswit[USERTYPO_SWITCH]=0;
ali@40	445	pswit[DP_SWITCH]=0;
ali@40	446	}
ali@40	447	if (argc<MINARGS \|\| argc>MAXARGS)
ali@40	448	{
ali@40	449	/* check number of args */
ali@0	450	proghelp();
ali@40	451	return 1;
ali@40	452	}
ali@0	453	/* read in the user-defined stealth scanno list */
ali@40	454	if (pswit[USERTYPO_SWITCH])
ali@40	455	{
ali@40	456	/* ... we were told we had one! */
ali@40	457	usertypofile=fopen(USERTYPO_FILE,"rb");
ali@40	458	if (!usertypofile)
ali@40	459	{
ali@40	460	/* not in cwd. try excuteable directory. */
ali@40	461	strcpy(usertypo_file,running_from);
ali@40	462	strcat(usertypo_file,USERTYPO_FILE);
ali@40	463	usertypofile=fopen(usertypo_file,"rb");
ali@40	464	if (!usertypofile) {
ali@40	465	/* we ain't got no user typo file! */
ali@40	466	printf(" --> I couldn't find gutcheck.typ "
ali@40	467	"-- proceeding without user typos.\n");
ali@40	468	}
ali@40	469	}
ali@40	470	usertypo_count=0;
ali@40	471	if (usertypofile)
ali@40	472	{
ali@40	473	/* we managed to open a User Typo File! */
ali@40	474	if (pswit[USERTYPO_SWITCH])
ali@40	475	{
ali@40	476	while (flgets(aline,LINEBUFSIZE-1,usertypofile,
ali@40	477	(long)usertypo_count))
ali@40	478	{
ali@40	479	if (strlen(aline)>1)
ali@40	480	{
ali@40	481	if ((int)*aline>33)
ali@40	482	{
ali@40	483	s=malloc(strlen(aline)+1);
ali@40	484	if (!s)
ali@40	485	{
ali@40	486	fprintf(stderr,"bookloupe: cannot get enough "
ali@40	487	"memory for user typo file!\n");
ali@0	488	exit(1);
ali@40	489	}
ali@40	490	strcpy(s,aline);
ali@40	491	usertypo[usertypo_count]=s;
ali@0	492	usertypo_count++;
ali@40	493	if (usertypo_count>=MAX_USER_TYPOS)
ali@40	494	{
ali@40	495	printf(" --> Only %d user-defined typos "
ali@42	496	"allowed: ignoring the rest\n",
ali@42	497	MAX_USER_TYPOS);
ali@0	498	break;
ali@40	499	}
ali@40	500	}
ali@40	501	}
ali@40	502	}
ali@40	503	}
ali@0	504	fclose(usertypofile);
ali@40	505	}
ali@40	506	}
ali@40	507	fprintf(stderr,"bookloupe: Check and report on an e-text\n");
ali@40	508	cnt_dquot=cnt_squot=cnt_brack=cnt_bin=cnt_odd=cnt_long=
ali@40	509	cnt_short=cnt_punct=cnt_dash=cnt_word=cnt_html=cnt_lineend=
ali@40	510	cnt_spacend=0;
ali@0	511	procfile(argv[0]);
ali@40	512	if (pswit[OVERVIEW_SWITCH])
ali@40	513	{
ali@40	514	printf(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
ali@40	515	checked_linecnt,linecnt,linecnt-checked_linecnt);
ali@40	516	printf(" --------------- Queries found --------------\n");
ali@40	517	if (cnt_long)
ali@40	518	printf(" Long lines: %14ld\n",cnt_long);
ali@40	519	if (cnt_short)
ali@40	520	printf(" Short lines: %14ld\n",cnt_short);
ali@40	521	if (cnt_lineend)
ali@40	522	printf(" Line-end problems: %14ld\n",cnt_lineend);
ali@40	523	if (cnt_word)
ali@40	524	printf(" Common typos: %14ld\n",cnt_word);
ali@40	525	if (cnt_dquot)
ali@40	526	printf(" Unmatched quotes: %14ld\n",cnt_dquot);
ali@40	527	if (cnt_squot)
ali@40	528	printf(" Unmatched SingleQuotes: %14ld\n",cnt_squot);
ali@40	529	if (cnt_brack)
ali@40	530	printf(" Unmatched brackets: %14ld\n",cnt_brack);
ali@40	531	if (cnt_bin)
ali@40	532	printf(" Non-ASCII characters: %14ld\n",cnt_bin);
ali@40	533	if (cnt_odd)
ali@40	534	printf(" Proofing characters: %14ld\n",cnt_odd);
ali@40	535	if (cnt_punct)
ali@40	536	printf(" Punctuation & spacing queries: %14ld\n",cnt_punct);
ali@40	537	if (cnt_dash)
ali@40	538	printf(" Non-standard dashes: %14ld\n",cnt_dash);
ali@40	539	if (cnt_html)
ali@40	540	printf(" Possible HTML tags: %14ld\n",cnt_html);
ali@0	541	printf("\n");
ali@40	542	printf(" TOTAL QUERIES %14ld\n",
ali@40	543	cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
ali@40	544	cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
ali@40	545	}
ali@40	546	return 0;
ali@0	547	}
ali@0	548
ali@41	549	struct first_pass_results {
ali@41	550	long firstline,astline;
ali@41	551	long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;
ali@41	552	long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;
ali@41	553	long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;
ali@41	554	signed int Dutchcount,Frenchcount;
ali@41	555	};
ali@41	556
ali@40	557	/*
ali@41	558	* first_pass:
ali@40	559	*
ali@41	560	* Run a first pass - verify that it's a valid PG
ali@41	561	* file, decide whether to report some things that
ali@41	562	* occur many times in the text like long or short
ali@41	563	* lines, non-standard dashes, etc.
ali@40	564	*/
ali@41	565	struct first_pass_results first_pass(FILE infile)
ali@0	566	{
ali@54	567	char laststart=CHAR_SPACE;
ali@54	568	const char *s;
ali@41	569	signed int i,llen;
ali@41	570	unsigned int lastlen=0,lastblen=0;
ali@41	571	long spline=0,nspline=0;
ali@41	572	static struct first_pass_results results={0};
ali@41	573	char inword[MAXWORDLEN]="";
ali@40	574	while (fgets(aline,LINEBUFSIZE-1,infile))
ali@40	575	{
ali@40	576	while (aline[strlen(aline)-1]==10 \|\| aline[strlen(aline)-1]==13)
ali@40	577	aline[strlen(aline)-1]=0;
ali@0	578	linecnt++;
ali@40	579	if (strstr(aline,"*END") && strstr(aline,"SMALL PRINT") &&
ali@40	580	(strstr(aline,"PUBLIC DOMAIN") \|\| strstr(aline,"COPYRIGHT")))
ali@40	581	{
ali@0	582	if (spline)
ali@0	583	printf(" --> Duplicate header?\n");
ali@40	584	spline=linecnt+1; /* first line of non-header text, that is */
ali@40	585	}
ali@40	586	if (!strncmp(aline,"*** START",9) && strstr(aline,"PROJECT GUTENBERG"))
ali@40	587	{
ali@0	588	if (nspline)
ali@0	589	printf(" --> Duplicate header?\n");
ali@40	590	nspline=linecnt+1; /* first line of non-header text, that is */
ali@40	591	}
ali@40	592	if (spline \|\| nspline)
ali@40	593	{
ali@0	594	lowerit(aline);
ali@40	595	if (strstr(aline,"end") && strstr(aline,"project gutenberg"))
ali@40	596	{
ali@40	597	if (strstr(aline,"end")<strstr(aline,"project gutenberg"))
ali@40	598	{
ali@41	599	if (results.footerline)
ali@40	600	{
ali@40	601	/* it's an old-form header - we can detect duplicates */
ali@40	602	if (!nspline)
ali@0	603	printf(" --> Duplicate footer?\n");
ali@40	604	}
ali@40	605	else
ali@41	606	results.footerline=linecnt;
ali@40	607	}
ali@40	608	}
ali@40	609	}
ali@40	610	if (spline)
ali@41	611	results.firstline=spline;
ali@40	612	if (nspline)
ali@41	613	results.firstline=nspline; /* override with new */
ali@41	614	if (results.footerline)
ali@40	615	continue; /* don't count the boilerplate in the footer */
ali@40	616	llen=strlen(aline);
ali@41	617	results.totlen+=llen;
ali@40	618	for (i=0;i<llen;i++)
ali@40	619	{
ali@40	620	if ((unsigned char)aline[i]>127)
ali@41	621	results.binlen++;
ali@40	622	if (gcisalpha(aline[i]))
ali@41	623	results.alphalen++;
ali@40	624	if (i>0 && aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))
ali@41	625	results.endquote_count++;
ali@40	626	}
ali@40	627	if (strlen(aline)>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE &&
ali@40	628	lastblen>2 && lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
ali@41	629	results.shortline++;
ali@40	630	if (*aline && (unsigned char)aline[strlen(aline)-1]<=CHAR_SPACE)
ali@40	631	cnt_spacend++;
ali@40	632	if (strstr(aline,".,"))
ali@41	633	results.dotcomma++;
ali@40	634	/* only count ast lines for ignoring purposes where there is */
ali@0	635	/* locase text on the line */
ali@40	636	if (strstr(aline,"*"))
ali@40	637	{
ali@40	638	for (s=aline;*s;s++)
ali@40	639	if (s>='a' && s<='z')
ali@0	640	break;
ali@40	641	if (*s)
ali@41	642	results.astline++;
ali@40	643	}
ali@40	644	if (strstr(aline,"/"))
ali@41	645	results.fslashline++;
ali@40	646	for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)
ali@40	647	;
ali@40	648	if (aline[i]=='-' && aline[i-1]!='-')
ali@41	649	results.hyphens++;
ali@40	650	if (llen>LONGEST_PG_LINE)
ali@41	651	results.longline++;
ali@40	652	if (llen>WAY_TOO_LONG)
ali@41	653	results.verylongline++;
ali@40	654	if (strstr(aline,"<") && strstr(aline,">"))
ali@40	655	{
ali@40	656	i=(signed int)(strstr(aline,">")-strstr(aline,"<")+1);
ali@40	657	if (i>0)
ali@41	658	results.htmcount++;
ali@40	659	if (strstr(aline,"<i>"))
ali@41	660	results.htmcount+=4; /* bonus marks! */
ali@40	661	}
ali@0	662	/* Check for spaced em-dashes */
ali@40	663	if (strstr(aline,"--"))
ali@40	664	{
ali@41	665	results.emdash++;
ali@40	666	if (*(strstr(aline,"--")-1)==CHAR_SPACE \|\|
ali@40	667	(*(strstr(aline,"--")+2)==CHAR_SPACE))
ali@41	668	results.space_emdash++;
ali@40	669	if (*(strstr(aline,"--")-1)==CHAR_SPACE &&
ali@40	670	(*(strstr(aline,"--")+2)==CHAR_SPACE))
ali@40	671	/* count of em-dashes with spaces both sides */
ali@41	672	results.non_PG_space_emdash++;
ali@40	673	if (*(strstr(aline,"--")-1)!=CHAR_SPACE &&
ali@40	674	(*(strstr(aline,"--")+2)!=CHAR_SPACE))
ali@40	675	/* count of PG-type em-dashes with no spaces */
ali@41	676	results.PG_space_emdash++;
ali@40	677	}
ali@40	678	for (s=aline;*s;)
ali@40	679	{
ali@40	680	s=getaword(s,inword);
ali@40	681	if (!strcmp(inword,"hij") \|\| !strcmp(inword,"niet"))
ali@41	682	results.Dutchcount++;
ali@40	683	if (!strcmp(inword,"dans") \|\| !strcmp(inword,"avec"))
ali@41	684	results.Frenchcount++;
ali@40	685	if (!strcmp(inword,"0") \|\| !strcmp(inword,"1"))
ali@41	686	results.standalone_digit++;
ali@40	687	}
ali@0	688	/* Check for spaced dashes */
ali@40	689	if (strstr(aline," -") && *(strstr(aline," -")+2)!='-')
ali@41	690	results.spacedash++;
ali@40	691	lastblen=lastlen;
ali@40	692	lastlen=strlen(aline);
ali@40	693	laststart=aline[0];
ali@40	694	}
ali@41	695	return &results;
ali@41	696	}
ali@41	697
ali@42	698	struct warnings {
ali@42	699	signed int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;
ali@42	700	signed int endquote,isDutch,isFrench;
ali@42	701	};
ali@42	702
ali@42	703	/*
ali@42	704	* report_first_pass:
ali@42	705	*
ali@42	706	* Make some snap decisions based on the first pass results.
ali@42	707	*/
ali@42	708	struct warnings report_first_pass(struct first_pass_results results)
ali@42	709	{
ali@42	710	static struct warnings warnings={0};
ali@42	711	if (cnt_spacend>0)
ali@42	712	printf(" --> %ld lines in this file have white space at end\n",
ali@42	713	cnt_spacend);
ali@42	714	warnings.dotcomma=1;
ali@42	715	if (results->dotcomma>5)
ali@42	716	{
ali@42	717	warnings.dotcomma=0;
ali@42	718	printf(" --> %ld lines in this file contain '.,'. "
ali@42	719	"Not reporting them.\n",results->dotcomma);
ali@42	720	}
ali@42	721	/*
ali@42	722	* If more than 50 lines, or one-tenth, are short,
ali@42	723	* don't bother reporting them.
ali@42	724	*/
ali@42	725	warnings.shortline=1;
ali@42	726	if (results->shortline>50 \|\| results->shortline*10>linecnt)
ali@42	727	{
ali@42	728	warnings.shortline=0;
ali@42	729	printf(" --> %ld lines in this file are short. "
ali@42	730	"Not reporting short lines.\n",results->shortline);
ali@42	731	}
ali@42	732	/*
ali@42	733	* If more than 50 lines, or one-tenth, are long,
ali@42	734	* don't bother reporting them.
ali@42	735	*/
ali@42	736	warnings.longline=1;
ali@42	737	if (results->longline>50 \|\| results->longline*10>linecnt)
ali@42	738	{
ali@42	739	warnings.longline=0;
ali@42	740	printf(" --> %ld lines in this file are long. "
ali@42	741	"Not reporting long lines.\n",results->longline);
ali@42	742	}
ali@42	743	/* If more than 10 lines contain asterisks, don't bother reporting them. */
ali@42	744	warnings.ast=1;
ali@42	745	if (results->astline>10)
ali@42	746	{
ali@42	747	warnings.ast=0;
ali@42	748	printf(" --> %ld lines in this file contain asterisks. "
ali@42	749	"Not reporting them.\n",results->astline);
ali@42	750	}
ali@42	751	/*
ali@42	752	* If more than 10 lines contain forward slashes,
ali@42	753	* don't bother reporting them.
ali@42	754	*/
ali@42	755	warnings.fslash=1;
ali@42	756	if (results->fslashline>10)
ali@42	757	{
ali@42	758	warnings.fslash=0;
ali@42	759	printf(" --> %ld lines in this file contain forward slashes. "
ali@42	760	"Not reporting them.\n",results->fslashline);
ali@42	761	}
ali@42	762	/*
ali@42	763	* If more than 20 lines contain unpunctuated endquotes,
ali@42	764	* don't bother reporting them.
ali@42	765	*/
ali@42	766	warnings.endquote=1;
ali@42	767	if (results->endquote_count>20)
ali@42	768	{
ali@42	769	warnings.endquote=0;
ali@42	770	printf(" --> %ld lines in this file contain unpunctuated endquotes. "
ali@42	771	"Not reporting them.\n",results->endquote_count);
ali@42	772	}
ali@42	773	/*
ali@42	774	* If more than 15 lines contain standalone digits,
ali@42	775	* don't bother reporting them.
ali@42	776	*/
ali@42	777	warnings.digit=1;
ali@42	778	if (results->standalone_digit>10)
ali@42	779	{
ali@42	780	warnings.digit=0;
ali@42	781	printf(" --> %ld lines in this file contain standalone 0s and 1s. "
ali@42	782	"Not reporting them.\n",results->standalone_digit);
ali@42	783	}
ali@42	784	/*
ali@42	785	* If more than 20 lines contain hyphens at end,
ali@42	786	* don't bother reporting them.
ali@42	787	*/
ali@42	788	warnings.hyphen=1;
ali@42	789	if (results->hyphens>20)
ali@42	790	{
ali@42	791	warnings.hyphen=0;
ali@42	792	printf(" --> %ld lines in this file have hyphens at end. "
ali@42	793	"Not reporting them.\n",results->hyphens);
ali@42	794	}
ali@42	795	if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
ali@42	796	{
ali@42	797	printf(" --> Looks like this is HTML. Switching HTML mode ON.\n");
ali@42	798	pswit[MARKUP_SWITCH]=1;
ali@42	799	}
ali@42	800	if (results->verylongline>0)
ali@42	801	printf(" --> %ld lines in this file are VERY long!\n",
ali@42	802	results->verylongline);
ali@42	803	/*
ali@42	804	* If there are more non-PG spaced dashes than PG em-dashes,
ali@42	805	* assume it's deliberate.
ali@42	806	* Current PG guidelines say don't use them, but older texts do,
ali@42	807	* and some people insist on them whatever the guidelines say.
ali@42	808	*/
ali@42	809	warnings.dash=1;
ali@42	810	if (results->spacedash+results->non_PG_space_emdash>
ali@42	811	results->PG_space_emdash)
ali@42	812	{
ali@42	813	warnings.dash=0;
ali@42	814	printf(" --> There are %ld spaced dashes and em-dashes. "
ali@42	815	"Not reporting them.\n",
ali@42	816	results->spacedash+results->non_PG_space_emdash);
ali@42	817	}
ali@42	818	/* If more than a quarter of characters are hi-bit, bug out. */
ali@42	819	warnings.bin=1;
ali@42	820	if (results->binlen*4>results->totlen)
ali@42	821	{
ali@42	822	printf(" --> This file does not appear to be ASCII. "
ali@42	823	"Terminating. Best of luck with it!\n");
ali@42	824	exit(1);
ali@42	825	}
ali@42	826	if (results->alphalen*4<results->totlen)
ali@42	827	{
ali@42	828	printf(" --> This file does not appear to be text. "
ali@42	829	"Terminating. Best of luck with it!\n");
ali@42	830	exit(1);
ali@42	831	}
ali@42	832	if (results->binlen*100>results->totlen \|\| results->binlen>100)
ali@42	833	{
ali@42	834	printf(" --> There are a lot of foreign letters here. "
ali@42	835	"Not reporting them.\n");
ali@42	836	warnings.bin=0;
ali@42	837	}
ali@42	838	warnings.isDutch=0;
ali@42	839	if (results->Dutchcount>50)
ali@42	840	{
ali@42	841	warnings.isDutch=1;
ali@42	842	printf(" --> This looks like Dutch - "
ali@42	843	"switching off dashes and warnings for 's Middags case.\n");
ali@42	844	}
ali@42	845	warnings.isFrench=0;
ali@42	846	if (results->Frenchcount>50)
ali@42	847	{
ali@42	848	warnings.isFrench=1;
ali@42	849	printf(" --> This looks like French - "
ali@42	850	"switching off some doublepunct.\n");
ali@42	851	}
ali@42	852	if (results->firstline && results->footerline)
ali@42	853	printf(" The PG header and footer appear to be already on.\n");
ali@42	854	else
ali@42	855	{
ali@42	856	if (results->firstline)
ali@42	857	printf(" The PG header is on - no footer.\n");
ali@42	858	if (results->footerline)
ali@42	859	printf(" The PG footer is on - no header.\n");
ali@42	860	}
ali@42	861	printf("\n");
ali@42	862	if (pswit[VERBOSE_SWITCH])
ali@42	863	{
ali@42	864	warnings.bin=1;
ali@42	865	warnings.shortline=1;
ali@42	866	warnings.dotcomma=1;
ali@42	867	warnings.longline=1;
ali@42	868	warnings.dash=1;
ali@42	869	warnings.digit=1;
ali@42	870	warnings.ast=1;
ali@42	871	warnings.fslash=1;
ali@42	872	warnings.hyphen=1;
ali@42	873	warnings.endquote=1;
ali@42	874	printf(" * Verbose output is ON -- you asked for it! *\n");
ali@42	875	}
ali@42	876	if (warnings.isDutch)
ali@42	877	warnings.dash=0;
ali@42	878	if (results->footerline>0 && results->firstline>0 &&
ali@42	879	results->footerline>results->firstline &&
ali@42	880	results->footerline-results->firstline<100)
ali@42	881	{
ali@42	882	printf(" --> I don't really know where this text starts. \n");
ali@42	883	printf(" There are no reference points.\n");
ali@42	884	printf(" I'm going to have to report the header and footer "
ali@42	885	"as well.\n");
ali@42	886	results->firstline=0;
ali@42	887	}
ali@42	888	return &warnings;
ali@42	889	}
ali@42	890
ali@43	891	struct counters {
ali@43	892	long quot;
ali@43	893	signed int c_unders,c_brack,s_brack,r_brack;
ali@43	894	signed int open_single_quote,close_single_quote;
ali@43	895	};
ali@43	896
ali@43	897	/*
ali@43	898	* analyse_quotes:
ali@43	899	*
ali@43	900	* Look along the line, accumulate the count of quotes, and see
ali@43	901	* if this is an empty line - i.e. a line with nothing on it
ali@43	902	* but spaces.
ali@43	903	* If line has just spaces, period, * and/or - on it, don't
ali@43	904	* count it, since empty lines with asterisks or dashes to
ali@43	905	* separate sections are common.
ali@43	906	*
ali@43	907	* Returns: Non-zero if the line is empty.
ali@43	908	*/
ali@43	909	int analyse_quotes(const char s,struct counters counters)
ali@43	910	{
ali@43	911	signed int guessquote=0;
ali@43	912	int isemptyline=1; /* assume the line is empty until proven otherwise */
ali@43	913	while (*s)
ali@43	914	{
ali@43	915	if (*s==CHAR_DQUOTE)
ali@43	916	counters->quot++;
ali@43	917	if (s==CHAR_SQUOTE \|\| s==CHAR_OPEN_SQUOTE)
ali@43	918	{
ali@43	919	if (s==aline)
ali@43	920	{
ali@43	921	/*
ali@43	922	* At start of line, it can only be an openquote.
ali@43	923	* Hardcode a very common exception!
ali@43	924	*/
ali@43	925	if (strncmp(s+2,"tis",3) && strncmp(s+2,"Tis",3))
ali@43	926	counters->open_single_quote++;
ali@43	927	}
ali@43	928	else if (gcisalpha(s[-1]) && gcisalpha(s[1]))
ali@43	929	/* Do nothing! it's definitely an apostrophe, not a quote */
ali@43	930	;
ali@43	931	/* it's outside a word - let's check it out */
ali@43	932	else if (*s==CHAR_OPEN_SQUOTE \|\| gcisalpha(s[1]))
ali@43	933	{
ali@43	934	/* it damwell better BE an openquote */
ali@43	935	if (strncmp(s+1,"tis",3) && strncmp(s+1,"Tis",3))
ali@43	936	/* hardcode a very common exception! */
ali@43	937	counters->open_single_quote++;
ali@43	938	}
ali@43	939	else
ali@43	940	{
ali@43	941	/* now - is it a closequote? */
ali@43	942	guessquote=0; /* accumulate clues */
ali@43	943	if (gcisalpha(s[-1]))
ali@43	944	{
ali@43	945	/* it follows a letter - could be either */
ali@43	946	guessquote++;
ali@43	947	if (s[-1]=='s')
ali@43	948	{
ali@43	949	/* looks like a plural apostrophe */
ali@43	950	guessquote-=3;
ali@43	951	if (s[1]==CHAR_SPACE) /* bonus marks! */
ali@43	952	guessquote-=2;
ali@43	953	}
ali@43	954	}
ali@43	955	/* it doesn't have a letter either side */
ali@43	956	else if (strchr(".?!,;:",s[-1]) && strchr(".?!,;: ",s[1]))
ali@43	957	guessquote+=8; /* looks like a closequote */
ali@43	958	else
ali@43	959	guessquote++;
ali@43	960	if (counters->open_single_quote>counters->close_single_quote)
ali@43	961	/*
ali@43	962	* Give it the benefit of some doubt,
ali@43	963	* if a squote is already open.
ali@43	964	*/
ali@43	965	guessquote++;
ali@43	966	else
ali@43	967	guessquote--;
ali@43	968	if (guessquote>=0)
ali@43	969	counters->close_single_quote++;
ali@43	970	}
ali@43	971	}
ali@43	972	if (s!=CHAR_SPACE && s!='-' && s!='.' && s!=CHAR_ASTERISK &&
ali@43	973	s!=13 && s!=10)
ali@43	974	isemptyline=0; /* ignore lines like * * * as spacers */
ali@43	975	if (*s==CHAR_UNDERSCORE)
ali@43	976	counters->c_unders++;
ali@43	977	if (*s==CHAR_OPEN_CBRACK)
ali@43	978	counters->c_brack++;
ali@43	979	if (*s==CHAR_CLOSE_CBRACK)
ali@43	980	counters->c_brack--;
ali@43	981	if (*s==CHAR_OPEN_RBRACK)
ali@43	982	counters->r_brack++;
ali@43	983	if (*s==CHAR_CLOSE_RBRACK)
ali@43	984	counters->r_brack--;
ali@43	985	if (*s==CHAR_OPEN_SBRACK)
ali@43	986	counters->s_brack++;
ali@43	987	if (*s==CHAR_CLOSE_SBRACK)
ali@43	988	counters->s_brack--;
ali@43	989	s++;
ali@43	990	}
ali@43	991	return isemptyline;
ali@43	992	}
ali@43	993
ali@41	994	/*
ali@44	995	* check_for_odd_characters:
ali@44	996	*
ali@44	997	* Check for binary and other odd characters.
ali@44	998	*/
ali@44	999	void check_for_odd_characters(const char aline,const struct warnings warnings,
ali@44	1000	int isemptyline)
ali@44	1001	{
ali@44	1002	/* Don't repeat multiple warnings on one line. */
ali@44	1003	signed int eNon_A=0,eTab=0,eTilde=0,eCarat=0,eFSlash=0,eAst=0;
ali@44	1004	const char *s;
ali@44	1005	unsigned char c;
ali@44	1006	for (s=aline;*s;s++)
ali@44	1007	{
ali@44	1008	c=(unsigned char )s;
ali@44	1009	if (!eNon_A && (s<CHAR_SPACE && s!=9 && *s!='\n' \|\| c>127))
ali@44	1010	{
ali@44	1011	if (pswit[ECHO_SWITCH])
ali@44	1012	printf("\n%s\n",aline);
ali@44	1013	if (!pswit[OVERVIEW_SWITCH])
ali@44	1014	if (c>127 && c<160)
ali@44	1015	printf(" Line %ld column %d - "
ali@44	1016	"Non-ISO-8859 character %d\n",linecnt,(int)(s-aline)+1,c);
ali@44	1017	else
ali@44	1018	printf(" Line %ld column %d - Non-ASCII character %d\n",
ali@44	1019	linecnt,(int)(s-aline)+1,c);
ali@44	1020	else
ali@44	1021	cnt_bin++;
ali@44	1022	eNon_A=1;
ali@44	1023	}
ali@44	1024	if (!eTab && *s==CHAR_TAB)
ali@44	1025	{
ali@44	1026	if (pswit[ECHO_SWITCH])
ali@44	1027	printf("\n%s\n",aline);
ali@44	1028	if (!pswit[OVERVIEW_SWITCH])
ali@44	1029	printf(" Line %ld column %d - Tab character?\n",
ali@44	1030	linecnt,(int)(s-aline)+1);
ali@44	1031	else
ali@44	1032	cnt_odd++;
ali@44	1033	eTab=1;
ali@44	1034	}
ali@44	1035	if (!eTilde && *s==CHAR_TILDE)
ali@44	1036	{
ali@44	1037	/*
ali@44	1038	* Often used by OCR software to indicate an
ali@44	1039	* unrecognizable character.
ali@44	1040	*/
ali@44	1041	if (pswit[ECHO_SWITCH])
ali@44	1042	printf("\n%s\n",aline);
ali@44	1043	if (!pswit[OVERVIEW_SWITCH])
ali@44	1044	printf(" Line %ld column %d - Tilde character?\n",
ali@44	1045	linecnt,(int)(s-aline)+1);
ali@44	1046	else
ali@44	1047	cnt_odd++;
ali@44	1048	eTilde=1;
ali@44	1049	}
ali@44	1050	if (!eCarat && *s==CHAR_CARAT)
ali@44	1051	{
ali@44	1052	if (pswit[ECHO_SWITCH])
ali@44	1053	printf("\n%s\n",aline);
ali@44	1054	if (!pswit[OVERVIEW_SWITCH])
ali@44	1055	printf(" Line %ld column %d - Carat character?\n",
ali@44	1056	linecnt,(int)(s-aline)+1);
ali@44	1057	else
ali@44	1058	cnt_odd++;
ali@44	1059	eCarat=1;
ali@44	1060	}
ali@44	1061	if (!eFSlash && *s==CHAR_FORESLASH && warnings->fslash)
ali@44	1062	{
ali@44	1063	if (pswit[ECHO_SWITCH])
ali@44	1064	printf("\n%s\n",aline);
ali@44	1065	if (!pswit[OVERVIEW_SWITCH])
ali@44	1066	printf(" Line %ld column %d - Forward slash?\n",
ali@44	1067	linecnt,(int)(s-aline)+1);
ali@44	1068	else
ali@44	1069	cnt_odd++;
ali@44	1070	eFSlash=1;
ali@44	1071	}
ali@44	1072	/*
ali@44	1073	* Report asterisks only in paranoid mode,
ali@44	1074	* since they're often deliberate.
ali@44	1075	*/
ali@44	1076	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
ali@44	1077	*s==CHAR_ASTERISK)
ali@44	1078	{
ali@44	1079	if (pswit[ECHO_SWITCH])
ali@44	1080	printf("\n%s\n",aline);
ali@44	1081	if (!pswit[OVERVIEW_SWITCH])
ali@44	1082	printf(" Line %ld column %d - Asterisk?\n",
ali@44	1083	linecnt,(int)(s-aline)+1);
ali@44	1084	else
ali@44	1085	cnt_odd++;
ali@44	1086	eAst=1;
ali@44	1087	}
ali@44	1088	}
ali@44	1089	}
ali@44	1090
ali@44	1091	/*
ali@45	1092	* check_for_long_line:
ali@45	1093	*
ali@45	1094	* Check for line too long.
ali@45	1095	*/
ali@45	1096	void check_for_long_line(const char *aline)
ali@45	1097	{
ali@45	1098	if (strlen(aline)>LONGEST_PG_LINE)
ali@45	1099	{
ali@45	1100	if (pswit[ECHO_SWITCH])
ali@45	1101	printf("\n%s\n",aline);
ali@45	1102	if (!pswit[OVERVIEW_SWITCH])
ali@45	1103	printf(" Line %ld column %d - Long line %d\n",
ali@45	1104	linecnt,strlen(aline),strlen(aline));
ali@45	1105	else
ali@45	1106	cnt_long++;
ali@45	1107	}
ali@45	1108	}
ali@45	1109
ali@45	1110	struct line_properties {
ali@45	1111	unsigned int len,blen;
ali@45	1112	char start;
ali@45	1113	};
ali@45	1114
ali@45	1115	/*
ali@45	1116	* check_for_short_line:
ali@45	1117	*
ali@45	1118	* Check for line too short.
ali@45	1119	*
ali@45	1120	* This one is a bit trickier to implement: we don't want to
ali@45	1121	* flag the last line of a paragraph for being short, so we
ali@45	1122	* have to wait until we know that our current line is a
ali@45	1123	* "normal" line, then report the _previous_ line if it was too
ali@45	1124	* short. We also don't want to report indented lines like
ali@45	1125	* chapter heads or formatted quotations. We therefore keep
ali@45	1126	* last->len as the length of the last line examined, and
ali@45	1127	* last->blen as the length of the last but one, and try to
ali@45	1128	* suppress unnecessary warnings by checking that both were of
ali@45	1129	* "normal" length. We keep the first character of the last
ali@45	1130	* line in last->start, and if it was a space, we assume that
ali@45	1131	* the formatting is deliberate. I can't figure out a way to
ali@45	1132	* distinguish something like a quoted verse left-aligned or
ali@45	1133	* the header or footer of a letter from a paragraph of short
ali@45	1134	* lines - maybe if I examined the whole paragraph, and if the
ali@45	1135	* para has less than, say, 8 lines and if all lines are short,
ali@45	1136	* then just assume it's OK? Need to look at some texts to see
ali@45	1137	* how often a formula like this would get the right result.
ali@45	1138	*/
ali@45	1139	void check_for_short_line(const char aline,const struct line_properties last)
ali@45	1140	{
ali@45	1141	if (strlen(aline)>1 && last->len>1 && last->len<SHORTEST_PG_LINE &&
ali@45	1142	last->blen>1 && last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
ali@45	1143	{
ali@45	1144	if (pswit[ECHO_SWITCH])
ali@45	1145	printf("\n%s\n",prevline);
ali@45	1146	if (!pswit[OVERVIEW_SWITCH])
ali@45	1147	printf(" Line %ld column %d - Short line %d?\n",
ali@45	1148	linecnt-1,strlen(prevline),strlen(prevline));
ali@45	1149	else
ali@45	1150	cnt_short++;
ali@45	1151	}
ali@45	1152	}
ali@45	1153
ali@45	1154	/*
ali@46	1155	* check_for_starting_punctuation:
ali@46	1156	*
ali@46	1157	* Look for punctuation other than full ellipses at start of line.
ali@46	1158	*/
ali@46	1159	void check_for_starting_punctuation(const char *aline)
ali@46	1160	{
ali@46	1161	if (*aline && strchr(".?!,;:",aline[0]) && strncmp(". . .",aline,5))
ali@46	1162	{
ali@46	1163	if (pswit[ECHO_SWITCH])
ali@46	1164	printf("\n%s\n",aline);
ali@46	1165	if (!pswit[OVERVIEW_SWITCH])
ali@46	1166	printf(" Line %ld column 1 - Begins with punctuation?\n",
ali@46	1167	linecnt);
ali@46	1168	else
ali@46	1169	cnt_punct++;
ali@46	1170	}
ali@46	1171	}
ali@46	1172
ali@46	1173	/*
ali@47	1174	* check_for_spaced_emdash:
ali@47	1175	*
ali@47	1176	* Check for spaced em-dashes.
ali@47	1177	*
ali@47	1178	* We must check _all_ occurrences of "--" on the line
ali@47	1179	* hence the loop - even if the first double-dash is OK
ali@47	1180	* there may be another that's wrong later on.
ali@47	1181	*/
ali@47	1182	void check_for_spaced_emdash(const char *aline)
ali@47	1183	{
ali@47	1184	const char s,t;
ali@47	1185	s=aline;
ali@47	1186	while ((t=strstr(s,"--")))
ali@47	1187	{
ali@47	1188	if (t>aline && t[-1]==CHAR_SPACE \|\| t[2]==CHAR_SPACE)
ali@47	1189	{
ali@47	1190	if (pswit[ECHO_SWITCH])
ali@47	1191	printf("\n%s\n",aline);
ali@47	1192	if (!pswit[OVERVIEW_SWITCH])
ali@47	1193	printf(" Line %ld column %d - Spaced em-dash?\n",
ali@47	1194	linecnt,(int)(t-aline)+1);
ali@47	1195	else
ali@47	1196	cnt_dash++;
ali@47	1197	}
ali@47	1198	s=t+2;
ali@47	1199	}
ali@47	1200	}
ali@47	1201
ali@47	1202	/*
ali@47	1203	* check_for_spaced_dash:
ali@47	1204	*
ali@47	1205	* Check for spaced dashes.
ali@47	1206	*/
ali@47	1207	void check_for_spaced_dash(const char *aline)
ali@47	1208	{
ali@47	1209	const char *s;
ali@47	1210	if ((s=strstr(aline," -")))
ali@47	1211	{
ali@47	1212	if (s[2]!='-')
ali@47	1213	{
ali@47	1214	if (pswit[ECHO_SWITCH])
ali@47	1215	printf("\n%s\n",aline);
ali@47	1216	if (!pswit[OVERVIEW_SWITCH])
ali@47	1217	printf(" Line %ld column %d - Spaced dash?\n",
ali@47	1218	linecnt,(int)(s-aline)+1);
ali@47	1219	else
ali@47	1220	cnt_dash++;
ali@47	1221	}
ali@47	1222	}
ali@47	1223	else if ((s=strstr(aline,"- ")))
ali@47	1224	{
ali@47	1225	if (s==aline \|\| s[-1]!='-')
ali@47	1226	{
ali@47	1227	if (pswit[ECHO_SWITCH])
ali@47	1228	printf("\n%s\n",aline);
ali@47	1229	if (!pswit[OVERVIEW_SWITCH])
ali@47	1230	printf(" Line %ld column %d - Spaced dash?\n",
ali@47	1231	linecnt,(int)(s-aline)+1);
ali@47	1232	else
ali@47	1233	cnt_dash++;
ali@47	1234	}
ali@47	1235	}
ali@47	1236	}
ali@47	1237
ali@47	1238	/*
ali@48	1239	* check_for_unmarked_paragraphs:
ali@48	1240	*
ali@48	1241	* Check for unmarked paragraphs indicated by separate speakers.
ali@48	1242	*
ali@48	1243	* May well be false positive:
ali@48	1244	* "Bravo!" "Wonderful!" called the crowd.
ali@48	1245	* but useful all the same.
ali@48	1246	*/
ali@48	1247	void check_for_unmarked_paragraphs(const char *aline)
ali@48	1248	{
ali@48	1249	const char *s;
ali@48	1250	s=strstr(aline,"\" \"");
ali@48	1251	if (!s)
ali@48	1252	s=strstr(aline,"\" \"");
ali@48	1253	if (s)
ali@48	1254	{
ali@48	1255	if (pswit[ECHO_SWITCH])
ali@48	1256	printf("\n%s\n",aline);
ali@48	1257	if (!pswit[OVERVIEW_SWITCH])
ali@48	1258	printf(" Line %ld column %d - Query missing paragraph break?\n",
ali@48	1259	linecnt,(int)(s-aline)+1);
ali@48	1260	else
ali@48	1261	cnt_punct++;
ali@48	1262	}
ali@48	1263	}
ali@48	1264
ali@48	1265	/*
ali@49	1266	* check_for_jeebies:
ali@49	1267	*
ali@49	1268	* Check for "to he" and other easy h/b errors.
ali@49	1269	*
ali@49	1270	* This is a very inadequate effort on the h/b problem,
ali@49	1271	* but the phrase "to he" is always an error, whereas "to
ali@49	1272	* be" is quite common.
ali@49	1273	* Similarly, '"Quiet!", be said.' is a non-be error
ali@49	1274	* "to he" is _not_ always an error!:
ali@49	1275	* "Where they went to he couldn't say."
ali@49	1276	* Another false positive:
ali@49	1277	* What would "Cinderella" be without the . . .
ali@49	1278	* and another: "If he wants to he can see for himself."
ali@49	1279	*/
ali@49	1280	void check_for_jeebies(const char *aline)
ali@49	1281	{
ali@49	1282	const char *s;
ali@49	1283	s=strstr(aline," be could ");
ali@49	1284	if (!s)
ali@49	1285	s=strstr(aline," be would ");
ali@49	1286	if (!s)
ali@49	1287	s=strstr(aline," was be ");
ali@49	1288	if (!s)
ali@49	1289	s=strstr(aline," be is ");
ali@49	1290	if (!s)
ali@49	1291	s=strstr(aline," is be ");
ali@49	1292	if (!s)
ali@49	1293	s=strstr(aline,"\", be ");
ali@49	1294	if (!s)
ali@49	1295	s=strstr(aline,"\" be ");
ali@49	1296	if (!s)
ali@49	1297	s=strstr(aline,"\" be ");
ali@49	1298	if (!s)
ali@49	1299	s=strstr(aline," to he ");
ali@49	1300	if (s)
ali@49	1301	{
ali@49	1302	if (pswit[ECHO_SWITCH])
ali@49	1303	printf("\n%s\n",aline);
ali@49	1304	if (!pswit[OVERVIEW_SWITCH])
ali@49	1305	printf(" Line %ld column %d - Query he/be error?\n",
ali@49	1306	linecnt,(int)(s-aline)+1);
ali@49	1307	else
ali@49	1308	cnt_word++;
ali@49	1309	}
ali@49	1310	s=strstr(aline," the had ");
ali@49	1311	if (!s)
ali@49	1312	s=strstr(aline," a had ");
ali@49	1313	if (!s)
ali@49	1314	s=strstr(aline," they bad ");
ali@49	1315	if (!s)
ali@49	1316	s=strstr(aline," she bad ");
ali@49	1317	if (!s)
ali@49	1318	s=strstr(aline," he bad ");
ali@49	1319	if (!s)
ali@49	1320	s=strstr(aline," you bad ");
ali@49	1321	if (!s)
ali@49	1322	s=strstr(aline," i bad ");
ali@49	1323	if (s)
ali@49	1324	{
ali@49	1325	if (pswit[ECHO_SWITCH])
ali@49	1326	printf("\n%s\n",aline);
ali@49	1327	if (!pswit[OVERVIEW_SWITCH])
ali@49	1328	printf(" Line %ld column %d - Query had/bad error?\n",
ali@49	1329	linecnt,(int)(s-aline)+1);
ali@49	1330	else
ali@49	1331	cnt_word++;
ali@49	1332	}
ali@49	1333	s=strstr(aline,"; hut ");
ali@49	1334	if (!s)
ali@49	1335	s=strstr(aline,", hut ");
ali@49	1336	if (s)
ali@49	1337	{
ali@49	1338	if (pswit[ECHO_SWITCH])
ali@49	1339	printf("\n%s\n",aline);
ali@49	1340	if (!pswit[OVERVIEW_SWITCH])
ali@49	1341	printf(" Line %ld column %d - Query hut/but error?\n",
ali@49	1342	linecnt,(int)(s-aline)+1);
ali@49	1343	else
ali@49	1344	cnt_word++;
ali@49	1345	}
ali@49	1346	}
ali@49	1347
ali@49	1348	/*
ali@50	1349	* check_for_mta_from:
ali@50	1350	*
ali@50	1351	* Special case - angled bracket in front of "From" placed there by an
ali@50	1352	* MTA when sending an e-mail.
ali@50	1353	*/
ali@50	1354	void check_for_mta_from(const char *aline)
ali@50	1355	{
ali@50	1356	const char *s;
ali@50	1357	s=strstr(aline,">From");
ali@50	1358	if (s)
ali@50	1359	{
ali@50	1360	if (pswit[ECHO_SWITCH])
ali@50	1361	printf("\n%s\n",aline);
ali@50	1362	if (!pswit[OVERVIEW_SWITCH])
ali@50	1363	printf(" Line %ld column %d - Query angled bracket with From\n",
ali@50	1364	linecnt,(int)(s-aline)+1);
ali@50	1365	else
ali@50	1366	cnt_punct++;
ali@50	1367	}
ali@50	1368	}
ali@50	1369
ali@50	1370	/*
ali@51	1371	* check_for_orphan_character:
ali@51	1372	*
ali@51	1373	* Check for a single character line -
ali@51	1374	* often an overflow from bad wrapping.
ali@51	1375	*/
ali@51	1376	void check_for_orphan_character(const char *aline)
ali@51	1377	{
ali@51	1378	if (*aline && !aline[1])
ali@51	1379	{
ali@51	1380	if (aline=='I' \|\| aline=='V' \|\| aline=='X' \|\| aline=='L' \|\|
ali@51	1381	gcisdigit(*aline))
ali@51	1382	; /* Nothing - ignore numerals alone on a line. */
ali@51	1383	else
ali@51	1384	{
ali@51	1385	if (pswit[ECHO_SWITCH])
ali@51	1386	printf("\n%s\n",aline);
ali@51	1387	if (!pswit[OVERVIEW_SWITCH])
ali@51	1388	printf(" Line %ld column 1 - Query single character line\n",
ali@51	1389	linecnt);
ali@51	1390	else
ali@51	1391	cnt_punct++;
ali@51	1392	}
ali@51	1393	}
ali@51	1394	}
ali@51	1395
ali@51	1396	/*
ali@52	1397	* check_for_pling_scanno:
ali@52	1398	*
ali@52	1399	* Check for I" - often should be !
ali@52	1400	*/
ali@52	1401	void check_for_pling_scanno(const char *aline)
ali@52	1402	{
ali@52	1403	const char *s;
ali@52	1404	s=strstr(aline," I\"");
ali@52	1405	if (s)
ali@52	1406	{
ali@52	1407	if (pswit[ECHO_SWITCH])
ali@52	1408	printf("\n%s\n",aline);
ali@52	1409	if (!pswit[OVERVIEW_SWITCH])
ali@52	1410	printf(" Line %ld column %ld - Query I=exclamation mark?\n",
ali@52	1411	linecnt,s-aline);
ali@52	1412	else
ali@52	1413	cnt_punct++;
ali@52	1414	}
ali@52	1415	}
ali@52	1416
ali@52	1417	/*
ali@53	1418	* check_for_extra_period:
ali@53	1419	*
ali@53	1420	* Check for period without a capital letter. Cut-down from gutspell.
ali@53	1421	* Only works when it happens on a single line.
ali@53	1422	*/
ali@53	1423	void check_for_extra_period(const char aline,const struct warnings warnings)
ali@53	1424	{
ali@53	1425	const char s,t,*s1;
ali@53	1426	signed int i,istypo,isdup;
ali@53	1427	static char qperiod[MAX_QWORD][MAX_QWORD_LENGTH];
ali@53	1428	static int qperiod_index=0;
ali@53	1429	char testword[MAXWORDLEN]="";
ali@53	1430	if (pswit[PARANOID_SWITCH])
ali@53	1431	{
ali@53	1432	for (t=s=aline;strstr(t,". ");)
ali@53	1433	{
ali@53	1434	t=strstr(t,". ");
ali@53	1435	if (t==s)
ali@53	1436	{
ali@53	1437	t++;
ali@53	1438	/* start of line punctuation is handled elsewhere */
ali@53	1439	continue;
ali@53	1440	}
ali@53	1441	if (!gcisalpha(t[-1]))
ali@53	1442	{
ali@53	1443	t++;
ali@53	1444	continue;
ali@53	1445	}
ali@53	1446	if (warnings->isDutch)
ali@53	1447	{
ali@53	1448	/* For Frank & Jeroen -- 's Middags case */
ali@53	1449	if (t[2]==CHAR_SQUOTE && t[3]>='a' && t[3]<='z' &&
ali@53	1450	t[4]==CHAR_SPACE && t[5]>='A' && t[5]<='Z')
ali@53	1451	{
ali@53	1452	t++;
ali@53	1453	continue;
ali@53	1454	}
ali@53	1455	}
ali@53	1456	s1=t+2;
ali@53	1457	while (s1 && !gcisalpha(s1) && !isdigit(*s1))
ali@53	1458	s1++;
ali@53	1459	if (s1>='a' && s1<='z')
ali@53	1460	{
ali@53	1461	/* we have something to investigate */
ali@53	1462	istypo=1;
ali@53	1463	/* so let's go back and find out */
ali@53	1464	for (s1=t-1;s1>=s &&
ali@53	1465	(gcisalpha(s1) \|\| gcisdigit(s1) \|\| *s1==CHAR_SQUOTE &&
ali@53	1466	gcisalpha(s1[1]) && gcisalpha(s1[-1]));s1--)
ali@53	1467	;
ali@53	1468	s1++;
ali@53	1469	for (i=0;s1 && s1!='.';s1++,i++)
ali@53	1470	testword[i]=*s1;
ali@53	1471	testword[i]=0;
ali@53	1472	for (i=0;*abbrev[i];i++)
ali@53	1473	if (!strcmp(testword,abbrev[i]))
ali@53	1474	istypo=0;
ali@53	1475	if (gcisdigit(*testword))
ali@53	1476	istypo=0;
ali@53	1477	if (!testword[1])
ali@53	1478	istypo=0;
ali@53	1479	if (isroman(testword))
ali@53	1480	istypo=0;
ali@53	1481	if (istypo)
ali@53	1482	{
ali@53	1483	istypo=0;
ali@53	1484	for (i=0;testword[i];i++)
ali@53	1485	if (strchr(vowels,testword[i]))
ali@53	1486	istypo=1;
ali@53	1487	}
ali@53	1488	if (istypo)
ali@53	1489	{
ali@53	1490	isdup=0;
ali@53	1491	if (strlen(testword)<MAX_QWORD_LENGTH &&
ali@53	1492	!pswit[VERBOSE_SWITCH])
ali@53	1493	for (i=0;i<qperiod_index;i++)
ali@53	1494	if (!strcmp(testword,qperiod[i]))
ali@53	1495	isdup=1;
ali@53	1496	if (!isdup)
ali@53	1497	{
ali@53	1498	if (qperiod_index<MAX_QWORD &&
ali@53	1499	strlen(testword)<MAX_QWORD_LENGTH)
ali@53	1500	{
ali@53	1501	strcpy(qperiod[qperiod_index],testword);
ali@53	1502	qperiod_index++;
ali@53	1503	}
ali@53	1504	if (pswit[ECHO_SWITCH])
ali@53	1505	printf("\n%s\n",aline);
ali@53	1506	if (!pswit[OVERVIEW_SWITCH])
ali@53	1507	printf(" Line %ld column %d - Extra period?\n",
ali@53	1508	linecnt,(int)(t-aline)+1);
ali@53	1509	else
ali@53	1510	cnt_punct++;
ali@53	1511	}
ali@53	1512	}
ali@53	1513	}
ali@53	1514	t++;
ali@53	1515	}
ali@53	1516	}
ali@53	1517	}
ali@53	1518
ali@53	1519	/*
ali@54	1520	* check_for_following_punctuation:
ali@54	1521	*
ali@54	1522	* Check for words usually not followed by punctuation.
ali@54	1523	*/
ali@54	1524	void check_for_following_punctuation(const char *aline)
ali@54	1525	{
ali@54	1526	int i;
ali@54	1527	const char s,wordstart;
ali@54	1528	char inword[MAXWORDLEN];
ali@54	1529	if (pswit[TYPO_SWITCH])
ali@54	1530	{
ali@54	1531	for (s=aline;*s;)
ali@54	1532	{
ali@54	1533	wordstart=s;
ali@54	1534	s=getaword(s,inword);
ali@54	1535	if (!*inword)
ali@54	1536	continue;
ali@54	1537	lowerit(inword);
ali@54	1538	for (i=0;*nocomma[i];i++)
ali@54	1539	if (!strcmp(inword,nocomma[i]))
ali@54	1540	{
ali@54	1541	if (s==',' \|\| s==';' \|\| *s==':')
ali@54	1542	{
ali@54	1543	if (pswit[ECHO_SWITCH])
ali@54	1544	printf("\n%s\n",aline);
ali@54	1545	if (!pswit[OVERVIEW_SWITCH])
ali@54	1546	printf(" Line %ld column %d - "
ali@54	1547	"Query punctuation after %s?\n",
ali@54	1548	linecnt,(int)(s-aline)+1,inword);
ali@54	1549	else
ali@54	1550	cnt_punct++;
ali@54	1551	}
ali@54	1552	}
ali@54	1553	for (i=0;*noperiod[i];i++)
ali@54	1554	if (!strcmp(inword,noperiod[i]))
ali@54	1555	{
ali@54	1556	if (s=='.' \|\| s=='!')
ali@54	1557	{
ali@54	1558	if (pswit[ECHO_SWITCH])
ali@54	1559	printf("\n%s\n",aline);
ali@54	1560	if (!pswit[OVERVIEW_SWITCH])
ali@54	1561	printf(" Line %ld column %d - "
ali@54	1562	"Query punctuation after %s?\n",
ali@54	1563	linecnt,(int)(s-aline)+1,inword);
ali@54	1564	else
ali@54	1565	cnt_punct++;
ali@54	1566	}
ali@54	1567	}
ali@54	1568	}
ali@54	1569	}
ali@54	1570	}
ali@54	1571
ali@54	1572	/*
ali@55	1573	* check_for_typos:
ali@55	1574	*
ali@55	1575	* Check for commonly mistyped words,
ali@55	1576	* and digits like 0 for O in a word.
ali@55	1577	*/
ali@55	1578	void check_for_typos(const char aline,struct warnings warnings)
ali@55	1579	{
ali@55	1580	const char s,wordstart;
ali@55	1581	char inword[MAXWORDLEN],testword[MAXWORDLEN];
ali@55	1582	int i,istypo,isdup,alower,vowel,consonant;
ali@55	1583	static int qword_index=0;
ali@55	1584	for (s=aline;*s;)
ali@55	1585	{
ali@55	1586	wordstart=s;
ali@55	1587	s=getaword(s,inword);
ali@55	1588	if (!*inword)
ali@55	1589	continue; /* don't bother with empty lines */
ali@55	1590	if (mixdigit(inword))
ali@55	1591	{
ali@55	1592	if (pswit[ECHO_SWITCH])
ali@55	1593	printf("\n%s\n",aline);
ali@55	1594	if (!pswit[OVERVIEW_SWITCH])
ali@55	1595	printf(" Line %ld column %d - Query digit in %s\n",
ali@55	1596	linecnt,(int)(wordstart-aline)+1,inword);
ali@55	1597	else
ali@55	1598	cnt_word++;
ali@55	1599	}
ali@55	1600	/*
ali@55	1601	* Put the word through a series of tests for likely typos and OCR
ali@55	1602	* errors.
ali@55	1603	*/
ali@55	1604	if (pswit[TYPO_SWITCH])
ali@55	1605	{
ali@55	1606	istypo=0;
ali@55	1607	strcpy(testword,inword);
ali@55	1608	alower=0;
ali@55	1609	for (i=0;i<(signed int)strlen(testword);i++)
ali@55	1610	{
ali@55	1611	/* lowercase for testing */
ali@55	1612	if (testword[i]>='a' && testword[i]<='z')
ali@55	1613	alower=1;
ali@55	1614	if (alower && testword[i]>='A' && testword[i]<='Z')
ali@55	1615	{
ali@55	1616	/*
ali@55	1617	* We have an uppercase mid-word. However, there are
ali@55	1618	* common cases:
ali@55	1619	* Mac and Mc like McGill
ali@55	1620	* French contractions like l'Abbe
ali@55	1621	*/
ali@55	1622	if (i==2 && testword[0]=='m' && testword[1]=='c' \|\|
ali@55	1623	i==3 && testword[0]=='m' && testword[1]=='a' &&
ali@55	1624	testword[2]=='c' \|\| i>0 && testword[i-1]==CHAR_SQUOTE)
ali@55	1625	; /* do nothing! */
ali@55	1626	else
ali@55	1627	istypo=1;
ali@55	1628	}
ali@55	1629	testword[i]=(char)tolower(testword[i]);
ali@55	1630	}
ali@55	1631	/*
ali@55	1632	* Check for certain unlikely two-letter combinations at word
ali@55	1633	* start and end.
ali@55	1634	*/
ali@55	1635	if (strlen(testword)>1)
ali@55	1636	{
ali@55	1637	for (i=0;*nostart[i];i++)
ali@55	1638	if (!strncmp(testword,nostart[i],2))
ali@55	1639	istypo=1;
ali@55	1640	for (i=0;*noend[i];i++)
ali@55	1641	if (!strncmp(testword+strlen(testword)-2,noend[i],2))
ali@55	1642	istypo=1;
ali@55	1643	}
ali@55	1644	/* ght is common, gbt never. Like that. */
ali@55	1645	if (strstr(testword,"cb"))
ali@55	1646	istypo=1;
ali@55	1647	if (strstr(testword,"gbt"))
ali@55	1648	istypo=1;
ali@55	1649	if (strstr(testword,"pbt"))
ali@55	1650	istypo=1;
ali@55	1651	if (strstr(testword,"tbs"))
ali@55	1652	istypo=1;
ali@55	1653	if (strstr(testword,"mrn"))
ali@55	1654	istypo=1;
ali@55	1655	if (strstr(testword,"ahle"))
ali@55	1656	istypo=1;
ali@55	1657	if (strstr(testword,"ihle"))
ali@55	1658	istypo=1;
ali@55	1659	/*
ali@55	1660	* "TBE" does happen - like HEARTBEAT - but uncommon.
ali@55	1661	* Also "TBI" - frostbite, outbid - but uncommon.
ali@55	1662	* Similarly "ii" like Hawaii, or Pompeii, and in Roman
ali@55	1663	* numerals, but "ii" is a common scanno.
ali@55	1664	*/
ali@55	1665	if (strstr(testword,"tbi"))
ali@55	1666	istypo=1;
ali@55	1667	if (strstr(testword,"tbe"))
ali@55	1668	istypo=1;
ali@55	1669	if (strstr(testword,"ii"))
ali@55	1670	istypo=1;
ali@55	1671	/*
ali@55	1672	* Check for no vowels or no consonants.
ali@55	1673	* If none, flag a typo.
ali@55	1674	*/
ali@55	1675	if (!istypo && strlen(testword)>1)
ali@55	1676	{
ali@55	1677	vowel=consonant=0;
ali@55	1678	for (i=0;testword[i];i++)
ali@55	1679	{
ali@55	1680	if (testword[i]=='y' \|\| gcisdigit(testword[i]))
ali@55	1681	{
ali@55	1682	/* Yah, this is loose. */
ali@55	1683	vowel++;
ali@55	1684	consonant++;
ali@55	1685	}
ali@55	1686	else if (strchr(vowels,testword[i]))
ali@55	1687	vowel++;
ali@55	1688	else
ali@55	1689	consonant++;
ali@55	1690	}
ali@55	1691	if (!vowel \|\| !consonant)
ali@55	1692	istypo=1;
ali@55	1693	}
ali@55	1694	/*
ali@55	1695	* Now exclude the word from being reported if it's in
ali@55	1696	* the okword list.
ali@55	1697	*/
ali@55	1698	for (i=0;*okword[i];i++)
ali@55	1699	if (!strcmp(testword,okword[i]))
ali@55	1700	istypo=0;
ali@55	1701	/*
ali@55	1702	* What looks like a typo may be a Roman numeral.
ali@55	1703	* Exclude these.
ali@55	1704	*/
ali@55	1705	if (istypo && isroman(testword))
ali@55	1706	istypo=0;
ali@55	1707	/* Check the manual list of typos. */
ali@55	1708	if (!istypo)
ali@55	1709	for (i=0;*typo[i];i++)
ali@55	1710	if (!strcmp(testword,typo[i]))
ali@55	1711	istypo=1;
ali@55	1712	/*
ali@55	1713	* Check lowercase s, l, i and m - special cases.
ali@55	1714	* "j" - often a semi-colon gone wrong.
ali@55	1715	* "d" for a missing apostrophe - he d
ali@55	1716	* "n" for "in"
ali@55	1717	*/
ali@55	1718	if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword))
ali@55	1719	istypo=1;
ali@55	1720	if (istypo)
ali@55	1721	{
ali@55	1722	isdup=0;
ali@55	1723	if (strlen(testword)<MAX_QWORD_LENGTH &&
ali@55	1724	!pswit[VERBOSE_SWITCH])
ali@55	1725	for (i=0;i<qword_index;i++)
ali@55	1726	if (!strcmp(testword,qword[i]))
ali@55	1727	{
ali@55	1728	isdup=1;
ali@55	1729	++dupcnt[i];
ali@55	1730	}
ali@55	1731	if (!isdup)
ali@55	1732	{
ali@55	1733	if (qword_index<MAX_QWORD &&
ali@55	1734	strlen(testword)<MAX_QWORD_LENGTH)
ali@55	1735	{
ali@55	1736	strcpy(qword[qword_index],testword);
ali@55	1737	qword_index++;
ali@55	1738	}
ali@55	1739	if (pswit[ECHO_SWITCH])
ali@55	1740	printf("\n%s\n",aline);
ali@55	1741	if (!pswit[OVERVIEW_SWITCH])
ali@55	1742	{
ali@55	1743	printf(" Line %ld column %d - Query word %s",
ali@55	1744	linecnt,(int)(wordstart-aline)+1,inword);
ali@55	1745	if (strlen(testword)<MAX_QWORD_LENGTH &&
ali@55	1746	!pswit[VERBOSE_SWITCH])
ali@55	1747	printf(" - not reporting duplicates");
ali@55	1748	printf("\n");
ali@55	1749	}
ali@55	1750	else
ali@55	1751	cnt_word++;
ali@55	1752	}
ali@55	1753	}
ali@55	1754	}
ali@55	1755	/* check the user's list of typos */
ali@55	1756	if (!istypo && usertypo_count)
ali@55	1757	for (i=0;i<usertypo_count;i++)
ali@55	1758	if (!strcmp(testword,usertypo[i]))
ali@55	1759	{
ali@55	1760	if (pswit[ECHO_SWITCH])
ali@55	1761	printf("\n%s\n",aline);
ali@55	1762	if (!pswit[OVERVIEW_SWITCH])
ali@55	1763	printf(" Line %ld column %d - "
ali@55	1764	"Query possible scanno %s\n",
ali@55	1765	linecnt,(int)(wordstart-aline)+2,inword);
ali@55	1766	}
ali@55	1767	if (pswit[PARANOID_SWITCH] && warnings->digit)
ali@55	1768	{
ali@55	1769	/* In paranoid mode, query all 0 and 1 standing alone. */
ali@55	1770	if (!strcmp(inword,"0") \|\| !strcmp(inword,"1"))
ali@55	1771	{
ali@55	1772	if (pswit[ECHO_SWITCH])
ali@55	1773	printf("\n%s\n",aline);
ali@55	1774	if (!pswit[OVERVIEW_SWITCH])
ali@55	1775	printf(" Line %ld column %d - Query standalone %s\n",
ali@55	1776	linecnt,(int)(wordstart-aline)+2,inword);
ali@55	1777	else
ali@55	1778	cnt_word++;
ali@55	1779	}
ali@55	1780	}
ali@55	1781	}
ali@55	1782	}
ali@55	1783
ali@56	1784	struct parities {
ali@56	1785	int dquote,squote;
ali@56	1786	};
ali@56	1787
ali@56	1788	/*
ali@56	1789	* check_for_misspaced_punctuation:
ali@56	1790	*
ali@56	1791	* Look for added or missing spaces around punctuation and quotes.
ali@56	1792	* If there is a punctuation character like ! with no space on
ali@56	1793	* either side, suspect a missing!space. If there are spaces on
ali@56	1794	* both sides , assume a typo. If we see a double quote with no
ali@56	1795	* space or punctuation on either side of it, assume unspaced
ali@56	1796	* quotes "like"this.
ali@56	1797	*/
ali@56	1798	void check_for_misspaced_punctuation(const char *aline,
ali@56	1799	struct parities *parities,int isemptyline)
ali@56	1800	{
ali@56	1801	int i,llen,isacro,isellipsis;
ali@56	1802	const char *s;
ali@56	1803	llen=strlen(aline);
ali@56	1804	for (i=1;i<llen;i++)
ali@56	1805	{
ali@56	1806	/* For each character in the line after the first. */
ali@56	1807	if (strchr(".?!,;:_",aline[i])) /* if it's punctuation */
ali@56	1808	{
ali@56	1809	/* we need to suppress warnings for acronyms like M.D. */
ali@56	1810	isacro=0;
ali@56	1811	/* we need to suppress warnings for ellipsis . . . */
ali@56	1812	isellipsis=0;
ali@56	1813	/* if there are letters on both sides of it or ... */
ali@56	1814	if (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1]) \|\|
ali@56	1815	gcisalpha(aline[i+1]) && strchr("?!,;:",aline[i]))
ali@56	1816	{
ali@56	1817	/* ...if it's strict punctuation followed by an alpha */
ali@56	1818	if (aline[i]=='.')
ali@56	1819	{
ali@56	1820	if (i>2 && aline[i-2]=='.')
ali@56	1821	isacro=1;
ali@56	1822	if (i+2<llen && aline[i+2]=='.')
ali@56	1823	isacro=1;
ali@56	1824	}
ali@56	1825	if (!isacro)
ali@56	1826	{
ali@56	1827	if (pswit[ECHO_SWITCH])
ali@56	1828	printf("\n%s\n",aline);
ali@56	1829	if (!pswit[OVERVIEW_SWITCH])
ali@56	1830	printf(" Line %ld column %d - Missing space?\n",
ali@56	1831	linecnt,i+1);
ali@56	1832	else
ali@56	1833	cnt_punct++;
ali@56	1834	}
ali@56	1835	}
ali@56	1836	if (aline[i-1]==CHAR_SPACE &&
ali@56	1837	(aline[i+1]==CHAR_SPACE \|\| aline[i+1]==0))
ali@56	1838	{
ali@56	1839	/*
ali@56	1840	* If there are spaces on both sides,
ali@56	1841	* or space before and end of line.
ali@56	1842	*/
ali@56	1843	if (aline[i]=='.')
ali@56	1844	{
ali@56	1845	if (i>2 && aline[i-2]=='.')
ali@56	1846	isellipsis=1;
ali@56	1847	if (i+2<llen && aline[i+2]=='.')
ali@56	1848	isellipsis=1;
ali@56	1849	}
ali@56	1850	if (!isemptyline && !isellipsis)
ali@56	1851	{
ali@56	1852	if (pswit[ECHO_SWITCH])
ali@56	1853	printf("\n%s\n",aline);
ali@56	1854	if (!pswit[OVERVIEW_SWITCH])
ali@56	1855	printf(" Line %ld column %d - "
ali@56	1856	"Spaced punctuation?\n",linecnt,i+1);
ali@56	1857	else
ali@56	1858	cnt_punct++;
ali@56	1859	}
ali@56	1860	}
ali@56	1861	}
ali@56	1862	}
ali@56	1863	/* Split out the characters that CANNOT be preceded by space. */
ali@56	1864	llen=strlen(aline);
ali@56	1865	for (i=1;i<llen;i++)
ali@56	1866	{
ali@56	1867	/* for each character in the line after the first */
ali@56	1868	if (strchr("?!,;:",aline[i]))
ali@56	1869	{
ali@56	1870	/* if it's punctuation that _cannot_ have a space before it */
ali@56	1871	if (aline[i-1]==CHAR_SPACE && !isemptyline &&
ali@56	1872	aline[i+1]!=CHAR_SPACE)
ali@56	1873	{
ali@56	1874	/*
ali@56	1875	* If aline[i+1) DOES == space,
ali@56	1876	* it was already reported just above.
ali@56	1877	*/
ali@56	1878	if (pswit[ECHO_SWITCH])
ali@56	1879	printf("\n%s\n",aline);
ali@56	1880	if (!pswit[OVERVIEW_SWITCH])
ali@56	1881	printf(" Line %ld column %d - Spaced punctuation?\n",
ali@56	1882	linecnt,i+1);
ali@56	1883	else
ali@56	1884	cnt_punct++;
ali@56	1885	}
ali@56	1886	}
ali@56	1887	}
ali@56	1888	/*
ali@56	1889	* Special case " .X" where X is any alpha.
ali@56	1890	* This plugs a hole in the acronym code above.
ali@56	1891	* Inelegant, but maintainable.
ali@56	1892	*/
ali@56	1893	llen=strlen(aline);
ali@56	1894	for (i=1;i<llen;i++)
ali@56	1895	{
ali@56	1896	/* for each character in the line after the first */
ali@56	1897	if (aline[i]=='.')
ali@56	1898	{
ali@56	1899	/* if it's a period */
ali@56	1900	if (aline[i-1]==CHAR_SPACE && gcisalpha(aline[i+1]))
ali@56	1901	{
ali@56	1902	/*
ali@56	1903	* If the period follows a space and
ali@56	1904	* is followed by a letter.
ali@56	1905	*/
ali@56	1906	if (pswit[ECHO_SWITCH])
ali@56	1907	printf("\n%s\n",aline);
ali@56	1908	if (!pswit[OVERVIEW_SWITCH])
ali@56	1909	printf(" Line %ld column %d - Spaced punctuation?\n",
ali@56	1910	linecnt,i+1);
ali@56	1911	else
ali@56	1912	cnt_punct++;
ali@56	1913	}
ali@56	1914	}
ali@56	1915	}
ali@56	1916	for (i=1;i<llen;i++)
ali@56	1917	{
ali@56	1918	/* for each character in the line after the first */
ali@56	1919	if (aline[i]==CHAR_DQUOTE)
ali@56	1920	{
ali@56	1921	if (!strchr(" _-.'`,;:!/([{?}])",aline[i-1]) &&
ali@56	1922	!strchr(" _-.'`,;:!/([{?}])",aline[i+1]) && aline[i+1] \|\|
ali@56	1923	!strchr(" _-([{'`",aline[i-1]) && gcisalpha(aline[i+1]))
ali@56	1924	{
ali@56	1925	if (pswit[ECHO_SWITCH])
ali@56	1926	printf("\n%s\n",aline);
ali@56	1927	if (!pswit[OVERVIEW_SWITCH])
ali@56	1928	printf(" Line %ld column %d - Unspaced quotes?\n",
ali@56	1929	linecnt,i+1);
ali@56	1930	else
ali@56	1931	cnt_punct++;
ali@56	1932	}
ali@56	1933	}
ali@56	1934	}
ali@56	1935	/* Check parity of quotes. */
ali@56	1936	for (s=aline;*s;s++)
ali@56	1937	{
ali@56	1938	if (*s==CHAR_DQUOTE)
ali@56	1939	{
ali@56	1940	parities->dquote=!parities->dquote;
ali@56	1941	if (!parities->dquote)
ali@56	1942	{
ali@56	1943	/* parity even */
ali@56	1944	if (!strchr("_-.'`/,;:!?)]} ",s[1]))
ali@56	1945	{
ali@56	1946	if (pswit[ECHO_SWITCH])
ali@56	1947	printf("\n%s\n",aline);
ali@56	1948	if (!pswit[OVERVIEW_SWITCH])
ali@56	1949	printf(" Line %ld column %d - "
ali@56	1950	"Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);
ali@56	1951	else
ali@56	1952	cnt_punct++;
ali@56	1953	}
ali@56	1954	}
ali@56	1955	else
ali@56	1956	{
ali@56	1957	/* parity odd */
ali@56	1958	if (!gcisalpha(s[1]) && !isdigit(s[1]) &&
ali@56	1959	!strchr("_-/.'`([{$",s[1]) \|\| !s[1])
ali@56	1960	{
ali@56	1961	if (pswit[ECHO_SWITCH])
ali@56	1962	printf("\n%s\n",aline);
ali@56	1963	if (!pswit[OVERVIEW_SWITCH])
ali@56	1964	printf(" Line %ld column %d - "
ali@56	1965	"Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);
ali@56	1966	else
ali@56	1967	cnt_punct++;
ali@56	1968	}
ali@56	1969	}
ali@56	1970	}
ali@56	1971	}
ali@56	1972	if (*aline==CHAR_DQUOTE)
ali@56	1973	{
ali@56	1974	if (strchr(",;:!?)]} ",aline[1]))
ali@56	1975	{
ali@56	1976	if (pswit[ECHO_SWITCH])
ali@56	1977	printf("\n%s\n",aline);
ali@56	1978	if (!pswit[OVERVIEW_SWITCH])
ali@56	1979	printf(" Line %ld column 1 - Wrongspaced quotes?\n",
ali@56	1980	linecnt);
ali@56	1981	else
ali@56	1982	cnt_punct++;
ali@56	1983	}
ali@56	1984	}
ali@56	1985	if (pswit[SQUOTE_SWITCH])
ali@56	1986	{
ali@56	1987	for (s=aline;*s;s++)
ali@56	1988	{
ali@56	1989	if ((s==CHAR_SQUOTE \|\| s==CHAR_OPEN_SQUOTE) &&
ali@56	1990	(s==aline \|\| s>aline && !gcisalpha(s[-1]) \|\|
ali@56	1991	!gcisalpha(s[1])))
ali@56	1992	{
ali@56	1993	parities->squote=!parities->squote;
ali@56	1994	if (!parities->squote)
ali@56	1995	{
ali@56	1996	/* parity even */
ali@56	1997	if (!strchr("_-.'`/\",;:!?)]} ",s[1]))
ali@56	1998	{
ali@56	1999	if (pswit[ECHO_SWITCH])
ali@56	2000	printf("\n%s\n",aline);
ali@56	2001	if (!pswit[OVERVIEW_SWITCH])
ali@56	2002	printf(" Line %ld column %d - "
ali@56	2003	"Wrongspaced singlequotes?\n",
ali@56	2004	linecnt,(int)(s-aline)+1);
ali@56	2005	else
ali@56	2006	cnt_punct++;
ali@56	2007	}
ali@56	2008	}
ali@56	2009	else
ali@56	2010	{
ali@56	2011	/* parity odd */
ali@56	2012	if (!gcisalpha(s[1]) && !isdigit(s[1]) &&
ali@56	2013	!strchr("_-/\".'`",s[1]) \|\| !s[1])
ali@56	2014	{
ali@56	2015	if (pswit[ECHO_SWITCH])
ali@56	2016	printf("\n%s\n",aline);
ali@56	2017	if (!pswit[OVERVIEW_SWITCH])
ali@56	2018	printf(" Line %ld column %d - "
ali@56	2019	"Wrongspaced singlequotes?\n",
ali@56	2020	linecnt,(int)(s-aline)+1);
ali@56	2021	else
ali@56	2022	cnt_punct++;
ali@56	2023	}
ali@56	2024	}
ali@56	2025	}
ali@56	2026	}
ali@56	2027	}
ali@56	2028	}
ali@56	2029
ali@55	2030	/*
ali@57	2031	* check_for_double_punctuation:
ali@57	2032	*
ali@57	2033	* Look for double punctuation like ,. or ,,
ali@57	2034	* Thanks to DW for the suggestion!
ali@57	2035	* In books with references, ".," and ".;" are common
ali@57	2036	* e.g. "etc., etc.," and vol. 1.; vol 3.;
ali@57	2037	* OTOH, from my initial tests, there are also fairly
ali@57	2038	* common errors. What to do? Make these cases paranoid?
ali@57	2039	* ".," is the most common, so warnings->dotcomma is used
ali@57	2040	* to suppress detailed reporting if it occurs often.
ali@57	2041	*/
ali@57	2042	void check_for_double_punctuation(const char aline,struct warnings warnings)
ali@57	2043	{
ali@57	2044	int i,llen;
ali@57	2045	llen=strlen(aline);
ali@57	2046	for (i=0;i<llen;i++)
ali@57	2047	{
ali@57	2048	/* for each punctuation character in the line */
ali@57	2049	if (strchr(".?!,;:",aline[i]) && strchr(".?!,;:",aline[i+1]) &&
ali@57	2050	aline[i] && aline[i+1])
ali@57	2051	{
ali@57	2052	/* followed by punctuation, it's a query, unless . . . */
ali@57	2053	if (aline[i]==aline[i+1] && (aline[i]=='.' \|\| aline[i]=='?' \|\|
ali@57	2054	aline[i]=='!') \|\|
ali@57	2055	!warnings->dotcomma && aline[i]=='.' && aline[i+1]==',' \|\|
ali@57	2056	warnings->isFrench && !strncmp(aline+i,",...",4) \|\|
ali@57	2057	warnings->isFrench && !strncmp(aline+i,"...,",4) \|\|
ali@57	2058	warnings->isFrench && !strncmp(aline+i,";...",4) \|\|
ali@57	2059	warnings->isFrench && !strncmp(aline+i,"...;",4) \|\|
ali@57	2060	warnings->isFrench && !strncmp(aline+i,":...",4) \|\|
ali@57	2061	warnings->isFrench && !strncmp(aline+i,"...:",4) \|\|
ali@57	2062	warnings->isFrench && !strncmp(aline+i,"!...",4) \|\|
ali@57	2063	warnings->isFrench && !strncmp(aline+i,"...!",4) \|\|
ali@57	2064	warnings->isFrench && !strncmp(aline+i,"?...",4) \|\|
ali@57	2065	warnings->isFrench && !strncmp(aline+i,"...?",4))
ali@57	2066	{
ali@57	2067	if (warnings->isFrench && !strncmp(aline+i,",...",4) \|\|
ali@57	2068	warnings->isFrench && !strncmp(aline+i,"...,",4) \|\|
ali@57	2069	warnings->isFrench && !strncmp(aline+i,";...",4) \|\|
ali@57	2070	warnings->isFrench && !strncmp(aline+i,"...;",4) \|\|
ali@57	2071	warnings->isFrench && !strncmp(aline+i,":...",4) \|\|
ali@57	2072	warnings->isFrench && !strncmp(aline+i,"...:",4) \|\|
ali@57	2073	warnings->isFrench && !strncmp(aline+i,"!...",4) \|\|
ali@57	2074	warnings->isFrench && !strncmp(aline+i,"...!",4) \|\|
ali@57	2075	warnings->isFrench && !strncmp(aline+i,"?...",4) \|\|
ali@57	2076	warnings->isFrench && !strncmp(aline+i,"...?",4))
ali@57	2077	i+=4;
ali@57	2078	; /* do nothing for .. !! and ?? which can be legit */
ali@57	2079	}
ali@57	2080	else
ali@57	2081	{
ali@57	2082	if (pswit[ECHO_SWITCH])
ali@57	2083	printf("\n%s\n",aline);
ali@57	2084	if (!pswit[OVERVIEW_SWITCH])
ali@57	2085	printf(" Line %ld column %d - Double punctuation?\n",
ali@57	2086	linecnt,i+1);
ali@57	2087	else
ali@57	2088	cnt_punct++;
ali@57	2089	}
ali@57	2090	}
ali@57	2091	}
ali@57	2092	}
ali@57	2093
ali@57	2094	/*
ali@41	2095	* procfile:
ali@41	2096	*
ali@41	2097	* Process one file.
ali@41	2098	*/
ali@41	2099	void procfile(char *filename)
ali@41	2100	{
ali@55	2101	const char s,t;
ali@41	2102	char parastart[81]; /* first line of current para */
ali@41	2103	FILE *infile;
ali@41	2104	struct first_pass_results *first_pass_results;
ali@42	2105	struct warnings *warnings;
ali@43	2106	struct counters counters={0};
ali@45	2107	struct line_properties last={0};
ali@56	2108	struct parities parities={0};
ali@43	2109	int isemptyline;
ali@43	2110	long squot,start_para_line;
ali@55	2111	signed int i,llen,isacro,isellipsis;
ali@55	2112	signed int isnewpara;
ali@41	2113	char dquote_err[80],squote_err[80],rbrack_err[80],sbrack_err[80],
ali@41	2114	cbrack_err[80],unders_err[80];
ali@41	2115	signed int enddash;
ali@45	2116	last.start=CHAR_SPACE;
ali@41	2117	dquote_err=squote_err=rbrack_err=cbrack_err=*sbrack_err=
ali@41	2118	unders_err=prevline=0;
ali@41	2119	linecnt=checked_linecnt=start_para_line=0;
ali@43	2120	squot=0;
ali@53	2121	i=llen=isacro=isellipsis=0;
ali@55	2122	isnewpara=enddash=0;
ali@41	2123	infile=fopen(filename,"rb");
ali@41	2124	if (!infile)
ali@41	2125	{
ali@41	2126	if (pswit[STDOUT_SWITCH])
ali@41	2127	fprintf(stdout,"bookloupe: cannot open %s\n",filename);
ali@41	2128	else
ali@41	2129	fprintf(stderr,"bookloupe: cannot open %s\n",filename);
ali@41	2130	exit(1);
ali@41	2131	}
ali@41	2132	fprintf(stdout,"\n\nFile: %s\n\n",filename);
ali@41	2133	first_pass_results=first_pass(infile);
ali@42	2134	warnings=report_first_pass(first_pass_results);
ali@42	2135	rewind(infile);
ali@40	2136	/*
ali@40	2137	* Here we go with the main pass. Hold onto yer hat!
ali@40	2138	* Re-init some variables we've dirtied.
ali@40	2139	*/
ali@43	2140	squot=linecnt=0;
ali@40	2141	while (flgets(aline,LINEBUFSIZE-1,infile,linecnt+1))
ali@40	2142	{
ali@0	2143	linecnt++;
ali@40	2144	if (linecnt==1)
ali@40	2145	isnewpara=1;
ali@40	2146	if (pswit[DP_SWITCH] && !strncmp(aline,"-----File: ",11))
ali@40	2147	continue; // skip DP page separators completely
ali@41	2148	if (linecnt<first_pass_results->firstline \|\|
ali@41	2149	(first_pass_results->footerline>0 &&
ali@41	2150	linecnt>first_pass_results->footerline))
ali@40	2151	{
ali@40	2152	if (pswit[HEADER_SWITCH])
ali@40	2153	{
ali@40	2154	if (!strncmp(aline,"Title:",6))
ali@40	2155	printf(" %s\n",aline);
ali@40	2156	if (!strncmp(aline,"Author:",7))
ali@40	2157	printf(" %s\n",aline);
ali@40	2158	if (!strncmp(aline,"Release Date:",13))
ali@40	2159	printf(" %s\n",aline);
ali@40	2160	if (!strncmp(aline,"Edition:",8))
ali@40	2161	printf(" %s\n\n",aline);
ali@40	2162	}
ali@0	2163	continue; /* skip through the header */
ali@40	2164	}
ali@0	2165	checked_linecnt++;
ali@40	2166	s=aline;
ali@40	2167	/*
ali@40	2168	* If we are in a state of unbalanced quotes, and this line
ali@40	2169	* doesn't begin with a quote, output the stored error message.
ali@40	2170	* If the -P switch was used, print the warning even if the
ali@40	2171	* new para starts with quotes.
ali@40	2172	*/
ali@40	2173	t=s;
ali@40	2174	while (*t==' ')
ali@40	2175	t++;
ali@0	2176	if (*dquote_err)
ali@40	2177	if (*t!=CHAR_DQUOTE \|\| pswit[QPARA_SWITCH])
ali@40	2178	{
ali@40	2179	if (!pswit[OVERVIEW_SWITCH])
ali@40	2180	{
ali@40	2181	if (pswit[ECHO_SWITCH])
ali@40	2182	printf("\n%s\n",parastart);
ali@0	2183	printf(dquote_err);
ali@40	2184	}
ali@0	2185	else
ali@0	2186	cnt_dquot++;
ali@0	2187	}
ali@40	2188	if (*squote_err)
ali@40	2189	{
ali@40	2190	if (t!=CHAR_SQUOTE && t!=CHAR_OPEN_SQUOTE \|\|
ali@40	2191	pswit[QPARA_SWITCH] \|\| squot)
ali@40	2192	{
ali@40	2193	if (!pswit[OVERVIEW_SWITCH])
ali@40	2194	{
ali@40	2195	if (pswit[ECHO_SWITCH])
ali@40	2196	printf("\n%s\n",parastart);
ali@0	2197	printf(squote_err);
ali@40	2198	}
ali@0	2199	else
ali@0	2200	cnt_squot++;
ali@40	2201	}
ali@40	2202	squot=0;
ali@40	2203	}
ali@40	2204	if (*rbrack_err)
ali@40	2205	{
ali@40	2206	if (!pswit[OVERVIEW_SWITCH])
ali@40	2207	{
ali@40	2208	if (pswit[ECHO_SWITCH])
ali@40	2209	printf("\n%s\n",parastart);
ali@0	2210	printf(rbrack_err);
ali@40	2211	}
ali@0	2212	else
ali@0	2213	cnt_brack++;
ali@40	2214	}
ali@40	2215	if (*sbrack_err)
ali@40	2216	{
ali@40	2217	if (!pswit[OVERVIEW_SWITCH])
ali@40	2218	{
ali@40	2219	if (pswit[ECHO_SWITCH])
ali@40	2220	printf("\n%s\n",parastart);
ali@0	2221	printf(sbrack_err);
ali@40	2222	}
ali@0	2223	else
ali@0	2224	cnt_brack++;
ali@40	2225	}
ali@40	2226	if (*cbrack_err)
ali@40	2227	{
ali@40	2228	if (!pswit[OVERVIEW_SWITCH])
ali@40	2229	{
ali@40	2230	if (pswit[ECHO_SWITCH])
ali@40	2231	printf("\n%s\n",parastart);
ali@0	2232	printf(cbrack_err);
ali@40	2233	}
ali@0	2234	else
ali@0	2235	cnt_brack++;
ali@40	2236	}
ali@40	2237	if (*unders_err)
ali@40	2238	{
ali@40	2239	if (!pswit[OVERVIEW_SWITCH])
ali@40	2240	{
ali@40	2241	if (pswit[ECHO_SWITCH])
ali@40	2242	printf("\n%s\n",parastart);
ali@0	2243	printf(unders_err);
ali@40	2244	}
ali@0	2245	else
ali@0	2246	cnt_brack++;
ali@40	2247	}
ali@40	2248	dquote_err=squote_err=rbrack_err=cbrack_err=
ali@40	2249	sbrack_err=unders_err=0;
ali@43	2250	isemptyline=analyse_quotes(aline,&counters);
ali@40	2251	if (isnewpara && !isemptyline)
ali@40	2252	{
ali@40	2253	/* This line is the start of a new paragraph. */
ali@40	2254	start_para_line=linecnt;
ali@40	2255	/* Capture its first line in case we want to report it later. */
ali@40	2256	strncpy(parastart,aline,80);
ali@40	2257	parastart[79]=0;
ali@56	2258	memset(&parities,0,sizeof(parities)); /* restart the quote count */
ali@40	2259	s=aline;
ali@40	2260	while (!gcisalpha(s) && !gcisdigit(s) && *s)
ali@40	2261	s++;
ali@40	2262	if (s>='a' && s<='z')
ali@40	2263	{
ali@40	2264	/* and its first letter is lowercase */
ali@40	2265	if (pswit[ECHO_SWITCH])
ali@40	2266	printf("\n%s\n",aline);
ali@0	2267	if (!pswit[OVERVIEW_SWITCH])
ali@40	2268	printf(" Line %ld column %d - "
ali@40	2269	"Paragraph starts with lower-case\n",
ali@40	2270	linecnt,(int)(s-aline)+1);
ali@0	2271	else
ali@0	2272	cnt_punct++;
ali@40	2273	}
ali@40	2274	isnewpara=0; /* Signal the end of new para processing. */
ali@40	2275	}
ali@40	2276	/* Check for an em-dash broken at line end. */
ali@40	2277	if (enddash && *aline=='-')
ali@40	2278	{
ali@40	2279	if (pswit[ECHO_SWITCH])
ali@40	2280	printf("\n%s\n",aline);
ali@0	2281	if (!pswit[OVERVIEW_SWITCH])
ali@40	2282	printf(" Line %ld column 1 - Broken em-dash?\n",linecnt);
ali@0	2283	else
ali@0	2284	cnt_punct++;
ali@40	2285	}
ali@40	2286	enddash=0;
ali@40	2287	for (s=aline+strlen(aline)-1;*s==' ' && s>aline;s--)
ali@40	2288	;
ali@40	2289	if (s>=aline && *s=='-')
ali@40	2290	enddash=1;
ali@40	2291	/*
ali@40	2292	* Check for invalid or questionable characters in the line
ali@40	2293	* Anything above 127 is invalid for plain ASCII, and
ali@40	2294	* non-printable control characters should also be flagged.
ali@40	2295	* Tabs should generally not be there.
ali@40	2296	*/
ali@40	2297	for (s=aline;*s;s++)
ali@40	2298	{
ali@40	2299	i=(unsigned char)*s;
ali@40	2300	if (i<CHAR_SPACE && i!=CHAR_LF && i!=CHAR_CR && i!=CHAR_TAB)
ali@40	2301	{
ali@40	2302	if (pswit[ECHO_SWITCH])
ali@40	2303	printf("\n%s\n",aline);
ali@0	2304	if (!pswit[OVERVIEW_SWITCH])
ali@40	2305	printf(" Line %ld column %d - Control character %d\n",
ali@40	2306	linecnt,(int)(s-aline)+1,i);
ali@0	2307	else
ali@0	2308	cnt_bin++;
ali@40	2309	}
ali@40	2310	}
ali@42	2311	if (warnings->bin)
ali@44	2312	check_for_odd_characters(aline,warnings,isemptyline);
ali@42	2313	if (warnings->longline)
ali@45	2314	check_for_long_line(aline);
ali@45	2315	if (warnings->shortline)
ali@45	2316	check_for_short_line(aline,&last);
ali@45	2317	last.blen=last.len;
ali@45	2318	last.len=strlen(aline);
ali@45	2319	last.start=aline[0];
ali@46	2320	check_for_starting_punctuation(aline);
ali@42	2321	if (warnings->dash)
ali@40	2322	{
ali@47	2323	check_for_spaced_emdash(aline);
ali@47	2324	check_for_spaced_dash(aline);
ali@40	2325	}
ali@48	2326	check_for_unmarked_paragraphs(aline);
ali@49	2327	check_for_jeebies(aline);
ali@50	2328	check_for_mta_from(aline);
ali@51	2329	check_for_orphan_character(aline);
ali@52	2330	check_for_pling_scanno(aline);
ali@53	2331	check_for_extra_period(aline,warnings);
ali@54	2332	check_for_following_punctuation(aline);
ali@55	2333	check_for_typos(aline,warnings);
ali@56	2334	check_for_misspaced_punctuation(aline,&parities,isemptyline);
ali@57	2335	check_for_double_punctuation(aline,warnings);
ali@40	2336	s=aline;
ali@40	2337	while (strstr(s," \" "))
ali@40	2338	{
ali@40	2339	if (pswit[ECHO_SWITCH])
ali@40	2340	printf("\n%s\n",aline);
ali@0	2341	if (!pswit[OVERVIEW_SWITCH])
ali@40	2342	printf(" Line %ld column %d - Spaced doublequote?\n",
ali@40	2343	linecnt,(int)(strstr(s," \" ")-aline+1));
ali@0	2344	else
ali@0	2345	cnt_punct++;
ali@40	2346	s=strstr(s," \" ")+2;
ali@40	2347	}
ali@40	2348	s=aline;
ali@40	2349	while (strstr(s," ' "))
ali@40	2350	{
ali@40	2351	if (pswit[ECHO_SWITCH])
ali@40	2352	printf("\n%s\n",aline);
ali@40	2353	if (!pswit[OVERVIEW_SWITCH])
ali@40	2354	printf(" Line %ld column %d - Spaced singlequote?\n",
ali@40	2355	linecnt,(int)(strstr(s," ' ")-aline+1));
ali@40	2356	else
ali@40	2357	cnt_punct++;
ali@40	2358	s=strstr(s," ' ")+2;
ali@40	2359	}
ali@40	2360	s=aline;
ali@40	2361	while (strstr(s," ` "))
ali@40	2362	{
ali@40	2363	if (pswit[ECHO_SWITCH])
ali@40	2364	printf("\n%s\n",aline);
ali@40	2365	if (!pswit[OVERVIEW_SWITCH])
ali@40	2366	printf(" Line %ld column %d - Spaced singlequote?\n",
ali@40	2367	linecnt,(int)(strstr(s," ` ")-aline+1));
ali@40	2368	else
ali@40	2369	cnt_punct++;
ali@40	2370	s=strstr(s," ` ")+2;
ali@40	2371	}
ali@40	2372	/* check special case of 'S instead of 's at end of word */
ali@40	2373	s=aline+1;
ali@40	2374	while (*s)
ali@40	2375	{
ali@40	2376	if (*s==CHAR_SQUOTE && s[1]=='S' && s[-1]>='a' && s[-1]<='z')
ali@40	2377	{
ali@40	2378	if (pswit[ECHO_SWITCH])
ali@40	2379	printf("\n%s\n",aline);
ali@0	2380	if (!pswit[OVERVIEW_SWITCH])
ali@40	2381	printf(" Line %ld column %d - Capital \"S\"?\n",
ali@40	2382	linecnt,(int)(s-aline+2));
ali@0	2383	else
ali@0	2384	cnt_punct++;
ali@40	2385	}
ali@40	2386	s++;
ali@40	2387	}
ali@40	2388	/*
ali@40	2389	* Now check special cases - start and end of line -
ali@40	2390	* for single and double quotes. Start is sometimes [sic]
ali@40	2391	* but better to query it anyway.
ali@40	2392	* While we're here, check for dash at end of line.
ali@40	2393	*/
ali@40	2394	llen=strlen(aline);
ali@40	2395	if (llen>1)
ali@40	2396	{
ali@40	2397	if (aline[llen-1]==CHAR_DQUOTE \|\| aline[llen-1]==CHAR_SQUOTE \|\|
ali@40	2398	aline[llen-1]==CHAR_OPEN_SQUOTE)
ali@40	2399	if (aline[llen-2]==CHAR_SPACE)
ali@40	2400	{
ali@40	2401	if (pswit[ECHO_SWITCH])
ali@40	2402	printf("\n%s\n",aline);
ali@0	2403	if (!pswit[OVERVIEW_SWITCH])
ali@40	2404	printf(" Line %ld column %d - Spaced quote?\n",
ali@40	2405	linecnt,llen);
ali@0	2406	else
ali@0	2407	cnt_punct++;
ali@40	2408	}
ali@40	2409	if ((aline[0]==CHAR_SQUOTE \|\| aline[0]==CHAR_OPEN_SQUOTE) &&
ali@40	2410	aline[1]==CHAR_SPACE)
ali@40	2411	{
ali@40	2412	if (pswit[ECHO_SWITCH])
ali@40	2413	printf("\n%s\n",aline);
ali@40	2414	if (!pswit[OVERVIEW_SWITCH])
ali@40	2415	printf(" Line %ld column 1 - Spaced quote?\n",linecnt);
ali@40	2416	else
ali@40	2417	cnt_punct++;
ali@40	2418	}
ali@40	2419	/*
ali@40	2420	* Dash at end of line may well be legit - paranoid mode only
ali@40	2421	* and don't report em-dash at line-end.
ali@40	2422	*/
ali@42	2423	if (pswit[PARANOID_SWITCH] && warnings->hyphen)
ali@40	2424	{
ali@40	2425	for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)
ali@40	2426	;
ali@40	2427	if (aline[i]=='-' && aline[i-1]!='-')
ali@40	2428	{
ali@40	2429	if (pswit[ECHO_SWITCH])
ali@40	2430	printf("\n%s\n",aline);
ali@0	2431	if (!pswit[OVERVIEW_SWITCH])
ali@40	2432	printf(" Line %ld column %d - "
ali@40	2433	"Hyphen at end of line?\n",linecnt,i);
ali@40	2434	}
ali@40	2435	}
ali@40	2436	}
ali@40	2437	/*
ali@40	2438	* Brackets are often unspaced, but shouldn't be surrounded by alpha.
ali@40	2439	* If so, suspect a scanno like "a]most".
ali@40	2440	*/
ali@40	2441	llen=strlen(aline);
ali@40	2442	for (i=1;i<llen-1;i++)
ali@40	2443	{
ali@40	2444	/* for each bracket character in the line except 1st & last */
ali@40	2445	if (strchr("{[()]}",aline[i]) && gcisalpha(aline[i-1]) &&
ali@40	2446	gcisalpha(aline[i+1]))
ali@40	2447	{
ali@40	2448	if (pswit[ECHO_SWITCH])
ali@40	2449	printf("\n%s\n",aline);
ali@0	2450	if (!pswit[OVERVIEW_SWITCH])
ali@40	2451	printf(" Line %ld column %d - Unspaced bracket?\n",
ali@40	2452	linecnt,i);
ali@0	2453	else
ali@0	2454	cnt_punct++;
ali@40	2455	}
ali@40	2456	}
ali@40	2457	llen=strlen(aline);
ali@42	2458	if (warnings->endquote)
ali@40	2459	{
ali@40	2460	for (i=1;i<llen;i++)
ali@40	2461	{
ali@40	2462	/* for each character in the line except 1st */
ali@40	2463	if (aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))
ali@40	2464	{
ali@40	2465	if (pswit[ECHO_SWITCH])
ali@40	2466	printf("\n%s\n",aline);
ali@40	2467	if (!pswit[OVERVIEW_SWITCH])
ali@40	2468	printf(" Line %ld column %d - "
ali@40	2469	"endquote missing punctuation?\n",linecnt,i);
ali@40	2470	else
ali@40	2471	cnt_punct++;
ali@40	2472	}
ali@40	2473	}
ali@40	2474	}
ali@40	2475	/*
ali@40	2476	* Check for <HTML TAG>.
ali@40	2477	* If there is a < in the line, followed at some point
ali@40	2478	* by a > then we suspect HTML.
ali@40	2479	*/
ali@40	2480	if (strstr(aline,"<") && strstr(aline,">"))
ali@40	2481	{
ali@40	2482	i=(signed int)(strstr(aline,">")-strstr(aline,"<")+1);
ali@40	2483	if (i>0)
ali@40	2484	{
ali@40	2485	strncpy(wrk,strstr(aline,"<"),i);
ali@40	2486	wrk[i]=0;
ali@40	2487	if (pswit[ECHO_SWITCH])
ali@40	2488	printf("\n%s\n",aline);
ali@0	2489	if (!pswit[OVERVIEW_SWITCH])
ali@40	2490	printf(" Line %ld column %d - HTML Tag? %s \n",
ali@40	2491	linecnt,(int)(strstr(aline,"<")-aline)+1,wrk);
ali@0	2492	else
ali@0	2493	cnt_html++;
ali@40	2494	}
ali@40	2495	}
ali@40	2496	/*
ali@40	2497	* Check for &symbol; HTML.
ali@40	2498	* If there is a & in the line, followed at
ali@40	2499	* some point by a ; then we suspect HTML.
ali@40	2500	*/
ali@40	2501	if (strstr(aline,"&") && strstr(aline,";"))
ali@40	2502	{
ali@40	2503	i=(int)(strstr(aline,";")-strstr(aline,"&")+1);
ali@40	2504	for (s=strstr(aline,"&");s<strstr(aline,";");s++)
ali@40	2505	if (*s==CHAR_SPACE)
ali@40	2506	i=0; /* Don't report "Jones & Son;" */
ali@40	2507	if (i>0)
ali@40	2508	{
ali@40	2509	strncpy(wrk,strstr(aline,"&"),i);
ali@40	2510	wrk[i]=0;
ali@40	2511	if (pswit[ECHO_SWITCH])
ali@40	2512	printf("\n%s\n",aline);
ali@0	2513	if (!pswit[OVERVIEW_SWITCH])
ali@40	2514	printf(" Line %ld column %d - HTML symbol? %s \n",
ali@40	2515	linecnt,(int)(strstr(aline,"&")-aline)+1,wrk);
ali@0	2516	else
ali@0	2517	cnt_html++;
ali@40	2518	}
ali@40	2519	}
ali@40	2520	/*
ali@40	2521	* At end of paragraph, check for mismatched quotes.
ali@40	2522	* We don't want to report an error immediately, since it is a
ali@40	2523	* common convention to omit the quotes at end of paragraph if
ali@40	2524	* the next paragraph is a continuation of the same speaker.
ali@40	2525	* Where this is the case, the next para should begin with a
ali@40	2526	* quote, so we store the warning message and only display it
ali@40	2527	* at the top of the next iteration if the new para doesn't
ali@40	2528	* start with a quote.
ali@40	2529	* The -p switch overrides this default, and warns of unclosed
ali@40	2530	* quotes on _every_ paragraph, whether the next begins with a
ali@40	2531	* quote or not.
ali@40	2532	*/
ali@40	2533	if (isemptyline)
ali@40	2534	{
ali@40	2535	/* end of para - add up the totals */
ali@43	2536	if (counters.quot%2)
ali@40	2537	sprintf(dquote_err," Line %ld - Mismatched quotes\n",
ali@40	2538	linecnt);
ali@43	2539	if (pswit[SQUOTE_SWITCH] && counters.open_single_quote &&
ali@43	2540	counters.open_single_quote!=counters.close_single_quote)
ali@40	2541	sprintf(squote_err," Line %ld - Mismatched singlequotes?\n",
ali@40	2542	linecnt);
ali@43	2543	if (pswit[SQUOTE_SWITCH] && counters.open_single_quote &&
ali@43	2544	counters.open_single_quote!=counters.close_single_quote &&
ali@43	2545	counters.open_single_quote!=counters.close_single_quote+1)
ali@40	2546	/*
ali@40	2547	* Flag it to be noted regardless of the
ali@40	2548	* first char of the next para.
ali@40	2549	*/
ali@40	2550	squot=1;
ali@43	2551	if (counters.r_brack)
ali@40	2552	sprintf(rbrack_err," Line %ld - "
ali@40	2553	"Mismatched round brackets?\n",linecnt);
ali@43	2554	if (counters.s_brack)
ali@40	2555	sprintf(sbrack_err," Line %ld - "
ali@40	2556	"Mismatched square brackets?\n",linecnt);
ali@43	2557	if (counters.c_brack)
ali@40	2558	sprintf(cbrack_err," Line %ld - "
ali@40	2559	"Mismatched curly brackets?\n",linecnt);
ali@43	2560	if (counters.c_unders%2)
ali@40	2561	sprintf(unders_err," Line %ld - Mismatched underscores?\n",
ali@40	2562	linecnt);
ali@43	2563	memset(&counters,0,sizeof(counters));
ali@40	2564	/* let the next iteration know that it's starting a new para */
ali@40	2565	isnewpara=1;
ali@40	2566	}
ali@40	2567	/*
ali@40	2568	* Check for omitted punctuation at end of paragraph by working back
ali@40	2569	* through prevline. DW.
ali@40	2570	* Need to check this only for "normal" paras.
ali@40	2571	* So what is a "normal" para?
ali@40	2572	* Not normal if one-liner (chapter headings, etc.)
ali@40	2573	* Not normal if doesn't contain at least one locase letter
ali@40	2574	* Not normal if starts with space
ali@40	2575	*/
ali@40	2576	if (isemptyline)
ali@40	2577	{
ali@40	2578	/* end of para */
ali@40	2579	for (s=prevline,i=0;*s && !i;s++)
ali@0	2580	if (gcisletter(*s))
ali@40	2581	/* use i to indicate the presence of a letter on the line */
ali@40	2582	i=1;
ali@40	2583	/*
ali@40	2584	* This next "if" is a problem.
ali@40	2585	* If we say "start_para_line <= linecnt - 1", that includes
ali@40	2586	* one-line "paragraphs" like chapter heads. Lotsa false positives.
ali@40	2587	* If we say "start_para_line < linecnt - 1" it doesn't, but then it
ali@40	2588	* misses genuine one-line paragraphs.
ali@40	2589	*/
ali@45	2590	if (i && last.blen>2 && start_para_line<linecnt-1 &&
ali@40	2591	*prevline>CHAR_SPACE)
ali@40	2592	{
ali@40	2593	for (i=strlen(prevline)-1;
ali@40	2594	(prevline[i]==CHAR_DQUOTE \|\| prevline[i]==CHAR_SQUOTE) &&
ali@40	2595	prevline[i]>CHAR_SPACE && i>0;
ali@40	2596	i--)
ali@40	2597	;
ali@40	2598	for (;i>0;i--)
ali@40	2599	{
ali@40	2600	if (gcisalpha(prevline[i]))
ali@40	2601	{
ali@40	2602	if (pswit[ECHO_SWITCH])
ali@40	2603	printf("\n%s\n",prevline);
ali@0	2604	if (!pswit[OVERVIEW_SWITCH])
ali@40	2605	printf(" Line %ld column %d - "
ali@40	2606	"No punctuation at para end?\n",
ali@40	2607	linecnt-1,strlen(prevline));
ali@0	2608	else
ali@0	2609	cnt_punct++;
ali@0	2610	break;
ali@40	2611	}
ali@40	2612	if (strchr("-.:!([{?}])",prevline[i]))
ali@0	2613	break;
ali@40	2614	}
ali@40	2615	}
ali@40	2616	}
ali@40	2617	strcpy(prevline,aline);
ali@0	2618	}
ali@40	2619	fclose(infile);
ali@0	2620	if (!pswit[OVERVIEW_SWITCH])
ali@40	2621	for (i=0;i<MAX_QWORD;i++)
ali@0	2622	if (dupcnt[i])
ali@40	2623	printf("\nNote: Queried word %s was duplicated %d time%s\n",
ali@40	2624	qword[i],dupcnt[i],"s");
ali@0	2625	}
ali@0	2626
ali@40	2627	/*
ali@40	2628	* flgets:
ali@40	2629	*
ali@40	2630	* Get one line from the input stream, checking for
ali@40	2631	* the existence of exactly one CR/LF line-end per line.
ali@40	2632	*
ali@40	2633	* Returns: a pointer to the line.
ali@40	2634	*/
ali@40	2635	char flgets(char theline,int maxlen,FILE *thefile,long lcnt)
ali@0	2636	{
ali@0	2637	char c;
ali@40	2638	int len,isCR,cint;
ali@40	2639	*theline=0;
ali@40	2640	len=isCR=0;
ali@40	2641	c=cint=fgetc(thefile);
ali@40	2642	do
ali@40	2643	{
ali@40	2644	if (cint==EOF)
ali@40	2645	return NULL;
ali@40	2646	/* either way, it's end of line */
ali@40	2647	if (c==10)
ali@40	2648	{
ali@0	2649	if (isCR)
ali@0	2650	break;
ali@40	2651	else
ali@40	2652	{
ali@40	2653	/* Error - a LF without a preceding CR */
ali@40	2654	if (pswit[LINE_END_SWITCH])
ali@40	2655	{
ali@40	2656	if (pswit[ECHO_SWITCH])
ali@40	2657	printf("\n%s\n",theline);
ali@0	2658	if (!pswit[OVERVIEW_SWITCH])
ali@40	2659	printf(" Line %ld - No CR?\n",lcnt);
ali@0	2660	else
ali@0	2661	cnt_lineend++;
ali@40	2662	}
ali@0	2663	break;
ali@40	2664	}
ali@40	2665	}
ali@40	2666	if (c==13)
ali@40	2667	{
ali@40	2668	if (isCR)
ali@40	2669	{
ali@40	2670	/* Error - two successive CRs */
ali@40	2671	if (pswit[LINE_END_SWITCH])
ali@40	2672	{
ali@40	2673	if (pswit[ECHO_SWITCH])
ali@40	2674	printf("\n%s\n",theline);
ali@0	2675	if (!pswit[OVERVIEW_SWITCH])
ali@40	2676	printf(" Line %ld - Two successive CRs?\n",lcnt);
ali@0	2677	else
ali@0	2678	cnt_lineend++;
ali@40	2679	}
ali@40	2680	}
ali@40	2681	isCR=1;
ali@40	2682	}
ali@40	2683	else
ali@40	2684	{
ali@40	2685	if (pswit[LINE_END_SWITCH] && isCR)
ali@40	2686	{
ali@40	2687	if (pswit[ECHO_SWITCH])
ali@40	2688	printf("\n%s\n",theline);
ali@0	2689	if (!pswit[OVERVIEW_SWITCH])
ali@40	2690	printf(" Line %ld column %d - CR without LF?\n",
ali@40	2691	lcnt,len+1);
ali@0	2692	else
ali@0	2693	cnt_lineend++;
ali@40	2694	}
ali@40	2695	theline[len]=c;
ali@40	2696	len++;
ali@40	2697	theline[len]=0;
ali@40	2698	isCR=0;
ali@40	2699	}
ali@40	2700	c=cint=fgetc(thefile);
ali@40	2701	} while(len<maxlen);
ali@0	2702	if (pswit[MARKUP_SWITCH])
ali@0	2703	postprocess_for_HTML(theline);
ali@0	2704	if (pswit[DP_SWITCH])
ali@0	2705	postprocess_for_DP(theline);
ali@40	2706	return theline;
ali@0	2707	}
ali@0	2708
ali@40	2709	/*
ali@40	2710	* mixdigit:
ali@40	2711	*
ali@40	2712	* Takes a "word" as a parameter, and checks whether it
ali@40	2713	* contains a mixture of alpha and digits. Generally, this is an
ali@40	2714	* error, but may not be for cases like 4th or L5 12s. 3d.
ali@40	2715	*
ali@40	2716	* Returns: 0 if no error found, 1 if error.
ali@40	2717	*/
ali@40	2718	int mixdigit(char *checkword)
ali@0	2719	{
ali@40	2720	int wehaveadigit,wehavealetter,firstdigits,query,wl;
ali@0	2721	char *s;
ali@40	2722	wehaveadigit=wehavealetter=query=0;
ali@40	2723	for (s=checkword;*s;s++)
ali@0	2724	if (gcisalpha(*s))
ali@40	2725	wehavealetter=1;
ali@0	2726	else
ali@0	2727	if (gcisdigit(*s))
ali@40	2728	wehaveadigit=1;
ali@40	2729	if (wehaveadigit && wehavealetter)
ali@40	2730	{
ali@40	2731	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
ali@40	2732	query=1;
ali@40	2733	wl=strlen(checkword);
ali@40	2734	for (firstdigits=0;gcisdigit(checkword[firstdigits]);firstdigits++)
ali@0	2735	;
ali@0	2736	/* digits, ending in st, rd, nd, th of either case */
ali@40	2737	if (firstdigits+2==wl && (matchword(checkword+wl-2,"st") \|\|
ali@40	2738	matchword(checkword+wl-2,"rd") \|\| matchword(checkword+wl-2,"nd") \|\|
ali@40	2739	matchword(checkword+wl-2,"th")))
ali@40	2740	query=0;
ali@40	2741	if (firstdigits+3==wl && (matchword(checkword+wl-3,"sts") \|\|
ali@40	2742	matchword(checkword+wl-3,"rds") \|\| matchword(checkword+wl-3,"nds") \|\|
ali@40	2743	matchword(checkword+wl-3,"ths")))
ali@40	2744	query=0;
ali@40	2745	if (firstdigits+3==wl && (matchword(checkword+wl-4,"stly") \|\|
ali@40	2746	matchword(checkword+wl-4,"rdly") \|\|
ali@40	2747	matchword(checkword+wl-4,"ndly") \|\| matchword(checkword+wl-4,"thly")))
ali@40	2748	query=0;
ali@0	2749	/* digits, ending in l, L, s or d */
ali@40	2750	if (firstdigits+1==wl && (checkword[wl-1]=='l' \|\|
ali@40	2751	checkword[wl-1]=='L' \|\| checkword[wl-1]=='s' \|\| checkword[wl-1]=='d'))
ali@40	2752	query=0;
ali@40	2753	/*
ali@40	2754	* L at the start of a number, representing Britsh pounds, like L500.
ali@40	2755	* This is cute. We know the current word is mixeddigit. If the first
ali@40	2756	* letter is L, there must be at least one digit following. If both
ali@40	2757	* digits and letters follow, we have a genuine error, else we have a
ali@40	2758	* capital L followed by digits, and we accept that as a non-error.
ali@40	2759	*/
ali@40	2760	if (checkword[0]=='L' && !mixdigit(checkword+1))
ali@40	2761	query=0;
ali@40	2762	}
ali@40	2763	return query;
ali@0	2764	}
ali@0	2765
ali@40	2766	/*
ali@40	2767	* getaword:
ali@40	2768	*
ali@40	2769	* Extracts the first/next "word" from the line, and puts
ali@40	2770	* it into "thisword". A word is defined as one English word unit--or
ali@40	2771	* at least that's the aim.
ali@40	2772	*
ali@40	2773	* Returns: a pointer to the position in the line where we will start
ali@40	2774	* looking for the next word.
ali@40	2775	*/
ali@54	2776	const char getaword(const char fromline,char *thisword)
ali@0	2777	{
ali@40	2778	int i,wordlen;
ali@54	2779	const char *s;
ali@40	2780	wordlen=0;
ali@40	2781	for (;!gcisdigit(fromline) && !gcisalpha(fromline) && *fromline;
ali@40	2782	fromline++)
ali@40	2783	;
ali@40	2784	/*
ali@40	2785	* Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
ali@40	2786	* Especially yucky is the case of L1,000
ali@40	2787	* This section looks for a pattern of characters including a digit
ali@40	2788	* followed by a comma or period followed by one or more digits.
ali@40	2789	* If found, it returns this whole pattern as a word; otherwise we discard
ali@40	2790	* the results and resume our normal programming.
ali@40	2791	*/
ali@40	2792	s=fromline;
ali@40	2793	for (;(gcisdigit(s) \|\| gcisalpha(s) \|\| s==',' \|\| s=='.') &&
ali@40	2794	wordlen<MAXWORDLEN;s++)
ali@40	2795	{
ali@40	2796	thisword[wordlen]=*s;
ali@0	2797	wordlen++;
ali@40	2798	}
ali@40	2799	thisword[wordlen]=0;
ali@40	2800	for (i=1;i<wordlen-1;i++)
ali@40	2801	{
ali@40	2802	if (thisword[i]=='.' \|\| thisword[i]==',')
ali@40	2803	{
ali@40	2804	if (gcisdigit(thisword[i-1]) && gcisdigit(thisword[i-1]))
ali@40	2805	{
ali@40	2806	fromline=s;
ali@40	2807	return fromline;
ali@40	2808	}
ali@40	2809	}
ali@40	2810	}
ali@0	2811	/* we didn't find a punctuated number - do the regular getword thing */
ali@40	2812	wordlen=0;
ali@40	2813	for (;(gcisdigit(fromline) \|\| gcisalpha(fromline) \|\| *fromline=='\'') &&
ali@40	2814	wordlen<MAXWORDLEN;fromline++)
ali@40	2815	{
ali@40	2816	thisword[wordlen]=*fromline;
ali@0	2817	wordlen++;
ali@40	2818	}
ali@40	2819	thisword[wordlen]=0;
ali@40	2820	return fromline;
ali@0	2821	}
ali@0	2822
ali@40	2823	/*
ali@40	2824	* matchword:
ali@40	2825	*
ali@40	2826	* A case-insensitive string matcher.
ali@40	2827	*/
ali@40	2828	int matchword(char checkfor,char thisword)
ali@0	2829	{
ali@40	2830	unsigned int ismatch,i;
ali@40	2831	if (strlen(checkfor)!=strlen(thisword))
ali@40	2832	return 0;
ali@40	2833	ismatch=1; /* assume a match until we find a difference */
ali@40	2834	for (i=0;i<strlen(checkfor);i++)
ali@40	2835	if (toupper(checkfor[i])!=toupper(thisword[i]))
ali@40	2836	ismatch=0;
ali@40	2837	return ismatch;
ali@0	2838	}
ali@0	2839
ali@40	2840	/*
ali@40	2841	* lowerit:
ali@40	2842	*
ali@40	2843	* Lowercase the line.
ali@40	2844	*/
ali@0	2845
ali@0	2846	void lowerit(char *theline)
ali@0	2847	{
ali@40	2848	for (;*theline;theline++)
ali@40	2849	if (theline>='A' && theline<='Z')
ali@40	2850	*theline+=32;
ali@0	2851	}
ali@0	2852
ali@40	2853	/*
ali@40	2854	* isroman:
ali@40	2855	*
ali@40	2856	* Is this word a Roman Numeral?
ali@40	2857	*
ali@40	2858	* It doesn't actually validate that the number is a valid Roman Numeral--for
ali@40	2859	* example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
ali@40	2860	* what we're here to do. If it passes this, it LOOKS like a Roman numeral.
ali@40	2861	* Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
ali@40	2862	* expressions thereof, except when it came to taxes. Allow any number of M,
ali@40	2863	* an optional D, an optional CM or CD, any number of optional Cs, an optional
ali@40	2864	* XL or an optional XC, an optional IX or IV, an optional V and any number
ali@40	2865	* of optional Is.
ali@40	2866	*/
ali@0	2867	int isroman(char *t)
ali@0	2868	{
ali@0	2869	char *s;
ali@40	2870	if (!t \|\| !*t)
ali@40	2871	return 0;
ali@40	2872	s=t;
ali@40	2873	while (t=='m' && t)
ali@40	2874	t++;
ali@40	2875	if (*t=='d')
ali@40	2876	t++;
ali@40	2877	if (*t=='c' && t[1]=='m')
ali@40	2878	t+=2;
ali@40	2879	if (*t=='c' && t[1]=='d')
ali@40	2880	t+=2;
ali@40	2881	while (t=='c' && t)
ali@40	2882	t++;
ali@40	2883	if (*t=='x' && t[1]=='l')
ali@40	2884	t+=2;
ali@40	2885	if (*t=='x' && t[1]=='c')
ali@40	2886	t+=2;
ali@40	2887	if (*t=='l')
ali@40	2888	t++;
ali@40	2889	while (t=='x' && t)
ali@40	2890	t++;
ali@40	2891	if (*t=='i' && t[1]=='x')
ali@40	2892	t+=2;
ali@40	2893	if (*t=='i' && t[1]=='v')
ali@40	2894	t+=2;
ali@40	2895	if (*t=='v')
ali@40	2896	t++;
ali@40	2897	while (t=='i' && t)
ali@40	2898	t++;
ali@40	2899	return !*t;
ali@0	2900	}
ali@0	2901
ali@40	2902	/*
ali@40	2903	* gcisalpha:
ali@40	2904	*
ali@40	2905	* A version of isalpha() that is somewhat lenient on 8-bit texts.
ali@40	2906	* If we use the standard function, 8-bit accented characters break
ali@40	2907	* words, so that tete with accented characters appears to be two words, "t"
ali@40	2908	* and "t", with 8-bit characters between them. This causes over-reporting of
ali@40	2909	* errors. gcisalpha() recognizes accented letters from the CP1252 (Windows)
ali@40	2910	* and ISO-8859-1 character sets, which are the most common PG 8-bit types.
ali@40	2911	*/
ali@0	2912	int gcisalpha(unsigned char c)
ali@0	2913	{
ali@40	2914	if (c>='a' && c<='z')
ali@40	2915	return 1;
ali@40	2916	if (c>='A' && c<='Z')
ali@40	2917	return 1;
ali@40	2918	if (c<140)
ali@40	2919	return 0;
ali@40	2920	if (c>=192 && c!=208 && c!=215 && c!=222 && c!=240 && c!=247 && c!=254)
ali@40	2921	return 1;
ali@40	2922	if (c==140 \|\| c==142 \|\| c==156 \|\| c==158 \|\| c==159)
ali@40	2923	return 1;
ali@40	2924	return 0;
ali@0	2925	}
ali@0	2926
ali@40	2927	/*
ali@40	2928	* gcisdigit:
ali@40	2929	*
ali@40	2930	* A version of isdigit() that doesn't get confused in 8-bit texts.
ali@40	2931	*/
ali@0	2932	int gcisdigit(unsigned char c)
ali@0	2933	{
ali@40	2934	return c>='0' && c<='9';
ali@0	2935	}
ali@0	2936
ali@40	2937	/*
ali@40	2938	* gcisletter:
ali@40	2939	*
ali@40	2940	* A version of isletter() that doesn't get confused in 8-bit texts.
ali@40	2941	* NB: this is ISO-8891-1-specific.
ali@40	2942	*/
ali@0	2943	int gcisletter(unsigned char c)
ali@0	2944	{
ali@40	2945	return c>='A' && c<='Z' \|\| c>='a' && c<='z' \|\| c>=192;
ali@0	2946	}
ali@0	2947
ali@40	2948	/*
ali@40	2949	* gcstrchr:
ali@40	2950	*
ali@40	2951	* Wraps strchr to return NULL if the character being searched for is zero.
ali@40	2952	*/
ali@40	2953	char gcstrchr(char s,char c)
ali@0	2954	{
ali@40	2955	if (!c)
ali@40	2956	return NULL;
ali@40	2957	return strchr(s,c);
ali@0	2958	}
ali@0	2959
ali@40	2960	/*
ali@40	2961	* postprocess_for_DP:
ali@40	2962	*
ali@40	2963	* Invoked with the -d switch from flgets().
ali@40	2964	* It simply "removes" from the line a hard-coded set of common
ali@40	2965	* DP-specific tags, so that the line passed to the main routine has
ali@40	2966	* been pre-cleaned of DP markup.
ali@40	2967	*/
ali@0	2968	void postprocess_for_DP(char *theline)
ali@0	2969	{
ali@40	2970	char s,t;
ali@0	2971	int i;
ali@0	2972	if (!*theline)
ali@0	2973	return;
ali@40	2974	for (i=0;*DPmarkup[i];i++)
ali@40	2975	{
ali@40	2976	s=strstr(theline,DPmarkup[i]);
ali@40	2977	while (s)
ali@40	2978	{
ali@40	2979	t=s+strlen(DPmarkup[i]);
ali@40	2980	while (*t)
ali@40	2981	{
ali@40	2982	s=t;
ali@40	2983	t++;
ali@40	2984	s++;
ali@40	2985	}
ali@40	2986	*s=0;
ali@40	2987	s=strstr(theline,DPmarkup[i]);
ali@40	2988	}
ali@40	2989	}
ali@0	2990	}
ali@0	2991
ali@40	2992	/*
ali@40	2993	* postprocess_for_HTML:
ali@40	2994	*
ali@40	2995	* Invoked with the -m switch from flgets().
ali@40	2996	* It simply "removes" from the line a hard-coded set of common
ali@40	2997	* HTML tags and "replaces" a hard-coded set of common HTML
ali@40	2998	* entities, so that the line passed to the main routine has
ali@40	2999	* been pre-cleaned of HTML.
ali@40	3000	*/
ali@0	3001	void postprocess_for_HTML(char *theline)
ali@0	3002	{
ali@40	3003	if (strstr(theline,"<") && strstr(theline,">"))
ali@0	3004	while (losemarkup(theline))
ali@0	3005	;
ali@0	3006	while (loseentities(theline))
ali@0	3007	;
ali@0	3008	}
ali@0	3009
ali@0	3010	char losemarkup(char theline)
ali@0	3011	{
ali@40	3012	char s,t;
ali@0	3013	int i;
ali@0	3014	if (!*theline)
ali@40	3015	return NULL;
ali@40	3016	s=strstr(theline,"<");
ali@40	3017	t=strstr(theline,">");
ali@40	3018	if (!s \|\| !t)
ali@40	3019	return NULL;
ali@40	3020	for (i=0;*markup[i];i++)
ali@40	3021	if (!tagcomp(s+1,markup[i]))
ali@40	3022	{
ali@40	3023	if (!t[1])
ali@40	3024	{
ali@40	3025	*s=0;
ali@40	3026	return s;
ali@40	3027	}
ali@40	3028	else if (t>s)
ali@40	3029	{
ali@40	3030	strcpy(s,t+1);
ali@40	3031	return s;
ali@40	3032	}
ali@0	3033	}
ali@40	3034	/* It's an unrecognized <xxx>. */
ali@40	3035	return NULL;
ali@0	3036	}
ali@0	3037
ali@0	3038	char loseentities(char theline)
ali@0	3039	{
ali@0	3040	int i;
ali@40	3041	char s,t;
ali@0	3042	if (!*theline)
ali@40	3043	return NULL;
ali@40	3044	for (i=0;*entities[i].htmlent;i++)
ali@40	3045	{
ali@40	3046	s=strstr(theline,entities[i].htmlent);
ali@40	3047	if (s)
ali@40	3048	{
ali@40	3049	t=malloc((size_t)strlen(s));
ali@40	3050	if (!t)
ali@40	3051	return NULL;
ali@40	3052	strcpy(t,s+strlen(entities[i].htmlent));
ali@40	3053	strcpy(s,entities[i].textent);
ali@40	3054	strcat(s,t);
ali@0	3055	free(t);
ali@40	3056	return theline;
ali@40	3057	}
ali@40	3058	}
ali@40	3059	for (i=0;*entities[i].htmlnum;i++)
ali@40	3060	{
ali@40	3061	s=strstr(theline,entities[i].htmlnum);
ali@40	3062	if (s)
ali@40	3063	{
ali@40	3064	t=malloc((size_t)strlen(s));
ali@40	3065	if (!t)
ali@40	3066	return NULL;
ali@40	3067	strcpy(t,s+strlen(entities[i].htmlnum));
ali@40	3068	strcpy(s,entities[i].textent);
ali@40	3069	strcat(s,t);
ali@0	3070	free(t);
ali@40	3071	return theline;
ali@40	3072	}
ali@40	3073	}
ali@40	3074	return NULL;
ali@0	3075	}
ali@0	3076
ali@40	3077	int tagcomp(char strin,char basetag)
ali@0	3078	{
ali@40	3079	char s,t;
ali@40	3080	s=basetag;
ali@40	3081	t=strin;
ali@40	3082	if (*t=='/')
ali@40	3083	t++; /* ignore a slash */
ali@40	3084	while (s && t)
ali@40	3085	{
ali@40	3086	if (tolower(s)!=tolower(t))
ali@40	3087	return 1;
ali@40	3088	s++;
ali@40	3089	t++;
ali@40	3090	}
ali@40	3091	return 0;
ali@0	3092	}
ali@0	3093
ali@40	3094	void proghelp()
ali@0	3095	{
ali@40	3096	fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
ali@40	3097	fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
ali@40	3098	fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
ali@40	3099	fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
ali@40	3100	"For details, read the file COPYING.\n",stderr);
ali@40	3101	fputs("This is Free Software; "
ali@40	3102	"you may redistribute it under certain conditions (GPL);\n",stderr);
ali@40	3103	fputs("read the file COPYING for details.\n\n",stderr);
ali@40	3104	fputs("Usage is: bookloupe [-setpxloyhud] filename\n",stderr);
ali@40	3105	fputs(" where -s checks single quotes, -e suppresses echoing lines, "
ali@40	3106	"-t checks typos\n",stderr);
ali@40	3107	fputs(" -x (paranoid) switches OFF -t and extra checks, "
ali@40	3108	"-l turns OFF line-end checks\n",stderr);
ali@40	3109	fputs(" -o just displays overview without detail, "
ali@40	3110	"-h echoes header fields\n",stderr);
ali@40	3111	fputs(" -v (verbose) unsuppresses duplicate reporting, "
ali@40	3112	"-m suppresses markup\n",stderr);
ali@0	3113	fputs(" -d ignores DP-specific markup,\n",stderr);
ali@40	3114	fputs(" -u uses a file gutcheck.typ to query user-defined "
ali@40	3115	"possible typos\n",stderr);
ali@40	3116	fputs("Sample usage: bookloupe warpeace.txt \n",stderr);
ali@0	3117	fputs("\n",stderr);
ali@40	3118	fputs("Bookloupe looks for errors in Project Gutenberg(TM) etexts.\n",
ali@40	3119	stderr);
ali@40	3120	fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
ali@40	3121	"non-ASCII\n",stderr);
ali@40	3122	fputs("characters like accented letters, "
ali@40	3123	"lines longer than 75 or shorter than 55,\n",stderr);
ali@40	3124	fputs("unbalanced quotes or brackets, "
ali@40	3125	"a variety of badly formatted punctuation, \n",stderr);
ali@40	3126	fputs("HTML tags, some likely typos. "
ali@40	3127	"It is NOT a substitute for human judgement.\n",stderr);
ali@0	3128	fputs("\n",stderr);
ali@0	3129	}

author	ali <ali@juiblex.co.uk>
	Sun May 26 19:28:31 2013 +0100 (2013-05-26)
changeset 57	1e89f47e56df
parent 56	8ade5460e220
child 58	b3385bfb28ac
permissions	-rw-r--r--