bookloupe: bookloupe/bookloupe.c@1016349e619f (annotated)

ali@0	1	/*************************************************************************/
ali@40	2	/* bookloupe--check for assorted weirdnesses in a PG candidate text file */
ali@68	3	/* */
ali@68	4	/* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
ali@68	5	/* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
ali@68	6	/* */
ali@0	7	/* This program is free software; you can redistribute it and/or modify */
ali@0	8	/* it under the terms of the GNU General Public License as published by */
ali@0	9	/* the Free Software Foundation; either version 2 of the License, or */
ali@68	10	/* (at your option) any later version. */
ali@68	11	/* */
ali@0	12	/* This program is distributed in the hope that it will be useful, */
ali@68	13	/* but WITHOUT ANY WARRANTY; without even the implied warranty of */
ali@68	14	/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
ali@68	15	/* GNU General Public License for more details. */
ali@68	16	/* */
ali@68	17	/* You should have received a copy of the GNU General Public License */
ali@68	18	/* along with this program. If not, see <http://www.gnu.org/licenses/>. */
ali@0	19	/*************************************************************************/
ali@0	20
ali@0	21	#include <stdio.h>
ali@0	22	#include <stdlib.h>
ali@0	23	#include <string.h>
ali@0	24	#include <ctype.h>
ali@69	25	#include <glib.h>
ali@69	26	#include <bl/bl.h>
ali@0	27
ali@69	28	gchar *prevline;
ali@0	29
ali@40	30	/* Common typos. */
ali@40	31	char *typo[] = {
ali@40	32	"teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
ali@40	33	"nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
ali@40	34	"bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
ali@40	35	"couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
ali@40	36	"esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
ali@40	37	"gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
ali@40	38	"herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
ali@40	39	"hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
ali@40	40	"loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
ali@40	41	"omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
ali@40	42	"peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
ali@40	43	"porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
ali@40	44	"sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
ali@40	45	"tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
ali@40	46	"thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
ali@40	47	"tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
ali@40	48	"waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
ali@40	49	"wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
ali@40	50	"woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
ali@40	51	"wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
ali@40	52	"ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
ali@40	53	"bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
ali@40	54	"ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
ali@40	55	"dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
ali@40	56	"hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
ali@40	57	"hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
ali@40	58	"memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
ali@40	59	"witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
ali@40	60	"prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
ali@40	61	"se", ""
ali@40	62	};
ali@0	63
ali@69	64	GTree *usertypo;
ali@0	65
ali@40	66	/* Common abbreviations and other OK words not to query as typos. */
ali@40	67	char *okword[] = {
ali@40	68	"mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
ali@40	69	"rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
ali@40	70	"pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
ali@40	71	"outbid", "outbids", "frostbite", "frostbitten", ""
ali@40	72	};
ali@0	73
ali@40	74	/* Common abbreviations that cause otherwise unexplained periods. */
ali@40	75	char *abbrev[] = {
ali@40	76	"cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
ali@40	77	"cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
ali@40	78	};
ali@0	79
ali@40	80	/*
ali@40	81	* Two-Letter combinations that rarely if ever start words,
ali@40	82	* but are common scannos or otherwise common letter combinations.
ali@40	83	*/
ali@40	84	char *nostart[] = {
ali@40	85	"hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
ali@40	86	};
ali@0	87
ali@40	88	/*
ali@40	89	* Two-Letter combinations that rarely if ever end words,
ali@40	90	* but are common scannos or otherwise common letter combinations.
ali@40	91	*/
ali@40	92	char *noend[] = {
ali@40	93	"cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
ali@40	94	"sw", "gr", "sl", "cl", "iy", ""
ali@40	95	};
ali@0	96
ali@40	97	char *markup[] = {
ali@40	98	"a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
ali@40	99	"font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
ali@40	100	"img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
ali@40	101	"sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
ali@40	102	};
ali@0	103
ali@40	104	char *DPmarkup[] = {
ali@40	105	"<sc>", "</sc>", "/", "/", "/#", "#/", "/$", "$/", "<tb>", ""
ali@40	106	};
ali@0	107
ali@40	108	char *nocomma[] = {
ali@40	109	"the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
ali@40	110	"every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
ali@40	111	"st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
ali@40	112	"during", "let", "toward", "among", ""
ali@40	113	};
ali@0	114
ali@40	115	char *noperiod[] = {
ali@40	116	"every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
ali@40	117	"and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
ali@40	118	"i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
ali@40	119	"among", "those", "into", "whom", "having", "thence", ""
ali@40	120	};
ali@0	121
ali@40	122	char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü";
ali@0	123
ali@0	124	struct {
ali@0	125	char *htmlent;
ali@0	126	char *htmlnum;
ali@0	127	char *textent;
ali@40	128	} entities[] = {
ali@40	129	"&", "&", "&",
ali@40	130	"<", "<", "<",
ali@40	131	">", ">", ">",
ali@40	132	"°", "°", " degrees",
ali@40	133	"£", "£", "L",
ali@40	134	""", """, "\"", /* quotation mark = APL quote */
ali@40	135	"&OElig;", "Œ", "OE", /* latin capital ligature OE */
ali@40	136	"&oelig;", "œ", "oe", /* latin small ligature oe */
ali@40	137	"&Scaron;", "Š", "S", /* latin capital letter S with caron */
ali@40	138	"&scaron;", "š", "s", /* latin small letter s with caron */
ali@40	139	"&Yuml;", "Ÿ", "Y", /* latin capital letter Y with diaeresis */
ali@40	140	"&circ;", "ˆ", "", /* modifier letter circumflex accent */
ali@40	141	"&tilde;", "˜", "~", /* small tilde, U+02DC ISOdia */
ali@40	142	"&ensp;", " ", " ", /* en space, U+2002 ISOpub */
ali@40	143	"&emsp;", " ", " ", /* em space, U+2003 ISOpub */
ali@40	144	" ", " ", " ", /* thin space, U+2009 ISOpub */
ali@40	145	"–", "–", "-", /* en dash, U+2013 ISOpub */
ali@40	146	"—", "—", "--", /* em dash, U+2014 ISOpub */
ali@40	147	"’", "’", "'", /* right single quotation mark */
ali@40	148	"&sbquo;", "‚", "'", /* single low-9 quotation mark */
ali@40	149	"“", "“", "\"", /* left double quotation mark */
ali@40	150	"”", "”", "\"", /* right double quotation mark */
ali@40	151	"&bdquo;", "„", "\"", /* double low-9 quotation mark */
ali@40	152	"&lsaquo;", "‹", "\"", /* single left-pointing angle quotation mark */
ali@40	153	"&rsaquo;", "›", "\"", /* single right-pointing angle quotation mark */
ali@40	154	" ", " ", " ", /* no-break space = non-breaking space, */
ali@40	155	"¡", "¡", "!", /* inverted exclamation mark */
ali@40	156	"¢", "¢", "c", /* cent sign */
ali@40	157	"£", "£", "L", /* pound sign */
ali@40	158	"¤", "¤", "$", /* currency sign */
ali@40	159	"¥", "¥", "Y", /* yen sign = yuan sign */
ali@40	160	"§", "§", "--", /* section sign */
ali@40	161	"¨", "¨", " ", /* diaeresis = spacing diaeresis */
ali@40	162	"©", "©", "(C) ", /* copyright sign */
ali@40	163	"ª", "ª", " ", /* feminine ordinal indicator */
ali@40	164	"«", "«", "\"", /* left-pointing double angle quotation mark */
ali@40	165	"", "", "-", /* soft hyphen = discretionary hyphen */
ali@40	166	"®", "®", "(R) ", /* registered sign = registered trade mark sign */
ali@40	167	"¯", "¯", " ", /* macron = spacing macron = overline */
ali@40	168	"°", "°", " degrees", /* degree sign */
ali@40	169	"±", "±", "+-", /* plus-minus sign = plus-or-minus sign */
ali@40	170	"²", "²", "2", /* superscript two = superscript digit two */
ali@40	171	"³", "³", "3", /* superscript three = superscript digit three */
ali@40	172	"´", "´", " ", /* acute accent = spacing acute */
ali@40	173	"µ", "µ", "m", /* micro sign */
ali@40	174	"¶", "¶", "--", /* pilcrow sign = paragraph sign */
ali@40	175	"¸", "¸", " ", /* cedilla = spacing cedilla */
ali@40	176	"¹", "¹", "1", /* superscript one = superscript digit one */
ali@40	177	"º", "º", " ", /* masculine ordinal indicator */
ali@40	178	"»", "»", "\"", /* right-pointing double angle quotation mark */
ali@40	179	"¼", "¼", "1/4", /* vulgar fraction one quarter */
ali@40	180	"½", "½", "1/2", /* vulgar fraction one half */
ali@40	181	"¾", "¾", "3/4", /* vulgar fraction three quarters */
ali@40	182	"¿", "¿", "?", /* inverted question mark */
ali@40	183	"À", "À", "A", /* latin capital letter A with grave */
ali@40	184	"Á", "Á", "A", /* latin capital letter A with acute */
ali@40	185	"Â", "Â", "A", /* latin capital letter A with circumflex */
ali@40	186	"Ã", "Ã", "A", /* latin capital letter A with tilde */
ali@40	187	"Ä", "Ä", "A", /* latin capital letter A with diaeresis */
ali@40	188	"Å", "Å", "A", /* latin capital letter A with ring above */
ali@40	189	"Æ", "Æ", "AE", /* latin capital letter AE */
ali@40	190	"Ç", "Ç", "C", /* latin capital letter C with cedilla */
ali@40	191	"È", "È", "E", /* latin capital letter E with grave */
ali@40	192	"É", "É", "E", /* latin capital letter E with acute */
ali@40	193	"Ê", "Ê", "E", /* latin capital letter E with circumflex */
ali@40	194	"Ë", "Ë", "E", /* latin capital letter E with diaeresis */
ali@40	195	"Ì", "Ì", "I", /* latin capital letter I with grave */
ali@40	196	"Í", "Í", "I", /* latin capital letter I with acute */
ali@40	197	"Î", "Î", "I", /* latin capital letter I with circumflex */
ali@40	198	"Ï", "Ï", "I", /* latin capital letter I with diaeresis */
ali@40	199	"Ð", "Ð", "E", /* latin capital letter ETH */
ali@40	200	"Ñ", "Ñ", "N", /* latin capital letter N with tilde */
ali@40	201	"Ò", "Ò", "O", /* latin capital letter O with grave */
ali@40	202	"Ó", "Ó", "O", /* latin capital letter O with acute */
ali@40	203	"Ô", "Ô", "O", /* latin capital letter O with circumflex */
ali@40	204	"Õ", "Õ", "O", /* latin capital letter O with tilde */
ali@40	205	"Ö", "Ö", "O", /* latin capital letter O with diaeresis */
ali@40	206	"×", "×", "", / multiplication sign */
ali@40	207	"Ø", "Ø", "O", /* latin capital letter O with stroke */
ali@40	208	"Ù", "Ù", "U", /* latin capital letter U with grave */
ali@40	209	"Ú", "Ú", "U", /* latin capital letter U with acute */
ali@40	210	"Û", "Û", "U", /* latin capital letter U with circumflex */
ali@40	211	"Ü", "Ü", "U", /* latin capital letter U with diaeresis */
ali@40	212	"Ý", "Ý", "Y", /* latin capital letter Y with acute */
ali@40	213	"Þ", "Þ", "TH", /* latin capital letter THORN */
ali@40	214	"ß", "ß", "sz", /* latin small letter sharp s = ess-zed */
ali@40	215	"à", "à", "a", /* latin small letter a with grave */
ali@40	216	"á", "á", "a", /* latin small letter a with acute */
ali@40	217	"â", "â", "a", /* latin small letter a with circumflex */
ali@40	218	"ã", "ã", "a", /* latin small letter a with tilde */
ali@40	219	"ä", "ä", "a", /* latin small letter a with diaeresis */
ali@40	220	"å", "å", "a", /* latin small letter a with ring above */
ali@40	221	"æ", "æ", "ae", /* latin small letter ae */
ali@40	222	"ç", "ç", "c", /* latin small letter c with cedilla */
ali@40	223	"è", "è", "e", /* latin small letter e with grave */
ali@40	224	"é", "é", "e", /* latin small letter e with acute */
ali@40	225	"ê", "ê", "e", /* latin small letter e with circumflex */
ali@40	226	"ë", "ë", "e", /* latin small letter e with diaeresis */
ali@40	227	"ì", "ì", "i", /* latin small letter i with grave */
ali@40	228	"í", "í", "i", /* latin small letter i with acute */
ali@40	229	"î", "î", "i", /* latin small letter i with circumflex */
ali@40	230	"ï", "ï", "i", /* latin small letter i with diaeresis */
ali@40	231	"ð", "ð", "eth", /* latin small letter eth */
ali@40	232	"ñ", "ñ", "n", /* latin small letter n with tilde */
ali@40	233	"ò", "ò", "o", /* latin small letter o with grave */
ali@40	234	"ó", "ó", "o", /* latin small letter o with acute */
ali@40	235	"ô", "ô", "o", /* latin small letter o with circumflex */
ali@40	236	"õ", "õ", "o", /* latin small letter o with tilde */
ali@40	237	"ö", "ö", "o", /* latin small letter o with diaeresis */
ali@40	238	"÷", "÷", "/", /* division sign */
ali@40	239	"ø", "ø", "o", /* latin small letter o with stroke */
ali@40	240	"ù", "ù", "u", /* latin small letter u with grave */
ali@40	241	"ú", "ú", "u", /* latin small letter u with acute */
ali@40	242	"û", "û", "u", /* latin small letter u with circumflex */
ali@40	243	"ü", "ü", "u", /* latin small letter u with diaeresis */
ali@40	244	"ý", "ý", "y", /* latin small letter y with acute */
ali@40	245	"þ", "þ", "th", /* latin small letter thorn */
ali@40	246	"ÿ", "ÿ", "y", /* latin small letter y with diaeresis */
ali@40	247	"", ""
ali@40	248	};
ali@40	249
ali@40	250	/* special characters */
ali@68	251	#define CHAR_SPACE 32
ali@68	252	#define CHAR_TAB 9
ali@68	253	#define CHAR_LF 10
ali@68	254	#define CHAR_CR 13
ali@68	255	#define CHAR_DQUOTE 34
ali@68	256	#define CHAR_SQUOTE 39
ali@0	257	#define CHAR_OPEN_SQUOTE 96
ali@68	258	#define CHAR_TILDE 126
ali@68	259	#define CHAR_ASTERISK 42
ali@68	260	#define CHAR_FORESLASH 47
ali@68	261	#define CHAR_CARAT 94
ali@0	262
ali@0	263	#define CHAR_UNDERSCORE '_'
ali@0	264	#define CHAR_OPEN_CBRACK '{'
ali@0	265	#define CHAR_CLOSE_CBRACK '}'
ali@0	266	#define CHAR_OPEN_RBRACK '('
ali@0	267	#define CHAR_CLOSE_RBRACK ')'
ali@0	268	#define CHAR_OPEN_SBRACK '['
ali@0	269	#define CHAR_CLOSE_SBRACK ']'
ali@0	270
ali@40	271	/* longest and shortest normal PG line lengths */
ali@0	272	#define LONGEST_PG_LINE 75
ali@0	273	#define WAY_TOO_LONG 80
ali@0	274	#define SHORTEST_PG_LINE 55
ali@0	275
ali@69	276	enum {
ali@69	277	ECHO_SWITCH,
ali@69	278	SQUOTE_SWITCH,
ali@69	279	TYPO_SWITCH,
ali@69	280	QPARA_SWITCH,
ali@69	281	PARANOID_SWITCH,
ali@69	282	LINE_END_SWITCH,
ali@69	283	OVERVIEW_SWITCH,
ali@69	284	STDOUT_SWITCH,
ali@69	285	HEADER_SWITCH,
ali@69	286	WEB_SWITCH,
ali@69	287	VERBOSE_SWITCH,
ali@69	288	MARKUP_SWITCH,
ali@69	289	USERTYPO_SWITCH,
ali@69	290	DP_SWITCH,
ali@69	291	SWITNO
ali@69	292	};
ali@0	293
ali@69	294	gboolean pswit[SWITNO]; /* program switches */
ali@0	295
ali@69	296	static GOptionEntry options[]={
ali@69	297	{ "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
ali@69	298	"Ignore DP-specific markup", NULL },
ali@69	299	{ "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
ali@69	300	"Don't echo queried line", NULL },
ali@69	301	{ "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
ali@69	302	"Check single quotes", NULL },
ali@69	303	{ "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
ali@69	304	"Check common typos", NULL },
ali@69	305	{ "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
ali@69	306	"Require closure of quotes on every paragraph", NULL },
ali@69	307	{ "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
ali@69	308	"Disable paranoid querying of everything", NULL },
ali@69	309	{ "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
ali@69	310	"Disable line end checking", NULL },
ali@69	311	{ "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
ali@69	312	"Overview: just show counts", NULL },
ali@69	313	{ "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
ali@69	314	"Output errors to stdout instead of stderr", NULL },
ali@69	315	{ "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
ali@69	316	"Echo header fields", NULL },
ali@69	317	{ "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
ali@69	318	"Ignore markup in < >", NULL },
ali@69	319	{ "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
ali@69	320	"Use file of user-defined typos", NULL },
ali@69	321	{ "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
ali@69	322	"Defaults for use on www upload", NULL },
ali@69	323	{ "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
ali@69	324	"Verbose - list everything", NULL },
ali@69	325	{ NULL }
ali@69	326	};
ali@0	327
ali@68	328	long cnt_dquot; /* for overview mode, count of doublequote queries */
ali@68	329	long cnt_squot; /* for overview mode, count of singlequote queries */
ali@68	330	long cnt_brack; /* for overview mode, count of brackets queries */
ali@68	331	long cnt_bin; /* for overview mode, count of non-ASCII queries */
ali@68	332	long cnt_odd; /* for overview mode, count of odd character queries */
ali@68	333	long cnt_long; /* for overview mode, count of long line errors */
ali@68	334	long cnt_short; /* for overview mode, count of short line queries */
ali@68	335	long cnt_punct; /* for overview mode,
ali@68	336	count of punctuation and spacing queries */
ali@68	337	long cnt_dash; /* for overview mode, count of dash-related queries */
ali@68	338	long cnt_word; /* for overview mode, count of word queries */
ali@68	339	long cnt_html; /* for overview mode, count of html queries */
ali@68	340	long cnt_lineend; /* for overview mode, count of line-end queries */
ali@68	341	long cnt_spacend; /* count of lines with space at end */
ali@68	342	long linecnt; /* count of total lines in the file */
ali@68	343	long checked_linecnt; /* count of lines actually checked */
ali@0	344
ali@69	345	void proghelp(GOptionContext *context);
ali@69	346	void procfile(const char *);
ali@0	347
ali@69	348	gchar *running_from;
ali@0	349
ali@69	350	int mixdigit(const char *);
ali@69	351	gchar getaword(const char *);
ali@69	352	char flgets(char *,long);
ali@69	353	gboolean gcisalpha(unsigned char);
ali@69	354	gboolean gcisdigit(unsigned char);
ali@69	355	gboolean gcisletter(unsigned char);
ali@0	356	void postprocess_for_HTML(char *);
ali@0	357	char linehasmarkup(char );
ali@0	358	char losemarkup(char );
ali@69	359	int tagcomp(const char ,const char );
ali@0	360	char loseentities(char );
ali@69	361	gboolean isroman(const char *);
ali@0	362	void postprocess_for_DP(char *);
ali@0	363
ali@69	364	GTree qword,qperiod;
ali@68	365
ali@68	366	struct first_pass_results {
ali@68	367	long firstline,astline;
ali@68	368	long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;
ali@68	369	long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;
ali@68	370	long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;
ali@68	371	int Dutchcount,Frenchcount;
ali@68	372	};
ali@68	373
ali@68	374	struct warnings {
ali@68	375	int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;
ali@69	376	int endquote;
ali@69	377	gboolean isDutch,isFrench;
ali@68	378	};
ali@68	379
ali@68	380	struct counters {
ali@68	381	long quot;
ali@68	382	int c_unders,c_brack,s_brack,r_brack;
ali@68	383	int open_single_quote,close_single_quote;
ali@68	384	};
ali@68	385
ali@68	386	struct line_properties {
ali@68	387	unsigned int len,blen;
ali@68	388	char start;
ali@68	389	};
ali@68	390
ali@68	391	struct parities {
ali@68	392	int dquote,squote;
ali@68	393	};
ali@68	394
ali@68	395	struct pending {
ali@69	396	char dquote,squote,rbrack,sbrack,cbrack,unders;
ali@68	397	long squot;
ali@68	398	};
ali@0	399
ali@69	400	void parse_options(int argc,char **argv)
ali@0	401	{
ali@69	402	GError *err=NULL;
ali@69	403	GOptionContext *context;
ali@69	404	context=g_option_context_new(
ali@69	405	"file - looks for errors in Project Gutenberg(TM) etexts");
ali@69	406	g_option_context_add_main_entries(context,options,NULL);
ali@69	407	if (!g_option_context_parse(context,argc,argv,&err))
ali@69	408	{
ali@69	409	g_printerr("Bookloupe: %s\n",err->message);
ali@69	410	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
ali@69	411	exit(1);
ali@69	412	}
ali@40	413	/* Paranoid checking is turned OFF, not on, by its switch */
ali@69	414	pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
ali@40	415	if (pswit[PARANOID_SWITCH])
ali@69	416	/* if running in paranoid mode, typo checks default to enabled */
ali@69	417	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
ali@40	418	/* Line-end checking is turned OFF, not on, by its switch */
ali@69	419	pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
ali@40	420	/* Echoing is turned OFF, not on, by its switch */
ali@69	421	pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
ali@40	422	if (pswit[OVERVIEW_SWITCH])
ali@40	423	/* just print summary; don't echo */
ali@69	424	pswit[ECHO_SWITCH]=FALSE;
ali@40	425	/*
ali@40	426	* Web uploads - for the moment, this is really just a placeholder
ali@40	427	* until we decide what processing we really want to do on web uploads
ali@40	428	*/
ali@40	429	if (pswit[WEB_SWITCH])
ali@40	430	{
ali@40	431	/* specific override for web uploads */
ali@69	432	pswit[ECHO_SWITCH]=TRUE;
ali@69	433	pswit[SQUOTE_SWITCH]=FALSE;
ali@69	434	pswit[TYPO_SWITCH]=TRUE;
ali@69	435	pswit[QPARA_SWITCH]=FALSE;
ali@69	436	pswit[PARANOID_SWITCH]=TRUE;
ali@69	437	pswit[LINE_END_SWITCH]=FALSE;
ali@69	438	pswit[OVERVIEW_SWITCH]=FALSE;
ali@69	439	pswit[STDOUT_SWITCH]=FALSE;
ali@69	440	pswit[HEADER_SWITCH]=TRUE;
ali@69	441	pswit[VERBOSE_SWITCH]=FALSE;
ali@69	442	pswit[MARKUP_SWITCH]=FALSE;
ali@69	443	pswit[USERTYPO_SWITCH]=FALSE;
ali@69	444	pswit[DP_SWITCH]=FALSE;
ali@40	445	}
ali@69	446	if (*argc<2)
ali@40	447	{
ali@69	448	proghelp(context);
ali@69	449	exit(1);
ali@40	450	}
ali@69	451	g_option_context_free(context);
ali@69	452	}
ali@69	453
ali@69	454	/*
ali@69	455	* read_user_scannos:
ali@69	456	*
ali@69	457	* Read in the user-defined stealth scanno list.
ali@69	458	*/
ali@69	459	void read_user_scannos(void)
ali@69	460	{
ali@69	461	GError *err=NULL;
ali@69	462	gchar *usertypo_file;
ali@69	463	gboolean okay;
ali@69	464	int i;
ali@69	465	gsize len;
ali@69	466	gchar contents,*lines;
ali@69	467	usertypo_file=g_strdup("bookloupe.typ");
ali@69	468	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
ali@69	469	if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
ali@69	470	{
ali@69	471	g_clear_error(&err);
ali@69	472	g_free(usertypo_file);
ali@69	473	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
ali@69	474	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
ali@69	475	}
ali@69	476	if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
ali@69	477	{
ali@69	478	g_clear_error(&err);
ali@69	479	g_free(usertypo_file);
ali@69	480	usertypo_file=g_strdup("gutcheck.typ");
ali@69	481	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
ali@69	482	}
ali@69	483	if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
ali@69	484	{
ali@69	485	g_clear_error(&err);
ali@69	486	g_free(usertypo_file);
ali@69	487	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
ali@69	488	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
ali@69	489	}
ali@69	490	if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
ali@69	491	{
ali@69	492	g_free(usertypo_file);
ali@69	493	printf(" --> I couldn't find bookloupe.typ "
ali@69	494	"-- proceeding without user typos.\n");
ali@69	495	return;
ali@69	496	}
ali@69	497	else if (!okay)
ali@69	498	{
ali@69	499	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
ali@69	500	g_free(usertypo_file);
ali@69	501	g_clear_error(&err);
ali@69	502	exit(1);
ali@69	503	}
ali@69	504	lines=g_strsplit(contents,"\n",0);
ali@69	505	usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
ali@69	506	for (i=0;lines[i];i++)
ali@69	507	if ((unsigned char )lines[i]>'!')
ali@69	508	g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
ali@69	509	else
ali@69	510	g_free(lines[i]);
ali@69	511	g_free(lines);
ali@69	512	}
ali@69	513
ali@69	514	#if 0
ali@69	515	/*
ali@69	516	* read_etext:
ali@69	517	*
ali@69	518	* Read an etext returning an array of lines. Lines are normally expected
ali@69	519	* to be terminated by CR LF. Solitary LFs delimit lines but are left
ali@69	520	* embedded at the end of the line for further processing. Solitary CRs
ali@69	521	* do not delimit lines.
ali@69	522	*/
ali@69	523	gchar *read_etext(const char filename,GError **err)
ali@69	524	{
ali@69	525	int i;
ali@69	526	const char s,t;
ali@69	527	gchar *contents;
ali@69	528	gchar **raw_lines;
ali@69	529	GPtrArray *lines;
ali@69	530	gsize len;
ali@69	531	if (!g_file_get_contents(filename,&contents,&len,err))
ali@69	532	return NULL;
ali@69	533	raw_lines=g_strsplit(contents,"\r\n",0);
ali@69	534	lines=g_ptr_array_sized_new(g_strv_length(raw_lines)+1);
ali@69	535	for (i=0;raw_lines[i];i++)
ali@69	536	{
ali@69	537	t=strchr(raw_lines[i],'\n');
ali@69	538	if (t)
ali@69	539	{
ali@69	540	s=raw_lines[i];
ali@69	541	while ((t=strchr(s,'\n')))
ali@69	542	{
ali@69	543	g_ptr_array_add(lines,g_strndup(s,t-s+1));
ali@69	544	s=t+1;
ali@69	545	}
ali@69	546	g_ptr_array_add(lines,g_strdup(s));
ali@69	547	g_free(raw_lines[i]);
ali@69	548	}
ali@69	549	else
ali@69	550	g_ptr_array_add(lines,raw_lines[i]);
ali@69	551	}
ali@69	552	g_free(raw_lines);
ali@69	553	g_ptr_array_add(lines,NULL);
ali@69	554	return (gchar **)g_ptr_array_free(lines,FALSE);
ali@69	555	}
ali@69	556	#else
ali@69	557	/*
ali@69	558	* read_etext:
ali@69	559	*
ali@69	560	* Read an etext returning a newly allocated string containing the file
ali@69	561	* contents or NULL on error.
ali@69	562	*/
ali@69	563	gchar read_etext(const char filename,GError **err)
ali@69	564	{
ali@69	565	gchar *contents;
ali@69	566	gsize len;
ali@69	567	if (!g_file_get_contents(filename,&contents,&len,err))
ali@69	568	return NULL;
ali@69	569	return contents;
ali@69	570	}
ali@69	571	#endif
ali@69	572
ali@69	573	int main(int argc,char **argv)
ali@69	574	{
ali@69	575	running_from=g_path_get_dirname(argv[0]);
ali@69	576	parse_options(&argc,&argv);
ali@40	577	if (pswit[USERTYPO_SWITCH])
ali@69	578	read_user_scannos();
ali@40	579	fprintf(stderr,"bookloupe: Check and report on an e-text\n");
ali@69	580	procfile(argv[1]);
ali@40	581	if (pswit[OVERVIEW_SWITCH])
ali@40	582	{
ali@40	583	printf(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
ali@40	584	checked_linecnt,linecnt,linecnt-checked_linecnt);
ali@68	585	printf(" --------------- Queries found --------------\n");
ali@68	586	if (cnt_long)
ali@68	587	printf(" Long lines: %14ld\n",cnt_long);
ali@68	588	if (cnt_short)
ali@68	589	printf(" Short lines: %14ld\n",cnt_short);
ali@68	590	if (cnt_lineend)
ali@68	591	printf(" Line-end problems: %14ld\n",cnt_lineend);
ali@68	592	if (cnt_word)
ali@68	593	printf(" Common typos: %14ld\n",cnt_word);
ali@68	594	if (cnt_dquot)
ali@68	595	printf(" Unmatched quotes: %14ld\n",cnt_dquot);
ali@68	596	if (cnt_squot)
ali@68	597	printf(" Unmatched SingleQuotes: %14ld\n",cnt_squot);
ali@68	598	if (cnt_brack)
ali@68	599	printf(" Unmatched brackets: %14ld\n",cnt_brack);
ali@68	600	if (cnt_bin)
ali@68	601	printf(" Non-ASCII characters: %14ld\n",cnt_bin);
ali@68	602	if (cnt_odd)
ali@68	603	printf(" Proofing characters: %14ld\n",cnt_odd);
ali@68	604	if (cnt_punct)
ali@40	605	printf(" Punctuation & spacing queries: %14ld\n",cnt_punct);
ali@68	606	if (cnt_dash)
ali@68	607	printf(" Non-standard dashes: %14ld\n",cnt_dash);
ali@68	608	if (cnt_html)
ali@68	609	printf(" Possible HTML tags: %14ld\n",cnt_html);
ali@68	610	printf("\n");
ali@68	611	printf(" TOTAL QUERIES %14ld\n",
ali@68	612	cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
ali@68	613	cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
ali@40	614	}
ali@69	615	g_free(running_from);
ali@69	616	if (usertypo)
ali@69	617	g_tree_unref(usertypo);
ali@40	618	return 0;
ali@0	619	}
ali@0	620
ali@40	621	/*
ali@41	622	* first_pass:
ali@40	623	*
ali@41	624	* Run a first pass - verify that it's a valid PG
ali@41	625	* file, decide whether to report some things that
ali@41	626	* occur many times in the text like long or short
ali@41	627	* lines, non-standard dashes, etc.
ali@40	628	*/
ali@69	629	struct first_pass_results first_pass(const char etext)
ali@0	630	{
ali@54	631	char laststart=CHAR_SPACE;
ali@54	632	const char *s;
ali@69	633	gchar *lc_line;
ali@69	634	int i,j,llen;
ali@69	635	gchar **lines;
ali@41	636	unsigned int lastlen=0,lastblen=0;
ali@41	637	long spline=0,nspline=0;
ali@41	638	static struct first_pass_results results={0};
ali@69	639	gchar *inword;
ali@69	640	lines=g_strsplit(etext,"\n",0);
ali@69	641	for (j=0;lines[j];j++)
ali@40	642	{
ali@69	643	llen=strlen(lines[j]);
ali@69	644	while(lines[j][llen-1]=='\r')
ali@69	645	lines[j][llen--]='\0';
ali@68	646	linecnt++;
ali@69	647	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
ali@69	648	(strstr(lines[j],"PUBLIC DOMAIN") \|\| strstr(lines[j],"COPYRIGHT")))
ali@40	649	{
ali@68	650	if (spline)
ali@68	651	printf(" --> Duplicate header?\n");
ali@68	652	spline=linecnt+1; /* first line of non-header text, that is */
ali@40	653	}
ali@69	654	if (!strncmp(lines[j],"*** START",9) &&
ali@69	655	strstr(lines[j],"PROJECT GUTENBERG"))
ali@40	656	{
ali@68	657	if (nspline)
ali@68	658	printf(" --> Duplicate header?\n");
ali@68	659	nspline=linecnt+1; /* first line of non-header text, that is */
ali@40	660	}
ali@68	661	if (spline \|\| nspline)
ali@40	662	{
ali@69	663	lc_line=g_ascii_strdown(lines[j],llen);
ali@69	664	if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
ali@40	665	{
ali@69	666	if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
ali@40	667	{
ali@68	668	if (results.footerline)
ali@40	669	{
ali@40	670	/* it's an old-form header - we can detect duplicates */
ali@68	671	if (!nspline)
ali@68	672	printf(" --> Duplicate footer?\n");
ali@40	673	}
ali@68	674	else
ali@68	675	results.footerline=linecnt;
ali@40	676	}
ali@40	677	}
ali@69	678	g_free(lc_line);
ali@40	679	}
ali@68	680	if (spline)
ali@41	681	results.firstline=spline;
ali@68	682	if (nspline)
ali@41	683	results.firstline=nspline; /* override with new */
ali@68	684	if (results.footerline)
ali@40	685	continue; /* don't count the boilerplate in the footer */
ali@68	686	results.totlen+=llen;
ali@68	687	for (i=0;i<llen;i++)
ali@40	688	{
ali@69	689	if ((unsigned char)lines[j][i]>127)
ali@41	690	results.binlen++;
ali@69	691	if (gcisalpha(lines[j][i]))
ali@41	692	results.alphalen++;
ali@69	693	if (i>0 && lines[j][i]==CHAR_DQUOTE && isalpha(lines[j][i-1]))
ali@41	694	results.endquote_count++;
ali@40	695	}
ali@69	696	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
ali@69	697	lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
ali@41	698	results.shortline++;
ali@69	699	if (llen>0 && (unsigned char)lines[j][llen-1]<=CHAR_SPACE)
ali@40	700	cnt_spacend++;
ali@69	701	if (strstr(lines[j],".,"))
ali@41	702	results.dotcomma++;
ali@68	703	/* only count ast lines for ignoring purposes where there is */
ali@68	704	/* locase text on the line */
ali@69	705	if (strchr(lines[j],'*'))
ali@40	706	{
ali@69	707	for (s=lines[j];*s;s++)
ali@68	708	if (s>='a' && s<='z')
ali@68	709	break;
ali@68	710	if (*s)
ali@41	711	results.astline++;
ali@40	712	}
ali@69	713	if (strchr(lines[j],'/'))
ali@68	714	results.fslashline++;
ali@69	715	for (i=llen-1;i>0 && (unsigned char)lines[j][i]<=CHAR_SPACE;i--)
ali@40	716	;
ali@69	717	if (i>1 && lines[j][i]=='-' && lines[j][i-1]!='-')
ali@41	718	results.hyphens++;
ali@68	719	if (llen>LONGEST_PG_LINE)
ali@41	720	results.longline++;
ali@68	721	if (llen>WAY_TOO_LONG)
ali@41	722	results.verylongline++;
ali@69	723	if (strchr(lines[j],'<') && strchr(lines[j],'>'))
ali@40	724	{
ali@69	725	i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
ali@68	726	if (i>0)
ali@68	727	results.htmcount++;
ali@69	728	if (strstr(lines[j],"<i>"))
ali@41	729	results.htmcount+=4; /* bonus marks! */
ali@40	730	}
ali@68	731	/* Check for spaced em-dashes */
ali@69	732	if (lines[j][0] && (s=strstr(lines[j]+1,"--")))
ali@40	733	{
ali@68	734	results.emdash++;
ali@69	735	if (s[-1]==CHAR_SPACE \|\| (s[2]==CHAR_SPACE))
ali@41	736	results.space_emdash++;
ali@69	737	if (s[-1]==CHAR_SPACE && (s[2]==CHAR_SPACE))
ali@40	738	/* count of em-dashes with spaces both sides */
ali@41	739	results.non_PG_space_emdash++;
ali@69	740	if (s[-1]!=CHAR_SPACE && (s[2]!=CHAR_SPACE))
ali@40	741	/* count of PG-type em-dashes with no spaces */
ali@41	742	results.PG_space_emdash++;
ali@40	743	}
ali@69	744	for (s=lines[j];*s;)
ali@40	745	{
ali@69	746	inword=getaword(&s);
ali@68	747	if (!strcmp(inword,"hij") \|\| !strcmp(inword,"niet"))
ali@68	748	results.Dutchcount++;
ali@68	749	if (!strcmp(inword,"dans") \|\| !strcmp(inword,"avec"))
ali@68	750	results.Frenchcount++;
ali@68	751	if (!strcmp(inword,"0") \|\| !strcmp(inword,"1"))
ali@68	752	results.standalone_digit++;
ali@69	753	g_free(inword);
ali@40	754	}
ali@68	755	/* Check for spaced dashes */
ali@69	756	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
ali@41	757	results.spacedash++;
ali@68	758	lastblen=lastlen;
ali@69	759	lastlen=llen;
ali@69	760	laststart=lines[j][0];
ali@40	761	}
ali@69	762	g_strfreev(lines);
ali@41	763	return &results;
ali@41	764	}
ali@41	765
ali@42	766	/*
ali@42	767	* report_first_pass:
ali@42	768	*
ali@42	769	* Make some snap decisions based on the first pass results.
ali@42	770	*/
ali@42	771	struct warnings report_first_pass(struct first_pass_results results)
ali@42	772	{
ali@42	773	static struct warnings warnings={0};
ali@42	774	if (cnt_spacend>0)
ali@68	775	printf(" --> %ld lines in this file have white space at end\n",
ali@42	776	cnt_spacend);
ali@42	777	warnings.dotcomma=1;
ali@42	778	if (results->dotcomma>5)
ali@42	779	{
ali@68	780	warnings.dotcomma=0;
ali@68	781	printf(" --> %ld lines in this file contain '.,'. "
ali@42	782	"Not reporting them.\n",results->dotcomma);
ali@42	783	}
ali@42	784	/*
ali@42	785	* If more than 50 lines, or one-tenth, are short,
ali@42	786	* don't bother reporting them.
ali@42	787	*/
ali@42	788	warnings.shortline=1;
ali@42	789	if (results->shortline>50 \|\| results->shortline*10>linecnt)
ali@42	790	{
ali@68	791	warnings.shortline=0;
ali@68	792	printf(" --> %ld lines in this file are short. "
ali@42	793	"Not reporting short lines.\n",results->shortline);
ali@42	794	}
ali@42	795	/*
ali@42	796	* If more than 50 lines, or one-tenth, are long,
ali@42	797	* don't bother reporting them.
ali@42	798	*/
ali@42	799	warnings.longline=1;
ali@42	800	if (results->longline>50 \|\| results->longline*10>linecnt)
ali@42	801	{
ali@68	802	warnings.longline=0;
ali@68	803	printf(" --> %ld lines in this file are long. "
ali@42	804	"Not reporting long lines.\n",results->longline);
ali@42	805	}
ali@42	806	/* If more than 10 lines contain asterisks, don't bother reporting them. */
ali@42	807	warnings.ast=1;
ali@42	808	if (results->astline>10)
ali@42	809	{
ali@68	810	warnings.ast=0;
ali@68	811	printf(" --> %ld lines in this file contain asterisks. "
ali@42	812	"Not reporting them.\n",results->astline);
ali@42	813	}
ali@42	814	/*
ali@42	815	* If more than 10 lines contain forward slashes,
ali@42	816	* don't bother reporting them.
ali@42	817	*/
ali@42	818	warnings.fslash=1;
ali@42	819	if (results->fslashline>10)
ali@42	820	{
ali@68	821	warnings.fslash=0;
ali@68	822	printf(" --> %ld lines in this file contain forward slashes. "
ali@42	823	"Not reporting them.\n",results->fslashline);
ali@42	824	}
ali@42	825	/*
ali@42	826	* If more than 20 lines contain unpunctuated endquotes,
ali@42	827	* don't bother reporting them.
ali@42	828	*/
ali@42	829	warnings.endquote=1;
ali@42	830	if (results->endquote_count>20)
ali@42	831	{
ali@68	832	warnings.endquote=0;
ali@68	833	printf(" --> %ld lines in this file contain unpunctuated endquotes. "
ali@42	834	"Not reporting them.\n",results->endquote_count);
ali@42	835	}
ali@42	836	/*
ali@42	837	* If more than 15 lines contain standalone digits,
ali@42	838	* don't bother reporting them.
ali@42	839	*/
ali@42	840	warnings.digit=1;
ali@42	841	if (results->standalone_digit>10)
ali@42	842	{
ali@68	843	warnings.digit=0;
ali@68	844	printf(" --> %ld lines in this file contain standalone 0s and 1s. "
ali@42	845	"Not reporting them.\n",results->standalone_digit);
ali@42	846	}
ali@42	847	/*
ali@42	848	* If more than 20 lines contain hyphens at end,
ali@42	849	* don't bother reporting them.
ali@42	850	*/
ali@42	851	warnings.hyphen=1;
ali@42	852	if (results->hyphens>20)
ali@42	853	{
ali@68	854	warnings.hyphen=0;
ali@68	855	printf(" --> %ld lines in this file have hyphens at end. "
ali@42	856	"Not reporting them.\n",results->hyphens);
ali@42	857	}
ali@42	858	if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
ali@42	859	{
ali@68	860	printf(" --> Looks like this is HTML. Switching HTML mode ON.\n");
ali@68	861	pswit[MARKUP_SWITCH]=1;
ali@42	862	}
ali@42	863	if (results->verylongline>0)
ali@68	864	printf(" --> %ld lines in this file are VERY long!\n",
ali@42	865	results->verylongline);
ali@42	866	/*
ali@42	867	* If there are more non-PG spaced dashes than PG em-dashes,
ali@42	868	* assume it's deliberate.
ali@42	869	* Current PG guidelines say don't use them, but older texts do,
ali@42	870	* and some people insist on them whatever the guidelines say.
ali@42	871	*/
ali@42	872	warnings.dash=1;
ali@42	873	if (results->spacedash+results->non_PG_space_emdash>
ali@42	874	results->PG_space_emdash)
ali@42	875	{
ali@68	876	warnings.dash=0;
ali@68	877	printf(" --> There are %ld spaced dashes and em-dashes. "
ali@42	878	"Not reporting them.\n",
ali@42	879	results->spacedash+results->non_PG_space_emdash);
ali@42	880	}
ali@42	881	/* If more than a quarter of characters are hi-bit, bug out. */
ali@42	882	warnings.bin=1;
ali@42	883	if (results->binlen*4>results->totlen)
ali@42	884	{
ali@68	885	printf(" --> This file does not appear to be ASCII. "
ali@42	886	"Terminating. Best of luck with it!\n");
ali@68	887	exit(1);
ali@42	888	}
ali@42	889	if (results->alphalen*4<results->totlen)
ali@42	890	{
ali@68	891	printf(" --> This file does not appear to be text. "
ali@42	892	"Terminating. Best of luck with it!\n");
ali@68	893	exit(1);
ali@42	894	}
ali@42	895	if (results->binlen*100>results->totlen \|\| results->binlen>100)
ali@42	896	{
ali@68	897	printf(" --> There are a lot of foreign letters here. "
ali@42	898	"Not reporting them.\n");
ali@68	899	warnings.bin=0;
ali@42	900	}
ali@69	901	warnings.isDutch=FALSE;
ali@42	902	if (results->Dutchcount>50)
ali@42	903	{
ali@69	904	warnings.isDutch=TRUE;
ali@68	905	printf(" --> This looks like Dutch - "
ali@42	906	"switching off dashes and warnings for 's Middags case.\n");
ali@42	907	}
ali@69	908	warnings.isFrench=FALSE;
ali@42	909	if (results->Frenchcount>50)
ali@42	910	{
ali@69	911	warnings.isFrench=TRUE;
ali@68	912	printf(" --> This looks like French - "
ali@42	913	"switching off some doublepunct.\n");
ali@42	914	}
ali@42	915	if (results->firstline && results->footerline)
ali@68	916	printf(" The PG header and footer appear to be already on.\n");
ali@42	917	else
ali@42	918	{
ali@68	919	if (results->firstline)
ali@68	920	printf(" The PG header is on - no footer.\n");
ali@68	921	if (results->footerline)
ali@68	922	printf(" The PG footer is on - no header.\n");
ali@42	923	}
ali@42	924	printf("\n");
ali@42	925	if (pswit[VERBOSE_SWITCH])
ali@42	926	{
ali@68	927	warnings.bin=1;
ali@68	928	warnings.shortline=1;
ali@68	929	warnings.dotcomma=1;
ali@68	930	warnings.longline=1;
ali@68	931	warnings.dash=1;
ali@68	932	warnings.digit=1;
ali@68	933	warnings.ast=1;
ali@68	934	warnings.fslash=1;
ali@68	935	warnings.hyphen=1;
ali@68	936	warnings.endquote=1;
ali@68	937	printf(" * Verbose output is ON -- you asked for it! *\n");
ali@42	938	}
ali@42	939	if (warnings.isDutch)
ali@68	940	warnings.dash=0;
ali@42	941	if (results->footerline>0 && results->firstline>0 &&
ali@42	942	results->footerline>results->firstline &&
ali@42	943	results->footerline-results->firstline<100)
ali@42	944	{
ali@68	945	printf(" --> I don't really know where this text starts. \n");
ali@68	946	printf(" There are no reference points.\n");
ali@68	947	printf(" I'm going to have to report the header and footer "
ali@42	948	"as well.\n");
ali@68	949	results->firstline=0;
ali@42	950	}
ali@42	951	return &warnings;
ali@42	952	}
ali@42	953
ali@43	954	/*
ali@43	955	* analyse_quotes:
ali@43	956	*
ali@43	957	* Look along the line, accumulate the count of quotes, and see
ali@43	958	* if this is an empty line - i.e. a line with nothing on it
ali@43	959	* but spaces.
ali@43	960	* If line has just spaces, period, * and/or - on it, don't
ali@43	961	* count it, since empty lines with asterisks or dashes to
ali@43	962	* separate sections are common.
ali@43	963	*
ali@69	964	* Returns: TRUE if the line is empty.
ali@43	965	*/
ali@69	966	gboolean analyse_quotes(const char aline,struct counters counters)
ali@43	967	{
ali@68	968	int guessquote=0;
ali@69	969	/* assume the line is empty until proven otherwise */
ali@69	970	gboolean isemptyline=TRUE;
ali@69	971	const char *s=aline;
ali@43	972	while (*s)
ali@43	973	{
ali@43	974	if (*s==CHAR_DQUOTE)
ali@43	975	counters->quot++;
ali@43	976	if (s==CHAR_SQUOTE \|\| s==CHAR_OPEN_SQUOTE)
ali@43	977	{
ali@43	978	if (s==aline)
ali@43	979	{
ali@43	980	/*
ali@43	981	* At start of line, it can only be an openquote.
ali@43	982	* Hardcode a very common exception!
ali@43	983	*/
ali@43	984	if (strncmp(s+2,"tis",3) && strncmp(s+2,"Tis",3))
ali@43	985	counters->open_single_quote++;
ali@43	986	}
ali@43	987	else if (gcisalpha(s[-1]) && gcisalpha(s[1]))
ali@43	988	/* Do nothing! it's definitely an apostrophe, not a quote */
ali@43	989	;
ali@43	990	/* it's outside a word - let's check it out */
ali@43	991	else if (*s==CHAR_OPEN_SQUOTE \|\| gcisalpha(s[1]))
ali@43	992	{
ali@43	993	/* it damwell better BE an openquote */
ali@43	994	if (strncmp(s+1,"tis",3) && strncmp(s+1,"Tis",3))
ali@43	995	/* hardcode a very common exception! */
ali@43	996	counters->open_single_quote++;
ali@43	997	}
ali@43	998	else
ali@43	999	{
ali@43	1000	/* now - is it a closequote? */
ali@43	1001	guessquote=0; /* accumulate clues */
ali@43	1002	if (gcisalpha(s[-1]))
ali@43	1003	{
ali@43	1004	/* it follows a letter - could be either */
ali@43	1005	guessquote++;
ali@43	1006	if (s[-1]=='s')
ali@43	1007	{
ali@43	1008	/* looks like a plural apostrophe */
ali@43	1009	guessquote-=3;
ali@43	1010	if (s[1]==CHAR_SPACE) /* bonus marks! */
ali@43	1011	guessquote-=2;
ali@43	1012	}
ali@43	1013	}
ali@43	1014	/* it doesn't have a letter either side */
ali@43	1015	else if (strchr(".?!,;:",s[-1]) && strchr(".?!,;: ",s[1]))
ali@43	1016	guessquote+=8; /* looks like a closequote */
ali@43	1017	else
ali@43	1018	guessquote++;
ali@43	1019	if (counters->open_single_quote>counters->close_single_quote)
ali@43	1020	/*
ali@43	1021	* Give it the benefit of some doubt,
ali@43	1022	* if a squote is already open.
ali@43	1023	*/
ali@43	1024	guessquote++;
ali@43	1025	else
ali@43	1026	guessquote--;
ali@43	1027	if (guessquote>=0)
ali@43	1028	counters->close_single_quote++;
ali@43	1029	}
ali@43	1030	}
ali@43	1031	if (s!=CHAR_SPACE && s!='-' && s!='.' && s!=CHAR_ASTERISK &&
ali@43	1032	s!=13 && s!=10)
ali@69	1033	isemptyline=FALSE; /* ignore lines like * * * as spacers */
ali@43	1034	if (*s==CHAR_UNDERSCORE)
ali@43	1035	counters->c_unders++;
ali@43	1036	if (*s==CHAR_OPEN_CBRACK)
ali@43	1037	counters->c_brack++;
ali@43	1038	if (*s==CHAR_CLOSE_CBRACK)
ali@43	1039	counters->c_brack--;
ali@43	1040	if (*s==CHAR_OPEN_RBRACK)
ali@43	1041	counters->r_brack++;
ali@43	1042	if (*s==CHAR_CLOSE_RBRACK)
ali@43	1043	counters->r_brack--;
ali@43	1044	if (*s==CHAR_OPEN_SBRACK)
ali@43	1045	counters->s_brack++;
ali@43	1046	if (*s==CHAR_CLOSE_SBRACK)
ali@43	1047	counters->s_brack--;
ali@43	1048	s++;
ali@43	1049	}
ali@43	1050	return isemptyline;
ali@43	1051	}
ali@43	1052
ali@41	1053	/*
ali@67	1054	* check_for_control_characters:
ali@67	1055	*
ali@67	1056	* Check for invalid or questionable characters in the line
ali@67	1057	* Anything above 127 is invalid for plain ASCII, and
ali@67	1058	* non-printable control characters should also be flagged.
ali@67	1059	* Tabs should generally not be there.
ali@67	1060	*/
ali@67	1061	void check_for_control_characters(const char *aline)
ali@67	1062	{
ali@67	1063	unsigned char c;
ali@67	1064	const char *s;
ali@67	1065	for (s=aline;*s;s++)
ali@67	1066	{
ali@67	1067	c=(unsigned char )s;
ali@67	1068	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
ali@67	1069	{
ali@67	1070	if (pswit[ECHO_SWITCH])
ali@67	1071	printf("\n%s\n",aline);
ali@67	1072	if (!pswit[OVERVIEW_SWITCH])
ali@67	1073	printf(" Line %ld column %d - Control character %d\n",
ali@67	1074	linecnt,(int)(s-aline)+1,c);
ali@67	1075	else
ali@67	1076	cnt_bin++;
ali@67	1077	}
ali@67	1078	}
ali@67	1079	}
ali@67	1080
ali@67	1081	/*
ali@44	1082	* check_for_odd_characters:
ali@44	1083	*
ali@44	1084	* Check for binary and other odd characters.
ali@44	1085	*/
ali@44	1086	void check_for_odd_characters(const char aline,const struct warnings warnings,
ali@69	1087	gboolean isemptyline)
ali@44	1088	{
ali@44	1089	/* Don't repeat multiple warnings on one line. */
ali@68	1090	int eNon_A=0,eTab=0,eTilde=0,eCarat=0,eFSlash=0,eAst=0;
ali@44	1091	const char *s;
ali@44	1092	unsigned char c;
ali@44	1093	for (s=aline;*s;s++)
ali@44	1094	{
ali@44	1095	c=(unsigned char )s;
ali@44	1096	if (!eNon_A && (s<CHAR_SPACE && s!=9 && *s!='\n' \|\| c>127))
ali@44	1097	{
ali@44	1098	if (pswit[ECHO_SWITCH])
ali@44	1099	printf("\n%s\n",aline);
ali@44	1100	if (!pswit[OVERVIEW_SWITCH])
ali@44	1101	if (c>127 && c<160)
ali@44	1102	printf(" Line %ld column %d - "
ali@44	1103	"Non-ISO-8859 character %d\n",linecnt,(int)(s-aline)+1,c);
ali@44	1104	else
ali@44	1105	printf(" Line %ld column %d - Non-ASCII character %d\n",
ali@44	1106	linecnt,(int)(s-aline)+1,c);
ali@44	1107	else
ali@44	1108	cnt_bin++;
ali@44	1109	eNon_A=1;
ali@44	1110	}
ali@44	1111	if (!eTab && *s==CHAR_TAB)
ali@44	1112	{
ali@44	1113	if (pswit[ECHO_SWITCH])
ali@44	1114	printf("\n%s\n",aline);
ali@44	1115	if (!pswit[OVERVIEW_SWITCH])
ali@44	1116	printf(" Line %ld column %d - Tab character?\n",
ali@44	1117	linecnt,(int)(s-aline)+1);
ali@44	1118	else
ali@44	1119	cnt_odd++;
ali@44	1120	eTab=1;
ali@44	1121	}
ali@44	1122	if (!eTilde && *s==CHAR_TILDE)
ali@44	1123	{
ali@44	1124	/*
ali@44	1125	* Often used by OCR software to indicate an
ali@44	1126	* unrecognizable character.
ali@44	1127	*/
ali@44	1128	if (pswit[ECHO_SWITCH])
ali@44	1129	printf("\n%s\n",aline);
ali@44	1130	if (!pswit[OVERVIEW_SWITCH])
ali@44	1131	printf(" Line %ld column %d - Tilde character?\n",
ali@44	1132	linecnt,(int)(s-aline)+1);
ali@44	1133	else
ali@44	1134	cnt_odd++;
ali@44	1135	eTilde=1;
ali@44	1136	}
ali@44	1137	if (!eCarat && *s==CHAR_CARAT)
ali@44	1138	{
ali@44	1139	if (pswit[ECHO_SWITCH])
ali@44	1140	printf("\n%s\n",aline);
ali@44	1141	if (!pswit[OVERVIEW_SWITCH])
ali@44	1142	printf(" Line %ld column %d - Carat character?\n",
ali@44	1143	linecnt,(int)(s-aline)+1);
ali@44	1144	else
ali@44	1145	cnt_odd++;
ali@44	1146	eCarat=1;
ali@44	1147	}
ali@44	1148	if (!eFSlash && *s==CHAR_FORESLASH && warnings->fslash)
ali@44	1149	{
ali@44	1150	if (pswit[ECHO_SWITCH])
ali@44	1151	printf("\n%s\n",aline);
ali@44	1152	if (!pswit[OVERVIEW_SWITCH])
ali@44	1153	printf(" Line %ld column %d - Forward slash?\n",
ali@44	1154	linecnt,(int)(s-aline)+1);
ali@44	1155	else
ali@44	1156	cnt_odd++;
ali@44	1157	eFSlash=1;
ali@44	1158	}
ali@44	1159	/*
ali@44	1160	* Report asterisks only in paranoid mode,
ali@44	1161	* since they're often deliberate.
ali@44	1162	*/
ali@44	1163	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
ali@44	1164	*s==CHAR_ASTERISK)
ali@44	1165	{
ali@44	1166	if (pswit[ECHO_SWITCH])
ali@44	1167	printf("\n%s\n",aline);
ali@44	1168	if (!pswit[OVERVIEW_SWITCH])
ali@44	1169	printf(" Line %ld column %d - Asterisk?\n",
ali@44	1170	linecnt,(int)(s-aline)+1);
ali@44	1171	else
ali@44	1172	cnt_odd++;
ali@44	1173	eAst=1;
ali@44	1174	}
ali@44	1175	}
ali@44	1176	}
ali@44	1177
ali@44	1178	/*
ali@45	1179	* check_for_long_line:
ali@45	1180	*
ali@45	1181	* Check for line too long.
ali@45	1182	*/
ali@45	1183	void check_for_long_line(const char *aline)
ali@45	1184	{
ali@45	1185	if (strlen(aline)>LONGEST_PG_LINE)
ali@45	1186	{
ali@45	1187	if (pswit[ECHO_SWITCH])
ali@45	1188	printf("\n%s\n",aline);
ali@45	1189	if (!pswit[OVERVIEW_SWITCH])
ali@45	1190	printf(" Line %ld column %d - Long line %d\n",
ali@68	1191	linecnt,(int)strlen(aline),(int)strlen(aline));
ali@45	1192	else
ali@45	1193	cnt_long++;
ali@45	1194	}
ali@45	1195	}
ali@45	1196
ali@45	1197	/*
ali@45	1198	* check_for_short_line:
ali@45	1199	*
ali@45	1200	* Check for line too short.
ali@45	1201	*
ali@45	1202	* This one is a bit trickier to implement: we don't want to
ali@45	1203	* flag the last line of a paragraph for being short, so we
ali@45	1204	* have to wait until we know that our current line is a
ali@45	1205	* "normal" line, then report the _previous_ line if it was too
ali@45	1206	* short. We also don't want to report indented lines like
ali@45	1207	* chapter heads or formatted quotations. We therefore keep
ali@45	1208	* last->len as the length of the last line examined, and
ali@45	1209	* last->blen as the length of the last but one, and try to
ali@45	1210	* suppress unnecessary warnings by checking that both were of
ali@45	1211	* "normal" length. We keep the first character of the last
ali@45	1212	* line in last->start, and if it was a space, we assume that
ali@45	1213	* the formatting is deliberate. I can't figure out a way to
ali@45	1214	* distinguish something like a quoted verse left-aligned or
ali@45	1215	* the header or footer of a letter from a paragraph of short
ali@45	1216	* lines - maybe if I examined the whole paragraph, and if the
ali@45	1217	* para has less than, say, 8 lines and if all lines are short,
ali@45	1218	* then just assume it's OK? Need to look at some texts to see
ali@45	1219	* how often a formula like this would get the right result.
ali@45	1220	*/
ali@45	1221	void check_for_short_line(const char aline,const struct line_properties last)
ali@45	1222	{
ali@45	1223	if (strlen(aline)>1 && last->len>1 && last->len<SHORTEST_PG_LINE &&
ali@45	1224	last->blen>1 && last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
ali@45	1225	{
ali@45	1226	if (pswit[ECHO_SWITCH])
ali@45	1227	printf("\n%s\n",prevline);
ali@45	1228	if (!pswit[OVERVIEW_SWITCH])
ali@45	1229	printf(" Line %ld column %d - Short line %d?\n",
ali@68	1230	linecnt-1,(int)strlen(prevline),(int)strlen(prevline));
ali@45	1231	else
ali@45	1232	cnt_short++;
ali@45	1233	}
ali@45	1234	}
ali@45	1235
ali@45	1236	/*
ali@46	1237	* check_for_starting_punctuation:
ali@46	1238	*
ali@46	1239	* Look for punctuation other than full ellipses at start of line.
ali@46	1240	*/
ali@46	1241	void check_for_starting_punctuation(const char *aline)
ali@46	1242	{
ali@46	1243	if (*aline && strchr(".?!,;:",aline[0]) && strncmp(". . .",aline,5))
ali@46	1244	{
ali@46	1245	if (pswit[ECHO_SWITCH])
ali@46	1246	printf("\n%s\n",aline);
ali@46	1247	if (!pswit[OVERVIEW_SWITCH])
ali@46	1248	printf(" Line %ld column 1 - Begins with punctuation?\n",
ali@46	1249	linecnt);
ali@46	1250	else
ali@46	1251	cnt_punct++;
ali@46	1252	}
ali@46	1253	}
ali@46	1254
ali@46	1255	/*
ali@47	1256	* check_for_spaced_emdash:
ali@47	1257	*
ali@47	1258	* Check for spaced em-dashes.
ali@47	1259	*
ali@47	1260	* We must check _all_ occurrences of "--" on the line
ali@47	1261	* hence the loop - even if the first double-dash is OK
ali@47	1262	* there may be another that's wrong later on.
ali@47	1263	*/
ali@47	1264	void check_for_spaced_emdash(const char *aline)
ali@47	1265	{
ali@47	1266	const char s,t;
ali@47	1267	s=aline;
ali@47	1268	while ((t=strstr(s,"--")))
ali@47	1269	{
ali@47	1270	if (t>aline && t[-1]==CHAR_SPACE \|\| t[2]==CHAR_SPACE)
ali@47	1271	{
ali@47	1272	if (pswit[ECHO_SWITCH])
ali@47	1273	printf("\n%s\n",aline);
ali@47	1274	if (!pswit[OVERVIEW_SWITCH])
ali@47	1275	printf(" Line %ld column %d - Spaced em-dash?\n",
ali@47	1276	linecnt,(int)(t-aline)+1);
ali@47	1277	else
ali@47	1278	cnt_dash++;
ali@47	1279	}
ali@47	1280	s=t+2;
ali@47	1281	}
ali@47	1282	}
ali@47	1283
ali@47	1284	/*
ali@47	1285	* check_for_spaced_dash:
ali@47	1286	*
ali@47	1287	* Check for spaced dashes.
ali@47	1288	*/
ali@47	1289	void check_for_spaced_dash(const char *aline)
ali@47	1290	{
ali@47	1291	const char *s;
ali@47	1292	if ((s=strstr(aline," -")))
ali@47	1293	{
ali@47	1294	if (s[2]!='-')
ali@47	1295	{
ali@47	1296	if (pswit[ECHO_SWITCH])
ali@47	1297	printf("\n%s\n",aline);
ali@47	1298	if (!pswit[OVERVIEW_SWITCH])
ali@47	1299	printf(" Line %ld column %d - Spaced dash?\n",
ali@47	1300	linecnt,(int)(s-aline)+1);
ali@47	1301	else
ali@47	1302	cnt_dash++;
ali@47	1303	}
ali@47	1304	}
ali@47	1305	else if ((s=strstr(aline,"- ")))
ali@47	1306	{
ali@47	1307	if (s==aline \|\| s[-1]!='-')
ali@47	1308	{
ali@47	1309	if (pswit[ECHO_SWITCH])
ali@47	1310	printf("\n%s\n",aline);
ali@47	1311	if (!pswit[OVERVIEW_SWITCH])
ali@47	1312	printf(" Line %ld column %d - Spaced dash?\n",
ali@47	1313	linecnt,(int)(s-aline)+1);
ali@47	1314	else
ali@47	1315	cnt_dash++;
ali@47	1316	}
ali@47	1317	}
ali@47	1318	}
ali@47	1319
ali@47	1320	/*
ali@48	1321	* check_for_unmarked_paragraphs:
ali@48	1322	*
ali@48	1323	* Check for unmarked paragraphs indicated by separate speakers.
ali@48	1324	*
ali@48	1325	* May well be false positive:
ali@48	1326	* "Bravo!" "Wonderful!" called the crowd.
ali@48	1327	* but useful all the same.
ali@48	1328	*/
ali@48	1329	void check_for_unmarked_paragraphs(const char *aline)
ali@48	1330	{
ali@48	1331	const char *s;
ali@48	1332	s=strstr(aline,"\" \"");
ali@48	1333	if (!s)
ali@48	1334	s=strstr(aline,"\" \"");
ali@48	1335	if (s)
ali@48	1336	{
ali@48	1337	if (pswit[ECHO_SWITCH])
ali@48	1338	printf("\n%s\n",aline);
ali@48	1339	if (!pswit[OVERVIEW_SWITCH])
ali@48	1340	printf(" Line %ld column %d - Query missing paragraph break?\n",
ali@48	1341	linecnt,(int)(s-aline)+1);
ali@48	1342	else
ali@48	1343	cnt_punct++;
ali@48	1344	}
ali@48	1345	}
ali@48	1346
ali@48	1347	/*
ali@49	1348	* check_for_jeebies:
ali@49	1349	*
ali@49	1350	* Check for "to he" and other easy h/b errors.
ali@49	1351	*
ali@49	1352	* This is a very inadequate effort on the h/b problem,
ali@49	1353	* but the phrase "to he" is always an error, whereas "to
ali@49	1354	* be" is quite common.
ali@49	1355	* Similarly, '"Quiet!", be said.' is a non-be error
ali@49	1356	* "to he" is _not_ always an error!:
ali@49	1357	* "Where they went to he couldn't say."
ali@49	1358	* Another false positive:
ali@49	1359	* What would "Cinderella" be without the . . .
ali@49	1360	* and another: "If he wants to he can see for himself."
ali@49	1361	*/
ali@49	1362	void check_for_jeebies(const char *aline)
ali@49	1363	{
ali@49	1364	const char *s;
ali@49	1365	s=strstr(aline," be could ");
ali@49	1366	if (!s)
ali@49	1367	s=strstr(aline," be would ");
ali@49	1368	if (!s)
ali@49	1369	s=strstr(aline," was be ");
ali@49	1370	if (!s)
ali@49	1371	s=strstr(aline," be is ");
ali@49	1372	if (!s)
ali@49	1373	s=strstr(aline," is be ");
ali@49	1374	if (!s)
ali@49	1375	s=strstr(aline,"\", be ");
ali@49	1376	if (!s)
ali@49	1377	s=strstr(aline,"\" be ");
ali@49	1378	if (!s)
ali@49	1379	s=strstr(aline,"\" be ");
ali@49	1380	if (!s)
ali@49	1381	s=strstr(aline," to he ");
ali@49	1382	if (s)
ali@49	1383	{
ali@49	1384	if (pswit[ECHO_SWITCH])
ali@49	1385	printf("\n%s\n",aline);
ali@49	1386	if (!pswit[OVERVIEW_SWITCH])
ali@49	1387	printf(" Line %ld column %d - Query he/be error?\n",
ali@49	1388	linecnt,(int)(s-aline)+1);
ali@49	1389	else
ali@49	1390	cnt_word++;
ali@49	1391	}
ali@49	1392	s=strstr(aline," the had ");
ali@49	1393	if (!s)
ali@49	1394	s=strstr(aline," a had ");
ali@49	1395	if (!s)
ali@49	1396	s=strstr(aline," they bad ");
ali@49	1397	if (!s)
ali@49	1398	s=strstr(aline," she bad ");
ali@49	1399	if (!s)
ali@49	1400	s=strstr(aline," he bad ");
ali@49	1401	if (!s)
ali@49	1402	s=strstr(aline," you bad ");
ali@49	1403	if (!s)
ali@49	1404	s=strstr(aline," i bad ");
ali@49	1405	if (s)
ali@49	1406	{
ali@49	1407	if (pswit[ECHO_SWITCH])
ali@49	1408	printf("\n%s\n",aline);
ali@49	1409	if (!pswit[OVERVIEW_SWITCH])
ali@49	1410	printf(" Line %ld column %d - Query had/bad error?\n",
ali@49	1411	linecnt,(int)(s-aline)+1);
ali@49	1412	else
ali@49	1413	cnt_word++;
ali@49	1414	}
ali@49	1415	s=strstr(aline,"; hut ");
ali@49	1416	if (!s)
ali@49	1417	s=strstr(aline,", hut ");
ali@49	1418	if (s)
ali@49	1419	{
ali@49	1420	if (pswit[ECHO_SWITCH])
ali@49	1421	printf("\n%s\n",aline);
ali@49	1422	if (!pswit[OVERVIEW_SWITCH])
ali@49	1423	printf(" Line %ld column %d - Query hut/but error?\n",
ali@49	1424	linecnt,(int)(s-aline)+1);
ali@49	1425	else
ali@49	1426	cnt_word++;
ali@49	1427	}
ali@49	1428	}
ali@49	1429
ali@49	1430	/*
ali@50	1431	* check_for_mta_from:
ali@50	1432	*
ali@50	1433	* Special case - angled bracket in front of "From" placed there by an
ali@50	1434	* MTA when sending an e-mail.
ali@50	1435	*/
ali@50	1436	void check_for_mta_from(const char *aline)
ali@50	1437	{
ali@50	1438	const char *s;
ali@50	1439	s=strstr(aline,">From");
ali@50	1440	if (s)
ali@50	1441	{
ali@50	1442	if (pswit[ECHO_SWITCH])
ali@50	1443	printf("\n%s\n",aline);
ali@50	1444	if (!pswit[OVERVIEW_SWITCH])
ali@50	1445	printf(" Line %ld column %d - Query angled bracket with From\n",
ali@50	1446	linecnt,(int)(s-aline)+1);
ali@50	1447	else
ali@50	1448	cnt_punct++;
ali@50	1449	}
ali@50	1450	}
ali@50	1451
ali@50	1452	/*
ali@51	1453	* check_for_orphan_character:
ali@51	1454	*
ali@51	1455	* Check for a single character line -
ali@51	1456	* often an overflow from bad wrapping.
ali@51	1457	*/
ali@51	1458	void check_for_orphan_character(const char *aline)
ali@51	1459	{
ali@51	1460	if (*aline && !aline[1])
ali@51	1461	{
ali@51	1462	if (aline=='I' \|\| aline=='V' \|\| aline=='X' \|\| aline=='L' \|\|
ali@51	1463	gcisdigit(*aline))
ali@51	1464	; /* Nothing - ignore numerals alone on a line. */
ali@51	1465	else
ali@51	1466	{
ali@51	1467	if (pswit[ECHO_SWITCH])
ali@51	1468	printf("\n%s\n",aline);
ali@51	1469	if (!pswit[OVERVIEW_SWITCH])
ali@51	1470	printf(" Line %ld column 1 - Query single character line\n",
ali@51	1471	linecnt);
ali@51	1472	else
ali@51	1473	cnt_punct++;
ali@51	1474	}
ali@51	1475	}
ali@51	1476	}
ali@51	1477
ali@51	1478	/*
ali@52	1479	* check_for_pling_scanno:
ali@52	1480	*
ali@52	1481	* Check for I" - often should be !
ali@52	1482	*/
ali@52	1483	void check_for_pling_scanno(const char *aline)
ali@52	1484	{
ali@52	1485	const char *s;
ali@52	1486	s=strstr(aline," I\"");
ali@52	1487	if (s)
ali@52	1488	{
ali@52	1489	if (pswit[ECHO_SWITCH])
ali@52	1490	printf("\n%s\n",aline);
ali@52	1491	if (!pswit[OVERVIEW_SWITCH])
ali@52	1492	printf(" Line %ld column %ld - Query I=exclamation mark?\n",
ali@52	1493	linecnt,s-aline);
ali@52	1494	else
ali@52	1495	cnt_punct++;
ali@52	1496	}
ali@52	1497	}
ali@52	1498
ali@52	1499	/*
ali@53	1500	* check_for_extra_period:
ali@53	1501	*
ali@53	1502	* Check for period without a capital letter. Cut-down from gutspell.
ali@53	1503	* Only works when it happens on a single line.
ali@53	1504	*/
ali@53	1505	void check_for_extra_period(const char aline,const struct warnings warnings)
ali@53	1506	{
ali@53	1507	const char s,t,*s1;
ali@69	1508	int i;
ali@69	1509	gboolean istypo;
ali@69	1510	gchar *testword;
ali@53	1511	if (pswit[PARANOID_SWITCH])
ali@53	1512	{
ali@69	1513	for (t=aline;strstr(t,". ");)
ali@53	1514	{
ali@53	1515	t=strstr(t,". ");
ali@69	1516	if (t==aline)
ali@53	1517	{
ali@53	1518	t++;
ali@53	1519	/* start of line punctuation is handled elsewhere */
ali@53	1520	continue;
ali@53	1521	}
ali@53	1522	if (!gcisalpha(t[-1]))
ali@53	1523	{
ali@53	1524	t++;
ali@53	1525	continue;
ali@53	1526	}
ali@53	1527	if (warnings->isDutch)
ali@53	1528	{
ali@53	1529	/* For Frank & Jeroen -- 's Middags case */
ali@53	1530	if (t[2]==CHAR_SQUOTE && t[3]>='a' && t[3]<='z' &&
ali@53	1531	t[4]==CHAR_SPACE && t[5]>='A' && t[5]<='Z')
ali@53	1532	{
ali@53	1533	t++;
ali@53	1534	continue;
ali@53	1535	}
ali@53	1536	}
ali@53	1537	s1=t+2;
ali@53	1538	while (s1 && !gcisalpha(s1) && !isdigit(*s1))
ali@53	1539	s1++;
ali@53	1540	if (s1>='a' && s1<='z')
ali@53	1541	{
ali@53	1542	/* we have something to investigate */
ali@69	1543	istypo=TRUE;
ali@53	1544	/* so let's go back and find out */
ali@69	1545	for (s1=t-1;s1>=aline &&
ali@53	1546	(gcisalpha(s1) \|\| gcisdigit(s1) \|\| *s1==CHAR_SQUOTE &&
ali@53	1547	gcisalpha(s1[1]) && gcisalpha(s1[-1]));s1--)
ali@53	1548	;
ali@53	1549	s1++;
ali@69	1550	s=strchr(s1,'.');
ali@69	1551	if (s)
ali@69	1552	testword=g_strndup(s1,s-s1);
ali@69	1553	else
ali@69	1554	testword=g_strdup(s1);
ali@53	1555	for (i=0;*abbrev[i];i++)
ali@53	1556	if (!strcmp(testword,abbrev[i]))
ali@69	1557	istypo=FALSE;
ali@53	1558	if (gcisdigit(*testword))
ali@69	1559	istypo=FALSE;
ali@53	1560	if (!testword[1])
ali@69	1561	istypo=FALSE;
ali@53	1562	if (isroman(testword))
ali@69	1563	istypo=FALSE;
ali@53	1564	if (istypo)
ali@53	1565	{
ali@69	1566	istypo=FALSE;
ali@53	1567	for (i=0;testword[i];i++)
ali@53	1568	if (strchr(vowels,testword[i]))
ali@69	1569	istypo=TRUE;
ali@53	1570	}
ali@69	1571	if (istypo &&
ali@69	1572	(pswit[VERBOSE_SWITCH] \|\| !g_tree_lookup(qperiod,testword)))
ali@53	1573	{
ali@69	1574	g_tree_insert(qperiod,g_strdup(testword),
ali@69	1575	GINT_TO_POINTER(1));
ali@69	1576	if (pswit[ECHO_SWITCH])
ali@69	1577	printf("\n%s\n",aline);
ali@69	1578	if (!pswit[OVERVIEW_SWITCH])
ali@69	1579	printf(" Line %ld column %d - Extra period?\n",
ali@69	1580	linecnt,(int)(t-aline)+1);
ali@69	1581	else
ali@69	1582	cnt_punct++;
ali@53	1583	}
ali@69	1584	g_free(testword);
ali@53	1585	}
ali@53	1586	t++;
ali@53	1587	}
ali@53	1588	}
ali@53	1589	}
ali@53	1590
ali@53	1591	/*
ali@54	1592	* check_for_following_punctuation:
ali@54	1593	*
ali@54	1594	* Check for words usually not followed by punctuation.
ali@54	1595	*/
ali@54	1596	void check_for_following_punctuation(const char *aline)
ali@54	1597	{
ali@54	1598	int i;
ali@54	1599	const char s,wordstart;
ali@69	1600	gchar inword,t;
ali@54	1601	if (pswit[TYPO_SWITCH])
ali@54	1602	{
ali@54	1603	for (s=aline;*s;)
ali@54	1604	{
ali@54	1605	wordstart=s;
ali@69	1606	t=getaword(&s);
ali@69	1607	if (!*t)
ali@69	1608	{
ali@69	1609	g_free(t);
ali@54	1610	continue;
ali@69	1611	}
ali@69	1612	inword=g_ascii_strdown(t,-1);
ali@69	1613	g_free(t);
ali@54	1614	for (i=0;*nocomma[i];i++)
ali@54	1615	if (!strcmp(inword,nocomma[i]))
ali@54	1616	{
ali@54	1617	if (s==',' \|\| s==';' \|\| *s==':')
ali@54	1618	{
ali@54	1619	if (pswit[ECHO_SWITCH])
ali@54	1620	printf("\n%s\n",aline);
ali@54	1621	if (!pswit[OVERVIEW_SWITCH])
ali@54	1622	printf(" Line %ld column %d - "
ali@54	1623	"Query punctuation after %s?\n",
ali@54	1624	linecnt,(int)(s-aline)+1,inword);
ali@54	1625	else
ali@54	1626	cnt_punct++;
ali@54	1627	}
ali@54	1628	}
ali@54	1629	for (i=0;*noperiod[i];i++)
ali@54	1630	if (!strcmp(inword,noperiod[i]))
ali@54	1631	{
ali@54	1632	if (s=='.' \|\| s=='!')
ali@54	1633	{
ali@54	1634	if (pswit[ECHO_SWITCH])
ali@54	1635	printf("\n%s\n",aline);
ali@54	1636	if (!pswit[OVERVIEW_SWITCH])
ali@54	1637	printf(" Line %ld column %d - "
ali@54	1638	"Query punctuation after %s?\n",
ali@54	1639	linecnt,(int)(s-aline)+1,inword);
ali@54	1640	else
ali@54	1641	cnt_punct++;
ali@54	1642	}
ali@54	1643	}
ali@69	1644	g_free(inword);
ali@54	1645	}
ali@54	1646	}
ali@54	1647	}
ali@54	1648
ali@54	1649	/*
ali@55	1650	* check_for_typos:
ali@55	1651	*
ali@55	1652	* Check for commonly mistyped words,
ali@55	1653	* and digits like 0 for O in a word.
ali@55	1654	*/
ali@55	1655	void check_for_typos(const char aline,struct warnings warnings)
ali@55	1656	{
ali@55	1657	const char s,wordstart;
ali@69	1658	gchar inword,testword;
ali@69	1659	int i,alower,vowel,consonant,*dupcnt;
ali@69	1660	gboolean isdup,istypo;
ali@55	1661	for (s=aline;*s;)
ali@55	1662	{
ali@55	1663	wordstart=s;
ali@69	1664	inword=getaword(&s);
ali@55	1665	if (!*inword)
ali@69	1666	{
ali@69	1667	g_free(inword);
ali@55	1668	continue; /* don't bother with empty lines */
ali@69	1669	}
ali@55	1670	if (mixdigit(inword))
ali@55	1671	{
ali@55	1672	if (pswit[ECHO_SWITCH])
ali@55	1673	printf("\n%s\n",aline);
ali@55	1674	if (!pswit[OVERVIEW_SWITCH])
ali@55	1675	printf(" Line %ld column %d - Query digit in %s\n",
ali@55	1676	linecnt,(int)(wordstart-aline)+1,inword);
ali@55	1677	else
ali@55	1678	cnt_word++;
ali@55	1679	}
ali@55	1680	/*
ali@55	1681	* Put the word through a series of tests for likely typos and OCR
ali@55	1682	* errors.
ali@55	1683	*/
ali@69	1684	if (pswit[TYPO_SWITCH] \|\| pswit[USERTYPO_SWITCH])
ali@55	1685	{
ali@69	1686	istypo=FALSE;
ali@69	1687	testword=g_strdup(inword);
ali@55	1688	alower=0;
ali@68	1689	for (i=0;i<(int)strlen(testword);i++)
ali@55	1690	{
ali@55	1691	/* lowercase for testing */
ali@55	1692	if (testword[i]>='a' && testword[i]<='z')
ali@55	1693	alower=1;
ali@55	1694	if (alower && testword[i]>='A' && testword[i]<='Z')
ali@55	1695	{
ali@55	1696	/*
ali@55	1697	* We have an uppercase mid-word. However, there are
ali@55	1698	* common cases:
ali@55	1699	* Mac and Mc like McGill
ali@55	1700	* French contractions like l'Abbe
ali@55	1701	*/
ali@55	1702	if (i==2 && testword[0]=='m' && testword[1]=='c' \|\|
ali@55	1703	i==3 && testword[0]=='m' && testword[1]=='a' &&
ali@55	1704	testword[2]=='c' \|\| i>0 && testword[i-1]==CHAR_SQUOTE)
ali@55	1705	; /* do nothing! */
ali@55	1706	else
ali@69	1707	istypo=TRUE;
ali@55	1708	}
ali@55	1709	testword[i]=(char)tolower(testword[i]);
ali@55	1710	}
ali@69	1711	}
ali@69	1712	if (pswit[TYPO_SWITCH])
ali@69	1713	{
ali@55	1714	/*
ali@55	1715	* Check for certain unlikely two-letter combinations at word
ali@55	1716	* start and end.
ali@55	1717	*/
ali@55	1718	if (strlen(testword)>1)
ali@55	1719	{
ali@55	1720	for (i=0;*nostart[i];i++)
ali@55	1721	if (!strncmp(testword,nostart[i],2))
ali@69	1722	istypo=TRUE;
ali@55	1723	for (i=0;*noend[i];i++)
ali@55	1724	if (!strncmp(testword+strlen(testword)-2,noend[i],2))
ali@69	1725	istypo=TRUE;
ali@55	1726	}
ali@55	1727	/* ght is common, gbt never. Like that. */
ali@55	1728	if (strstr(testword,"cb"))
ali@69	1729	istypo=TRUE;
ali@55	1730	if (strstr(testword,"gbt"))
ali@69	1731	istypo=TRUE;
ali@55	1732	if (strstr(testword,"pbt"))
ali@69	1733	istypo=TRUE;
ali@55	1734	if (strstr(testword,"tbs"))
ali@69	1735	istypo=TRUE;
ali@55	1736	if (strstr(testword,"mrn"))
ali@69	1737	istypo=TRUE;
ali@55	1738	if (strstr(testword,"ahle"))
ali@69	1739	istypo=TRUE;
ali@55	1740	if (strstr(testword,"ihle"))
ali@69	1741	istypo=TRUE;
ali@55	1742	/*
ali@55	1743	* "TBE" does happen - like HEARTBEAT - but uncommon.
ali@55	1744	* Also "TBI" - frostbite, outbid - but uncommon.
ali@55	1745	* Similarly "ii" like Hawaii, or Pompeii, and in Roman
ali@55	1746	* numerals, but "ii" is a common scanno.
ali@55	1747	*/
ali@55	1748	if (strstr(testword,"tbi"))
ali@69	1749	istypo=TRUE;
ali@55	1750	if (strstr(testword,"tbe"))
ali@69	1751	istypo=TRUE;
ali@55	1752	if (strstr(testword,"ii"))
ali@69	1753	istypo=TRUE;
ali@55	1754	/*
ali@55	1755	* Check for no vowels or no consonants.
ali@55	1756	* If none, flag a typo.
ali@55	1757	*/
ali@55	1758	if (!istypo && strlen(testword)>1)
ali@55	1759	{
ali@55	1760	vowel=consonant=0;
ali@55	1761	for (i=0;testword[i];i++)
ali@55	1762	{
ali@55	1763	if (testword[i]=='y' \|\| gcisdigit(testword[i]))
ali@55	1764	{
ali@55	1765	/* Yah, this is loose. */
ali@55	1766	vowel++;
ali@55	1767	consonant++;
ali@55	1768	}
ali@55	1769	else if (strchr(vowels,testword[i]))
ali@55	1770	vowel++;
ali@55	1771	else
ali@55	1772	consonant++;
ali@55	1773	}
ali@55	1774	if (!vowel \|\| !consonant)
ali@69	1775	istypo=TRUE;
ali@55	1776	}
ali@55	1777	/*
ali@55	1778	* Now exclude the word from being reported if it's in
ali@55	1779	* the okword list.
ali@55	1780	*/
ali@55	1781	for (i=0;*okword[i];i++)
ali@55	1782	if (!strcmp(testword,okword[i]))
ali@69	1783	istypo=FALSE;
ali@55	1784	/*
ali@55	1785	* What looks like a typo may be a Roman numeral.
ali@55	1786	* Exclude these.
ali@55	1787	*/
ali@55	1788	if (istypo && isroman(testword))
ali@69	1789	istypo=FALSE;
ali@55	1790	/* Check the manual list of typos. */
ali@55	1791	if (!istypo)
ali@55	1792	for (i=0;*typo[i];i++)
ali@55	1793	if (!strcmp(testword,typo[i]))
ali@69	1794	istypo=TRUE;
ali@55	1795	/*
ali@55	1796	* Check lowercase s, l, i and m - special cases.
ali@55	1797	* "j" - often a semi-colon gone wrong.
ali@55	1798	* "d" for a missing apostrophe - he d
ali@55	1799	* "n" for "in"
ali@55	1800	*/
ali@55	1801	if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword))
ali@69	1802	istypo=TRUE;
ali@55	1803	if (istypo)
ali@55	1804	{
ali@69	1805	dupcnt=g_tree_lookup(qword,testword);
ali@69	1806	if (dupcnt)
ali@69	1807	{
ali@69	1808	(*dupcnt)++;
ali@69	1809	isdup=!pswit[VERBOSE_SWITCH];
ali@69	1810	}
ali@69	1811	else
ali@69	1812	{
ali@69	1813	dupcnt=g_new0(int,1);
ali@69	1814	g_tree_insert(qword,g_strdup(testword),dupcnt);
ali@69	1815	isdup=FALSE;
ali@69	1816	}
ali@55	1817	if (!isdup)
ali@55	1818	{
ali@55	1819	if (pswit[ECHO_SWITCH])
ali@55	1820	printf("\n%s\n",aline);
ali@55	1821	if (!pswit[OVERVIEW_SWITCH])
ali@55	1822	{
ali@55	1823	printf(" Line %ld column %d - Query word %s",
ali@55	1824	linecnt,(int)(wordstart-aline)+1,inword);
ali@69	1825	if (!pswit[VERBOSE_SWITCH])
ali@55	1826	printf(" - not reporting duplicates");
ali@55	1827	printf("\n");
ali@55	1828	}
ali@55	1829	else
ali@55	1830	cnt_word++;
ali@55	1831	}
ali@55	1832	}
ali@55	1833	}
ali@55	1834	/* check the user's list of typos */
ali@69	1835	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
ali@69	1836	{
ali@69	1837	if (pswit[ECHO_SWITCH])
ali@69	1838	printf("\n%s\n",aline);
ali@69	1839	if (!pswit[OVERVIEW_SWITCH])
ali@69	1840	printf(" Line %ld column %d - Query possible scanno %s\n",
ali@69	1841	linecnt,(int)(wordstart-aline)+2,inword);
ali@69	1842	}
ali@69	1843	if (pswit[TYPO_SWITCH] \|\| pswit[USERTYPO_SWITCH])
ali@69	1844	g_free(testword);
ali@55	1845	if (pswit[PARANOID_SWITCH] && warnings->digit)
ali@55	1846	{
ali@55	1847	/* In paranoid mode, query all 0 and 1 standing alone. */
ali@55	1848	if (!strcmp(inword,"0") \|\| !strcmp(inword,"1"))
ali@55	1849	{
ali@55	1850	if (pswit[ECHO_SWITCH])
ali@55	1851	printf("\n%s\n",aline);
ali@55	1852	if (!pswit[OVERVIEW_SWITCH])
ali@55	1853	printf(" Line %ld column %d - Query standalone %s\n",
ali@55	1854	linecnt,(int)(wordstart-aline)+2,inword);
ali@55	1855	else
ali@55	1856	cnt_word++;
ali@55	1857	}
ali@55	1858	}
ali@69	1859	g_free(inword);
ali@55	1860	}
ali@55	1861	}
ali@55	1862
ali@56	1863	/*
ali@56	1864	* check_for_misspaced_punctuation:
ali@56	1865	*
ali@56	1866	* Look for added or missing spaces around punctuation and quotes.
ali@56	1867	* If there is a punctuation character like ! with no space on
ali@56	1868	* either side, suspect a missing!space. If there are spaces on
ali@56	1869	* both sides , assume a typo. If we see a double quote with no
ali@56	1870	* space or punctuation on either side of it, assume unspaced
ali@56	1871	* quotes "like"this.
ali@56	1872	*/
ali@56	1873	void check_for_misspaced_punctuation(const char *aline,
ali@69	1874	struct parities *parities,gboolean isemptyline)
ali@56	1875	{
ali@69	1876	int i,llen;
ali@69	1877	gboolean isacro,isellipsis;
ali@56	1878	const char *s;
ali@56	1879	llen=strlen(aline);
ali@56	1880	for (i=1;i<llen;i++)
ali@56	1881	{
ali@56	1882	/* For each character in the line after the first. */
ali@56	1883	if (strchr(".?!,;:_",aline[i])) /* if it's punctuation */
ali@56	1884	{
ali@56	1885	/* we need to suppress warnings for acronyms like M.D. */
ali@69	1886	isacro=FALSE;
ali@56	1887	/* we need to suppress warnings for ellipsis . . . */
ali@69	1888	isellipsis=FALSE;
ali@56	1889	/* if there are letters on both sides of it or ... */
ali@56	1890	if (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1]) \|\|
ali@56	1891	gcisalpha(aline[i+1]) && strchr("?!,;:",aline[i]))
ali@56	1892	{
ali@56	1893	/* ...if it's strict punctuation followed by an alpha */
ali@56	1894	if (aline[i]=='.')
ali@56	1895	{
ali@56	1896	if (i>2 && aline[i-2]=='.')
ali@69	1897	isacro=TRUE;
ali@56	1898	if (i+2<llen && aline[i+2]=='.')
ali@69	1899	isacro=TRUE;
ali@56	1900	}
ali@56	1901	if (!isacro)
ali@56	1902	{
ali@56	1903	if (pswit[ECHO_SWITCH])
ali@56	1904	printf("\n%s\n",aline);
ali@56	1905	if (!pswit[OVERVIEW_SWITCH])
ali@56	1906	printf(" Line %ld column %d - Missing space?\n",
ali@56	1907	linecnt,i+1);
ali@56	1908	else
ali@56	1909	cnt_punct++;
ali@56	1910	}
ali@56	1911	}
ali@56	1912	if (aline[i-1]==CHAR_SPACE &&
ali@56	1913	(aline[i+1]==CHAR_SPACE \|\| aline[i+1]==0))
ali@56	1914	{
ali@56	1915	/*
ali@56	1916	* If there are spaces on both sides,
ali@56	1917	* or space before and end of line.
ali@56	1918	*/
ali@56	1919	if (aline[i]=='.')
ali@56	1920	{
ali@56	1921	if (i>2 && aline[i-2]=='.')
ali@69	1922	isellipsis=TRUE;
ali@56	1923	if (i+2<llen && aline[i+2]=='.')
ali@69	1924	isellipsis=TRUE;
ali@56	1925	}
ali@56	1926	if (!isemptyline && !isellipsis)
ali@56	1927	{
ali@56	1928	if (pswit[ECHO_SWITCH])
ali@56	1929	printf("\n%s\n",aline);
ali@56	1930	if (!pswit[OVERVIEW_SWITCH])
ali@56	1931	printf(" Line %ld column %d - "
ali@56	1932	"Spaced punctuation?\n",linecnt,i+1);
ali@56	1933	else
ali@56	1934	cnt_punct++;
ali@56	1935	}
ali@56	1936	}
ali@56	1937	}
ali@56	1938	}
ali@56	1939	/* Split out the characters that CANNOT be preceded by space. */
ali@56	1940	llen=strlen(aline);
ali@56	1941	for (i=1;i<llen;i++)
ali@56	1942	{
ali@56	1943	/* for each character in the line after the first */
ali@56	1944	if (strchr("?!,;:",aline[i]))
ali@56	1945	{
ali@56	1946	/* if it's punctuation that _cannot_ have a space before it */
ali@56	1947	if (aline[i-1]==CHAR_SPACE && !isemptyline &&
ali@56	1948	aline[i+1]!=CHAR_SPACE)
ali@56	1949	{
ali@56	1950	/*
ali@56	1951	* If aline[i+1) DOES == space,
ali@56	1952	* it was already reported just above.
ali@56	1953	*/
ali@56	1954	if (pswit[ECHO_SWITCH])
ali@56	1955	printf("\n%s\n",aline);
ali@56	1956	if (!pswit[OVERVIEW_SWITCH])
ali@56	1957	printf(" Line %ld column %d - Spaced punctuation?\n",
ali@56	1958	linecnt,i+1);
ali@56	1959	else
ali@56	1960	cnt_punct++;
ali@56	1961	}
ali@56	1962	}
ali@56	1963	}
ali@56	1964	/*
ali@56	1965	* Special case " .X" where X is any alpha.
ali@56	1966	* This plugs a hole in the acronym code above.
ali@56	1967	* Inelegant, but maintainable.
ali@56	1968	*/
ali@56	1969	llen=strlen(aline);
ali@56	1970	for (i=1;i<llen;i++)
ali@56	1971	{
ali@56	1972	/* for each character in the line after the first */
ali@56	1973	if (aline[i]=='.')
ali@56	1974	{
ali@56	1975	/* if it's a period */
ali@56	1976	if (aline[i-1]==CHAR_SPACE && gcisalpha(aline[i+1]))
ali@56	1977	{
ali@56	1978	/*
ali@56	1979	* If the period follows a space and
ali@56	1980	* is followed by a letter.
ali@56	1981	*/
ali@56	1982	if (pswit[ECHO_SWITCH])
ali@56	1983	printf("\n%s\n",aline);
ali@56	1984	if (!pswit[OVERVIEW_SWITCH])
ali@56	1985	printf(" Line %ld column %d - Spaced punctuation?\n",
ali@56	1986	linecnt,i+1);
ali@56	1987	else
ali@56	1988	cnt_punct++;
ali@56	1989	}
ali@56	1990	}
ali@56	1991	}
ali@56	1992	for (i=1;i<llen;i++)
ali@56	1993	{
ali@56	1994	/* for each character in the line after the first */
ali@56	1995	if (aline[i]==CHAR_DQUOTE)
ali@56	1996	{
ali@56	1997	if (!strchr(" _-.'`,;:!/([{?}])",aline[i-1]) &&
ali@56	1998	!strchr(" _-.'`,;:!/([{?}])",aline[i+1]) && aline[i+1] \|\|
ali@56	1999	!strchr(" _-([{'`",aline[i-1]) && gcisalpha(aline[i+1]))
ali@56	2000	{
ali@56	2001	if (pswit[ECHO_SWITCH])
ali@56	2002	printf("\n%s\n",aline);
ali@56	2003	if (!pswit[OVERVIEW_SWITCH])
ali@56	2004	printf(" Line %ld column %d - Unspaced quotes?\n",
ali@56	2005	linecnt,i+1);
ali@56	2006	else
ali@56	2007	cnt_punct++;
ali@56	2008	}
ali@56	2009	}
ali@56	2010	}
ali@56	2011	/* Check parity of quotes. */
ali@56	2012	for (s=aline;*s;s++)
ali@56	2013	{
ali@56	2014	if (*s==CHAR_DQUOTE)
ali@56	2015	{
ali@56	2016	parities->dquote=!parities->dquote;
ali@56	2017	if (!parities->dquote)
ali@56	2018	{
ali@56	2019	/* parity even */
ali@56	2020	if (!strchr("_-.'`/,;:!?)]} ",s[1]))
ali@56	2021	{
ali@56	2022	if (pswit[ECHO_SWITCH])
ali@56	2023	printf("\n%s\n",aline);
ali@56	2024	if (!pswit[OVERVIEW_SWITCH])
ali@56	2025	printf(" Line %ld column %d - "
ali@56	2026	"Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);
ali@56	2027	else
ali@56	2028	cnt_punct++;
ali@56	2029	}
ali@56	2030	}
ali@56	2031	else
ali@56	2032	{
ali@56	2033	/* parity odd */
ali@56	2034	if (!gcisalpha(s[1]) && !isdigit(s[1]) &&
ali@56	2035	!strchr("_-/.'`([{$",s[1]) \|\| !s[1])
ali@56	2036	{
ali@56	2037	if (pswit[ECHO_SWITCH])
ali@56	2038	printf("\n%s\n",aline);
ali@56	2039	if (!pswit[OVERVIEW_SWITCH])
ali@56	2040	printf(" Line %ld column %d - "
ali@56	2041	"Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);
ali@56	2042	else
ali@56	2043	cnt_punct++;
ali@56	2044	}
ali@56	2045	}
ali@56	2046	}
ali@56	2047	}
ali@56	2048	if (*aline==CHAR_DQUOTE)
ali@56	2049	{
ali@56	2050	if (strchr(",;:!?)]} ",aline[1]))
ali@56	2051	{
ali@56	2052	if (pswit[ECHO_SWITCH])
ali@56	2053	printf("\n%s\n",aline);
ali@56	2054	if (!pswit[OVERVIEW_SWITCH])
ali@56	2055	printf(" Line %ld column 1 - Wrongspaced quotes?\n",
ali@56	2056	linecnt);
ali@56	2057	else
ali@56	2058	cnt_punct++;
ali@56	2059	}
ali@56	2060	}
ali@56	2061	if (pswit[SQUOTE_SWITCH])
ali@56	2062	{
ali@56	2063	for (s=aline;*s;s++)
ali@56	2064	{
ali@56	2065	if ((s==CHAR_SQUOTE \|\| s==CHAR_OPEN_SQUOTE) &&
ali@56	2066	(s==aline \|\| s>aline && !gcisalpha(s[-1]) \|\|
ali@56	2067	!gcisalpha(s[1])))
ali@56	2068	{
ali@56	2069	parities->squote=!parities->squote;
ali@56	2070	if (!parities->squote)
ali@56	2071	{
ali@56	2072	/* parity even */
ali@56	2073	if (!strchr("_-.'`/\",;:!?)]} ",s[1]))
ali@56	2074	{
ali@56	2075	if (pswit[ECHO_SWITCH])
ali@56	2076	printf("\n%s\n",aline);
ali@56	2077	if (!pswit[OVERVIEW_SWITCH])
ali@56	2078	printf(" Line %ld column %d - "
ali@56	2079	"Wrongspaced singlequotes?\n",
ali@56	2080	linecnt,(int)(s-aline)+1);
ali@56	2081	else
ali@56	2082	cnt_punct++;
ali@56	2083	}
ali@56	2084	}
ali@56	2085	else
ali@56	2086	{
ali@56	2087	/* parity odd */
ali@56	2088	if (!gcisalpha(s[1]) && !isdigit(s[1]) &&
ali@56	2089	!strchr("_-/\".'`",s[1]) \|\| !s[1])
ali@56	2090	{
ali@56	2091	if (pswit[ECHO_SWITCH])
ali@56	2092	printf("\n%s\n",aline);
ali@56	2093	if (!pswit[OVERVIEW_SWITCH])
ali@56	2094	printf(" Line %ld column %d - "
ali@56	2095	"Wrongspaced singlequotes?\n",
ali@56	2096	linecnt,(int)(s-aline)+1);
ali@56	2097	else
ali@56	2098	cnt_punct++;
ali@56	2099	}
ali@56	2100	}
ali@56	2101	}
ali@56	2102	}
ali@56	2103	}
ali@56	2104	}
ali@56	2105
ali@55	2106	/*
ali@57	2107	* check_for_double_punctuation:
ali@57	2108	*
ali@57	2109	* Look for double punctuation like ,. or ,,
ali@57	2110	* Thanks to DW for the suggestion!
ali@57	2111	* In books with references, ".," and ".;" are common
ali@57	2112	* e.g. "etc., etc.," and vol. 1.; vol 3.;
ali@57	2113	* OTOH, from my initial tests, there are also fairly
ali@57	2114	* common errors. What to do? Make these cases paranoid?
ali@57	2115	* ".," is the most common, so warnings->dotcomma is used
ali@57	2116	* to suppress detailed reporting if it occurs often.
ali@57	2117	*/
ali@57	2118	void check_for_double_punctuation(const char aline,struct warnings warnings)
ali@57	2119	{
ali@57	2120	int i,llen;
ali@57	2121	llen=strlen(aline);
ali@57	2122	for (i=0;i<llen;i++)
ali@57	2123	{
ali@57	2124	/* for each punctuation character in the line */
ali@57	2125	if (strchr(".?!,;:",aline[i]) && strchr(".?!,;:",aline[i+1]) &&
ali@57	2126	aline[i] && aline[i+1])
ali@57	2127	{
ali@57	2128	/* followed by punctuation, it's a query, unless . . . */
ali@57	2129	if (aline[i]==aline[i+1] && (aline[i]=='.' \|\| aline[i]=='?' \|\|
ali@57	2130	aline[i]=='!') \|\|
ali@57	2131	!warnings->dotcomma && aline[i]=='.' && aline[i+1]==',' \|\|
ali@57	2132	warnings->isFrench && !strncmp(aline+i,",...",4) \|\|
ali@57	2133	warnings->isFrench && !strncmp(aline+i,"...,",4) \|\|
ali@57	2134	warnings->isFrench && !strncmp(aline+i,";...",4) \|\|
ali@57	2135	warnings->isFrench && !strncmp(aline+i,"...;",4) \|\|
ali@57	2136	warnings->isFrench && !strncmp(aline+i,":...",4) \|\|
ali@57	2137	warnings->isFrench && !strncmp(aline+i,"...:",4) \|\|
ali@57	2138	warnings->isFrench && !strncmp(aline+i,"!...",4) \|\|
ali@57	2139	warnings->isFrench && !strncmp(aline+i,"...!",4) \|\|
ali@57	2140	warnings->isFrench && !strncmp(aline+i,"?...",4) \|\|
ali@57	2141	warnings->isFrench && !strncmp(aline+i,"...?",4))
ali@57	2142	{
ali@57	2143	if (warnings->isFrench && !strncmp(aline+i,",...",4) \|\|
ali@57	2144	warnings->isFrench && !strncmp(aline+i,"...,",4) \|\|
ali@57	2145	warnings->isFrench && !strncmp(aline+i,";...",4) \|\|
ali@57	2146	warnings->isFrench && !strncmp(aline+i,"...;",4) \|\|
ali@57	2147	warnings->isFrench && !strncmp(aline+i,":...",4) \|\|
ali@57	2148	warnings->isFrench && !strncmp(aline+i,"...:",4) \|\|
ali@57	2149	warnings->isFrench && !strncmp(aline+i,"!...",4) \|\|
ali@57	2150	warnings->isFrench && !strncmp(aline+i,"...!",4) \|\|
ali@57	2151	warnings->isFrench && !strncmp(aline+i,"?...",4) \|\|
ali@57	2152	warnings->isFrench && !strncmp(aline+i,"...?",4))
ali@57	2153	i+=4;
ali@57	2154	; /* do nothing for .. !! and ?? which can be legit */
ali@57	2155	}
ali@57	2156	else
ali@57	2157	{
ali@57	2158	if (pswit[ECHO_SWITCH])
ali@57	2159	printf("\n%s\n",aline);
ali@57	2160	if (!pswit[OVERVIEW_SWITCH])
ali@57	2161	printf(" Line %ld column %d - Double punctuation?\n",
ali@57	2162	linecnt,i+1);
ali@57	2163	else
ali@57	2164	cnt_punct++;
ali@57	2165	}
ali@57	2166	}
ali@57	2167	}
ali@57	2168	}
ali@57	2169
ali@57	2170	/*
ali@58	2171	* check_for_spaced_quotes:
ali@58	2172	*/
ali@58	2173	void check_for_spaced_quotes(const char *aline)
ali@58	2174	{
ali@58	2175	const char s,t;
ali@58	2176	s=aline;
ali@58	2177	while ((t=strstr(s," \" ")))
ali@58	2178	{
ali@58	2179	if (pswit[ECHO_SWITCH])
ali@58	2180	printf("\n%s\n",aline);
ali@58	2181	if (!pswit[OVERVIEW_SWITCH])
ali@58	2182	printf(" Line %ld column %d - Spaced doublequote?\n",
ali@58	2183	linecnt,(int)(t-aline+1));
ali@58	2184	else
ali@58	2185	cnt_punct++;
ali@58	2186	s=t+2;
ali@58	2187	}
ali@58	2188	s=aline;
ali@58	2189	while ((t=strstr(s," ' ")))
ali@58	2190	{
ali@58	2191	if (pswit[ECHO_SWITCH])
ali@58	2192	printf("\n%s\n",aline);
ali@58	2193	if (!pswit[OVERVIEW_SWITCH])
ali@58	2194	printf(" Line %ld column %d - Spaced singlequote?\n",
ali@58	2195	linecnt,(int)(t-aline+1));
ali@58	2196	else
ali@58	2197	cnt_punct++;
ali@58	2198	s=t+2;
ali@58	2199	}
ali@58	2200	s=aline;
ali@58	2201	while ((t=strstr(s," ` ")))
ali@58	2202	{
ali@58	2203	if (pswit[ECHO_SWITCH])
ali@58	2204	printf("\n%s\n",aline);
ali@58	2205	if (!pswit[OVERVIEW_SWITCH])
ali@58	2206	printf(" Line %ld column %d - Spaced singlequote?\n",
ali@58	2207	linecnt,(int)(t-aline+1));
ali@58	2208	else
ali@58	2209	cnt_punct++;
ali@58	2210	s=t+2;
ali@58	2211	}
ali@58	2212	}
ali@58	2213
ali@58	2214	/*
ali@59	2215	* check_for_miscased_genative:
ali@59	2216	*
ali@59	2217	* Check special case of 'S instead of 's at end of word.
ali@59	2218	*/
ali@59	2219	void check_for_miscased_genative(const char *aline)
ali@59	2220	{
ali@59	2221	const char *s;
ali@69	2222	if (!*aline)
ali@69	2223	return;
ali@59	2224	s=aline+1;
ali@59	2225	while (*s)
ali@59	2226	{
ali@59	2227	if (*s==CHAR_SQUOTE && s[1]=='S' && s[-1]>='a' && s[-1]<='z')
ali@59	2228	{
ali@59	2229	if (pswit[ECHO_SWITCH])
ali@59	2230	printf("\n%s\n",aline);
ali@59	2231	if (!pswit[OVERVIEW_SWITCH])
ali@59	2232	printf(" Line %ld column %d - Capital \"S\"?\n",
ali@59	2233	linecnt,(int)(s-aline+2));
ali@59	2234	else
ali@59	2235	cnt_punct++;
ali@59	2236	}
ali@59	2237	s++;
ali@59	2238	}
ali@59	2239	}
ali@59	2240
ali@59	2241	/*
ali@60	2242	* check_end_of_line:
ali@60	2243	*
ali@60	2244	* Now check special cases - start and end of line -
ali@60	2245	* for single and double quotes. Start is sometimes [sic]
ali@60	2246	* but better to query it anyway.
ali@60	2247	* While we're here, check for dash at end of line.
ali@60	2248	*/
ali@60	2249	void check_end_of_line(const char aline,struct warnings warnings)
ali@60	2250	{
ali@60	2251	int i,llen;
ali@60	2252	llen=strlen(aline);
ali@60	2253	if (llen>1)
ali@60	2254	{
ali@60	2255	if (aline[llen-1]==CHAR_DQUOTE \|\| aline[llen-1]==CHAR_SQUOTE \|\|
ali@60	2256	aline[llen-1]==CHAR_OPEN_SQUOTE)
ali@60	2257	if (aline[llen-2]==CHAR_SPACE)
ali@60	2258	{
ali@60	2259	if (pswit[ECHO_SWITCH])
ali@60	2260	printf("\n%s\n",aline);
ali@60	2261	if (!pswit[OVERVIEW_SWITCH])
ali@60	2262	printf(" Line %ld column %d - Spaced quote?\n",
ali@60	2263	linecnt,llen);
ali@60	2264	else
ali@60	2265	cnt_punct++;
ali@60	2266	}
ali@60	2267	if ((aline[0]==CHAR_SQUOTE \|\| aline[0]==CHAR_OPEN_SQUOTE) &&
ali@60	2268	aline[1]==CHAR_SPACE)
ali@60	2269	{
ali@60	2270	if (pswit[ECHO_SWITCH])
ali@60	2271	printf("\n%s\n",aline);
ali@60	2272	if (!pswit[OVERVIEW_SWITCH])
ali@60	2273	printf(" Line %ld column 1 - Spaced quote?\n",linecnt);
ali@60	2274	else
ali@60	2275	cnt_punct++;
ali@60	2276	}
ali@60	2277	/*
ali@60	2278	* Dash at end of line may well be legit - paranoid mode only
ali@60	2279	* and don't report em-dash at line-end.
ali@60	2280	*/
ali@60	2281	if (pswit[PARANOID_SWITCH] && warnings->hyphen)
ali@60	2282	{
ali@60	2283	for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)
ali@60	2284	;
ali@60	2285	if (aline[i]=='-' && aline[i-1]!='-')
ali@60	2286	{
ali@60	2287	if (pswit[ECHO_SWITCH])
ali@60	2288	printf("\n%s\n",aline);
ali@60	2289	if (!pswit[OVERVIEW_SWITCH])
ali@60	2290	printf(" Line %ld column %d - Hyphen at end of line?\n",
ali@60	2291	linecnt,i);
ali@60	2292	}
ali@60	2293	}
ali@60	2294	}
ali@60	2295	}
ali@60	2296
ali@60	2297	/*
ali@61	2298	* check_for_unspaced_bracket:
ali@61	2299	*
ali@61	2300	* Brackets are often unspaced, but shouldn't be surrounded by alpha.
ali@61	2301	* If so, suspect a scanno like "a]most".
ali@61	2302	*/
ali@61	2303	void check_for_unspaced_bracket(const char *aline)
ali@61	2304	{
ali@61	2305	int i,llen;
ali@61	2306	llen=strlen(aline);
ali@61	2307	for (i=1;i<llen-1;i++)
ali@61	2308	{
ali@61	2309	/* for each bracket character in the line except 1st & last */
ali@61	2310	if (strchr("{[()]}",aline[i]) && gcisalpha(aline[i-1]) &&
ali@61	2311	gcisalpha(aline[i+1]))
ali@61	2312	{
ali@61	2313	if (pswit[ECHO_SWITCH])
ali@61	2314	printf("\n%s\n",aline);
ali@61	2315	if (!pswit[OVERVIEW_SWITCH])
ali@61	2316	printf(" Line %ld column %d - Unspaced bracket?\n",
ali@61	2317	linecnt,i);
ali@61	2318	else
ali@61	2319	cnt_punct++;
ali@61	2320	}
ali@61	2321	}
ali@61	2322	}
ali@61	2323
ali@61	2324	/*
ali@62	2325	* check_for_unpunctuated_endquote:
ali@62	2326	*/
ali@62	2327	void check_for_unpunctuated_endquote(const char *aline)
ali@62	2328	{
ali@62	2329	int i,llen;
ali@62	2330	llen=strlen(aline);
ali@62	2331	for (i=1;i<llen;i++)
ali@62	2332	{
ali@62	2333	/* for each character in the line except 1st */
ali@62	2334	if (aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))
ali@62	2335	{
ali@62	2336	if (pswit[ECHO_SWITCH])
ali@62	2337	printf("\n%s\n",aline);
ali@62	2338	if (!pswit[OVERVIEW_SWITCH])
ali@62	2339	printf(" Line %ld column %d - "
ali@62	2340	"endquote missing punctuation?\n",linecnt,i);
ali@62	2341	else
ali@62	2342	cnt_punct++;
ali@62	2343	}
ali@62	2344	}
ali@62	2345	}
ali@62	2346
ali@62	2347	/*
ali@63	2348	* check_for_html_tag:
ali@63	2349	*
ali@63	2350	* Check for <HTML TAG>.
ali@63	2351	*
ali@63	2352	* If there is a < in the line, followed at some point
ali@63	2353	* by a > then we suspect HTML.
ali@63	2354	*/
ali@63	2355	void check_for_html_tag(const char *aline)
ali@63	2356	{
ali@63	2357	int i;
ali@63	2358	const char open,close;
ali@63	2359	open=strstr(aline,"<");
ali@63	2360	if (open)
ali@63	2361	{
ali@63	2362	close=strstr(aline,">");
ali@63	2363	if (close)
ali@63	2364	{
ali@68	2365	i=(int)(close-open+1);
ali@63	2366	if (i>0)
ali@63	2367	{
ali@63	2368	if (pswit[ECHO_SWITCH])
ali@63	2369	printf("\n%s\n",aline);
ali@63	2370	if (!pswit[OVERVIEW_SWITCH])
ali@69	2371	printf(" Line %ld column %d - HTML Tag? %.s \n",
ali@69	2372	linecnt,(int)(open-aline)+1,i,i,open);
ali@63	2373	else
ali@63	2374	cnt_html++;
ali@63	2375	}
ali@63	2376	}
ali@63	2377	}
ali@63	2378	}
ali@63	2379
ali@63	2380	/*
ali@64	2381	* check_for_html_entity:
ali@64	2382	*
ali@64	2383	* Check for &symbol; HTML.
ali@64	2384	*
ali@64	2385	* If there is a & in the line, followed at
ali@64	2386	* some point by a ; then we suspect HTML.
ali@64	2387	*/
ali@64	2388	void check_for_html_entity(const char *aline)
ali@64	2389	{
ali@64	2390	int i;
ali@64	2391	const char s,amp,*scolon;
ali@64	2392	amp=strstr(aline,"&");
ali@64	2393	if (amp)
ali@64	2394	{
ali@64	2395	scolon=strstr(aline,";");
ali@64	2396	if (scolon)
ali@64	2397	{
ali@64	2398	i=(int)(scolon-amp+1);
ali@64	2399	for (s=amp;s<scolon;s++)
ali@64	2400	if (*s==CHAR_SPACE)
ali@68	2401	i=0; /* Don't report "Jones & Son;" */
ali@64	2402	if (i>0)
ali@64	2403	{
ali@64	2404	if (pswit[ECHO_SWITCH])
ali@64	2405	printf("\n%s\n",aline);
ali@64	2406	if (!pswit[OVERVIEW_SWITCH])
ali@69	2407	printf(" Line %ld column %d - HTML symbol? %.s \n",
ali@69	2408	linecnt,(int)(amp-aline)+1,i,i,amp);
ali@64	2409	else
ali@64	2410	cnt_html++;
ali@64	2411	}
ali@64	2412	}
ali@64	2413	}
ali@64	2414	}
ali@64	2415
ali@65	2416	/*
ali@65	2417	* print_pending:
ali@65	2418	*
ali@65	2419	* If we are in a state of unbalanced quotes, and this line
ali@65	2420	* doesn't begin with a quote, output the stored error message.
ali@65	2421	* If the -P switch was used, print the warning even if the
ali@65	2422	* new para starts with quotes.
ali@65	2423	*/
ali@65	2424	void print_pending(const char aline,const char parastart,
ali@65	2425	struct pending *pending)
ali@65	2426	{
ali@65	2427	const char *s;
ali@65	2428	s=aline;
ali@65	2429	while (*s==' ')
ali@65	2430	s++;
ali@69	2431	if (pending->dquote)
ali@69	2432	{
ali@65	2433	if (*s!=CHAR_DQUOTE \|\| pswit[QPARA_SWITCH])
ali@65	2434	{
ali@65	2435	if (!pswit[OVERVIEW_SWITCH])
ali@65	2436	{
ali@65	2437	if (pswit[ECHO_SWITCH])
ali@65	2438	printf("\n%s\n",parastart);
ali@65	2439	puts(pending->dquote);
ali@65	2440	}
ali@65	2441	else
ali@65	2442	cnt_dquot++;
ali@65	2443	}
ali@69	2444	g_free(pending->dquote);
ali@69	2445	pending->dquote=NULL;
ali@69	2446	}
ali@69	2447	if (pending->squote)
ali@65	2448	{
ali@65	2449	if (s!=CHAR_SQUOTE && s!=CHAR_OPEN_SQUOTE \|\| pswit[QPARA_SWITCH] \|\|
ali@65	2450	pending->squot)
ali@65	2451	{
ali@65	2452	if (!pswit[OVERVIEW_SWITCH])
ali@65	2453	{
ali@65	2454	if (pswit[ECHO_SWITCH])
ali@65	2455	printf("\n%s\n",parastart);
ali@65	2456	puts(pending->squote);
ali@65	2457	}
ali@65	2458	else
ali@65	2459	cnt_squot++;
ali@65	2460	}
ali@69	2461	g_free(pending->squote);
ali@69	2462	pending->squote=NULL;
ali@65	2463	}
ali@69	2464	if (pending->rbrack)
ali@65	2465	{
ali@65	2466	if (!pswit[OVERVIEW_SWITCH])
ali@65	2467	{
ali@65	2468	if (pswit[ECHO_SWITCH])
ali@65	2469	printf("\n%s\n",parastart);
ali@65	2470	puts(pending->rbrack);
ali@65	2471	}
ali@65	2472	else
ali@65	2473	cnt_brack++;
ali@69	2474	g_free(pending->rbrack);
ali@69	2475	pending->rbrack=NULL;
ali@65	2476	}
ali@69	2477	if (pending->sbrack)
ali@65	2478	{
ali@65	2479	if (!pswit[OVERVIEW_SWITCH])
ali@65	2480	{
ali@65	2481	if (pswit[ECHO_SWITCH])
ali@65	2482	printf("\n%s\n",parastart);
ali@65	2483	puts(pending->sbrack);
ali@65	2484	}
ali@65	2485	else
ali@65	2486	cnt_brack++;
ali@69	2487	g_free(pending->sbrack);
ali@69	2488	pending->sbrack=NULL;
ali@65	2489	}
ali@69	2490	if (pending->cbrack)
ali@65	2491	{
ali@65	2492	if (!pswit[OVERVIEW_SWITCH])
ali@65	2493	{
ali@65	2494	if (pswit[ECHO_SWITCH])
ali@65	2495	printf("\n%s\n",parastart);
ali@65	2496	puts(pending->cbrack);
ali@65	2497	}
ali@65	2498	else
ali@65	2499	cnt_brack++;
ali@69	2500	g_free(pending->cbrack);
ali@69	2501	pending->cbrack=NULL;
ali@65	2502	}
ali@69	2503	if (pending->unders)
ali@65	2504	{
ali@65	2505	if (!pswit[OVERVIEW_SWITCH])
ali@65	2506	{
ali@65	2507	if (pswit[ECHO_SWITCH])
ali@65	2508	printf("\n%s\n",parastart);
ali@65	2509	puts(pending->unders);
ali@65	2510	}
ali@65	2511	else
ali@65	2512	cnt_brack++;
ali@69	2513	g_free(pending->unders);
ali@69	2514	pending->unders=NULL;
ali@65	2515	}
ali@65	2516	}
ali@65	2517
ali@65	2518	/*
ali@65	2519	* check_for_mismatched_quotes:
ali@65	2520	*
ali@65	2521	* At end of paragraph, check for mismatched quotes.
ali@65	2522	*
ali@65	2523	* We don't want to report an error immediately, since it is a
ali@65	2524	* common convention to omit the quotes at end of paragraph if
ali@65	2525	* the next paragraph is a continuation of the same speaker.
ali@65	2526	* Where this is the case, the next para should begin with a
ali@65	2527	* quote, so we store the warning message and only display it
ali@65	2528	* at the top of the next iteration if the new para doesn't
ali@65	2529	* start with a quote.
ali@65	2530	* The -p switch overrides this default, and warns of unclosed
ali@65	2531	* quotes on _every_ paragraph, whether the next begins with a
ali@65	2532	* quote or not.
ali@65	2533	*/
ali@65	2534	void check_for_mismatched_quotes(const struct counters *counters,
ali@65	2535	struct pending *pending)
ali@65	2536	{
ali@65	2537	if (counters->quot%2)
ali@69	2538	pending->dquote=
ali@69	2539	g_strdup_printf(" Line %ld - Mismatched quotes",linecnt);
ali@65	2540	if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
ali@65	2541	counters->open_single_quote!=counters->close_single_quote)
ali@69	2542	pending->squote=
ali@69	2543	g_strdup_printf(" Line %ld - Mismatched singlequotes?",linecnt);
ali@65	2544	if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
ali@65	2545	counters->open_single_quote!=counters->close_single_quote &&
ali@65	2546	counters->open_single_quote!=counters->close_single_quote+1)
ali@65	2547	/*
ali@65	2548	* Flag it to be noted regardless of the
ali@65	2549	* first char of the next para.
ali@65	2550	*/
ali@65	2551	pending->squot=1;
ali@65	2552	if (counters->r_brack)
ali@69	2553	pending->rbrack=
ali@69	2554	g_strdup_printf(" Line %ld - Mismatched round brackets?",linecnt);
ali@65	2555	if (counters->s_brack)
ali@69	2556	pending->sbrack=
ali@69	2557	g_strdup_printf(" Line %ld - Mismatched square brackets?",linecnt);
ali@65	2558	if (counters->c_brack)
ali@69	2559	pending->cbrack=
ali@69	2560	g_strdup_printf(" Line %ld - Mismatched curly brackets?",linecnt);
ali@65	2561	if (counters->c_unders%2)
ali@69	2562	pending->unders=
ali@69	2563	g_strdup_printf(" Line %ld - Mismatched underscores?",linecnt);
ali@65	2564	}
ali@65	2565
ali@64	2566	/*
ali@66	2567	* check_for_omitted_punctuation:
ali@66	2568	*
ali@66	2569	* Check for omitted punctuation at end of paragraph by working back
ali@66	2570	* through prevline. DW.
ali@66	2571	* Need to check this only for "normal" paras.
ali@66	2572	* So what is a "normal" para?
ali@66	2573	* Not normal if one-liner (chapter headings, etc.)
ali@66	2574	* Not normal if doesn't contain at least one locase letter
ali@66	2575	* Not normal if starts with space
ali@66	2576	*/
ali@66	2577	void check_for_omitted_punctuation(const char *prevline,
ali@66	2578	struct line_properties *last,int start_para_line)
ali@66	2579	{
ali@66	2580	int i;
ali@66	2581	const char *s;
ali@66	2582	for (s=prevline,i=0;*s && !i;s++)
ali@66	2583	if (gcisletter(*s))
ali@66	2584	/* use i to indicate the presence of a letter on the line */
ali@66	2585	i=1;
ali@66	2586	/*
ali@66	2587	* This next "if" is a problem.
ali@66	2588	* If we say "start_para_line <= linecnt - 1", that includes
ali@66	2589	* one-line "paragraphs" like chapter heads. Lotsa false positives.
ali@66	2590	* If we say "start_para_line < linecnt - 1" it doesn't, but then it
ali@66	2591	* misses genuine one-line paragraphs.
ali@66	2592	*/
ali@66	2593	if (i && last->blen>2 && start_para_line<linecnt-1 && *prevline>CHAR_SPACE)
ali@66	2594	{
ali@66	2595	for (i=strlen(prevline)-1;
ali@66	2596	(prevline[i]==CHAR_DQUOTE \|\| prevline[i]==CHAR_SQUOTE) &&
ali@66	2597	prevline[i]>CHAR_SPACE && i>0;
ali@66	2598	i--)
ali@66	2599	;
ali@66	2600	for (;i>0;i--)
ali@66	2601	{
ali@66	2602	if (gcisalpha(prevline[i]))
ali@66	2603	{
ali@66	2604	if (pswit[ECHO_SWITCH])
ali@66	2605	printf("\n%s\n",prevline);
ali@66	2606	if (!pswit[OVERVIEW_SWITCH])
ali@66	2607	printf(" Line %ld column %d - "
ali@66	2608	"No punctuation at para end?\n",
ali@68	2609	linecnt-1,(int)strlen(prevline));
ali@66	2610	else
ali@66	2611	cnt_punct++;
ali@66	2612	break;
ali@66	2613	}
ali@66	2614	if (strchr("-.:!([{?}])",prevline[i]))
ali@66	2615	break;
ali@66	2616	}
ali@66	2617	}
ali@66	2618	}
ali@66	2619
ali@69	2620	gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
ali@69	2621	{
ali@69	2622	const char *word=key;
ali@69	2623	int *dupcnt=value;
ali@69	2624	if (*dupcnt)
ali@69	2625	printf("\nNote: Queried word %s was duplicated %d times\n",
ali@69	2626	word,*dupcnt);
ali@69	2627	return FALSE;
ali@69	2628	}
ali@69	2629
ali@66	2630	/*
ali@41	2631	* procfile:
ali@41	2632	*
ali@41	2633	* Process one file.
ali@41	2634	*/
ali@69	2635	void procfile(const char *filename)
ali@41	2636	{
ali@65	2637	const char *s;
ali@69	2638	gchar parastart=NULL; / first line of current para */
ali@69	2639	gchar etext,aline;
ali@69	2640	gchar *etext_ptr;
ali@69	2641	GError *err=NULL;
ali@41	2642	struct first_pass_results *first_pass_results;
ali@42	2643	struct warnings *warnings;
ali@43	2644	struct counters counters={0};
ali@45	2645	struct line_properties last={0};
ali@56	2646	struct parities parities={0};
ali@69	2647	struct pending pending={0};
ali@69	2648	gboolean isemptyline;
ali@68	2649	long start_para_line=0;
ali@69	2650	gboolean isnewpara=FALSE,enddash=FALSE;
ali@45	2651	last.start=CHAR_SPACE;
ali@68	2652	linecnt=checked_linecnt=0;
ali@69	2653	etext=read_etext(filename,&err);
ali@69	2654	if (!etext)
ali@41	2655	{
ali@68	2656	if (pswit[STDOUT_SWITCH])
ali@69	2657	fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
ali@68	2658	else
ali@69	2659	fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
ali@41	2660	exit(1);
ali@41	2661	}
ali@41	2662	fprintf(stdout,"\n\nFile: %s\n\n",filename);
ali@69	2663	first_pass_results=first_pass(etext);
ali@42	2664	warnings=report_first_pass(first_pass_results);
ali@69	2665	qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
ali@69	2666	qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
ali@40	2667	/*
ali@40	2668	* Here we go with the main pass. Hold onto yer hat!
ali@40	2669	*/
ali@65	2670	linecnt=0;
ali@69	2671	etext_ptr=etext;
ali@69	2672	while ((aline=flgets(&etext_ptr,linecnt+1)))
ali@40	2673	{
ali@68	2674	linecnt++;
ali@68	2675	if (linecnt==1)
ali@69	2676	isnewpara=TRUE;
ali@68	2677	if (pswit[DP_SWITCH] && !strncmp(aline,"-----File: ",11))
ali@40	2678	continue; // skip DP page separators completely
ali@68	2679	if (linecnt<first_pass_results->firstline \|\|
ali@41	2680	(first_pass_results->footerline>0 &&
ali@41	2681	linecnt>first_pass_results->footerline))
ali@40	2682	{
ali@68	2683	if (pswit[HEADER_SWITCH])
ali@40	2684	{
ali@68	2685	if (!strncmp(aline,"Title:",6))
ali@68	2686	printf(" %s\n",aline);
ali@68	2687	if (!strncmp(aline,"Author:",7))
ali@68	2688	printf(" %s\n",aline);
ali@68	2689	if (!strncmp(aline,"Release Date:",13))
ali@68	2690	printf(" %s\n",aline);
ali@68	2691	if (!strncmp(aline,"Edition:",8))
ali@68	2692	printf(" %s\n\n",aline);
ali@40	2693	}
ali@68	2694	continue; /* skip through the header */
ali@40	2695	}
ali@68	2696	checked_linecnt++;
ali@65	2697	print_pending(aline,parastart,&pending);
ali@65	2698	memset(&pending,0,sizeof(pending));
ali@43	2699	isemptyline=analyse_quotes(aline,&counters);
ali@68	2700	if (isnewpara && !isemptyline)
ali@40	2701	{
ali@40	2702	/* This line is the start of a new paragraph. */
ali@68	2703	start_para_line=linecnt;
ali@40	2704	/* Capture its first line in case we want to report it later. */
ali@69	2705	g_free(parastart);
ali@69	2706	parastart=g_strdup(aline);
ali@56	2707	memset(&parities,0,sizeof(parities)); /* restart the quote count */
ali@68	2708	s=aline;
ali@68	2709	while (!gcisalpha(s) && !gcisdigit(s) && *s)
ali@40	2710	s++;
ali@68	2711	if (s>='a' && s<='z')
ali@40	2712	{
ali@40	2713	/* and its first letter is lowercase */
ali@68	2714	if (pswit[ECHO_SWITCH])
ali@40	2715	printf("\n%s\n",aline);
ali@68	2716	if (!pswit[OVERVIEW_SWITCH])
ali@68	2717	printf(" Line %ld column %d - "
ali@40	2718	"Paragraph starts with lower-case\n",
ali@40	2719	linecnt,(int)(s-aline)+1);
ali@68	2720	else
ali@68	2721	cnt_punct++;
ali@40	2722	}
ali@69	2723	isnewpara=FALSE; /* Signal the end of new para processing. */
ali@40	2724	}
ali@68	2725	/* Check for an em-dash broken at line end. */
ali@68	2726	if (enddash && *aline=='-')
ali@40	2727	{
ali@68	2728	if (pswit[ECHO_SWITCH])
ali@40	2729	printf("\n%s\n",aline);
ali@68	2730	if (!pswit[OVERVIEW_SWITCH])
ali@68	2731	printf(" Line %ld column 1 - Broken em-dash?\n",linecnt);
ali@68	2732	else
ali@68	2733	cnt_punct++;
ali@40	2734	}
ali@69	2735	enddash=FALSE;
ali@68	2736	for (s=aline+strlen(aline)-1;*s==' ' && s>aline;s--)
ali@40	2737	;
ali@68	2738	if (s>=aline && *s=='-')
ali@69	2739	enddash=TRUE;
ali@67	2740	check_for_control_characters(aline);
ali@68	2741	if (warnings->bin)
ali@44	2742	check_for_odd_characters(aline,warnings,isemptyline);
ali@68	2743	if (warnings->longline)
ali@45	2744	check_for_long_line(aline);
ali@68	2745	if (warnings->shortline)
ali@45	2746	check_for_short_line(aline,&last);
ali@68	2747	last.blen=last.len;
ali@68	2748	last.len=strlen(aline);
ali@68	2749	last.start=aline[0];
ali@46	2750	check_for_starting_punctuation(aline);
ali@68	2751	if (warnings->dash)
ali@40	2752	{
ali@47	2753	check_for_spaced_emdash(aline);
ali@47	2754	check_for_spaced_dash(aline);
ali@40	2755	}
ali@48	2756	check_for_unmarked_paragraphs(aline);
ali@49	2757	check_for_jeebies(aline);
ali@50	2758	check_for_mta_from(aline);
ali@51	2759	check_for_orphan_character(aline);
ali@52	2760	check_for_pling_scanno(aline);
ali@53	2761	check_for_extra_period(aline,warnings);
ali@54	2762	check_for_following_punctuation(aline);
ali@55	2763	check_for_typos(aline,warnings);
ali@56	2764	check_for_misspaced_punctuation(aline,&parities,isemptyline);
ali@57	2765	check_for_double_punctuation(aline,warnings);
ali@58	2766	check_for_spaced_quotes(aline);
ali@59	2767	check_for_miscased_genative(aline);
ali@60	2768	check_end_of_line(aline,warnings);
ali@61	2769	check_for_unspaced_bracket(aline);
ali@68	2770	if (warnings->endquote)
ali@62	2771	check_for_unpunctuated_endquote(aline);
ali@63	2772	check_for_html_tag(aline);
ali@64	2773	check_for_html_entity(aline);
ali@68	2774	if (isemptyline)
ali@40	2775	{
ali@65	2776	check_for_mismatched_quotes(&counters,&pending);
ali@43	2777	memset(&counters,0,sizeof(counters));
ali@40	2778	/* let the next iteration know that it's starting a new para */
ali@69	2779	isnewpara=TRUE;
ali@69	2780	if (prevline)
ali@69	2781	check_for_omitted_punctuation(prevline,&last,start_para_line);
ali@40	2782	}
ali@69	2783	g_free(prevline);
ali@69	2784	prevline=g_strdup(aline);
ali@0	2785	}
ali@69	2786	if (prevline)
ali@69	2787	{
ali@69	2788	g_free(prevline);
ali@69	2789	prevline=NULL;
ali@69	2790	}
ali@69	2791	g_free(parastart);
ali@69	2792	g_free(prevline);
ali@69	2793	g_free(etext);
ali@0	2794	if (!pswit[OVERVIEW_SWITCH])
ali@69	2795	g_tree_foreach(qword,report_duplicate_queries,NULL);
ali@69	2796	g_tree_unref(qword);
ali@69	2797	g_tree_unref(qperiod);
ali@0	2798	}
ali@0	2799
ali@40	2800	/*
ali@40	2801	* flgets:
ali@40	2802	*
ali@69	2803	* Get one line from the input text, checking for
ali@40	2804	* the existence of exactly one CR/LF line-end per line.
ali@40	2805	*
ali@40	2806	* Returns: a pointer to the line.
ali@40	2807	*/
ali@69	2808	char flgets(char *etext,long lcnt)
ali@0	2809	{
ali@0	2810	char c;
ali@69	2811	int len;
ali@69	2812	gboolean isCR=FALSE;
ali@69	2813	char theline=etext;
ali@69	2814	len=0;
ali@69	2815	for(;;)
ali@40	2816	{
ali@69	2817	c=(etext)++;
ali@69	2818	if (!c)
ali@68	2819	return NULL;
ali@40	2820	/* either way, it's end of line */
ali@69	2821	if (c=='\n')
ali@40	2822	{
ali@68	2823	if (isCR)
ali@68	2824	break;
ali@68	2825	else
ali@40	2826	{
ali@40	2827	/* Error - a LF without a preceding CR */
ali@68	2828	if (pswit[LINE_END_SWITCH])
ali@40	2829	{
ali@68	2830	if (pswit[ECHO_SWITCH])
ali@69	2831	printf("\n%.s\n",len,len,theline);
ali@68	2832	if (!pswit[OVERVIEW_SWITCH])
ali@68	2833	printf(" Line %ld - No CR?\n",lcnt);
ali@68	2834	else
ali@68	2835	cnt_lineend++;
ali@40	2836	}
ali@68	2837	break;
ali@40	2838	}
ali@40	2839	}
ali@69	2840	if (c=='\r')
ali@40	2841	{
ali@68	2842	if (isCR)
ali@40	2843	{
ali@40	2844	/* Error - two successive CRs */
ali@68	2845	if (pswit[LINE_END_SWITCH])
ali@40	2846	{
ali@68	2847	if (pswit[ECHO_SWITCH])
ali@69	2848	printf("\n%.s\n",len,len,theline);
ali@68	2849	if (!pswit[OVERVIEW_SWITCH])
ali@68	2850	printf(" Line %ld - Two successive CRs?\n",lcnt);
ali@68	2851	else
ali@68	2852	cnt_lineend++;
ali@40	2853	}
ali@40	2854	}
ali@69	2855	isCR=TRUE;
ali@40	2856	}
ali@68	2857	else
ali@40	2858	{
ali@68	2859	if (pswit[LINE_END_SWITCH] && isCR)
ali@40	2860	{
ali@68	2861	if (pswit[ECHO_SWITCH])
ali@69	2862	printf("\n%.s\n",len,len,theline);
ali@68	2863	if (!pswit[OVERVIEW_SWITCH])
ali@68	2864	printf(" Line %ld column %d - CR without LF?\n",
ali@40	2865	lcnt,len+1);
ali@68	2866	else
ali@68	2867	cnt_lineend++;
ali@69	2868	theline[len]=' ';
ali@40	2869	}
ali@69	2870	isCR=FALSE;
ali@68	2871	len++;
ali@40	2872	}
ali@69	2873	}
ali@69	2874	theline[len]='\0';
ali@0	2875	if (pswit[MARKUP_SWITCH])
ali@68	2876	postprocess_for_HTML(theline);
ali@0	2877	if (pswit[DP_SWITCH])
ali@68	2878	postprocess_for_DP(theline);
ali@40	2879	return theline;
ali@0	2880	}
ali@0	2881
ali@40	2882	/*
ali@40	2883	* mixdigit:
ali@40	2884	*
ali@40	2885	* Takes a "word" as a parameter, and checks whether it
ali@40	2886	* contains a mixture of alpha and digits. Generally, this is an
ali@40	2887	* error, but may not be for cases like 4th or L5 12s. 3d.
ali@40	2888	*
ali@40	2889	* Returns: 0 if no error found, 1 if error.
ali@40	2890	*/
ali@69	2891	int mixdigit(const char *checkword)
ali@0	2892	{
ali@40	2893	int wehaveadigit,wehavealetter,firstdigits,query,wl;
ali@69	2894	const char *s;
ali@40	2895	wehaveadigit=wehavealetter=query=0;
ali@40	2896	for (s=checkword;*s;s++)
ali@68	2897	if (gcisalpha(*s))
ali@68	2898	wehavealetter=1;
ali@68	2899	else
ali@68	2900	if (gcisdigit(*s))
ali@68	2901	wehaveadigit=1;
ali@40	2902	if (wehaveadigit && wehavealetter)
ali@40	2903	{
ali@40	2904	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
ali@68	2905	query=1;
ali@68	2906	wl=strlen(checkword);
ali@68	2907	for (firstdigits=0;gcisdigit(checkword[firstdigits]);firstdigits++)
ali@68	2908	;
ali@68	2909	/* digits, ending in st, rd, nd, th of either case */
ali@69	2910	if (firstdigits+2==wl && (!g_ascii_strcasecmp(checkword+wl-2,"st") \|\|
ali@69	2911	!g_ascii_strcasecmp(checkword+wl-2,"rd") \|\|
ali@69	2912	!g_ascii_strcasecmp(checkword+wl-2,"nd") \|\|
ali@69	2913	!g_ascii_strcasecmp(checkword+wl-2,"th")))
ali@40	2914	query=0;
ali@69	2915	if (firstdigits+3==wl && (!g_ascii_strcasecmp(checkword+wl-3,"sts") \|\|
ali@69	2916	!g_ascii_strcasecmp(checkword+wl-3,"rds") \|\|
ali@69	2917	!g_ascii_strcasecmp(checkword+wl-3,"nds") \|\|
ali@69	2918	!g_ascii_strcasecmp(checkword+wl-3,"ths")))
ali@40	2919	query=0;
ali@69	2920	if (firstdigits+3==wl && (!g_ascii_strcasecmp(checkword+wl-4,"stly") \|\|
ali@69	2921	!g_ascii_strcasecmp(checkword+wl-4,"rdly") \|\|
ali@69	2922	!g_ascii_strcasecmp(checkword+wl-4,"ndly") \|\|
ali@69	2923	!g_ascii_strcasecmp(checkword+wl-4,"thly")))
ali@40	2924	query=0;
ali@68	2925	/* digits, ending in l, L, s or d */
ali@68	2926	if (firstdigits+1==wl && (checkword[wl-1]=='l' \|\|
ali@40	2927	checkword[wl-1]=='L' \|\| checkword[wl-1]=='s' \|\| checkword[wl-1]=='d'))
ali@40	2928	query=0;
ali@68	2929	/*
ali@40	2930	* L at the start of a number, representing Britsh pounds, like L500.
ali@68	2931	* This is cute. We know the current word is mixeddigit. If the first
ali@68	2932	* letter is L, there must be at least one digit following. If both
ali@68	2933	* digits and letters follow, we have a genuine error, else we have a
ali@68	2934	* capital L followed by digits, and we accept that as a non-error.
ali@40	2935	*/
ali@68	2936	if (checkword[0]=='L' && !mixdigit(checkword+1))
ali@40	2937	query=0;
ali@40	2938	}
ali@40	2939	return query;
ali@0	2940	}
ali@0	2941
ali@40	2942	/*
ali@40	2943	* getaword:
ali@40	2944	*
ali@69	2945	* Extracts the first/next "word" from the line, and returns it.
ali@69	2946	* A word is defined as one English word unit--or at least that's the aim.
ali@69	2947	* "ptr" is advanced to the position in the line where we will start
ali@69	2948	* looking for the next word.
ali@40	2949	*
ali@69	2950	* Returns: A newly-allocated string.
ali@40	2951	*/
ali@69	2952	gchar getaword(const char *ptr)
ali@0	2953	{
ali@69	2954	int i;
ali@54	2955	const char *s;
ali@69	2956	GString *word;
ali@69	2957	word=g_string_new(NULL);
ali@69	2958	for (;!gcisdigit(ptr) && !gcisalpha(ptr) && *ptr;(ptr)++)
ali@40	2959	;
ali@40	2960	/*
ali@40	2961	* Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
ali@40	2962	* Especially yucky is the case of L1,000
ali@40	2963	* This section looks for a pattern of characters including a digit
ali@40	2964	* followed by a comma or period followed by one or more digits.
ali@40	2965	* If found, it returns this whole pattern as a word; otherwise we discard
ali@40	2966	* the results and resume our normal programming.
ali@40	2967	*/
ali@69	2968	s=*ptr;
ali@69	2969	for (;gcisdigit(s) \|\| gcisalpha(s) \|\| s==',' \|\| s=='.';s++)
ali@69	2970	g_string_append_c(word,*s);
ali@69	2971	for (i=1;i+1<word->len;i++)
ali@40	2972	{
ali@69	2973	if (word->str[i]=='.' \|\| word->str[i]==',')
ali@40	2974	{
ali@69	2975	if (gcisdigit(word->str[i-1]) && gcisdigit(word->str[i-1]))
ali@40	2976	{
ali@69	2977	*ptr=s;
ali@69	2978	return g_string_free(word,FALSE);
ali@40	2979	}
ali@40	2980	}
ali@40	2981	}
ali@0	2982	/* we didn't find a punctuated number - do the regular getword thing */
ali@69	2983	g_string_truncate(word,0);
ali@69	2984	for (;gcisdigit(ptr) \|\| gcisalpha(ptr) \|\| *ptr=='\'';(ptr)++)
ali@69	2985	g_string_append_c(word,**ptr);
ali@69	2986	return g_string_free(word,FALSE);
ali@0	2987	}
ali@0	2988
ali@40	2989	/*
ali@40	2990	* isroman:
ali@40	2991	*
ali@40	2992	* Is this word a Roman Numeral?
ali@40	2993	*
ali@40	2994	* It doesn't actually validate that the number is a valid Roman Numeral--for
ali@40	2995	* example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
ali@40	2996	* what we're here to do. If it passes this, it LOOKS like a Roman numeral.
ali@40	2997	* Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
ali@40	2998	* expressions thereof, except when it came to taxes. Allow any number of M,
ali@40	2999	* an optional D, an optional CM or CD, any number of optional Cs, an optional
ali@40	3000	* XL or an optional XC, an optional IX or IV, an optional V and any number
ali@40	3001	* of optional Is.
ali@40	3002	*/
ali@69	3003	gboolean isroman(const char *t)
ali@0	3004	{
ali@69	3005	const char *s;
ali@40	3006	if (!t \|\| !*t)
ali@69	3007	return FALSE;
ali@40	3008	s=t;
ali@40	3009	while (t=='m' && t)
ali@40	3010	t++;
ali@40	3011	if (*t=='d')
ali@40	3012	t++;
ali@40	3013	if (*t=='c' && t[1]=='m')
ali@40	3014	t+=2;
ali@40	3015	if (*t=='c' && t[1]=='d')
ali@40	3016	t+=2;
ali@40	3017	while (t=='c' && t)
ali@40	3018	t++;
ali@40	3019	if (*t=='x' && t[1]=='l')
ali@40	3020	t+=2;
ali@40	3021	if (*t=='x' && t[1]=='c')
ali@40	3022	t+=2;
ali@40	3023	if (*t=='l')
ali@40	3024	t++;
ali@40	3025	while (t=='x' && t)
ali@40	3026	t++;
ali@40	3027	if (*t=='i' && t[1]=='x')
ali@40	3028	t+=2;
ali@40	3029	if (*t=='i' && t[1]=='v')
ali@40	3030	t+=2;
ali@40	3031	if (*t=='v')
ali@40	3032	t++;
ali@40	3033	while (t=='i' && t)
ali@40	3034	t++;
ali@40	3035	return !*t;
ali@0	3036	}
ali@0	3037
ali@40	3038	/*
ali@40	3039	* gcisalpha:
ali@40	3040	*
ali@40	3041	* A version of isalpha() that is somewhat lenient on 8-bit texts.
ali@40	3042	* If we use the standard function, 8-bit accented characters break
ali@40	3043	* words, so that tete with accented characters appears to be two words, "t"
ali@40	3044	* and "t", with 8-bit characters between them. This causes over-reporting of
ali@40	3045	* errors. gcisalpha() recognizes accented letters from the CP1252 (Windows)
ali@40	3046	* and ISO-8859-1 character sets, which are the most common PG 8-bit types.
ali@40	3047	*/
ali@69	3048	gboolean gcisalpha(unsigned char c)
ali@0	3049	{
ali@40	3050	if (c>='a' && c<='z')
ali@69	3051	return TRUE;
ali@40	3052	if (c>='A' && c<='Z')
ali@69	3053	return TRUE;
ali@40	3054	if (c<140)
ali@69	3055	return FALSE;
ali@40	3056	if (c>=192 && c!=208 && c!=215 && c!=222 && c!=240 && c!=247 && c!=254)
ali@69	3057	return TRUE;
ali@40	3058	if (c==140 \|\| c==142 \|\| c==156 \|\| c==158 \|\| c==159)
ali@69	3059	return TRUE;
ali@69	3060	return FALSE;
ali@0	3061	}
ali@0	3062
ali@40	3063	/*
ali@40	3064	* gcisdigit:
ali@40	3065	*
ali@40	3066	* A version of isdigit() that doesn't get confused in 8-bit texts.
ali@40	3067	*/
ali@69	3068	gboolean gcisdigit(unsigned char c)
ali@0	3069	{
ali@40	3070	return c>='0' && c<='9';
ali@0	3071	}
ali@0	3072
ali@40	3073	/*
ali@40	3074	* gcisletter:
ali@40	3075	*
ali@40	3076	* A version of isletter() that doesn't get confused in 8-bit texts.
ali@40	3077	* NB: this is ISO-8891-1-specific.
ali@40	3078	*/
ali@69	3079	gboolean gcisletter(unsigned char c)
ali@0	3080	{
ali@40	3081	return c>='A' && c<='Z' \|\| c>='a' && c<='z' \|\| c>=192;
ali@0	3082	}
ali@0	3083
ali@40	3084	/*
ali@40	3085	* postprocess_for_DP:
ali@40	3086	*
ali@40	3087	* Invoked with the -d switch from flgets().
ali@40	3088	* It simply "removes" from the line a hard-coded set of common
ali@40	3089	* DP-specific tags, so that the line passed to the main routine has
ali@40	3090	* been pre-cleaned of DP markup.
ali@40	3091	*/
ali@0	3092	void postprocess_for_DP(char *theline)
ali@0	3093	{
ali@40	3094	char s,t;
ali@0	3095	int i;
ali@0	3096	if (!*theline)
ali@68	3097	return;
ali@40	3098	for (i=0;*DPmarkup[i];i++)
ali@40	3099	{
ali@68	3100	s=strstr(theline,DPmarkup[i]);
ali@68	3101	while (s)
ali@40	3102	{
ali@68	3103	t=s+strlen(DPmarkup[i]);
ali@68	3104	while (*t)
ali@40	3105	{
ali@68	3106	s=t;
ali@68	3107	t++;
ali@40	3108	s++;
ali@40	3109	}
ali@68	3110	*s=0;
ali@68	3111	s=strstr(theline,DPmarkup[i]);
ali@40	3112	}
ali@40	3113	}
ali@0	3114	}
ali@0	3115
ali@40	3116	/*
ali@40	3117	* postprocess_for_HTML:
ali@40	3118	*
ali@40	3119	* Invoked with the -m switch from flgets().
ali@40	3120	* It simply "removes" from the line a hard-coded set of common
ali@40	3121	* HTML tags and "replaces" a hard-coded set of common HTML
ali@40	3122	* entities, so that the line passed to the main routine has
ali@40	3123	* been pre-cleaned of HTML.
ali@40	3124	*/
ali@0	3125	void postprocess_for_HTML(char *theline)
ali@0	3126	{
ali@69	3127	if (strchr(theline,'<') && strchr(theline,'>'))
ali@68	3128	while (losemarkup(theline))
ali@68	3129	;
ali@0	3130	while (loseentities(theline))
ali@68	3131	;
ali@0	3132	}
ali@0	3133
ali@0	3134	char losemarkup(char theline)
ali@0	3135	{
ali@40	3136	char s,t;
ali@0	3137	int i;
ali@0	3138	if (!*theline)
ali@68	3139	return NULL;
ali@40	3140	s=strstr(theline,"<");
ali@40	3141	t=strstr(theline,">");
ali@40	3142	if (!s \|\| !t)
ali@40	3143	return NULL;
ali@40	3144	for (i=0;*markup[i];i++)
ali@68	3145	if (!tagcomp(s+1,markup[i]))
ali@40	3146	{
ali@68	3147	if (!t[1])
ali@40	3148	{
ali@68	3149	*s=0;
ali@68	3150	return s;
ali@40	3151	}
ali@68	3152	else if (t>s)
ali@40	3153	{
ali@40	3154	strcpy(s,t+1);
ali@40	3155	return s;
ali@40	3156	}
ali@68	3157	}
ali@40	3158	/* It's an unrecognized <xxx>. */
ali@40	3159	return NULL;
ali@0	3160	}
ali@0	3161
ali@0	3162	char loseentities(char theline)
ali@0	3163	{
ali@0	3164	int i;
ali@40	3165	char s,t;
ali@0	3166	if (!*theline)
ali@68	3167	return NULL;
ali@40	3168	for (i=0;*entities[i].htmlent;i++)
ali@40	3169	{
ali@68	3170	s=strstr(theline,entities[i].htmlent);
ali@68	3171	if (s)
ali@40	3172	{
ali@68	3173	t=malloc((size_t)strlen(s));
ali@68	3174	if (!t)
ali@40	3175	return NULL;
ali@68	3176	strcpy(t,s+strlen(entities[i].htmlent));
ali@68	3177	strcpy(s,entities[i].textent);
ali@68	3178	strcat(s,t);
ali@68	3179	free(t);
ali@68	3180	return theline;
ali@40	3181	}
ali@40	3182	}
ali@40	3183	for (i=0;*entities[i].htmlnum;i++)
ali@40	3184	{
ali@68	3185	s=strstr(theline,entities[i].htmlnum);
ali@68	3186	if (s)
ali@40	3187	{
ali@68	3188	t=malloc((size_t)strlen(s));
ali@68	3189	if (!t)
ali@40	3190	return NULL;
ali@68	3191	strcpy(t,s+strlen(entities[i].htmlnum));
ali@68	3192	strcpy(s,entities[i].textent);
ali@68	3193	strcat(s,t);
ali@68	3194	free(t);
ali@68	3195	return theline;
ali@40	3196	}
ali@40	3197	}
ali@40	3198	return NULL;
ali@0	3199	}
ali@0	3200
ali@69	3201	int tagcomp(const char strin,const char basetag)
ali@0	3202	{
ali@69	3203	const char s,t;
ali@40	3204	s=basetag;
ali@40	3205	t=strin;
ali@40	3206	if (*t=='/')
ali@40	3207	t++; /* ignore a slash */
ali@40	3208	while (s && t)
ali@40	3209	{
ali@68	3210	if (tolower(s)!=tolower(t))
ali@40	3211	return 1;
ali@68	3212	s++;
ali@40	3213	t++;
ali@40	3214	}
ali@40	3215	return 0;
ali@0	3216	}
ali@0	3217
ali@69	3218	void proghelp(GOptionContext *context)
ali@0	3219	{
ali@69	3220	gchar *help;
ali@40	3221	fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
ali@40	3222	fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
ali@40	3223	fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
ali@40	3224	fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
ali@40	3225	"For details, read the file COPYING.\n",stderr);
ali@40	3226	fputs("This is Free Software; "
ali@40	3227	"you may redistribute it under certain conditions (GPL);\n",stderr);
ali@40	3228	fputs("read the file COPYING for details.\n\n",stderr);
ali@69	3229	help=g_option_context_get_help(context,TRUE,NULL);
ali@69	3230	fputs(help,stderr);
ali@69	3231	g_free(help);
ali@69	3232	fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
ali@40	3233	fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
ali@40	3234	"non-ASCII\n",stderr);
ali@40	3235	fputs("characters like accented letters, "
ali@40	3236	"lines longer than 75 or shorter than 55,\n",stderr);
ali@40	3237	fputs("unbalanced quotes or brackets, "
ali@40	3238	"a variety of badly formatted punctuation, \n",stderr);
ali@40	3239	fputs("HTML tags, some likely typos. "
ali@40	3240	"It is NOT a substitute for human judgement.\n",stderr);
ali@0	3241	fputs("\n",stderr);
ali@0	3242	}

author	ali <ali@juiblex.co.uk>
	Tue May 28 15:17:19 2013 +0100 (2013-05-28)
changeset 69	1016349e619f
parent 68	adb087007d08
child 70	aa916da2e452
permissions	-rw-r--r--