bookloupe-testing: bookloupe/bookloupe.c@aa916da2e452 (annotated)

ali@0	1	/*************************************************************************/
ali@40	2	/* bookloupe--check for assorted weirdnesses in a PG candidate text file */
ali@68	3	/* */
ali@68	4	/* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
ali@68	5	/* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
ali@68	6	/* */
ali@0	7	/* This program is free software; you can redistribute it and/or modify */
ali@0	8	/* it under the terms of the GNU General Public License as published by */
ali@0	9	/* the Free Software Foundation; either version 2 of the License, or */
ali@68	10	/* (at your option) any later version. */
ali@68	11	/* */
ali@0	12	/* This program is distributed in the hope that it will be useful, */
ali@68	13	/* but WITHOUT ANY WARRANTY; without even the implied warranty of */
ali@68	14	/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
ali@68	15	/* GNU General Public License for more details. */
ali@68	16	/* */
ali@68	17	/* You should have received a copy of the GNU General Public License */
ali@68	18	/* along with this program. If not, see <http://www.gnu.org/licenses/>. */
ali@0	19	/*************************************************************************/
ali@0	20
ali@0	21	#include <stdio.h>
ali@0	22	#include <stdlib.h>
ali@0	23	#include <string.h>
ali@0	24	#include <ctype.h>
ali@69	25	#include <glib.h>
ali@69	26	#include <bl/bl.h>
ali@0	27
ali@69	28	gchar *prevline;
ali@0	29
ali@40	30	/* Common typos. */
ali@40	31	char *typo[] = {
ali@40	32	"teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
ali@40	33	"nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
ali@40	34	"bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
ali@40	35	"couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
ali@40	36	"esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
ali@40	37	"gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
ali@40	38	"herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
ali@40	39	"hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
ali@40	40	"loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
ali@40	41	"omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
ali@40	42	"peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
ali@40	43	"porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
ali@40	44	"sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
ali@40	45	"tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
ali@40	46	"thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
ali@40	47	"tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
ali@40	48	"waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
ali@40	49	"wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
ali@40	50	"woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
ali@40	51	"wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
ali@40	52	"ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
ali@40	53	"bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
ali@40	54	"ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
ali@40	55	"dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
ali@40	56	"hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
ali@40	57	"hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
ali@40	58	"memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
ali@40	59	"witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
ali@40	60	"prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
ali@40	61	"se", ""
ali@40	62	};
ali@0	63
ali@69	64	GTree *usertypo;
ali@0	65
ali@40	66	/* Common abbreviations and other OK words not to query as typos. */
ali@40	67	char *okword[] = {
ali@40	68	"mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
ali@40	69	"rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
ali@40	70	"pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
ali@40	71	"outbid", "outbids", "frostbite", "frostbitten", ""
ali@40	72	};
ali@0	73
ali@40	74	/* Common abbreviations that cause otherwise unexplained periods. */
ali@40	75	char *abbrev[] = {
ali@40	76	"cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
ali@40	77	"cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
ali@40	78	};
ali@0	79
ali@40	80	/*
ali@40	81	* Two-Letter combinations that rarely if ever start words,
ali@40	82	* but are common scannos or otherwise common letter combinations.
ali@40	83	*/
ali@40	84	char *nostart[] = {
ali@40	85	"hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
ali@40	86	};
ali@0	87
ali@40	88	/*
ali@40	89	* Two-Letter combinations that rarely if ever end words,
ali@40	90	* but are common scannos or otherwise common letter combinations.
ali@40	91	*/
ali@40	92	char *noend[] = {
ali@40	93	"cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
ali@40	94	"sw", "gr", "sl", "cl", "iy", ""
ali@40	95	};
ali@0	96
ali@40	97	char *markup[] = {
ali@40	98	"a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
ali@40	99	"font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
ali@40	100	"img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
ali@40	101	"sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
ali@40	102	};
ali@0	103
ali@40	104	char *DPmarkup[] = {
ali@40	105	"<sc>", "</sc>", "/", "/", "/#", "#/", "/$", "$/", "<tb>", ""
ali@40	106	};
ali@0	107
ali@40	108	char *nocomma[] = {
ali@40	109	"the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
ali@40	110	"every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
ali@40	111	"st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
ali@40	112	"during", "let", "toward", "among", ""
ali@40	113	};
ali@0	114
ali@40	115	char *noperiod[] = {
ali@40	116	"every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
ali@40	117	"and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
ali@40	118	"i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
ali@40	119	"among", "those", "into", "whom", "having", "thence", ""
ali@40	120	};
ali@0	121
ali@0	122	struct {
ali@0	123	char *htmlent;
ali@0	124	char *htmlnum;
ali@0	125	char *textent;
ali@40	126	} entities[] = {
ali@40	127	"&", "&", "&",
ali@40	128	"<", "<", "<",
ali@40	129	">", ">", ">",
ali@40	130	"°", "°", " degrees",
ali@40	131	"£", "£", "L",
ali@40	132	""", """, "\"", /* quotation mark = APL quote */
ali@40	133	"&OElig;", "Œ", "OE", /* latin capital ligature OE */
ali@40	134	"&oelig;", "œ", "oe", /* latin small ligature oe */
ali@40	135	"&Scaron;", "Š", "S", /* latin capital letter S with caron */
ali@40	136	"&scaron;", "š", "s", /* latin small letter s with caron */
ali@40	137	"&Yuml;", "Ÿ", "Y", /* latin capital letter Y with diaeresis */
ali@40	138	"&circ;", "ˆ", "", /* modifier letter circumflex accent */
ali@40	139	"&tilde;", "˜", "~", /* small tilde, U+02DC ISOdia */
ali@40	140	"&ensp;", " ", " ", /* en space, U+2002 ISOpub */
ali@40	141	"&emsp;", " ", " ", /* em space, U+2003 ISOpub */
ali@40	142	" ", " ", " ", /* thin space, U+2009 ISOpub */
ali@40	143	"–", "–", "-", /* en dash, U+2013 ISOpub */
ali@40	144	"—", "—", "--", /* em dash, U+2014 ISOpub */
ali@40	145	"’", "’", "'", /* right single quotation mark */
ali@40	146	"&sbquo;", "‚", "'", /* single low-9 quotation mark */
ali@40	147	"“", "“", "\"", /* left double quotation mark */
ali@40	148	"”", "”", "\"", /* right double quotation mark */
ali@40	149	"&bdquo;", "„", "\"", /* double low-9 quotation mark */
ali@40	150	"&lsaquo;", "‹", "\"", /* single left-pointing angle quotation mark */
ali@40	151	"&rsaquo;", "›", "\"", /* single right-pointing angle quotation mark */
ali@40	152	" ", " ", " ", /* no-break space = non-breaking space, */
ali@40	153	"¡", "¡", "!", /* inverted exclamation mark */
ali@40	154	"¢", "¢", "c", /* cent sign */
ali@40	155	"£", "£", "L", /* pound sign */
ali@40	156	"¤", "¤", "$", /* currency sign */
ali@40	157	"¥", "¥", "Y", /* yen sign = yuan sign */
ali@40	158	"§", "§", "--", /* section sign */
ali@40	159	"¨", "¨", " ", /* diaeresis = spacing diaeresis */
ali@40	160	"©", "©", "(C) ", /* copyright sign */
ali@40	161	"ª", "ª", " ", /* feminine ordinal indicator */
ali@40	162	"«", "«", "\"", /* left-pointing double angle quotation mark */
ali@40	163	"", "", "-", /* soft hyphen = discretionary hyphen */
ali@40	164	"®", "®", "(R) ", /* registered sign = registered trade mark sign */
ali@40	165	"¯", "¯", " ", /* macron = spacing macron = overline */
ali@40	166	"°", "°", " degrees", /* degree sign */
ali@40	167	"±", "±", "+-", /* plus-minus sign = plus-or-minus sign */
ali@40	168	"²", "²", "2", /* superscript two = superscript digit two */
ali@40	169	"³", "³", "3", /* superscript three = superscript digit three */
ali@40	170	"´", "´", " ", /* acute accent = spacing acute */
ali@40	171	"µ", "µ", "m", /* micro sign */
ali@40	172	"¶", "¶", "--", /* pilcrow sign = paragraph sign */
ali@40	173	"¸", "¸", " ", /* cedilla = spacing cedilla */
ali@40	174	"¹", "¹", "1", /* superscript one = superscript digit one */
ali@40	175	"º", "º", " ", /* masculine ordinal indicator */
ali@40	176	"»", "»", "\"", /* right-pointing double angle quotation mark */
ali@40	177	"¼", "¼", "1/4", /* vulgar fraction one quarter */
ali@40	178	"½", "½", "1/2", /* vulgar fraction one half */
ali@40	179	"¾", "¾", "3/4", /* vulgar fraction three quarters */
ali@40	180	"¿", "¿", "?", /* inverted question mark */
ali@40	181	"À", "À", "A", /* latin capital letter A with grave */
ali@40	182	"Á", "Á", "A", /* latin capital letter A with acute */
ali@40	183	"Â", "Â", "A", /* latin capital letter A with circumflex */
ali@40	184	"Ã", "Ã", "A", /* latin capital letter A with tilde */
ali@40	185	"Ä", "Ä", "A", /* latin capital letter A with diaeresis */
ali@40	186	"Å", "Å", "A", /* latin capital letter A with ring above */
ali@40	187	"Æ", "Æ", "AE", /* latin capital letter AE */
ali@40	188	"Ç", "Ç", "C", /* latin capital letter C with cedilla */
ali@40	189	"È", "È", "E", /* latin capital letter E with grave */
ali@40	190	"É", "É", "E", /* latin capital letter E with acute */
ali@40	191	"Ê", "Ê", "E", /* latin capital letter E with circumflex */
ali@40	192	"Ë", "Ë", "E", /* latin capital letter E with diaeresis */
ali@40	193	"Ì", "Ì", "I", /* latin capital letter I with grave */
ali@40	194	"Í", "Í", "I", /* latin capital letter I with acute */
ali@40	195	"Î", "Î", "I", /* latin capital letter I with circumflex */
ali@40	196	"Ï", "Ï", "I", /* latin capital letter I with diaeresis */
ali@40	197	"Ð", "Ð", "E", /* latin capital letter ETH */
ali@40	198	"Ñ", "Ñ", "N", /* latin capital letter N with tilde */
ali@40	199	"Ò", "Ò", "O", /* latin capital letter O with grave */
ali@40	200	"Ó", "Ó", "O", /* latin capital letter O with acute */
ali@40	201	"Ô", "Ô", "O", /* latin capital letter O with circumflex */
ali@40	202	"Õ", "Õ", "O", /* latin capital letter O with tilde */
ali@40	203	"Ö", "Ö", "O", /* latin capital letter O with diaeresis */
ali@40	204	"×", "×", "", / multiplication sign */
ali@40	205	"Ø", "Ø", "O", /* latin capital letter O with stroke */
ali@40	206	"Ù", "Ù", "U", /* latin capital letter U with grave */
ali@40	207	"Ú", "Ú", "U", /* latin capital letter U with acute */
ali@40	208	"Û", "Û", "U", /* latin capital letter U with circumflex */
ali@40	209	"Ü", "Ü", "U", /* latin capital letter U with diaeresis */
ali@40	210	"Ý", "Ý", "Y", /* latin capital letter Y with acute */
ali@40	211	"Þ", "Þ", "TH", /* latin capital letter THORN */
ali@40	212	"ß", "ß", "sz", /* latin small letter sharp s = ess-zed */
ali@40	213	"à", "à", "a", /* latin small letter a with grave */
ali@40	214	"á", "á", "a", /* latin small letter a with acute */
ali@40	215	"â", "â", "a", /* latin small letter a with circumflex */
ali@40	216	"ã", "ã", "a", /* latin small letter a with tilde */
ali@40	217	"ä", "ä", "a", /* latin small letter a with diaeresis */
ali@40	218	"å", "å", "a", /* latin small letter a with ring above */
ali@40	219	"æ", "æ", "ae", /* latin small letter ae */
ali@40	220	"ç", "ç", "c", /* latin small letter c with cedilla */
ali@40	221	"è", "è", "e", /* latin small letter e with grave */
ali@40	222	"é", "é", "e", /* latin small letter e with acute */
ali@40	223	"ê", "ê", "e", /* latin small letter e with circumflex */
ali@40	224	"ë", "ë", "e", /* latin small letter e with diaeresis */
ali@40	225	"ì", "ì", "i", /* latin small letter i with grave */
ali@40	226	"í", "í", "i", /* latin small letter i with acute */
ali@40	227	"î", "î", "i", /* latin small letter i with circumflex */
ali@40	228	"ï", "ï", "i", /* latin small letter i with diaeresis */
ali@40	229	"ð", "ð", "eth", /* latin small letter eth */
ali@40	230	"ñ", "ñ", "n", /* latin small letter n with tilde */
ali@40	231	"ò", "ò", "o", /* latin small letter o with grave */
ali@40	232	"ó", "ó", "o", /* latin small letter o with acute */
ali@40	233	"ô", "ô", "o", /* latin small letter o with circumflex */
ali@40	234	"õ", "õ", "o", /* latin small letter o with tilde */
ali@40	235	"ö", "ö", "o", /* latin small letter o with diaeresis */
ali@40	236	"÷", "÷", "/", /* division sign */
ali@40	237	"ø", "ø", "o", /* latin small letter o with stroke */
ali@40	238	"ù", "ù", "u", /* latin small letter u with grave */
ali@40	239	"ú", "ú", "u", /* latin small letter u with acute */
ali@40	240	"û", "û", "u", /* latin small letter u with circumflex */
ali@40	241	"ü", "ü", "u", /* latin small letter u with diaeresis */
ali@40	242	"ý", "ý", "y", /* latin small letter y with acute */
ali@40	243	"þ", "þ", "th", /* latin small letter thorn */
ali@40	244	"ÿ", "ÿ", "y", /* latin small letter y with diaeresis */
ali@40	245	"", ""
ali@40	246	};
ali@40	247
ali@40	248	/* special characters */
ali@68	249	#define CHAR_SPACE 32
ali@68	250	#define CHAR_TAB 9
ali@68	251	#define CHAR_LF 10
ali@68	252	#define CHAR_CR 13
ali@68	253	#define CHAR_DQUOTE 34
ali@68	254	#define CHAR_SQUOTE 39
ali@0	255	#define CHAR_OPEN_SQUOTE 96
ali@68	256	#define CHAR_TILDE 126
ali@68	257	#define CHAR_ASTERISK 42
ali@68	258	#define CHAR_FORESLASH 47
ali@68	259	#define CHAR_CARAT 94
ali@0	260
ali@0	261	#define CHAR_UNDERSCORE '_'
ali@0	262	#define CHAR_OPEN_CBRACK '{'
ali@0	263	#define CHAR_CLOSE_CBRACK '}'
ali@0	264	#define CHAR_OPEN_RBRACK '('
ali@0	265	#define CHAR_CLOSE_RBRACK ')'
ali@0	266	#define CHAR_OPEN_SBRACK '['
ali@0	267	#define CHAR_CLOSE_SBRACK ']'
ali@0	268
ali@40	269	/* longest and shortest normal PG line lengths */
ali@0	270	#define LONGEST_PG_LINE 75
ali@0	271	#define WAY_TOO_LONG 80
ali@0	272	#define SHORTEST_PG_LINE 55
ali@0	273
ali@69	274	enum {
ali@69	275	ECHO_SWITCH,
ali@69	276	SQUOTE_SWITCH,
ali@69	277	TYPO_SWITCH,
ali@69	278	QPARA_SWITCH,
ali@69	279	PARANOID_SWITCH,
ali@69	280	LINE_END_SWITCH,
ali@69	281	OVERVIEW_SWITCH,
ali@69	282	STDOUT_SWITCH,
ali@69	283	HEADER_SWITCH,
ali@69	284	WEB_SWITCH,
ali@69	285	VERBOSE_SWITCH,
ali@69	286	MARKUP_SWITCH,
ali@69	287	USERTYPO_SWITCH,
ali@69	288	DP_SWITCH,
ali@69	289	SWITNO
ali@69	290	};
ali@0	291
ali@69	292	gboolean pswit[SWITNO]; /* program switches */
ali@0	293
ali@69	294	static GOptionEntry options[]={
ali@69	295	{ "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
ali@69	296	"Ignore DP-specific markup", NULL },
ali@69	297	{ "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
ali@69	298	"Don't echo queried line", NULL },
ali@69	299	{ "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
ali@69	300	"Check single quotes", NULL },
ali@69	301	{ "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
ali@69	302	"Check common typos", NULL },
ali@69	303	{ "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
ali@69	304	"Require closure of quotes on every paragraph", NULL },
ali@69	305	{ "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
ali@69	306	"Disable paranoid querying of everything", NULL },
ali@69	307	{ "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
ali@69	308	"Disable line end checking", NULL },
ali@69	309	{ "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
ali@69	310	"Overview: just show counts", NULL },
ali@69	311	{ "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
ali@69	312	"Output errors to stdout instead of stderr", NULL },
ali@69	313	{ "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
ali@69	314	"Echo header fields", NULL },
ali@69	315	{ "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
ali@69	316	"Ignore markup in < >", NULL },
ali@69	317	{ "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
ali@69	318	"Use file of user-defined typos", NULL },
ali@69	319	{ "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
ali@69	320	"Defaults for use on www upload", NULL },
ali@69	321	{ "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
ali@69	322	"Verbose - list everything", NULL },
ali@69	323	{ NULL }
ali@69	324	};
ali@0	325
ali@68	326	long cnt_dquot; /* for overview mode, count of doublequote queries */
ali@68	327	long cnt_squot; /* for overview mode, count of singlequote queries */
ali@68	328	long cnt_brack; /* for overview mode, count of brackets queries */
ali@68	329	long cnt_bin; /* for overview mode, count of non-ASCII queries */
ali@68	330	long cnt_odd; /* for overview mode, count of odd character queries */
ali@68	331	long cnt_long; /* for overview mode, count of long line errors */
ali@68	332	long cnt_short; /* for overview mode, count of short line queries */
ali@68	333	long cnt_punct; /* for overview mode,
ali@68	334	count of punctuation and spacing queries */
ali@68	335	long cnt_dash; /* for overview mode, count of dash-related queries */
ali@68	336	long cnt_word; /* for overview mode, count of word queries */
ali@68	337	long cnt_html; /* for overview mode, count of html queries */
ali@68	338	long cnt_lineend; /* for overview mode, count of line-end queries */
ali@68	339	long cnt_spacend; /* count of lines with space at end */
ali@68	340	long linecnt; /* count of total lines in the file */
ali@68	341	long checked_linecnt; /* count of lines actually checked */
ali@0	342
ali@69	343	void proghelp(GOptionContext *context);
ali@69	344	void procfile(const char *);
ali@0	345
ali@69	346	gchar *running_from;
ali@0	347
ali@70	348	gboolean mixdigit(const char *);
ali@69	349	gchar getaword(const char *);
ali@69	350	char flgets(char *,long);
ali@0	351	void postprocess_for_HTML(char *);
ali@0	352	char linehasmarkup(char );
ali@0	353	char losemarkup(char );
ali@70	354	gboolean tagcomp(const char ,const char );
ali@0	355	char loseentities(char );
ali@69	356	gboolean isroman(const char *);
ali@0	357	void postprocess_for_DP(char *);
ali@0	358
ali@69	359	GTree qword,qperiod;
ali@68	360
ali@68	361	struct first_pass_results {
ali@68	362	long firstline,astline;
ali@68	363	long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;
ali@68	364	long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;
ali@68	365	long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;
ali@68	366	int Dutchcount,Frenchcount;
ali@68	367	};
ali@68	368
ali@68	369	struct warnings {
ali@68	370	int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;
ali@69	371	int endquote;
ali@69	372	gboolean isDutch,isFrench;
ali@68	373	};
ali@68	374
ali@68	375	struct counters {
ali@68	376	long quot;
ali@68	377	int c_unders,c_brack,s_brack,r_brack;
ali@68	378	int open_single_quote,close_single_quote;
ali@68	379	};
ali@68	380
ali@68	381	struct line_properties {
ali@68	382	unsigned int len,blen;
ali@70	383	gunichar start;
ali@68	384	};
ali@68	385
ali@68	386	struct parities {
ali@68	387	int dquote,squote;
ali@68	388	};
ali@68	389
ali@68	390	struct pending {
ali@69	391	char dquote,squote,rbrack,sbrack,cbrack,unders;
ali@68	392	long squot;
ali@68	393	};
ali@0	394
ali@69	395	void parse_options(int argc,char **argv)
ali@0	396	{
ali@69	397	GError *err=NULL;
ali@69	398	GOptionContext *context;
ali@69	399	context=g_option_context_new(
ali@69	400	"file - looks for errors in Project Gutenberg(TM) etexts");
ali@69	401	g_option_context_add_main_entries(context,options,NULL);
ali@69	402	if (!g_option_context_parse(context,argc,argv,&err))
ali@69	403	{
ali@69	404	g_printerr("Bookloupe: %s\n",err->message);
ali@69	405	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
ali@69	406	exit(1);
ali@69	407	}
ali@40	408	/* Paranoid checking is turned OFF, not on, by its switch */
ali@69	409	pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
ali@40	410	if (pswit[PARANOID_SWITCH])
ali@69	411	/* if running in paranoid mode, typo checks default to enabled */
ali@69	412	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
ali@40	413	/* Line-end checking is turned OFF, not on, by its switch */
ali@69	414	pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
ali@40	415	/* Echoing is turned OFF, not on, by its switch */
ali@69	416	pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
ali@40	417	if (pswit[OVERVIEW_SWITCH])
ali@40	418	/* just print summary; don't echo */
ali@69	419	pswit[ECHO_SWITCH]=FALSE;
ali@40	420	/*
ali@40	421	* Web uploads - for the moment, this is really just a placeholder
ali@40	422	* until we decide what processing we really want to do on web uploads
ali@40	423	*/
ali@40	424	if (pswit[WEB_SWITCH])
ali@40	425	{
ali@40	426	/* specific override for web uploads */
ali@69	427	pswit[ECHO_SWITCH]=TRUE;
ali@69	428	pswit[SQUOTE_SWITCH]=FALSE;
ali@69	429	pswit[TYPO_SWITCH]=TRUE;
ali@69	430	pswit[QPARA_SWITCH]=FALSE;
ali@69	431	pswit[PARANOID_SWITCH]=TRUE;
ali@69	432	pswit[LINE_END_SWITCH]=FALSE;
ali@69	433	pswit[OVERVIEW_SWITCH]=FALSE;
ali@69	434	pswit[STDOUT_SWITCH]=FALSE;
ali@69	435	pswit[HEADER_SWITCH]=TRUE;
ali@69	436	pswit[VERBOSE_SWITCH]=FALSE;
ali@69	437	pswit[MARKUP_SWITCH]=FALSE;
ali@69	438	pswit[USERTYPO_SWITCH]=FALSE;
ali@69	439	pswit[DP_SWITCH]=FALSE;
ali@40	440	}
ali@69	441	if (*argc<2)
ali@40	442	{
ali@69	443	proghelp(context);
ali@69	444	exit(1);
ali@40	445	}
ali@69	446	g_option_context_free(context);
ali@69	447	}
ali@69	448
ali@69	449	/*
ali@69	450	* read_user_scannos:
ali@69	451	*
ali@69	452	* Read in the user-defined stealth scanno list.
ali@69	453	*/
ali@69	454	void read_user_scannos(void)
ali@69	455	{
ali@69	456	GError *err=NULL;
ali@69	457	gchar *usertypo_file;
ali@69	458	gboolean okay;
ali@69	459	int i;
ali@70	460	gsize len,nb;
ali@70	461	gchar contents,utf8,**lines;
ali@69	462	usertypo_file=g_strdup("bookloupe.typ");
ali@69	463	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
ali@69	464	if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
ali@69	465	{
ali@69	466	g_clear_error(&err);
ali@69	467	g_free(usertypo_file);
ali@69	468	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
ali@69	469	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
ali@69	470	}
ali@69	471	if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
ali@69	472	{
ali@69	473	g_clear_error(&err);
ali@69	474	g_free(usertypo_file);
ali@69	475	usertypo_file=g_strdup("gutcheck.typ");
ali@69	476	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
ali@69	477	}
ali@69	478	if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
ali@69	479	{
ali@69	480	g_clear_error(&err);
ali@69	481	g_free(usertypo_file);
ali@69	482	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
ali@69	483	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
ali@69	484	}
ali@69	485	if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
ali@69	486	{
ali@69	487	g_free(usertypo_file);
ali@70	488	g_print(" --> I couldn't find bookloupe.typ "
ali@69	489	"-- proceeding without user typos.\n");
ali@69	490	return;
ali@69	491	}
ali@69	492	else if (!okay)
ali@69	493	{
ali@69	494	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
ali@69	495	g_free(usertypo_file);
ali@69	496	g_clear_error(&err);
ali@69	497	exit(1);
ali@69	498	}
ali@70	499	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
ali@70	500	g_free(contents);
ali@70	501	lines=g_strsplit_set(utf8,"\r\n",0);
ali@70	502	g_free(utf8);
ali@69	503	usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
ali@69	504	for (i=0;lines[i];i++)
ali@69	505	if ((unsigned char )lines[i]>'!')
ali@69	506	g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
ali@69	507	else
ali@69	508	g_free(lines[i]);
ali@69	509	g_free(lines);
ali@69	510	}
ali@69	511
ali@69	512	/*
ali@69	513	* read_etext:
ali@69	514	*
ali@69	515	* Read an etext returning a newly allocated string containing the file
ali@69	516	* contents or NULL on error.
ali@69	517	*/
ali@69	518	gchar read_etext(const char filename,GError **err)
ali@69	519	{
ali@70	520	gchar contents,utf8;
ali@70	521	gsize len,nb;
ali@69	522	if (!g_file_get_contents(filename,&contents,&len,err))
ali@69	523	return NULL;
ali@70	524	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
ali@70	525	g_free(contents);
ali@70	526	return utf8;
ali@69	527	}
ali@69	528
ali@69	529	int main(int argc,char **argv)
ali@69	530	{
ali@69	531	running_from=g_path_get_dirname(argv[0]);
ali@69	532	parse_options(&argc,&argv);
ali@40	533	if (pswit[USERTYPO_SWITCH])
ali@69	534	read_user_scannos();
ali@40	535	fprintf(stderr,"bookloupe: Check and report on an e-text\n");
ali@69	536	procfile(argv[1]);
ali@40	537	if (pswit[OVERVIEW_SWITCH])
ali@40	538	{
ali@70	539	g_print(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
ali@40	540	checked_linecnt,linecnt,linecnt-checked_linecnt);
ali@70	541	g_print(" --------------- Queries found --------------\n");
ali@68	542	if (cnt_long)
ali@70	543	g_print(" Long lines: %14ld\n",cnt_long);
ali@68	544	if (cnt_short)
ali@70	545	g_print(" Short lines: %14ld\n",cnt_short);
ali@68	546	if (cnt_lineend)
ali@70	547	g_print(" Line-end problems: %14ld\n",cnt_lineend);
ali@68	548	if (cnt_word)
ali@70	549	g_print(" Common typos: %14ld\n",cnt_word);
ali@68	550	if (cnt_dquot)
ali@70	551	g_print(" Unmatched quotes: %14ld\n",cnt_dquot);
ali@68	552	if (cnt_squot)
ali@70	553	g_print(" Unmatched SingleQuotes: %14ld\n",cnt_squot);
ali@68	554	if (cnt_brack)
ali@70	555	g_print(" Unmatched brackets: %14ld\n",cnt_brack);
ali@68	556	if (cnt_bin)
ali@70	557	g_print(" Non-ASCII characters: %14ld\n",cnt_bin);
ali@68	558	if (cnt_odd)
ali@70	559	g_print(" Proofing characters: %14ld\n",cnt_odd);
ali@68	560	if (cnt_punct)
ali@70	561	g_print(" Punctuation & spacing queries: %14ld\n",cnt_punct);
ali@68	562	if (cnt_dash)
ali@70	563	g_print(" Non-standard dashes: %14ld\n",cnt_dash);
ali@68	564	if (cnt_html)
ali@70	565	g_print(" Possible HTML tags: %14ld\n",cnt_html);
ali@70	566	g_print("\n");
ali@70	567	g_print(" TOTAL QUERIES %14ld\n",
ali@68	568	cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
ali@68	569	cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
ali@40	570	}
ali@69	571	g_free(running_from);
ali@69	572	if (usertypo)
ali@69	573	g_tree_unref(usertypo);
ali@40	574	return 0;
ali@0	575	}
ali@0	576
ali@40	577	/*
ali@41	578	* first_pass:
ali@40	579	*
ali@41	580	* Run a first pass - verify that it's a valid PG
ali@41	581	* file, decide whether to report some things that
ali@41	582	* occur many times in the text like long or short
ali@41	583	* lines, non-standard dashes, etc.
ali@40	584	*/
ali@69	585	struct first_pass_results first_pass(const char etext)
ali@0	586	{
ali@70	587	gunichar laststart=CHAR_SPACE;
ali@54	588	const char *s;
ali@69	589	gchar *lc_line;
ali@70	590	int i,j,lbytes,llen;
ali@69	591	gchar **lines;
ali@41	592	unsigned int lastlen=0,lastblen=0;
ali@41	593	long spline=0,nspline=0;
ali@41	594	static struct first_pass_results results={0};
ali@69	595	gchar *inword;
ali@69	596	lines=g_strsplit(etext,"\n",0);
ali@69	597	for (j=0;lines[j];j++)
ali@40	598	{
ali@70	599	lbytes=strlen(lines[j]);
ali@70	600	while (lines[j][lbytes-1]=='\r')
ali@70	601	lines[j][--lbytes]='\0';
ali@70	602	llen=g_utf8_strlen(lines[j],lbytes);
ali@68	603	linecnt++;
ali@69	604	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
ali@69	605	(strstr(lines[j],"PUBLIC DOMAIN") \|\| strstr(lines[j],"COPYRIGHT")))
ali@40	606	{
ali@68	607	if (spline)
ali@70	608	g_print(" --> Duplicate header?\n");
ali@68	609	spline=linecnt+1; /* first line of non-header text, that is */
ali@40	610	}
ali@69	611	if (!strncmp(lines[j],"*** START",9) &&
ali@69	612	strstr(lines[j],"PROJECT GUTENBERG"))
ali@40	613	{
ali@68	614	if (nspline)
ali@70	615	g_print(" --> Duplicate header?\n");
ali@68	616	nspline=linecnt+1; /* first line of non-header text, that is */
ali@40	617	}
ali@68	618	if (spline \|\| nspline)
ali@40	619	{
ali@70	620	lc_line=g_utf8_strdown(lines[j],lbytes);
ali@69	621	if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
ali@40	622	{
ali@69	623	if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
ali@40	624	{
ali@68	625	if (results.footerline)
ali@40	626	{
ali@40	627	/* it's an old-form header - we can detect duplicates */
ali@68	628	if (!nspline)
ali@70	629	g_print(" --> Duplicate footer?\n");
ali@40	630	}
ali@68	631	else
ali@68	632	results.footerline=linecnt;
ali@40	633	}
ali@40	634	}
ali@69	635	g_free(lc_line);
ali@40	636	}
ali@68	637	if (spline)
ali@41	638	results.firstline=spline;
ali@68	639	if (nspline)
ali@41	640	results.firstline=nspline; /* override with new */
ali@68	641	if (results.footerline)
ali@40	642	continue; /* don't count the boilerplate in the footer */
ali@68	643	results.totlen+=llen;
ali@70	644	for (s=lines[j];*s;s=g_utf8_next_char(s))
ali@40	645	{
ali@70	646	if (g_utf8_get_char(s)>127)
ali@41	647	results.binlen++;
ali@70	648	if (g_unichar_isalpha(g_utf8_get_char(s)))
ali@41	649	results.alphalen++;
ali@70	650	if (s>lines[j] && g_utf8_get_char(s)==CHAR_DQUOTE &&
ali@70	651	isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
ali@41	652	results.endquote_count++;
ali@40	653	}
ali@69	654	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
ali@69	655	lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
ali@41	656	results.shortline++;
ali@70	657	if (lbytes>0 &&
ali@70	658	g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
ali@40	659	cnt_spacend++;
ali@69	660	if (strstr(lines[j],".,"))
ali@41	661	results.dotcomma++;
ali@68	662	/* only count ast lines for ignoring purposes where there is */
ali@68	663	/* locase text on the line */
ali@69	664	if (strchr(lines[j],'*'))
ali@40	665	{
ali@70	666	for (s=lines[j];*s;s=g_utf8_next_char(s))
ali@70	667	if (g_unichar_islower(g_utf8_get_char(s)))
ali@68	668	break;
ali@70	669	if (*s)
ali@41	670	results.astline++;
ali@40	671	}
ali@69	672	if (strchr(lines[j],'/'))
ali@68	673	results.fslashline++;
ali@70	674	for (s=g_utf8_prev_char(lines[j]+lbytes);
ali@70	675	s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
ali@40	676	;
ali@70	677	if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
ali@70	678	g_utf8_get_char(g_utf8_prev_char(s))!='-')
ali@41	679	results.hyphens++;
ali@68	680	if (llen>LONGEST_PG_LINE)
ali@41	681	results.longline++;
ali@68	682	if (llen>WAY_TOO_LONG)
ali@41	683	results.verylongline++;
ali@69	684	if (strchr(lines[j],'<') && strchr(lines[j],'>'))
ali@40	685	{
ali@69	686	i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
ali@68	687	if (i>0)
ali@68	688	results.htmcount++;
ali@69	689	if (strstr(lines[j],"<i>"))
ali@41	690	results.htmcount+=4; /* bonus marks! */
ali@40	691	}
ali@68	692	/* Check for spaced em-dashes */
ali@70	693	if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))
ali@40	694	{
ali@68	695	results.emdash++;
ali@70	696	if (s[-1]==CHAR_SPACE \|\| s[2]==CHAR_SPACE)
ali@41	697	results.space_emdash++;
ali@70	698	if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)
ali@40	699	/* count of em-dashes with spaces both sides */
ali@41	700	results.non_PG_space_emdash++;
ali@70	701	if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)
ali@40	702	/* count of PG-type em-dashes with no spaces */
ali@41	703	results.PG_space_emdash++;
ali@40	704	}
ali@69	705	for (s=lines[j];*s;)
ali@40	706	{
ali@69	707	inword=getaword(&s);
ali@68	708	if (!strcmp(inword,"hij") \|\| !strcmp(inword,"niet"))
ali@68	709	results.Dutchcount++;
ali@68	710	if (!strcmp(inword,"dans") \|\| !strcmp(inword,"avec"))
ali@68	711	results.Frenchcount++;
ali@68	712	if (!strcmp(inword,"0") \|\| !strcmp(inword,"1"))
ali@68	713	results.standalone_digit++;
ali@69	714	g_free(inword);
ali@40	715	}
ali@68	716	/* Check for spaced dashes */
ali@69	717	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
ali@41	718	results.spacedash++;
ali@68	719	lastblen=lastlen;
ali@69	720	lastlen=llen;
ali@69	721	laststart=lines[j][0];
ali@40	722	}
ali@69	723	g_strfreev(lines);
ali@41	724	return &results;
ali@41	725	}
ali@41	726
ali@42	727	/*
ali@42	728	* report_first_pass:
ali@42	729	*
ali@42	730	* Make some snap decisions based on the first pass results.
ali@42	731	*/
ali@42	732	struct warnings report_first_pass(struct first_pass_results results)
ali@42	733	{
ali@42	734	static struct warnings warnings={0};
ali@42	735	if (cnt_spacend>0)
ali@70	736	g_print(" --> %ld lines in this file have white space at end\n",
ali@42	737	cnt_spacend);
ali@42	738	warnings.dotcomma=1;
ali@42	739	if (results->dotcomma>5)
ali@42	740	{
ali@68	741	warnings.dotcomma=0;
ali@70	742	g_print(" --> %ld lines in this file contain '.,'. "
ali@42	743	"Not reporting them.\n",results->dotcomma);
ali@42	744	}
ali@42	745	/*
ali@42	746	* If more than 50 lines, or one-tenth, are short,
ali@42	747	* don't bother reporting them.
ali@42	748	*/
ali@42	749	warnings.shortline=1;
ali@42	750	if (results->shortline>50 \|\| results->shortline*10>linecnt)
ali@42	751	{
ali@68	752	warnings.shortline=0;
ali@70	753	g_print(" --> %ld lines in this file are short. "
ali@42	754	"Not reporting short lines.\n",results->shortline);
ali@42	755	}
ali@42	756	/*
ali@42	757	* If more than 50 lines, or one-tenth, are long,
ali@42	758	* don't bother reporting them.
ali@42	759	*/
ali@42	760	warnings.longline=1;
ali@42	761	if (results->longline>50 \|\| results->longline*10>linecnt)
ali@42	762	{
ali@68	763	warnings.longline=0;
ali@70	764	g_print(" --> %ld lines in this file are long. "
ali@42	765	"Not reporting long lines.\n",results->longline);
ali@42	766	}
ali@42	767	/* If more than 10 lines contain asterisks, don't bother reporting them. */
ali@42	768	warnings.ast=1;
ali@42	769	if (results->astline>10)
ali@42	770	{
ali@68	771	warnings.ast=0;
ali@70	772	g_print(" --> %ld lines in this file contain asterisks. "
ali@42	773	"Not reporting them.\n",results->astline);
ali@42	774	}
ali@42	775	/*
ali@42	776	* If more than 10 lines contain forward slashes,
ali@42	777	* don't bother reporting them.
ali@42	778	*/
ali@42	779	warnings.fslash=1;
ali@42	780	if (results->fslashline>10)
ali@42	781	{
ali@68	782	warnings.fslash=0;
ali@70	783	g_print(" --> %ld lines in this file contain forward slashes. "
ali@42	784	"Not reporting them.\n",results->fslashline);
ali@42	785	}
ali@42	786	/*
ali@42	787	* If more than 20 lines contain unpunctuated endquotes,
ali@42	788	* don't bother reporting them.
ali@42	789	*/
ali@42	790	warnings.endquote=1;
ali@42	791	if (results->endquote_count>20)
ali@42	792	{
ali@68	793	warnings.endquote=0;
ali@70	794	g_print(" --> %ld lines in this file contain unpunctuated endquotes. "
ali@42	795	"Not reporting them.\n",results->endquote_count);
ali@42	796	}
ali@42	797	/*
ali@42	798	* If more than 15 lines contain standalone digits,
ali@42	799	* don't bother reporting them.
ali@42	800	*/
ali@42	801	warnings.digit=1;
ali@42	802	if (results->standalone_digit>10)
ali@42	803	{
ali@68	804	warnings.digit=0;
ali@70	805	g_print(" --> %ld lines in this file contain standalone 0s and 1s. "
ali@42	806	"Not reporting them.\n",results->standalone_digit);
ali@42	807	}
ali@42	808	/*
ali@42	809	* If more than 20 lines contain hyphens at end,
ali@42	810	* don't bother reporting them.
ali@42	811	*/
ali@42	812	warnings.hyphen=1;
ali@42	813	if (results->hyphens>20)
ali@42	814	{
ali@68	815	warnings.hyphen=0;
ali@70	816	g_print(" --> %ld lines in this file have hyphens at end. "
ali@42	817	"Not reporting them.\n",results->hyphens);
ali@42	818	}
ali@42	819	if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
ali@42	820	{
ali@70	821	g_print(" --> Looks like this is HTML. Switching HTML mode ON.\n");
ali@68	822	pswit[MARKUP_SWITCH]=1;
ali@42	823	}
ali@42	824	if (results->verylongline>0)
ali@70	825	g_print(" --> %ld lines in this file are VERY long!\n",
ali@42	826	results->verylongline);
ali@42	827	/*
ali@42	828	* If there are more non-PG spaced dashes than PG em-dashes,
ali@42	829	* assume it's deliberate.
ali@42	830	* Current PG guidelines say don't use them, but older texts do,
ali@42	831	* and some people insist on them whatever the guidelines say.
ali@42	832	*/
ali@42	833	warnings.dash=1;
ali@42	834	if (results->spacedash+results->non_PG_space_emdash>
ali@42	835	results->PG_space_emdash)
ali@42	836	{
ali@68	837	warnings.dash=0;
ali@70	838	g_print(" --> There are %ld spaced dashes and em-dashes. "
ali@42	839	"Not reporting them.\n",
ali@42	840	results->spacedash+results->non_PG_space_emdash);
ali@42	841	}
ali@42	842	/* If more than a quarter of characters are hi-bit, bug out. */
ali@42	843	warnings.bin=1;
ali@42	844	if (results->binlen*4>results->totlen)
ali@42	845	{
ali@70	846	g_print(" --> This file does not appear to be ASCII. "
ali@42	847	"Terminating. Best of luck with it!\n");
ali@68	848	exit(1);
ali@42	849	}
ali@42	850	if (results->alphalen*4<results->totlen)
ali@42	851	{
ali@70	852	g_print(" --> This file does not appear to be text. "
ali@42	853	"Terminating. Best of luck with it!\n");
ali@68	854	exit(1);
ali@42	855	}
ali@42	856	if (results->binlen*100>results->totlen \|\| results->binlen>100)
ali@42	857	{
ali@70	858	g_print(" --> There are a lot of foreign letters here. "
ali@42	859	"Not reporting them.\n");
ali@68	860	warnings.bin=0;
ali@42	861	}
ali@69	862	warnings.isDutch=FALSE;
ali@42	863	if (results->Dutchcount>50)
ali@42	864	{
ali@69	865	warnings.isDutch=TRUE;
ali@70	866	g_print(" --> This looks like Dutch - "
ali@42	867	"switching off dashes and warnings for 's Middags case.\n");
ali@42	868	}
ali@69	869	warnings.isFrench=FALSE;
ali@42	870	if (results->Frenchcount>50)
ali@42	871	{
ali@69	872	warnings.isFrench=TRUE;
ali@70	873	g_print(" --> This looks like French - "
ali@42	874	"switching off some doublepunct.\n");
ali@42	875	}
ali@42	876	if (results->firstline && results->footerline)
ali@70	877	g_print(" The PG header and footer appear to be already on.\n");
ali@42	878	else
ali@42	879	{
ali@68	880	if (results->firstline)
ali@70	881	g_print(" The PG header is on - no footer.\n");
ali@68	882	if (results->footerline)
ali@70	883	g_print(" The PG footer is on - no header.\n");
ali@42	884	}
ali@70	885	g_print("\n");
ali@42	886	if (pswit[VERBOSE_SWITCH])
ali@42	887	{
ali@68	888	warnings.bin=1;
ali@68	889	warnings.shortline=1;
ali@68	890	warnings.dotcomma=1;
ali@68	891	warnings.longline=1;
ali@68	892	warnings.dash=1;
ali@68	893	warnings.digit=1;
ali@68	894	warnings.ast=1;
ali@68	895	warnings.fslash=1;
ali@68	896	warnings.hyphen=1;
ali@68	897	warnings.endquote=1;
ali@70	898	g_print(" * Verbose output is ON -- you asked for it! *\n");
ali@42	899	}
ali@42	900	if (warnings.isDutch)
ali@68	901	warnings.dash=0;
ali@42	902	if (results->footerline>0 && results->firstline>0 &&
ali@42	903	results->footerline>results->firstline &&
ali@42	904	results->footerline-results->firstline<100)
ali@42	905	{
ali@70	906	g_print(" --> I don't really know where this text starts. \n");
ali@70	907	g_print(" There are no reference points.\n");
ali@70	908	g_print(" I'm going to have to report the header and footer "
ali@42	909	"as well.\n");
ali@68	910	results->firstline=0;
ali@42	911	}
ali@42	912	return &warnings;
ali@42	913	}
ali@42	914
ali@43	915	/*
ali@43	916	* analyse_quotes:
ali@43	917	*
ali@43	918	* Look along the line, accumulate the count of quotes, and see
ali@43	919	* if this is an empty line - i.e. a line with nothing on it
ali@43	920	* but spaces.
ali@43	921	* If line has just spaces, period, * and/or - on it, don't
ali@43	922	* count it, since empty lines with asterisks or dashes to
ali@43	923	* separate sections are common.
ali@43	924	*
ali@69	925	* Returns: TRUE if the line is empty.
ali@43	926	*/
ali@69	927	gboolean analyse_quotes(const char aline,struct counters counters)
ali@43	928	{
ali@68	929	int guessquote=0;
ali@69	930	/* assume the line is empty until proven otherwise */
ali@69	931	gboolean isemptyline=TRUE;
ali@70	932	const char s=aline,sprev,*snext;
ali@70	933	gunichar c;
ali@70	934	sprev=NULL;
ali@43	935	while (*s)
ali@43	936	{
ali@70	937	snext=g_utf8_next_char(s);
ali@70	938	c=g_utf8_get_char(s);
ali@70	939	if (c==CHAR_DQUOTE)
ali@43	940	counters->quot++;
ali@70	941	if (c==CHAR_SQUOTE \|\| c==CHAR_OPEN_SQUOTE)
ali@43	942	{
ali@43	943	if (s==aline)
ali@43	944	{
ali@43	945	/*
ali@43	946	* At start of line, it can only be an openquote.
ali@43	947	* Hardcode a very common exception!
ali@43	948	*/
ali@70	949	if (!g_str_has_prefix(snext,"tis") &&
ali@70	950	!g_str_has_prefix(snext,"Tis"))
ali@43	951	counters->open_single_quote++;
ali@43	952	}
ali@70	953	else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
ali@70	954	g_unichar_isalpha(g_utf8_get_char(snext)))
ali@43	955	/* Do nothing! it's definitely an apostrophe, not a quote */
ali@43	956	;
ali@43	957	/* it's outside a word - let's check it out */
ali@70	958	else if (c==CHAR_OPEN_SQUOTE \|\|
ali@70	959	g_unichar_isalpha(g_utf8_get_char(snext)))
ali@43	960	{
ali@43	961	/* it damwell better BE an openquote */
ali@70	962	if (!g_str_has_prefix(snext,"tis") &&
ali@70	963	!g_str_has_prefix(snext,"Tis"))
ali@43	964	/* hardcode a very common exception! */
ali@43	965	counters->open_single_quote++;
ali@43	966	}
ali@43	967	else
ali@43	968	{
ali@43	969	/* now - is it a closequote? */
ali@43	970	guessquote=0; /* accumulate clues */
ali@70	971	if (g_unichar_isalpha(g_utf8_get_char(sprev)))
ali@43	972	{
ali@43	973	/* it follows a letter - could be either */
ali@43	974	guessquote++;
ali@70	975	if (g_utf8_get_char(sprev)=='s')
ali@43	976	{
ali@43	977	/* looks like a plural apostrophe */
ali@43	978	guessquote-=3;
ali@70	979	if (g_utf8_get_char(snext)==CHAR_SPACE)
ali@70	980	/* bonus marks! */
ali@43	981	guessquote-=2;
ali@43	982	}
ali@43	983	}
ali@43	984	/* it doesn't have a letter either side */
ali@70	985	else if (strchr(".?!,;:",g_utf8_get_char(sprev)) &&
ali@70	986	strchr(".?!,;: ",g_utf8_get_char(snext)))
ali@43	987	guessquote+=8; /* looks like a closequote */
ali@43	988	else
ali@43	989	guessquote++;
ali@43	990	if (counters->open_single_quote>counters->close_single_quote)
ali@43	991	/*
ali@43	992	* Give it the benefit of some doubt,
ali@43	993	* if a squote is already open.
ali@43	994	*/
ali@43	995	guessquote++;
ali@43	996	else
ali@43	997	guessquote--;
ali@43	998	if (guessquote>=0)
ali@43	999	counters->close_single_quote++;
ali@43	1000	}
ali@43	1001	}
ali@70	1002	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
ali@70	1003	c!='\r' && c!='\n')
ali@69	1004	isemptyline=FALSE; /* ignore lines like * * * as spacers */
ali@70	1005	if (c==CHAR_UNDERSCORE)
ali@43	1006	counters->c_unders++;
ali@70	1007	if (c==CHAR_OPEN_CBRACK)
ali@43	1008	counters->c_brack++;
ali@70	1009	if (c==CHAR_CLOSE_CBRACK)
ali@43	1010	counters->c_brack--;
ali@70	1011	if (c==CHAR_OPEN_RBRACK)
ali@43	1012	counters->r_brack++;
ali@70	1013	if (c==CHAR_CLOSE_RBRACK)
ali@43	1014	counters->r_brack--;
ali@70	1015	if (c==CHAR_OPEN_SBRACK)
ali@43	1016	counters->s_brack++;
ali@70	1017	if (c==CHAR_CLOSE_SBRACK)
ali@43	1018	counters->s_brack--;
ali@70	1019	sprev=s;
ali@70	1020	s=snext;
ali@43	1021	}
ali@43	1022	return isemptyline;
ali@43	1023	}
ali@43	1024
ali@41	1025	/*
ali@67	1026	* check_for_control_characters:
ali@67	1027	*
ali@67	1028	* Check for invalid or questionable characters in the line
ali@67	1029	* Anything above 127 is invalid for plain ASCII, and
ali@67	1030	* non-printable control characters should also be flagged.
ali@67	1031	* Tabs should generally not be there.
ali@67	1032	*/
ali@67	1033	void check_for_control_characters(const char *aline)
ali@67	1034	{
ali@70	1035	gunichar c;
ali@67	1036	const char *s;
ali@70	1037	for (s=aline;*s;s=g_utf8_next_char(s))
ali@67	1038	{
ali@70	1039	c=g_utf8_get_char(s);
ali@67	1040	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
ali@67	1041	{
ali@67	1042	if (pswit[ECHO_SWITCH])
ali@70	1043	g_print("\n%s\n",aline);
ali@67	1044	if (!pswit[OVERVIEW_SWITCH])
ali@70	1045	g_print(" Line %ld column %ld - Control character %u\n",
ali@70	1046	linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
ali@67	1047	else
ali@67	1048	cnt_bin++;
ali@67	1049	}
ali@67	1050	}
ali@67	1051	}
ali@67	1052
ali@67	1053	/*
ali@44	1054	* check_for_odd_characters:
ali@44	1055	*
ali@44	1056	* Check for binary and other odd characters.
ali@44	1057	*/
ali@44	1058	void check_for_odd_characters(const char aline,const struct warnings warnings,
ali@69	1059	gboolean isemptyline)
ali@44	1060	{
ali@44	1061	/* Don't repeat multiple warnings on one line. */
ali@70	1062	gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
ali@70	1063	gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
ali@44	1064	const char *s;
ali@70	1065	gunichar c;
ali@70	1066	for (s=aline;*s;s=g_utf8_next_char(s))
ali@44	1067	{
ali@70	1068	c=g_utf8_get_char(s);
ali@70	1069	if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' \|\| c>127))
ali@44	1070	{
ali@44	1071	if (pswit[ECHO_SWITCH])
ali@70	1072	g_print("\n%s\n",aline);
ali@44	1073	if (!pswit[OVERVIEW_SWITCH])
ali@70	1074	if (c>127 && c<160 \|\| c>255)
ali@70	1075	g_print(" Line %ld column %ld - "
ali@70	1076	"Non-ISO-8859 character %u\n",
ali@70	1077	linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
ali@44	1078	else
ali@70	1079	g_print(" Line %ld column %ld - "
ali@70	1080	"Non-ASCII character %u\n",
ali@70	1081	linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
ali@44	1082	else
ali@44	1083	cnt_bin++;
ali@70	1084	eNon_A=TRUE;
ali@44	1085	}
ali@70	1086	if (!eTab && c==CHAR_TAB)
ali@44	1087	{
ali@44	1088	if (pswit[ECHO_SWITCH])
ali@70	1089	g_print("\n%s\n",aline);
ali@44	1090	if (!pswit[OVERVIEW_SWITCH])
ali@70	1091	g_print(" Line %ld column %ld - Tab character?\n",
ali@70	1092	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@44	1093	else
ali@44	1094	cnt_odd++;
ali@70	1095	eTab=TRUE;
ali@44	1096	}
ali@70	1097	if (!eTilde && c==CHAR_TILDE)
ali@44	1098	{
ali@44	1099	/*
ali@44	1100	* Often used by OCR software to indicate an
ali@44	1101	* unrecognizable character.
ali@44	1102	*/
ali@44	1103	if (pswit[ECHO_SWITCH])
ali@70	1104	g_print("\n%s\n",aline);
ali@44	1105	if (!pswit[OVERVIEW_SWITCH])
ali@70	1106	g_print(" Line %ld column %ld - Tilde character?\n",
ali@70	1107	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@44	1108	else
ali@44	1109	cnt_odd++;
ali@70	1110	eTilde=TRUE;
ali@44	1111	}
ali@70	1112	if (!eCarat && c==CHAR_CARAT)
ali@44	1113	{
ali@44	1114	if (pswit[ECHO_SWITCH])
ali@70	1115	g_print("\n%s\n",aline);
ali@44	1116	if (!pswit[OVERVIEW_SWITCH])
ali@70	1117	g_print(" Line %ld column %ld - Carat character?\n",
ali@70	1118	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@44	1119	else
ali@44	1120	cnt_odd++;
ali@70	1121	eCarat=TRUE;
ali@44	1122	}
ali@70	1123	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
ali@44	1124	{
ali@44	1125	if (pswit[ECHO_SWITCH])
ali@70	1126	g_print("\n%s\n",aline);
ali@44	1127	if (!pswit[OVERVIEW_SWITCH])
ali@70	1128	g_print(" Line %ld column %ld - Forward slash?\n",
ali@70	1129	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@44	1130	else
ali@44	1131	cnt_odd++;
ali@70	1132	eFSlash=TRUE;
ali@44	1133	}
ali@44	1134	/*
ali@44	1135	* Report asterisks only in paranoid mode,
ali@44	1136	* since they're often deliberate.
ali@44	1137	*/
ali@44	1138	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
ali@70	1139	c==CHAR_ASTERISK)
ali@44	1140	{
ali@44	1141	if (pswit[ECHO_SWITCH])
ali@70	1142	g_print("\n%s\n",aline);
ali@44	1143	if (!pswit[OVERVIEW_SWITCH])
ali@70	1144	g_print(" Line %ld column %ld - Asterisk?\n",
ali@70	1145	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@44	1146	else
ali@44	1147	cnt_odd++;
ali@70	1148	eAst=TRUE;
ali@44	1149	}
ali@44	1150	}
ali@44	1151	}
ali@44	1152
ali@44	1153	/*
ali@45	1154	* check_for_long_line:
ali@45	1155	*
ali@45	1156	* Check for line too long.
ali@45	1157	*/
ali@45	1158	void check_for_long_line(const char *aline)
ali@45	1159	{
ali@70	1160	if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
ali@45	1161	{
ali@45	1162	if (pswit[ECHO_SWITCH])
ali@70	1163	g_print("\n%s\n",aline);
ali@45	1164	if (!pswit[OVERVIEW_SWITCH])
ali@70	1165	g_print(" Line %ld column %ld - Long line %ld\n",
ali@70	1166	linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
ali@45	1167	else
ali@45	1168	cnt_long++;
ali@45	1169	}
ali@45	1170	}
ali@45	1171
ali@45	1172	/*
ali@45	1173	* check_for_short_line:
ali@45	1174	*
ali@45	1175	* Check for line too short.
ali@45	1176	*
ali@45	1177	* This one is a bit trickier to implement: we don't want to
ali@45	1178	* flag the last line of a paragraph for being short, so we
ali@45	1179	* have to wait until we know that our current line is a
ali@45	1180	* "normal" line, then report the _previous_ line if it was too
ali@45	1181	* short. We also don't want to report indented lines like
ali@45	1182	* chapter heads or formatted quotations. We therefore keep
ali@45	1183	* last->len as the length of the last line examined, and
ali@45	1184	* last->blen as the length of the last but one, and try to
ali@45	1185	* suppress unnecessary warnings by checking that both were of
ali@45	1186	* "normal" length. We keep the first character of the last
ali@45	1187	* line in last->start, and if it was a space, we assume that
ali@45	1188	* the formatting is deliberate. I can't figure out a way to
ali@45	1189	* distinguish something like a quoted verse left-aligned or
ali@45	1190	* the header or footer of a letter from a paragraph of short
ali@45	1191	* lines - maybe if I examined the whole paragraph, and if the
ali@45	1192	* para has less than, say, 8 lines and if all lines are short,
ali@45	1193	* then just assume it's OK? Need to look at some texts to see
ali@45	1194	* how often a formula like this would get the right result.
ali@45	1195	*/
ali@45	1196	void check_for_short_line(const char aline,const struct line_properties last)
ali@45	1197	{
ali@70	1198	if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
ali@70	1199	last->len<SHORTEST_PG_LINE && last->blen>1 &&
ali@70	1200	last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
ali@45	1201	{
ali@45	1202	if (pswit[ECHO_SWITCH])
ali@70	1203	g_print("\n%s\n",prevline);
ali@45	1204	if (!pswit[OVERVIEW_SWITCH])
ali@70	1205	g_print(" Line %ld column %ld - Short line %ld?\n",
ali@70	1206	linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
ali@45	1207	else
ali@45	1208	cnt_short++;
ali@45	1209	}
ali@45	1210	}
ali@45	1211
ali@45	1212	/*
ali@46	1213	* check_for_starting_punctuation:
ali@46	1214	*
ali@46	1215	* Look for punctuation other than full ellipses at start of line.
ali@46	1216	*/
ali@46	1217	void check_for_starting_punctuation(const char *aline)
ali@46	1218	{
ali@70	1219	if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
ali@70	1220	!g_str_has_prefix(aline,". . ."))
ali@46	1221	{
ali@46	1222	if (pswit[ECHO_SWITCH])
ali@70	1223	g_print("\n%s\n",aline);
ali@46	1224	if (!pswit[OVERVIEW_SWITCH])
ali@70	1225	g_print(" Line %ld column 1 - Begins with punctuation?\n",
ali@46	1226	linecnt);
ali@46	1227	else
ali@46	1228	cnt_punct++;
ali@46	1229	}
ali@46	1230	}
ali@46	1231
ali@46	1232	/*
ali@47	1233	* check_for_spaced_emdash:
ali@47	1234	*
ali@47	1235	* Check for spaced em-dashes.
ali@47	1236	*
ali@47	1237	* We must check _all_ occurrences of "--" on the line
ali@47	1238	* hence the loop - even if the first double-dash is OK
ali@47	1239	* there may be another that's wrong later on.
ali@47	1240	*/
ali@47	1241	void check_for_spaced_emdash(const char *aline)
ali@47	1242	{
ali@70	1243	const char s,t,*next;
ali@70	1244	for (s=aline;t=strstr(s,"--");s=next)
ali@47	1245	{
ali@70	1246	next=g_utf8_next_char(g_utf8_next_char(t));
ali@70	1247	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE \|\|
ali@70	1248	g_utf8_get_char(next)==CHAR_SPACE)
ali@47	1249	{
ali@47	1250	if (pswit[ECHO_SWITCH])
ali@70	1251	g_print("\n%s\n",aline);
ali@47	1252	if (!pswit[OVERVIEW_SWITCH])
ali@70	1253	g_print(" Line %ld column %ld - Spaced em-dash?\n",
ali@70	1254	linecnt,g_utf8_pointer_to_offset(aline,t)+1);
ali@47	1255	else
ali@47	1256	cnt_dash++;
ali@47	1257	}
ali@47	1258	}
ali@47	1259	}
ali@47	1260
ali@47	1261	/*
ali@47	1262	* check_for_spaced_dash:
ali@47	1263	*
ali@47	1264	* Check for spaced dashes.
ali@47	1265	*/
ali@47	1266	void check_for_spaced_dash(const char *aline)
ali@47	1267	{
ali@47	1268	const char *s;
ali@47	1269	if ((s=strstr(aline," -")))
ali@47	1270	{
ali@70	1271	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
ali@47	1272	{
ali@47	1273	if (pswit[ECHO_SWITCH])
ali@70	1274	g_print("\n%s\n",aline);
ali@47	1275	if (!pswit[OVERVIEW_SWITCH])
ali@70	1276	g_print(" Line %ld column %ld - Spaced dash?\n",
ali@70	1277	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@47	1278	else
ali@47	1279	cnt_dash++;
ali@47	1280	}
ali@47	1281	}
ali@47	1282	else if ((s=strstr(aline,"- ")))
ali@47	1283	{
ali@70	1284	if (s==aline \|\| g_utf8_get_char(g_utf8_prev_char(s))!='-')
ali@47	1285	{
ali@47	1286	if (pswit[ECHO_SWITCH])
ali@70	1287	g_print("\n%s\n",aline);
ali@47	1288	if (!pswit[OVERVIEW_SWITCH])
ali@70	1289	g_print(" Line %ld column %ld - Spaced dash?\n",
ali@70	1290	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@47	1291	else
ali@47	1292	cnt_dash++;
ali@47	1293	}
ali@47	1294	}
ali@47	1295	}
ali@47	1296
ali@47	1297	/*
ali@48	1298	* check_for_unmarked_paragraphs:
ali@48	1299	*
ali@48	1300	* Check for unmarked paragraphs indicated by separate speakers.
ali@48	1301	*
ali@48	1302	* May well be false positive:
ali@48	1303	* "Bravo!" "Wonderful!" called the crowd.
ali@48	1304	* but useful all the same.
ali@48	1305	*/
ali@48	1306	void check_for_unmarked_paragraphs(const char *aline)
ali@48	1307	{
ali@48	1308	const char *s;
ali@48	1309	s=strstr(aline,"\" \"");
ali@48	1310	if (!s)
ali@48	1311	s=strstr(aline,"\" \"");
ali@48	1312	if (s)
ali@48	1313	{
ali@48	1314	if (pswit[ECHO_SWITCH])
ali@70	1315	g_print("\n%s\n",aline);
ali@48	1316	if (!pswit[OVERVIEW_SWITCH])
ali@70	1317	g_print(" Line %ld column %ld - "
ali@70	1318	"Query missing paragraph break?\n",
ali@70	1319	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@48	1320	else
ali@48	1321	cnt_punct++;
ali@48	1322	}
ali@48	1323	}
ali@48	1324
ali@48	1325	/*
ali@49	1326	* check_for_jeebies:
ali@49	1327	*
ali@49	1328	* Check for "to he" and other easy h/b errors.
ali@49	1329	*
ali@49	1330	* This is a very inadequate effort on the h/b problem,
ali@49	1331	* but the phrase "to he" is always an error, whereas "to
ali@49	1332	* be" is quite common.
ali@49	1333	* Similarly, '"Quiet!", be said.' is a non-be error
ali@49	1334	* "to he" is _not_ always an error!:
ali@49	1335	* "Where they went to he couldn't say."
ali@49	1336	* Another false positive:
ali@49	1337	* What would "Cinderella" be without the . . .
ali@49	1338	* and another: "If he wants to he can see for himself."
ali@49	1339	*/
ali@49	1340	void check_for_jeebies(const char *aline)
ali@49	1341	{
ali@49	1342	const char *s;
ali@49	1343	s=strstr(aline," be could ");
ali@49	1344	if (!s)
ali@49	1345	s=strstr(aline," be would ");
ali@49	1346	if (!s)
ali@49	1347	s=strstr(aline," was be ");
ali@49	1348	if (!s)
ali@49	1349	s=strstr(aline," be is ");
ali@49	1350	if (!s)
ali@49	1351	s=strstr(aline," is be ");
ali@49	1352	if (!s)
ali@49	1353	s=strstr(aline,"\", be ");
ali@49	1354	if (!s)
ali@49	1355	s=strstr(aline,"\" be ");
ali@49	1356	if (!s)
ali@49	1357	s=strstr(aline,"\" be ");
ali@49	1358	if (!s)
ali@49	1359	s=strstr(aline," to he ");
ali@49	1360	if (s)
ali@49	1361	{
ali@49	1362	if (pswit[ECHO_SWITCH])
ali@70	1363	g_print("\n%s\n",aline);
ali@49	1364	if (!pswit[OVERVIEW_SWITCH])
ali@70	1365	g_print(" Line %ld column %ld - Query he/be error?\n",
ali@70	1366	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@49	1367	else
ali@49	1368	cnt_word++;
ali@49	1369	}
ali@49	1370	s=strstr(aline," the had ");
ali@49	1371	if (!s)
ali@49	1372	s=strstr(aline," a had ");
ali@49	1373	if (!s)
ali@49	1374	s=strstr(aline," they bad ");
ali@49	1375	if (!s)
ali@49	1376	s=strstr(aline," she bad ");
ali@49	1377	if (!s)
ali@49	1378	s=strstr(aline," he bad ");
ali@49	1379	if (!s)
ali@49	1380	s=strstr(aline," you bad ");
ali@49	1381	if (!s)
ali@49	1382	s=strstr(aline," i bad ");
ali@49	1383	if (s)
ali@49	1384	{
ali@49	1385	if (pswit[ECHO_SWITCH])
ali@70	1386	g_print("\n%s\n",aline);
ali@49	1387	if (!pswit[OVERVIEW_SWITCH])
ali@70	1388	g_print(" Line %ld column %ld - Query had/bad error?\n",
ali@70	1389	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@49	1390	else
ali@49	1391	cnt_word++;
ali@49	1392	}
ali@49	1393	s=strstr(aline,"; hut ");
ali@49	1394	if (!s)
ali@49	1395	s=strstr(aline,", hut ");
ali@49	1396	if (s)
ali@49	1397	{
ali@49	1398	if (pswit[ECHO_SWITCH])
ali@70	1399	g_print("\n%s\n",aline);
ali@49	1400	if (!pswit[OVERVIEW_SWITCH])
ali@70	1401	g_print(" Line %ld column %ld - Query hut/but error?\n",
ali@70	1402	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@49	1403	else
ali@49	1404	cnt_word++;
ali@49	1405	}
ali@49	1406	}
ali@49	1407
ali@49	1408	/*
ali@50	1409	* check_for_mta_from:
ali@50	1410	*
ali@50	1411	* Special case - angled bracket in front of "From" placed there by an
ali@50	1412	* MTA when sending an e-mail.
ali@50	1413	*/
ali@50	1414	void check_for_mta_from(const char *aline)
ali@50	1415	{
ali@50	1416	const char *s;
ali@50	1417	s=strstr(aline,">From");
ali@50	1418	if (s)
ali@50	1419	{
ali@50	1420	if (pswit[ECHO_SWITCH])
ali@70	1421	g_print("\n%s\n",aline);
ali@50	1422	if (!pswit[OVERVIEW_SWITCH])
ali@70	1423	g_print(" Line %ld column %ld - "
ali@70	1424	"Query angled bracket with From\n",
ali@70	1425	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@50	1426	else
ali@50	1427	cnt_punct++;
ali@50	1428	}
ali@50	1429	}
ali@50	1430
ali@50	1431	/*
ali@51	1432	* check_for_orphan_character:
ali@51	1433	*
ali@51	1434	* Check for a single character line -
ali@51	1435	* often an overflow from bad wrapping.
ali@51	1436	*/
ali@51	1437	void check_for_orphan_character(const char *aline)
ali@51	1438	{
ali@70	1439	gunichar c;
ali@70	1440	c=g_utf8_get_char(aline);
ali@70	1441	if (c && !*g_utf8_next_char(aline))
ali@51	1442	{
ali@70	1443	if (c=='I' \|\| c=='V' \|\| c=='X' \|\| c=='L' \|\| g_unichar_isdigit(c))
ali@51	1444	; /* Nothing - ignore numerals alone on a line. */
ali@51	1445	else
ali@51	1446	{
ali@51	1447	if (pswit[ECHO_SWITCH])
ali@70	1448	g_print("\n%s\n",aline);
ali@51	1449	if (!pswit[OVERVIEW_SWITCH])
ali@70	1450	g_print(" Line %ld column 1 - Query single character line\n",
ali@51	1451	linecnt);
ali@51	1452	else
ali@51	1453	cnt_punct++;
ali@51	1454	}
ali@51	1455	}
ali@51	1456	}
ali@51	1457
ali@51	1458	/*
ali@52	1459	* check_for_pling_scanno:
ali@52	1460	*
ali@52	1461	* Check for I" - often should be !
ali@52	1462	*/
ali@52	1463	void check_for_pling_scanno(const char *aline)
ali@52	1464	{
ali@52	1465	const char *s;
ali@52	1466	s=strstr(aline," I\"");
ali@52	1467	if (s)
ali@52	1468	{
ali@52	1469	if (pswit[ECHO_SWITCH])
ali@70	1470	g_print("\n%s\n",aline);
ali@52	1471	if (!pswit[OVERVIEW_SWITCH])
ali@70	1472	g_print(" Line %ld column %ld - Query I=exclamation mark?\n",
ali@70	1473	linecnt,g_utf8_pointer_to_offset(aline,s));
ali@52	1474	else
ali@52	1475	cnt_punct++;
ali@52	1476	}
ali@52	1477	}
ali@52	1478
ali@52	1479	/*
ali@53	1480	* check_for_extra_period:
ali@53	1481	*
ali@53	1482	* Check for period without a capital letter. Cut-down from gutspell.
ali@53	1483	* Only works when it happens on a single line.
ali@53	1484	*/
ali@53	1485	void check_for_extra_period(const char aline,const struct warnings warnings)
ali@53	1486	{
ali@53	1487	const char s,t,*s1;
ali@69	1488	int i;
ali@70	1489	gsize len;
ali@69	1490	gboolean istypo;
ali@69	1491	gchar *testword;
ali@70	1492	gunichar *decomposition;
ali@53	1493	if (pswit[PARANOID_SWITCH])
ali@53	1494	{
ali@70	1495	for (t=aline;t=strstr(t,". ");)
ali@53	1496	{
ali@69	1497	if (t==aline)
ali@53	1498	{
ali@70	1499	t=g_utf8_next_char(t);
ali@53	1500	/* start of line punctuation is handled elsewhere */
ali@53	1501	continue;
ali@53	1502	}
ali@70	1503	if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
ali@53	1504	{
ali@70	1505	t=g_utf8_next_char(t);
ali@53	1506	continue;
ali@53	1507	}
ali@53	1508	if (warnings->isDutch)
ali@53	1509	{
ali@53	1510	/* For Frank & Jeroen -- 's Middags case */
ali@70	1511	gunichar c2,c3,c4,c5;
ali@70	1512	c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
ali@70	1513	c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
ali@70	1514	c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
ali@70	1515	c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
ali@70	1516	if (c2==CHAR_SQUOTE && g_unichar_islower(c3) &&
ali@70	1517	c4==CHAR_SPACE && g_unichar_isupper(c5))
ali@53	1518	{
ali@70	1519	t=g_utf8_next_char(t);
ali@53	1520	continue;
ali@53	1521	}
ali@53	1522	}
ali@70	1523	s1=g_utf8_next_char(g_utf8_next_char(t));
ali@70	1524	while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
ali@70	1525	!isdigit(g_utf8_get_char(s1)))
ali@70	1526	s1=g_utf8_next_char(s1);
ali@70	1527	if (g_unichar_islower(g_utf8_get_char(s1)))
ali@53	1528	{
ali@53	1529	/* we have something to investigate */
ali@69	1530	istypo=TRUE;
ali@53	1531	/* so let's go back and find out */
ali@70	1532	for (s1=g_utf8_prev_char(t);s1>=aline &&
ali@70	1533	(g_unichar_isalpha(g_utf8_get_char(s1)) \|\|
ali@70	1534	g_unichar_isdigit(g_utf8_get_char(s1)) \|\|
ali@70	1535	g_utf8_get_char(s1)==CHAR_SQUOTE &&
ali@70	1536	g_unichar_isalpha(g_utf8_get_char(g_utf8_next_char(s1))) &&
ali@70	1537	g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s1))));
ali@70	1538	s1=g_utf8_prev_char(s1))
ali@53	1539	;
ali@70	1540	s1=g_utf8_next_char(s1);
ali@69	1541	s=strchr(s1,'.');
ali@69	1542	if (s)
ali@69	1543	testword=g_strndup(s1,s-s1);
ali@69	1544	else
ali@69	1545	testword=g_strdup(s1);
ali@53	1546	for (i=0;*abbrev[i];i++)
ali@53	1547	if (!strcmp(testword,abbrev[i]))
ali@69	1548	istypo=FALSE;
ali@70	1549	if (g_unichar_isdigit(g_utf8_get_char(testword)))
ali@69	1550	istypo=FALSE;
ali@70	1551	if (!*g_utf8_next_char(testword))
ali@69	1552	istypo=FALSE;
ali@53	1553	if (isroman(testword))
ali@69	1554	istypo=FALSE;
ali@53	1555	if (istypo)
ali@53	1556	{
ali@69	1557	istypo=FALSE;
ali@70	1558	for (s=testword;*s;s=g_utf8_next_char(s))
ali@70	1559	{
ali@70	1560	decomposition=g_unicode_canonical_decomposition(
ali@70	1561	g_utf8_get_char(s),&len);
ali@70	1562	if (g_utf8_strchr("aeiou",-1,decomposition[0]))
ali@69	1563	istypo=TRUE;
ali@70	1564	g_free(decomposition);
ali@70	1565	}
ali@53	1566	}
ali@69	1567	if (istypo &&
ali@69	1568	(pswit[VERBOSE_SWITCH] \|\| !g_tree_lookup(qperiod,testword)))
ali@53	1569	{
ali@69	1570	g_tree_insert(qperiod,g_strdup(testword),
ali@69	1571	GINT_TO_POINTER(1));
ali@69	1572	if (pswit[ECHO_SWITCH])
ali@70	1573	g_print("\n%s\n",aline);
ali@69	1574	if (!pswit[OVERVIEW_SWITCH])
ali@70	1575	g_print(" Line %ld column %ld - Extra period?\n",
ali@70	1576	linecnt,g_utf8_pointer_to_offset(aline,t)+1);
ali@69	1577	else
ali@69	1578	cnt_punct++;
ali@53	1579	}
ali@69	1580	g_free(testword);
ali@53	1581	}
ali@70	1582	t=g_utf8_next_char(t);
ali@53	1583	}
ali@53	1584	}
ali@53	1585	}
ali@53	1586
ali@53	1587	/*
ali@54	1588	* check_for_following_punctuation:
ali@54	1589	*
ali@54	1590	* Check for words usually not followed by punctuation.
ali@54	1591	*/
ali@54	1592	void check_for_following_punctuation(const char *aline)
ali@54	1593	{
ali@54	1594	int i;
ali@54	1595	const char s,wordstart;
ali@70	1596	gunichar c;
ali@69	1597	gchar inword,t;
ali@54	1598	if (pswit[TYPO_SWITCH])
ali@54	1599	{
ali@54	1600	for (s=aline;*s;)
ali@54	1601	{
ali@54	1602	wordstart=s;
ali@69	1603	t=getaword(&s);
ali@69	1604	if (!*t)
ali@69	1605	{
ali@69	1606	g_free(t);
ali@54	1607	continue;
ali@69	1608	}
ali@70	1609	inword=g_utf8_strdown(t,-1);
ali@69	1610	g_free(t);
ali@54	1611	for (i=0;*nocomma[i];i++)
ali@54	1612	if (!strcmp(inword,nocomma[i]))
ali@54	1613	{
ali@70	1614	c=g_utf8_get_char(s);
ali@70	1615	if (c==',' \|\| c==';' \|\| c==':')
ali@54	1616	{
ali@54	1617	if (pswit[ECHO_SWITCH])
ali@70	1618	g_print("\n%s\n",aline);
ali@54	1619	if (!pswit[OVERVIEW_SWITCH])
ali@70	1620	g_print(" Line %ld column %ld - "
ali@54	1621	"Query punctuation after %s?\n",
ali@70	1622	linecnt,g_utf8_pointer_to_offset(aline,s)+1,
ali@70	1623	inword);
ali@54	1624	else
ali@54	1625	cnt_punct++;
ali@54	1626	}
ali@54	1627	}
ali@54	1628	for (i=0;*noperiod[i];i++)
ali@54	1629	if (!strcmp(inword,noperiod[i]))
ali@54	1630	{
ali@70	1631	c=g_utf8_get_char(s);
ali@70	1632	if (c=='.' \|\| c=='!')
ali@54	1633	{
ali@54	1634	if (pswit[ECHO_SWITCH])
ali@70	1635	g_print("\n%s\n",aline);
ali@54	1636	if (!pswit[OVERVIEW_SWITCH])
ali@70	1637	g_print(" Line %ld column %ld - "
ali@54	1638	"Query punctuation after %s?\n",
ali@70	1639	linecnt,g_utf8_pointer_to_offset(aline,s)+1,
ali@70	1640	inword);
ali@54	1641	else
ali@54	1642	cnt_punct++;
ali@54	1643	}
ali@54	1644	}
ali@69	1645	g_free(inword);
ali@54	1646	}
ali@54	1647	}
ali@54	1648	}
ali@54	1649
ali@54	1650	/*
ali@55	1651	* check_for_typos:
ali@55	1652	*
ali@55	1653	* Check for commonly mistyped words,
ali@55	1654	* and digits like 0 for O in a word.
ali@55	1655	*/
ali@55	1656	void check_for_typos(const char aline,struct warnings warnings)
ali@55	1657	{
ali@70	1658	const char s,t,nt,wordstart;
ali@70	1659	gchar *inword;
ali@70	1660	gunichar *decomposition;
ali@70	1661	gchar *testword;
ali@70	1662	int i,vowel,consonant,*dupcnt;
ali@70	1663	gboolean isdup,istypo,alower;
ali@70	1664	gunichar c;
ali@70	1665	long offset,len;
ali@70	1666	gsize decomposition_len;
ali@55	1667	for (s=aline;*s;)
ali@55	1668	{
ali@55	1669	wordstart=s;
ali@69	1670	inword=getaword(&s);
ali@55	1671	if (!*inword)
ali@69	1672	{
ali@69	1673	g_free(inword);
ali@55	1674	continue; /* don't bother with empty lines */
ali@69	1675	}
ali@55	1676	if (mixdigit(inword))
ali@55	1677	{
ali@55	1678	if (pswit[ECHO_SWITCH])
ali@70	1679	g_print("\n%s\n",aline);
ali@55	1680	if (!pswit[OVERVIEW_SWITCH])
ali@70	1681	g_print(" Line %ld column %ld - Query digit in %s\n",
ali@70	1682	linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
ali@55	1683	else
ali@55	1684	cnt_word++;
ali@55	1685	}
ali@55	1686	/*
ali@55	1687	* Put the word through a series of tests for likely typos and OCR
ali@55	1688	* errors.
ali@55	1689	*/
ali@69	1690	if (pswit[TYPO_SWITCH] \|\| pswit[USERTYPO_SWITCH])
ali@55	1691	{
ali@69	1692	istypo=FALSE;
ali@70	1693	alower=FALSE;
ali@70	1694	for (t=inword;*t;t=g_utf8_next_char(t))
ali@55	1695	{
ali@70	1696	c=g_utf8_get_char(t);
ali@70	1697	nt=g_utf8_next_char(t);
ali@55	1698	/* lowercase for testing */
ali@70	1699	if (g_unichar_islower(c))
ali@70	1700	alower=TRUE;
ali@70	1701	if (alower && (g_unichar_isupper(c) \|\| g_unichar_istitle(c)))
ali@55	1702	{
ali@55	1703	/*
ali@55	1704	* We have an uppercase mid-word. However, there are
ali@55	1705	* common cases:
ali@55	1706	* Mac and Mc like McGill
ali@55	1707	* French contractions like l'Abbe
ali@55	1708	*/
ali@70	1709	offset=g_utf8_pointer_to_offset(inword,t);
ali@70	1710	if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' \|\|
ali@70	1711	offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
ali@70	1712	g_utf8_get_char(g_utf8_next_char(nt))=='c' \|\|
ali@70	1713	offset>0 &&
ali@70	1714	g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SQUOTE)
ali@55	1715	; /* do nothing! */
ali@55	1716	else
ali@69	1717	istypo=TRUE;
ali@55	1718	}
ali@55	1719	}
ali@70	1720	testword=g_utf8_casefold(inword,-1);
ali@69	1721	}
ali@69	1722	if (pswit[TYPO_SWITCH])
ali@69	1723	{
ali@55	1724	/*
ali@55	1725	* Check for certain unlikely two-letter combinations at word
ali@55	1726	* start and end.
ali@55	1727	*/
ali@70	1728	len=g_utf8_strlen(testword,-1);
ali@70	1729	if (len>1)
ali@55	1730	{
ali@55	1731	for (i=0;*nostart[i];i++)
ali@70	1732	if (g_str_has_prefix(testword,nostart[i]))
ali@69	1733	istypo=TRUE;
ali@55	1734	for (i=0;*noend[i];i++)
ali@70	1735	if (g_str_has_suffix(testword,noend[i]))
ali@69	1736	istypo=TRUE;
ali@55	1737	}
ali@55	1738	/* ght is common, gbt never. Like that. */
ali@55	1739	if (strstr(testword,"cb"))
ali@69	1740	istypo=TRUE;
ali@55	1741	if (strstr(testword,"gbt"))
ali@69	1742	istypo=TRUE;
ali@55	1743	if (strstr(testword,"pbt"))
ali@69	1744	istypo=TRUE;
ali@55	1745	if (strstr(testword,"tbs"))
ali@69	1746	istypo=TRUE;
ali@55	1747	if (strstr(testword,"mrn"))
ali@69	1748	istypo=TRUE;
ali@55	1749	if (strstr(testword,"ahle"))
ali@69	1750	istypo=TRUE;
ali@55	1751	if (strstr(testword,"ihle"))
ali@69	1752	istypo=TRUE;
ali@55	1753	/*
ali@55	1754	* "TBE" does happen - like HEARTBEAT - but uncommon.
ali@55	1755	* Also "TBI" - frostbite, outbid - but uncommon.
ali@55	1756	* Similarly "ii" like Hawaii, or Pompeii, and in Roman
ali@55	1757	* numerals, but "ii" is a common scanno.
ali@55	1758	*/
ali@55	1759	if (strstr(testword,"tbi"))
ali@69	1760	istypo=TRUE;
ali@55	1761	if (strstr(testword,"tbe"))
ali@69	1762	istypo=TRUE;
ali@55	1763	if (strstr(testword,"ii"))
ali@69	1764	istypo=TRUE;
ali@55	1765	/*
ali@55	1766	* Check for no vowels or no consonants.
ali@55	1767	* If none, flag a typo.
ali@55	1768	*/
ali@70	1769	if (!istypo && len>1)
ali@55	1770	{
ali@55	1771	vowel=consonant=0;
ali@70	1772	for (t=testword;*t;t=g_utf8_next_char(t))
ali@55	1773	{
ali@70	1774	c=g_utf8_get_char(t);
ali@70	1775	decomposition=
ali@70	1776	g_unicode_canonical_decomposition(c,&decomposition_len);
ali@70	1777	if (c=='y' \|\| g_unichar_isdigit(c))
ali@55	1778	{
ali@55	1779	/* Yah, this is loose. */
ali@55	1780	vowel++;
ali@55	1781	consonant++;
ali@55	1782	}
ali@70	1783	else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
ali@55	1784	vowel++;
ali@55	1785	else
ali@55	1786	consonant++;
ali@70	1787	g_free(decomposition);
ali@55	1788	}
ali@55	1789	if (!vowel \|\| !consonant)
ali@69	1790	istypo=TRUE;
ali@55	1791	}
ali@55	1792	/*
ali@55	1793	* Now exclude the word from being reported if it's in
ali@55	1794	* the okword list.
ali@55	1795	*/
ali@55	1796	for (i=0;*okword[i];i++)
ali@55	1797	if (!strcmp(testword,okword[i]))
ali@69	1798	istypo=FALSE;
ali@55	1799	/*
ali@55	1800	* What looks like a typo may be a Roman numeral.
ali@55	1801	* Exclude these.
ali@55	1802	*/
ali@55	1803	if (istypo && isroman(testword))
ali@69	1804	istypo=FALSE;
ali@55	1805	/* Check the manual list of typos. */
ali@55	1806	if (!istypo)
ali@55	1807	for (i=0;*typo[i];i++)
ali@55	1808	if (!strcmp(testword,typo[i]))
ali@69	1809	istypo=TRUE;
ali@55	1810	/*
ali@55	1811	* Check lowercase s, l, i and m - special cases.
ali@55	1812	* "j" - often a semi-colon gone wrong.
ali@55	1813	* "d" for a missing apostrophe - he d
ali@55	1814	* "n" for "in"
ali@55	1815	*/
ali@70	1816	if (!istypo && len==1 &&
ali@70	1817	g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
ali@69	1818	istypo=TRUE;
ali@55	1819	if (istypo)
ali@55	1820	{
ali@69	1821	dupcnt=g_tree_lookup(qword,testword);
ali@69	1822	if (dupcnt)
ali@69	1823	{
ali@69	1824	(*dupcnt)++;
ali@69	1825	isdup=!pswit[VERBOSE_SWITCH];
ali@69	1826	}
ali@69	1827	else
ali@69	1828	{
ali@69	1829	dupcnt=g_new0(int,1);
ali@69	1830	g_tree_insert(qword,g_strdup(testword),dupcnt);
ali@69	1831	isdup=FALSE;
ali@69	1832	}
ali@55	1833	if (!isdup)
ali@55	1834	{
ali@55	1835	if (pswit[ECHO_SWITCH])
ali@70	1836	g_print("\n%s\n",aline);
ali@55	1837	if (!pswit[OVERVIEW_SWITCH])
ali@55	1838	{
ali@70	1839	g_print(" Line %ld column %ld - Query word %s",
ali@70	1840	linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
ali@70	1841	inword);
ali@69	1842	if (!pswit[VERBOSE_SWITCH])
ali@70	1843	g_print(" - not reporting duplicates");
ali@70	1844	g_print("\n");
ali@55	1845	}
ali@55	1846	else
ali@55	1847	cnt_word++;
ali@55	1848	}
ali@55	1849	}
ali@55	1850	}
ali@55	1851	/* check the user's list of typos */
ali@69	1852	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
ali@69	1853	{
ali@69	1854	if (pswit[ECHO_SWITCH])
ali@70	1855	g_print("\n%s\n",aline);
ali@69	1856	if (!pswit[OVERVIEW_SWITCH])
ali@70	1857	g_print(" Line %ld column %ld - Query possible scanno %s\n",
ali@70	1858	linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
ali@69	1859	}
ali@69	1860	if (pswit[TYPO_SWITCH] \|\| pswit[USERTYPO_SWITCH])
ali@69	1861	g_free(testword);
ali@55	1862	if (pswit[PARANOID_SWITCH] && warnings->digit)
ali@55	1863	{
ali@55	1864	/* In paranoid mode, query all 0 and 1 standing alone. */
ali@55	1865	if (!strcmp(inword,"0") \|\| !strcmp(inword,"1"))
ali@55	1866	{
ali@55	1867	if (pswit[ECHO_SWITCH])
ali@70	1868	g_print("\n%s\n",aline);
ali@55	1869	if (!pswit[OVERVIEW_SWITCH])
ali@70	1870	g_print(" Line %ld column %ld - Query standalone %s\n",
ali@70	1871	linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
ali@70	1872	inword);
ali@55	1873	else
ali@55	1874	cnt_word++;
ali@55	1875	}
ali@55	1876	}
ali@69	1877	g_free(inword);
ali@55	1878	}
ali@55	1879	}
ali@55	1880
ali@56	1881	/*
ali@56	1882	* check_for_misspaced_punctuation:
ali@56	1883	*
ali@56	1884	* Look for added or missing spaces around punctuation and quotes.
ali@56	1885	* If there is a punctuation character like ! with no space on
ali@56	1886	* either side, suspect a missing!space. If there are spaces on
ali@56	1887	* both sides , assume a typo. If we see a double quote with no
ali@56	1888	* space or punctuation on either side of it, assume unspaced
ali@56	1889	* quotes "like"this.
ali@56	1890	*/
ali@56	1891	void check_for_misspaced_punctuation(const char *aline,
ali@69	1892	struct parities *parities,gboolean isemptyline)
ali@56	1893	{
ali@69	1894	gboolean isacro,isellipsis;
ali@56	1895	const char *s;
ali@70	1896	gunichar c,nc,pc,n2c;
ali@70	1897	c=g_utf8_get_char(aline);
ali@70	1898	nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70	1899	for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@56	1900	{
ali@70	1901	pc=c;
ali@70	1902	c=nc;
ali@70	1903	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@56	1904	/* For each character in the line after the first. */
ali@70	1905	if (g_utf8_strchr(".?!,;:_",-1,c)) /* if it's punctuation */
ali@56	1906	{
ali@56	1907	/* we need to suppress warnings for acronyms like M.D. */
ali@69	1908	isacro=FALSE;
ali@56	1909	/* we need to suppress warnings for ellipsis . . . */
ali@69	1910	isellipsis=FALSE;
ali@70	1911	/*
ali@70	1912	* If there are letters on both sides of it or
ali@70	1913	* if it's strict punctuation followed by an alpha.
ali@70	1914	*/
ali@70	1915	if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) \|\|
ali@70	1916	g_utf8_strchr("?!,;:",-1,c)))
ali@56	1917	{
ali@70	1918	if (c=='.')
ali@56	1919	{
ali@70	1920	if (g_utf8_pointer_to_offset(aline,s)>2 &&
ali@70	1921	g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
ali@69	1922	isacro=TRUE;
ali@70	1923	n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
ali@70	1924	if (nc && n2c=='.')
ali@69	1925	isacro=TRUE;
ali@56	1926	}
ali@56	1927	if (!isacro)
ali@56	1928	{
ali@56	1929	if (pswit[ECHO_SWITCH])
ali@70	1930	g_print("\n%s\n",aline);
ali@56	1931	if (!pswit[OVERVIEW_SWITCH])
ali@70	1932	g_print(" Line %ld column %ld - Missing space?\n",
ali@70	1933	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56	1934	else
ali@56	1935	cnt_punct++;
ali@56	1936	}
ali@56	1937	}
ali@70	1938	if (pc==CHAR_SPACE && (nc==CHAR_SPACE \|\| !nc))
ali@56	1939	{
ali@56	1940	/*
ali@56	1941	* If there are spaces on both sides,
ali@56	1942	* or space before and end of line.
ali@56	1943	*/
ali@70	1944	if (c=='.')
ali@56	1945	{
ali@70	1946	if (g_utf8_pointer_to_offset(aline,s)>2 &&
ali@70	1947	g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
ali@69	1948	isellipsis=TRUE;
ali@70	1949	n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
ali@70	1950	if (nc && n2c=='.')
ali@69	1951	isellipsis=TRUE;
ali@56	1952	}
ali@56	1953	if (!isemptyline && !isellipsis)
ali@56	1954	{
ali@56	1955	if (pswit[ECHO_SWITCH])
ali@70	1956	g_print("\n%s\n",aline);
ali@56	1957	if (!pswit[OVERVIEW_SWITCH])
ali@70	1958	g_print(" Line %ld column %ld - "
ali@70	1959	"Spaced punctuation?\n",linecnt,
ali@70	1960	g_utf8_pointer_to_offset(aline,s)+1);
ali@56	1961	else
ali@56	1962	cnt_punct++;
ali@56	1963	}
ali@56	1964	}
ali@56	1965	}
ali@56	1966	}
ali@56	1967	/* Split out the characters that CANNOT be preceded by space. */
ali@70	1968	c=g_utf8_get_char(aline);
ali@70	1969	nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70	1970	for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@56	1971	{
ali@70	1972	pc=c;
ali@70	1973	c=nc;
ali@70	1974	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@56	1975	/* for each character in the line after the first */
ali@70	1976	if (g_utf8_strchr("?!,;:",-1,c))
ali@56	1977	{
ali@56	1978	/* if it's punctuation that _cannot_ have a space before it */
ali@70	1979	if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
ali@56	1980	{
ali@56	1981	/*
ali@70	1982	* If nc DOES == space,
ali@56	1983	* it was already reported just above.
ali@56	1984	*/
ali@56	1985	if (pswit[ECHO_SWITCH])
ali@70	1986	g_print("\n%s\n",aline);
ali@56	1987	if (!pswit[OVERVIEW_SWITCH])
ali@70	1988	g_print(" Line %ld column %ld - Spaced punctuation?\n",
ali@70	1989	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56	1990	else
ali@56	1991	cnt_punct++;
ali@56	1992	}
ali@56	1993	}
ali@56	1994	}
ali@56	1995	/*
ali@56	1996	* Special case " .X" where X is any alpha.
ali@56	1997	* This plugs a hole in the acronym code above.
ali@56	1998	* Inelegant, but maintainable.
ali@56	1999	*/
ali@70	2000	c=g_utf8_get_char(aline);
ali@70	2001	nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70	2002	for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@56	2003	{
ali@70	2004	pc=c;
ali@70	2005	c=nc;
ali@70	2006	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@56	2007	/* for each character in the line after the first */
ali@70	2008	if (c=='.')
ali@56	2009	{
ali@56	2010	/* if it's a period */
ali@70	2011	if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
ali@56	2012	{
ali@56	2013	/*
ali@56	2014	* If the period follows a space and
ali@56	2015	* is followed by a letter.
ali@56	2016	*/
ali@56	2017	if (pswit[ECHO_SWITCH])
ali@70	2018	g_print("\n%s\n",aline);
ali@56	2019	if (!pswit[OVERVIEW_SWITCH])
ali@70	2020	g_print(" Line %ld column %ld - Spaced punctuation?\n",
ali@70	2021	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56	2022	else
ali@56	2023	cnt_punct++;
ali@56	2024	}
ali@56	2025	}
ali@56	2026	}
ali@70	2027	c=g_utf8_get_char(aline);
ali@70	2028	nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70	2029	for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@56	2030	{
ali@70	2031	pc=c;
ali@70	2032	c=nc;
ali@70	2033	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@56	2034	/* for each character in the line after the first */
ali@70	2035	if (c==CHAR_DQUOTE)
ali@56	2036	{
ali@70	2037	if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
ali@70	2038	!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc \|\|
ali@70	2039	!g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
ali@56	2040	{
ali@56	2041	if (pswit[ECHO_SWITCH])
ali@70	2042	g_print("\n%s\n",aline);
ali@56	2043	if (!pswit[OVERVIEW_SWITCH])
ali@70	2044	g_print(" Line %ld column %ld - Unspaced quotes?\n",
ali@70	2045	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56	2046	else
ali@56	2047	cnt_punct++;
ali@56	2048	}
ali@56	2049	}
ali@56	2050	}
ali@56	2051	/* Check parity of quotes. */
ali@70	2052	nc=g_utf8_get_char(aline);
ali@70	2053	for (s=aline;*s;s=g_utf8_next_char(s))
ali@56	2054	{
ali@70	2055	c=nc;
ali@70	2056	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@70	2057	if (c==CHAR_DQUOTE)
ali@56	2058	{
ali@56	2059	parities->dquote=!parities->dquote;
ali@56	2060	if (!parities->dquote)
ali@56	2061	{
ali@56	2062	/* parity even */
ali@70	2063	if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))
ali@56	2064	{
ali@56	2065	if (pswit[ECHO_SWITCH])
ali@70	2066	g_print("\n%s\n",aline);
ali@56	2067	if (!pswit[OVERVIEW_SWITCH])
ali@70	2068	g_print(" Line %ld column %ld - "
ali@70	2069	"Wrongspaced quotes?\n",
ali@70	2070	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56	2071	else
ali@56	2072	cnt_punct++;
ali@56	2073	}
ali@56	2074	}
ali@56	2075	else
ali@56	2076	{
ali@56	2077	/* parity odd */
ali@70	2078	if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
ali@70	2079	!g_utf8_strchr("_-/.'`([{$",-1,nc) \|\| !nc)
ali@56	2080	{
ali@56	2081	if (pswit[ECHO_SWITCH])
ali@70	2082	g_print("\n%s\n",aline);
ali@56	2083	if (!pswit[OVERVIEW_SWITCH])
ali@70	2084	g_print(" Line %ld column %ld - "
ali@70	2085	"Wrongspaced quotes?\n",
ali@70	2086	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56	2087	else
ali@56	2088	cnt_punct++;
ali@56	2089	}
ali@56	2090	}
ali@56	2091	}
ali@56	2092	}
ali@70	2093	if (g_utf8_get_char(aline)==CHAR_DQUOTE)
ali@56	2094	{
ali@70	2095	if (g_utf8_strchr(",;:!?)]} ",-1,
ali@70	2096	g_utf8_get_char(g_utf8_next_char(aline))))
ali@56	2097	{
ali@56	2098	if (pswit[ECHO_SWITCH])
ali@70	2099	g_print("\n%s\n",aline);
ali@56	2100	if (!pswit[OVERVIEW_SWITCH])
ali@70	2101	g_print(" Line %ld column 1 - Wrongspaced quotes?\n",
ali@56	2102	linecnt);
ali@56	2103	else
ali@56	2104	cnt_punct++;
ali@56	2105	}
ali@56	2106	}
ali@56	2107	if (pswit[SQUOTE_SWITCH])
ali@56	2108	{
ali@70	2109	nc=g_utf8_get_char(aline);
ali@70	2110	for (s=aline;*s;s=g_utf8_next_char(s))
ali@56	2111	{
ali@70	2112	c=nc;
ali@70	2113	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@70	2114	if ((c==CHAR_SQUOTE \|\| c==CHAR_OPEN_SQUOTE) && (s==aline \|\|
ali@70	2115	s>aline &&
ali@70	2116	!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) \|\|
ali@70	2117	!g_unichar_isalpha(nc)))
ali@56	2118	{
ali@56	2119	parities->squote=!parities->squote;
ali@56	2120	if (!parities->squote)
ali@56	2121	{
ali@56	2122	/* parity even */
ali@70	2123	if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
ali@56	2124	{
ali@56	2125	if (pswit[ECHO_SWITCH])
ali@70	2126	g_print("\n%s\n",aline);
ali@56	2127	if (!pswit[OVERVIEW_SWITCH])
ali@70	2128	g_print(" Line %ld column %ld - "
ali@56	2129	"Wrongspaced singlequotes?\n",
ali@70	2130	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56	2131	else
ali@56	2132	cnt_punct++;
ali@56	2133	}
ali@56	2134	}
ali@56	2135	else
ali@56	2136	{
ali@56	2137	/* parity odd */
ali@70	2138	if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
ali@70	2139	!g_utf8_strchr("_-/\".'`",-1,nc) \|\| !nc)
ali@56	2140	{
ali@56	2141	if (pswit[ECHO_SWITCH])
ali@70	2142	g_print("\n%s\n",aline);
ali@56	2143	if (!pswit[OVERVIEW_SWITCH])
ali@70	2144	g_print(" Line %ld column %ld - "
ali@56	2145	"Wrongspaced singlequotes?\n",
ali@70	2146	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56	2147	else
ali@56	2148	cnt_punct++;
ali@56	2149	}
ali@56	2150	}
ali@56	2151	}
ali@56	2152	}
ali@56	2153	}
ali@56	2154	}
ali@56	2155
ali@55	2156	/*
ali@57	2157	* check_for_double_punctuation:
ali@57	2158	*
ali@57	2159	* Look for double punctuation like ,. or ,,
ali@57	2160	* Thanks to DW for the suggestion!
ali@57	2161	* In books with references, ".," and ".;" are common
ali@57	2162	* e.g. "etc., etc.," and vol. 1.; vol 3.;
ali@57	2163	* OTOH, from my initial tests, there are also fairly
ali@57	2164	* common errors. What to do? Make these cases paranoid?
ali@57	2165	* ".," is the most common, so warnings->dotcomma is used
ali@57	2166	* to suppress detailed reporting if it occurs often.
ali@57	2167	*/
ali@57	2168	void check_for_double_punctuation(const char aline,struct warnings warnings)
ali@57	2169	{
ali@70	2170	const char *s;
ali@70	2171	gunichar c,nc;
ali@70	2172	nc=g_utf8_get_char(aline);
ali@70	2173	for (s=aline;*s;s=g_utf8_next_char(s))
ali@57	2174	{
ali@70	2175	c=nc;
ali@70	2176	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@57	2177	/* for each punctuation character in the line */
ali@70	2178	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
ali@70	2179	g_utf8_strchr(".?!,;:",-1,nc))
ali@57	2180	{
ali@57	2181	/* followed by punctuation, it's a query, unless . . . */
ali@70	2182	if (c==nc && (c=='.' \|\| c=='?' \|\| c=='!') \|\|
ali@70	2183	!warnings->dotcomma && c=='.' && nc==',' \|\|
ali@70	2184	warnings->isFrench && g_str_has_prefix(s,",...") \|\|
ali@70	2185	warnings->isFrench && g_str_has_prefix(s,"...,") \|\|
ali@70	2186	warnings->isFrench && g_str_has_prefix(s,";...") \|\|
ali@70	2187	warnings->isFrench && g_str_has_prefix(s,"...;") \|\|
ali@70	2188	warnings->isFrench && g_str_has_prefix(s,":...") \|\|
ali@70	2189	warnings->isFrench && g_str_has_prefix(s,"...:") \|\|
ali@70	2190	warnings->isFrench && g_str_has_prefix(s,"!...") \|\|
ali@70	2191	warnings->isFrench && g_str_has_prefix(s,"...!") \|\|
ali@70	2192	warnings->isFrench && g_str_has_prefix(s,"?...") \|\|
ali@70	2193	warnings->isFrench && g_str_has_prefix(s,"...?"))
ali@57	2194	{
ali@70	2195	if (warnings->isFrench && g_str_has_prefix(s,",...") \|\|
ali@70	2196	warnings->isFrench && g_str_has_prefix(s,"...,") \|\|
ali@70	2197	warnings->isFrench && g_str_has_prefix(s,";...") \|\|
ali@70	2198	warnings->isFrench && g_str_has_prefix(s,"...;") \|\|
ali@70	2199	warnings->isFrench && g_str_has_prefix(s,":...") \|\|
ali@70	2200	warnings->isFrench && g_str_has_prefix(s,"...:") \|\|
ali@70	2201	warnings->isFrench && g_str_has_prefix(s,"!...") \|\|
ali@70	2202	warnings->isFrench && g_str_has_prefix(s,"...!") \|\|
ali@70	2203	warnings->isFrench && g_str_has_prefix(s,"?...") \|\|
ali@70	2204	warnings->isFrench && g_str_has_prefix(s,"...?"))
ali@70	2205	{
ali@70	2206	s+=4;
ali@70	2207	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@70	2208	}
ali@57	2209	; /* do nothing for .. !! and ?? which can be legit */
ali@57	2210	}
ali@57	2211	else
ali@57	2212	{
ali@57	2213	if (pswit[ECHO_SWITCH])
ali@70	2214	g_print("\n%s\n",aline);
ali@57	2215	if (!pswit[OVERVIEW_SWITCH])
ali@70	2216	g_print(" Line %ld column %ld - Double punctuation?\n",
ali@70	2217	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@57	2218	else
ali@57	2219	cnt_punct++;
ali@57	2220	}
ali@57	2221	}
ali@57	2222	}
ali@57	2223	}
ali@57	2224
ali@57	2225	/*
ali@58	2226	* check_for_spaced_quotes:
ali@58	2227	*/
ali@58	2228	void check_for_spaced_quotes(const char *aline)
ali@58	2229	{
ali@58	2230	const char s,t;
ali@58	2231	s=aline;
ali@58	2232	while ((t=strstr(s," \" ")))
ali@58	2233	{
ali@58	2234	if (pswit[ECHO_SWITCH])
ali@70	2235	g_print("\n%s\n",aline);
ali@58	2236	if (!pswit[OVERVIEW_SWITCH])
ali@70	2237	g_print(" Line %ld column %ld - Spaced doublequote?\n",
ali@70	2238	linecnt,g_utf8_pointer_to_offset(aline,t)+1);
ali@58	2239	else
ali@58	2240	cnt_punct++;
ali@70	2241	s=g_utf8_next_char(g_utf8_next_char(t));
ali@58	2242	}
ali@58	2243	s=aline;
ali@58	2244	while ((t=strstr(s," ' ")))
ali@58	2245	{
ali@58	2246	if (pswit[ECHO_SWITCH])
ali@70	2247	g_print("\n%s\n",aline);
ali@58	2248	if (!pswit[OVERVIEW_SWITCH])
ali@70	2249	g_print(" Line %ld column %ld - Spaced singlequote?\n",
ali@70	2250	linecnt,g_utf8_pointer_to_offset(aline,t)+1);
ali@58	2251	else
ali@58	2252	cnt_punct++;
ali@70	2253	s=g_utf8_next_char(g_utf8_next_char(t));
ali@58	2254	}
ali@58	2255	s=aline;
ali@58	2256	while ((t=strstr(s," ` ")))
ali@58	2257	{
ali@58	2258	if (pswit[ECHO_SWITCH])
ali@70	2259	g_print("\n%s\n",aline);
ali@58	2260	if (!pswit[OVERVIEW_SWITCH])
ali@70	2261	g_print(" Line %ld column %ld - Spaced singlequote?\n",
ali@70	2262	linecnt,g_utf8_pointer_to_offset(aline,t)+1);
ali@58	2263	else
ali@58	2264	cnt_punct++;
ali@70	2265	s=g_utf8_next_char(g_utf8_next_char(t));
ali@58	2266	}
ali@58	2267	}
ali@58	2268
ali@58	2269	/*
ali@59	2270	* check_for_miscased_genative:
ali@59	2271	*
ali@59	2272	* Check special case of 'S instead of 's at end of word.
ali@59	2273	*/
ali@59	2274	void check_for_miscased_genative(const char *aline)
ali@59	2275	{
ali@59	2276	const char *s;
ali@70	2277	gunichar c,nc,pc;
ali@69	2278	if (!*aline)
ali@69	2279	return;
ali@70	2280	c=g_utf8_get_char(aline);
ali@70	2281	nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70	2282	for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@59	2283	{
ali@70	2284	pc=c;
ali@70	2285	c=nc;
ali@70	2286	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@70	2287	if (c==CHAR_SQUOTE && nc=='S' && g_unichar_islower(pc))
ali@59	2288	{
ali@59	2289	if (pswit[ECHO_SWITCH])
ali@70	2290	g_print("\n%s\n",aline);
ali@59	2291	if (!pswit[OVERVIEW_SWITCH])
ali@70	2292	g_print(" Line %ld column %ld - Capital \"S\"?\n",
ali@70	2293	linecnt,g_utf8_pointer_to_offset(aline,s)+2);
ali@59	2294	else
ali@59	2295	cnt_punct++;
ali@59	2296	}
ali@59	2297	}
ali@59	2298	}
ali@59	2299
ali@59	2300	/*
ali@60	2301	* check_end_of_line:
ali@60	2302	*
ali@60	2303	* Now check special cases - start and end of line -
ali@60	2304	* for single and double quotes. Start is sometimes [sic]
ali@60	2305	* but better to query it anyway.
ali@60	2306	* While we're here, check for dash at end of line.
ali@60	2307	*/
ali@60	2308	void check_end_of_line(const char aline,struct warnings warnings)
ali@60	2309	{
ali@70	2310	int lbytes;
ali@70	2311	const char *s;
ali@70	2312	gunichar c1,c2;
ali@70	2313	lbytes=strlen(aline);
ali@70	2314	if (g_utf8_strlen(aline,lbytes)>1)
ali@60	2315	{
ali@70	2316	s=g_utf8_prev_char(aline+lbytes);
ali@70	2317	c1=g_utf8_get_char(s);
ali@70	2318	c2=g_utf8_get_char(g_utf8_prev_char(s));
ali@70	2319	if ((c1==CHAR_DQUOTE \|\| c1==CHAR_SQUOTE \|\| c1==CHAR_OPEN_SQUOTE) &&
ali@70	2320	c2==CHAR_SPACE)
ali@60	2321	{
ali@60	2322	if (pswit[ECHO_SWITCH])
ali@70	2323	g_print("\n%s\n",aline);
ali@60	2324	if (!pswit[OVERVIEW_SWITCH])
ali@70	2325	g_print(" Line %ld column %ld - Spaced quote?\n",linecnt,
ali@70	2326	g_utf8_strlen(aline,lbytes));
ali@70	2327	else
ali@70	2328	cnt_punct++;
ali@70	2329	}
ali@70	2330	c1=g_utf8_get_char(aline);
ali@70	2331	c2=g_utf8_get_char(g_utf8_next_char(aline));
ali@70	2332	if ((c1==CHAR_SQUOTE \|\| c1==CHAR_OPEN_SQUOTE) && c2==CHAR_SPACE)
ali@70	2333	{
ali@70	2334	if (pswit[ECHO_SWITCH])
ali@70	2335	g_print("\n%s\n",aline);
ali@70	2336	if (!pswit[OVERVIEW_SWITCH])
ali@70	2337	g_print(" Line %ld column 1 - Spaced quote?\n",linecnt);
ali@60	2338	else
ali@60	2339	cnt_punct++;
ali@60	2340	}
ali@60	2341	/*
ali@60	2342	* Dash at end of line may well be legit - paranoid mode only
ali@60	2343	* and don't report em-dash at line-end.
ali@60	2344	*/
ali@60	2345	if (pswit[PARANOID_SWITCH] && warnings->hyphen)
ali@60	2346	{
ali@70	2347	for (s=g_utf8_prev_char(aline+lbytes);
ali@70	2348	s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
ali@60	2349	;
ali@70	2350	if (g_utf8_get_char(s)=='-' &&
ali@70	2351	g_utf8_get_char(g_utf8_prev_char(s))!='-')
ali@60	2352	{
ali@60	2353	if (pswit[ECHO_SWITCH])
ali@70	2354	g_print("\n%s\n",aline);
ali@60	2355	if (!pswit[OVERVIEW_SWITCH])
ali@70	2356	g_print(" Line %ld column %ld - "
ali@70	2357	"Hyphen at end of line?\n",
ali@70	2358	linecnt,g_utf8_pointer_to_offset(aline,s));
ali@60	2359	}
ali@60	2360	}
ali@60	2361	}
ali@60	2362	}
ali@60	2363
ali@60	2364	/*
ali@61	2365	* check_for_unspaced_bracket:
ali@61	2366	*
ali@61	2367	* Brackets are often unspaced, but shouldn't be surrounded by alpha.
ali@61	2368	* If so, suspect a scanno like "a]most".
ali@61	2369	*/
ali@61	2370	void check_for_unspaced_bracket(const char *aline)
ali@61	2371	{
ali@70	2372	const char *s;
ali@70	2373	gunichar c,nc,pc;
ali@70	2374	c=g_utf8_get_char(aline);
ali@70	2375	nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70	2376	for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@61	2377	{
ali@70	2378	pc=c;
ali@70	2379	c=nc;
ali@70	2380	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@70	2381	if (!nc)
ali@70	2382	break;
ali@61	2383	/* for each bracket character in the line except 1st & last */
ali@70	2384	if (g_utf8_strchr("{[()]}",-1,c) &&
ali@70	2385	g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
ali@61	2386	{
ali@61	2387	if (pswit[ECHO_SWITCH])
ali@70	2388	g_print("\n%s\n",aline);
ali@61	2389	if (!pswit[OVERVIEW_SWITCH])
ali@70	2390	g_print(" Line %ld column %ld - Unspaced bracket?\n",
ali@70	2391	linecnt,g_utf8_pointer_to_offset(aline,s));
ali@61	2392	else
ali@61	2393	cnt_punct++;
ali@61	2394	}
ali@61	2395	}
ali@61	2396	}
ali@61	2397
ali@61	2398	/*
ali@62	2399	* check_for_unpunctuated_endquote:
ali@62	2400	*/
ali@62	2401	void check_for_unpunctuated_endquote(const char *aline)
ali@62	2402	{
ali@70	2403	const char *s;
ali@70	2404	gunichar c,nc,pc;
ali@70	2405	c=g_utf8_get_char(aline);
ali@70	2406	nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70	2407	for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@62	2408	{
ali@70	2409	pc=c;
ali@70	2410	c=nc;
ali@70	2411	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@62	2412	/* for each character in the line except 1st */
ali@70	2413	if (c==CHAR_DQUOTE && isalpha(pc))
ali@62	2414	{
ali@62	2415	if (pswit[ECHO_SWITCH])
ali@70	2416	g_print("\n%s\n",aline);
ali@62	2417	if (!pswit[OVERVIEW_SWITCH])
ali@70	2418	g_print(" Line %ld column %ld - "
ali@70	2419	"endquote missing punctuation?\n",
ali@70	2420	linecnt,g_utf8_pointer_to_offset(aline,s));
ali@62	2421	else
ali@62	2422	cnt_punct++;
ali@62	2423	}
ali@62	2424	}
ali@62	2425	}
ali@62	2426
ali@62	2427	/*
ali@63	2428	* check_for_html_tag:
ali@63	2429	*
ali@63	2430	* Check for <HTML TAG>.
ali@63	2431	*
ali@63	2432	* If there is a < in the line, followed at some point
ali@63	2433	* by a > then we suspect HTML.
ali@63	2434	*/
ali@63	2435	void check_for_html_tag(const char *aline)
ali@63	2436	{
ali@63	2437	const char open,close;
ali@70	2438	gchar *tag;
ali@70	2439	open=strchr(aline,'<');
ali@63	2440	if (open)
ali@63	2441	{
ali@70	2442	close=strchr(g_utf8_next_char(open),'>');
ali@63	2443	if (close)
ali@63	2444	{
ali@70	2445	if (pswit[ECHO_SWITCH])
ali@70	2446	g_print("\n%s\n",aline);
ali@70	2447	if (!pswit[OVERVIEW_SWITCH])
ali@63	2448	{
ali@70	2449	tag=g_strndup(open,close-open+1);
ali@70	2450	g_print(" Line %ld column %ld - HTML Tag? %s \n",
ali@70	2451	linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
ali@70	2452	g_free(tag);
ali@63	2453	}
ali@70	2454	else
ali@70	2455	cnt_html++;
ali@63	2456	}
ali@63	2457	}
ali@63	2458	}
ali@63	2459
ali@63	2460	/*
ali@64	2461	* check_for_html_entity:
ali@64	2462	*
ali@64	2463	* Check for &symbol; HTML.
ali@64	2464	*
ali@64	2465	* If there is a & in the line, followed at
ali@64	2466	* some point by a ; then we suspect HTML.
ali@64	2467	*/
ali@64	2468	void check_for_html_entity(const char *aline)
ali@64	2469	{
ali@64	2470	const char s,amp,*scolon;
ali@70	2471	gchar *entity;
ali@70	2472	amp=strchr(aline,'&');
ali@64	2473	if (amp)
ali@64	2474	{
ali@70	2475	scolon=strchr(amp,';');
ali@64	2476	if (scolon)
ali@64	2477	{
ali@70	2478	for (s=amp;s<scolon;s=g_utf8_next_char(s))
ali@70	2479	if (g_utf8_get_char(s)==CHAR_SPACE)
ali@70	2480	break; /* Don't report "Jones & Son;" */
ali@70	2481	if (s>=scolon)
ali@64	2482	{
ali@64	2483	if (pswit[ECHO_SWITCH])
ali@70	2484	g_print("\n%s\n",aline);
ali@64	2485	if (!pswit[OVERVIEW_SWITCH])
ali@70	2486	{
ali@70	2487	entity=g_strndup(amp,scolon-amp+1);
ali@70	2488	g_print(" Line %ld column %d - HTML symbol? %s \n",
ali@70	2489	linecnt,(int)(amp-aline)+1,entity);
ali@70	2490	g_free(entity);
ali@70	2491	}
ali@64	2492	else
ali@64	2493	cnt_html++;
ali@64	2494	}
ali@64	2495	}
ali@64	2496	}
ali@64	2497	}
ali@64	2498
ali@65	2499	/*
ali@65	2500	* print_pending:
ali@65	2501	*
ali@65	2502	* If we are in a state of unbalanced quotes, and this line
ali@65	2503	* doesn't begin with a quote, output the stored error message.
ali@65	2504	* If the -P switch was used, print the warning even if the
ali@65	2505	* new para starts with quotes.
ali@65	2506	*/
ali@65	2507	void print_pending(const char aline,const char parastart,
ali@65	2508	struct pending *pending)
ali@65	2509	{
ali@65	2510	const char *s;
ali@70	2511	gunichar c;
ali@65	2512	s=aline;
ali@65	2513	while (*s==' ')
ali@65	2514	s++;
ali@70	2515	c=g_utf8_get_char(s);
ali@69	2516	if (pending->dquote)
ali@69	2517	{
ali@70	2518	if (c!=CHAR_DQUOTE \|\| pswit[QPARA_SWITCH])
ali@65	2519	{
ali@65	2520	if (!pswit[OVERVIEW_SWITCH])
ali@65	2521	{
ali@65	2522	if (pswit[ECHO_SWITCH])
ali@70	2523	g_print("\n%s\n",parastart);
ali@70	2524	g_print("%s\n",pending->dquote);
ali@65	2525	}
ali@65	2526	else
ali@65	2527	cnt_dquot++;
ali@65	2528	}
ali@69	2529	g_free(pending->dquote);
ali@69	2530	pending->dquote=NULL;
ali@69	2531	}
ali@69	2532	if (pending->squote)
ali@65	2533	{
ali@70	2534	if (c!=CHAR_SQUOTE && c!=CHAR_OPEN_SQUOTE \|\| pswit[QPARA_SWITCH] \|\|
ali@65	2535	pending->squot)
ali@65	2536	{
ali@65	2537	if (!pswit[OVERVIEW_SWITCH])
ali@65	2538	{
ali@65	2539	if (pswit[ECHO_SWITCH])
ali@70	2540	g_print("\n%s\n",parastart);
ali@70	2541	g_print("%s\n",pending->squote);
ali@65	2542	}
ali@65	2543	else
ali@65	2544	cnt_squot++;
ali@65	2545	}
ali@69	2546	g_free(pending->squote);
ali@69	2547	pending->squote=NULL;
ali@65	2548	}
ali@69	2549	if (pending->rbrack)
ali@65	2550	{
ali@65	2551	if (!pswit[OVERVIEW_SWITCH])
ali@65	2552	{
ali@65	2553	if (pswit[ECHO_SWITCH])
ali@70	2554	g_print("\n%s\n",parastart);
ali@70	2555	g_print("%s\n",pending->rbrack);
ali@65	2556	}
ali@65	2557	else
ali@65	2558	cnt_brack++;
ali@69	2559	g_free(pending->rbrack);
ali@69	2560	pending->rbrack=NULL;
ali@65	2561	}
ali@69	2562	if (pending->sbrack)
ali@65	2563	{
ali@65	2564	if (!pswit[OVERVIEW_SWITCH])
ali@65	2565	{
ali@65	2566	if (pswit[ECHO_SWITCH])
ali@70	2567	g_print("\n%s\n",parastart);
ali@70	2568	g_print("%s\n",pending->sbrack);
ali@65	2569	}
ali@65	2570	else
ali@65	2571	cnt_brack++;
ali@69	2572	g_free(pending->sbrack);
ali@69	2573	pending->sbrack=NULL;
ali@65	2574	}
ali@69	2575	if (pending->cbrack)
ali@65	2576	{
ali@65	2577	if (!pswit[OVERVIEW_SWITCH])
ali@65	2578	{
ali@65	2579	if (pswit[ECHO_SWITCH])
ali@70	2580	g_print("\n%s\n",parastart);
ali@70	2581	g_print("%s\n",pending->cbrack);
ali@65	2582	}
ali@65	2583	else
ali@65	2584	cnt_brack++;
ali@69	2585	g_free(pending->cbrack);
ali@69	2586	pending->cbrack=NULL;
ali@65	2587	}
ali@69	2588	if (pending->unders)
ali@65	2589	{
ali@65	2590	if (!pswit[OVERVIEW_SWITCH])
ali@65	2591	{
ali@65	2592	if (pswit[ECHO_SWITCH])
ali@70	2593	g_print("\n%s\n",parastart);
ali@70	2594	g_print("%s\n",pending->unders);
ali@65	2595	}
ali@65	2596	else
ali@65	2597	cnt_brack++;
ali@69	2598	g_free(pending->unders);
ali@69	2599	pending->unders=NULL;
ali@65	2600	}
ali@65	2601	}
ali@65	2602
ali@65	2603	/*
ali@65	2604	* check_for_mismatched_quotes:
ali@65	2605	*
ali@65	2606	* At end of paragraph, check for mismatched quotes.
ali@65	2607	*
ali@65	2608	* We don't want to report an error immediately, since it is a
ali@65	2609	* common convention to omit the quotes at end of paragraph if
ali@65	2610	* the next paragraph is a continuation of the same speaker.
ali@65	2611	* Where this is the case, the next para should begin with a
ali@65	2612	* quote, so we store the warning message and only display it
ali@65	2613	* at the top of the next iteration if the new para doesn't
ali@65	2614	* start with a quote.
ali@65	2615	* The -p switch overrides this default, and warns of unclosed
ali@65	2616	* quotes on _every_ paragraph, whether the next begins with a
ali@65	2617	* quote or not.
ali@65	2618	*/
ali@65	2619	void check_for_mismatched_quotes(const struct counters *counters,
ali@65	2620	struct pending *pending)
ali@65	2621	{
ali@65	2622	if (counters->quot%2)
ali@69	2623	pending->dquote=
ali@69	2624	g_strdup_printf(" Line %ld - Mismatched quotes",linecnt);
ali@65	2625	if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
ali@65	2626	counters->open_single_quote!=counters->close_single_quote)
ali@69	2627	pending->squote=
ali@69	2628	g_strdup_printf(" Line %ld - Mismatched singlequotes?",linecnt);
ali@65	2629	if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
ali@65	2630	counters->open_single_quote!=counters->close_single_quote &&
ali@65	2631	counters->open_single_quote!=counters->close_single_quote+1)
ali@65	2632	/*
ali@65	2633	* Flag it to be noted regardless of the
ali@65	2634	* first char of the next para.
ali@65	2635	*/
ali@65	2636	pending->squot=1;
ali@65	2637	if (counters->r_brack)
ali@69	2638	pending->rbrack=
ali@69	2639	g_strdup_printf(" Line %ld - Mismatched round brackets?",linecnt);
ali@65	2640	if (counters->s_brack)
ali@69	2641	pending->sbrack=
ali@69	2642	g_strdup_printf(" Line %ld - Mismatched square brackets?",linecnt);
ali@65	2643	if (counters->c_brack)
ali@69	2644	pending->cbrack=
ali@69	2645	g_strdup_printf(" Line %ld - Mismatched curly brackets?",linecnt);
ali@65	2646	if (counters->c_unders%2)
ali@69	2647	pending->unders=
ali@69	2648	g_strdup_printf(" Line %ld - Mismatched underscores?",linecnt);
ali@65	2649	}
ali@65	2650
ali@64	2651	/*
ali@66	2652	* check_for_omitted_punctuation:
ali@66	2653	*
ali@66	2654	* Check for omitted punctuation at end of paragraph by working back
ali@66	2655	* through prevline. DW.
ali@66	2656	* Need to check this only for "normal" paras.
ali@66	2657	* So what is a "normal" para?
ali@66	2658	* Not normal if one-liner (chapter headings, etc.)
ali@66	2659	* Not normal if doesn't contain at least one locase letter
ali@66	2660	* Not normal if starts with space
ali@66	2661	*/
ali@66	2662	void check_for_omitted_punctuation(const char *prevline,
ali@66	2663	struct line_properties *last,int start_para_line)
ali@66	2664	{
ali@70	2665	gboolean letter_on_line=FALSE;
ali@66	2666	const char *s;
ali@70	2667	for (s=prevline;*s;s=g_utf8_next_char(s))
ali@70	2668	if (g_unichar_isalpha(g_utf8_get_char(s)))
ali@70	2669	{
ali@70	2670	letter_on_line=TRUE;
ali@70	2671	break;
ali@70	2672	}
ali@66	2673	/*
ali@66	2674	* This next "if" is a problem.
ali@66	2675	* If we say "start_para_line <= linecnt - 1", that includes
ali@66	2676	* one-line "paragraphs" like chapter heads. Lotsa false positives.
ali@66	2677	* If we say "start_para_line < linecnt - 1" it doesn't, but then it
ali@66	2678	* misses genuine one-line paragraphs.
ali@66	2679	*/
ali@70	2680	if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
ali@70	2681	g_utf8_get_char(prevline)>CHAR_SPACE)
ali@66	2682	{
ali@70	2683	for (s=g_utf8_prev_char(prevline+strlen(prevline));
ali@70	2684	(g_utf8_get_char(s)==CHAR_DQUOTE \|\|
ali@70	2685	g_utf8_get_char(s)==CHAR_SQUOTE) &&
ali@70	2686	g_utf8_get_char(s)>CHAR_SPACE && s>prevline;
ali@70	2687	s=g_utf8_prev_char(s))
ali@66	2688	;
ali@70	2689	for (;s>prevline;s=g_utf8_prev_char(s))
ali@66	2690	{
ali@70	2691	if (g_unichar_isalpha(g_utf8_get_char(s)))
ali@66	2692	{
ali@66	2693	if (pswit[ECHO_SWITCH])
ali@70	2694	g_print("\n%s\n",prevline);
ali@66	2695	if (!pswit[OVERVIEW_SWITCH])
ali@70	2696	g_print(" Line %ld column %ld - "
ali@66	2697	"No punctuation at para end?\n",
ali@70	2698	linecnt-1,g_utf8_strlen(prevline,-1));
ali@66	2699	else
ali@66	2700	cnt_punct++;
ali@66	2701	break;
ali@66	2702	}
ali@70	2703	if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))
ali@66	2704	break;
ali@66	2705	}
ali@66	2706	}
ali@66	2707	}
ali@66	2708
ali@69	2709	gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
ali@69	2710	{
ali@69	2711	const char *word=key;
ali@69	2712	int *dupcnt=value;
ali@69	2713	if (*dupcnt)
ali@70	2714	g_print("\nNote: Queried word %s was duplicated %d times\n",
ali@69	2715	word,*dupcnt);
ali@69	2716	return FALSE;
ali@69	2717	}
ali@69	2718
ali@70	2719	void print_as_windows_1252(const char *string)
ali@70	2720	{
ali@70	2721	gsize inbytes,outbytes;
ali@70	2722	gchar buf,bp;
ali@70	2723	GIConv converter=(GIConv)-1;
ali@70	2724	if (!string)
ali@70	2725	{
ali@70	2726	if (converter!=(GIConv)-1)
ali@70	2727	g_iconv_close(converter);
ali@70	2728	converter=(GIConv)-1;
ali@70	2729	return;
ali@70	2730	}
ali@70	2731	if (converter=(GIConv)-1)
ali@70	2732	converter=g_iconv_open("WINDOWS-1252","UTF-8");
ali@70	2733	if (converter!=(GIConv)-1)
ali@70	2734	{
ali@70	2735	inbytes=outbytes=strlen(string);
ali@70	2736	bp=buf=g_malloc(outbytes+1);
ali@70	2737	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
ali@70	2738	*bp='\0';
ali@70	2739	fputs(buf,stdout);
ali@70	2740	g_free(buf);
ali@70	2741	}
ali@70	2742	else
ali@70	2743	fputs(string,stdout);
ali@70	2744	}
ali@70	2745
ali@66	2746	/*
ali@41	2747	* procfile:
ali@41	2748	*
ali@41	2749	* Process one file.
ali@41	2750	*/
ali@69	2751	void procfile(const char *filename)
ali@41	2752	{
ali@65	2753	const char *s;
ali@69	2754	gchar parastart=NULL; / first line of current para */
ali@69	2755	gchar etext,aline;
ali@69	2756	gchar *etext_ptr;
ali@69	2757	GError *err=NULL;
ali@41	2758	struct first_pass_results *first_pass_results;
ali@42	2759	struct warnings *warnings;
ali@43	2760	struct counters counters={0};
ali@45	2761	struct line_properties last={0};
ali@56	2762	struct parities parities={0};
ali@69	2763	struct pending pending={0};
ali@69	2764	gboolean isemptyline;
ali@68	2765	long start_para_line=0;
ali@69	2766	gboolean isnewpara=FALSE,enddash=FALSE;
ali@45	2767	last.start=CHAR_SPACE;
ali@68	2768	linecnt=checked_linecnt=0;
ali@69	2769	etext=read_etext(filename,&err);
ali@69	2770	if (!etext)
ali@41	2771	{
ali@68	2772	if (pswit[STDOUT_SWITCH])
ali@69	2773	fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
ali@68	2774	else
ali@69	2775	fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
ali@41	2776	exit(1);
ali@41	2777	}
ali@70	2778	g_set_print_handler(print_as_windows_1252);
ali@70	2779	g_print("\n\nFile: %s\n\n",filename);
ali@69	2780	first_pass_results=first_pass(etext);
ali@42	2781	warnings=report_first_pass(first_pass_results);
ali@69	2782	qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
ali@69	2783	qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
ali@40	2784	/*
ali@40	2785	* Here we go with the main pass. Hold onto yer hat!
ali@40	2786	*/
ali@65	2787	linecnt=0;
ali@69	2788	etext_ptr=etext;
ali@69	2789	while ((aline=flgets(&etext_ptr,linecnt+1)))
ali@40	2790	{
ali@68	2791	linecnt++;
ali@68	2792	if (linecnt==1)
ali@69	2793	isnewpara=TRUE;
ali@70	2794	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
ali@40	2795	continue; // skip DP page separators completely
ali@68	2796	if (linecnt<first_pass_results->firstline \|\|
ali@41	2797	(first_pass_results->footerline>0 &&
ali@41	2798	linecnt>first_pass_results->footerline))
ali@40	2799	{
ali@68	2800	if (pswit[HEADER_SWITCH])
ali@40	2801	{
ali@70	2802	if (g_str_has_prefix(aline,"Title:"))
ali@70	2803	g_print(" %s\n",aline);
ali@70	2804	if (g_str_has_prefix(aline,"Author:"))
ali@70	2805	g_print(" %s\n",aline);
ali@70	2806	if (g_str_has_prefix(aline,"Release Date:"))
ali@70	2807	g_print(" %s\n",aline);
ali@70	2808	if (g_str_has_prefix(aline,"Edition:"))
ali@70	2809	g_print(" %s\n\n",aline);
ali@40	2810	}
ali@68	2811	continue; /* skip through the header */
ali@40	2812	}
ali@68	2813	checked_linecnt++;
ali@65	2814	print_pending(aline,parastart,&pending);
ali@65	2815	memset(&pending,0,sizeof(pending));
ali@43	2816	isemptyline=analyse_quotes(aline,&counters);
ali@68	2817	if (isnewpara && !isemptyline)
ali@40	2818	{
ali@40	2819	/* This line is the start of a new paragraph. */
ali@68	2820	start_para_line=linecnt;
ali@40	2821	/* Capture its first line in case we want to report it later. */
ali@69	2822	g_free(parastart);
ali@69	2823	parastart=g_strdup(aline);
ali@56	2824	memset(&parities,0,sizeof(parities)); /* restart the quote count */
ali@68	2825	s=aline;
ali@70	2826	while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
ali@70	2827	!g_unichar_isdigit(g_utf8_get_char(s)))
ali@70	2828	s=g_utf8_next_char(s);
ali@70	2829	if (g_unichar_islower(g_utf8_get_char(s)))
ali@40	2830	{
ali@40	2831	/* and its first letter is lowercase */
ali@68	2832	if (pswit[ECHO_SWITCH])
ali@70	2833	g_print("\n%s\n",aline);
ali@68	2834	if (!pswit[OVERVIEW_SWITCH])
ali@70	2835	g_print(" Line %ld column %ld - "
ali@40	2836	"Paragraph starts with lower-case\n",
ali@70	2837	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@68	2838	else
ali@68	2839	cnt_punct++;
ali@40	2840	}
ali@69	2841	isnewpara=FALSE; /* Signal the end of new para processing. */
ali@40	2842	}
ali@68	2843	/* Check for an em-dash broken at line end. */
ali@70	2844	if (enddash && g_utf8_get_char(aline)=='-')
ali@40	2845	{
ali@68	2846	if (pswit[ECHO_SWITCH])
ali@70	2847	g_print("\n%s\n",aline);
ali@68	2848	if (!pswit[OVERVIEW_SWITCH])
ali@70	2849	g_print(" Line %ld column 1 - Broken em-dash?\n",linecnt);
ali@68	2850	else
ali@68	2851	cnt_punct++;
ali@40	2852	}
ali@69	2853	enddash=FALSE;
ali@70	2854	for (s=g_utf8_prev_char(aline+strlen(aline));
ali@70	2855	g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
ali@40	2856	;
ali@70	2857	if (s>=aline && g_utf8_get_char(s)=='-')
ali@69	2858	enddash=TRUE;
ali@67	2859	check_for_control_characters(aline);
ali@68	2860	if (warnings->bin)
ali@44	2861	check_for_odd_characters(aline,warnings,isemptyline);
ali@68	2862	if (warnings->longline)
ali@45	2863	check_for_long_line(aline);
ali@68	2864	if (warnings->shortline)
ali@45	2865	check_for_short_line(aline,&last);
ali@68	2866	last.blen=last.len;
ali@70	2867	last.len=g_utf8_strlen(aline,-1);
ali@70	2868	last.start=g_utf8_get_char(aline);
ali@46	2869	check_for_starting_punctuation(aline);
ali@68	2870	if (warnings->dash)
ali@40	2871	{
ali@47	2872	check_for_spaced_emdash(aline);
ali@47	2873	check_for_spaced_dash(aline);
ali@40	2874	}
ali@48	2875	check_for_unmarked_paragraphs(aline);
ali@49	2876	check_for_jeebies(aline);
ali@50	2877	check_for_mta_from(aline);
ali@51	2878	check_for_orphan_character(aline);
ali@52	2879	check_for_pling_scanno(aline);
ali@53	2880	check_for_extra_period(aline,warnings);
ali@54	2881	check_for_following_punctuation(aline);
ali@55	2882	check_for_typos(aline,warnings);
ali@56	2883	check_for_misspaced_punctuation(aline,&parities,isemptyline);
ali@57	2884	check_for_double_punctuation(aline,warnings);
ali@58	2885	check_for_spaced_quotes(aline);
ali@59	2886	check_for_miscased_genative(aline);
ali@60	2887	check_end_of_line(aline,warnings);
ali@61	2888	check_for_unspaced_bracket(aline);
ali@68	2889	if (warnings->endquote)
ali@62	2890	check_for_unpunctuated_endquote(aline);
ali@63	2891	check_for_html_tag(aline);
ali@64	2892	check_for_html_entity(aline);
ali@68	2893	if (isemptyline)
ali@40	2894	{
ali@65	2895	check_for_mismatched_quotes(&counters,&pending);
ali@43	2896	memset(&counters,0,sizeof(counters));
ali@40	2897	/* let the next iteration know that it's starting a new para */
ali@69	2898	isnewpara=TRUE;
ali@69	2899	if (prevline)
ali@69	2900	check_for_omitted_punctuation(prevline,&last,start_para_line);
ali@40	2901	}
ali@69	2902	g_free(prevline);
ali@69	2903	prevline=g_strdup(aline);
ali@0	2904	}
ali@69	2905	if (prevline)
ali@69	2906	{
ali@69	2907	g_free(prevline);
ali@69	2908	prevline=NULL;
ali@69	2909	}
ali@69	2910	g_free(parastart);
ali@69	2911	g_free(prevline);
ali@69	2912	g_free(etext);
ali@0	2913	if (!pswit[OVERVIEW_SWITCH])
ali@69	2914	g_tree_foreach(qword,report_duplicate_queries,NULL);
ali@69	2915	g_tree_unref(qword);
ali@69	2916	g_tree_unref(qperiod);
ali@70	2917	g_set_print_handler(NULL);
ali@70	2918	print_as_windows_1252(NULL);
ali@0	2919	}
ali@0	2920
ali@40	2921	/*
ali@40	2922	* flgets:
ali@40	2923	*
ali@69	2924	* Get one line from the input text, checking for
ali@40	2925	* the existence of exactly one CR/LF line-end per line.
ali@40	2926	*
ali@40	2927	* Returns: a pointer to the line.
ali@40	2928	*/
ali@69	2929	char flgets(char *etext,long lcnt)
ali@0	2930	{
ali@70	2931	gunichar c;
ali@69	2932	gboolean isCR=FALSE;
ali@69	2933	char theline=etext;
ali@70	2934	char *eos=theline;
ali@70	2935	gchar *s;
ali@70	2936	for (;;)
ali@40	2937	{
ali@70	2938	c=g_utf8_get_char(*etext);
ali@70	2939	etext=g_utf8_next_char(etext);
ali@69	2940	if (!c)
ali@68	2941	return NULL;
ali@40	2942	/* either way, it's end of line */
ali@69	2943	if (c=='\n')
ali@40	2944	{
ali@68	2945	if (isCR)
ali@68	2946	break;
ali@68	2947	else
ali@40	2948	{
ali@40	2949	/* Error - a LF without a preceding CR */
ali@68	2950	if (pswit[LINE_END_SWITCH])
ali@40	2951	{
ali@68	2952	if (pswit[ECHO_SWITCH])
ali@70	2953	{
ali@70	2954	s=g_strndup(theline,eos-theline);
ali@70	2955	g_print("\n%s\n",s);
ali@70	2956	g_free(s);
ali@70	2957	}
ali@68	2958	if (!pswit[OVERVIEW_SWITCH])
ali@70	2959	g_print(" Line %ld - No CR?\n",lcnt);
ali@68	2960	else
ali@68	2961	cnt_lineend++;
ali@40	2962	}
ali@68	2963	break;
ali@40	2964	}
ali@40	2965	}
ali@69	2966	if (c=='\r')
ali@40	2967	{
ali@68	2968	if (isCR)
ali@40	2969	{
ali@40	2970	/* Error - two successive CRs */
ali@68	2971	if (pswit[LINE_END_SWITCH])
ali@40	2972	{
ali@68	2973	if (pswit[ECHO_SWITCH])
ali@70	2974	{
ali@70	2975	s=g_strndup(theline,eos-theline);
ali@70	2976	g_print("\n%s\n",s);
ali@70	2977	g_free(s);
ali@70	2978	}
ali@68	2979	if (!pswit[OVERVIEW_SWITCH])
ali@70	2980	g_print(" Line %ld - Two successive CRs?\n",lcnt);
ali@68	2981	else
ali@68	2982	cnt_lineend++;
ali@40	2983	}
ali@40	2984	}
ali@69	2985	isCR=TRUE;
ali@40	2986	}
ali@68	2987	else
ali@40	2988	{
ali@68	2989	if (pswit[LINE_END_SWITCH] && isCR)
ali@40	2990	{
ali@68	2991	if (pswit[ECHO_SWITCH])
ali@70	2992	{
ali@70	2993	s=g_strndup(theline,eos-theline);
ali@70	2994	g_print("\n%s\n",s);
ali@70	2995	g_free(s);
ali@70	2996	}
ali@68	2997	if (!pswit[OVERVIEW_SWITCH])
ali@70	2998	g_print(" Line %ld column %ld - CR without LF?\n",
ali@70	2999	lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
ali@68	3000	else
ali@68	3001	cnt_lineend++;
ali@70	3002	*eos=' ';
ali@40	3003	}
ali@69	3004	isCR=FALSE;
ali@70	3005	eos=g_utf8_next_char(eos);
ali@40	3006	}
ali@69	3007	}
ali@70	3008	*eos='\0';
ali@0	3009	if (pswit[MARKUP_SWITCH])
ali@68	3010	postprocess_for_HTML(theline);
ali@0	3011	if (pswit[DP_SWITCH])
ali@68	3012	postprocess_for_DP(theline);
ali@40	3013	return theline;
ali@0	3014	}
ali@0	3015
ali@40	3016	/*
ali@40	3017	* mixdigit:
ali@40	3018	*
ali@40	3019	* Takes a "word" as a parameter, and checks whether it
ali@40	3020	* contains a mixture of alpha and digits. Generally, this is an
ali@40	3021	* error, but may not be for cases like 4th or L5 12s. 3d.
ali@40	3022	*
ali@70	3023	* Returns: TRUE iff an is error found.
ali@40	3024	*/
ali@70	3025	gboolean mixdigit(const char *checkword)
ali@0	3026	{
ali@70	3027	gboolean wehaveadigit,wehavealetter,query;
ali@70	3028	const char s,nondigit;
ali@70	3029	wehaveadigit=wehavealetter=query=FALSE;
ali@70	3030	for (s=checkword;*s;s=g_utf8_next_char(s))
ali@70	3031	if (g_unichar_isalpha(g_utf8_get_char(s)))
ali@70	3032	wehavealetter=TRUE;
ali@70	3033	else if (g_unichar_isdigit(g_utf8_get_char(s)))
ali@70	3034	wehaveadigit=TRUE;
ali@40	3035	if (wehaveadigit && wehavealetter)
ali@40	3036	{
ali@40	3037	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
ali@70	3038	query=TRUE;
ali@70	3039	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
ali@70	3040	nondigit=g_utf8_next_char(nondigit))
ali@68	3041	;
ali@68	3042	/* digits, ending in st, rd, nd, th of either case */
ali@70	3043	if (!g_ascii_strcasecmp(nondigit,"st") \|\|
ali@70	3044	!g_ascii_strcasecmp(nondigit,"rd") \|\|
ali@70	3045	!g_ascii_strcasecmp(nondigit,"nd") \|\|
ali@70	3046	!g_ascii_strcasecmp(nondigit,"th"))
ali@70	3047	query=FALSE;
ali@70	3048	if (!g_ascii_strcasecmp(nondigit,"sts") \|\|
ali@70	3049	!g_ascii_strcasecmp(nondigit,"rds") \|\|
ali@70	3050	!g_ascii_strcasecmp(nondigit,"nds") \|\|
ali@70	3051	!g_ascii_strcasecmp(nondigit,"ths"))
ali@70	3052	query=FALSE;
ali@70	3053	if (!g_ascii_strcasecmp(nondigit,"stly") \|\|
ali@70	3054	!g_ascii_strcasecmp(nondigit,"rdly") \|\|
ali@70	3055	!g_ascii_strcasecmp(nondigit,"ndly") \|\|
ali@70	3056	!g_ascii_strcasecmp(nondigit,"thly"))
ali@70	3057	query=FALSE;
ali@68	3058	/* digits, ending in l, L, s or d */
ali@70	3059	if (!g_ascii_strcasecmp(nondigit,"l") \|\| !strcmp(nondigit,"s") \|\|
ali@70	3060	!strcmp(nondigit,"d"))
ali@70	3061	query=FALSE;
ali@68	3062	/*
ali@40	3063	* L at the start of a number, representing Britsh pounds, like L500.
ali@70	3064	* This is cute. We know the current word is mixed digit. If the first
ali@68	3065	* letter is L, there must be at least one digit following. If both
ali@68	3066	* digits and letters follow, we have a genuine error, else we have a
ali@68	3067	* capital L followed by digits, and we accept that as a non-error.
ali@40	3068	*/
ali@70	3069	if (g_utf8_get_char(checkword)=='L' &&
ali@70	3070	!mixdigit(g_utf8_next_char(checkword)))
ali@70	3071	query=FALSE;
ali@40	3072	}
ali@40	3073	return query;
ali@0	3074	}
ali@0	3075
ali@40	3076	/*
ali@40	3077	* getaword:
ali@40	3078	*
ali@69	3079	* Extracts the first/next "word" from the line, and returns it.
ali@69	3080	* A word is defined as one English word unit--or at least that's the aim.
ali@69	3081	* "ptr" is advanced to the position in the line where we will start
ali@69	3082	* looking for the next word.
ali@40	3083	*
ali@69	3084	* Returns: A newly-allocated string.
ali@40	3085	*/
ali@69	3086	gchar getaword(const char *ptr)
ali@0	3087	{
ali@70	3088	const char s,t;
ali@69	3089	GString *word;
ali@70	3090	gunichar c,pc;
ali@69	3091	word=g_string_new(NULL);
ali@70	3092	for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
ali@70	3093	!g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
ali@70	3094	*ptr;ptr=g_utf8_next_char(*ptr))
ali@40	3095	;
ali@40	3096	/*
ali@40	3097	* Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
ali@40	3098	* Especially yucky is the case of L1,000
ali@40	3099	* This section looks for a pattern of characters including a digit
ali@40	3100	* followed by a comma or period followed by one or more digits.
ali@40	3101	* If found, it returns this whole pattern as a word; otherwise we discard
ali@40	3102	* the results and resume our normal programming.
ali@40	3103	*/
ali@69	3104	s=*ptr;
ali@70	3105	for (;g_unichar_isdigit(g_utf8_get_char(s)) \|\|
ali@70	3106	g_unichar_isalpha(g_utf8_get_char(s)) \|\|
ali@70	3107	g_utf8_get_char(s)==',' \|\| g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
ali@70	3108	g_string_append_unichar(word,g_utf8_get_char(s));
ali@70	3109	for (t=g_utf8_next_char(word->str);*g_utf8_next_char(t);
ali@70	3110	t=g_utf8_next_char(t))
ali@40	3111	{
ali@70	3112	c=g_utf8_get_char(t);
ali@70	3113	pc=g_utf8_get_char(g_utf8_prev_char(t));
ali@70	3114	if ((c=='.' \|\| c==',') && g_unichar_isdigit(pc))
ali@40	3115	{
ali@70	3116	*ptr=s;
ali@70	3117	return g_string_free(word,FALSE);
ali@40	3118	}
ali@40	3119	}
ali@0	3120	/* we didn't find a punctuated number - do the regular getword thing */
ali@69	3121	g_string_truncate(word,0);
ali@70	3122	for (;g_unichar_isdigit(g_utf8_get_char(*ptr)) \|\|
ali@70	3123	g_unichar_isalpha(g_utf8_get_char(*ptr)) \|\|
ali@70	3124	g_utf8_get_char(ptr)=='\'';ptr=g_utf8_next_char(*ptr))
ali@70	3125	g_string_append_unichar(word,g_utf8_get_char(*ptr));
ali@69	3126	return g_string_free(word,FALSE);
ali@0	3127	}
ali@0	3128
ali@40	3129	/*
ali@40	3130	* isroman:
ali@40	3131	*
ali@40	3132	* Is this word a Roman Numeral?
ali@40	3133	*
ali@40	3134	* It doesn't actually validate that the number is a valid Roman Numeral--for
ali@40	3135	* example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
ali@40	3136	* what we're here to do. If it passes this, it LOOKS like a Roman numeral.
ali@40	3137	* Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
ali@40	3138	* expressions thereof, except when it came to taxes. Allow any number of M,
ali@40	3139	* an optional D, an optional CM or CD, any number of optional Cs, an optional
ali@40	3140	* XL or an optional XC, an optional IX or IV, an optional V and any number
ali@40	3141	* of optional Is.
ali@40	3142	*/
ali@69	3143	gboolean isroman(const char *t)
ali@0	3144	{
ali@69	3145	const char *s;
ali@40	3146	if (!t \|\| !*t)
ali@69	3147	return FALSE;
ali@40	3148	s=t;
ali@70	3149	while (g_utf8_get_char(t)=='m' && *t)
ali@40	3150	t++;
ali@70	3151	if (g_utf8_get_char(t)=='d')
ali@40	3152	t++;
ali@70	3153	if (g_str_has_prefix(t,"cm"))
ali@40	3154	t+=2;
ali@70	3155	if (g_str_has_prefix(t,"cd"))
ali@40	3156	t+=2;
ali@70	3157	while (g_utf8_get_char(t)=='c' && *t)
ali@40	3158	t++;
ali@70	3159	if (g_str_has_prefix(t,"xl"))
ali@40	3160	t+=2;
ali@70	3161	if (g_str_has_prefix(t,"xc"))
ali@40	3162	t+=2;
ali@70	3163	if (g_utf8_get_char(t)=='l')
ali@40	3164	t++;
ali@70	3165	while (g_utf8_get_char(t)=='x' && *t)
ali@40	3166	t++;
ali@70	3167	if (g_str_has_prefix(t,"ix"))
ali@40	3168	t+=2;
ali@70	3169	if (g_str_has_prefix(t,"iv"))
ali@40	3170	t+=2;
ali@70	3171	if (g_utf8_get_char(t)=='v')
ali@40	3172	t++;
ali@70	3173	while (g_utf8_get_char(t)=='i' && *t)
ali@40	3174	t++;
ali@40	3175	return !*t;
ali@0	3176	}
ali@0	3177
ali@40	3178	/*
ali@40	3179	* postprocess_for_DP:
ali@40	3180	*
ali@40	3181	* Invoked with the -d switch from flgets().
ali@40	3182	* It simply "removes" from the line a hard-coded set of common
ali@40	3183	* DP-specific tags, so that the line passed to the main routine has
ali@40	3184	* been pre-cleaned of DP markup.
ali@40	3185	*/
ali@0	3186	void postprocess_for_DP(char *theline)
ali@0	3187	{
ali@40	3188	char s,t;
ali@0	3189	int i;
ali@0	3190	if (!*theline)
ali@68	3191	return;
ali@40	3192	for (i=0;*DPmarkup[i];i++)
ali@70	3193	while ((s=strstr(theline,DPmarkup[i])))
ali@40	3194	{
ali@68	3195	t=s+strlen(DPmarkup[i]);
ali@70	3196	memmove(s,t,strlen(t)+1);
ali@40	3197	}
ali@0	3198	}
ali@0	3199
ali@40	3200	/*
ali@40	3201	* postprocess_for_HTML:
ali@40	3202	*
ali@40	3203	* Invoked with the -m switch from flgets().
ali@40	3204	* It simply "removes" from the line a hard-coded set of common
ali@40	3205	* HTML tags and "replaces" a hard-coded set of common HTML
ali@40	3206	* entities, so that the line passed to the main routine has
ali@40	3207	* been pre-cleaned of HTML.
ali@40	3208	*/
ali@0	3209	void postprocess_for_HTML(char *theline)
ali@0	3210	{
ali@70	3211	while (losemarkup(theline))
ali@70	3212	;
ali@0	3213	while (loseentities(theline))
ali@68	3214	;
ali@0	3215	}
ali@0	3216
ali@0	3217	char losemarkup(char theline)
ali@0	3218	{
ali@40	3219	char s,t;
ali@0	3220	int i;
ali@70	3221	s=strchr(theline,'<');
ali@70	3222	t=s?strchr(s,'>'):NULL;
ali@40	3223	if (!s \|\| !t)
ali@40	3224	return NULL;
ali@40	3225	for (i=0;*markup[i];i++)
ali@70	3226	if (tagcomp(g_utf8_next_char(s),markup[i]))
ali@40	3227	{
ali@70	3228	t=g_utf8_next_char(t);
ali@70	3229	memmove(s,t,strlen(t)+1);
ali@70	3230	return s;
ali@68	3231	}
ali@40	3232	/* It's an unrecognized <xxx>. */
ali@40	3233	return NULL;
ali@0	3234	}
ali@0	3235
ali@0	3236	char loseentities(char theline)
ali@0	3237	{
ali@0	3238	int i;
ali@40	3239	char s,t;
ali@0	3240	if (!*theline)
ali@68	3241	return NULL;
ali@40	3242	for (i=0;*entities[i].htmlent;i++)
ali@40	3243	{
ali@68	3244	s=strstr(theline,entities[i].htmlent);
ali@68	3245	if (s)
ali@40	3246	{
ali@70	3247	t=g_strdup(s+strlen(entities[i].htmlent));
ali@68	3248	strcpy(s,entities[i].textent);
ali@68	3249	strcat(s,t);
ali@70	3250	g_free(t);
ali@68	3251	return theline;
ali@40	3252	}
ali@40	3253	}
ali@40	3254	for (i=0;*entities[i].htmlnum;i++)
ali@40	3255	{
ali@68	3256	s=strstr(theline,entities[i].htmlnum);
ali@68	3257	if (s)
ali@40	3258	{
ali@70	3259	t=g_strdup(s+strlen(entities[i].htmlnum));
ali@68	3260	strcpy(s,entities[i].textent);
ali@68	3261	strcat(s,t);
ali@70	3262	g_free(t);
ali@68	3263	return theline;
ali@40	3264	}
ali@40	3265	}
ali@40	3266	return NULL;
ali@0	3267	}
ali@0	3268
ali@70	3269	gboolean tagcomp(const char strin,const char basetag)
ali@0	3270	{
ali@70	3271	gboolean retval;
ali@70	3272	gchar s,t;
ali@70	3273	if (g_utf8_get_char(strin)=='/')
ali@70	3274	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
ali@70	3275	else
ali@70	3276	t=g_utf8_casefold(strin,-1);
ali@70	3277	s=g_utf8_casefold(basetag,-1);
ali@70	3278	retval=g_str_has_prefix(t,s);
ali@70	3279	g_free(s);
ali@70	3280	g_free(t);
ali@70	3281	return retval;
ali@0	3282	}
ali@0	3283
ali@69	3284	void proghelp(GOptionContext *context)
ali@0	3285	{
ali@69	3286	gchar *help;
ali@40	3287	fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
ali@40	3288	fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
ali@40	3289	fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
ali@40	3290	fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
ali@40	3291	"For details, read the file COPYING.\n",stderr);
ali@40	3292	fputs("This is Free Software; "
ali@40	3293	"you may redistribute it under certain conditions (GPL);\n",stderr);
ali@40	3294	fputs("read the file COPYING for details.\n\n",stderr);
ali@69	3295	help=g_option_context_get_help(context,TRUE,NULL);
ali@69	3296	fputs(help,stderr);
ali@69	3297	g_free(help);
ali@69	3298	fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
ali@40	3299	fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
ali@40	3300	"non-ASCII\n",stderr);
ali@40	3301	fputs("characters like accented letters, "
ali@40	3302	"lines longer than 75 or shorter than 55,\n",stderr);
ali@40	3303	fputs("unbalanced quotes or brackets, "
ali@40	3304	"a variety of badly formatted punctuation, \n",stderr);
ali@40	3305	fputs("HTML tags, some likely typos. "
ali@40	3306	"It is NOT a substitute for human judgement.\n",stderr);
ali@0	3307	fputs("\n",stderr);
ali@0	3308	}

author	ali <ali@juiblex.co.uk>
	Thu May 30 07:31:24 2013 +0100 (2013-05-30)
changeset 70	aa916da2e452
parent 69	1016349e619f
child 71	82d3cc398b54
permissions	-rw-r--r--