bookloupe: bookloupe/bookloupe.c@f44c530f80da (annotated)

ali@0	1	/*************************************************************************/
ali@40	2	/* bookloupe--check for assorted weirdnesses in a PG candidate text file */
ali@68	3	/* */
ali@68	4	/* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
ali@68	5	/* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
ali@68	6	/* */
ali@0	7	/* This program is free software; you can redistribute it and/or modify */
ali@0	8	/* it under the terms of the GNU General Public License as published by */
ali@0	9	/* the Free Software Foundation; either version 2 of the License, or */
ali@68	10	/* (at your option) any later version. */
ali@68	11	/* */
ali@0	12	/* This program is distributed in the hope that it will be useful, */
ali@68	13	/* but WITHOUT ANY WARRANTY; without even the implied warranty of */
ali@68	14	/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
ali@68	15	/* GNU General Public License for more details. */
ali@68	16	/* */
ali@68	17	/* You should have received a copy of the GNU General Public License */
ali@68	18	/* along with this program. If not, see <http://www.gnu.org/licenses/>. */
ali@0	19	/*************************************************************************/
ali@0	20
ali@0	21	#include <stdio.h>
ali@0	22	#include <stdlib.h>
ali@0	23	#include <string.h>
ali@0	24	#include <ctype.h>
ali@73	25	#ifdef __WIN32__
ali@73	26	#include <windows.h>
ali@73	27	#endif
ali@69	28	#include <glib.h>
ali@69	29	#include <bl/bl.h>
ali@92	30	#include "bookloupe.h"
ali@92	31	#include "counters.h"
ali@93	32	#include "pending.h"
ali@71	33	#include "HTMLentities.h"
ali@0	34
ali@69	35	gchar *prevline;
ali@0	36
ali@40	37	/* Common typos. */
ali@40	38	char *typo[] = {
ali@40	39	"teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
ali@40	40	"nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
ali@40	41	"bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
ali@40	42	"couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
ali@40	43	"esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
ali@40	44	"gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
ali@40	45	"herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
ali@40	46	"hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
ali@40	47	"loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
ali@40	48	"omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
ali@40	49	"peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
ali@40	50	"porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
ali@40	51	"sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
ali@40	52	"tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
ali@40	53	"thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
ali@40	54	"tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
ali@40	55	"waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
ali@40	56	"wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
ali@40	57	"woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
ali@40	58	"wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
ali@40	59	"ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
ali@40	60	"bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
ali@40	61	"ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
ali@40	62	"dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
ali@40	63	"hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
ali@40	64	"hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
ali@40	65	"memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
ali@40	66	"witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
ali@40	67	"prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
ali@40	68	"se", ""
ali@40	69	};
ali@0	70
ali@69	71	GTree *usertypo;
ali@0	72
ali@40	73	/* Common abbreviations and other OK words not to query as typos. */
ali@40	74	char *okword[] = {
ali@40	75	"mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
ali@40	76	"rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
ali@40	77	"pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
ali@40	78	"outbid", "outbids", "frostbite", "frostbitten", ""
ali@40	79	};
ali@0	80
ali@40	81	/* Common abbreviations that cause otherwise unexplained periods. */
ali@40	82	char *abbrev[] = {
ali@40	83	"cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
ali@40	84	"cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
ali@40	85	};
ali@0	86
ali@40	87	/*
ali@40	88	* Two-Letter combinations that rarely if ever start words,
ali@40	89	* but are common scannos or otherwise common letter combinations.
ali@40	90	*/
ali@40	91	char *nostart[] = {
ali@40	92	"hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
ali@40	93	};
ali@0	94
ali@40	95	/*
ali@40	96	* Two-Letter combinations that rarely if ever end words,
ali@40	97	* but are common scannos or otherwise common letter combinations.
ali@40	98	*/
ali@40	99	char *noend[] = {
ali@40	100	"cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
ali@40	101	"sw", "gr", "sl", "cl", "iy", ""
ali@40	102	};
ali@0	103
ali@40	104	char *markup[] = {
ali@40	105	"a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
ali@40	106	"font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
ali@40	107	"img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
ali@40	108	"sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
ali@40	109	};
ali@0	110
ali@40	111	char *DPmarkup[] = {
ali@40	112	"<sc>", "</sc>", "/", "/", "/#", "#/", "/$", "$/", "<tb>", ""
ali@40	113	};
ali@0	114
ali@40	115	char *nocomma[] = {
ali@40	116	"the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
ali@40	117	"every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
ali@40	118	"st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
ali@40	119	"during", "let", "toward", "among", ""
ali@40	120	};
ali@0	121
ali@40	122	char *noperiod[] = {
ali@40	123	"every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
ali@40	124	"and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
ali@40	125	"i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
ali@40	126	"among", "those", "into", "whom", "having", "thence", ""
ali@40	127	};
ali@0	128
ali@69	129	gboolean pswit[SWITNO]; /* program switches */
ali@0	130
ali@69	131	static GOptionEntry options[]={
ali@69	132	{ "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
ali@69	133	"Ignore DP-specific markup", NULL },
ali@69	134	{ "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
ali@69	135	"Don't echo queried line", NULL },
ali@69	136	{ "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
ali@69	137	"Check single quotes", NULL },
ali@69	138	{ "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
ali@69	139	"Check common typos", NULL },
ali@69	140	{ "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
ali@69	141	"Require closure of quotes on every paragraph", NULL },
ali@69	142	{ "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
ali@69	143	"Disable paranoid querying of everything", NULL },
ali@69	144	{ "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
ali@69	145	"Disable line end checking", NULL },
ali@69	146	{ "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
ali@69	147	"Overview: just show counts", NULL },
ali@69	148	{ "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
ali@69	149	"Output errors to stdout instead of stderr", NULL },
ali@69	150	{ "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
ali@69	151	"Echo header fields", NULL },
ali@69	152	{ "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
ali@69	153	"Ignore markup in < >", NULL },
ali@69	154	{ "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
ali@69	155	"Use file of user-defined typos", NULL },
ali@69	156	{ "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
ali@69	157	"Defaults for use on www upload", NULL },
ali@69	158	{ "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
ali@69	159	"Verbose - list everything", NULL },
ali@69	160	{ NULL }
ali@69	161	};
ali@0	162
ali@94	163	long cnt_quote; /* for overview mode, count of quote queries */
ali@68	164	long cnt_brack; /* for overview mode, count of brackets queries */
ali@68	165	long cnt_bin; /* for overview mode, count of non-ASCII queries */
ali@68	166	long cnt_odd; /* for overview mode, count of odd character queries */
ali@68	167	long cnt_long; /* for overview mode, count of long line errors */
ali@68	168	long cnt_short; /* for overview mode, count of short line queries */
ali@68	169	long cnt_punct; /* for overview mode,
ali@68	170	count of punctuation and spacing queries */
ali@68	171	long cnt_dash; /* for overview mode, count of dash-related queries */
ali@68	172	long cnt_word; /* for overview mode, count of word queries */
ali@68	173	long cnt_html; /* for overview mode, count of html queries */
ali@68	174	long cnt_lineend; /* for overview mode, count of line-end queries */
ali@68	175	long cnt_spacend; /* count of lines with space at end */
ali@68	176	long linecnt; /* count of total lines in the file */
ali@68	177	long checked_linecnt; /* count of lines actually checked */
ali@0	178
ali@69	179	void proghelp(GOptionContext *context);
ali@69	180	void procfile(const char *);
ali@0	181
ali@69	182	gchar *running_from;
ali@0	183
ali@70	184	gboolean mixdigit(const char *);
ali@69	185	gchar getaword(const char *);
ali@101	186	char flgets(char *,long,int);
ali@0	187	void postprocess_for_HTML(char *);
ali@0	188	char linehasmarkup(char );
ali@0	189	char losemarkup(char );
ali@70	190	gboolean tagcomp(const char ,const char );
ali@71	191	void loseentities(char *);
ali@69	192	gboolean isroman(const char *);
ali@0	193	void postprocess_for_DP(char *);
ali@72	194	void print_as_windows_1252(const char *string);
ali@72	195	void print_as_utf_8(const char *string);
ali@0	196
ali@69	197	GTree qword,qperiod;
ali@68	198
ali@73	199	#ifdef __WIN32__
ali@73	200	UINT saved_cp;
ali@73	201	#endif
ali@73	202
ali@69	203	void parse_options(int argc,char **argv)
ali@0	204	{
ali@69	205	GError *err=NULL;
ali@69	206	GOptionContext *context;
ali@69	207	context=g_option_context_new(
ali@69	208	"file - looks for errors in Project Gutenberg(TM) etexts");
ali@69	209	g_option_context_add_main_entries(context,options,NULL);
ali@69	210	if (!g_option_context_parse(context,argc,argv,&err))
ali@69	211	{
ali@69	212	g_printerr("Bookloupe: %s\n",err->message);
ali@69	213	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
ali@69	214	exit(1);
ali@69	215	}
ali@40	216	/* Paranoid checking is turned OFF, not on, by its switch */
ali@69	217	pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
ali@40	218	if (pswit[PARANOID_SWITCH])
ali@69	219	/* if running in paranoid mode, typo checks default to enabled */
ali@69	220	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
ali@40	221	/* Line-end checking is turned OFF, not on, by its switch */
ali@69	222	pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
ali@40	223	/* Echoing is turned OFF, not on, by its switch */
ali@69	224	pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
ali@40	225	if (pswit[OVERVIEW_SWITCH])
ali@40	226	/* just print summary; don't echo */
ali@69	227	pswit[ECHO_SWITCH]=FALSE;
ali@40	228	/*
ali@40	229	* Web uploads - for the moment, this is really just a placeholder
ali@40	230	* until we decide what processing we really want to do on web uploads
ali@40	231	*/
ali@40	232	if (pswit[WEB_SWITCH])
ali@40	233	{
ali@40	234	/* specific override for web uploads */
ali@69	235	pswit[ECHO_SWITCH]=TRUE;
ali@69	236	pswit[SQUOTE_SWITCH]=FALSE;
ali@69	237	pswit[TYPO_SWITCH]=TRUE;
ali@69	238	pswit[QPARA_SWITCH]=FALSE;
ali@69	239	pswit[PARANOID_SWITCH]=TRUE;
ali@69	240	pswit[LINE_END_SWITCH]=FALSE;
ali@69	241	pswit[OVERVIEW_SWITCH]=FALSE;
ali@69	242	pswit[STDOUT_SWITCH]=FALSE;
ali@69	243	pswit[HEADER_SWITCH]=TRUE;
ali@69	244	pswit[VERBOSE_SWITCH]=FALSE;
ali@69	245	pswit[MARKUP_SWITCH]=FALSE;
ali@69	246	pswit[USERTYPO_SWITCH]=FALSE;
ali@69	247	pswit[DP_SWITCH]=FALSE;
ali@40	248	}
ali@69	249	if (*argc<2)
ali@40	250	{
ali@69	251	proghelp(context);
ali@69	252	exit(1);
ali@40	253	}
ali@69	254	g_option_context_free(context);
ali@69	255	}
ali@69	256
ali@69	257	/*
ali@69	258	* read_user_scannos:
ali@69	259	*
ali@69	260	* Read in the user-defined stealth scanno list.
ali@69	261	*/
ali@69	262	void read_user_scannos(void)
ali@69	263	{
ali@69	264	GError *err=NULL;
ali@69	265	gchar *usertypo_file;
ali@69	266	gboolean okay;
ali@69	267	int i;
ali@70	268	gsize len,nb;
ali@70	269	gchar contents,utf8,**lines;
ali@69	270	usertypo_file=g_strdup("bookloupe.typ");
ali@69	271	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
ali@69	272	if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
ali@69	273	{
ali@69	274	g_clear_error(&err);
ali@69	275	g_free(usertypo_file);
ali@69	276	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
ali@69	277	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
ali@69	278	}
ali@69	279	if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
ali@69	280	{
ali@69	281	g_clear_error(&err);
ali@69	282	g_free(usertypo_file);
ali@69	283	usertypo_file=g_strdup("gutcheck.typ");
ali@69	284	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
ali@69	285	}
ali@69	286	if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
ali@69	287	{
ali@69	288	g_clear_error(&err);
ali@69	289	g_free(usertypo_file);
ali@69	290	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
ali@69	291	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
ali@69	292	}
ali@69	293	if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
ali@69	294	{
ali@69	295	g_free(usertypo_file);
ali@70	296	g_print(" --> I couldn't find bookloupe.typ "
ali@69	297	"-- proceeding without user typos.\n");
ali@69	298	return;
ali@69	299	}
ali@69	300	else if (!okay)
ali@69	301	{
ali@69	302	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
ali@69	303	g_free(usertypo_file);
ali@69	304	g_clear_error(&err);
ali@69	305	exit(1);
ali@69	306	}
ali@72	307	if (g_utf8_validate(contents,len,NULL))
ali@72	308	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
ali@72	309	else
ali@72	310	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
ali@70	311	g_free(contents);
ali@70	312	lines=g_strsplit_set(utf8,"\r\n",0);
ali@70	313	g_free(utf8);
ali@69	314	usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
ali@69	315	for (i=0;lines[i];i++)
ali@69	316	if ((unsigned char )lines[i]>'!')
ali@69	317	g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
ali@69	318	else
ali@69	319	g_free(lines[i]);
ali@69	320	g_free(lines);
ali@69	321	}
ali@69	322
ali@69	323	/*
ali@69	324	* read_etext:
ali@69	325	*
ali@69	326	* Read an etext returning a newly allocated string containing the file
ali@69	327	* contents or NULL on error.
ali@69	328	*/
ali@69	329	gchar read_etext(const char filename,GError **err)
ali@69	330	{
ali@76	331	GError *tmp_err=NULL;
ali@70	332	gchar contents,utf8;
ali@76	333	gsize len,bytes_read,bytes_written;
ali@76	334	int i,line,col;
ali@69	335	if (!g_file_get_contents(filename,&contents,&len,err))
ali@69	336	return NULL;
ali@72	337	if (g_utf8_validate(contents,len,NULL))
ali@72	338	{
ali@72	339	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
ali@72	340	g_set_print_handler(print_as_utf_8);
ali@73	341	#ifdef __WIN32__
ali@73	342	SetConsoleOutputCP(CP_UTF8);
ali@73	343	#endif
ali@72	344	}
ali@72	345	else
ali@72	346	{
ali@76	347	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
ali@76	348	&bytes_written,&tmp_err);
ali@76	349	if (g_error_matches(tmp_err,G_CONVERT_ERROR,
ali@76	350	G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
ali@76	351	{
ali@76	352	line=col=1;
ali@76	353	for(i=0;i<bytes_read;i++)
ali@76	354	if (contents[i]=='\n')
ali@76	355	{
ali@76	356	line++;
ali@76	357	col=1;
ali@76	358	}
ali@76	359	else if (contents[i]!='\r')
ali@76	360	col++;
ali@76	361	g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
ali@76	362	"Input conversion failed. Byte %d at line %d, column %d is not a "
ali@76	363	"valid Windows-1252 character",
ali@76	364	((unsigned char *)contents)[bytes_read],line,col);
ali@76	365	}
ali@76	366	else if (tmp_err)
ali@76	367	g_propagate_error(err,tmp_err);
ali@72	368	g_set_print_handler(print_as_windows_1252);
ali@73	369	#ifdef __WIN32__
ali@73	370	SetConsoleOutputCP(1252);
ali@73	371	#endif
ali@72	372	}
ali@70	373	g_free(contents);
ali@70	374	return utf8;
ali@69	375	}
ali@69	376
ali@73	377	void cleanup_on_exit(void)
ali@73	378	{
ali@73	379	#ifdef __WIN32__
ali@73	380	SetConsoleOutputCP(saved_cp);
ali@73	381	#endif
ali@73	382	}
ali@73	383
ali@69	384	int main(int argc,char **argv)
ali@69	385	{
ali@73	386	#ifdef __WIN32__
ali@73	387	atexit(cleanup_on_exit);
ali@73	388	saved_cp=GetConsoleOutputCP();
ali@73	389	#endif
ali@69	390	running_from=g_path_get_dirname(argv[0]);
ali@69	391	parse_options(&argc,&argv);
ali@40	392	if (pswit[USERTYPO_SWITCH])
ali@69	393	read_user_scannos();
ali@40	394	fprintf(stderr,"bookloupe: Check and report on an e-text\n");
ali@69	395	procfile(argv[1]);
ali@40	396	if (pswit[OVERVIEW_SWITCH])
ali@40	397	{
ali@70	398	g_print(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
ali@40	399	checked_linecnt,linecnt,linecnt-checked_linecnt);
ali@70	400	g_print(" --------------- Queries found --------------\n");
ali@68	401	if (cnt_long)
ali@70	402	g_print(" Long lines: %14ld\n",cnt_long);
ali@68	403	if (cnt_short)
ali@70	404	g_print(" Short lines: %14ld\n",cnt_short);
ali@68	405	if (cnt_lineend)
ali@70	406	g_print(" Line-end problems: %14ld\n",cnt_lineend);
ali@68	407	if (cnt_word)
ali@70	408	g_print(" Common typos: %14ld\n",cnt_word);
ali@94	409	if (cnt_quote)
ali@94	410	g_print(" Unmatched quotes: %14ld\n",cnt_quote);
ali@68	411	if (cnt_brack)
ali@70	412	g_print(" Unmatched brackets: %14ld\n",cnt_brack);
ali@68	413	if (cnt_bin)
ali@70	414	g_print(" Non-ASCII characters: %14ld\n",cnt_bin);
ali@68	415	if (cnt_odd)
ali@70	416	g_print(" Proofing characters: %14ld\n",cnt_odd);
ali@68	417	if (cnt_punct)
ali@70	418	g_print(" Punctuation & spacing queries: %14ld\n",cnt_punct);
ali@68	419	if (cnt_dash)
ali@70	420	g_print(" Non-standard dashes: %14ld\n",cnt_dash);
ali@68	421	if (cnt_html)
ali@70	422	g_print(" Possible HTML tags: %14ld\n",cnt_html);
ali@70	423	g_print("\n");
ali@70	424	g_print(" TOTAL QUERIES %14ld\n",
ali@94	425	cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+
ali@94	426	cnt_dash+cnt_word+cnt_html+cnt_lineend);
ali@40	427	}
ali@69	428	g_free(running_from);
ali@69	429	if (usertypo)
ali@69	430	g_tree_unref(usertypo);
ali@40	431	return 0;
ali@0	432	}
ali@0	433
ali@97	434	void count_dashes(const char line,const char dash,
ali@97	435	struct dash_results *results)
ali@97	436	{
ali@97	437	int i;
ali@97	438	gchar **tokens;
ali@97	439	gunichar pc,nc;
ali@97	440	gboolean spaced=FALSE,unspaced=FALSE,spaced2=FALSE;
ali@97	441	if (!*line)
ali@97	442	return;
ali@97	443	tokens=g_strsplit(line,dash,0);
ali@97	444	if (tokens[1])
ali@97	445	results->base++;
ali@97	446	for(i=1;tokens[i];i++)
ali@97	447	{
ali@97	448	pc=g_utf8_get_char(g_utf8_prev_char(tokens[i-1]+strlen(tokens[i-1])));
ali@97	449	nc=g_utf8_get_char(tokens[i]);
ali@97	450	if (g_unichar_isspace(pc) \|\| g_unichar_isspace(nc))
ali@97	451	spaced=TRUE;
ali@97	452	if (g_unichar_isspace(pc) && g_unichar_isspace(nc))
ali@97	453	spaced2=TRUE;
ali@97	454	else if (!g_unichar_isspace(pc) && !g_unichar_isspace(nc))
ali@97	455	unspaced=TRUE;
ali@97	456	}
ali@97	457	if (spaced)
ali@97	458	results->space++;
ali@97	459	if (spaced2)
ali@97	460	/* count of lines with em-dashes with spaces both sides */
ali@97	461	results->non_PG_space++;
ali@97	462	if (unspaced)
ali@97	463	/* count of lines with PG-type em-dashes with no spaces */
ali@97	464	results->PG_space++;
ali@97	465	g_strfreev(tokens);
ali@97	466	}
ali@97	467
ali@40	468	/*
ali@41	469	* first_pass:
ali@40	470	*
ali@41	471	* Run a first pass - verify that it's a valid PG
ali@41	472	* file, decide whether to report some things that
ali@41	473	* occur many times in the text like long or short
ali@41	474	* lines, non-standard dashes, etc.
ali@40	475	*/
ali@69	476	struct first_pass_results first_pass(const char etext)
ali@0	477	{
ali@70	478	gunichar laststart=CHAR_SPACE;
ali@54	479	const char *s;
ali@69	480	gchar *lc_line;
ali@70	481	int i,j,lbytes,llen;
ali@69	482	gchar **lines;
ali@41	483	unsigned int lastlen=0,lastblen=0;
ali@41	484	long spline=0,nspline=0;
ali@41	485	static struct first_pass_results results={0};
ali@97	486	struct dash_results tmp_dash_results;
ali@69	487	gchar *inword;
ali@94	488	QuoteClass qc;
ali@69	489	lines=g_strsplit(etext,"\n",0);
ali@101	490	if (!lines[0])
ali@101	491	{
ali@101	492	/* An empty etext has no terminators */
ali@101	493	results.newlines=DOS_NEWLINES;
ali@101	494	}
ali@101	495	else if (!lines[1])
ali@101	496	{
ali@101	497	/*
ali@101	498	* If there are no LFs, we don't have UNIX-style
ali@101	499	* terminators, but we might have OS9-style ones.
ali@101	500	*/
ali@101	501	results.newlines=OS9_NEWLINES;
ali@101	502	g_strfreev(lines);
ali@101	503	lines=g_strsplit(etext,"\r",0);
ali@101	504	if (!lines[0] \|\| !lines[1])
ali@101	505	/* Looks like we don't have any terminators at all */
ali@101	506	results.newlines=DOS_NEWLINES;
ali@101	507	}
ali@101	508	else
ali@101	509	{
ali@101	510	/* We might have UNIX-style terminators */
ali@101	511	results.newlines=UNIX_NEWLINES;
ali@101	512	}
ali@69	513	for (j=0;lines[j];j++)
ali@40	514	{
ali@70	515	lbytes=strlen(lines[j]);
ali@101	516	if (lbytes>0 && lines[j][lbytes-1]=='\r')
ali@101	517	{
ali@101	518	results.newlines=DOS_NEWLINES;
ali@101	519	do
ali@101	520	{
ali@101	521	lines[j][--lbytes]='\0';
ali@101	522	} while (lbytes>0 && lines[j][lbytes-1]=='\r');
ali@101	523	}
ali@70	524	llen=g_utf8_strlen(lines[j],lbytes);
ali@68	525	linecnt++;
ali@69	526	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
ali@69	527	(strstr(lines[j],"PUBLIC DOMAIN") \|\| strstr(lines[j],"COPYRIGHT")))
ali@40	528	{
ali@68	529	if (spline)
ali@70	530	g_print(" --> Duplicate header?\n");
ali@68	531	spline=linecnt+1; /* first line of non-header text, that is */
ali@40	532	}
ali@69	533	if (!strncmp(lines[j],"*** START",9) &&
ali@69	534	strstr(lines[j],"PROJECT GUTENBERG"))
ali@40	535	{
ali@68	536	if (nspline)
ali@70	537	g_print(" --> Duplicate header?\n");
ali@68	538	nspline=linecnt+1; /* first line of non-header text, that is */
ali@40	539	}
ali@68	540	if (spline \|\| nspline)
ali@40	541	{
ali@70	542	lc_line=g_utf8_strdown(lines[j],lbytes);
ali@69	543	if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
ali@40	544	{
ali@69	545	if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
ali@40	546	{
ali@68	547	if (results.footerline)
ali@40	548	{
ali@40	549	/* it's an old-form header - we can detect duplicates */
ali@68	550	if (!nspline)
ali@70	551	g_print(" --> Duplicate footer?\n");
ali@40	552	}
ali@68	553	else
ali@68	554	results.footerline=linecnt;
ali@40	555	}
ali@40	556	}
ali@69	557	g_free(lc_line);
ali@40	558	}
ali@68	559	if (spline)
ali@41	560	results.firstline=spline;
ali@68	561	if (nspline)
ali@41	562	results.firstline=nspline; /* override with new */
ali@68	563	if (results.footerline)
ali@40	564	continue; /* don't count the boilerplate in the footer */
ali@68	565	results.totlen+=llen;
ali@70	566	for (s=lines[j];*s;s=g_utf8_next_char(s))
ali@40	567	{
ali@70	568	if (g_utf8_get_char(s)>127)
ali@41	569	results.binlen++;
ali@70	570	if (g_unichar_isalpha(g_utf8_get_char(s)))
ali@41	571	results.alphalen++;
ali@94	572	if (s>lines[j])
ali@94	573	{
ali@94	574	if (CHAR_IS_DQUOTE(g_utf8_get_char(s)))
ali@94	575	qc=QUOTE_CLASS(g_utf8_get_char(s));
ali@94	576	else
ali@94	577	qc=INVALID_QUOTE;
ali@94	578	if ((qc==CLOSING_QUOTE \|\| qc==NEUTRAL_QUOTE) &&
ali@97	579	g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
ali@94	580	results.endquote_count++;
ali@94	581	}
ali@40	582	}
ali@69	583	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
ali@69	584	lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
ali@41	585	results.shortline++;
ali@70	586	if (lbytes>0 &&
ali@70	587	g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
ali@40	588	cnt_spacend++;
ali@69	589	if (strstr(lines[j],".,"))
ali@41	590	results.dotcomma++;
ali@68	591	/* only count ast lines for ignoring purposes where there is */
ali@68	592	/* locase text on the line */
ali@69	593	if (strchr(lines[j],'*'))
ali@40	594	{
ali@70	595	for (s=lines[j];*s;s=g_utf8_next_char(s))
ali@70	596	if (g_unichar_islower(g_utf8_get_char(s)))
ali@68	597	break;
ali@70	598	if (*s)
ali@41	599	results.astline++;
ali@40	600	}
ali@69	601	if (strchr(lines[j],'/'))
ali@68	602	results.fslashline++;
ali@82	603	if (lbytes>0)
ali@82	604	{
ali@82	605	for (s=g_utf8_prev_char(lines[j]+lbytes);
ali@82	606	s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
ali@82	607	s=g_utf8_prev_char(s))
ali@82	608	;
ali@82	609	if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
ali@82	610	g_utf8_get_char(g_utf8_prev_char(s))!='-')
ali@82	611	results.hyphens++;
ali@82	612	}
ali@68	613	if (llen>LONGEST_PG_LINE)
ali@41	614	results.longline++;
ali@68	615	if (llen>WAY_TOO_LONG)
ali@41	616	results.verylongline++;
ali@69	617	if (strchr(lines[j],'<') && strchr(lines[j],'>'))
ali@40	618	{
ali@69	619	i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
ali@68	620	if (i>0)
ali@68	621	results.htmcount++;
ali@69	622	if (strstr(lines[j],"<i>"))
ali@41	623	results.htmcount+=4; /* bonus marks! */
ali@40	624	}
ali@68	625	/* Check for spaced em-dashes */
ali@97	626	memset(&tmp_dash_results,0,sizeof(tmp_dash_results));
ali@97	627	count_dashes(lines[j],"--",&tmp_dash_results);
ali@97	628	count_dashes(lines[j],"—",&tmp_dash_results);
ali@97	629	if (tmp_dash_results.base)
ali@97	630	results.emdash.base++;
ali@97	631	if (tmp_dash_results.non_PG_space)
ali@97	632	results.emdash.non_PG_space++;
ali@97	633	if (tmp_dash_results.PG_space)
ali@97	634	results.emdash.PG_space++;
ali@69	635	for (s=lines[j];*s;)
ali@40	636	{
ali@69	637	inword=getaword(&s);
ali@68	638	if (!strcmp(inword,"hij") \|\| !strcmp(inword,"niet"))
ali@68	639	results.Dutchcount++;
ali@68	640	if (!strcmp(inword,"dans") \|\| !strcmp(inword,"avec"))
ali@68	641	results.Frenchcount++;
ali@68	642	if (!strcmp(inword,"0") \|\| !strcmp(inword,"1"))
ali@68	643	results.standalone_digit++;
ali@69	644	g_free(inword);
ali@40	645	}
ali@68	646	/* Check for spaced dashes */
ali@69	647	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
ali@41	648	results.spacedash++;
ali@68	649	lastblen=lastlen;
ali@69	650	lastlen=llen;
ali@69	651	laststart=lines[j][0];
ali@40	652	}
ali@69	653	g_strfreev(lines);
ali@41	654	return &results;
ali@41	655	}
ali@41	656
ali@42	657	/*
ali@42	658	* report_first_pass:
ali@42	659	*
ali@42	660	* Make some snap decisions based on the first pass results.
ali@42	661	*/
ali@42	662	struct warnings report_first_pass(struct first_pass_results results)
ali@42	663	{
ali@42	664	static struct warnings warnings={0};
ali@101	665	warnings.newlines=results->newlines;
ali@101	666	if (warnings.newlines==UNIX_NEWLINES)
ali@101	667	g_print(" --> No lines in this file have a CR. Not reporting them. "
ali@101	668	"Project Gutenberg requires that all lineends be CR-LF.\n");
ali@101	669	else if (warnings.newlines==OS9_NEWLINES)
ali@101	670	g_print(" --> No lines in this file have a LF. Not reporting them. "
ali@101	671	"Project Gutenberg requires that all lineends be CR-LF.\n");
ali@42	672	if (cnt_spacend>0)
ali@70	673	g_print(" --> %ld lines in this file have white space at end\n",
ali@42	674	cnt_spacend);
ali@42	675	warnings.dotcomma=1;
ali@42	676	if (results->dotcomma>5)
ali@42	677	{
ali@68	678	warnings.dotcomma=0;
ali@70	679	g_print(" --> %ld lines in this file contain '.,'. "
ali@42	680	"Not reporting them.\n",results->dotcomma);
ali@42	681	}
ali@42	682	/*
ali@42	683	* If more than 50 lines, or one-tenth, are short,
ali@42	684	* don't bother reporting them.
ali@42	685	*/
ali@42	686	warnings.shortline=1;
ali@42	687	if (results->shortline>50 \|\| results->shortline*10>linecnt)
ali@42	688	{
ali@68	689	warnings.shortline=0;
ali@70	690	g_print(" --> %ld lines in this file are short. "
ali@42	691	"Not reporting short lines.\n",results->shortline);
ali@42	692	}
ali@42	693	/*
ali@42	694	* If more than 50 lines, or one-tenth, are long,
ali@42	695	* don't bother reporting them.
ali@42	696	*/
ali@42	697	warnings.longline=1;
ali@42	698	if (results->longline>50 \|\| results->longline*10>linecnt)
ali@42	699	{
ali@68	700	warnings.longline=0;
ali@70	701	g_print(" --> %ld lines in this file are long. "
ali@42	702	"Not reporting long lines.\n",results->longline);
ali@42	703	}
ali@42	704	/* If more than 10 lines contain asterisks, don't bother reporting them. */
ali@42	705	warnings.ast=1;
ali@42	706	if (results->astline>10)
ali@42	707	{
ali@68	708	warnings.ast=0;
ali@70	709	g_print(" --> %ld lines in this file contain asterisks. "
ali@42	710	"Not reporting them.\n",results->astline);
ali@42	711	}
ali@42	712	/*
ali@42	713	* If more than 10 lines contain forward slashes,
ali@42	714	* don't bother reporting them.
ali@42	715	*/
ali@42	716	warnings.fslash=1;
ali@42	717	if (results->fslashline>10)
ali@42	718	{
ali@68	719	warnings.fslash=0;
ali@70	720	g_print(" --> %ld lines in this file contain forward slashes. "
ali@42	721	"Not reporting them.\n",results->fslashline);
ali@42	722	}
ali@42	723	/*
ali@42	724	* If more than 20 lines contain unpunctuated endquotes,
ali@42	725	* don't bother reporting them.
ali@42	726	*/
ali@42	727	warnings.endquote=1;
ali@42	728	if (results->endquote_count>20)
ali@42	729	{
ali@68	730	warnings.endquote=0;
ali@70	731	g_print(" --> %ld lines in this file contain unpunctuated endquotes. "
ali@42	732	"Not reporting them.\n",results->endquote_count);
ali@42	733	}
ali@42	734	/*
ali@42	735	* If more than 15 lines contain standalone digits,
ali@42	736	* don't bother reporting them.
ali@42	737	*/
ali@42	738	warnings.digit=1;
ali@42	739	if (results->standalone_digit>10)
ali@42	740	{
ali@68	741	warnings.digit=0;
ali@70	742	g_print(" --> %ld lines in this file contain standalone 0s and 1s. "
ali@42	743	"Not reporting them.\n",results->standalone_digit);
ali@42	744	}
ali@42	745	/*
ali@42	746	* If more than 20 lines contain hyphens at end,
ali@42	747	* don't bother reporting them.
ali@42	748	*/
ali@42	749	warnings.hyphen=1;
ali@42	750	if (results->hyphens>20)
ali@42	751	{
ali@68	752	warnings.hyphen=0;
ali@70	753	g_print(" --> %ld lines in this file have hyphens at end. "
ali@42	754	"Not reporting them.\n",results->hyphens);
ali@42	755	}
ali@42	756	if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
ali@42	757	{
ali@70	758	g_print(" --> Looks like this is HTML. Switching HTML mode ON.\n");
ali@68	759	pswit[MARKUP_SWITCH]=1;
ali@42	760	}
ali@42	761	if (results->verylongline>0)
ali@70	762	g_print(" --> %ld lines in this file are VERY long!\n",
ali@42	763	results->verylongline);
ali@42	764	/*
ali@42	765	* If there are more non-PG spaced dashes than PG em-dashes,
ali@42	766	* assume it's deliberate.
ali@42	767	* Current PG guidelines say don't use them, but older texts do,
ali@42	768	* and some people insist on them whatever the guidelines say.
ali@42	769	*/
ali@42	770	warnings.dash=1;
ali@97	771	if (results->spacedash+results->emdash.non_PG_space>
ali@97	772	results->emdash.PG_space)
ali@42	773	{
ali@68	774	warnings.dash=0;
ali@70	775	g_print(" --> There are %ld spaced dashes and em-dashes. "
ali@42	776	"Not reporting them.\n",
ali@97	777	results->spacedash+results->emdash.non_PG_space);
ali@42	778	}
ali@42	779	/* If more than a quarter of characters are hi-bit, bug out. */
ali@42	780	warnings.bin=1;
ali@42	781	if (results->binlen*4>results->totlen)
ali@42	782	{
ali@70	783	g_print(" --> This file does not appear to be ASCII. "
ali@42	784	"Terminating. Best of luck with it!\n");
ali@68	785	exit(1);
ali@42	786	}
ali@42	787	if (results->alphalen*4<results->totlen)
ali@42	788	{
ali@70	789	g_print(" --> This file does not appear to be text. "
ali@42	790	"Terminating. Best of luck with it!\n");
ali@68	791	exit(1);
ali@42	792	}
ali@42	793	if (results->binlen*100>results->totlen \|\| results->binlen>100)
ali@42	794	{
ali@70	795	g_print(" --> There are a lot of foreign letters here. "
ali@42	796	"Not reporting them.\n");
ali@68	797	warnings.bin=0;
ali@42	798	}
ali@69	799	warnings.isDutch=FALSE;
ali@42	800	if (results->Dutchcount>50)
ali@42	801	{
ali@69	802	warnings.isDutch=TRUE;
ali@70	803	g_print(" --> This looks like Dutch - "
ali@42	804	"switching off dashes and warnings for 's Middags case.\n");
ali@42	805	}
ali@69	806	warnings.isFrench=FALSE;
ali@42	807	if (results->Frenchcount>50)
ali@42	808	{
ali@69	809	warnings.isFrench=TRUE;
ali@70	810	g_print(" --> This looks like French - "
ali@42	811	"switching off some doublepunct.\n");
ali@42	812	}
ali@42	813	if (results->firstline && results->footerline)
ali@70	814	g_print(" The PG header and footer appear to be already on.\n");
ali@42	815	else
ali@42	816	{
ali@68	817	if (results->firstline)
ali@70	818	g_print(" The PG header is on - no footer.\n");
ali@68	819	if (results->footerline)
ali@70	820	g_print(" The PG footer is on - no header.\n");
ali@42	821	}
ali@70	822	g_print("\n");
ali@42	823	if (pswit[VERBOSE_SWITCH])
ali@42	824	{
ali@68	825	warnings.bin=1;
ali@68	826	warnings.shortline=1;
ali@68	827	warnings.dotcomma=1;
ali@68	828	warnings.longline=1;
ali@68	829	warnings.dash=1;
ali@68	830	warnings.digit=1;
ali@68	831	warnings.ast=1;
ali@68	832	warnings.fslash=1;
ali@68	833	warnings.hyphen=1;
ali@68	834	warnings.endquote=1;
ali@70	835	g_print(" * Verbose output is ON -- you asked for it! *\n");
ali@42	836	}
ali@42	837	if (warnings.isDutch)
ali@68	838	warnings.dash=0;
ali@42	839	if (results->footerline>0 && results->firstline>0 &&
ali@42	840	results->footerline>results->firstline &&
ali@42	841	results->footerline-results->firstline<100)
ali@42	842	{
ali@70	843	g_print(" --> I don't really know where this text starts. \n");
ali@70	844	g_print(" There are no reference points.\n");
ali@70	845	g_print(" I'm going to have to report the header and footer "
ali@42	846	"as well.\n");
ali@68	847	results->firstline=0;
ali@42	848	}
ali@42	849	return &warnings;
ali@42	850	}
ali@42	851
ali@43	852	/*
ali@43	853	* analyse_quotes:
ali@43	854	*
ali@43	855	* Look along the line, accumulate the count of quotes, and see
ali@43	856	* if this is an empty line - i.e. a line with nothing on it
ali@43	857	* but spaces.
ali@43	858	* If line has just spaces, period, * and/or - on it, don't
ali@43	859	* count it, since empty lines with asterisks or dashes to
ali@43	860	* separate sections are common.
ali@43	861	*
ali@69	862	* Returns: TRUE if the line is empty.
ali@43	863	*/
ali@98	864	gboolean analyse_quotes(const char aline,struct counters counters)
ali@43	865	{
ali@68	866	int guessquote=0;
ali@69	867	/* assume the line is empty until proven otherwise */
ali@69	868	gboolean isemptyline=TRUE;
ali@70	869	const char s=aline,sprev,*snext;
ali@70	870	gunichar c;
ali@70	871	sprev=NULL;
ali@94	872	GError *tmp_err=NULL;
ali@43	873	while (*s)
ali@43	874	{
ali@70	875	snext=g_utf8_next_char(s);
ali@70	876	c=g_utf8_get_char(s);
ali@94	877	if (CHAR_IS_DQUOTE(c))
ali@94	878	(void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);
ali@94	879	else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])
ali@43	880	{
ali@43	881	if (s==aline)
ali@43	882	{
ali@43	883	/*
ali@94	884	* At start of line, it can only be a quotation mark.
ali@43	885	* Hardcode a very common exception!
ali@43	886	*/
ali@70	887	if (!g_str_has_prefix(snext,"tis") &&
ali@70	888	!g_str_has_prefix(snext,"Tis"))
ali@94	889	(void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
ali@43	890	}
ali@70	891	else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
ali@70	892	g_unichar_isalpha(g_utf8_get_char(snext)))
ali@43	893	/* Do nothing! it's definitely an apostrophe, not a quote */
ali@43	894	;
ali@43	895	/* it's outside a word - let's check it out */
ali@92	896	else if (c==CHAR_OPEN_SQUOTE \|\| c==CHAR_LS_QUOTE \|\|
ali@70	897	g_unichar_isalpha(g_utf8_get_char(snext)))
ali@43	898	{
ali@94	899	/* certainly looks like a quotation mark */
ali@70	900	if (!g_str_has_prefix(snext,"tis") &&
ali@70	901	!g_str_has_prefix(snext,"Tis"))
ali@43	902	/* hardcode a very common exception! */
ali@94	903	{
ali@94	904	if (strchr(".?!,;:",g_utf8_get_char(sprev)))
ali@94	905	(void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
ali@94	906	else
ali@94	907	(void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);
ali@94	908	}
ali@43	909	}
ali@43	910	else
ali@43	911	{
ali@94	912	/* now - is it a quotation mark? */
ali@43	913	guessquote=0; /* accumulate clues */
ali@70	914	if (g_unichar_isalpha(g_utf8_get_char(sprev)))
ali@43	915	{
ali@43	916	/* it follows a letter - could be either */
ali@43	917	guessquote++;
ali@70	918	if (g_utf8_get_char(sprev)=='s')
ali@43	919	{
ali@43	920	/* looks like a plural apostrophe */
ali@43	921	guessquote-=3;
ali@70	922	if (g_utf8_get_char(snext)==CHAR_SPACE)
ali@70	923	/* bonus marks! */
ali@43	924	guessquote-=2;
ali@43	925	}
ali@94	926	if (innermost_quote_matches(counters,c))
ali@94	927	/*
ali@94	928	* Give it the benefit of some doubt,
ali@94	929	* if a squote is already open.
ali@94	930	*/
ali@94	931	guessquote++;
ali@94	932	else
ali@94	933	guessquote--;
ali@94	934	if (guessquote>=0)
ali@94	935	(void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);
ali@43	936	}
ali@43	937	else
ali@94	938	/* no adjacent letter - it must be a quote of some kind */
ali@94	939	(void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
ali@43	940	}
ali@43	941	}
ali@94	942	if (tmp_err)
ali@94	943	{
ali@94	944	if (pswit[ECHO_SWITCH])
ali@94	945	g_print("\n%s\n",aline);
ali@94	946	if (!pswit[OVERVIEW_SWITCH])
ali@94	947	g_print(" Line %ld column %ld - %s\n",
ali@94	948	linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);
ali@94	949	g_clear_error(&tmp_err);
ali@94	950	}
ali@70	951	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
ali@70	952	c!='\r' && c!='\n')
ali@69	953	isemptyline=FALSE; /* ignore lines like * * * as spacers */
ali@70	954	if (c==CHAR_UNDERSCORE)
ali@43	955	counters->c_unders++;
ali@93	956	if (c==CHAR_OPEN_SBRACK)
ali@93	957	{
ali@93	958	if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
ali@93	959	!matching_difference(counters,c) && s==aline &&
ali@93	960	g_str_has_prefix(s,"[Illustration:"))
ali@93	961	increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
ali@93	962	else
ali@93	963	increment_matching(counters,c,TRUE);
ali@93	964	}
ali@93	965	else if (c==CHAR_OPEN_CBRACK \|\| c==CHAR_OPEN_RBRACK)
ali@92	966	increment_matching(counters,c,TRUE);
ali@93	967	if (c==CHAR_CLOSE_SBRACK)
ali@93	968	{
ali@93	969	if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
ali@93	970	!matching_difference(counters,c) && !*snext)
ali@93	971	increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
ali@93	972	else
ali@93	973	increment_matching(counters,c,FALSE);
ali@93	974	}
ali@93	975	else if (c==CHAR_CLOSE_CBRACK \|\| c==CHAR_CLOSE_RBRACK)
ali@92	976	increment_matching(counters,c,FALSE);
ali@70	977	sprev=s;
ali@70	978	s=snext;
ali@43	979	}
ali@43	980	return isemptyline;
ali@43	981	}
ali@43	982
ali@41	983	/*
ali@67	984	* check_for_control_characters:
ali@67	985	*
ali@67	986	* Check for invalid or questionable characters in the line
ali@67	987	* Anything above 127 is invalid for plain ASCII, and
ali@67	988	* non-printable control characters should also be flagged.
ali@67	989	* Tabs should generally not be there.
ali@67	990	*/
ali@67	991	void check_for_control_characters(const char *aline)
ali@67	992	{
ali@70	993	gunichar c;
ali@67	994	const char *s;
ali@70	995	for (s=aline;*s;s=g_utf8_next_char(s))
ali@67	996	{
ali@70	997	c=g_utf8_get_char(s);
ali@67	998	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
ali@67	999	{
ali@67	1000	if (pswit[ECHO_SWITCH])
ali@70	1001	g_print("\n%s\n",aline);
ali@67	1002	if (!pswit[OVERVIEW_SWITCH])
ali@70	1003	g_print(" Line %ld column %ld - Control character %u\n",
ali@70	1004	linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
ali@67	1005	else
ali@67	1006	cnt_bin++;
ali@67	1007	}
ali@67	1008	}
ali@67	1009	}
ali@67	1010
ali@67	1011	/*
ali@44	1012	* check_for_odd_characters:
ali@44	1013	*
ali@44	1014	* Check for binary and other odd characters.
ali@44	1015	*/
ali@44	1016	void check_for_odd_characters(const char aline,const struct warnings warnings,
ali@69	1017	gboolean isemptyline)
ali@44	1018	{
ali@44	1019	/* Don't repeat multiple warnings on one line. */
ali@70	1020	gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
ali@70	1021	gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
ali@44	1022	const char *s;
ali@70	1023	gunichar c;
ali@70	1024	for (s=aline;*s;s=g_utf8_next_char(s))
ali@44	1025	{
ali@70	1026	c=g_utf8_get_char(s);
ali@70	1027	if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' \|\| c>127))
ali@44	1028	{
ali@44	1029	if (pswit[ECHO_SWITCH])
ali@70	1030	g_print("\n%s\n",aline);
ali@44	1031	if (!pswit[OVERVIEW_SWITCH])
ali@70	1032	if (c>127 && c<160 \|\| c>255)
ali@70	1033	g_print(" Line %ld column %ld - "
ali@70	1034	"Non-ISO-8859 character %u\n",
ali@70	1035	linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
ali@44	1036	else
ali@70	1037	g_print(" Line %ld column %ld - "
ali@70	1038	"Non-ASCII character %u\n",
ali@70	1039	linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
ali@44	1040	else
ali@44	1041	cnt_bin++;
ali@70	1042	eNon_A=TRUE;
ali@44	1043	}
ali@70	1044	if (!eTab && c==CHAR_TAB)
ali@44	1045	{
ali@44	1046	if (pswit[ECHO_SWITCH])
ali@70	1047	g_print("\n%s\n",aline);
ali@44	1048	if (!pswit[OVERVIEW_SWITCH])
ali@70	1049	g_print(" Line %ld column %ld - Tab character?\n",
ali@70	1050	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@44	1051	else
ali@44	1052	cnt_odd++;
ali@70	1053	eTab=TRUE;
ali@44	1054	}
ali@70	1055	if (!eTilde && c==CHAR_TILDE)
ali@44	1056	{
ali@44	1057	/*
ali@44	1058	* Often used by OCR software to indicate an
ali@44	1059	* unrecognizable character.
ali@44	1060	*/
ali@44	1061	if (pswit[ECHO_SWITCH])
ali@70	1062	g_print("\n%s\n",aline);
ali@44	1063	if (!pswit[OVERVIEW_SWITCH])
ali@70	1064	g_print(" Line %ld column %ld - Tilde character?\n",
ali@70	1065	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@44	1066	else
ali@44	1067	cnt_odd++;
ali@70	1068	eTilde=TRUE;
ali@44	1069	}
ali@70	1070	if (!eCarat && c==CHAR_CARAT)
ali@44	1071	{
ali@44	1072	if (pswit[ECHO_SWITCH])
ali@70	1073	g_print("\n%s\n",aline);
ali@44	1074	if (!pswit[OVERVIEW_SWITCH])
ali@70	1075	g_print(" Line %ld column %ld - Carat character?\n",
ali@70	1076	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@44	1077	else
ali@44	1078	cnt_odd++;
ali@70	1079	eCarat=TRUE;
ali@44	1080	}
ali@70	1081	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
ali@44	1082	{
ali@44	1083	if (pswit[ECHO_SWITCH])
ali@70	1084	g_print("\n%s\n",aline);
ali@44	1085	if (!pswit[OVERVIEW_SWITCH])
ali@70	1086	g_print(" Line %ld column %ld - Forward slash?\n",
ali@70	1087	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@44	1088	else
ali@44	1089	cnt_odd++;
ali@70	1090	eFSlash=TRUE;
ali@44	1091	}
ali@44	1092	/*
ali@44	1093	* Report asterisks only in paranoid mode,
ali@44	1094	* since they're often deliberate.
ali@44	1095	*/
ali@44	1096	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
ali@70	1097	c==CHAR_ASTERISK)
ali@44	1098	{
ali@44	1099	if (pswit[ECHO_SWITCH])
ali@70	1100	g_print("\n%s\n",aline);
ali@44	1101	if (!pswit[OVERVIEW_SWITCH])
ali@70	1102	g_print(" Line %ld column %ld - Asterisk?\n",
ali@70	1103	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@44	1104	else
ali@44	1105	cnt_odd++;
ali@70	1106	eAst=TRUE;
ali@44	1107	}
ali@44	1108	}
ali@44	1109	}
ali@44	1110
ali@44	1111	/*
ali@45	1112	* check_for_long_line:
ali@45	1113	*
ali@45	1114	* Check for line too long.
ali@45	1115	*/
ali@45	1116	void check_for_long_line(const char *aline)
ali@45	1117	{
ali@70	1118	if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
ali@45	1119	{
ali@45	1120	if (pswit[ECHO_SWITCH])
ali@70	1121	g_print("\n%s\n",aline);
ali@45	1122	if (!pswit[OVERVIEW_SWITCH])
ali@70	1123	g_print(" Line %ld column %ld - Long line %ld\n",
ali@70	1124	linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
ali@45	1125	else
ali@45	1126	cnt_long++;
ali@45	1127	}
ali@45	1128	}
ali@45	1129
ali@45	1130	/*
ali@45	1131	* check_for_short_line:
ali@45	1132	*
ali@45	1133	* Check for line too short.
ali@45	1134	*
ali@45	1135	* This one is a bit trickier to implement: we don't want to
ali@45	1136	* flag the last line of a paragraph for being short, so we
ali@45	1137	* have to wait until we know that our current line is a
ali@45	1138	* "normal" line, then report the _previous_ line if it was too
ali@45	1139	* short. We also don't want to report indented lines like
ali@45	1140	* chapter heads or formatted quotations. We therefore keep
ali@45	1141	* last->len as the length of the last line examined, and
ali@45	1142	* last->blen as the length of the last but one, and try to
ali@45	1143	* suppress unnecessary warnings by checking that both were of
ali@45	1144	* "normal" length. We keep the first character of the last
ali@45	1145	* line in last->start, and if it was a space, we assume that
ali@45	1146	* the formatting is deliberate. I can't figure out a way to
ali@45	1147	* distinguish something like a quoted verse left-aligned or
ali@45	1148	* the header or footer of a letter from a paragraph of short
ali@45	1149	* lines - maybe if I examined the whole paragraph, and if the
ali@45	1150	* para has less than, say, 8 lines and if all lines are short,
ali@45	1151	* then just assume it's OK? Need to look at some texts to see
ali@45	1152	* how often a formula like this would get the right result.
ali@45	1153	*/
ali@45	1154	void check_for_short_line(const char aline,const struct line_properties last)
ali@45	1155	{
ali@70	1156	if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
ali@70	1157	last->len<SHORTEST_PG_LINE && last->blen>1 &&
ali@70	1158	last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
ali@45	1159	{
ali@45	1160	if (pswit[ECHO_SWITCH])
ali@70	1161	g_print("\n%s\n",prevline);
ali@45	1162	if (!pswit[OVERVIEW_SWITCH])
ali@70	1163	g_print(" Line %ld column %ld - Short line %ld?\n",
ali@70	1164	linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
ali@45	1165	else
ali@45	1166	cnt_short++;
ali@45	1167	}
ali@45	1168	}
ali@45	1169
ali@45	1170	/*
ali@46	1171	* check_for_starting_punctuation:
ali@46	1172	*
ali@46	1173	* Look for punctuation other than full ellipses at start of line.
ali@46	1174	*/
ali@46	1175	void check_for_starting_punctuation(const char *aline)
ali@46	1176	{
ali@70	1177	if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
ali@70	1178	!g_str_has_prefix(aline,". . ."))
ali@46	1179	{
ali@46	1180	if (pswit[ECHO_SWITCH])
ali@70	1181	g_print("\n%s\n",aline);
ali@46	1182	if (!pswit[OVERVIEW_SWITCH])
ali@70	1183	g_print(" Line %ld column 1 - Begins with punctuation?\n",
ali@46	1184	linecnt);
ali@46	1185	else
ali@46	1186	cnt_punct++;
ali@46	1187	}
ali@46	1188	}
ali@46	1189
ali@46	1190	/*
ali@97	1191	* str_emdash:
ali@97	1192	*
ali@97	1193	* Find the first em-dash, return a pointer to it and set <next> to the
ali@97	1194	* character following the dash.
ali@97	1195	*/
ali@97	1196	char str_emdash(const char s,const char **next)
ali@97	1197	{
ali@97	1198	const char s1,s2;
ali@97	1199	s1=strstr(s,"--");
ali@97	1200	s2=strstr(s,"—");
ali@97	1201	if (!s1)
ali@97	1202	{
ali@97	1203	if (s2)
ali@97	1204	*next=g_utf8_next_char(s2);
ali@97	1205	return (char *)s2;
ali@97	1206	}
ali@97	1207	else if (!s2)
ali@97	1208	{
ali@97	1209	*next=g_utf8_next_char(g_utf8_next_char(s1));
ali@97	1210	return (char *)s1;
ali@97	1211	}
ali@97	1212	else if (s1<s2)
ali@97	1213	{
ali@97	1214	*next=g_utf8_next_char(g_utf8_next_char(s1));
ali@97	1215	return (char *)s1;
ali@97	1216	}
ali@97	1217	else
ali@97	1218	{
ali@97	1219	*next=g_utf8_next_char(s2);
ali@97	1220	return (char *)s2;
ali@97	1221	}
ali@97	1222	}
ali@97	1223
ali@97	1224	/*
ali@47	1225	* check_for_spaced_emdash:
ali@47	1226	*
ali@47	1227	* Check for spaced em-dashes.
ali@47	1228	*
ali@97	1229	* We must check _all_ occurrences of em-dashes on the line
ali@97	1230	* hence the loop - even if the first dash is OK
ali@47	1231	* there may be another that's wrong later on.
ali@47	1232	*/
ali@47	1233	void check_for_spaced_emdash(const char *aline)
ali@47	1234	{
ali@70	1235	const char s,t,*next;
ali@97	1236	for (s=aline;t=str_emdash(s,&next);s=next)
ali@47	1237	{
ali@70	1238	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE \|\|
ali@70	1239	g_utf8_get_char(next)==CHAR_SPACE)
ali@47	1240	{
ali@47	1241	if (pswit[ECHO_SWITCH])
ali@70	1242	g_print("\n%s\n",aline);
ali@47	1243	if (!pswit[OVERVIEW_SWITCH])
ali@70	1244	g_print(" Line %ld column %ld - Spaced em-dash?\n",
ali@70	1245	linecnt,g_utf8_pointer_to_offset(aline,t)+1);
ali@47	1246	else
ali@47	1247	cnt_dash++;
ali@47	1248	}
ali@47	1249	}
ali@47	1250	}
ali@47	1251
ali@47	1252	/*
ali@47	1253	* check_for_spaced_dash:
ali@47	1254	*
ali@47	1255	* Check for spaced dashes.
ali@47	1256	*/
ali@47	1257	void check_for_spaced_dash(const char *aline)
ali@47	1258	{
ali@47	1259	const char *s;
ali@47	1260	if ((s=strstr(aline," -")))
ali@47	1261	{
ali@70	1262	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
ali@47	1263	{
ali@47	1264	if (pswit[ECHO_SWITCH])
ali@70	1265	g_print("\n%s\n",aline);
ali@47	1266	if (!pswit[OVERVIEW_SWITCH])
ali@70	1267	g_print(" Line %ld column %ld - Spaced dash?\n",
ali@70	1268	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@47	1269	else
ali@47	1270	cnt_dash++;
ali@47	1271	}
ali@47	1272	}
ali@47	1273	else if ((s=strstr(aline,"- ")))
ali@47	1274	{
ali@70	1275	if (s==aline \|\| g_utf8_get_char(g_utf8_prev_char(s))!='-')
ali@47	1276	{
ali@47	1277	if (pswit[ECHO_SWITCH])
ali@70	1278	g_print("\n%s\n",aline);
ali@47	1279	if (!pswit[OVERVIEW_SWITCH])
ali@70	1280	g_print(" Line %ld column %ld - Spaced dash?\n",
ali@70	1281	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@47	1282	else
ali@47	1283	cnt_dash++;
ali@47	1284	}
ali@47	1285	}
ali@47	1286	}
ali@47	1287
ali@47	1288	/*
ali@48	1289	* check_for_unmarked_paragraphs:
ali@48	1290	*
ali@48	1291	* Check for unmarked paragraphs indicated by separate speakers.
ali@48	1292	*
ali@48	1293	* May well be false positive:
ali@48	1294	* "Bravo!" "Wonderful!" called the crowd.
ali@48	1295	* but useful all the same.
ali@48	1296	*/
ali@48	1297	void check_for_unmarked_paragraphs(const char *aline)
ali@48	1298	{
ali@48	1299	const char *s;
ali@48	1300	s=strstr(aline,"\" \"");
ali@48	1301	if (!s)
ali@48	1302	s=strstr(aline,"\" \"");
ali@48	1303	if (s)
ali@48	1304	{
ali@48	1305	if (pswit[ECHO_SWITCH])
ali@70	1306	g_print("\n%s\n",aline);
ali@48	1307	if (!pswit[OVERVIEW_SWITCH])
ali@70	1308	g_print(" Line %ld column %ld - "
ali@70	1309	"Query missing paragraph break?\n",
ali@70	1310	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@48	1311	else
ali@48	1312	cnt_punct++;
ali@48	1313	}
ali@48	1314	}
ali@48	1315
ali@48	1316	/*
ali@49	1317	* check_for_jeebies:
ali@49	1318	*
ali@49	1319	* Check for "to he" and other easy h/b errors.
ali@49	1320	*
ali@49	1321	* This is a very inadequate effort on the h/b problem,
ali@49	1322	* but the phrase "to he" is always an error, whereas "to
ali@49	1323	* be" is quite common.
ali@49	1324	* Similarly, '"Quiet!", be said.' is a non-be error
ali@49	1325	* "to he" is _not_ always an error!:
ali@49	1326	* "Where they went to he couldn't say."
ali@49	1327	* Another false positive:
ali@49	1328	* What would "Cinderella" be without the . . .
ali@49	1329	* and another: "If he wants to he can see for himself."
ali@49	1330	*/
ali@49	1331	void check_for_jeebies(const char *aline)
ali@49	1332	{
ali@49	1333	const char *s;
ali@49	1334	s=strstr(aline," be could ");
ali@49	1335	if (!s)
ali@49	1336	s=strstr(aline," be would ");
ali@49	1337	if (!s)
ali@49	1338	s=strstr(aline," was be ");
ali@49	1339	if (!s)
ali@49	1340	s=strstr(aline," be is ");
ali@49	1341	if (!s)
ali@49	1342	s=strstr(aline," is be ");
ali@49	1343	if (!s)
ali@49	1344	s=strstr(aline,"\", be ");
ali@49	1345	if (!s)
ali@49	1346	s=strstr(aline,"\" be ");
ali@49	1347	if (!s)
ali@49	1348	s=strstr(aline,"\" be ");
ali@49	1349	if (!s)
ali@49	1350	s=strstr(aline," to he ");
ali@49	1351	if (s)
ali@49	1352	{
ali@49	1353	if (pswit[ECHO_SWITCH])
ali@70	1354	g_print("\n%s\n",aline);
ali@49	1355	if (!pswit[OVERVIEW_SWITCH])
ali@70	1356	g_print(" Line %ld column %ld - Query he/be error?\n",
ali@70	1357	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@49	1358	else
ali@49	1359	cnt_word++;
ali@49	1360	}
ali@49	1361	s=strstr(aline," the had ");
ali@49	1362	if (!s)
ali@49	1363	s=strstr(aline," a had ");
ali@49	1364	if (!s)
ali@49	1365	s=strstr(aline," they bad ");
ali@49	1366	if (!s)
ali@49	1367	s=strstr(aline," she bad ");
ali@49	1368	if (!s)
ali@49	1369	s=strstr(aline," he bad ");
ali@49	1370	if (!s)
ali@49	1371	s=strstr(aline," you bad ");
ali@49	1372	if (!s)
ali@49	1373	s=strstr(aline," i bad ");
ali@49	1374	if (s)
ali@49	1375	{
ali@49	1376	if (pswit[ECHO_SWITCH])
ali@70	1377	g_print("\n%s\n",aline);
ali@49	1378	if (!pswit[OVERVIEW_SWITCH])
ali@70	1379	g_print(" Line %ld column %ld - Query had/bad error?\n",
ali@70	1380	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@49	1381	else
ali@49	1382	cnt_word++;
ali@49	1383	}
ali@49	1384	s=strstr(aline,"; hut ");
ali@49	1385	if (!s)
ali@49	1386	s=strstr(aline,", hut ");
ali@49	1387	if (s)
ali@49	1388	{
ali@49	1389	if (pswit[ECHO_SWITCH])
ali@70	1390	g_print("\n%s\n",aline);
ali@49	1391	if (!pswit[OVERVIEW_SWITCH])
ali@70	1392	g_print(" Line %ld column %ld - Query hut/but error?\n",
ali@70	1393	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@49	1394	else
ali@49	1395	cnt_word++;
ali@49	1396	}
ali@49	1397	}
ali@49	1398
ali@49	1399	/*
ali@50	1400	* check_for_mta_from:
ali@50	1401	*
ali@50	1402	* Special case - angled bracket in front of "From" placed there by an
ali@50	1403	* MTA when sending an e-mail.
ali@50	1404	*/
ali@50	1405	void check_for_mta_from(const char *aline)
ali@50	1406	{
ali@50	1407	const char *s;
ali@50	1408	s=strstr(aline,">From");
ali@50	1409	if (s)
ali@50	1410	{
ali@50	1411	if (pswit[ECHO_SWITCH])
ali@70	1412	g_print("\n%s\n",aline);
ali@50	1413	if (!pswit[OVERVIEW_SWITCH])
ali@70	1414	g_print(" Line %ld column %ld - "
ali@70	1415	"Query angled bracket with From\n",
ali@70	1416	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@50	1417	else
ali@50	1418	cnt_punct++;
ali@50	1419	}
ali@50	1420	}
ali@50	1421
ali@50	1422	/*
ali@51	1423	* check_for_orphan_character:
ali@51	1424	*
ali@51	1425	* Check for a single character line -
ali@51	1426	* often an overflow from bad wrapping.
ali@51	1427	*/
ali@51	1428	void check_for_orphan_character(const char *aline)
ali@51	1429	{
ali@70	1430	gunichar c;
ali@70	1431	c=g_utf8_get_char(aline);
ali@70	1432	if (c && !*g_utf8_next_char(aline))
ali@51	1433	{
ali@70	1434	if (c=='I' \|\| c=='V' \|\| c=='X' \|\| c=='L' \|\| g_unichar_isdigit(c))
ali@51	1435	; /* Nothing - ignore numerals alone on a line. */
ali@51	1436	else
ali@51	1437	{
ali@51	1438	if (pswit[ECHO_SWITCH])
ali@70	1439	g_print("\n%s\n",aline);
ali@51	1440	if (!pswit[OVERVIEW_SWITCH])
ali@70	1441	g_print(" Line %ld column 1 - Query single character line\n",
ali@51	1442	linecnt);
ali@51	1443	else
ali@51	1444	cnt_punct++;
ali@51	1445	}
ali@51	1446	}
ali@51	1447	}
ali@51	1448
ali@51	1449	/*
ali@52	1450	* check_for_pling_scanno:
ali@52	1451	*
ali@52	1452	* Check for I" - often should be !
ali@52	1453	*/
ali@52	1454	void check_for_pling_scanno(const char *aline)
ali@52	1455	{
ali@52	1456	const char *s;
ali@52	1457	s=strstr(aline," I\"");
ali@52	1458	if (s)
ali@52	1459	{
ali@52	1460	if (pswit[ECHO_SWITCH])
ali@70	1461	g_print("\n%s\n",aline);
ali@52	1462	if (!pswit[OVERVIEW_SWITCH])
ali@70	1463	g_print(" Line %ld column %ld - Query I=exclamation mark?\n",
ali@70	1464	linecnt,g_utf8_pointer_to_offset(aline,s));
ali@52	1465	else
ali@52	1466	cnt_punct++;
ali@52	1467	}
ali@52	1468	}
ali@52	1469
ali@52	1470	/*
ali@53	1471	* check_for_extra_period:
ali@53	1472	*
ali@53	1473	* Check for period without a capital letter. Cut-down from gutspell.
ali@53	1474	* Only works when it happens on a single line.
ali@53	1475	*/
ali@53	1476	void check_for_extra_period(const char aline,const struct warnings warnings)
ali@53	1477	{
ali@92	1478	const char s,t,s1,sprev;
ali@69	1479	int i;
ali@70	1480	gsize len;
ali@69	1481	gboolean istypo;
ali@69	1482	gchar *testword;
ali@92	1483	gunichar c,nc,pc,*decomposition;
ali@53	1484	if (pswit[PARANOID_SWITCH])
ali@53	1485	{
ali@70	1486	for (t=aline;t=strstr(t,". ");)
ali@53	1487	{
ali@69	1488	if (t==aline)
ali@53	1489	{
ali@70	1490	t=g_utf8_next_char(t);
ali@53	1491	/* start of line punctuation is handled elsewhere */
ali@53	1492	continue;
ali@53	1493	}
ali@70	1494	if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
ali@53	1495	{
ali@70	1496	t=g_utf8_next_char(t);
ali@53	1497	continue;
ali@53	1498	}
ali@53	1499	if (warnings->isDutch)
ali@53	1500	{
ali@53	1501	/* For Frank & Jeroen -- 's Middags case */
ali@70	1502	gunichar c2,c3,c4,c5;
ali@70	1503	c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
ali@70	1504	c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
ali@70	1505	c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
ali@70	1506	c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
ali@92	1507	if (CHAR_IS_APOSTROPHE(c2) &&
ali@92	1508	g_unichar_islower(c3) && c4==CHAR_SPACE &&
ali@92	1509	g_unichar_isupper(c5))
ali@53	1510	{
ali@70	1511	t=g_utf8_next_char(t);
ali@53	1512	continue;
ali@53	1513	}
ali@53	1514	}
ali@70	1515	s1=g_utf8_next_char(g_utf8_next_char(t));
ali@70	1516	while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
ali@99	1517	!g_unichar_isdigit(g_utf8_get_char(s1)))
ali@70	1518	s1=g_utf8_next_char(s1);
ali@70	1519	if (g_unichar_islower(g_utf8_get_char(s1)))
ali@53	1520	{
ali@53	1521	/* we have something to investigate */
ali@69	1522	istypo=TRUE;
ali@53	1523	/* so let's go back and find out */
ali@92	1524	nc=g_utf8_get_char(t);
ali@92	1525	s1=g_utf8_prev_char(t);
ali@92	1526	c=g_utf8_get_char(s1);
ali@92	1527	sprev=g_utf8_prev_char(s1);
ali@92	1528	pc=g_utf8_get_char(sprev);
ali@92	1529	while (s1>=aline &&
ali@92	1530	(g_unichar_isalpha(c) \|\| g_unichar_isdigit(c) \|\|
ali@92	1531	g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
ali@92	1532	g_unichar_isalpha(nc)))
ali@92	1533	{
ali@92	1534	nc=c;
ali@92	1535	s1=sprev;
ali@92	1536	c=pc;
ali@92	1537	sprev=g_utf8_prev_char(s1);
ali@92	1538	pc=g_utf8_get_char(sprev);
ali@92	1539	}
ali@70	1540	s1=g_utf8_next_char(s1);
ali@69	1541	s=strchr(s1,'.');
ali@69	1542	if (s)
ali@69	1543	testword=g_strndup(s1,s-s1);
ali@69	1544	else
ali@69	1545	testword=g_strdup(s1);
ali@53	1546	for (i=0;*abbrev[i];i++)
ali@53	1547	if (!strcmp(testword,abbrev[i]))
ali@69	1548	istypo=FALSE;
ali@70	1549	if (g_unichar_isdigit(g_utf8_get_char(testword)))
ali@69	1550	istypo=FALSE;
ali@70	1551	if (!*g_utf8_next_char(testword))
ali@69	1552	istypo=FALSE;
ali@53	1553	if (isroman(testword))
ali@69	1554	istypo=FALSE;
ali@53	1555	if (istypo)
ali@53	1556	{
ali@69	1557	istypo=FALSE;
ali@70	1558	for (s=testword;*s;s=g_utf8_next_char(s))
ali@70	1559	{
ali@70	1560	decomposition=g_unicode_canonical_decomposition(
ali@70	1561	g_utf8_get_char(s),&len);
ali@70	1562	if (g_utf8_strchr("aeiou",-1,decomposition[0]))
ali@69	1563	istypo=TRUE;
ali@70	1564	g_free(decomposition);
ali@70	1565	}
ali@53	1566	}
ali@69	1567	if (istypo &&
ali@69	1568	(pswit[VERBOSE_SWITCH] \|\| !g_tree_lookup(qperiod,testword)))
ali@53	1569	{
ali@69	1570	g_tree_insert(qperiod,g_strdup(testword),
ali@69	1571	GINT_TO_POINTER(1));
ali@69	1572	if (pswit[ECHO_SWITCH])
ali@70	1573	g_print("\n%s\n",aline);
ali@69	1574	if (!pswit[OVERVIEW_SWITCH])
ali@70	1575	g_print(" Line %ld column %ld - Extra period?\n",
ali@70	1576	linecnt,g_utf8_pointer_to_offset(aline,t)+1);
ali@69	1577	else
ali@69	1578	cnt_punct++;
ali@53	1579	}
ali@69	1580	g_free(testword);
ali@53	1581	}
ali@70	1582	t=g_utf8_next_char(t);
ali@53	1583	}
ali@53	1584	}
ali@53	1585	}
ali@53	1586
ali@53	1587	/*
ali@54	1588	* check_for_following_punctuation:
ali@54	1589	*
ali@54	1590	* Check for words usually not followed by punctuation.
ali@54	1591	*/
ali@54	1592	void check_for_following_punctuation(const char *aline)
ali@54	1593	{
ali@54	1594	int i;
ali@54	1595	const char s,wordstart;
ali@70	1596	gunichar c;
ali@69	1597	gchar inword,t;
ali@54	1598	if (pswit[TYPO_SWITCH])
ali@54	1599	{
ali@54	1600	for (s=aline;*s;)
ali@54	1601	{
ali@54	1602	wordstart=s;
ali@69	1603	t=getaword(&s);
ali@69	1604	if (!*t)
ali@69	1605	{
ali@69	1606	g_free(t);
ali@54	1607	continue;
ali@69	1608	}
ali@70	1609	inword=g_utf8_strdown(t,-1);
ali@69	1610	g_free(t);
ali@54	1611	for (i=0;*nocomma[i];i++)
ali@54	1612	if (!strcmp(inword,nocomma[i]))
ali@54	1613	{
ali@70	1614	c=g_utf8_get_char(s);
ali@70	1615	if (c==',' \|\| c==';' \|\| c==':')
ali@54	1616	{
ali@54	1617	if (pswit[ECHO_SWITCH])
ali@70	1618	g_print("\n%s\n",aline);
ali@54	1619	if (!pswit[OVERVIEW_SWITCH])
ali@70	1620	g_print(" Line %ld column %ld - "
ali@54	1621	"Query punctuation after %s?\n",
ali@70	1622	linecnt,g_utf8_pointer_to_offset(aline,s)+1,
ali@70	1623	inword);
ali@54	1624	else
ali@54	1625	cnt_punct++;
ali@54	1626	}
ali@54	1627	}
ali@54	1628	for (i=0;*noperiod[i];i++)
ali@54	1629	if (!strcmp(inword,noperiod[i]))
ali@54	1630	{
ali@70	1631	c=g_utf8_get_char(s);
ali@70	1632	if (c=='.' \|\| c=='!')
ali@54	1633	{
ali@54	1634	if (pswit[ECHO_SWITCH])
ali@70	1635	g_print("\n%s\n",aline);
ali@54	1636	if (!pswit[OVERVIEW_SWITCH])
ali@70	1637	g_print(" Line %ld column %ld - "
ali@54	1638	"Query punctuation after %s?\n",
ali@70	1639	linecnt,g_utf8_pointer_to_offset(aline,s)+1,
ali@70	1640	inword);
ali@54	1641	else
ali@54	1642	cnt_punct++;
ali@54	1643	}
ali@54	1644	}
ali@69	1645	g_free(inword);
ali@54	1646	}
ali@54	1647	}
ali@54	1648	}
ali@54	1649
ali@54	1650	/*
ali@55	1651	* check_for_typos:
ali@55	1652	*
ali@55	1653	* Check for commonly mistyped words,
ali@55	1654	* and digits like 0 for O in a word.
ali@55	1655	*/
ali@55	1656	void check_for_typos(const char aline,struct warnings warnings)
ali@55	1657	{
ali@70	1658	const char s,t,nt,wordstart;
ali@70	1659	gchar *inword;
ali@70	1660	gunichar *decomposition;
ali@70	1661	gchar *testword;
ali@70	1662	int i,vowel,consonant,*dupcnt;
ali@70	1663	gboolean isdup,istypo,alower;
ali@92	1664	gunichar c,pc;
ali@70	1665	long offset,len;
ali@70	1666	gsize decomposition_len;
ali@55	1667	for (s=aline;*s;)
ali@55	1668	{
ali@55	1669	wordstart=s;
ali@69	1670	inword=getaword(&s);
ali@55	1671	if (!*inword)
ali@69	1672	{
ali@69	1673	g_free(inword);
ali@55	1674	continue; /* don't bother with empty lines */
ali@69	1675	}
ali@55	1676	if (mixdigit(inword))
ali@55	1677	{
ali@55	1678	if (pswit[ECHO_SWITCH])
ali@70	1679	g_print("\n%s\n",aline);
ali@55	1680	if (!pswit[OVERVIEW_SWITCH])
ali@70	1681	g_print(" Line %ld column %ld - Query digit in %s\n",
ali@70	1682	linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
ali@55	1683	else
ali@55	1684	cnt_word++;
ali@55	1685	}
ali@55	1686	/*
ali@55	1687	* Put the word through a series of tests for likely typos and OCR
ali@55	1688	* errors.
ali@55	1689	*/
ali@69	1690	if (pswit[TYPO_SWITCH] \|\| pswit[USERTYPO_SWITCH])
ali@55	1691	{
ali@69	1692	istypo=FALSE;
ali@70	1693	alower=FALSE;
ali@70	1694	for (t=inword;*t;t=g_utf8_next_char(t))
ali@55	1695	{
ali@70	1696	c=g_utf8_get_char(t);
ali@70	1697	nt=g_utf8_next_char(t);
ali@55	1698	/* lowercase for testing */
ali@70	1699	if (g_unichar_islower(c))
ali@70	1700	alower=TRUE;
ali@70	1701	if (alower && (g_unichar_isupper(c) \|\| g_unichar_istitle(c)))
ali@55	1702	{
ali@55	1703	/*
ali@55	1704	* We have an uppercase mid-word. However, there are
ali@55	1705	* common cases:
ali@55	1706	* Mac and Mc like McGill
ali@55	1707	* French contractions like l'Abbe
ali@55	1708	*/
ali@70	1709	offset=g_utf8_pointer_to_offset(inword,t);
ali@92	1710	if (offset>0)
ali@92	1711	pc=g_utf8_get_char(g_utf8_prev_char(t));
ali@92	1712	else
ali@92	1713	pc='\0';
ali@70	1714	if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' \|\|
ali@70	1715	offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
ali@70	1716	g_utf8_get_char(g_utf8_next_char(nt))=='c' \|\|
ali@92	1717	CHAR_IS_APOSTROPHE(pc))
ali@55	1718	; /* do nothing! */
ali@55	1719	else
ali@69	1720	istypo=TRUE;
ali@55	1721	}
ali@55	1722	}
ali@70	1723	testword=g_utf8_casefold(inword,-1);
ali@69	1724	}
ali@69	1725	if (pswit[TYPO_SWITCH])
ali@69	1726	{
ali@55	1727	/*
ali@55	1728	* Check for certain unlikely two-letter combinations at word
ali@55	1729	* start and end.
ali@55	1730	*/
ali@70	1731	len=g_utf8_strlen(testword,-1);
ali@70	1732	if (len>1)
ali@55	1733	{
ali@55	1734	for (i=0;*nostart[i];i++)
ali@70	1735	if (g_str_has_prefix(testword,nostart[i]))
ali@69	1736	istypo=TRUE;
ali@55	1737	for (i=0;*noend[i];i++)
ali@70	1738	if (g_str_has_suffix(testword,noend[i]))
ali@69	1739	istypo=TRUE;
ali@55	1740	}
ali@55	1741	/* ght is common, gbt never. Like that. */
ali@55	1742	if (strstr(testword,"cb"))
ali@69	1743	istypo=TRUE;
ali@55	1744	if (strstr(testword,"gbt"))
ali@69	1745	istypo=TRUE;
ali@55	1746	if (strstr(testword,"pbt"))
ali@69	1747	istypo=TRUE;
ali@55	1748	if (strstr(testword,"tbs"))
ali@69	1749	istypo=TRUE;
ali@55	1750	if (strstr(testword,"mrn"))
ali@69	1751	istypo=TRUE;
ali@55	1752	if (strstr(testword,"ahle"))
ali@69	1753	istypo=TRUE;
ali@55	1754	if (strstr(testword,"ihle"))
ali@69	1755	istypo=TRUE;
ali@55	1756	/*
ali@55	1757	* "TBE" does happen - like HEARTBEAT - but uncommon.
ali@55	1758	* Also "TBI" - frostbite, outbid - but uncommon.
ali@55	1759	* Similarly "ii" like Hawaii, or Pompeii, and in Roman
ali@55	1760	* numerals, but "ii" is a common scanno.
ali@55	1761	*/
ali@55	1762	if (strstr(testword,"tbi"))
ali@69	1763	istypo=TRUE;
ali@55	1764	if (strstr(testword,"tbe"))
ali@69	1765	istypo=TRUE;
ali@55	1766	if (strstr(testword,"ii"))
ali@69	1767	istypo=TRUE;
ali@55	1768	/*
ali@55	1769	* Check for no vowels or no consonants.
ali@55	1770	* If none, flag a typo.
ali@55	1771	*/
ali@70	1772	if (!istypo && len>1)
ali@55	1773	{
ali@55	1774	vowel=consonant=0;
ali@70	1775	for (t=testword;*t;t=g_utf8_next_char(t))
ali@55	1776	{
ali@70	1777	c=g_utf8_get_char(t);
ali@70	1778	decomposition=
ali@70	1779	g_unicode_canonical_decomposition(c,&decomposition_len);
ali@70	1780	if (c=='y' \|\| g_unichar_isdigit(c))
ali@55	1781	{
ali@55	1782	/* Yah, this is loose. */
ali@55	1783	vowel++;
ali@55	1784	consonant++;
ali@55	1785	}
ali@70	1786	else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
ali@55	1787	vowel++;
ali@55	1788	else
ali@55	1789	consonant++;
ali@70	1790	g_free(decomposition);
ali@55	1791	}
ali@55	1792	if (!vowel \|\| !consonant)
ali@69	1793	istypo=TRUE;
ali@55	1794	}
ali@55	1795	/*
ali@55	1796	* Now exclude the word from being reported if it's in
ali@55	1797	* the okword list.
ali@55	1798	*/
ali@55	1799	for (i=0;*okword[i];i++)
ali@55	1800	if (!strcmp(testword,okword[i]))
ali@69	1801	istypo=FALSE;
ali@55	1802	/*
ali@55	1803	* What looks like a typo may be a Roman numeral.
ali@55	1804	* Exclude these.
ali@55	1805	*/
ali@55	1806	if (istypo && isroman(testword))
ali@69	1807	istypo=FALSE;
ali@55	1808	/* Check the manual list of typos. */
ali@55	1809	if (!istypo)
ali@55	1810	for (i=0;*typo[i];i++)
ali@55	1811	if (!strcmp(testword,typo[i]))
ali@69	1812	istypo=TRUE;
ali@55	1813	/*
ali@55	1814	* Check lowercase s, l, i and m - special cases.
ali@55	1815	* "j" - often a semi-colon gone wrong.
ali@55	1816	* "d" for a missing apostrophe - he d
ali@55	1817	* "n" for "in"
ali@55	1818	*/
ali@70	1819	if (!istypo && len==1 &&
ali@70	1820	g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
ali@69	1821	istypo=TRUE;
ali@55	1822	if (istypo)
ali@55	1823	{
ali@69	1824	dupcnt=g_tree_lookup(qword,testword);
ali@69	1825	if (dupcnt)
ali@69	1826	{
ali@69	1827	(*dupcnt)++;
ali@69	1828	isdup=!pswit[VERBOSE_SWITCH];
ali@69	1829	}
ali@69	1830	else
ali@69	1831	{
ali@69	1832	dupcnt=g_new0(int,1);
ali@69	1833	g_tree_insert(qword,g_strdup(testword),dupcnt);
ali@69	1834	isdup=FALSE;
ali@69	1835	}
ali@55	1836	if (!isdup)
ali@55	1837	{
ali@55	1838	if (pswit[ECHO_SWITCH])
ali@70	1839	g_print("\n%s\n",aline);
ali@55	1840	if (!pswit[OVERVIEW_SWITCH])
ali@55	1841	{
ali@70	1842	g_print(" Line %ld column %ld - Query word %s",
ali@70	1843	linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
ali@70	1844	inword);
ali@69	1845	if (!pswit[VERBOSE_SWITCH])
ali@70	1846	g_print(" - not reporting duplicates");
ali@70	1847	g_print("\n");
ali@55	1848	}
ali@55	1849	else
ali@55	1850	cnt_word++;
ali@55	1851	}
ali@55	1852	}
ali@55	1853	}
ali@55	1854	/* check the user's list of typos */
ali@69	1855	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
ali@69	1856	{
ali@69	1857	if (pswit[ECHO_SWITCH])
ali@70	1858	g_print("\n%s\n",aline);
ali@69	1859	if (!pswit[OVERVIEW_SWITCH])
ali@70	1860	g_print(" Line %ld column %ld - Query possible scanno %s\n",
ali@70	1861	linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
ali@69	1862	}
ali@69	1863	if (pswit[TYPO_SWITCH] \|\| pswit[USERTYPO_SWITCH])
ali@69	1864	g_free(testword);
ali@55	1865	if (pswit[PARANOID_SWITCH] && warnings->digit)
ali@55	1866	{
ali@55	1867	/* In paranoid mode, query all 0 and 1 standing alone. */
ali@55	1868	if (!strcmp(inword,"0") \|\| !strcmp(inword,"1"))
ali@55	1869	{
ali@55	1870	if (pswit[ECHO_SWITCH])
ali@70	1871	g_print("\n%s\n",aline);
ali@55	1872	if (!pswit[OVERVIEW_SWITCH])
ali@70	1873	g_print(" Line %ld column %ld - Query standalone %s\n",
ali@70	1874	linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
ali@70	1875	inword);
ali@55	1876	else
ali@55	1877	cnt_word++;
ali@55	1878	}
ali@55	1879	}
ali@69	1880	g_free(inword);
ali@55	1881	}
ali@55	1882	}
ali@55	1883
ali@56	1884	/*
ali@56	1885	* check_for_misspaced_punctuation:
ali@56	1886	*
ali@56	1887	* Look for added or missing spaces around punctuation and quotes.
ali@56	1888	* If there is a punctuation character like ! with no space on
ali@56	1889	* either side, suspect a missing!space. If there are spaces on
ali@56	1890	* both sides , assume a typo. If we see a double quote with no
ali@56	1891	* space or punctuation on either side of it, assume unspaced
ali@56	1892	* quotes "like"this.
ali@56	1893	*/
ali@56	1894	void check_for_misspaced_punctuation(const char *aline,
ali@69	1895	struct parities *parities,gboolean isemptyline)
ali@56	1896	{
ali@69	1897	gboolean isacro,isellipsis;
ali@56	1898	const char *s;
ali@70	1899	gunichar c,nc,pc,n2c;
ali@94	1900	int parity;
ali@70	1901	c=g_utf8_get_char(aline);
ali@70	1902	nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70	1903	for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@56	1904	{
ali@70	1905	pc=c;
ali@70	1906	c=nc;
ali@70	1907	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@56	1908	/* For each character in the line after the first. */
ali@70	1909	if (g_utf8_strchr(".?!,;:_",-1,c)) /* if it's punctuation */
ali@56	1910	{
ali@56	1911	/* we need to suppress warnings for acronyms like M.D. */
ali@69	1912	isacro=FALSE;
ali@56	1913	/* we need to suppress warnings for ellipsis . . . */
ali@69	1914	isellipsis=FALSE;
ali@70	1915	/*
ali@70	1916	* If there are letters on both sides of it or
ali@70	1917	* if it's strict punctuation followed by an alpha.
ali@70	1918	*/
ali@70	1919	if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) \|\|
ali@70	1920	g_utf8_strchr("?!,;:",-1,c)))
ali@56	1921	{
ali@70	1922	if (c=='.')
ali@56	1923	{
ali@70	1924	if (g_utf8_pointer_to_offset(aline,s)>2 &&
ali@70	1925	g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
ali@69	1926	isacro=TRUE;
ali@70	1927	n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
ali@70	1928	if (nc && n2c=='.')
ali@69	1929	isacro=TRUE;
ali@56	1930	}
ali@56	1931	if (!isacro)
ali@56	1932	{
ali@56	1933	if (pswit[ECHO_SWITCH])
ali@70	1934	g_print("\n%s\n",aline);
ali@56	1935	if (!pswit[OVERVIEW_SWITCH])
ali@70	1936	g_print(" Line %ld column %ld - Missing space?\n",
ali@70	1937	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56	1938	else
ali@56	1939	cnt_punct++;
ali@56	1940	}
ali@56	1941	}
ali@70	1942	if (pc==CHAR_SPACE && (nc==CHAR_SPACE \|\| !nc))
ali@56	1943	{
ali@56	1944	/*
ali@56	1945	* If there are spaces on both sides,
ali@56	1946	* or space before and end of line.
ali@56	1947	*/
ali@70	1948	if (c=='.')
ali@56	1949	{
ali@70	1950	if (g_utf8_pointer_to_offset(aline,s)>2 &&
ali@70	1951	g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
ali@69	1952	isellipsis=TRUE;
ali@70	1953	n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
ali@70	1954	if (nc && n2c=='.')
ali@69	1955	isellipsis=TRUE;
ali@56	1956	}
ali@56	1957	if (!isemptyline && !isellipsis)
ali@56	1958	{
ali@56	1959	if (pswit[ECHO_SWITCH])
ali@70	1960	g_print("\n%s\n",aline);
ali@56	1961	if (!pswit[OVERVIEW_SWITCH])
ali@70	1962	g_print(" Line %ld column %ld - "
ali@70	1963	"Spaced punctuation?\n",linecnt,
ali@70	1964	g_utf8_pointer_to_offset(aline,s)+1);
ali@56	1965	else
ali@56	1966	cnt_punct++;
ali@56	1967	}
ali@56	1968	}
ali@56	1969	}
ali@56	1970	}
ali@56	1971	/* Split out the characters that CANNOT be preceded by space. */
ali@70	1972	c=g_utf8_get_char(aline);
ali@70	1973	nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70	1974	for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@56	1975	{
ali@70	1976	pc=c;
ali@70	1977	c=nc;
ali@70	1978	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@56	1979	/* for each character in the line after the first */
ali@70	1980	if (g_utf8_strchr("?!,;:",-1,c))
ali@56	1981	{
ali@56	1982	/* if it's punctuation that _cannot_ have a space before it */
ali@70	1983	if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
ali@56	1984	{
ali@56	1985	/*
ali@70	1986	* If nc DOES == space,
ali@56	1987	* it was already reported just above.
ali@56	1988	*/
ali@56	1989	if (pswit[ECHO_SWITCH])
ali@70	1990	g_print("\n%s\n",aline);
ali@56	1991	if (!pswit[OVERVIEW_SWITCH])
ali@70	1992	g_print(" Line %ld column %ld - Spaced punctuation?\n",
ali@70	1993	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56	1994	else
ali@56	1995	cnt_punct++;
ali@56	1996	}
ali@56	1997	}
ali@56	1998	}
ali@56	1999	/*
ali@56	2000	* Special case " .X" where X is any alpha.
ali@56	2001	* This plugs a hole in the acronym code above.
ali@56	2002	* Inelegant, but maintainable.
ali@56	2003	*/
ali@70	2004	c=g_utf8_get_char(aline);
ali@70	2005	nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70	2006	for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@56	2007	{
ali@70	2008	pc=c;
ali@70	2009	c=nc;
ali@70	2010	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@56	2011	/* for each character in the line after the first */
ali@70	2012	if (c=='.')
ali@56	2013	{
ali@56	2014	/* if it's a period */
ali@70	2015	if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
ali@56	2016	{
ali@56	2017	/*
ali@56	2018	* If the period follows a space and
ali@56	2019	* is followed by a letter.
ali@56	2020	*/
ali@56	2021	if (pswit[ECHO_SWITCH])
ali@70	2022	g_print("\n%s\n",aline);
ali@56	2023	if (!pswit[OVERVIEW_SWITCH])
ali@70	2024	g_print(" Line %ld column %ld - Spaced punctuation?\n",
ali@70	2025	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56	2026	else
ali@56	2027	cnt_punct++;
ali@56	2028	}
ali@56	2029	}
ali@56	2030	}
ali@70	2031	c=g_utf8_get_char(aline);
ali@70	2032	nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70	2033	for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@56	2034	{
ali@70	2035	pc=c;
ali@70	2036	c=nc;
ali@70	2037	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@56	2038	/* for each character in the line after the first */
ali@94	2039	if (CHAR_IS_DQUOTE(c))
ali@56	2040	{
ali@70	2041	if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
ali@70	2042	!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc \|\|
ali@70	2043	!g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
ali@56	2044	{
ali@56	2045	if (pswit[ECHO_SWITCH])
ali@70	2046	g_print("\n%s\n",aline);
ali@56	2047	if (!pswit[OVERVIEW_SWITCH])
ali@70	2048	g_print(" Line %ld column %ld - Unspaced quotes?\n",
ali@70	2049	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56	2050	else
ali@56	2051	cnt_punct++;
ali@56	2052	}
ali@56	2053	}
ali@56	2054	}
ali@56	2055	/* Check parity of quotes. */
ali@70	2056	nc=g_utf8_get_char(aline);
ali@70	2057	for (s=aline;*s;s=g_utf8_next_char(s))
ali@56	2058	{
ali@70	2059	c=nc;
ali@70	2060	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@94	2061	if (CHAR_IS_DQUOTE(c))
ali@56	2062	{
ali@94	2063	if (c==CHAR_DQUOTE)
ali@94	2064	{
ali@94	2065	parities->dquote=!parities->dquote;
ali@94	2066	parity=parities->dquote;
ali@94	2067	}
ali@94	2068	else if (c==CHAR_LD_QUOTE)
ali@94	2069	parity=1;
ali@94	2070	else
ali@94	2071	parity=0;
ali@94	2072	if (!parity)
ali@56	2073	{
ali@56	2074	/* parity even */
ali@99	2075	if (!g_utf8_strchr("_-.'`‘’/,;:!?)]} ",-1,nc))
ali@56	2076	{
ali@56	2077	if (pswit[ECHO_SWITCH])
ali@70	2078	g_print("\n%s\n",aline);
ali@56	2079	if (!pswit[OVERVIEW_SWITCH])
ali@70	2080	g_print(" Line %ld column %ld - "
ali@70	2081	"Wrongspaced quotes?\n",
ali@70	2082	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56	2083	else
ali@56	2084	cnt_punct++;
ali@56	2085	}
ali@56	2086	}
ali@56	2087	else
ali@56	2088	{
ali@56	2089	/* parity odd */
ali@99	2090	if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
ali@99	2091	!g_utf8_strchr("_-/.'`‘’([{$",-1,nc) \|\| !nc)
ali@56	2092	{
ali@56	2093	if (pswit[ECHO_SWITCH])
ali@70	2094	g_print("\n%s\n",aline);
ali@56	2095	if (!pswit[OVERVIEW_SWITCH])
ali@70	2096	g_print(" Line %ld column %ld - "
ali@70	2097	"Wrongspaced quotes?\n",
ali@70	2098	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56	2099	else
ali@56	2100	cnt_punct++;
ali@56	2101	}
ali@56	2102	}
ali@56	2103	}
ali@56	2104	}
ali@94	2105	c=g_utf8_get_char(aline);
ali@94	2106	if (CHAR_IS_DQUOTE(c))
ali@56	2107	{
ali@70	2108	if (g_utf8_strchr(",;:!?)]} ",-1,
ali@70	2109	g_utf8_get_char(g_utf8_next_char(aline))))
ali@56	2110	{
ali@56	2111	if (pswit[ECHO_SWITCH])
ali@70	2112	g_print("\n%s\n",aline);
ali@56	2113	if (!pswit[OVERVIEW_SWITCH])
ali@70	2114	g_print(" Line %ld column 1 - Wrongspaced quotes?\n",
ali@56	2115	linecnt);
ali@56	2116	else
ali@56	2117	cnt_punct++;
ali@56	2118	}
ali@56	2119	}
ali@56	2120	if (pswit[SQUOTE_SWITCH])
ali@56	2121	{
ali@70	2122	nc=g_utf8_get_char(aline);
ali@70	2123	for (s=aline;*s;s=g_utf8_next_char(s))
ali@56	2124	{
ali@70	2125	c=nc;
ali@70	2126	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@92	2127	if (CHAR_IS_SQUOTE(c) && (s==aline \|\| s>aline &&
ali@70	2128	!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) \|\|
ali@70	2129	!g_unichar_isalpha(nc)))
ali@56	2130	{
ali@56	2131	parities->squote=!parities->squote;
ali@56	2132	if (!parities->squote)
ali@56	2133	{
ali@56	2134	/* parity even */
ali@70	2135	if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
ali@56	2136	{
ali@56	2137	if (pswit[ECHO_SWITCH])
ali@70	2138	g_print("\n%s\n",aline);
ali@56	2139	if (!pswit[OVERVIEW_SWITCH])
ali@70	2140	g_print(" Line %ld column %ld - "
ali@56	2141	"Wrongspaced singlequotes?\n",
ali@70	2142	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56	2143	else
ali@56	2144	cnt_punct++;
ali@56	2145	}
ali@56	2146	}
ali@56	2147	else
ali@56	2148	{
ali@56	2149	/* parity odd */
ali@99	2150	if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
ali@70	2151	!g_utf8_strchr("_-/\".'`",-1,nc) \|\| !nc)
ali@56	2152	{
ali@56	2153	if (pswit[ECHO_SWITCH])
ali@70	2154	g_print("\n%s\n",aline);
ali@56	2155	if (!pswit[OVERVIEW_SWITCH])
ali@70	2156	g_print(" Line %ld column %ld - "
ali@56	2157	"Wrongspaced singlequotes?\n",
ali@70	2158	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56	2159	else
ali@56	2160	cnt_punct++;
ali@56	2161	}
ali@56	2162	}
ali@56	2163	}
ali@56	2164	}
ali@56	2165	}
ali@56	2166	}
ali@56	2167
ali@55	2168	/*
ali@57	2169	* check_for_double_punctuation:
ali@57	2170	*
ali@57	2171	* Look for double punctuation like ,. or ,,
ali@57	2172	* Thanks to DW for the suggestion!
ali@57	2173	* In books with references, ".," and ".;" are common
ali@57	2174	* e.g. "etc., etc.," and vol. 1.; vol 3.;
ali@57	2175	* OTOH, from my initial tests, there are also fairly
ali@57	2176	* common errors. What to do? Make these cases paranoid?
ali@57	2177	* ".," is the most common, so warnings->dotcomma is used
ali@57	2178	* to suppress detailed reporting if it occurs often.
ali@57	2179	*/
ali@57	2180	void check_for_double_punctuation(const char aline,struct warnings warnings)
ali@57	2181	{
ali@70	2182	const char *s;
ali@70	2183	gunichar c,nc;
ali@70	2184	nc=g_utf8_get_char(aline);
ali@70	2185	for (s=aline;*s;s=g_utf8_next_char(s))
ali@57	2186	{
ali@70	2187	c=nc;
ali@70	2188	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@57	2189	/* for each punctuation character in the line */
ali@70	2190	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
ali@70	2191	g_utf8_strchr(".?!,;:",-1,nc))
ali@57	2192	{
ali@57	2193	/* followed by punctuation, it's a query, unless . . . */
ali@70	2194	if (c==nc && (c=='.' \|\| c=='?' \|\| c=='!') \|\|
ali@70	2195	!warnings->dotcomma && c=='.' && nc==',' \|\|
ali@70	2196	warnings->isFrench && g_str_has_prefix(s,",...") \|\|
ali@70	2197	warnings->isFrench && g_str_has_prefix(s,"...,") \|\|
ali@70	2198	warnings->isFrench && g_str_has_prefix(s,";...") \|\|
ali@70	2199	warnings->isFrench && g_str_has_prefix(s,"...;") \|\|
ali@70	2200	warnings->isFrench && g_str_has_prefix(s,":...") \|\|
ali@70	2201	warnings->isFrench && g_str_has_prefix(s,"...:") \|\|
ali@70	2202	warnings->isFrench && g_str_has_prefix(s,"!...") \|\|
ali@70	2203	warnings->isFrench && g_str_has_prefix(s,"...!") \|\|
ali@70	2204	warnings->isFrench && g_str_has_prefix(s,"?...") \|\|
ali@70	2205	warnings->isFrench && g_str_has_prefix(s,"...?"))
ali@57	2206	{
ali@70	2207	if (warnings->isFrench && g_str_has_prefix(s,",...") \|\|
ali@70	2208	warnings->isFrench && g_str_has_prefix(s,"...,") \|\|
ali@70	2209	warnings->isFrench && g_str_has_prefix(s,";...") \|\|
ali@70	2210	warnings->isFrench && g_str_has_prefix(s,"...;") \|\|
ali@70	2211	warnings->isFrench && g_str_has_prefix(s,":...") \|\|
ali@70	2212	warnings->isFrench && g_str_has_prefix(s,"...:") \|\|
ali@70	2213	warnings->isFrench && g_str_has_prefix(s,"!...") \|\|
ali@70	2214	warnings->isFrench && g_str_has_prefix(s,"...!") \|\|
ali@70	2215	warnings->isFrench && g_str_has_prefix(s,"?...") \|\|
ali@70	2216	warnings->isFrench && g_str_has_prefix(s,"...?"))
ali@70	2217	{
ali@70	2218	s+=4;
ali@70	2219	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@70	2220	}
ali@57	2221	; /* do nothing for .. !! and ?? which can be legit */
ali@57	2222	}
ali@57	2223	else
ali@57	2224	{
ali@57	2225	if (pswit[ECHO_SWITCH])
ali@70	2226	g_print("\n%s\n",aline);
ali@57	2227	if (!pswit[OVERVIEW_SWITCH])
ali@70	2228	g_print(" Line %ld column %ld - Double punctuation?\n",
ali@70	2229	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@57	2230	else
ali@57	2231	cnt_punct++;
ali@57	2232	}
ali@57	2233	}
ali@57	2234	}
ali@57	2235	}
ali@57	2236
ali@57	2237	/*
ali@58	2238	* check_for_spaced_quotes:
ali@58	2239	*/
ali@58	2240	void check_for_spaced_quotes(const char *aline)
ali@58	2241	{
ali@92	2242	int i;
ali@58	2243	const char s,t;
ali@92	2244	const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
ali@92	2245	CHAR_RS_QUOTE};
ali@92	2246	GString *pattern;
ali@58	2247	s=aline;
ali@58	2248	while ((t=strstr(s," \" ")))
ali@58	2249	{
ali@58	2250	if (pswit[ECHO_SWITCH])
ali@70	2251	g_print("\n%s\n",aline);
ali@58	2252	if (!pswit[OVERVIEW_SWITCH])
ali@70	2253	g_print(" Line %ld column %ld - Spaced doublequote?\n",
ali@70	2254	linecnt,g_utf8_pointer_to_offset(aline,t)+1);
ali@58	2255	else
ali@58	2256	cnt_punct++;
ali@70	2257	s=g_utf8_next_char(g_utf8_next_char(t));
ali@58	2258	}
ali@92	2259	pattern=g_string_new(NULL);
ali@92	2260	for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
ali@58	2261	{
ali@92	2262	g_string_assign(pattern," ");
ali@92	2263	g_string_append_unichar(pattern,single_quotes[i]);
ali@92	2264	g_string_append_c(pattern,' ');
ali@92	2265	s=aline;
ali@92	2266	while ((t=strstr(s,pattern->str)))
ali@92	2267	{
ali@92	2268	if (pswit[ECHO_SWITCH])
ali@92	2269	g_print("\n%s\n",aline);
ali@92	2270	if (!pswit[OVERVIEW_SWITCH])
ali@92	2271	g_print(" Line %ld column %ld - Spaced singlequote?\n",
ali@92	2272	linecnt,g_utf8_pointer_to_offset(aline,t)+1);
ali@92	2273	else
ali@92	2274	cnt_punct++;
ali@92	2275	s=g_utf8_next_char(g_utf8_next_char(t));
ali@92	2276	}
ali@58	2277	}
ali@92	2278	g_string_free(pattern,TRUE);
ali@58	2279	}
ali@58	2280
ali@58	2281	/*
ali@59	2282	* check_for_miscased_genative:
ali@59	2283	*
ali@59	2284	* Check special case of 'S instead of 's at end of word.
ali@59	2285	*/
ali@59	2286	void check_for_miscased_genative(const char *aline)
ali@59	2287	{
ali@59	2288	const char *s;
ali@70	2289	gunichar c,nc,pc;
ali@69	2290	if (!*aline)
ali@69	2291	return;
ali@70	2292	c=g_utf8_get_char(aline);
ali@70	2293	nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70	2294	for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@59	2295	{
ali@70	2296	pc=c;
ali@70	2297	c=nc;
ali@70	2298	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@92	2299	if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
ali@59	2300	{
ali@59	2301	if (pswit[ECHO_SWITCH])
ali@70	2302	g_print("\n%s\n",aline);
ali@59	2303	if (!pswit[OVERVIEW_SWITCH])
ali@70	2304	g_print(" Line %ld column %ld - Capital \"S\"?\n",
ali@70	2305	linecnt,g_utf8_pointer_to_offset(aline,s)+2);
ali@59	2306	else
ali@59	2307	cnt_punct++;
ali@59	2308	}
ali@59	2309	}
ali@59	2310	}
ali@59	2311
ali@59	2312	/*
ali@60	2313	* check_end_of_line:
ali@60	2314	*
ali@60	2315	* Now check special cases - start and end of line -
ali@60	2316	* for single and double quotes. Start is sometimes [sic]
ali@60	2317	* but better to query it anyway.
ali@60	2318	* While we're here, check for dash at end of line.
ali@60	2319	*/
ali@60	2320	void check_end_of_line(const char aline,struct warnings warnings)
ali@60	2321	{
ali@70	2322	int lbytes;
ali@70	2323	const char *s;
ali@70	2324	gunichar c1,c2;
ali@70	2325	lbytes=strlen(aline);
ali@70	2326	if (g_utf8_strlen(aline,lbytes)>1)
ali@60	2327	{
ali@70	2328	s=g_utf8_prev_char(aline+lbytes);
ali@70	2329	c1=g_utf8_get_char(s);
ali@70	2330	c2=g_utf8_get_char(g_utf8_prev_char(s));
ali@94	2331	if ((CHAR_IS_DQUOTE(c1) \|\| CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
ali@60	2332	{
ali@60	2333	if (pswit[ECHO_SWITCH])
ali@70	2334	g_print("\n%s\n",aline);
ali@60	2335	if (!pswit[OVERVIEW_SWITCH])
ali@70	2336	g_print(" Line %ld column %ld - Spaced quote?\n",linecnt,
ali@70	2337	g_utf8_strlen(aline,lbytes));
ali@70	2338	else
ali@70	2339	cnt_punct++;
ali@70	2340	}
ali@70	2341	c1=g_utf8_get_char(aline);
ali@70	2342	c2=g_utf8_get_char(g_utf8_next_char(aline));
ali@92	2343	if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
ali@70	2344	{
ali@70	2345	if (pswit[ECHO_SWITCH])
ali@70	2346	g_print("\n%s\n",aline);
ali@70	2347	if (!pswit[OVERVIEW_SWITCH])
ali@70	2348	g_print(" Line %ld column 1 - Spaced quote?\n",linecnt);
ali@60	2349	else
ali@60	2350	cnt_punct++;
ali@60	2351	}
ali@60	2352	/*
ali@60	2353	* Dash at end of line may well be legit - paranoid mode only
ali@60	2354	* and don't report em-dash at line-end.
ali@60	2355	*/
ali@60	2356	if (pswit[PARANOID_SWITCH] && warnings->hyphen)
ali@60	2357	{
ali@70	2358	for (s=g_utf8_prev_char(aline+lbytes);
ali@70	2359	s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
ali@60	2360	;
ali@70	2361	if (g_utf8_get_char(s)=='-' &&
ali@70	2362	g_utf8_get_char(g_utf8_prev_char(s))!='-')
ali@60	2363	{
ali@60	2364	if (pswit[ECHO_SWITCH])
ali@70	2365	g_print("\n%s\n",aline);
ali@60	2366	if (!pswit[OVERVIEW_SWITCH])
ali@70	2367	g_print(" Line %ld column %ld - "
ali@70	2368	"Hyphen at end of line?\n",
ali@70	2369	linecnt,g_utf8_pointer_to_offset(aline,s));
ali@60	2370	}
ali@60	2371	}
ali@60	2372	}
ali@60	2373	}
ali@60	2374
ali@60	2375	/*
ali@61	2376	* check_for_unspaced_bracket:
ali@61	2377	*
ali@61	2378	* Brackets are often unspaced, but shouldn't be surrounded by alpha.
ali@61	2379	* If so, suspect a scanno like "a]most".
ali@61	2380	*/
ali@61	2381	void check_for_unspaced_bracket(const char *aline)
ali@61	2382	{
ali@70	2383	const char *s;
ali@70	2384	gunichar c,nc,pc;
ali@70	2385	c=g_utf8_get_char(aline);
ali@70	2386	nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70	2387	for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@61	2388	{
ali@70	2389	pc=c;
ali@70	2390	c=nc;
ali@70	2391	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@70	2392	if (!nc)
ali@70	2393	break;
ali@61	2394	/* for each bracket character in the line except 1st & last */
ali@70	2395	if (g_utf8_strchr("{[()]}",-1,c) &&
ali@70	2396	g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
ali@61	2397	{
ali@61	2398	if (pswit[ECHO_SWITCH])
ali@70	2399	g_print("\n%s\n",aline);
ali@61	2400	if (!pswit[OVERVIEW_SWITCH])
ali@70	2401	g_print(" Line %ld column %ld - Unspaced bracket?\n",
ali@70	2402	linecnt,g_utf8_pointer_to_offset(aline,s));
ali@61	2403	else
ali@61	2404	cnt_punct++;
ali@61	2405	}
ali@61	2406	}
ali@61	2407	}
ali@61	2408
ali@61	2409	/*
ali@62	2410	* check_for_unpunctuated_endquote:
ali@62	2411	*/
ali@62	2412	void check_for_unpunctuated_endquote(const char *aline)
ali@62	2413	{
ali@70	2414	const char *s;
ali@70	2415	gunichar c,nc,pc;
ali@94	2416	QuoteClass qc;
ali@70	2417	c=g_utf8_get_char(aline);
ali@70	2418	nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70	2419	for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@62	2420	{
ali@70	2421	pc=c;
ali@70	2422	c=nc;
ali@94	2423	qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;
ali@70	2424	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@62	2425	/* for each character in the line except 1st */
ali@97	2426	if ((qc==CLOSING_QUOTE \|\| qc==NEUTRAL_QUOTE) && g_unichar_isalpha(pc))
ali@62	2427	{
ali@62	2428	if (pswit[ECHO_SWITCH])
ali@70	2429	g_print("\n%s\n",aline);
ali@62	2430	if (!pswit[OVERVIEW_SWITCH])
ali@70	2431	g_print(" Line %ld column %ld - "
ali@70	2432	"endquote missing punctuation?\n",
ali@70	2433	linecnt,g_utf8_pointer_to_offset(aline,s));
ali@62	2434	else
ali@62	2435	cnt_punct++;
ali@62	2436	}
ali@62	2437	}
ali@62	2438	}
ali@62	2439
ali@62	2440	/*
ali@63	2441	* check_for_html_tag:
ali@63	2442	*
ali@63	2443	* Check for <HTML TAG>.
ali@63	2444	*
ali@63	2445	* If there is a < in the line, followed at some point
ali@63	2446	* by a > then we suspect HTML.
ali@63	2447	*/
ali@63	2448	void check_for_html_tag(const char *aline)
ali@63	2449	{
ali@63	2450	const char open,close;
ali@70	2451	gchar *tag;
ali@70	2452	open=strchr(aline,'<');
ali@63	2453	if (open)
ali@63	2454	{
ali@70	2455	close=strchr(g_utf8_next_char(open),'>');
ali@63	2456	if (close)
ali@63	2457	{
ali@70	2458	if (pswit[ECHO_SWITCH])
ali@70	2459	g_print("\n%s\n",aline);
ali@70	2460	if (!pswit[OVERVIEW_SWITCH])
ali@63	2461	{
ali@70	2462	tag=g_strndup(open,close-open+1);
ali@70	2463	g_print(" Line %ld column %ld - HTML Tag? %s \n",
ali@70	2464	linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
ali@70	2465	g_free(tag);
ali@63	2466	}
ali@70	2467	else
ali@70	2468	cnt_html++;
ali@63	2469	}
ali@63	2470	}
ali@63	2471	}
ali@63	2472
ali@63	2473	/*
ali@64	2474	* check_for_html_entity:
ali@64	2475	*
ali@64	2476	* Check for &symbol; HTML.
ali@64	2477	*
ali@64	2478	* If there is a & in the line, followed at
ali@64	2479	* some point by a ; then we suspect HTML.
ali@64	2480	*/
ali@64	2481	void check_for_html_entity(const char *aline)
ali@64	2482	{
ali@64	2483	const char s,amp,*scolon;
ali@70	2484	gchar *entity;
ali@70	2485	amp=strchr(aline,'&');
ali@64	2486	if (amp)
ali@64	2487	{
ali@70	2488	scolon=strchr(amp,';');
ali@64	2489	if (scolon)
ali@64	2490	{
ali@70	2491	for (s=amp;s<scolon;s=g_utf8_next_char(s))
ali@70	2492	if (g_utf8_get_char(s)==CHAR_SPACE)
ali@70	2493	break; /* Don't report "Jones & Son;" */
ali@70	2494	if (s>=scolon)
ali@64	2495	{
ali@64	2496	if (pswit[ECHO_SWITCH])
ali@70	2497	g_print("\n%s\n",aline);
ali@64	2498	if (!pswit[OVERVIEW_SWITCH])
ali@70	2499	{
ali@70	2500	entity=g_strndup(amp,scolon-amp+1);
ali@70	2501	g_print(" Line %ld column %d - HTML symbol? %s \n",
ali@70	2502	linecnt,(int)(amp-aline)+1,entity);
ali@70	2503	g_free(entity);
ali@70	2504	}
ali@64	2505	else
ali@64	2506	cnt_html++;
ali@64	2507	}
ali@64	2508	}
ali@64	2509	}
ali@64	2510	}
ali@64	2511
ali@65	2512	/*
ali@66	2513	* check_for_omitted_punctuation:
ali@66	2514	*
ali@66	2515	* Check for omitted punctuation at end of paragraph by working back
ali@66	2516	* through prevline. DW.
ali@66	2517	* Need to check this only for "normal" paras.
ali@66	2518	* So what is a "normal" para?
ali@66	2519	* Not normal if one-liner (chapter headings, etc.)
ali@66	2520	* Not normal if doesn't contain at least one locase letter
ali@66	2521	* Not normal if starts with space
ali@66	2522	*/
ali@66	2523	void check_for_omitted_punctuation(const char *prevline,
ali@66	2524	struct line_properties *last,int start_para_line)
ali@66	2525	{
ali@70	2526	gboolean letter_on_line=FALSE;
ali@66	2527	const char *s;
ali@92	2528	gunichar c;
ali@94	2529	gboolean closing_quote;
ali@70	2530	for (s=prevline;*s;s=g_utf8_next_char(s))
ali@70	2531	if (g_unichar_isalpha(g_utf8_get_char(s)))
ali@70	2532	{
ali@70	2533	letter_on_line=TRUE;
ali@70	2534	break;
ali@70	2535	}
ali@66	2536	/*
ali@66	2537	* This next "if" is a problem.
ali@66	2538	* If we say "start_para_line <= linecnt - 1", that includes
ali@66	2539	* one-line "paragraphs" like chapter heads. Lotsa false positives.
ali@66	2540	* If we say "start_para_line < linecnt - 1" it doesn't, but then it
ali@66	2541	* misses genuine one-line paragraphs.
ali@66	2542	*/
ali@70	2543	if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
ali@70	2544	g_utf8_get_char(prevline)>CHAR_SPACE)
ali@66	2545	{
ali@92	2546	s=prevline+strlen(prevline);
ali@92	2547	do
ali@92	2548	{
ali@92	2549	s=g_utf8_prev_char(s);
ali@92	2550	c=g_utf8_get_char(s);
ali@94	2551	if (QUOTE_CLASS(c)==CLOSING_QUOTE \|\| QUOTE_CLASS(c)==NEUTRAL_QUOTE)
ali@94	2552	closing_quote=TRUE;
ali@94	2553	else
ali@94	2554	closing_quote=FALSE;
ali@94	2555	} while (closing_quote && s>prevline);
ali@70	2556	for (;s>prevline;s=g_utf8_prev_char(s))
ali@66	2557	{
ali@70	2558	if (g_unichar_isalpha(g_utf8_get_char(s)))
ali@66	2559	{
ali@66	2560	if (pswit[ECHO_SWITCH])
ali@70	2561	g_print("\n%s\n",prevline);
ali@66	2562	if (!pswit[OVERVIEW_SWITCH])
ali@70	2563	g_print(" Line %ld column %ld - "
ali@66	2564	"No punctuation at para end?\n",
ali@70	2565	linecnt-1,g_utf8_strlen(prevline,-1));
ali@66	2566	else
ali@66	2567	cnt_punct++;
ali@66	2568	break;
ali@66	2569	}
ali@97	2570	if (g_utf8_strchr("—-.:!([{?}])",-1,g_utf8_get_char(s)))
ali@66	2571	break;
ali@66	2572	}
ali@66	2573	}
ali@66	2574	}
ali@66	2575
ali@69	2576	gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
ali@69	2577	{
ali@69	2578	const char *word=key;
ali@69	2579	int *dupcnt=value;
ali@69	2580	if (*dupcnt)
ali@70	2581	g_print("\nNote: Queried word %s was duplicated %d times\n",
ali@69	2582	word,*dupcnt);
ali@69	2583	return FALSE;
ali@69	2584	}
ali@69	2585
ali@70	2586	void print_as_windows_1252(const char *string)
ali@70	2587	{
ali@70	2588	gsize inbytes,outbytes;
ali@70	2589	gchar buf,bp;
ali@86	2590	static GIConv converter=(GIConv)-1;
ali@70	2591	if (!string)
ali@70	2592	{
ali@70	2593	if (converter!=(GIConv)-1)
ali@70	2594	g_iconv_close(converter);
ali@70	2595	converter=(GIConv)-1;
ali@70	2596	return;
ali@70	2597	}
ali@86	2598	if (converter==(GIConv)-1)
ali@70	2599	converter=g_iconv_open("WINDOWS-1252","UTF-8");
ali@70	2600	if (converter!=(GIConv)-1)
ali@70	2601	{
ali@70	2602	inbytes=outbytes=strlen(string);
ali@70	2603	bp=buf=g_malloc(outbytes+1);
ali@70	2604	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
ali@70	2605	*bp='\0';
ali@70	2606	fputs(buf,stdout);
ali@70	2607	g_free(buf);
ali@70	2608	}
ali@70	2609	else
ali@70	2610	fputs(string,stdout);
ali@70	2611	}
ali@70	2612
ali@72	2613	void print_as_utf_8(const char *string)
ali@72	2614	{
ali@72	2615	fputs(string,stdout);
ali@72	2616	}
ali@72	2617
ali@66	2618	/*
ali@41	2619	* procfile:
ali@41	2620	*
ali@41	2621	* Process one file.
ali@41	2622	*/
ali@69	2623	void procfile(const char *filename)
ali@41	2624	{
ali@65	2625	const char *s;
ali@69	2626	gchar parastart=NULL; / first line of current para */
ali@69	2627	gchar etext,aline;
ali@69	2628	gchar *etext_ptr;
ali@69	2629	GError *err=NULL;
ali@41	2630	struct first_pass_results *first_pass_results;
ali@42	2631	struct warnings *warnings;
ali@43	2632	struct counters counters={0};
ali@45	2633	struct line_properties last={0};
ali@56	2634	struct parities parities={0};
ali@69	2635	struct pending pending={0};
ali@69	2636	gboolean isemptyline;
ali@68	2637	long start_para_line=0;
ali@69	2638	gboolean isnewpara=FALSE,enddash=FALSE;
ali@45	2639	last.start=CHAR_SPACE;
ali@68	2640	linecnt=checked_linecnt=0;
ali@69	2641	etext=read_etext(filename,&err);
ali@69	2642	if (!etext)
ali@41	2643	{
ali@68	2644	if (pswit[STDOUT_SWITCH])
ali@69	2645	fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
ali@68	2646	else
ali@69	2647	fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
ali@41	2648	exit(1);
ali@41	2649	}
ali@70	2650	g_print("\n\nFile: %s\n\n",filename);
ali@69	2651	first_pass_results=first_pass(etext);
ali@42	2652	warnings=report_first_pass(first_pass_results);
ali@69	2653	qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
ali@69	2654	qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
ali@40	2655	/*
ali@40	2656	* Here we go with the main pass. Hold onto yer hat!
ali@40	2657	*/
ali@65	2658	linecnt=0;
ali@69	2659	etext_ptr=etext;
ali@101	2660	while ((aline=flgets(&etext_ptr,linecnt+1,warnings->newlines)))
ali@40	2661	{
ali@68	2662	linecnt++;
ali@68	2663	if (linecnt==1)
ali@69	2664	isnewpara=TRUE;
ali@70	2665	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
ali@40	2666	continue; // skip DP page separators completely
ali@68	2667	if (linecnt<first_pass_results->firstline \|\|
ali@41	2668	(first_pass_results->footerline>0 &&
ali@41	2669	linecnt>first_pass_results->footerline))
ali@40	2670	{
ali@68	2671	if (pswit[HEADER_SWITCH])
ali@40	2672	{
ali@70	2673	if (g_str_has_prefix(aline,"Title:"))
ali@70	2674	g_print(" %s\n",aline);
ali@70	2675	if (g_str_has_prefix(aline,"Author:"))
ali@70	2676	g_print(" %s\n",aline);
ali@70	2677	if (g_str_has_prefix(aline,"Release Date:"))
ali@70	2678	g_print(" %s\n",aline);
ali@70	2679	if (g_str_has_prefix(aline,"Edition:"))
ali@70	2680	g_print(" %s\n\n",aline);
ali@40	2681	}
ali@68	2682	continue; /* skip through the header */
ali@40	2683	}
ali@68	2684	checked_linecnt++;
ali@65	2685	print_pending(aline,parastart,&pending);
ali@98	2686	isemptyline=analyse_quotes(aline,&counters);
ali@68	2687	if (isnewpara && !isemptyline)
ali@40	2688	{
ali@40	2689	/* This line is the start of a new paragraph. */
ali@68	2690	start_para_line=linecnt;
ali@40	2691	/* Capture its first line in case we want to report it later. */
ali@69	2692	g_free(parastart);
ali@69	2693	parastart=g_strdup(aline);
ali@56	2694	memset(&parities,0,sizeof(parities)); /* restart the quote count */
ali@68	2695	s=aline;
ali@70	2696	while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
ali@70	2697	!g_unichar_isdigit(g_utf8_get_char(s)))
ali@70	2698	s=g_utf8_next_char(s);
ali@70	2699	if (g_unichar_islower(g_utf8_get_char(s)))
ali@40	2700	{
ali@40	2701	/* and its first letter is lowercase */
ali@68	2702	if (pswit[ECHO_SWITCH])
ali@70	2703	g_print("\n%s\n",aline);
ali@68	2704	if (!pswit[OVERVIEW_SWITCH])
ali@70	2705	g_print(" Line %ld column %ld - "
ali@40	2706	"Paragraph starts with lower-case\n",
ali@70	2707	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@68	2708	else
ali@68	2709	cnt_punct++;
ali@40	2710	}
ali@69	2711	isnewpara=FALSE; /* Signal the end of new para processing. */
ali@40	2712	}
ali@68	2713	/* Check for an em-dash broken at line end. */
ali@70	2714	if (enddash && g_utf8_get_char(aline)=='-')
ali@40	2715	{
ali@68	2716	if (pswit[ECHO_SWITCH])
ali@70	2717	g_print("\n%s\n",aline);
ali@68	2718	if (!pswit[OVERVIEW_SWITCH])
ali@70	2719	g_print(" Line %ld column 1 - Broken em-dash?\n",linecnt);
ali@68	2720	else
ali@68	2721	cnt_punct++;
ali@40	2722	}
ali@69	2723	enddash=FALSE;
ali@70	2724	for (s=g_utf8_prev_char(aline+strlen(aline));
ali@70	2725	g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
ali@40	2726	;
ali@70	2727	if (s>=aline && g_utf8_get_char(s)=='-')
ali@69	2728	enddash=TRUE;
ali@67	2729	check_for_control_characters(aline);
ali@68	2730	if (warnings->bin)
ali@44	2731	check_for_odd_characters(aline,warnings,isemptyline);
ali@68	2732	if (warnings->longline)
ali@45	2733	check_for_long_line(aline);
ali@68	2734	if (warnings->shortline)
ali@45	2735	check_for_short_line(aline,&last);
ali@68	2736	last.blen=last.len;
ali@70	2737	last.len=g_utf8_strlen(aline,-1);
ali@70	2738	last.start=g_utf8_get_char(aline);
ali@46	2739	check_for_starting_punctuation(aline);
ali@68	2740	if (warnings->dash)
ali@40	2741	{
ali@47	2742	check_for_spaced_emdash(aline);
ali@47	2743	check_for_spaced_dash(aline);
ali@40	2744	}
ali@48	2745	check_for_unmarked_paragraphs(aline);
ali@49	2746	check_for_jeebies(aline);
ali@50	2747	check_for_mta_from(aline);
ali@51	2748	check_for_orphan_character(aline);
ali@52	2749	check_for_pling_scanno(aline);
ali@53	2750	check_for_extra_period(aline,warnings);
ali@54	2751	check_for_following_punctuation(aline);
ali@55	2752	check_for_typos(aline,warnings);
ali@56	2753	check_for_misspaced_punctuation(aline,&parities,isemptyline);
ali@57	2754	check_for_double_punctuation(aline,warnings);
ali@58	2755	check_for_spaced_quotes(aline);
ali@59	2756	check_for_miscased_genative(aline);
ali@60	2757	check_end_of_line(aline,warnings);
ali@61	2758	check_for_unspaced_bracket(aline);
ali@68	2759	if (warnings->endquote)
ali@62	2760	check_for_unpunctuated_endquote(aline);
ali@63	2761	check_for_html_tag(aline);
ali@64	2762	check_for_html_entity(aline);
ali@68	2763	if (isemptyline)
ali@40	2764	{
ali@65	2765	check_for_mismatched_quotes(&counters,&pending);
ali@93	2766	counters_reset(&counters);
ali@40	2767	/* let the next iteration know that it's starting a new para */
ali@69	2768	isnewpara=TRUE;
ali@69	2769	if (prevline)
ali@69	2770	check_for_omitted_punctuation(prevline,&last,start_para_line);
ali@40	2771	}
ali@69	2772	g_free(prevline);
ali@69	2773	prevline=g_strdup(aline);
ali@0	2774	}
ali@93	2775	linecnt++;
ali@93	2776	check_for_mismatched_quotes(&counters,&pending);
ali@93	2777	print_pending(NULL,parastart,&pending);
ali@93	2778	reset_pending(&pending);
ali@69	2779	if (prevline)
ali@69	2780	{
ali@69	2781	g_free(prevline);
ali@69	2782	prevline=NULL;
ali@69	2783	}
ali@69	2784	g_free(parastart);
ali@69	2785	g_free(prevline);
ali@69	2786	g_free(etext);
ali@79	2787	if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
ali@69	2788	g_tree_foreach(qword,report_duplicate_queries,NULL);
ali@69	2789	g_tree_unref(qword);
ali@69	2790	g_tree_unref(qperiod);
ali@92	2791	counters_destroy(&counters);
ali@70	2792	g_set_print_handler(NULL);
ali@70	2793	print_as_windows_1252(NULL);
ali@71	2794	if (pswit[MARKUP_SWITCH])
ali@71	2795	loseentities(NULL);
ali@0	2796	}
ali@0	2797
ali@40	2798	/*
ali@40	2799	* flgets:
ali@40	2800	*
ali@101	2801	* Get one line from the input text. The setting of newlines has the following
ali@101	2802	* effect:
ali@101	2803	*
ali@101	2804	* DOS_NEWLINES: Check for the existence of exactly one CR-LF line-end per line.
ali@101	2805	*
ali@101	2806	* OS9_NEWLINES: Asserts that etext contains no LFs. CR is used as
ali@101	2807	* the newline character.
ali@101	2808	*
ali@101	2809	* UNIX_NEWLINES: Check for the presence of CRs.
ali@101	2810	*
ali@101	2811	* In all cases, check that the last line is correctly terminated.
ali@40	2812	*
ali@40	2813	* Returns: a pointer to the line.
ali@40	2814	*/
ali@101	2815	char flgets(char *etext,long lcnt,int newlines)
ali@0	2816	{
ali@70	2817	gunichar c;
ali@69	2818	gboolean isCR=FALSE;
ali@69	2819	char theline=etext;
ali@70	2820	char *eos=theline;
ali@70	2821	gchar *s;
ali@70	2822	for (;;)
ali@40	2823	{
ali@70	2824	c=g_utf8_get_char(*etext);
ali@99	2825	if (!c)
ali@99	2826	{
ali@99	2827	if (*etext==theline)
ali@99	2828	return NULL;
ali@99	2829	else if (pswit[LINE_END_SWITCH])
ali@99	2830	{
ali@99	2831	if (pswit[ECHO_SWITCH])
ali@99	2832	{
ali@99	2833	s=g_strndup(theline,eos-theline);
ali@99	2834	g_print("\n%s\n",s);
ali@99	2835	g_free(s);
ali@99	2836	}
ali@99	2837	if (!pswit[OVERVIEW_SWITCH])
ali@101	2838	{
ali@101	2839	if (newlines==OS9_NEWLINES)
ali@101	2840	g_print(" Line %ld - No CR?\n",lcnt);
ali@101	2841	else
ali@101	2842	{
ali@101	2843	/* There may, or may not, have been a CR */
ali@101	2844	g_print(" Line %ld - No LF?\n",lcnt);
ali@101	2845	}
ali@101	2846	}
ali@99	2847	else
ali@99	2848	cnt_lineend++;
ali@99	2849	}
ali@99	2850	break;
ali@99	2851	}
ali@70	2852	etext=g_utf8_next_char(etext);
ali@40	2853	/* either way, it's end of line */
ali@69	2854	if (c=='\n')
ali@40	2855	{
ali@101	2856	if (newlines==DOS_NEWLINES && !isCR)
ali@40	2857	{
ali@40	2858	/* Error - a LF without a preceding CR */
ali@68	2859	if (pswit[LINE_END_SWITCH])
ali@40	2860	{
ali@68	2861	if (pswit[ECHO_SWITCH])
ali@70	2862	{
ali@70	2863	s=g_strndup(theline,eos-theline);
ali@70	2864	g_print("\n%s\n",s);
ali@70	2865	g_free(s);
ali@70	2866	}
ali@68	2867	if (!pswit[OVERVIEW_SWITCH])
ali@70	2868	g_print(" Line %ld - No CR?\n",lcnt);
ali@68	2869	else
ali@68	2870	cnt_lineend++;
ali@40	2871	}
ali@40	2872	}
ali@101	2873	break;
ali@40	2874	}
ali@69	2875	if (c=='\r')
ali@40	2876	{
ali@101	2877	if (newlines==OS9_NEWLINES)
ali@101	2878	break;
ali@101	2879	if (isCR \|\| newlines==UNIX_NEWLINES)
ali@40	2880	{
ali@68	2881	if (pswit[LINE_END_SWITCH])
ali@40	2882	{
ali@68	2883	if (pswit[ECHO_SWITCH])
ali@70	2884	{
ali@70	2885	s=g_strndup(theline,eos-theline);
ali@70	2886	g_print("\n%s\n",s);
ali@70	2887	g_free(s);
ali@70	2888	}
ali@68	2889	if (!pswit[OVERVIEW_SWITCH])
ali@101	2890	{
ali@101	2891	if (newlines==UNIX_NEWLINES)
ali@101	2892	g_print(" Line %ld column %ld - Embedded CR?\n",
ali@101	2893	lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
ali@101	2894	else
ali@101	2895	g_print(" Line %ld - Two successive CRs?\n",
ali@101	2896	lcnt);
ali@101	2897	}
ali@68	2898	else
ali@68	2899	cnt_lineend++;
ali@40	2900	}
ali@101	2901	if (newlines==UNIX_NEWLINES)
ali@101	2902	*eos=' ';
ali@40	2903	}
ali@101	2904	if (newlines==DOS_NEWLINES)
ali@101	2905	isCR=TRUE;
ali@40	2906	}
ali@68	2907	else
ali@40	2908	{
ali@68	2909	if (pswit[LINE_END_SWITCH] && isCR)
ali@40	2910	{
ali@68	2911	if (pswit[ECHO_SWITCH])
ali@70	2912	{
ali@70	2913	s=g_strndup(theline,eos-theline);
ali@70	2914	g_print("\n%s\n",s);
ali@70	2915	g_free(s);
ali@70	2916	}
ali@68	2917	if (!pswit[OVERVIEW_SWITCH])
ali@70	2918	g_print(" Line %ld column %ld - CR without LF?\n",
ali@70	2919	lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
ali@68	2920	else
ali@68	2921	cnt_lineend++;
ali@70	2922	*eos=' ';
ali@40	2923	}
ali@69	2924	isCR=FALSE;
ali@70	2925	eos=g_utf8_next_char(eos);
ali@40	2926	}
ali@69	2927	}
ali@70	2928	*eos='\0';
ali@0	2929	if (pswit[MARKUP_SWITCH])
ali@68	2930	postprocess_for_HTML(theline);
ali@0	2931	if (pswit[DP_SWITCH])
ali@68	2932	postprocess_for_DP(theline);
ali@40	2933	return theline;
ali@0	2934	}
ali@0	2935
ali@40	2936	/*
ali@40	2937	* mixdigit:
ali@40	2938	*
ali@40	2939	* Takes a "word" as a parameter, and checks whether it
ali@40	2940	* contains a mixture of alpha and digits. Generally, this is an
ali@40	2941	* error, but may not be for cases like 4th or L5 12s. 3d.
ali@40	2942	*
ali@70	2943	* Returns: TRUE iff an is error found.
ali@40	2944	*/
ali@70	2945	gboolean mixdigit(const char *checkword)
ali@0	2946	{
ali@70	2947	gboolean wehaveadigit,wehavealetter,query;
ali@70	2948	const char s,nondigit;
ali@70	2949	wehaveadigit=wehavealetter=query=FALSE;
ali@70	2950	for (s=checkword;*s;s=g_utf8_next_char(s))
ali@70	2951	if (g_unichar_isalpha(g_utf8_get_char(s)))
ali@70	2952	wehavealetter=TRUE;
ali@70	2953	else if (g_unichar_isdigit(g_utf8_get_char(s)))
ali@70	2954	wehaveadigit=TRUE;
ali@40	2955	if (wehaveadigit && wehavealetter)
ali@40	2956	{
ali@40	2957	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
ali@70	2958	query=TRUE;
ali@70	2959	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
ali@70	2960	nondigit=g_utf8_next_char(nondigit))
ali@68	2961	;
ali@68	2962	/* digits, ending in st, rd, nd, th of either case */
ali@70	2963	if (!g_ascii_strcasecmp(nondigit,"st") \|\|
ali@70	2964	!g_ascii_strcasecmp(nondigit,"rd") \|\|
ali@70	2965	!g_ascii_strcasecmp(nondigit,"nd") \|\|
ali@70	2966	!g_ascii_strcasecmp(nondigit,"th"))
ali@70	2967	query=FALSE;
ali@70	2968	if (!g_ascii_strcasecmp(nondigit,"sts") \|\|
ali@70	2969	!g_ascii_strcasecmp(nondigit,"rds") \|\|
ali@70	2970	!g_ascii_strcasecmp(nondigit,"nds") \|\|
ali@70	2971	!g_ascii_strcasecmp(nondigit,"ths"))
ali@70	2972	query=FALSE;
ali@70	2973	if (!g_ascii_strcasecmp(nondigit,"stly") \|\|
ali@70	2974	!g_ascii_strcasecmp(nondigit,"rdly") \|\|
ali@70	2975	!g_ascii_strcasecmp(nondigit,"ndly") \|\|
ali@70	2976	!g_ascii_strcasecmp(nondigit,"thly"))
ali@70	2977	query=FALSE;
ali@68	2978	/* digits, ending in l, L, s or d */
ali@70	2979	if (!g_ascii_strcasecmp(nondigit,"l") \|\| !strcmp(nondigit,"s") \|\|
ali@70	2980	!strcmp(nondigit,"d"))
ali@70	2981	query=FALSE;
ali@68	2982	/*
ali@40	2983	* L at the start of a number, representing Britsh pounds, like L500.
ali@70	2984	* This is cute. We know the current word is mixed digit. If the first
ali@68	2985	* letter is L, there must be at least one digit following. If both
ali@68	2986	* digits and letters follow, we have a genuine error, else we have a
ali@68	2987	* capital L followed by digits, and we accept that as a non-error.
ali@40	2988	*/
ali@70	2989	if (g_utf8_get_char(checkword)=='L' &&
ali@70	2990	!mixdigit(g_utf8_next_char(checkword)))
ali@70	2991	query=FALSE;
ali@40	2992	}
ali@40	2993	return query;
ali@0	2994	}
ali@0	2995
ali@40	2996	/*
ali@40	2997	* getaword:
ali@40	2998	*
ali@69	2999	* Extracts the first/next "word" from the line, and returns it.
ali@69	3000	* A word is defined as one English word unit--or at least that's the aim.
ali@69	3001	* "ptr" is advanced to the position in the line where we will start
ali@69	3002	* looking for the next word.
ali@40	3003	*
ali@69	3004	* Returns: A newly-allocated string.
ali@40	3005	*/
ali@69	3006	gchar getaword(const char *ptr)
ali@0	3007	{
ali@70	3008	const char s,t;
ali@69	3009	GString *word;
ali@70	3010	gunichar c,pc;
ali@69	3011	word=g_string_new(NULL);
ali@70	3012	for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
ali@70	3013	!g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
ali@70	3014	*ptr;ptr=g_utf8_next_char(*ptr))
ali@100	3015	{
ali@100	3016	/* Handle exceptions for footnote markers like [1] */
ali@100	3017	if (g_utf8_get_char(*ptr)=='[')
ali@100	3018	{
ali@100	3019	g_string_append_c(word,'[');
ali@100	3020	s=g_utf8_next_char(*ptr);
ali@100	3021	for (;g_unichar_isdigit(g_utf8_get_char(s));s=g_utf8_next_char(s))
ali@100	3022	g_string_append_unichar(word,g_utf8_get_char(s));
ali@100	3023	if (g_utf8_get_char(s)==']')
ali@100	3024	{
ali@100	3025	g_string_append_c(word,']');
ali@100	3026	*ptr=g_utf8_next_char(s);
ali@100	3027	return g_string_free(word,FALSE);
ali@100	3028	}
ali@100	3029	else
ali@100	3030	g_string_truncate(word,0);
ali@100	3031	}
ali@100	3032	}
ali@40	3033	/*
ali@40	3034	* Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
ali@40	3035	* Especially yucky is the case of L1,000
ali@40	3036	* This section looks for a pattern of characters including a digit
ali@40	3037	* followed by a comma or period followed by one or more digits.
ali@40	3038	* If found, it returns this whole pattern as a word; otherwise we discard
ali@40	3039	* the results and resume our normal programming.
ali@40	3040	*/
ali@69	3041	s=*ptr;
ali@70	3042	for (;g_unichar_isdigit(g_utf8_get_char(s)) \|\|
ali@70	3043	g_unichar_isalpha(g_utf8_get_char(s)) \|\|
ali@70	3044	g_utf8_get_char(s)==',' \|\| g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
ali@70	3045	g_string_append_unichar(word,g_utf8_get_char(s));
ali@82	3046	if (word->len)
ali@40	3047	{
ali@82	3048	for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
ali@40	3049	{
ali@82	3050	c=g_utf8_get_char(t);
ali@82	3051	pc=g_utf8_get_char(g_utf8_prev_char(t));
ali@82	3052	if ((c=='.' \|\| c==',') && g_unichar_isdigit(pc))
ali@82	3053	{
ali@82	3054	*ptr=s;
ali@82	3055	return g_string_free(word,FALSE);
ali@82	3056	}
ali@40	3057	}
ali@40	3058	}
ali@0	3059	/* we didn't find a punctuated number - do the regular getword thing */
ali@69	3060	g_string_truncate(word,0);
ali@92	3061	c=g_utf8_get_char(*ptr);
ali@92	3062	for (;g_unichar_isdigit(c) \|\| g_unichar_isalpha(c) \|\| CHAR_IS_APOSTROPHE(c);
ali@92	3063	ptr=g_utf8_next_char(ptr),c=g_utf8_get_char(*ptr))
ali@92	3064	g_string_append_unichar(word,c);
ali@69	3065	return g_string_free(word,FALSE);
ali@0	3066	}
ali@0	3067
ali@40	3068	/*
ali@40	3069	* isroman:
ali@40	3070	*
ali@40	3071	* Is this word a Roman Numeral?
ali@40	3072	*
ali@40	3073	* It doesn't actually validate that the number is a valid Roman Numeral--for
ali@40	3074	* example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
ali@40	3075	* what we're here to do. If it passes this, it LOOKS like a Roman numeral.
ali@40	3076	* Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
ali@40	3077	* expressions thereof, except when it came to taxes. Allow any number of M,
ali@40	3078	* an optional D, an optional CM or CD, any number of optional Cs, an optional
ali@40	3079	* XL or an optional XC, an optional IX or IV, an optional V and any number
ali@40	3080	* of optional Is.
ali@40	3081	*/
ali@69	3082	gboolean isroman(const char *t)
ali@0	3083	{
ali@69	3084	const char *s;
ali@40	3085	if (!t \|\| !*t)
ali@69	3086	return FALSE;
ali@40	3087	s=t;
ali@70	3088	while (g_utf8_get_char(t)=='m' && *t)
ali@40	3089	t++;
ali@70	3090	if (g_utf8_get_char(t)=='d')
ali@40	3091	t++;
ali@70	3092	if (g_str_has_prefix(t,"cm"))
ali@40	3093	t+=2;
ali@70	3094	if (g_str_has_prefix(t,"cd"))
ali@40	3095	t+=2;
ali@70	3096	while (g_utf8_get_char(t)=='c' && *t)
ali@40	3097	t++;
ali@70	3098	if (g_str_has_prefix(t,"xl"))
ali@40	3099	t+=2;
ali@70	3100	if (g_str_has_prefix(t,"xc"))
ali@40	3101	t+=2;
ali@70	3102	if (g_utf8_get_char(t)=='l')
ali@40	3103	t++;
ali@70	3104	while (g_utf8_get_char(t)=='x' && *t)
ali@40	3105	t++;
ali@70	3106	if (g_str_has_prefix(t,"ix"))
ali@40	3107	t+=2;
ali@70	3108	if (g_str_has_prefix(t,"iv"))
ali@40	3109	t+=2;
ali@70	3110	if (g_utf8_get_char(t)=='v')
ali@40	3111	t++;
ali@70	3112	while (g_utf8_get_char(t)=='i' && *t)
ali@40	3113	t++;
ali@40	3114	return !*t;
ali@0	3115	}
ali@0	3116
ali@40	3117	/*
ali@40	3118	* postprocess_for_DP:
ali@40	3119	*
ali@40	3120	* Invoked with the -d switch from flgets().
ali@40	3121	* It simply "removes" from the line a hard-coded set of common
ali@40	3122	* DP-specific tags, so that the line passed to the main routine has
ali@40	3123	* been pre-cleaned of DP markup.
ali@40	3124	*/
ali@0	3125	void postprocess_for_DP(char *theline)
ali@0	3126	{
ali@40	3127	char s,t;
ali@0	3128	int i;
ali@0	3129	if (!*theline)
ali@68	3130	return;
ali@40	3131	for (i=0;*DPmarkup[i];i++)
ali@70	3132	while ((s=strstr(theline,DPmarkup[i])))
ali@40	3133	{
ali@68	3134	t=s+strlen(DPmarkup[i]);
ali@70	3135	memmove(s,t,strlen(t)+1);
ali@40	3136	}
ali@0	3137	}
ali@0	3138
ali@40	3139	/*
ali@40	3140	* postprocess_for_HTML:
ali@40	3141	*
ali@40	3142	* Invoked with the -m switch from flgets().
ali@40	3143	* It simply "removes" from the line a hard-coded set of common
ali@40	3144	* HTML tags and "replaces" a hard-coded set of common HTML
ali@40	3145	* entities, so that the line passed to the main routine has
ali@40	3146	* been pre-cleaned of HTML.
ali@40	3147	*/
ali@0	3148	void postprocess_for_HTML(char *theline)
ali@0	3149	{
ali@70	3150	while (losemarkup(theline))
ali@70	3151	;
ali@71	3152	loseentities(theline);
ali@0	3153	}
ali@0	3154
ali@0	3155	char losemarkup(char theline)
ali@0	3156	{
ali@40	3157	char s,t;
ali@0	3158	int i;
ali@70	3159	s=strchr(theline,'<');
ali@70	3160	t=s?strchr(s,'>'):NULL;
ali@40	3161	if (!s \|\| !t)
ali@40	3162	return NULL;
ali@40	3163	for (i=0;*markup[i];i++)
ali@70	3164	if (tagcomp(g_utf8_next_char(s),markup[i]))
ali@40	3165	{
ali@70	3166	t=g_utf8_next_char(t);
ali@70	3167	memmove(s,t,strlen(t)+1);
ali@70	3168	return s;
ali@68	3169	}
ali@40	3170	/* It's an unrecognized <xxx>. */
ali@40	3171	return NULL;
ali@0	3172	}
ali@0	3173
ali@71	3174	void loseentities(char *theline)
ali@0	3175	{
ali@0	3176	int i;
ali@71	3177	gsize nb;
ali@71	3178	char amp,scolon;
ali@71	3179	gchar s,t;
ali@71	3180	gunichar c;
ali@71	3181	GTree *entities=NULL;
ali@86	3182	static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
ali@71	3183	if (!theline)
ali@40	3184	{
ali@71	3185	if (entities)
ali@71	3186	g_tree_destroy(entities);
ali@71	3187	entities=NULL;
ali@86	3188	if (translit!=(GIConv)-1)
ali@71	3189	g_iconv_close(translit);
ali@71	3190	translit=(GIConv)-1;
ali@86	3191	if (to_utf8!=(GIConv)-1)
ali@71	3192	g_iconv_close(to_utf8);
ali@71	3193	to_utf8=(GIConv)-1;
ali@71	3194	return;
ali@71	3195	}
ali@71	3196	if (!*theline)
ali@71	3197	return;
ali@71	3198	if (!entities)
ali@71	3199	{
ali@71	3200	entities=g_tree_new((GCompareFunc)strcmp);
ali@71	3201	for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
ali@71	3202	g_tree_insert(entities,HTMLentities[i].name,
ali@71	3203	GUINT_TO_POINTER(HTMLentities[i].c));
ali@71	3204	}
ali@71	3205	if (translit==(GIConv)-1)
ali@71	3206	translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
ali@71	3207	if (to_utf8==(GIConv)-1)
ali@71	3208	to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
ali@71	3209	while((amp=strchr(theline,'&')))
ali@71	3210	{
ali@71	3211	scolon=strchr(amp,';');
ali@71	3212	if (scolon)
ali@40	3213	{
ali@71	3214	if (amp[1]=='#')
ali@71	3215	{
ali@71	3216	if (amp+2+strspn(amp+2,"0123456789")==scolon)
ali@71	3217	c=strtol(amp+2,NULL,10);
ali@71	3218	else if (amp[2]=='x' &&
ali@71	3219	amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
ali@71	3220	c=strtol(amp+3,NULL,16);
ali@71	3221	}
ali@71	3222	else
ali@71	3223	{
ali@71	3224	s=g_strndup(amp+1,scolon-(amp+1));
ali@71	3225	c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
ali@71	3226	g_free(s);
ali@71	3227	}
ali@40	3228	}
ali@71	3229	else
ali@71	3230	c=0;
ali@71	3231	if (c)
ali@71	3232	{
ali@71	3233	theline=amp;
ali@71	3234	if (c<128 \|\| c>=192 && c<=255) /* An ISO-8859-1 character */
ali@71	3235	theline+=g_unichar_to_utf8(c,theline);
ali@71	3236	else
ali@71	3237	{
ali@71	3238	s=g_malloc(6);
ali@71	3239	nb=g_unichar_to_utf8(c,s);
ali@71	3240	t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
ali@71	3241	g_free(s);
ali@71	3242	s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
ali@71	3243	g_free(t);
ali@71	3244	memcpy(theline,s,nb);
ali@71	3245	g_free(s);
ali@71	3246	theline+=nb;
ali@71	3247	}
ali@71	3248	memmove(theline,g_utf8_next_char(scolon),
ali@71	3249	strlen(g_utf8_next_char(scolon))+1);
ali@71	3250	}
ali@71	3251	else
ali@71	3252	theline=g_utf8_next_char(amp);
ali@40	3253	}
ali@0	3254	}
ali@0	3255
ali@70	3256	gboolean tagcomp(const char strin,const char basetag)
ali@0	3257	{
ali@70	3258	gboolean retval;
ali@70	3259	gchar s,t;
ali@70	3260	if (g_utf8_get_char(strin)=='/')
ali@70	3261	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
ali@70	3262	else
ali@70	3263	t=g_utf8_casefold(strin,-1);
ali@70	3264	s=g_utf8_casefold(basetag,-1);
ali@70	3265	retval=g_str_has_prefix(t,s);
ali@70	3266	g_free(s);
ali@70	3267	g_free(t);
ali@70	3268	return retval;
ali@0	3269	}
ali@0	3270
ali@69	3271	void proghelp(GOptionContext *context)
ali@0	3272	{
ali@69	3273	gchar *help;
ali@40	3274	fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
ali@40	3275	fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
ali@40	3276	fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
ali@40	3277	fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
ali@40	3278	"For details, read the file COPYING.\n",stderr);
ali@40	3279	fputs("This is Free Software; "
ali@40	3280	"you may redistribute it under certain conditions (GPL);\n",stderr);
ali@40	3281	fputs("read the file COPYING for details.\n\n",stderr);
ali@69	3282	help=g_option_context_get_help(context,TRUE,NULL);
ali@69	3283	fputs(help,stderr);
ali@69	3284	g_free(help);
ali@69	3285	fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
ali@40	3286	fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
ali@40	3287	"non-ASCII\n",stderr);
ali@40	3288	fputs("characters like accented letters, "
ali@40	3289	"lines longer than 75 or shorter than 55,\n",stderr);
ali@40	3290	fputs("unbalanced quotes or brackets, "
ali@40	3291	"a variety of badly formatted punctuation, \n",stderr);
ali@40	3292	fputs("HTML tags, some likely typos. "
ali@40	3293	"It is NOT a substitute for human judgement.\n",stderr);
ali@0	3294	fputs("\n",stderr);
ali@0	3295	}

author	ali <ali@juiblex.co.uk>
	Sat Oct 26 18:47:33 2013 +0100 (2013-10-26)
changeset 101	f44c530f80da
parent 100	ad92d11d59b8
child 102	ff0aa9b1397a
permissions	-rw-r--r--