bookloupe: bookloupe/bookloupe.c@ad92d11d59b8 (annotated)

ali@0	1	/*************************************************************************/
ali@40	2	/* bookloupe--check for assorted weirdnesses in a PG candidate text file */
ali@68	3	/* */
ali@68	4	/* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
ali@68	5	/* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
ali@68	6	/* */
ali@0	7	/* This program is free software; you can redistribute it and/or modify */
ali@0	8	/* it under the terms of the GNU General Public License as published by */
ali@0	9	/* the Free Software Foundation; either version 2 of the License, or */
ali@68	10	/* (at your option) any later version. */
ali@68	11	/* */
ali@0	12	/* This program is distributed in the hope that it will be useful, */
ali@68	13	/* but WITHOUT ANY WARRANTY; without even the implied warranty of */
ali@68	14	/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
ali@68	15	/* GNU General Public License for more details. */
ali@68	16	/* */
ali@68	17	/* You should have received a copy of the GNU General Public License */
ali@68	18	/* along with this program. If not, see <http://www.gnu.org/licenses/>. */
ali@0	19	/*************************************************************************/
ali@0	20
ali@0	21	#include <stdio.h>
ali@0	22	#include <stdlib.h>
ali@0	23	#include <string.h>
ali@0	24	#include <ctype.h>
ali@73	25	#ifdef __WIN32__
ali@73	26	#include <windows.h>
ali@73	27	#endif
ali@69	28	#include <glib.h>
ali@69	29	#include <bl/bl.h>
ali@92	30	#include "bookloupe.h"
ali@92	31	#include "counters.h"
ali@93	32	#include "pending.h"
ali@71	33	#include "HTMLentities.h"
ali@0	34
ali@69	35	gchar *prevline;
ali@0	36
ali@40	37	/* Common typos. */
ali@40	38	char *typo[] = {
ali@40	39	"teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
ali@40	40	"nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
ali@40	41	"bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
ali@40	42	"couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
ali@40	43	"esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
ali@40	44	"gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
ali@40	45	"herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
ali@40	46	"hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
ali@40	47	"loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
ali@40	48	"omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
ali@40	49	"peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
ali@40	50	"porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
ali@40	51	"sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
ali@40	52	"tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
ali@40	53	"thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
ali@40	54	"tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
ali@40	55	"waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
ali@40	56	"wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
ali@40	57	"woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
ali@40	58	"wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
ali@40	59	"ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
ali@40	60	"bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
ali@40	61	"ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
ali@40	62	"dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
ali@40	63	"hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
ali@40	64	"hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
ali@40	65	"memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
ali@40	66	"witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
ali@40	67	"prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
ali@40	68	"se", ""
ali@40	69	};
ali@0	70
ali@69	71	GTree *usertypo;
ali@0	72
ali@40	73	/* Common abbreviations and other OK words not to query as typos. */
ali@40	74	char *okword[] = {
ali@40	75	"mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
ali@40	76	"rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
ali@40	77	"pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
ali@40	78	"outbid", "outbids", "frostbite", "frostbitten", ""
ali@40	79	};
ali@0	80
ali@40	81	/* Common abbreviations that cause otherwise unexplained periods. */
ali@40	82	char *abbrev[] = {
ali@40	83	"cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
ali@40	84	"cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
ali@40	85	};
ali@0	86
ali@40	87	/*
ali@40	88	* Two-Letter combinations that rarely if ever start words,
ali@40	89	* but are common scannos or otherwise common letter combinations.
ali@40	90	*/
ali@40	91	char *nostart[] = {
ali@40	92	"hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
ali@40	93	};
ali@0	94
ali@40	95	/*
ali@40	96	* Two-Letter combinations that rarely if ever end words,
ali@40	97	* but are common scannos or otherwise common letter combinations.
ali@40	98	*/
ali@40	99	char *noend[] = {
ali@40	100	"cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
ali@40	101	"sw", "gr", "sl", "cl", "iy", ""
ali@40	102	};
ali@0	103
ali@40	104	char *markup[] = {
ali@40	105	"a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
ali@40	106	"font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
ali@40	107	"img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
ali@40	108	"sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
ali@40	109	};
ali@0	110
ali@40	111	char *DPmarkup[] = {
ali@40	112	"<sc>", "</sc>", "/", "/", "/#", "#/", "/$", "$/", "<tb>", ""
ali@40	113	};
ali@0	114
ali@40	115	char *nocomma[] = {
ali@40	116	"the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
ali@40	117	"every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
ali@40	118	"st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
ali@40	119	"during", "let", "toward", "among", ""
ali@40	120	};
ali@0	121
ali@40	122	char *noperiod[] = {
ali@40	123	"every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
ali@40	124	"and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
ali@40	125	"i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
ali@40	126	"among", "those", "into", "whom", "having", "thence", ""
ali@40	127	};
ali@0	128
ali@69	129	gboolean pswit[SWITNO]; /* program switches */
ali@0	130
ali@69	131	static GOptionEntry options[]={
ali@69	132	{ "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
ali@69	133	"Ignore DP-specific markup", NULL },
ali@69	134	{ "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
ali@69	135	"Don't echo queried line", NULL },
ali@69	136	{ "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
ali@69	137	"Check single quotes", NULL },
ali@69	138	{ "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
ali@69	139	"Check common typos", NULL },
ali@69	140	{ "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
ali@69	141	"Require closure of quotes on every paragraph", NULL },
ali@69	142	{ "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
ali@69	143	"Disable paranoid querying of everything", NULL },
ali@69	144	{ "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
ali@69	145	"Disable line end checking", NULL },
ali@69	146	{ "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
ali@69	147	"Overview: just show counts", NULL },
ali@69	148	{ "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
ali@69	149	"Output errors to stdout instead of stderr", NULL },
ali@69	150	{ "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
ali@69	151	"Echo header fields", NULL },
ali@69	152	{ "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
ali@69	153	"Ignore markup in < >", NULL },
ali@69	154	{ "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
ali@69	155	"Use file of user-defined typos", NULL },
ali@69	156	{ "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
ali@69	157	"Defaults for use on www upload", NULL },
ali@69	158	{ "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
ali@69	159	"Verbose - list everything", NULL },
ali@69	160	{ NULL }
ali@69	161	};
ali@0	162
ali@94	163	long cnt_quote; /* for overview mode, count of quote queries */
ali@68	164	long cnt_brack; /* for overview mode, count of brackets queries */
ali@68	165	long cnt_bin; /* for overview mode, count of non-ASCII queries */
ali@68	166	long cnt_odd; /* for overview mode, count of odd character queries */
ali@68	167	long cnt_long; /* for overview mode, count of long line errors */
ali@68	168	long cnt_short; /* for overview mode, count of short line queries */
ali@68	169	long cnt_punct; /* for overview mode,
ali@68	170	count of punctuation and spacing queries */
ali@68	171	long cnt_dash; /* for overview mode, count of dash-related queries */
ali@68	172	long cnt_word; /* for overview mode, count of word queries */
ali@68	173	long cnt_html; /* for overview mode, count of html queries */
ali@68	174	long cnt_lineend; /* for overview mode, count of line-end queries */
ali@68	175	long cnt_spacend; /* count of lines with space at end */
ali@68	176	long linecnt; /* count of total lines in the file */
ali@68	177	long checked_linecnt; /* count of lines actually checked */
ali@0	178
ali@69	179	void proghelp(GOptionContext *context);
ali@69	180	void procfile(const char *);
ali@0	181
ali@69	182	gchar *running_from;
ali@0	183
ali@70	184	gboolean mixdigit(const char *);
ali@69	185	gchar getaword(const char *);
ali@69	186	char flgets(char *,long);
ali@0	187	void postprocess_for_HTML(char *);
ali@0	188	char linehasmarkup(char );
ali@0	189	char losemarkup(char );
ali@70	190	gboolean tagcomp(const char ,const char );
ali@71	191	void loseentities(char *);
ali@69	192	gboolean isroman(const char *);
ali@0	193	void postprocess_for_DP(char *);
ali@72	194	void print_as_windows_1252(const char *string);
ali@72	195	void print_as_utf_8(const char *string);
ali@0	196
ali@69	197	GTree qword,qperiod;
ali@68	198
ali@73	199	#ifdef __WIN32__
ali@73	200	UINT saved_cp;
ali@73	201	#endif
ali@73	202
ali@69	203	void parse_options(int argc,char **argv)
ali@0	204	{
ali@69	205	GError *err=NULL;
ali@69	206	GOptionContext *context;
ali@69	207	context=g_option_context_new(
ali@69	208	"file - looks for errors in Project Gutenberg(TM) etexts");
ali@69	209	g_option_context_add_main_entries(context,options,NULL);
ali@69	210	if (!g_option_context_parse(context,argc,argv,&err))
ali@69	211	{
ali@69	212	g_printerr("Bookloupe: %s\n",err->message);
ali@69	213	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
ali@69	214	exit(1);
ali@69	215	}
ali@40	216	/* Paranoid checking is turned OFF, not on, by its switch */
ali@69	217	pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
ali@40	218	if (pswit[PARANOID_SWITCH])
ali@69	219	/* if running in paranoid mode, typo checks default to enabled */
ali@69	220	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
ali@40	221	/* Line-end checking is turned OFF, not on, by its switch */
ali@69	222	pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
ali@40	223	/* Echoing is turned OFF, not on, by its switch */
ali@69	224	pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
ali@40	225	if (pswit[OVERVIEW_SWITCH])
ali@40	226	/* just print summary; don't echo */
ali@69	227	pswit[ECHO_SWITCH]=FALSE;
ali@40	228	/*
ali@40	229	* Web uploads - for the moment, this is really just a placeholder
ali@40	230	* until we decide what processing we really want to do on web uploads
ali@40	231	*/
ali@40	232	if (pswit[WEB_SWITCH])
ali@40	233	{
ali@40	234	/* specific override for web uploads */
ali@69	235	pswit[ECHO_SWITCH]=TRUE;
ali@69	236	pswit[SQUOTE_SWITCH]=FALSE;
ali@69	237	pswit[TYPO_SWITCH]=TRUE;
ali@69	238	pswit[QPARA_SWITCH]=FALSE;
ali@69	239	pswit[PARANOID_SWITCH]=TRUE;
ali@69	240	pswit[LINE_END_SWITCH]=FALSE;
ali@69	241	pswit[OVERVIEW_SWITCH]=FALSE;
ali@69	242	pswit[STDOUT_SWITCH]=FALSE;
ali@69	243	pswit[HEADER_SWITCH]=TRUE;
ali@69	244	pswit[VERBOSE_SWITCH]=FALSE;
ali@69	245	pswit[MARKUP_SWITCH]=FALSE;
ali@69	246	pswit[USERTYPO_SWITCH]=FALSE;
ali@69	247	pswit[DP_SWITCH]=FALSE;
ali@40	248	}
ali@69	249	if (*argc<2)
ali@40	250	{
ali@69	251	proghelp(context);
ali@69	252	exit(1);
ali@40	253	}
ali@69	254	g_option_context_free(context);
ali@69	255	}
ali@69	256
ali@69	257	/*
ali@69	258	* read_user_scannos:
ali@69	259	*
ali@69	260	* Read in the user-defined stealth scanno list.
ali@69	261	*/
ali@69	262	void read_user_scannos(void)
ali@69	263	{
ali@69	264	GError *err=NULL;
ali@69	265	gchar *usertypo_file;
ali@69	266	gboolean okay;
ali@69	267	int i;
ali@70	268	gsize len,nb;
ali@70	269	gchar contents,utf8,**lines;
ali@69	270	usertypo_file=g_strdup("bookloupe.typ");
ali@69	271	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
ali@69	272	if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
ali@69	273	{
ali@69	274	g_clear_error(&err);
ali@69	275	g_free(usertypo_file);
ali@69	276	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
ali@69	277	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
ali@69	278	}
ali@69	279	if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
ali@69	280	{
ali@69	281	g_clear_error(&err);
ali@69	282	g_free(usertypo_file);
ali@69	283	usertypo_file=g_strdup("gutcheck.typ");
ali@69	284	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
ali@69	285	}
ali@69	286	if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
ali@69	287	{
ali@69	288	g_clear_error(&err);
ali@69	289	g_free(usertypo_file);
ali@69	290	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
ali@69	291	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
ali@69	292	}
ali@69	293	if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
ali@69	294	{
ali@69	295	g_free(usertypo_file);
ali@70	296	g_print(" --> I couldn't find bookloupe.typ "
ali@69	297	"-- proceeding without user typos.\n");
ali@69	298	return;
ali@69	299	}
ali@69	300	else if (!okay)
ali@69	301	{
ali@69	302	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
ali@69	303	g_free(usertypo_file);
ali@69	304	g_clear_error(&err);
ali@69	305	exit(1);
ali@69	306	}
ali@72	307	if (g_utf8_validate(contents,len,NULL))
ali@72	308	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
ali@72	309	else
ali@72	310	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
ali@70	311	g_free(contents);
ali@70	312	lines=g_strsplit_set(utf8,"\r\n",0);
ali@70	313	g_free(utf8);
ali@69	314	usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
ali@69	315	for (i=0;lines[i];i++)
ali@69	316	if ((unsigned char )lines[i]>'!')
ali@69	317	g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
ali@69	318	else
ali@69	319	g_free(lines[i]);
ali@69	320	g_free(lines);
ali@69	321	}
ali@69	322
ali@69	323	/*
ali@69	324	* read_etext:
ali@69	325	*
ali@69	326	* Read an etext returning a newly allocated string containing the file
ali@69	327	* contents or NULL on error.
ali@69	328	*/
ali@69	329	gchar read_etext(const char filename,GError **err)
ali@69	330	{
ali@76	331	GError *tmp_err=NULL;
ali@70	332	gchar contents,utf8;
ali@76	333	gsize len,bytes_read,bytes_written;
ali@76	334	int i,line,col;
ali@69	335	if (!g_file_get_contents(filename,&contents,&len,err))
ali@69	336	return NULL;
ali@72	337	if (g_utf8_validate(contents,len,NULL))
ali@72	338	{
ali@72	339	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
ali@72	340	g_set_print_handler(print_as_utf_8);
ali@73	341	#ifdef __WIN32__
ali@73	342	SetConsoleOutputCP(CP_UTF8);
ali@73	343	#endif
ali@72	344	}
ali@72	345	else
ali@72	346	{
ali@76	347	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
ali@76	348	&bytes_written,&tmp_err);
ali@76	349	if (g_error_matches(tmp_err,G_CONVERT_ERROR,
ali@76	350	G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
ali@76	351	{
ali@76	352	line=col=1;
ali@76	353	for(i=0;i<bytes_read;i++)
ali@76	354	if (contents[i]=='\n')
ali@76	355	{
ali@76	356	line++;
ali@76	357	col=1;
ali@76	358	}
ali@76	359	else if (contents[i]!='\r')
ali@76	360	col++;
ali@76	361	g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
ali@76	362	"Input conversion failed. Byte %d at line %d, column %d is not a "
ali@76	363	"valid Windows-1252 character",
ali@76	364	((unsigned char *)contents)[bytes_read],line,col);
ali@76	365	}
ali@76	366	else if (tmp_err)
ali@76	367	g_propagate_error(err,tmp_err);
ali@72	368	g_set_print_handler(print_as_windows_1252);
ali@73	369	#ifdef __WIN32__
ali@73	370	SetConsoleOutputCP(1252);
ali@73	371	#endif
ali@72	372	}
ali@70	373	g_free(contents);
ali@70	374	return utf8;
ali@69	375	}
ali@69	376
ali@73	377	void cleanup_on_exit(void)
ali@73	378	{
ali@73	379	#ifdef __WIN32__
ali@73	380	SetConsoleOutputCP(saved_cp);
ali@73	381	#endif
ali@73	382	}
ali@73	383
ali@69	384	int main(int argc,char **argv)
ali@69	385	{
ali@73	386	#ifdef __WIN32__
ali@73	387	atexit(cleanup_on_exit);
ali@73	388	saved_cp=GetConsoleOutputCP();
ali@73	389	#endif
ali@69	390	running_from=g_path_get_dirname(argv[0]);
ali@69	391	parse_options(&argc,&argv);
ali@40	392	if (pswit[USERTYPO_SWITCH])
ali@69	393	read_user_scannos();
ali@40	394	fprintf(stderr,"bookloupe: Check and report on an e-text\n");
ali@69	395	procfile(argv[1]);
ali@40	396	if (pswit[OVERVIEW_SWITCH])
ali@40	397	{
ali@70	398	g_print(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
ali@40	399	checked_linecnt,linecnt,linecnt-checked_linecnt);
ali@70	400	g_print(" --------------- Queries found --------------\n");
ali@68	401	if (cnt_long)
ali@70	402	g_print(" Long lines: %14ld\n",cnt_long);
ali@68	403	if (cnt_short)
ali@70	404	g_print(" Short lines: %14ld\n",cnt_short);
ali@68	405	if (cnt_lineend)
ali@70	406	g_print(" Line-end problems: %14ld\n",cnt_lineend);
ali@68	407	if (cnt_word)
ali@70	408	g_print(" Common typos: %14ld\n",cnt_word);
ali@94	409	if (cnt_quote)
ali@94	410	g_print(" Unmatched quotes: %14ld\n",cnt_quote);
ali@68	411	if (cnt_brack)
ali@70	412	g_print(" Unmatched brackets: %14ld\n",cnt_brack);
ali@68	413	if (cnt_bin)
ali@70	414	g_print(" Non-ASCII characters: %14ld\n",cnt_bin);
ali@68	415	if (cnt_odd)
ali@70	416	g_print(" Proofing characters: %14ld\n",cnt_odd);
ali@68	417	if (cnt_punct)
ali@70	418	g_print(" Punctuation & spacing queries: %14ld\n",cnt_punct);
ali@68	419	if (cnt_dash)
ali@70	420	g_print(" Non-standard dashes: %14ld\n",cnt_dash);
ali@68	421	if (cnt_html)
ali@70	422	g_print(" Possible HTML tags: %14ld\n",cnt_html);
ali@70	423	g_print("\n");
ali@70	424	g_print(" TOTAL QUERIES %14ld\n",
ali@94	425	cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+
ali@94	426	cnt_dash+cnt_word+cnt_html+cnt_lineend);
ali@40	427	}
ali@69	428	g_free(running_from);
ali@69	429	if (usertypo)
ali@69	430	g_tree_unref(usertypo);
ali@40	431	return 0;
ali@0	432	}
ali@0	433
ali@97	434	void count_dashes(const char line,const char dash,
ali@97	435	struct dash_results *results)
ali@97	436	{
ali@97	437	int i;
ali@97	438	gchar **tokens;
ali@97	439	gunichar pc,nc;
ali@97	440	gboolean spaced=FALSE,unspaced=FALSE,spaced2=FALSE;
ali@97	441	if (!*line)
ali@97	442	return;
ali@97	443	tokens=g_strsplit(line,dash,0);
ali@97	444	if (tokens[1])
ali@97	445	results->base++;
ali@97	446	for(i=1;tokens[i];i++)
ali@97	447	{
ali@97	448	pc=g_utf8_get_char(g_utf8_prev_char(tokens[i-1]+strlen(tokens[i-1])));
ali@97	449	nc=g_utf8_get_char(tokens[i]);
ali@97	450	if (g_unichar_isspace(pc) \|\| g_unichar_isspace(nc))
ali@97	451	spaced=TRUE;
ali@97	452	if (g_unichar_isspace(pc) && g_unichar_isspace(nc))
ali@97	453	spaced2=TRUE;
ali@97	454	else if (!g_unichar_isspace(pc) && !g_unichar_isspace(nc))
ali@97	455	unspaced=TRUE;
ali@97	456	}
ali@97	457	if (spaced)
ali@97	458	results->space++;
ali@97	459	if (spaced2)
ali@97	460	/* count of lines with em-dashes with spaces both sides */
ali@97	461	results->non_PG_space++;
ali@97	462	if (unspaced)
ali@97	463	/* count of lines with PG-type em-dashes with no spaces */
ali@97	464	results->PG_space++;
ali@97	465	g_strfreev(tokens);
ali@97	466	}
ali@97	467
ali@40	468	/*
ali@41	469	* first_pass:
ali@40	470	*
ali@41	471	* Run a first pass - verify that it's a valid PG
ali@41	472	* file, decide whether to report some things that
ali@41	473	* occur many times in the text like long or short
ali@41	474	* lines, non-standard dashes, etc.
ali@40	475	*/
ali@69	476	struct first_pass_results first_pass(const char etext)
ali@0	477	{
ali@70	478	gunichar laststart=CHAR_SPACE;
ali@54	479	const char *s;
ali@69	480	gchar *lc_line;
ali@70	481	int i,j,lbytes,llen;
ali@69	482	gchar **lines;
ali@41	483	unsigned int lastlen=0,lastblen=0;
ali@41	484	long spline=0,nspline=0;
ali@41	485	static struct first_pass_results results={0};
ali@97	486	struct dash_results tmp_dash_results;
ali@69	487	gchar *inword;
ali@94	488	QuoteClass qc;
ali@69	489	lines=g_strsplit(etext,"\n",0);
ali@69	490	for (j=0;lines[j];j++)
ali@40	491	{
ali@70	492	lbytes=strlen(lines[j]);
ali@82	493	while (lbytes>0 && lines[j][lbytes-1]=='\r')
ali@70	494	lines[j][--lbytes]='\0';
ali@70	495	llen=g_utf8_strlen(lines[j],lbytes);
ali@68	496	linecnt++;
ali@69	497	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
ali@69	498	(strstr(lines[j],"PUBLIC DOMAIN") \|\| strstr(lines[j],"COPYRIGHT")))
ali@40	499	{
ali@68	500	if (spline)
ali@70	501	g_print(" --> Duplicate header?\n");
ali@68	502	spline=linecnt+1; /* first line of non-header text, that is */
ali@40	503	}
ali@69	504	if (!strncmp(lines[j],"*** START",9) &&
ali@69	505	strstr(lines[j],"PROJECT GUTENBERG"))
ali@40	506	{
ali@68	507	if (nspline)
ali@70	508	g_print(" --> Duplicate header?\n");
ali@68	509	nspline=linecnt+1; /* first line of non-header text, that is */
ali@40	510	}
ali@68	511	if (spline \|\| nspline)
ali@40	512	{
ali@70	513	lc_line=g_utf8_strdown(lines[j],lbytes);
ali@69	514	if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
ali@40	515	{
ali@69	516	if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
ali@40	517	{
ali@68	518	if (results.footerline)
ali@40	519	{
ali@40	520	/* it's an old-form header - we can detect duplicates */
ali@68	521	if (!nspline)
ali@70	522	g_print(" --> Duplicate footer?\n");
ali@40	523	}
ali@68	524	else
ali@68	525	results.footerline=linecnt;
ali@40	526	}
ali@40	527	}
ali@69	528	g_free(lc_line);
ali@40	529	}
ali@68	530	if (spline)
ali@41	531	results.firstline=spline;
ali@68	532	if (nspline)
ali@41	533	results.firstline=nspline; /* override with new */
ali@68	534	if (results.footerline)
ali@40	535	continue; /* don't count the boilerplate in the footer */
ali@68	536	results.totlen+=llen;
ali@70	537	for (s=lines[j];*s;s=g_utf8_next_char(s))
ali@40	538	{
ali@70	539	if (g_utf8_get_char(s)>127)
ali@41	540	results.binlen++;
ali@70	541	if (g_unichar_isalpha(g_utf8_get_char(s)))
ali@41	542	results.alphalen++;
ali@94	543	if (s>lines[j])
ali@94	544	{
ali@94	545	if (CHAR_IS_DQUOTE(g_utf8_get_char(s)))
ali@94	546	qc=QUOTE_CLASS(g_utf8_get_char(s));
ali@94	547	else
ali@94	548	qc=INVALID_QUOTE;
ali@94	549	if ((qc==CLOSING_QUOTE \|\| qc==NEUTRAL_QUOTE) &&
ali@97	550	g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
ali@94	551	results.endquote_count++;
ali@94	552	}
ali@40	553	}
ali@69	554	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
ali@69	555	lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
ali@41	556	results.shortline++;
ali@70	557	if (lbytes>0 &&
ali@70	558	g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
ali@40	559	cnt_spacend++;
ali@69	560	if (strstr(lines[j],".,"))
ali@41	561	results.dotcomma++;
ali@68	562	/* only count ast lines for ignoring purposes where there is */
ali@68	563	/* locase text on the line */
ali@69	564	if (strchr(lines[j],'*'))
ali@40	565	{
ali@70	566	for (s=lines[j];*s;s=g_utf8_next_char(s))
ali@70	567	if (g_unichar_islower(g_utf8_get_char(s)))
ali@68	568	break;
ali@70	569	if (*s)
ali@41	570	results.astline++;
ali@40	571	}
ali@69	572	if (strchr(lines[j],'/'))
ali@68	573	results.fslashline++;
ali@82	574	if (lbytes>0)
ali@82	575	{
ali@82	576	for (s=g_utf8_prev_char(lines[j]+lbytes);
ali@82	577	s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
ali@82	578	s=g_utf8_prev_char(s))
ali@82	579	;
ali@82	580	if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
ali@82	581	g_utf8_get_char(g_utf8_prev_char(s))!='-')
ali@82	582	results.hyphens++;
ali@82	583	}
ali@68	584	if (llen>LONGEST_PG_LINE)
ali@41	585	results.longline++;
ali@68	586	if (llen>WAY_TOO_LONG)
ali@41	587	results.verylongline++;
ali@69	588	if (strchr(lines[j],'<') && strchr(lines[j],'>'))
ali@40	589	{
ali@69	590	i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
ali@68	591	if (i>0)
ali@68	592	results.htmcount++;
ali@69	593	if (strstr(lines[j],"<i>"))
ali@41	594	results.htmcount+=4; /* bonus marks! */
ali@40	595	}
ali@68	596	/* Check for spaced em-dashes */
ali@97	597	memset(&tmp_dash_results,0,sizeof(tmp_dash_results));
ali@97	598	count_dashes(lines[j],"--",&tmp_dash_results);
ali@97	599	count_dashes(lines[j],"—",&tmp_dash_results);
ali@97	600	if (tmp_dash_results.base)
ali@97	601	results.emdash.base++;
ali@97	602	if (tmp_dash_results.non_PG_space)
ali@97	603	results.emdash.non_PG_space++;
ali@97	604	if (tmp_dash_results.PG_space)
ali@97	605	results.emdash.PG_space++;
ali@69	606	for (s=lines[j];*s;)
ali@40	607	{
ali@69	608	inword=getaword(&s);
ali@68	609	if (!strcmp(inword,"hij") \|\| !strcmp(inword,"niet"))
ali@68	610	results.Dutchcount++;
ali@68	611	if (!strcmp(inword,"dans") \|\| !strcmp(inword,"avec"))
ali@68	612	results.Frenchcount++;
ali@68	613	if (!strcmp(inword,"0") \|\| !strcmp(inword,"1"))
ali@68	614	results.standalone_digit++;
ali@69	615	g_free(inword);
ali@40	616	}
ali@68	617	/* Check for spaced dashes */
ali@69	618	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
ali@41	619	results.spacedash++;
ali@68	620	lastblen=lastlen;
ali@69	621	lastlen=llen;
ali@69	622	laststart=lines[j][0];
ali@40	623	}
ali@69	624	g_strfreev(lines);
ali@41	625	return &results;
ali@41	626	}
ali@41	627
ali@42	628	/*
ali@42	629	* report_first_pass:
ali@42	630	*
ali@42	631	* Make some snap decisions based on the first pass results.
ali@42	632	*/
ali@42	633	struct warnings report_first_pass(struct first_pass_results results)
ali@42	634	{
ali@42	635	static struct warnings warnings={0};
ali@42	636	if (cnt_spacend>0)
ali@70	637	g_print(" --> %ld lines in this file have white space at end\n",
ali@42	638	cnt_spacend);
ali@42	639	warnings.dotcomma=1;
ali@42	640	if (results->dotcomma>5)
ali@42	641	{
ali@68	642	warnings.dotcomma=0;
ali@70	643	g_print(" --> %ld lines in this file contain '.,'. "
ali@42	644	"Not reporting them.\n",results->dotcomma);
ali@42	645	}
ali@42	646	/*
ali@42	647	* If more than 50 lines, or one-tenth, are short,
ali@42	648	* don't bother reporting them.
ali@42	649	*/
ali@42	650	warnings.shortline=1;
ali@42	651	if (results->shortline>50 \|\| results->shortline*10>linecnt)
ali@42	652	{
ali@68	653	warnings.shortline=0;
ali@70	654	g_print(" --> %ld lines in this file are short. "
ali@42	655	"Not reporting short lines.\n",results->shortline);
ali@42	656	}
ali@42	657	/*
ali@42	658	* If more than 50 lines, or one-tenth, are long,
ali@42	659	* don't bother reporting them.
ali@42	660	*/
ali@42	661	warnings.longline=1;
ali@42	662	if (results->longline>50 \|\| results->longline*10>linecnt)
ali@42	663	{
ali@68	664	warnings.longline=0;
ali@70	665	g_print(" --> %ld lines in this file are long. "
ali@42	666	"Not reporting long lines.\n",results->longline);
ali@42	667	}
ali@42	668	/* If more than 10 lines contain asterisks, don't bother reporting them. */
ali@42	669	warnings.ast=1;
ali@42	670	if (results->astline>10)
ali@42	671	{
ali@68	672	warnings.ast=0;
ali@70	673	g_print(" --> %ld lines in this file contain asterisks. "
ali@42	674	"Not reporting them.\n",results->astline);
ali@42	675	}
ali@42	676	/*
ali@42	677	* If more than 10 lines contain forward slashes,
ali@42	678	* don't bother reporting them.
ali@42	679	*/
ali@42	680	warnings.fslash=1;
ali@42	681	if (results->fslashline>10)
ali@42	682	{
ali@68	683	warnings.fslash=0;
ali@70	684	g_print(" --> %ld lines in this file contain forward slashes. "
ali@42	685	"Not reporting them.\n",results->fslashline);
ali@42	686	}
ali@42	687	/*
ali@42	688	* If more than 20 lines contain unpunctuated endquotes,
ali@42	689	* don't bother reporting them.
ali@42	690	*/
ali@42	691	warnings.endquote=1;
ali@42	692	if (results->endquote_count>20)
ali@42	693	{
ali@68	694	warnings.endquote=0;
ali@70	695	g_print(" --> %ld lines in this file contain unpunctuated endquotes. "
ali@42	696	"Not reporting them.\n",results->endquote_count);
ali@42	697	}
ali@42	698	/*
ali@42	699	* If more than 15 lines contain standalone digits,
ali@42	700	* don't bother reporting them.
ali@42	701	*/
ali@42	702	warnings.digit=1;
ali@42	703	if (results->standalone_digit>10)
ali@42	704	{
ali@68	705	warnings.digit=0;
ali@70	706	g_print(" --> %ld lines in this file contain standalone 0s and 1s. "
ali@42	707	"Not reporting them.\n",results->standalone_digit);
ali@42	708	}
ali@42	709	/*
ali@42	710	* If more than 20 lines contain hyphens at end,
ali@42	711	* don't bother reporting them.
ali@42	712	*/
ali@42	713	warnings.hyphen=1;
ali@42	714	if (results->hyphens>20)
ali@42	715	{
ali@68	716	warnings.hyphen=0;
ali@70	717	g_print(" --> %ld lines in this file have hyphens at end. "
ali@42	718	"Not reporting them.\n",results->hyphens);
ali@42	719	}
ali@42	720	if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
ali@42	721	{
ali@70	722	g_print(" --> Looks like this is HTML. Switching HTML mode ON.\n");
ali@68	723	pswit[MARKUP_SWITCH]=1;
ali@42	724	}
ali@42	725	if (results->verylongline>0)
ali@70	726	g_print(" --> %ld lines in this file are VERY long!\n",
ali@42	727	results->verylongline);
ali@42	728	/*
ali@42	729	* If there are more non-PG spaced dashes than PG em-dashes,
ali@42	730	* assume it's deliberate.
ali@42	731	* Current PG guidelines say don't use them, but older texts do,
ali@42	732	* and some people insist on them whatever the guidelines say.
ali@42	733	*/
ali@42	734	warnings.dash=1;
ali@97	735	if (results->spacedash+results->emdash.non_PG_space>
ali@97	736	results->emdash.PG_space)
ali@42	737	{
ali@68	738	warnings.dash=0;
ali@70	739	g_print(" --> There are %ld spaced dashes and em-dashes. "
ali@42	740	"Not reporting them.\n",
ali@97	741	results->spacedash+results->emdash.non_PG_space);
ali@42	742	}
ali@42	743	/* If more than a quarter of characters are hi-bit, bug out. */
ali@42	744	warnings.bin=1;
ali@42	745	if (results->binlen*4>results->totlen)
ali@42	746	{
ali@70	747	g_print(" --> This file does not appear to be ASCII. "
ali@42	748	"Terminating. Best of luck with it!\n");
ali@68	749	exit(1);
ali@42	750	}
ali@42	751	if (results->alphalen*4<results->totlen)
ali@42	752	{
ali@70	753	g_print(" --> This file does not appear to be text. "
ali@42	754	"Terminating. Best of luck with it!\n");
ali@68	755	exit(1);
ali@42	756	}
ali@42	757	if (results->binlen*100>results->totlen \|\| results->binlen>100)
ali@42	758	{
ali@70	759	g_print(" --> There are a lot of foreign letters here. "
ali@42	760	"Not reporting them.\n");
ali@68	761	warnings.bin=0;
ali@42	762	}
ali@69	763	warnings.isDutch=FALSE;
ali@42	764	if (results->Dutchcount>50)
ali@42	765	{
ali@69	766	warnings.isDutch=TRUE;
ali@70	767	g_print(" --> This looks like Dutch - "
ali@42	768	"switching off dashes and warnings for 's Middags case.\n");
ali@42	769	}
ali@69	770	warnings.isFrench=FALSE;
ali@42	771	if (results->Frenchcount>50)
ali@42	772	{
ali@69	773	warnings.isFrench=TRUE;
ali@70	774	g_print(" --> This looks like French - "
ali@42	775	"switching off some doublepunct.\n");
ali@42	776	}
ali@42	777	if (results->firstline && results->footerline)
ali@70	778	g_print(" The PG header and footer appear to be already on.\n");
ali@42	779	else
ali@42	780	{
ali@68	781	if (results->firstline)
ali@70	782	g_print(" The PG header is on - no footer.\n");
ali@68	783	if (results->footerline)
ali@70	784	g_print(" The PG footer is on - no header.\n");
ali@42	785	}
ali@70	786	g_print("\n");
ali@42	787	if (pswit[VERBOSE_SWITCH])
ali@42	788	{
ali@68	789	warnings.bin=1;
ali@68	790	warnings.shortline=1;
ali@68	791	warnings.dotcomma=1;
ali@68	792	warnings.longline=1;
ali@68	793	warnings.dash=1;
ali@68	794	warnings.digit=1;
ali@68	795	warnings.ast=1;
ali@68	796	warnings.fslash=1;
ali@68	797	warnings.hyphen=1;
ali@68	798	warnings.endquote=1;
ali@70	799	g_print(" * Verbose output is ON -- you asked for it! *\n");
ali@42	800	}
ali@42	801	if (warnings.isDutch)
ali@68	802	warnings.dash=0;
ali@42	803	if (results->footerline>0 && results->firstline>0 &&
ali@42	804	results->footerline>results->firstline &&
ali@42	805	results->footerline-results->firstline<100)
ali@42	806	{
ali@70	807	g_print(" --> I don't really know where this text starts. \n");
ali@70	808	g_print(" There are no reference points.\n");
ali@70	809	g_print(" I'm going to have to report the header and footer "
ali@42	810	"as well.\n");
ali@68	811	results->firstline=0;
ali@42	812	}
ali@42	813	return &warnings;
ali@42	814	}
ali@42	815
ali@43	816	/*
ali@43	817	* analyse_quotes:
ali@43	818	*
ali@43	819	* Look along the line, accumulate the count of quotes, and see
ali@43	820	* if this is an empty line - i.e. a line with nothing on it
ali@43	821	* but spaces.
ali@43	822	* If line has just spaces, period, * and/or - on it, don't
ali@43	823	* count it, since empty lines with asterisks or dashes to
ali@43	824	* separate sections are common.
ali@43	825	*
ali@69	826	* Returns: TRUE if the line is empty.
ali@43	827	*/
ali@98	828	gboolean analyse_quotes(const char aline,struct counters counters)
ali@43	829	{
ali@68	830	int guessquote=0;
ali@69	831	/* assume the line is empty until proven otherwise */
ali@69	832	gboolean isemptyline=TRUE;
ali@70	833	const char s=aline,sprev,*snext;
ali@70	834	gunichar c;
ali@70	835	sprev=NULL;
ali@94	836	GError *tmp_err=NULL;
ali@43	837	while (*s)
ali@43	838	{
ali@70	839	snext=g_utf8_next_char(s);
ali@70	840	c=g_utf8_get_char(s);
ali@94	841	if (CHAR_IS_DQUOTE(c))
ali@94	842	(void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);
ali@94	843	else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])
ali@43	844	{
ali@43	845	if (s==aline)
ali@43	846	{
ali@43	847	/*
ali@94	848	* At start of line, it can only be a quotation mark.
ali@43	849	* Hardcode a very common exception!
ali@43	850	*/
ali@70	851	if (!g_str_has_prefix(snext,"tis") &&
ali@70	852	!g_str_has_prefix(snext,"Tis"))
ali@94	853	(void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
ali@43	854	}
ali@70	855	else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
ali@70	856	g_unichar_isalpha(g_utf8_get_char(snext)))
ali@43	857	/* Do nothing! it's definitely an apostrophe, not a quote */
ali@43	858	;
ali@43	859	/* it's outside a word - let's check it out */
ali@92	860	else if (c==CHAR_OPEN_SQUOTE \|\| c==CHAR_LS_QUOTE \|\|
ali@70	861	g_unichar_isalpha(g_utf8_get_char(snext)))
ali@43	862	{
ali@94	863	/* certainly looks like a quotation mark */
ali@70	864	if (!g_str_has_prefix(snext,"tis") &&
ali@70	865	!g_str_has_prefix(snext,"Tis"))
ali@43	866	/* hardcode a very common exception! */
ali@94	867	{
ali@94	868	if (strchr(".?!,;:",g_utf8_get_char(sprev)))
ali@94	869	(void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
ali@94	870	else
ali@94	871	(void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);
ali@94	872	}
ali@43	873	}
ali@43	874	else
ali@43	875	{
ali@94	876	/* now - is it a quotation mark? */
ali@43	877	guessquote=0; /* accumulate clues */
ali@70	878	if (g_unichar_isalpha(g_utf8_get_char(sprev)))
ali@43	879	{
ali@43	880	/* it follows a letter - could be either */
ali@43	881	guessquote++;
ali@70	882	if (g_utf8_get_char(sprev)=='s')
ali@43	883	{
ali@43	884	/* looks like a plural apostrophe */
ali@43	885	guessquote-=3;
ali@70	886	if (g_utf8_get_char(snext)==CHAR_SPACE)
ali@70	887	/* bonus marks! */
ali@43	888	guessquote-=2;
ali@43	889	}
ali@94	890	if (innermost_quote_matches(counters,c))
ali@94	891	/*
ali@94	892	* Give it the benefit of some doubt,
ali@94	893	* if a squote is already open.
ali@94	894	*/
ali@94	895	guessquote++;
ali@94	896	else
ali@94	897	guessquote--;
ali@94	898	if (guessquote>=0)
ali@94	899	(void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);
ali@43	900	}
ali@43	901	else
ali@94	902	/* no adjacent letter - it must be a quote of some kind */
ali@94	903	(void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
ali@43	904	}
ali@43	905	}
ali@94	906	if (tmp_err)
ali@94	907	{
ali@94	908	if (pswit[ECHO_SWITCH])
ali@94	909	g_print("\n%s\n",aline);
ali@94	910	if (!pswit[OVERVIEW_SWITCH])
ali@94	911	g_print(" Line %ld column %ld - %s\n",
ali@94	912	linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);
ali@94	913	g_clear_error(&tmp_err);
ali@94	914	}
ali@70	915	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
ali@70	916	c!='\r' && c!='\n')
ali@69	917	isemptyline=FALSE; /* ignore lines like * * * as spacers */
ali@70	918	if (c==CHAR_UNDERSCORE)
ali@43	919	counters->c_unders++;
ali@93	920	if (c==CHAR_OPEN_SBRACK)
ali@93	921	{
ali@93	922	if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
ali@93	923	!matching_difference(counters,c) && s==aline &&
ali@93	924	g_str_has_prefix(s,"[Illustration:"))
ali@93	925	increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
ali@93	926	else
ali@93	927	increment_matching(counters,c,TRUE);
ali@93	928	}
ali@93	929	else if (c==CHAR_OPEN_CBRACK \|\| c==CHAR_OPEN_RBRACK)
ali@92	930	increment_matching(counters,c,TRUE);
ali@93	931	if (c==CHAR_CLOSE_SBRACK)
ali@93	932	{
ali@93	933	if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
ali@93	934	!matching_difference(counters,c) && !*snext)
ali@93	935	increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
ali@93	936	else
ali@93	937	increment_matching(counters,c,FALSE);
ali@93	938	}
ali@93	939	else if (c==CHAR_CLOSE_CBRACK \|\| c==CHAR_CLOSE_RBRACK)
ali@92	940	increment_matching(counters,c,FALSE);
ali@70	941	sprev=s;
ali@70	942	s=snext;
ali@43	943	}
ali@43	944	return isemptyline;
ali@43	945	}
ali@43	946
ali@41	947	/*
ali@67	948	* check_for_control_characters:
ali@67	949	*
ali@67	950	* Check for invalid or questionable characters in the line
ali@67	951	* Anything above 127 is invalid for plain ASCII, and
ali@67	952	* non-printable control characters should also be flagged.
ali@67	953	* Tabs should generally not be there.
ali@67	954	*/
ali@67	955	void check_for_control_characters(const char *aline)
ali@67	956	{
ali@70	957	gunichar c;
ali@67	958	const char *s;
ali@70	959	for (s=aline;*s;s=g_utf8_next_char(s))
ali@67	960	{
ali@70	961	c=g_utf8_get_char(s);
ali@67	962	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
ali@67	963	{
ali@67	964	if (pswit[ECHO_SWITCH])
ali@70	965	g_print("\n%s\n",aline);
ali@67	966	if (!pswit[OVERVIEW_SWITCH])
ali@70	967	g_print(" Line %ld column %ld - Control character %u\n",
ali@70	968	linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
ali@67	969	else
ali@67	970	cnt_bin++;
ali@67	971	}
ali@67	972	}
ali@67	973	}
ali@67	974
ali@67	975	/*
ali@44	976	* check_for_odd_characters:
ali@44	977	*
ali@44	978	* Check for binary and other odd characters.
ali@44	979	*/
ali@44	980	void check_for_odd_characters(const char aline,const struct warnings warnings,
ali@69	981	gboolean isemptyline)
ali@44	982	{
ali@44	983	/* Don't repeat multiple warnings on one line. */
ali@70	984	gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
ali@70	985	gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
ali@44	986	const char *s;
ali@70	987	gunichar c;
ali@70	988	for (s=aline;*s;s=g_utf8_next_char(s))
ali@44	989	{
ali@70	990	c=g_utf8_get_char(s);
ali@70	991	if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' \|\| c>127))
ali@44	992	{
ali@44	993	if (pswit[ECHO_SWITCH])
ali@70	994	g_print("\n%s\n",aline);
ali@44	995	if (!pswit[OVERVIEW_SWITCH])
ali@70	996	if (c>127 && c<160 \|\| c>255)
ali@70	997	g_print(" Line %ld column %ld - "
ali@70	998	"Non-ISO-8859 character %u\n",
ali@70	999	linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
ali@44	1000	else
ali@70	1001	g_print(" Line %ld column %ld - "
ali@70	1002	"Non-ASCII character %u\n",
ali@70	1003	linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
ali@44	1004	else
ali@44	1005	cnt_bin++;
ali@70	1006	eNon_A=TRUE;
ali@44	1007	}
ali@70	1008	if (!eTab && c==CHAR_TAB)
ali@44	1009	{
ali@44	1010	if (pswit[ECHO_SWITCH])
ali@70	1011	g_print("\n%s\n",aline);
ali@44	1012	if (!pswit[OVERVIEW_SWITCH])
ali@70	1013	g_print(" Line %ld column %ld - Tab character?\n",
ali@70	1014	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@44	1015	else
ali@44	1016	cnt_odd++;
ali@70	1017	eTab=TRUE;
ali@44	1018	}
ali@70	1019	if (!eTilde && c==CHAR_TILDE)
ali@44	1020	{
ali@44	1021	/*
ali@44	1022	* Often used by OCR software to indicate an
ali@44	1023	* unrecognizable character.
ali@44	1024	*/
ali@44	1025	if (pswit[ECHO_SWITCH])
ali@70	1026	g_print("\n%s\n",aline);
ali@44	1027	if (!pswit[OVERVIEW_SWITCH])
ali@70	1028	g_print(" Line %ld column %ld - Tilde character?\n",
ali@70	1029	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@44	1030	else
ali@44	1031	cnt_odd++;
ali@70	1032	eTilde=TRUE;
ali@44	1033	}
ali@70	1034	if (!eCarat && c==CHAR_CARAT)
ali@44	1035	{
ali@44	1036	if (pswit[ECHO_SWITCH])
ali@70	1037	g_print("\n%s\n",aline);
ali@44	1038	if (!pswit[OVERVIEW_SWITCH])
ali@70	1039	g_print(" Line %ld column %ld - Carat character?\n",
ali@70	1040	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@44	1041	else
ali@44	1042	cnt_odd++;
ali@70	1043	eCarat=TRUE;
ali@44	1044	}
ali@70	1045	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
ali@44	1046	{
ali@44	1047	if (pswit[ECHO_SWITCH])
ali@70	1048	g_print("\n%s\n",aline);
ali@44	1049	if (!pswit[OVERVIEW_SWITCH])
ali@70	1050	g_print(" Line %ld column %ld - Forward slash?\n",
ali@70	1051	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@44	1052	else
ali@44	1053	cnt_odd++;
ali@70	1054	eFSlash=TRUE;
ali@44	1055	}
ali@44	1056	/*
ali@44	1057	* Report asterisks only in paranoid mode,
ali@44	1058	* since they're often deliberate.
ali@44	1059	*/
ali@44	1060	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
ali@70	1061	c==CHAR_ASTERISK)
ali@44	1062	{
ali@44	1063	if (pswit[ECHO_SWITCH])
ali@70	1064	g_print("\n%s\n",aline);
ali@44	1065	if (!pswit[OVERVIEW_SWITCH])
ali@70	1066	g_print(" Line %ld column %ld - Asterisk?\n",
ali@70	1067	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@44	1068	else
ali@44	1069	cnt_odd++;
ali@70	1070	eAst=TRUE;
ali@44	1071	}
ali@44	1072	}
ali@44	1073	}
ali@44	1074
ali@44	1075	/*
ali@45	1076	* check_for_long_line:
ali@45	1077	*
ali@45	1078	* Check for line too long.
ali@45	1079	*/
ali@45	1080	void check_for_long_line(const char *aline)
ali@45	1081	{
ali@70	1082	if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
ali@45	1083	{
ali@45	1084	if (pswit[ECHO_SWITCH])
ali@70	1085	g_print("\n%s\n",aline);
ali@45	1086	if (!pswit[OVERVIEW_SWITCH])
ali@70	1087	g_print(" Line %ld column %ld - Long line %ld\n",
ali@70	1088	linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
ali@45	1089	else
ali@45	1090	cnt_long++;
ali@45	1091	}
ali@45	1092	}
ali@45	1093
ali@45	1094	/*
ali@45	1095	* check_for_short_line:
ali@45	1096	*
ali@45	1097	* Check for line too short.
ali@45	1098	*
ali@45	1099	* This one is a bit trickier to implement: we don't want to
ali@45	1100	* flag the last line of a paragraph for being short, so we
ali@45	1101	* have to wait until we know that our current line is a
ali@45	1102	* "normal" line, then report the _previous_ line if it was too
ali@45	1103	* short. We also don't want to report indented lines like
ali@45	1104	* chapter heads or formatted quotations. We therefore keep
ali@45	1105	* last->len as the length of the last line examined, and
ali@45	1106	* last->blen as the length of the last but one, and try to
ali@45	1107	* suppress unnecessary warnings by checking that both were of
ali@45	1108	* "normal" length. We keep the first character of the last
ali@45	1109	* line in last->start, and if it was a space, we assume that
ali@45	1110	* the formatting is deliberate. I can't figure out a way to
ali@45	1111	* distinguish something like a quoted verse left-aligned or
ali@45	1112	* the header or footer of a letter from a paragraph of short
ali@45	1113	* lines - maybe if I examined the whole paragraph, and if the
ali@45	1114	* para has less than, say, 8 lines and if all lines are short,
ali@45	1115	* then just assume it's OK? Need to look at some texts to see
ali@45	1116	* how often a formula like this would get the right result.
ali@45	1117	*/
ali@45	1118	void check_for_short_line(const char aline,const struct line_properties last)
ali@45	1119	{
ali@70	1120	if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
ali@70	1121	last->len<SHORTEST_PG_LINE && last->blen>1 &&
ali@70	1122	last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
ali@45	1123	{
ali@45	1124	if (pswit[ECHO_SWITCH])
ali@70	1125	g_print("\n%s\n",prevline);
ali@45	1126	if (!pswit[OVERVIEW_SWITCH])
ali@70	1127	g_print(" Line %ld column %ld - Short line %ld?\n",
ali@70	1128	linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
ali@45	1129	else
ali@45	1130	cnt_short++;
ali@45	1131	}
ali@45	1132	}
ali@45	1133
ali@45	1134	/*
ali@46	1135	* check_for_starting_punctuation:
ali@46	1136	*
ali@46	1137	* Look for punctuation other than full ellipses at start of line.
ali@46	1138	*/
ali@46	1139	void check_for_starting_punctuation(const char *aline)
ali@46	1140	{
ali@70	1141	if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
ali@70	1142	!g_str_has_prefix(aline,". . ."))
ali@46	1143	{
ali@46	1144	if (pswit[ECHO_SWITCH])
ali@70	1145	g_print("\n%s\n",aline);
ali@46	1146	if (!pswit[OVERVIEW_SWITCH])
ali@70	1147	g_print(" Line %ld column 1 - Begins with punctuation?\n",
ali@46	1148	linecnt);
ali@46	1149	else
ali@46	1150	cnt_punct++;
ali@46	1151	}
ali@46	1152	}
ali@46	1153
ali@46	1154	/*
ali@97	1155	* str_emdash:
ali@97	1156	*
ali@97	1157	* Find the first em-dash, return a pointer to it and set <next> to the
ali@97	1158	* character following the dash.
ali@97	1159	*/
ali@97	1160	char str_emdash(const char s,const char **next)
ali@97	1161	{
ali@97	1162	const char s1,s2;
ali@97	1163	s1=strstr(s,"--");
ali@97	1164	s2=strstr(s,"—");
ali@97	1165	if (!s1)
ali@97	1166	{
ali@97	1167	if (s2)
ali@97	1168	*next=g_utf8_next_char(s2);
ali@97	1169	return (char *)s2;
ali@97	1170	}
ali@97	1171	else if (!s2)
ali@97	1172	{
ali@97	1173	*next=g_utf8_next_char(g_utf8_next_char(s1));
ali@97	1174	return (char *)s1;
ali@97	1175	}
ali@97	1176	else if (s1<s2)
ali@97	1177	{
ali@97	1178	*next=g_utf8_next_char(g_utf8_next_char(s1));
ali@97	1179	return (char *)s1;
ali@97	1180	}
ali@97	1181	else
ali@97	1182	{
ali@97	1183	*next=g_utf8_next_char(s2);
ali@97	1184	return (char *)s2;
ali@97	1185	}
ali@97	1186	}
ali@97	1187
ali@97	1188	/*
ali@47	1189	* check_for_spaced_emdash:
ali@47	1190	*
ali@47	1191	* Check for spaced em-dashes.
ali@47	1192	*
ali@97	1193	* We must check _all_ occurrences of em-dashes on the line
ali@97	1194	* hence the loop - even if the first dash is OK
ali@47	1195	* there may be another that's wrong later on.
ali@47	1196	*/
ali@47	1197	void check_for_spaced_emdash(const char *aline)
ali@47	1198	{
ali@70	1199	const char s,t,*next;
ali@97	1200	for (s=aline;t=str_emdash(s,&next);s=next)
ali@47	1201	{
ali@70	1202	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE \|\|
ali@70	1203	g_utf8_get_char(next)==CHAR_SPACE)
ali@47	1204	{
ali@47	1205	if (pswit[ECHO_SWITCH])
ali@70	1206	g_print("\n%s\n",aline);
ali@47	1207	if (!pswit[OVERVIEW_SWITCH])
ali@70	1208	g_print(" Line %ld column %ld - Spaced em-dash?\n",
ali@70	1209	linecnt,g_utf8_pointer_to_offset(aline,t)+1);
ali@47	1210	else
ali@47	1211	cnt_dash++;
ali@47	1212	}
ali@47	1213	}
ali@47	1214	}
ali@47	1215
ali@47	1216	/*
ali@47	1217	* check_for_spaced_dash:
ali@47	1218	*
ali@47	1219	* Check for spaced dashes.
ali@47	1220	*/
ali@47	1221	void check_for_spaced_dash(const char *aline)
ali@47	1222	{
ali@47	1223	const char *s;
ali@47	1224	if ((s=strstr(aline," -")))
ali@47	1225	{
ali@70	1226	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
ali@47	1227	{
ali@47	1228	if (pswit[ECHO_SWITCH])
ali@70	1229	g_print("\n%s\n",aline);
ali@47	1230	if (!pswit[OVERVIEW_SWITCH])
ali@70	1231	g_print(" Line %ld column %ld - Spaced dash?\n",
ali@70	1232	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@47	1233	else
ali@47	1234	cnt_dash++;
ali@47	1235	}
ali@47	1236	}
ali@47	1237	else if ((s=strstr(aline,"- ")))
ali@47	1238	{
ali@70	1239	if (s==aline \|\| g_utf8_get_char(g_utf8_prev_char(s))!='-')
ali@47	1240	{
ali@47	1241	if (pswit[ECHO_SWITCH])
ali@70	1242	g_print("\n%s\n",aline);
ali@47	1243	if (!pswit[OVERVIEW_SWITCH])
ali@70	1244	g_print(" Line %ld column %ld - Spaced dash?\n",
ali@70	1245	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@47	1246	else
ali@47	1247	cnt_dash++;
ali@47	1248	}
ali@47	1249	}
ali@47	1250	}
ali@47	1251
ali@47	1252	/*
ali@48	1253	* check_for_unmarked_paragraphs:
ali@48	1254	*
ali@48	1255	* Check for unmarked paragraphs indicated by separate speakers.
ali@48	1256	*
ali@48	1257	* May well be false positive:
ali@48	1258	* "Bravo!" "Wonderful!" called the crowd.
ali@48	1259	* but useful all the same.
ali@48	1260	*/
ali@48	1261	void check_for_unmarked_paragraphs(const char *aline)
ali@48	1262	{
ali@48	1263	const char *s;
ali@48	1264	s=strstr(aline,"\" \"");
ali@48	1265	if (!s)
ali@48	1266	s=strstr(aline,"\" \"");
ali@48	1267	if (s)
ali@48	1268	{
ali@48	1269	if (pswit[ECHO_SWITCH])
ali@70	1270	g_print("\n%s\n",aline);
ali@48	1271	if (!pswit[OVERVIEW_SWITCH])
ali@70	1272	g_print(" Line %ld column %ld - "
ali@70	1273	"Query missing paragraph break?\n",
ali@70	1274	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@48	1275	else
ali@48	1276	cnt_punct++;
ali@48	1277	}
ali@48	1278	}
ali@48	1279
ali@48	1280	/*
ali@49	1281	* check_for_jeebies:
ali@49	1282	*
ali@49	1283	* Check for "to he" and other easy h/b errors.
ali@49	1284	*
ali@49	1285	* This is a very inadequate effort on the h/b problem,
ali@49	1286	* but the phrase "to he" is always an error, whereas "to
ali@49	1287	* be" is quite common.
ali@49	1288	* Similarly, '"Quiet!", be said.' is a non-be error
ali@49	1289	* "to he" is _not_ always an error!:
ali@49	1290	* "Where they went to he couldn't say."
ali@49	1291	* Another false positive:
ali@49	1292	* What would "Cinderella" be without the . . .
ali@49	1293	* and another: "If he wants to he can see for himself."
ali@49	1294	*/
ali@49	1295	void check_for_jeebies(const char *aline)
ali@49	1296	{
ali@49	1297	const char *s;
ali@49	1298	s=strstr(aline," be could ");
ali@49	1299	if (!s)
ali@49	1300	s=strstr(aline," be would ");
ali@49	1301	if (!s)
ali@49	1302	s=strstr(aline," was be ");
ali@49	1303	if (!s)
ali@49	1304	s=strstr(aline," be is ");
ali@49	1305	if (!s)
ali@49	1306	s=strstr(aline," is be ");
ali@49	1307	if (!s)
ali@49	1308	s=strstr(aline,"\", be ");
ali@49	1309	if (!s)
ali@49	1310	s=strstr(aline,"\" be ");
ali@49	1311	if (!s)
ali@49	1312	s=strstr(aline,"\" be ");
ali@49	1313	if (!s)
ali@49	1314	s=strstr(aline," to he ");
ali@49	1315	if (s)
ali@49	1316	{
ali@49	1317	if (pswit[ECHO_SWITCH])
ali@70	1318	g_print("\n%s\n",aline);
ali@49	1319	if (!pswit[OVERVIEW_SWITCH])
ali@70	1320	g_print(" Line %ld column %ld - Query he/be error?\n",
ali@70	1321	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@49	1322	else
ali@49	1323	cnt_word++;
ali@49	1324	}
ali@49	1325	s=strstr(aline," the had ");
ali@49	1326	if (!s)
ali@49	1327	s=strstr(aline," a had ");
ali@49	1328	if (!s)
ali@49	1329	s=strstr(aline," they bad ");
ali@49	1330	if (!s)
ali@49	1331	s=strstr(aline," she bad ");
ali@49	1332	if (!s)
ali@49	1333	s=strstr(aline," he bad ");
ali@49	1334	if (!s)
ali@49	1335	s=strstr(aline," you bad ");
ali@49	1336	if (!s)
ali@49	1337	s=strstr(aline," i bad ");
ali@49	1338	if (s)
ali@49	1339	{
ali@49	1340	if (pswit[ECHO_SWITCH])
ali@70	1341	g_print("\n%s\n",aline);
ali@49	1342	if (!pswit[OVERVIEW_SWITCH])
ali@70	1343	g_print(" Line %ld column %ld - Query had/bad error?\n",
ali@70	1344	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@49	1345	else
ali@49	1346	cnt_word++;
ali@49	1347	}
ali@49	1348	s=strstr(aline,"; hut ");
ali@49	1349	if (!s)
ali@49	1350	s=strstr(aline,", hut ");
ali@49	1351	if (s)
ali@49	1352	{
ali@49	1353	if (pswit[ECHO_SWITCH])
ali@70	1354	g_print("\n%s\n",aline);
ali@49	1355	if (!pswit[OVERVIEW_SWITCH])
ali@70	1356	g_print(" Line %ld column %ld - Query hut/but error?\n",
ali@70	1357	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@49	1358	else
ali@49	1359	cnt_word++;
ali@49	1360	}
ali@49	1361	}
ali@49	1362
ali@49	1363	/*
ali@50	1364	* check_for_mta_from:
ali@50	1365	*
ali@50	1366	* Special case - angled bracket in front of "From" placed there by an
ali@50	1367	* MTA when sending an e-mail.
ali@50	1368	*/
ali@50	1369	void check_for_mta_from(const char *aline)
ali@50	1370	{
ali@50	1371	const char *s;
ali@50	1372	s=strstr(aline,">From");
ali@50	1373	if (s)
ali@50	1374	{
ali@50	1375	if (pswit[ECHO_SWITCH])
ali@70	1376	g_print("\n%s\n",aline);
ali@50	1377	if (!pswit[OVERVIEW_SWITCH])
ali@70	1378	g_print(" Line %ld column %ld - "
ali@70	1379	"Query angled bracket with From\n",
ali@70	1380	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@50	1381	else
ali@50	1382	cnt_punct++;
ali@50	1383	}
ali@50	1384	}
ali@50	1385
ali@50	1386	/*
ali@51	1387	* check_for_orphan_character:
ali@51	1388	*
ali@51	1389	* Check for a single character line -
ali@51	1390	* often an overflow from bad wrapping.
ali@51	1391	*/
ali@51	1392	void check_for_orphan_character(const char *aline)
ali@51	1393	{
ali@70	1394	gunichar c;
ali@70	1395	c=g_utf8_get_char(aline);
ali@70	1396	if (c && !*g_utf8_next_char(aline))
ali@51	1397	{
ali@70	1398	if (c=='I' \|\| c=='V' \|\| c=='X' \|\| c=='L' \|\| g_unichar_isdigit(c))
ali@51	1399	; /* Nothing - ignore numerals alone on a line. */
ali@51	1400	else
ali@51	1401	{
ali@51	1402	if (pswit[ECHO_SWITCH])
ali@70	1403	g_print("\n%s\n",aline);
ali@51	1404	if (!pswit[OVERVIEW_SWITCH])
ali@70	1405	g_print(" Line %ld column 1 - Query single character line\n",
ali@51	1406	linecnt);
ali@51	1407	else
ali@51	1408	cnt_punct++;
ali@51	1409	}
ali@51	1410	}
ali@51	1411	}
ali@51	1412
ali@51	1413	/*
ali@52	1414	* check_for_pling_scanno:
ali@52	1415	*
ali@52	1416	* Check for I" - often should be !
ali@52	1417	*/
ali@52	1418	void check_for_pling_scanno(const char *aline)
ali@52	1419	{
ali@52	1420	const char *s;
ali@52	1421	s=strstr(aline," I\"");
ali@52	1422	if (s)
ali@52	1423	{
ali@52	1424	if (pswit[ECHO_SWITCH])
ali@70	1425	g_print("\n%s\n",aline);
ali@52	1426	if (!pswit[OVERVIEW_SWITCH])
ali@70	1427	g_print(" Line %ld column %ld - Query I=exclamation mark?\n",
ali@70	1428	linecnt,g_utf8_pointer_to_offset(aline,s));
ali@52	1429	else
ali@52	1430	cnt_punct++;
ali@52	1431	}
ali@52	1432	}
ali@52	1433
ali@52	1434	/*
ali@53	1435	* check_for_extra_period:
ali@53	1436	*
ali@53	1437	* Check for period without a capital letter. Cut-down from gutspell.
ali@53	1438	* Only works when it happens on a single line.
ali@53	1439	*/
ali@53	1440	void check_for_extra_period(const char aline,const struct warnings warnings)
ali@53	1441	{
ali@92	1442	const char s,t,s1,sprev;
ali@69	1443	int i;
ali@70	1444	gsize len;
ali@69	1445	gboolean istypo;
ali@69	1446	gchar *testword;
ali@92	1447	gunichar c,nc,pc,*decomposition;
ali@53	1448	if (pswit[PARANOID_SWITCH])
ali@53	1449	{
ali@70	1450	for (t=aline;t=strstr(t,". ");)
ali@53	1451	{
ali@69	1452	if (t==aline)
ali@53	1453	{
ali@70	1454	t=g_utf8_next_char(t);
ali@53	1455	/* start of line punctuation is handled elsewhere */
ali@53	1456	continue;
ali@53	1457	}
ali@70	1458	if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
ali@53	1459	{
ali@70	1460	t=g_utf8_next_char(t);
ali@53	1461	continue;
ali@53	1462	}
ali@53	1463	if (warnings->isDutch)
ali@53	1464	{
ali@53	1465	/* For Frank & Jeroen -- 's Middags case */
ali@70	1466	gunichar c2,c3,c4,c5;
ali@70	1467	c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
ali@70	1468	c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
ali@70	1469	c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
ali@70	1470	c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
ali@92	1471	if (CHAR_IS_APOSTROPHE(c2) &&
ali@92	1472	g_unichar_islower(c3) && c4==CHAR_SPACE &&
ali@92	1473	g_unichar_isupper(c5))
ali@53	1474	{
ali@70	1475	t=g_utf8_next_char(t);
ali@53	1476	continue;
ali@53	1477	}
ali@53	1478	}
ali@70	1479	s1=g_utf8_next_char(g_utf8_next_char(t));
ali@70	1480	while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
ali@99	1481	!g_unichar_isdigit(g_utf8_get_char(s1)))
ali@70	1482	s1=g_utf8_next_char(s1);
ali@70	1483	if (g_unichar_islower(g_utf8_get_char(s1)))
ali@53	1484	{
ali@53	1485	/* we have something to investigate */
ali@69	1486	istypo=TRUE;
ali@53	1487	/* so let's go back and find out */
ali@92	1488	nc=g_utf8_get_char(t);
ali@92	1489	s1=g_utf8_prev_char(t);
ali@92	1490	c=g_utf8_get_char(s1);
ali@92	1491	sprev=g_utf8_prev_char(s1);
ali@92	1492	pc=g_utf8_get_char(sprev);
ali@92	1493	while (s1>=aline &&
ali@92	1494	(g_unichar_isalpha(c) \|\| g_unichar_isdigit(c) \|\|
ali@92	1495	g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
ali@92	1496	g_unichar_isalpha(nc)))
ali@92	1497	{
ali@92	1498	nc=c;
ali@92	1499	s1=sprev;
ali@92	1500	c=pc;
ali@92	1501	sprev=g_utf8_prev_char(s1);
ali@92	1502	pc=g_utf8_get_char(sprev);
ali@92	1503	}
ali@70	1504	s1=g_utf8_next_char(s1);
ali@69	1505	s=strchr(s1,'.');
ali@69	1506	if (s)
ali@69	1507	testword=g_strndup(s1,s-s1);
ali@69	1508	else
ali@69	1509	testword=g_strdup(s1);
ali@53	1510	for (i=0;*abbrev[i];i++)
ali@53	1511	if (!strcmp(testword,abbrev[i]))
ali@69	1512	istypo=FALSE;
ali@70	1513	if (g_unichar_isdigit(g_utf8_get_char(testword)))
ali@69	1514	istypo=FALSE;
ali@70	1515	if (!*g_utf8_next_char(testword))
ali@69	1516	istypo=FALSE;
ali@53	1517	if (isroman(testword))
ali@69	1518	istypo=FALSE;
ali@53	1519	if (istypo)
ali@53	1520	{
ali@69	1521	istypo=FALSE;
ali@70	1522	for (s=testword;*s;s=g_utf8_next_char(s))
ali@70	1523	{
ali@70	1524	decomposition=g_unicode_canonical_decomposition(
ali@70	1525	g_utf8_get_char(s),&len);
ali@70	1526	if (g_utf8_strchr("aeiou",-1,decomposition[0]))
ali@69	1527	istypo=TRUE;
ali@70	1528	g_free(decomposition);
ali@70	1529	}
ali@53	1530	}
ali@69	1531	if (istypo &&
ali@69	1532	(pswit[VERBOSE_SWITCH] \|\| !g_tree_lookup(qperiod,testword)))
ali@53	1533	{
ali@69	1534	g_tree_insert(qperiod,g_strdup(testword),
ali@69	1535	GINT_TO_POINTER(1));
ali@69	1536	if (pswit[ECHO_SWITCH])
ali@70	1537	g_print("\n%s\n",aline);
ali@69	1538	if (!pswit[OVERVIEW_SWITCH])
ali@70	1539	g_print(" Line %ld column %ld - Extra period?\n",
ali@70	1540	linecnt,g_utf8_pointer_to_offset(aline,t)+1);
ali@69	1541	else
ali@69	1542	cnt_punct++;
ali@53	1543	}
ali@69	1544	g_free(testword);
ali@53	1545	}
ali@70	1546	t=g_utf8_next_char(t);
ali@53	1547	}
ali@53	1548	}
ali@53	1549	}
ali@53	1550
ali@53	1551	/*
ali@54	1552	* check_for_following_punctuation:
ali@54	1553	*
ali@54	1554	* Check for words usually not followed by punctuation.
ali@54	1555	*/
ali@54	1556	void check_for_following_punctuation(const char *aline)
ali@54	1557	{
ali@54	1558	int i;
ali@54	1559	const char s,wordstart;
ali@70	1560	gunichar c;
ali@69	1561	gchar inword,t;
ali@54	1562	if (pswit[TYPO_SWITCH])
ali@54	1563	{
ali@54	1564	for (s=aline;*s;)
ali@54	1565	{
ali@54	1566	wordstart=s;
ali@69	1567	t=getaword(&s);
ali@69	1568	if (!*t)
ali@69	1569	{
ali@69	1570	g_free(t);
ali@54	1571	continue;
ali@69	1572	}
ali@70	1573	inword=g_utf8_strdown(t,-1);
ali@69	1574	g_free(t);
ali@54	1575	for (i=0;*nocomma[i];i++)
ali@54	1576	if (!strcmp(inword,nocomma[i]))
ali@54	1577	{
ali@70	1578	c=g_utf8_get_char(s);
ali@70	1579	if (c==',' \|\| c==';' \|\| c==':')
ali@54	1580	{
ali@54	1581	if (pswit[ECHO_SWITCH])
ali@70	1582	g_print("\n%s\n",aline);
ali@54	1583	if (!pswit[OVERVIEW_SWITCH])
ali@70	1584	g_print(" Line %ld column %ld - "
ali@54	1585	"Query punctuation after %s?\n",
ali@70	1586	linecnt,g_utf8_pointer_to_offset(aline,s)+1,
ali@70	1587	inword);
ali@54	1588	else
ali@54	1589	cnt_punct++;
ali@54	1590	}
ali@54	1591	}
ali@54	1592	for (i=0;*noperiod[i];i++)
ali@54	1593	if (!strcmp(inword,noperiod[i]))
ali@54	1594	{
ali@70	1595	c=g_utf8_get_char(s);
ali@70	1596	if (c=='.' \|\| c=='!')
ali@54	1597	{
ali@54	1598	if (pswit[ECHO_SWITCH])
ali@70	1599	g_print("\n%s\n",aline);
ali@54	1600	if (!pswit[OVERVIEW_SWITCH])
ali@70	1601	g_print(" Line %ld column %ld - "
ali@54	1602	"Query punctuation after %s?\n",
ali@70	1603	linecnt,g_utf8_pointer_to_offset(aline,s)+1,
ali@70	1604	inword);
ali@54	1605	else
ali@54	1606	cnt_punct++;
ali@54	1607	}
ali@54	1608	}
ali@69	1609	g_free(inword);
ali@54	1610	}
ali@54	1611	}
ali@54	1612	}
ali@54	1613
ali@54	1614	/*
ali@55	1615	* check_for_typos:
ali@55	1616	*
ali@55	1617	* Check for commonly mistyped words,
ali@55	1618	* and digits like 0 for O in a word.
ali@55	1619	*/
ali@55	1620	void check_for_typos(const char aline,struct warnings warnings)
ali@55	1621	{
ali@70	1622	const char s,t,nt,wordstart;
ali@70	1623	gchar *inword;
ali@70	1624	gunichar *decomposition;
ali@70	1625	gchar *testword;
ali@70	1626	int i,vowel,consonant,*dupcnt;
ali@70	1627	gboolean isdup,istypo,alower;
ali@92	1628	gunichar c,pc;
ali@70	1629	long offset,len;
ali@70	1630	gsize decomposition_len;
ali@55	1631	for (s=aline;*s;)
ali@55	1632	{
ali@55	1633	wordstart=s;
ali@69	1634	inword=getaword(&s);
ali@55	1635	if (!*inword)
ali@69	1636	{
ali@69	1637	g_free(inword);
ali@55	1638	continue; /* don't bother with empty lines */
ali@69	1639	}
ali@55	1640	if (mixdigit(inword))
ali@55	1641	{
ali@55	1642	if (pswit[ECHO_SWITCH])
ali@70	1643	g_print("\n%s\n",aline);
ali@55	1644	if (!pswit[OVERVIEW_SWITCH])
ali@70	1645	g_print(" Line %ld column %ld - Query digit in %s\n",
ali@70	1646	linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
ali@55	1647	else
ali@55	1648	cnt_word++;
ali@55	1649	}
ali@55	1650	/*
ali@55	1651	* Put the word through a series of tests for likely typos and OCR
ali@55	1652	* errors.
ali@55	1653	*/
ali@69	1654	if (pswit[TYPO_SWITCH] \|\| pswit[USERTYPO_SWITCH])
ali@55	1655	{
ali@69	1656	istypo=FALSE;
ali@70	1657	alower=FALSE;
ali@70	1658	for (t=inword;*t;t=g_utf8_next_char(t))
ali@55	1659	{
ali@70	1660	c=g_utf8_get_char(t);
ali@70	1661	nt=g_utf8_next_char(t);
ali@55	1662	/* lowercase for testing */
ali@70	1663	if (g_unichar_islower(c))
ali@70	1664	alower=TRUE;
ali@70	1665	if (alower && (g_unichar_isupper(c) \|\| g_unichar_istitle(c)))
ali@55	1666	{
ali@55	1667	/*
ali@55	1668	* We have an uppercase mid-word. However, there are
ali@55	1669	* common cases:
ali@55	1670	* Mac and Mc like McGill
ali@55	1671	* French contractions like l'Abbe
ali@55	1672	*/
ali@70	1673	offset=g_utf8_pointer_to_offset(inword,t);
ali@92	1674	if (offset>0)
ali@92	1675	pc=g_utf8_get_char(g_utf8_prev_char(t));
ali@92	1676	else
ali@92	1677	pc='\0';
ali@70	1678	if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' \|\|
ali@70	1679	offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
ali@70	1680	g_utf8_get_char(g_utf8_next_char(nt))=='c' \|\|
ali@92	1681	CHAR_IS_APOSTROPHE(pc))
ali@55	1682	; /* do nothing! */
ali@55	1683	else
ali@69	1684	istypo=TRUE;
ali@55	1685	}
ali@55	1686	}
ali@70	1687	testword=g_utf8_casefold(inword,-1);
ali@69	1688	}
ali@69	1689	if (pswit[TYPO_SWITCH])
ali@69	1690	{
ali@55	1691	/*
ali@55	1692	* Check for certain unlikely two-letter combinations at word
ali@55	1693	* start and end.
ali@55	1694	*/
ali@70	1695	len=g_utf8_strlen(testword,-1);
ali@70	1696	if (len>1)
ali@55	1697	{
ali@55	1698	for (i=0;*nostart[i];i++)
ali@70	1699	if (g_str_has_prefix(testword,nostart[i]))
ali@69	1700	istypo=TRUE;
ali@55	1701	for (i=0;*noend[i];i++)
ali@70	1702	if (g_str_has_suffix(testword,noend[i]))
ali@69	1703	istypo=TRUE;
ali@55	1704	}
ali@55	1705	/* ght is common, gbt never. Like that. */
ali@55	1706	if (strstr(testword,"cb"))
ali@69	1707	istypo=TRUE;
ali@55	1708	if (strstr(testword,"gbt"))
ali@69	1709	istypo=TRUE;
ali@55	1710	if (strstr(testword,"pbt"))
ali@69	1711	istypo=TRUE;
ali@55	1712	if (strstr(testword,"tbs"))
ali@69	1713	istypo=TRUE;
ali@55	1714	if (strstr(testword,"mrn"))
ali@69	1715	istypo=TRUE;
ali@55	1716	if (strstr(testword,"ahle"))
ali@69	1717	istypo=TRUE;
ali@55	1718	if (strstr(testword,"ihle"))
ali@69	1719	istypo=TRUE;
ali@55	1720	/*
ali@55	1721	* "TBE" does happen - like HEARTBEAT - but uncommon.
ali@55	1722	* Also "TBI" - frostbite, outbid - but uncommon.
ali@55	1723	* Similarly "ii" like Hawaii, or Pompeii, and in Roman
ali@55	1724	* numerals, but "ii" is a common scanno.
ali@55	1725	*/
ali@55	1726	if (strstr(testword,"tbi"))
ali@69	1727	istypo=TRUE;
ali@55	1728	if (strstr(testword,"tbe"))
ali@69	1729	istypo=TRUE;
ali@55	1730	if (strstr(testword,"ii"))
ali@69	1731	istypo=TRUE;
ali@55	1732	/*
ali@55	1733	* Check for no vowels or no consonants.
ali@55	1734	* If none, flag a typo.
ali@55	1735	*/
ali@70	1736	if (!istypo && len>1)
ali@55	1737	{
ali@55	1738	vowel=consonant=0;
ali@70	1739	for (t=testword;*t;t=g_utf8_next_char(t))
ali@55	1740	{
ali@70	1741	c=g_utf8_get_char(t);
ali@70	1742	decomposition=
ali@70	1743	g_unicode_canonical_decomposition(c,&decomposition_len);
ali@70	1744	if (c=='y' \|\| g_unichar_isdigit(c))
ali@55	1745	{
ali@55	1746	/* Yah, this is loose. */
ali@55	1747	vowel++;
ali@55	1748	consonant++;
ali@55	1749	}
ali@70	1750	else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
ali@55	1751	vowel++;
ali@55	1752	else
ali@55	1753	consonant++;
ali@70	1754	g_free(decomposition);
ali@55	1755	}
ali@55	1756	if (!vowel \|\| !consonant)
ali@69	1757	istypo=TRUE;
ali@55	1758	}
ali@55	1759	/*
ali@55	1760	* Now exclude the word from being reported if it's in
ali@55	1761	* the okword list.
ali@55	1762	*/
ali@55	1763	for (i=0;*okword[i];i++)
ali@55	1764	if (!strcmp(testword,okword[i]))
ali@69	1765	istypo=FALSE;
ali@55	1766	/*
ali@55	1767	* What looks like a typo may be a Roman numeral.
ali@55	1768	* Exclude these.
ali@55	1769	*/
ali@55	1770	if (istypo && isroman(testword))
ali@69	1771	istypo=FALSE;
ali@55	1772	/* Check the manual list of typos. */
ali@55	1773	if (!istypo)
ali@55	1774	for (i=0;*typo[i];i++)
ali@55	1775	if (!strcmp(testword,typo[i]))
ali@69	1776	istypo=TRUE;
ali@55	1777	/*
ali@55	1778	* Check lowercase s, l, i and m - special cases.
ali@55	1779	* "j" - often a semi-colon gone wrong.
ali@55	1780	* "d" for a missing apostrophe - he d
ali@55	1781	* "n" for "in"
ali@55	1782	*/
ali@70	1783	if (!istypo && len==1 &&
ali@70	1784	g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
ali@69	1785	istypo=TRUE;
ali@55	1786	if (istypo)
ali@55	1787	{
ali@69	1788	dupcnt=g_tree_lookup(qword,testword);
ali@69	1789	if (dupcnt)
ali@69	1790	{
ali@69	1791	(*dupcnt)++;
ali@69	1792	isdup=!pswit[VERBOSE_SWITCH];
ali@69	1793	}
ali@69	1794	else
ali@69	1795	{
ali@69	1796	dupcnt=g_new0(int,1);
ali@69	1797	g_tree_insert(qword,g_strdup(testword),dupcnt);
ali@69	1798	isdup=FALSE;
ali@69	1799	}
ali@55	1800	if (!isdup)
ali@55	1801	{
ali@55	1802	if (pswit[ECHO_SWITCH])
ali@70	1803	g_print("\n%s\n",aline);
ali@55	1804	if (!pswit[OVERVIEW_SWITCH])
ali@55	1805	{
ali@70	1806	g_print(" Line %ld column %ld - Query word %s",
ali@70	1807	linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
ali@70	1808	inword);
ali@69	1809	if (!pswit[VERBOSE_SWITCH])
ali@70	1810	g_print(" - not reporting duplicates");
ali@70	1811	g_print("\n");
ali@55	1812	}
ali@55	1813	else
ali@55	1814	cnt_word++;
ali@55	1815	}
ali@55	1816	}
ali@55	1817	}
ali@55	1818	/* check the user's list of typos */
ali@69	1819	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
ali@69	1820	{
ali@69	1821	if (pswit[ECHO_SWITCH])
ali@70	1822	g_print("\n%s\n",aline);
ali@69	1823	if (!pswit[OVERVIEW_SWITCH])
ali@70	1824	g_print(" Line %ld column %ld - Query possible scanno %s\n",
ali@70	1825	linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
ali@69	1826	}
ali@69	1827	if (pswit[TYPO_SWITCH] \|\| pswit[USERTYPO_SWITCH])
ali@69	1828	g_free(testword);
ali@55	1829	if (pswit[PARANOID_SWITCH] && warnings->digit)
ali@55	1830	{
ali@55	1831	/* In paranoid mode, query all 0 and 1 standing alone. */
ali@55	1832	if (!strcmp(inword,"0") \|\| !strcmp(inword,"1"))
ali@55	1833	{
ali@55	1834	if (pswit[ECHO_SWITCH])
ali@70	1835	g_print("\n%s\n",aline);
ali@55	1836	if (!pswit[OVERVIEW_SWITCH])
ali@70	1837	g_print(" Line %ld column %ld - Query standalone %s\n",
ali@70	1838	linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
ali@70	1839	inword);
ali@55	1840	else
ali@55	1841	cnt_word++;
ali@55	1842	}
ali@55	1843	}
ali@69	1844	g_free(inword);
ali@55	1845	}
ali@55	1846	}
ali@55	1847
ali@56	1848	/*
ali@56	1849	* check_for_misspaced_punctuation:
ali@56	1850	*
ali@56	1851	* Look for added or missing spaces around punctuation and quotes.
ali@56	1852	* If there is a punctuation character like ! with no space on
ali@56	1853	* either side, suspect a missing!space. If there are spaces on
ali@56	1854	* both sides , assume a typo. If we see a double quote with no
ali@56	1855	* space or punctuation on either side of it, assume unspaced
ali@56	1856	* quotes "like"this.
ali@56	1857	*/
ali@56	1858	void check_for_misspaced_punctuation(const char *aline,
ali@69	1859	struct parities *parities,gboolean isemptyline)
ali@56	1860	{
ali@69	1861	gboolean isacro,isellipsis;
ali@56	1862	const char *s;
ali@70	1863	gunichar c,nc,pc,n2c;
ali@94	1864	int parity;
ali@70	1865	c=g_utf8_get_char(aline);
ali@70	1866	nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70	1867	for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@56	1868	{
ali@70	1869	pc=c;
ali@70	1870	c=nc;
ali@70	1871	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@56	1872	/* For each character in the line after the first. */
ali@70	1873	if (g_utf8_strchr(".?!,;:_",-1,c)) /* if it's punctuation */
ali@56	1874	{
ali@56	1875	/* we need to suppress warnings for acronyms like M.D. */
ali@69	1876	isacro=FALSE;
ali@56	1877	/* we need to suppress warnings for ellipsis . . . */
ali@69	1878	isellipsis=FALSE;
ali@70	1879	/*
ali@70	1880	* If there are letters on both sides of it or
ali@70	1881	* if it's strict punctuation followed by an alpha.
ali@70	1882	*/
ali@70	1883	if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) \|\|
ali@70	1884	g_utf8_strchr("?!,;:",-1,c)))
ali@56	1885	{
ali@70	1886	if (c=='.')
ali@56	1887	{
ali@70	1888	if (g_utf8_pointer_to_offset(aline,s)>2 &&
ali@70	1889	g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
ali@69	1890	isacro=TRUE;
ali@70	1891	n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
ali@70	1892	if (nc && n2c=='.')
ali@69	1893	isacro=TRUE;
ali@56	1894	}
ali@56	1895	if (!isacro)
ali@56	1896	{
ali@56	1897	if (pswit[ECHO_SWITCH])
ali@70	1898	g_print("\n%s\n",aline);
ali@56	1899	if (!pswit[OVERVIEW_SWITCH])
ali@70	1900	g_print(" Line %ld column %ld - Missing space?\n",
ali@70	1901	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56	1902	else
ali@56	1903	cnt_punct++;
ali@56	1904	}
ali@56	1905	}
ali@70	1906	if (pc==CHAR_SPACE && (nc==CHAR_SPACE \|\| !nc))
ali@56	1907	{
ali@56	1908	/*
ali@56	1909	* If there are spaces on both sides,
ali@56	1910	* or space before and end of line.
ali@56	1911	*/
ali@70	1912	if (c=='.')
ali@56	1913	{
ali@70	1914	if (g_utf8_pointer_to_offset(aline,s)>2 &&
ali@70	1915	g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
ali@69	1916	isellipsis=TRUE;
ali@70	1917	n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
ali@70	1918	if (nc && n2c=='.')
ali@69	1919	isellipsis=TRUE;
ali@56	1920	}
ali@56	1921	if (!isemptyline && !isellipsis)
ali@56	1922	{
ali@56	1923	if (pswit[ECHO_SWITCH])
ali@70	1924	g_print("\n%s\n",aline);
ali@56	1925	if (!pswit[OVERVIEW_SWITCH])
ali@70	1926	g_print(" Line %ld column %ld - "
ali@70	1927	"Spaced punctuation?\n",linecnt,
ali@70	1928	g_utf8_pointer_to_offset(aline,s)+1);
ali@56	1929	else
ali@56	1930	cnt_punct++;
ali@56	1931	}
ali@56	1932	}
ali@56	1933	}
ali@56	1934	}
ali@56	1935	/* Split out the characters that CANNOT be preceded by space. */
ali@70	1936	c=g_utf8_get_char(aline);
ali@70	1937	nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70	1938	for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@56	1939	{
ali@70	1940	pc=c;
ali@70	1941	c=nc;
ali@70	1942	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@56	1943	/* for each character in the line after the first */
ali@70	1944	if (g_utf8_strchr("?!,;:",-1,c))
ali@56	1945	{
ali@56	1946	/* if it's punctuation that _cannot_ have a space before it */
ali@70	1947	if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
ali@56	1948	{
ali@56	1949	/*
ali@70	1950	* If nc DOES == space,
ali@56	1951	* it was already reported just above.
ali@56	1952	*/
ali@56	1953	if (pswit[ECHO_SWITCH])
ali@70	1954	g_print("\n%s\n",aline);
ali@56	1955	if (!pswit[OVERVIEW_SWITCH])
ali@70	1956	g_print(" Line %ld column %ld - Spaced punctuation?\n",
ali@70	1957	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56	1958	else
ali@56	1959	cnt_punct++;
ali@56	1960	}
ali@56	1961	}
ali@56	1962	}
ali@56	1963	/*
ali@56	1964	* Special case " .X" where X is any alpha.
ali@56	1965	* This plugs a hole in the acronym code above.
ali@56	1966	* Inelegant, but maintainable.
ali@56	1967	*/
ali@70	1968	c=g_utf8_get_char(aline);
ali@70	1969	nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70	1970	for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@56	1971	{
ali@70	1972	pc=c;
ali@70	1973	c=nc;
ali@70	1974	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@56	1975	/* for each character in the line after the first */
ali@70	1976	if (c=='.')
ali@56	1977	{
ali@56	1978	/* if it's a period */
ali@70	1979	if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
ali@56	1980	{
ali@56	1981	/*
ali@56	1982	* If the period follows a space and
ali@56	1983	* is followed by a letter.
ali@56	1984	*/
ali@56	1985	if (pswit[ECHO_SWITCH])
ali@70	1986	g_print("\n%s\n",aline);
ali@56	1987	if (!pswit[OVERVIEW_SWITCH])
ali@70	1988	g_print(" Line %ld column %ld - Spaced punctuation?\n",
ali@70	1989	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56	1990	else
ali@56	1991	cnt_punct++;
ali@56	1992	}
ali@56	1993	}
ali@56	1994	}
ali@70	1995	c=g_utf8_get_char(aline);
ali@70	1996	nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70	1997	for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@56	1998	{
ali@70	1999	pc=c;
ali@70	2000	c=nc;
ali@70	2001	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@56	2002	/* for each character in the line after the first */
ali@94	2003	if (CHAR_IS_DQUOTE(c))
ali@56	2004	{
ali@70	2005	if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
ali@70	2006	!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc \|\|
ali@70	2007	!g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
ali@56	2008	{
ali@56	2009	if (pswit[ECHO_SWITCH])
ali@70	2010	g_print("\n%s\n",aline);
ali@56	2011	if (!pswit[OVERVIEW_SWITCH])
ali@70	2012	g_print(" Line %ld column %ld - Unspaced quotes?\n",
ali@70	2013	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56	2014	else
ali@56	2015	cnt_punct++;
ali@56	2016	}
ali@56	2017	}
ali@56	2018	}
ali@56	2019	/* Check parity of quotes. */
ali@70	2020	nc=g_utf8_get_char(aline);
ali@70	2021	for (s=aline;*s;s=g_utf8_next_char(s))
ali@56	2022	{
ali@70	2023	c=nc;
ali@70	2024	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@94	2025	if (CHAR_IS_DQUOTE(c))
ali@56	2026	{
ali@94	2027	if (c==CHAR_DQUOTE)
ali@94	2028	{
ali@94	2029	parities->dquote=!parities->dquote;
ali@94	2030	parity=parities->dquote;
ali@94	2031	}
ali@94	2032	else if (c==CHAR_LD_QUOTE)
ali@94	2033	parity=1;
ali@94	2034	else
ali@94	2035	parity=0;
ali@94	2036	if (!parity)
ali@56	2037	{
ali@56	2038	/* parity even */
ali@99	2039	if (!g_utf8_strchr("_-.'`‘’/,;:!?)]} ",-1,nc))
ali@56	2040	{
ali@56	2041	if (pswit[ECHO_SWITCH])
ali@70	2042	g_print("\n%s\n",aline);
ali@56	2043	if (!pswit[OVERVIEW_SWITCH])
ali@70	2044	g_print(" Line %ld column %ld - "
ali@70	2045	"Wrongspaced quotes?\n",
ali@70	2046	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56	2047	else
ali@56	2048	cnt_punct++;
ali@56	2049	}
ali@56	2050	}
ali@56	2051	else
ali@56	2052	{
ali@56	2053	/* parity odd */
ali@99	2054	if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
ali@99	2055	!g_utf8_strchr("_-/.'`‘’([{$",-1,nc) \|\| !nc)
ali@56	2056	{
ali@56	2057	if (pswit[ECHO_SWITCH])
ali@70	2058	g_print("\n%s\n",aline);
ali@56	2059	if (!pswit[OVERVIEW_SWITCH])
ali@70	2060	g_print(" Line %ld column %ld - "
ali@70	2061	"Wrongspaced quotes?\n",
ali@70	2062	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56	2063	else
ali@56	2064	cnt_punct++;
ali@56	2065	}
ali@56	2066	}
ali@56	2067	}
ali@56	2068	}
ali@94	2069	c=g_utf8_get_char(aline);
ali@94	2070	if (CHAR_IS_DQUOTE(c))
ali@56	2071	{
ali@70	2072	if (g_utf8_strchr(",;:!?)]} ",-1,
ali@70	2073	g_utf8_get_char(g_utf8_next_char(aline))))
ali@56	2074	{
ali@56	2075	if (pswit[ECHO_SWITCH])
ali@70	2076	g_print("\n%s\n",aline);
ali@56	2077	if (!pswit[OVERVIEW_SWITCH])
ali@70	2078	g_print(" Line %ld column 1 - Wrongspaced quotes?\n",
ali@56	2079	linecnt);
ali@56	2080	else
ali@56	2081	cnt_punct++;
ali@56	2082	}
ali@56	2083	}
ali@56	2084	if (pswit[SQUOTE_SWITCH])
ali@56	2085	{
ali@70	2086	nc=g_utf8_get_char(aline);
ali@70	2087	for (s=aline;*s;s=g_utf8_next_char(s))
ali@56	2088	{
ali@70	2089	c=nc;
ali@70	2090	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@92	2091	if (CHAR_IS_SQUOTE(c) && (s==aline \|\| s>aline &&
ali@70	2092	!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) \|\|
ali@70	2093	!g_unichar_isalpha(nc)))
ali@56	2094	{
ali@56	2095	parities->squote=!parities->squote;
ali@56	2096	if (!parities->squote)
ali@56	2097	{
ali@56	2098	/* parity even */
ali@70	2099	if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
ali@56	2100	{
ali@56	2101	if (pswit[ECHO_SWITCH])
ali@70	2102	g_print("\n%s\n",aline);
ali@56	2103	if (!pswit[OVERVIEW_SWITCH])
ali@70	2104	g_print(" Line %ld column %ld - "
ali@56	2105	"Wrongspaced singlequotes?\n",
ali@70	2106	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56	2107	else
ali@56	2108	cnt_punct++;
ali@56	2109	}
ali@56	2110	}
ali@56	2111	else
ali@56	2112	{
ali@56	2113	/* parity odd */
ali@99	2114	if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
ali@70	2115	!g_utf8_strchr("_-/\".'`",-1,nc) \|\| !nc)
ali@56	2116	{
ali@56	2117	if (pswit[ECHO_SWITCH])
ali@70	2118	g_print("\n%s\n",aline);
ali@56	2119	if (!pswit[OVERVIEW_SWITCH])
ali@70	2120	g_print(" Line %ld column %ld - "
ali@56	2121	"Wrongspaced singlequotes?\n",
ali@70	2122	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56	2123	else
ali@56	2124	cnt_punct++;
ali@56	2125	}
ali@56	2126	}
ali@56	2127	}
ali@56	2128	}
ali@56	2129	}
ali@56	2130	}
ali@56	2131
ali@55	2132	/*
ali@57	2133	* check_for_double_punctuation:
ali@57	2134	*
ali@57	2135	* Look for double punctuation like ,. or ,,
ali@57	2136	* Thanks to DW for the suggestion!
ali@57	2137	* In books with references, ".," and ".;" are common
ali@57	2138	* e.g. "etc., etc.," and vol. 1.; vol 3.;
ali@57	2139	* OTOH, from my initial tests, there are also fairly
ali@57	2140	* common errors. What to do? Make these cases paranoid?
ali@57	2141	* ".," is the most common, so warnings->dotcomma is used
ali@57	2142	* to suppress detailed reporting if it occurs often.
ali@57	2143	*/
ali@57	2144	void check_for_double_punctuation(const char aline,struct warnings warnings)
ali@57	2145	{
ali@70	2146	const char *s;
ali@70	2147	gunichar c,nc;
ali@70	2148	nc=g_utf8_get_char(aline);
ali@70	2149	for (s=aline;*s;s=g_utf8_next_char(s))
ali@57	2150	{
ali@70	2151	c=nc;
ali@70	2152	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@57	2153	/* for each punctuation character in the line */
ali@70	2154	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
ali@70	2155	g_utf8_strchr(".?!,;:",-1,nc))
ali@57	2156	{
ali@57	2157	/* followed by punctuation, it's a query, unless . . . */
ali@70	2158	if (c==nc && (c=='.' \|\| c=='?' \|\| c=='!') \|\|
ali@70	2159	!warnings->dotcomma && c=='.' && nc==',' \|\|
ali@70	2160	warnings->isFrench && g_str_has_prefix(s,",...") \|\|
ali@70	2161	warnings->isFrench && g_str_has_prefix(s,"...,") \|\|
ali@70	2162	warnings->isFrench && g_str_has_prefix(s,";...") \|\|
ali@70	2163	warnings->isFrench && g_str_has_prefix(s,"...;") \|\|
ali@70	2164	warnings->isFrench && g_str_has_prefix(s,":...") \|\|
ali@70	2165	warnings->isFrench && g_str_has_prefix(s,"...:") \|\|
ali@70	2166	warnings->isFrench && g_str_has_prefix(s,"!...") \|\|
ali@70	2167	warnings->isFrench && g_str_has_prefix(s,"...!") \|\|
ali@70	2168	warnings->isFrench && g_str_has_prefix(s,"?...") \|\|
ali@70	2169	warnings->isFrench && g_str_has_prefix(s,"...?"))
ali@57	2170	{
ali@70	2171	if (warnings->isFrench && g_str_has_prefix(s,",...") \|\|
ali@70	2172	warnings->isFrench && g_str_has_prefix(s,"...,") \|\|
ali@70	2173	warnings->isFrench && g_str_has_prefix(s,";...") \|\|
ali@70	2174	warnings->isFrench && g_str_has_prefix(s,"...;") \|\|
ali@70	2175	warnings->isFrench && g_str_has_prefix(s,":...") \|\|
ali@70	2176	warnings->isFrench && g_str_has_prefix(s,"...:") \|\|
ali@70	2177	warnings->isFrench && g_str_has_prefix(s,"!...") \|\|
ali@70	2178	warnings->isFrench && g_str_has_prefix(s,"...!") \|\|
ali@70	2179	warnings->isFrench && g_str_has_prefix(s,"?...") \|\|
ali@70	2180	warnings->isFrench && g_str_has_prefix(s,"...?"))
ali@70	2181	{
ali@70	2182	s+=4;
ali@70	2183	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@70	2184	}
ali@57	2185	; /* do nothing for .. !! and ?? which can be legit */
ali@57	2186	}
ali@57	2187	else
ali@57	2188	{
ali@57	2189	if (pswit[ECHO_SWITCH])
ali@70	2190	g_print("\n%s\n",aline);
ali@57	2191	if (!pswit[OVERVIEW_SWITCH])
ali@70	2192	g_print(" Line %ld column %ld - Double punctuation?\n",
ali@70	2193	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@57	2194	else
ali@57	2195	cnt_punct++;
ali@57	2196	}
ali@57	2197	}
ali@57	2198	}
ali@57	2199	}
ali@57	2200
ali@57	2201	/*
ali@58	2202	* check_for_spaced_quotes:
ali@58	2203	*/
ali@58	2204	void check_for_spaced_quotes(const char *aline)
ali@58	2205	{
ali@92	2206	int i;
ali@58	2207	const char s,t;
ali@92	2208	const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
ali@92	2209	CHAR_RS_QUOTE};
ali@92	2210	GString *pattern;
ali@58	2211	s=aline;
ali@58	2212	while ((t=strstr(s," \" ")))
ali@58	2213	{
ali@58	2214	if (pswit[ECHO_SWITCH])
ali@70	2215	g_print("\n%s\n",aline);
ali@58	2216	if (!pswit[OVERVIEW_SWITCH])
ali@70	2217	g_print(" Line %ld column %ld - Spaced doublequote?\n",
ali@70	2218	linecnt,g_utf8_pointer_to_offset(aline,t)+1);
ali@58	2219	else
ali@58	2220	cnt_punct++;
ali@70	2221	s=g_utf8_next_char(g_utf8_next_char(t));
ali@58	2222	}
ali@92	2223	pattern=g_string_new(NULL);
ali@92	2224	for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
ali@58	2225	{
ali@92	2226	g_string_assign(pattern," ");
ali@92	2227	g_string_append_unichar(pattern,single_quotes[i]);
ali@92	2228	g_string_append_c(pattern,' ');
ali@92	2229	s=aline;
ali@92	2230	while ((t=strstr(s,pattern->str)))
ali@92	2231	{
ali@92	2232	if (pswit[ECHO_SWITCH])
ali@92	2233	g_print("\n%s\n",aline);
ali@92	2234	if (!pswit[OVERVIEW_SWITCH])
ali@92	2235	g_print(" Line %ld column %ld - Spaced singlequote?\n",
ali@92	2236	linecnt,g_utf8_pointer_to_offset(aline,t)+1);
ali@92	2237	else
ali@92	2238	cnt_punct++;
ali@92	2239	s=g_utf8_next_char(g_utf8_next_char(t));
ali@92	2240	}
ali@58	2241	}
ali@92	2242	g_string_free(pattern,TRUE);
ali@58	2243	}
ali@58	2244
ali@58	2245	/*
ali@59	2246	* check_for_miscased_genative:
ali@59	2247	*
ali@59	2248	* Check special case of 'S instead of 's at end of word.
ali@59	2249	*/
ali@59	2250	void check_for_miscased_genative(const char *aline)
ali@59	2251	{
ali@59	2252	const char *s;
ali@70	2253	gunichar c,nc,pc;
ali@69	2254	if (!*aline)
ali@69	2255	return;
ali@70	2256	c=g_utf8_get_char(aline);
ali@70	2257	nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70	2258	for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@59	2259	{
ali@70	2260	pc=c;
ali@70	2261	c=nc;
ali@70	2262	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@92	2263	if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
ali@59	2264	{
ali@59	2265	if (pswit[ECHO_SWITCH])
ali@70	2266	g_print("\n%s\n",aline);
ali@59	2267	if (!pswit[OVERVIEW_SWITCH])
ali@70	2268	g_print(" Line %ld column %ld - Capital \"S\"?\n",
ali@70	2269	linecnt,g_utf8_pointer_to_offset(aline,s)+2);
ali@59	2270	else
ali@59	2271	cnt_punct++;
ali@59	2272	}
ali@59	2273	}
ali@59	2274	}
ali@59	2275
ali@59	2276	/*
ali@60	2277	* check_end_of_line:
ali@60	2278	*
ali@60	2279	* Now check special cases - start and end of line -
ali@60	2280	* for single and double quotes. Start is sometimes [sic]
ali@60	2281	* but better to query it anyway.
ali@60	2282	* While we're here, check for dash at end of line.
ali@60	2283	*/
ali@60	2284	void check_end_of_line(const char aline,struct warnings warnings)
ali@60	2285	{
ali@70	2286	int lbytes;
ali@70	2287	const char *s;
ali@70	2288	gunichar c1,c2;
ali@70	2289	lbytes=strlen(aline);
ali@70	2290	if (g_utf8_strlen(aline,lbytes)>1)
ali@60	2291	{
ali@70	2292	s=g_utf8_prev_char(aline+lbytes);
ali@70	2293	c1=g_utf8_get_char(s);
ali@70	2294	c2=g_utf8_get_char(g_utf8_prev_char(s));
ali@94	2295	if ((CHAR_IS_DQUOTE(c1) \|\| CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
ali@60	2296	{
ali@60	2297	if (pswit[ECHO_SWITCH])
ali@70	2298	g_print("\n%s\n",aline);
ali@60	2299	if (!pswit[OVERVIEW_SWITCH])
ali@70	2300	g_print(" Line %ld column %ld - Spaced quote?\n",linecnt,
ali@70	2301	g_utf8_strlen(aline,lbytes));
ali@70	2302	else
ali@70	2303	cnt_punct++;
ali@70	2304	}
ali@70	2305	c1=g_utf8_get_char(aline);
ali@70	2306	c2=g_utf8_get_char(g_utf8_next_char(aline));
ali@92	2307	if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
ali@70	2308	{
ali@70	2309	if (pswit[ECHO_SWITCH])
ali@70	2310	g_print("\n%s\n",aline);
ali@70	2311	if (!pswit[OVERVIEW_SWITCH])
ali@70	2312	g_print(" Line %ld column 1 - Spaced quote?\n",linecnt);
ali@60	2313	else
ali@60	2314	cnt_punct++;
ali@60	2315	}
ali@60	2316	/*
ali@60	2317	* Dash at end of line may well be legit - paranoid mode only
ali@60	2318	* and don't report em-dash at line-end.
ali@60	2319	*/
ali@60	2320	if (pswit[PARANOID_SWITCH] && warnings->hyphen)
ali@60	2321	{
ali@70	2322	for (s=g_utf8_prev_char(aline+lbytes);
ali@70	2323	s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
ali@60	2324	;
ali@70	2325	if (g_utf8_get_char(s)=='-' &&
ali@70	2326	g_utf8_get_char(g_utf8_prev_char(s))!='-')
ali@60	2327	{
ali@60	2328	if (pswit[ECHO_SWITCH])
ali@70	2329	g_print("\n%s\n",aline);
ali@60	2330	if (!pswit[OVERVIEW_SWITCH])
ali@70	2331	g_print(" Line %ld column %ld - "
ali@70	2332	"Hyphen at end of line?\n",
ali@70	2333	linecnt,g_utf8_pointer_to_offset(aline,s));
ali@60	2334	}
ali@60	2335	}
ali@60	2336	}
ali@60	2337	}
ali@60	2338
ali@60	2339	/*
ali@61	2340	* check_for_unspaced_bracket:
ali@61	2341	*
ali@61	2342	* Brackets are often unspaced, but shouldn't be surrounded by alpha.
ali@61	2343	* If so, suspect a scanno like "a]most".
ali@61	2344	*/
ali@61	2345	void check_for_unspaced_bracket(const char *aline)
ali@61	2346	{
ali@70	2347	const char *s;
ali@70	2348	gunichar c,nc,pc;
ali@70	2349	c=g_utf8_get_char(aline);
ali@70	2350	nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70	2351	for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@61	2352	{
ali@70	2353	pc=c;
ali@70	2354	c=nc;
ali@70	2355	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@70	2356	if (!nc)
ali@70	2357	break;
ali@61	2358	/* for each bracket character in the line except 1st & last */
ali@70	2359	if (g_utf8_strchr("{[()]}",-1,c) &&
ali@70	2360	g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
ali@61	2361	{
ali@61	2362	if (pswit[ECHO_SWITCH])
ali@70	2363	g_print("\n%s\n",aline);
ali@61	2364	if (!pswit[OVERVIEW_SWITCH])
ali@70	2365	g_print(" Line %ld column %ld - Unspaced bracket?\n",
ali@70	2366	linecnt,g_utf8_pointer_to_offset(aline,s));
ali@61	2367	else
ali@61	2368	cnt_punct++;
ali@61	2369	}
ali@61	2370	}
ali@61	2371	}
ali@61	2372
ali@61	2373	/*
ali@62	2374	* check_for_unpunctuated_endquote:
ali@62	2375	*/
ali@62	2376	void check_for_unpunctuated_endquote(const char *aline)
ali@62	2377	{
ali@70	2378	const char *s;
ali@70	2379	gunichar c,nc,pc;
ali@94	2380	QuoteClass qc;
ali@70	2381	c=g_utf8_get_char(aline);
ali@70	2382	nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70	2383	for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@62	2384	{
ali@70	2385	pc=c;
ali@70	2386	c=nc;
ali@94	2387	qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;
ali@70	2388	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@62	2389	/* for each character in the line except 1st */
ali@97	2390	if ((qc==CLOSING_QUOTE \|\| qc==NEUTRAL_QUOTE) && g_unichar_isalpha(pc))
ali@62	2391	{
ali@62	2392	if (pswit[ECHO_SWITCH])
ali@70	2393	g_print("\n%s\n",aline);
ali@62	2394	if (!pswit[OVERVIEW_SWITCH])
ali@70	2395	g_print(" Line %ld column %ld - "
ali@70	2396	"endquote missing punctuation?\n",
ali@70	2397	linecnt,g_utf8_pointer_to_offset(aline,s));
ali@62	2398	else
ali@62	2399	cnt_punct++;
ali@62	2400	}
ali@62	2401	}
ali@62	2402	}
ali@62	2403
ali@62	2404	/*
ali@63	2405	* check_for_html_tag:
ali@63	2406	*
ali@63	2407	* Check for <HTML TAG>.
ali@63	2408	*
ali@63	2409	* If there is a < in the line, followed at some point
ali@63	2410	* by a > then we suspect HTML.
ali@63	2411	*/
ali@63	2412	void check_for_html_tag(const char *aline)
ali@63	2413	{
ali@63	2414	const char open,close;
ali@70	2415	gchar *tag;
ali@70	2416	open=strchr(aline,'<');
ali@63	2417	if (open)
ali@63	2418	{
ali@70	2419	close=strchr(g_utf8_next_char(open),'>');
ali@63	2420	if (close)
ali@63	2421	{
ali@70	2422	if (pswit[ECHO_SWITCH])
ali@70	2423	g_print("\n%s\n",aline);
ali@70	2424	if (!pswit[OVERVIEW_SWITCH])
ali@63	2425	{
ali@70	2426	tag=g_strndup(open,close-open+1);
ali@70	2427	g_print(" Line %ld column %ld - HTML Tag? %s \n",
ali@70	2428	linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
ali@70	2429	g_free(tag);
ali@63	2430	}
ali@70	2431	else
ali@70	2432	cnt_html++;
ali@63	2433	}
ali@63	2434	}
ali@63	2435	}
ali@63	2436
ali@63	2437	/*
ali@64	2438	* check_for_html_entity:
ali@64	2439	*
ali@64	2440	* Check for &symbol; HTML.
ali@64	2441	*
ali@64	2442	* If there is a & in the line, followed at
ali@64	2443	* some point by a ; then we suspect HTML.
ali@64	2444	*/
ali@64	2445	void check_for_html_entity(const char *aline)
ali@64	2446	{
ali@64	2447	const char s,amp,*scolon;
ali@70	2448	gchar *entity;
ali@70	2449	amp=strchr(aline,'&');
ali@64	2450	if (amp)
ali@64	2451	{
ali@70	2452	scolon=strchr(amp,';');
ali@64	2453	if (scolon)
ali@64	2454	{
ali@70	2455	for (s=amp;s<scolon;s=g_utf8_next_char(s))
ali@70	2456	if (g_utf8_get_char(s)==CHAR_SPACE)
ali@70	2457	break; /* Don't report "Jones & Son;" */
ali@70	2458	if (s>=scolon)
ali@64	2459	{
ali@64	2460	if (pswit[ECHO_SWITCH])
ali@70	2461	g_print("\n%s\n",aline);
ali@64	2462	if (!pswit[OVERVIEW_SWITCH])
ali@70	2463	{
ali@70	2464	entity=g_strndup(amp,scolon-amp+1);
ali@70	2465	g_print(" Line %ld column %d - HTML symbol? %s \n",
ali@70	2466	linecnt,(int)(amp-aline)+1,entity);
ali@70	2467	g_free(entity);
ali@70	2468	}
ali@64	2469	else
ali@64	2470	cnt_html++;
ali@64	2471	}
ali@64	2472	}
ali@64	2473	}
ali@64	2474	}
ali@64	2475
ali@65	2476	/*
ali@66	2477	* check_for_omitted_punctuation:
ali@66	2478	*
ali@66	2479	* Check for omitted punctuation at end of paragraph by working back
ali@66	2480	* through prevline. DW.
ali@66	2481	* Need to check this only for "normal" paras.
ali@66	2482	* So what is a "normal" para?
ali@66	2483	* Not normal if one-liner (chapter headings, etc.)
ali@66	2484	* Not normal if doesn't contain at least one locase letter
ali@66	2485	* Not normal if starts with space
ali@66	2486	*/
ali@66	2487	void check_for_omitted_punctuation(const char *prevline,
ali@66	2488	struct line_properties *last,int start_para_line)
ali@66	2489	{
ali@70	2490	gboolean letter_on_line=FALSE;
ali@66	2491	const char *s;
ali@92	2492	gunichar c;
ali@94	2493	gboolean closing_quote;
ali@70	2494	for (s=prevline;*s;s=g_utf8_next_char(s))
ali@70	2495	if (g_unichar_isalpha(g_utf8_get_char(s)))
ali@70	2496	{
ali@70	2497	letter_on_line=TRUE;
ali@70	2498	break;
ali@70	2499	}
ali@66	2500	/*
ali@66	2501	* This next "if" is a problem.
ali@66	2502	* If we say "start_para_line <= linecnt - 1", that includes
ali@66	2503	* one-line "paragraphs" like chapter heads. Lotsa false positives.
ali@66	2504	* If we say "start_para_line < linecnt - 1" it doesn't, but then it
ali@66	2505	* misses genuine one-line paragraphs.
ali@66	2506	*/
ali@70	2507	if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
ali@70	2508	g_utf8_get_char(prevline)>CHAR_SPACE)
ali@66	2509	{
ali@92	2510	s=prevline+strlen(prevline);
ali@92	2511	do
ali@92	2512	{
ali@92	2513	s=g_utf8_prev_char(s);
ali@92	2514	c=g_utf8_get_char(s);
ali@94	2515	if (QUOTE_CLASS(c)==CLOSING_QUOTE \|\| QUOTE_CLASS(c)==NEUTRAL_QUOTE)
ali@94	2516	closing_quote=TRUE;
ali@94	2517	else
ali@94	2518	closing_quote=FALSE;
ali@94	2519	} while (closing_quote && s>prevline);
ali@70	2520	for (;s>prevline;s=g_utf8_prev_char(s))
ali@66	2521	{
ali@70	2522	if (g_unichar_isalpha(g_utf8_get_char(s)))
ali@66	2523	{
ali@66	2524	if (pswit[ECHO_SWITCH])
ali@70	2525	g_print("\n%s\n",prevline);
ali@66	2526	if (!pswit[OVERVIEW_SWITCH])
ali@70	2527	g_print(" Line %ld column %ld - "
ali@66	2528	"No punctuation at para end?\n",
ali@70	2529	linecnt-1,g_utf8_strlen(prevline,-1));
ali@66	2530	else
ali@66	2531	cnt_punct++;
ali@66	2532	break;
ali@66	2533	}
ali@97	2534	if (g_utf8_strchr("—-.:!([{?}])",-1,g_utf8_get_char(s)))
ali@66	2535	break;
ali@66	2536	}
ali@66	2537	}
ali@66	2538	}
ali@66	2539
ali@69	2540	gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
ali@69	2541	{
ali@69	2542	const char *word=key;
ali@69	2543	int *dupcnt=value;
ali@69	2544	if (*dupcnt)
ali@70	2545	g_print("\nNote: Queried word %s was duplicated %d times\n",
ali@69	2546	word,*dupcnt);
ali@69	2547	return FALSE;
ali@69	2548	}
ali@69	2549
ali@70	2550	void print_as_windows_1252(const char *string)
ali@70	2551	{
ali@70	2552	gsize inbytes,outbytes;
ali@70	2553	gchar buf,bp;
ali@86	2554	static GIConv converter=(GIConv)-1;
ali@70	2555	if (!string)
ali@70	2556	{
ali@70	2557	if (converter!=(GIConv)-1)
ali@70	2558	g_iconv_close(converter);
ali@70	2559	converter=(GIConv)-1;
ali@70	2560	return;
ali@70	2561	}
ali@86	2562	if (converter==(GIConv)-1)
ali@70	2563	converter=g_iconv_open("WINDOWS-1252","UTF-8");
ali@70	2564	if (converter!=(GIConv)-1)
ali@70	2565	{
ali@70	2566	inbytes=outbytes=strlen(string);
ali@70	2567	bp=buf=g_malloc(outbytes+1);
ali@70	2568	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
ali@70	2569	*bp='\0';
ali@70	2570	fputs(buf,stdout);
ali@70	2571	g_free(buf);
ali@70	2572	}
ali@70	2573	else
ali@70	2574	fputs(string,stdout);
ali@70	2575	}
ali@70	2576
ali@72	2577	void print_as_utf_8(const char *string)
ali@72	2578	{
ali@72	2579	fputs(string,stdout);
ali@72	2580	}
ali@72	2581
ali@66	2582	/*
ali@41	2583	* procfile:
ali@41	2584	*
ali@41	2585	* Process one file.
ali@41	2586	*/
ali@69	2587	void procfile(const char *filename)
ali@41	2588	{
ali@65	2589	const char *s;
ali@69	2590	gchar parastart=NULL; / first line of current para */
ali@69	2591	gchar etext,aline;
ali@69	2592	gchar *etext_ptr;
ali@69	2593	GError *err=NULL;
ali@41	2594	struct first_pass_results *first_pass_results;
ali@42	2595	struct warnings *warnings;
ali@43	2596	struct counters counters={0};
ali@45	2597	struct line_properties last={0};
ali@56	2598	struct parities parities={0};
ali@69	2599	struct pending pending={0};
ali@69	2600	gboolean isemptyline;
ali@68	2601	long start_para_line=0;
ali@69	2602	gboolean isnewpara=FALSE,enddash=FALSE;
ali@45	2603	last.start=CHAR_SPACE;
ali@68	2604	linecnt=checked_linecnt=0;
ali@69	2605	etext=read_etext(filename,&err);
ali@69	2606	if (!etext)
ali@41	2607	{
ali@68	2608	if (pswit[STDOUT_SWITCH])
ali@69	2609	fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
ali@68	2610	else
ali@69	2611	fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
ali@41	2612	exit(1);
ali@41	2613	}
ali@70	2614	g_print("\n\nFile: %s\n\n",filename);
ali@69	2615	first_pass_results=first_pass(etext);
ali@42	2616	warnings=report_first_pass(first_pass_results);
ali@69	2617	qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
ali@69	2618	qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
ali@40	2619	/*
ali@40	2620	* Here we go with the main pass. Hold onto yer hat!
ali@40	2621	*/
ali@65	2622	linecnt=0;
ali@69	2623	etext_ptr=etext;
ali@69	2624	while ((aline=flgets(&etext_ptr,linecnt+1)))
ali@40	2625	{
ali@68	2626	linecnt++;
ali@68	2627	if (linecnt==1)
ali@69	2628	isnewpara=TRUE;
ali@70	2629	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
ali@40	2630	continue; // skip DP page separators completely
ali@68	2631	if (linecnt<first_pass_results->firstline \|\|
ali@41	2632	(first_pass_results->footerline>0 &&
ali@41	2633	linecnt>first_pass_results->footerline))
ali@40	2634	{
ali@68	2635	if (pswit[HEADER_SWITCH])
ali@40	2636	{
ali@70	2637	if (g_str_has_prefix(aline,"Title:"))
ali@70	2638	g_print(" %s\n",aline);
ali@70	2639	if (g_str_has_prefix(aline,"Author:"))
ali@70	2640	g_print(" %s\n",aline);
ali@70	2641	if (g_str_has_prefix(aline,"Release Date:"))
ali@70	2642	g_print(" %s\n",aline);
ali@70	2643	if (g_str_has_prefix(aline,"Edition:"))
ali@70	2644	g_print(" %s\n\n",aline);
ali@40	2645	}
ali@68	2646	continue; /* skip through the header */
ali@40	2647	}
ali@68	2648	checked_linecnt++;
ali@65	2649	print_pending(aline,parastart,&pending);
ali@98	2650	isemptyline=analyse_quotes(aline,&counters);
ali@68	2651	if (isnewpara && !isemptyline)
ali@40	2652	{
ali@40	2653	/* This line is the start of a new paragraph. */
ali@68	2654	start_para_line=linecnt;
ali@40	2655	/* Capture its first line in case we want to report it later. */
ali@69	2656	g_free(parastart);
ali@69	2657	parastart=g_strdup(aline);
ali@56	2658	memset(&parities,0,sizeof(parities)); /* restart the quote count */
ali@68	2659	s=aline;
ali@70	2660	while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
ali@70	2661	!g_unichar_isdigit(g_utf8_get_char(s)))
ali@70	2662	s=g_utf8_next_char(s);
ali@70	2663	if (g_unichar_islower(g_utf8_get_char(s)))
ali@40	2664	{
ali@40	2665	/* and its first letter is lowercase */
ali@68	2666	if (pswit[ECHO_SWITCH])
ali@70	2667	g_print("\n%s\n",aline);
ali@68	2668	if (!pswit[OVERVIEW_SWITCH])
ali@70	2669	g_print(" Line %ld column %ld - "
ali@40	2670	"Paragraph starts with lower-case\n",
ali@70	2671	linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@68	2672	else
ali@68	2673	cnt_punct++;
ali@40	2674	}
ali@69	2675	isnewpara=FALSE; /* Signal the end of new para processing. */
ali@40	2676	}
ali@68	2677	/* Check for an em-dash broken at line end. */
ali@70	2678	if (enddash && g_utf8_get_char(aline)=='-')
ali@40	2679	{
ali@68	2680	if (pswit[ECHO_SWITCH])
ali@70	2681	g_print("\n%s\n",aline);
ali@68	2682	if (!pswit[OVERVIEW_SWITCH])
ali@70	2683	g_print(" Line %ld column 1 - Broken em-dash?\n",linecnt);
ali@68	2684	else
ali@68	2685	cnt_punct++;
ali@40	2686	}
ali@69	2687	enddash=FALSE;
ali@70	2688	for (s=g_utf8_prev_char(aline+strlen(aline));
ali@70	2689	g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
ali@40	2690	;
ali@70	2691	if (s>=aline && g_utf8_get_char(s)=='-')
ali@69	2692	enddash=TRUE;
ali@67	2693	check_for_control_characters(aline);
ali@68	2694	if (warnings->bin)
ali@44	2695	check_for_odd_characters(aline,warnings,isemptyline);
ali@68	2696	if (warnings->longline)
ali@45	2697	check_for_long_line(aline);
ali@68	2698	if (warnings->shortline)
ali@45	2699	check_for_short_line(aline,&last);
ali@68	2700	last.blen=last.len;
ali@70	2701	last.len=g_utf8_strlen(aline,-1);
ali@70	2702	last.start=g_utf8_get_char(aline);
ali@46	2703	check_for_starting_punctuation(aline);
ali@68	2704	if (warnings->dash)
ali@40	2705	{
ali@47	2706	check_for_spaced_emdash(aline);
ali@47	2707	check_for_spaced_dash(aline);
ali@40	2708	}
ali@48	2709	check_for_unmarked_paragraphs(aline);
ali@49	2710	check_for_jeebies(aline);
ali@50	2711	check_for_mta_from(aline);
ali@51	2712	check_for_orphan_character(aline);
ali@52	2713	check_for_pling_scanno(aline);
ali@53	2714	check_for_extra_period(aline,warnings);
ali@54	2715	check_for_following_punctuation(aline);
ali@55	2716	check_for_typos(aline,warnings);
ali@56	2717	check_for_misspaced_punctuation(aline,&parities,isemptyline);
ali@57	2718	check_for_double_punctuation(aline,warnings);
ali@58	2719	check_for_spaced_quotes(aline);
ali@59	2720	check_for_miscased_genative(aline);
ali@60	2721	check_end_of_line(aline,warnings);
ali@61	2722	check_for_unspaced_bracket(aline);
ali@68	2723	if (warnings->endquote)
ali@62	2724	check_for_unpunctuated_endquote(aline);
ali@63	2725	check_for_html_tag(aline);
ali@64	2726	check_for_html_entity(aline);
ali@68	2727	if (isemptyline)
ali@40	2728	{
ali@65	2729	check_for_mismatched_quotes(&counters,&pending);
ali@93	2730	counters_reset(&counters);
ali@40	2731	/* let the next iteration know that it's starting a new para */
ali@69	2732	isnewpara=TRUE;
ali@69	2733	if (prevline)
ali@69	2734	check_for_omitted_punctuation(prevline,&last,start_para_line);
ali@40	2735	}
ali@69	2736	g_free(prevline);
ali@69	2737	prevline=g_strdup(aline);
ali@0	2738	}
ali@93	2739	linecnt++;
ali@93	2740	check_for_mismatched_quotes(&counters,&pending);
ali@93	2741	print_pending(NULL,parastart,&pending);
ali@93	2742	reset_pending(&pending);
ali@69	2743	if (prevline)
ali@69	2744	{
ali@69	2745	g_free(prevline);
ali@69	2746	prevline=NULL;
ali@69	2747	}
ali@69	2748	g_free(parastart);
ali@69	2749	g_free(prevline);
ali@69	2750	g_free(etext);
ali@79	2751	if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
ali@69	2752	g_tree_foreach(qword,report_duplicate_queries,NULL);
ali@69	2753	g_tree_unref(qword);
ali@69	2754	g_tree_unref(qperiod);
ali@92	2755	counters_destroy(&counters);
ali@70	2756	g_set_print_handler(NULL);
ali@70	2757	print_as_windows_1252(NULL);
ali@71	2758	if (pswit[MARKUP_SWITCH])
ali@71	2759	loseentities(NULL);
ali@0	2760	}
ali@0	2761
ali@40	2762	/*
ali@40	2763	* flgets:
ali@40	2764	*
ali@69	2765	* Get one line from the input text, checking for
ali@40	2766	* the existence of exactly one CR/LF line-end per line.
ali@40	2767	*
ali@40	2768	* Returns: a pointer to the line.
ali@40	2769	*/
ali@69	2770	char flgets(char *etext,long lcnt)
ali@0	2771	{
ali@70	2772	gunichar c;
ali@69	2773	gboolean isCR=FALSE;
ali@69	2774	char theline=etext;
ali@70	2775	char *eos=theline;
ali@70	2776	gchar *s;
ali@70	2777	for (;;)
ali@40	2778	{
ali@70	2779	c=g_utf8_get_char(*etext);
ali@99	2780	if (!c)
ali@99	2781	{
ali@99	2782	if (*etext==theline)
ali@99	2783	return NULL;
ali@99	2784	else if (pswit[LINE_END_SWITCH])
ali@99	2785	{
ali@99	2786	if (pswit[ECHO_SWITCH])
ali@99	2787	{
ali@99	2788	s=g_strndup(theline,eos-theline);
ali@99	2789	g_print("\n%s\n",s);
ali@99	2790	g_free(s);
ali@99	2791	}
ali@99	2792	if (!pswit[OVERVIEW_SWITCH])
ali@99	2793	/* There may, or may not, have been a CR */
ali@99	2794	g_print(" Line %ld - No LF?\n",lcnt);
ali@99	2795	else
ali@99	2796	cnt_lineend++;
ali@99	2797	}
ali@99	2798	break;
ali@99	2799	}
ali@70	2800	etext=g_utf8_next_char(etext);
ali@40	2801	/* either way, it's end of line */
ali@69	2802	if (c=='\n')
ali@40	2803	{
ali@68	2804	if (isCR)
ali@68	2805	break;
ali@68	2806	else
ali@40	2807	{
ali@40	2808	/* Error - a LF without a preceding CR */
ali@68	2809	if (pswit[LINE_END_SWITCH])
ali@40	2810	{
ali@68	2811	if (pswit[ECHO_SWITCH])
ali@70	2812	{
ali@70	2813	s=g_strndup(theline,eos-theline);
ali@70	2814	g_print("\n%s\n",s);
ali@70	2815	g_free(s);
ali@70	2816	}
ali@68	2817	if (!pswit[OVERVIEW_SWITCH])
ali@70	2818	g_print(" Line %ld - No CR?\n",lcnt);
ali@68	2819	else
ali@68	2820	cnt_lineend++;
ali@40	2821	}
ali@68	2822	break;
ali@40	2823	}
ali@40	2824	}
ali@69	2825	if (c=='\r')
ali@40	2826	{
ali@68	2827	if (isCR)
ali@40	2828	{
ali@40	2829	/* Error - two successive CRs */
ali@68	2830	if (pswit[LINE_END_SWITCH])
ali@40	2831	{
ali@68	2832	if (pswit[ECHO_SWITCH])
ali@70	2833	{
ali@70	2834	s=g_strndup(theline,eos-theline);
ali@70	2835	g_print("\n%s\n",s);
ali@70	2836	g_free(s);
ali@70	2837	}
ali@68	2838	if (!pswit[OVERVIEW_SWITCH])
ali@70	2839	g_print(" Line %ld - Two successive CRs?\n",lcnt);
ali@68	2840	else
ali@68	2841	cnt_lineend++;
ali@40	2842	}
ali@40	2843	}
ali@69	2844	isCR=TRUE;
ali@40	2845	}
ali@68	2846	else
ali@40	2847	{
ali@68	2848	if (pswit[LINE_END_SWITCH] && isCR)
ali@40	2849	{
ali@68	2850	if (pswit[ECHO_SWITCH])
ali@70	2851	{
ali@70	2852	s=g_strndup(theline,eos-theline);
ali@70	2853	g_print("\n%s\n",s);
ali@70	2854	g_free(s);
ali@70	2855	}
ali@68	2856	if (!pswit[OVERVIEW_SWITCH])
ali@70	2857	g_print(" Line %ld column %ld - CR without LF?\n",
ali@70	2858	lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
ali@68	2859	else
ali@68	2860	cnt_lineend++;
ali@70	2861	*eos=' ';
ali@40	2862	}
ali@69	2863	isCR=FALSE;
ali@70	2864	eos=g_utf8_next_char(eos);
ali@40	2865	}
ali@69	2866	}
ali@70	2867	*eos='\0';
ali@0	2868	if (pswit[MARKUP_SWITCH])
ali@68	2869	postprocess_for_HTML(theline);
ali@0	2870	if (pswit[DP_SWITCH])
ali@68	2871	postprocess_for_DP(theline);
ali@40	2872	return theline;
ali@0	2873	}
ali@0	2874
ali@40	2875	/*
ali@40	2876	* mixdigit:
ali@40	2877	*
ali@40	2878	* Takes a "word" as a parameter, and checks whether it
ali@40	2879	* contains a mixture of alpha and digits. Generally, this is an
ali@40	2880	* error, but may not be for cases like 4th or L5 12s. 3d.
ali@40	2881	*
ali@70	2882	* Returns: TRUE iff an is error found.
ali@40	2883	*/
ali@70	2884	gboolean mixdigit(const char *checkword)
ali@0	2885	{
ali@70	2886	gboolean wehaveadigit,wehavealetter,query;
ali@70	2887	const char s,nondigit;
ali@70	2888	wehaveadigit=wehavealetter=query=FALSE;
ali@70	2889	for (s=checkword;*s;s=g_utf8_next_char(s))
ali@70	2890	if (g_unichar_isalpha(g_utf8_get_char(s)))
ali@70	2891	wehavealetter=TRUE;
ali@70	2892	else if (g_unichar_isdigit(g_utf8_get_char(s)))
ali@70	2893	wehaveadigit=TRUE;
ali@40	2894	if (wehaveadigit && wehavealetter)
ali@40	2895	{
ali@40	2896	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
ali@70	2897	query=TRUE;
ali@70	2898	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
ali@70	2899	nondigit=g_utf8_next_char(nondigit))
ali@68	2900	;
ali@68	2901	/* digits, ending in st, rd, nd, th of either case */
ali@70	2902	if (!g_ascii_strcasecmp(nondigit,"st") \|\|
ali@70	2903	!g_ascii_strcasecmp(nondigit,"rd") \|\|
ali@70	2904	!g_ascii_strcasecmp(nondigit,"nd") \|\|
ali@70	2905	!g_ascii_strcasecmp(nondigit,"th"))
ali@70	2906	query=FALSE;
ali@70	2907	if (!g_ascii_strcasecmp(nondigit,"sts") \|\|
ali@70	2908	!g_ascii_strcasecmp(nondigit,"rds") \|\|
ali@70	2909	!g_ascii_strcasecmp(nondigit,"nds") \|\|
ali@70	2910	!g_ascii_strcasecmp(nondigit,"ths"))
ali@70	2911	query=FALSE;
ali@70	2912	if (!g_ascii_strcasecmp(nondigit,"stly") \|\|
ali@70	2913	!g_ascii_strcasecmp(nondigit,"rdly") \|\|
ali@70	2914	!g_ascii_strcasecmp(nondigit,"ndly") \|\|
ali@70	2915	!g_ascii_strcasecmp(nondigit,"thly"))
ali@70	2916	query=FALSE;
ali@68	2917	/* digits, ending in l, L, s or d */
ali@70	2918	if (!g_ascii_strcasecmp(nondigit,"l") \|\| !strcmp(nondigit,"s") \|\|
ali@70	2919	!strcmp(nondigit,"d"))
ali@70	2920	query=FALSE;
ali@68	2921	/*
ali@40	2922	* L at the start of a number, representing Britsh pounds, like L500.
ali@70	2923	* This is cute. We know the current word is mixed digit. If the first
ali@68	2924	* letter is L, there must be at least one digit following. If both
ali@68	2925	* digits and letters follow, we have a genuine error, else we have a
ali@68	2926	* capital L followed by digits, and we accept that as a non-error.
ali@40	2927	*/
ali@70	2928	if (g_utf8_get_char(checkword)=='L' &&
ali@70	2929	!mixdigit(g_utf8_next_char(checkword)))
ali@70	2930	query=FALSE;
ali@40	2931	}
ali@40	2932	return query;
ali@0	2933	}
ali@0	2934
ali@40	2935	/*
ali@40	2936	* getaword:
ali@40	2937	*
ali@69	2938	* Extracts the first/next "word" from the line, and returns it.
ali@69	2939	* A word is defined as one English word unit--or at least that's the aim.
ali@69	2940	* "ptr" is advanced to the position in the line where we will start
ali@69	2941	* looking for the next word.
ali@40	2942	*
ali@69	2943	* Returns: A newly-allocated string.
ali@40	2944	*/
ali@69	2945	gchar getaword(const char *ptr)
ali@0	2946	{
ali@70	2947	const char s,t;
ali@69	2948	GString *word;
ali@70	2949	gunichar c,pc;
ali@69	2950	word=g_string_new(NULL);
ali@70	2951	for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
ali@70	2952	!g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
ali@70	2953	*ptr;ptr=g_utf8_next_char(*ptr))
ali@100	2954	{
ali@100	2955	/* Handle exceptions for footnote markers like [1] */
ali@100	2956	if (g_utf8_get_char(*ptr)=='[')
ali@100	2957	{
ali@100	2958	g_string_append_c(word,'[');
ali@100	2959	s=g_utf8_next_char(*ptr);
ali@100	2960	for (;g_unichar_isdigit(g_utf8_get_char(s));s=g_utf8_next_char(s))
ali@100	2961	g_string_append_unichar(word,g_utf8_get_char(s));
ali@100	2962	if (g_utf8_get_char(s)==']')
ali@100	2963	{
ali@100	2964	g_string_append_c(word,']');
ali@100	2965	*ptr=g_utf8_next_char(s);
ali@100	2966	return g_string_free(word,FALSE);
ali@100	2967	}
ali@100	2968	else
ali@100	2969	g_string_truncate(word,0);
ali@100	2970	}
ali@100	2971	}
ali@40	2972	/*
ali@40	2973	* Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
ali@40	2974	* Especially yucky is the case of L1,000
ali@40	2975	* This section looks for a pattern of characters including a digit
ali@40	2976	* followed by a comma or period followed by one or more digits.
ali@40	2977	* If found, it returns this whole pattern as a word; otherwise we discard
ali@40	2978	* the results and resume our normal programming.
ali@40	2979	*/
ali@69	2980	s=*ptr;
ali@70	2981	for (;g_unichar_isdigit(g_utf8_get_char(s)) \|\|
ali@70	2982	g_unichar_isalpha(g_utf8_get_char(s)) \|\|
ali@70	2983	g_utf8_get_char(s)==',' \|\| g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
ali@70	2984	g_string_append_unichar(word,g_utf8_get_char(s));
ali@82	2985	if (word->len)
ali@40	2986	{
ali@82	2987	for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
ali@40	2988	{
ali@82	2989	c=g_utf8_get_char(t);
ali@82	2990	pc=g_utf8_get_char(g_utf8_prev_char(t));
ali@82	2991	if ((c=='.' \|\| c==',') && g_unichar_isdigit(pc))
ali@82	2992	{
ali@82	2993	*ptr=s;
ali@82	2994	return g_string_free(word,FALSE);
ali@82	2995	}
ali@40	2996	}
ali@40	2997	}
ali@0	2998	/* we didn't find a punctuated number - do the regular getword thing */
ali@69	2999	g_string_truncate(word,0);
ali@92	3000	c=g_utf8_get_char(*ptr);
ali@92	3001	for (;g_unichar_isdigit(c) \|\| g_unichar_isalpha(c) \|\| CHAR_IS_APOSTROPHE(c);
ali@92	3002	ptr=g_utf8_next_char(ptr),c=g_utf8_get_char(*ptr))
ali@92	3003	g_string_append_unichar(word,c);
ali@69	3004	return g_string_free(word,FALSE);
ali@0	3005	}
ali@0	3006
ali@40	3007	/*
ali@40	3008	* isroman:
ali@40	3009	*
ali@40	3010	* Is this word a Roman Numeral?
ali@40	3011	*
ali@40	3012	* It doesn't actually validate that the number is a valid Roman Numeral--for
ali@40	3013	* example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
ali@40	3014	* what we're here to do. If it passes this, it LOOKS like a Roman numeral.
ali@40	3015	* Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
ali@40	3016	* expressions thereof, except when it came to taxes. Allow any number of M,
ali@40	3017	* an optional D, an optional CM or CD, any number of optional Cs, an optional
ali@40	3018	* XL or an optional XC, an optional IX or IV, an optional V and any number
ali@40	3019	* of optional Is.
ali@40	3020	*/
ali@69	3021	gboolean isroman(const char *t)
ali@0	3022	{
ali@69	3023	const char *s;
ali@40	3024	if (!t \|\| !*t)
ali@69	3025	return FALSE;
ali@40	3026	s=t;
ali@70	3027	while (g_utf8_get_char(t)=='m' && *t)
ali@40	3028	t++;
ali@70	3029	if (g_utf8_get_char(t)=='d')
ali@40	3030	t++;
ali@70	3031	if (g_str_has_prefix(t,"cm"))
ali@40	3032	t+=2;
ali@70	3033	if (g_str_has_prefix(t,"cd"))
ali@40	3034	t+=2;
ali@70	3035	while (g_utf8_get_char(t)=='c' && *t)
ali@40	3036	t++;
ali@70	3037	if (g_str_has_prefix(t,"xl"))
ali@40	3038	t+=2;
ali@70	3039	if (g_str_has_prefix(t,"xc"))
ali@40	3040	t+=2;
ali@70	3041	if (g_utf8_get_char(t)=='l')
ali@40	3042	t++;
ali@70	3043	while (g_utf8_get_char(t)=='x' && *t)
ali@40	3044	t++;
ali@70	3045	if (g_str_has_prefix(t,"ix"))
ali@40	3046	t+=2;
ali@70	3047	if (g_str_has_prefix(t,"iv"))
ali@40	3048	t+=2;
ali@70	3049	if (g_utf8_get_char(t)=='v')
ali@40	3050	t++;
ali@70	3051	while (g_utf8_get_char(t)=='i' && *t)
ali@40	3052	t++;
ali@40	3053	return !*t;
ali@0	3054	}
ali@0	3055
ali@40	3056	/*
ali@40	3057	* postprocess_for_DP:
ali@40	3058	*
ali@40	3059	* Invoked with the -d switch from flgets().
ali@40	3060	* It simply "removes" from the line a hard-coded set of common
ali@40	3061	* DP-specific tags, so that the line passed to the main routine has
ali@40	3062	* been pre-cleaned of DP markup.
ali@40	3063	*/
ali@0	3064	void postprocess_for_DP(char *theline)
ali@0	3065	{
ali@40	3066	char s,t;
ali@0	3067	int i;
ali@0	3068	if (!*theline)
ali@68	3069	return;
ali@40	3070	for (i=0;*DPmarkup[i];i++)
ali@70	3071	while ((s=strstr(theline,DPmarkup[i])))
ali@40	3072	{
ali@68	3073	t=s+strlen(DPmarkup[i]);
ali@70	3074	memmove(s,t,strlen(t)+1);
ali@40	3075	}
ali@0	3076	}
ali@0	3077
ali@40	3078	/*
ali@40	3079	* postprocess_for_HTML:
ali@40	3080	*
ali@40	3081	* Invoked with the -m switch from flgets().
ali@40	3082	* It simply "removes" from the line a hard-coded set of common
ali@40	3083	* HTML tags and "replaces" a hard-coded set of common HTML
ali@40	3084	* entities, so that the line passed to the main routine has
ali@40	3085	* been pre-cleaned of HTML.
ali@40	3086	*/
ali@0	3087	void postprocess_for_HTML(char *theline)
ali@0	3088	{
ali@70	3089	while (losemarkup(theline))
ali@70	3090	;
ali@71	3091	loseentities(theline);
ali@0	3092	}
ali@0	3093
ali@0	3094	char losemarkup(char theline)
ali@0	3095	{
ali@40	3096	char s,t;
ali@0	3097	int i;
ali@70	3098	s=strchr(theline,'<');
ali@70	3099	t=s?strchr(s,'>'):NULL;
ali@40	3100	if (!s \|\| !t)
ali@40	3101	return NULL;
ali@40	3102	for (i=0;*markup[i];i++)
ali@70	3103	if (tagcomp(g_utf8_next_char(s),markup[i]))
ali@40	3104	{
ali@70	3105	t=g_utf8_next_char(t);
ali@70	3106	memmove(s,t,strlen(t)+1);
ali@70	3107	return s;
ali@68	3108	}
ali@40	3109	/* It's an unrecognized <xxx>. */
ali@40	3110	return NULL;
ali@0	3111	}
ali@0	3112
ali@71	3113	void loseentities(char *theline)
ali@0	3114	{
ali@0	3115	int i;
ali@71	3116	gsize nb;
ali@71	3117	char amp,scolon;
ali@71	3118	gchar s,t;
ali@71	3119	gunichar c;
ali@71	3120	GTree *entities=NULL;
ali@86	3121	static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
ali@71	3122	if (!theline)
ali@40	3123	{
ali@71	3124	if (entities)
ali@71	3125	g_tree_destroy(entities);
ali@71	3126	entities=NULL;
ali@86	3127	if (translit!=(GIConv)-1)
ali@71	3128	g_iconv_close(translit);
ali@71	3129	translit=(GIConv)-1;
ali@86	3130	if (to_utf8!=(GIConv)-1)
ali@71	3131	g_iconv_close(to_utf8);
ali@71	3132	to_utf8=(GIConv)-1;
ali@71	3133	return;
ali@71	3134	}
ali@71	3135	if (!*theline)
ali@71	3136	return;
ali@71	3137	if (!entities)
ali@71	3138	{
ali@71	3139	entities=g_tree_new((GCompareFunc)strcmp);
ali@71	3140	for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
ali@71	3141	g_tree_insert(entities,HTMLentities[i].name,
ali@71	3142	GUINT_TO_POINTER(HTMLentities[i].c));
ali@71	3143	}
ali@71	3144	if (translit==(GIConv)-1)
ali@71	3145	translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
ali@71	3146	if (to_utf8==(GIConv)-1)
ali@71	3147	to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
ali@71	3148	while((amp=strchr(theline,'&')))
ali@71	3149	{
ali@71	3150	scolon=strchr(amp,';');
ali@71	3151	if (scolon)
ali@40	3152	{
ali@71	3153	if (amp[1]=='#')
ali@71	3154	{
ali@71	3155	if (amp+2+strspn(amp+2,"0123456789")==scolon)
ali@71	3156	c=strtol(amp+2,NULL,10);
ali@71	3157	else if (amp[2]=='x' &&
ali@71	3158	amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
ali@71	3159	c=strtol(amp+3,NULL,16);
ali@71	3160	}
ali@71	3161	else
ali@71	3162	{
ali@71	3163	s=g_strndup(amp+1,scolon-(amp+1));
ali@71	3164	c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
ali@71	3165	g_free(s);
ali@71	3166	}
ali@40	3167	}
ali@71	3168	else
ali@71	3169	c=0;
ali@71	3170	if (c)
ali@71	3171	{
ali@71	3172	theline=amp;
ali@71	3173	if (c<128 \|\| c>=192 && c<=255) /* An ISO-8859-1 character */
ali@71	3174	theline+=g_unichar_to_utf8(c,theline);
ali@71	3175	else
ali@71	3176	{
ali@71	3177	s=g_malloc(6);
ali@71	3178	nb=g_unichar_to_utf8(c,s);
ali@71	3179	t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
ali@71	3180	g_free(s);
ali@71	3181	s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
ali@71	3182	g_free(t);
ali@71	3183	memcpy(theline,s,nb);
ali@71	3184	g_free(s);
ali@71	3185	theline+=nb;
ali@71	3186	}
ali@71	3187	memmove(theline,g_utf8_next_char(scolon),
ali@71	3188	strlen(g_utf8_next_char(scolon))+1);
ali@71	3189	}
ali@71	3190	else
ali@71	3191	theline=g_utf8_next_char(amp);
ali@40	3192	}
ali@0	3193	}
ali@0	3194
ali@70	3195	gboolean tagcomp(const char strin,const char basetag)
ali@0	3196	{
ali@70	3197	gboolean retval;
ali@70	3198	gchar s,t;
ali@70	3199	if (g_utf8_get_char(strin)=='/')
ali@70	3200	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
ali@70	3201	else
ali@70	3202	t=g_utf8_casefold(strin,-1);
ali@70	3203	s=g_utf8_casefold(basetag,-1);
ali@70	3204	retval=g_str_has_prefix(t,s);
ali@70	3205	g_free(s);
ali@70	3206	g_free(t);
ali@70	3207	return retval;
ali@0	3208	}
ali@0	3209
ali@69	3210	void proghelp(GOptionContext *context)
ali@0	3211	{
ali@69	3212	gchar *help;
ali@40	3213	fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
ali@40	3214	fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
ali@40	3215	fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
ali@40	3216	fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
ali@40	3217	"For details, read the file COPYING.\n",stderr);
ali@40	3218	fputs("This is Free Software; "
ali@40	3219	"you may redistribute it under certain conditions (GPL);\n",stderr);
ali@40	3220	fputs("read the file COPYING for details.\n\n",stderr);
ali@69	3221	help=g_option_context_get_help(context,TRUE,NULL);
ali@69	3222	fputs(help,stderr);
ali@69	3223	g_free(help);
ali@69	3224	fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
ali@40	3225	fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
ali@40	3226	"non-ASCII\n",stderr);
ali@40	3227	fputs("characters like accented letters, "
ali@40	3228	"lines longer than 75 or shorter than 55,\n",stderr);
ali@40	3229	fputs("unbalanced quotes or brackets, "
ali@40	3230	"a variety of badly formatted punctuation, \n",stderr);
ali@40	3231	fputs("HTML tags, some likely typos. "
ali@40	3232	"It is NOT a substitute for human judgement.\n",stderr);
ali@0	3233	fputs("\n",stderr);
ali@0	3234	}

author	ali <ali@juiblex.co.uk>
	Tue Oct 15 09:16:04 2013 +0100 (2013-10-15)
changeset 100	ad92d11d59b8
parent 99	783eff3047bc
child 101	f44c530f80da
permissions	-rw-r--r--