ali@0: /*************************************************************************/
ali@40: /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
ali@68: /*									 */
ali@68: /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */
ali@68: /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */
ali@68: /*									 */
ali@0: /* This program is free software; you can redistribute it and/or modify  */
ali@0: /* it under the terms of the GNU General Public License as published by  */
ali@0: /* the Free Software Foundation; either version 2 of the License, or     */
ali@68: /* (at your option) any later version.					 */
ali@68: /*									 */
ali@0: /* This program is distributed in the hope that it will be useful,       */
ali@68: /* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */
ali@68: /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */
ali@68: /* GNU General Public License for more details.				 */
ali@68: /*									 */
ali@68: /* You should have received a copy of the GNU General Public License	 */
ali@68: /* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */
ali@0: /*************************************************************************/
ali@0: 
ali@0: #include <stdio.h>
ali@0: #include <stdlib.h>
ali@0: #include <string.h>
ali@0: #include <ctype.h>
ali@73: #ifdef __WIN32__
ali@73: #include <windows.h>
ali@73: #endif
ali@69: #include <glib.h>
ali@69: #include <bl/bl.h>
ali@99: #include "bookloupe.h"
ali@99: #include "counters.h"
ali@103: #include "pending.h"
ali@71: #include "HTMLentities.h"
ali@0: 
ali@185: gchar *charset;		/* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */
ali@185: GIConv charset_validator=(GIConv)-1;
ali@185: 
ali@69: gchar *prevline;
ali@0: 
ali@40: /* Common typos. */
ali@40: char *typo[] = {
ali@40:     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
ali@40:     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",
ali@40:     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
ali@40:     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
ali@40:     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
ali@40:     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
ali@40:     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
ali@40:     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
ali@40:     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
ali@40:     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
ali@40:     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
ali@40:     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
ali@40:     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
ali@40:     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
ali@40:     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
ali@40:     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
ali@40:     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
ali@40:     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
ali@40:     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
ali@40:     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
ali@40:     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
ali@40:     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
ali@40:     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
ali@40:     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
ali@40:     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
ali@40:     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
ali@40:     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
ali@40:     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
ali@40:     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
ali@40:     "se", ""
ali@40: };
ali@0: 
ali@69: GTree *usertypo;
ali@0: 
ali@40: /* Common abbreviations and other OK words not to query as typos. */
ali@40: char *okword[] = {
ali@40:     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
ali@40:     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
ali@40:     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
ali@40:     "outbid", "outbids", "frostbite", "frostbitten", ""
ali@40: };
ali@0: 
ali@40: /* Common abbreviations that cause otherwise unexplained periods. */
ali@40: char *abbrev[] = {
ali@40:     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
ali@40:     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
ali@40: };
ali@0: 
ali@40: /*
ali@40:  * Two-Letter combinations that rarely if ever start words,
ali@40:  * but are common scannos or otherwise common letter combinations.
ali@40:  */
ali@40: char *nostart[] = {
ali@40:     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
ali@40: };
ali@0: 
ali@40: /*
ali@40:  * Two-Letter combinations that rarely if ever end words,
ali@40:  * but are common scannos or otherwise common letter combinations.
ali@40:  */
ali@40: char *noend[] = {
ali@40:     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
ali@40:     "sw", "gr", "sl", "cl", "iy", ""
ali@40: };
ali@0: 
ali@40: char *markup[] = {
ali@40:     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
ali@40:     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
ali@40:     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
ali@40:     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
ali@40: };
ali@0: 
ali@40: char *DPmarkup[] = {
ali@40:     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
ali@40: };
ali@0: 
ali@40: char *nocomma[] = {
ali@40:     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
ali@40:     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
ali@40:     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
ali@40:     "during", "let", "toward", "among", ""
ali@40: };
ali@0: 
ali@40: char *noperiod[] = {
ali@40:     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
ali@40:     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
ali@40:     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
ali@40:     "among", "those", "into", "whom", "having", "thence", ""
ali@40: }; 
ali@0: 
ali@69: gboolean pswit[SWITNO];  /* program switches */
ali@185: gchar *opt_charset;
ali@0: 
ali@186: gboolean typo_compat,paranoid_compat;
ali@186: 
ali@69: static GOptionEntry options[]={
ali@69:     { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
ali@69:       "Ignore DP-specific markup", NULL },
ali@186:     { "no-dp", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
ali@186:       G_OPTION_ARG_NONE, pswit+DP_SWITCH,
ali@186:       "Don't ignore DP-specific markup", NULL },
ali@186:     { "echo", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
ali@186:       "Echo queried line", NULL },
ali@186:     { "no-echo", 'e', G_OPTION_FLAG_REVERSE,
ali@186:       G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
ali@69:       "Don't echo queried line", NULL },
ali@69:     { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
ali@69:       "Check single quotes", NULL },
ali@186:     { "no-squote", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
ali@186:       G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
ali@186:       "Don't check single quotes", NULL },
ali@186:     { "typo", 0, 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
ali@69:       "Check common typos", NULL },
ali@186:     { "no-typo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
ali@186:       G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
ali@186:       "Don't check common typos", NULL },
ali@69:     { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
ali@69:       "Require closure of quotes on every paragraph", NULL },
ali@186:     { "no-qpara", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
ali@186:       G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
ali@186:       "Don't require closure of quotes on every paragraph", NULL },
ali@186:     { "paranoid", 0, G_OPTION_FLAG_HIDDEN,
ali@186:       G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
ali@186:       "Enable paranoid querying of everything", NULL },
ali@186:     { "no-paranoid", 0, G_OPTION_FLAG_REVERSE,
ali@186:       G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
ali@69:       "Disable paranoid querying of everything", NULL },
ali@186:     { "line-end", 0, G_OPTION_FLAG_HIDDEN,
ali@186:       G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
ali@186:       "Enable line end checking", NULL },
ali@186:     { "no-line-end", 'l', G_OPTION_FLAG_REVERSE,
ali@186:       G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
ali@186:       "Diable line end checking", NULL },
ali@69:     { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
ali@69:       "Overview: just show counts", NULL },
ali@186:     { "no-overview", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
ali@186:       G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
ali@186:       "Show individual warnings", NULL },
ali@69:     { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
ali@69:       "Output errors to stdout instead of stderr", NULL },
ali@186:     { "no-stdout", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
ali@186:       G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
ali@186:       "Output errors to stderr instead of stdout", NULL },
ali@69:     { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
ali@69:       "Echo header fields", NULL },
ali@186:     { "no-header", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
ali@186:       G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
ali@186:       "Don't echo header fields", NULL },
ali@69:     { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
ali@69:       "Ignore markup in < >", NULL },
ali@186:     { "no-markup", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
ali@186:       G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
ali@186:       "No special handling for markup in < >", NULL },
ali@69:     { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
ali@69:       "Use file of user-defined typos", NULL },
ali@186:     { "no-usertypo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
ali@186:       G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
ali@186:       "Ignore file of user-defined typos", NULL },
ali@186:     { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
ali@186:       "Verbose - list everything", NULL },
ali@186:     { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
ali@186:       G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
ali@186:       "Switch off verbose mode", NULL },
ali@187:     { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset,
ali@187:       "Set of characters valid for this ebook", "NAME" },
ali@186:     { NULL }
ali@186: };
ali@186: 
ali@186: /*
ali@186:  * Options relating to configuration which make no sense from inside
ali@186:  * a configuration file.
ali@186:  */
ali@186: 
ali@186: static GOptionEntry config_options[]={
ali@69:     { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
ali@69:       "Defaults for use on www upload", NULL },
ali@186:     { "dump-config", 0, 0, G_OPTION_ARG_NONE, pswit+DUMP_CONFIG_SWITCH,
ali@186:       "Dump current config settings", NULL },
ali@186:     { NULL }
ali@186: };
ali@186: 
ali@186: static GOptionEntry compatibility_options[]={
ali@186:     { "toggle-typo", 't', 0, G_OPTION_ARG_NONE, &typo_compat,
ali@186:       "Toggle checking for common typos", NULL },
ali@186:     { "toggle-relaxed", 'x', 0, G_OPTION_ARG_NONE, &paranoid_compat,
ali@186:       "Toggle both paranoid mode and common typos", NULL },
ali@69:     { NULL }
ali@69: };
ali@0: 
ali@142: long cnt_quote;		/* for overview mode, count of quote queries */
ali@68: long cnt_brack;		/* for overview mode, count of brackets queries */
ali@68: long cnt_bin;		/* for overview mode, count of non-ASCII queries */
ali@68: long cnt_odd;		/* for overview mode, count of odd character queries */
ali@68: long cnt_long;		/* for overview mode, count of long line errors */
ali@68: long cnt_short;		/* for overview mode, count of short line queries */
ali@68: long cnt_punct;		/* for overview mode,
ali@68: 			   count of punctuation and spacing queries */
ali@68: long cnt_dash;		/* for overview mode, count of dash-related queries */
ali@68: long cnt_word;		/* for overview mode, count of word queries */
ali@68: long cnt_html;		/* for overview mode, count of html queries */
ali@68: long cnt_lineend;	/* for overview mode, count of line-end queries */
ali@68: long cnt_spacend;	/* count of lines with space at end */
ali@68: long linecnt;		/* count of total lines in the file */
ali@68: long checked_linecnt;	/* count of lines actually checked */
ali@0: 
ali@69: void proghelp(GOptionContext *context);
ali@69: void procfile(const char *);
ali@0: 
ali@69: gchar *running_from;
ali@0: 
ali@70: gboolean mixdigit(const char *);
ali@69: gchar *getaword(const char **);
ali@69: char *flgets(char **,long);
ali@0: void postprocess_for_HTML(char *);
ali@0: char *linehasmarkup(char *);
ali@0: char *losemarkup(char *);
ali@70: gboolean tagcomp(const char *,const char *);
ali@71: void loseentities(char *);
ali@69: gboolean isroman(const char *);
ali@0: void postprocess_for_DP(char *);
ali@72: void print_as_windows_1252(const char *string);
ali@72: void print_as_utf_8(const char *string);
ali@0: 
ali@69: GTree *qword,*qperiod;
ali@68: 
ali@73: #ifdef __WIN32__
ali@73: UINT saved_cp;
ali@73: #endif
ali@73: 
ali@186: GKeyFile *config;
ali@186: 
ali@186: void config_file_update(GKeyFile *kf)
ali@186: {
ali@186:     int i;
ali@186:     gboolean sw;
ali@186:     for(i=0;options[i].long_name;i++)
ali@186:     {
ali@186: 	if (g_str_has_prefix(options[i].long_name,"no-"))
ali@186: 	    continue;
ali@186: 	if (options[i].arg==G_OPTION_ARG_NONE)
ali@186: 	{
ali@186: 	    sw=*(gboolean *)options[i].arg_data;
ali@186: 	    if (options[i].flags&G_OPTION_FLAG_REVERSE)
ali@186: 		sw=!sw;
ali@186: 	    g_key_file_set_boolean(kf,"options",options[i].long_name,sw);
ali@186: 	}
ali@186: 	else
ali@186: 	    g_assert_not_reached();
ali@186:     }
ali@186: }
ali@186: 
ali@186: void config_file_add_comments(GKeyFile *kf)
ali@186: {
ali@186:     int i;
ali@186:     gchar *comment;
ali@186:     g_key_file_set_comment(kf,NULL,NULL," Default configuration for bookloupe",
ali@186:       NULL);
ali@186:     for(i=0;options[i].long_name;i++)
ali@186:     {
ali@186: 	if (g_str_has_prefix(options[i].long_name,"no-"))
ali@186: 	    continue;
ali@186: 	comment=g_strconcat(" ",options[i].description,NULL);
ali@186: 	g_key_file_set_comment(kf,"options",options[i].long_name,comment,NULL);
ali@186: 	g_free(comment);
ali@186:     }
ali@186: }
ali@186: 
ali@186: void dump_config(void)
ali@186: {
ali@186:     gchar *s;
ali@186:     if (config)
ali@186: 	config_file_update(config);
ali@186:     else
ali@186:     {
ali@186: 	config=g_key_file_new();
ali@186: 	config_file_update(config);
ali@186: 	config_file_add_comments(config);
ali@186:     }
ali@186:     s=g_key_file_to_data(config,NULL,NULL);
ali@186:     if (s)
ali@186: 	g_print("%s",s);
ali@186:     g_free(s);
ali@186: }
ali@186: 
ali@186: GKeyFile *read_config_file(gchar **full_path)
ali@186: {
ali@186:     int i;
ali@186:     GError *err=NULL;
ali@186:     gchar **search_dirs;
ali@186:     gchar *path;
ali@186:     const char *search_path;
ali@186:     GKeyFile *kf;
ali@186:     kf=g_key_file_new();
ali@186:     search_path=g_getenv("BOOKLOUPE_CONFIG_PATH");
ali@186:     if (search_path)
ali@186:     {
ali@186: #ifdef __WIN32__
ali@186: 	search_dirs=g_strsplit(search_path,";",0);
ali@186: #else
ali@186: 	search_dirs=g_strsplit(search_path,":",0);
ali@186: #endif
ali@186:     }
ali@186:     else
ali@186:     {
ali@186: 	search_dirs=g_new(gchar *,4);
ali@186: 	search_dirs[0]=g_get_current_dir();
ali@186: 	search_dirs[1]=g_strdup(running_from);
ali@186: 	search_dirs[2]=g_strdup(g_get_user_config_dir());
ali@186: 	search_dirs[3]=NULL;
ali@186:     }
ali@186:     for(i=0;search_dirs[i];i++)
ali@186:     {
ali@186: 	path=g_build_filename(search_dirs[i],"bookloupe.ini",NULL);
ali@186: 	if (g_key_file_load_from_file(kf,path,
ali@186: 	  G_KEY_FILE_KEEP_COMMENTS|G_KEY_FILE_KEEP_TRANSLATIONS,&err))
ali@186: 	    break;
ali@186: 	if (!g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
ali@186: 	{
ali@186: 	    g_printerr("Bookloupe: Error reading %s\n",path);
ali@186: 	    g_printerr("%s\n",err->message);
ali@186: 	    exit(1);
ali@186: 	}
ali@186: 	g_clear_error(&err);
ali@186: 	g_free(path);
ali@186: 	path=NULL;
ali@186:     }
ali@186:     if (!search_dirs[i])
ali@186:     {
ali@186: 	g_key_file_free(kf);
ali@186: 	kf=NULL;
ali@186:     }
ali@186:     g_strfreev(search_dirs);
ali@186:     if (full_path && kf)
ali@186: 	*full_path=path;
ali@186:     else
ali@186: 	g_free(path);
ali@186:     return kf;
ali@186: }
ali@186: 
ali@186: void parse_config_file(void)
ali@186: {
ali@186:     int i,j;
ali@186:     gchar *path;
ali@186:     gchar **keys;
ali@186:     gboolean sw;
ali@186:     GError *err=NULL;
ali@186:     config=read_config_file(&path);
ali@186:     if (config)
ali@186: 	keys=g_key_file_get_keys(config,"options",NULL,NULL);
ali@186:     else
ali@186: 	keys=NULL;
ali@186:     if (keys)
ali@186:     {
ali@186: 	for(i=0;keys[i];i++)
ali@186: 	{
ali@186: 	    for(j=0;options[j].long_name;j++)
ali@186: 	    {
ali@186: 		if (g_str_has_prefix(options[j].long_name,"no-"))
ali@186: 		    continue;
ali@186: 		else if (!strcmp(keys[i],options[j].long_name))
ali@186: 		{
ali@186: 		    if (options[j].arg==G_OPTION_ARG_NONE)
ali@186: 		    {
ali@186: 			sw=g_key_file_get_boolean(config,"options",keys[i],
ali@186: 			  &err);
ali@186: 			if (err)
ali@186: 			{
ali@186: 			    g_printerr("Bookloupe: %s: options.%s: %s\n",
ali@186: 			      path,keys[i],err->message);
ali@186: 			    g_clear_error(&err);
ali@186: 			}
ali@186: 			if (options[j].flags&G_OPTION_FLAG_REVERSE)
ali@186: 			    sw=!sw;
ali@186: 			*(gboolean *)options[j].arg_data=sw;
ali@186: 			break;
ali@186: 		    }
ali@186: 		    else
ali@186: 			g_assert_not_reached();
ali@186: 		}
ali@186: 	    }
ali@186: 	    if (!options[j].long_name)
ali@186: 		g_printerr("Bookloupe: %s: Unknown option \"%s\" ignored\n",
ali@186: 		  path,keys[i]);
ali@186: 	}
ali@186: 	g_strfreev(keys);
ali@186:     }
ali@186:     if (config)
ali@186: 	g_free(path);
ali@186: }
ali@186: 
ali@185: gboolean set_charset(const char *name,GError **err)
ali@185: {
ali@185:     /* The various UNICODE encodings all share the same character set. */
ali@185:     const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4",
ali@185:       "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG",
ali@185:       "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
ali@185:       "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE",
ali@185:       "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" };
ali@185:     int i;
ali@185:     if (charset)
ali@185: 	g_free(charset);
ali@185:     if (charset_validator!=(GIConv)-1)
ali@185: 	g_iconv_close(charset_validator);
ali@185:     if (!name || !g_strcasecmp(name,"auto"))
ali@185:     {
ali@185: 	charset=NULL;
ali@185: 	charset_validator=(GIConv)-1;
ali@185: 	return TRUE;
ali@185:     }
ali@185:     else
ali@185: 	charset=g_strdup(name);
ali@185:     for(i=0;i<G_N_ELEMENTS(unicode_aliases);i++)
ali@185: 	if (!g_strcasecmp(charset,unicode_aliases[i]))
ali@185: 	{
ali@185: 	    g_free(charset);
ali@185: 	    charset=g_strdup("UTF-8");
ali@185: 	    break;
ali@185: 	}
ali@185:     if (!strcmp(charset,"UTF-8"))
ali@185: 	charset_validator=(GIConv)-1;
ali@185:     else
ali@185:     {
ali@185: 	charset_validator=g_iconv_open(charset,"UTF-8");
ali@185: 	if (charset_validator==(GIConv)-1)
ali@185: 	{
ali@185: 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_NO_CONVERSION,
ali@185: 	      "Unknown character set \"%s\"",charset);
ali@185: 	    return FALSE;
ali@185: 	}
ali@185:     }
ali@185:     return TRUE;
ali@185: }
ali@185: 
ali@69: void parse_options(int *argc,char ***argv)
ali@0: {
ali@69:     GError *err=NULL;
ali@69:     GOptionContext *context;
ali@186:     GOptionGroup *compatibility;
ali@69:     context=g_option_context_new(
ali@186:       "file - look for errors in Project Gutenberg(TM) etexts");
ali@69:     g_option_context_add_main_entries(context,options,NULL);
ali@186:     g_option_context_add_main_entries(context,config_options,NULL);
ali@186:     compatibility=g_option_group_new("compatibility",
ali@186:       "Options for Compatibility with Gutcheck:",
ali@186:       "Show compatibility options",NULL,NULL);
ali@186:     g_option_group_add_entries(compatibility,compatibility_options);
ali@186:     g_option_context_add_group(context,compatibility);
ali@186:     g_option_context_set_description(context,
ali@186:       "For simplicity, only the switch options which reverse the\n"
ali@186:       "default configuration are listed. In most cases, both vanilla\n"
ali@186:       "and \"no-\" prefixed versions are available for use.");
ali@69:     if (!g_option_context_parse(context,argc,argv,&err))
ali@69:     {
ali@69: 	g_printerr("Bookloupe: %s\n",err->message);
ali@69: 	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
ali@69: 	exit(1);
ali@69:     }
ali@186:     if (typo_compat)
ali@69: 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
ali@186:     if (paranoid_compat)
ali@186:     {
ali@186: 	pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
ali@186: 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
ali@186:     }
ali@40:     /*
ali@40:      * Web uploads - for the moment, this is really just a placeholder
ali@40:      * until we decide what processing we really want to do on web uploads
ali@40:      */
ali@40:     if (pswit[WEB_SWITCH])
ali@40:     {
ali@40: 	/* specific override for web uploads */
ali@69: 	pswit[ECHO_SWITCH]=TRUE;
ali@69: 	pswit[SQUOTE_SWITCH]=FALSE;
ali@69: 	pswit[TYPO_SWITCH]=TRUE;
ali@69: 	pswit[QPARA_SWITCH]=FALSE;
ali@69: 	pswit[PARANOID_SWITCH]=TRUE;
ali@69: 	pswit[LINE_END_SWITCH]=FALSE;
ali@69: 	pswit[OVERVIEW_SWITCH]=FALSE;
ali@69: 	pswit[STDOUT_SWITCH]=FALSE;
ali@69: 	pswit[HEADER_SWITCH]=TRUE;
ali@69: 	pswit[VERBOSE_SWITCH]=FALSE;
ali@69: 	pswit[MARKUP_SWITCH]=FALSE;
ali@69: 	pswit[USERTYPO_SWITCH]=FALSE;
ali@69: 	pswit[DP_SWITCH]=FALSE;
ali@40:     }
ali@185:     if (opt_charset && !set_charset(opt_charset,&err))
ali@185:     {
ali@185: 	g_printerr("%s\n",err->message);
ali@185: 	exit(1);
ali@185:     }
ali@186:     if (pswit[DUMP_CONFIG_SWITCH])
ali@186:     {
ali@186: 	dump_config();
ali@186: 	exit(0);
ali@186:     }
ali@185:     g_free(opt_charset);
ali@185:     opt_charset=NULL;
ali@186:     if (pswit[OVERVIEW_SWITCH])
ali@186: 	/* just print summary; don't echo */
ali@186: 	pswit[ECHO_SWITCH]=FALSE;
ali@69:     if (*argc<2)
ali@40:     {
ali@69: 	proghelp(context);
ali@69: 	exit(1);
ali@40:     }
ali@69:     g_option_context_free(context);
ali@69: }
ali@69: 
ali@69: /*
ali@69:  * read_user_scannos:
ali@69:  *
ali@69:  * Read in the user-defined stealth scanno list.
ali@69:  */
ali@69: void read_user_scannos(void)
ali@69: {
ali@69:     GError *err=NULL;
ali@69:     gchar *usertypo_file;
ali@69:     gboolean okay;
ali@69:     int i;
ali@70:     gsize len,nb;
ali@70:     gchar *contents,*utf8,**lines;
ali@69:     usertypo_file=g_strdup("bookloupe.typ");
ali@69:     okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
ali@69:     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
ali@69:     {
ali@69: 	g_clear_error(&err);
ali@69: 	g_free(usertypo_file);
ali@69: 	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
ali@69: 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
ali@69:     }
ali@69:     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
ali@69:     {
ali@69: 	g_clear_error(&err);
ali@69: 	g_free(usertypo_file);
ali@69: 	usertypo_file=g_strdup("gutcheck.typ");
ali@69: 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
ali@69:     }
ali@69:     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
ali@69:     {
ali@69: 	g_clear_error(&err);
ali@69: 	g_free(usertypo_file);
ali@69: 	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
ali@69: 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
ali@69:     }
ali@69:     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
ali@69:     {
ali@69: 	g_free(usertypo_file);
ali@70: 	g_print("   --> I couldn't find bookloupe.typ "
ali@69: 	  "-- proceeding without user typos.\n");
ali@69: 	return;
ali@69:     }
ali@69:     else if (!okay)
ali@69:     {
ali@69: 	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
ali@69: 	g_free(usertypo_file);
ali@69: 	g_clear_error(&err);
ali@69: 	exit(1);
ali@69:     }
ali@72:     if (g_utf8_validate(contents,len,NULL))
ali@185:     {
ali@72: 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
ali@185: 	if (!charset)
ali@185: 	    (void)set_charset("UNICODE",NULL);
ali@185:     }
ali@72:     else
ali@72: 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
ali@70:     g_free(contents);
ali@70:     lines=g_strsplit_set(utf8,"\r\n",0);
ali@70:     g_free(utf8);
ali@69:     usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
ali@69:     for (i=0;lines[i];i++)
ali@69: 	if (*(unsigned char *)lines[i]>'!')
ali@69: 	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
ali@69: 	else
ali@69: 	    g_free(lines[i]);
ali@69:     g_free(lines);
ali@69: }
ali@69: 
ali@69: /*
ali@69:  * read_etext:
ali@69:  *
ali@69:  * Read an etext returning a newly allocated string containing the file
ali@69:  * contents or NULL on error.
ali@69:  */
ali@69: gchar *read_etext(const char *filename,GError **err)
ali@69: {
ali@76:     GError *tmp_err=NULL;
ali@70:     gchar *contents,*utf8;
ali@76:     gsize len,bytes_read,bytes_written;
ali@76:     int i,line,col;
ali@69:     if (!g_file_get_contents(filename,&contents,&len,err))
ali@69: 	return NULL;
ali@72:     if (g_utf8_validate(contents,len,NULL))
ali@72:     {
ali@72: 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
ali@72: 	g_set_print_handler(print_as_utf_8);
ali@73: #ifdef __WIN32__
ali@73: 	SetConsoleOutputCP(CP_UTF8);
ali@73: #endif
ali@72:     }
ali@72:     else
ali@72:     {
ali@76: 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
ali@76: 	  &bytes_written,&tmp_err);
ali@76: 	if (g_error_matches(tmp_err,G_CONVERT_ERROR,
ali@76: 	  G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
ali@76: 	{
ali@76: 	    line=col=1;
ali@76: 	    for(i=0;i<bytes_read;i++)
ali@76: 		if (contents[i]=='\n')
ali@76: 		{
ali@76: 		    line++;
ali@76: 		    col=1;
ali@76: 		}
ali@76: 		else if (contents[i]!='\r')
ali@76: 		    col++;
ali@76: 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
ali@76: 	      "Input conversion failed. Byte %d at line %d, column %d is not a "
ali@76: 	      "valid Windows-1252 character",
ali@76: 	      ((unsigned char *)contents)[bytes_read],line,col);
ali@76: 	}
ali@76: 	else if (tmp_err)
ali@76: 	    g_propagate_error(err,tmp_err);
ali@72: 	g_set_print_handler(print_as_windows_1252);
ali@73: #ifdef __WIN32__
ali@73: 	SetConsoleOutputCP(1252);
ali@73: #endif
ali@72:     }
ali@70:     g_free(contents);
ali@70:     return utf8;
ali@69: }
ali@69: 
ali@73: void cleanup_on_exit(void)
ali@73: {
ali@73: #ifdef __WIN32__
ali@73:     SetConsoleOutputCP(saved_cp);
ali@73: #endif
ali@73: }
ali@73: 
ali@69: int main(int argc,char **argv)
ali@69: {
ali@73: #ifdef __WIN32__
ali@73:     atexit(cleanup_on_exit);
ali@73:     saved_cp=GetConsoleOutputCP();
ali@73: #endif
ali@69:     running_from=g_path_get_dirname(argv[0]);
ali@186:     /* Paranoid checking is turned OFF, not on, by its switch */
ali@186:     pswit[PARANOID_SWITCH]=TRUE;
ali@186:     /* if running in paranoid mode, typo checks default to enabled */
ali@186:     pswit[TYPO_SWITCH]=TRUE;
ali@186:     /* Line-end checking is turned OFF, not on, by its switch */
ali@186:     pswit[LINE_END_SWITCH]=TRUE;
ali@186:     /* Echoing is turned OFF, not on, by its switch */
ali@186:     pswit[ECHO_SWITCH]=TRUE;
ali@186:     parse_config_file();
ali@69:     parse_options(&argc,&argv);
ali@40:     if (pswit[USERTYPO_SWITCH])
ali@69: 	read_user_scannos();
ali@40:     fprintf(stderr,"bookloupe: Check and report on an e-text\n");
ali@69:     procfile(argv[1]);
ali@40:     if (pswit[OVERVIEW_SWITCH])
ali@40:     {
ali@70: 	g_print("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
ali@40: 	  checked_linecnt,linecnt,linecnt-checked_linecnt);
ali@70: 	g_print("    --------------- Queries found --------------\n");
ali@68: 	if (cnt_long)
ali@70: 	    g_print("    Long lines:		    %14ld\n",cnt_long);
ali@68: 	if (cnt_short)
ali@70: 	    g_print("    Short lines:		   %14ld\n",cnt_short);
ali@68: 	if (cnt_lineend)
ali@70: 	    g_print("    Line-end problems:	     %14ld\n",cnt_lineend);
ali@68: 	if (cnt_word)
ali@70: 	    g_print("    Common typos:		  %14ld\n",cnt_word);
ali@142: 	if (cnt_quote)
ali@142: 	    g_print("    Unmatched quotes:	      %14ld\n",cnt_quote);
ali@68: 	if (cnt_brack)
ali@70: 	    g_print("    Unmatched brackets:	    %14ld\n",cnt_brack);
ali@68: 	if (cnt_bin)
ali@70: 	    g_print("    Non-ASCII characters:	  %14ld\n",cnt_bin);
ali@68: 	if (cnt_odd)
ali@70: 	    g_print("    Proofing characters:	   %14ld\n",cnt_odd);
ali@68: 	if (cnt_punct)
ali@70: 	    g_print("    Punctuation & spacing queries: %14ld\n",cnt_punct);
ali@68: 	if (cnt_dash)
ali@70: 	    g_print("    Non-standard dashes:	   %14ld\n",cnt_dash);
ali@68: 	if (cnt_html)
ali@70: 	    g_print("    Possible HTML tags:	    %14ld\n",cnt_html);
ali@70: 	g_print("\n");
ali@70: 	g_print("    TOTAL QUERIES		  %14ld\n",
ali@142: 	  cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+
ali@142: 	  cnt_dash+cnt_word+cnt_html+cnt_lineend);
ali@40:     }
ali@69:     g_free(running_from);
ali@69:     if (usertypo)
ali@69: 	g_tree_unref(usertypo);
ali@185:     set_charset(NULL,NULL);
ali@186:     if (config)
ali@186: 	g_key_file_free(config);
ali@40:     return 0;
ali@0: }
ali@0: 
ali@147: void count_dashes(const char *line,const char *dash,
ali@147:   struct dash_results *results)
ali@147: {
ali@147:     int i;
ali@147:     gchar **tokens;
ali@147:     gunichar pc,nc;
ali@147:     gboolean spaced=FALSE,unspaced=FALSE,spaced2=FALSE;
ali@147:     if (!*line)
ali@147: 	return;
ali@147:     tokens=g_strsplit(line,dash,0);
ali@147:     if (tokens[1])
ali@147: 	results->base++;
ali@147:     for(i=1;tokens[i];i++)
ali@147:     {
ali@147: 	pc=g_utf8_get_char(g_utf8_prev_char(tokens[i-1]+strlen(tokens[i-1])));
ali@147: 	nc=g_utf8_get_char(tokens[i]);
ali@147: 	if (g_unichar_isspace(pc) || g_unichar_isspace(nc))
ali@147: 	    spaced=TRUE;
ali@147: 	if (g_unichar_isspace(pc) && g_unichar_isspace(nc))
ali@147: 	    spaced2=TRUE;
ali@147: 	else if (!g_unichar_isspace(pc) && !g_unichar_isspace(nc))
ali@147: 	    unspaced=TRUE;
ali@147:     }
ali@147:     if (spaced)
ali@147: 	results->space++;
ali@147:     if (spaced2)
ali@147: 	/* count of lines with em-dashes with spaces both sides */
ali@147: 	results->non_PG_space++;
ali@147:     if (unspaced)
ali@147: 	/* count of lines with PG-type em-dashes with no spaces */
ali@147: 	results->PG_space++;
ali@147:     g_strfreev(tokens);
ali@147: }
ali@147: 
ali@40: /*
ali@41:  * first_pass:
ali@40:  *
ali@41:  * Run a first pass - verify that it's a valid PG
ali@41:  * file, decide whether to report some things that
ali@41:  * occur many times in the text like long or short
ali@41:  * lines, non-standard dashes, etc.
ali@40:  */
ali@69: struct first_pass_results *first_pass(const char *etext)
ali@0: {
ali@70:     gunichar laststart=CHAR_SPACE;
ali@54:     const char *s;
ali@69:     gchar *lc_line;
ali@70:     int i,j,lbytes,llen;
ali@69:     gchar **lines;
ali@41:     unsigned int lastlen=0,lastblen=0;
ali@41:     long spline=0,nspline=0;
ali@41:     static struct first_pass_results results={0};
ali@147:     struct dash_results tmp_dash_results;
ali@69:     gchar *inword;
ali@142:     QuoteClass qc;
ali@69:     lines=g_strsplit(etext,"\n",0);
ali@69:     for (j=0;lines[j];j++)
ali@40:     {
ali@70: 	lbytes=strlen(lines[j]);
ali@82: 	while (lbytes>0 && lines[j][lbytes-1]=='\r')
ali@70: 	    lines[j][--lbytes]='\0';
ali@70: 	llen=g_utf8_strlen(lines[j],lbytes);
ali@68: 	linecnt++;
ali@69: 	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
ali@69: 	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
ali@40: 	{
ali@68: 	    if (spline)
ali@70: 		g_print("   --> Duplicate header?\n");
ali@68: 	    spline=linecnt+1;   /* first line of non-header text, that is */
ali@40: 	}
ali@69: 	if (!strncmp(lines[j],"*** START",9) &&
ali@69: 	  strstr(lines[j],"PROJECT GUTENBERG"))
ali@40: 	{
ali@68: 	    if (nspline)
ali@70: 		g_print("   --> Duplicate header?\n");
ali@68: 	    nspline=linecnt+1;   /* first line of non-header text, that is */
ali@40: 	}
ali@68: 	if (spline || nspline)
ali@40: 	{
ali@70: 	    lc_line=g_utf8_strdown(lines[j],lbytes);
ali@69: 	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
ali@40: 	    {
ali@69: 		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
ali@40: 		{
ali@68: 		    if (results.footerline)
ali@40: 		    {
ali@40: 			/* it's an old-form header - we can detect duplicates */
ali@68: 			if (!nspline)
ali@70: 			    g_print("   --> Duplicate footer?\n");
ali@40: 		    }
ali@68: 		    else
ali@68: 			results.footerline=linecnt;
ali@40: 		}
ali@40: 	    }
ali@69: 	    g_free(lc_line);
ali@40: 	}
ali@68: 	if (spline)
ali@41: 	    results.firstline=spline;
ali@68: 	if (nspline)
ali@41: 	    results.firstline=nspline;  /* override with new */
ali@68: 	if (results.footerline)
ali@40: 	    continue;    /* don't count the boilerplate in the footer */
ali@68: 	results.totlen+=llen;
ali@70: 	for (s=lines[j];*s;s=g_utf8_next_char(s))
ali@40: 	{
ali@70: 	    if (g_utf8_get_char(s)>127)
ali@41: 		results.binlen++;
ali@70: 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
ali@41: 		results.alphalen++;
ali@142: 	    if (s>lines[j])
ali@142: 	    {
ali@142: 		if (CHAR_IS_DQUOTE(g_utf8_get_char(s)))
ali@142: 		    qc=QUOTE_CLASS(g_utf8_get_char(s));
ali@142: 		else
ali@142: 		    qc=INVALID_QUOTE;
ali@142: 		if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) &&
ali@147: 		  g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
ali@142: 		    results.endquote_count++;
ali@142: 	    }
ali@40: 	}
ali@69: 	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
ali@69: 	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
ali@41: 	    results.shortline++;
ali@70: 	if (lbytes>0 &&
ali@70: 	  g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
ali@40: 	    cnt_spacend++;
ali@69: 	if (strstr(lines[j],".,"))
ali@41: 	    results.dotcomma++;
ali@68: 	/* only count ast lines for ignoring purposes where there is */
ali@68: 	/* locase text on the line */
ali@69: 	if (strchr(lines[j],'*'))
ali@40: 	{
ali@70: 	    for (s=lines[j];*s;s=g_utf8_next_char(s))
ali@70: 		if (g_unichar_islower(g_utf8_get_char(s)))
ali@68: 		    break;
ali@70: 	    if (*s)
ali@41: 		results.astline++;
ali@40: 	}
ali@69: 	if (strchr(lines[j],'/'))
ali@68: 	    results.fslashline++;
ali@82: 	if (lbytes>0)
ali@82: 	{
ali@82: 	    for (s=g_utf8_prev_char(lines[j]+lbytes);
ali@82: 	      s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
ali@82: 	      s=g_utf8_prev_char(s))
ali@82: 		;
ali@82: 	    if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
ali@82: 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
ali@82: 		results.hyphens++;
ali@82: 	}
ali@68: 	if (llen>LONGEST_PG_LINE)
ali@41: 	    results.longline++;
ali@68: 	if (llen>WAY_TOO_LONG)
ali@41: 	    results.verylongline++;
ali@69: 	if (strchr(lines[j],'<') && strchr(lines[j],'>'))
ali@40: 	{
ali@69: 	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
ali@68: 	    if (i>0)
ali@68: 		results.htmcount++;
ali@69: 	    if (strstr(lines[j],"<i>"))
ali@41: 		results.htmcount+=4; /* bonus marks! */
ali@40: 	}
ali@68: 	/* Check for spaced em-dashes */
ali@147: 	memset(&tmp_dash_results,0,sizeof(tmp_dash_results));
ali@147: 	count_dashes(lines[j],"--",&tmp_dash_results);
ali@147: 	count_dashes(lines[j],"—",&tmp_dash_results);
ali@147: 	if (tmp_dash_results.base)
ali@147: 	    results.emdash.base++;
ali@147: 	if (tmp_dash_results.non_PG_space)
ali@147: 	    results.emdash.non_PG_space++;
ali@147: 	if (tmp_dash_results.PG_space)
ali@147: 	    results.emdash.PG_space++;
ali@69: 	for (s=lines[j];*s;)
ali@40: 	{
ali@69: 	    inword=getaword(&s);
ali@68: 	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) 
ali@68: 		results.Dutchcount++;
ali@68: 	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) 
ali@68: 		results.Frenchcount++;
ali@68: 	    if (!strcmp(inword,"0") || !strcmp(inword,"1")) 
ali@68: 		results.standalone_digit++;
ali@69: 	    g_free(inword);
ali@40: 	}
ali@68: 	/* Check for spaced dashes */
ali@69: 	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
ali@41: 	    results.spacedash++;
ali@68: 	lastblen=lastlen;
ali@69: 	lastlen=llen;
ali@69: 	laststart=lines[j][0];
ali@40:     }
ali@69:     g_strfreev(lines);
ali@41:     return &results;
ali@41: }
ali@41: 
ali@42: /*
ali@42:  * report_first_pass:
ali@42:  *
ali@42:  * Make some snap decisions based on the first pass results.
ali@42:  */
ali@42: struct warnings *report_first_pass(struct first_pass_results *results)
ali@42: {
ali@42:     static struct warnings warnings={0};
ali@42:     if (cnt_spacend>0)
ali@70: 	g_print("   --> %ld lines in this file have white space at end\n",
ali@42: 	  cnt_spacend);
ali@42:     warnings.dotcomma=1;
ali@42:     if (results->dotcomma>5)
ali@42:     {
ali@68: 	warnings.dotcomma=0;
ali@70: 	g_print("   --> %ld lines in this file contain '.,'. "
ali@42: 	  "Not reporting them.\n",results->dotcomma);
ali@42:     }
ali@42:     /*
ali@42:      * If more than 50 lines, or one-tenth, are short,
ali@42:      * don't bother reporting them.
ali@42:      */
ali@42:     warnings.shortline=1;
ali@42:     if (results->shortline>50 || results->shortline*10>linecnt)
ali@42:     {
ali@68: 	warnings.shortline=0;
ali@70: 	g_print("   --> %ld lines in this file are short. "
ali@42: 	  "Not reporting short lines.\n",results->shortline);
ali@42:     }
ali@42:     /*
ali@42:      * If more than 50 lines, or one-tenth, are long,
ali@42:      * don't bother reporting them.
ali@42:      */
ali@42:     warnings.longline=1;
ali@42:     if (results->longline>50 || results->longline*10>linecnt)
ali@42:     {
ali@68: 	warnings.longline=0;
ali@70: 	g_print("   --> %ld lines in this file are long. "
ali@42: 	  "Not reporting long lines.\n",results->longline);
ali@42:     }
ali@42:     /* If more than 10 lines contain asterisks, don't bother reporting them. */
ali@42:     warnings.ast=1;
ali@42:     if (results->astline>10)
ali@42:     {
ali@68: 	warnings.ast=0;
ali@70: 	g_print("   --> %ld lines in this file contain asterisks. "
ali@42: 	  "Not reporting them.\n",results->astline);
ali@42:     }
ali@42:     /*
ali@42:      * If more than 10 lines contain forward slashes,
ali@42:      * don't bother reporting them.
ali@42:      */
ali@42:     warnings.fslash=1;
ali@42:     if (results->fslashline>10)
ali@42:     {
ali@68: 	warnings.fslash=0;
ali@70: 	g_print("   --> %ld lines in this file contain forward slashes. "
ali@42: 	  "Not reporting them.\n",results->fslashline);
ali@42:     }
ali@42:     /*
ali@42:      * If more than 20 lines contain unpunctuated endquotes,
ali@42:      * don't bother reporting them.
ali@42:      */
ali@42:     warnings.endquote=1;
ali@42:     if (results->endquote_count>20)
ali@42:     {
ali@68: 	warnings.endquote=0;
ali@70: 	g_print("   --> %ld lines in this file contain unpunctuated endquotes. "
ali@42: 	  "Not reporting them.\n",results->endquote_count);
ali@42:     }
ali@42:     /*
ali@42:      * If more than 15 lines contain standalone digits,
ali@42:      * don't bother reporting them.
ali@42:      */
ali@42:     warnings.digit=1;
ali@42:     if (results->standalone_digit>10)
ali@42:     {
ali@68: 	warnings.digit=0;
ali@70: 	g_print("   --> %ld lines in this file contain standalone 0s and 1s. "
ali@42: 	  "Not reporting them.\n",results->standalone_digit);
ali@42:     }
ali@42:     /*
ali@42:      * If more than 20 lines contain hyphens at end,
ali@42:      * don't bother reporting them.
ali@42:      */
ali@42:     warnings.hyphen=1;
ali@42:     if (results->hyphens>20)
ali@42:     {
ali@68: 	warnings.hyphen=0;
ali@70: 	g_print("   --> %ld lines in this file have hyphens at end. "
ali@42: 	  "Not reporting them.\n",results->hyphens);
ali@42:     }
ali@42:     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
ali@42:     {
ali@70: 	g_print("   --> Looks like this is HTML. Switching HTML mode ON.\n");
ali@68: 	pswit[MARKUP_SWITCH]=1;
ali@42:     }
ali@42:     if (results->verylongline>0)
ali@70: 	g_print("   --> %ld lines in this file are VERY long!\n",
ali@42: 	  results->verylongline);
ali@42:     /*
ali@42:      * If there are more non-PG spaced dashes than PG em-dashes,
ali@42:      * assume it's deliberate.
ali@42:      * Current PG guidelines say don't use them, but older texts do,
ali@42:      * and some people insist on them whatever the guidelines say.
ali@42:      */
ali@42:     warnings.dash=1;
ali@147:     if (results->spacedash+results->emdash.non_PG_space>
ali@147:       results->emdash.PG_space)
ali@42:     {
ali@68: 	warnings.dash=0;
ali@70: 	g_print("   --> There are %ld spaced dashes and em-dashes. "
ali@42: 	  "Not reporting them.\n",
ali@147: 	  results->spacedash+results->emdash.non_PG_space);
ali@42:     }
ali@185:     if (charset)
ali@185: 	warnings.bin=0;
ali@185:     else
ali@42:     {
ali@185: 	/* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */
ali@185: 	warnings.bin=1;
ali@185: 	/* If more than a quarter of characters are hi-bit, bug out. */
ali@185: 	if (results->binlen*4>results->totlen)
ali@185: 	{
ali@185: 	    g_print("   --> This file does not appear to be ASCII. "
ali@185: 	      "Terminating. Best of luck with it!\n");
ali@185: 	    exit(1);
ali@185: 	}
ali@185: 	if (results->alphalen*4<results->totlen)
ali@185: 	{
ali@185: 	    g_print("   --> This file does not appear to be text. "
ali@185: 	      "Terminating. Best of luck with it!\n");
ali@185: 	    exit(1);
ali@185: 	}
ali@185: 	if (results->binlen*100>results->totlen || results->binlen>100)
ali@185: 	{
ali@185: 	    g_print("   --> There are a lot of foreign letters here. "
ali@185: 	      "Not reporting them.\n");
ali@185: 	    if (!pswit[VERBOSE_SWITCH])
ali@185: 		warnings.bin=0;
ali@185: 	}
ali@42:     }
ali@69:     warnings.isDutch=FALSE;
ali@42:     if (results->Dutchcount>50)
ali@42:     {
ali@69: 	warnings.isDutch=TRUE;
ali@70: 	g_print("   --> This looks like Dutch - "
ali@42: 	  "switching off dashes and warnings for 's Middags case.\n");
ali@42:     }
ali@69:     warnings.isFrench=FALSE;
ali@42:     if (results->Frenchcount>50)
ali@42:     {
ali@69: 	warnings.isFrench=TRUE;
ali@70: 	g_print("   --> This looks like French - "
ali@42: 	  "switching off some doublepunct.\n");
ali@42:     }
ali@42:     if (results->firstline && results->footerline)
ali@70: 	g_print("    The PG header and footer appear to be already on.\n");
ali@42:     else
ali@42:     {
ali@68: 	if (results->firstline)
ali@70: 	    g_print("    The PG header is on - no footer.\n");
ali@68: 	if (results->footerline)
ali@70: 	    g_print("    The PG footer is on - no header.\n");
ali@42:     }
ali@70:     g_print("\n");
ali@42:     if (pswit[VERBOSE_SWITCH])
ali@42:     {
ali@68: 	warnings.shortline=1;
ali@68: 	warnings.dotcomma=1;
ali@68: 	warnings.longline=1;
ali@68: 	warnings.dash=1;
ali@68: 	warnings.digit=1;
ali@68: 	warnings.ast=1;
ali@68: 	warnings.fslash=1;
ali@68: 	warnings.hyphen=1;
ali@68: 	warnings.endquote=1;
ali@70: 	g_print("   *** Verbose output is ON -- you asked for it! ***\n");
ali@42:     }
ali@42:     if (warnings.isDutch)
ali@68: 	warnings.dash=0;
ali@42:     if (results->footerline>0 && results->firstline>0 &&
ali@42:       results->footerline>results->firstline &&
ali@42:       results->footerline-results->firstline<100)
ali@42:     {
ali@70: 	g_print("   --> I don't really know where this text starts. \n");
ali@70: 	g_print("       There are no reference points.\n");
ali@70: 	g_print("       I'm going to have to report the header and footer "
ali@42: 	  "as well.\n");
ali@68: 	results->firstline=0;
ali@42:     }
ali@42:     return &warnings;
ali@42: }
ali@42: 
ali@43: /*
ali@43:  * analyse_quotes:
ali@43:  *
ali@43:  * Look along the line, accumulate the count of quotes, and see
ali@43:  * if this is an empty line - i.e. a line with nothing on it
ali@43:  * but spaces.
ali@43:  * If line has just spaces, period, * and/or - on it, don't
ali@43:  * count it, since empty lines with asterisks or dashes to
ali@43:  * separate sections are common.
ali@43:  *
ali@69:  * Returns: TRUE if the line is empty.
ali@43:  */
ali@164: gboolean analyse_quotes(const char *aline,struct counters *counters)
ali@43: {
ali@68:     int guessquote=0;
ali@69:     /* assume the line is empty until proven otherwise */
ali@69:     gboolean isemptyline=TRUE;
ali@70:     const char *s=aline,*sprev,*snext;
ali@70:     gunichar c;
ali@70:     sprev=NULL;
ali@142:     GError *tmp_err=NULL;
ali@43:     while (*s)
ali@43:     {
ali@70: 	snext=g_utf8_next_char(s);
ali@70: 	c=g_utf8_get_char(s);
ali@142: 	if (CHAR_IS_DQUOTE(c))
ali@142: 	    (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);
ali@142: 	else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])
ali@43: 	{
ali@43: 	    if (s==aline)
ali@43: 	    {
ali@43: 		/*
ali@142: 		 * At start of line, it can only be a quotation mark.
ali@43: 		 * Hardcode a very common exception!
ali@43: 		 */
ali@70: 		if (!g_str_has_prefix(snext,"tis") &&
ali@70: 		  !g_str_has_prefix(snext,"Tis"))
ali@142: 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
ali@43: 	    }
ali@70: 	    else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
ali@70: 	      g_unichar_isalpha(g_utf8_get_char(snext)))
ali@43: 		/* Do nothing! it's definitely an apostrophe, not a quote */
ali@43: 		;
ali@43: 	    /* it's outside a word - let's check it out */
ali@99: 	    else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
ali@70: 	      g_unichar_isalpha(g_utf8_get_char(snext)))
ali@43: 	    {
ali@142: 		/* certainly looks like a quotation mark */
ali@70: 		if (!g_str_has_prefix(snext,"tis") &&
ali@70: 		  !g_str_has_prefix(snext,"Tis"))
ali@43: 		    /* hardcode a very common exception! */
ali@142: 		{
ali@142: 		    if (strchr(".?!,;:",g_utf8_get_char(sprev)))
ali@142: 			(void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
ali@142: 		    else
ali@142: 			(void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);
ali@142: 		}
ali@43: 	    }
ali@43: 	    else
ali@43: 	    {
ali@142: 		/* now - is it a quotation mark? */
ali@43: 		guessquote=0;   /* accumulate clues */
ali@70: 		if (g_unichar_isalpha(g_utf8_get_char(sprev)))
ali@43: 		{
ali@43: 		    /* it follows a letter - could be either */
ali@43: 		    guessquote++;
ali@70: 		    if (g_utf8_get_char(sprev)=='s')
ali@43: 		    {
ali@43: 			/* looks like a plural apostrophe */
ali@43: 			guessquote-=3;
ali@70: 			if (g_utf8_get_char(snext)==CHAR_SPACE)
ali@70: 			    /* bonus marks! */
ali@43: 			    guessquote-=2;
ali@43: 		    }
ali@142: 		    if (innermost_quote_matches(counters,c))
ali@142: 			/*
ali@142: 			 * Give it the benefit of some doubt,
ali@142: 			 * if a squote is already open.
ali@142: 			 */
ali@142: 			guessquote++;
ali@142: 		    else
ali@142: 			guessquote--;
ali@142: 		    if (guessquote>=0)
ali@142: 			(void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);
ali@43: 		}
ali@43: 		else
ali@142: 		    /* no adjacent letter - it must be a quote of some kind */
ali@142: 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
ali@43: 	    }
ali@43: 	}
ali@142: 	if (tmp_err)
ali@142: 	{
ali@142: 	    if (pswit[ECHO_SWITCH])
ali@142: 		g_print("\n%s\n",aline);
ali@142: 	    if (!pswit[OVERVIEW_SWITCH])
ali@142: 		g_print("    Line %ld column %ld - %s\n",
ali@142: 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);
ali@142: 	    g_clear_error(&tmp_err);
ali@142: 	}
ali@70: 	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
ali@70: 	  c!='\r' && c!='\n')
ali@69: 	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */
ali@70: 	if (c==CHAR_UNDERSCORE)
ali@43: 	    counters->c_unders++;
ali@103: 	if (c==CHAR_OPEN_SBRACK)
ali@103: 	{
ali@103: 	    if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
ali@103: 	      !matching_difference(counters,c) && s==aline &&
ali@103: 	      g_str_has_prefix(s,"[Illustration:"))
ali@103: 		increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
ali@103: 	    else
ali@103: 		increment_matching(counters,c,TRUE);
ali@103: 	}
ali@103: 	else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
ali@99: 	    increment_matching(counters,c,TRUE);
ali@103: 	if (c==CHAR_CLOSE_SBRACK)
ali@103: 	{
ali@103: 	    if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
ali@103: 	      !matching_difference(counters,c) && !*snext)
ali@103: 		increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
ali@103: 	    else
ali@103: 		increment_matching(counters,c,FALSE);
ali@103: 	}
ali@103: 	else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
ali@99: 	    increment_matching(counters,c,FALSE);
ali@70: 	sprev=s;
ali@70: 	s=snext;
ali@43:     }
ali@43:     return isemptyline;
ali@43: }
ali@43: 
ali@41: /*
ali@67:  * check_for_control_characters:
ali@67:  *
ali@67:  * Check for invalid or questionable characters in the line
ali@67:  * Anything above 127 is invalid for plain ASCII, and
ali@67:  * non-printable control characters should also be flagged.
ali@67:  * Tabs should generally not be there.
ali@67:  */
ali@67: void check_for_control_characters(const char *aline)
ali@67: {
ali@70:     gunichar c;
ali@67:     const char *s;
ali@70:     for (s=aline;*s;s=g_utf8_next_char(s))
ali@67:     {
ali@70: 	c=g_utf8_get_char(s);
ali@67: 	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
ali@67: 	{
ali@67: 	    if (pswit[ECHO_SWITCH])
ali@70: 		g_print("\n%s\n",aline);
ali@67: 	    if (!pswit[OVERVIEW_SWITCH])
ali@70: 		g_print("    Line %ld column %ld - Control character %u\n",
ali@70: 		  linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
ali@67: 	    else
ali@67: 		cnt_bin++;
ali@67: 	}
ali@67:     }
ali@67: }
ali@67: 
ali@67: /*
ali@44:  * check_for_odd_characters:
ali@44:  *
ali@44:  * Check for binary and other odd characters.
ali@44:  */
ali@44: void check_for_odd_characters(const char *aline,const struct warnings *warnings,
ali@69:   gboolean isemptyline)
ali@44: {
ali@44:     /* Don't repeat multiple warnings on one line. */
ali@185:     gboolean eInvalidChar=FALSE,eTab=FALSE,eTilde=FALSE;
ali@70:     gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
ali@44:     const char *s;
ali@70:     gunichar c;
ali@185:     gsize nb;
ali@185:     gchar *t;
ali@70:     for (s=aline;*s;s=g_utf8_next_char(s))
ali@44:     {
ali@70: 	c=g_utf8_get_char(s);
ali@185: 	if (warnings->bin && !eInvalidChar &&
ali@185: 	  (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
ali@44: 	{
ali@44: 	    if (pswit[ECHO_SWITCH])
ali@70: 		g_print("\n%s\n",aline);
ali@44: 	    if (!pswit[OVERVIEW_SWITCH])
ali@70: 		if (c>127 && c<160 || c>255)
ali@70: 		    g_print("    Line %ld column %ld - "
ali@70: 		      "Non-ISO-8859 character %u\n",
ali@70: 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
ali@44: 		else
ali@70: 		    g_print("    Line %ld column %ld - "
ali@70: 		      "Non-ASCII character %u\n",
ali@70: 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
ali@44: 	    else
ali@44: 		cnt_bin++;
ali@185: 	    eInvalidChar=TRUE;
ali@185: 	}
ali@185: 	if (!eInvalidChar && charset)
ali@185: 	{
ali@185: 	    if (charset_validator==(GIConv)-1)
ali@185: 	    {
ali@185: 		if (!g_unichar_isdefined(c))
ali@185: 		{
ali@185: 		    if (pswit[ECHO_SWITCH])
ali@185: 			g_print("\n%s\n",aline);
ali@185: 		    if (!pswit[OVERVIEW_SWITCH])
ali@185: 			g_print("    Line %ld column %ld - Unassigned UNICODE "
ali@185: 			  "code point U+%04" G_GINT32_MODIFIER "X\n",
ali@185: 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
ali@185: 		    else
ali@185: 			cnt_bin++;
ali@185: 		    eInvalidChar=TRUE;
ali@185: 		}
ali@185: 		else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD ||
ali@185: 		  c>=100000 && c<=0x10FFFD)
ali@185: 		{
ali@185: 		    if (pswit[ECHO_SWITCH])
ali@185: 			g_print("\n%s\n",aline);
ali@185: 		    if (!pswit[OVERVIEW_SWITCH])
ali@185: 			g_print("    Line %ld column %ld - Private Use "
ali@185: 			  "character U+%04" G_GINT32_MODIFIER "X\n",
ali@185: 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
ali@185: 		    else
ali@185: 			cnt_bin++;
ali@185: 		    eInvalidChar=TRUE;
ali@185: 		}
ali@185: 	    }
ali@185: 	    else
ali@185: 	    {
ali@185: 		t=g_convert_with_iconv(s,g_utf8_next_char(s)-s,
ali@185: 		  charset_validator,NULL,&nb,NULL);
ali@185: 		if (t)
ali@185: 		    g_free(t);
ali@185: 		else
ali@185: 		{
ali@185: 		    if (pswit[ECHO_SWITCH])
ali@185: 			g_print("\n%s\n",aline);
ali@185: 		    if (!pswit[OVERVIEW_SWITCH])
ali@185: 			g_print("    Line %ld column %ld - Non-%s "
ali@185: 			  "character %u\n",linecnt,
ali@185: 			  g_utf8_pointer_to_offset(aline,s)+1,charset,c);
ali@185: 		    else
ali@185: 			cnt_bin++;
ali@185: 		    eInvalidChar=TRUE;
ali@185: 		}
ali@185: 	    }
ali@44: 	}
ali@70: 	if (!eTab && c==CHAR_TAB)
ali@44: 	{
ali@44: 	    if (pswit[ECHO_SWITCH])
ali@70: 		g_print("\n%s\n",aline);
ali@44: 	    if (!pswit[OVERVIEW_SWITCH])
ali@70: 		g_print("    Line %ld column %ld - Tab character?\n",
ali@70: 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@44: 	    else
ali@44: 		cnt_odd++;
ali@70: 	    eTab=TRUE;
ali@44: 	}
ali@70: 	if (!eTilde && c==CHAR_TILDE)
ali@44: 	{
ali@44: 	    /*
ali@44: 	     * Often used by OCR software to indicate an
ali@44: 	     * unrecognizable character.
ali@44: 	     */
ali@44: 	    if (pswit[ECHO_SWITCH])
ali@70: 		g_print("\n%s\n",aline);
ali@44: 	    if (!pswit[OVERVIEW_SWITCH])
ali@70: 		g_print("    Line %ld column %ld - Tilde character?\n",
ali@70: 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@44: 	    else
ali@44: 		cnt_odd++;
ali@70: 	    eTilde=TRUE;
ali@44: 	}
ali@70: 	if (!eCarat && c==CHAR_CARAT)
ali@44: 	{  
ali@44: 	    if (pswit[ECHO_SWITCH])
ali@70: 		g_print("\n%s\n",aline);
ali@44: 	    if (!pswit[OVERVIEW_SWITCH])
ali@70: 		g_print("    Line %ld column %ld - Carat character?\n",
ali@70: 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@44: 	    else
ali@44: 		cnt_odd++;
ali@70: 	    eCarat=TRUE;
ali@44: 	}
ali@70: 	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
ali@44: 	{  
ali@44: 	    if (pswit[ECHO_SWITCH])
ali@70: 		g_print("\n%s\n",aline);
ali@44: 	    if (!pswit[OVERVIEW_SWITCH])
ali@70: 		g_print("    Line %ld column %ld - Forward slash?\n",
ali@70: 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@44: 	    else
ali@44: 		cnt_odd++;
ali@70: 	    eFSlash=TRUE;
ali@44: 	}
ali@44: 	/*
ali@44: 	 * Report asterisks only in paranoid mode,
ali@44: 	 * since they're often deliberate.
ali@44: 	 */
ali@44: 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
ali@70: 	  c==CHAR_ASTERISK)
ali@44: 	{
ali@44: 	    if (pswit[ECHO_SWITCH])
ali@70: 		g_print("\n%s\n",aline);
ali@44: 	    if (!pswit[OVERVIEW_SWITCH])
ali@70: 		g_print("    Line %ld column %ld - Asterisk?\n",
ali@70: 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@44: 	    else
ali@44: 		cnt_odd++;
ali@70: 	    eAst=TRUE;
ali@44: 	}
ali@44:     }
ali@44: }
ali@44: 
ali@44: /*
ali@45:  * check_for_long_line:
ali@45:  *
ali@45:  * Check for line too long.
ali@45:  */
ali@45: void check_for_long_line(const char *aline)
ali@45: {
ali@70:     if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
ali@45:     {
ali@45: 	if (pswit[ECHO_SWITCH])
ali@70: 	    g_print("\n%s\n",aline);
ali@45: 	if (!pswit[OVERVIEW_SWITCH])
ali@70: 	    g_print("    Line %ld column %ld - Long line %ld\n",
ali@70: 	      linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
ali@45: 	else
ali@45: 	    cnt_long++;
ali@45:     }
ali@45: }
ali@45: 
ali@45: /*
ali@45:  * check_for_short_line:
ali@45:  *
ali@45:  * Check for line too short.
ali@45:  *
ali@45:  * This one is a bit trickier to implement: we don't want to
ali@45:  * flag the last line of a paragraph for being short, so we
ali@45:  * have to wait until we know that our current line is a
ali@45:  * "normal" line, then report the _previous_ line if it was too
ali@45:  * short. We also don't want to report indented lines like
ali@45:  * chapter heads or formatted quotations. We therefore keep
ali@45:  * last->len as the length of the last line examined, and
ali@45:  * last->blen as the length of the last but one, and try to
ali@45:  * suppress unnecessary warnings by checking that both were of
ali@45:  * "normal" length. We keep the first character of the last
ali@45:  * line in last->start, and if it was a space, we assume that
ali@45:  * the formatting is deliberate. I can't figure out a way to
ali@45:  * distinguish something like a quoted verse left-aligned or
ali@45:  * the header or footer of a letter from a paragraph of short
ali@45:  * lines - maybe if I examined the whole paragraph, and if the
ali@45:  * para has less than, say, 8 lines and if all lines are short,
ali@45:  * then just assume it's OK? Need to look at some texts to see
ali@45:  * how often a formula like this would get the right result.
ali@45:  */
ali@45: void check_for_short_line(const char *aline,const struct line_properties *last)
ali@45: {
ali@70:     if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
ali@70:       last->len<SHORTEST_PG_LINE && last->blen>1 &&
ali@70:       last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
ali@45:     {
ali@45: 	if (pswit[ECHO_SWITCH])
ali@70: 	    g_print("\n%s\n",prevline);
ali@45: 	if (!pswit[OVERVIEW_SWITCH])
ali@70: 	    g_print("    Line %ld column %ld - Short line %ld?\n",
ali@70: 	      linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
ali@45: 	else
ali@45: 	    cnt_short++;
ali@45:     }
ali@45: }
ali@45: 
ali@45: /*
ali@46:  * check_for_starting_punctuation:
ali@46:  *
ali@46:  * Look for punctuation other than full ellipses at start of line.
ali@46:  */
ali@46: void check_for_starting_punctuation(const char *aline)
ali@46: {
ali@70:     if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
ali@70:       !g_str_has_prefix(aline,". . ."))
ali@46:     {
ali@46: 	if (pswit[ECHO_SWITCH])
ali@70: 	    g_print("\n%s\n",aline);
ali@46: 	if (!pswit[OVERVIEW_SWITCH])
ali@70: 	    g_print("    Line %ld column 1 - Begins with punctuation?\n",
ali@46: 	      linecnt);
ali@46: 	else
ali@46: 	    cnt_punct++;
ali@46:     }
ali@46: }
ali@46: 
ali@46: /*
ali@147:  * str_emdash:
ali@147:  *
ali@147:  * Find the first em-dash, return a pointer to it and set <next> to the
ali@147:  * character following the dash.
ali@147:  */
ali@147: char *str_emdash(const char *s,const char **next)
ali@147: {
ali@147:     const char *s1,*s2;
ali@147:     s1=strstr(s,"--");
ali@147:     s2=strstr(s,"—");
ali@147:     if (!s1)
ali@147:     {
ali@147: 	if (s2)
ali@147: 	    *next=g_utf8_next_char(s2);
ali@147: 	return (char *)s2;
ali@147:     }
ali@147:     else if (!s2)
ali@147:     {
ali@147: 	*next=g_utf8_next_char(g_utf8_next_char(s1));
ali@147: 	return (char *)s1;
ali@147:     }
ali@147:     else if (s1<s2)
ali@147:     {
ali@147: 	*next=g_utf8_next_char(g_utf8_next_char(s1));
ali@147: 	return (char *)s1;
ali@147:     }
ali@147:     else
ali@147:     {
ali@147: 	*next=g_utf8_next_char(s2);
ali@147: 	return (char *)s2;
ali@147:     }
ali@147: }
ali@147: 
ali@147: /*
ali@47:  * check_for_spaced_emdash:
ali@47:  *
ali@47:  * Check for spaced em-dashes.
ali@47:  *
ali@147:  * We must check _all_ occurrences of em-dashes on the line
ali@147:  * hence the loop - even if the first dash is OK
ali@47:  * there may be another that's wrong later on.
ali@47:  */
ali@47: void check_for_spaced_emdash(const char *aline)
ali@47: {
ali@70:     const char *s,*t,*next;
ali@147:     for (s=aline;t=str_emdash(s,&next);s=next)
ali@47:     {
ali@70: 	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
ali@70: 	  g_utf8_get_char(next)==CHAR_SPACE)
ali@47: 	{
ali@47: 	    if (pswit[ECHO_SWITCH])
ali@70: 		g_print("\n%s\n",aline);
ali@47: 	    if (!pswit[OVERVIEW_SWITCH])
ali@70: 		g_print("    Line %ld column %ld - Spaced em-dash?\n",
ali@70: 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
ali@47: 	    else
ali@47: 		cnt_dash++;
ali@47: 	}
ali@47:     }
ali@47: }
ali@47: 
ali@47: /*
ali@47:  * check_for_spaced_dash:
ali@47:  *
ali@47:  * Check for spaced dashes.
ali@47:  */
ali@47: void check_for_spaced_dash(const char *aline)
ali@47: {
ali@47:     const char *s;
ali@47:     if ((s=strstr(aline," -")))
ali@47:     {
ali@70: 	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
ali@47: 	{
ali@47: 	    if (pswit[ECHO_SWITCH])
ali@70: 		g_print("\n%s\n",aline);
ali@47: 	    if (!pswit[OVERVIEW_SWITCH])
ali@70: 		g_print("    Line %ld column %ld - Spaced dash?\n",
ali@70: 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@47: 	    else
ali@47: 		cnt_dash++;
ali@47: 	}
ali@47:     }
ali@47:     else if ((s=strstr(aline,"- ")))
ali@47:     {
ali@70: 	if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
ali@47: 	{
ali@47: 	    if (pswit[ECHO_SWITCH])
ali@70: 		g_print("\n%s\n",aline);
ali@47: 	    if (!pswit[OVERVIEW_SWITCH])
ali@70: 		g_print("    Line %ld column %ld - Spaced dash?\n",
ali@70: 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@47: 	    else
ali@47: 		cnt_dash++;
ali@47: 	}
ali@47:     }
ali@47: }
ali@47: 
ali@47: /*
ali@48:  * check_for_unmarked_paragraphs:
ali@48:  *
ali@48:  * Check for unmarked paragraphs indicated by separate speakers.
ali@48:  *
ali@48:  * May well be false positive:
ali@48:  * "Bravo!" "Wonderful!" called the crowd.
ali@48:  * but useful all the same.
ali@48:  */
ali@48: void check_for_unmarked_paragraphs(const char *aline)
ali@48: {
ali@48:     const char *s;
ali@48:     s=strstr(aline,"\"  \"");
ali@48:     if (!s)
ali@48: 	s=strstr(aline,"\" \"");
ali@48:     if (s)
ali@48:     {
ali@48: 	if (pswit[ECHO_SWITCH])
ali@70: 	    g_print("\n%s\n",aline);
ali@48: 	if (!pswit[OVERVIEW_SWITCH])
ali@70: 	    g_print("    Line %ld column %ld - "
ali@70: 	      "Query missing paragraph break?\n",
ali@70: 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@48: 	else
ali@48: 	    cnt_punct++;
ali@48:     }
ali@48: }
ali@48: 
ali@48: /*
ali@49:  * check_for_jeebies:
ali@49:  *
ali@49:  * Check for "to he" and other easy h/b errors.
ali@49:  *
ali@49:  * This is a very inadequate effort on the h/b problem,
ali@49:  * but the phrase "to he" is always an error, whereas "to
ali@49:  * be" is quite common.
ali@49:  * Similarly, '"Quiet!", be said.' is a non-be error
ali@49:  * "to he" is _not_ always an error!:
ali@49:  *       "Where they went to he couldn't say."
ali@49:  * Another false positive:
ali@49:  *       What would "Cinderella" be without the . . .
ali@49:  * and another: "If he wants to he can see for himself."
ali@49:  */
ali@49: void check_for_jeebies(const char *aline)
ali@49: {
ali@49:     const char *s;
ali@49:     s=strstr(aline," be could ");
ali@49:     if (!s)
ali@49: 	s=strstr(aline," be would ");
ali@49:     if (!s)
ali@49: 	s=strstr(aline," was be ");
ali@49:     if (!s)
ali@49: 	s=strstr(aline," be is ");
ali@49:     if (!s)
ali@49: 	s=strstr(aline," is be ");
ali@49:     if (!s)
ali@49: 	s=strstr(aline,"\", be ");
ali@49:     if (!s)
ali@49: 	s=strstr(aline,"\" be ");
ali@49:     if (!s)
ali@49: 	s=strstr(aline,"\" be ");
ali@49:     if (!s)
ali@49: 	s=strstr(aline," to he ");
ali@49:     if (s)
ali@49:     {
ali@49: 	if (pswit[ECHO_SWITCH])
ali@70: 	    g_print("\n%s\n",aline);
ali@49: 	if (!pswit[OVERVIEW_SWITCH])
ali@70: 	    g_print("    Line %ld column %ld - Query he/be error?\n",
ali@70: 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@49: 	else
ali@49: 	    cnt_word++;
ali@49:     }
ali@49:     s=strstr(aline," the had ");
ali@49:     if (!s)
ali@49: 	s=strstr(aline," a had ");
ali@49:     if (!s)
ali@49: 	s=strstr(aline," they bad ");
ali@49:     if (!s)
ali@49: 	s=strstr(aline," she bad ");
ali@49:     if (!s)
ali@49: 	s=strstr(aline," he bad ");
ali@49:     if (!s)
ali@49: 	s=strstr(aline," you bad ");
ali@49:     if (!s)
ali@49: 	s=strstr(aline," i bad ");
ali@49:     if (s)
ali@49:     {
ali@49: 	if (pswit[ECHO_SWITCH])
ali@70: 	    g_print("\n%s\n",aline);
ali@49: 	if (!pswit[OVERVIEW_SWITCH])
ali@70: 	    g_print("    Line %ld column %ld - Query had/bad error?\n",
ali@70: 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@49: 	else
ali@49: 	    cnt_word++;
ali@49:     }
ali@49:     s=strstr(aline,"; hut ");
ali@49:     if (!s)
ali@49: 	s=strstr(aline,", hut ");
ali@49:     if (s)
ali@49:     {
ali@49: 	if (pswit[ECHO_SWITCH])
ali@70: 	    g_print("\n%s\n",aline);
ali@49: 	if (!pswit[OVERVIEW_SWITCH])
ali@70: 	    g_print("    Line %ld column %ld - Query hut/but error?\n",
ali@70: 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@49: 	else
ali@49: 	    cnt_word++;
ali@49:     }
ali@49: }
ali@49: 
ali@49: /*
ali@50:  * check_for_mta_from:
ali@50:  *
ali@50:  * Special case - angled bracket in front of "From" placed there by an
ali@50:  * MTA when sending an e-mail.
ali@50:  */
ali@50: void check_for_mta_from(const char *aline)
ali@50: {
ali@50:     const char *s;
ali@50:     s=strstr(aline,">From");
ali@50:     if (s)
ali@50:     {
ali@50: 	if (pswit[ECHO_SWITCH])
ali@70: 	    g_print("\n%s\n",aline);
ali@50: 	if (!pswit[OVERVIEW_SWITCH])
ali@70: 	    g_print("    Line %ld column %ld - "
ali@70: 	      "Query angled bracket with From\n",
ali@70: 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@50: 	else
ali@50: 	    cnt_punct++;
ali@50:     }
ali@50: }
ali@50: 
ali@50: /*
ali@51:  * check_for_orphan_character:
ali@51:  *
ali@51:  * Check for a single character line -
ali@51:  * often an overflow from bad wrapping.
ali@51:  */
ali@51: void check_for_orphan_character(const char *aline)
ali@51: {
ali@70:     gunichar c;
ali@70:     c=g_utf8_get_char(aline);
ali@70:     if (c && !*g_utf8_next_char(aline))
ali@51:     {
ali@70: 	if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
ali@51: 	    ; /* Nothing - ignore numerals alone on a line. */
ali@51: 	else
ali@51: 	{
ali@51: 	    if (pswit[ECHO_SWITCH])
ali@70: 		g_print("\n%s\n",aline);
ali@51: 	    if (!pswit[OVERVIEW_SWITCH])
ali@70: 		g_print("    Line %ld column 1 - Query single character line\n",
ali@51: 		  linecnt);
ali@51: 	    else
ali@51: 		cnt_punct++;
ali@51: 	}
ali@51:     }
ali@51: }
ali@51: 
ali@51: /*
ali@52:  * check_for_pling_scanno:
ali@52:  *
ali@52:  * Check for I" - often should be !
ali@52:  */
ali@52: void check_for_pling_scanno(const char *aline)
ali@52: {
ali@52:     const char *s;
ali@52:     s=strstr(aline," I\"");
ali@52:     if (s)
ali@52:     {
ali@52: 	if (pswit[ECHO_SWITCH])
ali@70: 	    g_print("\n%s\n",aline);
ali@52: 	if (!pswit[OVERVIEW_SWITCH])
ali@70: 	    g_print("    Line %ld column %ld - Query I=exclamation mark?\n",
ali@70: 	      linecnt,g_utf8_pointer_to_offset(aline,s));
ali@52: 	else
ali@52: 	    cnt_punct++;
ali@52:     }
ali@52: }
ali@52: 
ali@52: /*
ali@53:  * check_for_extra_period:
ali@53:  *
ali@53:  * Check for period without a capital letter. Cut-down from gutspell.
ali@53:  * Only works when it happens on a single line.
ali@53:  */
ali@53: void check_for_extra_period(const char *aline,const struct warnings *warnings)
ali@53: {
ali@99:     const char *s,*t,*s1,*sprev;
ali@69:     int i;
ali@70:     gsize len;
ali@69:     gboolean istypo;
ali@69:     gchar *testword;
ali@99:     gunichar c,nc,pc,*decomposition;
ali@53:     if (pswit[PARANOID_SWITCH])
ali@53:     {
ali@70: 	for (t=aline;t=strstr(t,". ");)
ali@53: 	{
ali@69: 	    if (t==aline)
ali@53: 	    {
ali@70: 		t=g_utf8_next_char(t);
ali@53: 		/* start of line punctuation is handled elsewhere */
ali@53: 		continue;
ali@53: 	    }
ali@70: 	    if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
ali@53: 	    {
ali@70: 		t=g_utf8_next_char(t);
ali@53: 		continue;
ali@53: 	    }
ali@53: 	    if (warnings->isDutch)
ali@53: 	    {
ali@53: 		/* For Frank & Jeroen -- 's Middags case */
ali@70: 		gunichar c2,c3,c4,c5;
ali@70: 		c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
ali@70: 		c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
ali@70: 		c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
ali@70: 		c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
ali@99: 		if (CHAR_IS_APOSTROPHE(c2) &&
ali@99: 		  g_unichar_islower(c3) && c4==CHAR_SPACE &&
ali@99: 		  g_unichar_isupper(c5))
ali@53: 		{
ali@70: 		    t=g_utf8_next_char(t);
ali@53: 		    continue;
ali@53: 		}
ali@53: 	    }
ali@70: 	    s1=g_utf8_next_char(g_utf8_next_char(t));
ali@70: 	    while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
ali@173: 	      !g_unichar_isdigit(g_utf8_get_char(s1)))
ali@70: 		s1=g_utf8_next_char(s1);
ali@70: 	    if (g_unichar_islower(g_utf8_get_char(s1)))
ali@53: 	    {
ali@53: 		/* we have something to investigate */
ali@69: 		istypo=TRUE;
ali@53: 		/* so let's go back and find out */
ali@99: 		nc=g_utf8_get_char(t);
ali@99: 		s1=g_utf8_prev_char(t);
ali@99: 		c=g_utf8_get_char(s1);
ali@99: 		sprev=g_utf8_prev_char(s1);
ali@99: 		pc=g_utf8_get_char(sprev);
ali@99: 		while (s1>=aline &&
ali@99: 		  (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
ali@99: 		  g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
ali@99: 		  g_unichar_isalpha(nc)))
ali@99: 		{
ali@99: 		    nc=c;
ali@99: 		    s1=sprev;
ali@99: 		    c=pc;
ali@99: 		    sprev=g_utf8_prev_char(s1);
ali@99: 		    pc=g_utf8_get_char(sprev);
ali@99: 		}
ali@70: 		s1=g_utf8_next_char(s1);
ali@69: 		s=strchr(s1,'.');
ali@69: 		if (s)
ali@69: 		    testword=g_strndup(s1,s-s1);
ali@69: 		else
ali@69: 		    testword=g_strdup(s1);
ali@53: 		for (i=0;*abbrev[i];i++)
ali@53: 		    if (!strcmp(testword,abbrev[i]))
ali@69: 			istypo=FALSE;
ali@70: 		if (g_unichar_isdigit(g_utf8_get_char(testword)))
ali@69: 		    istypo=FALSE;
ali@70: 		if (!*g_utf8_next_char(testword))
ali@69: 		    istypo=FALSE;
ali@53: 		if (isroman(testword))
ali@69: 		    istypo=FALSE;
ali@53: 		if (istypo)
ali@53: 		{
ali@69: 		    istypo=FALSE;
ali@70: 		    for (s=testword;*s;s=g_utf8_next_char(s))
ali@70: 		    {
ali@70: 			decomposition=g_unicode_canonical_decomposition(
ali@70: 			  g_utf8_get_char(s),&len);
ali@70: 			if (g_utf8_strchr("aeiou",-1,decomposition[0]))
ali@69: 			    istypo=TRUE;
ali@70: 			g_free(decomposition);
ali@70: 		    }
ali@53: 		}
ali@69: 		if (istypo &&
ali@69: 		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
ali@53: 		{
ali@69: 		    g_tree_insert(qperiod,g_strdup(testword),
ali@69: 		      GINT_TO_POINTER(1));
ali@69: 		    if (pswit[ECHO_SWITCH])
ali@70: 			g_print("\n%s\n",aline);
ali@69: 		    if (!pswit[OVERVIEW_SWITCH])
ali@70: 			g_print("    Line %ld column %ld - Extra period?\n",
ali@70: 			  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
ali@69: 		    else
ali@69: 			cnt_punct++;
ali@53: 		}
ali@69: 		g_free(testword);
ali@53: 	    }
ali@70: 	    t=g_utf8_next_char(t);
ali@53: 	}
ali@53:     }
ali@53: }
ali@53: 
ali@53: /*
ali@54:  * check_for_following_punctuation:
ali@54:  *
ali@54:  * Check for words usually not followed by punctuation.
ali@54:  */
ali@54: void check_for_following_punctuation(const char *aline)
ali@54: {
ali@54:     int i;
ali@54:     const char *s,*wordstart;
ali@70:     gunichar c;
ali@69:     gchar *inword,*t;
ali@54:     if (pswit[TYPO_SWITCH])
ali@54:     {
ali@54: 	for (s=aline;*s;)
ali@54: 	{
ali@54: 	    wordstart=s;
ali@69: 	    t=getaword(&s);
ali@69: 	    if (!*t)
ali@69: 	    {
ali@69: 		g_free(t);
ali@54: 		continue;
ali@69: 	    }
ali@70: 	    inword=g_utf8_strdown(t,-1);
ali@69: 	    g_free(t);
ali@54: 	    for (i=0;*nocomma[i];i++)
ali@54: 		if (!strcmp(inword,nocomma[i]))
ali@54: 		{
ali@70: 		    c=g_utf8_get_char(s);
ali@70: 		    if (c==',' || c==';' || c==':')
ali@54: 		    {
ali@54: 			if (pswit[ECHO_SWITCH])
ali@70: 			    g_print("\n%s\n",aline);
ali@54: 			if (!pswit[OVERVIEW_SWITCH])
ali@70: 			    g_print("    Line %ld column %ld - "
ali@54: 			      "Query punctuation after %s?\n",
ali@70: 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
ali@70: 			      inword);
ali@54: 			else
ali@54: 			    cnt_punct++;
ali@54: 		    }
ali@54: 		}
ali@54: 	    for (i=0;*noperiod[i];i++)
ali@54: 		if (!strcmp(inword,noperiod[i]))
ali@54: 		{
ali@70: 		    c=g_utf8_get_char(s);
ali@70: 		    if (c=='.' || c=='!')
ali@54: 		    {
ali@54: 			if (pswit[ECHO_SWITCH])
ali@70: 			    g_print("\n%s\n",aline);
ali@54: 			if (!pswit[OVERVIEW_SWITCH])
ali@70: 			    g_print("    Line %ld column %ld - "
ali@54: 			      "Query punctuation after %s?\n",
ali@70: 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
ali@70: 			      inword);
ali@54: 			else
ali@54: 			    cnt_punct++;
ali@54: 		    }
ali@54: 		}
ali@69: 	    g_free(inword);
ali@54: 	}
ali@54:     }
ali@54: }
ali@54: 
ali@54: /*
ali@55:  * check_for_typos:
ali@55:  *
ali@55:  * Check for commonly mistyped words,
ali@55:  * and digits like 0 for O in a word.
ali@55:  */
ali@55: void check_for_typos(const char *aline,struct warnings *warnings)
ali@55: {
ali@70:     const char *s,*t,*nt,*wordstart;
ali@70:     gchar *inword;
ali@70:     gunichar *decomposition;
ali@70:     gchar *testword;
ali@70:     int i,vowel,consonant,*dupcnt;
ali@70:     gboolean isdup,istypo,alower;
ali@99:     gunichar c,pc;
ali@70:     long offset,len;
ali@70:     gsize decomposition_len;
ali@55:     for (s=aline;*s;)
ali@55:     {
ali@55: 	wordstart=s;
ali@69: 	inword=getaword(&s);
ali@55: 	if (!*inword)
ali@69: 	{
ali@69: 	    g_free(inword);
ali@55: 	    continue; /* don't bother with empty lines */
ali@69: 	}
ali@55: 	if (mixdigit(inword))
ali@55: 	{
ali@55: 	    if (pswit[ECHO_SWITCH])
ali@70: 		g_print("\n%s\n",aline);
ali@55: 	    if (!pswit[OVERVIEW_SWITCH])
ali@70: 		g_print("    Line %ld column %ld - Query digit in %s\n",
ali@70: 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
ali@55: 	    else
ali@55: 		cnt_word++;
ali@55: 	}
ali@55: 	/*
ali@55: 	 * Put the word through a series of tests for likely typos and OCR
ali@55: 	 * errors.
ali@55: 	 */
ali@69: 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
ali@55: 	{
ali@69: 	    istypo=FALSE;
ali@70: 	    alower=FALSE;
ali@70: 	    for (t=inword;*t;t=g_utf8_next_char(t))
ali@55: 	    {
ali@70: 		c=g_utf8_get_char(t);
ali@70: 		nt=g_utf8_next_char(t);
ali@55: 		/* lowercase for testing */
ali@70: 		if (g_unichar_islower(c))
ali@70: 		    alower=TRUE;
ali@70: 		if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
ali@55: 		{
ali@55: 		    /*
ali@55: 		     * We have an uppercase mid-word. However, there are
ali@55: 		     * common cases:
ali@55: 		     *   Mac and Mc like McGill
ali@55: 		     *   French contractions like l'Abbe
ali@55: 		     */
ali@70: 		    offset=g_utf8_pointer_to_offset(inword,t);
ali@99: 		    if (offset>0)
ali@99: 			pc=g_utf8_get_char(g_utf8_prev_char(t));
ali@99: 		    else
ali@99: 			pc='\0';
ali@70: 		    if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
ali@70: 		      offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
ali@70: 		      g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
ali@99: 		      CHAR_IS_APOSTROPHE(pc))
ali@55: 			; /* do nothing! */
ali@55: 		    else
ali@69: 			istypo=TRUE;
ali@55: 		}
ali@55: 	    }
ali@70: 	    testword=g_utf8_casefold(inword,-1);
ali@69: 	}
ali@69: 	if (pswit[TYPO_SWITCH])
ali@69: 	{
ali@55: 	    /*
ali@55: 	     * Check for certain unlikely two-letter combinations at word
ali@55: 	     * start and end.
ali@55: 	     */
ali@70: 	    len=g_utf8_strlen(testword,-1);
ali@70: 	    if (len>1)
ali@55: 	    {
ali@55: 		for (i=0;*nostart[i];i++)
ali@70: 		    if (g_str_has_prefix(testword,nostart[i]))
ali@69: 			istypo=TRUE;
ali@55: 		for (i=0;*noend[i];i++)
ali@70: 		    if (g_str_has_suffix(testword,noend[i]))
ali@69: 			istypo=TRUE;
ali@55: 	    }
ali@55: 	    /* ght is common, gbt never. Like that. */
ali@55: 	    if (strstr(testword,"cb"))
ali@69: 		istypo=TRUE;
ali@55: 	    if (strstr(testword,"gbt"))
ali@69: 		istypo=TRUE;
ali@55: 	    if (strstr(testword,"pbt"))
ali@69: 		istypo=TRUE;
ali@55: 	    if (strstr(testword,"tbs"))
ali@69: 		istypo=TRUE;
ali@55: 	    if (strstr(testword,"mrn"))
ali@69: 		istypo=TRUE;
ali@55: 	    if (strstr(testword,"ahle"))
ali@69: 		istypo=TRUE;
ali@55: 	    if (strstr(testword,"ihle"))
ali@69: 		istypo=TRUE;
ali@55: 	    /*
ali@55: 	     * "TBE" does happen - like HEARTBEAT - but uncommon.
ali@55: 	     * Also "TBI" - frostbite, outbid - but uncommon.
ali@55: 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman
ali@55: 	     * numerals, but "ii" is a common scanno.
ali@55: 	     */
ali@55: 	    if (strstr(testword,"tbi"))
ali@69: 		istypo=TRUE;
ali@55: 	    if (strstr(testword,"tbe"))
ali@69: 		istypo=TRUE;
ali@55: 	    if (strstr(testword,"ii"))
ali@69: 		istypo=TRUE;
ali@55: 	    /*
ali@55: 	     * Check for no vowels or no consonants.
ali@55: 	     * If none, flag a typo.
ali@55: 	     */
ali@70: 	    if (!istypo && len>1)
ali@55: 	    {
ali@55: 		vowel=consonant=0;
ali@70: 		for (t=testword;*t;t=g_utf8_next_char(t))
ali@55: 		{
ali@70: 		    c=g_utf8_get_char(t);
ali@70: 		    decomposition=
ali@70: 		      g_unicode_canonical_decomposition(c,&decomposition_len);
ali@70: 		    if (c=='y' || g_unichar_isdigit(c))
ali@55: 		    {
ali@55: 			/* Yah, this is loose. */
ali@55: 			vowel++;
ali@55: 			consonant++;
ali@55: 		    }
ali@70: 		    else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
ali@55: 			vowel++;
ali@55: 		    else
ali@55: 			consonant++;
ali@70: 		    g_free(decomposition);
ali@55: 		}
ali@55: 		if (!vowel || !consonant)
ali@69: 		    istypo=TRUE;
ali@55: 	    }
ali@55: 	    /*
ali@55: 	     * Now exclude the word from being reported if it's in
ali@55: 	     * the okword list.
ali@55: 	     */
ali@55: 	    for (i=0;*okword[i];i++)
ali@55: 		if (!strcmp(testword,okword[i]))
ali@69: 		    istypo=FALSE;
ali@55: 	    /*
ali@55: 	     * What looks like a typo may be a Roman numeral.
ali@55: 	     * Exclude these.
ali@55: 	     */
ali@55: 	    if (istypo && isroman(testword))
ali@69: 		istypo=FALSE;
ali@55: 	    /* Check the manual list of typos. */
ali@55: 	    if (!istypo)
ali@55: 		for (i=0;*typo[i];i++)
ali@55: 		    if (!strcmp(testword,typo[i]))
ali@69: 			istypo=TRUE;
ali@55: 	    /*
ali@55: 	     * Check lowercase s, l, i and m - special cases.
ali@55: 	     *   "j" - often a semi-colon gone wrong.
ali@55: 	     *   "d" for a missing apostrophe - he d
ali@55: 	     *   "n" for "in"
ali@55: 	     */
ali@70: 	    if (!istypo && len==1 &&
ali@70: 	      g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
ali@69: 		istypo=TRUE;
ali@55: 	    if (istypo)
ali@55: 	    {
ali@69: 		dupcnt=g_tree_lookup(qword,testword);
ali@69: 		if (dupcnt)
ali@69: 		{
ali@69: 		    (*dupcnt)++;
ali@69: 		    isdup=!pswit[VERBOSE_SWITCH];
ali@69: 		}
ali@69: 		else
ali@69: 		{
ali@69: 		    dupcnt=g_new0(int,1);
ali@69: 		    g_tree_insert(qword,g_strdup(testword),dupcnt);
ali@69: 		    isdup=FALSE;
ali@69: 		}
ali@55: 		if (!isdup)
ali@55: 		{
ali@55: 		    if (pswit[ECHO_SWITCH])
ali@70: 			g_print("\n%s\n",aline);
ali@55: 		    if (!pswit[OVERVIEW_SWITCH])
ali@55: 		    {
ali@70: 			g_print("    Line %ld column %ld - Query word %s",
ali@70: 			  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
ali@70: 			  inword);
ali@69: 			if (!pswit[VERBOSE_SWITCH])
ali@70: 			    g_print(" - not reporting duplicates");
ali@70: 			g_print("\n");
ali@55: 		    }
ali@55: 		    else
ali@55: 			cnt_word++;
ali@55: 		}
ali@55: 	    }
ali@55: 	}
ali@55: 	/* check the user's list of typos */
ali@69: 	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
ali@69: 	{
ali@69: 	    if (pswit[ECHO_SWITCH])
ali@70: 		g_print("\n%s\n",aline);
ali@69: 	    if (!pswit[OVERVIEW_SWITCH])  
ali@70: 		g_print("    Line %ld column %ld - Query possible scanno %s\n",
ali@70: 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
ali@69: 	}
ali@69: 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
ali@69: 	    g_free(testword);
ali@55: 	if (pswit[PARANOID_SWITCH] && warnings->digit)
ali@55: 	{
ali@55: 	    /* In paranoid mode, query all 0 and 1 standing alone. */
ali@55: 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))
ali@55: 	    {
ali@55: 		if (pswit[ECHO_SWITCH])
ali@70: 		    g_print("\n%s\n",aline);
ali@55: 		if (!pswit[OVERVIEW_SWITCH])
ali@70: 		    g_print("    Line %ld column %ld - Query standalone %s\n",
ali@70: 		      linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
ali@70: 		      inword);
ali@55: 		else
ali@55: 		    cnt_word++;
ali@55: 	    }
ali@55: 	}
ali@69: 	g_free(inword);
ali@55:     }
ali@55: }
ali@55: 
ali@56: /*
ali@56:  * check_for_misspaced_punctuation:
ali@56:  *
ali@56:  * Look for added or missing spaces around punctuation and quotes.
ali@56:  * If there is a punctuation character like ! with no space on
ali@56:  * either side, suspect a missing!space. If there are spaces on
ali@56:  * both sides , assume a typo. If we see a double quote with no
ali@56:  * space or punctuation on either side of it, assume unspaced
ali@56:  * quotes "like"this.
ali@56:  */
ali@56: void check_for_misspaced_punctuation(const char *aline,
ali@69:   struct parities *parities,gboolean isemptyline)
ali@56: {
ali@69:     gboolean isacro,isellipsis;
ali@56:     const char *s;
ali@70:     gunichar c,nc,pc,n2c;
ali@142:     int parity;
ali@70:     c=g_utf8_get_char(aline);
ali@70:     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70:     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@56:     {
ali@70: 	pc=c;
ali@70: 	c=nc;
ali@70: 	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@56: 	/* For each character in the line after the first. */
ali@70: 	if (g_utf8_strchr(".?!,;:_",-1,c))  /* if it's punctuation */
ali@56: 	{
ali@56: 	    /* we need to suppress warnings for acronyms like M.D. */
ali@69: 	    isacro=FALSE;
ali@56: 	    /* we need to suppress warnings for ellipsis . . . */
ali@69: 	    isellipsis=FALSE;
ali@70: 	    /*
ali@70: 	     * If there are letters on both sides of it or
ali@70: 	     * if it's strict punctuation followed by an alpha.
ali@70: 	     */
ali@70: 	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
ali@70: 	      g_utf8_strchr("?!,;:",-1,c)))
ali@56: 	    {
ali@70: 		if (c=='.')
ali@56: 		{
ali@70: 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
ali@70: 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
ali@69: 			isacro=TRUE;
ali@70: 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
ali@70: 		    if (nc && n2c=='.')
ali@69: 			isacro=TRUE;
ali@56: 		}
ali@56: 		if (!isacro)
ali@56: 		{
ali@56: 		    if (pswit[ECHO_SWITCH])
ali@70: 			g_print("\n%s\n",aline);
ali@56: 		    if (!pswit[OVERVIEW_SWITCH])
ali@70: 			g_print("    Line %ld column %ld - Missing space?\n",
ali@70: 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56: 		    else
ali@56: 			cnt_punct++;
ali@56: 		}
ali@56: 	    }
ali@70: 	    if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
ali@56: 	    {
ali@56: 		/*
ali@56: 		 * If there are spaces on both sides,
ali@56: 		 * or space before and end of line.
ali@56: 		 */
ali@70: 		if (c=='.')
ali@56: 		{
ali@70: 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
ali@70: 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
ali@69: 			isellipsis=TRUE;
ali@70: 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
ali@70: 		    if (nc && n2c=='.')
ali@69: 			isellipsis=TRUE;
ali@56: 		}
ali@56: 		if (!isemptyline && !isellipsis)
ali@56: 		{
ali@56: 		    if (pswit[ECHO_SWITCH])
ali@70: 			g_print("\n%s\n",aline);
ali@56: 		    if (!pswit[OVERVIEW_SWITCH])
ali@70: 			g_print("    Line %ld column %ld - "
ali@70: 			  "Spaced punctuation?\n",linecnt,
ali@70: 			  g_utf8_pointer_to_offset(aline,s)+1);
ali@56: 		    else
ali@56: 			cnt_punct++;
ali@56: 		}
ali@56: 	    }
ali@56: 	}
ali@56:     }
ali@56:     /* Split out the characters that CANNOT be preceded by space. */
ali@70:     c=g_utf8_get_char(aline);
ali@70:     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70:     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@56:     {
ali@70: 	pc=c;
ali@70: 	c=nc;
ali@70: 	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@56: 	/* for each character in the line after the first */
ali@70: 	if (g_utf8_strchr("?!,;:",-1,c))
ali@56: 	{
ali@56: 	    /* if it's punctuation that _cannot_ have a space before it */
ali@70: 	    if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
ali@56: 	    {
ali@56: 		/*
ali@70: 		 * If nc DOES == space,
ali@56: 		 * it was already reported just above.
ali@56: 		 */
ali@56: 		if (pswit[ECHO_SWITCH])
ali@70: 		    g_print("\n%s\n",aline);
ali@56: 		if (!pswit[OVERVIEW_SWITCH])
ali@70: 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
ali@70: 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56: 		else
ali@56: 		    cnt_punct++;
ali@56: 	    }
ali@56: 	}
ali@56:     }
ali@56:     /*
ali@56:      * Special case " .X" where X is any alpha.
ali@56:      * This plugs a hole in the acronym code above.
ali@56:      * Inelegant, but maintainable.
ali@56:      */
ali@70:     c=g_utf8_get_char(aline);
ali@70:     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70:     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@56:     {
ali@70: 	pc=c;
ali@70: 	c=nc;
ali@70: 	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@56: 	/* for each character in the line after the first */
ali@70: 	if (c=='.')
ali@56: 	{
ali@56: 	    /* if it's a period */
ali@70: 	    if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
ali@56: 	    {
ali@56: 		/*
ali@56: 		 * If the period follows a space and
ali@56: 		 * is followed by a letter.
ali@56: 		 */
ali@56: 		if (pswit[ECHO_SWITCH])
ali@70: 		    g_print("\n%s\n",aline);
ali@56: 		if (!pswit[OVERVIEW_SWITCH])
ali@70: 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
ali@70: 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56: 		else
ali@56: 		    cnt_punct++;
ali@56: 	    }
ali@56: 	}
ali@56:     }
ali@70:     c=g_utf8_get_char(aline);
ali@70:     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70:     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@56:     {
ali@70: 	pc=c;
ali@70: 	c=nc;
ali@70: 	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@56: 	/* for each character in the line after the first */
ali@142: 	if (CHAR_IS_DQUOTE(c))
ali@56: 	{
ali@70: 	    if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
ali@70: 	      !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
ali@70: 	      !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
ali@56: 	    {
ali@56: 		if (pswit[ECHO_SWITCH])
ali@70: 		    g_print("\n%s\n",aline);
ali@56: 		if (!pswit[OVERVIEW_SWITCH])
ali@70: 		    g_print("    Line %ld column %ld - Unspaced quotes?\n",
ali@70: 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56: 		else
ali@56: 		    cnt_punct++;
ali@56: 	    }
ali@56: 	}
ali@56:     }
ali@56:     /* Check parity of quotes. */
ali@70:     nc=g_utf8_get_char(aline);
ali@70:     for (s=aline;*s;s=g_utf8_next_char(s))
ali@56:     {
ali@70: 	c=nc;
ali@70: 	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@142: 	if (CHAR_IS_DQUOTE(c))
ali@56: 	{
ali@142: 	    if (c==CHAR_DQUOTE)
ali@142: 	    {
ali@142: 		parities->dquote=!parities->dquote;
ali@142: 		parity=parities->dquote;
ali@142: 	    }
ali@142: 	    else if (c==CHAR_LD_QUOTE)
ali@142: 		parity=1;
ali@142: 	    else
ali@142: 		parity=0;
ali@142: 	    if (!parity)
ali@56: 	    {
ali@56: 		/* parity even */
ali@173: 		if (!g_utf8_strchr("_-.'`‘’/,;:!?)]} ",-1,nc))
ali@56: 		{
ali@56: 		    if (pswit[ECHO_SWITCH])
ali@70: 			g_print("\n%s\n",aline);
ali@56: 		    if (!pswit[OVERVIEW_SWITCH])
ali@70: 			g_print("    Line %ld column %ld - "
ali@70: 			  "Wrongspaced quotes?\n",
ali@70: 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56: 		    else
ali@56: 			cnt_punct++;
ali@56: 		}
ali@56: 	    }
ali@56: 	    else
ali@56: 	    {
ali@56: 		/* parity odd */
ali@173: 		if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
ali@173: 		  !g_utf8_strchr("_-/.'`‘’([{$",-1,nc) || !nc)
ali@56: 		{
ali@56: 		    if (pswit[ECHO_SWITCH])
ali@70: 			g_print("\n%s\n",aline);
ali@56: 		    if (!pswit[OVERVIEW_SWITCH])
ali@70: 			g_print("    Line %ld column %ld - "
ali@70: 			  "Wrongspaced quotes?\n",
ali@70: 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56: 		    else
ali@56: 			cnt_punct++;
ali@56: 		}
ali@56: 	    }
ali@56: 	}
ali@56:     }
ali@142:     c=g_utf8_get_char(aline);
ali@142:     if (CHAR_IS_DQUOTE(c))
ali@56:     {
ali@70: 	if (g_utf8_strchr(",;:!?)]} ",-1,
ali@70: 	  g_utf8_get_char(g_utf8_next_char(aline))))
ali@56: 	{
ali@56: 	    if (pswit[ECHO_SWITCH])
ali@70: 		g_print("\n%s\n",aline);
ali@56: 	    if (!pswit[OVERVIEW_SWITCH])
ali@70: 		g_print("    Line %ld column 1 - Wrongspaced quotes?\n",
ali@56: 		  linecnt);
ali@56: 	    else
ali@56: 		cnt_punct++;
ali@56: 	}
ali@56:     }
ali@56:     if (pswit[SQUOTE_SWITCH])
ali@56:     {
ali@70: 	nc=g_utf8_get_char(aline);
ali@70: 	for (s=aline;*s;s=g_utf8_next_char(s))
ali@56: 	{
ali@70: 	    c=nc;
ali@70: 	    nc=g_utf8_get_char(g_utf8_next_char(s));
ali@99: 	    if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
ali@70: 	      !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
ali@70: 	      !g_unichar_isalpha(nc)))
ali@56: 	    {
ali@56: 		parities->squote=!parities->squote;
ali@56: 		if (!parities->squote)
ali@56: 		{
ali@56: 		    /* parity even */
ali@70: 		    if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
ali@56: 		    {
ali@56: 			if (pswit[ECHO_SWITCH])
ali@70: 			    g_print("\n%s\n",aline);
ali@56: 			if (!pswit[OVERVIEW_SWITCH])
ali@70: 			    g_print("    Line %ld column %ld - "
ali@56: 			      "Wrongspaced singlequotes?\n",
ali@70: 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56: 			else
ali@56: 			    cnt_punct++;
ali@56: 		    }
ali@56: 		}
ali@56: 		else
ali@56: 		{
ali@56: 		    /* parity odd */
ali@173: 		    if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
ali@70: 		      !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
ali@56: 		    {
ali@56: 			if (pswit[ECHO_SWITCH])
ali@70: 			    g_print("\n%s\n",aline);
ali@56: 			if (!pswit[OVERVIEW_SWITCH])
ali@70: 			    g_print("    Line %ld column %ld - "
ali@56: 			      "Wrongspaced singlequotes?\n",
ali@70: 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56: 			else
ali@56: 			    cnt_punct++;
ali@56: 		    }
ali@56: 		}
ali@56: 	    }
ali@56: 	}
ali@56:     }
ali@56: }
ali@56: 
ali@55: /*
ali@57:  * check_for_double_punctuation:
ali@57:  *
ali@57:  * Look for double punctuation like ,. or ,,
ali@57:  * Thanks to DW for the suggestion!
ali@57:  * In books with references, ".," and ".;" are common
ali@57:  * e.g. "etc., etc.," and vol. 1.; vol 3.;
ali@57:  * OTOH, from my initial tests, there are also fairly
ali@57:  * common errors. What to do? Make these cases paranoid?
ali@57:  * ".," is the most common, so warnings->dotcomma is used
ali@57:  * to suppress detailed reporting if it occurs often.
ali@57:  */
ali@57: void check_for_double_punctuation(const char *aline,struct warnings *warnings)
ali@57: {
ali@70:     const char *s;
ali@70:     gunichar c,nc;
ali@70:     nc=g_utf8_get_char(aline);
ali@70:     for (s=aline;*s;s=g_utf8_next_char(s))
ali@57:     {
ali@70: 	c=nc;
ali@70: 	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@57: 	/* for each punctuation character in the line */
ali@70: 	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
ali@70: 	  g_utf8_strchr(".?!,;:",-1,nc))
ali@57: 	{
ali@57: 	    /* followed by punctuation, it's a query, unless . . . */
ali@70: 	    if (c==nc && (c=='.' || c=='?' || c=='!') ||
ali@70: 	      !warnings->dotcomma && c=='.' && nc==',' ||
ali@70: 	      warnings->isFrench && g_str_has_prefix(s,",...") ||
ali@70: 	      warnings->isFrench && g_str_has_prefix(s,"...,") ||
ali@70: 	      warnings->isFrench && g_str_has_prefix(s,";...") ||
ali@70: 	      warnings->isFrench && g_str_has_prefix(s,"...;") ||
ali@70: 	      warnings->isFrench && g_str_has_prefix(s,":...") ||
ali@70: 	      warnings->isFrench && g_str_has_prefix(s,"...:") ||
ali@70: 	      warnings->isFrench && g_str_has_prefix(s,"!...") ||
ali@70: 	      warnings->isFrench && g_str_has_prefix(s,"...!") ||
ali@70: 	      warnings->isFrench && g_str_has_prefix(s,"?...") ||
ali@70: 	      warnings->isFrench && g_str_has_prefix(s,"...?"))
ali@57: 	    {
ali@70: 		if (warnings->isFrench && g_str_has_prefix(s,",...") ||
ali@70: 		  warnings->isFrench && g_str_has_prefix(s,"...,") ||
ali@70: 		  warnings->isFrench && g_str_has_prefix(s,";...") ||
ali@70: 		  warnings->isFrench && g_str_has_prefix(s,"...;") ||
ali@70: 		  warnings->isFrench && g_str_has_prefix(s,":...") ||
ali@70: 		  warnings->isFrench && g_str_has_prefix(s,"...:") ||
ali@70: 		  warnings->isFrench && g_str_has_prefix(s,"!...") ||
ali@70: 		  warnings->isFrench && g_str_has_prefix(s,"...!") ||
ali@70: 		  warnings->isFrench && g_str_has_prefix(s,"?...") ||
ali@70: 		  warnings->isFrench && g_str_has_prefix(s,"...?"))
ali@70: 		{
ali@70: 		    s+=4;
ali@70: 		    nc=g_utf8_get_char(g_utf8_next_char(s));
ali@70: 		}
ali@57: 		; /* do nothing for .. !! and ?? which can be legit */
ali@57: 	    }
ali@57: 	    else
ali@57: 	    {
ali@57: 		if (pswit[ECHO_SWITCH])
ali@70: 		    g_print("\n%s\n",aline);
ali@57: 		if (!pswit[OVERVIEW_SWITCH])
ali@70: 		    g_print("    Line %ld column %ld - Double punctuation?\n",
ali@70: 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@57: 		else
ali@57: 		    cnt_punct++;
ali@57: 	    }
ali@57: 	}
ali@57:     }
ali@57: }
ali@57: 
ali@57: /*
ali@58:  * check_for_spaced_quotes:
ali@58:  */
ali@58: void check_for_spaced_quotes(const char *aline)
ali@58: {
ali@99:     int i;
ali@58:     const char *s,*t;
ali@99:     const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
ali@99:       CHAR_RS_QUOTE};
ali@99:     GString *pattern;
ali@58:     s=aline;
ali@58:     while ((t=strstr(s," \" ")))
ali@58:     {
ali@58: 	if (pswit[ECHO_SWITCH])
ali@70: 	    g_print("\n%s\n",aline);
ali@58: 	if (!pswit[OVERVIEW_SWITCH])
ali@70: 	    g_print("    Line %ld column %ld - Spaced doublequote?\n",
ali@70: 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
ali@58: 	else
ali@58: 	    cnt_punct++;
ali@70: 	s=g_utf8_next_char(g_utf8_next_char(t));
ali@58:     }
ali@99:     pattern=g_string_new(NULL);
ali@99:     for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
ali@58:     {
ali@99: 	g_string_assign(pattern," ");
ali@99: 	g_string_append_unichar(pattern,single_quotes[i]);
ali@99: 	g_string_append_c(pattern,' ');
ali@99: 	s=aline;
ali@99: 	while ((t=strstr(s,pattern->str)))
ali@99: 	{
ali@99: 	    if (pswit[ECHO_SWITCH])
ali@99: 		g_print("\n%s\n",aline);
ali@99: 	    if (!pswit[OVERVIEW_SWITCH])
ali@99: 		g_print("    Line %ld column %ld - Spaced singlequote?\n",
ali@99: 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
ali@99: 	    else
ali@99: 		cnt_punct++;
ali@99: 	    s=g_utf8_next_char(g_utf8_next_char(t));
ali@99: 	}
ali@58:     }
ali@99:     g_string_free(pattern,TRUE);
ali@58: }
ali@58: 
ali@58: /*
ali@59:  * check_for_miscased_genative:
ali@59:  *
ali@59:  * Check special case of 'S instead of 's at end of word.
ali@59:  */
ali@59: void check_for_miscased_genative(const char *aline)
ali@59: {
ali@59:     const char *s;
ali@70:     gunichar c,nc,pc;
ali@69:     if (!*aline)
ali@69: 	return;
ali@70:     c=g_utf8_get_char(aline);
ali@70:     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70:     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@59:     {
ali@70: 	pc=c;
ali@70: 	c=nc;
ali@70: 	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@99: 	if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
ali@59: 	{
ali@59: 	    if (pswit[ECHO_SWITCH])
ali@70: 		g_print("\n%s\n",aline);
ali@59: 	    if (!pswit[OVERVIEW_SWITCH])
ali@70: 		g_print("    Line %ld column %ld - Capital \"S\"?\n",
ali@70: 		  linecnt,g_utf8_pointer_to_offset(aline,s)+2);
ali@59: 	    else
ali@59: 		cnt_punct++;
ali@59: 	}
ali@59:     }
ali@59: }
ali@59: 
ali@59: /*
ali@60:  * check_end_of_line:
ali@60:  *
ali@60:  * Now check special cases - start and end of line -
ali@60:  * for single and double quotes. Start is sometimes [sic]
ali@60:  * but better to query it anyway.
ali@60:  * While we're here, check for dash at end of line.
ali@60:  */
ali@60: void check_end_of_line(const char *aline,struct warnings *warnings)
ali@60: {
ali@70:     int lbytes;
ali@70:     const char *s;
ali@70:     gunichar c1,c2;
ali@70:     lbytes=strlen(aline);
ali@70:     if (g_utf8_strlen(aline,lbytes)>1)
ali@60:     {
ali@70: 	s=g_utf8_prev_char(aline+lbytes);
ali@70: 	c1=g_utf8_get_char(s);
ali@70: 	c2=g_utf8_get_char(g_utf8_prev_char(s));
ali@142: 	if ((CHAR_IS_DQUOTE(c1) || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
ali@60: 	{
ali@60: 	    if (pswit[ECHO_SWITCH])
ali@70: 		g_print("\n%s\n",aline);
ali@60: 	    if (!pswit[OVERVIEW_SWITCH])
ali@70: 		g_print("    Line %ld column %ld - Spaced quote?\n",linecnt,
ali@70: 		  g_utf8_strlen(aline,lbytes));
ali@70: 	    else
ali@70: 		cnt_punct++;
ali@70: 	}
ali@70: 	c1=g_utf8_get_char(aline);
ali@70: 	c2=g_utf8_get_char(g_utf8_next_char(aline));
ali@99: 	if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
ali@70: 	{
ali@70: 	    if (pswit[ECHO_SWITCH])
ali@70: 		g_print("\n%s\n",aline);
ali@70: 	    if (!pswit[OVERVIEW_SWITCH])
ali@70: 		g_print("    Line %ld column 1 - Spaced quote?\n",linecnt);
ali@60: 	    else
ali@60: 		cnt_punct++;
ali@60: 	}
ali@60: 	/*
ali@60: 	 * Dash at end of line may well be legit - paranoid mode only
ali@60: 	 * and don't report em-dash at line-end.
ali@60: 	 */
ali@60: 	if (pswit[PARANOID_SWITCH] && warnings->hyphen)
ali@60: 	{
ali@70: 	    for (s=g_utf8_prev_char(aline+lbytes);
ali@70: 	      s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
ali@60: 		;
ali@70: 	    if (g_utf8_get_char(s)=='-' &&
ali@70: 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
ali@60: 	    {
ali@60: 		if (pswit[ECHO_SWITCH])
ali@70: 		    g_print("\n%s\n",aline);
ali@60: 		if (!pswit[OVERVIEW_SWITCH])
ali@70: 		    g_print("    Line %ld column %ld - "
ali@70: 		      "Hyphen at end of line?\n",
ali@70: 		      linecnt,g_utf8_pointer_to_offset(aline,s));
ali@60: 	    }
ali@60: 	}
ali@60:     }
ali@60: }
ali@60: 
ali@60: /*
ali@61:  * check_for_unspaced_bracket:
ali@61:  *
ali@61:  * Brackets are often unspaced, but shouldn't be surrounded by alpha.
ali@61:  * If so, suspect a scanno like "a]most".
ali@61:  */
ali@61: void check_for_unspaced_bracket(const char *aline)
ali@61: {
ali@70:     const char *s;
ali@70:     gunichar c,nc,pc;
ali@70:     c=g_utf8_get_char(aline);
ali@70:     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70:     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@61:     {
ali@70: 	pc=c;
ali@70: 	c=nc;
ali@70: 	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@70: 	if (!nc)
ali@70: 	    break;
ali@61: 	/* for each bracket character in the line except 1st & last */
ali@70: 	if (g_utf8_strchr("{[()]}",-1,c) &&
ali@70: 	  g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
ali@61: 	{
ali@61: 	    if (pswit[ECHO_SWITCH])
ali@70: 		g_print("\n%s\n",aline);
ali@61: 	    if (!pswit[OVERVIEW_SWITCH])
ali@70: 		g_print("    Line %ld column %ld - Unspaced bracket?\n",
ali@70: 		  linecnt,g_utf8_pointer_to_offset(aline,s));
ali@61: 	    else
ali@61: 		cnt_punct++;
ali@61: 	}
ali@61:     }
ali@61: }
ali@61: 
ali@61: /*
ali@62:  * check_for_unpunctuated_endquote:
ali@62:  */
ali@62: void check_for_unpunctuated_endquote(const char *aline)
ali@62: {
ali@70:     const char *s;
ali@70:     gunichar c,nc,pc;
ali@142:     QuoteClass qc;
ali@70:     c=g_utf8_get_char(aline);
ali@70:     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70:     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@62:     {
ali@70: 	pc=c;
ali@70: 	c=nc;
ali@142: 	qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;
ali@70: 	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@62: 	/* for each character in the line except 1st */
ali@147: 	if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && g_unichar_isalpha(pc))
ali@62: 	{
ali@62: 	    if (pswit[ECHO_SWITCH])
ali@70: 		g_print("\n%s\n",aline);
ali@62: 	    if (!pswit[OVERVIEW_SWITCH])
ali@70: 		g_print("    Line %ld column %ld - "
ali@70: 		  "endquote missing punctuation?\n",
ali@70: 		  linecnt,g_utf8_pointer_to_offset(aline,s));
ali@62: 	    else
ali@62: 		cnt_punct++;
ali@62: 	}
ali@62:     }
ali@62: }
ali@62: 
ali@62: /*
ali@63:  * check_for_html_tag:
ali@63:  *
ali@63:  * Check for <HTML TAG>.
ali@63:  *
ali@63:  * If there is a < in the line, followed at some point
ali@63:  * by a > then we suspect HTML.
ali@63:  */
ali@63: void check_for_html_tag(const char *aline)
ali@63: {
ali@63:     const char *open,*close;
ali@70:     gchar *tag;
ali@70:     open=strchr(aline,'<');
ali@63:     if (open)
ali@63:     {
ali@70: 	close=strchr(g_utf8_next_char(open),'>');
ali@63: 	if (close)
ali@63: 	{
ali@70: 	    if (pswit[ECHO_SWITCH])
ali@70: 		g_print("\n%s\n",aline);
ali@70: 	    if (!pswit[OVERVIEW_SWITCH])
ali@63: 	    {
ali@70: 		tag=g_strndup(open,close-open+1);
ali@70: 		g_print("    Line %ld column %ld - HTML Tag? %s \n",
ali@70: 		  linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
ali@70: 		g_free(tag);
ali@63: 	    }
ali@70: 	    else
ali@70: 		cnt_html++;
ali@63: 	}
ali@63:     }
ali@63: }
ali@63: 
ali@63: /*
ali@64:  * check_for_html_entity:
ali@64:  *
ali@64:  * Check for &symbol; HTML.
ali@64:  *
ali@64:  * If there is a & in the line, followed at
ali@64:  * some point by a ; then we suspect HTML.
ali@64:  */
ali@64: void check_for_html_entity(const char *aline)
ali@64: {
ali@64:     const char *s,*amp,*scolon;
ali@70:     gchar *entity;
ali@70:     amp=strchr(aline,'&');
ali@64:     if (amp)
ali@64:     {
ali@70: 	scolon=strchr(amp,';');
ali@64: 	if (scolon)
ali@64: 	{
ali@70: 	    for (s=amp;s<scolon;s=g_utf8_next_char(s))   
ali@70: 		if (g_utf8_get_char(s)==CHAR_SPACE)
ali@70: 		    break;		/* Don't report "Jones & Son;" */
ali@70: 	    if (s>=scolon)
ali@64: 	    {
ali@64: 		if (pswit[ECHO_SWITCH])
ali@70: 		    g_print("\n%s\n",aline);
ali@64: 		if (!pswit[OVERVIEW_SWITCH])
ali@70: 		{
ali@70: 		    entity=g_strndup(amp,scolon-amp+1);
ali@70: 		    g_print("    Line %ld column %d - HTML symbol? %s \n",
ali@70: 		      linecnt,(int)(amp-aline)+1,entity);
ali@70: 		    g_free(entity);
ali@70: 		}
ali@64: 		else
ali@64: 		    cnt_html++;
ali@64: 	    }
ali@64: 	}
ali@64:     }
ali@64: }
ali@64: 
ali@65: /*
ali@66:  * check_for_omitted_punctuation:
ali@66:  *
ali@66:  * Check for omitted punctuation at end of paragraph by working back
ali@66:  * through prevline. DW.
ali@66:  * Need to check this only for "normal" paras.
ali@66:  * So what is a "normal" para?
ali@66:  *    Not normal if one-liner (chapter headings, etc.)
ali@66:  *    Not normal if doesn't contain at least one locase letter
ali@66:  *    Not normal if starts with space
ali@66:  */
ali@66: void check_for_omitted_punctuation(const char *prevline,
ali@66:   struct line_properties *last,int start_para_line)
ali@66: {
ali@70:     gboolean letter_on_line=FALSE;
ali@66:     const char *s;
ali@99:     gunichar c;
ali@142:     gboolean closing_quote;
ali@70:     for (s=prevline;*s;s=g_utf8_next_char(s))
ali@70: 	if (g_unichar_isalpha(g_utf8_get_char(s)))
ali@70: 	{
ali@70: 	    letter_on_line=TRUE;
ali@70: 	    break;
ali@70: 	}
ali@66:     /*
ali@66:      * This next "if" is a problem.
ali@66:      * If we say "start_para_line <= linecnt - 1", that includes
ali@66:      * one-line "paragraphs" like chapter heads. Lotsa false positives.
ali@66:      * If we say "start_para_line < linecnt - 1" it doesn't, but then it
ali@66:      * misses genuine one-line paragraphs.
ali@66:      */
ali@70:     if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
ali@70:       g_utf8_get_char(prevline)>CHAR_SPACE)
ali@66:     {
ali@99: 	s=prevline+strlen(prevline);
ali@99: 	do
ali@99: 	{
ali@99: 	    s=g_utf8_prev_char(s);
ali@99: 	    c=g_utf8_get_char(s);
ali@142: 	    if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)
ali@142: 		closing_quote=TRUE;
ali@142: 	    else
ali@142: 		closing_quote=FALSE;
ali@142: 	} while (closing_quote && s>prevline);
ali@70: 	for (;s>prevline;s=g_utf8_prev_char(s))
ali@66: 	{
ali@70: 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
ali@66: 	    {
ali@66: 		if (pswit[ECHO_SWITCH])
ali@70: 		    g_print("\n%s\n",prevline);
ali@66: 		if (!pswit[OVERVIEW_SWITCH])
ali@70: 		    g_print("    Line %ld column %ld - "
ali@66: 		      "No punctuation at para end?\n",
ali@70: 		      linecnt-1,g_utf8_strlen(prevline,-1));
ali@66: 		else
ali@66: 		    cnt_punct++;
ali@66: 		break;
ali@66: 	    }
ali@147: 	    if (g_utf8_strchr("—-.:!([{?}])",-1,g_utf8_get_char(s)))
ali@66: 		break;
ali@66: 	}
ali@66:     }
ali@66: }
ali@66: 
ali@69: gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
ali@69: {
ali@69:     const char *word=key;
ali@69:     int *dupcnt=value;
ali@69:     if (*dupcnt)
ali@70: 	g_print("\nNote: Queried word %s was duplicated %d times\n",
ali@69: 	  word,*dupcnt);
ali@69:     return FALSE;
ali@69: }
ali@69: 
ali@70: void print_as_windows_1252(const char *string)
ali@70: {
ali@70:     gsize inbytes,outbytes;
ali@70:     gchar *buf,*bp;
ali@86:     static GIConv converter=(GIConv)-1;
ali@70:     if (!string)
ali@70:     {
ali@70: 	if (converter!=(GIConv)-1)
ali@70: 	    g_iconv_close(converter);
ali@70: 	converter=(GIConv)-1;
ali@70: 	return;
ali@70:     }
ali@86:     if (converter==(GIConv)-1)
ali@70: 	converter=g_iconv_open("WINDOWS-1252","UTF-8");
ali@70:     if (converter!=(GIConv)-1)
ali@70:     {
ali@70: 	inbytes=outbytes=strlen(string);
ali@70: 	bp=buf=g_malloc(outbytes+1);
ali@70: 	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
ali@70: 	*bp='\0';
ali@70: 	fputs(buf,stdout);
ali@70: 	g_free(buf);
ali@70:     }
ali@70:     else
ali@70: 	fputs(string,stdout);
ali@70: }
ali@70: 
ali@72: void print_as_utf_8(const char *string)
ali@72: {
ali@72:     fputs(string,stdout);
ali@72: }
ali@72: 
ali@66: /*
ali@41:  * procfile:
ali@41:  *
ali@41:  * Process one file.
ali@41:  */
ali@69: void procfile(const char *filename)
ali@41: {
ali@65:     const char *s;
ali@69:     gchar *parastart=NULL;	/* first line of current para */
ali@69:     gchar *etext,*aline;
ali@69:     gchar *etext_ptr;
ali@69:     GError *err=NULL;
ali@41:     struct first_pass_results *first_pass_results;
ali@42:     struct warnings *warnings;
ali@43:     struct counters counters={0};
ali@45:     struct line_properties last={0};
ali@56:     struct parities parities={0};
ali@69:     struct pending pending={0};
ali@69:     gboolean isemptyline;
ali@68:     long start_para_line=0;
ali@69:     gboolean isnewpara=FALSE,enddash=FALSE;
ali@45:     last.start=CHAR_SPACE;
ali@68:     linecnt=checked_linecnt=0;
ali@69:     etext=read_etext(filename,&err);
ali@69:     if (!etext)
ali@41:     {
ali@68: 	if (pswit[STDOUT_SWITCH])
ali@69: 	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
ali@68: 	else
ali@69: 	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
ali@41: 	exit(1);
ali@41:     }
ali@70:     g_print("\n\nFile: %s\n\n",filename);
ali@69:     first_pass_results=first_pass(etext);
ali@42:     warnings=report_first_pass(first_pass_results);
ali@69:     qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
ali@69:     qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
ali@40:     /*
ali@40:      * Here we go with the main pass. Hold onto yer hat!
ali@40:      */
ali@65:     linecnt=0;
ali@69:     etext_ptr=etext;
ali@69:     while ((aline=flgets(&etext_ptr,linecnt+1)))
ali@40:     {
ali@68: 	linecnt++;
ali@68: 	if (linecnt==1)
ali@69: 	    isnewpara=TRUE;
ali@70: 	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
ali@40: 	    continue;    // skip DP page separators completely
ali@68: 	if (linecnt<first_pass_results->firstline ||
ali@41: 	  (first_pass_results->footerline>0 &&
ali@41: 	  linecnt>first_pass_results->footerline))
ali@40: 	{
ali@68: 	    if (pswit[HEADER_SWITCH])
ali@40: 	    {
ali@70: 		if (g_str_has_prefix(aline,"Title:"))
ali@70: 		    g_print("    %s\n",aline);
ali@70: 		if (g_str_has_prefix(aline,"Author:"))
ali@70: 		    g_print("    %s\n",aline);
ali@70: 		if (g_str_has_prefix(aline,"Release Date:"))
ali@70: 		    g_print("    %s\n",aline);
ali@70: 		if (g_str_has_prefix(aline,"Edition:"))
ali@70: 		    g_print("    %s\n\n",aline);
ali@40: 	    }
ali@68: 	    continue;		/* skip through the header */
ali@40: 	}
ali@68: 	checked_linecnt++;
ali@65: 	print_pending(aline,parastart,&pending);
ali@164: 	isemptyline=analyse_quotes(aline,&counters);
ali@68: 	if (isnewpara && !isemptyline)
ali@40: 	{
ali@40: 	    /* This line is the start of a new paragraph. */
ali@68: 	    start_para_line=linecnt;
ali@40: 	    /* Capture its first line in case we want to report it later. */
ali@69: 	    g_free(parastart);
ali@69: 	    parastart=g_strdup(aline);
ali@56: 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */
ali@68: 	    s=aline;
ali@70: 	    while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
ali@70: 	      !g_unichar_isdigit(g_utf8_get_char(s)))
ali@70: 		s=g_utf8_next_char(s);
ali@70: 	    if (g_unichar_islower(g_utf8_get_char(s)))
ali@40: 	    {
ali@40: 		/* and its first letter is lowercase */
ali@68: 		if (pswit[ECHO_SWITCH])
ali@70: 		    g_print("\n%s\n",aline);
ali@68: 		if (!pswit[OVERVIEW_SWITCH])
ali@70: 		    g_print("    Line %ld column %ld - "
ali@40: 		      "Paragraph starts with lower-case\n",
ali@70: 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@68: 		else
ali@68: 		    cnt_punct++;
ali@40: 	    }
ali@69: 	    isnewpara=FALSE; /* Signal the end of new para processing. */
ali@40: 	}
ali@68: 	/* Check for an em-dash broken at line end. */
ali@70: 	if (enddash && g_utf8_get_char(aline)=='-')
ali@40: 	{
ali@68: 	    if (pswit[ECHO_SWITCH])
ali@70: 		g_print("\n%s\n",aline);
ali@68: 	    if (!pswit[OVERVIEW_SWITCH])
ali@70: 		g_print("    Line %ld column 1 - Broken em-dash?\n",linecnt);
ali@68: 	    else
ali@68: 		cnt_punct++;
ali@40: 	}
ali@69: 	enddash=FALSE;
ali@70: 	for (s=g_utf8_prev_char(aline+strlen(aline));
ali@70: 	  g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
ali@40: 	    ;
ali@70: 	if (s>=aline && g_utf8_get_char(s)=='-')
ali@69: 	    enddash=TRUE;
ali@67: 	check_for_control_characters(aline);
ali@185: 	check_for_odd_characters(aline,warnings,isemptyline);
ali@68: 	if (warnings->longline)
ali@45: 	    check_for_long_line(aline);
ali@68: 	if (warnings->shortline)
ali@45: 	    check_for_short_line(aline,&last);
ali@68: 	last.blen=last.len;
ali@70: 	last.len=g_utf8_strlen(aline,-1);
ali@70: 	last.start=g_utf8_get_char(aline);
ali@46: 	check_for_starting_punctuation(aline);
ali@68: 	if (warnings->dash)
ali@40: 	{
ali@47: 	    check_for_spaced_emdash(aline);
ali@47: 	    check_for_spaced_dash(aline);
ali@40: 	}
ali@48: 	check_for_unmarked_paragraphs(aline);
ali@49: 	check_for_jeebies(aline);
ali@50: 	check_for_mta_from(aline);
ali@51: 	check_for_orphan_character(aline);
ali@52: 	check_for_pling_scanno(aline);
ali@53: 	check_for_extra_period(aline,warnings);
ali@54: 	check_for_following_punctuation(aline);
ali@55: 	check_for_typos(aline,warnings);
ali@56: 	check_for_misspaced_punctuation(aline,&parities,isemptyline);
ali@57: 	check_for_double_punctuation(aline,warnings);
ali@58: 	check_for_spaced_quotes(aline);
ali@59: 	check_for_miscased_genative(aline);
ali@60: 	check_end_of_line(aline,warnings);
ali@61: 	check_for_unspaced_bracket(aline);
ali@68: 	if (warnings->endquote)
ali@62: 	    check_for_unpunctuated_endquote(aline);
ali@63: 	check_for_html_tag(aline);
ali@64: 	check_for_html_entity(aline);
ali@68: 	if (isemptyline)
ali@40: 	{
ali@65: 	    check_for_mismatched_quotes(&counters,&pending);
ali@103: 	    counters_reset(&counters);
ali@40: 	    /* let the next iteration know that it's starting a new para */
ali@69: 	    isnewpara=TRUE;
ali@69: 	    if (prevline)
ali@69: 		check_for_omitted_punctuation(prevline,&last,start_para_line);
ali@40: 	}
ali@69: 	g_free(prevline);
ali@69: 	prevline=g_strdup(aline);
ali@0:     }
ali@103:     linecnt++;
ali@103:     check_for_mismatched_quotes(&counters,&pending);
ali@103:     print_pending(NULL,parastart,&pending);
ali@103:     reset_pending(&pending);
ali@69:     if (prevline)
ali@69:     {
ali@69: 	g_free(prevline);
ali@69: 	prevline=NULL;
ali@69:     }
ali@69:     g_free(parastart);
ali@69:     g_free(prevline);
ali@69:     g_free(etext);
ali@79:     if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
ali@69: 	g_tree_foreach(qword,report_duplicate_queries,NULL);
ali@69:     g_tree_unref(qword);
ali@69:     g_tree_unref(qperiod);
ali@99:     counters_destroy(&counters);
ali@70:     g_set_print_handler(NULL);
ali@70:     print_as_windows_1252(NULL);
ali@71:     if (pswit[MARKUP_SWITCH])  
ali@71: 	loseentities(NULL);
ali@0: }
ali@0: 
ali@40: /*
ali@40:  * flgets:
ali@40:  *
ali@69:  * Get one line from the input text, checking for
ali@40:  * the existence of exactly one CR/LF line-end per line.
ali@40:  *
ali@40:  * Returns: a pointer to the line.
ali@40:  */
ali@69: char *flgets(char **etext,long lcnt)
ali@0: {
ali@70:     gunichar c;
ali@69:     gboolean isCR=FALSE;
ali@69:     char *theline=*etext;
ali@70:     char *eos=theline;
ali@70:     gchar *s;
ali@70:     for (;;)
ali@40:     {
ali@70: 	c=g_utf8_get_char(*etext);
ali@173: 	if (!c)
ali@173: 	{
ali@173: 	    if (*etext==theline)
ali@173: 		return NULL;
ali@173: 	    else if (pswit[LINE_END_SWITCH])
ali@173: 	    {
ali@173: 		if (pswit[ECHO_SWITCH])
ali@173: 		{
ali@173: 		    s=g_strndup(theline,eos-theline);
ali@173: 		    g_print("\n%s\n",s);
ali@173: 		    g_free(s);
ali@173: 		}
ali@173: 		if (!pswit[OVERVIEW_SWITCH])
ali@173: 		    /* There may, or may not, have been a CR */
ali@173: 		    g_print("    Line %ld - No LF?\n",lcnt);
ali@173: 		else
ali@173: 		    cnt_lineend++;
ali@173: 	    }
ali@173: 	    break;
ali@173: 	}
ali@70: 	*etext=g_utf8_next_char(*etext);
ali@40: 	/* either way, it's end of line */
ali@69: 	if (c=='\n')
ali@40: 	{
ali@68: 	    if (isCR)
ali@68: 		break;
ali@68: 	    else
ali@40: 	    {
ali@40: 		/* Error - a LF without a preceding CR */
ali@68: 		if (pswit[LINE_END_SWITCH])
ali@40: 		{
ali@68: 		    if (pswit[ECHO_SWITCH])
ali@70: 		    {
ali@70: 			s=g_strndup(theline,eos-theline);
ali@70: 			g_print("\n%s\n",s);
ali@70: 			g_free(s);
ali@70: 		    }
ali@68: 		    if (!pswit[OVERVIEW_SWITCH])
ali@70: 			g_print("    Line %ld - No CR?\n",lcnt);
ali@68: 		    else
ali@68: 			cnt_lineend++;
ali@40: 		}
ali@68: 		break;
ali@40: 	    }
ali@40: 	}
ali@69: 	if (c=='\r')
ali@40: 	{
ali@68: 	    if (isCR)
ali@40: 	    {
ali@40: 		/* Error - two successive CRs */
ali@68: 		if (pswit[LINE_END_SWITCH])
ali@40: 		{
ali@68: 		    if (pswit[ECHO_SWITCH])
ali@70: 		    {
ali@70: 			s=g_strndup(theline,eos-theline);
ali@70: 			g_print("\n%s\n",s);
ali@70: 			g_free(s);
ali@70: 		    }
ali@68: 		    if (!pswit[OVERVIEW_SWITCH])
ali@70: 			g_print("    Line %ld - Two successive CRs?\n",lcnt);
ali@68: 		    else
ali@68: 			cnt_lineend++;
ali@40: 		}
ali@40: 	    }
ali@69: 	    isCR=TRUE;
ali@40: 	}
ali@68: 	else
ali@40: 	{
ali@68: 	    if (pswit[LINE_END_SWITCH] && isCR)
ali@40: 	    {
ali@68: 		if (pswit[ECHO_SWITCH])
ali@70: 		{
ali@70: 		    s=g_strndup(theline,eos-theline);
ali@70: 		    g_print("\n%s\n",s);
ali@70: 		    g_free(s);
ali@70: 		}
ali@68: 		if (!pswit[OVERVIEW_SWITCH])
ali@70: 		    g_print("    Line %ld column %ld - CR without LF?\n",
ali@70: 		      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
ali@68: 		else
ali@68: 		    cnt_lineend++;
ali@70: 		*eos=' ';
ali@40: 	    }
ali@69: 	    isCR=FALSE;
ali@70: 	    eos=g_utf8_next_char(eos);
ali@40: 	}
ali@69:     }
ali@70:     *eos='\0';
ali@0:     if (pswit[MARKUP_SWITCH])  
ali@68: 	postprocess_for_HTML(theline);
ali@0:     if (pswit[DP_SWITCH])  
ali@68: 	postprocess_for_DP(theline);
ali@40:     return theline;
ali@0: }
ali@0: 
ali@40: /*
ali@40:  * mixdigit:
ali@40:  *
ali@40:  * Takes a "word" as a parameter, and checks whether it
ali@40:  * contains a mixture of alpha and digits. Generally, this is an
ali@40:  * error, but may not be for cases like 4th or L5 12s. 3d.
ali@40:  *
ali@70:  * Returns: TRUE iff an is error found.
ali@40:  */
ali@70: gboolean mixdigit(const char *checkword)
ali@0: {
ali@70:     gboolean wehaveadigit,wehavealetter,query;
ali@70:     const char *s,*nondigit;
ali@70:     wehaveadigit=wehavealetter=query=FALSE;
ali@70:     for (s=checkword;*s;s=g_utf8_next_char(s))
ali@70: 	if (g_unichar_isalpha(g_utf8_get_char(s)))
ali@70: 	    wehavealetter=TRUE;
ali@70: 	else if (g_unichar_isdigit(g_utf8_get_char(s)))
ali@70: 	    wehaveadigit=TRUE;
ali@40:     if (wehaveadigit && wehavealetter)
ali@40:     {
ali@40: 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
ali@70: 	query=TRUE;
ali@70: 	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
ali@70: 	  nondigit=g_utf8_next_char(nondigit))
ali@68: 	    ;
ali@68: 	/* digits, ending in st, rd, nd, th of either case */
ali@70: 	if (!g_ascii_strcasecmp(nondigit,"st") ||
ali@70: 	  !g_ascii_strcasecmp(nondigit,"rd") ||
ali@70: 	  !g_ascii_strcasecmp(nondigit,"nd") ||
ali@70: 	  !g_ascii_strcasecmp(nondigit,"th"))
ali@70: 	    query=FALSE;
ali@70: 	if (!g_ascii_strcasecmp(nondigit,"sts") ||
ali@70: 	  !g_ascii_strcasecmp(nondigit,"rds") ||
ali@70: 	  !g_ascii_strcasecmp(nondigit,"nds") ||
ali@70: 	  !g_ascii_strcasecmp(nondigit,"ths"))
ali@70: 	    query=FALSE;
ali@70: 	if (!g_ascii_strcasecmp(nondigit,"stly") ||
ali@70: 	  !g_ascii_strcasecmp(nondigit,"rdly") ||
ali@70: 	  !g_ascii_strcasecmp(nondigit,"ndly") ||
ali@70: 	  !g_ascii_strcasecmp(nondigit,"thly"))
ali@70: 	    query=FALSE;
ali@68: 	/* digits, ending in l, L, s or d */
ali@70: 	if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
ali@70: 	  !strcmp(nondigit,"d"))
ali@70: 	    query=FALSE;
ali@68: 	/*
ali@40: 	 * L at the start of a number, representing Britsh pounds, like L500.
ali@70: 	 * This is cute. We know the current word is mixed digit. If the first
ali@68: 	 * letter is L, there must be at least one digit following. If both
ali@68: 	 * digits and letters follow, we have a genuine error, else we have a
ali@68: 	 * capital L followed by digits, and we accept that as a non-error.
ali@40: 	 */
ali@70: 	if (g_utf8_get_char(checkword)=='L' &&
ali@70: 	  !mixdigit(g_utf8_next_char(checkword)))
ali@70: 	    query=FALSE;
ali@40:     }
ali@40:     return query;
ali@0: }
ali@0: 
ali@40: /*
ali@40:  * getaword:
ali@40:  *
ali@69:  * Extracts the first/next "word" from the line, and returns it.
ali@69:  * A word is defined as one English word unit--or at least that's the aim.
ali@69:  * "ptr" is advanced to the position in the line where we will start
ali@69:  * looking for the next word.
ali@40:  *
ali@69:  * Returns: A newly-allocated string.
ali@40:  */
ali@69: gchar *getaword(const char **ptr)
ali@0: {
ali@70:     const char *s,*t;
ali@69:     GString *word;
ali@70:     gunichar c,pc;
ali@69:     word=g_string_new(NULL);
ali@70:     for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
ali@70:       !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
ali@70:       **ptr;*ptr=g_utf8_next_char(*ptr))
ali@174:     {
ali@174: 	/* Handle exceptions for footnote markers like [1] */
ali@174: 	if (g_utf8_get_char(*ptr)=='[')
ali@174: 	{
ali@174: 	    g_string_append_c(word,'[');
ali@174: 	    s=g_utf8_next_char(*ptr);
ali@174: 	    for (;g_unichar_isdigit(g_utf8_get_char(s));s=g_utf8_next_char(s))
ali@174: 		g_string_append_unichar(word,g_utf8_get_char(s));
ali@174: 	    if (g_utf8_get_char(s)==']')
ali@174: 	    {
ali@174: 		g_string_append_c(word,']');
ali@174: 		*ptr=g_utf8_next_char(s);
ali@174: 		return g_string_free(word,FALSE);
ali@174: 	    }
ali@174: 	    else
ali@174: 		g_string_truncate(word,0);
ali@174: 	}
ali@174:     }
ali@40:     /*
ali@40:      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
ali@40:      * Especially yucky is the case of L1,000
ali@40:      * This section looks for a pattern of characters including a digit
ali@40:      * followed by a comma or period followed by one or more digits.
ali@40:      * If found, it returns this whole pattern as a word; otherwise we discard
ali@40:      * the results and resume our normal programming.
ali@40:      */
ali@69:     s=*ptr;
ali@70:     for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
ali@70:       g_unichar_isalpha(g_utf8_get_char(s)) ||
ali@70:       g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
ali@70: 	g_string_append_unichar(word,g_utf8_get_char(s));
ali@82:     if (word->len)
ali@40:     {
ali@82: 	for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
ali@40: 	{
ali@82: 	    c=g_utf8_get_char(t);
ali@82: 	    pc=g_utf8_get_char(g_utf8_prev_char(t));
ali@82: 	    if ((c=='.' || c==',') && g_unichar_isdigit(pc))
ali@82: 	    {
ali@82: 		*ptr=s;
ali@82: 		return g_string_free(word,FALSE);
ali@82: 	    }
ali@40: 	}
ali@40:     }
ali@0:     /* we didn't find a punctuated number - do the regular getword thing */
ali@69:     g_string_truncate(word,0);
ali@99:     c=g_utf8_get_char(*ptr);
ali@99:     for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
ali@99:       *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
ali@99: 	g_string_append_unichar(word,c);
ali@69:     return g_string_free(word,FALSE);
ali@0: }
ali@0: 
ali@40: /*
ali@40:  * isroman:
ali@40:  *
ali@40:  * Is this word a Roman Numeral?
ali@40:  *
ali@40:  * It doesn't actually validate that the number is a valid Roman Numeral--for
ali@40:  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
ali@40:  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
ali@40:  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
ali@40:  * expressions thereof, except when it came to taxes. Allow any number of M,
ali@40:  * an optional D, an optional CM or CD, any number of optional Cs, an optional
ali@40:  * XL or an optional XC, an optional IX or IV, an optional V and any number
ali@40:  * of optional Is.
ali@40:  */
ali@69: gboolean isroman(const char *t)
ali@0: {
ali@69:     const char *s;
ali@40:     if (!t || !*t)
ali@69: 	return FALSE;
ali@40:     s=t;
ali@70:     while (g_utf8_get_char(t)=='m' && *t)
ali@40: 	t++;
ali@70:     if (g_utf8_get_char(t)=='d')
ali@40: 	t++;
ali@70:     if (g_str_has_prefix(t,"cm"))
ali@40: 	t+=2;
ali@70:     if (g_str_has_prefix(t,"cd"))
ali@40: 	t+=2;
ali@70:     while (g_utf8_get_char(t)=='c' && *t)
ali@40: 	t++;
ali@70:     if (g_str_has_prefix(t,"xl"))
ali@40: 	t+=2;
ali@70:     if (g_str_has_prefix(t,"xc"))
ali@40: 	t+=2;
ali@70:     if (g_utf8_get_char(t)=='l')
ali@40: 	t++;
ali@70:     while (g_utf8_get_char(t)=='x' && *t)
ali@40: 	t++;
ali@70:     if (g_str_has_prefix(t,"ix"))
ali@40: 	t+=2;
ali@70:     if (g_str_has_prefix(t,"iv"))
ali@40: 	t+=2;
ali@70:     if (g_utf8_get_char(t)=='v')
ali@40: 	t++;
ali@70:     while (g_utf8_get_char(t)=='i' && *t)
ali@40: 	t++;
ali@40:     return !*t;
ali@0: }
ali@0: 
ali@40: /*
ali@40:  * postprocess_for_DP:
ali@40:  *
ali@40:  * Invoked with the -d switch from flgets().
ali@40:  * It simply "removes" from the line a hard-coded set of common
ali@40:  * DP-specific tags, so that the line passed to the main routine has
ali@40:  * been pre-cleaned of DP markup.
ali@40:  */
ali@0: void postprocess_for_DP(char *theline)
ali@0: {
ali@40:     char *s,*t;
ali@0:     int i;
ali@0:     if (!*theline) 
ali@68: 	return;
ali@40:     for (i=0;*DPmarkup[i];i++)
ali@70: 	while ((s=strstr(theline,DPmarkup[i])))
ali@40: 	{
ali@68: 	    t=s+strlen(DPmarkup[i]);
ali@70: 	    memmove(s,t,strlen(t)+1);
ali@40: 	}
ali@0: }
ali@0: 
ali@40: /*
ali@40:  * postprocess_for_HTML:
ali@40:  *
ali@40:  * Invoked with the -m switch from flgets().
ali@40:  * It simply "removes" from the line a hard-coded set of common
ali@40:  * HTML tags and "replaces" a hard-coded set of common HTML
ali@40:  * entities, so that the line passed to the main routine has
ali@40:  * been pre-cleaned of HTML.
ali@40:  */
ali@0: void postprocess_for_HTML(char *theline)
ali@0: {
ali@70:     while (losemarkup(theline))
ali@70: 	;
ali@71:     loseentities(theline);
ali@0: }
ali@0: 
ali@0: char *losemarkup(char *theline)
ali@0: {
ali@40:     char *s,*t;
ali@0:     int i;
ali@70:     s=strchr(theline,'<');
ali@70:     t=s?strchr(s,'>'):NULL;
ali@40:     if (!s || !t)
ali@40: 	return NULL;
ali@40:     for (i=0;*markup[i];i++)
ali@70: 	if (tagcomp(g_utf8_next_char(s),markup[i]))
ali@40: 	{
ali@70: 	    t=g_utf8_next_char(t);
ali@70: 	    memmove(s,t,strlen(t)+1);
ali@70: 	    return s;
ali@68: 	}
ali@40:     /* It's an unrecognized <xxx>. */
ali@40:     return NULL;
ali@0: }
ali@0: 
ali@71: void loseentities(char *theline)
ali@0: {
ali@0:     int i;
ali@71:     gsize nb;
ali@71:     char *amp,*scolon;
ali@71:     gchar *s,*t;
ali@71:     gunichar c;
ali@71:     GTree *entities=NULL;
ali@86:     static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
ali@71:     if (!theline)
ali@40:     {
ali@71: 	if (entities)
ali@71: 	    g_tree_destroy(entities);
ali@71: 	entities=NULL;
ali@86: 	if (translit!=(GIConv)-1)
ali@71: 	    g_iconv_close(translit);
ali@71: 	translit=(GIConv)-1;
ali@86: 	if (to_utf8!=(GIConv)-1)
ali@71: 	    g_iconv_close(to_utf8);
ali@71: 	to_utf8=(GIConv)-1;
ali@71: 	return;
ali@71:     }
ali@71:     if (!*theline)
ali@71: 	return;
ali@71:     if (!entities)
ali@71:     {
ali@71: 	entities=g_tree_new((GCompareFunc)strcmp);
ali@71: 	for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
ali@71: 	    g_tree_insert(entities,HTMLentities[i].name,
ali@71: 	      GUINT_TO_POINTER(HTMLentities[i].c));
ali@71:     }
ali@71:     if (translit==(GIConv)-1)
ali@71: 	translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
ali@71:     if (to_utf8==(GIConv)-1)
ali@71: 	to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
ali@71:     while((amp=strchr(theline,'&')))
ali@71:     {
ali@71: 	scolon=strchr(amp,';');
ali@71: 	if (scolon)
ali@40: 	{
ali@71: 	    if (amp[1]=='#')
ali@71: 	    {
ali@71: 		if (amp+2+strspn(amp+2,"0123456789")==scolon)
ali@71: 		    c=strtol(amp+2,NULL,10);
ali@71: 		else if (amp[2]=='x' &&
ali@71: 		  amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
ali@71: 		    c=strtol(amp+3,NULL,16);
ali@71: 	    }
ali@71: 	    else
ali@71: 	    {
ali@71: 		s=g_strndup(amp+1,scolon-(amp+1));
ali@71: 	        c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
ali@71: 		g_free(s);
ali@71: 	    }
ali@40: 	}
ali@71: 	else
ali@71: 	    c=0;
ali@71: 	if (c)
ali@71: 	{
ali@71: 	    theline=amp;
ali@71: 	    if (c<128 || c>=192 && c<=255)	/* An ISO-8859-1 character */
ali@71: 		theline+=g_unichar_to_utf8(c,theline);
ali@71: 	    else
ali@71: 	    {
ali@71: 		s=g_malloc(6);
ali@71: 		nb=g_unichar_to_utf8(c,s);
ali@71: 		t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
ali@71: 		g_free(s);
ali@71: 		s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
ali@71: 		g_free(t);
ali@71: 		memcpy(theline,s,nb);
ali@71: 		g_free(s);
ali@71: 		theline+=nb;
ali@71: 	    }
ali@71: 	    memmove(theline,g_utf8_next_char(scolon),
ali@71: 	      strlen(g_utf8_next_char(scolon))+1);
ali@71: 	}
ali@71: 	else
ali@71: 	    theline=g_utf8_next_char(amp);
ali@40:     }
ali@0: }
ali@0: 
ali@70: gboolean tagcomp(const char *strin,const char *basetag)
ali@0: {
ali@70:     gboolean retval;
ali@70:     gchar *s,*t;
ali@70:     if (g_utf8_get_char(strin)=='/')
ali@70: 	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
ali@70:     else
ali@70: 	t=g_utf8_casefold(strin,-1);
ali@70:     s=g_utf8_casefold(basetag,-1);
ali@70:     retval=g_str_has_prefix(t,s);
ali@70:     g_free(s);
ali@70:     g_free(t);
ali@70:     return retval;
ali@0: }
ali@0: 
ali@69: void proghelp(GOptionContext *context)
ali@0: {
ali@69:     gchar *help;
ali@40:     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
ali@40:     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
ali@40:     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
ali@40:     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
ali@40:       "For details, read the file COPYING.\n",stderr);
ali@40:     fputs("This is Free Software; "
ali@40:       "you may redistribute it under certain conditions (GPL);\n",stderr);
ali@40:     fputs("read the file COPYING for details.\n\n",stderr);
ali@69:     help=g_option_context_get_help(context,TRUE,NULL);
ali@69:     fputs(help,stderr);
ali@69:     g_free(help);
ali@69:     fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
ali@40:     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
ali@40:       "non-ASCII\n",stderr);
ali@40:     fputs("characters like accented letters, "
ali@40:       "lines longer than 75 or shorter than 55,\n",stderr);
ali@40:     fputs("unbalanced quotes or brackets, "
ali@40:       "a variety of badly formatted punctuation, \n",stderr);
ali@40:     fputs("HTML tags, some likely typos. "
ali@40:       "It is NOT a substitute for human judgement.\n",stderr);
ali@0:     fputs("\n",stderr);
ali@0: }