/*************************************************************************/
/* bookloupe--check for assorted weirdnesses in a PG candidate text file */
/*									 */
/* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */
/* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */
/*									 */
/* This program is free software; you can redistribute it and/or modify  */
/* it under the terms of the GNU General Public License as published by  */
/* the Free Software Foundation; either version 2 of the License, or     */
/* (at your option) any later version.					 */
/*									 */
/* This program is distributed in the hope that it will be useful,       */
/* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */
/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */
/* GNU General Public License for more details.				 */
/*									 */
/* You should have received a copy of the GNU General Public License	 */
/* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */
/*************************************************************************/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <glib.h>
#include <bl/bl.h>

gchar *prevline;

/* Common typos. */
char *typo[] = {
    "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
    "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",
    "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
    "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
    "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
    "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
    "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
    "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
    "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
    "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
    "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
    "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
    "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
    "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
    "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
    "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
    "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
    "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
    "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
    "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
    "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
    "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
    "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
    "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
    "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
    "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
    "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
    "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
    "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
    "se", ""
};

GTree *usertypo;

/* Common abbreviations and other OK words not to query as typos. */
char *okword[] = {
    "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
    "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
    "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
    "outbid", "outbids", "frostbite", "frostbitten", ""
};

/* Common abbreviations that cause otherwise unexplained periods. */
char *abbrev[] = {
    "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
    "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
};

/*
 * Two-Letter combinations that rarely if ever start words,
 * but are common scannos or otherwise common letter combinations.
 */
char *nostart[] = {
    "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
};

/*
 * Two-Letter combinations that rarely if ever end words,
 * but are common scannos or otherwise common letter combinations.
 */
char *noend[] = {
    "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
    "sw", "gr", "sl", "cl", "iy", ""
};

char *markup[] = {
    "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
    "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
    "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
    "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
};

char *DPmarkup[] = {
    "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
};

char *nocomma[] = {
    "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
    "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
    "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
    "during", "let", "toward", "among", ""
};

char *noperiod[] = {
    "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
    "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
    "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
    "among", "those", "into", "whom", "having", "thence", ""
}; 

char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü";

struct {
    char *htmlent;
    char *htmlnum;
    char *textent;
} entities[] = {
    "&amp;",	"&#38;",     "&", 
    "&lt;",	"&#60;",     "<",
    "&gt;",	"&#62;",     ">",
    "&deg;",	"&#176;",    " degrees",
    "&pound;",	"&#163;",    "L",
    "&quot;",	"&#34;",     "\"", /* quotation mark = APL quote */
    "&OElig;",	"&#338;",    "OE", /* latin capital ligature OE */
    "&oelig;",	"&#339;",    "oe", /* latin small ligature oe */
    "&Scaron;",	"&#352;",    "S", /* latin capital letter S with caron */
    "&scaron;",	"&#353;",    "s", /* latin small letter s with caron */
    "&Yuml;",	"&#376;",    "Y", /* latin capital letter Y with diaeresis */
    "&circ;",	"&#710;",    "",  /* modifier letter circumflex accent */
    "&tilde;",	"&#732;",    "~", /* small tilde, U+02DC ISOdia */
    "&ensp;",	"&#8194;",   " ", /* en space, U+2002 ISOpub */
    "&emsp;",	"&#8195;",   " ", /* em space, U+2003 ISOpub */
    "&thinsp;",	"&#8201;",   " ", /* thin space, U+2009 ISOpub */
    "&ndash;",	"&#8211;",   "-", /* en dash, U+2013 ISOpub */
    "&mdash;",	"&#8212;",   "--", /* em dash, U+2014 ISOpub */
    "&rsquo;",	"&#8217;",   "'", /* right single quotation mark */
    "&sbquo;",	"&#8218;",   "'", /* single low-9 quotation mark */
    "&ldquo;",	"&#8220;",   "\"", /* left double quotation mark */
    "&rdquo;",	"&#8221;",   "\"", /* right double quotation mark */
    "&bdquo;",	"&#8222;",   "\"", /* double low-9 quotation mark */
    "&lsaquo;",	"&#8249;",   "\"", /* single left-pointing angle quotation mark */
    "&rsaquo;",	"&#8250;",   "\"", /* single right-pointing angle quotation mark */
    "&nbsp;",	"&#160;",    " ", /* no-break space = non-breaking space, */
    "&iexcl;",	"&#161;",    "!", /* inverted exclamation mark */
    "&cent;",	"&#162;",    "c", /* cent sign */
    "&pound;",	"&#163;",    "L", /* pound sign */
    "&curren;",	"&#164;",    "$", /* currency sign */
    "&yen;",	"&#165;",    "Y", /* yen sign = yuan sign */
    "&sect;",	"&#167;",    "--", /* section sign */
    "&uml;",	"&#168;",    " ", /* diaeresis = spacing diaeresis */
    "&copy;",	"&#169;",    "(C) ", /* copyright sign */
    "&ordf;",	"&#170;",    " ", /* feminine ordinal indicator */
    "&laquo;",	"&#171;",    "\"", /* left-pointing double angle quotation mark */
    "&shy;",	"&#173;",    "-", /* soft hyphen = discretionary hyphen */
    "&reg;",	"&#174;",    "(R) ", /* registered sign = registered trade mark sign */
    "&macr;",	"&#175;",    " ", /* macron = spacing macron = overline */
    "&deg;",	"&#176;",    " degrees", /* degree sign */
    "&plusmn;",	"&#177;",    "+-", /* plus-minus sign = plus-or-minus sign */
    "&sup2;",	"&#178;",    "2", /* superscript two = superscript digit two */
    "&sup3;",	"&#179;",    "3", /* superscript three = superscript digit three */
    "&acute;",	"&#180;",    " ", /* acute accent = spacing acute */
    "&micro;",	"&#181;",    "m", /* micro sign */
    "&para;",	"&#182;",    "--", /* pilcrow sign = paragraph sign */
    "&cedil;",	"&#184;",    " ", /* cedilla = spacing cedilla */
    "&sup1;",	"&#185;",    "1", /* superscript one = superscript digit one */
    "&ordm;",	"&#186;",    " ", /* masculine ordinal indicator */
    "&raquo;",	"&#187;",    "\"", /* right-pointing double angle quotation mark */
    "&frac14;",	"&#188;",    "1/4", /* vulgar fraction one quarter */
    "&frac12;",	"&#189;",    "1/2", /* vulgar fraction one half */
    "&frac34;",	"&#190;",    "3/4", /* vulgar fraction three quarters */
    "&iquest;",	"&#191;",    "?", /* inverted question mark */
    "&Agrave;",	"&#192;",    "A", /* latin capital letter A with grave */
    "&Aacute;",	"&#193;",    "A", /* latin capital letter A with acute */
    "&Acirc;",	"&#194;",    "A", /* latin capital letter A with circumflex */
    "&Atilde;",	"&#195;",    "A", /* latin capital letter A with tilde */
    "&Auml;",	"&#196;",    "A", /* latin capital letter A with diaeresis */
    "&Aring;",	"&#197;",    "A", /* latin capital letter A with ring above */
    "&AElig;",	"&#198;",    "AE", /* latin capital letter AE */
    "&Ccedil;",	"&#199;",    "C", /* latin capital letter C with cedilla */
    "&Egrave;",	"&#200;",    "E", /* latin capital letter E with grave */
    "&Eacute;",	"&#201;",    "E", /* latin capital letter E with acute */
    "&Ecirc;",	"&#202;",    "E", /* latin capital letter E with circumflex */
    "&Euml;",	"&#203;",    "E", /* latin capital letter E with diaeresis */
    "&Igrave;",	"&#204;",    "I", /* latin capital letter I with grave */
    "&Iacute;",	"&#205;",    "I", /* latin capital letter I with acute */
    "&Icirc;",	"&#206;",    "I", /* latin capital letter I with circumflex */
    "&Iuml;",	"&#207;",    "I", /* latin capital letter I with diaeresis */
    "&ETH;",	"&#208;",    "E", /* latin capital letter ETH */
    "&Ntilde;",	"&#209;",    "N", /* latin capital letter N with tilde */
    "&Ograve;",	"&#210;",    "O", /* latin capital letter O with grave */
    "&Oacute;",	"&#211;",    "O", /* latin capital letter O with acute */
    "&Ocirc;",	"&#212;",    "O", /* latin capital letter O with circumflex */
    "&Otilde;",	"&#213;",    "O", /* latin capital letter O with tilde */
    "&Ouml;",	"&#214;",    "O", /* latin capital letter O with diaeresis */
    "&times;",	"&#215;",    "*", /* multiplication sign */
    "&Oslash;",	"&#216;",    "O", /* latin capital letter O with stroke */
    "&Ugrave;",	"&#217;",    "U", /* latin capital letter U with grave */
    "&Uacute;",	"&#218;",    "U", /* latin capital letter U with acute */
    "&Ucirc;",	"&#219;",    "U", /* latin capital letter U with circumflex */
    "&Uuml;",	"&#220;",    "U", /* latin capital letter U with diaeresis */
    "&Yacute;",	"&#221;",    "Y", /* latin capital letter Y with acute */
    "&THORN;",	"&#222;",    "TH", /* latin capital letter THORN */
    "&szlig;",	"&#223;",    "sz", /* latin small letter sharp s = ess-zed */
    "&agrave;",	"&#224;",    "a", /* latin small letter a with grave */
    "&aacute;",	"&#225;",    "a", /* latin small letter a with acute */
    "&acirc;",	"&#226;",    "a", /* latin small letter a with circumflex */
    "&atilde;",	"&#227;",    "a", /* latin small letter a with tilde */
    "&auml;",	"&#228;",    "a", /* latin small letter a with diaeresis */
    "&aring;",	"&#229;",    "a", /* latin small letter a with ring above */
    "&aelig;",	"&#230;",    "ae", /* latin small letter ae */
    "&ccedil;",	"&#231;",    "c", /* latin small letter c with cedilla */
    "&egrave;",	"&#232;",    "e", /* latin small letter e with grave */
    "&eacute;",	"&#233;",    "e", /* latin small letter e with acute */
    "&ecirc;",	"&#234;",    "e", /* latin small letter e with circumflex */
    "&euml;",	"&#235;",    "e", /* latin small letter e with diaeresis */
    "&igrave;",	"&#236;",    "i", /* latin small letter i with grave */
    "&iacute;",	"&#237;",    "i", /* latin small letter i with acute */
    "&icirc;",	"&#238;",    "i", /* latin small letter i with circumflex */
    "&iuml;",	"&#239;",    "i", /* latin small letter i with diaeresis */
    "&eth;",	"&#240;",    "eth", /* latin small letter eth */
    "&ntilde;",	"&#241;",    "n", /* latin small letter n with tilde */
    "&ograve;",	"&#242;",    "o", /* latin small letter o with grave */
    "&oacute;",	"&#243;",    "o", /* latin small letter o with acute */
    "&ocirc;",	"&#244;",    "o", /* latin small letter o with circumflex */
    "&otilde;",	"&#245;",    "o", /* latin small letter o with tilde */
    "&ouml;",	"&#246;",    "o", /* latin small letter o with diaeresis */
    "&divide;",	"&#247;",    "/", /* division sign */
    "&oslash;",	"&#248;",    "o", /* latin small letter o with stroke */
    "&ugrave;",	"&#249;",    "u", /* latin small letter u with grave */
    "&uacute;",	"&#250;",    "u", /* latin small letter u with acute */
    "&ucirc;",	"&#251;",    "u", /* latin small letter u with circumflex */
    "&uuml;",	"&#252;",    "u", /* latin small letter u with diaeresis */
    "&yacute;",	"&#253;",    "y", /* latin small letter y with acute */
    "&thorn;",	"&#254;",    "th", /* latin small letter thorn */
    "&yuml;",	"&#255;",    "y", /* latin small letter y with diaeresis */
    "", ""
};

/* special characters */
#define CHAR_SPACE	  32
#define CHAR_TAB	   9
#define CHAR_LF		  10
#define CHAR_CR		  13
#define CHAR_DQUOTE	  34
#define CHAR_SQUOTE	  39
#define CHAR_OPEN_SQUOTE  96
#define CHAR_TILDE	 126
#define CHAR_ASTERISK	  42
#define CHAR_FORESLASH	  47
#define CHAR_CARAT	  94

#define CHAR_UNDERSCORE    '_'
#define CHAR_OPEN_CBRACK   '{'
#define CHAR_CLOSE_CBRACK  '}'
#define CHAR_OPEN_RBRACK   '('
#define CHAR_CLOSE_RBRACK  ')'
#define CHAR_OPEN_SBRACK   '['
#define CHAR_CLOSE_SBRACK  ']'

/* longest and shortest normal PG line lengths */
#define LONGEST_PG_LINE   75
#define WAY_TOO_LONG      80
#define SHORTEST_PG_LINE  55

enum {
    ECHO_SWITCH,
    SQUOTE_SWITCH,
    TYPO_SWITCH,
    QPARA_SWITCH,
    PARANOID_SWITCH,
    LINE_END_SWITCH,
    OVERVIEW_SWITCH,
    STDOUT_SWITCH,
    HEADER_SWITCH,
    WEB_SWITCH,
    VERBOSE_SWITCH,
    MARKUP_SWITCH,
    USERTYPO_SWITCH,
    DP_SWITCH,
    SWITNO
};

gboolean pswit[SWITNO];  /* program switches */

static GOptionEntry options[]={
    { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
      "Ignore DP-specific markup", NULL },
    { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
      "Don't echo queried line", NULL },
    { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
      "Check single quotes", NULL },
    { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
      "Check common typos", NULL },
    { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
      "Require closure of quotes on every paragraph", NULL },
    { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
      "Disable paranoid querying of everything", NULL },
    { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
      "Disable line end checking", NULL },
    { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
      "Overview: just show counts", NULL },
    { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
      "Output errors to stdout instead of stderr", NULL },
    { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
      "Echo header fields", NULL },
    { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
      "Ignore markup in < >", NULL },
    { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
      "Use file of user-defined typos", NULL },
    { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
      "Defaults for use on www upload", NULL },
    { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
      "Verbose - list everything", NULL },
    { NULL }
};

long cnt_dquot;		/* for overview mode, count of doublequote queries */
long cnt_squot;		/* for overview mode, count of singlequote queries */
long cnt_brack;		/* for overview mode, count of brackets queries */
long cnt_bin;		/* for overview mode, count of non-ASCII queries */
long cnt_odd;		/* for overview mode, count of odd character queries */
long cnt_long;		/* for overview mode, count of long line errors */
long cnt_short;		/* for overview mode, count of short line queries */
long cnt_punct;		/* for overview mode,
			   count of punctuation and spacing queries */
long cnt_dash;		/* for overview mode, count of dash-related queries */
long cnt_word;		/* for overview mode, count of word queries */
long cnt_html;		/* for overview mode, count of html queries */
long cnt_lineend;	/* for overview mode, count of line-end queries */
long cnt_spacend;	/* count of lines with space at end */
long linecnt;		/* count of total lines in the file */
long checked_linecnt;	/* count of lines actually checked */

void proghelp(GOptionContext *context);
void procfile(const char *);

gchar *running_from;

int mixdigit(const char *);
gchar *getaword(const char **);
char *flgets(char **,long);
gboolean gcisalpha(unsigned char);
gboolean gcisdigit(unsigned char);
gboolean gcisletter(unsigned char);
void postprocess_for_HTML(char *);
char *linehasmarkup(char *);
char *losemarkup(char *);
int tagcomp(const char *,const char *);
char *loseentities(char *);
gboolean isroman(const char *);
void postprocess_for_DP(char *);

GTree *qword,*qperiod;

struct first_pass_results {
    long firstline,astline;
    long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;
    long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;
    long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;
    int Dutchcount,Frenchcount;
};

struct warnings {
    int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;
    int endquote;
    gboolean isDutch,isFrench;
};

struct counters {
    long quot;
    int c_unders,c_brack,s_brack,r_brack;
    int open_single_quote,close_single_quote;
};

struct line_properties {
    unsigned int len,blen;
    char start;
};

struct parities {
    int dquote,squote;
};

struct pending {
    char *dquote,*squote,*rbrack,*sbrack,*cbrack,*unders;
    long squot;
};

void parse_options(int *argc,char ***argv)
{
    GError *err=NULL;
    GOptionContext *context;
    context=g_option_context_new(
      "file - looks for errors in Project Gutenberg(TM) etexts");
    g_option_context_add_main_entries(context,options,NULL);
    if (!g_option_context_parse(context,argc,argv,&err))
    {
	g_printerr("Bookloupe: %s\n",err->message);
	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
	exit(1);
    }
    /* Paranoid checking is turned OFF, not on, by its switch */
    pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
    if (pswit[PARANOID_SWITCH])
	/* if running in paranoid mode, typo checks default to enabled */
	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
    /* Line-end checking is turned OFF, not on, by its switch */
    pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
    /* Echoing is turned OFF, not on, by its switch */
    pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
    if (pswit[OVERVIEW_SWITCH])
	/* just print summary; don't echo */
	pswit[ECHO_SWITCH]=FALSE;
    /*
     * Web uploads - for the moment, this is really just a placeholder
     * until we decide what processing we really want to do on web uploads
     */
    if (pswit[WEB_SWITCH])
    {
	/* specific override for web uploads */
	pswit[ECHO_SWITCH]=TRUE;
	pswit[SQUOTE_SWITCH]=FALSE;
	pswit[TYPO_SWITCH]=TRUE;
	pswit[QPARA_SWITCH]=FALSE;
	pswit[PARANOID_SWITCH]=TRUE;
	pswit[LINE_END_SWITCH]=FALSE;
	pswit[OVERVIEW_SWITCH]=FALSE;
	pswit[STDOUT_SWITCH]=FALSE;
	pswit[HEADER_SWITCH]=TRUE;
	pswit[VERBOSE_SWITCH]=FALSE;
	pswit[MARKUP_SWITCH]=FALSE;
	pswit[USERTYPO_SWITCH]=FALSE;
	pswit[DP_SWITCH]=FALSE;
    }
    if (*argc<2)
    {
	proghelp(context);
	exit(1);
    }
    g_option_context_free(context);
}

/*
 * read_user_scannos:
 *
 * Read in the user-defined stealth scanno list.
 */
void read_user_scannos(void)
{
    GError *err=NULL;
    gchar *usertypo_file;
    gboolean okay;
    int i;
    gsize len;
    gchar *contents,**lines;
    usertypo_file=g_strdup("bookloupe.typ");
    okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
    if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
    {
	g_clear_error(&err);
	g_free(usertypo_file);
	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
    }
    if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
    {
	g_clear_error(&err);
	g_free(usertypo_file);
	usertypo_file=g_strdup("gutcheck.typ");
	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
    }
    if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
    {
	g_clear_error(&err);
	g_free(usertypo_file);
	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
    }
    if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
    {
	g_free(usertypo_file);
	printf("   --> I couldn't find bookloupe.typ "
	  "-- proceeding without user typos.\n");
	return;
    }
    else if (!okay)
    {
	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
	g_free(usertypo_file);
	g_clear_error(&err);
	exit(1);
    }
    lines=g_strsplit(contents,"\n",0);
    usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
    for (i=0;lines[i];i++)
	if (*(unsigned char *)lines[i]>'!')
	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
	else
	    g_free(lines[i]);
    g_free(lines);
}

#if 0
/*
 * read_etext:
 *
 * Read an etext returning an array of lines. Lines are normally expected
 * to be terminated by CR LF. Solitary LFs delimit lines but are left
 * embedded at the end of the line for further processing. Solitary CRs
 * do not delimit lines.
 */
gchar **read_etext(const char *filename,GError **err)
{
    int i;
    const char *s,*t;
    gchar *contents;
    gchar **raw_lines;
    GPtrArray *lines;
    gsize len;
    if (!g_file_get_contents(filename,&contents,&len,err))
	return NULL;
    raw_lines=g_strsplit(contents,"\r\n",0);
    lines=g_ptr_array_sized_new(g_strv_length(raw_lines)+1);
    for (i=0;raw_lines[i];i++)
    {
	t=strchr(raw_lines[i],'\n');
	if (t)
	{
	    s=raw_lines[i];
	    while ((t=strchr(s,'\n')))
	    {
		g_ptr_array_add(lines,g_strndup(s,t-s+1));
		s=t+1;
	    }
	    g_ptr_array_add(lines,g_strdup(s));
	    g_free(raw_lines[i]);
	}
	else
	    g_ptr_array_add(lines,raw_lines[i]);
    }
    g_free(raw_lines);
    g_ptr_array_add(lines,NULL);
    return (gchar **)g_ptr_array_free(lines,FALSE);
}
#else
/*
 * read_etext:
 *
 * Read an etext returning a newly allocated string containing the file
 * contents or NULL on error.
 */
gchar *read_etext(const char *filename,GError **err)
{
    gchar *contents;
    gsize len;
    if (!g_file_get_contents(filename,&contents,&len,err))
	return NULL;
    return contents;
}
#endif

int main(int argc,char **argv)
{
    running_from=g_path_get_dirname(argv[0]);
    parse_options(&argc,&argv);
    if (pswit[USERTYPO_SWITCH])
	read_user_scannos();
    fprintf(stderr,"bookloupe: Check and report on an e-text\n");
    procfile(argv[1]);
    if (pswit[OVERVIEW_SWITCH])
    {
	printf("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
	  checked_linecnt,linecnt,linecnt-checked_linecnt);
	printf("    --------------- Queries found --------------\n");
	if (cnt_long)
	    printf("    Long lines:		    %14ld\n",cnt_long);
	if (cnt_short)
	    printf("    Short lines:		   %14ld\n",cnt_short);
	if (cnt_lineend)
	    printf("    Line-end problems:	     %14ld\n",cnt_lineend);
	if (cnt_word)
	    printf("    Common typos:		  %14ld\n",cnt_word);
	if (cnt_dquot)
	    printf("    Unmatched quotes:	      %14ld\n",cnt_dquot);
	if (cnt_squot)
	    printf("    Unmatched SingleQuotes:	%14ld\n",cnt_squot);
	if (cnt_brack)
	    printf("    Unmatched brackets:	    %14ld\n",cnt_brack);
	if (cnt_bin)
	    printf("    Non-ASCII characters:	  %14ld\n",cnt_bin);
	if (cnt_odd)
	    printf("    Proofing characters:	   %14ld\n",cnt_odd);
	if (cnt_punct)
	    printf("    Punctuation & spacing queries: %14ld\n",cnt_punct);
	if (cnt_dash)
	    printf("    Non-standard dashes:	   %14ld\n",cnt_dash);
	if (cnt_html)
	    printf("    Possible HTML tags:	    %14ld\n",cnt_html);
	printf("\n");
	printf("    TOTAL QUERIES		  %14ld\n",
	  cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
	  cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
    }
    g_free(running_from);
    if (usertypo)
	g_tree_unref(usertypo);
    return 0;
}

/*
 * first_pass:
 *
 * Run a first pass - verify that it's a valid PG
 * file, decide whether to report some things that
 * occur many times in the text like long or short
 * lines, non-standard dashes, etc.
 */
struct first_pass_results *first_pass(const char *etext)
{
    char laststart=CHAR_SPACE;
    const char *s;
    gchar *lc_line;
    int i,j,llen;
    gchar **lines;
    unsigned int lastlen=0,lastblen=0;
    long spline=0,nspline=0;
    static struct first_pass_results results={0};
    gchar *inword;
    lines=g_strsplit(etext,"\n",0);
    for (j=0;lines[j];j++)
    {
	llen=strlen(lines[j]);
	while(lines[j][llen-1]=='\r')
	    lines[j][llen--]='\0';
	linecnt++;
	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
	{
	    if (spline)
		printf("   --> Duplicate header?\n");
	    spline=linecnt+1;   /* first line of non-header text, that is */
	}
	if (!strncmp(lines[j],"*** START",9) &&
	  strstr(lines[j],"PROJECT GUTENBERG"))
	{
	    if (nspline)
		printf("   --> Duplicate header?\n");
	    nspline=linecnt+1;   /* first line of non-header text, that is */
	}
	if (spline || nspline)
	{
	    lc_line=g_ascii_strdown(lines[j],llen);
	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
	    {
		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
		{
		    if (results.footerline)
		    {
			/* it's an old-form header - we can detect duplicates */
			if (!nspline)
			    printf("   --> Duplicate footer?\n");
		    }
		    else
			results.footerline=linecnt;
		}
	    }
	    g_free(lc_line);
	}
	if (spline)
	    results.firstline=spline;
	if (nspline)
	    results.firstline=nspline;  /* override with new */
	if (results.footerline)
	    continue;    /* don't count the boilerplate in the footer */
	results.totlen+=llen;
	for (i=0;i<llen;i++)
	{
	    if ((unsigned char)lines[j][i]>127)
		results.binlen++;
	    if (gcisalpha(lines[j][i]))
		results.alphalen++;
	    if (i>0 && lines[j][i]==CHAR_DQUOTE && isalpha(lines[j][i-1]))
		results.endquote_count++;
	}
	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
	    results.shortline++;
	if (llen>0 && (unsigned char)lines[j][llen-1]<=CHAR_SPACE)
	    cnt_spacend++;
	if (strstr(lines[j],".,"))
	    results.dotcomma++;
	/* only count ast lines for ignoring purposes where there is */
	/* locase text on the line */
	if (strchr(lines[j],'*'))
	{
	    for (s=lines[j];*s;s++)
		if (*s>='a' && *s<='z')
		    break;
	     if (*s)
		results.astline++;
	}
	if (strchr(lines[j],'/'))
	    results.fslashline++;
	for (i=llen-1;i>0 && (unsigned char)lines[j][i]<=CHAR_SPACE;i--)
	    ;
	if (i>1 && lines[j][i]=='-' && lines[j][i-1]!='-')
	    results.hyphens++;
	if (llen>LONGEST_PG_LINE)
	    results.longline++;
	if (llen>WAY_TOO_LONG)
	    results.verylongline++;
	if (strchr(lines[j],'<') && strchr(lines[j],'>'))
	{
	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
	    if (i>0)
		results.htmcount++;
	    if (strstr(lines[j],"<i>"))
		results.htmcount+=4; /* bonus marks! */
	}
	/* Check for spaced em-dashes */
	if (lines[j][0] && (s=strstr(lines[j]+1,"--")))
	{
	    results.emdash++;
	    if (s[-1]==CHAR_SPACE || (s[2]==CHAR_SPACE))
		results.space_emdash++;
	    if (s[-1]==CHAR_SPACE && (s[2]==CHAR_SPACE))
		/* count of em-dashes with spaces both sides */
		results.non_PG_space_emdash++;
	    if (s[-1]!=CHAR_SPACE && (s[2]!=CHAR_SPACE))
		/* count of PG-type em-dashes with no spaces */
		results.PG_space_emdash++;
	}
	for (s=lines[j];*s;)
	{
	    inword=getaword(&s);
	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) 
		results.Dutchcount++;
	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) 
		results.Frenchcount++;
	    if (!strcmp(inword,"0") || !strcmp(inword,"1")) 
		results.standalone_digit++;
	    g_free(inword);
	}
	/* Check for spaced dashes */
	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
	    results.spacedash++;
	lastblen=lastlen;
	lastlen=llen;
	laststart=lines[j][0];
    }
    g_strfreev(lines);
    return &results;
}

/*
 * report_first_pass:
 *
 * Make some snap decisions based on the first pass results.
 */
struct warnings *report_first_pass(struct first_pass_results *results)
{
    static struct warnings warnings={0};
    if (cnt_spacend>0)
	printf("   --> %ld lines in this file have white space at end\n",
	  cnt_spacend);
    warnings.dotcomma=1;
    if (results->dotcomma>5)
    {
	warnings.dotcomma=0;
	printf("   --> %ld lines in this file contain '.,'. "
	  "Not reporting them.\n",results->dotcomma);
    }
    /*
     * If more than 50 lines, or one-tenth, are short,
     * don't bother reporting them.
     */
    warnings.shortline=1;
    if (results->shortline>50 || results->shortline*10>linecnt)
    {
	warnings.shortline=0;
	printf("   --> %ld lines in this file are short. "
	  "Not reporting short lines.\n",results->shortline);
    }
    /*
     * If more than 50 lines, or one-tenth, are long,
     * don't bother reporting them.
     */
    warnings.longline=1;
    if (results->longline>50 || results->longline*10>linecnt)
    {
	warnings.longline=0;
	printf("   --> %ld lines in this file are long. "
	  "Not reporting long lines.\n",results->longline);
    }
    /* If more than 10 lines contain asterisks, don't bother reporting them. */
    warnings.ast=1;
    if (results->astline>10)
    {
	warnings.ast=0;
	printf("   --> %ld lines in this file contain asterisks. "
	  "Not reporting them.\n",results->astline);
    }
    /*
     * If more than 10 lines contain forward slashes,
     * don't bother reporting them.
     */
    warnings.fslash=1;
    if (results->fslashline>10)
    {
	warnings.fslash=0;
	printf("   --> %ld lines in this file contain forward slashes. "
	  "Not reporting them.\n",results->fslashline);
    }
    /*
     * If more than 20 lines contain unpunctuated endquotes,
     * don't bother reporting them.
     */
    warnings.endquote=1;
    if (results->endquote_count>20)
    {
	warnings.endquote=0;
	printf("   --> %ld lines in this file contain unpunctuated endquotes. "
	  "Not reporting them.\n",results->endquote_count);
    }
    /*
     * If more than 15 lines contain standalone digits,
     * don't bother reporting them.
     */
    warnings.digit=1;
    if (results->standalone_digit>10)
    {
	warnings.digit=0;
	printf("   --> %ld lines in this file contain standalone 0s and 1s. "
	  "Not reporting them.\n",results->standalone_digit);
    }
    /*
     * If more than 20 lines contain hyphens at end,
     * don't bother reporting them.
     */
    warnings.hyphen=1;
    if (results->hyphens>20)
    {
	warnings.hyphen=0;
	printf("   --> %ld lines in this file have hyphens at end. "
	  "Not reporting them.\n",results->hyphens);
    }
    if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
    {
	printf("   --> Looks like this is HTML. Switching HTML mode ON.\n");
	pswit[MARKUP_SWITCH]=1;
    }
    if (results->verylongline>0)
	printf("   --> %ld lines in this file are VERY long!\n",
	  results->verylongline);
    /*
     * If there are more non-PG spaced dashes than PG em-dashes,
     * assume it's deliberate.
     * Current PG guidelines say don't use them, but older texts do,
     * and some people insist on them whatever the guidelines say.
     */
    warnings.dash=1;
    if (results->spacedash+results->non_PG_space_emdash>
      results->PG_space_emdash)
    {
	warnings.dash=0;
	printf("   --> There are %ld spaced dashes and em-dashes. "
	  "Not reporting them.\n",
	  results->spacedash+results->non_PG_space_emdash);
    }
    /* If more than a quarter of characters are hi-bit, bug out. */
    warnings.bin=1;
    if (results->binlen*4>results->totlen)
    {
	printf("   --> This file does not appear to be ASCII. "
	  "Terminating. Best of luck with it!\n");
	exit(1);
    }
    if (results->alphalen*4<results->totlen)
    {
	printf("   --> This file does not appear to be text. "
	  "Terminating. Best of luck with it!\n");
	exit(1);
    }
    if (results->binlen*100>results->totlen || results->binlen>100)
    {
	printf("   --> There are a lot of foreign letters here. "
	  "Not reporting them.\n");
	warnings.bin=0;
    }
    warnings.isDutch=FALSE;
    if (results->Dutchcount>50)
    {
	warnings.isDutch=TRUE;
	printf("   --> This looks like Dutch - "
	  "switching off dashes and warnings for 's Middags case.\n");
    }
    warnings.isFrench=FALSE;
    if (results->Frenchcount>50)
    {
	warnings.isFrench=TRUE;
	printf("   --> This looks like French - "
	  "switching off some doublepunct.\n");
    }
    if (results->firstline && results->footerline)
	printf("    The PG header and footer appear to be already on.\n");
    else
    {
	if (results->firstline)
	    printf("    The PG header is on - no footer.\n");
	if (results->footerline)
	    printf("    The PG footer is on - no header.\n");
    }
    printf("\n");
    if (pswit[VERBOSE_SWITCH])
    {
	warnings.bin=1;
	warnings.shortline=1;
	warnings.dotcomma=1;
	warnings.longline=1;
	warnings.dash=1;
	warnings.digit=1;
	warnings.ast=1;
	warnings.fslash=1;
	warnings.hyphen=1;
	warnings.endquote=1;
	printf("   *** Verbose output is ON -- you asked for it! ***\n");
    }
    if (warnings.isDutch)
	warnings.dash=0;
    if (results->footerline>0 && results->firstline>0 &&
      results->footerline>results->firstline &&
      results->footerline-results->firstline<100)
    {
	printf("   --> I don't really know where this text starts. \n");
	printf("       There are no reference points.\n");
	printf("       I'm going to have to report the header and footer "
	  "as well.\n");
	results->firstline=0;
    }
    return &warnings;
}

/*
 * analyse_quotes:
 *
 * Look along the line, accumulate the count of quotes, and see
 * if this is an empty line - i.e. a line with nothing on it
 * but spaces.
 * If line has just spaces, period, * and/or - on it, don't
 * count it, since empty lines with asterisks or dashes to
 * separate sections are common.
 *
 * Returns: TRUE if the line is empty.
 */
gboolean analyse_quotes(const char *aline,struct counters *counters)
{
    int guessquote=0;
    /* assume the line is empty until proven otherwise */
    gboolean isemptyline=TRUE;
    const char *s=aline;
    while (*s)
    {
	if (*s==CHAR_DQUOTE)
	    counters->quot++;
	if (*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE)
	{
	    if (s==aline)
	    {
		/*
		 * At start of line, it can only be an openquote.
		 * Hardcode a very common exception!
		 */
		if (strncmp(s+2,"tis",3) && strncmp(s+2,"Tis",3))
		    counters->open_single_quote++;
	    }
	    else if (gcisalpha(s[-1]) && gcisalpha(s[1]))
		/* Do nothing! it's definitely an apostrophe, not a quote */
		;
	    /* it's outside a word - let's check it out */
	    else if (*s==CHAR_OPEN_SQUOTE || gcisalpha(s[1]))
	    {
		/* it damwell better BE an openquote */
		if (strncmp(s+1,"tis",3) && strncmp(s+1,"Tis",3))
		    /* hardcode a very common exception! */
		    counters->open_single_quote++;
	    }
	    else
	    {
		/* now - is it a closequote? */
		guessquote=0;   /* accumulate clues */
		if (gcisalpha(s[-1]))
		{
		    /* it follows a letter - could be either */
		    guessquote++;
		    if (s[-1]=='s')
		    {
			/* looks like a plural apostrophe */
			guessquote-=3;
			if (s[1]==CHAR_SPACE)  /* bonus marks! */
			    guessquote-=2;
		    }
		}
		/* it doesn't have a letter either side */
		else if (strchr(".?!,;:",s[-1]) && strchr(".?!,;: ",s[1]))
		    guessquote+=8; /* looks like a closequote */
		else
		    guessquote++;
		if (counters->open_single_quote>counters->close_single_quote)
		    /*
		     * Give it the benefit of some doubt,
		     * if a squote is already open.
		     */
		    guessquote++;
		else
		    guessquote--;
		if (guessquote>=0)
		    counters->close_single_quote++;
	    }
	}
	if (*s!=CHAR_SPACE && *s!='-' && *s!='.' && *s!=CHAR_ASTERISK &&
	  *s!=13 && *s!=10)
	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */
	if (*s==CHAR_UNDERSCORE)
	    counters->c_unders++;
	if (*s==CHAR_OPEN_CBRACK)
	    counters->c_brack++;
	if (*s==CHAR_CLOSE_CBRACK)
	    counters->c_brack--;
	if (*s==CHAR_OPEN_RBRACK)
	    counters->r_brack++;
	if (*s==CHAR_CLOSE_RBRACK)
	    counters->r_brack--;
	if (*s==CHAR_OPEN_SBRACK)
	    counters->s_brack++;
	if (*s==CHAR_CLOSE_SBRACK)
	    counters->s_brack--;
	s++;
    }
    return isemptyline;
}

/*
 * check_for_control_characters:
 *
 * Check for invalid or questionable characters in the line
 * Anything above 127 is invalid for plain ASCII, and
 * non-printable control characters should also be flagged.
 * Tabs should generally not be there.
 */
void check_for_control_characters(const char *aline)
{
    unsigned char c;
    const char *s;
    for (s=aline;*s;s++)
    {
	c=*(unsigned char *)s;
	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
	{
	    if (pswit[ECHO_SWITCH])
		printf("\n%s\n",aline);
	    if (!pswit[OVERVIEW_SWITCH])
		printf("    Line %ld column %d - Control character %d\n",
		  linecnt,(int)(s-aline)+1,c);
	    else
		cnt_bin++;
	}
    }
}

/*
 * check_for_odd_characters:
 *
 * Check for binary and other odd characters.
 */
void check_for_odd_characters(const char *aline,const struct warnings *warnings,
  gboolean isemptyline)
{
    /* Don't repeat multiple warnings on one line. */
    int eNon_A=0,eTab=0,eTilde=0,eCarat=0,eFSlash=0,eAst=0;
    const char *s;
    unsigned char c;
    for (s=aline;*s;s++)
    {
	c=*(unsigned char *)s;
	if (!eNon_A && (*s<CHAR_SPACE && *s!=9 && *s!='\n' || c>127))
	{
	    if (pswit[ECHO_SWITCH])
		printf("\n%s\n",aline);
	    if (!pswit[OVERVIEW_SWITCH])
		if (c>127 && c<160)
		    printf("    Line %ld column %d - "
		      "Non-ISO-8859 character %d\n",linecnt,(int)(s-aline)+1,c);
		else
		    printf("    Line %ld column %d - Non-ASCII character %d\n",
		      linecnt,(int)(s-aline)+1,c);
	    else
		cnt_bin++;
	    eNon_A=1;
	}
	if (!eTab && *s==CHAR_TAB)
	{
	    if (pswit[ECHO_SWITCH])
		printf("\n%s\n",aline);
	    if (!pswit[OVERVIEW_SWITCH])
		printf("    Line %ld column %d - Tab character?\n",
		  linecnt,(int)(s-aline)+1);
	    else
		cnt_odd++;
	    eTab=1;
	}
	if (!eTilde && *s==CHAR_TILDE)
	{
	    /*
	     * Often used by OCR software to indicate an
	     * unrecognizable character.
	     */
	    if (pswit[ECHO_SWITCH])
		printf("\n%s\n",aline);
	    if (!pswit[OVERVIEW_SWITCH])
		printf("    Line %ld column %d - Tilde character?\n",
		  linecnt,(int)(s-aline)+1);
	    else
		cnt_odd++;
	    eTilde=1;
	}
	if (!eCarat && *s==CHAR_CARAT)
	{  
	    if (pswit[ECHO_SWITCH])
		printf("\n%s\n",aline);
	    if (!pswit[OVERVIEW_SWITCH])
		printf("    Line %ld column %d - Carat character?\n",
		  linecnt,(int)(s-aline)+1);
	    else
		cnt_odd++;
	    eCarat=1;
	}
	if (!eFSlash && *s==CHAR_FORESLASH && warnings->fslash)
	{  
	    if (pswit[ECHO_SWITCH])
		printf("\n%s\n",aline);
	    if (!pswit[OVERVIEW_SWITCH])
		printf("    Line %ld column %d - Forward slash?\n",
		  linecnt,(int)(s-aline)+1);
	    else
		cnt_odd++;
	    eFSlash=1;
	}
	/*
	 * Report asterisks only in paranoid mode,
	 * since they're often deliberate.
	 */
	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
	  *s==CHAR_ASTERISK)
	{
	    if (pswit[ECHO_SWITCH])
		printf("\n%s\n",aline);
	    if (!pswit[OVERVIEW_SWITCH])
		printf("    Line %ld column %d - Asterisk?\n",
		  linecnt,(int)(s-aline)+1);
	    else
		cnt_odd++;
	    eAst=1;
	}
    }
}

/*
 * check_for_long_line:
 *
 * Check for line too long.
 */
void check_for_long_line(const char *aline)
{
    if (strlen(aline)>LONGEST_PG_LINE)
    {
	if (pswit[ECHO_SWITCH])
	    printf("\n%s\n",aline);
	if (!pswit[OVERVIEW_SWITCH])
	    printf("    Line %ld column %d - Long line %d\n",
	      linecnt,(int)strlen(aline),(int)strlen(aline));
	else
	    cnt_long++;
    }
}

/*
 * check_for_short_line:
 *
 * Check for line too short.
 *
 * This one is a bit trickier to implement: we don't want to
 * flag the last line of a paragraph for being short, so we
 * have to wait until we know that our current line is a
 * "normal" line, then report the _previous_ line if it was too
 * short. We also don't want to report indented lines like
 * chapter heads or formatted quotations. We therefore keep
 * last->len as the length of the last line examined, and
 * last->blen as the length of the last but one, and try to
 * suppress unnecessary warnings by checking that both were of
 * "normal" length. We keep the first character of the last
 * line in last->start, and if it was a space, we assume that
 * the formatting is deliberate. I can't figure out a way to
 * distinguish something like a quoted verse left-aligned or
 * the header or footer of a letter from a paragraph of short
 * lines - maybe if I examined the whole paragraph, and if the
 * para has less than, say, 8 lines and if all lines are short,
 * then just assume it's OK? Need to look at some texts to see
 * how often a formula like this would get the right result.
 */
void check_for_short_line(const char *aline,const struct line_properties *last)
{
    if (strlen(aline)>1 && last->len>1 && last->len<SHORTEST_PG_LINE &&
      last->blen>1 && last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
    {
	if (pswit[ECHO_SWITCH])
	    printf("\n%s\n",prevline);
	if (!pswit[OVERVIEW_SWITCH])
	    printf("    Line %ld column %d - Short line %d?\n",
	      linecnt-1,(int)strlen(prevline),(int)strlen(prevline));
	else
	    cnt_short++;
    }
}

/*
 * check_for_starting_punctuation:
 *
 * Look for punctuation other than full ellipses at start of line.
 */
void check_for_starting_punctuation(const char *aline)
{
    if (*aline && strchr(".?!,;:",aline[0]) && strncmp(". . .",aline,5))
    {
	if (pswit[ECHO_SWITCH])
	    printf("\n%s\n",aline);
	if (!pswit[OVERVIEW_SWITCH])
	    printf("    Line %ld column 1 - Begins with punctuation?\n",
	      linecnt);
	else
	    cnt_punct++;
    }
}

/*
 * check_for_spaced_emdash:
 *
 * Check for spaced em-dashes.
 *
 * We must check _all_ occurrences of "--" on the line
 * hence the loop - even if the first double-dash is OK
 * there may be another that's wrong later on.
 */
void check_for_spaced_emdash(const char *aline)
{
    const char *s,*t;
    s=aline;
    while ((t=strstr(s,"--")))
    {
	if (t>aline && t[-1]==CHAR_SPACE || t[2]==CHAR_SPACE)
	{
	    if (pswit[ECHO_SWITCH])
		printf("\n%s\n",aline);
	    if (!pswit[OVERVIEW_SWITCH])
		printf("    Line %ld column %d - Spaced em-dash?\n",
		  linecnt,(int)(t-aline)+1);
	    else
		cnt_dash++;
	}
	s=t+2;
    }
}

/*
 * check_for_spaced_dash:
 *
 * Check for spaced dashes.
 */
void check_for_spaced_dash(const char *aline)
{
    const char *s;
    if ((s=strstr(aline," -")))
    {
	if (s[2]!='-')
	{
	    if (pswit[ECHO_SWITCH])
		printf("\n%s\n",aline);
	    if (!pswit[OVERVIEW_SWITCH])
		printf("    Line %ld column %d - Spaced dash?\n",
		  linecnt,(int)(s-aline)+1);
	    else
		cnt_dash++;
	}
    }
    else if ((s=strstr(aline,"- ")))
    {
	if (s==aline || s[-1]!='-')
	{
	    if (pswit[ECHO_SWITCH])
		printf("\n%s\n",aline);
	    if (!pswit[OVERVIEW_SWITCH])
		printf("    Line %ld column %d - Spaced dash?\n",
		  linecnt,(int)(s-aline)+1);
	    else
		cnt_dash++;
	}
    }
}

/*
 * check_for_unmarked_paragraphs:
 *
 * Check for unmarked paragraphs indicated by separate speakers.
 *
 * May well be false positive:
 * "Bravo!" "Wonderful!" called the crowd.
 * but useful all the same.
 */
void check_for_unmarked_paragraphs(const char *aline)
{
    const char *s;
    s=strstr(aline,"\"  \"");
    if (!s)
	s=strstr(aline,"\" \"");
    if (s)
    {
	if (pswit[ECHO_SWITCH])
	    printf("\n%s\n",aline);
	if (!pswit[OVERVIEW_SWITCH])
	    printf("    Line %ld column %d - Query missing paragraph break?\n",
	      linecnt,(int)(s-aline)+1);
	else
	    cnt_punct++;
    }
}

/*
 * check_for_jeebies:
 *
 * Check for "to he" and other easy h/b errors.
 *
 * This is a very inadequate effort on the h/b problem,
 * but the phrase "to he" is always an error, whereas "to
 * be" is quite common.
 * Similarly, '"Quiet!", be said.' is a non-be error
 * "to he" is _not_ always an error!:
 *       "Where they went to he couldn't say."
 * Another false positive:
 *       What would "Cinderella" be without the . . .
 * and another: "If he wants to he can see for himself."
 */
void check_for_jeebies(const char *aline)
{
    const char *s;
    s=strstr(aline," be could ");
    if (!s)
	s=strstr(aline," be would ");
    if (!s)
	s=strstr(aline," was be ");
    if (!s)
	s=strstr(aline," be is ");
    if (!s)
	s=strstr(aline," is be ");
    if (!s)
	s=strstr(aline,"\", be ");
    if (!s)
	s=strstr(aline,"\" be ");
    if (!s)
	s=strstr(aline,"\" be ");
    if (!s)
	s=strstr(aline," to he ");
    if (s)
    {
	if (pswit[ECHO_SWITCH])
	    printf("\n%s\n",aline);
	if (!pswit[OVERVIEW_SWITCH])
	    printf("    Line %ld column %d - Query he/be error?\n",
	      linecnt,(int)(s-aline)+1);
	else
	    cnt_word++;
    }
    s=strstr(aline," the had ");
    if (!s)
	s=strstr(aline," a had ");
    if (!s)
	s=strstr(aline," they bad ");
    if (!s)
	s=strstr(aline," she bad ");
    if (!s)
	s=strstr(aline," he bad ");
    if (!s)
	s=strstr(aline," you bad ");
    if (!s)
	s=strstr(aline," i bad ");
    if (s)
    {
	if (pswit[ECHO_SWITCH])
	    printf("\n%s\n",aline);
	if (!pswit[OVERVIEW_SWITCH])
	    printf("    Line %ld column %d - Query had/bad error?\n",
	      linecnt,(int)(s-aline)+1);
	else
	    cnt_word++;
    }
    s=strstr(aline,"; hut ");
    if (!s)
	s=strstr(aline,", hut ");
    if (s)
    {
	if (pswit[ECHO_SWITCH])
	    printf("\n%s\n",aline);
	if (!pswit[OVERVIEW_SWITCH])
	    printf("    Line %ld column %d - Query hut/but error?\n",
	      linecnt,(int)(s-aline)+1);
	else
	    cnt_word++;
    }
}

/*
 * check_for_mta_from:
 *
 * Special case - angled bracket in front of "From" placed there by an
 * MTA when sending an e-mail.
 */
void check_for_mta_from(const char *aline)
{
    const char *s;
    s=strstr(aline,">From");
    if (s)
    {
	if (pswit[ECHO_SWITCH])
	    printf("\n%s\n",aline);
	if (!pswit[OVERVIEW_SWITCH])
	    printf("    Line %ld column %d - Query angled bracket with From\n",
	      linecnt,(int)(s-aline)+1);
	else
	    cnt_punct++;
    }
}

/*
 * check_for_orphan_character:
 *
 * Check for a single character line -
 * often an overflow from bad wrapping.
 */
void check_for_orphan_character(const char *aline)
{
    if (*aline && !aline[1])
    {
	if (*aline=='I' || *aline=='V' || *aline=='X' || *aline=='L' ||
	  gcisdigit(*aline))
	    ; /* Nothing - ignore numerals alone on a line. */
	else
	{
	    if (pswit[ECHO_SWITCH])
		printf("\n%s\n",aline);
	    if (!pswit[OVERVIEW_SWITCH])
		printf("    Line %ld column 1 - Query single character line\n",
		  linecnt);
	    else
		cnt_punct++;
	}
    }
}

/*
 * check_for_pling_scanno:
 *
 * Check for I" - often should be !
 */
void check_for_pling_scanno(const char *aline)
{
    const char *s;
    s=strstr(aline," I\"");
    if (s)
    {
	if (pswit[ECHO_SWITCH])
	    printf("\n%s\n",aline);
	if (!pswit[OVERVIEW_SWITCH])
	    printf("    Line %ld column %ld - Query I=exclamation mark?\n",
	      linecnt,s-aline);
	else
	    cnt_punct++;
    }
}

/*
 * check_for_extra_period:
 *
 * Check for period without a capital letter. Cut-down from gutspell.
 * Only works when it happens on a single line.
 */
void check_for_extra_period(const char *aline,const struct warnings *warnings)
{
    const char *s,*t,*s1;
    int i;
    gboolean istypo;
    gchar *testword;
    if (pswit[PARANOID_SWITCH])
    {
	for (t=aline;strstr(t,". ");)
	{
	    t=strstr(t,". ");
	    if (t==aline)
	    {
		t++;
		/* start of line punctuation is handled elsewhere */
		continue;
	    }
	    if (!gcisalpha(t[-1]))
	    {
		t++;
		continue;
	    }
	    if (warnings->isDutch)
	    {
		/* For Frank & Jeroen -- 's Middags case */
		if (t[2]==CHAR_SQUOTE && t[3]>='a' && t[3]<='z' &&
		  t[4]==CHAR_SPACE && t[5]>='A' && t[5]<='Z')
		{
		    t++;
		    continue;
		}
	    }
	    s1=t+2;
	    while (*s1 && !gcisalpha(*s1) && !isdigit(*s1))
		s1++;
	    if (*s1>='a' && *s1<='z')
	    {
		/* we have something to investigate */
		istypo=TRUE;
		/* so let's go back and find out */
		for (s1=t-1;s1>=aline &&
		  (gcisalpha(*s1) || gcisdigit(*s1) || *s1==CHAR_SQUOTE &&
		  gcisalpha(s1[1]) && gcisalpha(s1[-1]));s1--)
		    ;
		s1++;
		s=strchr(s1,'.');
		if (s)
		    testword=g_strndup(s1,s-s1);
		else
		    testword=g_strdup(s1);
		for (i=0;*abbrev[i];i++)
		    if (!strcmp(testword,abbrev[i]))
			istypo=FALSE;
		if (gcisdigit(*testword))
		    istypo=FALSE;
		if (!testword[1])
		    istypo=FALSE;
		if (isroman(testword))
		    istypo=FALSE;
		if (istypo)
		{
		    istypo=FALSE;
		    for (i=0;testword[i];i++)
			if (strchr(vowels,testword[i]))
			    istypo=TRUE;
		}
		if (istypo &&
		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
		{
		    g_tree_insert(qperiod,g_strdup(testword),
		      GINT_TO_POINTER(1));
		    if (pswit[ECHO_SWITCH])
			printf("\n%s\n",aline);
		    if (!pswit[OVERVIEW_SWITCH])
			printf("    Line %ld column %d - Extra period?\n",
			  linecnt,(int)(t-aline)+1);
		    else
			cnt_punct++;
		}
		g_free(testword);
	    }
	    t++;
	}
    }
}

/*
 * check_for_following_punctuation:
 *
 * Check for words usually not followed by punctuation.
 */
void check_for_following_punctuation(const char *aline)
{
    int i;
    const char *s,*wordstart;
    gchar *inword,*t;
    if (pswit[TYPO_SWITCH])
    {
	for (s=aline;*s;)
	{
	    wordstart=s;
	    t=getaword(&s);
	    if (!*t)
	    {
		g_free(t);
		continue;
	    }
	    inword=g_ascii_strdown(t,-1);
	    g_free(t);
	    for (i=0;*nocomma[i];i++)
		if (!strcmp(inword,nocomma[i]))
		{
		    if (*s==',' || *s==';' || *s==':')
		    {
			if (pswit[ECHO_SWITCH])
			    printf("\n%s\n",aline);
			if (!pswit[OVERVIEW_SWITCH])
			    printf("    Line %ld column %d - "
			      "Query punctuation after %s?\n",
			      linecnt,(int)(s-aline)+1,inword);
			else
			    cnt_punct++;
		    }
		}
	    for (i=0;*noperiod[i];i++)
		if (!strcmp(inword,noperiod[i]))
		{
		    if (*s=='.' || *s=='!')
		    {
			if (pswit[ECHO_SWITCH])
			    printf("\n%s\n",aline);
			if (!pswit[OVERVIEW_SWITCH])
			    printf("    Line %ld column %d - "
			      "Query punctuation after %s?\n",
			      linecnt,(int)(s-aline)+1,inword);
			else
			    cnt_punct++;
		    }
		}
	    g_free(inword);
	}
    }
}

/*
 * check_for_typos:
 *
 * Check for commonly mistyped words,
 * and digits like 0 for O in a word.
 */
void check_for_typos(const char *aline,struct warnings *warnings)
{
    const char *s,*wordstart;
    gchar *inword,*testword;
    int i,alower,vowel,consonant,*dupcnt;
    gboolean isdup,istypo;
    for (s=aline;*s;)
    {
	wordstart=s;
	inword=getaword(&s);
	if (!*inword)
	{
	    g_free(inword);
	    continue; /* don't bother with empty lines */
	}
	if (mixdigit(inword))
	{
	    if (pswit[ECHO_SWITCH])
		printf("\n%s\n",aline);
	    if (!pswit[OVERVIEW_SWITCH])
		printf("    Line %ld column %d - Query digit in %s\n",
		  linecnt,(int)(wordstart-aline)+1,inword);
	    else
		cnt_word++;
	}
	/*
	 * Put the word through a series of tests for likely typos and OCR
	 * errors.
	 */
	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
	{
	    istypo=FALSE;
	    testword=g_strdup(inword);
	    alower=0;
	    for (i=0;i<(int)strlen(testword);i++)
	    {
		/* lowercase for testing */
		if (testword[i]>='a' && testword[i]<='z')
		    alower=1;
		if (alower && testword[i]>='A' && testword[i]<='Z')
		{
		    /*
		     * We have an uppercase mid-word. However, there are
		     * common cases:
		     *   Mac and Mc like McGill
		     *   French contractions like l'Abbe
		     */
		    if (i==2 && testword[0]=='m' && testword[1]=='c' ||
		      i==3 && testword[0]=='m' && testword[1]=='a' &&
		      testword[2]=='c' || i>0 && testword[i-1]==CHAR_SQUOTE)
			; /* do nothing! */
		    else
			istypo=TRUE;
		}
		testword[i]=(char)tolower(testword[i]);
	    }
	}
	if (pswit[TYPO_SWITCH])
	{
	    /*
	     * Check for certain unlikely two-letter combinations at word
	     * start and end.
	     */
	    if (strlen(testword)>1)
	    {
		for (i=0;*nostart[i];i++)
		    if (!strncmp(testword,nostart[i],2))
			istypo=TRUE;
		for (i=0;*noend[i];i++)
		    if (!strncmp(testword+strlen(testword)-2,noend[i],2))
			istypo=TRUE;
	    }
	    /* ght is common, gbt never. Like that. */
	    if (strstr(testword,"cb"))
		istypo=TRUE;
	    if (strstr(testword,"gbt"))
		istypo=TRUE;
	    if (strstr(testword,"pbt"))
		istypo=TRUE;
	    if (strstr(testword,"tbs"))
		istypo=TRUE;
	    if (strstr(testword,"mrn"))
		istypo=TRUE;
	    if (strstr(testword,"ahle"))
		istypo=TRUE;
	    if (strstr(testword,"ihle"))
		istypo=TRUE;
	    /*
	     * "TBE" does happen - like HEARTBEAT - but uncommon.
	     * Also "TBI" - frostbite, outbid - but uncommon.
	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman
	     * numerals, but "ii" is a common scanno.
	     */
	    if (strstr(testword,"tbi"))
		istypo=TRUE;
	    if (strstr(testword,"tbe"))
		istypo=TRUE;
	    if (strstr(testword,"ii"))
		istypo=TRUE;
	    /*
	     * Check for no vowels or no consonants.
	     * If none, flag a typo.
	     */
	    if (!istypo && strlen(testword)>1)
	    {
		vowel=consonant=0;
		for (i=0;testword[i];i++)
		{
		    if (testword[i]=='y' || gcisdigit(testword[i]))
		    {
			/* Yah, this is loose. */
			vowel++;
			consonant++;
		    }
		    else if (strchr(vowels,testword[i]))
			vowel++;
		    else
			consonant++;
		}
		if (!vowel || !consonant)
		    istypo=TRUE;
	    }
	    /*
	     * Now exclude the word from being reported if it's in
	     * the okword list.
	     */
	    for (i=0;*okword[i];i++)
		if (!strcmp(testword,okword[i]))
		    istypo=FALSE;
	    /*
	     * What looks like a typo may be a Roman numeral.
	     * Exclude these.
	     */
	    if (istypo && isroman(testword))
		istypo=FALSE;
	    /* Check the manual list of typos. */
	    if (!istypo)
		for (i=0;*typo[i];i++)
		    if (!strcmp(testword,typo[i]))
			istypo=TRUE;
	    /*
	     * Check lowercase s, l, i and m - special cases.
	     *   "j" - often a semi-colon gone wrong.
	     *   "d" for a missing apostrophe - he d
	     *   "n" for "in"
	     */
	    if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword))
		istypo=TRUE;
	    if (istypo)
	    {
		dupcnt=g_tree_lookup(qword,testword);
		if (dupcnt)
		{
		    (*dupcnt)++;
		    isdup=!pswit[VERBOSE_SWITCH];
		}
		else
		{
		    dupcnt=g_new0(int,1);
		    g_tree_insert(qword,g_strdup(testword),dupcnt);
		    isdup=FALSE;
		}
		if (!isdup)
		{
		    if (pswit[ECHO_SWITCH])
			printf("\n%s\n",aline);
		    if (!pswit[OVERVIEW_SWITCH])
		    {
			printf("    Line %ld column %d - Query word %s",
			  linecnt,(int)(wordstart-aline)+1,inword);
			if (!pswit[VERBOSE_SWITCH])
			    printf(" - not reporting duplicates");
			printf("\n");
		    }
		    else
			cnt_word++;
		}
	    }
	}
	/* check the user's list of typos */
	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
	{
	    if (pswit[ECHO_SWITCH])
		printf("\n%s\n",aline);
	    if (!pswit[OVERVIEW_SWITCH])  
		printf("    Line %ld column %d - Query possible scanno %s\n",
		  linecnt,(int)(wordstart-aline)+2,inword);
	}
	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
	    g_free(testword);
	if (pswit[PARANOID_SWITCH] && warnings->digit)
	{
	    /* In paranoid mode, query all 0 and 1 standing alone. */
	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))
	    {
		if (pswit[ECHO_SWITCH])
		    printf("\n%s\n",aline);
		if (!pswit[OVERVIEW_SWITCH])
		    printf("    Line %ld column %d - Query standalone %s\n",
		      linecnt,(int)(wordstart-aline)+2,inword);
		else
		    cnt_word++;
	    }
	}
	g_free(inword);
    }
}

/*
 * check_for_misspaced_punctuation:
 *
 * Look for added or missing spaces around punctuation and quotes.
 * If there is a punctuation character like ! with no space on
 * either side, suspect a missing!space. If there are spaces on
 * both sides , assume a typo. If we see a double quote with no
 * space or punctuation on either side of it, assume unspaced
 * quotes "like"this.
 */
void check_for_misspaced_punctuation(const char *aline,
  struct parities *parities,gboolean isemptyline)
{
    int i,llen;
    gboolean isacro,isellipsis;
    const char *s;
    llen=strlen(aline);
    for (i=1;i<llen;i++)
    {
	/* For each character in the line after the first. */
	if (strchr(".?!,;:_",aline[i]))  /* if it's punctuation */
	{
	    /* we need to suppress warnings for acronyms like M.D. */
	    isacro=FALSE;
	    /* we need to suppress warnings for ellipsis . . . */
	    isellipsis=FALSE;
	    /* if there are letters on both sides of it or ... */
	    if (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1]) ||
	       gcisalpha(aline[i+1]) && strchr("?!,;:",aline[i]))
	    {
		/* ...if it's strict punctuation followed by an alpha */
		if (aline[i]=='.')
		{
		    if (i>2 && aline[i-2]=='.')
			isacro=TRUE;
		    if (i+2<llen && aline[i+2]=='.')
			isacro=TRUE;
		}
		if (!isacro)
		{
		    if (pswit[ECHO_SWITCH])
			printf("\n%s\n",aline);
		    if (!pswit[OVERVIEW_SWITCH])
			printf("    Line %ld column %d - Missing space?\n",
			  linecnt,i+1);
		    else
			cnt_punct++;
		}
	    }
	    if (aline[i-1]==CHAR_SPACE &&
	      (aline[i+1]==CHAR_SPACE || aline[i+1]==0))
	    {
		/*
		 * If there are spaces on both sides,
		 * or space before and end of line.
		 */
		if (aline[i]=='.')
		{
		    if (i>2 && aline[i-2]=='.')
			isellipsis=TRUE;
		    if (i+2<llen && aline[i+2]=='.')
			isellipsis=TRUE;
		}
		if (!isemptyline && !isellipsis)
		{
		    if (pswit[ECHO_SWITCH])
			printf("\n%s\n",aline);
		    if (!pswit[OVERVIEW_SWITCH])
			printf("    Line %ld column %d - "
			  "Spaced punctuation?\n",linecnt,i+1);
		    else
			cnt_punct++;
		}
	    }
	}
    }
    /* Split out the characters that CANNOT be preceded by space. */
    llen=strlen(aline);
    for (i=1;i<llen;i++)
    {
	/* for each character in the line after the first */
	if (strchr("?!,;:",aline[i]))
	{
	    /* if it's punctuation that _cannot_ have a space before it */
	    if (aline[i-1]==CHAR_SPACE && !isemptyline &&
	      aline[i+1]!=CHAR_SPACE)
	    {
		/*
		 * If aline[i+1) DOES == space,
		 * it was already reported just above.
		 */
		if (pswit[ECHO_SWITCH])
		    printf("\n%s\n",aline);
		if (!pswit[OVERVIEW_SWITCH])
		    printf("    Line %ld column %d - Spaced punctuation?\n",
		      linecnt,i+1);
		else
		    cnt_punct++;
	    }
	}
    }
    /*
     * Special case " .X" where X is any alpha.
     * This plugs a hole in the acronym code above.
     * Inelegant, but maintainable.
     */
    llen=strlen(aline);
    for (i=1;i<llen;i++)
    {
	/* for each character in the line after the first */
	if (aline[i]=='.')
	{
	    /* if it's a period */
	    if (aline[i-1]==CHAR_SPACE && gcisalpha(aline[i+1]))
	    {
		/*
		 * If the period follows a space and
		 * is followed by a letter.
		 */
		if (pswit[ECHO_SWITCH])
		    printf("\n%s\n",aline);
		if (!pswit[OVERVIEW_SWITCH])
		    printf("    Line %ld column %d - Spaced punctuation?\n",
		      linecnt,i+1);
		else
		    cnt_punct++;
	    }
	}
    }
    for (i=1;i<llen;i++)
    {
	/* for each character in the line after the first */
	if (aline[i]==CHAR_DQUOTE)
	{
	    if (!strchr(" _-.'`,;:!/([{?}])",aline[i-1]) &&
	      !strchr(" _-.'`,;:!/([{?}])",aline[i+1]) && aline[i+1] ||
	      !strchr(" _-([{'`",aline[i-1]) && gcisalpha(aline[i+1]))
	    {
		if (pswit[ECHO_SWITCH])
		    printf("\n%s\n",aline);
		if (!pswit[OVERVIEW_SWITCH])
		    printf("    Line %ld column %d - Unspaced quotes?\n",
		      linecnt,i+1);
		else
		    cnt_punct++;
	    }
	}
    }
    /* Check parity of quotes. */
    for (s=aline;*s;s++)
    {
	if (*s==CHAR_DQUOTE)
	{
	    parities->dquote=!parities->dquote;
	    if (!parities->dquote)
	    {
		/* parity even */
		if (!strchr("_-.'`/,;:!?)]} ",s[1]))
		{
		    if (pswit[ECHO_SWITCH])
			printf("\n%s\n",aline);
		    if (!pswit[OVERVIEW_SWITCH])
			printf("    Line %ld column %d - "
			  "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);
		    else
			cnt_punct++;
		}
	    }
	    else
	    {
		/* parity odd */
		if (!gcisalpha(s[1]) && !isdigit(s[1]) &&
		  !strchr("_-/.'`([{$",s[1]) || !s[1])
		{
		    if (pswit[ECHO_SWITCH])
			printf("\n%s\n",aline);
		    if (!pswit[OVERVIEW_SWITCH])
			printf("    Line %ld column %d - "
			  "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);
		    else
			cnt_punct++;
		}
	    }
	}
    }
    if (*aline==CHAR_DQUOTE)
    {
	if (strchr(",;:!?)]} ",aline[1]))
	{
	    if (pswit[ECHO_SWITCH])
		printf("\n%s\n",aline);
	    if (!pswit[OVERVIEW_SWITCH])
		printf("    Line %ld column 1 - Wrongspaced quotes?\n",
		  linecnt);
	    else
		cnt_punct++;
	}
    }
    if (pswit[SQUOTE_SWITCH])
    {
	for (s=aline;*s;s++)
	{
	    if ((*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE) &&
	      (s==aline || s>aline && !gcisalpha(s[-1]) ||
	      !gcisalpha(s[1])))
	    {
		parities->squote=!parities->squote;
		if (!parities->squote)
		{
		    /* parity even */
		    if (!strchr("_-.'`/\",;:!?)]} ",s[1]))
		    {
			if (pswit[ECHO_SWITCH])
			    printf("\n%s\n",aline);
			if (!pswit[OVERVIEW_SWITCH])
			    printf("    Line %ld column %d - "
			      "Wrongspaced singlequotes?\n",
			      linecnt,(int)(s-aline)+1);
			else
			    cnt_punct++;
		    }
		}
		else
		{
		    /* parity odd */
		    if (!gcisalpha(s[1]) && !isdigit(s[1]) &&
		      !strchr("_-/\".'`",s[1]) || !s[1])
		    {
			if (pswit[ECHO_SWITCH])
			    printf("\n%s\n",aline);
			if (!pswit[OVERVIEW_SWITCH])
			    printf("    Line %ld column %d - "
			      "Wrongspaced singlequotes?\n",
			      linecnt,(int)(s-aline)+1);
			else
			    cnt_punct++;
		    }
		}
	    }
	}
    }
}

/*
 * check_for_double_punctuation:
 *
 * Look for double punctuation like ,. or ,,
 * Thanks to DW for the suggestion!
 * In books with references, ".," and ".;" are common
 * e.g. "etc., etc.," and vol. 1.; vol 3.;
 * OTOH, from my initial tests, there are also fairly
 * common errors. What to do? Make these cases paranoid?
 * ".," is the most common, so warnings->dotcomma is used
 * to suppress detailed reporting if it occurs often.
 */
void check_for_double_punctuation(const char *aline,struct warnings *warnings)
{
    int i,llen;
    llen=strlen(aline);
    for (i=0;i<llen;i++)
    {
	/* for each punctuation character in the line */
	if (strchr(".?!,;:",aline[i]) && strchr(".?!,;:",aline[i+1]) &&
	  aline[i] && aline[i+1])
	{
	    /* followed by punctuation, it's a query, unless . . . */
	    if (aline[i]==aline[i+1] && (aline[i]=='.' || aline[i]=='?' ||
	      aline[i]=='!') ||
	      !warnings->dotcomma && aline[i]=='.' && aline[i+1]==',' ||
	      warnings->isFrench && !strncmp(aline+i,",...",4) ||
	      warnings->isFrench && !strncmp(aline+i,"...,",4) ||
	      warnings->isFrench && !strncmp(aline+i,";...",4) ||
	      warnings->isFrench && !strncmp(aline+i,"...;",4) ||
	      warnings->isFrench && !strncmp(aline+i,":...",4) ||
	      warnings->isFrench && !strncmp(aline+i,"...:",4) ||
	      warnings->isFrench && !strncmp(aline+i,"!...",4) ||
	      warnings->isFrench && !strncmp(aline+i,"...!",4) ||
	      warnings->isFrench && !strncmp(aline+i,"?...",4) ||
	      warnings->isFrench && !strncmp(aline+i,"...?",4))
	    {
		if (warnings->isFrench && !strncmp(aline+i,",...",4) ||
		  warnings->isFrench && !strncmp(aline+i,"...,",4) ||
		  warnings->isFrench && !strncmp(aline+i,";...",4) ||
		  warnings->isFrench && !strncmp(aline+i,"...;",4) ||
		  warnings->isFrench && !strncmp(aline+i,":...",4) ||
		  warnings->isFrench && !strncmp(aline+i,"...:",4) ||
		  warnings->isFrench && !strncmp(aline+i,"!...",4) ||
		  warnings->isFrench && !strncmp(aline+i,"...!",4) ||
		  warnings->isFrench && !strncmp(aline+i,"?...",4) ||
		  warnings->isFrench && !strncmp(aline+i,"...?",4))
		    i+=4;
		; /* do nothing for .. !! and ?? which can be legit */
	    }
	    else
	    {
		if (pswit[ECHO_SWITCH])
		    printf("\n%s\n",aline);
		if (!pswit[OVERVIEW_SWITCH])
		    printf("    Line %ld column %d - Double punctuation?\n",
		      linecnt,i+1);
		else
		    cnt_punct++;
	    }
	}
    }
}

/*
 * check_for_spaced_quotes:
 */
void check_for_spaced_quotes(const char *aline)
{
    const char *s,*t;
    s=aline;
    while ((t=strstr(s," \" ")))
    {
	if (pswit[ECHO_SWITCH])
	    printf("\n%s\n",aline);
	if (!pswit[OVERVIEW_SWITCH])
	    printf("    Line %ld column %d - Spaced doublequote?\n",
	      linecnt,(int)(t-aline+1));
	else
	    cnt_punct++;
	s=t+2;
    }
    s=aline;
    while ((t=strstr(s," ' ")))
    {
	if (pswit[ECHO_SWITCH])
	    printf("\n%s\n",aline);
	if (!pswit[OVERVIEW_SWITCH])
	    printf("    Line %ld column %d - Spaced singlequote?\n",
	      linecnt,(int)(t-aline+1));
	else
	    cnt_punct++;
	s=t+2;
    }
    s=aline;
    while ((t=strstr(s," ` ")))
    {
	if (pswit[ECHO_SWITCH])
	    printf("\n%s\n",aline);
	if (!pswit[OVERVIEW_SWITCH])
	    printf("    Line %ld column %d - Spaced singlequote?\n",
	      linecnt,(int)(t-aline+1));
	else
	    cnt_punct++;
	s=t+2;
    }
}

/*
 * check_for_miscased_genative:
 *
 * Check special case of 'S instead of 's at end of word.
 */
void check_for_miscased_genative(const char *aline)
{
    const char *s;
    if (!*aline)
	return;
    s=aline+1;
    while (*s)
    {
	if (*s==CHAR_SQUOTE && s[1]=='S' && s[-1]>='a' && s[-1]<='z')
	{
	    if (pswit[ECHO_SWITCH])
		printf("\n%s\n",aline);
	    if (!pswit[OVERVIEW_SWITCH])
		printf("    Line %ld column %d - Capital \"S\"?\n",
		  linecnt,(int)(s-aline+2));
	    else
		cnt_punct++;
	}
	s++;
    }
}

/*
 * check_end_of_line:
 *
 * Now check special cases - start and end of line -
 * for single and double quotes. Start is sometimes [sic]
 * but better to query it anyway.
 * While we're here, check for dash at end of line.
 */
void check_end_of_line(const char *aline,struct warnings *warnings)
{
    int i,llen;
    llen=strlen(aline);
    if (llen>1)
    {
	if (aline[llen-1]==CHAR_DQUOTE || aline[llen-1]==CHAR_SQUOTE ||
	  aline[llen-1]==CHAR_OPEN_SQUOTE)
	    if (aline[llen-2]==CHAR_SPACE)
	    {
		if (pswit[ECHO_SWITCH])
		    printf("\n%s\n",aline);
		if (!pswit[OVERVIEW_SWITCH])
		    printf("    Line %ld column %d - Spaced quote?\n",
		      linecnt,llen);
		else
		    cnt_punct++;
	    }
	if ((aline[0]==CHAR_SQUOTE || aline[0]==CHAR_OPEN_SQUOTE) &&
	  aline[1]==CHAR_SPACE)
	{
	    if (pswit[ECHO_SWITCH])
		printf("\n%s\n",aline);
	    if (!pswit[OVERVIEW_SWITCH])
		printf("    Line %ld column 1 - Spaced quote?\n",linecnt);
	    else
		cnt_punct++;
	}
	/*
	 * Dash at end of line may well be legit - paranoid mode only
	 * and don't report em-dash at line-end.
	 */
	if (pswit[PARANOID_SWITCH] && warnings->hyphen)
	{
	    for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)
		;
	    if (aline[i]=='-' && aline[i-1]!='-')
	    {
		if (pswit[ECHO_SWITCH])
		    printf("\n%s\n",aline);
		if (!pswit[OVERVIEW_SWITCH])
		    printf("    Line %ld column %d - Hyphen at end of line?\n",
		      linecnt,i);
	    }
	}
    }
}

/*
 * check_for_unspaced_bracket:
 *
 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
 * If so, suspect a scanno like "a]most".
 */
void check_for_unspaced_bracket(const char *aline)
{
    int i,llen;
    llen=strlen(aline);
    for (i=1;i<llen-1;i++)
    {
	/* for each bracket character in the line except 1st & last */
	if (strchr("{[()]}",aline[i]) && gcisalpha(aline[i-1]) &&
	  gcisalpha(aline[i+1]))
	{
	    if (pswit[ECHO_SWITCH])
		printf("\n%s\n",aline);
	    if (!pswit[OVERVIEW_SWITCH])
		printf("    Line %ld column %d - Unspaced bracket?\n",
		  linecnt,i);
	    else
		cnt_punct++;
	}
    }
}

/*
 * check_for_unpunctuated_endquote:
 */
void check_for_unpunctuated_endquote(const char *aline)
{
    int i,llen;
    llen=strlen(aline);
    for (i=1;i<llen;i++)
    {
	/* for each character in the line except 1st */
	if (aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))
	{
	    if (pswit[ECHO_SWITCH])
		printf("\n%s\n",aline);
	    if (!pswit[OVERVIEW_SWITCH])
		printf("    Line %ld column %d - "
		  "endquote missing punctuation?\n",linecnt,i);
	    else
		cnt_punct++;
	}
    }
}

/*
 * check_for_html_tag:
 *
 * Check for <HTML TAG>.
 *
 * If there is a < in the line, followed at some point
 * by a > then we suspect HTML.
 */
void check_for_html_tag(const char *aline)
{
    int i;
    const char *open,*close;
    open=strstr(aline,"<");
    if (open)
    {
	close=strstr(aline,">");
	if (close)
	{
	    i=(int)(close-open+1);
	    if (i>0)
	    {
		if (pswit[ECHO_SWITCH])
		    printf("\n%s\n",aline);
		if (!pswit[OVERVIEW_SWITCH])
		    printf("    Line %ld column %d - HTML Tag? %*.*s \n",
		      linecnt,(int)(open-aline)+1,i,i,open);
		else
		    cnt_html++;
	    }
	}
    }
}

/*
 * check_for_html_entity:
 *
 * Check for &symbol; HTML.
 *
 * If there is a & in the line, followed at
 * some point by a ; then we suspect HTML.
 */
void check_for_html_entity(const char *aline)
{
    int i;
    const char *s,*amp,*scolon;
    amp=strstr(aline,"&");
    if (amp)
    {
	scolon=strstr(aline,";");
	if (scolon)
	{
	    i=(int)(scolon-amp+1);
	    for (s=amp;s<scolon;s++)   
		if (*s==CHAR_SPACE)
		    i=0;		/* Don't report "Jones & Son;" */
	    if (i>0)
	    {
		if (pswit[ECHO_SWITCH])
		    printf("\n%s\n",aline);
		if (!pswit[OVERVIEW_SWITCH])
		    printf("    Line %ld column %d - HTML symbol? %*.*s \n",
		      linecnt,(int)(amp-aline)+1,i,i,amp);
		else
		    cnt_html++;
	    }
	}
    }
}

/*
 * print_pending:
 *
 * If we are in a state of unbalanced quotes, and this line
 * doesn't begin with a quote, output the stored error message.
 * If the -P switch was used, print the warning even if the
 * new para starts with quotes.
 */
void print_pending(const char *aline,const char *parastart,
  struct pending *pending)
{
    const char *s;
    s=aline;
    while (*s==' ')
	s++;
    if (pending->dquote)
    {
	if (*s!=CHAR_DQUOTE || pswit[QPARA_SWITCH])
	{
	    if (!pswit[OVERVIEW_SWITCH])
	    {
		if (pswit[ECHO_SWITCH])
		    printf("\n%s\n",parastart);
		puts(pending->dquote);
	    }
	    else
		cnt_dquot++;
	}
	g_free(pending->dquote);
	pending->dquote=NULL;
    }
    if (pending->squote)
    {
	if (*s!=CHAR_SQUOTE && *s!=CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] ||
	  pending->squot)
	{
	    if (!pswit[OVERVIEW_SWITCH])
	    {
		if (pswit[ECHO_SWITCH])
		    printf("\n%s\n",parastart);
		puts(pending->squote);
	    }
	    else
		cnt_squot++;
	}
	g_free(pending->squote);
	pending->squote=NULL;
    }
    if (pending->rbrack)
    {
	if (!pswit[OVERVIEW_SWITCH])
	{
	    if (pswit[ECHO_SWITCH])
		printf("\n%s\n",parastart);
	    puts(pending->rbrack);
	}
	else
	    cnt_brack++;
	g_free(pending->rbrack);
	pending->rbrack=NULL;
    }
    if (pending->sbrack)
    {
	if (!pswit[OVERVIEW_SWITCH])
	{
	    if (pswit[ECHO_SWITCH])
		printf("\n%s\n",parastart);
	    puts(pending->sbrack);
	}
	else
	    cnt_brack++;
	g_free(pending->sbrack);
	pending->sbrack=NULL;
    }
    if (pending->cbrack)
    {
	if (!pswit[OVERVIEW_SWITCH])
	{
	    if (pswit[ECHO_SWITCH])
		printf("\n%s\n",parastart);
	    puts(pending->cbrack);
	}
	else
	    cnt_brack++;
	g_free(pending->cbrack);
	pending->cbrack=NULL;
    }
    if (pending->unders)
    {
	if (!pswit[OVERVIEW_SWITCH])
	{
	    if (pswit[ECHO_SWITCH])
		printf("\n%s\n",parastart);
	    puts(pending->unders);
	}
	else
	    cnt_brack++;
	g_free(pending->unders);
	pending->unders=NULL;
    }
}

/*
 * check_for_mismatched_quotes:
 *
 * At end of paragraph, check for mismatched quotes.
 *
 * We don't want to report an error immediately, since it is a
 * common convention to omit the quotes at end of paragraph if
 * the next paragraph is a continuation of the same speaker.
 * Where this is the case, the next para should begin with a
 * quote, so we store the warning message and only display it
 * at the top of the next iteration if the new para doesn't
 * start with a quote.
 * The -p switch overrides this default, and warns of unclosed
 * quotes on _every_ paragraph, whether the next begins with a
 * quote or not.
 */
void check_for_mismatched_quotes(const struct counters *counters,
  struct pending *pending)
{
    if (counters->quot%2)
	pending->dquote=
	  g_strdup_printf("    Line %ld - Mismatched quotes",linecnt);
    if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
      counters->open_single_quote!=counters->close_single_quote)
	pending->squote=
	  g_strdup_printf("    Line %ld - Mismatched singlequotes?",linecnt);
    if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
      counters->open_single_quote!=counters->close_single_quote &&
      counters->open_single_quote!=counters->close_single_quote+1)
	/*
	 * Flag it to be noted regardless of the
	 * first char of the next para.
	 */
	pending->squot=1;
    if (counters->r_brack)
	pending->rbrack=
	  g_strdup_printf("    Line %ld - Mismatched round brackets?",linecnt);
    if (counters->s_brack)
	pending->sbrack=
	  g_strdup_printf("    Line %ld - Mismatched square brackets?",linecnt);
    if (counters->c_brack)
	pending->cbrack=
	  g_strdup_printf("    Line %ld - Mismatched curly brackets?",linecnt);
    if (counters->c_unders%2)
	pending->unders=
	  g_strdup_printf("    Line %ld - Mismatched underscores?",linecnt);
}

/*
 * check_for_omitted_punctuation:
 *
 * Check for omitted punctuation at end of paragraph by working back
 * through prevline. DW.
 * Need to check this only for "normal" paras.
 * So what is a "normal" para?
 *    Not normal if one-liner (chapter headings, etc.)
 *    Not normal if doesn't contain at least one locase letter
 *    Not normal if starts with space
 */
void check_for_omitted_punctuation(const char *prevline,
  struct line_properties *last,int start_para_line)
{
    int i;
    const char *s;
    for (s=prevline,i=0;*s && !i;s++)
	if (gcisletter(*s))
	    /* use i to indicate the presence of a letter on the line */
	    i=1;
    /*
     * This next "if" is a problem.
     * If we say "start_para_line <= linecnt - 1", that includes
     * one-line "paragraphs" like chapter heads. Lotsa false positives.
     * If we say "start_para_line < linecnt - 1" it doesn't, but then it
     * misses genuine one-line paragraphs.
     */
    if (i && last->blen>2 && start_para_line<linecnt-1 && *prevline>CHAR_SPACE)
    {
	for (i=strlen(prevline)-1;
	  (prevline[i]==CHAR_DQUOTE || prevline[i]==CHAR_SQUOTE) &&
	  prevline[i]>CHAR_SPACE && i>0;
	  i--)
	    ;
	for (;i>0;i--)
	{
	    if (gcisalpha(prevline[i]))
	    {
		if (pswit[ECHO_SWITCH])
		    printf("\n%s\n",prevline);
		if (!pswit[OVERVIEW_SWITCH])
		    printf("    Line %ld column %d - "
		      "No punctuation at para end?\n",
		      linecnt-1,(int)strlen(prevline));
		else
		    cnt_punct++;
		break;
	    }
	    if (strchr("-.:!([{?}])",prevline[i]))
		break;
	}
    }
}

gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
{
    const char *word=key;
    int *dupcnt=value;
    if (*dupcnt)
	printf("\nNote: Queried word %s was duplicated %d times\n",
	  word,*dupcnt);
    return FALSE;
}

/*
 * procfile:
 *
 * Process one file.
 */
void procfile(const char *filename)
{
    const char *s;
    gchar *parastart=NULL;	/* first line of current para */
    gchar *etext,*aline;
    gchar *etext_ptr;
    GError *err=NULL;
    struct first_pass_results *first_pass_results;
    struct warnings *warnings;
    struct counters counters={0};
    struct line_properties last={0};
    struct parities parities={0};
    struct pending pending={0};
    gboolean isemptyline;
    long start_para_line=0;
    gboolean isnewpara=FALSE,enddash=FALSE;
    last.start=CHAR_SPACE;
    linecnt=checked_linecnt=0;
    etext=read_etext(filename,&err);
    if (!etext)
    {
	if (pswit[STDOUT_SWITCH])
	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
	else
	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
	exit(1);
    }
    fprintf(stdout,"\n\nFile: %s\n\n",filename);
    first_pass_results=first_pass(etext);
    warnings=report_first_pass(first_pass_results);
    qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
    qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
    /*
     * Here we go with the main pass. Hold onto yer hat!
     */
    linecnt=0;
    etext_ptr=etext;
    while ((aline=flgets(&etext_ptr,linecnt+1)))
    {
	linecnt++;
	if (linecnt==1)
	    isnewpara=TRUE;
	if (pswit[DP_SWITCH] && !strncmp(aline,"-----File: ",11))
	    continue;    // skip DP page separators completely
	if (linecnt<first_pass_results->firstline ||
	  (first_pass_results->footerline>0 &&
	  linecnt>first_pass_results->footerline))
	{
	    if (pswit[HEADER_SWITCH])
	    {
		if (!strncmp(aline,"Title:",6))
		    printf("    %s\n",aline);
		if (!strncmp(aline,"Author:",7))
		    printf("    %s\n",aline);
		if (!strncmp(aline,"Release Date:",13))
		    printf("    %s\n",aline);
		if (!strncmp(aline,"Edition:",8))
		    printf("    %s\n\n",aline);
	    }
	    continue;		/* skip through the header */
	}
	checked_linecnt++;
	print_pending(aline,parastart,&pending);
	memset(&pending,0,sizeof(pending));
	isemptyline=analyse_quotes(aline,&counters);
	if (isnewpara && !isemptyline)
	{
	    /* This line is the start of a new paragraph. */
	    start_para_line=linecnt;
	    /* Capture its first line in case we want to report it later. */
	    g_free(parastart);
	    parastart=g_strdup(aline);
	    memset(&parities,0,sizeof(parities));  /* restart the quote count */
	    s=aline;
	    while (!gcisalpha(*s) && !gcisdigit(*s) && *s)
		s++;
	    if (*s>='a' && *s<='z')
	    {
		/* and its first letter is lowercase */
		if (pswit[ECHO_SWITCH])
		    printf("\n%s\n",aline);
		if (!pswit[OVERVIEW_SWITCH])
		    printf("    Line %ld column %d - "
		      "Paragraph starts with lower-case\n",
		      linecnt,(int)(s-aline)+1);
		else
		    cnt_punct++;
	    }
	    isnewpara=FALSE; /* Signal the end of new para processing. */
	}
	/* Check for an em-dash broken at line end. */
	if (enddash && *aline=='-')
	{
	    if (pswit[ECHO_SWITCH])
		printf("\n%s\n",aline);
	    if (!pswit[OVERVIEW_SWITCH])
		printf("    Line %ld column 1 - Broken em-dash?\n",linecnt);
	    else
		cnt_punct++;
	}
	enddash=FALSE;
	for (s=aline+strlen(aline)-1;*s==' ' && s>aline;s--)
	    ;
	if (s>=aline && *s=='-')
	    enddash=TRUE;
	check_for_control_characters(aline);
	if (warnings->bin)
	    check_for_odd_characters(aline,warnings,isemptyline);
	if (warnings->longline)
	    check_for_long_line(aline);
	if (warnings->shortline)
	    check_for_short_line(aline,&last);
	last.blen=last.len;
	last.len=strlen(aline);
	last.start=aline[0];
	check_for_starting_punctuation(aline);
	if (warnings->dash)
	{
	    check_for_spaced_emdash(aline);
	    check_for_spaced_dash(aline);
	}
	check_for_unmarked_paragraphs(aline);
	check_for_jeebies(aline);
	check_for_mta_from(aline);
	check_for_orphan_character(aline);
	check_for_pling_scanno(aline);
	check_for_extra_period(aline,warnings);
	check_for_following_punctuation(aline);
	check_for_typos(aline,warnings);
	check_for_misspaced_punctuation(aline,&parities,isemptyline);
	check_for_double_punctuation(aline,warnings);
	check_for_spaced_quotes(aline);
	check_for_miscased_genative(aline);
	check_end_of_line(aline,warnings);
	check_for_unspaced_bracket(aline);
	if (warnings->endquote)
	    check_for_unpunctuated_endquote(aline);
	check_for_html_tag(aline);
	check_for_html_entity(aline);
	if (isemptyline)
	{
	    check_for_mismatched_quotes(&counters,&pending);
	    memset(&counters,0,sizeof(counters));
	    /* let the next iteration know that it's starting a new para */
	    isnewpara=TRUE;
	    if (prevline)
		check_for_omitted_punctuation(prevline,&last,start_para_line);
	}
	g_free(prevline);
	prevline=g_strdup(aline);
    }
    if (prevline)
    {
	g_free(prevline);
	prevline=NULL;
    }
    g_free(parastart);
    g_free(prevline);
    g_free(etext);
    if (!pswit[OVERVIEW_SWITCH])
	g_tree_foreach(qword,report_duplicate_queries,NULL);
    g_tree_unref(qword);
    g_tree_unref(qperiod);
}

/*
 * flgets:
 *
 * Get one line from the input text, checking for
 * the existence of exactly one CR/LF line-end per line.
 *
 * Returns: a pointer to the line.
 */
char *flgets(char **etext,long lcnt)
{
    char c;
    int len;
    gboolean isCR=FALSE;
    char *theline=*etext;
    len=0;
    for(;;)
    {
	c=*(*etext)++;
	if (!c)
	    return NULL;
	/* either way, it's end of line */
	if (c=='\n')
	{
	    if (isCR)
		break;
	    else
	    {
		/* Error - a LF without a preceding CR */
		if (pswit[LINE_END_SWITCH])
		{
		    if (pswit[ECHO_SWITCH])
			printf("\n%*.*s\n",len,len,theline);
		    if (!pswit[OVERVIEW_SWITCH])
			printf("    Line %ld - No CR?\n",lcnt);
		    else
			cnt_lineend++;
		}
		break;
	    }
	}
	if (c=='\r')
	{
	    if (isCR)
	    {
		/* Error - two successive CRs */
		if (pswit[LINE_END_SWITCH])
		{
		    if (pswit[ECHO_SWITCH])
			printf("\n%*.*s\n",len,len,theline);
		    if (!pswit[OVERVIEW_SWITCH])
			printf("    Line %ld - Two successive CRs?\n",lcnt);
		    else
			cnt_lineend++;
		}
	    }
	    isCR=TRUE;
	}
	else
	{
	    if (pswit[LINE_END_SWITCH] && isCR)
	    {
		if (pswit[ECHO_SWITCH])
		    printf("\n%*.*s\n",len,len,theline);
		if (!pswit[OVERVIEW_SWITCH])
		    printf("    Line %ld column %d - CR without LF?\n",
		      lcnt,len+1);
		else
		    cnt_lineend++;
		theline[len]=' ';
	    }
	    isCR=FALSE;
	    len++;
	}
    }
    theline[len]='\0';
    if (pswit[MARKUP_SWITCH])  
	postprocess_for_HTML(theline);
    if (pswit[DP_SWITCH])  
	postprocess_for_DP(theline);
    return theline;
}

/*
 * mixdigit:
 *
 * Takes a "word" as a parameter, and checks whether it
 * contains a mixture of alpha and digits. Generally, this is an
 * error, but may not be for cases like 4th or L5 12s. 3d.
 *
 * Returns: 0 if no error found, 1 if error.
 */
int mixdigit(const char *checkword)
{
    int wehaveadigit,wehavealetter,firstdigits,query,wl;
    const char *s;
    wehaveadigit=wehavealetter=query=0;
    for (s=checkword;*s;s++)
	if (gcisalpha(*s))
	    wehavealetter=1;
	else
	    if (gcisdigit(*s))
		wehaveadigit=1;
    if (wehaveadigit && wehavealetter)
    {
	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
	query=1;
	wl=strlen(checkword);
	for (firstdigits=0;gcisdigit(checkword[firstdigits]);firstdigits++)
	    ;
	/* digits, ending in st, rd, nd, th of either case */
	if (firstdigits+2==wl && (!g_ascii_strcasecmp(checkword+wl-2,"st") ||
	  !g_ascii_strcasecmp(checkword+wl-2,"rd") ||
	  !g_ascii_strcasecmp(checkword+wl-2,"nd") ||
	  !g_ascii_strcasecmp(checkword+wl-2,"th")))
	    query=0;
	if (firstdigits+3==wl && (!g_ascii_strcasecmp(checkword+wl-3,"sts") ||
	  !g_ascii_strcasecmp(checkword+wl-3,"rds") ||
	  !g_ascii_strcasecmp(checkword+wl-3,"nds") ||
	  !g_ascii_strcasecmp(checkword+wl-3,"ths")))
	    query=0;
	if (firstdigits+3==wl && (!g_ascii_strcasecmp(checkword+wl-4,"stly") ||
	  !g_ascii_strcasecmp(checkword+wl-4,"rdly") ||
	  !g_ascii_strcasecmp(checkword+wl-4,"ndly") ||
	  !g_ascii_strcasecmp(checkword+wl-4,"thly")))
	    query=0;
	/* digits, ending in l, L, s or d */
	if (firstdigits+1==wl && (checkword[wl-1]=='l' ||
	  checkword[wl-1]=='L' || checkword[wl-1]=='s' || checkword[wl-1]=='d'))
	    query=0;
	/*
	 * L at the start of a number, representing Britsh pounds, like L500.
	 * This is cute. We know the current word is mixeddigit. If the first
	 * letter is L, there must be at least one digit following. If both
	 * digits and letters follow, we have a genuine error, else we have a
	 * capital L followed by digits, and we accept that as a non-error.
	 */
	if (checkword[0]=='L' && !mixdigit(checkword+1))
	    query=0;
    }
    return query;
}

/*
 * getaword:
 *
 * Extracts the first/next "word" from the line, and returns it.
 * A word is defined as one English word unit--or at least that's the aim.
 * "ptr" is advanced to the position in the line where we will start
 * looking for the next word.
 *
 * Returns: A newly-allocated string.
 */
gchar *getaword(const char **ptr)
{
    int i;
    const char *s;
    GString *word;
    word=g_string_new(NULL);
    for (;!gcisdigit(**ptr) && !gcisalpha(**ptr) && **ptr;(*ptr)++)
	;
    /*
     * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
     * Especially yucky is the case of L1,000
     * This section looks for a pattern of characters including a digit
     * followed by a comma or period followed by one or more digits.
     * If found, it returns this whole pattern as a word; otherwise we discard
     * the results and resume our normal programming.
     */
    s=*ptr;
    for (;gcisdigit(*s) || gcisalpha(*s) || *s==',' || *s=='.';s++)
	g_string_append_c(word,*s);
    for (i=1;i+1<word->len;i++)
    {
	if (word->str[i]=='.' || word->str[i]==',')
	{
	    if (gcisdigit(word->str[i-1]) && gcisdigit(word->str[i-1]))
	    {
		*ptr=s;
		return g_string_free(word,FALSE);
	    }
	}
    }
    /* we didn't find a punctuated number - do the regular getword thing */
    g_string_truncate(word,0);
    for (;gcisdigit(**ptr) || gcisalpha(**ptr) || **ptr=='\'';(*ptr)++)
	g_string_append_c(word,**ptr);
    return g_string_free(word,FALSE);
}

/*
 * isroman:
 *
 * Is this word a Roman Numeral?
 *
 * It doesn't actually validate that the number is a valid Roman Numeral--for
 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
 * expressions thereof, except when it came to taxes. Allow any number of M,
 * an optional D, an optional CM or CD, any number of optional Cs, an optional
 * XL or an optional XC, an optional IX or IV, an optional V and any number
 * of optional Is.
 */
gboolean isroman(const char *t)
{
    const char *s;
    if (!t || !*t)
	return FALSE;
    s=t;
    while (*t=='m' && *t)
	t++;
    if (*t=='d')
	t++;
    if (*t=='c' && t[1]=='m')
	t+=2;
    if (*t=='c' && t[1]=='d')
	t+=2;
    while (*t=='c' && *t)
	t++;
    if (*t=='x' && t[1]=='l')
	t+=2;
    if (*t=='x' && t[1]=='c')
	t+=2;
    if (*t=='l')
	t++;
    while (*t=='x' && *t)
	t++;
    if (*t=='i' && t[1]=='x')
	t+=2;
    if (*t=='i' && t[1]=='v')
	t+=2;
    if (*t=='v')
	t++;
    while (*t=='i' && *t)
	t++;
    return !*t;
}

/*
 * gcisalpha:
 *
 * A version of isalpha() that is somewhat lenient on 8-bit texts.
 * If we use the standard function, 8-bit accented characters break
 * words, so that tete with accented characters appears to be two words, "t"
 * and "t", with 8-bit characters between them. This causes over-reporting of
 * errors. gcisalpha() recognizes accented letters from the CP1252 (Windows)
 * and ISO-8859-1 character sets, which are the most common PG 8-bit types.
 */
gboolean gcisalpha(unsigned char c)
{
    if (c>='a' && c<='z')
	return TRUE;
    if (c>='A' && c<='Z')
	return TRUE;
    if (c<140)
	return FALSE;
    if (c>=192 && c!=208 && c!=215 && c!=222 && c!=240 && c!=247 && c!=254)
	return TRUE;
    if (c==140 || c==142 || c==156 || c==158 || c==159)
	return TRUE;
    return FALSE;
}

/*
 * gcisdigit:
 *
 * A version of isdigit() that doesn't get confused in 8-bit texts.
 */
gboolean gcisdigit(unsigned char c)
{   
    return c>='0' && c<='9';
}

/*
 * gcisletter:
 *
 * A version of isletter() that doesn't get confused in 8-bit texts.
 * NB: this is ISO-8891-1-specific.
 */
gboolean gcisletter(unsigned char c)
{   
    return c>='A' && c<='Z' || c>='a' && c<='z' || c>=192;
}

/*
 * postprocess_for_DP:
 *
 * Invoked with the -d switch from flgets().
 * It simply "removes" from the line a hard-coded set of common
 * DP-specific tags, so that the line passed to the main routine has
 * been pre-cleaned of DP markup.
 */
void postprocess_for_DP(char *theline)
{
    char *s,*t;
    int i;
    if (!*theline) 
	return;
    for (i=0;*DPmarkup[i];i++)
    {
	s=strstr(theline,DPmarkup[i]);
	while (s)
	{
	    t=s+strlen(DPmarkup[i]);
	    while (*t)
	    {
		*s=*t;
		t++;
		s++;
	    }
	    *s=0;
	    s=strstr(theline,DPmarkup[i]);
	}
    }
}

/*
 * postprocess_for_HTML:
 *
 * Invoked with the -m switch from flgets().
 * It simply "removes" from the line a hard-coded set of common
 * HTML tags and "replaces" a hard-coded set of common HTML
 * entities, so that the line passed to the main routine has
 * been pre-cleaned of HTML.
 */
void postprocess_for_HTML(char *theline)
{
    if (strchr(theline,'<') && strchr(theline,'>'))
	while (losemarkup(theline))
	    ;
    while (loseentities(theline))
	;
}

char *losemarkup(char *theline)
{
    char *s,*t;
    int i;
    if (!*theline) 
	return NULL;
    s=strstr(theline,"<");
    t=strstr(theline,">");
    if (!s || !t)
	return NULL;
    for (i=0;*markup[i];i++)
	if (!tagcomp(s+1,markup[i]))
	{
	    if (!t[1])
	    {
		*s=0;
		return s;
	    }
	    else if (t>s)
	    {
		strcpy(s,t+1);
		return s;
	    }
	}
    /* It's an unrecognized <xxx>. */
    return NULL;
}

char *loseentities(char *theline)
{
    int i;
    char *s,*t;
    if (!*theline) 
	return NULL;
    for (i=0;*entities[i].htmlent;i++)
    {
	s=strstr(theline,entities[i].htmlent);
	if (s)
	{
	    t=malloc((size_t)strlen(s));
	    if (!t)
		return NULL;
	    strcpy(t,s+strlen(entities[i].htmlent));
	    strcpy(s,entities[i].textent);
	    strcat(s,t);
	    free(t);
	    return theline;
	}
    }
    for (i=0;*entities[i].htmlnum;i++)
    {
	s=strstr(theline,entities[i].htmlnum);
	if (s)
	{
	    t=malloc((size_t)strlen(s));
	    if (!t)
		return NULL;
	    strcpy(t,s+strlen(entities[i].htmlnum));
	    strcpy(s,entities[i].textent);
	    strcat(s,t);
	    free(t);
	    return theline;
	}
    }
    return NULL;
}

int tagcomp(const char *strin,const char *basetag)
{
    const char *s,*t;
    s=basetag;
    t=strin;
    if (*t=='/')
	t++; /* ignore a slash */
    while (*s && *t)
    {
	if (tolower(*s)!=tolower(*t))
	    return 1;
	s++;
	t++;
    }
    return 0;
}

void proghelp(GOptionContext *context)
{
    gchar *help;
    fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
    fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
    fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
    fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
      "For details, read the file COPYING.\n",stderr);
    fputs("This is Free Software; "
      "you may redistribute it under certain conditions (GPL);\n",stderr);
    fputs("read the file COPYING for details.\n\n",stderr);
    help=g_option_context_get_help(context,TRUE,NULL);
    fputs(help,stderr);
    g_free(help);
    fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
    fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
      "non-ASCII\n",stderr);
    fputs("characters like accented letters, "
      "lines longer than 75 or shorter than 55,\n",stderr);
    fputs("unbalanced quotes or brackets, "
      "a variety of badly formatted punctuation, \n",stderr);
    fputs("HTML tags, some likely typos. "
      "It is NOT a substitute for human judgement.\n",stderr);
    fputs("\n",stderr);
}