bookloupe/bookloupe.c
author ali <ali@juiblex.co.uk>
Sun Oct 27 16:58:50 2013 +0000 (2013-10-27)
changeset 201 f1d85b36e188
parent 200 8e0ba1a088c4
parent 185 a6d93c9932ac
child 202 c25e023cb9fe
permissions -rw-r--r--
Merge bug #13: Character sets
ali@0
     1
/*************************************************************************/
ali@40
     2
/* bookloupe--check for assorted weirdnesses in a PG candidate text file */
ali@68
     3
/*									 */
ali@68
     4
/* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */
ali@68
     5
/* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */
ali@68
     6
/*									 */
ali@0
     7
/* This program is free software; you can redistribute it and/or modify  */
ali@0
     8
/* it under the terms of the GNU General Public License as published by  */
ali@0
     9
/* the Free Software Foundation; either version 2 of the License, or     */
ali@68
    10
/* (at your option) any later version.					 */
ali@68
    11
/*									 */
ali@0
    12
/* This program is distributed in the hope that it will be useful,       */
ali@68
    13
/* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */
ali@68
    14
/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */
ali@68
    15
/* GNU General Public License for more details.				 */
ali@68
    16
/*									 */
ali@68
    17
/* You should have received a copy of the GNU General Public License	 */
ali@68
    18
/* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */
ali@0
    19
/*************************************************************************/
ali@0
    20
ali@0
    21
#include <stdio.h>
ali@0
    22
#include <stdlib.h>
ali@0
    23
#include <string.h>
ali@0
    24
#include <ctype.h>
ali@73
    25
#ifdef __WIN32__
ali@73
    26
#include <windows.h>
ali@73
    27
#endif
ali@69
    28
#include <glib.h>
ali@69
    29
#include <bl/bl.h>
ali@99
    30
#include "bookloupe.h"
ali@99
    31
#include "counters.h"
ali@103
    32
#include "pending.h"
ali@71
    33
#include "HTMLentities.h"
ali@0
    34
ali@185
    35
gchar *charset;		/* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */
ali@185
    36
GIConv charset_validator=(GIConv)-1;
ali@185
    37
ali@69
    38
gchar *prevline;
ali@0
    39
ali@40
    40
/* Common typos. */
ali@40
    41
char *typo[] = {
ali@40
    42
    "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
ali@40
    43
    "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",
ali@40
    44
    "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
ali@40
    45
    "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
ali@40
    46
    "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
ali@40
    47
    "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
ali@40
    48
    "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
ali@40
    49
    "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
ali@40
    50
    "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
ali@40
    51
    "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
ali@40
    52
    "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
ali@40
    53
    "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
ali@40
    54
    "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
ali@40
    55
    "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
ali@40
    56
    "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
ali@40
    57
    "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
ali@40
    58
    "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
ali@40
    59
    "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
ali@40
    60
    "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
ali@40
    61
    "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
ali@40
    62
    "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
ali@40
    63
    "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
ali@40
    64
    "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
ali@40
    65
    "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
ali@40
    66
    "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
ali@40
    67
    "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
ali@40
    68
    "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
ali@40
    69
    "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
ali@40
    70
    "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
ali@40
    71
    "se", ""
ali@40
    72
};
ali@0
    73
ali@69
    74
GTree *usertypo;
ali@0
    75
ali@40
    76
/* Common abbreviations and other OK words not to query as typos. */
ali@40
    77
char *okword[] = {
ali@40
    78
    "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
ali@40
    79
    "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
ali@40
    80
    "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
ali@40
    81
    "outbid", "outbids", "frostbite", "frostbitten", ""
ali@40
    82
};
ali@0
    83
ali@40
    84
/* Common abbreviations that cause otherwise unexplained periods. */
ali@40
    85
char *abbrev[] = {
ali@40
    86
    "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
ali@40
    87
    "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
ali@40
    88
};
ali@0
    89
ali@40
    90
/*
ali@40
    91
 * Two-Letter combinations that rarely if ever start words,
ali@40
    92
 * but are common scannos or otherwise common letter combinations.
ali@40
    93
 */
ali@40
    94
char *nostart[] = {
ali@40
    95
    "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
ali@40
    96
};
ali@0
    97
ali@40
    98
/*
ali@40
    99
 * Two-Letter combinations that rarely if ever end words,
ali@40
   100
 * but are common scannos or otherwise common letter combinations.
ali@40
   101
 */
ali@40
   102
char *noend[] = {
ali@40
   103
    "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
ali@40
   104
    "sw", "gr", "sl", "cl", "iy", ""
ali@40
   105
};
ali@0
   106
ali@40
   107
char *markup[] = {
ali@40
   108
    "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
ali@40
   109
    "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
ali@40
   110
    "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
ali@40
   111
    "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
ali@40
   112
};
ali@0
   113
ali@40
   114
char *DPmarkup[] = {
ali@40
   115
    "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
ali@40
   116
};
ali@0
   117
ali@40
   118
char *nocomma[] = {
ali@40
   119
    "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
ali@40
   120
    "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
ali@40
   121
    "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
ali@40
   122
    "during", "let", "toward", "among", ""
ali@40
   123
};
ali@0
   124
ali@40
   125
char *noperiod[] = {
ali@40
   126
    "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
ali@40
   127
    "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
ali@40
   128
    "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
ali@40
   129
    "among", "those", "into", "whom", "having", "thence", ""
ali@40
   130
}; 
ali@0
   131
ali@69
   132
gboolean pswit[SWITNO];  /* program switches */
ali@185
   133
gchar *opt_charset;
ali@0
   134
ali@198
   135
gboolean typo_compat,paranoid_compat;
ali@198
   136
ali@69
   137
static GOptionEntry options[]={
ali@69
   138
    { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
ali@69
   139
      "Ignore DP-specific markup", NULL },
ali@198
   140
    { "no-dp", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
ali@198
   141
      G_OPTION_ARG_NONE, pswit+DP_SWITCH,
ali@198
   142
      "Don't ignore DP-specific markup", NULL },
ali@198
   143
    { "echo", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
ali@198
   144
      "Echo queried line", NULL },
ali@198
   145
    { "no-echo", 'e', G_OPTION_FLAG_REVERSE,
ali@198
   146
      G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
ali@69
   147
      "Don't echo queried line", NULL },
ali@69
   148
    { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
ali@69
   149
      "Check single quotes", NULL },
ali@198
   150
    { "no-squote", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
ali@198
   151
      G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
ali@198
   152
      "Don't check single quotes", NULL },
ali@198
   153
    { "typo", 0, 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
ali@69
   154
      "Check common typos", NULL },
ali@198
   155
    { "no-typo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
ali@198
   156
      G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
ali@198
   157
      "Don't check common typos", NULL },
ali@69
   158
    { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
ali@69
   159
      "Require closure of quotes on every paragraph", NULL },
ali@198
   160
    { "no-qpara", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
ali@198
   161
      G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
ali@198
   162
      "Don't require closure of quotes on every paragraph", NULL },
ali@198
   163
    { "paranoid", 0, G_OPTION_FLAG_HIDDEN,
ali@198
   164
      G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
ali@198
   165
      "Enable paranoid querying of everything", NULL },
ali@198
   166
    { "no-paranoid", 0, G_OPTION_FLAG_REVERSE,
ali@198
   167
      G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
ali@69
   168
      "Disable paranoid querying of everything", NULL },
ali@198
   169
    { "line-end", 0, G_OPTION_FLAG_HIDDEN,
ali@198
   170
      G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
ali@198
   171
      "Enable line end checking", NULL },
ali@198
   172
    { "no-line-end", 'l', G_OPTION_FLAG_REVERSE,
ali@198
   173
      G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
ali@69
   174
      "Disable line end checking", NULL },
ali@69
   175
    { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
ali@69
   176
      "Overview: just show counts", NULL },
ali@198
   177
    { "no-overview", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
ali@198
   178
      G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
ali@198
   179
      "Show individual warnings", NULL },
ali@69
   180
    { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
ali@69
   181
      "Output errors to stdout instead of stderr", NULL },
ali@198
   182
    { "no-stdout", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
ali@198
   183
      G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
ali@198
   184
      "Output errors to stderr instead of stdout", NULL },
ali@69
   185
    { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
ali@69
   186
      "Echo header fields", NULL },
ali@198
   187
    { "no-header", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
ali@198
   188
      G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
ali@198
   189
      "Don't echo header fields", NULL },
ali@69
   190
    { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
ali@69
   191
      "Ignore markup in < >", NULL },
ali@198
   192
    { "no-markup", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
ali@198
   193
      G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
ali@198
   194
      "No special handling for markup in < >", NULL },
ali@69
   195
    { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
ali@69
   196
      "Use file of user-defined typos", NULL },
ali@198
   197
    { "no-usertypo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
ali@198
   198
      G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
ali@198
   199
      "Ignore file of user-defined typos", NULL },
ali@198
   200
    { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
ali@198
   201
      "Verbose - list everything", NULL },
ali@198
   202
    { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
ali@198
   203
      G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
ali@198
   204
      "Switch off verbose mode", NULL },
ali@198
   205
    { NULL }
ali@198
   206
};
ali@198
   207
ali@198
   208
/*
ali@198
   209
 * Options relating to configuration which make no sense from inside
ali@198
   210
 * a configuration file.
ali@198
   211
 */
ali@198
   212
ali@198
   213
static GOptionEntry config_options[]={
ali@69
   214
    { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
ali@69
   215
      "Defaults for use on www upload", NULL },
ali@198
   216
    { "dump-config", 0, 0, G_OPTION_ARG_NONE, pswit+DUMP_CONFIG_SWITCH,
ali@198
   217
      "Dump current config settings", NULL },
ali@185
   218
    { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset,
ali@185
   219
      "Set of characters valid for this ebook", "NAME" },
ali@69
   220
    { NULL }
ali@69
   221
};
ali@0
   222
ali@198
   223
static GOptionEntry compatibility_options[]={
ali@198
   224
    { "toggle-typo", 't', 0, G_OPTION_ARG_NONE, &typo_compat,
ali@198
   225
      "Toggle checking for common typos", NULL },
ali@198
   226
    { "toggle-relaxed", 'x', 0, G_OPTION_ARG_NONE, &paranoid_compat,
ali@198
   227
      "Toggle both paranoid mode and common typos", NULL },
ali@69
   228
    { NULL }
ali@69
   229
};
ali@0
   230
ali@142
   231
long cnt_quote;		/* for overview mode, count of quote queries */
ali@68
   232
long cnt_brack;		/* for overview mode, count of brackets queries */
ali@68
   233
long cnt_bin;		/* for overview mode, count of non-ASCII queries */
ali@68
   234
long cnt_odd;		/* for overview mode, count of odd character queries */
ali@68
   235
long cnt_long;		/* for overview mode, count of long line errors */
ali@68
   236
long cnt_short;		/* for overview mode, count of short line queries */
ali@68
   237
long cnt_punct;		/* for overview mode,
ali@68
   238
			   count of punctuation and spacing queries */
ali@68
   239
long cnt_dash;		/* for overview mode, count of dash-related queries */
ali@68
   240
long cnt_word;		/* for overview mode, count of word queries */
ali@68
   241
long cnt_html;		/* for overview mode, count of html queries */
ali@68
   242
long cnt_lineend;	/* for overview mode, count of line-end queries */
ali@68
   243
long cnt_spacend;	/* count of lines with space at end */
ali@68
   244
long linecnt;		/* count of total lines in the file */
ali@68
   245
long checked_linecnt;	/* count of lines actually checked */
ali@0
   246
ali@69
   247
void proghelp(GOptionContext *context);
ali@69
   248
void procfile(const char *);
ali@0
   249
ali@69
   250
gchar *running_from;
ali@0
   251
ali@70
   252
gboolean mixdigit(const char *);
ali@69
   253
gchar *getaword(const char **);
ali@199
   254
char *flgets(char **,long,int);
ali@0
   255
void postprocess_for_HTML(char *);
ali@0
   256
char *linehasmarkup(char *);
ali@0
   257
char *losemarkup(char *);
ali@70
   258
gboolean tagcomp(const char *,const char *);
ali@71
   259
void loseentities(char *);
ali@69
   260
gboolean isroman(const char *);
ali@0
   261
void postprocess_for_DP(char *);
ali@72
   262
void print_as_windows_1252(const char *string);
ali@72
   263
void print_as_utf_8(const char *string);
ali@0
   264
ali@69
   265
GTree *qword,*qperiod;
ali@68
   266
ali@73
   267
#ifdef __WIN32__
ali@73
   268
UINT saved_cp;
ali@73
   269
#endif
ali@73
   270
ali@198
   271
GKeyFile *config;
ali@198
   272
ali@198
   273
void config_file_update(GKeyFile *kf)
ali@198
   274
{
ali@198
   275
    int i;
ali@198
   276
    gboolean sw;
ali@198
   277
    for(i=0;options[i].long_name;i++)
ali@198
   278
    {
ali@198
   279
	if (g_str_has_prefix(options[i].long_name,"no-"))
ali@198
   280
	    continue;
ali@198
   281
	if (options[i].arg==G_OPTION_ARG_NONE)
ali@198
   282
	{
ali@198
   283
	    sw=*(gboolean *)options[i].arg_data;
ali@198
   284
	    if (options[i].flags&G_OPTION_FLAG_REVERSE)
ali@198
   285
		sw=!sw;
ali@198
   286
	    g_key_file_set_boolean(kf,"options",options[i].long_name,sw);
ali@198
   287
	}
ali@198
   288
	else
ali@198
   289
	    g_assert_not_reached();
ali@198
   290
    }
ali@198
   291
}
ali@198
   292
ali@198
   293
void config_file_add_comments(GKeyFile *kf)
ali@198
   294
{
ali@198
   295
    int i;
ali@198
   296
    gchar *comment;
ali@198
   297
    g_key_file_set_comment(kf,NULL,NULL," Default configuration for bookloupe",
ali@198
   298
      NULL);
ali@198
   299
    for(i=0;options[i].long_name;i++)
ali@198
   300
    {
ali@198
   301
	if (g_str_has_prefix(options[i].long_name,"no-"))
ali@198
   302
	    continue;
ali@198
   303
	comment=g_strconcat(" ",options[i].description,NULL);
ali@198
   304
	g_key_file_set_comment(kf,"options",options[i].long_name,comment,NULL);
ali@198
   305
	g_free(comment);
ali@198
   306
    }
ali@198
   307
}
ali@198
   308
ali@198
   309
void dump_config(void)
ali@198
   310
{
ali@198
   311
    gchar *s;
ali@198
   312
    if (config)
ali@198
   313
	config_file_update(config);
ali@198
   314
    else
ali@198
   315
    {
ali@198
   316
	config=g_key_file_new();
ali@198
   317
	config_file_update(config);
ali@198
   318
	config_file_add_comments(config);
ali@198
   319
    }
ali@198
   320
    s=g_key_file_to_data(config,NULL,NULL);
ali@198
   321
    if (s)
ali@198
   322
	g_print("%s",s);
ali@198
   323
    g_free(s);
ali@198
   324
}
ali@198
   325
ali@198
   326
GKeyFile *read_config_file(gchar **full_path)
ali@198
   327
{
ali@198
   328
    int i;
ali@198
   329
    GError *err=NULL;
ali@198
   330
    gchar **search_dirs;
ali@198
   331
    gchar *path;
ali@198
   332
    const char *search_path;
ali@198
   333
    GKeyFile *kf;
ali@198
   334
    kf=g_key_file_new();
ali@198
   335
    search_path=g_getenv("BOOKLOUPE_CONFIG_PATH");
ali@198
   336
    if (search_path)
ali@198
   337
    {
ali@198
   338
#ifdef __WIN32__
ali@198
   339
	search_dirs=g_strsplit(search_path,";",0);
ali@198
   340
#else
ali@198
   341
	search_dirs=g_strsplit(search_path,":",0);
ali@198
   342
#endif
ali@198
   343
    }
ali@198
   344
    else
ali@198
   345
    {
ali@198
   346
	search_dirs=g_new(gchar *,4);
ali@198
   347
	search_dirs[0]=g_get_current_dir();
ali@198
   348
	search_dirs[1]=g_strdup(running_from);
ali@198
   349
	search_dirs[2]=g_strdup(g_get_user_config_dir());
ali@198
   350
	search_dirs[3]=NULL;
ali@198
   351
    }
ali@198
   352
    for(i=0;search_dirs[i];i++)
ali@198
   353
    {
ali@198
   354
	path=g_build_filename(search_dirs[i],"bookloupe.ini",NULL);
ali@198
   355
	if (g_key_file_load_from_file(kf,path,
ali@198
   356
	  G_KEY_FILE_KEEP_COMMENTS|G_KEY_FILE_KEEP_TRANSLATIONS,&err))
ali@198
   357
	    break;
ali@198
   358
	if (!g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
ali@198
   359
	{
ali@198
   360
	    g_printerr("Bookloupe: Error reading %s\n",path);
ali@198
   361
	    g_printerr("%s\n",err->message);
ali@198
   362
	    exit(1);
ali@198
   363
	}
ali@198
   364
	g_clear_error(&err);
ali@198
   365
	g_free(path);
ali@198
   366
	path=NULL;
ali@198
   367
    }
ali@198
   368
    if (!search_dirs[i])
ali@198
   369
    {
ali@198
   370
	g_key_file_free(kf);
ali@198
   371
	kf=NULL;
ali@198
   372
    }
ali@198
   373
    g_strfreev(search_dirs);
ali@198
   374
    if (full_path && kf)
ali@198
   375
	*full_path=path;
ali@198
   376
    else
ali@198
   377
	g_free(path);
ali@198
   378
    return kf;
ali@198
   379
}
ali@198
   380
ali@198
   381
void parse_config_file(void)
ali@198
   382
{
ali@198
   383
    int i,j;
ali@198
   384
    gchar *path;
ali@198
   385
    gchar **keys;
ali@198
   386
    gboolean sw;
ali@198
   387
    GError *err=NULL;
ali@198
   388
    config=read_config_file(&path);
ali@198
   389
    if (config)
ali@198
   390
	keys=g_key_file_get_keys(config,"options",NULL,NULL);
ali@198
   391
    else
ali@198
   392
	keys=NULL;
ali@198
   393
    if (keys)
ali@198
   394
    {
ali@198
   395
	for(i=0;keys[i];i++)
ali@198
   396
	{
ali@198
   397
	    for(j=0;options[j].long_name;j++)
ali@198
   398
	    {
ali@198
   399
		if (g_str_has_prefix(options[j].long_name,"no-"))
ali@198
   400
		    continue;
ali@198
   401
		else if (!strcmp(keys[i],options[j].long_name))
ali@198
   402
		{
ali@198
   403
		    if (options[j].arg==G_OPTION_ARG_NONE)
ali@198
   404
		    {
ali@198
   405
			sw=g_key_file_get_boolean(config,"options",keys[i],
ali@198
   406
			  &err);
ali@198
   407
			if (err)
ali@198
   408
			{
ali@198
   409
			    g_printerr("Bookloupe: %s: options.%s: %s\n",
ali@198
   410
			      path,keys[i],err->message);
ali@198
   411
			    g_clear_error(&err);
ali@198
   412
			}
ali@198
   413
			if (options[j].flags&G_OPTION_FLAG_REVERSE)
ali@198
   414
			    sw=!sw;
ali@198
   415
			*(gboolean *)options[j].arg_data=sw;
ali@198
   416
			break;
ali@198
   417
		    }
ali@198
   418
		    else
ali@198
   419
			g_assert_not_reached();
ali@198
   420
		}
ali@198
   421
	    }
ali@198
   422
	    if (!options[j].long_name)
ali@198
   423
		g_printerr("Bookloupe: %s: Unknown option \"%s\" ignored\n",
ali@198
   424
		  path,keys[i]);
ali@198
   425
	}
ali@198
   426
	g_strfreev(keys);
ali@198
   427
    }
ali@198
   428
    if (config)
ali@198
   429
	g_free(path);
ali@198
   430
}
ali@198
   431
ali@185
   432
gboolean set_charset(const char *name,GError **err)
ali@185
   433
{
ali@185
   434
    /* The various UNICODE encodings all share the same character set. */
ali@185
   435
    const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4",
ali@185
   436
      "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG",
ali@185
   437
      "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
ali@185
   438
      "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE",
ali@185
   439
      "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" };
ali@185
   440
    int i;
ali@185
   441
    if (charset)
ali@185
   442
	g_free(charset);
ali@185
   443
    if (charset_validator!=(GIConv)-1)
ali@185
   444
	g_iconv_close(charset_validator);
ali@185
   445
    if (!name || !g_strcasecmp(name,"auto"))
ali@185
   446
    {
ali@185
   447
	charset=NULL;
ali@185
   448
	charset_validator=(GIConv)-1;
ali@185
   449
	return TRUE;
ali@185
   450
    }
ali@185
   451
    else
ali@185
   452
	charset=g_strdup(name);
ali@185
   453
    for(i=0;i<G_N_ELEMENTS(unicode_aliases);i++)
ali@185
   454
	if (!g_strcasecmp(charset,unicode_aliases[i]))
ali@185
   455
	{
ali@185
   456
	    g_free(charset);
ali@185
   457
	    charset=g_strdup("UTF-8");
ali@185
   458
	    break;
ali@185
   459
	}
ali@185
   460
    if (!strcmp(charset,"UTF-8"))
ali@185
   461
	charset_validator=(GIConv)-1;
ali@185
   462
    else
ali@185
   463
    {
ali@185
   464
	charset_validator=g_iconv_open(charset,"UTF-8");
ali@185
   465
	if (charset_validator==(GIConv)-1)
ali@185
   466
	{
ali@185
   467
	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_NO_CONVERSION,
ali@185
   468
	      "Unknown character set \"%s\"",charset);
ali@185
   469
	    return FALSE;
ali@185
   470
	}
ali@185
   471
    }
ali@185
   472
    return TRUE;
ali@185
   473
}
ali@185
   474
ali@69
   475
void parse_options(int *argc,char ***argv)
ali@0
   476
{
ali@69
   477
    GError *err=NULL;
ali@69
   478
    GOptionContext *context;
ali@198
   479
    GOptionGroup *compatibility;
ali@69
   480
    context=g_option_context_new(
ali@198
   481
      "file - look for errors in Project Gutenberg(TM) etexts");
ali@69
   482
    g_option_context_add_main_entries(context,options,NULL);
ali@198
   483
    g_option_context_add_main_entries(context,config_options,NULL);
ali@198
   484
    compatibility=g_option_group_new("compatibility",
ali@198
   485
      "Options for Compatibility with Gutcheck:",
ali@198
   486
      "Show compatibility options",NULL,NULL);
ali@198
   487
    g_option_group_add_entries(compatibility,compatibility_options);
ali@198
   488
    g_option_context_add_group(context,compatibility);
ali@198
   489
    g_option_context_set_description(context,
ali@198
   490
      "For simplicity, only the switch options which reverse the\n"
ali@198
   491
      "default configuration are listed. In most cases, both vanilla\n"
ali@198
   492
      "and \"no-\" prefixed versions are available for use.");
ali@69
   493
    if (!g_option_context_parse(context,argc,argv,&err))
ali@69
   494
    {
ali@69
   495
	g_printerr("Bookloupe: %s\n",err->message);
ali@69
   496
	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
ali@69
   497
	exit(1);
ali@69
   498
    }
ali@198
   499
    if (typo_compat)
ali@69
   500
	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
ali@198
   501
    if (paranoid_compat)
ali@198
   502
    {
ali@198
   503
	pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
ali@198
   504
	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
ali@198
   505
    }
ali@40
   506
    /*
ali@40
   507
     * Web uploads - for the moment, this is really just a placeholder
ali@40
   508
     * until we decide what processing we really want to do on web uploads
ali@40
   509
     */
ali@40
   510
    if (pswit[WEB_SWITCH])
ali@40
   511
    {
ali@40
   512
	/* specific override for web uploads */
ali@69
   513
	pswit[ECHO_SWITCH]=TRUE;
ali@69
   514
	pswit[SQUOTE_SWITCH]=FALSE;
ali@69
   515
	pswit[TYPO_SWITCH]=TRUE;
ali@69
   516
	pswit[QPARA_SWITCH]=FALSE;
ali@69
   517
	pswit[PARANOID_SWITCH]=TRUE;
ali@69
   518
	pswit[LINE_END_SWITCH]=FALSE;
ali@69
   519
	pswit[OVERVIEW_SWITCH]=FALSE;
ali@69
   520
	pswit[STDOUT_SWITCH]=FALSE;
ali@69
   521
	pswit[HEADER_SWITCH]=TRUE;
ali@69
   522
	pswit[VERBOSE_SWITCH]=FALSE;
ali@69
   523
	pswit[MARKUP_SWITCH]=FALSE;
ali@69
   524
	pswit[USERTYPO_SWITCH]=FALSE;
ali@69
   525
	pswit[DP_SWITCH]=FALSE;
ali@40
   526
    }
ali@185
   527
    if (opt_charset && !set_charset(opt_charset,&err))
ali@185
   528
    {
ali@185
   529
	g_printerr("%s\n",err->message);
ali@185
   530
	exit(1);
ali@185
   531
    }
ali@198
   532
    if (pswit[DUMP_CONFIG_SWITCH])
ali@198
   533
    {
ali@198
   534
	dump_config();
ali@198
   535
	exit(0);
ali@198
   536
    }
ali@185
   537
    g_free(opt_charset);
ali@185
   538
    opt_charset=NULL;
ali@198
   539
    if (pswit[OVERVIEW_SWITCH])
ali@198
   540
	/* just print summary; don't echo */
ali@198
   541
	pswit[ECHO_SWITCH]=FALSE;
ali@69
   542
    if (*argc<2)
ali@40
   543
    {
ali@69
   544
	proghelp(context);
ali@69
   545
	exit(1);
ali@40
   546
    }
ali@69
   547
    g_option_context_free(context);
ali@69
   548
}
ali@69
   549
ali@69
   550
/*
ali@69
   551
 * read_user_scannos:
ali@69
   552
 *
ali@69
   553
 * Read in the user-defined stealth scanno list.
ali@69
   554
 */
ali@69
   555
void read_user_scannos(void)
ali@69
   556
{
ali@69
   557
    GError *err=NULL;
ali@69
   558
    gchar *usertypo_file;
ali@69
   559
    gboolean okay;
ali@69
   560
    int i;
ali@70
   561
    gsize len,nb;
ali@70
   562
    gchar *contents,*utf8,**lines;
ali@69
   563
    usertypo_file=g_strdup("bookloupe.typ");
ali@69
   564
    okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
ali@69
   565
    if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
ali@69
   566
    {
ali@69
   567
	g_clear_error(&err);
ali@69
   568
	g_free(usertypo_file);
ali@69
   569
	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
ali@69
   570
	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
ali@69
   571
    }
ali@69
   572
    if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
ali@69
   573
    {
ali@69
   574
	g_clear_error(&err);
ali@69
   575
	g_free(usertypo_file);
ali@69
   576
	usertypo_file=g_strdup("gutcheck.typ");
ali@69
   577
	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
ali@69
   578
    }
ali@69
   579
    if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
ali@69
   580
    {
ali@69
   581
	g_clear_error(&err);
ali@69
   582
	g_free(usertypo_file);
ali@69
   583
	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
ali@69
   584
	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
ali@69
   585
    }
ali@69
   586
    if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
ali@69
   587
    {
ali@69
   588
	g_free(usertypo_file);
ali@70
   589
	g_print("   --> I couldn't find bookloupe.typ "
ali@69
   590
	  "-- proceeding without user typos.\n");
ali@69
   591
	return;
ali@69
   592
    }
ali@69
   593
    else if (!okay)
ali@69
   594
    {
ali@69
   595
	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
ali@69
   596
	g_free(usertypo_file);
ali@69
   597
	g_clear_error(&err);
ali@69
   598
	exit(1);
ali@69
   599
    }
ali@72
   600
    if (g_utf8_validate(contents,len,NULL))
ali@185
   601
    {
ali@72
   602
	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
ali@185
   603
	if (!charset)
ali@185
   604
	    (void)set_charset("UNICODE",NULL);
ali@185
   605
    }
ali@72
   606
    else
ali@72
   607
	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
ali@70
   608
    g_free(contents);
ali@70
   609
    lines=g_strsplit_set(utf8,"\r\n",0);
ali@70
   610
    g_free(utf8);
ali@69
   611
    usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
ali@69
   612
    for (i=0;lines[i];i++)
ali@69
   613
	if (*(unsigned char *)lines[i]>'!')
ali@69
   614
	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
ali@69
   615
	else
ali@69
   616
	    g_free(lines[i]);
ali@69
   617
    g_free(lines);
ali@69
   618
}
ali@69
   619
ali@69
   620
/*
ali@69
   621
 * read_etext:
ali@69
   622
 *
ali@69
   623
 * Read an etext returning a newly allocated string containing the file
ali@69
   624
 * contents or NULL on error.
ali@69
   625
 */
ali@69
   626
gchar *read_etext(const char *filename,GError **err)
ali@69
   627
{
ali@76
   628
    GError *tmp_err=NULL;
ali@70
   629
    gchar *contents,*utf8;
ali@76
   630
    gsize len,bytes_read,bytes_written;
ali@76
   631
    int i,line,col;
ali@69
   632
    if (!g_file_get_contents(filename,&contents,&len,err))
ali@69
   633
	return NULL;
ali@72
   634
    if (g_utf8_validate(contents,len,NULL))
ali@72
   635
    {
ali@72
   636
	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
ali@72
   637
	g_set_print_handler(print_as_utf_8);
ali@73
   638
#ifdef __WIN32__
ali@73
   639
	SetConsoleOutputCP(CP_UTF8);
ali@73
   640
#endif
ali@72
   641
    }
ali@72
   642
    else
ali@72
   643
    {
ali@76
   644
	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
ali@76
   645
	  &bytes_written,&tmp_err);
ali@76
   646
	if (g_error_matches(tmp_err,G_CONVERT_ERROR,
ali@76
   647
	  G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
ali@76
   648
	{
ali@76
   649
	    line=col=1;
ali@76
   650
	    for(i=0;i<bytes_read;i++)
ali@76
   651
		if (contents[i]=='\n')
ali@76
   652
		{
ali@76
   653
		    line++;
ali@76
   654
		    col=1;
ali@76
   655
		}
ali@76
   656
		else if (contents[i]!='\r')
ali@76
   657
		    col++;
ali@76
   658
	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
ali@76
   659
	      "Input conversion failed. Byte %d at line %d, column %d is not a "
ali@76
   660
	      "valid Windows-1252 character",
ali@76
   661
	      ((unsigned char *)contents)[bytes_read],line,col);
ali@76
   662
	}
ali@76
   663
	else if (tmp_err)
ali@76
   664
	    g_propagate_error(err,tmp_err);
ali@72
   665
	g_set_print_handler(print_as_windows_1252);
ali@73
   666
#ifdef __WIN32__
ali@73
   667
	SetConsoleOutputCP(1252);
ali@73
   668
#endif
ali@72
   669
    }
ali@70
   670
    g_free(contents);
ali@70
   671
    return utf8;
ali@69
   672
}
ali@69
   673
ali@73
   674
void cleanup_on_exit(void)
ali@73
   675
{
ali@73
   676
#ifdef __WIN32__
ali@73
   677
    SetConsoleOutputCP(saved_cp);
ali@73
   678
#endif
ali@73
   679
}
ali@73
   680
ali@69
   681
int main(int argc,char **argv)
ali@69
   682
{
ali@73
   683
#ifdef __WIN32__
ali@73
   684
    atexit(cleanup_on_exit);
ali@73
   685
    saved_cp=GetConsoleOutputCP();
ali@73
   686
#endif
ali@69
   687
    running_from=g_path_get_dirname(argv[0]);
ali@198
   688
    /* Paranoid checking is turned OFF, not on, by its switch */
ali@198
   689
    pswit[PARANOID_SWITCH]=TRUE;
ali@198
   690
    /* if running in paranoid mode, typo checks default to enabled */
ali@198
   691
    pswit[TYPO_SWITCH]=TRUE;
ali@198
   692
    /* Line-end checking is turned OFF, not on, by its switch */
ali@198
   693
    pswit[LINE_END_SWITCH]=TRUE;
ali@198
   694
    /* Echoing is turned OFF, not on, by its switch */
ali@198
   695
    pswit[ECHO_SWITCH]=TRUE;
ali@198
   696
    parse_config_file();
ali@69
   697
    parse_options(&argc,&argv);
ali@40
   698
    if (pswit[USERTYPO_SWITCH])
ali@69
   699
	read_user_scannos();
ali@40
   700
    fprintf(stderr,"bookloupe: Check and report on an e-text\n");
ali@69
   701
    procfile(argv[1]);
ali@40
   702
    if (pswit[OVERVIEW_SWITCH])
ali@40
   703
    {
ali@70
   704
	g_print("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
ali@40
   705
	  checked_linecnt,linecnt,linecnt-checked_linecnt);
ali@70
   706
	g_print("    --------------- Queries found --------------\n");
ali@68
   707
	if (cnt_long)
ali@70
   708
	    g_print("    Long lines:		    %14ld\n",cnt_long);
ali@68
   709
	if (cnt_short)
ali@70
   710
	    g_print("    Short lines:		   %14ld\n",cnt_short);
ali@68
   711
	if (cnt_lineend)
ali@70
   712
	    g_print("    Line-end problems:	     %14ld\n",cnt_lineend);
ali@68
   713
	if (cnt_word)
ali@70
   714
	    g_print("    Common typos:		  %14ld\n",cnt_word);
ali@142
   715
	if (cnt_quote)
ali@142
   716
	    g_print("    Unmatched quotes:	      %14ld\n",cnt_quote);
ali@68
   717
	if (cnt_brack)
ali@70
   718
	    g_print("    Unmatched brackets:	    %14ld\n",cnt_brack);
ali@68
   719
	if (cnt_bin)
ali@70
   720
	    g_print("    Non-ASCII characters:	  %14ld\n",cnt_bin);
ali@68
   721
	if (cnt_odd)
ali@70
   722
	    g_print("    Proofing characters:	   %14ld\n",cnt_odd);
ali@68
   723
	if (cnt_punct)
ali@70
   724
	    g_print("    Punctuation & spacing queries: %14ld\n",cnt_punct);
ali@68
   725
	if (cnt_dash)
ali@70
   726
	    g_print("    Non-standard dashes:	   %14ld\n",cnt_dash);
ali@68
   727
	if (cnt_html)
ali@70
   728
	    g_print("    Possible HTML tags:	    %14ld\n",cnt_html);
ali@70
   729
	g_print("\n");
ali@70
   730
	g_print("    TOTAL QUERIES		  %14ld\n",
ali@142
   731
	  cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+
ali@142
   732
	  cnt_dash+cnt_word+cnt_html+cnt_lineend);
ali@40
   733
    }
ali@69
   734
    g_free(running_from);
ali@69
   735
    if (usertypo)
ali@69
   736
	g_tree_unref(usertypo);
ali@185
   737
    set_charset(NULL,NULL);
ali@198
   738
    if (config)
ali@198
   739
	g_key_file_free(config);
ali@40
   740
    return 0;
ali@0
   741
}
ali@0
   742
ali@147
   743
void count_dashes(const char *line,const char *dash,
ali@147
   744
  struct dash_results *results)
ali@147
   745
{
ali@147
   746
    int i;
ali@147
   747
    gchar **tokens;
ali@147
   748
    gunichar pc,nc;
ali@147
   749
    gboolean spaced=FALSE,unspaced=FALSE,spaced2=FALSE;
ali@147
   750
    if (!*line)
ali@147
   751
	return;
ali@147
   752
    tokens=g_strsplit(line,dash,0);
ali@147
   753
    if (tokens[1])
ali@147
   754
	results->base++;
ali@147
   755
    for(i=1;tokens[i];i++)
ali@147
   756
    {
ali@147
   757
	pc=g_utf8_get_char(g_utf8_prev_char(tokens[i-1]+strlen(tokens[i-1])));
ali@147
   758
	nc=g_utf8_get_char(tokens[i]);
ali@147
   759
	if (g_unichar_isspace(pc) || g_unichar_isspace(nc))
ali@147
   760
	    spaced=TRUE;
ali@147
   761
	if (g_unichar_isspace(pc) && g_unichar_isspace(nc))
ali@147
   762
	    spaced2=TRUE;
ali@147
   763
	else if (!g_unichar_isspace(pc) && !g_unichar_isspace(nc))
ali@147
   764
	    unspaced=TRUE;
ali@147
   765
    }
ali@147
   766
    if (spaced)
ali@147
   767
	results->space++;
ali@147
   768
    if (spaced2)
ali@147
   769
	/* count of lines with em-dashes with spaces both sides */
ali@147
   770
	results->non_PG_space++;
ali@147
   771
    if (unspaced)
ali@147
   772
	/* count of lines with PG-type em-dashes with no spaces */
ali@147
   773
	results->PG_space++;
ali@147
   774
    g_strfreev(tokens);
ali@147
   775
}
ali@147
   776
ali@40
   777
/*
ali@41
   778
 * first_pass:
ali@40
   779
 *
ali@41
   780
 * Run a first pass - verify that it's a valid PG
ali@41
   781
 * file, decide whether to report some things that
ali@41
   782
 * occur many times in the text like long or short
ali@41
   783
 * lines, non-standard dashes, etc.
ali@40
   784
 */
ali@69
   785
struct first_pass_results *first_pass(const char *etext)
ali@0
   786
{
ali@70
   787
    gunichar laststart=CHAR_SPACE;
ali@54
   788
    const char *s;
ali@69
   789
    gchar *lc_line;
ali@70
   790
    int i,j,lbytes,llen;
ali@69
   791
    gchar **lines;
ali@41
   792
    unsigned int lastlen=0,lastblen=0;
ali@41
   793
    long spline=0,nspline=0;
ali@41
   794
    static struct first_pass_results results={0};
ali@147
   795
    struct dash_results tmp_dash_results;
ali@69
   796
    gchar *inword;
ali@142
   797
    QuoteClass qc;
ali@69
   798
    lines=g_strsplit(etext,"\n",0);
ali@199
   799
    if (!lines[0])
ali@199
   800
    {
ali@199
   801
	/* An empty etext has no terminators */
ali@199
   802
	results.newlines=DOS_NEWLINES;
ali@199
   803
    }
ali@199
   804
    else if (!lines[1])
ali@199
   805
    {
ali@199
   806
	/*
ali@199
   807
	 * If there are no LFs, we don't have UNIX-style
ali@199
   808
	 * terminators, but we might have OS9-style ones.
ali@199
   809
	 */
ali@199
   810
	results.newlines=OS9_NEWLINES;
ali@199
   811
	g_strfreev(lines);
ali@199
   812
	lines=g_strsplit(etext,"\r",0);
ali@199
   813
	if (!lines[0] || !lines[1])
ali@199
   814
	    /* Looks like we don't have any terminators at all */
ali@199
   815
	    results.newlines=DOS_NEWLINES;
ali@199
   816
    }
ali@199
   817
    else
ali@199
   818
    {
ali@199
   819
	/* We might have UNIX-style terminators */
ali@199
   820
	results.newlines=UNIX_NEWLINES;
ali@199
   821
    }
ali@69
   822
    for (j=0;lines[j];j++)
ali@40
   823
    {
ali@70
   824
	lbytes=strlen(lines[j]);
ali@199
   825
	if (lbytes>0 && lines[j][lbytes-1]=='\r')
ali@199
   826
	{
ali@199
   827
	    results.newlines=DOS_NEWLINES;
ali@199
   828
	    do
ali@199
   829
	    {
ali@199
   830
		lines[j][--lbytes]='\0';
ali@199
   831
	    } while (lbytes>0 && lines[j][lbytes-1]=='\r');
ali@199
   832
	}
ali@70
   833
	llen=g_utf8_strlen(lines[j],lbytes);
ali@68
   834
	linecnt++;
ali@69
   835
	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
ali@69
   836
	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
ali@40
   837
	{
ali@68
   838
	    if (spline)
ali@70
   839
		g_print("   --> Duplicate header?\n");
ali@68
   840
	    spline=linecnt+1;   /* first line of non-header text, that is */
ali@40
   841
	}
ali@69
   842
	if (!strncmp(lines[j],"*** START",9) &&
ali@69
   843
	  strstr(lines[j],"PROJECT GUTENBERG"))
ali@40
   844
	{
ali@68
   845
	    if (nspline)
ali@70
   846
		g_print("   --> Duplicate header?\n");
ali@68
   847
	    nspline=linecnt+1;   /* first line of non-header text, that is */
ali@40
   848
	}
ali@68
   849
	if (spline || nspline)
ali@40
   850
	{
ali@70
   851
	    lc_line=g_utf8_strdown(lines[j],lbytes);
ali@69
   852
	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
ali@40
   853
	    {
ali@69
   854
		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
ali@40
   855
		{
ali@68
   856
		    if (results.footerline)
ali@40
   857
		    {
ali@40
   858
			/* it's an old-form header - we can detect duplicates */
ali@68
   859
			if (!nspline)
ali@70
   860
			    g_print("   --> Duplicate footer?\n");
ali@40
   861
		    }
ali@68
   862
		    else
ali@68
   863
			results.footerline=linecnt;
ali@40
   864
		}
ali@40
   865
	    }
ali@69
   866
	    g_free(lc_line);
ali@40
   867
	}
ali@68
   868
	if (spline)
ali@41
   869
	    results.firstline=spline;
ali@68
   870
	if (nspline)
ali@41
   871
	    results.firstline=nspline;  /* override with new */
ali@68
   872
	if (results.footerline)
ali@40
   873
	    continue;    /* don't count the boilerplate in the footer */
ali@68
   874
	results.totlen+=llen;
ali@70
   875
	for (s=lines[j];*s;s=g_utf8_next_char(s))
ali@40
   876
	{
ali@70
   877
	    if (g_utf8_get_char(s)>127)
ali@41
   878
		results.binlen++;
ali@70
   879
	    if (g_unichar_isalpha(g_utf8_get_char(s)))
ali@41
   880
		results.alphalen++;
ali@142
   881
	    if (s>lines[j])
ali@142
   882
	    {
ali@142
   883
		if (CHAR_IS_DQUOTE(g_utf8_get_char(s)))
ali@142
   884
		    qc=QUOTE_CLASS(g_utf8_get_char(s));
ali@142
   885
		else
ali@142
   886
		    qc=INVALID_QUOTE;
ali@142
   887
		if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) &&
ali@147
   888
		  g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
ali@142
   889
		    results.endquote_count++;
ali@142
   890
	    }
ali@40
   891
	}
ali@69
   892
	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
ali@69
   893
	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
ali@41
   894
	    results.shortline++;
ali@70
   895
	if (lbytes>0 &&
ali@70
   896
	  g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
ali@40
   897
	    cnt_spacend++;
ali@69
   898
	if (strstr(lines[j],".,"))
ali@41
   899
	    results.dotcomma++;
ali@68
   900
	/* only count ast lines for ignoring purposes where there is */
ali@68
   901
	/* locase text on the line */
ali@69
   902
	if (strchr(lines[j],'*'))
ali@40
   903
	{
ali@70
   904
	    for (s=lines[j];*s;s=g_utf8_next_char(s))
ali@70
   905
		if (g_unichar_islower(g_utf8_get_char(s)))
ali@68
   906
		    break;
ali@70
   907
	    if (*s)
ali@41
   908
		results.astline++;
ali@40
   909
	}
ali@69
   910
	if (strchr(lines[j],'/'))
ali@68
   911
	    results.fslashline++;
ali@82
   912
	if (lbytes>0)
ali@82
   913
	{
ali@82
   914
	    for (s=g_utf8_prev_char(lines[j]+lbytes);
ali@82
   915
	      s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
ali@82
   916
	      s=g_utf8_prev_char(s))
ali@82
   917
		;
ali@82
   918
	    if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
ali@82
   919
	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
ali@82
   920
		results.hyphens++;
ali@82
   921
	}
ali@68
   922
	if (llen>LONGEST_PG_LINE)
ali@41
   923
	    results.longline++;
ali@68
   924
	if (llen>WAY_TOO_LONG)
ali@41
   925
	    results.verylongline++;
ali@69
   926
	if (strchr(lines[j],'<') && strchr(lines[j],'>'))
ali@40
   927
	{
ali@69
   928
	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
ali@68
   929
	    if (i>0)
ali@68
   930
		results.htmcount++;
ali@69
   931
	    if (strstr(lines[j],"<i>"))
ali@41
   932
		results.htmcount+=4; /* bonus marks! */
ali@40
   933
	}
ali@68
   934
	/* Check for spaced em-dashes */
ali@147
   935
	memset(&tmp_dash_results,0,sizeof(tmp_dash_results));
ali@147
   936
	count_dashes(lines[j],"--",&tmp_dash_results);
ali@147
   937
	count_dashes(lines[j],"—",&tmp_dash_results);
ali@147
   938
	if (tmp_dash_results.base)
ali@147
   939
	    results.emdash.base++;
ali@147
   940
	if (tmp_dash_results.non_PG_space)
ali@147
   941
	    results.emdash.non_PG_space++;
ali@147
   942
	if (tmp_dash_results.PG_space)
ali@147
   943
	    results.emdash.PG_space++;
ali@69
   944
	for (s=lines[j];*s;)
ali@40
   945
	{
ali@69
   946
	    inword=getaword(&s);
ali@68
   947
	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) 
ali@68
   948
		results.Dutchcount++;
ali@68
   949
	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) 
ali@68
   950
		results.Frenchcount++;
ali@68
   951
	    if (!strcmp(inword,"0") || !strcmp(inword,"1")) 
ali@68
   952
		results.standalone_digit++;
ali@69
   953
	    g_free(inword);
ali@40
   954
	}
ali@68
   955
	/* Check for spaced dashes */
ali@69
   956
	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
ali@41
   957
	    results.spacedash++;
ali@68
   958
	lastblen=lastlen;
ali@69
   959
	lastlen=llen;
ali@69
   960
	laststart=lines[j][0];
ali@40
   961
    }
ali@69
   962
    g_strfreev(lines);
ali@41
   963
    return &results;
ali@41
   964
}
ali@41
   965
ali@42
   966
/*
ali@42
   967
 * report_first_pass:
ali@42
   968
 *
ali@42
   969
 * Make some snap decisions based on the first pass results.
ali@42
   970
 */
ali@42
   971
struct warnings *report_first_pass(struct first_pass_results *results)
ali@42
   972
{
ali@42
   973
    static struct warnings warnings={0};
ali@199
   974
    warnings.newlines=results->newlines;
ali@199
   975
    if (warnings.newlines==UNIX_NEWLINES)
ali@199
   976
	g_print("   --> No lines in this file have a CR. Not reporting them. "
ali@199
   977
	  "Project Gutenberg requires that all lineends be CR-LF.\n");
ali@199
   978
    else if (warnings.newlines==OS9_NEWLINES)
ali@199
   979
	g_print("   --> No lines in this file have a LF. Not reporting them. "
ali@199
   980
	  "Project Gutenberg requires that all lineends be CR-LF.\n");
ali@42
   981
    if (cnt_spacend>0)
ali@70
   982
	g_print("   --> %ld lines in this file have white space at end\n",
ali@42
   983
	  cnt_spacend);
ali@42
   984
    warnings.dotcomma=1;
ali@42
   985
    if (results->dotcomma>5)
ali@42
   986
    {
ali@68
   987
	warnings.dotcomma=0;
ali@70
   988
	g_print("   --> %ld lines in this file contain '.,'. "
ali@42
   989
	  "Not reporting them.\n",results->dotcomma);
ali@42
   990
    }
ali@42
   991
    /*
ali@42
   992
     * If more than 50 lines, or one-tenth, are short,
ali@42
   993
     * don't bother reporting them.
ali@42
   994
     */
ali@42
   995
    warnings.shortline=1;
ali@42
   996
    if (results->shortline>50 || results->shortline*10>linecnt)
ali@42
   997
    {
ali@68
   998
	warnings.shortline=0;
ali@70
   999
	g_print("   --> %ld lines in this file are short. "
ali@42
  1000
	  "Not reporting short lines.\n",results->shortline);
ali@42
  1001
    }
ali@42
  1002
    /*
ali@42
  1003
     * If more than 50 lines, or one-tenth, are long,
ali@42
  1004
     * don't bother reporting them.
ali@42
  1005
     */
ali@42
  1006
    warnings.longline=1;
ali@42
  1007
    if (results->longline>50 || results->longline*10>linecnt)
ali@42
  1008
    {
ali@68
  1009
	warnings.longline=0;
ali@70
  1010
	g_print("   --> %ld lines in this file are long. "
ali@42
  1011
	  "Not reporting long lines.\n",results->longline);
ali@42
  1012
    }
ali@42
  1013
    /* If more than 10 lines contain asterisks, don't bother reporting them. */
ali@42
  1014
    warnings.ast=1;
ali@42
  1015
    if (results->astline>10)
ali@42
  1016
    {
ali@68
  1017
	warnings.ast=0;
ali@70
  1018
	g_print("   --> %ld lines in this file contain asterisks. "
ali@42
  1019
	  "Not reporting them.\n",results->astline);
ali@42
  1020
    }
ali@42
  1021
    /*
ali@42
  1022
     * If more than 10 lines contain forward slashes,
ali@42
  1023
     * don't bother reporting them.
ali@42
  1024
     */
ali@42
  1025
    warnings.fslash=1;
ali@42
  1026
    if (results->fslashline>10)
ali@42
  1027
    {
ali@68
  1028
	warnings.fslash=0;
ali@70
  1029
	g_print("   --> %ld lines in this file contain forward slashes. "
ali@42
  1030
	  "Not reporting them.\n",results->fslashline);
ali@42
  1031
    }
ali@42
  1032
    /*
ali@42
  1033
     * If more than 20 lines contain unpunctuated endquotes,
ali@42
  1034
     * don't bother reporting them.
ali@42
  1035
     */
ali@42
  1036
    warnings.endquote=1;
ali@42
  1037
    if (results->endquote_count>20)
ali@42
  1038
    {
ali@68
  1039
	warnings.endquote=0;
ali@70
  1040
	g_print("   --> %ld lines in this file contain unpunctuated endquotes. "
ali@42
  1041
	  "Not reporting them.\n",results->endquote_count);
ali@42
  1042
    }
ali@42
  1043
    /*
ali@42
  1044
     * If more than 15 lines contain standalone digits,
ali@42
  1045
     * don't bother reporting them.
ali@42
  1046
     */
ali@42
  1047
    warnings.digit=1;
ali@42
  1048
    if (results->standalone_digit>10)
ali@42
  1049
    {
ali@68
  1050
	warnings.digit=0;
ali@70
  1051
	g_print("   --> %ld lines in this file contain standalone 0s and 1s. "
ali@42
  1052
	  "Not reporting them.\n",results->standalone_digit);
ali@42
  1053
    }
ali@42
  1054
    /*
ali@42
  1055
     * If more than 20 lines contain hyphens at end,
ali@42
  1056
     * don't bother reporting them.
ali@42
  1057
     */
ali@42
  1058
    warnings.hyphen=1;
ali@42
  1059
    if (results->hyphens>20)
ali@42
  1060
    {
ali@68
  1061
	warnings.hyphen=0;
ali@70
  1062
	g_print("   --> %ld lines in this file have hyphens at end. "
ali@42
  1063
	  "Not reporting them.\n",results->hyphens);
ali@42
  1064
    }
ali@42
  1065
    if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
ali@42
  1066
    {
ali@70
  1067
	g_print("   --> Looks like this is HTML. Switching HTML mode ON.\n");
ali@68
  1068
	pswit[MARKUP_SWITCH]=1;
ali@42
  1069
    }
ali@42
  1070
    if (results->verylongline>0)
ali@70
  1071
	g_print("   --> %ld lines in this file are VERY long!\n",
ali@42
  1072
	  results->verylongline);
ali@42
  1073
    /*
ali@42
  1074
     * If there are more non-PG spaced dashes than PG em-dashes,
ali@42
  1075
     * assume it's deliberate.
ali@42
  1076
     * Current PG guidelines say don't use them, but older texts do,
ali@42
  1077
     * and some people insist on them whatever the guidelines say.
ali@42
  1078
     */
ali@42
  1079
    warnings.dash=1;
ali@147
  1080
    if (results->spacedash+results->emdash.non_PG_space>
ali@147
  1081
      results->emdash.PG_space)
ali@42
  1082
    {
ali@68
  1083
	warnings.dash=0;
ali@70
  1084
	g_print("   --> There are %ld spaced dashes and em-dashes. "
ali@42
  1085
	  "Not reporting them.\n",
ali@147
  1086
	  results->spacedash+results->emdash.non_PG_space);
ali@42
  1087
    }
ali@185
  1088
    if (charset)
ali@185
  1089
	warnings.bin=0;
ali@185
  1090
    else
ali@42
  1091
    {
ali@185
  1092
	/* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */
ali@185
  1093
	warnings.bin=1;
ali@185
  1094
	/* If more than a quarter of characters are hi-bit, bug out. */
ali@185
  1095
	if (results->binlen*4>results->totlen)
ali@185
  1096
	{
ali@185
  1097
	    g_print("   --> This file does not appear to be ASCII. "
ali@185
  1098
	      "Terminating. Best of luck with it!\n");
ali@185
  1099
	    exit(1);
ali@185
  1100
	}
ali@185
  1101
	if (results->alphalen*4<results->totlen)
ali@185
  1102
	{
ali@185
  1103
	    g_print("   --> This file does not appear to be text. "
ali@185
  1104
	      "Terminating. Best of luck with it!\n");
ali@185
  1105
	    exit(1);
ali@185
  1106
	}
ali@185
  1107
	if (results->binlen*100>results->totlen || results->binlen>100)
ali@185
  1108
	{
ali@185
  1109
	    g_print("   --> There are a lot of foreign letters here. "
ali@185
  1110
	      "Not reporting them.\n");
ali@185
  1111
	    if (!pswit[VERBOSE_SWITCH])
ali@185
  1112
		warnings.bin=0;
ali@185
  1113
	}
ali@42
  1114
    }
ali@69
  1115
    warnings.isDutch=FALSE;
ali@42
  1116
    if (results->Dutchcount>50)
ali@42
  1117
    {
ali@69
  1118
	warnings.isDutch=TRUE;
ali@70
  1119
	g_print("   --> This looks like Dutch - "
ali@42
  1120
	  "switching off dashes and warnings for 's Middags case.\n");
ali@42
  1121
    }
ali@69
  1122
    warnings.isFrench=FALSE;
ali@42
  1123
    if (results->Frenchcount>50)
ali@42
  1124
    {
ali@69
  1125
	warnings.isFrench=TRUE;
ali@70
  1126
	g_print("   --> This looks like French - "
ali@42
  1127
	  "switching off some doublepunct.\n");
ali@42
  1128
    }
ali@42
  1129
    if (results->firstline && results->footerline)
ali@70
  1130
	g_print("    The PG header and footer appear to be already on.\n");
ali@42
  1131
    else
ali@42
  1132
    {
ali@68
  1133
	if (results->firstline)
ali@70
  1134
	    g_print("    The PG header is on - no footer.\n");
ali@68
  1135
	if (results->footerline)
ali@70
  1136
	    g_print("    The PG footer is on - no header.\n");
ali@42
  1137
    }
ali@70
  1138
    g_print("\n");
ali@42
  1139
    if (pswit[VERBOSE_SWITCH])
ali@42
  1140
    {
ali@68
  1141
	warnings.shortline=1;
ali@68
  1142
	warnings.dotcomma=1;
ali@68
  1143
	warnings.longline=1;
ali@68
  1144
	warnings.dash=1;
ali@68
  1145
	warnings.digit=1;
ali@68
  1146
	warnings.ast=1;
ali@68
  1147
	warnings.fslash=1;
ali@68
  1148
	warnings.hyphen=1;
ali@68
  1149
	warnings.endquote=1;
ali@70
  1150
	g_print("   *** Verbose output is ON -- you asked for it! ***\n");
ali@42
  1151
    }
ali@42
  1152
    if (warnings.isDutch)
ali@68
  1153
	warnings.dash=0;
ali@42
  1154
    if (results->footerline>0 && results->firstline>0 &&
ali@42
  1155
      results->footerline>results->firstline &&
ali@42
  1156
      results->footerline-results->firstline<100)
ali@42
  1157
    {
ali@70
  1158
	g_print("   --> I don't really know where this text starts. \n");
ali@70
  1159
	g_print("       There are no reference points.\n");
ali@70
  1160
	g_print("       I'm going to have to report the header and footer "
ali@42
  1161
	  "as well.\n");
ali@68
  1162
	results->firstline=0;
ali@42
  1163
    }
ali@42
  1164
    return &warnings;
ali@42
  1165
}
ali@42
  1166
ali@43
  1167
/*
ali@43
  1168
 * analyse_quotes:
ali@43
  1169
 *
ali@43
  1170
 * Look along the line, accumulate the count of quotes, and see
ali@43
  1171
 * if this is an empty line - i.e. a line with nothing on it
ali@43
  1172
 * but spaces.
ali@43
  1173
 * If line has just spaces, period, * and/or - on it, don't
ali@43
  1174
 * count it, since empty lines with asterisks or dashes to
ali@43
  1175
 * separate sections are common.
ali@43
  1176
 *
ali@69
  1177
 * Returns: TRUE if the line is empty.
ali@43
  1178
 */
ali@164
  1179
gboolean analyse_quotes(const char *aline,struct counters *counters)
ali@43
  1180
{
ali@68
  1181
    int guessquote=0;
ali@69
  1182
    /* assume the line is empty until proven otherwise */
ali@69
  1183
    gboolean isemptyline=TRUE;
ali@70
  1184
    const char *s=aline,*sprev,*snext;
ali@70
  1185
    gunichar c;
ali@70
  1186
    sprev=NULL;
ali@142
  1187
    GError *tmp_err=NULL;
ali@43
  1188
    while (*s)
ali@43
  1189
    {
ali@70
  1190
	snext=g_utf8_next_char(s);
ali@70
  1191
	c=g_utf8_get_char(s);
ali@142
  1192
	if (CHAR_IS_DQUOTE(c))
ali@142
  1193
	    (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);
ali@142
  1194
	else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])
ali@43
  1195
	{
ali@43
  1196
	    if (s==aline)
ali@43
  1197
	    {
ali@43
  1198
		/*
ali@142
  1199
		 * At start of line, it can only be a quotation mark.
ali@43
  1200
		 * Hardcode a very common exception!
ali@43
  1201
		 */
ali@70
  1202
		if (!g_str_has_prefix(snext,"tis") &&
ali@70
  1203
		  !g_str_has_prefix(snext,"Tis"))
ali@142
  1204
		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
ali@43
  1205
	    }
ali@70
  1206
	    else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
ali@70
  1207
	      g_unichar_isalpha(g_utf8_get_char(snext)))
ali@43
  1208
		/* Do nothing! it's definitely an apostrophe, not a quote */
ali@43
  1209
		;
ali@43
  1210
	    /* it's outside a word - let's check it out */
ali@99
  1211
	    else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
ali@70
  1212
	      g_unichar_isalpha(g_utf8_get_char(snext)))
ali@43
  1213
	    {
ali@142
  1214
		/* certainly looks like a quotation mark */
ali@70
  1215
		if (!g_str_has_prefix(snext,"tis") &&
ali@70
  1216
		  !g_str_has_prefix(snext,"Tis"))
ali@43
  1217
		    /* hardcode a very common exception! */
ali@142
  1218
		{
ali@142
  1219
		    if (strchr(".?!,;:",g_utf8_get_char(sprev)))
ali@142
  1220
			(void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
ali@142
  1221
		    else
ali@142
  1222
			(void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);
ali@142
  1223
		}
ali@43
  1224
	    }
ali@43
  1225
	    else
ali@43
  1226
	    {
ali@142
  1227
		/* now - is it a quotation mark? */
ali@43
  1228
		guessquote=0;   /* accumulate clues */
ali@70
  1229
		if (g_unichar_isalpha(g_utf8_get_char(sprev)))
ali@43
  1230
		{
ali@43
  1231
		    /* it follows a letter - could be either */
ali@43
  1232
		    guessquote++;
ali@70
  1233
		    if (g_utf8_get_char(sprev)=='s')
ali@43
  1234
		    {
ali@43
  1235
			/* looks like a plural apostrophe */
ali@43
  1236
			guessquote-=3;
ali@70
  1237
			if (g_utf8_get_char(snext)==CHAR_SPACE)
ali@70
  1238
			    /* bonus marks! */
ali@43
  1239
			    guessquote-=2;
ali@43
  1240
		    }
ali@142
  1241
		    if (innermost_quote_matches(counters,c))
ali@142
  1242
			/*
ali@142
  1243
			 * Give it the benefit of some doubt,
ali@142
  1244
			 * if a squote is already open.
ali@142
  1245
			 */
ali@142
  1246
			guessquote++;
ali@142
  1247
		    else
ali@142
  1248
			guessquote--;
ali@142
  1249
		    if (guessquote>=0)
ali@142
  1250
			(void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);
ali@43
  1251
		}
ali@43
  1252
		else
ali@142
  1253
		    /* no adjacent letter - it must be a quote of some kind */
ali@142
  1254
		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
ali@43
  1255
	    }
ali@43
  1256
	}
ali@142
  1257
	if (tmp_err)
ali@142
  1258
	{
ali@142
  1259
	    if (pswit[ECHO_SWITCH])
ali@142
  1260
		g_print("\n%s\n",aline);
ali@142
  1261
	    if (!pswit[OVERVIEW_SWITCH])
ali@142
  1262
		g_print("    Line %ld column %ld - %s\n",
ali@142
  1263
		  linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);
ali@142
  1264
	    g_clear_error(&tmp_err);
ali@142
  1265
	}
ali@70
  1266
	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
ali@70
  1267
	  c!='\r' && c!='\n')
ali@69
  1268
	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */
ali@70
  1269
	if (c==CHAR_UNDERSCORE)
ali@43
  1270
	    counters->c_unders++;
ali@103
  1271
	if (c==CHAR_OPEN_SBRACK)
ali@103
  1272
	{
ali@103
  1273
	    if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
ali@103
  1274
	      !matching_difference(counters,c) && s==aline &&
ali@103
  1275
	      g_str_has_prefix(s,"[Illustration:"))
ali@103
  1276
		increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
ali@103
  1277
	    else
ali@103
  1278
		increment_matching(counters,c,TRUE);
ali@103
  1279
	}
ali@103
  1280
	else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
ali@99
  1281
	    increment_matching(counters,c,TRUE);
ali@103
  1282
	if (c==CHAR_CLOSE_SBRACK)
ali@103
  1283
	{
ali@103
  1284
	    if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
ali@103
  1285
	      !matching_difference(counters,c) && !*snext)
ali@103
  1286
		increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
ali@103
  1287
	    else
ali@103
  1288
		increment_matching(counters,c,FALSE);
ali@103
  1289
	}
ali@103
  1290
	else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
ali@99
  1291
	    increment_matching(counters,c,FALSE);
ali@70
  1292
	sprev=s;
ali@70
  1293
	s=snext;
ali@43
  1294
    }
ali@43
  1295
    return isemptyline;
ali@43
  1296
}
ali@43
  1297
ali@41
  1298
/*
ali@67
  1299
 * check_for_control_characters:
ali@67
  1300
 *
ali@67
  1301
 * Check for invalid or questionable characters in the line
ali@67
  1302
 * Anything above 127 is invalid for plain ASCII, and
ali@67
  1303
 * non-printable control characters should also be flagged.
ali@67
  1304
 * Tabs should generally not be there.
ali@67
  1305
 */
ali@67
  1306
void check_for_control_characters(const char *aline)
ali@67
  1307
{
ali@70
  1308
    gunichar c;
ali@67
  1309
    const char *s;
ali@70
  1310
    for (s=aline;*s;s=g_utf8_next_char(s))
ali@67
  1311
    {
ali@70
  1312
	c=g_utf8_get_char(s);
ali@67
  1313
	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
ali@67
  1314
	{
ali@67
  1315
	    if (pswit[ECHO_SWITCH])
ali@70
  1316
		g_print("\n%s\n",aline);
ali@67
  1317
	    if (!pswit[OVERVIEW_SWITCH])
ali@70
  1318
		g_print("    Line %ld column %ld - Control character %u\n",
ali@70
  1319
		  linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
ali@67
  1320
	    else
ali@67
  1321
		cnt_bin++;
ali@67
  1322
	}
ali@67
  1323
    }
ali@67
  1324
}
ali@67
  1325
ali@67
  1326
/*
ali@44
  1327
 * check_for_odd_characters:
ali@44
  1328
 *
ali@44
  1329
 * Check for binary and other odd characters.
ali@44
  1330
 */
ali@44
  1331
void check_for_odd_characters(const char *aline,const struct warnings *warnings,
ali@69
  1332
  gboolean isemptyline)
ali@44
  1333
{
ali@44
  1334
    /* Don't repeat multiple warnings on one line. */
ali@185
  1335
    gboolean eInvalidChar=FALSE,eTab=FALSE,eTilde=FALSE;
ali@70
  1336
    gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
ali@44
  1337
    const char *s;
ali@70
  1338
    gunichar c;
ali@185
  1339
    gsize nb;
ali@185
  1340
    gchar *t;
ali@70
  1341
    for (s=aline;*s;s=g_utf8_next_char(s))
ali@44
  1342
    {
ali@70
  1343
	c=g_utf8_get_char(s);
ali@185
  1344
	if (warnings->bin && !eInvalidChar &&
ali@185
  1345
	  (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
ali@44
  1346
	{
ali@44
  1347
	    if (pswit[ECHO_SWITCH])
ali@70
  1348
		g_print("\n%s\n",aline);
ali@44
  1349
	    if (!pswit[OVERVIEW_SWITCH])
ali@70
  1350
		if (c>127 && c<160 || c>255)
ali@70
  1351
		    g_print("    Line %ld column %ld - "
ali@70
  1352
		      "Non-ISO-8859 character %u\n",
ali@70
  1353
		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
ali@44
  1354
		else
ali@70
  1355
		    g_print("    Line %ld column %ld - "
ali@70
  1356
		      "Non-ASCII character %u\n",
ali@70
  1357
		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
ali@44
  1358
	    else
ali@44
  1359
		cnt_bin++;
ali@185
  1360
	    eInvalidChar=TRUE;
ali@185
  1361
	}
ali@185
  1362
	if (!eInvalidChar && charset)
ali@185
  1363
	{
ali@185
  1364
	    if (charset_validator==(GIConv)-1)
ali@185
  1365
	    {
ali@185
  1366
		if (!g_unichar_isdefined(c))
ali@185
  1367
		{
ali@185
  1368
		    if (pswit[ECHO_SWITCH])
ali@185
  1369
			g_print("\n%s\n",aline);
ali@185
  1370
		    if (!pswit[OVERVIEW_SWITCH])
ali@185
  1371
			g_print("    Line %ld column %ld - Unassigned UNICODE "
ali@185
  1372
			  "code point U+%04" G_GINT32_MODIFIER "X\n",
ali@185
  1373
			  linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
ali@185
  1374
		    else
ali@185
  1375
			cnt_bin++;
ali@185
  1376
		    eInvalidChar=TRUE;
ali@185
  1377
		}
ali@185
  1378
		else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD ||
ali@185
  1379
		  c>=100000 && c<=0x10FFFD)
ali@185
  1380
		{
ali@185
  1381
		    if (pswit[ECHO_SWITCH])
ali@185
  1382
			g_print("\n%s\n",aline);
ali@185
  1383
		    if (!pswit[OVERVIEW_SWITCH])
ali@185
  1384
			g_print("    Line %ld column %ld - Private Use "
ali@185
  1385
			  "character U+%04" G_GINT32_MODIFIER "X\n",
ali@185
  1386
			  linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
ali@185
  1387
		    else
ali@185
  1388
			cnt_bin++;
ali@185
  1389
		    eInvalidChar=TRUE;
ali@185
  1390
		}
ali@185
  1391
	    }
ali@185
  1392
	    else
ali@185
  1393
	    {
ali@185
  1394
		t=g_convert_with_iconv(s,g_utf8_next_char(s)-s,
ali@185
  1395
		  charset_validator,NULL,&nb,NULL);
ali@185
  1396
		if (t)
ali@185
  1397
		    g_free(t);
ali@185
  1398
		else
ali@185
  1399
		{
ali@185
  1400
		    if (pswit[ECHO_SWITCH])
ali@185
  1401
			g_print("\n%s\n",aline);
ali@185
  1402
		    if (!pswit[OVERVIEW_SWITCH])
ali@185
  1403
			g_print("    Line %ld column %ld - Non-%s "
ali@185
  1404
			  "character %u\n",linecnt,
ali@185
  1405
			  g_utf8_pointer_to_offset(aline,s)+1,charset,c);
ali@185
  1406
		    else
ali@185
  1407
			cnt_bin++;
ali@185
  1408
		    eInvalidChar=TRUE;
ali@185
  1409
		}
ali@185
  1410
	    }
ali@44
  1411
	}
ali@70
  1412
	if (!eTab && c==CHAR_TAB)
ali@44
  1413
	{
ali@44
  1414
	    if (pswit[ECHO_SWITCH])
ali@70
  1415
		g_print("\n%s\n",aline);
ali@44
  1416
	    if (!pswit[OVERVIEW_SWITCH])
ali@70
  1417
		g_print("    Line %ld column %ld - Tab character?\n",
ali@70
  1418
		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@44
  1419
	    else
ali@44
  1420
		cnt_odd++;
ali@70
  1421
	    eTab=TRUE;
ali@44
  1422
	}
ali@70
  1423
	if (!eTilde && c==CHAR_TILDE)
ali@44
  1424
	{
ali@44
  1425
	    /*
ali@44
  1426
	     * Often used by OCR software to indicate an
ali@44
  1427
	     * unrecognizable character.
ali@44
  1428
	     */
ali@44
  1429
	    if (pswit[ECHO_SWITCH])
ali@70
  1430
		g_print("\n%s\n",aline);
ali@44
  1431
	    if (!pswit[OVERVIEW_SWITCH])
ali@70
  1432
		g_print("    Line %ld column %ld - Tilde character?\n",
ali@70
  1433
		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@44
  1434
	    else
ali@44
  1435
		cnt_odd++;
ali@70
  1436
	    eTilde=TRUE;
ali@44
  1437
	}
ali@70
  1438
	if (!eCarat && c==CHAR_CARAT)
ali@44
  1439
	{  
ali@44
  1440
	    if (pswit[ECHO_SWITCH])
ali@70
  1441
		g_print("\n%s\n",aline);
ali@44
  1442
	    if (!pswit[OVERVIEW_SWITCH])
ali@70
  1443
		g_print("    Line %ld column %ld - Carat character?\n",
ali@70
  1444
		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@44
  1445
	    else
ali@44
  1446
		cnt_odd++;
ali@70
  1447
	    eCarat=TRUE;
ali@44
  1448
	}
ali@70
  1449
	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
ali@44
  1450
	{  
ali@44
  1451
	    if (pswit[ECHO_SWITCH])
ali@70
  1452
		g_print("\n%s\n",aline);
ali@44
  1453
	    if (!pswit[OVERVIEW_SWITCH])
ali@70
  1454
		g_print("    Line %ld column %ld - Forward slash?\n",
ali@70
  1455
		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@44
  1456
	    else
ali@44
  1457
		cnt_odd++;
ali@70
  1458
	    eFSlash=TRUE;
ali@44
  1459
	}
ali@44
  1460
	/*
ali@44
  1461
	 * Report asterisks only in paranoid mode,
ali@44
  1462
	 * since they're often deliberate.
ali@44
  1463
	 */
ali@44
  1464
	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
ali@70
  1465
	  c==CHAR_ASTERISK)
ali@44
  1466
	{
ali@44
  1467
	    if (pswit[ECHO_SWITCH])
ali@70
  1468
		g_print("\n%s\n",aline);
ali@44
  1469
	    if (!pswit[OVERVIEW_SWITCH])
ali@70
  1470
		g_print("    Line %ld column %ld - Asterisk?\n",
ali@70
  1471
		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@44
  1472
	    else
ali@44
  1473
		cnt_odd++;
ali@70
  1474
	    eAst=TRUE;
ali@44
  1475
	}
ali@44
  1476
    }
ali@44
  1477
}
ali@44
  1478
ali@44
  1479
/*
ali@45
  1480
 * check_for_long_line:
ali@45
  1481
 *
ali@45
  1482
 * Check for line too long.
ali@45
  1483
 */
ali@45
  1484
void check_for_long_line(const char *aline)
ali@45
  1485
{
ali@70
  1486
    if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
ali@45
  1487
    {
ali@45
  1488
	if (pswit[ECHO_SWITCH])
ali@70
  1489
	    g_print("\n%s\n",aline);
ali@45
  1490
	if (!pswit[OVERVIEW_SWITCH])
ali@70
  1491
	    g_print("    Line %ld column %ld - Long line %ld\n",
ali@70
  1492
	      linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
ali@45
  1493
	else
ali@45
  1494
	    cnt_long++;
ali@45
  1495
    }
ali@45
  1496
}
ali@45
  1497
ali@45
  1498
/*
ali@45
  1499
 * check_for_short_line:
ali@45
  1500
 *
ali@45
  1501
 * Check for line too short.
ali@45
  1502
 *
ali@45
  1503
 * This one is a bit trickier to implement: we don't want to
ali@45
  1504
 * flag the last line of a paragraph for being short, so we
ali@45
  1505
 * have to wait until we know that our current line is a
ali@45
  1506
 * "normal" line, then report the _previous_ line if it was too
ali@45
  1507
 * short. We also don't want to report indented lines like
ali@45
  1508
 * chapter heads or formatted quotations. We therefore keep
ali@45
  1509
 * last->len as the length of the last line examined, and
ali@45
  1510
 * last->blen as the length of the last but one, and try to
ali@45
  1511
 * suppress unnecessary warnings by checking that both were of
ali@45
  1512
 * "normal" length. We keep the first character of the last
ali@45
  1513
 * line in last->start, and if it was a space, we assume that
ali@45
  1514
 * the formatting is deliberate. I can't figure out a way to
ali@45
  1515
 * distinguish something like a quoted verse left-aligned or
ali@45
  1516
 * the header or footer of a letter from a paragraph of short
ali@45
  1517
 * lines - maybe if I examined the whole paragraph, and if the
ali@45
  1518
 * para has less than, say, 8 lines and if all lines are short,
ali@45
  1519
 * then just assume it's OK? Need to look at some texts to see
ali@45
  1520
 * how often a formula like this would get the right result.
ali@45
  1521
 */
ali@45
  1522
void check_for_short_line(const char *aline,const struct line_properties *last)
ali@45
  1523
{
ali@70
  1524
    if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
ali@70
  1525
      last->len<SHORTEST_PG_LINE && last->blen>1 &&
ali@70
  1526
      last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
ali@45
  1527
    {
ali@45
  1528
	if (pswit[ECHO_SWITCH])
ali@70
  1529
	    g_print("\n%s\n",prevline);
ali@45
  1530
	if (!pswit[OVERVIEW_SWITCH])
ali@70
  1531
	    g_print("    Line %ld column %ld - Short line %ld?\n",
ali@70
  1532
	      linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
ali@45
  1533
	else
ali@45
  1534
	    cnt_short++;
ali@45
  1535
    }
ali@45
  1536
}
ali@45
  1537
ali@45
  1538
/*
ali@46
  1539
 * check_for_starting_punctuation:
ali@46
  1540
 *
ali@46
  1541
 * Look for punctuation other than full ellipses at start of line.
ali@46
  1542
 */
ali@46
  1543
void check_for_starting_punctuation(const char *aline)
ali@46
  1544
{
ali@70
  1545
    if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
ali@70
  1546
      !g_str_has_prefix(aline,". . ."))
ali@46
  1547
    {
ali@46
  1548
	if (pswit[ECHO_SWITCH])
ali@70
  1549
	    g_print("\n%s\n",aline);
ali@46
  1550
	if (!pswit[OVERVIEW_SWITCH])
ali@70
  1551
	    g_print("    Line %ld column 1 - Begins with punctuation?\n",
ali@46
  1552
	      linecnt);
ali@46
  1553
	else
ali@46
  1554
	    cnt_punct++;
ali@46
  1555
    }
ali@46
  1556
}
ali@46
  1557
ali@46
  1558
/*
ali@147
  1559
 * str_emdash:
ali@147
  1560
 *
ali@147
  1561
 * Find the first em-dash, return a pointer to it and set <next> to the
ali@147
  1562
 * character following the dash.
ali@147
  1563
 */
ali@147
  1564
char *str_emdash(const char *s,const char **next)
ali@147
  1565
{
ali@147
  1566
    const char *s1,*s2;
ali@147
  1567
    s1=strstr(s,"--");
ali@147
  1568
    s2=strstr(s,"—");
ali@147
  1569
    if (!s1)
ali@147
  1570
    {
ali@147
  1571
	if (s2)
ali@147
  1572
	    *next=g_utf8_next_char(s2);
ali@147
  1573
	return (char *)s2;
ali@147
  1574
    }
ali@147
  1575
    else if (!s2)
ali@147
  1576
    {
ali@147
  1577
	*next=g_utf8_next_char(g_utf8_next_char(s1));
ali@147
  1578
	return (char *)s1;
ali@147
  1579
    }
ali@147
  1580
    else if (s1<s2)
ali@147
  1581
    {
ali@147
  1582
	*next=g_utf8_next_char(g_utf8_next_char(s1));
ali@147
  1583
	return (char *)s1;
ali@147
  1584
    }
ali@147
  1585
    else
ali@147
  1586
    {
ali@147
  1587
	*next=g_utf8_next_char(s2);
ali@147
  1588
	return (char *)s2;
ali@147
  1589
    }
ali@147
  1590
}
ali@147
  1591
ali@147
  1592
/*
ali@47
  1593
 * check_for_spaced_emdash:
ali@47
  1594
 *
ali@47
  1595
 * Check for spaced em-dashes.
ali@47
  1596
 *
ali@147
  1597
 * We must check _all_ occurrences of em-dashes on the line
ali@147
  1598
 * hence the loop - even if the first dash is OK
ali@47
  1599
 * there may be another that's wrong later on.
ali@47
  1600
 */
ali@47
  1601
void check_for_spaced_emdash(const char *aline)
ali@47
  1602
{
ali@70
  1603
    const char *s,*t,*next;
ali@147
  1604
    for (s=aline;t=str_emdash(s,&next);s=next)
ali@47
  1605
    {
ali@70
  1606
	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
ali@70
  1607
	  g_utf8_get_char(next)==CHAR_SPACE)
ali@47
  1608
	{
ali@47
  1609
	    if (pswit[ECHO_SWITCH])
ali@70
  1610
		g_print("\n%s\n",aline);
ali@47
  1611
	    if (!pswit[OVERVIEW_SWITCH])
ali@70
  1612
		g_print("    Line %ld column %ld - Spaced em-dash?\n",
ali@70
  1613
		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
ali@47
  1614
	    else
ali@47
  1615
		cnt_dash++;
ali@47
  1616
	}
ali@47
  1617
    }
ali@47
  1618
}
ali@47
  1619
ali@47
  1620
/*
ali@47
  1621
 * check_for_spaced_dash:
ali@47
  1622
 *
ali@47
  1623
 * Check for spaced dashes.
ali@47
  1624
 */
ali@47
  1625
void check_for_spaced_dash(const char *aline)
ali@47
  1626
{
ali@47
  1627
    const char *s;
ali@47
  1628
    if ((s=strstr(aline," -")))
ali@47
  1629
    {
ali@70
  1630
	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
ali@47
  1631
	{
ali@47
  1632
	    if (pswit[ECHO_SWITCH])
ali@70
  1633
		g_print("\n%s\n",aline);
ali@47
  1634
	    if (!pswit[OVERVIEW_SWITCH])
ali@70
  1635
		g_print("    Line %ld column %ld - Spaced dash?\n",
ali@70
  1636
		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@47
  1637
	    else
ali@47
  1638
		cnt_dash++;
ali@47
  1639
	}
ali@47
  1640
    }
ali@47
  1641
    else if ((s=strstr(aline,"- ")))
ali@47
  1642
    {
ali@70
  1643
	if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
ali@47
  1644
	{
ali@47
  1645
	    if (pswit[ECHO_SWITCH])
ali@70
  1646
		g_print("\n%s\n",aline);
ali@47
  1647
	    if (!pswit[OVERVIEW_SWITCH])
ali@70
  1648
		g_print("    Line %ld column %ld - Spaced dash?\n",
ali@70
  1649
		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@47
  1650
	    else
ali@47
  1651
		cnt_dash++;
ali@47
  1652
	}
ali@47
  1653
    }
ali@47
  1654
}
ali@47
  1655
ali@47
  1656
/*
ali@48
  1657
 * check_for_unmarked_paragraphs:
ali@48
  1658
 *
ali@48
  1659
 * Check for unmarked paragraphs indicated by separate speakers.
ali@48
  1660
 *
ali@48
  1661
 * May well be false positive:
ali@48
  1662
 * "Bravo!" "Wonderful!" called the crowd.
ali@48
  1663
 * but useful all the same.
ali@48
  1664
 */
ali@48
  1665
void check_for_unmarked_paragraphs(const char *aline)
ali@48
  1666
{
ali@48
  1667
    const char *s;
ali@48
  1668
    s=strstr(aline,"\"  \"");
ali@48
  1669
    if (!s)
ali@48
  1670
	s=strstr(aline,"\" \"");
ali@48
  1671
    if (s)
ali@48
  1672
    {
ali@48
  1673
	if (pswit[ECHO_SWITCH])
ali@70
  1674
	    g_print("\n%s\n",aline);
ali@48
  1675
	if (!pswit[OVERVIEW_SWITCH])
ali@70
  1676
	    g_print("    Line %ld column %ld - "
ali@70
  1677
	      "Query missing paragraph break?\n",
ali@70
  1678
	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@48
  1679
	else
ali@48
  1680
	    cnt_punct++;
ali@48
  1681
    }
ali@48
  1682
}
ali@48
  1683
ali@48
  1684
/*
ali@49
  1685
 * check_for_jeebies:
ali@49
  1686
 *
ali@49
  1687
 * Check for "to he" and other easy h/b errors.
ali@49
  1688
 *
ali@49
  1689
 * This is a very inadequate effort on the h/b problem,
ali@49
  1690
 * but the phrase "to he" is always an error, whereas "to
ali@49
  1691
 * be" is quite common.
ali@49
  1692
 * Similarly, '"Quiet!", be said.' is a non-be error
ali@49
  1693
 * "to he" is _not_ always an error!:
ali@49
  1694
 *       "Where they went to he couldn't say."
ali@49
  1695
 * Another false positive:
ali@49
  1696
 *       What would "Cinderella" be without the . . .
ali@49
  1697
 * and another: "If he wants to he can see for himself."
ali@49
  1698
 */
ali@49
  1699
void check_for_jeebies(const char *aline)
ali@49
  1700
{
ali@49
  1701
    const char *s;
ali@49
  1702
    s=strstr(aline," be could ");
ali@49
  1703
    if (!s)
ali@49
  1704
	s=strstr(aline," be would ");
ali@49
  1705
    if (!s)
ali@49
  1706
	s=strstr(aline," was be ");
ali@49
  1707
    if (!s)
ali@49
  1708
	s=strstr(aline," be is ");
ali@49
  1709
    if (!s)
ali@49
  1710
	s=strstr(aline," is be ");
ali@49
  1711
    if (!s)
ali@49
  1712
	s=strstr(aline,"\", be ");
ali@49
  1713
    if (!s)
ali@49
  1714
	s=strstr(aline,"\" be ");
ali@49
  1715
    if (!s)
ali@49
  1716
	s=strstr(aline,"\" be ");
ali@49
  1717
    if (!s)
ali@49
  1718
	s=strstr(aline," to he ");
ali@49
  1719
    if (s)
ali@49
  1720
    {
ali@49
  1721
	if (pswit[ECHO_SWITCH])
ali@70
  1722
	    g_print("\n%s\n",aline);
ali@49
  1723
	if (!pswit[OVERVIEW_SWITCH])
ali@70
  1724
	    g_print("    Line %ld column %ld - Query he/be error?\n",
ali@70
  1725
	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@49
  1726
	else
ali@49
  1727
	    cnt_word++;
ali@49
  1728
    }
ali@49
  1729
    s=strstr(aline," the had ");
ali@49
  1730
    if (!s)
ali@49
  1731
	s=strstr(aline," a had ");
ali@49
  1732
    if (!s)
ali@49
  1733
	s=strstr(aline," they bad ");
ali@49
  1734
    if (!s)
ali@49
  1735
	s=strstr(aline," she bad ");
ali@49
  1736
    if (!s)
ali@49
  1737
	s=strstr(aline," he bad ");
ali@49
  1738
    if (!s)
ali@49
  1739
	s=strstr(aline," you bad ");
ali@49
  1740
    if (!s)
ali@49
  1741
	s=strstr(aline," i bad ");
ali@49
  1742
    if (s)
ali@49
  1743
    {
ali@49
  1744
	if (pswit[ECHO_SWITCH])
ali@70
  1745
	    g_print("\n%s\n",aline);
ali@49
  1746
	if (!pswit[OVERVIEW_SWITCH])
ali@70
  1747
	    g_print("    Line %ld column %ld - Query had/bad error?\n",
ali@70
  1748
	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@49
  1749
	else
ali@49
  1750
	    cnt_word++;
ali@49
  1751
    }
ali@49
  1752
    s=strstr(aline,"; hut ");
ali@49
  1753
    if (!s)
ali@49
  1754
	s=strstr(aline,", hut ");
ali@49
  1755
    if (s)
ali@49
  1756
    {
ali@49
  1757
	if (pswit[ECHO_SWITCH])
ali@70
  1758
	    g_print("\n%s\n",aline);
ali@49
  1759
	if (!pswit[OVERVIEW_SWITCH])
ali@70
  1760
	    g_print("    Line %ld column %ld - Query hut/but error?\n",
ali@70
  1761
	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@49
  1762
	else
ali@49
  1763
	    cnt_word++;
ali@49
  1764
    }
ali@49
  1765
}
ali@49
  1766
ali@49
  1767
/*
ali@50
  1768
 * check_for_mta_from:
ali@50
  1769
 *
ali@50
  1770
 * Special case - angled bracket in front of "From" placed there by an
ali@50
  1771
 * MTA when sending an e-mail.
ali@50
  1772
 */
ali@50
  1773
void check_for_mta_from(const char *aline)
ali@50
  1774
{
ali@50
  1775
    const char *s;
ali@50
  1776
    s=strstr(aline,">From");
ali@50
  1777
    if (s)
ali@50
  1778
    {
ali@50
  1779
	if (pswit[ECHO_SWITCH])
ali@70
  1780
	    g_print("\n%s\n",aline);
ali@50
  1781
	if (!pswit[OVERVIEW_SWITCH])
ali@70
  1782
	    g_print("    Line %ld column %ld - "
ali@70
  1783
	      "Query angled bracket with From\n",
ali@70
  1784
	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@50
  1785
	else
ali@50
  1786
	    cnt_punct++;
ali@50
  1787
    }
ali@50
  1788
}
ali@50
  1789
ali@50
  1790
/*
ali@51
  1791
 * check_for_orphan_character:
ali@51
  1792
 *
ali@51
  1793
 * Check for a single character line -
ali@51
  1794
 * often an overflow from bad wrapping.
ali@51
  1795
 */
ali@51
  1796
void check_for_orphan_character(const char *aline)
ali@51
  1797
{
ali@70
  1798
    gunichar c;
ali@70
  1799
    c=g_utf8_get_char(aline);
ali@70
  1800
    if (c && !*g_utf8_next_char(aline))
ali@51
  1801
    {
ali@70
  1802
	if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
ali@51
  1803
	    ; /* Nothing - ignore numerals alone on a line. */
ali@51
  1804
	else
ali@51
  1805
	{
ali@51
  1806
	    if (pswit[ECHO_SWITCH])
ali@70
  1807
		g_print("\n%s\n",aline);
ali@51
  1808
	    if (!pswit[OVERVIEW_SWITCH])
ali@70
  1809
		g_print("    Line %ld column 1 - Query single character line\n",
ali@51
  1810
		  linecnt);
ali@51
  1811
	    else
ali@51
  1812
		cnt_punct++;
ali@51
  1813
	}
ali@51
  1814
    }
ali@51
  1815
}
ali@51
  1816
ali@51
  1817
/*
ali@52
  1818
 * check_for_pling_scanno:
ali@52
  1819
 *
ali@52
  1820
 * Check for I" - often should be !
ali@52
  1821
 */
ali@52
  1822
void check_for_pling_scanno(const char *aline)
ali@52
  1823
{
ali@52
  1824
    const char *s;
ali@52
  1825
    s=strstr(aline," I\"");
ali@52
  1826
    if (s)
ali@52
  1827
    {
ali@52
  1828
	if (pswit[ECHO_SWITCH])
ali@70
  1829
	    g_print("\n%s\n",aline);
ali@52
  1830
	if (!pswit[OVERVIEW_SWITCH])
ali@70
  1831
	    g_print("    Line %ld column %ld - Query I=exclamation mark?\n",
ali@70
  1832
	      linecnt,g_utf8_pointer_to_offset(aline,s));
ali@52
  1833
	else
ali@52
  1834
	    cnt_punct++;
ali@52
  1835
    }
ali@52
  1836
}
ali@52
  1837
ali@52
  1838
/*
ali@53
  1839
 * check_for_extra_period:
ali@53
  1840
 *
ali@53
  1841
 * Check for period without a capital letter. Cut-down from gutspell.
ali@53
  1842
 * Only works when it happens on a single line.
ali@53
  1843
 */
ali@53
  1844
void check_for_extra_period(const char *aline,const struct warnings *warnings)
ali@53
  1845
{
ali@99
  1846
    const char *s,*t,*s1,*sprev;
ali@69
  1847
    int i;
ali@70
  1848
    gsize len;
ali@69
  1849
    gboolean istypo;
ali@69
  1850
    gchar *testword;
ali@99
  1851
    gunichar c,nc,pc,*decomposition;
ali@53
  1852
    if (pswit[PARANOID_SWITCH])
ali@53
  1853
    {
ali@70
  1854
	for (t=aline;t=strstr(t,". ");)
ali@53
  1855
	{
ali@69
  1856
	    if (t==aline)
ali@53
  1857
	    {
ali@70
  1858
		t=g_utf8_next_char(t);
ali@53
  1859
		/* start of line punctuation is handled elsewhere */
ali@53
  1860
		continue;
ali@53
  1861
	    }
ali@70
  1862
	    if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
ali@53
  1863
	    {
ali@70
  1864
		t=g_utf8_next_char(t);
ali@53
  1865
		continue;
ali@53
  1866
	    }
ali@53
  1867
	    if (warnings->isDutch)
ali@53
  1868
	    {
ali@53
  1869
		/* For Frank & Jeroen -- 's Middags case */
ali@70
  1870
		gunichar c2,c3,c4,c5;
ali@70
  1871
		c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
ali@70
  1872
		c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
ali@70
  1873
		c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
ali@70
  1874
		c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
ali@99
  1875
		if (CHAR_IS_APOSTROPHE(c2) &&
ali@99
  1876
		  g_unichar_islower(c3) && c4==CHAR_SPACE &&
ali@99
  1877
		  g_unichar_isupper(c5))
ali@53
  1878
		{
ali@70
  1879
		    t=g_utf8_next_char(t);
ali@53
  1880
		    continue;
ali@53
  1881
		}
ali@53
  1882
	    }
ali@70
  1883
	    s1=g_utf8_next_char(g_utf8_next_char(t));
ali@70
  1884
	    while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
ali@173
  1885
	      !g_unichar_isdigit(g_utf8_get_char(s1)))
ali@70
  1886
		s1=g_utf8_next_char(s1);
ali@70
  1887
	    if (g_unichar_islower(g_utf8_get_char(s1)))
ali@53
  1888
	    {
ali@53
  1889
		/* we have something to investigate */
ali@69
  1890
		istypo=TRUE;
ali@53
  1891
		/* so let's go back and find out */
ali@99
  1892
		nc=g_utf8_get_char(t);
ali@99
  1893
		s1=g_utf8_prev_char(t);
ali@99
  1894
		c=g_utf8_get_char(s1);
ali@99
  1895
		sprev=g_utf8_prev_char(s1);
ali@99
  1896
		pc=g_utf8_get_char(sprev);
ali@99
  1897
		while (s1>=aline &&
ali@99
  1898
		  (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
ali@99
  1899
		  g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
ali@99
  1900
		  g_unichar_isalpha(nc)))
ali@99
  1901
		{
ali@99
  1902
		    nc=c;
ali@99
  1903
		    s1=sprev;
ali@99
  1904
		    c=pc;
ali@99
  1905
		    sprev=g_utf8_prev_char(s1);
ali@99
  1906
		    pc=g_utf8_get_char(sprev);
ali@99
  1907
		}
ali@70
  1908
		s1=g_utf8_next_char(s1);
ali@69
  1909
		s=strchr(s1,'.');
ali@69
  1910
		if (s)
ali@69
  1911
		    testword=g_strndup(s1,s-s1);
ali@69
  1912
		else
ali@69
  1913
		    testword=g_strdup(s1);
ali@53
  1914
		for (i=0;*abbrev[i];i++)
ali@53
  1915
		    if (!strcmp(testword,abbrev[i]))
ali@69
  1916
			istypo=FALSE;
ali@70
  1917
		if (g_unichar_isdigit(g_utf8_get_char(testword)))
ali@69
  1918
		    istypo=FALSE;
ali@70
  1919
		if (!*g_utf8_next_char(testword))
ali@69
  1920
		    istypo=FALSE;
ali@53
  1921
		if (isroman(testword))
ali@69
  1922
		    istypo=FALSE;
ali@53
  1923
		if (istypo)
ali@53
  1924
		{
ali@69
  1925
		    istypo=FALSE;
ali@70
  1926
		    for (s=testword;*s;s=g_utf8_next_char(s))
ali@70
  1927
		    {
ali@70
  1928
			decomposition=g_unicode_canonical_decomposition(
ali@70
  1929
			  g_utf8_get_char(s),&len);
ali@70
  1930
			if (g_utf8_strchr("aeiou",-1,decomposition[0]))
ali@69
  1931
			    istypo=TRUE;
ali@70
  1932
			g_free(decomposition);
ali@70
  1933
		    }
ali@53
  1934
		}
ali@69
  1935
		if (istypo &&
ali@69
  1936
		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
ali@53
  1937
		{
ali@69
  1938
		    g_tree_insert(qperiod,g_strdup(testword),
ali@69
  1939
		      GINT_TO_POINTER(1));
ali@69
  1940
		    if (pswit[ECHO_SWITCH])
ali@70
  1941
			g_print("\n%s\n",aline);
ali@69
  1942
		    if (!pswit[OVERVIEW_SWITCH])
ali@70
  1943
			g_print("    Line %ld column %ld - Extra period?\n",
ali@70
  1944
			  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
ali@69
  1945
		    else
ali@69
  1946
			cnt_punct++;
ali@53
  1947
		}
ali@69
  1948
		g_free(testword);
ali@53
  1949
	    }
ali@70
  1950
	    t=g_utf8_next_char(t);
ali@53
  1951
	}
ali@53
  1952
    }
ali@53
  1953
}
ali@53
  1954
ali@53
  1955
/*
ali@54
  1956
 * check_for_following_punctuation:
ali@54
  1957
 *
ali@54
  1958
 * Check for words usually not followed by punctuation.
ali@54
  1959
 */
ali@54
  1960
void check_for_following_punctuation(const char *aline)
ali@54
  1961
{
ali@54
  1962
    int i;
ali@54
  1963
    const char *s,*wordstart;
ali@70
  1964
    gunichar c;
ali@69
  1965
    gchar *inword,*t;
ali@54
  1966
    if (pswit[TYPO_SWITCH])
ali@54
  1967
    {
ali@54
  1968
	for (s=aline;*s;)
ali@54
  1969
	{
ali@54
  1970
	    wordstart=s;
ali@69
  1971
	    t=getaword(&s);
ali@69
  1972
	    if (!*t)
ali@69
  1973
	    {
ali@69
  1974
		g_free(t);
ali@54
  1975
		continue;
ali@69
  1976
	    }
ali@70
  1977
	    inword=g_utf8_strdown(t,-1);
ali@69
  1978
	    g_free(t);
ali@54
  1979
	    for (i=0;*nocomma[i];i++)
ali@54
  1980
		if (!strcmp(inword,nocomma[i]))
ali@54
  1981
		{
ali@70
  1982
		    c=g_utf8_get_char(s);
ali@70
  1983
		    if (c==',' || c==';' || c==':')
ali@54
  1984
		    {
ali@54
  1985
			if (pswit[ECHO_SWITCH])
ali@70
  1986
			    g_print("\n%s\n",aline);
ali@54
  1987
			if (!pswit[OVERVIEW_SWITCH])
ali@70
  1988
			    g_print("    Line %ld column %ld - "
ali@54
  1989
			      "Query punctuation after %s?\n",
ali@70
  1990
			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
ali@70
  1991
			      inword);
ali@54
  1992
			else
ali@54
  1993
			    cnt_punct++;
ali@54
  1994
		    }
ali@54
  1995
		}
ali@54
  1996
	    for (i=0;*noperiod[i];i++)
ali@54
  1997
		if (!strcmp(inword,noperiod[i]))
ali@54
  1998
		{
ali@70
  1999
		    c=g_utf8_get_char(s);
ali@70
  2000
		    if (c=='.' || c=='!')
ali@54
  2001
		    {
ali@54
  2002
			if (pswit[ECHO_SWITCH])
ali@70
  2003
			    g_print("\n%s\n",aline);
ali@54
  2004
			if (!pswit[OVERVIEW_SWITCH])
ali@70
  2005
			    g_print("    Line %ld column %ld - "
ali@54
  2006
			      "Query punctuation after %s?\n",
ali@70
  2007
			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
ali@70
  2008
			      inword);
ali@54
  2009
			else
ali@54
  2010
			    cnt_punct++;
ali@54
  2011
		    }
ali@54
  2012
		}
ali@69
  2013
	    g_free(inword);
ali@54
  2014
	}
ali@54
  2015
    }
ali@54
  2016
}
ali@54
  2017
ali@54
  2018
/*
ali@55
  2019
 * check_for_typos:
ali@55
  2020
 *
ali@55
  2021
 * Check for commonly mistyped words,
ali@55
  2022
 * and digits like 0 for O in a word.
ali@55
  2023
 */
ali@55
  2024
void check_for_typos(const char *aline,struct warnings *warnings)
ali@55
  2025
{
ali@70
  2026
    const char *s,*t,*nt,*wordstart;
ali@70
  2027
    gchar *inword;
ali@70
  2028
    gunichar *decomposition;
ali@70
  2029
    gchar *testword;
ali@70
  2030
    int i,vowel,consonant,*dupcnt;
ali@70
  2031
    gboolean isdup,istypo,alower;
ali@99
  2032
    gunichar c,pc;
ali@70
  2033
    long offset,len;
ali@70
  2034
    gsize decomposition_len;
ali@55
  2035
    for (s=aline;*s;)
ali@55
  2036
    {
ali@55
  2037
	wordstart=s;
ali@69
  2038
	inword=getaword(&s);
ali@55
  2039
	if (!*inword)
ali@69
  2040
	{
ali@69
  2041
	    g_free(inword);
ali@55
  2042
	    continue; /* don't bother with empty lines */
ali@69
  2043
	}
ali@55
  2044
	if (mixdigit(inword))
ali@55
  2045
	{
ali@55
  2046
	    if (pswit[ECHO_SWITCH])
ali@70
  2047
		g_print("\n%s\n",aline);
ali@55
  2048
	    if (!pswit[OVERVIEW_SWITCH])
ali@70
  2049
		g_print("    Line %ld column %ld - Query digit in %s\n",
ali@70
  2050
		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
ali@55
  2051
	    else
ali@55
  2052
		cnt_word++;
ali@55
  2053
	}
ali@55
  2054
	/*
ali@55
  2055
	 * Put the word through a series of tests for likely typos and OCR
ali@55
  2056
	 * errors.
ali@55
  2057
	 */
ali@69
  2058
	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
ali@55
  2059
	{
ali@69
  2060
	    istypo=FALSE;
ali@70
  2061
	    alower=FALSE;
ali@70
  2062
	    for (t=inword;*t;t=g_utf8_next_char(t))
ali@55
  2063
	    {
ali@70
  2064
		c=g_utf8_get_char(t);
ali@70
  2065
		nt=g_utf8_next_char(t);
ali@55
  2066
		/* lowercase for testing */
ali@70
  2067
		if (g_unichar_islower(c))
ali@70
  2068
		    alower=TRUE;
ali@70
  2069
		if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
ali@55
  2070
		{
ali@55
  2071
		    /*
ali@55
  2072
		     * We have an uppercase mid-word. However, there are
ali@55
  2073
		     * common cases:
ali@55
  2074
		     *   Mac and Mc like McGill
ali@55
  2075
		     *   French contractions like l'Abbe
ali@55
  2076
		     */
ali@70
  2077
		    offset=g_utf8_pointer_to_offset(inword,t);
ali@99
  2078
		    if (offset>0)
ali@99
  2079
			pc=g_utf8_get_char(g_utf8_prev_char(t));
ali@99
  2080
		    else
ali@99
  2081
			pc='\0';
ali@70
  2082
		    if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
ali@70
  2083
		      offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
ali@70
  2084
		      g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
ali@99
  2085
		      CHAR_IS_APOSTROPHE(pc))
ali@55
  2086
			; /* do nothing! */
ali@55
  2087
		    else
ali@69
  2088
			istypo=TRUE;
ali@55
  2089
		}
ali@55
  2090
	    }
ali@70
  2091
	    testword=g_utf8_casefold(inword,-1);
ali@69
  2092
	}
ali@69
  2093
	if (pswit[TYPO_SWITCH])
ali@69
  2094
	{
ali@55
  2095
	    /*
ali@55
  2096
	     * Check for certain unlikely two-letter combinations at word
ali@55
  2097
	     * start and end.
ali@55
  2098
	     */
ali@70
  2099
	    len=g_utf8_strlen(testword,-1);
ali@70
  2100
	    if (len>1)
ali@55
  2101
	    {
ali@55
  2102
		for (i=0;*nostart[i];i++)
ali@70
  2103
		    if (g_str_has_prefix(testword,nostart[i]))
ali@69
  2104
			istypo=TRUE;
ali@55
  2105
		for (i=0;*noend[i];i++)
ali@70
  2106
		    if (g_str_has_suffix(testword,noend[i]))
ali@69
  2107
			istypo=TRUE;
ali@55
  2108
	    }
ali@55
  2109
	    /* ght is common, gbt never. Like that. */
ali@55
  2110
	    if (strstr(testword,"cb"))
ali@69
  2111
		istypo=TRUE;
ali@55
  2112
	    if (strstr(testword,"gbt"))
ali@69
  2113
		istypo=TRUE;
ali@55
  2114
	    if (strstr(testword,"pbt"))
ali@69
  2115
		istypo=TRUE;
ali@55
  2116
	    if (strstr(testword,"tbs"))
ali@69
  2117
		istypo=TRUE;
ali@55
  2118
	    if (strstr(testword,"mrn"))
ali@69
  2119
		istypo=TRUE;
ali@55
  2120
	    if (strstr(testword,"ahle"))
ali@69
  2121
		istypo=TRUE;
ali@55
  2122
	    if (strstr(testword,"ihle"))
ali@69
  2123
		istypo=TRUE;
ali@55
  2124
	    /*
ali@55
  2125
	     * "TBE" does happen - like HEARTBEAT - but uncommon.
ali@55
  2126
	     * Also "TBI" - frostbite, outbid - but uncommon.
ali@55
  2127
	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman
ali@55
  2128
	     * numerals, but "ii" is a common scanno.
ali@55
  2129
	     */
ali@55
  2130
	    if (strstr(testword,"tbi"))
ali@69
  2131
		istypo=TRUE;
ali@55
  2132
	    if (strstr(testword,"tbe"))
ali@69
  2133
		istypo=TRUE;
ali@55
  2134
	    if (strstr(testword,"ii"))
ali@69
  2135
		istypo=TRUE;
ali@55
  2136
	    /*
ali@55
  2137
	     * Check for no vowels or no consonants.
ali@55
  2138
	     * If none, flag a typo.
ali@55
  2139
	     */
ali@70
  2140
	    if (!istypo && len>1)
ali@55
  2141
	    {
ali@55
  2142
		vowel=consonant=0;
ali@70
  2143
		for (t=testword;*t;t=g_utf8_next_char(t))
ali@55
  2144
		{
ali@70
  2145
		    c=g_utf8_get_char(t);
ali@70
  2146
		    decomposition=
ali@70
  2147
		      g_unicode_canonical_decomposition(c,&decomposition_len);
ali@70
  2148
		    if (c=='y' || g_unichar_isdigit(c))
ali@55
  2149
		    {
ali@55
  2150
			/* Yah, this is loose. */
ali@55
  2151
			vowel++;
ali@55
  2152
			consonant++;
ali@55
  2153
		    }
ali@70
  2154
		    else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
ali@55
  2155
			vowel++;
ali@55
  2156
		    else
ali@55
  2157
			consonant++;
ali@70
  2158
		    g_free(decomposition);
ali@55
  2159
		}
ali@55
  2160
		if (!vowel || !consonant)
ali@69
  2161
		    istypo=TRUE;
ali@55
  2162
	    }
ali@55
  2163
	    /*
ali@55
  2164
	     * Now exclude the word from being reported if it's in
ali@55
  2165
	     * the okword list.
ali@55
  2166
	     */
ali@55
  2167
	    for (i=0;*okword[i];i++)
ali@55
  2168
		if (!strcmp(testword,okword[i]))
ali@69
  2169
		    istypo=FALSE;
ali@55
  2170
	    /*
ali@55
  2171
	     * What looks like a typo may be a Roman numeral.
ali@55
  2172
	     * Exclude these.
ali@55
  2173
	     */
ali@55
  2174
	    if (istypo && isroman(testword))
ali@69
  2175
		istypo=FALSE;
ali@55
  2176
	    /* Check the manual list of typos. */
ali@55
  2177
	    if (!istypo)
ali@55
  2178
		for (i=0;*typo[i];i++)
ali@55
  2179
		    if (!strcmp(testword,typo[i]))
ali@69
  2180
			istypo=TRUE;
ali@55
  2181
	    /*
ali@55
  2182
	     * Check lowercase s, l, i and m - special cases.
ali@55
  2183
	     *   "j" - often a semi-colon gone wrong.
ali@55
  2184
	     *   "d" for a missing apostrophe - he d
ali@55
  2185
	     *   "n" for "in"
ali@55
  2186
	     */
ali@70
  2187
	    if (!istypo && len==1 &&
ali@70
  2188
	      g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
ali@69
  2189
		istypo=TRUE;
ali@55
  2190
	    if (istypo)
ali@55
  2191
	    {
ali@69
  2192
		dupcnt=g_tree_lookup(qword,testword);
ali@69
  2193
		if (dupcnt)
ali@69
  2194
		{
ali@69
  2195
		    (*dupcnt)++;
ali@69
  2196
		    isdup=!pswit[VERBOSE_SWITCH];
ali@69
  2197
		}
ali@69
  2198
		else
ali@69
  2199
		{
ali@69
  2200
		    dupcnt=g_new0(int,1);
ali@69
  2201
		    g_tree_insert(qword,g_strdup(testword),dupcnt);
ali@69
  2202
		    isdup=FALSE;
ali@69
  2203
		}
ali@55
  2204
		if (!isdup)
ali@55
  2205
		{
ali@55
  2206
		    if (pswit[ECHO_SWITCH])
ali@70
  2207
			g_print("\n%s\n",aline);
ali@55
  2208
		    if (!pswit[OVERVIEW_SWITCH])
ali@55
  2209
		    {
ali@70
  2210
			g_print("    Line %ld column %ld - Query word %s",
ali@70
  2211
			  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
ali@70
  2212
			  inword);
ali@69
  2213
			if (!pswit[VERBOSE_SWITCH])
ali@70
  2214
			    g_print(" - not reporting duplicates");
ali@70
  2215
			g_print("\n");
ali@55
  2216
		    }
ali@55
  2217
		    else
ali@55
  2218
			cnt_word++;
ali@55
  2219
		}
ali@55
  2220
	    }
ali@55
  2221
	}
ali@55
  2222
	/* check the user's list of typos */
ali@69
  2223
	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
ali@69
  2224
	{
ali@69
  2225
	    if (pswit[ECHO_SWITCH])
ali@70
  2226
		g_print("\n%s\n",aline);
ali@69
  2227
	    if (!pswit[OVERVIEW_SWITCH])  
ali@70
  2228
		g_print("    Line %ld column %ld - Query possible scanno %s\n",
ali@70
  2229
		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
ali@69
  2230
	}
ali@69
  2231
	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
ali@69
  2232
	    g_free(testword);
ali@55
  2233
	if (pswit[PARANOID_SWITCH] && warnings->digit)
ali@55
  2234
	{
ali@55
  2235
	    /* In paranoid mode, query all 0 and 1 standing alone. */
ali@55
  2236
	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))
ali@55
  2237
	    {
ali@55
  2238
		if (pswit[ECHO_SWITCH])
ali@70
  2239
		    g_print("\n%s\n",aline);
ali@55
  2240
		if (!pswit[OVERVIEW_SWITCH])
ali@70
  2241
		    g_print("    Line %ld column %ld - Query standalone %s\n",
ali@70
  2242
		      linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
ali@70
  2243
		      inword);
ali@55
  2244
		else
ali@55
  2245
		    cnt_word++;
ali@55
  2246
	    }
ali@55
  2247
	}
ali@69
  2248
	g_free(inword);
ali@55
  2249
    }
ali@55
  2250
}
ali@55
  2251
ali@56
  2252
/*
ali@56
  2253
 * check_for_misspaced_punctuation:
ali@56
  2254
 *
ali@56
  2255
 * Look for added or missing spaces around punctuation and quotes.
ali@56
  2256
 * If there is a punctuation character like ! with no space on
ali@56
  2257
 * either side, suspect a missing!space. If there are spaces on
ali@56
  2258
 * both sides , assume a typo. If we see a double quote with no
ali@56
  2259
 * space or punctuation on either side of it, assume unspaced
ali@56
  2260
 * quotes "like"this.
ali@56
  2261
 */
ali@56
  2262
void check_for_misspaced_punctuation(const char *aline,
ali@69
  2263
  struct parities *parities,gboolean isemptyline)
ali@56
  2264
{
ali@69
  2265
    gboolean isacro,isellipsis;
ali@56
  2266
    const char *s;
ali@70
  2267
    gunichar c,nc,pc,n2c;
ali@142
  2268
    int parity;
ali@70
  2269
    c=g_utf8_get_char(aline);
ali@70
  2270
    nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70
  2271
    for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@56
  2272
    {
ali@70
  2273
	pc=c;
ali@70
  2274
	c=nc;
ali@70
  2275
	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@56
  2276
	/* For each character in the line after the first. */
ali@70
  2277
	if (g_utf8_strchr(".?!,;:_",-1,c))  /* if it's punctuation */
ali@56
  2278
	{
ali@56
  2279
	    /* we need to suppress warnings for acronyms like M.D. */
ali@69
  2280
	    isacro=FALSE;
ali@56
  2281
	    /* we need to suppress warnings for ellipsis . . . */
ali@69
  2282
	    isellipsis=FALSE;
ali@70
  2283
	    /*
ali@70
  2284
	     * If there are letters on both sides of it or
ali@70
  2285
	     * if it's strict punctuation followed by an alpha.
ali@70
  2286
	     */
ali@70
  2287
	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
ali@70
  2288
	      g_utf8_strchr("?!,;:",-1,c)))
ali@56
  2289
	    {
ali@70
  2290
		if (c=='.')
ali@56
  2291
		{
ali@70
  2292
		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
ali@70
  2293
		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
ali@69
  2294
			isacro=TRUE;
ali@70
  2295
		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
ali@70
  2296
		    if (nc && n2c=='.')
ali@69
  2297
			isacro=TRUE;
ali@56
  2298
		}
ali@56
  2299
		if (!isacro)
ali@56
  2300
		{
ali@56
  2301
		    if (pswit[ECHO_SWITCH])
ali@70
  2302
			g_print("\n%s\n",aline);
ali@56
  2303
		    if (!pswit[OVERVIEW_SWITCH])
ali@70
  2304
			g_print("    Line %ld column %ld - Missing space?\n",
ali@70
  2305
			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56
  2306
		    else
ali@56
  2307
			cnt_punct++;
ali@56
  2308
		}
ali@56
  2309
	    }
ali@70
  2310
	    if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
ali@56
  2311
	    {
ali@56
  2312
		/*
ali@56
  2313
		 * If there are spaces on both sides,
ali@56
  2314
		 * or space before and end of line.
ali@56
  2315
		 */
ali@70
  2316
		if (c=='.')
ali@56
  2317
		{
ali@70
  2318
		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
ali@70
  2319
		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
ali@69
  2320
			isellipsis=TRUE;
ali@70
  2321
		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
ali@70
  2322
		    if (nc && n2c=='.')
ali@69
  2323
			isellipsis=TRUE;
ali@56
  2324
		}
ali@56
  2325
		if (!isemptyline && !isellipsis)
ali@56
  2326
		{
ali@56
  2327
		    if (pswit[ECHO_SWITCH])
ali@70
  2328
			g_print("\n%s\n",aline);
ali@56
  2329
		    if (!pswit[OVERVIEW_SWITCH])
ali@70
  2330
			g_print("    Line %ld column %ld - "
ali@70
  2331
			  "Spaced punctuation?\n",linecnt,
ali@70
  2332
			  g_utf8_pointer_to_offset(aline,s)+1);
ali@56
  2333
		    else
ali@56
  2334
			cnt_punct++;
ali@56
  2335
		}
ali@56
  2336
	    }
ali@56
  2337
	}
ali@56
  2338
    }
ali@56
  2339
    /* Split out the characters that CANNOT be preceded by space. */
ali@70
  2340
    c=g_utf8_get_char(aline);
ali@70
  2341
    nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70
  2342
    for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@56
  2343
    {
ali@70
  2344
	pc=c;
ali@70
  2345
	c=nc;
ali@70
  2346
	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@56
  2347
	/* for each character in the line after the first */
ali@70
  2348
	if (g_utf8_strchr("?!,;:",-1,c))
ali@56
  2349
	{
ali@56
  2350
	    /* if it's punctuation that _cannot_ have a space before it */
ali@70
  2351
	    if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
ali@56
  2352
	    {
ali@56
  2353
		/*
ali@70
  2354
		 * If nc DOES == space,
ali@56
  2355
		 * it was already reported just above.
ali@56
  2356
		 */
ali@56
  2357
		if (pswit[ECHO_SWITCH])
ali@70
  2358
		    g_print("\n%s\n",aline);
ali@56
  2359
		if (!pswit[OVERVIEW_SWITCH])
ali@70
  2360
		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
ali@70
  2361
		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56
  2362
		else
ali@56
  2363
		    cnt_punct++;
ali@56
  2364
	    }
ali@56
  2365
	}
ali@56
  2366
    }
ali@56
  2367
    /*
ali@56
  2368
     * Special case " .X" where X is any alpha.
ali@56
  2369
     * This plugs a hole in the acronym code above.
ali@56
  2370
     * Inelegant, but maintainable.
ali@56
  2371
     */
ali@70
  2372
    c=g_utf8_get_char(aline);
ali@70
  2373
    nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70
  2374
    for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@56
  2375
    {
ali@70
  2376
	pc=c;
ali@70
  2377
	c=nc;
ali@70
  2378
	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@56
  2379
	/* for each character in the line after the first */
ali@70
  2380
	if (c=='.')
ali@56
  2381
	{
ali@56
  2382
	    /* if it's a period */
ali@70
  2383
	    if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
ali@56
  2384
	    {
ali@56
  2385
		/*
ali@56
  2386
		 * If the period follows a space and
ali@56
  2387
		 * is followed by a letter.
ali@56
  2388
		 */
ali@56
  2389
		if (pswit[ECHO_SWITCH])
ali@70
  2390
		    g_print("\n%s\n",aline);
ali@56
  2391
		if (!pswit[OVERVIEW_SWITCH])
ali@70
  2392
		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
ali@70
  2393
		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56
  2394
		else
ali@56
  2395
		    cnt_punct++;
ali@56
  2396
	    }
ali@56
  2397
	}
ali@56
  2398
    }
ali@70
  2399
    c=g_utf8_get_char(aline);
ali@70
  2400
    nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70
  2401
    for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@56
  2402
    {
ali@70
  2403
	pc=c;
ali@70
  2404
	c=nc;
ali@70
  2405
	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@56
  2406
	/* for each character in the line after the first */
ali@142
  2407
	if (CHAR_IS_DQUOTE(c))
ali@56
  2408
	{
ali@70
  2409
	    if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
ali@70
  2410
	      !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
ali@70
  2411
	      !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
ali@56
  2412
	    {
ali@56
  2413
		if (pswit[ECHO_SWITCH])
ali@70
  2414
		    g_print("\n%s\n",aline);
ali@56
  2415
		if (!pswit[OVERVIEW_SWITCH])
ali@70
  2416
		    g_print("    Line %ld column %ld - Unspaced quotes?\n",
ali@70
  2417
		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56
  2418
		else
ali@56
  2419
		    cnt_punct++;
ali@56
  2420
	    }
ali@56
  2421
	}
ali@56
  2422
    }
ali@56
  2423
    /* Check parity of quotes. */
ali@70
  2424
    nc=g_utf8_get_char(aline);
ali@70
  2425
    for (s=aline;*s;s=g_utf8_next_char(s))
ali@56
  2426
    {
ali@70
  2427
	c=nc;
ali@70
  2428
	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@142
  2429
	if (CHAR_IS_DQUOTE(c))
ali@56
  2430
	{
ali@142
  2431
	    if (c==CHAR_DQUOTE)
ali@142
  2432
	    {
ali@142
  2433
		parities->dquote=!parities->dquote;
ali@142
  2434
		parity=parities->dquote;
ali@142
  2435
	    }
ali@142
  2436
	    else if (c==CHAR_LD_QUOTE)
ali@142
  2437
		parity=1;
ali@142
  2438
	    else
ali@142
  2439
		parity=0;
ali@142
  2440
	    if (!parity)
ali@56
  2441
	    {
ali@56
  2442
		/* parity even */
ali@173
  2443
		if (!g_utf8_strchr("_-.'`‘’/,;:!?)]} ",-1,nc))
ali@56
  2444
		{
ali@56
  2445
		    if (pswit[ECHO_SWITCH])
ali@70
  2446
			g_print("\n%s\n",aline);
ali@56
  2447
		    if (!pswit[OVERVIEW_SWITCH])
ali@70
  2448
			g_print("    Line %ld column %ld - "
ali@70
  2449
			  "Wrongspaced quotes?\n",
ali@70
  2450
			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56
  2451
		    else
ali@56
  2452
			cnt_punct++;
ali@56
  2453
		}
ali@56
  2454
	    }
ali@56
  2455
	    else
ali@56
  2456
	    {
ali@56
  2457
		/* parity odd */
ali@173
  2458
		if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
ali@173
  2459
		  !g_utf8_strchr("_-/.'`‘’([{$",-1,nc) || !nc)
ali@56
  2460
		{
ali@56
  2461
		    if (pswit[ECHO_SWITCH])
ali@70
  2462
			g_print("\n%s\n",aline);
ali@56
  2463
		    if (!pswit[OVERVIEW_SWITCH])
ali@70
  2464
			g_print("    Line %ld column %ld - "
ali@70
  2465
			  "Wrongspaced quotes?\n",
ali@70
  2466
			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56
  2467
		    else
ali@56
  2468
			cnt_punct++;
ali@56
  2469
		}
ali@56
  2470
	    }
ali@56
  2471
	}
ali@56
  2472
    }
ali@142
  2473
    c=g_utf8_get_char(aline);
ali@142
  2474
    if (CHAR_IS_DQUOTE(c))
ali@56
  2475
    {
ali@70
  2476
	if (g_utf8_strchr(",;:!?)]} ",-1,
ali@70
  2477
	  g_utf8_get_char(g_utf8_next_char(aline))))
ali@56
  2478
	{
ali@56
  2479
	    if (pswit[ECHO_SWITCH])
ali@70
  2480
		g_print("\n%s\n",aline);
ali@56
  2481
	    if (!pswit[OVERVIEW_SWITCH])
ali@70
  2482
		g_print("    Line %ld column 1 - Wrongspaced quotes?\n",
ali@56
  2483
		  linecnt);
ali@56
  2484
	    else
ali@56
  2485
		cnt_punct++;
ali@56
  2486
	}
ali@56
  2487
    }
ali@56
  2488
    if (pswit[SQUOTE_SWITCH])
ali@56
  2489
    {
ali@70
  2490
	nc=g_utf8_get_char(aline);
ali@70
  2491
	for (s=aline;*s;s=g_utf8_next_char(s))
ali@56
  2492
	{
ali@70
  2493
	    c=nc;
ali@70
  2494
	    nc=g_utf8_get_char(g_utf8_next_char(s));
ali@99
  2495
	    if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
ali@70
  2496
	      !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
ali@70
  2497
	      !g_unichar_isalpha(nc)))
ali@56
  2498
	    {
ali@56
  2499
		parities->squote=!parities->squote;
ali@56
  2500
		if (!parities->squote)
ali@56
  2501
		{
ali@56
  2502
		    /* parity even */
ali@70
  2503
		    if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
ali@56
  2504
		    {
ali@56
  2505
			if (pswit[ECHO_SWITCH])
ali@70
  2506
			    g_print("\n%s\n",aline);
ali@56
  2507
			if (!pswit[OVERVIEW_SWITCH])
ali@70
  2508
			    g_print("    Line %ld column %ld - "
ali@56
  2509
			      "Wrongspaced singlequotes?\n",
ali@70
  2510
			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56
  2511
			else
ali@56
  2512
			    cnt_punct++;
ali@56
  2513
		    }
ali@56
  2514
		}
ali@56
  2515
		else
ali@56
  2516
		{
ali@56
  2517
		    /* parity odd */
ali@173
  2518
		    if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
ali@70
  2519
		      !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
ali@56
  2520
		    {
ali@56
  2521
			if (pswit[ECHO_SWITCH])
ali@70
  2522
			    g_print("\n%s\n",aline);
ali@56
  2523
			if (!pswit[OVERVIEW_SWITCH])
ali@70
  2524
			    g_print("    Line %ld column %ld - "
ali@56
  2525
			      "Wrongspaced singlequotes?\n",
ali@70
  2526
			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56
  2527
			else
ali@56
  2528
			    cnt_punct++;
ali@56
  2529
		    }
ali@56
  2530
		}
ali@56
  2531
	    }
ali@56
  2532
	}
ali@56
  2533
    }
ali@56
  2534
}
ali@56
  2535
ali@55
  2536
/*
ali@57
  2537
 * check_for_double_punctuation:
ali@57
  2538
 *
ali@57
  2539
 * Look for double punctuation like ,. or ,,
ali@57
  2540
 * Thanks to DW for the suggestion!
ali@57
  2541
 * In books with references, ".," and ".;" are common
ali@57
  2542
 * e.g. "etc., etc.," and vol. 1.; vol 3.;
ali@57
  2543
 * OTOH, from my initial tests, there are also fairly
ali@57
  2544
 * common errors. What to do? Make these cases paranoid?
ali@57
  2545
 * ".," is the most common, so warnings->dotcomma is used
ali@57
  2546
 * to suppress detailed reporting if it occurs often.
ali@57
  2547
 */
ali@57
  2548
void check_for_double_punctuation(const char *aline,struct warnings *warnings)
ali@57
  2549
{
ali@70
  2550
    const char *s;
ali@70
  2551
    gunichar c,nc;
ali@70
  2552
    nc=g_utf8_get_char(aline);
ali@70
  2553
    for (s=aline;*s;s=g_utf8_next_char(s))
ali@57
  2554
    {
ali@70
  2555
	c=nc;
ali@70
  2556
	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@57
  2557
	/* for each punctuation character in the line */
ali@70
  2558
	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
ali@70
  2559
	  g_utf8_strchr(".?!,;:",-1,nc))
ali@57
  2560
	{
ali@57
  2561
	    /* followed by punctuation, it's a query, unless . . . */
ali@70
  2562
	    if (c==nc && (c=='.' || c=='?' || c=='!') ||
ali@70
  2563
	      !warnings->dotcomma && c=='.' && nc==',' ||
ali@70
  2564
	      warnings->isFrench && g_str_has_prefix(s,",...") ||
ali@70
  2565
	      warnings->isFrench && g_str_has_prefix(s,"...,") ||
ali@70
  2566
	      warnings->isFrench && g_str_has_prefix(s,";...") ||
ali@70
  2567
	      warnings->isFrench && g_str_has_prefix(s,"...;") ||
ali@70
  2568
	      warnings->isFrench && g_str_has_prefix(s,":...") ||
ali@70
  2569
	      warnings->isFrench && g_str_has_prefix(s,"...:") ||
ali@70
  2570
	      warnings->isFrench && g_str_has_prefix(s,"!...") ||
ali@70
  2571
	      warnings->isFrench && g_str_has_prefix(s,"...!") ||
ali@70
  2572
	      warnings->isFrench && g_str_has_prefix(s,"?...") ||
ali@70
  2573
	      warnings->isFrench && g_str_has_prefix(s,"...?"))
ali@57
  2574
	    {
ali@70
  2575
		if (warnings->isFrench && g_str_has_prefix(s,",...") ||
ali@70
  2576
		  warnings->isFrench && g_str_has_prefix(s,"...,") ||
ali@70
  2577
		  warnings->isFrench && g_str_has_prefix(s,";...") ||
ali@70
  2578
		  warnings->isFrench && g_str_has_prefix(s,"...;") ||
ali@70
  2579
		  warnings->isFrench && g_str_has_prefix(s,":...") ||
ali@70
  2580
		  warnings->isFrench && g_str_has_prefix(s,"...:") ||
ali@70
  2581
		  warnings->isFrench && g_str_has_prefix(s,"!...") ||
ali@70
  2582
		  warnings->isFrench && g_str_has_prefix(s,"...!") ||
ali@70
  2583
		  warnings->isFrench && g_str_has_prefix(s,"?...") ||
ali@70
  2584
		  warnings->isFrench && g_str_has_prefix(s,"...?"))
ali@70
  2585
		{
ali@70
  2586
		    s+=4;
ali@70
  2587
		    nc=g_utf8_get_char(g_utf8_next_char(s));
ali@70
  2588
		}
ali@57
  2589
		; /* do nothing for .. !! and ?? which can be legit */
ali@57
  2590
	    }
ali@57
  2591
	    else
ali@57
  2592
	    {
ali@57
  2593
		if (pswit[ECHO_SWITCH])
ali@70
  2594
		    g_print("\n%s\n",aline);
ali@57
  2595
		if (!pswit[OVERVIEW_SWITCH])
ali@70
  2596
		    g_print("    Line %ld column %ld - Double punctuation?\n",
ali@70
  2597
		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@57
  2598
		else
ali@57
  2599
		    cnt_punct++;
ali@57
  2600
	    }
ali@57
  2601
	}
ali@57
  2602
    }
ali@57
  2603
}
ali@57
  2604
ali@57
  2605
/*
ali@58
  2606
 * check_for_spaced_quotes:
ali@58
  2607
 */
ali@58
  2608
void check_for_spaced_quotes(const char *aline)
ali@58
  2609
{
ali@99
  2610
    int i;
ali@58
  2611
    const char *s,*t;
ali@99
  2612
    const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
ali@99
  2613
      CHAR_RS_QUOTE};
ali@99
  2614
    GString *pattern;
ali@58
  2615
    s=aline;
ali@58
  2616
    while ((t=strstr(s," \" ")))
ali@58
  2617
    {
ali@58
  2618
	if (pswit[ECHO_SWITCH])
ali@70
  2619
	    g_print("\n%s\n",aline);
ali@58
  2620
	if (!pswit[OVERVIEW_SWITCH])
ali@70
  2621
	    g_print("    Line %ld column %ld - Spaced doublequote?\n",
ali@70
  2622
	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
ali@58
  2623
	else
ali@58
  2624
	    cnt_punct++;
ali@70
  2625
	s=g_utf8_next_char(g_utf8_next_char(t));
ali@58
  2626
    }
ali@99
  2627
    pattern=g_string_new(NULL);
ali@99
  2628
    for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
ali@58
  2629
    {
ali@99
  2630
	g_string_assign(pattern," ");
ali@99
  2631
	g_string_append_unichar(pattern,single_quotes[i]);
ali@99
  2632
	g_string_append_c(pattern,' ');
ali@99
  2633
	s=aline;
ali@99
  2634
	while ((t=strstr(s,pattern->str)))
ali@99
  2635
	{
ali@99
  2636
	    if (pswit[ECHO_SWITCH])
ali@99
  2637
		g_print("\n%s\n",aline);
ali@99
  2638
	    if (!pswit[OVERVIEW_SWITCH])
ali@99
  2639
		g_print("    Line %ld column %ld - Spaced singlequote?\n",
ali@99
  2640
		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
ali@99
  2641
	    else
ali@99
  2642
		cnt_punct++;
ali@99
  2643
	    s=g_utf8_next_char(g_utf8_next_char(t));
ali@99
  2644
	}
ali@58
  2645
    }
ali@99
  2646
    g_string_free(pattern,TRUE);
ali@58
  2647
}
ali@58
  2648
ali@58
  2649
/*
ali@59
  2650
 * check_for_miscased_genative:
ali@59
  2651
 *
ali@59
  2652
 * Check special case of 'S instead of 's at end of word.
ali@59
  2653
 */
ali@59
  2654
void check_for_miscased_genative(const char *aline)
ali@59
  2655
{
ali@59
  2656
    const char *s;
ali@70
  2657
    gunichar c,nc,pc;
ali@69
  2658
    if (!*aline)
ali@69
  2659
	return;
ali@70
  2660
    c=g_utf8_get_char(aline);
ali@70
  2661
    nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70
  2662
    for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@59
  2663
    {
ali@70
  2664
	pc=c;
ali@70
  2665
	c=nc;
ali@70
  2666
	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@99
  2667
	if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
ali@59
  2668
	{
ali@59
  2669
	    if (pswit[ECHO_SWITCH])
ali@70
  2670
		g_print("\n%s\n",aline);
ali@59
  2671
	    if (!pswit[OVERVIEW_SWITCH])
ali@70
  2672
		g_print("    Line %ld column %ld - Capital \"S\"?\n",
ali@70
  2673
		  linecnt,g_utf8_pointer_to_offset(aline,s)+2);
ali@59
  2674
	    else
ali@59
  2675
		cnt_punct++;
ali@59
  2676
	}
ali@59
  2677
    }
ali@59
  2678
}
ali@59
  2679
ali@59
  2680
/*
ali@60
  2681
 * check_end_of_line:
ali@60
  2682
 *
ali@60
  2683
 * Now check special cases - start and end of line -
ali@60
  2684
 * for single and double quotes. Start is sometimes [sic]
ali@60
  2685
 * but better to query it anyway.
ali@60
  2686
 * While we're here, check for dash at end of line.
ali@60
  2687
 */
ali@60
  2688
void check_end_of_line(const char *aline,struct warnings *warnings)
ali@60
  2689
{
ali@70
  2690
    int lbytes;
ali@70
  2691
    const char *s;
ali@70
  2692
    gunichar c1,c2;
ali@70
  2693
    lbytes=strlen(aline);
ali@70
  2694
    if (g_utf8_strlen(aline,lbytes)>1)
ali@60
  2695
    {
ali@70
  2696
	s=g_utf8_prev_char(aline+lbytes);
ali@70
  2697
	c1=g_utf8_get_char(s);
ali@70
  2698
	c2=g_utf8_get_char(g_utf8_prev_char(s));
ali@142
  2699
	if ((CHAR_IS_DQUOTE(c1) || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
ali@60
  2700
	{
ali@60
  2701
	    if (pswit[ECHO_SWITCH])
ali@70
  2702
		g_print("\n%s\n",aline);
ali@60
  2703
	    if (!pswit[OVERVIEW_SWITCH])
ali@70
  2704
		g_print("    Line %ld column %ld - Spaced quote?\n",linecnt,
ali@70
  2705
		  g_utf8_strlen(aline,lbytes));
ali@70
  2706
	    else
ali@70
  2707
		cnt_punct++;
ali@70
  2708
	}
ali@70
  2709
	c1=g_utf8_get_char(aline);
ali@70
  2710
	c2=g_utf8_get_char(g_utf8_next_char(aline));
ali@99
  2711
	if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
ali@70
  2712
	{
ali@70
  2713
	    if (pswit[ECHO_SWITCH])
ali@70
  2714
		g_print("\n%s\n",aline);
ali@70
  2715
	    if (!pswit[OVERVIEW_SWITCH])
ali@70
  2716
		g_print("    Line %ld column 1 - Spaced quote?\n",linecnt);
ali@60
  2717
	    else
ali@60
  2718
		cnt_punct++;
ali@60
  2719
	}
ali@60
  2720
	/*
ali@60
  2721
	 * Dash at end of line may well be legit - paranoid mode only
ali@60
  2722
	 * and don't report em-dash at line-end.
ali@60
  2723
	 */
ali@60
  2724
	if (pswit[PARANOID_SWITCH] && warnings->hyphen)
ali@60
  2725
	{
ali@70
  2726
	    for (s=g_utf8_prev_char(aline+lbytes);
ali@70
  2727
	      s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
ali@60
  2728
		;
ali@70
  2729
	    if (g_utf8_get_char(s)=='-' &&
ali@70
  2730
	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
ali@60
  2731
	    {
ali@60
  2732
		if (pswit[ECHO_SWITCH])
ali@70
  2733
		    g_print("\n%s\n",aline);
ali@60
  2734
		if (!pswit[OVERVIEW_SWITCH])
ali@70
  2735
		    g_print("    Line %ld column %ld - "
ali@70
  2736
		      "Hyphen at end of line?\n",
ali@70
  2737
		      linecnt,g_utf8_pointer_to_offset(aline,s));
ali@60
  2738
	    }
ali@60
  2739
	}
ali@60
  2740
    }
ali@60
  2741
}
ali@60
  2742
ali@60
  2743
/*
ali@61
  2744
 * check_for_unspaced_bracket:
ali@61
  2745
 *
ali@61
  2746
 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
ali@61
  2747
 * If so, suspect a scanno like "a]most".
ali@61
  2748
 */
ali@61
  2749
void check_for_unspaced_bracket(const char *aline)
ali@61
  2750
{
ali@70
  2751
    const char *s;
ali@70
  2752
    gunichar c,nc,pc;
ali@70
  2753
    c=g_utf8_get_char(aline);
ali@70
  2754
    nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70
  2755
    for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@61
  2756
    {
ali@70
  2757
	pc=c;
ali@70
  2758
	c=nc;
ali@70
  2759
	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@70
  2760
	if (!nc)
ali@70
  2761
	    break;
ali@61
  2762
	/* for each bracket character in the line except 1st & last */
ali@70
  2763
	if (g_utf8_strchr("{[()]}",-1,c) &&
ali@70
  2764
	  g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
ali@61
  2765
	{
ali@61
  2766
	    if (pswit[ECHO_SWITCH])
ali@70
  2767
		g_print("\n%s\n",aline);
ali@61
  2768
	    if (!pswit[OVERVIEW_SWITCH])
ali@70
  2769
		g_print("    Line %ld column %ld - Unspaced bracket?\n",
ali@70
  2770
		  linecnt,g_utf8_pointer_to_offset(aline,s));
ali@61
  2771
	    else
ali@61
  2772
		cnt_punct++;
ali@61
  2773
	}
ali@61
  2774
    }
ali@61
  2775
}
ali@61
  2776
ali@61
  2777
/*
ali@62
  2778
 * check_for_unpunctuated_endquote:
ali@62
  2779
 */
ali@62
  2780
void check_for_unpunctuated_endquote(const char *aline)
ali@62
  2781
{
ali@70
  2782
    const char *s;
ali@70
  2783
    gunichar c,nc,pc;
ali@142
  2784
    QuoteClass qc;
ali@70
  2785
    c=g_utf8_get_char(aline);
ali@70
  2786
    nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70
  2787
    for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@62
  2788
    {
ali@70
  2789
	pc=c;
ali@70
  2790
	c=nc;
ali@142
  2791
	qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;
ali@70
  2792
	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@62
  2793
	/* for each character in the line except 1st */
ali@147
  2794
	if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && g_unichar_isalpha(pc))
ali@62
  2795
	{
ali@62
  2796
	    if (pswit[ECHO_SWITCH])
ali@70
  2797
		g_print("\n%s\n",aline);
ali@62
  2798
	    if (!pswit[OVERVIEW_SWITCH])
ali@70
  2799
		g_print("    Line %ld column %ld - "
ali@70
  2800
		  "endquote missing punctuation?\n",
ali@70
  2801
		  linecnt,g_utf8_pointer_to_offset(aline,s));
ali@62
  2802
	    else
ali@62
  2803
		cnt_punct++;
ali@62
  2804
	}
ali@62
  2805
    }
ali@62
  2806
}
ali@62
  2807
ali@62
  2808
/*
ali@63
  2809
 * check_for_html_tag:
ali@63
  2810
 *
ali@63
  2811
 * Check for <HTML TAG>.
ali@63
  2812
 *
ali@63
  2813
 * If there is a < in the line, followed at some point
ali@63
  2814
 * by a > then we suspect HTML.
ali@63
  2815
 */
ali@63
  2816
void check_for_html_tag(const char *aline)
ali@63
  2817
{
ali@63
  2818
    const char *open,*close;
ali@70
  2819
    gchar *tag;
ali@70
  2820
    open=strchr(aline,'<');
ali@63
  2821
    if (open)
ali@63
  2822
    {
ali@70
  2823
	close=strchr(g_utf8_next_char(open),'>');
ali@63
  2824
	if (close)
ali@63
  2825
	{
ali@70
  2826
	    if (pswit[ECHO_SWITCH])
ali@70
  2827
		g_print("\n%s\n",aline);
ali@70
  2828
	    if (!pswit[OVERVIEW_SWITCH])
ali@63
  2829
	    {
ali@70
  2830
		tag=g_strndup(open,close-open+1);
ali@70
  2831
		g_print("    Line %ld column %ld - HTML Tag? %s \n",
ali@70
  2832
		  linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
ali@70
  2833
		g_free(tag);
ali@63
  2834
	    }
ali@70
  2835
	    else
ali@70
  2836
		cnt_html++;
ali@63
  2837
	}
ali@63
  2838
    }
ali@63
  2839
}
ali@63
  2840
ali@63
  2841
/*
ali@64
  2842
 * check_for_html_entity:
ali@64
  2843
 *
ali@64
  2844
 * Check for &symbol; HTML.
ali@64
  2845
 *
ali@64
  2846
 * If there is a & in the line, followed at
ali@64
  2847
 * some point by a ; then we suspect HTML.
ali@64
  2848
 */
ali@64
  2849
void check_for_html_entity(const char *aline)
ali@64
  2850
{
ali@64
  2851
    const char *s,*amp,*scolon;
ali@70
  2852
    gchar *entity;
ali@70
  2853
    amp=strchr(aline,'&');
ali@64
  2854
    if (amp)
ali@64
  2855
    {
ali@70
  2856
	scolon=strchr(amp,';');
ali@64
  2857
	if (scolon)
ali@64
  2858
	{
ali@70
  2859
	    for (s=amp;s<scolon;s=g_utf8_next_char(s))   
ali@70
  2860
		if (g_utf8_get_char(s)==CHAR_SPACE)
ali@70
  2861
		    break;		/* Don't report "Jones & Son;" */
ali@70
  2862
	    if (s>=scolon)
ali@64
  2863
	    {
ali@64
  2864
		if (pswit[ECHO_SWITCH])
ali@70
  2865
		    g_print("\n%s\n",aline);
ali@64
  2866
		if (!pswit[OVERVIEW_SWITCH])
ali@70
  2867
		{
ali@70
  2868
		    entity=g_strndup(amp,scolon-amp+1);
ali@70
  2869
		    g_print("    Line %ld column %d - HTML symbol? %s \n",
ali@70
  2870
		      linecnt,(int)(amp-aline)+1,entity);
ali@70
  2871
		    g_free(entity);
ali@70
  2872
		}
ali@64
  2873
		else
ali@64
  2874
		    cnt_html++;
ali@64
  2875
	    }
ali@64
  2876
	}
ali@64
  2877
    }
ali@64
  2878
}
ali@64
  2879
ali@65
  2880
/*
ali@66
  2881
 * check_for_omitted_punctuation:
ali@66
  2882
 *
ali@66
  2883
 * Check for omitted punctuation at end of paragraph by working back
ali@66
  2884
 * through prevline. DW.
ali@66
  2885
 * Need to check this only for "normal" paras.
ali@66
  2886
 * So what is a "normal" para?
ali@66
  2887
 *    Not normal if one-liner (chapter headings, etc.)
ali@66
  2888
 *    Not normal if doesn't contain at least one locase letter
ali@66
  2889
 *    Not normal if starts with space
ali@66
  2890
 */
ali@66
  2891
void check_for_omitted_punctuation(const char *prevline,
ali@66
  2892
  struct line_properties *last,int start_para_line)
ali@66
  2893
{
ali@70
  2894
    gboolean letter_on_line=FALSE;
ali@66
  2895
    const char *s;
ali@99
  2896
    gunichar c;
ali@142
  2897
    gboolean closing_quote;
ali@70
  2898
    for (s=prevline;*s;s=g_utf8_next_char(s))
ali@70
  2899
	if (g_unichar_isalpha(g_utf8_get_char(s)))
ali@70
  2900
	{
ali@70
  2901
	    letter_on_line=TRUE;
ali@70
  2902
	    break;
ali@70
  2903
	}
ali@66
  2904
    /*
ali@66
  2905
     * This next "if" is a problem.
ali@66
  2906
     * If we say "start_para_line <= linecnt - 1", that includes
ali@66
  2907
     * one-line "paragraphs" like chapter heads. Lotsa false positives.
ali@66
  2908
     * If we say "start_para_line < linecnt - 1" it doesn't, but then it
ali@66
  2909
     * misses genuine one-line paragraphs.
ali@66
  2910
     */
ali@70
  2911
    if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
ali@70
  2912
      g_utf8_get_char(prevline)>CHAR_SPACE)
ali@66
  2913
    {
ali@99
  2914
	s=prevline+strlen(prevline);
ali@99
  2915
	do
ali@99
  2916
	{
ali@99
  2917
	    s=g_utf8_prev_char(s);
ali@99
  2918
	    c=g_utf8_get_char(s);
ali@142
  2919
	    if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)
ali@142
  2920
		closing_quote=TRUE;
ali@142
  2921
	    else
ali@142
  2922
		closing_quote=FALSE;
ali@142
  2923
	} while (closing_quote && s>prevline);
ali@70
  2924
	for (;s>prevline;s=g_utf8_prev_char(s))
ali@66
  2925
	{
ali@70
  2926
	    if (g_unichar_isalpha(g_utf8_get_char(s)))
ali@66
  2927
	    {
ali@66
  2928
		if (pswit[ECHO_SWITCH])
ali@70
  2929
		    g_print("\n%s\n",prevline);
ali@66
  2930
		if (!pswit[OVERVIEW_SWITCH])
ali@70
  2931
		    g_print("    Line %ld column %ld - "
ali@66
  2932
		      "No punctuation at para end?\n",
ali@70
  2933
		      linecnt-1,g_utf8_strlen(prevline,-1));
ali@66
  2934
		else
ali@66
  2935
		    cnt_punct++;
ali@66
  2936
		break;
ali@66
  2937
	    }
ali@147
  2938
	    if (g_utf8_strchr("—-.:!([{?}])",-1,g_utf8_get_char(s)))
ali@66
  2939
		break;
ali@66
  2940
	}
ali@66
  2941
    }
ali@66
  2942
}
ali@66
  2943
ali@69
  2944
gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
ali@69
  2945
{
ali@69
  2946
    const char *word=key;
ali@69
  2947
    int *dupcnt=value;
ali@69
  2948
    if (*dupcnt)
ali@70
  2949
	g_print("\nNote: Queried word %s was duplicated %d times\n",
ali@69
  2950
	  word,*dupcnt);
ali@69
  2951
    return FALSE;
ali@69
  2952
}
ali@69
  2953
ali@70
  2954
void print_as_windows_1252(const char *string)
ali@70
  2955
{
ali@70
  2956
    gsize inbytes,outbytes;
ali@70
  2957
    gchar *buf,*bp;
ali@86
  2958
    static GIConv converter=(GIConv)-1;
ali@70
  2959
    if (!string)
ali@70
  2960
    {
ali@70
  2961
	if (converter!=(GIConv)-1)
ali@70
  2962
	    g_iconv_close(converter);
ali@70
  2963
	converter=(GIConv)-1;
ali@70
  2964
	return;
ali@70
  2965
    }
ali@86
  2966
    if (converter==(GIConv)-1)
ali@70
  2967
	converter=g_iconv_open("WINDOWS-1252","UTF-8");
ali@70
  2968
    if (converter!=(GIConv)-1)
ali@70
  2969
    {
ali@70
  2970
	inbytes=outbytes=strlen(string);
ali@70
  2971
	bp=buf=g_malloc(outbytes+1);
ali@70
  2972
	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
ali@70
  2973
	*bp='\0';
ali@70
  2974
	fputs(buf,stdout);
ali@70
  2975
	g_free(buf);
ali@70
  2976
    }
ali@70
  2977
    else
ali@70
  2978
	fputs(string,stdout);
ali@70
  2979
}
ali@70
  2980
ali@72
  2981
void print_as_utf_8(const char *string)
ali@72
  2982
{
ali@72
  2983
    fputs(string,stdout);
ali@72
  2984
}
ali@72
  2985
ali@66
  2986
/*
ali@41
  2987
 * procfile:
ali@41
  2988
 *
ali@41
  2989
 * Process one file.
ali@41
  2990
 */
ali@69
  2991
void procfile(const char *filename)
ali@41
  2992
{
ali@65
  2993
    const char *s;
ali@69
  2994
    gchar *parastart=NULL;	/* first line of current para */
ali@69
  2995
    gchar *etext,*aline;
ali@69
  2996
    gchar *etext_ptr;
ali@69
  2997
    GError *err=NULL;
ali@41
  2998
    struct first_pass_results *first_pass_results;
ali@42
  2999
    struct warnings *warnings;
ali@43
  3000
    struct counters counters={0};
ali@45
  3001
    struct line_properties last={0};
ali@56
  3002
    struct parities parities={0};
ali@69
  3003
    struct pending pending={0};
ali@69
  3004
    gboolean isemptyline;
ali@68
  3005
    long start_para_line=0;
ali@69
  3006
    gboolean isnewpara=FALSE,enddash=FALSE;
ali@45
  3007
    last.start=CHAR_SPACE;
ali@68
  3008
    linecnt=checked_linecnt=0;
ali@69
  3009
    etext=read_etext(filename,&err);
ali@69
  3010
    if (!etext)
ali@41
  3011
    {
ali@68
  3012
	if (pswit[STDOUT_SWITCH])
ali@69
  3013
	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
ali@68
  3014
	else
ali@69
  3015
	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
ali@41
  3016
	exit(1);
ali@41
  3017
    }
ali@70
  3018
    g_print("\n\nFile: %s\n\n",filename);
ali@69
  3019
    first_pass_results=first_pass(etext);
ali@42
  3020
    warnings=report_first_pass(first_pass_results);
ali@69
  3021
    qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
ali@69
  3022
    qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
ali@40
  3023
    /*
ali@40
  3024
     * Here we go with the main pass. Hold onto yer hat!
ali@40
  3025
     */
ali@65
  3026
    linecnt=0;
ali@69
  3027
    etext_ptr=etext;
ali@199
  3028
    while ((aline=flgets(&etext_ptr,linecnt+1,warnings->newlines)))
ali@40
  3029
    {
ali@68
  3030
	linecnt++;
ali@68
  3031
	if (linecnt==1)
ali@69
  3032
	    isnewpara=TRUE;
ali@70
  3033
	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
ali@40
  3034
	    continue;    // skip DP page separators completely
ali@68
  3035
	if (linecnt<first_pass_results->firstline ||
ali@41
  3036
	  (first_pass_results->footerline>0 &&
ali@41
  3037
	  linecnt>first_pass_results->footerline))
ali@40
  3038
	{
ali@68
  3039
	    if (pswit[HEADER_SWITCH])
ali@40
  3040
	    {
ali@70
  3041
		if (g_str_has_prefix(aline,"Title:"))
ali@70
  3042
		    g_print("    %s\n",aline);
ali@70
  3043
		if (g_str_has_prefix(aline,"Author:"))
ali@70
  3044
		    g_print("    %s\n",aline);
ali@70
  3045
		if (g_str_has_prefix(aline,"Release Date:"))
ali@70
  3046
		    g_print("    %s\n",aline);
ali@70
  3047
		if (g_str_has_prefix(aline,"Edition:"))
ali@70
  3048
		    g_print("    %s\n\n",aline);
ali@40
  3049
	    }
ali@68
  3050
	    continue;		/* skip through the header */
ali@40
  3051
	}
ali@68
  3052
	checked_linecnt++;
ali@65
  3053
	print_pending(aline,parastart,&pending);
ali@164
  3054
	isemptyline=analyse_quotes(aline,&counters);
ali@68
  3055
	if (isnewpara && !isemptyline)
ali@40
  3056
	{
ali@40
  3057
	    /* This line is the start of a new paragraph. */
ali@68
  3058
	    start_para_line=linecnt;
ali@40
  3059
	    /* Capture its first line in case we want to report it later. */
ali@69
  3060
	    g_free(parastart);
ali@69
  3061
	    parastart=g_strdup(aline);
ali@56
  3062
	    memset(&parities,0,sizeof(parities));  /* restart the quote count */
ali@68
  3063
	    s=aline;
ali@70
  3064
	    while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
ali@70
  3065
	      !g_unichar_isdigit(g_utf8_get_char(s)))
ali@70
  3066
		s=g_utf8_next_char(s);
ali@70
  3067
	    if (g_unichar_islower(g_utf8_get_char(s)))
ali@40
  3068
	    {
ali@40
  3069
		/* and its first letter is lowercase */
ali@68
  3070
		if (pswit[ECHO_SWITCH])
ali@70
  3071
		    g_print("\n%s\n",aline);
ali@68
  3072
		if (!pswit[OVERVIEW_SWITCH])
ali@70
  3073
		    g_print("    Line %ld column %ld - "
ali@40
  3074
		      "Paragraph starts with lower-case\n",
ali@70
  3075
		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@68
  3076
		else
ali@68
  3077
		    cnt_punct++;
ali@40
  3078
	    }
ali@69
  3079
	    isnewpara=FALSE; /* Signal the end of new para processing. */
ali@40
  3080
	}
ali@68
  3081
	/* Check for an em-dash broken at line end. */
ali@70
  3082
	if (enddash && g_utf8_get_char(aline)=='-')
ali@40
  3083
	{
ali@68
  3084
	    if (pswit[ECHO_SWITCH])
ali@70
  3085
		g_print("\n%s\n",aline);
ali@68
  3086
	    if (!pswit[OVERVIEW_SWITCH])
ali@70
  3087
		g_print("    Line %ld column 1 - Broken em-dash?\n",linecnt);
ali@68
  3088
	    else
ali@68
  3089
		cnt_punct++;
ali@40
  3090
	}
ali@69
  3091
	enddash=FALSE;
ali@70
  3092
	for (s=g_utf8_prev_char(aline+strlen(aline));
ali@70
  3093
	  g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
ali@40
  3094
	    ;
ali@70
  3095
	if (s>=aline && g_utf8_get_char(s)=='-')
ali@69
  3096
	    enddash=TRUE;
ali@67
  3097
	check_for_control_characters(aline);
ali@185
  3098
	check_for_odd_characters(aline,warnings,isemptyline);
ali@68
  3099
	if (warnings->longline)
ali@45
  3100
	    check_for_long_line(aline);
ali@68
  3101
	if (warnings->shortline)
ali@45
  3102
	    check_for_short_line(aline,&last);
ali@68
  3103
	last.blen=last.len;
ali@70
  3104
	last.len=g_utf8_strlen(aline,-1);
ali@70
  3105
	last.start=g_utf8_get_char(aline);
ali@46
  3106
	check_for_starting_punctuation(aline);
ali@68
  3107
	if (warnings->dash)
ali@40
  3108
	{
ali@47
  3109
	    check_for_spaced_emdash(aline);
ali@47
  3110
	    check_for_spaced_dash(aline);
ali@40
  3111
	}
ali@48
  3112
	check_for_unmarked_paragraphs(aline);
ali@49
  3113
	check_for_jeebies(aline);
ali@50
  3114
	check_for_mta_from(aline);
ali@51
  3115
	check_for_orphan_character(aline);
ali@52
  3116
	check_for_pling_scanno(aline);
ali@53
  3117
	check_for_extra_period(aline,warnings);
ali@54
  3118
	check_for_following_punctuation(aline);
ali@55
  3119
	check_for_typos(aline,warnings);
ali@56
  3120
	check_for_misspaced_punctuation(aline,&parities,isemptyline);
ali@57
  3121
	check_for_double_punctuation(aline,warnings);
ali@58
  3122
	check_for_spaced_quotes(aline);
ali@59
  3123
	check_for_miscased_genative(aline);
ali@60
  3124
	check_end_of_line(aline,warnings);
ali@61
  3125
	check_for_unspaced_bracket(aline);
ali@68
  3126
	if (warnings->endquote)
ali@62
  3127
	    check_for_unpunctuated_endquote(aline);
ali@63
  3128
	check_for_html_tag(aline);
ali@64
  3129
	check_for_html_entity(aline);
ali@68
  3130
	if (isemptyline)
ali@40
  3131
	{
ali@65
  3132
	    check_for_mismatched_quotes(&counters,&pending);
ali@103
  3133
	    counters_reset(&counters);
ali@40
  3134
	    /* let the next iteration know that it's starting a new para */
ali@69
  3135
	    isnewpara=TRUE;
ali@69
  3136
	    if (prevline)
ali@69
  3137
		check_for_omitted_punctuation(prevline,&last,start_para_line);
ali@40
  3138
	}
ali@69
  3139
	g_free(prevline);
ali@69
  3140
	prevline=g_strdup(aline);
ali@0
  3141
    }
ali@103
  3142
    linecnt++;
ali@103
  3143
    check_for_mismatched_quotes(&counters,&pending);
ali@103
  3144
    print_pending(NULL,parastart,&pending);
ali@103
  3145
    reset_pending(&pending);
ali@69
  3146
    if (prevline)
ali@69
  3147
    {
ali@69
  3148
	g_free(prevline);
ali@69
  3149
	prevline=NULL;
ali@69
  3150
    }
ali@69
  3151
    g_free(parastart);
ali@69
  3152
    g_free(prevline);
ali@69
  3153
    g_free(etext);
ali@79
  3154
    if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
ali@69
  3155
	g_tree_foreach(qword,report_duplicate_queries,NULL);
ali@69
  3156
    g_tree_unref(qword);
ali@69
  3157
    g_tree_unref(qperiod);
ali@99
  3158
    counters_destroy(&counters);
ali@70
  3159
    g_set_print_handler(NULL);
ali@70
  3160
    print_as_windows_1252(NULL);
ali@71
  3161
    if (pswit[MARKUP_SWITCH])  
ali@71
  3162
	loseentities(NULL);
ali@0
  3163
}
ali@0
  3164
ali@40
  3165
/*
ali@40
  3166
 * flgets:
ali@40
  3167
 *
ali@199
  3168
 * Get one line from the input text. The setting of newlines has the following
ali@199
  3169
 * effect:
ali@199
  3170
 *
ali@199
  3171
 * DOS_NEWLINES: Check for the existence of exactly one CR-LF line-end per line.
ali@199
  3172
 *
ali@199
  3173
 * OS9_NEWLINES: Asserts that etext contains no LFs. CR is used as
ali@199
  3174
 *		 the newline character.
ali@199
  3175
 *
ali@199
  3176
 * UNIX_NEWLINES: Check for the presence of CRs.
ali@199
  3177
 *
ali@199
  3178
 * In all cases, check that the last line is correctly terminated.
ali@40
  3179
 *
ali@40
  3180
 * Returns: a pointer to the line.
ali@40
  3181
 */
ali@199
  3182
char *flgets(char **etext,long lcnt,int newlines)
ali@0
  3183
{
ali@70
  3184
    gunichar c;
ali@69
  3185
    gboolean isCR=FALSE;
ali@69
  3186
    char *theline=*etext;
ali@70
  3187
    char *eos=theline;
ali@70
  3188
    gchar *s;
ali@70
  3189
    for (;;)
ali@40
  3190
    {
ali@70
  3191
	c=g_utf8_get_char(*etext);
ali@173
  3192
	if (!c)
ali@173
  3193
	{
ali@173
  3194
	    if (*etext==theline)
ali@173
  3195
		return NULL;
ali@173
  3196
	    else if (pswit[LINE_END_SWITCH])
ali@173
  3197
	    {
ali@173
  3198
		if (pswit[ECHO_SWITCH])
ali@173
  3199
		{
ali@173
  3200
		    s=g_strndup(theline,eos-theline);
ali@173
  3201
		    g_print("\n%s\n",s);
ali@173
  3202
		    g_free(s);
ali@173
  3203
		}
ali@173
  3204
		if (!pswit[OVERVIEW_SWITCH])
ali@199
  3205
		{
ali@199
  3206
		    if (newlines==OS9_NEWLINES)
ali@199
  3207
			g_print("    Line %ld - No CR?\n",lcnt);
ali@199
  3208
		    else
ali@199
  3209
		    {
ali@199
  3210
			/* There may, or may not, have been a CR */
ali@199
  3211
			g_print("    Line %ld - No LF?\n",lcnt);
ali@199
  3212
		    }
ali@199
  3213
		}
ali@173
  3214
		else
ali@173
  3215
		    cnt_lineend++;
ali@173
  3216
	    }
ali@173
  3217
	    break;
ali@173
  3218
	}
ali@70
  3219
	*etext=g_utf8_next_char(*etext);
ali@40
  3220
	/* either way, it's end of line */
ali@69
  3221
	if (c=='\n')
ali@40
  3222
	{
ali@199
  3223
	    if (newlines==DOS_NEWLINES && !isCR)
ali@40
  3224
	    {
ali@40
  3225
		/* Error - a LF without a preceding CR */
ali@68
  3226
		if (pswit[LINE_END_SWITCH])
ali@40
  3227
		{
ali@68
  3228
		    if (pswit[ECHO_SWITCH])
ali@70
  3229
		    {
ali@70
  3230
			s=g_strndup(theline,eos-theline);
ali@70
  3231
			g_print("\n%s\n",s);
ali@70
  3232
			g_free(s);
ali@70
  3233
		    }
ali@68
  3234
		    if (!pswit[OVERVIEW_SWITCH])
ali@70
  3235
			g_print("    Line %ld - No CR?\n",lcnt);
ali@68
  3236
		    else
ali@68
  3237
			cnt_lineend++;
ali@40
  3238
		}
ali@40
  3239
	    }
ali@199
  3240
	    break;
ali@40
  3241
	}
ali@69
  3242
	if (c=='\r')
ali@40
  3243
	{
ali@199
  3244
	    if (newlines==OS9_NEWLINES)
ali@199
  3245
		break;
ali@199
  3246
	    if (isCR || newlines==UNIX_NEWLINES)
ali@40
  3247
	    {
ali@68
  3248
		if (pswit[LINE_END_SWITCH])
ali@40
  3249
		{
ali@68
  3250
		    if (pswit[ECHO_SWITCH])
ali@70
  3251
		    {
ali@70
  3252
			s=g_strndup(theline,eos-theline);
ali@70
  3253
			g_print("\n%s\n",s);
ali@70
  3254
			g_free(s);
ali@70
  3255
		    }
ali@68
  3256
		    if (!pswit[OVERVIEW_SWITCH])
ali@199
  3257
		    {
ali@199
  3258
			if (newlines==UNIX_NEWLINES)
ali@199
  3259
			    g_print("    Line %ld column %ld - Embedded CR?\n",
ali@199
  3260
			      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
ali@199
  3261
			else
ali@199
  3262
			    g_print("    Line %ld - Two successive CRs?\n",
ali@199
  3263
			      lcnt);
ali@199
  3264
		    }
ali@68
  3265
		    else
ali@68
  3266
			cnt_lineend++;
ali@40
  3267
		}
ali@199
  3268
		if (newlines==UNIX_NEWLINES)
ali@199
  3269
		    *eos=' ';
ali@40
  3270
	    }
ali@199
  3271
	    if (newlines==DOS_NEWLINES)
ali@199
  3272
		isCR=TRUE;
ali@40
  3273
	}
ali@68
  3274
	else
ali@40
  3275
	{
ali@68
  3276
	    if (pswit[LINE_END_SWITCH] && isCR)
ali@40
  3277
	    {
ali@68
  3278
		if (pswit[ECHO_SWITCH])
ali@70
  3279
		{
ali@70
  3280
		    s=g_strndup(theline,eos-theline);
ali@70
  3281
		    g_print("\n%s\n",s);
ali@70
  3282
		    g_free(s);
ali@70
  3283
		}
ali@68
  3284
		if (!pswit[OVERVIEW_SWITCH])
ali@70
  3285
		    g_print("    Line %ld column %ld - CR without LF?\n",
ali@70
  3286
		      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
ali@68
  3287
		else
ali@68
  3288
		    cnt_lineend++;
ali@70
  3289
		*eos=' ';
ali@40
  3290
	    }
ali@69
  3291
	    isCR=FALSE;
ali@70
  3292
	    eos=g_utf8_next_char(eos);
ali@40
  3293
	}
ali@69
  3294
    }
ali@70
  3295
    *eos='\0';
ali@0
  3296
    if (pswit[MARKUP_SWITCH])  
ali@68
  3297
	postprocess_for_HTML(theline);
ali@0
  3298
    if (pswit[DP_SWITCH])  
ali@68
  3299
	postprocess_for_DP(theline);
ali@40
  3300
    return theline;
ali@0
  3301
}
ali@0
  3302
ali@40
  3303
/*
ali@40
  3304
 * mixdigit:
ali@40
  3305
 *
ali@40
  3306
 * Takes a "word" as a parameter, and checks whether it
ali@40
  3307
 * contains a mixture of alpha and digits. Generally, this is an
ali@40
  3308
 * error, but may not be for cases like 4th or L5 12s. 3d.
ali@40
  3309
 *
ali@70
  3310
 * Returns: TRUE iff an is error found.
ali@40
  3311
 */
ali@70
  3312
gboolean mixdigit(const char *checkword)
ali@0
  3313
{
ali@70
  3314
    gboolean wehaveadigit,wehavealetter,query;
ali@70
  3315
    const char *s,*nondigit;
ali@70
  3316
    wehaveadigit=wehavealetter=query=FALSE;
ali@70
  3317
    for (s=checkword;*s;s=g_utf8_next_char(s))
ali@70
  3318
	if (g_unichar_isalpha(g_utf8_get_char(s)))
ali@70
  3319
	    wehavealetter=TRUE;
ali@70
  3320
	else if (g_unichar_isdigit(g_utf8_get_char(s)))
ali@70
  3321
	    wehaveadigit=TRUE;
ali@40
  3322
    if (wehaveadigit && wehavealetter)
ali@40
  3323
    {
ali@40
  3324
	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
ali@70
  3325
	query=TRUE;
ali@70
  3326
	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
ali@70
  3327
	  nondigit=g_utf8_next_char(nondigit))
ali@68
  3328
	    ;
ali@68
  3329
	/* digits, ending in st, rd, nd, th of either case */
ali@70
  3330
	if (!g_ascii_strcasecmp(nondigit,"st") ||
ali@70
  3331
	  !g_ascii_strcasecmp(nondigit,"rd") ||
ali@70
  3332
	  !g_ascii_strcasecmp(nondigit,"nd") ||
ali@70
  3333
	  !g_ascii_strcasecmp(nondigit,"th"))
ali@70
  3334
	    query=FALSE;
ali@70
  3335
	if (!g_ascii_strcasecmp(nondigit,"sts") ||
ali@70
  3336
	  !g_ascii_strcasecmp(nondigit,"rds") ||
ali@70
  3337
	  !g_ascii_strcasecmp(nondigit,"nds") ||
ali@70
  3338
	  !g_ascii_strcasecmp(nondigit,"ths"))
ali@70
  3339
	    query=FALSE;
ali@70
  3340
	if (!g_ascii_strcasecmp(nondigit,"stly") ||
ali@70
  3341
	  !g_ascii_strcasecmp(nondigit,"rdly") ||
ali@70
  3342
	  !g_ascii_strcasecmp(nondigit,"ndly") ||
ali@70
  3343
	  !g_ascii_strcasecmp(nondigit,"thly"))
ali@70
  3344
	    query=FALSE;
ali@68
  3345
	/* digits, ending in l, L, s or d */
ali@70
  3346
	if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
ali@70
  3347
	  !strcmp(nondigit,"d"))
ali@70
  3348
	    query=FALSE;
ali@68
  3349
	/*
ali@40
  3350
	 * L at the start of a number, representing Britsh pounds, like L500.
ali@70
  3351
	 * This is cute. We know the current word is mixed digit. If the first
ali@68
  3352
	 * letter is L, there must be at least one digit following. If both
ali@68
  3353
	 * digits and letters follow, we have a genuine error, else we have a
ali@68
  3354
	 * capital L followed by digits, and we accept that as a non-error.
ali@40
  3355
	 */
ali@70
  3356
	if (g_utf8_get_char(checkword)=='L' &&
ali@70
  3357
	  !mixdigit(g_utf8_next_char(checkword)))
ali@70
  3358
	    query=FALSE;
ali@40
  3359
    }
ali@40
  3360
    return query;
ali@0
  3361
}
ali@0
  3362
ali@40
  3363
/*
ali@40
  3364
 * getaword:
ali@40
  3365
 *
ali@69
  3366
 * Extracts the first/next "word" from the line, and returns it.
ali@69
  3367
 * A word is defined as one English word unit--or at least that's the aim.
ali@69
  3368
 * "ptr" is advanced to the position in the line where we will start
ali@69
  3369
 * looking for the next word.
ali@40
  3370
 *
ali@69
  3371
 * Returns: A newly-allocated string.
ali@40
  3372
 */
ali@69
  3373
gchar *getaword(const char **ptr)
ali@0
  3374
{
ali@70
  3375
    const char *s,*t;
ali@69
  3376
    GString *word;
ali@70
  3377
    gunichar c,pc;
ali@69
  3378
    word=g_string_new(NULL);
ali@70
  3379
    for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
ali@70
  3380
      !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
ali@70
  3381
      **ptr;*ptr=g_utf8_next_char(*ptr))
ali@174
  3382
    {
ali@174
  3383
	/* Handle exceptions for footnote markers like [1] */
ali@174
  3384
	if (g_utf8_get_char(*ptr)=='[')
ali@174
  3385
	{
ali@174
  3386
	    g_string_append_c(word,'[');
ali@174
  3387
	    s=g_utf8_next_char(*ptr);
ali@174
  3388
	    for (;g_unichar_isdigit(g_utf8_get_char(s));s=g_utf8_next_char(s))
ali@174
  3389
		g_string_append_unichar(word,g_utf8_get_char(s));
ali@174
  3390
	    if (g_utf8_get_char(s)==']')
ali@174
  3391
	    {
ali@174
  3392
		g_string_append_c(word,']');
ali@174
  3393
		*ptr=g_utf8_next_char(s);
ali@174
  3394
		return g_string_free(word,FALSE);
ali@174
  3395
	    }
ali@174
  3396
	    else
ali@174
  3397
		g_string_truncate(word,0);
ali@174
  3398
	}
ali@174
  3399
    }
ali@40
  3400
    /*
ali@40
  3401
     * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
ali@40
  3402
     * Especially yucky is the case of L1,000
ali@40
  3403
     * This section looks for a pattern of characters including a digit
ali@40
  3404
     * followed by a comma or period followed by one or more digits.
ali@40
  3405
     * If found, it returns this whole pattern as a word; otherwise we discard
ali@40
  3406
     * the results and resume our normal programming.
ali@40
  3407
     */
ali@69
  3408
    s=*ptr;
ali@70
  3409
    for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
ali@70
  3410
      g_unichar_isalpha(g_utf8_get_char(s)) ||
ali@70
  3411
      g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
ali@70
  3412
	g_string_append_unichar(word,g_utf8_get_char(s));
ali@82
  3413
    if (word->len)
ali@40
  3414
    {
ali@82
  3415
	for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
ali@40
  3416
	{
ali@82
  3417
	    c=g_utf8_get_char(t);
ali@82
  3418
	    pc=g_utf8_get_char(g_utf8_prev_char(t));
ali@82
  3419
	    if ((c=='.' || c==',') && g_unichar_isdigit(pc))
ali@82
  3420
	    {
ali@82
  3421
		*ptr=s;
ali@82
  3422
		return g_string_free(word,FALSE);
ali@82
  3423
	    }
ali@40
  3424
	}
ali@40
  3425
    }
ali@0
  3426
    /* we didn't find a punctuated number - do the regular getword thing */
ali@69
  3427
    g_string_truncate(word,0);
ali@99
  3428
    c=g_utf8_get_char(*ptr);
ali@99
  3429
    for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
ali@99
  3430
      *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
ali@99
  3431
	g_string_append_unichar(word,c);
ali@69
  3432
    return g_string_free(word,FALSE);
ali@0
  3433
}
ali@0
  3434
ali@40
  3435
/*
ali@40
  3436
 * isroman:
ali@40
  3437
 *
ali@40
  3438
 * Is this word a Roman Numeral?
ali@40
  3439
 *
ali@40
  3440
 * It doesn't actually validate that the number is a valid Roman Numeral--for
ali@40
  3441
 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
ali@40
  3442
 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
ali@40
  3443
 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
ali@40
  3444
 * expressions thereof, except when it came to taxes. Allow any number of M,
ali@40
  3445
 * an optional D, an optional CM or CD, any number of optional Cs, an optional
ali@40
  3446
 * XL or an optional XC, an optional IX or IV, an optional V and any number
ali@40
  3447
 * of optional Is.
ali@40
  3448
 */
ali@69
  3449
gboolean isroman(const char *t)
ali@0
  3450
{
ali@69
  3451
    const char *s;
ali@40
  3452
    if (!t || !*t)
ali@69
  3453
	return FALSE;
ali@40
  3454
    s=t;
ali@70
  3455
    while (g_utf8_get_char(t)=='m' && *t)
ali@40
  3456
	t++;
ali@70
  3457
    if (g_utf8_get_char(t)=='d')
ali@40
  3458
	t++;
ali@70
  3459
    if (g_str_has_prefix(t,"cm"))
ali@40
  3460
	t+=2;
ali@70
  3461
    if (g_str_has_prefix(t,"cd"))
ali@40
  3462
	t+=2;
ali@70
  3463
    while (g_utf8_get_char(t)=='c' && *t)
ali@40
  3464
	t++;
ali@70
  3465
    if (g_str_has_prefix(t,"xl"))
ali@40
  3466
	t+=2;
ali@70
  3467
    if (g_str_has_prefix(t,"xc"))
ali@40
  3468
	t+=2;
ali@70
  3469
    if (g_utf8_get_char(t)=='l')
ali@40
  3470
	t++;
ali@70
  3471
    while (g_utf8_get_char(t)=='x' && *t)
ali@40
  3472
	t++;
ali@70
  3473
    if (g_str_has_prefix(t,"ix"))
ali@40
  3474
	t+=2;
ali@70
  3475
    if (g_str_has_prefix(t,"iv"))
ali@40
  3476
	t+=2;
ali@70
  3477
    if (g_utf8_get_char(t)=='v')
ali@40
  3478
	t++;
ali@70
  3479
    while (g_utf8_get_char(t)=='i' && *t)
ali@40
  3480
	t++;
ali@40
  3481
    return !*t;
ali@0
  3482
}
ali@0
  3483
ali@40
  3484
/*
ali@40
  3485
 * postprocess_for_DP:
ali@40
  3486
 *
ali@40
  3487
 * Invoked with the -d switch from flgets().
ali@40
  3488
 * It simply "removes" from the line a hard-coded set of common
ali@40
  3489
 * DP-specific tags, so that the line passed to the main routine has
ali@40
  3490
 * been pre-cleaned of DP markup.
ali@40
  3491
 */
ali@0
  3492
void postprocess_for_DP(char *theline)
ali@0
  3493
{
ali@40
  3494
    char *s,*t;
ali@0
  3495
    int i;
ali@0
  3496
    if (!*theline) 
ali@68
  3497
	return;
ali@40
  3498
    for (i=0;*DPmarkup[i];i++)
ali@70
  3499
	while ((s=strstr(theline,DPmarkup[i])))
ali@40
  3500
	{
ali@68
  3501
	    t=s+strlen(DPmarkup[i]);
ali@70
  3502
	    memmove(s,t,strlen(t)+1);
ali@40
  3503
	}
ali@0
  3504
}
ali@0
  3505
ali@40
  3506
/*
ali@40
  3507
 * postprocess_for_HTML:
ali@40
  3508
 *
ali@40
  3509
 * Invoked with the -m switch from flgets().
ali@40
  3510
 * It simply "removes" from the line a hard-coded set of common
ali@40
  3511
 * HTML tags and "replaces" a hard-coded set of common HTML
ali@40
  3512
 * entities, so that the line passed to the main routine has
ali@40
  3513
 * been pre-cleaned of HTML.
ali@40
  3514
 */
ali@0
  3515
void postprocess_for_HTML(char *theline)
ali@0
  3516
{
ali@70
  3517
    while (losemarkup(theline))
ali@70
  3518
	;
ali@71
  3519
    loseentities(theline);
ali@0
  3520
}
ali@0
  3521
ali@0
  3522
char *losemarkup(char *theline)
ali@0
  3523
{
ali@40
  3524
    char *s,*t;
ali@0
  3525
    int i;
ali@70
  3526
    s=strchr(theline,'<');
ali@70
  3527
    t=s?strchr(s,'>'):NULL;
ali@40
  3528
    if (!s || !t)
ali@40
  3529
	return NULL;
ali@40
  3530
    for (i=0;*markup[i];i++)
ali@70
  3531
	if (tagcomp(g_utf8_next_char(s),markup[i]))
ali@40
  3532
	{
ali@70
  3533
	    t=g_utf8_next_char(t);
ali@70
  3534
	    memmove(s,t,strlen(t)+1);
ali@70
  3535
	    return s;
ali@68
  3536
	}
ali@40
  3537
    /* It's an unrecognized <xxx>. */
ali@40
  3538
    return NULL;
ali@0
  3539
}
ali@0
  3540
ali@71
  3541
void loseentities(char *theline)
ali@0
  3542
{
ali@0
  3543
    int i;
ali@71
  3544
    gsize nb;
ali@71
  3545
    char *amp,*scolon;
ali@71
  3546
    gchar *s,*t;
ali@71
  3547
    gunichar c;
ali@71
  3548
    GTree *entities=NULL;
ali@86
  3549
    static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
ali@71
  3550
    if (!theline)
ali@40
  3551
    {
ali@71
  3552
	if (entities)
ali@71
  3553
	    g_tree_destroy(entities);
ali@71
  3554
	entities=NULL;
ali@86
  3555
	if (translit!=(GIConv)-1)
ali@71
  3556
	    g_iconv_close(translit);
ali@71
  3557
	translit=(GIConv)-1;
ali@86
  3558
	if (to_utf8!=(GIConv)-1)
ali@71
  3559
	    g_iconv_close(to_utf8);
ali@71
  3560
	to_utf8=(GIConv)-1;
ali@71
  3561
	return;
ali@71
  3562
    }
ali@71
  3563
    if (!*theline)
ali@71
  3564
	return;
ali@71
  3565
    if (!entities)
ali@71
  3566
    {
ali@71
  3567
	entities=g_tree_new((GCompareFunc)strcmp);
ali@71
  3568
	for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
ali@71
  3569
	    g_tree_insert(entities,HTMLentities[i].name,
ali@71
  3570
	      GUINT_TO_POINTER(HTMLentities[i].c));
ali@71
  3571
    }
ali@71
  3572
    if (translit==(GIConv)-1)
ali@71
  3573
	translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
ali@71
  3574
    if (to_utf8==(GIConv)-1)
ali@71
  3575
	to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
ali@71
  3576
    while((amp=strchr(theline,'&')))
ali@71
  3577
    {
ali@71
  3578
	scolon=strchr(amp,';');
ali@71
  3579
	if (scolon)
ali@40
  3580
	{
ali@71
  3581
	    if (amp[1]=='#')
ali@71
  3582
	    {
ali@71
  3583
		if (amp+2+strspn(amp+2,"0123456789")==scolon)
ali@71
  3584
		    c=strtol(amp+2,NULL,10);
ali@71
  3585
		else if (amp[2]=='x' &&
ali@71
  3586
		  amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
ali@71
  3587
		    c=strtol(amp+3,NULL,16);
ali@71
  3588
	    }
ali@71
  3589
	    else
ali@71
  3590
	    {
ali@71
  3591
		s=g_strndup(amp+1,scolon-(amp+1));
ali@71
  3592
	        c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
ali@71
  3593
		g_free(s);
ali@71
  3594
	    }
ali@40
  3595
	}
ali@71
  3596
	else
ali@71
  3597
	    c=0;
ali@71
  3598
	if (c)
ali@71
  3599
	{
ali@71
  3600
	    theline=amp;
ali@71
  3601
	    if (c<128 || c>=192 && c<=255)	/* An ISO-8859-1 character */
ali@71
  3602
		theline+=g_unichar_to_utf8(c,theline);
ali@71
  3603
	    else
ali@71
  3604
	    {
ali@71
  3605
		s=g_malloc(6);
ali@71
  3606
		nb=g_unichar_to_utf8(c,s);
ali@71
  3607
		t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
ali@71
  3608
		g_free(s);
ali@71
  3609
		s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
ali@71
  3610
		g_free(t);
ali@71
  3611
		memcpy(theline,s,nb);
ali@71
  3612
		g_free(s);
ali@71
  3613
		theline+=nb;
ali@71
  3614
	    }
ali@71
  3615
	    memmove(theline,g_utf8_next_char(scolon),
ali@71
  3616
	      strlen(g_utf8_next_char(scolon))+1);
ali@71
  3617
	}
ali@71
  3618
	else
ali@71
  3619
	    theline=g_utf8_next_char(amp);
ali@40
  3620
    }
ali@0
  3621
}
ali@0
  3622
ali@70
  3623
gboolean tagcomp(const char *strin,const char *basetag)
ali@0
  3624
{
ali@70
  3625
    gboolean retval;
ali@70
  3626
    gchar *s,*t;
ali@70
  3627
    if (g_utf8_get_char(strin)=='/')
ali@70
  3628
	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
ali@70
  3629
    else
ali@70
  3630
	t=g_utf8_casefold(strin,-1);
ali@70
  3631
    s=g_utf8_casefold(basetag,-1);
ali@70
  3632
    retval=g_str_has_prefix(t,s);
ali@70
  3633
    g_free(s);
ali@70
  3634
    g_free(t);
ali@70
  3635
    return retval;
ali@0
  3636
}
ali@0
  3637
ali@69
  3638
void proghelp(GOptionContext *context)
ali@0
  3639
{
ali@69
  3640
    gchar *help;
ali@40
  3641
    fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
ali@40
  3642
    fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
ali@40
  3643
    fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
ali@40
  3644
    fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
ali@40
  3645
      "For details, read the file COPYING.\n",stderr);
ali@40
  3646
    fputs("This is Free Software; "
ali@40
  3647
      "you may redistribute it under certain conditions (GPL);\n",stderr);
ali@40
  3648
    fputs("read the file COPYING for details.\n\n",stderr);
ali@69
  3649
    help=g_option_context_get_help(context,TRUE,NULL);
ali@69
  3650
    fputs(help,stderr);
ali@69
  3651
    g_free(help);
ali@69
  3652
    fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
ali@40
  3653
    fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
ali@40
  3654
      "non-ASCII\n",stderr);
ali@40
  3655
    fputs("characters like accented letters, "
ali@40
  3656
      "lines longer than 75 or shorter than 55,\n",stderr);
ali@40
  3657
    fputs("unbalanced quotes or brackets, "
ali@40
  3658
      "a variety of badly formatted punctuation, \n",stderr);
ali@40
  3659
    fputs("HTML tags, some likely typos. "
ali@40
  3660
      "It is NOT a substitute for human judgement.\n",stderr);
ali@0
  3661
    fputs("\n",stderr);
ali@0
  3662
}