bookloupe/bookloupe.c
author ali <ali@juiblex.co.uk>
Thu May 30 18:33:44 2013 +0100 (2013-05-30)
changeset 72 52d4a7f926b4
parent 71 82d3cc398b54
child 73 cffa80824f8c
permissions -rw-r--r--
Support WINDOWS-1252 characters encoded as UTF-8
ali@0
     1
/*************************************************************************/
ali@40
     2
/* bookloupe--check for assorted weirdnesses in a PG candidate text file */
ali@68
     3
/*									 */
ali@68
     4
/* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */
ali@68
     5
/* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */
ali@68
     6
/*									 */
ali@0
     7
/* This program is free software; you can redistribute it and/or modify  */
ali@0
     8
/* it under the terms of the GNU General Public License as published by  */
ali@0
     9
/* the Free Software Foundation; either version 2 of the License, or     */
ali@68
    10
/* (at your option) any later version.					 */
ali@68
    11
/*									 */
ali@0
    12
/* This program is distributed in the hope that it will be useful,       */
ali@68
    13
/* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */
ali@68
    14
/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */
ali@68
    15
/* GNU General Public License for more details.				 */
ali@68
    16
/*									 */
ali@68
    17
/* You should have received a copy of the GNU General Public License	 */
ali@68
    18
/* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */
ali@0
    19
/*************************************************************************/
ali@0
    20
ali@0
    21
#include <stdio.h>
ali@0
    22
#include <stdlib.h>
ali@0
    23
#include <string.h>
ali@0
    24
#include <ctype.h>
ali@69
    25
#include <glib.h>
ali@69
    26
#include <bl/bl.h>
ali@71
    27
#include "HTMLentities.h"
ali@0
    28
ali@69
    29
gchar *prevline;
ali@0
    30
ali@40
    31
/* Common typos. */
ali@40
    32
char *typo[] = {
ali@40
    33
    "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
ali@40
    34
    "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",
ali@40
    35
    "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
ali@40
    36
    "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
ali@40
    37
    "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
ali@40
    38
    "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
ali@40
    39
    "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
ali@40
    40
    "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
ali@40
    41
    "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
ali@40
    42
    "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
ali@40
    43
    "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
ali@40
    44
    "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
ali@40
    45
    "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
ali@40
    46
    "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
ali@40
    47
    "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
ali@40
    48
    "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
ali@40
    49
    "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
ali@40
    50
    "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
ali@40
    51
    "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
ali@40
    52
    "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
ali@40
    53
    "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
ali@40
    54
    "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
ali@40
    55
    "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
ali@40
    56
    "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
ali@40
    57
    "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
ali@40
    58
    "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
ali@40
    59
    "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
ali@40
    60
    "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
ali@40
    61
    "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
ali@40
    62
    "se", ""
ali@40
    63
};
ali@0
    64
ali@69
    65
GTree *usertypo;
ali@0
    66
ali@40
    67
/* Common abbreviations and other OK words not to query as typos. */
ali@40
    68
char *okword[] = {
ali@40
    69
    "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
ali@40
    70
    "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
ali@40
    71
    "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
ali@40
    72
    "outbid", "outbids", "frostbite", "frostbitten", ""
ali@40
    73
};
ali@0
    74
ali@40
    75
/* Common abbreviations that cause otherwise unexplained periods. */
ali@40
    76
char *abbrev[] = {
ali@40
    77
    "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
ali@40
    78
    "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
ali@40
    79
};
ali@0
    80
ali@40
    81
/*
ali@40
    82
 * Two-Letter combinations that rarely if ever start words,
ali@40
    83
 * but are common scannos or otherwise common letter combinations.
ali@40
    84
 */
ali@40
    85
char *nostart[] = {
ali@40
    86
    "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
ali@40
    87
};
ali@0
    88
ali@40
    89
/*
ali@40
    90
 * Two-Letter combinations that rarely if ever end words,
ali@40
    91
 * but are common scannos or otherwise common letter combinations.
ali@40
    92
 */
ali@40
    93
char *noend[] = {
ali@40
    94
    "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
ali@40
    95
    "sw", "gr", "sl", "cl", "iy", ""
ali@40
    96
};
ali@0
    97
ali@40
    98
char *markup[] = {
ali@40
    99
    "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
ali@40
   100
    "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
ali@40
   101
    "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
ali@40
   102
    "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
ali@40
   103
};
ali@0
   104
ali@40
   105
char *DPmarkup[] = {
ali@40
   106
    "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
ali@40
   107
};
ali@0
   108
ali@40
   109
char *nocomma[] = {
ali@40
   110
    "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
ali@40
   111
    "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
ali@40
   112
    "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
ali@40
   113
    "during", "let", "toward", "among", ""
ali@40
   114
};
ali@0
   115
ali@40
   116
char *noperiod[] = {
ali@40
   117
    "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
ali@40
   118
    "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
ali@40
   119
    "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
ali@40
   120
    "among", "those", "into", "whom", "having", "thence", ""
ali@40
   121
}; 
ali@0
   122
ali@40
   123
/* special characters */
ali@68
   124
#define CHAR_SPACE	  32
ali@68
   125
#define CHAR_TAB	   9
ali@68
   126
#define CHAR_LF		  10
ali@68
   127
#define CHAR_CR		  13
ali@68
   128
#define CHAR_DQUOTE	  34
ali@68
   129
#define CHAR_SQUOTE	  39
ali@0
   130
#define CHAR_OPEN_SQUOTE  96
ali@68
   131
#define CHAR_TILDE	 126
ali@68
   132
#define CHAR_ASTERISK	  42
ali@68
   133
#define CHAR_FORESLASH	  47
ali@68
   134
#define CHAR_CARAT	  94
ali@0
   135
ali@0
   136
#define CHAR_UNDERSCORE    '_'
ali@0
   137
#define CHAR_OPEN_CBRACK   '{'
ali@0
   138
#define CHAR_CLOSE_CBRACK  '}'
ali@0
   139
#define CHAR_OPEN_RBRACK   '('
ali@0
   140
#define CHAR_CLOSE_RBRACK  ')'
ali@0
   141
#define CHAR_OPEN_SBRACK   '['
ali@0
   142
#define CHAR_CLOSE_SBRACK  ']'
ali@0
   143
ali@40
   144
/* longest and shortest normal PG line lengths */
ali@0
   145
#define LONGEST_PG_LINE   75
ali@0
   146
#define WAY_TOO_LONG      80
ali@0
   147
#define SHORTEST_PG_LINE  55
ali@0
   148
ali@69
   149
enum {
ali@69
   150
    ECHO_SWITCH,
ali@69
   151
    SQUOTE_SWITCH,
ali@69
   152
    TYPO_SWITCH,
ali@69
   153
    QPARA_SWITCH,
ali@69
   154
    PARANOID_SWITCH,
ali@69
   155
    LINE_END_SWITCH,
ali@69
   156
    OVERVIEW_SWITCH,
ali@69
   157
    STDOUT_SWITCH,
ali@69
   158
    HEADER_SWITCH,
ali@69
   159
    WEB_SWITCH,
ali@69
   160
    VERBOSE_SWITCH,
ali@69
   161
    MARKUP_SWITCH,
ali@69
   162
    USERTYPO_SWITCH,
ali@69
   163
    DP_SWITCH,
ali@69
   164
    SWITNO
ali@69
   165
};
ali@0
   166
ali@69
   167
gboolean pswit[SWITNO];  /* program switches */
ali@0
   168
ali@69
   169
static GOptionEntry options[]={
ali@69
   170
    { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
ali@69
   171
      "Ignore DP-specific markup", NULL },
ali@69
   172
    { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
ali@69
   173
      "Don't echo queried line", NULL },
ali@69
   174
    { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
ali@69
   175
      "Check single quotes", NULL },
ali@69
   176
    { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
ali@69
   177
      "Check common typos", NULL },
ali@69
   178
    { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
ali@69
   179
      "Require closure of quotes on every paragraph", NULL },
ali@69
   180
    { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
ali@69
   181
      "Disable paranoid querying of everything", NULL },
ali@69
   182
    { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
ali@69
   183
      "Disable line end checking", NULL },
ali@69
   184
    { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
ali@69
   185
      "Overview: just show counts", NULL },
ali@69
   186
    { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
ali@69
   187
      "Output errors to stdout instead of stderr", NULL },
ali@69
   188
    { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
ali@69
   189
      "Echo header fields", NULL },
ali@69
   190
    { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
ali@69
   191
      "Ignore markup in < >", NULL },
ali@69
   192
    { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
ali@69
   193
      "Use file of user-defined typos", NULL },
ali@69
   194
    { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
ali@69
   195
      "Defaults for use on www upload", NULL },
ali@69
   196
    { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
ali@69
   197
      "Verbose - list everything", NULL },
ali@69
   198
    { NULL }
ali@69
   199
};
ali@0
   200
ali@68
   201
long cnt_dquot;		/* for overview mode, count of doublequote queries */
ali@68
   202
long cnt_squot;		/* for overview mode, count of singlequote queries */
ali@68
   203
long cnt_brack;		/* for overview mode, count of brackets queries */
ali@68
   204
long cnt_bin;		/* for overview mode, count of non-ASCII queries */
ali@68
   205
long cnt_odd;		/* for overview mode, count of odd character queries */
ali@68
   206
long cnt_long;		/* for overview mode, count of long line errors */
ali@68
   207
long cnt_short;		/* for overview mode, count of short line queries */
ali@68
   208
long cnt_punct;		/* for overview mode,
ali@68
   209
			   count of punctuation and spacing queries */
ali@68
   210
long cnt_dash;		/* for overview mode, count of dash-related queries */
ali@68
   211
long cnt_word;		/* for overview mode, count of word queries */
ali@68
   212
long cnt_html;		/* for overview mode, count of html queries */
ali@68
   213
long cnt_lineend;	/* for overview mode, count of line-end queries */
ali@68
   214
long cnt_spacend;	/* count of lines with space at end */
ali@68
   215
long linecnt;		/* count of total lines in the file */
ali@68
   216
long checked_linecnt;	/* count of lines actually checked */
ali@0
   217
ali@69
   218
void proghelp(GOptionContext *context);
ali@69
   219
void procfile(const char *);
ali@0
   220
ali@69
   221
gchar *running_from;
ali@0
   222
ali@70
   223
gboolean mixdigit(const char *);
ali@69
   224
gchar *getaword(const char **);
ali@69
   225
char *flgets(char **,long);
ali@0
   226
void postprocess_for_HTML(char *);
ali@0
   227
char *linehasmarkup(char *);
ali@0
   228
char *losemarkup(char *);
ali@70
   229
gboolean tagcomp(const char *,const char *);
ali@71
   230
void loseentities(char *);
ali@69
   231
gboolean isroman(const char *);
ali@0
   232
void postprocess_for_DP(char *);
ali@72
   233
void print_as_windows_1252(const char *string);
ali@72
   234
void print_as_utf_8(const char *string);
ali@0
   235
ali@69
   236
GTree *qword,*qperiod;
ali@68
   237
ali@68
   238
struct first_pass_results {
ali@68
   239
    long firstline,astline;
ali@68
   240
    long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;
ali@68
   241
    long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;
ali@68
   242
    long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;
ali@68
   243
    int Dutchcount,Frenchcount;
ali@68
   244
};
ali@68
   245
ali@68
   246
struct warnings {
ali@68
   247
    int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;
ali@69
   248
    int endquote;
ali@69
   249
    gboolean isDutch,isFrench;
ali@68
   250
};
ali@68
   251
ali@68
   252
struct counters {
ali@68
   253
    long quot;
ali@68
   254
    int c_unders,c_brack,s_brack,r_brack;
ali@68
   255
    int open_single_quote,close_single_quote;
ali@68
   256
};
ali@68
   257
ali@68
   258
struct line_properties {
ali@68
   259
    unsigned int len,blen;
ali@70
   260
    gunichar start;
ali@68
   261
};
ali@68
   262
ali@68
   263
struct parities {
ali@68
   264
    int dquote,squote;
ali@68
   265
};
ali@68
   266
ali@68
   267
struct pending {
ali@69
   268
    char *dquote,*squote,*rbrack,*sbrack,*cbrack,*unders;
ali@68
   269
    long squot;
ali@68
   270
};
ali@0
   271
ali@69
   272
void parse_options(int *argc,char ***argv)
ali@0
   273
{
ali@69
   274
    GError *err=NULL;
ali@69
   275
    GOptionContext *context;
ali@69
   276
    context=g_option_context_new(
ali@69
   277
      "file - looks for errors in Project Gutenberg(TM) etexts");
ali@69
   278
    g_option_context_add_main_entries(context,options,NULL);
ali@69
   279
    if (!g_option_context_parse(context,argc,argv,&err))
ali@69
   280
    {
ali@69
   281
	g_printerr("Bookloupe: %s\n",err->message);
ali@69
   282
	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
ali@69
   283
	exit(1);
ali@69
   284
    }
ali@40
   285
    /* Paranoid checking is turned OFF, not on, by its switch */
ali@69
   286
    pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
ali@40
   287
    if (pswit[PARANOID_SWITCH])
ali@69
   288
	/* if running in paranoid mode, typo checks default to enabled */
ali@69
   289
	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
ali@40
   290
    /* Line-end checking is turned OFF, not on, by its switch */
ali@69
   291
    pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
ali@40
   292
    /* Echoing is turned OFF, not on, by its switch */
ali@69
   293
    pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
ali@40
   294
    if (pswit[OVERVIEW_SWITCH])
ali@40
   295
	/* just print summary; don't echo */
ali@69
   296
	pswit[ECHO_SWITCH]=FALSE;
ali@40
   297
    /*
ali@40
   298
     * Web uploads - for the moment, this is really just a placeholder
ali@40
   299
     * until we decide what processing we really want to do on web uploads
ali@40
   300
     */
ali@40
   301
    if (pswit[WEB_SWITCH])
ali@40
   302
    {
ali@40
   303
	/* specific override for web uploads */
ali@69
   304
	pswit[ECHO_SWITCH]=TRUE;
ali@69
   305
	pswit[SQUOTE_SWITCH]=FALSE;
ali@69
   306
	pswit[TYPO_SWITCH]=TRUE;
ali@69
   307
	pswit[QPARA_SWITCH]=FALSE;
ali@69
   308
	pswit[PARANOID_SWITCH]=TRUE;
ali@69
   309
	pswit[LINE_END_SWITCH]=FALSE;
ali@69
   310
	pswit[OVERVIEW_SWITCH]=FALSE;
ali@69
   311
	pswit[STDOUT_SWITCH]=FALSE;
ali@69
   312
	pswit[HEADER_SWITCH]=TRUE;
ali@69
   313
	pswit[VERBOSE_SWITCH]=FALSE;
ali@69
   314
	pswit[MARKUP_SWITCH]=FALSE;
ali@69
   315
	pswit[USERTYPO_SWITCH]=FALSE;
ali@69
   316
	pswit[DP_SWITCH]=FALSE;
ali@40
   317
    }
ali@69
   318
    if (*argc<2)
ali@40
   319
    {
ali@69
   320
	proghelp(context);
ali@69
   321
	exit(1);
ali@40
   322
    }
ali@69
   323
    g_option_context_free(context);
ali@69
   324
}
ali@69
   325
ali@69
   326
/*
ali@69
   327
 * read_user_scannos:
ali@69
   328
 *
ali@69
   329
 * Read in the user-defined stealth scanno list.
ali@69
   330
 */
ali@69
   331
void read_user_scannos(void)
ali@69
   332
{
ali@69
   333
    GError *err=NULL;
ali@69
   334
    gchar *usertypo_file;
ali@69
   335
    gboolean okay;
ali@69
   336
    int i;
ali@70
   337
    gsize len,nb;
ali@70
   338
    gchar *contents,*utf8,**lines;
ali@69
   339
    usertypo_file=g_strdup("bookloupe.typ");
ali@69
   340
    okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
ali@69
   341
    if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
ali@69
   342
    {
ali@69
   343
	g_clear_error(&err);
ali@69
   344
	g_free(usertypo_file);
ali@69
   345
	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
ali@69
   346
	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
ali@69
   347
    }
ali@69
   348
    if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
ali@69
   349
    {
ali@69
   350
	g_clear_error(&err);
ali@69
   351
	g_free(usertypo_file);
ali@69
   352
	usertypo_file=g_strdup("gutcheck.typ");
ali@69
   353
	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
ali@69
   354
    }
ali@69
   355
    if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
ali@69
   356
    {
ali@69
   357
	g_clear_error(&err);
ali@69
   358
	g_free(usertypo_file);
ali@69
   359
	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
ali@69
   360
	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
ali@69
   361
    }
ali@69
   362
    if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
ali@69
   363
    {
ali@69
   364
	g_free(usertypo_file);
ali@70
   365
	g_print("   --> I couldn't find bookloupe.typ "
ali@69
   366
	  "-- proceeding without user typos.\n");
ali@69
   367
	return;
ali@69
   368
    }
ali@69
   369
    else if (!okay)
ali@69
   370
    {
ali@69
   371
	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
ali@69
   372
	g_free(usertypo_file);
ali@69
   373
	g_clear_error(&err);
ali@69
   374
	exit(1);
ali@69
   375
    }
ali@72
   376
    if (g_utf8_validate(contents,len,NULL))
ali@72
   377
	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
ali@72
   378
    else
ali@72
   379
	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
ali@70
   380
    g_free(contents);
ali@70
   381
    lines=g_strsplit_set(utf8,"\r\n",0);
ali@70
   382
    g_free(utf8);
ali@69
   383
    usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
ali@69
   384
    for (i=0;lines[i];i++)
ali@69
   385
	if (*(unsigned char *)lines[i]>'!')
ali@69
   386
	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
ali@69
   387
	else
ali@69
   388
	    g_free(lines[i]);
ali@69
   389
    g_free(lines);
ali@69
   390
}
ali@69
   391
ali@69
   392
/*
ali@69
   393
 * read_etext:
ali@69
   394
 *
ali@69
   395
 * Read an etext returning a newly allocated string containing the file
ali@69
   396
 * contents or NULL on error.
ali@69
   397
 */
ali@69
   398
gchar *read_etext(const char *filename,GError **err)
ali@69
   399
{
ali@70
   400
    gchar *contents,*utf8;
ali@70
   401
    gsize len,nb;
ali@69
   402
    if (!g_file_get_contents(filename,&contents,&len,err))
ali@69
   403
	return NULL;
ali@72
   404
    if (g_utf8_validate(contents,len,NULL))
ali@72
   405
    {
ali@72
   406
	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
ali@72
   407
	g_set_print_handler(print_as_utf_8);
ali@72
   408
    }
ali@72
   409
    else
ali@72
   410
    {
ali@72
   411
	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
ali@72
   412
	g_set_print_handler(print_as_windows_1252);
ali@72
   413
    }
ali@70
   414
    g_free(contents);
ali@70
   415
    return utf8;
ali@69
   416
}
ali@69
   417
ali@69
   418
int main(int argc,char **argv)
ali@69
   419
{
ali@69
   420
    running_from=g_path_get_dirname(argv[0]);
ali@69
   421
    parse_options(&argc,&argv);
ali@40
   422
    if (pswit[USERTYPO_SWITCH])
ali@69
   423
	read_user_scannos();
ali@40
   424
    fprintf(stderr,"bookloupe: Check and report on an e-text\n");
ali@69
   425
    procfile(argv[1]);
ali@40
   426
    if (pswit[OVERVIEW_SWITCH])
ali@40
   427
    {
ali@70
   428
	g_print("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
ali@40
   429
	  checked_linecnt,linecnt,linecnt-checked_linecnt);
ali@70
   430
	g_print("    --------------- Queries found --------------\n");
ali@68
   431
	if (cnt_long)
ali@70
   432
	    g_print("    Long lines:		    %14ld\n",cnt_long);
ali@68
   433
	if (cnt_short)
ali@70
   434
	    g_print("    Short lines:		   %14ld\n",cnt_short);
ali@68
   435
	if (cnt_lineend)
ali@70
   436
	    g_print("    Line-end problems:	     %14ld\n",cnt_lineend);
ali@68
   437
	if (cnt_word)
ali@70
   438
	    g_print("    Common typos:		  %14ld\n",cnt_word);
ali@68
   439
	if (cnt_dquot)
ali@70
   440
	    g_print("    Unmatched quotes:	      %14ld\n",cnt_dquot);
ali@68
   441
	if (cnt_squot)
ali@70
   442
	    g_print("    Unmatched SingleQuotes:	%14ld\n",cnt_squot);
ali@68
   443
	if (cnt_brack)
ali@70
   444
	    g_print("    Unmatched brackets:	    %14ld\n",cnt_brack);
ali@68
   445
	if (cnt_bin)
ali@70
   446
	    g_print("    Non-ASCII characters:	  %14ld\n",cnt_bin);
ali@68
   447
	if (cnt_odd)
ali@70
   448
	    g_print("    Proofing characters:	   %14ld\n",cnt_odd);
ali@68
   449
	if (cnt_punct)
ali@70
   450
	    g_print("    Punctuation & spacing queries: %14ld\n",cnt_punct);
ali@68
   451
	if (cnt_dash)
ali@70
   452
	    g_print("    Non-standard dashes:	   %14ld\n",cnt_dash);
ali@68
   453
	if (cnt_html)
ali@70
   454
	    g_print("    Possible HTML tags:	    %14ld\n",cnt_html);
ali@70
   455
	g_print("\n");
ali@70
   456
	g_print("    TOTAL QUERIES		  %14ld\n",
ali@68
   457
	  cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
ali@68
   458
	  cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
ali@40
   459
    }
ali@69
   460
    g_free(running_from);
ali@69
   461
    if (usertypo)
ali@69
   462
	g_tree_unref(usertypo);
ali@40
   463
    return 0;
ali@0
   464
}
ali@0
   465
ali@40
   466
/*
ali@41
   467
 * first_pass:
ali@40
   468
 *
ali@41
   469
 * Run a first pass - verify that it's a valid PG
ali@41
   470
 * file, decide whether to report some things that
ali@41
   471
 * occur many times in the text like long or short
ali@41
   472
 * lines, non-standard dashes, etc.
ali@40
   473
 */
ali@69
   474
struct first_pass_results *first_pass(const char *etext)
ali@0
   475
{
ali@70
   476
    gunichar laststart=CHAR_SPACE;
ali@54
   477
    const char *s;
ali@69
   478
    gchar *lc_line;
ali@70
   479
    int i,j,lbytes,llen;
ali@69
   480
    gchar **lines;
ali@41
   481
    unsigned int lastlen=0,lastblen=0;
ali@41
   482
    long spline=0,nspline=0;
ali@41
   483
    static struct first_pass_results results={0};
ali@69
   484
    gchar *inword;
ali@69
   485
    lines=g_strsplit(etext,"\n",0);
ali@69
   486
    for (j=0;lines[j];j++)
ali@40
   487
    {
ali@70
   488
	lbytes=strlen(lines[j]);
ali@70
   489
	while (lines[j][lbytes-1]=='\r')
ali@70
   490
	    lines[j][--lbytes]='\0';
ali@70
   491
	llen=g_utf8_strlen(lines[j],lbytes);
ali@68
   492
	linecnt++;
ali@69
   493
	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
ali@69
   494
	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
ali@40
   495
	{
ali@68
   496
	    if (spline)
ali@70
   497
		g_print("   --> Duplicate header?\n");
ali@68
   498
	    spline=linecnt+1;   /* first line of non-header text, that is */
ali@40
   499
	}
ali@69
   500
	if (!strncmp(lines[j],"*** START",9) &&
ali@69
   501
	  strstr(lines[j],"PROJECT GUTENBERG"))
ali@40
   502
	{
ali@68
   503
	    if (nspline)
ali@70
   504
		g_print("   --> Duplicate header?\n");
ali@68
   505
	    nspline=linecnt+1;   /* first line of non-header text, that is */
ali@40
   506
	}
ali@68
   507
	if (spline || nspline)
ali@40
   508
	{
ali@70
   509
	    lc_line=g_utf8_strdown(lines[j],lbytes);
ali@69
   510
	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
ali@40
   511
	    {
ali@69
   512
		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
ali@40
   513
		{
ali@68
   514
		    if (results.footerline)
ali@40
   515
		    {
ali@40
   516
			/* it's an old-form header - we can detect duplicates */
ali@68
   517
			if (!nspline)
ali@70
   518
			    g_print("   --> Duplicate footer?\n");
ali@40
   519
		    }
ali@68
   520
		    else
ali@68
   521
			results.footerline=linecnt;
ali@40
   522
		}
ali@40
   523
	    }
ali@69
   524
	    g_free(lc_line);
ali@40
   525
	}
ali@68
   526
	if (spline)
ali@41
   527
	    results.firstline=spline;
ali@68
   528
	if (nspline)
ali@41
   529
	    results.firstline=nspline;  /* override with new */
ali@68
   530
	if (results.footerline)
ali@40
   531
	    continue;    /* don't count the boilerplate in the footer */
ali@68
   532
	results.totlen+=llen;
ali@70
   533
	for (s=lines[j];*s;s=g_utf8_next_char(s))
ali@40
   534
	{
ali@70
   535
	    if (g_utf8_get_char(s)>127)
ali@41
   536
		results.binlen++;
ali@70
   537
	    if (g_unichar_isalpha(g_utf8_get_char(s)))
ali@41
   538
		results.alphalen++;
ali@70
   539
	    if (s>lines[j] && g_utf8_get_char(s)==CHAR_DQUOTE &&
ali@70
   540
	      isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
ali@41
   541
		results.endquote_count++;
ali@40
   542
	}
ali@69
   543
	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
ali@69
   544
	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
ali@41
   545
	    results.shortline++;
ali@70
   546
	if (lbytes>0 &&
ali@70
   547
	  g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
ali@40
   548
	    cnt_spacend++;
ali@69
   549
	if (strstr(lines[j],".,"))
ali@41
   550
	    results.dotcomma++;
ali@68
   551
	/* only count ast lines for ignoring purposes where there is */
ali@68
   552
	/* locase text on the line */
ali@69
   553
	if (strchr(lines[j],'*'))
ali@40
   554
	{
ali@70
   555
	    for (s=lines[j];*s;s=g_utf8_next_char(s))
ali@70
   556
		if (g_unichar_islower(g_utf8_get_char(s)))
ali@68
   557
		    break;
ali@70
   558
	    if (*s)
ali@41
   559
		results.astline++;
ali@40
   560
	}
ali@69
   561
	if (strchr(lines[j],'/'))
ali@68
   562
	    results.fslashline++;
ali@70
   563
	for (s=g_utf8_prev_char(lines[j]+lbytes);
ali@70
   564
	  s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
ali@40
   565
	    ;
ali@70
   566
	if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
ali@70
   567
	  g_utf8_get_char(g_utf8_prev_char(s))!='-')
ali@41
   568
	    results.hyphens++;
ali@68
   569
	if (llen>LONGEST_PG_LINE)
ali@41
   570
	    results.longline++;
ali@68
   571
	if (llen>WAY_TOO_LONG)
ali@41
   572
	    results.verylongline++;
ali@69
   573
	if (strchr(lines[j],'<') && strchr(lines[j],'>'))
ali@40
   574
	{
ali@69
   575
	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
ali@68
   576
	    if (i>0)
ali@68
   577
		results.htmcount++;
ali@69
   578
	    if (strstr(lines[j],"<i>"))
ali@41
   579
		results.htmcount+=4; /* bonus marks! */
ali@40
   580
	}
ali@68
   581
	/* Check for spaced em-dashes */
ali@70
   582
	if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))
ali@40
   583
	{
ali@68
   584
	    results.emdash++;
ali@70
   585
	    if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)
ali@41
   586
		results.space_emdash++;
ali@70
   587
	    if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)
ali@40
   588
		/* count of em-dashes with spaces both sides */
ali@41
   589
		results.non_PG_space_emdash++;
ali@70
   590
	    if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)
ali@40
   591
		/* count of PG-type em-dashes with no spaces */
ali@41
   592
		results.PG_space_emdash++;
ali@40
   593
	}
ali@69
   594
	for (s=lines[j];*s;)
ali@40
   595
	{
ali@69
   596
	    inword=getaword(&s);
ali@68
   597
	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) 
ali@68
   598
		results.Dutchcount++;
ali@68
   599
	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) 
ali@68
   600
		results.Frenchcount++;
ali@68
   601
	    if (!strcmp(inword,"0") || !strcmp(inword,"1")) 
ali@68
   602
		results.standalone_digit++;
ali@69
   603
	    g_free(inword);
ali@40
   604
	}
ali@68
   605
	/* Check for spaced dashes */
ali@69
   606
	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
ali@41
   607
	    results.spacedash++;
ali@68
   608
	lastblen=lastlen;
ali@69
   609
	lastlen=llen;
ali@69
   610
	laststart=lines[j][0];
ali@40
   611
    }
ali@69
   612
    g_strfreev(lines);
ali@41
   613
    return &results;
ali@41
   614
}
ali@41
   615
ali@42
   616
/*
ali@42
   617
 * report_first_pass:
ali@42
   618
 *
ali@42
   619
 * Make some snap decisions based on the first pass results.
ali@42
   620
 */
ali@42
   621
struct warnings *report_first_pass(struct first_pass_results *results)
ali@42
   622
{
ali@42
   623
    static struct warnings warnings={0};
ali@42
   624
    if (cnt_spacend>0)
ali@70
   625
	g_print("   --> %ld lines in this file have white space at end\n",
ali@42
   626
	  cnt_spacend);
ali@42
   627
    warnings.dotcomma=1;
ali@42
   628
    if (results->dotcomma>5)
ali@42
   629
    {
ali@68
   630
	warnings.dotcomma=0;
ali@70
   631
	g_print("   --> %ld lines in this file contain '.,'. "
ali@42
   632
	  "Not reporting them.\n",results->dotcomma);
ali@42
   633
    }
ali@42
   634
    /*
ali@42
   635
     * If more than 50 lines, or one-tenth, are short,
ali@42
   636
     * don't bother reporting them.
ali@42
   637
     */
ali@42
   638
    warnings.shortline=1;
ali@42
   639
    if (results->shortline>50 || results->shortline*10>linecnt)
ali@42
   640
    {
ali@68
   641
	warnings.shortline=0;
ali@70
   642
	g_print("   --> %ld lines in this file are short. "
ali@42
   643
	  "Not reporting short lines.\n",results->shortline);
ali@42
   644
    }
ali@42
   645
    /*
ali@42
   646
     * If more than 50 lines, or one-tenth, are long,
ali@42
   647
     * don't bother reporting them.
ali@42
   648
     */
ali@42
   649
    warnings.longline=1;
ali@42
   650
    if (results->longline>50 || results->longline*10>linecnt)
ali@42
   651
    {
ali@68
   652
	warnings.longline=0;
ali@70
   653
	g_print("   --> %ld lines in this file are long. "
ali@42
   654
	  "Not reporting long lines.\n",results->longline);
ali@42
   655
    }
ali@42
   656
    /* If more than 10 lines contain asterisks, don't bother reporting them. */
ali@42
   657
    warnings.ast=1;
ali@42
   658
    if (results->astline>10)
ali@42
   659
    {
ali@68
   660
	warnings.ast=0;
ali@70
   661
	g_print("   --> %ld lines in this file contain asterisks. "
ali@42
   662
	  "Not reporting them.\n",results->astline);
ali@42
   663
    }
ali@42
   664
    /*
ali@42
   665
     * If more than 10 lines contain forward slashes,
ali@42
   666
     * don't bother reporting them.
ali@42
   667
     */
ali@42
   668
    warnings.fslash=1;
ali@42
   669
    if (results->fslashline>10)
ali@42
   670
    {
ali@68
   671
	warnings.fslash=0;
ali@70
   672
	g_print("   --> %ld lines in this file contain forward slashes. "
ali@42
   673
	  "Not reporting them.\n",results->fslashline);
ali@42
   674
    }
ali@42
   675
    /*
ali@42
   676
     * If more than 20 lines contain unpunctuated endquotes,
ali@42
   677
     * don't bother reporting them.
ali@42
   678
     */
ali@42
   679
    warnings.endquote=1;
ali@42
   680
    if (results->endquote_count>20)
ali@42
   681
    {
ali@68
   682
	warnings.endquote=0;
ali@70
   683
	g_print("   --> %ld lines in this file contain unpunctuated endquotes. "
ali@42
   684
	  "Not reporting them.\n",results->endquote_count);
ali@42
   685
    }
ali@42
   686
    /*
ali@42
   687
     * If more than 15 lines contain standalone digits,
ali@42
   688
     * don't bother reporting them.
ali@42
   689
     */
ali@42
   690
    warnings.digit=1;
ali@42
   691
    if (results->standalone_digit>10)
ali@42
   692
    {
ali@68
   693
	warnings.digit=0;
ali@70
   694
	g_print("   --> %ld lines in this file contain standalone 0s and 1s. "
ali@42
   695
	  "Not reporting them.\n",results->standalone_digit);
ali@42
   696
    }
ali@42
   697
    /*
ali@42
   698
     * If more than 20 lines contain hyphens at end,
ali@42
   699
     * don't bother reporting them.
ali@42
   700
     */
ali@42
   701
    warnings.hyphen=1;
ali@42
   702
    if (results->hyphens>20)
ali@42
   703
    {
ali@68
   704
	warnings.hyphen=0;
ali@70
   705
	g_print("   --> %ld lines in this file have hyphens at end. "
ali@42
   706
	  "Not reporting them.\n",results->hyphens);
ali@42
   707
    }
ali@42
   708
    if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
ali@42
   709
    {
ali@70
   710
	g_print("   --> Looks like this is HTML. Switching HTML mode ON.\n");
ali@68
   711
	pswit[MARKUP_SWITCH]=1;
ali@42
   712
    }
ali@42
   713
    if (results->verylongline>0)
ali@70
   714
	g_print("   --> %ld lines in this file are VERY long!\n",
ali@42
   715
	  results->verylongline);
ali@42
   716
    /*
ali@42
   717
     * If there are more non-PG spaced dashes than PG em-dashes,
ali@42
   718
     * assume it's deliberate.
ali@42
   719
     * Current PG guidelines say don't use them, but older texts do,
ali@42
   720
     * and some people insist on them whatever the guidelines say.
ali@42
   721
     */
ali@42
   722
    warnings.dash=1;
ali@42
   723
    if (results->spacedash+results->non_PG_space_emdash>
ali@42
   724
      results->PG_space_emdash)
ali@42
   725
    {
ali@68
   726
	warnings.dash=0;
ali@70
   727
	g_print("   --> There are %ld spaced dashes and em-dashes. "
ali@42
   728
	  "Not reporting them.\n",
ali@42
   729
	  results->spacedash+results->non_PG_space_emdash);
ali@42
   730
    }
ali@42
   731
    /* If more than a quarter of characters are hi-bit, bug out. */
ali@42
   732
    warnings.bin=1;
ali@42
   733
    if (results->binlen*4>results->totlen)
ali@42
   734
    {
ali@70
   735
	g_print("   --> This file does not appear to be ASCII. "
ali@42
   736
	  "Terminating. Best of luck with it!\n");
ali@68
   737
	exit(1);
ali@42
   738
    }
ali@42
   739
    if (results->alphalen*4<results->totlen)
ali@42
   740
    {
ali@70
   741
	g_print("   --> This file does not appear to be text. "
ali@42
   742
	  "Terminating. Best of luck with it!\n");
ali@68
   743
	exit(1);
ali@42
   744
    }
ali@42
   745
    if (results->binlen*100>results->totlen || results->binlen>100)
ali@42
   746
    {
ali@70
   747
	g_print("   --> There are a lot of foreign letters here. "
ali@42
   748
	  "Not reporting them.\n");
ali@68
   749
	warnings.bin=0;
ali@42
   750
    }
ali@69
   751
    warnings.isDutch=FALSE;
ali@42
   752
    if (results->Dutchcount>50)
ali@42
   753
    {
ali@69
   754
	warnings.isDutch=TRUE;
ali@70
   755
	g_print("   --> This looks like Dutch - "
ali@42
   756
	  "switching off dashes and warnings for 's Middags case.\n");
ali@42
   757
    }
ali@69
   758
    warnings.isFrench=FALSE;
ali@42
   759
    if (results->Frenchcount>50)
ali@42
   760
    {
ali@69
   761
	warnings.isFrench=TRUE;
ali@70
   762
	g_print("   --> This looks like French - "
ali@42
   763
	  "switching off some doublepunct.\n");
ali@42
   764
    }
ali@42
   765
    if (results->firstline && results->footerline)
ali@70
   766
	g_print("    The PG header and footer appear to be already on.\n");
ali@42
   767
    else
ali@42
   768
    {
ali@68
   769
	if (results->firstline)
ali@70
   770
	    g_print("    The PG header is on - no footer.\n");
ali@68
   771
	if (results->footerline)
ali@70
   772
	    g_print("    The PG footer is on - no header.\n");
ali@42
   773
    }
ali@70
   774
    g_print("\n");
ali@42
   775
    if (pswit[VERBOSE_SWITCH])
ali@42
   776
    {
ali@68
   777
	warnings.bin=1;
ali@68
   778
	warnings.shortline=1;
ali@68
   779
	warnings.dotcomma=1;
ali@68
   780
	warnings.longline=1;
ali@68
   781
	warnings.dash=1;
ali@68
   782
	warnings.digit=1;
ali@68
   783
	warnings.ast=1;
ali@68
   784
	warnings.fslash=1;
ali@68
   785
	warnings.hyphen=1;
ali@68
   786
	warnings.endquote=1;
ali@70
   787
	g_print("   *** Verbose output is ON -- you asked for it! ***\n");
ali@42
   788
    }
ali@42
   789
    if (warnings.isDutch)
ali@68
   790
	warnings.dash=0;
ali@42
   791
    if (results->footerline>0 && results->firstline>0 &&
ali@42
   792
      results->footerline>results->firstline &&
ali@42
   793
      results->footerline-results->firstline<100)
ali@42
   794
    {
ali@70
   795
	g_print("   --> I don't really know where this text starts. \n");
ali@70
   796
	g_print("       There are no reference points.\n");
ali@70
   797
	g_print("       I'm going to have to report the header and footer "
ali@42
   798
	  "as well.\n");
ali@68
   799
	results->firstline=0;
ali@42
   800
    }
ali@42
   801
    return &warnings;
ali@42
   802
}
ali@42
   803
ali@43
   804
/*
ali@43
   805
 * analyse_quotes:
ali@43
   806
 *
ali@43
   807
 * Look along the line, accumulate the count of quotes, and see
ali@43
   808
 * if this is an empty line - i.e. a line with nothing on it
ali@43
   809
 * but spaces.
ali@43
   810
 * If line has just spaces, period, * and/or - on it, don't
ali@43
   811
 * count it, since empty lines with asterisks or dashes to
ali@43
   812
 * separate sections are common.
ali@43
   813
 *
ali@69
   814
 * Returns: TRUE if the line is empty.
ali@43
   815
 */
ali@69
   816
gboolean analyse_quotes(const char *aline,struct counters *counters)
ali@43
   817
{
ali@68
   818
    int guessquote=0;
ali@69
   819
    /* assume the line is empty until proven otherwise */
ali@69
   820
    gboolean isemptyline=TRUE;
ali@70
   821
    const char *s=aline,*sprev,*snext;
ali@70
   822
    gunichar c;
ali@70
   823
    sprev=NULL;
ali@43
   824
    while (*s)
ali@43
   825
    {
ali@70
   826
	snext=g_utf8_next_char(s);
ali@70
   827
	c=g_utf8_get_char(s);
ali@70
   828
	if (c==CHAR_DQUOTE)
ali@43
   829
	    counters->quot++;
ali@70
   830
	if (c==CHAR_SQUOTE || c==CHAR_OPEN_SQUOTE)
ali@43
   831
	{
ali@43
   832
	    if (s==aline)
ali@43
   833
	    {
ali@43
   834
		/*
ali@43
   835
		 * At start of line, it can only be an openquote.
ali@43
   836
		 * Hardcode a very common exception!
ali@43
   837
		 */
ali@70
   838
		if (!g_str_has_prefix(snext,"tis") &&
ali@70
   839
		  !g_str_has_prefix(snext,"Tis"))
ali@43
   840
		    counters->open_single_quote++;
ali@43
   841
	    }
ali@70
   842
	    else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
ali@70
   843
	      g_unichar_isalpha(g_utf8_get_char(snext)))
ali@43
   844
		/* Do nothing! it's definitely an apostrophe, not a quote */
ali@43
   845
		;
ali@43
   846
	    /* it's outside a word - let's check it out */
ali@70
   847
	    else if (c==CHAR_OPEN_SQUOTE ||
ali@70
   848
	      g_unichar_isalpha(g_utf8_get_char(snext)))
ali@43
   849
	    {
ali@43
   850
		/* it damwell better BE an openquote */
ali@70
   851
		if (!g_str_has_prefix(snext,"tis") &&
ali@70
   852
		  !g_str_has_prefix(snext,"Tis"))
ali@43
   853
		    /* hardcode a very common exception! */
ali@43
   854
		    counters->open_single_quote++;
ali@43
   855
	    }
ali@43
   856
	    else
ali@43
   857
	    {
ali@43
   858
		/* now - is it a closequote? */
ali@43
   859
		guessquote=0;   /* accumulate clues */
ali@70
   860
		if (g_unichar_isalpha(g_utf8_get_char(sprev)))
ali@43
   861
		{
ali@43
   862
		    /* it follows a letter - could be either */
ali@43
   863
		    guessquote++;
ali@70
   864
		    if (g_utf8_get_char(sprev)=='s')
ali@43
   865
		    {
ali@43
   866
			/* looks like a plural apostrophe */
ali@43
   867
			guessquote-=3;
ali@70
   868
			if (g_utf8_get_char(snext)==CHAR_SPACE)
ali@70
   869
			    /* bonus marks! */
ali@43
   870
			    guessquote-=2;
ali@43
   871
		    }
ali@43
   872
		}
ali@43
   873
		/* it doesn't have a letter either side */
ali@70
   874
		else if (strchr(".?!,;:",g_utf8_get_char(sprev)) &&
ali@70
   875
		  strchr(".?!,;: ",g_utf8_get_char(snext)))
ali@43
   876
		    guessquote+=8; /* looks like a closequote */
ali@43
   877
		else
ali@43
   878
		    guessquote++;
ali@43
   879
		if (counters->open_single_quote>counters->close_single_quote)
ali@43
   880
		    /*
ali@43
   881
		     * Give it the benefit of some doubt,
ali@43
   882
		     * if a squote is already open.
ali@43
   883
		     */
ali@43
   884
		    guessquote++;
ali@43
   885
		else
ali@43
   886
		    guessquote--;
ali@43
   887
		if (guessquote>=0)
ali@43
   888
		    counters->close_single_quote++;
ali@43
   889
	    }
ali@43
   890
	}
ali@70
   891
	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
ali@70
   892
	  c!='\r' && c!='\n')
ali@69
   893
	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */
ali@70
   894
	if (c==CHAR_UNDERSCORE)
ali@43
   895
	    counters->c_unders++;
ali@70
   896
	if (c==CHAR_OPEN_CBRACK)
ali@43
   897
	    counters->c_brack++;
ali@70
   898
	if (c==CHAR_CLOSE_CBRACK)
ali@43
   899
	    counters->c_brack--;
ali@70
   900
	if (c==CHAR_OPEN_RBRACK)
ali@43
   901
	    counters->r_brack++;
ali@70
   902
	if (c==CHAR_CLOSE_RBRACK)
ali@43
   903
	    counters->r_brack--;
ali@70
   904
	if (c==CHAR_OPEN_SBRACK)
ali@43
   905
	    counters->s_brack++;
ali@70
   906
	if (c==CHAR_CLOSE_SBRACK)
ali@43
   907
	    counters->s_brack--;
ali@70
   908
	sprev=s;
ali@70
   909
	s=snext;
ali@43
   910
    }
ali@43
   911
    return isemptyline;
ali@43
   912
}
ali@43
   913
ali@41
   914
/*
ali@67
   915
 * check_for_control_characters:
ali@67
   916
 *
ali@67
   917
 * Check for invalid or questionable characters in the line
ali@67
   918
 * Anything above 127 is invalid for plain ASCII, and
ali@67
   919
 * non-printable control characters should also be flagged.
ali@67
   920
 * Tabs should generally not be there.
ali@67
   921
 */
ali@67
   922
void check_for_control_characters(const char *aline)
ali@67
   923
{
ali@70
   924
    gunichar c;
ali@67
   925
    const char *s;
ali@70
   926
    for (s=aline;*s;s=g_utf8_next_char(s))
ali@67
   927
    {
ali@70
   928
	c=g_utf8_get_char(s);
ali@67
   929
	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
ali@67
   930
	{
ali@67
   931
	    if (pswit[ECHO_SWITCH])
ali@70
   932
		g_print("\n%s\n",aline);
ali@67
   933
	    if (!pswit[OVERVIEW_SWITCH])
ali@70
   934
		g_print("    Line %ld column %ld - Control character %u\n",
ali@70
   935
		  linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
ali@67
   936
	    else
ali@67
   937
		cnt_bin++;
ali@67
   938
	}
ali@67
   939
    }
ali@67
   940
}
ali@67
   941
ali@67
   942
/*
ali@44
   943
 * check_for_odd_characters:
ali@44
   944
 *
ali@44
   945
 * Check for binary and other odd characters.
ali@44
   946
 */
ali@44
   947
void check_for_odd_characters(const char *aline,const struct warnings *warnings,
ali@69
   948
  gboolean isemptyline)
ali@44
   949
{
ali@44
   950
    /* Don't repeat multiple warnings on one line. */
ali@70
   951
    gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
ali@70
   952
    gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
ali@44
   953
    const char *s;
ali@70
   954
    gunichar c;
ali@70
   955
    for (s=aline;*s;s=g_utf8_next_char(s))
ali@44
   956
    {
ali@70
   957
	c=g_utf8_get_char(s);
ali@70
   958
	if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
ali@44
   959
	{
ali@44
   960
	    if (pswit[ECHO_SWITCH])
ali@70
   961
		g_print("\n%s\n",aline);
ali@44
   962
	    if (!pswit[OVERVIEW_SWITCH])
ali@70
   963
		if (c>127 && c<160 || c>255)
ali@70
   964
		    g_print("    Line %ld column %ld - "
ali@70
   965
		      "Non-ISO-8859 character %u\n",
ali@70
   966
		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
ali@44
   967
		else
ali@70
   968
		    g_print("    Line %ld column %ld - "
ali@70
   969
		      "Non-ASCII character %u\n",
ali@70
   970
		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
ali@44
   971
	    else
ali@44
   972
		cnt_bin++;
ali@70
   973
	    eNon_A=TRUE;
ali@44
   974
	}
ali@70
   975
	if (!eTab && c==CHAR_TAB)
ali@44
   976
	{
ali@44
   977
	    if (pswit[ECHO_SWITCH])
ali@70
   978
		g_print("\n%s\n",aline);
ali@44
   979
	    if (!pswit[OVERVIEW_SWITCH])
ali@70
   980
		g_print("    Line %ld column %ld - Tab character?\n",
ali@70
   981
		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@44
   982
	    else
ali@44
   983
		cnt_odd++;
ali@70
   984
	    eTab=TRUE;
ali@44
   985
	}
ali@70
   986
	if (!eTilde && c==CHAR_TILDE)
ali@44
   987
	{
ali@44
   988
	    /*
ali@44
   989
	     * Often used by OCR software to indicate an
ali@44
   990
	     * unrecognizable character.
ali@44
   991
	     */
ali@44
   992
	    if (pswit[ECHO_SWITCH])
ali@70
   993
		g_print("\n%s\n",aline);
ali@44
   994
	    if (!pswit[OVERVIEW_SWITCH])
ali@70
   995
		g_print("    Line %ld column %ld - Tilde character?\n",
ali@70
   996
		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@44
   997
	    else
ali@44
   998
		cnt_odd++;
ali@70
   999
	    eTilde=TRUE;
ali@44
  1000
	}
ali@70
  1001
	if (!eCarat && c==CHAR_CARAT)
ali@44
  1002
	{  
ali@44
  1003
	    if (pswit[ECHO_SWITCH])
ali@70
  1004
		g_print("\n%s\n",aline);
ali@44
  1005
	    if (!pswit[OVERVIEW_SWITCH])
ali@70
  1006
		g_print("    Line %ld column %ld - Carat character?\n",
ali@70
  1007
		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@44
  1008
	    else
ali@44
  1009
		cnt_odd++;
ali@70
  1010
	    eCarat=TRUE;
ali@44
  1011
	}
ali@70
  1012
	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
ali@44
  1013
	{  
ali@44
  1014
	    if (pswit[ECHO_SWITCH])
ali@70
  1015
		g_print("\n%s\n",aline);
ali@44
  1016
	    if (!pswit[OVERVIEW_SWITCH])
ali@70
  1017
		g_print("    Line %ld column %ld - Forward slash?\n",
ali@70
  1018
		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@44
  1019
	    else
ali@44
  1020
		cnt_odd++;
ali@70
  1021
	    eFSlash=TRUE;
ali@44
  1022
	}
ali@44
  1023
	/*
ali@44
  1024
	 * Report asterisks only in paranoid mode,
ali@44
  1025
	 * since they're often deliberate.
ali@44
  1026
	 */
ali@44
  1027
	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
ali@70
  1028
	  c==CHAR_ASTERISK)
ali@44
  1029
	{
ali@44
  1030
	    if (pswit[ECHO_SWITCH])
ali@70
  1031
		g_print("\n%s\n",aline);
ali@44
  1032
	    if (!pswit[OVERVIEW_SWITCH])
ali@70
  1033
		g_print("    Line %ld column %ld - Asterisk?\n",
ali@70
  1034
		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@44
  1035
	    else
ali@44
  1036
		cnt_odd++;
ali@70
  1037
	    eAst=TRUE;
ali@44
  1038
	}
ali@44
  1039
    }
ali@44
  1040
}
ali@44
  1041
ali@44
  1042
/*
ali@45
  1043
 * check_for_long_line:
ali@45
  1044
 *
ali@45
  1045
 * Check for line too long.
ali@45
  1046
 */
ali@45
  1047
void check_for_long_line(const char *aline)
ali@45
  1048
{
ali@70
  1049
    if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
ali@45
  1050
    {
ali@45
  1051
	if (pswit[ECHO_SWITCH])
ali@70
  1052
	    g_print("\n%s\n",aline);
ali@45
  1053
	if (!pswit[OVERVIEW_SWITCH])
ali@70
  1054
	    g_print("    Line %ld column %ld - Long line %ld\n",
ali@70
  1055
	      linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
ali@45
  1056
	else
ali@45
  1057
	    cnt_long++;
ali@45
  1058
    }
ali@45
  1059
}
ali@45
  1060
ali@45
  1061
/*
ali@45
  1062
 * check_for_short_line:
ali@45
  1063
 *
ali@45
  1064
 * Check for line too short.
ali@45
  1065
 *
ali@45
  1066
 * This one is a bit trickier to implement: we don't want to
ali@45
  1067
 * flag the last line of a paragraph for being short, so we
ali@45
  1068
 * have to wait until we know that our current line is a
ali@45
  1069
 * "normal" line, then report the _previous_ line if it was too
ali@45
  1070
 * short. We also don't want to report indented lines like
ali@45
  1071
 * chapter heads or formatted quotations. We therefore keep
ali@45
  1072
 * last->len as the length of the last line examined, and
ali@45
  1073
 * last->blen as the length of the last but one, and try to
ali@45
  1074
 * suppress unnecessary warnings by checking that both were of
ali@45
  1075
 * "normal" length. We keep the first character of the last
ali@45
  1076
 * line in last->start, and if it was a space, we assume that
ali@45
  1077
 * the formatting is deliberate. I can't figure out a way to
ali@45
  1078
 * distinguish something like a quoted verse left-aligned or
ali@45
  1079
 * the header or footer of a letter from a paragraph of short
ali@45
  1080
 * lines - maybe if I examined the whole paragraph, and if the
ali@45
  1081
 * para has less than, say, 8 lines and if all lines are short,
ali@45
  1082
 * then just assume it's OK? Need to look at some texts to see
ali@45
  1083
 * how often a formula like this would get the right result.
ali@45
  1084
 */
ali@45
  1085
void check_for_short_line(const char *aline,const struct line_properties *last)
ali@45
  1086
{
ali@70
  1087
    if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
ali@70
  1088
      last->len<SHORTEST_PG_LINE && last->blen>1 &&
ali@70
  1089
      last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
ali@45
  1090
    {
ali@45
  1091
	if (pswit[ECHO_SWITCH])
ali@70
  1092
	    g_print("\n%s\n",prevline);
ali@45
  1093
	if (!pswit[OVERVIEW_SWITCH])
ali@70
  1094
	    g_print("    Line %ld column %ld - Short line %ld?\n",
ali@70
  1095
	      linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
ali@45
  1096
	else
ali@45
  1097
	    cnt_short++;
ali@45
  1098
    }
ali@45
  1099
}
ali@45
  1100
ali@45
  1101
/*
ali@46
  1102
 * check_for_starting_punctuation:
ali@46
  1103
 *
ali@46
  1104
 * Look for punctuation other than full ellipses at start of line.
ali@46
  1105
 */
ali@46
  1106
void check_for_starting_punctuation(const char *aline)
ali@46
  1107
{
ali@70
  1108
    if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
ali@70
  1109
      !g_str_has_prefix(aline,". . ."))
ali@46
  1110
    {
ali@46
  1111
	if (pswit[ECHO_SWITCH])
ali@70
  1112
	    g_print("\n%s\n",aline);
ali@46
  1113
	if (!pswit[OVERVIEW_SWITCH])
ali@70
  1114
	    g_print("    Line %ld column 1 - Begins with punctuation?\n",
ali@46
  1115
	      linecnt);
ali@46
  1116
	else
ali@46
  1117
	    cnt_punct++;
ali@46
  1118
    }
ali@46
  1119
}
ali@46
  1120
ali@46
  1121
/*
ali@47
  1122
 * check_for_spaced_emdash:
ali@47
  1123
 *
ali@47
  1124
 * Check for spaced em-dashes.
ali@47
  1125
 *
ali@47
  1126
 * We must check _all_ occurrences of "--" on the line
ali@47
  1127
 * hence the loop - even if the first double-dash is OK
ali@47
  1128
 * there may be another that's wrong later on.
ali@47
  1129
 */
ali@47
  1130
void check_for_spaced_emdash(const char *aline)
ali@47
  1131
{
ali@70
  1132
    const char *s,*t,*next;
ali@70
  1133
    for (s=aline;t=strstr(s,"--");s=next)
ali@47
  1134
    {
ali@70
  1135
	next=g_utf8_next_char(g_utf8_next_char(t));
ali@70
  1136
	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
ali@70
  1137
	  g_utf8_get_char(next)==CHAR_SPACE)
ali@47
  1138
	{
ali@47
  1139
	    if (pswit[ECHO_SWITCH])
ali@70
  1140
		g_print("\n%s\n",aline);
ali@47
  1141
	    if (!pswit[OVERVIEW_SWITCH])
ali@70
  1142
		g_print("    Line %ld column %ld - Spaced em-dash?\n",
ali@70
  1143
		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
ali@47
  1144
	    else
ali@47
  1145
		cnt_dash++;
ali@47
  1146
	}
ali@47
  1147
    }
ali@47
  1148
}
ali@47
  1149
ali@47
  1150
/*
ali@47
  1151
 * check_for_spaced_dash:
ali@47
  1152
 *
ali@47
  1153
 * Check for spaced dashes.
ali@47
  1154
 */
ali@47
  1155
void check_for_spaced_dash(const char *aline)
ali@47
  1156
{
ali@47
  1157
    const char *s;
ali@47
  1158
    if ((s=strstr(aline," -")))
ali@47
  1159
    {
ali@70
  1160
	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
ali@47
  1161
	{
ali@47
  1162
	    if (pswit[ECHO_SWITCH])
ali@70
  1163
		g_print("\n%s\n",aline);
ali@47
  1164
	    if (!pswit[OVERVIEW_SWITCH])
ali@70
  1165
		g_print("    Line %ld column %ld - Spaced dash?\n",
ali@70
  1166
		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@47
  1167
	    else
ali@47
  1168
		cnt_dash++;
ali@47
  1169
	}
ali@47
  1170
    }
ali@47
  1171
    else if ((s=strstr(aline,"- ")))
ali@47
  1172
    {
ali@70
  1173
	if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
ali@47
  1174
	{
ali@47
  1175
	    if (pswit[ECHO_SWITCH])
ali@70
  1176
		g_print("\n%s\n",aline);
ali@47
  1177
	    if (!pswit[OVERVIEW_SWITCH])
ali@70
  1178
		g_print("    Line %ld column %ld - Spaced dash?\n",
ali@70
  1179
		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@47
  1180
	    else
ali@47
  1181
		cnt_dash++;
ali@47
  1182
	}
ali@47
  1183
    }
ali@47
  1184
}
ali@47
  1185
ali@47
  1186
/*
ali@48
  1187
 * check_for_unmarked_paragraphs:
ali@48
  1188
 *
ali@48
  1189
 * Check for unmarked paragraphs indicated by separate speakers.
ali@48
  1190
 *
ali@48
  1191
 * May well be false positive:
ali@48
  1192
 * "Bravo!" "Wonderful!" called the crowd.
ali@48
  1193
 * but useful all the same.
ali@48
  1194
 */
ali@48
  1195
void check_for_unmarked_paragraphs(const char *aline)
ali@48
  1196
{
ali@48
  1197
    const char *s;
ali@48
  1198
    s=strstr(aline,"\"  \"");
ali@48
  1199
    if (!s)
ali@48
  1200
	s=strstr(aline,"\" \"");
ali@48
  1201
    if (s)
ali@48
  1202
    {
ali@48
  1203
	if (pswit[ECHO_SWITCH])
ali@70
  1204
	    g_print("\n%s\n",aline);
ali@48
  1205
	if (!pswit[OVERVIEW_SWITCH])
ali@70
  1206
	    g_print("    Line %ld column %ld - "
ali@70
  1207
	      "Query missing paragraph break?\n",
ali@70
  1208
	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@48
  1209
	else
ali@48
  1210
	    cnt_punct++;
ali@48
  1211
    }
ali@48
  1212
}
ali@48
  1213
ali@48
  1214
/*
ali@49
  1215
 * check_for_jeebies:
ali@49
  1216
 *
ali@49
  1217
 * Check for "to he" and other easy h/b errors.
ali@49
  1218
 *
ali@49
  1219
 * This is a very inadequate effort on the h/b problem,
ali@49
  1220
 * but the phrase "to he" is always an error, whereas "to
ali@49
  1221
 * be" is quite common.
ali@49
  1222
 * Similarly, '"Quiet!", be said.' is a non-be error
ali@49
  1223
 * "to he" is _not_ always an error!:
ali@49
  1224
 *       "Where they went to he couldn't say."
ali@49
  1225
 * Another false positive:
ali@49
  1226
 *       What would "Cinderella" be without the . . .
ali@49
  1227
 * and another: "If he wants to he can see for himself."
ali@49
  1228
 */
ali@49
  1229
void check_for_jeebies(const char *aline)
ali@49
  1230
{
ali@49
  1231
    const char *s;
ali@49
  1232
    s=strstr(aline," be could ");
ali@49
  1233
    if (!s)
ali@49
  1234
	s=strstr(aline," be would ");
ali@49
  1235
    if (!s)
ali@49
  1236
	s=strstr(aline," was be ");
ali@49
  1237
    if (!s)
ali@49
  1238
	s=strstr(aline," be is ");
ali@49
  1239
    if (!s)
ali@49
  1240
	s=strstr(aline," is be ");
ali@49
  1241
    if (!s)
ali@49
  1242
	s=strstr(aline,"\", be ");
ali@49
  1243
    if (!s)
ali@49
  1244
	s=strstr(aline,"\" be ");
ali@49
  1245
    if (!s)
ali@49
  1246
	s=strstr(aline,"\" be ");
ali@49
  1247
    if (!s)
ali@49
  1248
	s=strstr(aline," to he ");
ali@49
  1249
    if (s)
ali@49
  1250
    {
ali@49
  1251
	if (pswit[ECHO_SWITCH])
ali@70
  1252
	    g_print("\n%s\n",aline);
ali@49
  1253
	if (!pswit[OVERVIEW_SWITCH])
ali@70
  1254
	    g_print("    Line %ld column %ld - Query he/be error?\n",
ali@70
  1255
	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@49
  1256
	else
ali@49
  1257
	    cnt_word++;
ali@49
  1258
    }
ali@49
  1259
    s=strstr(aline," the had ");
ali@49
  1260
    if (!s)
ali@49
  1261
	s=strstr(aline," a had ");
ali@49
  1262
    if (!s)
ali@49
  1263
	s=strstr(aline," they bad ");
ali@49
  1264
    if (!s)
ali@49
  1265
	s=strstr(aline," she bad ");
ali@49
  1266
    if (!s)
ali@49
  1267
	s=strstr(aline," he bad ");
ali@49
  1268
    if (!s)
ali@49
  1269
	s=strstr(aline," you bad ");
ali@49
  1270
    if (!s)
ali@49
  1271
	s=strstr(aline," i bad ");
ali@49
  1272
    if (s)
ali@49
  1273
    {
ali@49
  1274
	if (pswit[ECHO_SWITCH])
ali@70
  1275
	    g_print("\n%s\n",aline);
ali@49
  1276
	if (!pswit[OVERVIEW_SWITCH])
ali@70
  1277
	    g_print("    Line %ld column %ld - Query had/bad error?\n",
ali@70
  1278
	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@49
  1279
	else
ali@49
  1280
	    cnt_word++;
ali@49
  1281
    }
ali@49
  1282
    s=strstr(aline,"; hut ");
ali@49
  1283
    if (!s)
ali@49
  1284
	s=strstr(aline,", hut ");
ali@49
  1285
    if (s)
ali@49
  1286
    {
ali@49
  1287
	if (pswit[ECHO_SWITCH])
ali@70
  1288
	    g_print("\n%s\n",aline);
ali@49
  1289
	if (!pswit[OVERVIEW_SWITCH])
ali@70
  1290
	    g_print("    Line %ld column %ld - Query hut/but error?\n",
ali@70
  1291
	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@49
  1292
	else
ali@49
  1293
	    cnt_word++;
ali@49
  1294
    }
ali@49
  1295
}
ali@49
  1296
ali@49
  1297
/*
ali@50
  1298
 * check_for_mta_from:
ali@50
  1299
 *
ali@50
  1300
 * Special case - angled bracket in front of "From" placed there by an
ali@50
  1301
 * MTA when sending an e-mail.
ali@50
  1302
 */
ali@50
  1303
void check_for_mta_from(const char *aline)
ali@50
  1304
{
ali@50
  1305
    const char *s;
ali@50
  1306
    s=strstr(aline,">From");
ali@50
  1307
    if (s)
ali@50
  1308
    {
ali@50
  1309
	if (pswit[ECHO_SWITCH])
ali@70
  1310
	    g_print("\n%s\n",aline);
ali@50
  1311
	if (!pswit[OVERVIEW_SWITCH])
ali@70
  1312
	    g_print("    Line %ld column %ld - "
ali@70
  1313
	      "Query angled bracket with From\n",
ali@70
  1314
	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@50
  1315
	else
ali@50
  1316
	    cnt_punct++;
ali@50
  1317
    }
ali@50
  1318
}
ali@50
  1319
ali@50
  1320
/*
ali@51
  1321
 * check_for_orphan_character:
ali@51
  1322
 *
ali@51
  1323
 * Check for a single character line -
ali@51
  1324
 * often an overflow from bad wrapping.
ali@51
  1325
 */
ali@51
  1326
void check_for_orphan_character(const char *aline)
ali@51
  1327
{
ali@70
  1328
    gunichar c;
ali@70
  1329
    c=g_utf8_get_char(aline);
ali@70
  1330
    if (c && !*g_utf8_next_char(aline))
ali@51
  1331
    {
ali@70
  1332
	if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
ali@51
  1333
	    ; /* Nothing - ignore numerals alone on a line. */
ali@51
  1334
	else
ali@51
  1335
	{
ali@51
  1336
	    if (pswit[ECHO_SWITCH])
ali@70
  1337
		g_print("\n%s\n",aline);
ali@51
  1338
	    if (!pswit[OVERVIEW_SWITCH])
ali@70
  1339
		g_print("    Line %ld column 1 - Query single character line\n",
ali@51
  1340
		  linecnt);
ali@51
  1341
	    else
ali@51
  1342
		cnt_punct++;
ali@51
  1343
	}
ali@51
  1344
    }
ali@51
  1345
}
ali@51
  1346
ali@51
  1347
/*
ali@52
  1348
 * check_for_pling_scanno:
ali@52
  1349
 *
ali@52
  1350
 * Check for I" - often should be !
ali@52
  1351
 */
ali@52
  1352
void check_for_pling_scanno(const char *aline)
ali@52
  1353
{
ali@52
  1354
    const char *s;
ali@52
  1355
    s=strstr(aline," I\"");
ali@52
  1356
    if (s)
ali@52
  1357
    {
ali@52
  1358
	if (pswit[ECHO_SWITCH])
ali@70
  1359
	    g_print("\n%s\n",aline);
ali@52
  1360
	if (!pswit[OVERVIEW_SWITCH])
ali@70
  1361
	    g_print("    Line %ld column %ld - Query I=exclamation mark?\n",
ali@70
  1362
	      linecnt,g_utf8_pointer_to_offset(aline,s));
ali@52
  1363
	else
ali@52
  1364
	    cnt_punct++;
ali@52
  1365
    }
ali@52
  1366
}
ali@52
  1367
ali@52
  1368
/*
ali@53
  1369
 * check_for_extra_period:
ali@53
  1370
 *
ali@53
  1371
 * Check for period without a capital letter. Cut-down from gutspell.
ali@53
  1372
 * Only works when it happens on a single line.
ali@53
  1373
 */
ali@53
  1374
void check_for_extra_period(const char *aline,const struct warnings *warnings)
ali@53
  1375
{
ali@53
  1376
    const char *s,*t,*s1;
ali@69
  1377
    int i;
ali@70
  1378
    gsize len;
ali@69
  1379
    gboolean istypo;
ali@69
  1380
    gchar *testword;
ali@70
  1381
    gunichar *decomposition;
ali@53
  1382
    if (pswit[PARANOID_SWITCH])
ali@53
  1383
    {
ali@70
  1384
	for (t=aline;t=strstr(t,". ");)
ali@53
  1385
	{
ali@69
  1386
	    if (t==aline)
ali@53
  1387
	    {
ali@70
  1388
		t=g_utf8_next_char(t);
ali@53
  1389
		/* start of line punctuation is handled elsewhere */
ali@53
  1390
		continue;
ali@53
  1391
	    }
ali@70
  1392
	    if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
ali@53
  1393
	    {
ali@70
  1394
		t=g_utf8_next_char(t);
ali@53
  1395
		continue;
ali@53
  1396
	    }
ali@53
  1397
	    if (warnings->isDutch)
ali@53
  1398
	    {
ali@53
  1399
		/* For Frank & Jeroen -- 's Middags case */
ali@70
  1400
		gunichar c2,c3,c4,c5;
ali@70
  1401
		c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
ali@70
  1402
		c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
ali@70
  1403
		c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
ali@70
  1404
		c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
ali@70
  1405
		if (c2==CHAR_SQUOTE && g_unichar_islower(c3) &&
ali@70
  1406
		  c4==CHAR_SPACE && g_unichar_isupper(c5))
ali@53
  1407
		{
ali@70
  1408
		    t=g_utf8_next_char(t);
ali@53
  1409
		    continue;
ali@53
  1410
		}
ali@53
  1411
	    }
ali@70
  1412
	    s1=g_utf8_next_char(g_utf8_next_char(t));
ali@70
  1413
	    while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
ali@70
  1414
	      !isdigit(g_utf8_get_char(s1)))
ali@70
  1415
		s1=g_utf8_next_char(s1);
ali@70
  1416
	    if (g_unichar_islower(g_utf8_get_char(s1)))
ali@53
  1417
	    {
ali@53
  1418
		/* we have something to investigate */
ali@69
  1419
		istypo=TRUE;
ali@53
  1420
		/* so let's go back and find out */
ali@70
  1421
		for (s1=g_utf8_prev_char(t);s1>=aline &&
ali@70
  1422
		  (g_unichar_isalpha(g_utf8_get_char(s1)) ||
ali@70
  1423
		  g_unichar_isdigit(g_utf8_get_char(s1)) ||
ali@70
  1424
		  g_utf8_get_char(s1)==CHAR_SQUOTE &&
ali@70
  1425
		  g_unichar_isalpha(g_utf8_get_char(g_utf8_next_char(s1))) &&
ali@70
  1426
		  g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s1))));
ali@70
  1427
		  s1=g_utf8_prev_char(s1))
ali@53
  1428
		    ;
ali@70
  1429
		s1=g_utf8_next_char(s1);
ali@69
  1430
		s=strchr(s1,'.');
ali@69
  1431
		if (s)
ali@69
  1432
		    testword=g_strndup(s1,s-s1);
ali@69
  1433
		else
ali@69
  1434
		    testword=g_strdup(s1);
ali@53
  1435
		for (i=0;*abbrev[i];i++)
ali@53
  1436
		    if (!strcmp(testword,abbrev[i]))
ali@69
  1437
			istypo=FALSE;
ali@70
  1438
		if (g_unichar_isdigit(g_utf8_get_char(testword)))
ali@69
  1439
		    istypo=FALSE;
ali@70
  1440
		if (!*g_utf8_next_char(testword))
ali@69
  1441
		    istypo=FALSE;
ali@53
  1442
		if (isroman(testword))
ali@69
  1443
		    istypo=FALSE;
ali@53
  1444
		if (istypo)
ali@53
  1445
		{
ali@69
  1446
		    istypo=FALSE;
ali@70
  1447
		    for (s=testword;*s;s=g_utf8_next_char(s))
ali@70
  1448
		    {
ali@70
  1449
			decomposition=g_unicode_canonical_decomposition(
ali@70
  1450
			  g_utf8_get_char(s),&len);
ali@70
  1451
			if (g_utf8_strchr("aeiou",-1,decomposition[0]))
ali@69
  1452
			    istypo=TRUE;
ali@70
  1453
			g_free(decomposition);
ali@70
  1454
		    }
ali@53
  1455
		}
ali@69
  1456
		if (istypo &&
ali@69
  1457
		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
ali@53
  1458
		{
ali@69
  1459
		    g_tree_insert(qperiod,g_strdup(testword),
ali@69
  1460
		      GINT_TO_POINTER(1));
ali@69
  1461
		    if (pswit[ECHO_SWITCH])
ali@70
  1462
			g_print("\n%s\n",aline);
ali@69
  1463
		    if (!pswit[OVERVIEW_SWITCH])
ali@70
  1464
			g_print("    Line %ld column %ld - Extra period?\n",
ali@70
  1465
			  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
ali@69
  1466
		    else
ali@69
  1467
			cnt_punct++;
ali@53
  1468
		}
ali@69
  1469
		g_free(testword);
ali@53
  1470
	    }
ali@70
  1471
	    t=g_utf8_next_char(t);
ali@53
  1472
	}
ali@53
  1473
    }
ali@53
  1474
}
ali@53
  1475
ali@53
  1476
/*
ali@54
  1477
 * check_for_following_punctuation:
ali@54
  1478
 *
ali@54
  1479
 * Check for words usually not followed by punctuation.
ali@54
  1480
 */
ali@54
  1481
void check_for_following_punctuation(const char *aline)
ali@54
  1482
{
ali@54
  1483
    int i;
ali@54
  1484
    const char *s,*wordstart;
ali@70
  1485
    gunichar c;
ali@69
  1486
    gchar *inword,*t;
ali@54
  1487
    if (pswit[TYPO_SWITCH])
ali@54
  1488
    {
ali@54
  1489
	for (s=aline;*s;)
ali@54
  1490
	{
ali@54
  1491
	    wordstart=s;
ali@69
  1492
	    t=getaword(&s);
ali@69
  1493
	    if (!*t)
ali@69
  1494
	    {
ali@69
  1495
		g_free(t);
ali@54
  1496
		continue;
ali@69
  1497
	    }
ali@70
  1498
	    inword=g_utf8_strdown(t,-1);
ali@69
  1499
	    g_free(t);
ali@54
  1500
	    for (i=0;*nocomma[i];i++)
ali@54
  1501
		if (!strcmp(inword,nocomma[i]))
ali@54
  1502
		{
ali@70
  1503
		    c=g_utf8_get_char(s);
ali@70
  1504
		    if (c==',' || c==';' || c==':')
ali@54
  1505
		    {
ali@54
  1506
			if (pswit[ECHO_SWITCH])
ali@70
  1507
			    g_print("\n%s\n",aline);
ali@54
  1508
			if (!pswit[OVERVIEW_SWITCH])
ali@70
  1509
			    g_print("    Line %ld column %ld - "
ali@54
  1510
			      "Query punctuation after %s?\n",
ali@70
  1511
			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
ali@70
  1512
			      inword);
ali@54
  1513
			else
ali@54
  1514
			    cnt_punct++;
ali@54
  1515
		    }
ali@54
  1516
		}
ali@54
  1517
	    for (i=0;*noperiod[i];i++)
ali@54
  1518
		if (!strcmp(inword,noperiod[i]))
ali@54
  1519
		{
ali@70
  1520
		    c=g_utf8_get_char(s);
ali@70
  1521
		    if (c=='.' || c=='!')
ali@54
  1522
		    {
ali@54
  1523
			if (pswit[ECHO_SWITCH])
ali@70
  1524
			    g_print("\n%s\n",aline);
ali@54
  1525
			if (!pswit[OVERVIEW_SWITCH])
ali@70
  1526
			    g_print("    Line %ld column %ld - "
ali@54
  1527
			      "Query punctuation after %s?\n",
ali@70
  1528
			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
ali@70
  1529
			      inword);
ali@54
  1530
			else
ali@54
  1531
			    cnt_punct++;
ali@54
  1532
		    }
ali@54
  1533
		}
ali@69
  1534
	    g_free(inword);
ali@54
  1535
	}
ali@54
  1536
    }
ali@54
  1537
}
ali@54
  1538
ali@54
  1539
/*
ali@55
  1540
 * check_for_typos:
ali@55
  1541
 *
ali@55
  1542
 * Check for commonly mistyped words,
ali@55
  1543
 * and digits like 0 for O in a word.
ali@55
  1544
 */
ali@55
  1545
void check_for_typos(const char *aline,struct warnings *warnings)
ali@55
  1546
{
ali@70
  1547
    const char *s,*t,*nt,*wordstart;
ali@70
  1548
    gchar *inword;
ali@70
  1549
    gunichar *decomposition;
ali@70
  1550
    gchar *testword;
ali@70
  1551
    int i,vowel,consonant,*dupcnt;
ali@70
  1552
    gboolean isdup,istypo,alower;
ali@70
  1553
    gunichar c;
ali@70
  1554
    long offset,len;
ali@70
  1555
    gsize decomposition_len;
ali@55
  1556
    for (s=aline;*s;)
ali@55
  1557
    {
ali@55
  1558
	wordstart=s;
ali@69
  1559
	inword=getaword(&s);
ali@55
  1560
	if (!*inword)
ali@69
  1561
	{
ali@69
  1562
	    g_free(inword);
ali@55
  1563
	    continue; /* don't bother with empty lines */
ali@69
  1564
	}
ali@55
  1565
	if (mixdigit(inword))
ali@55
  1566
	{
ali@55
  1567
	    if (pswit[ECHO_SWITCH])
ali@70
  1568
		g_print("\n%s\n",aline);
ali@55
  1569
	    if (!pswit[OVERVIEW_SWITCH])
ali@70
  1570
		g_print("    Line %ld column %ld - Query digit in %s\n",
ali@70
  1571
		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
ali@55
  1572
	    else
ali@55
  1573
		cnt_word++;
ali@55
  1574
	}
ali@55
  1575
	/*
ali@55
  1576
	 * Put the word through a series of tests for likely typos and OCR
ali@55
  1577
	 * errors.
ali@55
  1578
	 */
ali@69
  1579
	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
ali@55
  1580
	{
ali@69
  1581
	    istypo=FALSE;
ali@70
  1582
	    alower=FALSE;
ali@70
  1583
	    for (t=inword;*t;t=g_utf8_next_char(t))
ali@55
  1584
	    {
ali@70
  1585
		c=g_utf8_get_char(t);
ali@70
  1586
		nt=g_utf8_next_char(t);
ali@55
  1587
		/* lowercase for testing */
ali@70
  1588
		if (g_unichar_islower(c))
ali@70
  1589
		    alower=TRUE;
ali@70
  1590
		if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
ali@55
  1591
		{
ali@55
  1592
		    /*
ali@55
  1593
		     * We have an uppercase mid-word. However, there are
ali@55
  1594
		     * common cases:
ali@55
  1595
		     *   Mac and Mc like McGill
ali@55
  1596
		     *   French contractions like l'Abbe
ali@55
  1597
		     */
ali@70
  1598
		    offset=g_utf8_pointer_to_offset(inword,t);
ali@70
  1599
		    if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
ali@70
  1600
		      offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
ali@70
  1601
		      g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
ali@70
  1602
		      offset>0 &&
ali@70
  1603
		      g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SQUOTE)
ali@55
  1604
			; /* do nothing! */
ali@55
  1605
		    else
ali@69
  1606
			istypo=TRUE;
ali@55
  1607
		}
ali@55
  1608
	    }
ali@70
  1609
	    testword=g_utf8_casefold(inword,-1);
ali@69
  1610
	}
ali@69
  1611
	if (pswit[TYPO_SWITCH])
ali@69
  1612
	{
ali@55
  1613
	    /*
ali@55
  1614
	     * Check for certain unlikely two-letter combinations at word
ali@55
  1615
	     * start and end.
ali@55
  1616
	     */
ali@70
  1617
	    len=g_utf8_strlen(testword,-1);
ali@70
  1618
	    if (len>1)
ali@55
  1619
	    {
ali@55
  1620
		for (i=0;*nostart[i];i++)
ali@70
  1621
		    if (g_str_has_prefix(testword,nostart[i]))
ali@69
  1622
			istypo=TRUE;
ali@55
  1623
		for (i=0;*noend[i];i++)
ali@70
  1624
		    if (g_str_has_suffix(testword,noend[i]))
ali@69
  1625
			istypo=TRUE;
ali@55
  1626
	    }
ali@55
  1627
	    /* ght is common, gbt never. Like that. */
ali@55
  1628
	    if (strstr(testword,"cb"))
ali@69
  1629
		istypo=TRUE;
ali@55
  1630
	    if (strstr(testword,"gbt"))
ali@69
  1631
		istypo=TRUE;
ali@55
  1632
	    if (strstr(testword,"pbt"))
ali@69
  1633
		istypo=TRUE;
ali@55
  1634
	    if (strstr(testword,"tbs"))
ali@69
  1635
		istypo=TRUE;
ali@55
  1636
	    if (strstr(testword,"mrn"))
ali@69
  1637
		istypo=TRUE;
ali@55
  1638
	    if (strstr(testword,"ahle"))
ali@69
  1639
		istypo=TRUE;
ali@55
  1640
	    if (strstr(testword,"ihle"))
ali@69
  1641
		istypo=TRUE;
ali@55
  1642
	    /*
ali@55
  1643
	     * "TBE" does happen - like HEARTBEAT - but uncommon.
ali@55
  1644
	     * Also "TBI" - frostbite, outbid - but uncommon.
ali@55
  1645
	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman
ali@55
  1646
	     * numerals, but "ii" is a common scanno.
ali@55
  1647
	     */
ali@55
  1648
	    if (strstr(testword,"tbi"))
ali@69
  1649
		istypo=TRUE;
ali@55
  1650
	    if (strstr(testword,"tbe"))
ali@69
  1651
		istypo=TRUE;
ali@55
  1652
	    if (strstr(testword,"ii"))
ali@69
  1653
		istypo=TRUE;
ali@55
  1654
	    /*
ali@55
  1655
	     * Check for no vowels or no consonants.
ali@55
  1656
	     * If none, flag a typo.
ali@55
  1657
	     */
ali@70
  1658
	    if (!istypo && len>1)
ali@55
  1659
	    {
ali@55
  1660
		vowel=consonant=0;
ali@70
  1661
		for (t=testword;*t;t=g_utf8_next_char(t))
ali@55
  1662
		{
ali@70
  1663
		    c=g_utf8_get_char(t);
ali@70
  1664
		    decomposition=
ali@70
  1665
		      g_unicode_canonical_decomposition(c,&decomposition_len);
ali@70
  1666
		    if (c=='y' || g_unichar_isdigit(c))
ali@55
  1667
		    {
ali@55
  1668
			/* Yah, this is loose. */
ali@55
  1669
			vowel++;
ali@55
  1670
			consonant++;
ali@55
  1671
		    }
ali@70
  1672
		    else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
ali@55
  1673
			vowel++;
ali@55
  1674
		    else
ali@55
  1675
			consonant++;
ali@70
  1676
		    g_free(decomposition);
ali@55
  1677
		}
ali@55
  1678
		if (!vowel || !consonant)
ali@69
  1679
		    istypo=TRUE;
ali@55
  1680
	    }
ali@55
  1681
	    /*
ali@55
  1682
	     * Now exclude the word from being reported if it's in
ali@55
  1683
	     * the okword list.
ali@55
  1684
	     */
ali@55
  1685
	    for (i=0;*okword[i];i++)
ali@55
  1686
		if (!strcmp(testword,okword[i]))
ali@69
  1687
		    istypo=FALSE;
ali@55
  1688
	    /*
ali@55
  1689
	     * What looks like a typo may be a Roman numeral.
ali@55
  1690
	     * Exclude these.
ali@55
  1691
	     */
ali@55
  1692
	    if (istypo && isroman(testword))
ali@69
  1693
		istypo=FALSE;
ali@55
  1694
	    /* Check the manual list of typos. */
ali@55
  1695
	    if (!istypo)
ali@55
  1696
		for (i=0;*typo[i];i++)
ali@55
  1697
		    if (!strcmp(testword,typo[i]))
ali@69
  1698
			istypo=TRUE;
ali@55
  1699
	    /*
ali@55
  1700
	     * Check lowercase s, l, i and m - special cases.
ali@55
  1701
	     *   "j" - often a semi-colon gone wrong.
ali@55
  1702
	     *   "d" for a missing apostrophe - he d
ali@55
  1703
	     *   "n" for "in"
ali@55
  1704
	     */
ali@70
  1705
	    if (!istypo && len==1 &&
ali@70
  1706
	      g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
ali@69
  1707
		istypo=TRUE;
ali@55
  1708
	    if (istypo)
ali@55
  1709
	    {
ali@69
  1710
		dupcnt=g_tree_lookup(qword,testword);
ali@69
  1711
		if (dupcnt)
ali@69
  1712
		{
ali@69
  1713
		    (*dupcnt)++;
ali@69
  1714
		    isdup=!pswit[VERBOSE_SWITCH];
ali@69
  1715
		}
ali@69
  1716
		else
ali@69
  1717
		{
ali@69
  1718
		    dupcnt=g_new0(int,1);
ali@69
  1719
		    g_tree_insert(qword,g_strdup(testword),dupcnt);
ali@69
  1720
		    isdup=FALSE;
ali@69
  1721
		}
ali@55
  1722
		if (!isdup)
ali@55
  1723
		{
ali@55
  1724
		    if (pswit[ECHO_SWITCH])
ali@70
  1725
			g_print("\n%s\n",aline);
ali@55
  1726
		    if (!pswit[OVERVIEW_SWITCH])
ali@55
  1727
		    {
ali@70
  1728
			g_print("    Line %ld column %ld - Query word %s",
ali@70
  1729
			  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
ali@70
  1730
			  inword);
ali@69
  1731
			if (!pswit[VERBOSE_SWITCH])
ali@70
  1732
			    g_print(" - not reporting duplicates");
ali@70
  1733
			g_print("\n");
ali@55
  1734
		    }
ali@55
  1735
		    else
ali@55
  1736
			cnt_word++;
ali@55
  1737
		}
ali@55
  1738
	    }
ali@55
  1739
	}
ali@55
  1740
	/* check the user's list of typos */
ali@69
  1741
	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
ali@69
  1742
	{
ali@69
  1743
	    if (pswit[ECHO_SWITCH])
ali@70
  1744
		g_print("\n%s\n",aline);
ali@69
  1745
	    if (!pswit[OVERVIEW_SWITCH])  
ali@70
  1746
		g_print("    Line %ld column %ld - Query possible scanno %s\n",
ali@70
  1747
		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
ali@69
  1748
	}
ali@69
  1749
	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
ali@69
  1750
	    g_free(testword);
ali@55
  1751
	if (pswit[PARANOID_SWITCH] && warnings->digit)
ali@55
  1752
	{
ali@55
  1753
	    /* In paranoid mode, query all 0 and 1 standing alone. */
ali@55
  1754
	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))
ali@55
  1755
	    {
ali@55
  1756
		if (pswit[ECHO_SWITCH])
ali@70
  1757
		    g_print("\n%s\n",aline);
ali@55
  1758
		if (!pswit[OVERVIEW_SWITCH])
ali@70
  1759
		    g_print("    Line %ld column %ld - Query standalone %s\n",
ali@70
  1760
		      linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
ali@70
  1761
		      inword);
ali@55
  1762
		else
ali@55
  1763
		    cnt_word++;
ali@55
  1764
	    }
ali@55
  1765
	}
ali@69
  1766
	g_free(inword);
ali@55
  1767
    }
ali@55
  1768
}
ali@55
  1769
ali@56
  1770
/*
ali@56
  1771
 * check_for_misspaced_punctuation:
ali@56
  1772
 *
ali@56
  1773
 * Look for added or missing spaces around punctuation and quotes.
ali@56
  1774
 * If there is a punctuation character like ! with no space on
ali@56
  1775
 * either side, suspect a missing!space. If there are spaces on
ali@56
  1776
 * both sides , assume a typo. If we see a double quote with no
ali@56
  1777
 * space or punctuation on either side of it, assume unspaced
ali@56
  1778
 * quotes "like"this.
ali@56
  1779
 */
ali@56
  1780
void check_for_misspaced_punctuation(const char *aline,
ali@69
  1781
  struct parities *parities,gboolean isemptyline)
ali@56
  1782
{
ali@69
  1783
    gboolean isacro,isellipsis;
ali@56
  1784
    const char *s;
ali@70
  1785
    gunichar c,nc,pc,n2c;
ali@70
  1786
    c=g_utf8_get_char(aline);
ali@70
  1787
    nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70
  1788
    for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@56
  1789
    {
ali@70
  1790
	pc=c;
ali@70
  1791
	c=nc;
ali@70
  1792
	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@56
  1793
	/* For each character in the line after the first. */
ali@70
  1794
	if (g_utf8_strchr(".?!,;:_",-1,c))  /* if it's punctuation */
ali@56
  1795
	{
ali@56
  1796
	    /* we need to suppress warnings for acronyms like M.D. */
ali@69
  1797
	    isacro=FALSE;
ali@56
  1798
	    /* we need to suppress warnings for ellipsis . . . */
ali@69
  1799
	    isellipsis=FALSE;
ali@70
  1800
	    /*
ali@70
  1801
	     * If there are letters on both sides of it or
ali@70
  1802
	     * if it's strict punctuation followed by an alpha.
ali@70
  1803
	     */
ali@70
  1804
	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
ali@70
  1805
	      g_utf8_strchr("?!,;:",-1,c)))
ali@56
  1806
	    {
ali@70
  1807
		if (c=='.')
ali@56
  1808
		{
ali@70
  1809
		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
ali@70
  1810
		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
ali@69
  1811
			isacro=TRUE;
ali@70
  1812
		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
ali@70
  1813
		    if (nc && n2c=='.')
ali@69
  1814
			isacro=TRUE;
ali@56
  1815
		}
ali@56
  1816
		if (!isacro)
ali@56
  1817
		{
ali@56
  1818
		    if (pswit[ECHO_SWITCH])
ali@70
  1819
			g_print("\n%s\n",aline);
ali@56
  1820
		    if (!pswit[OVERVIEW_SWITCH])
ali@70
  1821
			g_print("    Line %ld column %ld - Missing space?\n",
ali@70
  1822
			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56
  1823
		    else
ali@56
  1824
			cnt_punct++;
ali@56
  1825
		}
ali@56
  1826
	    }
ali@70
  1827
	    if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
ali@56
  1828
	    {
ali@56
  1829
		/*
ali@56
  1830
		 * If there are spaces on both sides,
ali@56
  1831
		 * or space before and end of line.
ali@56
  1832
		 */
ali@70
  1833
		if (c=='.')
ali@56
  1834
		{
ali@70
  1835
		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
ali@70
  1836
		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
ali@69
  1837
			isellipsis=TRUE;
ali@70
  1838
		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
ali@70
  1839
		    if (nc && n2c=='.')
ali@69
  1840
			isellipsis=TRUE;
ali@56
  1841
		}
ali@56
  1842
		if (!isemptyline && !isellipsis)
ali@56
  1843
		{
ali@56
  1844
		    if (pswit[ECHO_SWITCH])
ali@70
  1845
			g_print("\n%s\n",aline);
ali@56
  1846
		    if (!pswit[OVERVIEW_SWITCH])
ali@70
  1847
			g_print("    Line %ld column %ld - "
ali@70
  1848
			  "Spaced punctuation?\n",linecnt,
ali@70
  1849
			  g_utf8_pointer_to_offset(aline,s)+1);
ali@56
  1850
		    else
ali@56
  1851
			cnt_punct++;
ali@56
  1852
		}
ali@56
  1853
	    }
ali@56
  1854
	}
ali@56
  1855
    }
ali@56
  1856
    /* Split out the characters that CANNOT be preceded by space. */
ali@70
  1857
    c=g_utf8_get_char(aline);
ali@70
  1858
    nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70
  1859
    for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@56
  1860
    {
ali@70
  1861
	pc=c;
ali@70
  1862
	c=nc;
ali@70
  1863
	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@56
  1864
	/* for each character in the line after the first */
ali@70
  1865
	if (g_utf8_strchr("?!,;:",-1,c))
ali@56
  1866
	{
ali@56
  1867
	    /* if it's punctuation that _cannot_ have a space before it */
ali@70
  1868
	    if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
ali@56
  1869
	    {
ali@56
  1870
		/*
ali@70
  1871
		 * If nc DOES == space,
ali@56
  1872
		 * it was already reported just above.
ali@56
  1873
		 */
ali@56
  1874
		if (pswit[ECHO_SWITCH])
ali@70
  1875
		    g_print("\n%s\n",aline);
ali@56
  1876
		if (!pswit[OVERVIEW_SWITCH])
ali@70
  1877
		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
ali@70
  1878
		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56
  1879
		else
ali@56
  1880
		    cnt_punct++;
ali@56
  1881
	    }
ali@56
  1882
	}
ali@56
  1883
    }
ali@56
  1884
    /*
ali@56
  1885
     * Special case " .X" where X is any alpha.
ali@56
  1886
     * This plugs a hole in the acronym code above.
ali@56
  1887
     * Inelegant, but maintainable.
ali@56
  1888
     */
ali@70
  1889
    c=g_utf8_get_char(aline);
ali@70
  1890
    nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70
  1891
    for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@56
  1892
    {
ali@70
  1893
	pc=c;
ali@70
  1894
	c=nc;
ali@70
  1895
	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@56
  1896
	/* for each character in the line after the first */
ali@70
  1897
	if (c=='.')
ali@56
  1898
	{
ali@56
  1899
	    /* if it's a period */
ali@70
  1900
	    if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
ali@56
  1901
	    {
ali@56
  1902
		/*
ali@56
  1903
		 * If the period follows a space and
ali@56
  1904
		 * is followed by a letter.
ali@56
  1905
		 */
ali@56
  1906
		if (pswit[ECHO_SWITCH])
ali@70
  1907
		    g_print("\n%s\n",aline);
ali@56
  1908
		if (!pswit[OVERVIEW_SWITCH])
ali@70
  1909
		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
ali@70
  1910
		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56
  1911
		else
ali@56
  1912
		    cnt_punct++;
ali@56
  1913
	    }
ali@56
  1914
	}
ali@56
  1915
    }
ali@70
  1916
    c=g_utf8_get_char(aline);
ali@70
  1917
    nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70
  1918
    for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@56
  1919
    {
ali@70
  1920
	pc=c;
ali@70
  1921
	c=nc;
ali@70
  1922
	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@56
  1923
	/* for each character in the line after the first */
ali@70
  1924
	if (c==CHAR_DQUOTE)
ali@56
  1925
	{
ali@70
  1926
	    if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
ali@70
  1927
	      !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
ali@70
  1928
	      !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
ali@56
  1929
	    {
ali@56
  1930
		if (pswit[ECHO_SWITCH])
ali@70
  1931
		    g_print("\n%s\n",aline);
ali@56
  1932
		if (!pswit[OVERVIEW_SWITCH])
ali@70
  1933
		    g_print("    Line %ld column %ld - Unspaced quotes?\n",
ali@70
  1934
		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56
  1935
		else
ali@56
  1936
		    cnt_punct++;
ali@56
  1937
	    }
ali@56
  1938
	}
ali@56
  1939
    }
ali@56
  1940
    /* Check parity of quotes. */
ali@70
  1941
    nc=g_utf8_get_char(aline);
ali@70
  1942
    for (s=aline;*s;s=g_utf8_next_char(s))
ali@56
  1943
    {
ali@70
  1944
	c=nc;
ali@70
  1945
	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@70
  1946
	if (c==CHAR_DQUOTE)
ali@56
  1947
	{
ali@56
  1948
	    parities->dquote=!parities->dquote;
ali@56
  1949
	    if (!parities->dquote)
ali@56
  1950
	    {
ali@56
  1951
		/* parity even */
ali@70
  1952
		if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))
ali@56
  1953
		{
ali@56
  1954
		    if (pswit[ECHO_SWITCH])
ali@70
  1955
			g_print("\n%s\n",aline);
ali@56
  1956
		    if (!pswit[OVERVIEW_SWITCH])
ali@70
  1957
			g_print("    Line %ld column %ld - "
ali@70
  1958
			  "Wrongspaced quotes?\n",
ali@70
  1959
			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56
  1960
		    else
ali@56
  1961
			cnt_punct++;
ali@56
  1962
		}
ali@56
  1963
	    }
ali@56
  1964
	    else
ali@56
  1965
	    {
ali@56
  1966
		/* parity odd */
ali@70
  1967
		if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
ali@70
  1968
		  !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)
ali@56
  1969
		{
ali@56
  1970
		    if (pswit[ECHO_SWITCH])
ali@70
  1971
			g_print("\n%s\n",aline);
ali@56
  1972
		    if (!pswit[OVERVIEW_SWITCH])
ali@70
  1973
			g_print("    Line %ld column %ld - "
ali@70
  1974
			  "Wrongspaced quotes?\n",
ali@70
  1975
			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56
  1976
		    else
ali@56
  1977
			cnt_punct++;
ali@56
  1978
		}
ali@56
  1979
	    }
ali@56
  1980
	}
ali@56
  1981
    }
ali@70
  1982
    if (g_utf8_get_char(aline)==CHAR_DQUOTE)
ali@56
  1983
    {
ali@70
  1984
	if (g_utf8_strchr(",;:!?)]} ",-1,
ali@70
  1985
	  g_utf8_get_char(g_utf8_next_char(aline))))
ali@56
  1986
	{
ali@56
  1987
	    if (pswit[ECHO_SWITCH])
ali@70
  1988
		g_print("\n%s\n",aline);
ali@56
  1989
	    if (!pswit[OVERVIEW_SWITCH])
ali@70
  1990
		g_print("    Line %ld column 1 - Wrongspaced quotes?\n",
ali@56
  1991
		  linecnt);
ali@56
  1992
	    else
ali@56
  1993
		cnt_punct++;
ali@56
  1994
	}
ali@56
  1995
    }
ali@56
  1996
    if (pswit[SQUOTE_SWITCH])
ali@56
  1997
    {
ali@70
  1998
	nc=g_utf8_get_char(aline);
ali@70
  1999
	for (s=aline;*s;s=g_utf8_next_char(s))
ali@56
  2000
	{
ali@70
  2001
	    c=nc;
ali@70
  2002
	    nc=g_utf8_get_char(g_utf8_next_char(s));
ali@70
  2003
	    if ((c==CHAR_SQUOTE || c==CHAR_OPEN_SQUOTE) && (s==aline ||
ali@70
  2004
	      s>aline &&
ali@70
  2005
	      !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
ali@70
  2006
	      !g_unichar_isalpha(nc)))
ali@56
  2007
	    {
ali@56
  2008
		parities->squote=!parities->squote;
ali@56
  2009
		if (!parities->squote)
ali@56
  2010
		{
ali@56
  2011
		    /* parity even */
ali@70
  2012
		    if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
ali@56
  2013
		    {
ali@56
  2014
			if (pswit[ECHO_SWITCH])
ali@70
  2015
			    g_print("\n%s\n",aline);
ali@56
  2016
			if (!pswit[OVERVIEW_SWITCH])
ali@70
  2017
			    g_print("    Line %ld column %ld - "
ali@56
  2018
			      "Wrongspaced singlequotes?\n",
ali@70
  2019
			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56
  2020
			else
ali@56
  2021
			    cnt_punct++;
ali@56
  2022
		    }
ali@56
  2023
		}
ali@56
  2024
		else
ali@56
  2025
		{
ali@56
  2026
		    /* parity odd */
ali@70
  2027
		    if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
ali@70
  2028
		      !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
ali@56
  2029
		    {
ali@56
  2030
			if (pswit[ECHO_SWITCH])
ali@70
  2031
			    g_print("\n%s\n",aline);
ali@56
  2032
			if (!pswit[OVERVIEW_SWITCH])
ali@70
  2033
			    g_print("    Line %ld column %ld - "
ali@56
  2034
			      "Wrongspaced singlequotes?\n",
ali@70
  2035
			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@56
  2036
			else
ali@56
  2037
			    cnt_punct++;
ali@56
  2038
		    }
ali@56
  2039
		}
ali@56
  2040
	    }
ali@56
  2041
	}
ali@56
  2042
    }
ali@56
  2043
}
ali@56
  2044
ali@55
  2045
/*
ali@57
  2046
 * check_for_double_punctuation:
ali@57
  2047
 *
ali@57
  2048
 * Look for double punctuation like ,. or ,,
ali@57
  2049
 * Thanks to DW for the suggestion!
ali@57
  2050
 * In books with references, ".," and ".;" are common
ali@57
  2051
 * e.g. "etc., etc.," and vol. 1.; vol 3.;
ali@57
  2052
 * OTOH, from my initial tests, there are also fairly
ali@57
  2053
 * common errors. What to do? Make these cases paranoid?
ali@57
  2054
 * ".," is the most common, so warnings->dotcomma is used
ali@57
  2055
 * to suppress detailed reporting if it occurs often.
ali@57
  2056
 */
ali@57
  2057
void check_for_double_punctuation(const char *aline,struct warnings *warnings)
ali@57
  2058
{
ali@70
  2059
    const char *s;
ali@70
  2060
    gunichar c,nc;
ali@70
  2061
    nc=g_utf8_get_char(aline);
ali@70
  2062
    for (s=aline;*s;s=g_utf8_next_char(s))
ali@57
  2063
    {
ali@70
  2064
	c=nc;
ali@70
  2065
	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@57
  2066
	/* for each punctuation character in the line */
ali@70
  2067
	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
ali@70
  2068
	  g_utf8_strchr(".?!,;:",-1,nc))
ali@57
  2069
	{
ali@57
  2070
	    /* followed by punctuation, it's a query, unless . . . */
ali@70
  2071
	    if (c==nc && (c=='.' || c=='?' || c=='!') ||
ali@70
  2072
	      !warnings->dotcomma && c=='.' && nc==',' ||
ali@70
  2073
	      warnings->isFrench && g_str_has_prefix(s,",...") ||
ali@70
  2074
	      warnings->isFrench && g_str_has_prefix(s,"...,") ||
ali@70
  2075
	      warnings->isFrench && g_str_has_prefix(s,";...") ||
ali@70
  2076
	      warnings->isFrench && g_str_has_prefix(s,"...;") ||
ali@70
  2077
	      warnings->isFrench && g_str_has_prefix(s,":...") ||
ali@70
  2078
	      warnings->isFrench && g_str_has_prefix(s,"...:") ||
ali@70
  2079
	      warnings->isFrench && g_str_has_prefix(s,"!...") ||
ali@70
  2080
	      warnings->isFrench && g_str_has_prefix(s,"...!") ||
ali@70
  2081
	      warnings->isFrench && g_str_has_prefix(s,"?...") ||
ali@70
  2082
	      warnings->isFrench && g_str_has_prefix(s,"...?"))
ali@57
  2083
	    {
ali@70
  2084
		if (warnings->isFrench && g_str_has_prefix(s,",...") ||
ali@70
  2085
		  warnings->isFrench && g_str_has_prefix(s,"...,") ||
ali@70
  2086
		  warnings->isFrench && g_str_has_prefix(s,";...") ||
ali@70
  2087
		  warnings->isFrench && g_str_has_prefix(s,"...;") ||
ali@70
  2088
		  warnings->isFrench && g_str_has_prefix(s,":...") ||
ali@70
  2089
		  warnings->isFrench && g_str_has_prefix(s,"...:") ||
ali@70
  2090
		  warnings->isFrench && g_str_has_prefix(s,"!...") ||
ali@70
  2091
		  warnings->isFrench && g_str_has_prefix(s,"...!") ||
ali@70
  2092
		  warnings->isFrench && g_str_has_prefix(s,"?...") ||
ali@70
  2093
		  warnings->isFrench && g_str_has_prefix(s,"...?"))
ali@70
  2094
		{
ali@70
  2095
		    s+=4;
ali@70
  2096
		    nc=g_utf8_get_char(g_utf8_next_char(s));
ali@70
  2097
		}
ali@57
  2098
		; /* do nothing for .. !! and ?? which can be legit */
ali@57
  2099
	    }
ali@57
  2100
	    else
ali@57
  2101
	    {
ali@57
  2102
		if (pswit[ECHO_SWITCH])
ali@70
  2103
		    g_print("\n%s\n",aline);
ali@57
  2104
		if (!pswit[OVERVIEW_SWITCH])
ali@70
  2105
		    g_print("    Line %ld column %ld - Double punctuation?\n",
ali@70
  2106
		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@57
  2107
		else
ali@57
  2108
		    cnt_punct++;
ali@57
  2109
	    }
ali@57
  2110
	}
ali@57
  2111
    }
ali@57
  2112
}
ali@57
  2113
ali@57
  2114
/*
ali@58
  2115
 * check_for_spaced_quotes:
ali@58
  2116
 */
ali@58
  2117
void check_for_spaced_quotes(const char *aline)
ali@58
  2118
{
ali@58
  2119
    const char *s,*t;
ali@58
  2120
    s=aline;
ali@58
  2121
    while ((t=strstr(s," \" ")))
ali@58
  2122
    {
ali@58
  2123
	if (pswit[ECHO_SWITCH])
ali@70
  2124
	    g_print("\n%s\n",aline);
ali@58
  2125
	if (!pswit[OVERVIEW_SWITCH])
ali@70
  2126
	    g_print("    Line %ld column %ld - Spaced doublequote?\n",
ali@70
  2127
	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
ali@58
  2128
	else
ali@58
  2129
	    cnt_punct++;
ali@70
  2130
	s=g_utf8_next_char(g_utf8_next_char(t));
ali@58
  2131
    }
ali@58
  2132
    s=aline;
ali@58
  2133
    while ((t=strstr(s," ' ")))
ali@58
  2134
    {
ali@58
  2135
	if (pswit[ECHO_SWITCH])
ali@70
  2136
	    g_print("\n%s\n",aline);
ali@58
  2137
	if (!pswit[OVERVIEW_SWITCH])
ali@70
  2138
	    g_print("    Line %ld column %ld - Spaced singlequote?\n",
ali@70
  2139
	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
ali@58
  2140
	else
ali@58
  2141
	    cnt_punct++;
ali@70
  2142
	s=g_utf8_next_char(g_utf8_next_char(t));
ali@58
  2143
    }
ali@58
  2144
    s=aline;
ali@58
  2145
    while ((t=strstr(s," ` ")))
ali@58
  2146
    {
ali@58
  2147
	if (pswit[ECHO_SWITCH])
ali@70
  2148
	    g_print("\n%s\n",aline);
ali@58
  2149
	if (!pswit[OVERVIEW_SWITCH])
ali@70
  2150
	    g_print("    Line %ld column %ld - Spaced singlequote?\n",
ali@70
  2151
	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
ali@58
  2152
	else
ali@58
  2153
	    cnt_punct++;
ali@70
  2154
	s=g_utf8_next_char(g_utf8_next_char(t));
ali@58
  2155
    }
ali@58
  2156
}
ali@58
  2157
ali@58
  2158
/*
ali@59
  2159
 * check_for_miscased_genative:
ali@59
  2160
 *
ali@59
  2161
 * Check special case of 'S instead of 's at end of word.
ali@59
  2162
 */
ali@59
  2163
void check_for_miscased_genative(const char *aline)
ali@59
  2164
{
ali@59
  2165
    const char *s;
ali@70
  2166
    gunichar c,nc,pc;
ali@69
  2167
    if (!*aline)
ali@69
  2168
	return;
ali@70
  2169
    c=g_utf8_get_char(aline);
ali@70
  2170
    nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70
  2171
    for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@59
  2172
    {
ali@70
  2173
	pc=c;
ali@70
  2174
	c=nc;
ali@70
  2175
	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@70
  2176
	if (c==CHAR_SQUOTE && nc=='S' && g_unichar_islower(pc))
ali@59
  2177
	{
ali@59
  2178
	    if (pswit[ECHO_SWITCH])
ali@70
  2179
		g_print("\n%s\n",aline);
ali@59
  2180
	    if (!pswit[OVERVIEW_SWITCH])
ali@70
  2181
		g_print("    Line %ld column %ld - Capital \"S\"?\n",
ali@70
  2182
		  linecnt,g_utf8_pointer_to_offset(aline,s)+2);
ali@59
  2183
	    else
ali@59
  2184
		cnt_punct++;
ali@59
  2185
	}
ali@59
  2186
    }
ali@59
  2187
}
ali@59
  2188
ali@59
  2189
/*
ali@60
  2190
 * check_end_of_line:
ali@60
  2191
 *
ali@60
  2192
 * Now check special cases - start and end of line -
ali@60
  2193
 * for single and double quotes. Start is sometimes [sic]
ali@60
  2194
 * but better to query it anyway.
ali@60
  2195
 * While we're here, check for dash at end of line.
ali@60
  2196
 */
ali@60
  2197
void check_end_of_line(const char *aline,struct warnings *warnings)
ali@60
  2198
{
ali@70
  2199
    int lbytes;
ali@70
  2200
    const char *s;
ali@70
  2201
    gunichar c1,c2;
ali@70
  2202
    lbytes=strlen(aline);
ali@70
  2203
    if (g_utf8_strlen(aline,lbytes)>1)
ali@60
  2204
    {
ali@70
  2205
	s=g_utf8_prev_char(aline+lbytes);
ali@70
  2206
	c1=g_utf8_get_char(s);
ali@70
  2207
	c2=g_utf8_get_char(g_utf8_prev_char(s));
ali@70
  2208
	if ((c1==CHAR_DQUOTE || c1==CHAR_SQUOTE || c1==CHAR_OPEN_SQUOTE) &&
ali@70
  2209
	  c2==CHAR_SPACE)
ali@60
  2210
	{
ali@60
  2211
	    if (pswit[ECHO_SWITCH])
ali@70
  2212
		g_print("\n%s\n",aline);
ali@60
  2213
	    if (!pswit[OVERVIEW_SWITCH])
ali@70
  2214
		g_print("    Line %ld column %ld - Spaced quote?\n",linecnt,
ali@70
  2215
		  g_utf8_strlen(aline,lbytes));
ali@70
  2216
	    else
ali@70
  2217
		cnt_punct++;
ali@70
  2218
	}
ali@70
  2219
	c1=g_utf8_get_char(aline);
ali@70
  2220
	c2=g_utf8_get_char(g_utf8_next_char(aline));
ali@70
  2221
	if ((c1==CHAR_SQUOTE || c1==CHAR_OPEN_SQUOTE) && c2==CHAR_SPACE)
ali@70
  2222
	{
ali@70
  2223
	    if (pswit[ECHO_SWITCH])
ali@70
  2224
		g_print("\n%s\n",aline);
ali@70
  2225
	    if (!pswit[OVERVIEW_SWITCH])
ali@70
  2226
		g_print("    Line %ld column 1 - Spaced quote?\n",linecnt);
ali@60
  2227
	    else
ali@60
  2228
		cnt_punct++;
ali@60
  2229
	}
ali@60
  2230
	/*
ali@60
  2231
	 * Dash at end of line may well be legit - paranoid mode only
ali@60
  2232
	 * and don't report em-dash at line-end.
ali@60
  2233
	 */
ali@60
  2234
	if (pswit[PARANOID_SWITCH] && warnings->hyphen)
ali@60
  2235
	{
ali@70
  2236
	    for (s=g_utf8_prev_char(aline+lbytes);
ali@70
  2237
	      s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
ali@60
  2238
		;
ali@70
  2239
	    if (g_utf8_get_char(s)=='-' &&
ali@70
  2240
	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
ali@60
  2241
	    {
ali@60
  2242
		if (pswit[ECHO_SWITCH])
ali@70
  2243
		    g_print("\n%s\n",aline);
ali@60
  2244
		if (!pswit[OVERVIEW_SWITCH])
ali@70
  2245
		    g_print("    Line %ld column %ld - "
ali@70
  2246
		      "Hyphen at end of line?\n",
ali@70
  2247
		      linecnt,g_utf8_pointer_to_offset(aline,s));
ali@60
  2248
	    }
ali@60
  2249
	}
ali@60
  2250
    }
ali@60
  2251
}
ali@60
  2252
ali@60
  2253
/*
ali@61
  2254
 * check_for_unspaced_bracket:
ali@61
  2255
 *
ali@61
  2256
 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
ali@61
  2257
 * If so, suspect a scanno like "a]most".
ali@61
  2258
 */
ali@61
  2259
void check_for_unspaced_bracket(const char *aline)
ali@61
  2260
{
ali@70
  2261
    const char *s;
ali@70
  2262
    gunichar c,nc,pc;
ali@70
  2263
    c=g_utf8_get_char(aline);
ali@70
  2264
    nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70
  2265
    for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@61
  2266
    {
ali@70
  2267
	pc=c;
ali@70
  2268
	c=nc;
ali@70
  2269
	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@70
  2270
	if (!nc)
ali@70
  2271
	    break;
ali@61
  2272
	/* for each bracket character in the line except 1st & last */
ali@70
  2273
	if (g_utf8_strchr("{[()]}",-1,c) &&
ali@70
  2274
	  g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
ali@61
  2275
	{
ali@61
  2276
	    if (pswit[ECHO_SWITCH])
ali@70
  2277
		g_print("\n%s\n",aline);
ali@61
  2278
	    if (!pswit[OVERVIEW_SWITCH])
ali@70
  2279
		g_print("    Line %ld column %ld - Unspaced bracket?\n",
ali@70
  2280
		  linecnt,g_utf8_pointer_to_offset(aline,s));
ali@61
  2281
	    else
ali@61
  2282
		cnt_punct++;
ali@61
  2283
	}
ali@61
  2284
    }
ali@61
  2285
}
ali@61
  2286
ali@61
  2287
/*
ali@62
  2288
 * check_for_unpunctuated_endquote:
ali@62
  2289
 */
ali@62
  2290
void check_for_unpunctuated_endquote(const char *aline)
ali@62
  2291
{
ali@70
  2292
    const char *s;
ali@70
  2293
    gunichar c,nc,pc;
ali@70
  2294
    c=g_utf8_get_char(aline);
ali@70
  2295
    nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
ali@70
  2296
    for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
ali@62
  2297
    {
ali@70
  2298
	pc=c;
ali@70
  2299
	c=nc;
ali@70
  2300
	nc=g_utf8_get_char(g_utf8_next_char(s));
ali@62
  2301
	/* for each character in the line except 1st */
ali@70
  2302
	if (c==CHAR_DQUOTE && isalpha(pc))
ali@62
  2303
	{
ali@62
  2304
	    if (pswit[ECHO_SWITCH])
ali@70
  2305
		g_print("\n%s\n",aline);
ali@62
  2306
	    if (!pswit[OVERVIEW_SWITCH])
ali@70
  2307
		g_print("    Line %ld column %ld - "
ali@70
  2308
		  "endquote missing punctuation?\n",
ali@70
  2309
		  linecnt,g_utf8_pointer_to_offset(aline,s));
ali@62
  2310
	    else
ali@62
  2311
		cnt_punct++;
ali@62
  2312
	}
ali@62
  2313
    }
ali@62
  2314
}
ali@62
  2315
ali@62
  2316
/*
ali@63
  2317
 * check_for_html_tag:
ali@63
  2318
 *
ali@63
  2319
 * Check for <HTML TAG>.
ali@63
  2320
 *
ali@63
  2321
 * If there is a < in the line, followed at some point
ali@63
  2322
 * by a > then we suspect HTML.
ali@63
  2323
 */
ali@63
  2324
void check_for_html_tag(const char *aline)
ali@63
  2325
{
ali@63
  2326
    const char *open,*close;
ali@70
  2327
    gchar *tag;
ali@70
  2328
    open=strchr(aline,'<');
ali@63
  2329
    if (open)
ali@63
  2330
    {
ali@70
  2331
	close=strchr(g_utf8_next_char(open),'>');
ali@63
  2332
	if (close)
ali@63
  2333
	{
ali@70
  2334
	    if (pswit[ECHO_SWITCH])
ali@70
  2335
		g_print("\n%s\n",aline);
ali@70
  2336
	    if (!pswit[OVERVIEW_SWITCH])
ali@63
  2337
	    {
ali@70
  2338
		tag=g_strndup(open,close-open+1);
ali@70
  2339
		g_print("    Line %ld column %ld - HTML Tag? %s \n",
ali@70
  2340
		  linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
ali@70
  2341
		g_free(tag);
ali@63
  2342
	    }
ali@70
  2343
	    else
ali@70
  2344
		cnt_html++;
ali@63
  2345
	}
ali@63
  2346
    }
ali@63
  2347
}
ali@63
  2348
ali@63
  2349
/*
ali@64
  2350
 * check_for_html_entity:
ali@64
  2351
 *
ali@64
  2352
 * Check for &symbol; HTML.
ali@64
  2353
 *
ali@64
  2354
 * If there is a & in the line, followed at
ali@64
  2355
 * some point by a ; then we suspect HTML.
ali@64
  2356
 */
ali@64
  2357
void check_for_html_entity(const char *aline)
ali@64
  2358
{
ali@64
  2359
    const char *s,*amp,*scolon;
ali@70
  2360
    gchar *entity;
ali@70
  2361
    amp=strchr(aline,'&');
ali@64
  2362
    if (amp)
ali@64
  2363
    {
ali@70
  2364
	scolon=strchr(amp,';');
ali@64
  2365
	if (scolon)
ali@64
  2366
	{
ali@70
  2367
	    for (s=amp;s<scolon;s=g_utf8_next_char(s))   
ali@70
  2368
		if (g_utf8_get_char(s)==CHAR_SPACE)
ali@70
  2369
		    break;		/* Don't report "Jones & Son;" */
ali@70
  2370
	    if (s>=scolon)
ali@64
  2371
	    {
ali@64
  2372
		if (pswit[ECHO_SWITCH])
ali@70
  2373
		    g_print("\n%s\n",aline);
ali@64
  2374
		if (!pswit[OVERVIEW_SWITCH])
ali@70
  2375
		{
ali@70
  2376
		    entity=g_strndup(amp,scolon-amp+1);
ali@70
  2377
		    g_print("    Line %ld column %d - HTML symbol? %s \n",
ali@70
  2378
		      linecnt,(int)(amp-aline)+1,entity);
ali@70
  2379
		    g_free(entity);
ali@70
  2380
		}
ali@64
  2381
		else
ali@64
  2382
		    cnt_html++;
ali@64
  2383
	    }
ali@64
  2384
	}
ali@64
  2385
    }
ali@64
  2386
}
ali@64
  2387
ali@65
  2388
/*
ali@65
  2389
 * print_pending:
ali@65
  2390
 *
ali@65
  2391
 * If we are in a state of unbalanced quotes, and this line
ali@65
  2392
 * doesn't begin with a quote, output the stored error message.
ali@65
  2393
 * If the -P switch was used, print the warning even if the
ali@65
  2394
 * new para starts with quotes.
ali@65
  2395
 */
ali@65
  2396
void print_pending(const char *aline,const char *parastart,
ali@65
  2397
  struct pending *pending)
ali@65
  2398
{
ali@65
  2399
    const char *s;
ali@70
  2400
    gunichar c;
ali@65
  2401
    s=aline;
ali@65
  2402
    while (*s==' ')
ali@65
  2403
	s++;
ali@70
  2404
    c=g_utf8_get_char(s);
ali@69
  2405
    if (pending->dquote)
ali@69
  2406
    {
ali@70
  2407
	if (c!=CHAR_DQUOTE || pswit[QPARA_SWITCH])
ali@65
  2408
	{
ali@65
  2409
	    if (!pswit[OVERVIEW_SWITCH])
ali@65
  2410
	    {
ali@65
  2411
		if (pswit[ECHO_SWITCH])
ali@70
  2412
		    g_print("\n%s\n",parastart);
ali@70
  2413
		g_print("%s\n",pending->dquote);
ali@65
  2414
	    }
ali@65
  2415
	    else
ali@65
  2416
		cnt_dquot++;
ali@65
  2417
	}
ali@69
  2418
	g_free(pending->dquote);
ali@69
  2419
	pending->dquote=NULL;
ali@69
  2420
    }
ali@69
  2421
    if (pending->squote)
ali@65
  2422
    {
ali@70
  2423
	if (c!=CHAR_SQUOTE && c!=CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] ||
ali@65
  2424
	  pending->squot)
ali@65
  2425
	{
ali@65
  2426
	    if (!pswit[OVERVIEW_SWITCH])
ali@65
  2427
	    {
ali@65
  2428
		if (pswit[ECHO_SWITCH])
ali@70
  2429
		    g_print("\n%s\n",parastart);
ali@70
  2430
		g_print("%s\n",pending->squote);
ali@65
  2431
	    }
ali@65
  2432
	    else
ali@65
  2433
		cnt_squot++;
ali@65
  2434
	}
ali@69
  2435
	g_free(pending->squote);
ali@69
  2436
	pending->squote=NULL;
ali@65
  2437
    }
ali@69
  2438
    if (pending->rbrack)
ali@65
  2439
    {
ali@65
  2440
	if (!pswit[OVERVIEW_SWITCH])
ali@65
  2441
	{
ali@65
  2442
	    if (pswit[ECHO_SWITCH])
ali@70
  2443
		g_print("\n%s\n",parastart);
ali@70
  2444
	    g_print("%s\n",pending->rbrack);
ali@65
  2445
	}
ali@65
  2446
	else
ali@65
  2447
	    cnt_brack++;
ali@69
  2448
	g_free(pending->rbrack);
ali@69
  2449
	pending->rbrack=NULL;
ali@65
  2450
    }
ali@69
  2451
    if (pending->sbrack)
ali@65
  2452
    {
ali@65
  2453
	if (!pswit[OVERVIEW_SWITCH])
ali@65
  2454
	{
ali@65
  2455
	    if (pswit[ECHO_SWITCH])
ali@70
  2456
		g_print("\n%s\n",parastart);
ali@70
  2457
	    g_print("%s\n",pending->sbrack);
ali@65
  2458
	}
ali@65
  2459
	else
ali@65
  2460
	    cnt_brack++;
ali@69
  2461
	g_free(pending->sbrack);
ali@69
  2462
	pending->sbrack=NULL;
ali@65
  2463
    }
ali@69
  2464
    if (pending->cbrack)
ali@65
  2465
    {
ali@65
  2466
	if (!pswit[OVERVIEW_SWITCH])
ali@65
  2467
	{
ali@65
  2468
	    if (pswit[ECHO_SWITCH])
ali@70
  2469
		g_print("\n%s\n",parastart);
ali@70
  2470
	    g_print("%s\n",pending->cbrack);
ali@65
  2471
	}
ali@65
  2472
	else
ali@65
  2473
	    cnt_brack++;
ali@69
  2474
	g_free(pending->cbrack);
ali@69
  2475
	pending->cbrack=NULL;
ali@65
  2476
    }
ali@69
  2477
    if (pending->unders)
ali@65
  2478
    {
ali@65
  2479
	if (!pswit[OVERVIEW_SWITCH])
ali@65
  2480
	{
ali@65
  2481
	    if (pswit[ECHO_SWITCH])
ali@70
  2482
		g_print("\n%s\n",parastart);
ali@70
  2483
	    g_print("%s\n",pending->unders);
ali@65
  2484
	}
ali@65
  2485
	else
ali@65
  2486
	    cnt_brack++;
ali@69
  2487
	g_free(pending->unders);
ali@69
  2488
	pending->unders=NULL;
ali@65
  2489
    }
ali@65
  2490
}
ali@65
  2491
ali@65
  2492
/*
ali@65
  2493
 * check_for_mismatched_quotes:
ali@65
  2494
 *
ali@65
  2495
 * At end of paragraph, check for mismatched quotes.
ali@65
  2496
 *
ali@65
  2497
 * We don't want to report an error immediately, since it is a
ali@65
  2498
 * common convention to omit the quotes at end of paragraph if
ali@65
  2499
 * the next paragraph is a continuation of the same speaker.
ali@65
  2500
 * Where this is the case, the next para should begin with a
ali@65
  2501
 * quote, so we store the warning message and only display it
ali@65
  2502
 * at the top of the next iteration if the new para doesn't
ali@65
  2503
 * start with a quote.
ali@65
  2504
 * The -p switch overrides this default, and warns of unclosed
ali@65
  2505
 * quotes on _every_ paragraph, whether the next begins with a
ali@65
  2506
 * quote or not.
ali@65
  2507
 */
ali@65
  2508
void check_for_mismatched_quotes(const struct counters *counters,
ali@65
  2509
  struct pending *pending)
ali@65
  2510
{
ali@65
  2511
    if (counters->quot%2)
ali@69
  2512
	pending->dquote=
ali@69
  2513
	  g_strdup_printf("    Line %ld - Mismatched quotes",linecnt);
ali@65
  2514
    if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
ali@65
  2515
      counters->open_single_quote!=counters->close_single_quote)
ali@69
  2516
	pending->squote=
ali@69
  2517
	  g_strdup_printf("    Line %ld - Mismatched singlequotes?",linecnt);
ali@65
  2518
    if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
ali@65
  2519
      counters->open_single_quote!=counters->close_single_quote &&
ali@65
  2520
      counters->open_single_quote!=counters->close_single_quote+1)
ali@65
  2521
	/*
ali@65
  2522
	 * Flag it to be noted regardless of the
ali@65
  2523
	 * first char of the next para.
ali@65
  2524
	 */
ali@65
  2525
	pending->squot=1;
ali@65
  2526
    if (counters->r_brack)
ali@69
  2527
	pending->rbrack=
ali@69
  2528
	  g_strdup_printf("    Line %ld - Mismatched round brackets?",linecnt);
ali@65
  2529
    if (counters->s_brack)
ali@69
  2530
	pending->sbrack=
ali@69
  2531
	  g_strdup_printf("    Line %ld - Mismatched square brackets?",linecnt);
ali@65
  2532
    if (counters->c_brack)
ali@69
  2533
	pending->cbrack=
ali@69
  2534
	  g_strdup_printf("    Line %ld - Mismatched curly brackets?",linecnt);
ali@65
  2535
    if (counters->c_unders%2)
ali@69
  2536
	pending->unders=
ali@69
  2537
	  g_strdup_printf("    Line %ld - Mismatched underscores?",linecnt);
ali@65
  2538
}
ali@65
  2539
ali@64
  2540
/*
ali@66
  2541
 * check_for_omitted_punctuation:
ali@66
  2542
 *
ali@66
  2543
 * Check for omitted punctuation at end of paragraph by working back
ali@66
  2544
 * through prevline. DW.
ali@66
  2545
 * Need to check this only for "normal" paras.
ali@66
  2546
 * So what is a "normal" para?
ali@66
  2547
 *    Not normal if one-liner (chapter headings, etc.)
ali@66
  2548
 *    Not normal if doesn't contain at least one locase letter
ali@66
  2549
 *    Not normal if starts with space
ali@66
  2550
 */
ali@66
  2551
void check_for_omitted_punctuation(const char *prevline,
ali@66
  2552
  struct line_properties *last,int start_para_line)
ali@66
  2553
{
ali@70
  2554
    gboolean letter_on_line=FALSE;
ali@66
  2555
    const char *s;
ali@70
  2556
    for (s=prevline;*s;s=g_utf8_next_char(s))
ali@70
  2557
	if (g_unichar_isalpha(g_utf8_get_char(s)))
ali@70
  2558
	{
ali@70
  2559
	    letter_on_line=TRUE;
ali@70
  2560
	    break;
ali@70
  2561
	}
ali@66
  2562
    /*
ali@66
  2563
     * This next "if" is a problem.
ali@66
  2564
     * If we say "start_para_line <= linecnt - 1", that includes
ali@66
  2565
     * one-line "paragraphs" like chapter heads. Lotsa false positives.
ali@66
  2566
     * If we say "start_para_line < linecnt - 1" it doesn't, but then it
ali@66
  2567
     * misses genuine one-line paragraphs.
ali@66
  2568
     */
ali@70
  2569
    if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
ali@70
  2570
      g_utf8_get_char(prevline)>CHAR_SPACE)
ali@66
  2571
    {
ali@70
  2572
	for (s=g_utf8_prev_char(prevline+strlen(prevline));
ali@70
  2573
	  (g_utf8_get_char(s)==CHAR_DQUOTE ||
ali@70
  2574
	  g_utf8_get_char(s)==CHAR_SQUOTE) &&
ali@70
  2575
	  g_utf8_get_char(s)>CHAR_SPACE && s>prevline;
ali@70
  2576
	  s=g_utf8_prev_char(s))
ali@66
  2577
	    ;
ali@70
  2578
	for (;s>prevline;s=g_utf8_prev_char(s))
ali@66
  2579
	{
ali@70
  2580
	    if (g_unichar_isalpha(g_utf8_get_char(s)))
ali@66
  2581
	    {
ali@66
  2582
		if (pswit[ECHO_SWITCH])
ali@70
  2583
		    g_print("\n%s\n",prevline);
ali@66
  2584
		if (!pswit[OVERVIEW_SWITCH])
ali@70
  2585
		    g_print("    Line %ld column %ld - "
ali@66
  2586
		      "No punctuation at para end?\n",
ali@70
  2587
		      linecnt-1,g_utf8_strlen(prevline,-1));
ali@66
  2588
		else
ali@66
  2589
		    cnt_punct++;
ali@66
  2590
		break;
ali@66
  2591
	    }
ali@70
  2592
	    if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))
ali@66
  2593
		break;
ali@66
  2594
	}
ali@66
  2595
    }
ali@66
  2596
}
ali@66
  2597
ali@69
  2598
gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
ali@69
  2599
{
ali@69
  2600
    const char *word=key;
ali@69
  2601
    int *dupcnt=value;
ali@69
  2602
    if (*dupcnt)
ali@70
  2603
	g_print("\nNote: Queried word %s was duplicated %d times\n",
ali@69
  2604
	  word,*dupcnt);
ali@69
  2605
    return FALSE;
ali@69
  2606
}
ali@69
  2607
ali@70
  2608
void print_as_windows_1252(const char *string)
ali@70
  2609
{
ali@70
  2610
    gsize inbytes,outbytes;
ali@70
  2611
    gchar *buf,*bp;
ali@70
  2612
    GIConv converter=(GIConv)-1;
ali@70
  2613
    if (!string)
ali@70
  2614
    {
ali@70
  2615
	if (converter!=(GIConv)-1)
ali@70
  2616
	    g_iconv_close(converter);
ali@70
  2617
	converter=(GIConv)-1;
ali@70
  2618
	return;
ali@70
  2619
    }
ali@70
  2620
    if (converter=(GIConv)-1)
ali@70
  2621
	converter=g_iconv_open("WINDOWS-1252","UTF-8");
ali@70
  2622
    if (converter!=(GIConv)-1)
ali@70
  2623
    {
ali@70
  2624
	inbytes=outbytes=strlen(string);
ali@70
  2625
	bp=buf=g_malloc(outbytes+1);
ali@70
  2626
	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
ali@70
  2627
	*bp='\0';
ali@70
  2628
	fputs(buf,stdout);
ali@70
  2629
	g_free(buf);
ali@70
  2630
    }
ali@70
  2631
    else
ali@70
  2632
	fputs(string,stdout);
ali@70
  2633
}
ali@70
  2634
ali@72
  2635
void print_as_utf_8(const char *string)
ali@72
  2636
{
ali@72
  2637
    fputs(string,stdout);
ali@72
  2638
}
ali@72
  2639
ali@66
  2640
/*
ali@41
  2641
 * procfile:
ali@41
  2642
 *
ali@41
  2643
 * Process one file.
ali@41
  2644
 */
ali@69
  2645
void procfile(const char *filename)
ali@41
  2646
{
ali@65
  2647
    const char *s;
ali@69
  2648
    gchar *parastart=NULL;	/* first line of current para */
ali@69
  2649
    gchar *etext,*aline;
ali@69
  2650
    gchar *etext_ptr;
ali@69
  2651
    GError *err=NULL;
ali@41
  2652
    struct first_pass_results *first_pass_results;
ali@42
  2653
    struct warnings *warnings;
ali@43
  2654
    struct counters counters={0};
ali@45
  2655
    struct line_properties last={0};
ali@56
  2656
    struct parities parities={0};
ali@69
  2657
    struct pending pending={0};
ali@69
  2658
    gboolean isemptyline;
ali@68
  2659
    long start_para_line=0;
ali@69
  2660
    gboolean isnewpara=FALSE,enddash=FALSE;
ali@45
  2661
    last.start=CHAR_SPACE;
ali@68
  2662
    linecnt=checked_linecnt=0;
ali@69
  2663
    etext=read_etext(filename,&err);
ali@69
  2664
    if (!etext)
ali@41
  2665
    {
ali@68
  2666
	if (pswit[STDOUT_SWITCH])
ali@69
  2667
	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
ali@68
  2668
	else
ali@69
  2669
	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
ali@41
  2670
	exit(1);
ali@41
  2671
    }
ali@70
  2672
    g_print("\n\nFile: %s\n\n",filename);
ali@69
  2673
    first_pass_results=first_pass(etext);
ali@42
  2674
    warnings=report_first_pass(first_pass_results);
ali@69
  2675
    qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
ali@69
  2676
    qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
ali@40
  2677
    /*
ali@40
  2678
     * Here we go with the main pass. Hold onto yer hat!
ali@40
  2679
     */
ali@65
  2680
    linecnt=0;
ali@69
  2681
    etext_ptr=etext;
ali@69
  2682
    while ((aline=flgets(&etext_ptr,linecnt+1)))
ali@40
  2683
    {
ali@68
  2684
	linecnt++;
ali@68
  2685
	if (linecnt==1)
ali@69
  2686
	    isnewpara=TRUE;
ali@70
  2687
	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
ali@40
  2688
	    continue;    // skip DP page separators completely
ali@68
  2689
	if (linecnt<first_pass_results->firstline ||
ali@41
  2690
	  (first_pass_results->footerline>0 &&
ali@41
  2691
	  linecnt>first_pass_results->footerline))
ali@40
  2692
	{
ali@68
  2693
	    if (pswit[HEADER_SWITCH])
ali@40
  2694
	    {
ali@70
  2695
		if (g_str_has_prefix(aline,"Title:"))
ali@70
  2696
		    g_print("    %s\n",aline);
ali@70
  2697
		if (g_str_has_prefix(aline,"Author:"))
ali@70
  2698
		    g_print("    %s\n",aline);
ali@70
  2699
		if (g_str_has_prefix(aline,"Release Date:"))
ali@70
  2700
		    g_print("    %s\n",aline);
ali@70
  2701
		if (g_str_has_prefix(aline,"Edition:"))
ali@70
  2702
		    g_print("    %s\n\n",aline);
ali@40
  2703
	    }
ali@68
  2704
	    continue;		/* skip through the header */
ali@40
  2705
	}
ali@68
  2706
	checked_linecnt++;
ali@65
  2707
	print_pending(aline,parastart,&pending);
ali@65
  2708
	memset(&pending,0,sizeof(pending));
ali@43
  2709
	isemptyline=analyse_quotes(aline,&counters);
ali@68
  2710
	if (isnewpara && !isemptyline)
ali@40
  2711
	{
ali@40
  2712
	    /* This line is the start of a new paragraph. */
ali@68
  2713
	    start_para_line=linecnt;
ali@40
  2714
	    /* Capture its first line in case we want to report it later. */
ali@69
  2715
	    g_free(parastart);
ali@69
  2716
	    parastart=g_strdup(aline);
ali@56
  2717
	    memset(&parities,0,sizeof(parities));  /* restart the quote count */
ali@68
  2718
	    s=aline;
ali@70
  2719
	    while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
ali@70
  2720
	      !g_unichar_isdigit(g_utf8_get_char(s)))
ali@70
  2721
		s=g_utf8_next_char(s);
ali@70
  2722
	    if (g_unichar_islower(g_utf8_get_char(s)))
ali@40
  2723
	    {
ali@40
  2724
		/* and its first letter is lowercase */
ali@68
  2725
		if (pswit[ECHO_SWITCH])
ali@70
  2726
		    g_print("\n%s\n",aline);
ali@68
  2727
		if (!pswit[OVERVIEW_SWITCH])
ali@70
  2728
		    g_print("    Line %ld column %ld - "
ali@40
  2729
		      "Paragraph starts with lower-case\n",
ali@70
  2730
		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
ali@68
  2731
		else
ali@68
  2732
		    cnt_punct++;
ali@40
  2733
	    }
ali@69
  2734
	    isnewpara=FALSE; /* Signal the end of new para processing. */
ali@40
  2735
	}
ali@68
  2736
	/* Check for an em-dash broken at line end. */
ali@70
  2737
	if (enddash && g_utf8_get_char(aline)=='-')
ali@40
  2738
	{
ali@68
  2739
	    if (pswit[ECHO_SWITCH])
ali@70
  2740
		g_print("\n%s\n",aline);
ali@68
  2741
	    if (!pswit[OVERVIEW_SWITCH])
ali@70
  2742
		g_print("    Line %ld column 1 - Broken em-dash?\n",linecnt);
ali@68
  2743
	    else
ali@68
  2744
		cnt_punct++;
ali@40
  2745
	}
ali@69
  2746
	enddash=FALSE;
ali@70
  2747
	for (s=g_utf8_prev_char(aline+strlen(aline));
ali@70
  2748
	  g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
ali@40
  2749
	    ;
ali@70
  2750
	if (s>=aline && g_utf8_get_char(s)=='-')
ali@69
  2751
	    enddash=TRUE;
ali@67
  2752
	check_for_control_characters(aline);
ali@68
  2753
	if (warnings->bin)
ali@44
  2754
	    check_for_odd_characters(aline,warnings,isemptyline);
ali@68
  2755
	if (warnings->longline)
ali@45
  2756
	    check_for_long_line(aline);
ali@68
  2757
	if (warnings->shortline)
ali@45
  2758
	    check_for_short_line(aline,&last);
ali@68
  2759
	last.blen=last.len;
ali@70
  2760
	last.len=g_utf8_strlen(aline,-1);
ali@70
  2761
	last.start=g_utf8_get_char(aline);
ali@46
  2762
	check_for_starting_punctuation(aline);
ali@68
  2763
	if (warnings->dash)
ali@40
  2764
	{
ali@47
  2765
	    check_for_spaced_emdash(aline);
ali@47
  2766
	    check_for_spaced_dash(aline);
ali@40
  2767
	}
ali@48
  2768
	check_for_unmarked_paragraphs(aline);
ali@49
  2769
	check_for_jeebies(aline);
ali@50
  2770
	check_for_mta_from(aline);
ali@51
  2771
	check_for_orphan_character(aline);
ali@52
  2772
	check_for_pling_scanno(aline);
ali@53
  2773
	check_for_extra_period(aline,warnings);
ali@54
  2774
	check_for_following_punctuation(aline);
ali@55
  2775
	check_for_typos(aline,warnings);
ali@56
  2776
	check_for_misspaced_punctuation(aline,&parities,isemptyline);
ali@57
  2777
	check_for_double_punctuation(aline,warnings);
ali@58
  2778
	check_for_spaced_quotes(aline);
ali@59
  2779
	check_for_miscased_genative(aline);
ali@60
  2780
	check_end_of_line(aline,warnings);
ali@61
  2781
	check_for_unspaced_bracket(aline);
ali@68
  2782
	if (warnings->endquote)
ali@62
  2783
	    check_for_unpunctuated_endquote(aline);
ali@63
  2784
	check_for_html_tag(aline);
ali@64
  2785
	check_for_html_entity(aline);
ali@68
  2786
	if (isemptyline)
ali@40
  2787
	{
ali@65
  2788
	    check_for_mismatched_quotes(&counters,&pending);
ali@43
  2789
	    memset(&counters,0,sizeof(counters));
ali@40
  2790
	    /* let the next iteration know that it's starting a new para */
ali@69
  2791
	    isnewpara=TRUE;
ali@69
  2792
	    if (prevline)
ali@69
  2793
		check_for_omitted_punctuation(prevline,&last,start_para_line);
ali@40
  2794
	}
ali@69
  2795
	g_free(prevline);
ali@69
  2796
	prevline=g_strdup(aline);
ali@0
  2797
    }
ali@69
  2798
    if (prevline)
ali@69
  2799
    {
ali@69
  2800
	g_free(prevline);
ali@69
  2801
	prevline=NULL;
ali@69
  2802
    }
ali@69
  2803
    g_free(parastart);
ali@69
  2804
    g_free(prevline);
ali@69
  2805
    g_free(etext);
ali@0
  2806
    if (!pswit[OVERVIEW_SWITCH])
ali@69
  2807
	g_tree_foreach(qword,report_duplicate_queries,NULL);
ali@69
  2808
    g_tree_unref(qword);
ali@69
  2809
    g_tree_unref(qperiod);
ali@70
  2810
    g_set_print_handler(NULL);
ali@70
  2811
    print_as_windows_1252(NULL);
ali@71
  2812
    if (pswit[MARKUP_SWITCH])  
ali@71
  2813
	loseentities(NULL);
ali@0
  2814
}
ali@0
  2815
ali@40
  2816
/*
ali@40
  2817
 * flgets:
ali@40
  2818
 *
ali@69
  2819
 * Get one line from the input text, checking for
ali@40
  2820
 * the existence of exactly one CR/LF line-end per line.
ali@40
  2821
 *
ali@40
  2822
 * Returns: a pointer to the line.
ali@40
  2823
 */
ali@69
  2824
char *flgets(char **etext,long lcnt)
ali@0
  2825
{
ali@70
  2826
    gunichar c;
ali@69
  2827
    gboolean isCR=FALSE;
ali@69
  2828
    char *theline=*etext;
ali@70
  2829
    char *eos=theline;
ali@70
  2830
    gchar *s;
ali@70
  2831
    for (;;)
ali@40
  2832
    {
ali@70
  2833
	c=g_utf8_get_char(*etext);
ali@70
  2834
	*etext=g_utf8_next_char(*etext);
ali@69
  2835
	if (!c)
ali@68
  2836
	    return NULL;
ali@40
  2837
	/* either way, it's end of line */
ali@69
  2838
	if (c=='\n')
ali@40
  2839
	{
ali@68
  2840
	    if (isCR)
ali@68
  2841
		break;
ali@68
  2842
	    else
ali@40
  2843
	    {
ali@40
  2844
		/* Error - a LF without a preceding CR */
ali@68
  2845
		if (pswit[LINE_END_SWITCH])
ali@40
  2846
		{
ali@68
  2847
		    if (pswit[ECHO_SWITCH])
ali@70
  2848
		    {
ali@70
  2849
			s=g_strndup(theline,eos-theline);
ali@70
  2850
			g_print("\n%s\n",s);
ali@70
  2851
			g_free(s);
ali@70
  2852
		    }
ali@68
  2853
		    if (!pswit[OVERVIEW_SWITCH])
ali@70
  2854
			g_print("    Line %ld - No CR?\n",lcnt);
ali@68
  2855
		    else
ali@68
  2856
			cnt_lineend++;
ali@40
  2857
		}
ali@68
  2858
		break;
ali@40
  2859
	    }
ali@40
  2860
	}
ali@69
  2861
	if (c=='\r')
ali@40
  2862
	{
ali@68
  2863
	    if (isCR)
ali@40
  2864
	    {
ali@40
  2865
		/* Error - two successive CRs */
ali@68
  2866
		if (pswit[LINE_END_SWITCH])
ali@40
  2867
		{
ali@68
  2868
		    if (pswit[ECHO_SWITCH])
ali@70
  2869
		    {
ali@70
  2870
			s=g_strndup(theline,eos-theline);
ali@70
  2871
			g_print("\n%s\n",s);
ali@70
  2872
			g_free(s);
ali@70
  2873
		    }
ali@68
  2874
		    if (!pswit[OVERVIEW_SWITCH])
ali@70
  2875
			g_print("    Line %ld - Two successive CRs?\n",lcnt);
ali@68
  2876
		    else
ali@68
  2877
			cnt_lineend++;
ali@40
  2878
		}
ali@40
  2879
	    }
ali@69
  2880
	    isCR=TRUE;
ali@40
  2881
	}
ali@68
  2882
	else
ali@40
  2883
	{
ali@68
  2884
	    if (pswit[LINE_END_SWITCH] && isCR)
ali@40
  2885
	    {
ali@68
  2886
		if (pswit[ECHO_SWITCH])
ali@70
  2887
		{
ali@70
  2888
		    s=g_strndup(theline,eos-theline);
ali@70
  2889
		    g_print("\n%s\n",s);
ali@70
  2890
		    g_free(s);
ali@70
  2891
		}
ali@68
  2892
		if (!pswit[OVERVIEW_SWITCH])
ali@70
  2893
		    g_print("    Line %ld column %ld - CR without LF?\n",
ali@70
  2894
		      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
ali@68
  2895
		else
ali@68
  2896
		    cnt_lineend++;
ali@70
  2897
		*eos=' ';
ali@40
  2898
	    }
ali@69
  2899
	    isCR=FALSE;
ali@70
  2900
	    eos=g_utf8_next_char(eos);
ali@40
  2901
	}
ali@69
  2902
    }
ali@70
  2903
    *eos='\0';
ali@0
  2904
    if (pswit[MARKUP_SWITCH])  
ali@68
  2905
	postprocess_for_HTML(theline);
ali@0
  2906
    if (pswit[DP_SWITCH])  
ali@68
  2907
	postprocess_for_DP(theline);
ali@40
  2908
    return theline;
ali@0
  2909
}
ali@0
  2910
ali@40
  2911
/*
ali@40
  2912
 * mixdigit:
ali@40
  2913
 *
ali@40
  2914
 * Takes a "word" as a parameter, and checks whether it
ali@40
  2915
 * contains a mixture of alpha and digits. Generally, this is an
ali@40
  2916
 * error, but may not be for cases like 4th or L5 12s. 3d.
ali@40
  2917
 *
ali@70
  2918
 * Returns: TRUE iff an is error found.
ali@40
  2919
 */
ali@70
  2920
gboolean mixdigit(const char *checkword)
ali@0
  2921
{
ali@70
  2922
    gboolean wehaveadigit,wehavealetter,query;
ali@70
  2923
    const char *s,*nondigit;
ali@70
  2924
    wehaveadigit=wehavealetter=query=FALSE;
ali@70
  2925
    for (s=checkword;*s;s=g_utf8_next_char(s))
ali@70
  2926
	if (g_unichar_isalpha(g_utf8_get_char(s)))
ali@70
  2927
	    wehavealetter=TRUE;
ali@70
  2928
	else if (g_unichar_isdigit(g_utf8_get_char(s)))
ali@70
  2929
	    wehaveadigit=TRUE;
ali@40
  2930
    if (wehaveadigit && wehavealetter)
ali@40
  2931
    {
ali@40
  2932
	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
ali@70
  2933
	query=TRUE;
ali@70
  2934
	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
ali@70
  2935
	  nondigit=g_utf8_next_char(nondigit))
ali@68
  2936
	    ;
ali@68
  2937
	/* digits, ending in st, rd, nd, th of either case */
ali@70
  2938
	if (!g_ascii_strcasecmp(nondigit,"st") ||
ali@70
  2939
	  !g_ascii_strcasecmp(nondigit,"rd") ||
ali@70
  2940
	  !g_ascii_strcasecmp(nondigit,"nd") ||
ali@70
  2941
	  !g_ascii_strcasecmp(nondigit,"th"))
ali@70
  2942
	    query=FALSE;
ali@70
  2943
	if (!g_ascii_strcasecmp(nondigit,"sts") ||
ali@70
  2944
	  !g_ascii_strcasecmp(nondigit,"rds") ||
ali@70
  2945
	  !g_ascii_strcasecmp(nondigit,"nds") ||
ali@70
  2946
	  !g_ascii_strcasecmp(nondigit,"ths"))
ali@70
  2947
	    query=FALSE;
ali@70
  2948
	if (!g_ascii_strcasecmp(nondigit,"stly") ||
ali@70
  2949
	  !g_ascii_strcasecmp(nondigit,"rdly") ||
ali@70
  2950
	  !g_ascii_strcasecmp(nondigit,"ndly") ||
ali@70
  2951
	  !g_ascii_strcasecmp(nondigit,"thly"))
ali@70
  2952
	    query=FALSE;
ali@68
  2953
	/* digits, ending in l, L, s or d */
ali@70
  2954
	if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
ali@70
  2955
	  !strcmp(nondigit,"d"))
ali@70
  2956
	    query=FALSE;
ali@68
  2957
	/*
ali@40
  2958
	 * L at the start of a number, representing Britsh pounds, like L500.
ali@70
  2959
	 * This is cute. We know the current word is mixed digit. If the first
ali@68
  2960
	 * letter is L, there must be at least one digit following. If both
ali@68
  2961
	 * digits and letters follow, we have a genuine error, else we have a
ali@68
  2962
	 * capital L followed by digits, and we accept that as a non-error.
ali@40
  2963
	 */
ali@70
  2964
	if (g_utf8_get_char(checkword)=='L' &&
ali@70
  2965
	  !mixdigit(g_utf8_next_char(checkword)))
ali@70
  2966
	    query=FALSE;
ali@40
  2967
    }
ali@40
  2968
    return query;
ali@0
  2969
}
ali@0
  2970
ali@40
  2971
/*
ali@40
  2972
 * getaword:
ali@40
  2973
 *
ali@69
  2974
 * Extracts the first/next "word" from the line, and returns it.
ali@69
  2975
 * A word is defined as one English word unit--or at least that's the aim.
ali@69
  2976
 * "ptr" is advanced to the position in the line where we will start
ali@69
  2977
 * looking for the next word.
ali@40
  2978
 *
ali@69
  2979
 * Returns: A newly-allocated string.
ali@40
  2980
 */
ali@69
  2981
gchar *getaword(const char **ptr)
ali@0
  2982
{
ali@70
  2983
    const char *s,*t;
ali@69
  2984
    GString *word;
ali@70
  2985
    gunichar c,pc;
ali@69
  2986
    word=g_string_new(NULL);
ali@70
  2987
    for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
ali@70
  2988
      !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
ali@70
  2989
      **ptr;*ptr=g_utf8_next_char(*ptr))
ali@40
  2990
	;
ali@40
  2991
    /*
ali@40
  2992
     * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
ali@40
  2993
     * Especially yucky is the case of L1,000
ali@40
  2994
     * This section looks for a pattern of characters including a digit
ali@40
  2995
     * followed by a comma or period followed by one or more digits.
ali@40
  2996
     * If found, it returns this whole pattern as a word; otherwise we discard
ali@40
  2997
     * the results and resume our normal programming.
ali@40
  2998
     */
ali@69
  2999
    s=*ptr;
ali@70
  3000
    for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
ali@70
  3001
      g_unichar_isalpha(g_utf8_get_char(s)) ||
ali@70
  3002
      g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
ali@70
  3003
	g_string_append_unichar(word,g_utf8_get_char(s));
ali@70
  3004
    for (t=g_utf8_next_char(word->str);*g_utf8_next_char(t);
ali@70
  3005
      t=g_utf8_next_char(t))
ali@40
  3006
    {
ali@70
  3007
	c=g_utf8_get_char(t);
ali@70
  3008
	pc=g_utf8_get_char(g_utf8_prev_char(t));
ali@70
  3009
	if ((c=='.' || c==',') && g_unichar_isdigit(pc))
ali@40
  3010
	{
ali@70
  3011
	    *ptr=s;
ali@70
  3012
	    return g_string_free(word,FALSE);
ali@40
  3013
	}
ali@40
  3014
    }
ali@0
  3015
    /* we didn't find a punctuated number - do the regular getword thing */
ali@69
  3016
    g_string_truncate(word,0);
ali@70
  3017
    for (;g_unichar_isdigit(g_utf8_get_char(*ptr)) ||
ali@70
  3018
      g_unichar_isalpha(g_utf8_get_char(*ptr)) ||
ali@70
  3019
      g_utf8_get_char(*ptr)=='\'';*ptr=g_utf8_next_char(*ptr))
ali@70
  3020
	g_string_append_unichar(word,g_utf8_get_char(*ptr));
ali@69
  3021
    return g_string_free(word,FALSE);
ali@0
  3022
}
ali@0
  3023
ali@40
  3024
/*
ali@40
  3025
 * isroman:
ali@40
  3026
 *
ali@40
  3027
 * Is this word a Roman Numeral?
ali@40
  3028
 *
ali@40
  3029
 * It doesn't actually validate that the number is a valid Roman Numeral--for
ali@40
  3030
 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
ali@40
  3031
 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
ali@40
  3032
 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
ali@40
  3033
 * expressions thereof, except when it came to taxes. Allow any number of M,
ali@40
  3034
 * an optional D, an optional CM or CD, any number of optional Cs, an optional
ali@40
  3035
 * XL or an optional XC, an optional IX or IV, an optional V and any number
ali@40
  3036
 * of optional Is.
ali@40
  3037
 */
ali@69
  3038
gboolean isroman(const char *t)
ali@0
  3039
{
ali@69
  3040
    const char *s;
ali@40
  3041
    if (!t || !*t)
ali@69
  3042
	return FALSE;
ali@40
  3043
    s=t;
ali@70
  3044
    while (g_utf8_get_char(t)=='m' && *t)
ali@40
  3045
	t++;
ali@70
  3046
    if (g_utf8_get_char(t)=='d')
ali@40
  3047
	t++;
ali@70
  3048
    if (g_str_has_prefix(t,"cm"))
ali@40
  3049
	t+=2;
ali@70
  3050
    if (g_str_has_prefix(t,"cd"))
ali@40
  3051
	t+=2;
ali@70
  3052
    while (g_utf8_get_char(t)=='c' && *t)
ali@40
  3053
	t++;
ali@70
  3054
    if (g_str_has_prefix(t,"xl"))
ali@40
  3055
	t+=2;
ali@70
  3056
    if (g_str_has_prefix(t,"xc"))
ali@40
  3057
	t+=2;
ali@70
  3058
    if (g_utf8_get_char(t)=='l')
ali@40
  3059
	t++;
ali@70
  3060
    while (g_utf8_get_char(t)=='x' && *t)
ali@40
  3061
	t++;
ali@70
  3062
    if (g_str_has_prefix(t,"ix"))
ali@40
  3063
	t+=2;
ali@70
  3064
    if (g_str_has_prefix(t,"iv"))
ali@40
  3065
	t+=2;
ali@70
  3066
    if (g_utf8_get_char(t)=='v')
ali@40
  3067
	t++;
ali@70
  3068
    while (g_utf8_get_char(t)=='i' && *t)
ali@40
  3069
	t++;
ali@40
  3070
    return !*t;
ali@0
  3071
}
ali@0
  3072
ali@40
  3073
/*
ali@40
  3074
 * postprocess_for_DP:
ali@40
  3075
 *
ali@40
  3076
 * Invoked with the -d switch from flgets().
ali@40
  3077
 * It simply "removes" from the line a hard-coded set of common
ali@40
  3078
 * DP-specific tags, so that the line passed to the main routine has
ali@40
  3079
 * been pre-cleaned of DP markup.
ali@40
  3080
 */
ali@0
  3081
void postprocess_for_DP(char *theline)
ali@0
  3082
{
ali@40
  3083
    char *s,*t;
ali@0
  3084
    int i;
ali@0
  3085
    if (!*theline) 
ali@68
  3086
	return;
ali@40
  3087
    for (i=0;*DPmarkup[i];i++)
ali@70
  3088
	while ((s=strstr(theline,DPmarkup[i])))
ali@40
  3089
	{
ali@68
  3090
	    t=s+strlen(DPmarkup[i]);
ali@70
  3091
	    memmove(s,t,strlen(t)+1);
ali@40
  3092
	}
ali@0
  3093
}
ali@0
  3094
ali@40
  3095
/*
ali@40
  3096
 * postprocess_for_HTML:
ali@40
  3097
 *
ali@40
  3098
 * Invoked with the -m switch from flgets().
ali@40
  3099
 * It simply "removes" from the line a hard-coded set of common
ali@40
  3100
 * HTML tags and "replaces" a hard-coded set of common HTML
ali@40
  3101
 * entities, so that the line passed to the main routine has
ali@40
  3102
 * been pre-cleaned of HTML.
ali@40
  3103
 */
ali@0
  3104
void postprocess_for_HTML(char *theline)
ali@0
  3105
{
ali@70
  3106
    while (losemarkup(theline))
ali@70
  3107
	;
ali@71
  3108
    loseentities(theline);
ali@0
  3109
}
ali@0
  3110
ali@0
  3111
char *losemarkup(char *theline)
ali@0
  3112
{
ali@40
  3113
    char *s,*t;
ali@0
  3114
    int i;
ali@70
  3115
    s=strchr(theline,'<');
ali@70
  3116
    t=s?strchr(s,'>'):NULL;
ali@40
  3117
    if (!s || !t)
ali@40
  3118
	return NULL;
ali@40
  3119
    for (i=0;*markup[i];i++)
ali@70
  3120
	if (tagcomp(g_utf8_next_char(s),markup[i]))
ali@40
  3121
	{
ali@70
  3122
	    t=g_utf8_next_char(t);
ali@70
  3123
	    memmove(s,t,strlen(t)+1);
ali@70
  3124
	    return s;
ali@68
  3125
	}
ali@40
  3126
    /* It's an unrecognized <xxx>. */
ali@40
  3127
    return NULL;
ali@0
  3128
}
ali@0
  3129
ali@71
  3130
void loseentities(char *theline)
ali@0
  3131
{
ali@0
  3132
    int i;
ali@71
  3133
    gsize nb;
ali@71
  3134
    char *amp,*scolon;
ali@71
  3135
    gchar *s,*t;
ali@71
  3136
    gunichar c;
ali@71
  3137
    GTree *entities=NULL;
ali@71
  3138
    GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
ali@71
  3139
    if (!theline)
ali@40
  3140
    {
ali@71
  3141
	if (entities)
ali@71
  3142
	    g_tree_destroy(entities);
ali@71
  3143
	entities=NULL;
ali@71
  3144
	if (translit==(GIConv)-1)
ali@71
  3145
	    g_iconv_close(translit);
ali@71
  3146
	translit=(GIConv)-1;
ali@71
  3147
	if (to_utf8==(GIConv)-1)
ali@71
  3148
	    g_iconv_close(to_utf8);
ali@71
  3149
	to_utf8=(GIConv)-1;
ali@71
  3150
	return;
ali@71
  3151
    }
ali@71
  3152
    if (!*theline)
ali@71
  3153
	return;
ali@71
  3154
    if (!entities)
ali@71
  3155
    {
ali@71
  3156
	entities=g_tree_new((GCompareFunc)strcmp);
ali@71
  3157
	for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
ali@71
  3158
	    g_tree_insert(entities,HTMLentities[i].name,
ali@71
  3159
	      GUINT_TO_POINTER(HTMLentities[i].c));
ali@71
  3160
    }
ali@71
  3161
    if (translit==(GIConv)-1)
ali@71
  3162
	translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
ali@71
  3163
    if (to_utf8==(GIConv)-1)
ali@71
  3164
	to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
ali@71
  3165
    while((amp=strchr(theline,'&')))
ali@71
  3166
    {
ali@71
  3167
	scolon=strchr(amp,';');
ali@71
  3168
	if (scolon)
ali@40
  3169
	{
ali@71
  3170
	    if (amp[1]=='#')
ali@71
  3171
	    {
ali@71
  3172
		if (amp+2+strspn(amp+2,"0123456789")==scolon)
ali@71
  3173
		    c=strtol(amp+2,NULL,10);
ali@71
  3174
		else if (amp[2]=='x' &&
ali@71
  3175
		  amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
ali@71
  3176
		    c=strtol(amp+3,NULL,16);
ali@71
  3177
	    }
ali@71
  3178
	    else
ali@71
  3179
	    {
ali@71
  3180
		s=g_strndup(amp+1,scolon-(amp+1));
ali@71
  3181
	        c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
ali@71
  3182
		g_free(s);
ali@71
  3183
	    }
ali@40
  3184
	}
ali@71
  3185
	else
ali@71
  3186
	    c=0;
ali@71
  3187
	if (c)
ali@71
  3188
	{
ali@71
  3189
	    theline=amp;
ali@71
  3190
	    if (c<128 || c>=192 && c<=255)	/* An ISO-8859-1 character */
ali@71
  3191
		theline+=g_unichar_to_utf8(c,theline);
ali@71
  3192
	    else
ali@71
  3193
	    {
ali@71
  3194
		s=g_malloc(6);
ali@71
  3195
		nb=g_unichar_to_utf8(c,s);
ali@71
  3196
		t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
ali@71
  3197
		g_free(s);
ali@71
  3198
		s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
ali@71
  3199
		g_free(t);
ali@71
  3200
		memcpy(theline,s,nb);
ali@71
  3201
		g_free(s);
ali@71
  3202
		theline+=nb;
ali@71
  3203
	    }
ali@71
  3204
	    memmove(theline,g_utf8_next_char(scolon),
ali@71
  3205
	      strlen(g_utf8_next_char(scolon))+1);
ali@71
  3206
	}
ali@71
  3207
	else
ali@71
  3208
	    theline=g_utf8_next_char(amp);
ali@40
  3209
    }
ali@0
  3210
}
ali@0
  3211
ali@70
  3212
gboolean tagcomp(const char *strin,const char *basetag)
ali@0
  3213
{
ali@70
  3214
    gboolean retval;
ali@70
  3215
    gchar *s,*t;
ali@70
  3216
    if (g_utf8_get_char(strin)=='/')
ali@70
  3217
	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
ali@70
  3218
    else
ali@70
  3219
	t=g_utf8_casefold(strin,-1);
ali@70
  3220
    s=g_utf8_casefold(basetag,-1);
ali@70
  3221
    retval=g_str_has_prefix(t,s);
ali@70
  3222
    g_free(s);
ali@70
  3223
    g_free(t);
ali@70
  3224
    return retval;
ali@0
  3225
}
ali@0
  3226
ali@69
  3227
void proghelp(GOptionContext *context)
ali@0
  3228
{
ali@69
  3229
    gchar *help;
ali@40
  3230
    fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
ali@40
  3231
    fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
ali@40
  3232
    fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
ali@40
  3233
    fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
ali@40
  3234
      "For details, read the file COPYING.\n",stderr);
ali@40
  3235
    fputs("This is Free Software; "
ali@40
  3236
      "you may redistribute it under certain conditions (GPL);\n",stderr);
ali@40
  3237
    fputs("read the file COPYING for details.\n\n",stderr);
ali@69
  3238
    help=g_option_context_get_help(context,TRUE,NULL);
ali@69
  3239
    fputs(help,stderr);
ali@69
  3240
    g_free(help);
ali@69
  3241
    fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
ali@40
  3242
    fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
ali@40
  3243
      "non-ASCII\n",stderr);
ali@40
  3244
    fputs("characters like accented letters, "
ali@40
  3245
      "lines longer than 75 or shorter than 55,\n",stderr);
ali@40
  3246
    fputs("unbalanced quotes or brackets, "
ali@40
  3247
      "a variety of badly formatted punctuation, \n",stderr);
ali@40
  3248
    fputs("HTML tags, some likely typos. "
ali@40
  3249
      "It is NOT a substitute for human judgement.\n",stderr);
ali@0
  3250
    fputs("\n",stderr);
ali@0
  3251
}