# HG changeset patch # User ali # Date 1327660216 0 # Node ID f600b0d1fc5dc428bfd7d0e747f034a40e30b949 # Parent 2189044102317a21cbf66282ac1f92068d4e7989 Rename to bookloupe diff -r 218904410231 -r f600b0d1fc5d .hgignore --- a/.hgignore Fri Jan 27 00:28:11 2012 +0000 +++ b/.hgignore Fri Jan 27 10:30:16 2012 +0000 @@ -1,5 +1,5 @@ -gutcheck-.*\.tar\.gz -gutcheck-.*/ +bookloupe-.*\.tar\.gz +bookloupe-.*/ Makefile$ Makefile\.in aclocal\.m4 @@ -17,6 +17,6 @@ .*\.la .*\.lo .*\.exe -gutcheck/gutcheck\.typ -gutcheck/gutcheck -test/harness/gc-test +bookloupe/bookloupe\.typ +bookloupe/bookloupe +test/harness/loupe-test diff -r 218904410231 -r f600b0d1fc5d Makefile.am --- a/Makefile.am Fri Jan 27 00:28:11 2012 +0000 +++ b/Makefile.am Fri Jan 27 10:30:16 2012 +0000 @@ -1,1 +1,1 @@ -SUBDIRS=gclib gutcheck test doc +SUBDIRS=bl bookloupe test doc diff -r 218904410231 -r f600b0d1fc5d README --- a/README Fri Jan 27 00:28:11 2012 +0000 +++ b/README Fri Jan 27 10:30:16 2012 +0000 @@ -1,10 +1,10 @@ - gutcheck - ======== + bookloupe + ========= General installation instructions can be found in INSTALL. The following aim to give a quick overview and some help for specific systems. Documentation -for gutcheck itself can be found in doc/gutcheck.txt and for the test -framework in doc/gc-test.txt. +for bookloupe itself can be found in doc/bookloupe.txt and for the test +framework in doc/loupe-test.txt. Linux ----- @@ -43,12 +43,12 @@ % sudo yum install mingw32-gcc pkgconfig mingw32-glib2-static \ mingw32-gettext-static mingw32-iconv-static % ./configure --host=i686-w64-mingw32 --disable-shared \ - --bindir=/gutcheck --datadir=/ + --bindir=/bookloupe --datadir=/ % make % mkdir build % make install DESTDIR=`pwd`/build -The contents of the build/gutcheck directory can then be copied to a +The contents of the build/bookloupe directory can then be copied to a Microsoft Windows machine. Depending on the version of mingw32-gcc you use, you may need to specify a diff -r 218904410231 -r f600b0d1fc5d bl/Makefile.am --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bl/Makefile.am Fri Jan 27 10:30:16 2012 +0000 @@ -0,0 +1,10 @@ +INCLUDES=-I$(top_srcdir) +AM_CFLAGS=$(GLIB_CFLAGS) +LIBS=$(GLIB_LIBS) + +noinst_LTLIBRARIES=libbl.la +libbl_la_SOURCES=bl.h textfileutils.c textfileutils.h spawn.c spawn.h +if !HAVE_GLIB +libbl_la_SOURCES+=macros.h types.h fileutils.c fileutils.h mem.c mem.h \ + strfuncs.c strfuncs.h blstring.c blstring.h utils.c utils.h +endif diff -r 218904410231 -r f600b0d1fc5d bl/bl.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bl/bl.h Fri Jan 27 10:30:16 2012 +0000 @@ -0,0 +1,36 @@ +#if HAVE_GLIB + +#include +#define BL_DIR_SEPARATOR G_DIR_SEPARATOR +#define BL_DIR_SEPARATOR_S G_DIR_SEPARATOR_S +#define BL_IS_DIR_SEPARATOR(c) G_IS_DIR_SEPARATOR(c) +#define boolean gboolean +#define String GString +#define mem_new0 g_new0 +#define mem_free g_free +#define str_dup g_strdup +#define str_ndup g_strndup +#define path_get_basename g_path_get_basename +#define file_get_contents(filename,contents,length) \ + g_file_get_contents(filename,contents,length,NULL) +#define string_new g_string_new +#define string_append g_string_append +#define string_append_len g_string_append_len +#define string_append_c g_string_append_c +#define string_free g_string_free +#define string_set_size g_string_set_size + +#else /* !HAVE_GLIB */ + +#include +#include +#include +#include +#include +#include +#include + +#endif /* HAVE_GLIB */ + +#include +#include diff -r 218904410231 -r f600b0d1fc5d bl/blstring.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bl/blstring.c Fri Jan 27 10:30:16 2012 +0000 @@ -0,0 +1,90 @@ +#include +#include +#include +#include +#include +#include + +/* + * Strings which manage their own memory + */ + +String *string_new(const char *init) +{ + String *string=mem_new(String,1); + if (!init) + init=""; + string->len=strlen(init); + string->alloc=string->len+1; + string->str=str_dup(init); + return string; +} + +/* + * Free a string and either return the contents (if free_segment is FALSE) + * or free the contents as well and return NULL (if free_segment is TRUE). + */ +char *string_free(String *string,boolean free_segment) +{ + char *retval; + if (free_segment) + { + mem_free(string->str); + retval=NULL; + } + else + retval=string->str; + mem_free(string); + return retval; +} + +/* + * Append a byte to string. + */ +void string_append_c(String *string,char c) +{ + if (string->len+1==string->alloc) + { + string->alloc*=2; + string->str=mem_renew(char,string->str,string->alloc); + } + string->str[string->len++]=c; + string->str[string->len]='\0'; +} + +/* + * Append len bytes from s to string. len may be passed as <0 if s is + * a nul-terminated string of unknown length. + */ +void string_append_len(String *string,const char *s,ssize_t len) +{ + if (len<0) + len=strlen(s); + if (string->len+len>=string->alloc) + { + while (string->len+len>=string->alloc) + string->alloc*=2; + string->str=mem_renew(char,string->str,string->alloc); + } + memcpy(string->str+string->len,s,len); + string->len+=len; + string->str[string->len]='\0'; +} + +/* + * Sets the length of a String. If the length is less than the current length, + * the string will be truncated. If the length is greater than the current + * length, the contents of the newly added area are undefined. (However, as + * always, string->str[string->len] will be a nul byte.) + */ +void string_set_size(String *string,size_t len) +{ + if (len>=string->alloc) + { + while (len>=string->alloc) + string->alloc*=2; + string->str=mem_renew(char,string->str,string->alloc); + } + string->len=len; + string->str[string->len]='\0'; +} diff -r 218904410231 -r f600b0d1fc5d bl/blstring.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bl/blstring.h Fri Jan 27 10:30:16 2012 +0000 @@ -0,0 +1,18 @@ +#ifndef BL_STRING_H +#define BL_STRING_H + +#include +#include + +typedef struct { + char *str; + size_t alloc,len; +} String; + +String *string_new(const char *init); +char *string_free(String *string,boolean free_segment); +void string_append_c(String *string,char c); +void string_append_len(String *string,const char *s,ssize_t len); +#define string_append(string,s) string_append_len(string,s,-1) + +#endif /* BL_STRING_H */ diff -r 218904410231 -r f600b0d1fc5d bl/fileutils.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bl/fileutils.c Fri Jan 27 10:30:16 2012 +0000 @@ -0,0 +1,46 @@ +#include +#include +#include +#include +#include +#include + +/* + * Read a file into memory (which should be freed with mem_free when no + * longer required). Returns FALSE on error and outputs a suitable error + * message to stderr. + */ +boolean file_get_contents(const char *filename,char **contents,size_t *length) +{ + FILE *fp; + size_t n; + char *buffer; + String *string; + fp=fopen(filename,"rb"); + if (!fp) + { + perror(filename); + return FALSE; + } + buffer=mem_new(char,1024); + string=string_new(NULL); + do + { + n=fread(buffer,1,1024,fp); + if (n<0) + { + perror(filename); + string_free(string,TRUE); + mem_free(buffer); + free(fp); + return FALSE; + } + string_append_len(string,buffer,n); + } while(n); + mem_free(buffer); + if (length) + *length=string->len; + *contents=string_free(string,FALSE); + fclose(fp); + return TRUE; +} diff -r 218904410231 -r f600b0d1fc5d bl/fileutils.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bl/fileutils.h Fri Jan 27 10:30:16 2012 +0000 @@ -0,0 +1,8 @@ +#ifndef BL_FILEUTILS_H +#define BL_FILEUTILS_H + +#include + +boolean file_get_contents(const char *filename,char **contents,size_t *length); + +#endif /* BL_FILEUTILS_H */ diff -r 218904410231 -r f600b0d1fc5d bl/macros.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bl/macros.h Fri Jan 27 10:30:16 2012 +0000 @@ -0,0 +1,7 @@ +#ifndef FALSE +#define FALSE 0 +#endif + +#ifndef TRUE +#define TRUE (!FALSE) +#endif diff -r 218904410231 -r f600b0d1fc5d bl/mem.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bl/mem.c Fri Jan 27 10:30:16 2012 +0000 @@ -0,0 +1,54 @@ +#include +#include +#include +#include + +/* + * A memory allocator that aborts on failure (so that the caller never + * needs to handle out of memory, which we assume is very unlikely to + * happen under normal circumstances on any modern machine). + */ +void *mem_alloc(size_t nmemb,size_t size) +{ + void *ptr=malloc(nmemb*size); + if (!ptr) + { + fprintf(stderr, + "Not enough memory to allocate %lu elements of %lu bytes.\n", + (unsigned long)nmemb,(unsigned long)size); + abort(); + } + return ptr; +} + +/* + * As mem_new, but new memory is cleared to zero. + */ +void *mem_alloc0(size_t nmemb,size_t size) +{ + void *ptr=calloc(nmemb,size); + if (!ptr) + { + fprintf(stderr, + "Not enough memory to allocate %lu elements of %lu bytes.\n", + (unsigned long)nmemb,(unsigned long)size); + abort(); + } + return ptr; +} + +/* + * Grow or shrink a memory block, aborting on failure. + */ +void *mem_realloc(void *ptr,size_t nmemb,size_t size) +{ + ptr=realloc(ptr,nmemb*size); + if (!ptr) + { + fprintf(stderr, + "Not enough memory to allocate %lu elements of %lu bytes.\n", + (unsigned long)nmemb,(unsigned long)size); + abort(); + } + return ptr; +} diff -r 218904410231 -r f600b0d1fc5d bl/mem.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bl/mem.h Fri Jan 27 10:30:16 2012 +0000 @@ -0,0 +1,13 @@ +#ifndef BL_MEM_H +#define BL_MEM_H + +void *mem_alloc(size_t nmemb,size_t size); +void *mem_alloc0(size_t nmemb,size_t size); +void *mem_realloc(void *ptr,size_t nmemb,size_t size); + +#define mem_new(type,n) ((type *)mem_alloc(n,sizeof(type))) +#define mem_new0(type,n) ((type *)mem_alloc0(n,sizeof(type))) +#define mem_renew(type,ptr,n) ((type *)mem_realloc(ptr,n,sizeof(type))) +#define mem_free(ptr) free(ptr) + +#endif /* BL_MEM_H */ diff -r 218904410231 -r f600b0d1fc5d bl/spawn.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bl/spawn.c Fri Jan 27 10:30:16 2012 +0000 @@ -0,0 +1,84 @@ +#include +#include +#ifndef WIN32 +#include +#endif +#include + +#define SPAWN_BUFSIZE 128 + +boolean spawn_sync(char **argv,char **standard_output,int *exit_status) +{ +/* Don't use g_spawn_sync on WIN32 for now to avoid needing the helper */ +#if HAVE_GLIB && !defined(WIN32) + char *standard_error; + GError *error=NULL; + gboolean retval; + GSpawnFlags flags=G_SPAWN_SEARCH_PATH; + if (!standard_output) + flags=G_SPAWN_STDOUT_TO_DEV_NULL; + retval=g_spawn_sync(NULL,argv,NULL,flags,NULL,NULL,standard_output, + &standard_error,exit_status,&error); + fputs(standard_error,stderr); + g_free(standard_error); + if (!retval) + { + fprintf(stderr,"%s\n",error->message); + g_error_free(error); + } + else if (exit_status) + *exit_status=WEXITSTATUS(*exit_status); + return retval; +#else + FILE *fp; + int i,r; + size_t n,len; + String *command_line,*string; + command_line=string_new(NULL); + for(i=0;argv[i];i++) + { + if (i) + string_append_c(command_line,' '); + string_append(command_line,argv[i]); + } + fp=popen(command_line->str,"r"); + string_free(command_line,TRUE); + if (!fp) + { + perror(command_line->str); + return FALSE; + } + string=string_new(NULL); + do + { + len=string->len; + string_set_size(string,len+SPAWN_BUFSIZE); + n=fread(string->str+len,1,SPAWN_BUFSIZE,fp); + if (n<0) + { + perror("fread"); + (void)pclose(fp); + string_free(string,TRUE); + return FALSE; + } + string_set_size(string,len+n); + } while(n); + r=pclose(fp); + if (r<0) + { + perror("pclose"); + string_free(string,TRUE); + return FALSE; + } + else + { + if (exit_status) + *exit_status=r; + if (standard_output) + *standard_output=string_free(string,FALSE); + else + string_free(string,TRUE); + return TRUE; + } +#endif +} diff -r 218904410231 -r f600b0d1fc5d bl/spawn.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bl/spawn.h Fri Jan 27 10:30:16 2012 +0000 @@ -0,0 +1,8 @@ +#ifndef BL_SPAWN_H +#define BL_SPAWN_H + +#include + +boolean spawn_sync(char **argv,char **standard_output,int *exit_status); + +#endif /* BL_SPAWN_H */ diff -r 218904410231 -r f600b0d1fc5d bl/strfuncs.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bl/strfuncs.c Fri Jan 27 10:30:16 2012 +0000 @@ -0,0 +1,26 @@ +#include +#include +#include +#include + +/* + * Like strndup, but only returns NULL if str is NULL. + * Note that this routine copies n bytes rather than n characters. + */ +char *str_ndup(const char *str,size_t n) +{ + char *dup; + if (!str) + return NULL; + dup=mem_alloc0(n+1,1); + strncpy(dup,str,n); + return dup; +} + +/* + * Like strdup, but only returns NULL if str is NULL. + */ +char *str_dup(const char *str) +{ + return str_ndup(str,strlen(str)); +} diff -r 218904410231 -r f600b0d1fc5d bl/strfuncs.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bl/strfuncs.h Fri Jan 27 10:30:16 2012 +0000 @@ -0,0 +1,7 @@ +#ifndef BL_STRFUNCS_H +#define BL_STRFUNCS_H + +char *str_dup(const char *str); +char *str_ndup(const char *str,size_t n); + +#endif /* BL_STRFUNCS_H */ diff -r 218904410231 -r f600b0d1fc5d bl/textfileutils.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bl/textfileutils.c Fri Jan 27 10:30:16 2012 +0000 @@ -0,0 +1,33 @@ +#include +#include +#include + +/* + * Read a file into memory (which should be freed with mem_free when no + * longer required). Returns NULL on error and outputs a suitable error + * message to stderr. + * DOS-style line endings are handled transparently even on platforms which + * don't normally use this format. + */ +boolean file_get_contents_text(const char *filename,char **contents, + size_t *length) +{ + int i; + char *raw; + size_t raw_length; + String *string; + if (!file_get_contents(filename,&raw,&raw_length)) + return FALSE; + string=string_new(NULL); + for(i=0;ilen; + if (contents) + *contents=string_free(string,FALSE); + else + string_free(string,TRUE); + return TRUE; +} diff -r 218904410231 -r f600b0d1fc5d bl/textfileutils.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bl/textfileutils.h Fri Jan 27 10:30:16 2012 +0000 @@ -0,0 +1,9 @@ +#ifndef BL_TEXTFILEUTILS_H +#define BL_TEXTFILEUTILS_H + +#include + +boolean file_get_contents_text(const char *filename,char **contents, + size_t *length); + +#endif /* BL_TEXTFILEUTILS_H */ diff -r 218904410231 -r f600b0d1fc5d bl/types.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bl/types.h Fri Jan 27 10:30:16 2012 +0000 @@ -0,0 +1,6 @@ +#ifndef BL_TYPES_H +#define BL_TYPES_H + +typedef int boolean; + +#endif /* BL_TYPES_H */ diff -r 218904410231 -r f600b0d1fc5d bl/utils.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bl/utils.c Fri Jan 27 10:30:16 2012 +0000 @@ -0,0 +1,46 @@ +#include +#include +#include +#include +#include +#include + +#define is_valid_drive(d) ((d)>='a' && (d)<='z' || (d)>='A' && (d)<='Z') + +/* + * Gets the last component of the filename. If filename ends with a directory + * separator it gets the component before the last slash. If filename consists + * only of directory separators (and on Windows, possibly a drive letter), a + * single separator is returned. If filename is empty, it gets ".". + */ +char *path_get_basename(const char *filename) +{ + ssize_t base,last_nonslash; + size_t len; + char *retval; + if (*filename=='\0') + return str_dup("."); + last_nonslash=strlen(filename)-1; + while (last_nonslash>=0 && BL_IS_DIR_SEPARATOR(filename[last_nonslash])) + last_nonslash--; + if (last_nonslash<0) + /* string only containing slashes */ + return str_dup(BL_DIR_SEPARATOR_S); +#ifdef WIN32 + if (last_nonslash==1 && is_valid_drive(filename[0]) && filename[1]==':') + /* string only containing slashes and a drive */ + return str_dup(BL_DIR_SEPARATOR_S); +#endif + base=last_nonslash; + while (base>=0 && !BL_IS_DIR_SEPARATOR(filename[base])) + base--; +#ifdef WIN32 + if (base==-1 && is_valid_drive(filename[0]) && filename[1] == ':') + base=1; +#endif + len=last_nonslash-base; + retval=mem_alloc(len+1,1); + memcpy(retval,filename+base+1,len); + retval[len]='\0'; + return retval; +} diff -r 218904410231 -r f600b0d1fc5d bl/utils.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bl/utils.h Fri Jan 27 10:30:16 2012 +0000 @@ -0,0 +1,16 @@ +#ifndef BL_UTIL_H +#define BL_UTIL_H + +#ifdef WIN32 +#define BL_DIR_SEPARATOR '\\' +#define BL_DIR_SEPARATOR_S "\\" +#define BL_IS_DIR_SEPARATOR(c) ((c)==BL_DIR_SEPARATOR || (c)=='/') +#else +#define BL_DIR_SEPARATOR '/' +#define BL_DIR_SEPARATOR_S "/" +#define BL_IS_DIR_SEPARATOR(c) ((c)==BL_DIR_SEPARATOR) +#endif + +char *path_get_basename(const char *filename); + +#endif /* BL_UTIL_H */ diff -r 218904410231 -r f600b0d1fc5d bookloupe/Makefile.am --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bookloupe/Makefile.am Fri Jan 27 10:30:16 2012 +0000 @@ -0,0 +1,8 @@ +bin_PROGRAMS=bookloupe +pkgdata_DATA=bookloupe.typ + +bookloupe.typ: bookloupe.typ.in + sed 's/$$/\r/' $< > $@ + +EXTRA_DIST=bookloupe.typ.in +CLEANFILES=bookloupe.typ diff -r 218904410231 -r f600b0d1fc5d bookloupe/bookloupe.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bookloupe/bookloupe.c Fri Jan 27 10:30:16 2012 +0000 @@ -0,0 +1,2982 @@ +/*************************************************************************/ +/* gutcheck - check for assorted weirdnesses in a PG candidate text file */ +/* */ +/* Version 0.991 */ +/* Copyright 2000-2005 Jim Tinsley */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the */ +/* Free Software Foundation, Inc., */ +/* 59 Temple Place, */ +/* Suite 330, */ +/* Boston, MA 02111-1307 USA */ +/* */ +/* */ +/* */ +/* Overview comments: */ +/* */ +/* If you're reading this, you're either interested in how to detect */ +/* formatting errors, or very very bored. */ +/* */ +/* Gutcheck is a homebrew formatting checker specifically for */ +/* spotting common formatting problems in a PG e-text. I typically */ +/* run it once or twice on a file I'm about to submit; it usually */ +/* finds a few formatting problems. It also usually finds lots of */ +/* queries that aren't problems at all; it _really_ doesn't like */ +/* the standard PG header, for example. It's optimized for straight */ +/* prose; poetry and non-fiction involving tables tend to trigger */ +/* false alarms. */ +/* */ +/* The code of gutcheck is not very interesting, but the experience */ +/* of what constitutes a possible error may be, and the best way to */ +/* illustrate that is by example. */ +/* */ +/* */ +/* Here are some common typos found in PG texts that gutcheck */ +/* will flag as errors: */ +/* */ +/* "Look!John , over there!" */ +/* */ +/* &so is this; */ +/* Margaret said: " Now you should start for school." */ +/* Margaret said: "Now you should start for school. (if end of para) */ +/* The horse is said to he worth a lot. */ +/* 0K - this'11 make you look close1y. */ +/* "If you do. you'll regret it!" */ +/* */ +/* There are some complications . The extra space left around that */ +/* period was an error . . . but that ellipsis wasn't. */ +/* */ +/* The last line of a paragraph */ +/* is usually short. */ +/* */ +/* This period is an error.But the periods in a.m. aren't. */ +/* */ +/* Checks that are do-able but not (well) implemented are: */ +/* Single-quote chcking. */ +/* Despite 3 attempts at it, singlequote checking is still */ +/* crap in gutcheck. It may not be possible without analysis */ +/* of the whole paragraph. */ +/* */ +/*************************************************************************/ + + +#include +#include +#include +#include + +#define MAXWORDLEN 80 /* max length of one word */ +#define LINEBUFSIZE 2048 /* buffer size for an input line */ + +#define MAX_USER_TYPOS 1000 +#define USERTYPO_FILE "gutcheck.typ" + +#ifndef MAX_PATH +#define MAX_PATH 16384 +#endif + +char aline[LINEBUFSIZE]; +char prevline[LINEBUFSIZE]; + + /* Common typos. */ +char *typo[] = { "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane", "nad", + "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa", "bakc", "om", + "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt", "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", + "hmi", "hse", "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd", "gerat", "goign", + "gruop", "haev", "hda", "hearign", "seeign", "sayign", "herat", "hge", "hsa", "hsi", "hte", "htere", + "htese", "htey", "htis", "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut", "loev", + "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter", "omre", "onyl", "otehr", "otu", "owrk", + "owuld", "peice", "peices", "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem", + "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe", "sohw", "stnad", "stopry", + "stoyr", "stpo", "tahn", "taht", "tath", "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", + "thier", "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne", "tirne", "tkae", + "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey", "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", + "whihc", "whta", "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats", + "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking", "wtih", "wuould", "wya", "yera", + "yeras", "yersa", "yoiu", "youve", "ytou", "yuor", + /* added h/b words for version 12 - removed a few with "tbe" v.25 */ + "abead", "ahle", "ahout", "ahove", "altbough", "balf", "bardly", "bas", "bave", "baving", "bebind", + "beld", "belp", "belped", "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge", "dehates", + "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan", "hegin", "heing", + "helieve", "henefit", "hetter", "hetween", "heyond", "hig", "higber", "huild", "huy", "hy", "jobn", "joh", + "meanwbile", "memher", "memhers", "numher", "numhers", + "perbaps", "prohlem", "puhlic", "witbout", + /* and a few more for .18 */ + "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud", "prornise", "prornised", "modem", "bo", + "heside", "chapteb", "chaptee", "se", + ""}; + +char *usertypo[MAX_USER_TYPOS]; + + /* Common abbreviations and other OK words not to query as typos. */ + /* 0.99 last-minute - removed "ms" */ +char *okword[] = {"mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm", "rd", "sh", "br", + "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "pompeii","hawaii","hawaiian", + "hotbed", "heartbeat", "heartbeats", "outbid", "outbids", "frostbite", "frostbitten", + ""}; + + /* Common abbreviations that cause otherwise unexplained periods. */ +char *abbrev[] = {"cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op", "cit", + "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", + ""}; + /* Two-Letter combinations that rarely if ever start words, */ + /* but are common scannos or otherwise common letter */ + /* combinations. */ +char *nostart[] = { "hr", "hl", "cb", "sb", "tb", "wb", "tl", + "tn", "rn", "lt", "tj", + "" }; + + /* Two-Letter combinations that rarely if ever end words */ + /* but are common scannos or otherwise common letter */ + /* combinations */ +char *noend[] = { "cb", "gb", "pb", "sb", "tb", + "wh","fr","br","qu","tw","gl","fl","sw","gr","sl","cl", + "iy", + ""}; + +char *markup[] = { "a", "b", "big", "blockquote", "body", "br", "center", + "col", "div", "em", "font", "h1", "h2", "h3", "h4", + "h5", "h6", "head", "hr", "html", "i", "img", "li", + "meta", "ol", "p", "pre", "small", "span", "strong", + "sub", "sup", "table", "td", "tfoot", "thead", "title", + "tr", "tt", "u", "ul", + ""}; + +char *DPmarkup[] = { "", "", "/*", "*/", "/#", "#/", "/$", "$/", "", + ""}; /* added .991 */ + +char *nocomma[] = { "the", "it's", "their", "an", "mrs", "a", "our", "that's", + "its", "whose", "every", "i'll", "your", "my", + "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "rd", + "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", + "i'm", "during", "let", "toward", "among", + ""}; + + +char *noperiod[] = { "every", "i'm", "during", "that's", "their", "your", "our", "my", "or", + "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether", + "i'll", "whose", "who", "because", "when", "let", "till", "very", + "an", "among", "those", "into", "whom", "having", "thence", + ""}; + + +char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü"; /* Carlo's old suggestion, updated .991 */ + +struct { + char *htmlent; + char *htmlnum; + char *textent; + } entities[] = { "&", "&", "&", + "<", "<", "<", + ">", ">", ">", + "°", "°", " degrees", + "£", "£", "L", + """, """, "\"", /* -- quotation mark = APL quote, */ + "Œ", "Œ", "OE", /* -- latin capital ligature OE, */ + "œ", "œ", "oe", /* -- latin small ligature oe, U+0153 ISOlat2 --> */ + "Š", "Š", "S", /* -- latin capital letter S with caron, */ + "š", "š", "s", /* -- latin small letter s with caron, */ + "Ÿ", "Ÿ", "Y", /* -- latin capital letter Y with diaeresis, */ + "ˆ", "ˆ", "", /* -- modifier letter circumflex accent, */ + "˜", "˜", "~", /* -- small tilde, U+02DC ISOdia --> */ + " ", " ", " ", /* -- en space, U+2002 ISOpub --> */ + " ", " ", " ", /* -- em space, U+2003 ISOpub --> */ + " ", " ", " ", /* -- thin space, U+2009 ISOpub --> */ + "–", "–", "-", /* -- en dash, U+2013 ISOpub --> */ + "—", "—", "--", /* -- em dash, U+2014 ISOpub --> */ + "‘", "‘", "'", /* -- left single quotation mark, */ + "’", "’", "'", /* -- right single quotation mark, */ + "‚", "‚", "'", /* -- single low-9 quotation mark, U+201A NEW --> */ + "“", "“", "\"", /* -- left double quotation mark, */ + "”", "”", "\"", /* -- right double quotation mark, */ + "„", "„", "\"", /* -- double low-9 quotation mark, U+201E NEW --> */ + "‹", "‹", "\"", /* -- single left-pointing angle quotation mark, */ + "›", "›", "\"", /* -- single right-pointing angle quotation mark, */ + " ", " ", " ", /* -- no-break space = non-breaking space, */ + "¡", "¡", "!", /* -- inverted exclamation mark, U+00A1 ISOnum --> */ + "¢", "¢", "c", /* -- cent sign, U+00A2 ISOnum --> */ + "£", "£", "L", /* -- pound sign, U+00A3 ISOnum --> */ + "¤", "¤", "$", /* -- currency sign, U+00A4 ISOnum --> */ + "¥", "¥", "Y", /* -- yen sign = yuan sign, U+00A5 ISOnum --> */ + "§", "§", "--", /* -- section sign, U+00A7 ISOnum --> */ + "¨", "¨", " ", /* -- diaeresis = spacing diaeresis, */ + "©", "©", "(C) ", /* -- copyright sign, U+00A9 ISOnum --> */ + "ª", "ª", " ", /* -- feminine ordinal indicator, U+00AA ISOnum --> */ + "«", "«", "\"", /* -- left-pointing double angle quotation mark */ + "­", "­", "-", /* -- soft hyphen = discretionary hyphen, */ + "®", "®", "(R) ", /* -- registered sign = registered trade mark sign, */ + "¯", "¯", " ", /* -- macron = spacing macron = overline */ + "°", "°", " degrees", /* -- degree sign, U+00B0 ISOnum --> */ + "±", "±", "+-", /* -- plus-minus sign = plus-or-minus sign, */ + "²", "²", "2", /* -- superscript two = superscript digit two */ + "³", "³", "3", /* -- superscript three = superscript digit three */ + "´", "´", " ", /* -- acute accent = spacing acute, */ + "µ", "µ", "m", /* -- micro sign, U+00B5 ISOnum --> */ + "¶", "¶", "--", /* -- pilcrow sign = paragraph sign, */ + "¸", "¸", " ", /* -- cedilla = spacing cedilla, U+00B8 ISOdia --> */ + "¹", "¹", "1", /* -- superscript one = superscript digit one, */ + "º", "º", " ", /* -- masculine ordinal indicator, */ + "»", "»", "\"", /* -- right-pointing double angle quotation mark */ + "¼", "¼", "1/4", /* -- vulgar fraction one quarter */ + "½", "½", "1/2", /* -- vulgar fraction one half */ + "¾", "¾", "3/4", /* -- vulgar fraction three quarters */ + "¿", "¿", "?", /* -- inverted question mark */ + "À", "À", "A", /* -- latin capital letter A with grave */ + "Á", "Á", "A", /* -- latin capital letter A with acute, */ + "Â", "Â", "A", /* -- latin capital letter A with circumflex, */ + "Ã", "Ã", "A", /* -- latin capital letter A with tilde, */ + "Ä", "Ä", "A", /* -- latin capital letter A with diaeresis, */ + "Å", "Å", "A", /* -- latin capital letter A with ring above */ + "Æ", "Æ", "AE", /* -- latin capital letter AE */ + "Ç", "Ç", "C", /* -- latin capital letter C with cedilla, */ + "È", "È", "E", /* -- latin capital letter E with grave, */ + "É", "É", "E", /* -- latin capital letter E with acute, */ + "Ê", "Ê", "E", /* -- latin capital letter E with circumflex, */ + "Ë", "Ë", "E", /* -- latin capital letter E with diaeresis, */ + "Ì", "Ì", "I", /* -- latin capital letter I with grave, */ + "Í", "Í", "I", /* -- latin capital letter I with acute, */ + "Î", "Î", "I", /* -- latin capital letter I with circumflex, */ + "Ï", "Ï", "I", /* -- latin capital letter I with diaeresis, */ + "Ð", "Ð", "E", /* -- latin capital letter ETH, U+00D0 ISOlat1 --> */ + "Ñ", "Ñ", "N", /* -- latin capital letter N with tilde, */ + "Ò", "Ò", "O", /* -- latin capital letter O with grave, */ + "Ó", "Ó", "O", /* -- latin capital letter O with acute, */ + "Ô", "Ô", "O", /* -- latin capital letter O with circumflex, */ + "Õ", "Õ", "O", /* -- latin capital letter O with tilde, */ + "Ö", "Ö", "O", /* -- latin capital letter O with diaeresis, */ + "×", "×", "*", /* -- multiplication sign, U+00D7 ISOnum --> */ + "Ø", "Ø", "O", /* -- latin capital letter O with stroke */ + "Ù", "Ù", "U", /* -- latin capital letter U with grave, */ + "Ú", "Ú", "U", /* -- latin capital letter U with acute, */ + "Û", "Û", "U", /* -- latin capital letter U with circumflex, */ + "Ü", "Ü", "U", /* -- latin capital letter U with diaeresis, */ + "Ý", "Ý", "Y", /* -- latin capital letter Y with acute, */ + "Þ", "Þ", "TH", /* -- latin capital letter THORN, */ + "ß", "ß", "sz", /* -- latin small letter sharp s = ess-zed, */ + "à", "à", "a", /* -- latin small letter a with grave */ + "á", "á", "a", /* -- latin small letter a with acute, */ + "â", "â", "a", /* -- latin small letter a with circumflex, */ + "ã", "ã", "a", /* -- latin small letter a with tilde, */ + "ä", "ä", "a", /* -- latin small letter a with diaeresis, */ + "å", "å", "a", /* -- latin small letter a with ring above */ + "æ", "æ", "ae", /* -- latin small letter ae */ + "ç", "ç", "c", /* -- latin small letter c with cedilla, */ + "è", "è", "e", /* -- latin small letter e with grave, */ + "é", "é", "e", /* -- latin small letter e with acute, */ + "ê", "ê", "e", /* -- latin small letter e with circumflex, */ + "ë", "ë", "e", /* -- latin small letter e with diaeresis, */ + "ì", "ì", "i", /* -- latin small letter i with grave, */ + "í", "í", "i", /* -- latin small letter i with acute, */ + "î", "î", "i", /* -- latin small letter i with circumflex, */ + "ï", "ï", "i", /* -- latin small letter i with diaeresis, */ + "ð", "ð", "eth", /* -- latin small letter eth, U+00F0 ISOlat1 --> */ + "ñ", "ñ", "n", /* -- latin small letter n with tilde, */ + "ò", "ò", "o", /* -- latin small letter o with grave, */ + "ó", "ó", "o", /* -- latin small letter o with acute, */ + "ô", "ô", "o", /* -- latin small letter o with circumflex, */ + "õ", "õ", "o", /* -- latin small letter o with tilde, */ + "ö", "ö", "o", /* -- latin small letter o with diaeresis, */ + "÷", "÷", "/", /* -- division sign, U+00F7 ISOnum --> */ + "ø", "ø", "o", /* -- latin small letter o with stroke, */ + "ù", "ù", "u", /* -- latin small letter u with grave, */ + "ú", "ú", "u", /* -- latin small letter u with acute, */ + "û", "û", "u", /* -- latin small letter u with circumflex, */ + "ü", "ü", "u", /* -- latin small letter u with diaeresis, */ + "ý", "ý", "y", /* -- latin small letter y with acute, */ + "þ", "þ", "th", /* -- latin small letter thorn, */ + "ÿ", "ÿ", "y", /* -- latin small letter y with diaeresis, */ + "", "" }; + +/* ---- list of special characters ---- */ +#define CHAR_SPACE 32 +#define CHAR_TAB 9 +#define CHAR_LF 10 +#define CHAR_CR 13 +#define CHAR_DQUOTE 34 +#define CHAR_SQUOTE 39 +#define CHAR_OPEN_SQUOTE 96 +#define CHAR_TILDE 126 +#define CHAR_ASTERISK 42 +#define CHAR_FORESLASH 47 +#define CHAR_CARAT 94 + +#define CHAR_UNDERSCORE '_' +#define CHAR_OPEN_CBRACK '{' +#define CHAR_CLOSE_CBRACK '}' +#define CHAR_OPEN_RBRACK '(' +#define CHAR_CLOSE_RBRACK ')' +#define CHAR_OPEN_SBRACK '[' +#define CHAR_CLOSE_SBRACK ']' + + + + + +/* ---- longest and shortest normal PG line lengths ----*/ +#define LONGEST_PG_LINE 75 +#define WAY_TOO_LONG 80 +#define SHORTEST_PG_LINE 55 + +#define SWITCHES "ESTPXLOYHWVMUD" /* switches:- */ + /* D - ignore DP-specific markup */ + /* E - echo queried line */ + /* S - check single quotes */ + /* T - check common typos */ + /* P - require closure of quotes on */ + /* every paragraph */ + /* X - "Trust no one" :-) Paranoid! */ + /* Queries everything */ + /* L - line end checking defaults on */ + /* -L turns it off */ + /* O - overview. Just shows counts. */ + /* Y - puts errors to stdout */ + /* instead of stderr */ + /* H - Echoes header fields */ + /* M - Ignore markup in < > */ + /* U - Use file of User-defined Typos*/ + /* W - Defaults for use on Web upload*/ + /* V - Verbose - list EVERYTHING! */ +#define SWITNO 14 /* max number of switch parms */ + /* - used for defining array-size */ +#define MINARGS 1 /* minimum no of args excl switches */ +#define MAXARGS 1 /* maximum no of args excl switches */ + +int pswit[SWITNO]; /* program switches set by SWITCHES */ + +#define ECHO_SWITCH 0 +#define SQUOTE_SWITCH 1 +#define TYPO_SWITCH 2 +#define QPARA_SWITCH 3 +#define PARANOID_SWITCH 4 +#define LINE_END_SWITCH 5 +#define OVERVIEW_SWITCH 6 +#define STDOUT_SWITCH 7 +#define HEADER_SWITCH 8 +#define WEB_SWITCH 9 +#define VERBOSE_SWITCH 10 +#define MARKUP_SWITCH 11 +#define USERTYPO_SWITCH 12 +#define DP_SWITCH 13 + + + +long cnt_dquot; /* for overview mode, count of doublequote queries */ +long cnt_squot; /* for overview mode, count of singlequote queries */ +long cnt_brack; /* for overview mode, count of brackets queries */ +long cnt_bin; /* for overview mode, count of non-ASCII queries */ +long cnt_odd; /* for overview mode, count of odd character queries */ +long cnt_long; /* for overview mode, count of long line errors */ +long cnt_short; /* for overview mode, count of short line queries */ +long cnt_punct; /* for overview mode, count of punctuation and spacing queries */ +long cnt_dash; /* for overview mode, count of dash-related queries */ +long cnt_word; /* for overview mode, count of word queries */ +long cnt_html; /* for overview mode, count of html queries */ +long cnt_lineend; /* for overview mode, count of line-end queries */ +long cnt_spacend; /* count of lines with space at end V .21 */ +long linecnt; /* count of total lines in the file */ +long checked_linecnt; /* count of lines actually gutchecked V .26 */ + +void proghelp(void); +void procfile(char *); + +#define LOW_THRESHOLD 0 +#define HIGH_THRESHOLD 1 + +#define START 0 +#define END 1 +#define PREV 0 +#define NEXT 1 +#define FIRST_OF_PAIR 0 +#define SECOND_OF_PAIR 1 + +#define MAX_WORDPAIR 1000 + +char running_from[MAX_PATH]; + +int mixdigit(char *); +char *getaword(char *, char *); +int matchword(char *, char *); +char *flgets(char *, int, FILE *, long); +void lowerit(char *); +int gcisalpha(unsigned char); +int gcisdigit(unsigned char); +int gcisletter(unsigned char); +char *gcstrchr(char *s, char c); +void postprocess_for_HTML(char *); +char *linehasmarkup(char *); +char *losemarkup(char *); +int tagcomp(char *, char *); +char *loseentities(char *); +int isroman(char *); +int usertypo_count; +void postprocess_for_DP(char *); + +char wrk[LINEBUFSIZE]; + +/* This is disgustingly lazy, predefining max words & lengths, */ +/* but now I'm out of 16-bit restrictions, what's a couple of K? */ +#define MAX_QWORD 50 +#define MAX_QWORD_LENGTH 40 +char qword[MAX_QWORD][MAX_QWORD_LENGTH]; +char qperiod[MAX_QWORD][MAX_QWORD_LENGTH]; +signed int dupcnt[MAX_QWORD]; + + + + +int main(int argc, char **argv) +{ + char *argsw, *s; + int i, switno, invarg; + char usertypo_file[MAX_PATH]; + FILE *usertypofile; + + + if (strlen(argv[0]) < sizeof(running_from)) + strcpy(running_from, argv[0]); /* save the path to the executable gutcheck */ + + /* find out what directory we're running from */ + for (s = running_from + strlen(running_from); *s != '/' && *s != '\\' && s >= running_from; s--) + *s = 0; + + + switno = strlen(SWITCHES); + for (i = switno ; --i >0 ; ) + pswit[i] = 0; /* initialise switches */ + + /* Standard loop to extract switches. */ + /* When we come out of this loop, the arguments will be */ + /* in argv[0] upwards and the switches used will be */ + /* represented by their equivalent elements in pswit[] */ + while ( --argc > 0 && **++argv == '-') + for (argsw = argv[0]+1; *argsw !='\0'; argsw++) + for (i = switno, invarg = 1; (--i >= 0) && invarg == 1 ; ) + if ((toupper(*argsw)) == SWITCHES[i] ) { + invarg = 0; + pswit[i] = 1; + } + + pswit[PARANOID_SWITCH] ^= 1; /* Paranoid checking is turned OFF, not on, by its switch */ + + if (pswit[PARANOID_SWITCH]) { /* if running in paranoid mode */ + pswit[TYPO_SWITCH] = pswit[TYPO_SWITCH] ^ 1; /* force typo checks as well */ + } /* v.20 removed s and p switches from paranoid mode */ + + pswit[LINE_END_SWITCH] ^= 1; /* Line-end checking is turned OFF, not on, by its switch */ + pswit[ECHO_SWITCH] ^= 1; /* V.21 Echoing is turned OFF, not on, by its switch */ + + if (pswit[OVERVIEW_SWITCH]) /* just print summary; don't echo */ + pswit[ECHO_SWITCH] = 0; + + /* Web uploads - for the moment, this is really just a placeholder */ + /* until we decide what processing we really want to do on web uploads */ + if (pswit[WEB_SWITCH]) { /* specific override for web uploads */ + pswit[ECHO_SWITCH] = 1; + pswit[SQUOTE_SWITCH] = 0; + pswit[TYPO_SWITCH] = 1; + pswit[QPARA_SWITCH] = 0; + pswit[PARANOID_SWITCH] = 1; + pswit[LINE_END_SWITCH] = 0; + pswit[OVERVIEW_SWITCH] = 0; + pswit[STDOUT_SWITCH] = 0; + pswit[HEADER_SWITCH] = 1; + pswit[VERBOSE_SWITCH] = 0; + pswit[MARKUP_SWITCH] = 0; + pswit[USERTYPO_SWITCH] = 0; + pswit[DP_SWITCH] = 0; + } + + + if (argc < MINARGS || argc > MAXARGS) { /* check number of args */ + proghelp(); + return(1); /* exit */ + } + + + /* read in the user-defined stealth scanno list */ + + if (pswit[USERTYPO_SWITCH]) { /* ... we were told we had one! */ + if ((usertypofile = fopen(USERTYPO_FILE, "rb")) == NULL) { /* not in cwd. try gutcheck directory. */ + strcpy(usertypo_file, running_from); + strcat(usertypo_file, USERTYPO_FILE); + if ((usertypofile = fopen(usertypo_file, "rb")) == NULL) { /* we ain't got no user typo file! */ + printf(" --> I couldn't find gutcheck.typ -- proceeding without user typos.\n"); + } + } + + usertypo_count = 0; + if (usertypofile) { /* we managed to open a User Typo File! */ + if (pswit[USERTYPO_SWITCH]) { + while (flgets(aline, LINEBUFSIZE-1, usertypofile, (long)usertypo_count)) { + if (strlen(aline) > 1) { + if ((int)*aline > 33) { + s = malloc(strlen(aline)+1); + if (!s) { + fprintf(stderr, "gutcheck: cannot get enough memory for user typo file!!\n"); + exit(1); + } + strcpy(s, aline); + usertypo[usertypo_count] = s; + usertypo_count++; + if (usertypo_count >= MAX_USER_TYPOS) { + printf(" --> Only %d user-defined typos allowed: ignoring the rest\n"); + break; + } + } + } + } + } + fclose(usertypofile); + } + } + + + + + fprintf(stderr, "gutcheck: Check and report on an e-text\n"); + + cnt_dquot = cnt_squot = cnt_brack = cnt_bin = cnt_odd = cnt_long = + cnt_short = cnt_punct = cnt_dash = cnt_word = cnt_html = cnt_lineend = + cnt_spacend = 0; + + procfile(argv[0]); + + if (pswit[OVERVIEW_SWITCH]) { + printf(" Checked %ld lines of %ld (head+foot = %ld)\n\n", + checked_linecnt, linecnt, linecnt - checked_linecnt); + printf(" --------------- Queries found --------------\n"); + if (cnt_long) printf(" Long lines: %5ld\n",cnt_long); + if (cnt_short) printf(" Short lines: %5ld\n",cnt_short); + if (cnt_lineend) printf(" Line-end problems: %5ld\n",cnt_lineend); + if (cnt_word) printf(" Common typos: %5ld\n",cnt_word); + if (cnt_dquot) printf(" Unmatched quotes: %5ld\n",cnt_dquot); + if (cnt_squot) printf(" Unmatched SingleQuotes: %5ld\n",cnt_squot); + if (cnt_brack) printf(" Unmatched brackets: %5ld\n",cnt_brack); + if (cnt_bin) printf(" Non-ASCII characters: %5ld\n",cnt_bin); + if (cnt_odd) printf(" Proofing characters: %5ld\n",cnt_odd); + if (cnt_punct) printf(" Punctuation & spacing queries: %5ld\n",cnt_punct); + if (cnt_dash) printf(" Non-standard dashes: %5ld\n",cnt_dash); + if (cnt_html) printf(" Possible HTML tags: %5ld\n",cnt_html); + printf("\n"); + printf(" TOTAL QUERIES %5ld\n", + cnt_dquot + cnt_squot + cnt_brack + cnt_bin + cnt_odd + cnt_long + + cnt_short + cnt_punct + cnt_dash + cnt_word + cnt_html + cnt_lineend); + } + + return(0); +} + + + +/* procfile - process one file */ + +void procfile(char *filename) +{ + + char *s, *t, *s1, laststart, *wordstart; + char inword[MAXWORDLEN], testword[MAXWORDLEN]; + char parastart[81]; /* first line of current para */ + FILE *infile; + long quot, squot, firstline, alphalen, totlen, binlen, + shortline, longline, verylongline, spacedash, emdash, + space_emdash, non_PG_space_emdash, PG_space_emdash, + footerline, dotcomma, start_para_line, astline, fslashline, + standalone_digit, hyphens, htmcount, endquote_count; + long spline, nspline; + signed int i, j, llen, isemptyline, isacro, isellipsis, istypo, alower, + eNon_A, eTab, eTilde, eAst, eFSlash, eCarat; + signed int warn_short, warn_long, warn_bin, warn_dash, warn_dotcomma, + warn_ast, warn_fslash, warn_digit, warn_hyphen, warn_endquote; + unsigned int lastlen, lastblen; + signed int s_brack, c_brack, r_brack, c_unders; + signed int open_single_quote, close_single_quote, guessquote, dquotepar, squotepar; + signed int isnewpara, vowel, consonant; + char dquote_err[80], squote_err[80], rbrack_err[80], sbrack_err[80], cbrack_err[80], + unders_err[80]; + signed int qword_index, qperiod_index, isdup; + signed int enddash; + signed int Dutchcount, isDutch, Frenchcount, isFrench; + + + + + + laststart = CHAR_SPACE; + lastlen = lastblen = 0; + *dquote_err = *squote_err = *rbrack_err = *cbrack_err = *sbrack_err = + *unders_err = *prevline = 0; + linecnt = firstline = alphalen = totlen = binlen = + shortline = longline = spacedash = emdash = checked_linecnt = + space_emdash = non_PG_space_emdash = PG_space_emdash = + footerline = dotcomma = start_para_line = astline = fslashline = + standalone_digit = hyphens = htmcount = endquote_count = 0; + quot = squot = s_brack = c_brack = r_brack = c_unders = 0; + i = llen = isemptyline = isacro = isellipsis = istypo = 0; + warn_short = warn_long = warn_bin = warn_dash = warn_dotcomma = + warn_ast = warn_fslash = warn_digit = warn_endquote = 0; + isnewpara = vowel = consonant = enddash = 0; + spline = nspline = 0; + qword_index = qperiod_index = isdup = 0; + *inword = *testword = 0; + open_single_quote = close_single_quote = guessquote = dquotepar = squotepar = 0; + Dutchcount = isDutch = Frenchcount = isFrench = 0; + + + for (j = 0; j < MAX_QWORD; j++) { + dupcnt[j] = 0; + for (i = 0; i < MAX_QWORD_LENGTH; i++) + qword[i][j] = 0; + qperiod[i][j] = 0; + } + + + if ((infile = fopen(filename, "rb")) == NULL) { + if (pswit[STDOUT_SWITCH]) + fprintf(stdout, "gutcheck: cannot open %s\n", filename); + else + fprintf(stderr, "gutcheck: cannot open %s\n", filename); + exit(1); + } + + fprintf(stdout, "\n\nFile: %s\n\n", filename); + firstline = shortline = longline = verylongline = 0; + + + /*****************************************************/ + /* */ + /* Run a first pass - verify that it's a valid PG */ + /* file, decide whether to report some things that */ + /* occur many times in the text like long or short */ + /* lines, non-standard dashes, and other good stuff */ + /* I'll doubtless think of later. */ + /* */ + /*****************************************************/ + + /*****************************************************/ + /* V.24 Sigh. Yet Another Header Change */ + /*****************************************************/ + + while (fgets(aline, LINEBUFSIZE-1, infile)) { + while (aline[strlen(aline)-1] == 10 || aline[strlen(aline)-1] == 13 ) aline[strlen(aline)-1] = 0; + linecnt++; + if (strstr(aline, "*END") && strstr(aline, "SMALL PRINT") && (strstr(aline, "PUBLIC DOMAIN") || strstr(aline, "COPYRIGHT"))) { + if (spline) + printf(" --> Duplicate header?\n"); + spline = linecnt + 1; /* first line of non-header text, that is */ + } + if (!strncmp(aline, "*** START", 9) && strstr(aline, "PROJECT GUTENBERG")) { + if (nspline) + printf(" --> Duplicate header?\n"); + nspline = linecnt + 1; /* first line of non-header text, that is */ + } + if (spline || nspline) { + lowerit(aline); + if (strstr(aline, "end") && strstr(aline, "project gutenberg")) { + if (strstr(aline, "end") < strstr(aline, "project gutenberg")) { + if (footerline) { + if (!nspline) /* it's an old-form header - we can detect duplicates */ + printf(" --> Duplicate footer?\n"); + else + ; + } + else { + footerline = linecnt; + } + } + } + } + if (spline) firstline = spline; + if (nspline) firstline = nspline; /* override with new */ + + if (footerline) continue; /* 0.99+ don't count the boilerplate in the footer */ + + llen = strlen(aline); + totlen += llen; + for (i = 0; i < llen; i++) { + if ((unsigned char)aline[i] > 127) binlen++; + if (gcisalpha(aline[i])) alphalen++; + if (i > 0) + if (aline[i] == CHAR_DQUOTE && isalpha(aline[i-1])) + endquote_count++; + } + if (strlen(aline) > 2 + && lastlen > 2 && lastlen < SHORTEST_PG_LINE + && lastblen > 2 && lastblen > SHORTEST_PG_LINE + && laststart != CHAR_SPACE) + shortline++; + + if (*aline) /* fixed line below for 0.96 */ + if ((unsigned char)aline[strlen(aline)-1] <= CHAR_SPACE) cnt_spacend++; + + if (strstr(aline, ".,")) dotcomma++; + /* 0.98 only count ast lines for ignoring purposes where there is */ + /* locase text on the line */ + if (strstr(aline, "*")) { + for (s = aline; *s; s++) + if (*s >='a' && *s <= 'z') + break; + if (*s) astline++; + } + if (strstr(aline, "/")) + fslashline++; + for (i = llen-1; i > 0 && (unsigned char)aline[i] <= CHAR_SPACE; i--); + if (aline[i] == '-' && aline[i-1] != '-') hyphens++; + + if (llen > LONGEST_PG_LINE) longline++; + if (llen > WAY_TOO_LONG) verylongline++; + + if (strstr(aline, "<") && strstr(aline, ">")) { + i = (signed int) (strstr(aline, ">") - strstr(aline, "<") + 1); + if (i > 0) + htmcount++; + if (strstr(aline, "")) htmcount +=4; /* bonus marks! */ + } + + /* Check for spaced em-dashes */ + if (strstr(aline,"--")) { + emdash++; + if (*(strstr(aline, "--")-1) == CHAR_SPACE || + (*(strstr(aline, "--")+2) == CHAR_SPACE)) + space_emdash++; + if (*(strstr(aline, "--")-1) == CHAR_SPACE && + (*(strstr(aline, "--")+2) == CHAR_SPACE)) + non_PG_space_emdash++; /* count of em-dashes with spaces both sides */ + if (*(strstr(aline, "--")-1) != CHAR_SPACE && + (*(strstr(aline, "--")+2) != CHAR_SPACE)) + PG_space_emdash++; /* count of PG-type em-dashes with no spaces */ + } + + for (s = aline; *s;) { + s = getaword(s, inword); + if (!strcmp(inword, "hij") || !strcmp(inword, "niet")) + Dutchcount++; + if (!strcmp(inword, "dans") || !strcmp(inword, "avec")) + Frenchcount++; + if (!strcmp(inword, "0") || !strcmp(inword, "1")) + standalone_digit++; + } + + /* Check for spaced dashes */ + if (strstr(aline," -")) + if (*(strstr(aline, " -")+2) != '-') + spacedash++; + lastblen = lastlen; + lastlen = strlen(aline); + laststart = aline[0]; + + } + fclose(infile); + + + /* now, based on this quick view, make some snap decisions */ + if (cnt_spacend > 0) { + printf(" --> %ld lines in this file have white space at end\n", cnt_spacend); + } + + warn_dotcomma = 1; + if (dotcomma > 5) { + warn_dotcomma = 0; + printf(" --> %ld lines in this file contain '.,'. Not reporting them.\n", dotcomma); + } + + /* if more than 50 lines, or one-tenth, are short, don't bother reporting them */ + warn_short = 1; + if (shortline > 50 || shortline * 10 > linecnt) { + warn_short = 0; + printf(" --> %ld lines in this file are short. Not reporting short lines.\n", shortline); + } + + /* if more than 50 lines, or one-tenth, are long, don't bother reporting them */ + warn_long = 1; + if (longline > 50 || longline * 10 > linecnt) { + warn_long = 0; + printf(" --> %ld lines in this file are long. Not reporting long lines.\n", longline); + } + + /* if more than 10 lines contain asterisks, don't bother reporting them V.0.97 */ + warn_ast = 1; + if (astline > 10 ) { + warn_ast = 0; + printf(" --> %ld lines in this file contain asterisks. Not reporting them.\n", astline); + } + + /* if more than 10 lines contain forward slashes, don't bother reporting them V.0.99 */ + warn_fslash = 1; + if (fslashline > 10 ) { + warn_fslash = 0; + printf(" --> %ld lines in this file contain forward slashes. Not reporting them.\n", fslashline); + } + + /* if more than 20 lines contain unpunctuated endquotes, don't bother reporting them V.0.99 */ + warn_endquote = 1; + if (endquote_count > 20 ) { + warn_endquote = 0; + printf(" --> %ld lines in this file contain unpunctuated endquotes. Not reporting them.\n", endquote_count); + } + + /* if more than 15 lines contain standalone digits, don't bother reporting them V.0.97 */ + warn_digit = 1; + if (standalone_digit > 10 ) { + warn_digit = 0; + printf(" --> %ld lines in this file contain standalone 0s and 1s. Not reporting them.\n", standalone_digit); + } + + /* if more than 20 lines contain hyphens at end, don't bother reporting them V.0.98 */ + warn_hyphen = 1; + if (hyphens > 20 ) { + warn_hyphen = 0; + printf(" --> %ld lines in this file have hyphens at end. Not reporting them.\n", hyphens); + } + + if (htmcount > 20 && !pswit[MARKUP_SWITCH]) { + printf(" --> Looks like this is HTML. Switching HTML mode ON.\n"); + pswit[MARKUP_SWITCH] = 1; + } + + if (verylongline > 0) { + printf(" --> %ld lines in this file are VERY long!\n", verylongline); + } + + /* If there are more non-PG spaced dashes than PG em-dashes, */ + /* assume it's deliberate */ + /* Current PG guidelines say don't use them, but older texts do,*/ + /* and some people insist on them whatever the guidelines say. */ + /* V.20 removed requirement that PG_space_emdash be greater than*/ + /* ten before turning off warnings about spaced dashes. */ + warn_dash = 1; + if (spacedash + non_PG_space_emdash > PG_space_emdash) { + warn_dash = 0; + printf(" --> There are %ld spaced dashes and em-dashes. Not reporting them.\n", spacedash + non_PG_space_emdash); + } + + /* if more than a quarter of characters are hi-bit, bug out */ + warn_bin = 1; + if (binlen * 4 > totlen) { + printf(" --> This file does not appear to be ASCII. Terminating. Best of luck with it!\n"); + exit(1); + } + if (alphalen * 4 < totlen) { + printf(" --> This file does not appear to be text. Terminating. Best of luck with it!\n"); + exit(1); + } + if ((binlen * 100 > totlen) || (binlen > 100)) { + printf(" --> There are a lot of foreign letters here. Not reporting them.\n"); + warn_bin = 0; + } + + /* isDutch and isFrench added .991 Feb 06 for Frank, Jeroen, Renald */ + isDutch = 0; + if (Dutchcount > 50) { + isDutch = 1; + printf(" --> This looks like Dutch - switching off dashes and warnings for 's Middags case.\n"); + } + + isFrench = 0; + if (Frenchcount > 50) { + isFrench = 1; + printf(" --> This looks like French - switching off some doublepunct.\n"); + } + + if (firstline && footerline) + printf(" The PG header and footer appear to be already on.\n"); + else { + if (firstline) + printf(" The PG header is on - no footer.\n"); + if (footerline) + printf(" The PG footer is on - no header.\n"); + } + printf("\n"); + + /* V.22 George Davis asked for an override switch to force it to list everything */ + if (pswit[VERBOSE_SWITCH]) { + warn_bin = 1; + warn_short = 1; + warn_dotcomma = 1; + warn_long = 1; + warn_dash = 1; + warn_digit = 1; + warn_ast = 1; + warn_fslash = 1; + warn_hyphen = 1; + warn_endquote = 1; + printf(" *** Verbose output is ON -- you asked for it! ***\n"); + } + + if (isDutch) + warn_dash = 0; /* Frank suggested turning it REALLY off for Dutch */ + + if ((infile = fopen(filename, "rb")) == NULL) { + if (pswit[STDOUT_SWITCH]) + fprintf(stdout, "gutcheck: cannot open %s\n", filename); + else + fprintf(stderr, "gutcheck: cannot open %s\n", filename); + exit(1); + } + + if (footerline > 0 && firstline > 0 && footerline > firstline && footerline - firstline < 100) { /* ugh */ + printf(" --> I don't really know where this text starts. \n"); + printf(" There are no reference points.\n"); + printf(" I'm going to have to report the header and footer as well.\n"); + firstline=0; + } + + + + /*****************************************************/ + /* */ + /* Here we go with the main pass. Hold onto yer hat! */ + /* */ + /*****************************************************/ + + /* Re-init some variables we've dirtied */ + quot = squot = linecnt = 0; + laststart = CHAR_SPACE; + lastlen = lastblen = 0; + + while (flgets(aline, LINEBUFSIZE-1, infile, linecnt+1)) { + linecnt++; + if (linecnt == 1) isnewpara = 1; + if (pswit[DP_SWITCH]) + if (!strncmp(aline, "-----File: ", 11)) + continue; // skip DP page separators completely + if (linecnt < firstline || (footerline > 0 && linecnt > footerline)) { + if (pswit[HEADER_SWITCH]) { + if (!strncmp(aline, "Title:", 6)) + printf(" %s\n", aline); + if (!strncmp (aline, "Author:", 7)) + printf(" %s\n", aline); + if (!strncmp(aline, "Release Date:", 13)) + printf(" %s\n", aline); + if (!strncmp(aline, "Edition:", 8)) + printf(" %s\n\n", aline); + } + continue; /* skip through the header */ + } + checked_linecnt++; + s = aline; + isemptyline = 1; /* assume the line is empty until proven otherwise */ + + /* If we are in a state of unbalanced quotes, and this line */ + /* doesn't begin with a quote, output the stored error message */ + /* If the -P switch was used, print the warning even if the */ + /* new para starts with quotes */ + /* Version .20 - if the new paragraph does start with a quote, */ + /* but is indented, I was giving a spurious error. Need to */ + /* check the first _non-space_ character on the line rather */ + /* than the first character when deciding whether the para */ + /* starts with a quote. Using *t for this. */ + t = s; + while (*t == ' ') t++; + if (*dquote_err) + if (*t != CHAR_DQUOTE || pswit[QPARA_SWITCH]) { + if (!pswit[OVERVIEW_SWITCH]) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart); + printf(dquote_err); + } + else + cnt_dquot++; + } + if (*squote_err) { + if (*t != CHAR_SQUOTE && *t != CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] || squot) { + if (!pswit[OVERVIEW_SWITCH]) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart); + printf(squote_err); + } + else + cnt_squot++; + } + squot = 0; + } + if (*rbrack_err) { + if (!pswit[OVERVIEW_SWITCH]) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart); + printf(rbrack_err); + } + else + cnt_brack++; + } + if (*sbrack_err) { + if (!pswit[OVERVIEW_SWITCH]) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart); + printf(sbrack_err); + } + else + cnt_brack++; + } + if (*cbrack_err) { + if (!pswit[OVERVIEW_SWITCH]) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart); + printf(cbrack_err); + } + else + cnt_brack++; + } + if (*unders_err) { + if (!pswit[OVERVIEW_SWITCH]) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart); + printf(unders_err); + } + else + cnt_brack++; + } + + *dquote_err = *squote_err = *rbrack_err = *cbrack_err = + *sbrack_err = *unders_err = 0; + + + /* look along the line, accumulate the count of quotes, and see */ + /* if this is an empty line - i.e. a line with nothing on it */ + /* but spaces. */ + /* V .12 also if line has just spaces, * and/or - on it, don't */ + /* count it, since empty lines with asterisks or dashes to */ + /* separate sections are common. */ + /* V .15 new single-quote checking - has to be better than the */ + /* previous version, but how much better? fingers crossed! */ + /* V .20 add period to * and - as characters on a separator line*/ + s = aline; + while (*s) { + if (*s == CHAR_DQUOTE) quot++; + if (*s == CHAR_SQUOTE || *s == CHAR_OPEN_SQUOTE) + if (s == aline) { /* at start of line, it can only be an openquote */ + if (strncmp(s+2, "tis", 3) && strncmp(s+2, "Tis", 3)) /* hardcode a very common exception! */ + open_single_quote++; + } + else + if (gcisalpha(*(s-1)) && gcisalpha(*(s+1))) + ; /* do nothing! - it's definitely an apostrophe, not a quote */ + else /* it's outside a word - let's check it out */ + if (*s == CHAR_OPEN_SQUOTE || gcisalpha(*(s+1))) { /* it damwell better BE an openquote */ + if (strncmp(s+1, "tis", 3) && strncmp(s+1, "Tis", 3)) /* hardcode a very common exception! */ + open_single_quote++; + } + else { /* now - is it a closequote? */ + guessquote = 0; /* accumulate clues */ + if (gcisalpha(*(s-1))) { /* it follows a letter - could be either */ + guessquote += 1; + if (*(s-1) == 's') { /* looks like a plural apostrophe */ + guessquote -= 3; + if (*(s+1) == CHAR_SPACE) /* bonus marks! */ + guessquote -= 2; + } + } + else /* it doesn't have a letter either side */ + if (strchr(".?!,;:", *(s-1)) && (strchr(".?!,;: ", *(s+1)))) + guessquote += 8; /* looks like a closequote */ + else + guessquote += 1; + if (open_single_quote > close_single_quote) + guessquote += 1; /* give it the benefit of some doubt - if a squote is already open */ + else + guessquote -= 1; + if (guessquote >= 0) + close_single_quote++; + } + + if (*s != CHAR_SPACE + && *s != '-' + && *s != '.' + && *s != CHAR_ASTERISK + && *s != 13 + && *s != 10) isemptyline = 0; /* ignore lines like * * * as spacers */ + if (*s == CHAR_UNDERSCORE) c_unders++; + if (*s == CHAR_OPEN_CBRACK) c_brack++; + if (*s == CHAR_CLOSE_CBRACK) c_brack--; + if (*s == CHAR_OPEN_RBRACK) r_brack++; + if (*s == CHAR_CLOSE_RBRACK) r_brack--; + if (*s == CHAR_OPEN_SBRACK) s_brack++; + if (*s == CHAR_CLOSE_SBRACK) s_brack--; + s++; + } + + if (isnewpara && !isemptyline) { /* This line is the start of a new paragraph */ + start_para_line = linecnt; + strncpy(parastart, aline, 80); /* Capture its first line in case we want to report it later */ + parastart[79] = 0; + dquotepar = squotepar = 0; /* restart the quote count 0.98 */ + s = aline; + while (!gcisalpha(*s) && !gcisdigit(*s) && *s) s++; /* V.97 fixed bug - overran line and gave false warning - rare */ + if (*s >= 'a' && *s <='z') { /* and its first letter is lowercase */ + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Paragraph starts with lower-case\n", linecnt, (int)(s - aline) +1); + else + cnt_punct++; + } + isnewpara = 0; /* Signal the end of new para processing */ + } + + /* Check for an em-dash broken at line end */ + if (enddash && *aline == '-') { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column 1 - Broken em-dash?\n", linecnt); + else + cnt_punct++; + } + enddash = 0; + for (s = aline + strlen(aline) - 1; *s == ' ' && s > aline; s--); + if (s >= aline && *s == '-') + enddash = 1; + + + /* Check for invalid or questionable characters in the line */ + /* Anything above 127 is invalid for plain ASCII, and */ + /* non-printable control characters should also be flagged. */ + /* Tabs should generally not be there. */ + /* Jan 06, in 0.99: Hm. For some strange reason, I either */ + /* never created or deleted the check for unprintable */ + /* control characters. They should be reported even if */ + /* warn_bin is on, I think, and in full. */ + + for (s = aline; *s; s++) { + i = (unsigned char) *s; + if (i < CHAR_SPACE && i != CHAR_LF && i != CHAR_CR && i != CHAR_TAB) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Control character %d\n", linecnt, (int) (s - aline) + 1, i); + else + cnt_bin++; + } + } + + if (warn_bin) { + eNon_A = eTab = eTilde = eCarat = eFSlash = eAst = 0; /* don't repeat multiple warnings on one line */ + for (s = aline; *s; s++) { + if (!eNon_A && ((*s < CHAR_SPACE && *s != 9 && *s != '\n') || (unsigned char)*s > 127)) { + i = *s; /* annoying kludge for signed chars */ + if (i < 0) i += 256; + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + if (i > 127 && i < 160) + printf(" Line %ld column %d - Non-ISO-8859 character %d\n", linecnt, (int) (s - aline) + 1, i); + else + printf(" Line %ld column %d - Non-ASCII character %d\n", linecnt, (int) (s - aline) + 1, i); + else + cnt_bin++; + eNon_A = 1; + } + if (!eTab && *s == CHAR_TAB) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Tab character?\n", linecnt, (int) (s - aline) + 1); + else + cnt_odd++; + eTab = 1; + } + if (!eTilde && *s == CHAR_TILDE) { /* often used by OCR software to indicate an unrecognizable character */ + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Tilde character?\n", linecnt, (int) (s - aline) + 1); + else + cnt_odd++; + eTilde = 1; + } + if (!eCarat && *s == CHAR_CARAT) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Carat character?\n", linecnt, (int) (s - aline) + 1); + else + cnt_odd++; + eCarat = 1; + } + if (!eFSlash && *s == CHAR_FORESLASH && warn_fslash) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Forward slash?\n", linecnt, (int) (s - aline) + 1); + else + cnt_odd++; + eFSlash = 1; + } + /* report asterisks only in paranoid mode, since they're often deliberate */ + if (!eAst && pswit[PARANOID_SWITCH] && warn_ast && !isemptyline && *s == CHAR_ASTERISK) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Asterisk?\n", linecnt, (int) (s - aline) + 1); + else + cnt_odd++; + eAst = 1; + } + } + } + + /* Check for line too long */ + if (warn_long) { + if (strlen(aline) > LONGEST_PG_LINE) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Long line %d\n", linecnt, strlen(aline), strlen(aline)); + else + cnt_long++; + } + } + + /* Check for line too short. */ + /* This one is a bit trickier to implement: we don't want to */ + /* flag the last line of a paragraph for being short, so we */ + /* have to wait until we know that our current line is a */ + /* "normal" line, then report the _previous_ line if it was too */ + /* short. We also don't want to report indented lines like */ + /* chapter heads or formatted quotations. We therefore keep */ + /* lastlen as the length of the last line examined, and */ + /* lastblen as the length of the last but one, and try to */ + /* suppress unnecessary warnings by checking that both were of */ + /* "normal" length. We keep the first character of the last */ + /* line in laststart, and if it was a space, we assume that the */ + /* formatting is deliberate. I can't figure out a way to */ + /* distinguish something like a quoted verse left-aligned or */ + /* the header or footer of a letter from a paragraph of short */ + /* lines - maybe if I examined the whole paragraph, and if the */ + /* para has less than, say, 8 lines and if all lines are short, */ + /* then just assume it's OK? Need to look at some texts to see */ + /* how often a formula like this would get the right result. */ + /* V0.99 changed the tolerance for length to ignore from 2 to 1 */ + if (warn_short) { + if (strlen(aline) > 1 + && lastlen > 1 && lastlen < SHORTEST_PG_LINE + && lastblen > 1 && lastblen > SHORTEST_PG_LINE + && laststart != CHAR_SPACE) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", prevline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Short line %d?\n", linecnt-1, strlen(prevline), strlen(prevline)); + else + cnt_short++; + } + } + lastblen = lastlen; + lastlen = strlen(aline); + laststart = aline[0]; + + /* look for punctuation at start of line */ + if (*aline && strchr(".?!,;:", aline[0])) { /* if it's punctuation */ + if (strncmp(". . .", aline, 5)) { /* exception for ellipsis: V.98 tightened up to except only a full ellipsis */ + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column 1 - Begins with punctuation?\n", linecnt); + else + cnt_punct++; + } + } + + /* Check for spaced em-dashes */ + /* V.20 must check _all_ occurrences of "--" on the line */ + /* hence the loop - even if the first double-dash is OK */ + /* there may be another that's wrong later on. */ + if (warn_dash) { + s = aline; + while (strstr(s,"--")) { + if (*(strstr(s, "--")-1) == CHAR_SPACE || + (*(strstr(s, "--")+2) == CHAR_SPACE)) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Spaced em-dash?\n", linecnt, (int) (strstr(s,"--") - aline) + 1); + else + cnt_dash++; + } + s = strstr(s,"--") + 2; + } + } + + /* Check for spaced dashes */ + if (warn_dash) + if (strstr(aline," -")) { + if (*(strstr(aline, " -")+2) != '-') { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Spaced dash?\n", linecnt, (int) (strstr(aline," -") - aline) + 1); + else + cnt_dash++; + } + } + else + if (strstr(aline,"- ")) { + if (*(strstr(aline, "- ")-1) != '-') { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Spaced dash?\n", linecnt, (int) (strstr(aline,"- ") - aline) + 1); + else + cnt_dash++; + } + } + + /* v 0.99 */ + /* Check for unmarked paragraphs indicated by separate speakers */ + /* May well be false positive: */ + /* "Bravo!" "Wonderful!" called the crowd. */ + /* but useful all the same. */ + s = wrk; + *s = 0; + if (strstr(aline, "\" \"")) s = strstr(aline, "\" \""); + if (strstr(aline, "\" \"")) s = strstr(aline, "\" \""); + if (*s) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Query missing paragraph break?\n", linecnt, (int)(s - aline) +1); + else + cnt_punct++; + } + + + + /* Check for "to he" and other easy he/be errors */ + /* This is a very inadequate effort on the he/be problem, */ + /* but the phrase "to he" is always an error, whereas "to */ + /* be" is quite common. I chuckle when it does catch one! */ + /* Similarly, '"Quiet!", be said.' is a non-be error */ + /* V .18 - "to he" is _not_ always an error!: */ + /* "Where they went to he couldn't say." */ + /* but I'm leaving it in anyway. */ + /* V .20 Another false positive: */ + /* What would "Cinderella" be without the . . . */ + /* and another "If he wants to he can see for himself." */ + /* V .21 Added " is be " and " be is " and " be was " */ + /* V .99 Added jeebies code -- removed again. */ + /* Is jeebies code worth adding? Rare to see he/be */ + /* errors with modern OCR. Separate program? Yes! */ + /* jeebies does the job without cluttering up this. */ + /* We do get a few more queryable pairs from the */ + /* project though -- they're cheap to implement. */ + /* Also added a column number for guiguts. */ + + s = wrk; + *s = 0; + if (strstr(aline," to he ")) s = strstr(aline," to he "); + if (strstr(aline,"\" be ")) s = strstr(aline,"\" be "); + if (strstr(aline,"\", be ")) s = strstr(aline,"\", be "); + if (strstr(aline," is be ")) s = strstr(aline," is be "); + if (strstr(aline," be is ")) s = strstr(aline," be is "); + if (strstr(aline," was be ")) s = strstr(aline," was be "); + if (strstr(aline," be would ")) s = strstr(aline," be would "); + if (strstr(aline," be could ")) s = strstr(aline," be could "); + if (*s) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Query he/be error?\n", linecnt, (int)(s - aline) +1); + else + cnt_word++; + } + + s = wrk; + *s = 0; + if (strstr(aline," i bad ")) s = strstr(aline," i bad "); + if (strstr(aline," you bad ")) s = strstr(aline," you bad "); + if (strstr(aline," he bad ")) s = strstr(aline," he bad "); + if (strstr(aline," she bad ")) s = strstr(aline," she bad "); + if (strstr(aline," they bad ")) s = strstr(aline," they bad "); + if (strstr(aline," a had ")) s = strstr(aline," a had "); + if (strstr(aline," the had ")) s = strstr(aline," the had "); + if (*s) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Query had/bad error?\n", linecnt, (int)(s - aline) +1); + else + cnt_word++; + } + + + /* V .97 Added ", hut " Not too common, hut pretty certain */ + /* V.99 changed to add a column number for guiguts */ + s = wrk; + *s = 0; + if (strstr(aline,", hut ")) s = strstr(aline,", hut "); + if (strstr(aline,"; hut ")) s = strstr(aline,"; hut "); + if (*s) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Query hut/but error?\n", linecnt, (int)(s - aline) +1); + else + cnt_word++; + } + + /* Special case - angled bracket in front of "From" placed there by an MTA */ + /* when sending an e-mail. V .21 */ + if (strstr(aline, ">From")) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Query angled bracket with From\n", linecnt, (int)(strstr(aline, ">From") - aline) +1); + else + cnt_punct++; + } + + /* V 0.98 Check for a single character line - often an overflow from bad wrapping. */ + if (*aline && !*(aline+1)) { + if (*aline == 'I' || *aline == 'V' || *aline == 'X' || *aline == 'L' || gcisdigit(*aline)) + ; /* nothing - ignore numerals alone on a line. */ + else { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column 1 - Query single character line\n", linecnt); + else + cnt_punct++; + } + } + + /* V 0.98 Check for I" - often should be ! */ + if (strstr(aline, " I\"")) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %ld - Query I=exclamation mark?\n", linecnt, strstr(aline, " I\"") - aline); + else + cnt_punct++; + } + + /* V 0.98 Check for period without a capital letter. Cut-down from gutspell */ + /* Only works when it happens on a single line. */ + + if (pswit[PARANOID_SWITCH]) + for (t = s = aline; strstr(t,". ");) { + t = strstr(t, ". "); + if (t == s) { + t++; + continue; /* start of line punctuation is handled elsewhere */ + } + if (!gcisalpha(*(t-1))) { + t++; + continue; + } + if (isDutch) { /* For Frank & Jeroen -- 's Middags case */ + if (*(t+2) == CHAR_SQUOTE && + *(t+3)>='a' && *(t+3)<='z' && + *(t+4) == CHAR_SPACE && + *(t+5)>='A' && *(t+5)<='Z') { + t++; + continue; + } + } + s1 = t+2; + while (*s1 && !gcisalpha(*s1) && !isdigit(*s1)) + s1++; + if (*s1 >= 'a' && *s1 <= 'z') { /* we have something to investigate */ + istypo = 1; + for (s1 = t - 1; s1 >= s && + (gcisalpha(*s1) || gcisdigit(*s1) || + (*s1 == CHAR_SQUOTE && gcisalpha(*(s1+1)) && gcisalpha(*(s1-1)))); s1--); /* so let's go back and find out */ + s1++; + for (i = 0; *s1 && *s1 != '.'; s1++, i++) + testword[i] = *s1; + testword[i] = 0; + for (i = 0; *abbrev[i]; i++) + if (!strcmp(testword, abbrev[i])) + istypo = 0; +// if (*testword >= 'A' && *testword <= 'Z') +// istypo = 0; + if (gcisdigit(*testword)) istypo = 0; + if (!*(testword+1)) istypo = 0; + if (isroman(testword)) istypo = 0; + if (istypo) { + istypo = 0; + for (i = 0; testword[i]; i++) + if (strchr(vowels, testword[i])) + istypo = 1; + } + if (istypo) { + isdup = 0; + if (strlen(testword) < MAX_QWORD_LENGTH && !pswit[VERBOSE_SWITCH]) + for (i = 0; i < qperiod_index; i++) + if (!strcmp(testword, qperiod[i])) { + isdup = 1; + } + if (!isdup) { + if (qperiod_index < MAX_QWORD && strlen(testword) < MAX_QWORD_LENGTH) { + strcpy(qperiod[qperiod_index], testword); + qperiod_index++; + } + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Extra period?\n", linecnt, (int)(t - aline)+1); + else + cnt_punct++; + } + } + } + t++; + } + + + if (pswit[TYPO_SWITCH]) { /* Should have put this condition in at the start of 0.99. Duh! */ + /* Check for words usually not followed by punctuation 0.99 */ + for (s = aline; *s;) { + wordstart = s; + s = getaword(s, inword); + if (!*inword) continue; + lowerit(inword); + for (i = 0; *nocomma[i]; i++) + if (!strcmp(inword, nocomma[i])) { + if (*s == ',' || *s == ';' || *s == ':') { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Query punctuation after %s?\n", linecnt, (int)(s - aline)+1, inword); + else + cnt_punct++; + } + } + for (i = 0; *noperiod[i]; i++) + if (!strcmp(inword, noperiod[i])) { + if (*s == '.' || *s == '!') { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Query punctuation after %s?\n", linecnt, (int)(s - aline)+1, inword); + else + cnt_punct++; + } + } + } + } + + + + /* Check for commonly mistyped words, and digits like 0 for O in a word */ + for (s = aline; *s;) { + wordstart = s; + s = getaword(s, inword); + if (!*inword) continue; /* don't bother with empty lines */ + if (mixdigit(inword)) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %ld - Query digit in %s\n", linecnt, (int)(wordstart - aline) + 1, inword); + else + cnt_word++; + } + + /* put the word through a series of tests for likely typos and OCR errors */ + /* V.21 I had allowed lots of typo-checking even with the typo switch */ + /* turned off, but I really should disallow reporting of them when */ + /* the switch is off. Hence the "if" below. */ + if (pswit[TYPO_SWITCH]) { + istypo = 0; + strcpy(testword, inword); + alower = 0; + for (i = 0; i < (signed int)strlen(testword); i++) { /* lowercase for testing */ + if (testword[i] >= 'a' && testword[i] <= 'z') alower = 1; + if (alower && testword[i] >= 'A' && testword[i] <= 'Z') { + /* we have an uppercase mid-word. However, there are common cases: */ + /* Mac and Mc like McGill */ + /* French contractions like l'Abbe */ + if ((i == 2 && testword[0] == 'm' && testword[1] == 'c') || + (i == 3 && testword[0] == 'm' && testword[1] == 'a' && testword[2] == 'c') || + (i > 0 && testword[i-1] == CHAR_SQUOTE)) + ; /* do nothing! */ + + else { /* V.97 - remove separate case of uppercase within word so that */ + /* names like VanAllen fall into qword_index and get reported only once */ + istypo = 1; + } + } + testword[i] = (char)tolower(testword[i]); + } + + /* check for certain unlikely two-letter combinations at word start and end */ + /* V.0.97 - this replaces individual hardcoded checks in previous versions */ + if (strlen(testword) > 1) { + for (i = 0; *nostart[i]; i++) + if (!strncmp(testword, nostart[i], 2)) + istypo = 1; + for (i = 0; *noend[i]; i++) + if (!strncmp(testword + strlen(testword) -2, noend[i], 2)) + istypo = 1; + } + + + /* ght is common, gbt never. Like that. */ + if (strstr(testword, "cb")) istypo = 1; + if (strstr(testword, "gbt")) istypo = 1; + if (strstr(testword, "pbt")) istypo = 1; + if (strstr(testword, "tbs")) istypo = 1; + if (strstr(testword, "mrn")) istypo = 1; + if (strstr(testword, "ahle")) istypo = 1; + if (strstr(testword, "ihle")) istypo = 1; + + /* "TBE" does happen - like HEARTBEAT - but uncommon. */ + /* Also "TBI" - frostbite, outbid - but uncommon. */ + /* Similarly "ii" like Hawaii, or Pompeii, and in Roman numerals, */ + /* but these are covered in V.20. "ii" is a common scanno. */ + if (strstr(testword, "tbi")) istypo = 1; + if (strstr(testword, "tbe")) istypo = 1; + if (strstr(testword, "ii")) istypo = 1; + + /* check for no vowels or no consonants. */ + /* If none, flag a typo */ + if (!istypo && strlen(testword)>1) { + vowel = consonant = 0; + for (i = 0; testword[i]; i++) + if (testword[i] == 'y' || gcisdigit(testword[i])) { /* Yah, this is loose. */ + vowel++; + consonant++; + } + else + if (strchr(vowels, testword[i])) vowel++; + else consonant++; + if (!vowel || !consonant) { + istypo = 1; + } + } + + /* now exclude the word from being reported if it's in */ + /* the okword list */ + for (i = 0; *okword[i]; i++) + if (!strcmp(testword, okword[i])) + istypo = 0; + + /* what looks like a typo may be a Roman numeral. Exclude these */ + if (istypo) + if (isroman(testword)) + istypo = 0; + + /* check the manual list of typos */ + if (!istypo) + for (i = 0; *typo[i]; i++) + if (!strcmp(testword, typo[i])) + istypo = 1; + + + /* V.21 - check lowercase s and l - special cases */ + /* V.98 - added "i" and "m" */ + /* V.99 - added "j" often a semi-colon gone wrong */ + /* - and "d" for a missing apostrophe - he d */ + /* - and "n" for "in" */ + if (!istypo && strlen(testword) == 1) + if (strchr("slmijdn", *inword)) + istypo = 1; + + + if (istypo) { + isdup = 0; + if (strlen(testword) < MAX_QWORD_LENGTH && !pswit[VERBOSE_SWITCH]) + for (i = 0; i < qword_index; i++) + if (!strcmp(testword, qword[i])) { + isdup = 1; + ++dupcnt[i]; + } + if (!isdup) { + if (qword_index < MAX_QWORD && strlen(testword) < MAX_QWORD_LENGTH) { + strcpy(qword[qword_index], testword); + qword_index++; + } + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) { + printf(" Line %ld column %d - Query word %s", linecnt, (int)(wordstart - aline) + 1, inword); + if (strlen(testword) < MAX_QWORD_LENGTH && !pswit[VERBOSE_SWITCH]) + printf(" - not reporting duplicates"); + printf("\n"); + } + else + cnt_word++; + } + } + } /* end of typo-checking */ + + /* check the user's list of typos */ + if (!istypo) + if (usertypo_count) + for (i = 0; i < usertypo_count; i++) + if (!strcmp(testword, usertypo[i])) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Query possible scanno %s\n", linecnt, (int)(wordstart - aline) + 2, inword); + } + + + + if (pswit[PARANOID_SWITCH] && warn_digit) { /* in paranoid mode, query all 0 and 1 standing alone - added warn_digit V.97*/ + if (!strcmp(inword, "0") || !strcmp(inword, "1")) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Query standalone %s\n", linecnt, (int)(wordstart - aline) + 2, inword); + else + cnt_word++; + } + } + } + + /* look for added or missing spaces around punctuation and quotes */ + /* If there is a punctuation character like ! with no space on */ + /* either side, suspect a missing!space. If there are spaces on */ + /* both sides , assume a typo. If we see a double quote with no */ + /* space or punctuation on either side of it, assume unspaced */ + /* quotes "like"this. */ + llen = strlen(aline); + for (i = 1; i < llen; i++) { /* for each character in the line after the first */ + if (strchr(".?!,;:_", aline[i])) { /* if it's punctuation */ + isacro = 0; /* we need to suppress warnings for acronyms like M.D. */ + isellipsis = 0; /* we need to suppress warnings for ellipsis . . . */ + if ( (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1])) || /* if there are letters on both sides of it or ... */ + (gcisalpha(aline[i+1]) && strchr("?!,;:", aline[i]))) { /* ...if it's strict punctuation followed by an alpha */ + if (aline[i] == '.') { + if (i > 2) + if (aline[i-2] == '.') isacro = 1; + if (i + 2 < llen) + if (aline[i+2] == '.') isacro = 1; + } + if (!isacro) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Missing space?\n", linecnt, i+1); + else + cnt_punct++; + } + } + if (aline[i-1] == CHAR_SPACE && (aline[i+1] == CHAR_SPACE || aline[i+1] == 0)) { /* if there are spaces on both sides, or space before and end of line */ + if (aline[i] == '.') { + if (i > 2) + if (aline[i-2] == '.') isellipsis = 1; + if (i + 2 < llen) + if (aline[i+2] == '.') isellipsis = 1; + } + if (!isemptyline && !isellipsis) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Spaced punctuation?\n", linecnt, i+1); + else + cnt_punct++; + } + } + } + } + + /* 0.98 -- split out the characters that CANNOT be preceded by space */ + llen = strlen(aline); + for (i = 1; i < llen; i++) { /* for each character in the line after the first */ + if (strchr("?!,;:", aline[i])) { /* if it's punctuation that _cannot_ have a space before it */ + if (aline[i-1] == CHAR_SPACE && !isemptyline && aline[i+1] != CHAR_SPACE) { /* if aline[i+1) DOES == space, it was already reported just above */ + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Spaced punctuation?\n", linecnt, i+1); + else + cnt_punct++; + } + } + } + + + /* 0.99 -- special case " .X" where X is any alpha. */ + /* This plugs a hole in the acronym code above. Inelegant, but maintainable. */ + llen = strlen(aline); + for (i = 1; i < llen; i++) { /* for each character in the line after the first */ + if (aline[i] == '.') { /* if it's a period */ + if (aline[i-1] == CHAR_SPACE && gcisalpha(aline[i+1])) { /* if the period follows a space and is followed by a letter */ + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Spaced punctuation?\n", linecnt, i+1); + else + cnt_punct++; + } + } + } + + + + + /* v.21 breaking out the search for unspaced doublequotes */ + /* This is not as efficient, but it's more maintainable */ + /* V.97 added underscore to the list of characters not to query, */ + /* since underscores are commonly used as italics indicators. */ + /* V.98 Added slash as well, same reason. */ + for (i = 1; i < llen; i++) { /* for each character in the line after the first */ + if (aline[i] == CHAR_DQUOTE) { + if ((!strchr(" _-.'`,;:!/([{?}])", aline[i-1]) && + !strchr(" _-.'`,;:!/([{?}])", aline[i+1]) && + aline[i+1] != 0 + || (!strchr(" _-([{'`", aline[i-1]) && gcisalpha(aline[i+1])))) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Unspaced quotes?\n", linecnt, i+1); + else + cnt_punct++; + } + } + } + + + /* v.98 check parity of quotes */ + /* v.99 added !*(s+1) in some tests to catch "I am," he said, but I will not be soon". */ + for (s = aline; *s; s++) { + if (*s == CHAR_DQUOTE) { + if (!(dquotepar = !dquotepar)) { /* parity even */ + if (!strchr("_-.'`/,;:!?)]} ", *(s+1))) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Wrongspaced quotes?\n", linecnt, (int)(s - aline)+1); + else + cnt_punct++; + } + } + else { /* parity odd */ + if (!gcisalpha(*(s+1)) && !isdigit(*(s+1)) && !strchr("_-/.'`([{$", *(s+1)) || !*(s+1)) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Wrongspaced quotes?\n", linecnt, (int)(s - aline)+1); + else + cnt_punct++; + } + } + } + } + + if (*aline == CHAR_DQUOTE) { + if (strchr(",;:!?)]} ", aline[1])) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column 1 - Wrongspaced quotes?\n", linecnt, (int)(s - aline)+1); + else + cnt_punct++; + } + } + + if (pswit[SQUOTE_SWITCH]) + for (s = aline; *s; s++) { + if ((*s == CHAR_SQUOTE || *s == CHAR_OPEN_SQUOTE) + && ( s == aline || (s > aline && !gcisalpha(*(s-1))) || !gcisalpha(*(s+1)))) { + if (!(squotepar = !squotepar)) { /* parity even */ + if (!strchr("_-.'`/\",;:!?)]} ", *(s+1))) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Wrongspaced singlequotes?\n", linecnt, (int)(s - aline)+1); + else + cnt_punct++; + } + } + else { /* parity odd */ + if (!gcisalpha(*(s+1)) && !isdigit(*(s+1)) && !strchr("_-/\".'`", *(s+1)) || !*(s+1)) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Wrongspaced singlequotes?\n", linecnt, (int)(s - aline)+1); + else + cnt_punct++; + } + } + } + } + + + /* v.20 also look for double punctuation like ,. or ,, */ + /* Thanks to DW for the suggestion! */ + /* I'm putting this in a separate loop for clarity */ + /* In books with references, ".," and ".;" are common */ + /* e.g. "etc., etc.," and vol. 1.; vol 3.; */ + /* OTOH, from my initial tests, there are also fairly */ + /* common errors. What to do? Make these cases paranoid? */ + /* V.21 ".," is the most common, so invented warn_dotcomma */ + /* to suppress detailed reporting if it occurs often */ + llen = strlen(aline); + for (i = 0; i < llen; i++) /* for each character in the line */ + if (strchr(".?!,;:", aline[i]) /* if it's punctuation */ + && (strchr(".?!,;:", aline[i+1])) + && aline[i] && aline[i+1]) /* followed by punctuation, it's a query, unless . . . */ + if ( + (aline[i] == aline[i+1] + && (aline[i] == '.' || aline[i] == '?' || aline[i] == '!')) + || (!warn_dotcomma && aline[i] == '.' && aline[i+1] == ',') + || (isFrench && !strncmp(aline+i, ",...", 4)) + || (isFrench && !strncmp(aline+i, "...,", 4)) + || (isFrench && !strncmp(aline+i, ";...", 4)) + || (isFrench && !strncmp(aline+i, "...;", 4)) + || (isFrench && !strncmp(aline+i, ":...", 4)) + || (isFrench && !strncmp(aline+i, "...:", 4)) + || (isFrench && !strncmp(aline+i, "!...", 4)) + || (isFrench && !strncmp(aline+i, "...!", 4)) + || (isFrench && !strncmp(aline+i, "?...", 4)) + || (isFrench && !strncmp(aline+i, "...?", 4)) + ) { + if ((isFrench && !strncmp(aline+i, ",...", 4)) /* could this BE any more awkward? */ + || (isFrench && !strncmp(aline+i, "...,", 4)) + || (isFrench && !strncmp(aline+i, ";...", 4)) + || (isFrench && !strncmp(aline+i, "...;", 4)) + || (isFrench && !strncmp(aline+i, ":...", 4)) + || (isFrench && !strncmp(aline+i, "...:", 4)) + || (isFrench && !strncmp(aline+i, "!...", 4)) + || (isFrench && !strncmp(aline+i, "...!", 4)) + || (isFrench && !strncmp(aline+i, "?...", 4)) + || (isFrench && !strncmp(aline+i, "...?", 4))) + i +=4; + ; /* do nothing for .. !! and ?? which can be legit */ + } + else { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Double punctuation?\n", linecnt, i+1); + else + cnt_punct++; + } + + /* v.21 breaking out the search for spaced doublequotes */ + /* This is not as efficient, but it's more maintainable */ + s = aline; + while (strstr(s," \" ")) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Spaced doublequote?\n", linecnt, (int)(strstr(s," \" ")-aline+1)); + else + cnt_punct++; + s = strstr(s," \" ") + 2; + } + + /* v.20 also look for spaced singlequotes ' and ` */ + s = aline; + while (strstr(s," ' ")) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Spaced singlequote?\n", linecnt, (int)(strstr(s," ' ")-aline+1)); + else + cnt_punct++; + s = strstr(s," ' ") + 2; + } + + s = aline; + while (strstr(s," ` ")) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Spaced singlequote?\n", linecnt, (int)(strstr(s," ` ")-aline+1)); + else + cnt_punct++; + s = strstr(s," ` ") + 2; + } + + /* v.99 check special case of 'S instead of 's at end of word */ + s = aline + 1; + while (*s) { + if (*s == CHAR_SQUOTE && *(s+1) == 'S' && *(s-1)>='a' && *(s-1)<='z') { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Capital \"S\"?\n", linecnt, (int)(s-aline+2)); + else + cnt_punct++; + } + s++; + } + + + /* v.21 Now check special cases - start and end of line - */ + /* for single and double quotes. Start is sometimes [sic] */ + /* but better to query it anyway. */ + /* While I'm here, check for dash at end of line */ + llen = strlen(aline); + if (llen > 1) { + if (aline[llen-1] == CHAR_DQUOTE || + aline[llen-1] == CHAR_SQUOTE || + aline[llen-1] == CHAR_OPEN_SQUOTE) + if (aline[llen-2] == CHAR_SPACE) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Spaced quote?\n", linecnt, llen); + else + cnt_punct++; + } + + /* V 0.98 removed aline[0] == CHAR_DQUOTE from the test below, since */ + /* Wrongspaced quotes test also catches it for " */ + if (aline[0] == CHAR_SQUOTE || + aline[0] == CHAR_OPEN_SQUOTE) + if (aline[1] == CHAR_SPACE) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column 1 - Spaced quote?\n", linecnt); + else + cnt_punct++; + } + /* dash at end of line may well be legit - paranoid mode only */ + /* and don't report em-dash at line-end */ + if (pswit[PARANOID_SWITCH] && warn_hyphen) { + for (i = llen-1; i > 0 && (unsigned char)aline[i] <= CHAR_SPACE; i--); + if (aline[i] == '-' && aline[i-1] != '-') { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Hyphen at end of line?\n", linecnt, i); + } + } + } + + /* v.21 also look for brackets surrounded by alpha */ + /* Brackets are often unspaced, but shouldn't be surrounded by alpha. */ + /* If so, suspect a scanno like "a]most" */ + llen = strlen(aline); + for (i = 1; i < llen-1; i++) { /* for each character in the line except 1st & last*/ + if (strchr("{[()]}", aline[i]) /* if it's a bracket */ + && gcisalpha(aline[i-1]) && gcisalpha(aline[i+1])) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Unspaced bracket?\n", linecnt, i); + else + cnt_punct++; + } + } + /* The "Cinderella" case, back in again! :-S Give it another shot */ + if (warn_endquote) { + llen = strlen(aline); + for (i = 1; i < llen; i++) { /* for each character in the line except 1st */ + if (aline[i] == CHAR_DQUOTE) + if (isalpha(aline[i-1])) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - endquote missing punctuation?\n", linecnt, i); + else + cnt_punct++; + } + } + } + + llen = strlen(aline); + + /* Check for */ + /* If there is a < in the line, followed at some point */ + /* by a > then we suspect HTML */ + if (strstr(aline, "<") && strstr(aline, ">")) { + i = (signed int) (strstr(aline, ">") - strstr(aline, "<") + 1); + if (i > 0) { + strncpy(wrk, strstr(aline, "<"), i); + wrk[i] = 0; + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - HTML Tag? %s \n", linecnt, (int)(strstr(aline, "<") - aline) + 1, wrk); + else + cnt_html++; + } + } + + /* Check for &symbol; HTML */ + /* If there is a & in the line, followed at */ + /* some point by a ; then we suspect HTML */ + if (strstr(aline, "&") && strstr(aline, ";")) { + i = (int)(strstr(aline, ";") - strstr(aline, "&") + 1); + for (s = strstr(aline, "&"); s < strstr(aline, ";"); s++) + if (*s == CHAR_SPACE) i = 0; /* 0.99 don't report "Jones & Son;" */ + if (i > 0) { + strncpy(wrk, strstr(aline,"&"), i); + wrk[i] = 0; + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - HTML symbol? %s \n", linecnt, (int)(strstr(aline, "&") - aline) + 1, wrk); + else + cnt_html++; + } + } + + /* At end of paragraph, check for mismatched quotes. */ + /* We don't want to report an error immediately, since it is a */ + /* common convention to omit the quotes at end of paragraph if */ + /* the next paragraph is a continuation of the same speaker. */ + /* Where this is the case, the next para should begin with a */ + /* quote, so we store the warning message and only display it */ + /* at the top of the next iteration if the new para doesn't */ + /* start with a quote. */ + /* The -p switch overrides this default, and warns of unclosed */ + /* quotes on _every_ paragraph, whether the next begins with a */ + /* quote or not. */ + /* Version .16 - only report mismatched single quotes if */ + /* an open_single_quotes was found. */ + + if (isemptyline) { /* end of para - add up the totals */ + if (quot % 2) + sprintf(dquote_err, " Line %ld - Mismatched quotes\n", linecnt); + if (pswit[SQUOTE_SWITCH] && open_single_quote && (open_single_quote != close_single_quote) ) + sprintf(squote_err," Line %ld - Mismatched singlequotes?\n", linecnt); + if (pswit[SQUOTE_SWITCH] && open_single_quote + && (open_single_quote != close_single_quote) + && (open_single_quote != close_single_quote +1) ) + squot = 1; /* flag it to be noted regardless of the first char of the next para */ + if (r_brack) + sprintf(rbrack_err, " Line %ld - Mismatched round brackets?\n", linecnt); + if (s_brack) + sprintf(sbrack_err, " Line %ld - Mismatched square brackets?\n", linecnt); + if (c_brack) + sprintf(cbrack_err, " Line %ld - Mismatched curly brackets?\n", linecnt); + if (c_unders % 2) + sprintf(unders_err, " Line %ld - Mismatched underscores?\n", linecnt); + quot = s_brack = c_brack = r_brack = c_unders = + open_single_quote = close_single_quote = 0; + isnewpara = 1; /* let the next iteration know that it's starting a new para */ + } + + /* V.21 _ALSO_ at end of paragraph, check for omitted punctuation. */ + /* by working back through prevline. DW. */ + /* Hmmm. Need to check this only for "normal" paras. */ + /* So what is a "normal" para? ouch! */ + /* Not normal if one-liner (chapter headings, etc.) */ + /* Not normal if doesn't contain at least one locase letter */ + /* Not normal if starts with space */ + + /* 0.99 tighten up on para end checks. Disallow comma and */ + /* semi-colon. Check for legit para end before quotes. */ + if (isemptyline) { /* end of para */ + for (s = prevline, i = 0; *s && !i; s++) + if (gcisletter(*s)) + i = 1; /* use i to indicate the presence of a letter on the line */ + /* This next "if" is a problem. */ + /* If I say "start_para_line <= linecnt - 1", that includes one-line */ + /* "paragraphs" like chapter heads. Lotsa false positives. */ + /* If I say "start_para_line < linecnt - 1" it doesn't, but then it */ + /* misses genuine one-line paragraphs. */ + /* So what do I do? */ + if (i + && lastblen > 2 + && start_para_line < linecnt - 1 + && *prevline > CHAR_SPACE + ) { + for (i = strlen(prevline)-1; (prevline[i] == CHAR_DQUOTE || prevline[i] == CHAR_SQUOTE) && prevline[i] > CHAR_SPACE && i > 0; i--); + for ( ; i > 0; i--) { + if (gcisalpha(prevline[i])) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", prevline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - No punctuation at para end?\n", linecnt-1, strlen(prevline)); + else + cnt_punct++; + break; + } + if (strchr("-.:!([{?}])", prevline[i])) + break; + } + } + } + strcpy(prevline, aline); + } + fclose (infile); + if (!pswit[OVERVIEW_SWITCH]) + for (i = 0; i < MAX_QWORD; i++) + if (dupcnt[i]) + printf("\nNote: Queried word %s was duplicated %d time%s\n", qword[i], dupcnt[i], "s"); +} + + + +/* flgets - get one line from the input stream, checking for */ +/* the existence of exactly one CR/LF line-end per line. */ +/* Returns a pointer to the line. */ + +char *flgets(char *theline, int maxlen, FILE *thefile, long lcnt) +{ + char c; + int len, isCR, cint; + + *theline = 0; + len = isCR = 0; + c = cint = fgetc(thefile); + do { + if (cint == EOF) + return (NULL); + if (c == 10) /* either way, it's end of line */ + if (isCR) + break; + else { /* Error - a LF without a preceding CR */ + if (pswit[LINE_END_SWITCH]) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", theline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld - No CR?\n", lcnt); + else + cnt_lineend++; + } + break; + } + if (c == 13) { + if (isCR) { /* Error - two successive CRs */ + if (pswit[LINE_END_SWITCH]) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", theline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld - Two successive CRs?\n", lcnt); + else + cnt_lineend++; + } + } + isCR = 1; + } + else { + if (pswit[LINE_END_SWITCH] && isCR) { + if (pswit[ECHO_SWITCH]) printf("\n%s\n", theline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - CR without LF?\n", lcnt, len+1); + else + cnt_lineend++; + } + theline[len] = c; + len++; + theline[len] = 0; + isCR = 0; + } + c = cint = fgetc(thefile); + } while(len < maxlen); + if (pswit[MARKUP_SWITCH]) + postprocess_for_HTML(theline); + if (pswit[DP_SWITCH]) + postprocess_for_DP(theline); + return(theline); +} + + + + +/* mixdigit - takes a "word" as a parameter, and checks whether it */ +/* contains a mixture of alpha and digits. Generally, this is an */ +/* error, but may not be for cases like 4th or L5 12s. 3d. */ +/* Returns 0 if no error found, 1 if error. */ + +int mixdigit(char *checkword) /* check for digits like 1 or 0 in words */ +{ + int wehaveadigit, wehavealetter, firstdigits, query, wl; + char *s; + + + wehaveadigit = wehavealetter = query = 0; + for (s = checkword; *s; s++) + if (gcisalpha(*s)) + wehavealetter = 1; + else + if (gcisdigit(*s)) + wehaveadigit = 1; + if (wehaveadigit && wehavealetter) { /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */ + query = 1; + wl = strlen(checkword); + for (firstdigits = 0; gcisdigit(checkword[firstdigits]); firstdigits++) + ; + /* digits, ending in st, rd, nd, th of either case */ + /* 0.99 donovan points out an error below. Turns out */ + /* I was using matchword like strcmp when the */ + /* return values are different! Duh. */ + if (firstdigits + 2 == wl && + (matchword(checkword + wl - 2, "st") + || matchword(checkword + wl - 2, "rd") + || matchword(checkword + wl - 2, "nd") + || matchword(checkword + wl - 2, "th")) + ) + query = 0; + if (firstdigits + 3 == wl && + (matchword(checkword + wl - 3, "sts") + || matchword(checkword + wl - 3, "rds") + || matchword(checkword + wl - 3, "nds") + || matchword(checkword + wl - 3, "ths")) + ) + query = 0; + if (firstdigits + 3 == wl && + (matchword(checkword + wl - 4, "stly") + || matchword(checkword + wl - 4, "rdly") + || matchword(checkword + wl - 4, "ndly") + || matchword(checkword + wl - 4, "thly")) + ) + query = 0; + + /* digits, ending in l, L, s or d */ + if (firstdigits + 1 == wl && + (checkword[wl-1] == 'l' + || checkword[wl-1] == 'L' + || checkword[wl-1] == 's' + || checkword[wl-1] == 'd')) + query = 0; + /* L at the start of a number, representing Britsh pounds, like L500 */ + /* This is cute. We know the current word is mixeddigit. If the first */ + /* letter is L, there must be at least one digit following. If both */ + /* digits and letters follow, we have a genuine error, else we have a */ + /* capital L followed by digits, and we accept that as a non-error. */ + if (checkword[0] == 'L') + if (!mixdigit(checkword+1)) + query = 0; + } + return (query); +} + + + + +/* getaword - extracts the first/next "word" from the line, and puts */ +/* it into "thisword". A word is defined as one English word unit */ +/* -- or at least that's what I'm trying for. */ +/* Returns a pointer to the position in the line where we will start */ +/* looking for the next word. */ + +char *getaword(char *fromline, char *thisword) +{ + int i, wordlen; + char *s; + + wordlen = 0; + for ( ; !gcisdigit(*fromline) && !gcisalpha(*fromline) && *fromline ; fromline++ ); + + /* V .20 */ + /* add a look-ahead to handle exceptions for numbers like 1,000 and 1.35. */ + /* Especially yucky is the case of L1,000 */ + /* I hate this, and I see other ways, but I don't see that any is _better_.*/ + /* This section looks for a pattern of characters including a digit */ + /* followed by a comma or period followed by one or more digits. */ + /* If found, it returns this whole pattern as a word; otherwise we discard */ + /* the results and resume our normal programming. */ + s = fromline; + for ( ; (gcisdigit(*s) || gcisalpha(*s) || *s == ',' || *s == '.') && wordlen < MAXWORDLEN ; s++ ) { + thisword[wordlen] = *s; + wordlen++; + } + thisword[wordlen] = 0; + for (i = 1; i < wordlen -1; i++) { + if (thisword[i] == '.' || thisword[i] == ',') { + if (gcisdigit(thisword[i-1]) && gcisdigit(thisword[i-1])) { /* we have one of the damned things */ + fromline = s; + return(fromline); + } + } + } + + /* we didn't find a punctuated number - do the regular getword thing */ + wordlen = 0; + for ( ; (gcisdigit(*fromline) || gcisalpha(*fromline) || *fromline == '\'') && wordlen < MAXWORDLEN ; fromline++ ) { + thisword[wordlen] = *fromline; + wordlen++; + } + thisword[wordlen] = 0; + return(fromline); +} + + + + + +/* matchword - just a case-insensitive string matcher */ +/* yes, I know this is not efficient. I'll worry about */ +/* that when I have a clear idea where I'm going with it.*/ + +int matchword(char *checkfor, char *thisword) +{ + unsigned int ismatch, i; + + if (strlen(checkfor) != strlen(thisword)) return(0); + + ismatch = 1; /* assume a match until we find a difference */ + for (i = 0; i ='A' && *theline <='Z') + *theline += 32; +} + + +/* Is this word a Roman Numeral? */ +/* v 0.99 improved to be better. It still doesn't actually */ +/* validate that the number is a valid Roman Numeral -- for example */ +/* it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not*/ +/* what we're here to do. If it passes this, it LOOKS like a Roman */ +/* numeral. Anyway, the actual Romans were pretty tolerant of bad */ +/* arithmetic, or expressions thereof, except when it came to taxes.*/ +/* Allow any number of M, an optional D, an optional CM or CD, */ +/* any number of optional Cs, an optional XL or an optional XC, an */ +/* optional IX or IV, an optional V and any number of optional Is. */ +/* Good enough for jazz chords. */ + +int isroman(char *t) +{ + char *s; + + if (!t || !*t) return (0); + + s = t; + + while (*t == 'm' && *t ) t++; + if (*t == 'd') t++; + if (*t == 'c' && *(t+1) == 'm') t+=2; + if (*t == 'c' && *(t+1) == 'd') t+=2; + while (*t == 'c' && *t) t++; + if (*t == 'x' && *(t+1) == 'l') t+=2; + if (*t == 'x' && *(t+1) == 'c') t+=2; + if (*t == 'l') t++; + while (*t == 'x' && *t) t++; + if (*t == 'i' && *(t+1) == 'x') t+=2; + if (*t == 'i' && *(t+1) == 'v') t+=2; + if (*t == 'v') t++; + while (*t == 'i' && *t) t++; + if (!*t) return (1); + + return(0); +} + + + + +/* gcisalpha is a special version that is somewhat lenient on 8-bit texts. */ +/* If we use the standard isalpha() function, 8-bit accented characters break */ +/* words, so that tete with accented characters appears to be two words, "t" */ +/* and "t", with 8-bit characters between them. This causes over-reporting of */ +/* errors. gcisalpha() recognizes accented letters from the CP1252 (Windows) */ +/* and ISO-8859-1 character sets, which are the most common PG 8-bit types. */ + +int gcisalpha(unsigned char c) +{ + if (c >='a' && c <='z') return(1); + if (c >='A' && c <='Z') return(1); + if (c < 140) return(0); + if (c >=192 && c != 208 && c != 215 && c != 222 && c != 240 && c != 247 && c != 254) return(1); + if (c == 140 || c == 142 || c == 156 || c == 158 || c == 159) return (1); + return(0); +} + +/* gcisdigit is a special version that doesn't get confused in 8-bit texts. */ +int gcisdigit(unsigned char c) +{ + if (c >= '0' && c <='9') return(1); + return(0); +} + +/* gcisletter is a special version that doesn't get confused in 8-bit texts. */ +/* Yeah, we're ISO-8891-1-specific. So sue me. */ +int gcisletter(unsigned char c) +{ + if ((c >= 'A' && c <='Z') || (c >= 'a' && c <='z') || c >= 192) return(1); + return(0); +} + + + + +/* gcstrchr wraps strchr to return NULL if the character being searched for is zero */ + +char *gcstrchr(char *s, char c) +{ + if (c == 0) return(NULL); + return(strchr(s,c)); +} + +/* postprocess_for_DP is derived from postprocess_for_HTML */ +/* It is invoked with the -d switch from flgets(). */ +/* It simply "removes" from the line a hard-coded set of common */ +/* DP-specific tags, so that the line passed to the main routine has*/ +/* been pre-cleaned of DP markup. */ + +void postprocess_for_DP(char *theline) +{ + + char *s, *t; + int i; + + if (!*theline) + return; + + for (i = 0; *DPmarkup[i]; i++) { + s = strstr(theline, DPmarkup[i]); + while (s) { + t = s + strlen(DPmarkup[i]); + while (*t) { + *s = *t; + t++; s++; + } + *s = 0; + s = strstr(theline, DPmarkup[i]); + } + } + +} + + +/* postprocess_for_HTML is, at the moment (0.97), a very nasty */ +/* short-term fix for Charlz. Nasty, nasty, nasty. */ +/* It is invoked with the -m switch from flgets(). */ +/* It simply "removes" from the line a hard-coded set of common */ +/* HTML tags and "replaces" a hard-coded set of common HTML */ +/* entities, so that the line passed to the main routine has */ +/* been pre-cleaned of HTML. This is _so_ not the right way to */ +/* deal with HTML, but what Charlz needs now is not HTML handling */ +/* proper: just ignoring tags and some others. */ +/* To be revisited in future releases! */ + +void postprocess_for_HTML(char *theline) +{ + + if (strstr(theline, "<") && strstr(theline, ">")) + while (losemarkup(theline)) + ; + while (loseentities(theline)) + ; +} + +char *losemarkup(char *theline) +{ + char *s, *t; + int i; + + if (!*theline) + return(NULL); + + s = strstr(theline, "<"); + t = strstr(theline, ">"); + if (!s || !t) return(NULL); + for (i = 0; *markup[i]; i++) + if (!tagcomp(s+1, markup[i])) { + if (!*(t+1)) { + *s = 0; + return(s); + } + else + if (t > s) { + strcpy(s, t+1); + return(s); + } + } + /* it's an unrecognized */ + return(NULL); +} + +char *loseentities(char *theline) +{ + int i; + char *s, *t; + + if (!*theline) + return(NULL); + + for (i = 0; *entities[i].htmlent; i++) { + s = strstr(theline, entities[i].htmlent); + if (s) { + t = malloc((size_t)strlen(s)); + if (!t) return(NULL); + strcpy(t, s + strlen(entities[i].htmlent)); + strcpy(s, entities[i].textent); + strcat(s, t); + free(t); + return(theline); + } + } + + /* V0.97 Duh. Forgot to check the htmlnum member */ + for (i = 0; *entities[i].htmlnum; i++) { + s = strstr(theline, entities[i].htmlnum); + if (s) { + t = malloc((size_t)strlen(s)); + if (!t) return(NULL); + strcpy(t, s + strlen(entities[i].htmlnum)); + strcpy(s, entities[i].textent); + strcat(s, t); + free(t); + return(theline); + } + } + return(NULL); +} + + +int tagcomp(char *strin, char *basetag) +{ + char *s, *t; + + s = basetag; + t = strin; + if (*t == '/') t++; /* ignore a slash */ + while (*s && *t) { + if (tolower(*s) != tolower(*t)) return(1); + s++; t++; + } + /* OK, we have < followed by a valid tag start */ + /* should I do something about length? */ + /* this is messy. The length of an tag is */ + /* limited, but a could go on for miles */ + /* so I'd have to parse the tags . . . ugh. */ + /* It isn't what Charlz needs now, so mark it */ + /* as 'pending'. */ + return(0); +} + +void proghelp() /* explain program usage here */ +{ + fputs("V. 0.991. Copyright 2000-2005 Jim Tinsley .\n",stderr); + fputs("Gutcheck comes wih ABSOLUTELY NO WARRANTY. For details, read the file COPYING.\n", stderr); + fputs("This is Free Software; you may redistribute it under certain conditions (GPL);\n", stderr); + fputs("read the file COPYING for details.\n\n", stderr); + fputs("Usage is: gutcheck [-setpxloyhud] filename\n",stderr); + fputs(" where -s checks single quotes, -e suppresses echoing lines, -t checks typos\n",stderr); + fputs(" -x (paranoid) switches OFF -t and extra checks, -l turns OFF line-end checks\n",stderr); + fputs(" -o just displays overview without detail, -h echoes header fields\n",stderr); + fputs(" -v (verbose) unsuppresses duplicate reporting, -m suppresses markup\n",stderr); + fputs(" -d ignores DP-specific markup,\n",stderr); + fputs(" -u uses a file gutcheck.typ to query user-defined possible typos\n",stderr); + fputs("Sample usage: gutcheck warpeace.txt \n",stderr); + fputs("\n",stderr); + fputs("Gutcheck looks for errors in Project Gutenberg(TM) etexts.\n", stderr); + fputs("Gutcheck queries anything it thinks shouldn't be in a PG text; non-ASCII\n",stderr); + fputs("characters like accented letters, lines longer than 75 or shorter than 55,\n",stderr); + fputs("unbalanced quotes or brackets, a variety of badly formatted punctuation, \n",stderr); + fputs("HTML tags, some likely typos. It is NOT a substitute for human judgement.\n",stderr); + fputs("\n",stderr); +} + + + +/********************************************************************* + Revision History: + + 04/22/01 Cleaned up some stuff and released .10 + + --------------- + + 05/09/01 Added the typo list, added two extra cases of he/be error, + added -p switch, OPEN_SINGLE QUOTE char as .11 + + --------------- + + 05/20/01 Increased the typo list, + added paranoid mode, + ANSIfied the code and added some casts + so the compiler wouldn't keep asking if I knew what I was doing, + fixed bug in l.s.d. condition (thanks, Dave!), + standardized spacing when echoing, + added letter-combo checking code to typo section, + added more h/b words to typo array. + Not too sure about putting letter combos outside of the TYPO conditions - + someone is sure to have a book about the tbaka tribe, or something. Anyway, let's see. + Released as .12 + + --------------- + + 06/01/01 Removed duplicate reporting of Tildes, asterisks, etc. + 06/10/01 Added flgets routine to help with platform-independent + detection of invalid line-ends. All PG text files should + have CR/LF (13/10) at end of line, regardless of system. + Gutcheck now validates this by default. (Thanks, Charles!) + Released as .13 + + --------------- + + 06/11/01 Added parenthesis match checking. (c_brack, cbrack_err etc.) + Released as .14 + + --------------- + + 06/23/01 Fixed: 'No',he said. not being flagged. + + Improved: better single-quotes checking: + + Ignore singlequotes surrounded by alpha, like didn't. (was OK) + + If a singlequote is at the END of a word AND the word ends in "s": + The dogs' tails wagged. + it's probably an apostrophe, but less commonly may be a closequote: + "These 'pack dogs' of yours look more like wolves." + + If it's got punctuation before it and is followed by a space + or punctuation: + . . . was a problem,' he said + . . . was a problem,'" + it is probably (certainly?) a closequote. + + If it's at start of paragraph, it's probably an openquote. + (but watch dialect) + + Words with ' at beginning and end are probably quoted: + "You have the word 'chivalry' frequently on your lips." + (Not specifically implemented) + V.18 I'm glad I didn't implement this, 'cos it jest ain't so + where the convention is to punctuate outside the quotes. + 'Come', he said, 'and join the party'. + + If it is followed by an alpha, and especially a capital: + 'Hello,' called he. + it is either an openquote or dialect. + + Dialect breaks ALL the rules: + A man's a man for a' that. + "Aye, but 'tis all in the pas' now." + "'Tis often the way," he said. + 'Ave a drink on me. + + This version looks to be an improvement, and produces + fewer false positives, but is still not perfect. The + 'pack dogs' case still fools it, and dialect is still + a problem. Oh, well, it's an improvement, and I have + a weighted structure in place for refining guesses at + closequotes. Maybe next time, I'll add a bit of logic + where if there is an open quote and one that was guessed + to be a possessive apostrophe after s, I'll re-guess it + to be a closequote. Let's see how this one flies, first. + + (Afterview: it's still crap. Needs much work, and a deeper insight.) + + Released as .15 + + TODO: More he/be checks. Can't be perfect - counterexamples: + I gave my son good advice: be married regardless of the world's opinion. + I gave my son good advice: he married regardless of the world's opinion. + + If by "primitive" be meant "crude", we can understand the sentence. + If by "primitive" he meant "crude", we can understand the sentence. + + No matter what be said, I must go on. + No matter what he said, I must go on. + + No value, however great, can be set upon them. + No value, however great, can he set upon them. + + Real-Life one from a DP International Weekly Miscellany: + He wandered through the forest without fear, sleeping + much, for in sleep be had companionship--the Great + Spirit teaching him what he should know in dreams. + That one found by jeebies, and it turned out to be "he". + + + --------------- + + 07/01/01 Added -O option. + Improved singlequotes by reporting mismatched single quotes + only if an open_single_quotes was found. + + Released as .16 + + --------------- + + 08/27/01 Added -Y switch for Robert Rowe to allow his app to + catch the error output. + + Released as .17 + + --------------- + + 09/08/01 Added checking Capitals at start of paragraph, but not + checking them at start of sentence. + + TODO: Parse sentences out so can check reliably for start of + sentence. Need a whole different approach for that. + (Can't just rely on periods, since they are also + used for abbreviations, etc.) + + Added checking for all vowels or all consonants in a word. + + While I was in, I added "ii" checking and "tl" at start of word. + + Added echoing of first line of paragraph when reporting + mismatched quoted or brackets (thanks to David Widger for the + suggestion) + + Not querying L at start of a number (used for British pounds). + + The spelling changes are sort of half-done but released anyway + Skipped .18 because I had given out a couple of test versions + with that number. + + 09/25/01 Released as .19 + + --------------- + + TODO: + Use the logic from my new version of safewrap to stop querying + short lines like poems and TOCs. + Ignore non-standard ellipses like . . . or ... + + + --------------- + 10/01/01 Made any line over 80 a VERY long line (was 85). + Recognized openquotes on indented paragraphs as continuations + of the same speech. + Added "cf" to the okword list (how did I forget _that_?) and a few others. + Moved abbrev to okword and made it more general. + Removed requirement that PG_space_emdash be greater than + ten before turning off warnings about spaced dashes. + Added period to list of characters that might constitute a separator line. + Now checking for double punctuation (Thanks, David!) + Now if two spaced em-dashes on a line, reports both. (DW) + Bug: Wasn't catching spaced punctuation at line-end since I + added flgets in version .13 - fixed. + Bug: Wasn't catching spaced singlequotes - fixed + Now reads punctuated numbers like 1,000 as a single word. + (Used to give "standalone 1" type queries) + Changed paranoid mode - not including s and p options. -ex is now quite usable. + Bug: was calling `"For it is perfectly impossible," Unspaced Quotes - fixed + Bug: Sometimes gave _next_ line number for queried word at end of line - fixed + + 10/22/01 Released as .20 + + --------------- + + Added count of lines with spaces at end. (cnt_spacend) (Thanks, Brett!) + Reduced the number of hi-bit letters needed to stop reporting them + from 1/20 to 1/100 or 200 in total. + Added PG footer check. + Added the -h switch. + Fixed platform-specific CHAR_EOL checking for isemptyline - changed to 13 and 10 + Not reporting ".," when there are many of them, such as a book with many references to "Vol 1., p. 23" + Added unspaced brackets check when surrounded by alpha. + Removed all typo reporting unless the typo switch is on. + Added gcisalpha to ease over-reporting of 8-bit queries. + ECHO_SWITCH is now ON by default! + PARANOID_SWITCH is now ON by default! + Checking for ">From" placed there by e-mail MTA (Thanks Andrew & Greg) + Checking for standalone lowercase "l" + Checking for standalone lowercase "s" + Considering "is be" and "be is" "be was" "was be" as he/be errors + Looking at punct at end of para + + 01/20/02 Released as .21 + + --------------- + + Added VERBOSE_SWITCH to make it list everything. (George Davis) + + --------------- + + 02/17/02 Added cint in flgets to try fix an EOF failure on a compiler I don't have. + after which + This line caused a coredump on Solaris - fixed. + Da sagte die Figur: " Das ist alles gar schoen, und man mag die Puppe + 03/09/02 Changed header recognition for another header change + Called it .24 + 03/29/02 Added qword[][] so I can suppress massive overreporting + of queried "words" like "FN", "Wm.", "th'", people's + initials, chemical formulae and suchlike in some texts. + Called it .25 + 04/07/02 The qword summary reports at end shouldn't show in OVERVIEW mode. Fixed. + Added linecounts in overview mode. + Wow! gutcheck gutcheck.exe doesn't report a binary! :-) Need to tighten up. Done. + "m" is a not uncommon scanno for "in", but also appears in "a.m." - Can I get round that? + 07/07/02 Added GPL. + Added checking for broken em-dash at line-end (enddash) + Released as 0.95 + 08/17/02 Fixed a bug that treated some hi-bit characters as spaces. Thanks, Carlo. + Released as 0.96 + 10/10/02 Suppressing some annoying multiple reports by default: + Standalone Ones, Asterisks, Square Brackets. + Digit 1 occurs often in many scientific texts. + Asterisk occurs often in multi-footnoted texts. + Mismatch Square Brackets occurs often in multi-para footnotes. + Added -m switch for Charlz. Horrible. Nasty. Kludgy. Evil. + . . . but it does more or less work for the main cases. + Removed uppercase within a word as a separate category so + that names like VanAllen get reported only once, like other + suspected typos. + 11/24/02 Fixed - -m switch wasn't looking at htmlnum in + loseentities (Thanks, Brett!) + Fixed bug which occasionally gave false warning of + paragraph starting with lowercase. + Added underscore as character not to query around doublequotes. + Split the "Non-ASCII" message into "Non-ASCII" vs. "Non-ISO-8859" + . . . this is to help detect things like CP1252 characters. + Released as 0.97 + + 12/01/02 Hacked a simplified version of the "Wrongspaced quotes" out of gutspell, + for doublequotes only. Replaces "Spaced quote", since it also covers that + case. + Added "warn_hyphen" to ease over-reporting of hyphens. + + 12/20/02 Added "extra period" checks. + Added single character line check + Added I" check - is usually an exclam + Released as 0.98 + + 1/5/03 Eeek! Left in a lowerit(argv[0]) at the start before procfile() + from when I was looking at ways to identify markup. Refuses to + open files for *nix users with upcase in the filemanes. Removed. + Fixed quickly and released as 0.981 + + 1/8/03 Added "arid" to the list of typos, slightly against my better + judgement, but the DP gang are all excited about it. :-) + Added a check for comma followed by capital letter, where + a period has OCRed into a comma. (DW). Not sure about this + either; we'll see. + Compiling for Win32 to allow longfilenames. + + 6/1/04 A messy test release for DW to include the "gutcheck.typ" + process. And the gutcheck.jee trials. Removed "arid" -- + it can go in gutcheck.typ + + Added checks for carats ^ and slants / but disabling slant + queries if more than 20 of them, because some people use them + for /italics/. Slants are commonly mistaken italic "I"s. + + Later: removed gutcheck.jee -- wrote jeebies instead. + +Random TODO: + Check brackets more closely, like quotes, so that it becomes + easy to find the error in long paragraphs full of brackets. + + + 11/4/04 Assorted cleanup. Fixed case where text started with an + unbalanced paragraph. + + 1/2/05 Has it really been that long? Added "nocomma", "noperiod" check. + Bits and pieces: improved isroman(). Added isletter(). + Other stuff I never noted before this. + + 7/3/05 Stuck in a quick start on DP-markup ignoring + at BillFlis's suggestion. + + 1/23/06 Took out nocomma etc if typos are off. Why did I ever leave that in? + Don't count footer for dotcomma etc. + + +1 I +ail all +arc are +arid and +bad had +ball hall +band hand +bar her +bat but +be he +bead head +beads heads +bear hear +bit hit +bo be +boon been +borne home +bow how +bumbled humbled +car ear +carnage carriage +carne came +cast east +cat cut +cat eat +cheek check +clay day +coining coming +comer corner +die she +docs does +ease case +fail fall +fee he +haying having +ho he +ho who +hut but +is as +lie he +lime time +loth 10th +m in +modem modern +Ms his +ray away +ray my +ringer finger +ringers fingers +rioted noted +tho the +tie he +tie the +tier her +tight right +tile the +tiling thing +tip up +tram train +tune time +u " +wen well +yon you + +*********************************************************************/ + diff -r 218904410231 -r f600b0d1fc5d bookloupe/bookloupe.typ.in --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bookloupe/bookloupe.typ.in Fri Jan 27 10:30:16 2012 +0000 @@ -0,0 +1,54 @@ +11 +44 +ms +ail +alien +arc +arid +bar +bat +bo +borne +bow +bum +bumbled +carnage +carne +cither +coining +comer +cur +docs +eve +eves +gaming +gram +guru +hag +hare +haying +ho +lime +loth +m +modem +nave +ringer +ringers +riot +rioted +signer +snore +spam +tho +tier +tile +tiling +tram +tum +tune +u +vas +wag +wen +yon diff -r 218904410231 -r f600b0d1fc5d configure.ac --- a/configure.ac Fri Jan 27 00:28:11 2012 +0000 +++ b/configure.ac Fri Jan 27 10:30:16 2012 +0000 @@ -1,13 +1,13 @@ # -*- Autoconf -*- # Process this file with autoconf to produce a configure script. -AC_INIT([gutcheck],[1.50],[ali@juiblex.co.uk]) +AC_INIT([bookloupe],[1.50],[ali@juiblex.co.uk]) AC_PREREQ(2.59) AC_CONFIG_AUX_DIR([config]) -AC_CONFIG_SRCDIR([gutcheck/gutcheck.c]) +AC_CONFIG_SRCDIR([bookloupe/bookloupe.c]) AC_CONFIG_FILES([Makefile -gclib/Makefile -gutcheck/Makefile +bl/Makefile +bookloupe/Makefile test/Makefile test/harness/Makefile test/compatibility/Makefile diff -r 218904410231 -r f600b0d1fc5d doc/Makefile.am --- a/doc/Makefile.am Fri Jan 27 00:28:11 2012 +0000 +++ b/doc/Makefile.am Fri Jan 27 10:30:16 2012 +0000 @@ -1,3 +1,3 @@ -dist_pkgdata_DATA=gutcheck.txt gc-test.txt +dist_pkgdata_DATA=bookloupe.txt loupe-test.txt EXTRA_DIST=README-0.99 diff -r 218904410231 -r f600b0d1fc5d doc/bookloupe.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/doc/bookloupe.txt Fri Jan 27 10:30:16 2012 +0000 @@ -0,0 +1,742 @@ + + + Gutcheck documentation + + +gutcheck: lists possible common formatting errors in a Project +Gutenberg candidate file. It is a command line program and can be used +under Win32 or Unix (gutcheck.c should compile anywhere; if it doesn't, +tell me). For Windows-only people, there is an appendix at the end +with brief instructions for running it. + + +Current version: 0.99. Users of 0.98 see end of file for changes. + +You should also have received the licence file COPYING, a README file, +gutcheck.c, the source code, and gutcheck.exe, a DOS executable, with +this file. + +This software is Copyright Jim Tinsley 2000-2005. + +Gutcheck comes wih ABSOLUTELY NO WARRANTY. For details, read the file COPYING. +This is Free Software; you may redistribute it under certain conditions (GPL). + +See http://gutcheck.sourceforge.net for the latest version. + + +Usage is: gutcheck [-setopxlywm] filename + where: + -s checks Single quotes + -e switches off Echoing of lines + -t checks Typos + -o produces an Overview only + -p sets strict quotes checking for Paragraphs + -x (paranoid) switches OFF typo checking and extra checks + -l turns off Line-end checks + -y sets error messages to stdout + -w is a special mode for web uploads (for future use) + -v (verbose) forces individual reporting of minor problems + -m interprets Markup of some common HTML tags and entities + -u warns about words in a user-defined typo file gutcheck.typ + -d ignores some DP-specific markup + +Running gutcheck without any parameters will display a brief help message. + +Sample usage: + + gutcheck warpeace.txt + + +More detail: + + Echoing lines (-e to switch off) + + You may find it convenient, when reviewing Gutcheck's + suggestions, to see the line that Gutcheck is questioning. + That way, you can often see at a glance whether it is + a real error that needs to be fixed, or a false positive + that should be in the text, but Gutcheck's limited + programming doesn't understand. + + By default, gutcheck echoes these lines, but if you don't + want to see the lines referred to, -e will switch it OFF. + + + Quotes (-s and -p switches) + + Gutcheck always looks for unbalanced doublequotes in a + paragraph. It is a common convention for writers not to + close quotes in a paragraph if the next paragraph opens + with quotes and is a continuation by the same speaker. + + Gutcheck therefore does not normally report unclosed quotes + if the next paragraph begins with a quote. If you need + to see all unclosed quotes, even where the next paragraph + begins with a quote, you should use the -p switch. + + Singlequotes (') are a problem, since the same character + is used for an apostrophe. I'm not sure that it is + possible to get 100% accuracy on singlequotes checking, + particularly since dialect, quite common in PG texts, + upsets the normal rules so badly. Consider the sentence: + 'Tis often said that a man's a man for a' that. + As humans, we recognize that both apostrophes are used + for contractions rather than quotes, but it isn't easy + to get a program to recognize that. + + Since Gutcheck makes too many mistakes when trying to match + singlequotes, it doesn't look for unbalanced singlequotes + unless you specify the -s switch. + + Consider these sentences, which illustrate the main cases: + + 'Tis often said that a fool and his money are soon parted. + + 'Becky's goin' home,' said Tom. + + The dogs' tails wagged in unison. + + Those 'pack dogs' of yours look more like wolves. + + + + Typos (-t switch) + + It's not Gutcheck's job to be a spelling checker, but it + does check for a list of common typos and OCR errors if you + use the -t switch. (The -x switch also turns typo checking on.) + + It also checks for character combinations, especially involving + h and b, which are often confused by OCR, that rarely or never + occur. For example, it queries "tbe" in a word. Now, "the" often + occurs, but "tbe" is very rare (heartbeat, hotbed), so I'm + playing the odds - a few false positives for many errors found. + Similarly with "ii", which is a very common OCR error. + + Gutcheck suppresses multiple reporting of the first 40 "typos" + found. This is to remove the annoyance of seeing something like + "FN" (footnote) or "LK" (initials) flagged as a typo 147 times + in a text. + + + Line-end checking (-l switch to disable) + + All PG texts should have a Carriage Return (CR - character 13) + and a Line Feed (LF - character 10) at end of each line, + regardless of what O/S you made them on. DOS/Windows, Unix + and Mac have different conventions, but the final text should + always use a CR/LF pair as its line terminator. + + By default, Gutcheck verifies that every line does have + the correct terminator, but if you're on a work-in-progress + in Linux, you might want to convert the line-ends as a final + step, and not want to see thousands of errors every time you + run Gutcheck before that final step, so you can turn off + this checking with the -l switch. + + + Paranoid mode (-x switch to disable: Trust No One :-) + + -x switches OFF typo-checking, the -t flag, automatically + and some extra checks like standalone 1 and 0 queries. + + + Overview mode (-o switch) + + This mode just gives a count of queries found + instead of a detailed list. + + + Header quote (-h switch) + + If you use the -h switch, gutcheck will also display + the Title, Author, Release and Edition fields from the + PG header. This is useful mostly for the automated + checks we do on recently-posted texts. + + + Errors to stdout (-y switch) + + If you're just running gutcheck normally, you can ignore + this. It's only there for programs that provide a front + end to gutcheck. It makes error messages appear within + the output of gutcheck so that the front end knows whether + gutcheck ran OK. + + + Verbose reporting (-v switch) + + Normally, if gutcheck sees lots of long lines, short lines, + spaced dashes, non-ASCII characters or dot-commas ".," it + assumes these are features of the text, counts and summarizes + them at the top of its report, but does not list them + individually. If the -v switch is on, gutcheck will list them all. + + + Markup interpretation (-m switch) + + Normally, gutcheck flags anything it suspects of being HTML + markup as a possible error. When you use the -m switch, + however, it matches anything that looks like markup against + a short list of common HTML tags and entities. If the markup + is in that list, it either ignores the markup, in the case + of a tag, or "interprets" the markup as its nearest ASCII + equivalent, in the case of an entity. So, for example, using + this switch, gutcheck will "see" + + “He went thataway!” + + as + + "He went thataway!" + + and report accordingly. + + This switch does not, not, NOT check the validity of HTML; + it exists so that you can run gutcheck on most HTML texts + for PG, and get sane results. It does not support all tags. + It does not support all entities. When it sees a tag or entity + it does not recognize, it will query it as HTML just as if + you hadn't specified the -m switch. + + Gutcheck 0.99 will automatically switch on markup interpretation + if it sees a lot of tags that appear to be markup, so mostly, you + won't have to specify this. + + User-defined typos (-u switch) + + If you have a file named gutcheck.typ either in your current + working directory or in the directory from which you explicitly + invoked gutcheck, but not necessarily on your path, and if you + specify the -u switch, gutcheck will query any word specified + in that file. The file is simple: one word, in lower case, per + line. 999 lines are allowed for. Be careful not to put multiple + words onto a line, or leave any rubbish other than the word on + the line. You should have received a sample file gutcheck.typ + with this package. + + Ignore DP markup (-d switch) + + Distributed Proofreaders (http://www.pgdp.net) is currently + (2005) the main source of PG texts, and proofers there use + special conventions. This switch understands those conventions, + so that people can use gutcheck on files in process that still + haven't had the special conventions removed yet. The special + conventions supported in 0.99 are page-separators and + "", "", "/*", "*/", "/#", "#/", "/$", "$/". + + +You will probably only run gutcheck on a text once or maybe twice, +just prior to uploading; it usually finds a few formatting problems; +it also usually finds queries that aren't problems at all - it often +questions Tables of Contents for having short lines, for example. +These are called "false positives", and need a human to decide on +them. + +The text should be standard prose, and already close to PG normal +format (plain text, about 70 characters per line with blank lines +between paragraphs). + +Gutcheck merely draws your attention to things that might be errors. +It is NOT a substitute for human judgement. Formatting choices like +short lines may be for a reason that this program can't understand. + +Even the most careful human proofing can leave errors behind in a +text, and there are several automated checks you can do to help find +them. Of these, spellchecking (with _very_ careful human judgement) is +the most important and most useful. + +Gutcheck does perform some basic typo-checking if you ask it to, +but its focus is on formatting errors specific to PG texts - +mismatched quotes, non-ASCII characters, bad spacing, bad line +length, HTML tags perhaps left from a conversion, unbalanced +brackets. + +Suggestions for additional checks would be appreciated and duly +considered, but no guarantees that they will be implemented. + + + + + How do _I_ use it? + +Practically everyone I give gutcheck to asks me how _I_ use it. +Well, when I get a text for posting, say filename.txt, I run + + gutcheck -o filename.txt + +That gives me a quick idea what I'm dealing with. It'll tell +me what kind of problems gutcheck sees, and give me an idea +of how much more work needs to be done on the text. Keep in +mind that gutcheck doesn't do anything like a full spellcheck, +but when I see a text that has a lot of problems, I assume that +it probably needs a spellcheck too. + +Having got a feel for the ballpark, I run + + gutcheck filename.txt > jj + +where jj is my personal, all-purpose filename for temporary data +that doesn't need to be kept. Then I open filename.txt and jj in +a split-screen view in my editor, and work down the text, fixing +whatever needs fixing, and skipping whatever doesn't. If your +editor doesn't split-screen, you can get much the same effect by +opening your original file in your normal editor, and jj (or your +equivalent name) in something like Notepad, keeping both in view +at the same time. + +Twice a day, an automatic process looks at all recently-posted +texts, and emails Michael, me, and sometimes other people with +their gutcheck summaries. + + + + Future development of gutcheck + +Gutcheck has gone about as far as it can, given its current +structure. In order to add better singlequotes checking, +sentence checking, better he/be checking and other good stuff +that I'd like to see, I'll have to rewrite it from a different +angle - looking at the syntax instead of the lines. And I'll +probably get around to that sooner or later. + +Meantime, I'm just trying to get this version stabilized, so +please report any bugs you find. When it is stable, I'll run +up a Windows port for those timid souls who can't look a +command line in the eye. :-) + +And I've started work on gutspell, a companion to gutcheck +which will concentrate on spelling problems. PG spelling +problems are unusual, since the range of texts we cover is +so wide, and I'll be taking a somewhat unorthodox approach +to writing this spelling-checker _specifically_ for texts +containing a lot of dialect and uncommon words that have +probably already been spell-checked against a standard +modern dictionary. + + + + +Explanations of common gutcheck messages: + + --> 74 lines in this file have white space at end + + PG texts shouldn't have extra white space added at end of line. + Don't worry too much about this; they're not doing any harm, + and they'll be removed during posting anyway. + + + --> 348 lines in this file are short. Not reporting short lines. + --> 84 lines in this file are long. Not reporting long lines. + --> 8 lines in this file are VERY long! + + If there are a lot of long or short lines, Gutcheck won't list + them individually. The short lines version of this message + is commonly seen when gutchecking poetry and some plays, where + the normal line length is shorter than the standard for prose. + A "VERY long" line is one over 80 characters. You normally + shouldn't have any of these, but sometimes you may have to render + a table that must be that long, or some special preformatted + quotation that can't be broken. + + + --> There are 75 spaced dashes and em-dashes in this file. Not reporting them. + + The PG standard for an emdash--like these--is two minus signs + with no spaces before or after them. However, some older texts + used spaced dashes - like these -- and if there are very many + such spaced dashes in the file, gutcheck just draws your + attention to it and doesn't list them individually. + + + + Line 3020 - Non-ASCII character 233 + + Standard PG texts should use only ASCII characters with values + up to 127; however, non-English, accented characters can be + represented according to several different non-ASCII encoding + schemes, using values over 127. If you have a plain English text + with a few accented characters in words like cafe or tete-a-tete, + you should replace the accented characters with their unaccented + versions. The English pound sign is another commonly-seen + non-ASCII character. If you have enough non-ASCII characters in + your text that you feel removing them would degrade your text + unacceptably, you should probably consider doing an 8-bit text + as well as a plain-ASCII version. + + + + Line 1207 - Non-ISO-8859 character 156 + + Even in "8-bit" texts, there are distinctions between code sets. + The ISO-8859 family of 8-bit code sets is the most commonly used + in PG, and these sets do not define values in the range 128 through + 159 as printable characters. It's quite common for someone on a + Windows or Mac machine to use a non-ISO character inadvertently, + so this message warns that the character is not only not ASCII, + but also outside the ISO-8859 range. + + + + Line 46 - Tab character? + + Some editors and WPs will put in Tab characters (character 9) to + indicate indented text. You should not use these in a PG text, + because you can't be sure how they will appear on a reader's + screen. Find the Tab, and replace it with the appropriate number + of spaces. + + + Line 1327 - Tilde character? + + The tilde character (~) might be legitimately used, but it's the + character commonly used by OCR software to indicate a place where + it couldn't make out the letter, so gutcheck flags it. + + + + Line 1347 - Asterisk? + + Asterisks are reported only in paranoid mode (see -x). + Like tildes, they are often used to indicate errors, but they are + also legitimately used as line delimiters and footnote markers. + + + + Line 1451 - Long line 129 + + PG texts should have lines shorter than 76. There may be occasions + where you decide that you really have to go out to 79 characters, + but the sample above says that line 1451 is 129 characters long - + probably two lines run together. + + + + Line 1590 - Short line? + + PG texts should have lines longer than 54 characters. However, + there are special cases like poetry and tables of contents where + the lines _should_ be shorter. So treat Gutcheck warnings about + short lines carefully. Sometimes it's a genuine formatting + problem; sometimes the line really needs to be short. + + Hint: gutcheck will not flag lines as short if they are indented + - if they start with a space. I like to start inserted stanzas + and other such items indented with a couple of spaces so that + they stand out from the main text anyway. + + + + Line 1804 - Begins with punctuation? + + Lines should normally not begin with commas, periods and so on. + An exception is ellipses . . . which can happen at start of line. + + + + Line 1850 - Spaced em-dash? + + The PG standard for an em-dash--like these--is two minus signs + with no spaces before or after them. Gutcheck flags non-PG + em-dashes - like this one. Normally, you will replace it with a + PG-standard em-dash. + + + + Line 1904 - Query he/be error? + + Gutcheck makes a very minor effort to look for that scourge of all + proofreaders, "be" replacing "he" or vice-versa, and draws your + attention to it when it thinks it has found one. + + + + Line 2017 - Query digit in a1most + + The digit 1 is commonly OCRed for the letter l, the digit 0 for + the letter O, and so on. When gutcheck sees a mix of digits and + letters, it warns you. It may generate a false positive for + something like 7am. + + + + Line 2083 - Query standalone 0 + + In paranoid mode (see -x) only, gutcheck warns about the digit 0 + and the number 1 standing alone as a word. This can happen if the + OCR misreads the words O or I. + + + + Line 2115 - Query word whetber + + If you have switched typo-checking on, gutcheck looks for + potential typos, especially common h/b errors. It's not + infallible; it sometimes queries legit words, but it's + always worth taking a look. + + + + Line 2190 column 14 - Missing space? + + Omitting a space is a very common error,especially coming from + OCRed text,and can be hard for a human to spot. The commas in + the previous sentence illustrate the kind of thing I mean. + + + + Line 2240 column 48 - Spaced punctuation? + + The flip side of the "missing space" error , here , is when extra + spaces are added before punctuation . Some old texts appear to add + extra spaces around punctuation consistently, but this was a + typographical convention rather than the author's intent, and the + extra "spaces" should be removed when preparing a PG text. + + + + Line 2301 column 19 - Unspaced quotes? + + Another common spacing problem occurs in a phrase like "You wait + there,"he said. + + + + Line 2385 column 27 - Wrongspaced quotes? + + As of version 0.98, gutcheck adds extra checks on whether a quote + seems to be a start or end quote, and queries those that appear to + be misplaced. This does give rise to false positives when quotes are + nested, for example: + + "And how," she asked, "will your "friends" help you now?" + + but these false positives are worth it because of the many cases + that this test catches, notably those like: + + "And how, "she said," will your friends help you now?" + + Sometimes a "wrongspaced quotes" query will arise because an earlier + quote in the paragraph was omitted, so if the place specified seems + to be OK, look back to see whether there's a problem in the preceding + lines. + + + + Line 2400 - HTML Tag?
+
+    Some PG texts have been converted from HTML, and not all of the
+    HTML tags have been removed.
+
+
+
+    Line 2402 - HTML symbol? &emdash;
+
+    Similarly, special HTML symbol characters can survive into PG
+    texts. Can occasionally produce amusing false positives like
+    . . . Marwick & Co were well known for it;
+
+
+
+    Line 2540 - Mismatched quotes
+
+    Another gutcheck mainstay - unclosed doublequotes in a paragraph.
+    See the discussion of quotes in the switches section near the
+    start of this file.
+    
+    Since the mismatch doesn't occur on any one line, gutcheck quotes
+    the line number of the first blank line following the paragraph,
+    since this is the point where it reconciles the count of quotes.
+    However, if gutcheck is echoing lines, that is, you haven't used
+    the -e switch, it will show the _first_ line of the paragraph, 
+    to help you find the place without using line numbers. The 
+    offending paragraph is therefore between the quoted line and 
+    the line number given.
+
+
+
+    Line 2587 - Mismatched single quotes
+
+    Only checked with the -s switch, since checking single quotes is 
+    not a very reliable process. Otherwise, the same logic as for 
+    doublequotes applies.
+
+
+
+    Line 2877 - Mismatched round brackets?
+
+    Also curly and square brackets. Texts with a lot of brackets, like
+    plays with bracketed stage instructions, may have mismatches.
+
+
+    Line 3150 - No CR?
+    Line 3204 - Two successive CRs?
+    Line 3281 position 75 - CR without LF?
+
+    These are the invalid line-end warnings. See the discussion of
+    line-end checking in the switches section near the start of this
+    file. If you see these, and your editor doesn't show anything
+    wrong, you should probably try deleting the characters just before
+    and after the line end, and the line-end itself, then retyping the
+    characters and the line-end.
+
+
+    Line 2940 - Paragraph starts with lower-case
+
+    A common error in an e-text is for an extra blank line
+
+    to be put in, like the blank line above, and this often
+    shows up as a new paragraph beginning with lower case.
+    Sometimes the blank line is deliberate, as when a 
+    quotation is inserted in a speech. Use your judgement.
+
+
+    Line 2987 - Extra period?
+
+    An extra period. is a. common problem in OCRed text. and usually
+    arises when a speck of dust on the page is mistaken for a period.
+    or. as occasionally happens. when a comma loses its tail.
+
+
+    Line 3012 column 12 - Double punctuation?
+
+    Double punctuation., like that,, is a common typo and
+    scanno. Some books have much legit double punctuation,
+    like etc., etc., but it's worth checking anyway.
+
+
+
+            *       *       *        *
+
+For Windows-only users who are unfamiliar with DOS:
+
+    If you're a Windows-only user, you need to save
+    gutcheck.exe into the folder (directory) where the
+    text file you want to check is. Let's say your
+    text file is in C:\GUT, then you should save
+    GUTCHECK.EXE into C:\GUT.
+
+    Now get to a DOS prompt. You can do this by
+    selecting the "Command Prompt" or "MS-DOS Prompt"
+    option that will be somewhere on your
+    Start/Programs menu.
+
+    Now get into the C:\GUT directory. 
+    You can do this using the CD (change directory) 
+    command, like this:
+        CD \GUT
+    and your prompt will change to 
+        C:\GUT>
+    so you know you're in the right place.
+
+    Now type
+        gutcheck yourfile.txt
+    and you'll see gutcheck's report
+
+    By default, gutcheck prints its queries to screen.
+    If you want to create a file of them, to edit
+    against the text, you can use the greater-than
+    sign (>) to tell it to output the report to a
+    file. For example, if you want its report in a
+    file called QUERIES.LST, you could type
+    
+        gutcheck yourfile.txt > queries.lst
+
+    The queries.lst file will then contain the listing
+    of possible formatting errors, and you can
+    edit it alongside your text.
+
+    Whatever you do, DON'T make the filename after
+    the greater-than sign the name of a file already
+    on your disk that you want to keep, because
+    the greater-than sign will cause gutcheck to
+    replace any existing file of that name.
+
+    So, for example, if you have two Tolstoy files
+    that you want to check, called WARPEACE.TXT and 
+    ANNAK.TXT, make sure that neither of these names
+    is ever used following the greater-than sign.
+    To check these correctly, you might do:
+
+    gutcheck warpeace.txt >war.lst
+
+    and
+
+    gutcheck annak.txt > annak.lst
+
+    separately. Then you can look at war.lst and annak.lst
+    to see the gutcheck reports.
+
+            *       *       *        *
+
+
+For existing 0.98 users upgrading to 0.99:
+
+    If you run on old 16-bit DOS or Windows 3.x, I'm afraid
+    you're out of luck. I'm not saying it _can't_ be compiled
+    to run on 16-bit, but the executable with the package is
+    for Win32 only. *nix users won't notice the change at all.
+
+
+    There are two new switches: -u and -d. 
+          See above for full rundown.
+
+
+Here's a list of the new errors:
+
+    Line 1456 - Carat character?
+
+    I^ve found a few.
+
+
+    Line 1821 - Forward slash?
+
+    Common error for italicized "I", or so /'ve found.
+
+
+    Line 2139 - Query missing paragraph break?
+
+    "Come here, son." "Do I _have_ to go, dad?"
+    Like that. False positives in some texts. Sorry 'bout that,
+    but these are often errors.
+
+
+    Line 2200 - Query had/bad error?
+
+    Clear enough. Doesn't catch as many as I'd like it to,
+    but rarely gives false alarms.
+
+
+    Line 2268 - Query punctuation after the?
+
+    Some words, like "the", very rarely have punctuation
+    following them. Others, like "Mrs", usually have a
+    period, but never a comma. Occasional false positives.
+
+
+    Line 2380 - Query possible scanno arid
+
+    It found one of your user-defined typos when you
+    used the -u switch.
+
+
+    Line 2511 - Capital "S"?
+
+    Surprisingly common specific case, like: Jane'S 
+
+    
+    Line 3469 - endquote missing punctuation?
+
+    OK. This one can really cause a lot of false positives
+    in some books, but it switches itself off if it finds
+    more than 20 in a text, unless you force it to list them
+    all with the -v switch.
+    "Hey, dad" Johnny said, "can we go now?"
+    is a common punctuation-missing error.
+
+
+    Line 4266 - Mismatched underscores?
+
+    Like mismatched anything else!
+
+
diff -r 218904410231 -r f600b0d1fc5d doc/gc-test.txt
--- a/doc/gc-test.txt	Fri Jan 27 00:28:11 2012 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,64 +0,0 @@
-                            gutcheck test framework
-                            =======================
-
-Running existing testcases
---------------------------
-
-The test harness (the program that runs a test) is called gc-test. The various
-testcases are stored in multiple text files, typically with a .tst extension.
-
-To run a testcase when all of gutcheck, gc-test and the testcase file are
-in the current directory simply do something like:
-
-% gc-test missing-space.tst
-
-from a command prompt. Under MS-Windows, this is called a command window and
-the prompt will normally look slightly different, eg.,
-
-C:\DP> gc-test missing-space.tst
-
-To run all the tests in the current directory, do something like this:
-
-% gc-test *.tst
-
-If gutcheck is not in the current directory, then you can set an environment
-variable (GUTCHECK) to point at it. For example, on MS-Windows you might do:
-
-C:\DP> set GUTCHECK=C:\GUTCHECK\GUTCHECK.EXE
-C:\DP> gc-test *.tst
-
-Writing your own testcases
---------------------------
-
-Writing a new testcase is pretty painless. Most testcases follow this simple
-pattern:
-
-		┌──────────────────────────────────────────â”
-		│**************** INPUT ****************   │
-		│"Look!John, over there!"                  │
-		│**************** EXPECTED ****************│
-		│                                          │
-		│"Look!John, over there!"                  │
-		│    Line 1 column 6 - Missing space?      │
-		└──────────────────────────────────────────┘
-
-The sixteen asterisks in this example form what is known as the "flag". This
-flag must come before and after all tags (eg., INPUT and EXPECTED). In the
-unlikely event that you need sixteen asterisks at the start of line of text,
-then simply choose a different flag and use it throughout the file (flags
-can be any sequence of ASCII characters except control codes and space).
-
-Note that the header that gutcheck normally outputs is not included in the
-expected output. This avoids problems with not knowing beforehand the name
-of the file that gutcheck will be asked to look at (and saves typing!).
-gutcheck prints a blank line before each warning. These are not part of the
-header and so do need to be included.
-
-To test that gutcheck produces no output, you still need to include
-an EXPECTED tag, just with no text following it. If there is no EXPECTED
-tag, then gc-test will consider that no expectation exists and won't check
-the output at all.
-
-There is no support yet for non-ASCII testcases, embedded linefeeds,
-passing command line options to gutcheck or for testcases which are
-expected to fail.
diff -r 218904410231 -r f600b0d1fc5d doc/gutcheck.txt
--- a/doc/gutcheck.txt	Fri Jan 27 00:28:11 2012 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,742 +0,0 @@
-
-
-                            Gutcheck documentation
-
-
-gutcheck:  lists possible common formatting errors in a Project
-Gutenberg candidate file. It is a command line program and can be used
-under Win32 or Unix (gutcheck.c should compile anywhere; if it doesn't,
-tell me). For Windows-only people, there is an appendix at the end
-with brief instructions for running it.
-
-
-Current version: 0.99. Users of 0.98 see end of file for changes.
-
-You should also have received the licence file COPYING, a README file, 
-gutcheck.c, the source code, and gutcheck.exe, a DOS executable, with
-this file.
-
-This software is Copyright Jim Tinsley 2000-2005.
-
-Gutcheck comes wih ABSOLUTELY NO WARRANTY. For details, read the file COPYING.
-This is Free Software; you may redistribute it under certain conditions (GPL).
-
-See http://gutcheck.sourceforge.net for the latest version.
-
-
-Usage is: gutcheck [-setopxlywm] filename
-      where:
-      -s checks Single quotes 
-      -e switches off Echoing of lines 
-      -t checks Typos
-      -o produces an Overview only
-      -p sets strict quotes checking for Paragraphs
-      -x (paranoid) switches OFF typo checking and extra checks
-      -l turns off Line-end checks
-      -y sets error messages to stdout
-      -w is a special mode for web uploads (for future use)
-      -v (verbose) forces individual reporting of minor problems
-      -m interprets Markup of some common HTML tags and entities    
-      -u warns about words in a user-defined typo file gutcheck.typ 
-      -d ignores some DP-specific markup
-
-Running gutcheck without any parameters will display a brief help message.
-
-Sample usage: 
-
-    gutcheck warpeace.txt
-
-
-More detail:
-
-    Echoing lines (-e to switch off)
-
-      You may find it convenient, when reviewing Gutcheck's 
-      suggestions, to see the line that Gutcheck is questioning.
-      That way, you can often see at a glance whether it is
-      a real error that needs to be fixed, or a false positive
-      that should be in the text, but Gutcheck's limited
-      programming doesn't understand.
-
-      By default, gutcheck echoes these lines, but if you don't 
-      want to see the lines referred to, -e will switch it OFF.
-
-
-    Quotes (-s and -p switches)
-
-      Gutcheck always looks for unbalanced doublequotes in a 
-      paragraph. It is a common convention for writers not to
-      close quotes in a paragraph if the next paragraph opens
-      with quotes and is a continuation by the same speaker.
-
-      Gutcheck therefore does not normally report unclosed quotes 
-      if the next paragraph begins with a quote. If you need
-      to see all unclosed quotes, even where the next paragraph
-      begins with a quote, you should use the -p switch.
-
-      Singlequotes (') are a problem, since the same character
-      is used for an apostrophe. I'm not sure that it is 
-      possible to get 100% accuracy on singlequotes checking,
-      particularly since dialect, quite common in PG texts,
-      upsets the normal rules so badly. Consider the sentence:
-        'Tis often said that a man's a man for a' that.
-      As humans, we recognize that both apostrophes are used
-      for contractions rather than quotes, but it isn't easy 
-      to get a program to recognize that.
-
-      Since Gutcheck makes too many mistakes when trying to match
-      singlequotes, it doesn't look for unbalanced singlequotes
-      unless you specify the -s switch.
-
-      Consider these sentences, which illustrate the main cases:
-
-        'Tis often said that a fool and his money are soon parted.
-
-        'Becky's goin' home,' said Tom.
-
-        The dogs' tails wagged in unison.
-
-        Those 'pack dogs' of yours look more like wolves.
-
-
-
-    Typos (-t switch)
-
-      It's not Gutcheck's job to be a spelling checker, but it
-      does check for a list of common typos and OCR errors if you
-      use the -t switch. (The -x switch also turns typo checking on.)
-
-      It also checks for character combinations, especially involving
-      h and b, which are often confused by OCR, that rarely or never
-      occur. For example, it queries "tbe" in a word. Now, "the" often
-      occurs, but "tbe" is very rare (heartbeat, hotbed), so I'm
-      playing the odds - a few false positives for many errors found.
-      Similarly with "ii", which is a very common OCR error.
-
-      Gutcheck suppresses multiple reporting of the first 40 "typos"
-      found. This is to remove the annoyance of seeing something like
-      "FN" (footnote) or "LK" (initials) flagged as a typo 147 times
-      in a text. 
-
-
-    Line-end checking (-l switch to disable)
-
-      All PG texts should have a Carriage Return (CR - character 13)
-      and a Line Feed (LF - character 10) at end of each line,
-      regardless of what O/S you made them on. DOS/Windows, Unix
-      and Mac have different conventions, but the final text should
-      always use a CR/LF pair as its line terminator.
-
-      By default, Gutcheck verifies that every line does have
-      the correct terminator, but if you're on a work-in-progress
-      in Linux, you might want to convert the line-ends as a final
-      step, and not want to see thousands of errors every time you
-      run Gutcheck before that final step, so you can turn off 
-      this checking with the -l switch.
-
-
-    Paranoid mode (-x switch to disable: Trust No One :-)
-
-      -x switches OFF typo-checking, the -t flag, automatically
-      and some extra checks like standalone 1 and 0 queries.
-
-
-    Overview mode (-o switch)
-
-       This mode just gives a count of queries found
-       instead of a detailed list.
-
-
-    Header quote  (-h switch)
-
-       If you use the -h switch, gutcheck will also display
-       the Title, Author, Release and Edition fields from the
-       PG header. This is useful mostly for the automated
-       checks we do on recently-posted texts.
-
-
-    Errors to stdout (-y switch)
-
-       If you're just running gutcheck normally, you can ignore
-       this. It's only there for programs that provide a front
-       end to gutcheck. It makes error messages appear within
-       the output of gutcheck so that the front end knows whether
-       gutcheck ran OK.
-
-
-    Verbose reporting (-v switch)
-
-       Normally, if gutcheck sees lots of long lines, short lines,
-       spaced dashes, non-ASCII characters or dot-commas ".," it
-       assumes these are features of the text, counts and summarizes
-       them at the top of its report, but does not list them 
-       individually. If the -v switch is on, gutcheck will list them all.
-
-
-    Markup interpretation (-m switch)
-
-       Normally, gutcheck flags anything it suspects of being HTML
-       markup as a possible error. When you use the -m switch,
-       however, it matches anything that looks like markup against
-       a short list of common HTML tags and entities. If the markup
-       is in that list, it either ignores the markup, in the case
-       of a tag, or "interprets" the markup as its nearest ASCII 
-       equivalent, in the case of an entity. So, for example, using
-       this switch, gutcheck will "see"
-
-       “He went thataway!”
-
-       as
-
-       "He went thataway!"
-
-       and report accordingly.
-
-       This switch does not, not, NOT check the validity of HTML;
-       it exists so that you can run gutcheck on most HTML texts
-       for PG, and get sane results. It does not support all tags.
-       It does not support all entities. When it sees a tag or entity
-       it does not recognize, it will query it as HTML just as if
-       you hadn't specified the -m switch.
-
-       Gutcheck 0.99 will automatically switch on markup interpretation
-       if it sees a lot of tags that appear to be markup, so mostly, you
-       won't have to specify this.
-
-    User-defined typos (-u switch)
-
-        If you have a file named gutcheck.typ either in your current
-        working directory or in the directory from which you explicitly
-        invoked gutcheck, but not necessarily on your path, and if you
-        specify the -u switch, gutcheck will query any word specified 
-        in that file. The file is simple: one word, in lower case, per
-        line. 999 lines are allowed for. Be careful not to put multiple
-        words onto a line, or leave any rubbish other than the word on
-        the line. You should have received a sample file gutcheck.typ
-        with this package.
-
-    Ignore DP markup (-d switch)
-        
-        Distributed Proofreaders (http://www.pgdp.net) is currently
-        (2005) the main source of PG texts, and proofers there use
-        special conventions. This switch understands those conventions,
-        so that people can use gutcheck on files in process that still
-        haven't had the special conventions removed yet. The special
-        conventions supported in 0.99 are page-separators and
-        "", "", "/*", "*/", "/#", "#/", "/$", "$/".
-
-
-You will probably only run gutcheck on a text once or maybe twice,
-just prior to uploading; it usually finds a few formatting problems;
-it also usually finds queries that aren't problems at all - it often
-questions Tables of Contents for having short lines, for example.
-These are called "false positives", and need a human to decide on
-them.
-
-The text should be standard prose, and already close to PG normal
-format (plain text, about 70 characters per line with blank lines
-between paragraphs).
-
-Gutcheck merely draws your attention to things that might be errors.
-It is NOT a substitute for human judgement. Formatting choices like
-short lines may be for a reason that this program can't understand.
-
-Even the most careful human proofing can leave errors behind in a
-text, and there are several automated checks you can do to help find
-them. Of these, spellchecking (with _very_ careful human judgement) is
-the most important and most useful.
-
-Gutcheck does perform some basic typo-checking if you ask it to,
-but its focus is on formatting errors specific to PG texts - 
-mismatched quotes, non-ASCII characters, bad spacing, bad line
-length, HTML tags perhaps left from a conversion, unbalanced
-brackets.
-
-Suggestions for additional checks would be appreciated and duly 
-considered, but no guarantees that they will be implemented.
-
-
-
-
-                How do _I_ use it?
-
-Practically everyone I give gutcheck to asks me how _I_ use it.
-Well, when I get a text for posting, say filename.txt, I run
-
-    gutcheck -o filename.txt
-
-That gives me a quick idea what I'm dealing with. It'll tell
-me what kind of problems gutcheck sees, and give me an idea 
-of how much more work needs to be done on the text. Keep in 
-mind that gutcheck doesn't do anything like a full spellcheck,
-but when I see a text that has a lot of problems, I assume that
-it probably needs a spellcheck too.
-
-Having got a feel for the ballpark, I run
-
-    gutcheck filename.txt > jj
-
-where jj is my personal, all-purpose filename for temporary data
-that doesn't need to be kept. Then I open filename.txt and jj in
-a split-screen view in my editor, and work down the text, fixing
-whatever needs fixing, and skipping whatever doesn't. If your 
-editor doesn't split-screen, you can get much the same effect by 
-opening your original file in your normal editor, and jj (or your
-equivalent name) in something like Notepad, keeping both in view 
-at the same time.
-
-Twice a day, an automatic process looks at all recently-posted
-texts, and emails Michael, me, and sometimes other people with
-their gutcheck summaries.
-
-
-
-        Future development of gutcheck
-
-Gutcheck has gone about as far as it can, given its current
-structure. In order to add better singlequotes checking,
-sentence checking, better he/be checking and other good stuff
-that I'd like to see, I'll have to rewrite it from a different
-angle - looking at the syntax instead of the lines. And I'll
-probably get around to that sooner or later.
-
-Meantime, I'm just trying to get this version stabilized, so
-please report any bugs you find. When it is stable, I'll run
-up a Windows port for those timid souls who can't look a 
-command line in the eye. :-)
-
-And I've started work on gutspell, a companion to gutcheck
-which will concentrate on spelling problems. PG spelling
-problems are unusual, since the range of texts we cover is
-so wide, and I'll be taking a somewhat unorthodox approach
-to writing this spelling-checker _specifically_ for texts
-containing a lot of dialect and uncommon words that have
-probably already been spell-checked against a standard
-modern dictionary.
-
-
-
-
-Explanations of common gutcheck messages:
-
-    --> 74 lines in this file have white space at end
-
-    PG texts shouldn't have extra white space added at end of line.
-    Don't worry too much about this; they're not doing any harm,
-    and they'll be removed during posting anyway.
-
-
-    --> 348 lines in this file are short. Not reporting short lines.
-    --> 84 lines in this file are long. Not reporting long lines.
-    --> 8 lines in this file are VERY long!
-
-    If there are a lot of long or short lines, Gutcheck won't list
-    them individually. The short lines version of this message
-    is commonly seen when gutchecking poetry and some plays, where
-    the normal line length is shorter than the standard for prose.
-    A "VERY long" line is one over 80 characters.  You normally
-    shouldn't have any of these, but sometimes you may have to render
-    a table that must be that long, or some special preformatted
-    quotation that can't be broken.
-
-
-    --> There are 75 spaced dashes and em-dashes in this file. Not reporting them.
-
-    The PG standard for an emdash--like these--is two minus signs
-    with no spaces before or after them. However, some older texts
-    used spaced dashes - like these -- and if there are very many
-    such spaced dashes in the file, gutcheck just draws your
-    attention to it and doesn't list them individually.
-
-
-
-    Line 3020 - Non-ASCII character 233
-
-    Standard PG texts should use only ASCII characters with values
-    up to 127; however, non-English, accented characters can be 
-    represented according to several different non-ASCII encoding 
-    schemes, using values over 127. If you have a plain English text
-    with a few accented characters in words like cafe or tete-a-tete,
-    you should replace the accented characters with their unaccented 
-    versions. The English pound sign is another commonly-seen
-    non-ASCII character. If you have enough non-ASCII characters in
-    your text that you feel removing them would degrade your text
-    unacceptably, you should probably consider doing an 8-bit text
-    as well as a plain-ASCII version.
-
-
-
-    Line 1207 - Non-ISO-8859 character 156
-
-    Even in "8-bit" texts, there are distinctions between code sets.
-    The ISO-8859 family of 8-bit code sets is the most commonly used
-    in PG, and these sets do not define values in the range 128 through
-    159 as printable characters. It's quite common for someone on a
-    Windows or Mac machine to use a non-ISO character inadvertently,
-    so this message warns that the character is not only not ASCII,
-    but also outside the ISO-8859 range.
-
-
-
-    Line 46 - Tab character?
-
-    Some editors and WPs will put in Tab characters (character 9) to
-    indicate indented text. You should not use these in a PG text,
-    because you can't be sure how they will appear on a reader's
-    screen. Find the Tab, and replace it with the appropriate number
-    of spaces.
-
-
-    Line 1327 - Tilde character?
-
-    The tilde character (~) might be legitimately used, but it's the
-    character commonly used by OCR software to indicate a place where
-    it couldn't make out the letter, so gutcheck flags it.
-
-
-
-    Line 1347 - Asterisk?
-
-    Asterisks are reported only in paranoid mode (see -x). 
-    Like tildes, they are often used to indicate errors, but they are
-    also legitimately used as line delimiters and footnote markers.
-
-
-
-    Line 1451 - Long line 129
-
-    PG texts should have lines shorter than 76. There may be occasions
-    where you decide that you really have to go out to 79 characters,
-    but the sample above says that line 1451 is 129 characters long -
-    probably two lines run together.
-
-
-
-    Line 1590 - Short line?
-
-    PG texts should have lines longer than 54 characters. However,
-    there are special cases like poetry and tables of contents where
-    the lines _should_ be shorter. So treat Gutcheck warnings about
-    short lines carefully. Sometimes it's a genuine formatting
-    problem; sometimes the line really needs to be short.
-
-    Hint: gutcheck will not flag lines as short if they are indented
-    - if they start with a space. I like to start inserted stanzas
-    and other such items indented with a couple of spaces so that 
-    they stand out from the main text anyway.
-
-
-
-    Line 1804 - Begins with punctuation?
-
-    Lines should normally not begin with commas, periods and so on.
-    An exception is ellipses . . . which can happen at start of line.
-
-
-
-    Line 1850 - Spaced em-dash?
-
-    The PG standard for an em-dash--like these--is two minus signs
-    with no spaces before or after them. Gutcheck flags non-PG
-    em-dashes - like this one. Normally, you will replace it with a 
-    PG-standard em-dash.
-
-
-
-    Line 1904 - Query he/be error?
-
-    Gutcheck makes a very minor effort to look for that scourge of all
-    proofreaders, "be" replacing "he" or vice-versa, and draws your
-    attention to it when it thinks it has found one.
-
-
-
-    Line 2017 - Query digit in a1most
-
-    The digit 1 is commonly OCRed for the letter l, the digit 0 for
-    the letter O, and so on. When gutcheck sees a mix of digits and
-    letters, it warns you. It may generate a false positive for
-    something like 7am.
-
-
-
-    Line 2083 - Query standalone 0
-
-    In paranoid mode (see -x) only, gutcheck warns about the digit 0 
-    and the number 1 standing alone as a word. This can happen if the 
-    OCR misreads the words O or I.
-
-
-
-    Line 2115 - Query word whetber
-
-    If you have switched typo-checking on, gutcheck looks for
-    potential typos, especially common h/b errors. It's not
-    infallible; it sometimes queries legit words, but it's
-    always worth taking a look.
-
-
-
-    Line 2190 column 14 - Missing space?
-
-    Omitting a space is a very common error,especially coming from
-    OCRed text,and can be hard for a human to spot. The commas in
-    the previous sentence illustrate the kind of thing I mean.
-
-
-
-    Line 2240 column 48 - Spaced punctuation?
-
-    The flip side of the "missing space" error , here , is when extra
-    spaces are added before punctuation . Some old texts appear to add
-    extra spaces around punctuation consistently, but this was a
-    typographical convention rather than the author's intent, and the
-    extra "spaces" should be removed when preparing a PG text.
-
-
-
-    Line 2301 column 19 - Unspaced quotes?
-
-    Another common spacing problem occurs in a phrase like "You wait
-    there,"he said.
-
-
-
-    Line 2385 column 27 - Wrongspaced quotes?
-
-    As of version 0.98, gutcheck adds extra checks on whether a quote
-    seems to be a start or end quote, and queries those that appear to
-    be misplaced. This does give rise to false positives when quotes are
-    nested, for example:
-
-    "And how," she asked, "will your "friends" help you now?"
-
-    but these false positives are worth it because of the many cases
-    that this test catches, notably those like:
-
-    "And how, "she said," will your friends help you now?"
-
-    Sometimes a "wrongspaced quotes" query will arise because an earlier
-    quote in the paragraph was omitted, so if the place specified seems
-    to be OK, look back to see whether there's a problem in the preceding
-    lines.
-
-
-
-    Line 2400 - HTML Tag? 
-
-    Some PG texts have been converted from HTML, and not all of the
-    HTML tags have been removed.
-
-
-
-    Line 2402 - HTML symbol? &emdash;
-
-    Similarly, special HTML symbol characters can survive into PG
-    texts. Can occasionally produce amusing false positives like
-    . . . Marwick & Co were well known for it;
-
-
-
-    Line 2540 - Mismatched quotes
-
-    Another gutcheck mainstay - unclosed doublequotes in a paragraph.
-    See the discussion of quotes in the switches section near the
-    start of this file.
-    
-    Since the mismatch doesn't occur on any one line, gutcheck quotes
-    the line number of the first blank line following the paragraph,
-    since this is the point where it reconciles the count of quotes.
-    However, if gutcheck is echoing lines, that is, you haven't used
-    the -e switch, it will show the _first_ line of the paragraph, 
-    to help you find the place without using line numbers. The 
-    offending paragraph is therefore between the quoted line and 
-    the line number given.
-
-
-
-    Line 2587 - Mismatched single quotes
-
-    Only checked with the -s switch, since checking single quotes is 
-    not a very reliable process. Otherwise, the same logic as for 
-    doublequotes applies.
-
-
-
-    Line 2877 - Mismatched round brackets?
-
-    Also curly and square brackets. Texts with a lot of brackets, like
-    plays with bracketed stage instructions, may have mismatches.
-
-
-    Line 3150 - No CR?
-    Line 3204 - Two successive CRs?
-    Line 3281 position 75 - CR without LF?
-
-    These are the invalid line-end warnings. See the discussion of
-    line-end checking in the switches section near the start of this
-    file. If you see these, and your editor doesn't show anything
-    wrong, you should probably try deleting the characters just before
-    and after the line end, and the line-end itself, then retyping the
-    characters and the line-end.
-
-
-    Line 2940 - Paragraph starts with lower-case
-
-    A common error in an e-text is for an extra blank line
-
-    to be put in, like the blank line above, and this often
-    shows up as a new paragraph beginning with lower case.
-    Sometimes the blank line is deliberate, as when a 
-    quotation is inserted in a speech. Use your judgement.
-
-
-    Line 2987 - Extra period?
-
-    An extra period. is a. common problem in OCRed text. and usually
-    arises when a speck of dust on the page is mistaken for a period.
-    or. as occasionally happens. when a comma loses its tail.
-
-
-    Line 3012 column 12 - Double punctuation?
-
-    Double punctuation., like that,, is a common typo and
-    scanno. Some books have much legit double punctuation,
-    like etc., etc., but it's worth checking anyway.
-
-
-
-            *       *       *        *
-
-For Windows-only users who are unfamiliar with DOS:
-
-    If you're a Windows-only user, you need to save
-    gutcheck.exe into the folder (directory) where the
-    text file you want to check is. Let's say your
-    text file is in C:\GUT, then you should save
-    GUTCHECK.EXE into C:\GUT.
-
-    Now get to a DOS prompt. You can do this by
-    selecting the "Command Prompt" or "MS-DOS Prompt"
-    option that will be somewhere on your
-    Start/Programs menu.
-
-    Now get into the C:\GUT directory. 
-    You can do this using the CD (change directory) 
-    command, like this:
-        CD \GUT
-    and your prompt will change to 
-        C:\GUT>
-    so you know you're in the right place.
-
-    Now type
-        gutcheck yourfile.txt
-    and you'll see gutcheck's report
-
-    By default, gutcheck prints its queries to screen.
-    If you want to create a file of them, to edit
-    against the text, you can use the greater-than
-    sign (>) to tell it to output the report to a
-    file. For example, if you want its report in a
-    file called QUERIES.LST, you could type
-    
-        gutcheck yourfile.txt > queries.lst
-
-    The queries.lst file will then contain the listing
-    of possible formatting errors, and you can
-    edit it alongside your text.
-
-    Whatever you do, DON'T make the filename after
-    the greater-than sign the name of a file already
-    on your disk that you want to keep, because
-    the greater-than sign will cause gutcheck to
-    replace any existing file of that name.
-
-    So, for example, if you have two Tolstoy files
-    that you want to check, called WARPEACE.TXT and 
-    ANNAK.TXT, make sure that neither of these names
-    is ever used following the greater-than sign.
-    To check these correctly, you might do:
-
-    gutcheck warpeace.txt >war.lst
-
-    and
-
-    gutcheck annak.txt > annak.lst
-
-    separately. Then you can look at war.lst and annak.lst
-    to see the gutcheck reports.
-
-            *       *       *        *
-
-
-For existing 0.98 users upgrading to 0.99:
-
-    If you run on old 16-bit DOS or Windows 3.x, I'm afraid
-    you're out of luck. I'm not saying it _can't_ be compiled
-    to run on 16-bit, but the executable with the package is
-    for Win32 only. *nix users won't notice the change at all.
-
-
-    There are two new switches: -u and -d. 
-          See above for full rundown.
-
-
-Here's a list of the new errors:
-
-    Line 1456 - Carat character?
-
-    I^ve found a few.
-
-
-    Line 1821 - Forward slash?
-
-    Common error for italicized "I", or so /'ve found.
-
-
-    Line 2139 - Query missing paragraph break?
-
-    "Come here, son." "Do I _have_ to go, dad?"
-    Like that. False positives in some texts. Sorry 'bout that,
-    but these are often errors.
-
-
-    Line 2200 - Query had/bad error?
-
-    Clear enough. Doesn't catch as many as I'd like it to,
-    but rarely gives false alarms.
-
-
-    Line 2268 - Query punctuation after the?
-
-    Some words, like "the", very rarely have punctuation
-    following them. Others, like "Mrs", usually have a
-    period, but never a comma. Occasional false positives.
-
-
-    Line 2380 - Query possible scanno arid
-
-    It found one of your user-defined typos when you
-    used the -u switch.
-
-
-    Line 2511 - Capital "S"?
-
-    Surprisingly common specific case, like: Jane'S 
-
-    
-    Line 3469 - endquote missing punctuation?
-
-    OK. This one can really cause a lot of false positives
-    in some books, but it switches itself off if it finds
-    more than 20 in a text, unless you force it to list them
-    all with the -v switch.
-    "Hey, dad" Johnny said, "can we go now?"
-    is a common punctuation-missing error.
-
-
-    Line 4266 - Mismatched underscores?
-
-    Like mismatched anything else!
-
-
diff -r 218904410231 -r f600b0d1fc5d doc/loupe-test.txt
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/doc/loupe-test.txt	Fri Jan 27 10:30:16 2012 +0000
@@ -0,0 +1,68 @@
+                           bookloupe test framework
+                           ========================
+
+Running existing testcases
+--------------------------
+
+The test harness (the program that runs a test) is called loupe-test. The
+various testcases are stored in multiple text files, typically with a .tst
+extension.
+
+To run a testcase when all of bookloupe, loupe-test and the testcase file are
+in the current directory simply do something like:
+
+% loupe-test missing-space.tst
+
+from a command prompt. Under MS-Windows, this is called a command window and
+the prompt will normally look slightly different, eg.,
+
+C:\DP> loupe-test missing-space.tst
+
+To run all the tests in the current directory, do something like this:
+
+% loupe-test *.tst
+
+If bookloupe is not in the current directory or you want to run the testsuite
+against gutcheck (the program that bookloupe is based on), then you can set an
+environment variable (BOOKLOUPE) to point at it. For example, on MS-Windows
+you might do:
+
+C:\DP> set BOOKLOUPE=C:\GUTCHECK\GUTCHECK.EXE
+C:\DP> loupe-test *.tst
+
+Writing your own testcases
+--------------------------
+
+Writing a new testcase is pretty painless. Most testcases follow this simple
+pattern:
+
+		┌──────────────────────────────────────────â”
+		│**************** INPUT ****************   │
+		│"Look!John, over there!"                  │
+		│**************** EXPECTED ****************│
+		│                                          │
+		│"Look!John, over there!"                  │
+		│    Line 1 column 6 - Missing space?      │
+		└──────────────────────────────────────────┘
+
+The sixteen asterisks in this example form what is known as the "flag". This
+flag must come before and after all tags (eg., INPUT and EXPECTED). In the
+unlikely event that you need sixteen asterisks at the start of a line of text,
+then simply choose a different flag and use it throughout the file (flags
+can be any sequence of ASCII characters except control codes and space).
+
+Note that the header that bookloupe and gutcheck normally output is not
+included in the expected output. This avoids problems with not knowing
+beforehand the name of the file that bookloupe/gutcheck will be asked to
+look at (and saves typing!). bookloupe (and gutcheck) prints a blank line
+before each warning. These are not part of the header and so do need to
+be included.
+
+To test that bookloupe produces no output, you still need to include
+an EXPECTED tag, just with no text following it. If there is no EXPECTED
+tag, then loupe-test will consider that no expectation exists and won't
+check the output at all.
+
+There is no support yet for non-ASCII testcases, embedded linefeeds,
+passing command line options to bookloupe or for testcases which are
+expected to fail.
diff -r 218904410231 -r f600b0d1fc5d gclib/Makefile.am
--- a/gclib/Makefile.am	Fri Jan 27 00:28:11 2012 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,10 +0,0 @@
-INCLUDES=-I$(top_srcdir)
-AM_CFLAGS=$(GLIB_CFLAGS)
-LIBS=$(GLIB_LIBS)
-
-noinst_LTLIBRARIES=libgc.la
-libgc_la_SOURCES=gclib.h textfileutils.c textfileutils.h spawn.c spawn.h
-if !HAVE_GLIB
-libgc_la_SOURCES+=macros.h types.h fileutils.c fileutils.h mem.c mem.h \
-  strfuncs.c strfuncs.h gcstring.c gcstring.h utils.c utils.h
-endif
diff -r 218904410231 -r f600b0d1fc5d gclib/fileutils.c
--- a/gclib/fileutils.c	Fri Jan 27 00:28:11 2012 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,46 +0,0 @@
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-
-/*
- * Read a file into memory (which should be freed with mem_free when no
- * longer required). Returns FALSE on error and outputs a suitable error
- * message to stderr.
- */
-boolean file_get_contents(const char *filename,char **contents,size_t *length)
-{
-    FILE *fp;
-    size_t n;
-    char *buffer;
-    String *string;
-    fp=fopen(filename,"rb");
-    if (!fp)
-    {
-	perror(filename);
-	return FALSE;
-    }
-    buffer=mem_new(char,1024);
-    string=string_new(NULL);
-    do
-    {
-	n=fread(buffer,1,1024,fp);
-	if (n<0)
-	{
-	    perror(filename);
-	    string_free(string,TRUE);
-	    mem_free(buffer);
-	    free(fp);
-	    return FALSE;
-	}
-	string_append_len(string,buffer,n);
-    } while(n);
-    mem_free(buffer);
-    if (length)
-	*length=string->len;
-    *contents=string_free(string,FALSE);
-    fclose(fp);
-    return TRUE;
-}
diff -r 218904410231 -r f600b0d1fc5d gclib/fileutils.h
--- a/gclib/fileutils.h	Fri Jan 27 00:28:11 2012 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,8 +0,0 @@
-#ifndef GC_FILEUTILS_H
-#define GC_FILEUTILS_H
-
-#include 
-
-boolean file_get_contents(const char *filename,char **contents,size_t *length);
-
-#endif /* GC_FILEUTILS_H */
diff -r 218904410231 -r f600b0d1fc5d gclib/gclib.h
--- a/gclib/gclib.h	Fri Jan 27 00:28:11 2012 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,36 +0,0 @@
-#if HAVE_GLIB
-
-#include 
-#define GC_DIR_SEPARATOR G_DIR_SEPARATOR
-#define GC_DIR_SEPARATOR_S G_DIR_SEPARATOR_S
-#define GC_IS_DIR_SEPARATOR(c) G_IS_DIR_SEPARATOR(c)
-#define boolean gboolean
-#define String GString
-#define mem_new0 g_new0
-#define mem_free g_free
-#define str_dup g_strdup
-#define str_ndup g_strndup
-#define path_get_basename g_path_get_basename
-#define file_get_contents(filename,contents,length) \
-  g_file_get_contents(filename,contents,length,NULL)
-#define string_new g_string_new
-#define string_append g_string_append
-#define string_append_len g_string_append_len
-#define string_append_c g_string_append_c
-#define string_free g_string_free
-#define string_set_size g_string_set_size
-
-#else	/* !HAVE_GLIB */
-
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-
-#endif	/* HAVE_GLIB */
-
-#include 
-#include 
diff -r 218904410231 -r f600b0d1fc5d gclib/gcstring.c
--- a/gclib/gcstring.c	Fri Jan 27 00:28:11 2012 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,90 +0,0 @@
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-
-/*
- * Strings which manage their own memory
- */
-
-String *string_new(const char *init)
-{
-    String *string=mem_new(String,1);
-    if (!init)
-	init="";
-    string->len=strlen(init);
-    string->alloc=string->len+1;
-    string->str=str_dup(init);
-    return string;
-}
-
-/*
- * Free a string and either return the contents (if free_segment is FALSE)
- * or free the contents as well and return NULL (if free_segment is TRUE).
- */
-char *string_free(String *string,boolean free_segment)
-{
-    char *retval;
-    if (free_segment)
-    {
-	mem_free(string->str);
-	retval=NULL;
-    }
-    else
-	retval=string->str;
-    mem_free(string);
-    return retval;
-}
-
-/*
- * Append a byte to string.
- */
-void string_append_c(String *string,char c)
-{
-    if (string->len+1==string->alloc)
-    {
-	string->alloc*=2;
-	string->str=mem_renew(char,string->str,string->alloc);
-    }
-    string->str[string->len++]=c;
-    string->str[string->len]='\0';
-}
-
-/*
- * Append len bytes from s to string. len may be passed as <0 if s is
- * a nul-terminated string of unknown length.
- */
-void string_append_len(String *string,const char *s,ssize_t len)
-{
-    if (len<0)
-	len=strlen(s);
-    if (string->len+len>=string->alloc)
-    {
-	while (string->len+len>=string->alloc)
-	    string->alloc*=2;
-	string->str=mem_renew(char,string->str,string->alloc);
-    }
-    memcpy(string->str+string->len,s,len);
-    string->len+=len;
-    string->str[string->len]='\0';
-}
-
-/*
- * Sets the length of a String. If the length is less than the current length,
- * the string will be truncated. If the length is greater than the current
- * length, the contents of the newly added area are undefined. (However, as
- * always, string->str[string->len] will be a nul byte.)
- */
-void string_set_size(String *string,size_t len)
-{
-    if (len>=string->alloc)
-    {
-	while (len>=string->alloc)
-	    string->alloc*=2;
-	string->str=mem_renew(char,string->str,string->alloc);
-    }
-    string->len=len;
-    string->str[string->len]='\0';
-}
diff -r 218904410231 -r f600b0d1fc5d gclib/gcstring.h
--- a/gclib/gcstring.h	Fri Jan 27 00:28:11 2012 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,18 +0,0 @@
-#ifndef GC_STRING_H
-#define GC_STRING_H
-
-#include 
-#include 
-
-typedef struct {
-    char *str;
-    size_t alloc,len;
-} String;
-
-String *string_new(const char *init);
-char *string_free(String *string,boolean free_segment);
-void string_append_c(String *string,char c);
-void string_append_len(String *string,const char *s,ssize_t len);
-#define string_append(string,s)		string_append_len(string,s,-1)
-
-#endif /* GC_STRING_H */
diff -r 218904410231 -r f600b0d1fc5d gclib/macros.h
--- a/gclib/macros.h	Fri Jan 27 00:28:11 2012 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,7 +0,0 @@
-#ifndef FALSE
-#define FALSE	0
-#endif
-
-#ifndef TRUE
-#define TRUE	(!FALSE)
-#endif
diff -r 218904410231 -r f600b0d1fc5d gclib/mem.c
--- a/gclib/mem.c	Fri Jan 27 00:28:11 2012 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,54 +0,0 @@
-#include 
-#include 
-#include 
-#include 
-
-/*
- * A memory allocator that aborts on failure (so that the caller never
- * needs to handle out of memory, which we assume is very unlikely to
- * happen under normal circumstances on any modern machine).
- */
-void *mem_alloc(size_t nmemb,size_t size)
-{
-    void *ptr=malloc(nmemb*size);
-    if (!ptr)
-    {
-	fprintf(stderr,
-	  "Not enough memory to allocate %lu elements of %lu bytes.\n",
-	  (unsigned long)nmemb,(unsigned long)size);
-	abort();
-    }
-    return ptr;
-}
-
-/*
- * As mem_new, but new memory is cleared to zero.
- */
-void *mem_alloc0(size_t nmemb,size_t size)
-{
-    void *ptr=calloc(nmemb,size);
-    if (!ptr)
-    {
-	fprintf(stderr,
-	  "Not enough memory to allocate %lu elements of %lu bytes.\n",
-	  (unsigned long)nmemb,(unsigned long)size);
-	abort();
-    }
-    return ptr;
-}
-
-/*
- * Grow or shrink a memory block, aborting on failure.
- */
-void *mem_realloc(void *ptr,size_t nmemb,size_t size)
-{
-    ptr=realloc(ptr,nmemb*size);
-    if (!ptr)
-    {
-	fprintf(stderr,
-	  "Not enough memory to allocate %lu elements of %lu bytes.\n",
-	  (unsigned long)nmemb,(unsigned long)size);
-	abort();
-    }
-    return ptr;
-}
diff -r 218904410231 -r f600b0d1fc5d gclib/mem.h
--- a/gclib/mem.h	Fri Jan 27 00:28:11 2012 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,13 +0,0 @@
-#ifndef GC_MEM_H
-#define GC_MEM_H
-
-void *mem_alloc(size_t nmemb,size_t size);
-void *mem_alloc0(size_t nmemb,size_t size);
-void *mem_realloc(void *ptr,size_t nmemb,size_t size);
-
-#define mem_new(type,n)		((type *)mem_alloc(n,sizeof(type)))
-#define mem_new0(type,n)	((type *)mem_alloc0(n,sizeof(type)))
-#define mem_renew(type,ptr,n)	((type *)mem_realloc(ptr,n,sizeof(type)))
-#define mem_free(ptr)		free(ptr)
-
-#endif /* GC_MEM_H */
diff -r 218904410231 -r f600b0d1fc5d gclib/spawn.c
--- a/gclib/spawn.c	Fri Jan 27 00:28:11 2012 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,84 +0,0 @@
-#include 
-#include 
-#ifndef WIN32
-#include 
-#endif
-#include 
-
-#define SPAWN_BUFSIZE	128
-
-boolean spawn_sync(char **argv,char **standard_output,int *exit_status)
-{
-/* Don't use g_spawn_sync on WIN32 for now to avoid needing the helper */
-#if HAVE_GLIB && !defined(WIN32)
-    char *standard_error;
-    GError *error=NULL;
-    gboolean retval;
-    GSpawnFlags flags=G_SPAWN_SEARCH_PATH;
-    if (!standard_output)
-	flags=G_SPAWN_STDOUT_TO_DEV_NULL;
-    retval=g_spawn_sync(NULL,argv,NULL,flags,NULL,NULL,standard_output,
-      &standard_error,exit_status,&error);
-    fputs(standard_error,stderr);
-    g_free(standard_error);
-    if (!retval)
-    {
-	fprintf(stderr,"%s\n",error->message);
-	g_error_free(error);
-    }
-    else if (exit_status)
-	*exit_status=WEXITSTATUS(*exit_status);
-    return retval;
-#else
-    FILE *fp;
-    int i,r;
-    size_t n,len;
-    String *command_line,*string;
-    command_line=string_new(NULL);
-    for(i=0;argv[i];i++)
-    {
-	if (i)
-	    string_append_c(command_line,' ');
-	string_append(command_line,argv[i]);
-    }
-    fp=popen(command_line->str,"r");
-    string_free(command_line,TRUE);
-    if (!fp)
-    {
-	perror(command_line->str);
-	return FALSE;
-    }
-    string=string_new(NULL);
-    do
-    {
-	len=string->len;
-	string_set_size(string,len+SPAWN_BUFSIZE);
-	n=fread(string->str+len,1,SPAWN_BUFSIZE,fp);
-	if (n<0)
-	{
-	    perror("fread");
-	    (void)pclose(fp);
-	    string_free(string,TRUE);
-	    return FALSE;
-	}
-	string_set_size(string,len+n);
-    } while(n);
-    r=pclose(fp);
-    if (r<0)
-    {
-	perror("pclose");
-	string_free(string,TRUE);
-	return FALSE;
-    }
-    else
-    {
-	if (exit_status)
-	    *exit_status=r;
-	if (standard_output)
-	    *standard_output=string_free(string,FALSE);
-	else
-	    string_free(string,TRUE);
-	return TRUE;
-    }
-#endif
-}
diff -r 218904410231 -r f600b0d1fc5d gclib/spawn.h
--- a/gclib/spawn.h	Fri Jan 27 00:28:11 2012 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,8 +0,0 @@
-#ifndef GC_SPAWN_H
-#define GC_SPAWN_H
-
-#include 
-
-boolean spawn_sync(char **argv,char **standard_output,int *exit_status);
-
-#endif /* GC_SPAWN_H */
diff -r 218904410231 -r f600b0d1fc5d gclib/strfuncs.c
--- a/gclib/strfuncs.c	Fri Jan 27 00:28:11 2012 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,26 +0,0 @@
-#include 
-#include 
-#include 
-#include 
-
-/*
- * Like strndup, but only returns NULL if str is NULL.
- * Note that this routine copies n bytes rather than n characters.
- */
-char *str_ndup(const char *str,size_t n)
-{
-    char *dup;
-    if (!str)
-	return NULL;
-    dup=mem_alloc0(n+1,1);
-    strncpy(dup,str,n);
-    return dup;
-}
-
-/*
- * Like strdup, but only returns NULL if str is NULL.
- */
-char *str_dup(const char *str)
-{
-    return str_ndup(str,strlen(str));
-}
diff -r 218904410231 -r f600b0d1fc5d gclib/strfuncs.h
--- a/gclib/strfuncs.h	Fri Jan 27 00:28:11 2012 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,7 +0,0 @@
-#ifndef GC_STRFUNCS_H
-#define GC_STRFUNCS_H
-
-char *str_dup(const char *str);
-char *str_ndup(const char *str,size_t n);
-
-#endif /* GC_STRFUNCS_H */
diff -r 218904410231 -r f600b0d1fc5d gclib/textfileutils.c
--- a/gclib/textfileutils.c	Fri Jan 27 00:28:11 2012 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,33 +0,0 @@
-#include 
-#include 
-#include 
-
-/*
- * Read a file into memory (which should be freed with mem_free when no
- * longer required). Returns NULL on error and outputs a suitable error
- * message to stderr.
- * DOS-style line endings are handled transparently even on platforms which
- * don't normally use this format.
- */
-boolean file_get_contents_text(const char *filename,char **contents,
-  size_t *length)
-{
-    int i;
-    char *raw;
-    size_t raw_length;
-    String *string;
-    if (!file_get_contents(filename,&raw,&raw_length))
-	return FALSE;
-    string=string_new(NULL);
-    for(i=0;ilen;
-    if (contents)
-	*contents=string_free(string,FALSE);
-    else
-	string_free(string,TRUE);
-    return TRUE;
-}
diff -r 218904410231 -r f600b0d1fc5d gclib/textfileutils.h
--- a/gclib/textfileutils.h	Fri Jan 27 00:28:11 2012 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,9 +0,0 @@
-#ifndef GC_TEXTFILEUTILS_H
-#define GC_TEXTFILEUTILS_H
-
-#include 
-
-boolean file_get_contents_text(const char *filename,char **contents,
-  size_t *length);
-
-#endif /* GC_TEXTFILEUTILS_H */
diff -r 218904410231 -r f600b0d1fc5d gclib/types.h
--- a/gclib/types.h	Fri Jan 27 00:28:11 2012 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,6 +0,0 @@
-#ifndef GC_TYPES_H
-#define GC_TYPES_H
-
-typedef int boolean;
-
-#endif	/* GC_TYPES_H */
diff -r 218904410231 -r f600b0d1fc5d gclib/utils.c
--- a/gclib/utils.c	Fri Jan 27 00:28:11 2012 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,46 +0,0 @@
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-
-#define is_valid_drive(d)	((d)>='a' && (d)<='z' || (d)>='A' && (d)<='Z')
-
-/*
- * Gets the last component of the filename. If filename ends with a directory
- * separator it gets the component before the last slash. If filename consists
- * only of directory separators (and on Windows, possibly a drive letter), a
- * single separator is returned. If filename is empty, it gets ".".
- */
-char *path_get_basename(const char *filename)
-{
-    ssize_t base,last_nonslash;
-    size_t len;
-    char *retval;
-    if (*filename=='\0')
-        return str_dup(".");
-    last_nonslash=strlen(filename)-1;
-    while (last_nonslash>=0 && GC_IS_DIR_SEPARATOR(filename[last_nonslash]))
-	last_nonslash--;
-    if (last_nonslash<0)
-	/* string only containing slashes */
-    return str_dup(GC_DIR_SEPARATOR_S);
-#ifdef WIN32
-    if (last_nonslash==1 && is_valid_drive(filename[0]) && filename[1]==':')
-	/* string only containing slashes and a drive */
-	return str_dup(GC_DIR_SEPARATOR_S);
-#endif
-    base=last_nonslash;
-    while (base>=0 && !GC_IS_DIR_SEPARATOR(filename[base]))
-	base--;
-#ifdef WIN32
-    if (base==-1 && is_valid_drive(filename[0]) && filename[1] == ':')
-	  base=1;
-#endif
-    len=last_nonslash-base;
-    retval=mem_alloc(len+1,1);
-    memcpy(retval,filename+base+1,len);
-    retval[len]='\0';
-    return retval;
-}
diff -r 218904410231 -r f600b0d1fc5d gclib/utils.h
--- a/gclib/utils.h	Fri Jan 27 00:28:11 2012 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,16 +0,0 @@
-#ifndef GC_UTIL_H
-#define GC_UTIL_H
-
-#ifdef WIN32
-#define GC_DIR_SEPARATOR '\\'
-#define GC_DIR_SEPARATOR_S "\\"
-#define GC_IS_DIR_SEPARATOR(c) ((c)==GC_DIR_SEPARATOR || (c)=='/')
-#else
-#define GC_DIR_SEPARATOR '/'
-#define GC_DIR_SEPARATOR_S "/"
-#define GC_IS_DIR_SEPARATOR(c) ((c)==GC_DIR_SEPARATOR)
-#endif
-
-char *path_get_basename(const char *filename);
-
-#endif /* GC_UTIL_H */
diff -r 218904410231 -r f600b0d1fc5d gutcheck/Makefile.am
--- a/gutcheck/Makefile.am	Fri Jan 27 00:28:11 2012 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,8 +0,0 @@
-bin_PROGRAMS=gutcheck
-pkgdata_DATA=gutcheck.typ
-
-gutcheck.typ:	gutcheck.typ.in
-	sed 's/$$/\r/' $< > $@
-
-EXTRA_DIST=gutcheck.typ.in
-CLEANFILES=gutcheck.typ
diff -r 218904410231 -r f600b0d1fc5d gutcheck/gutcheck.c
--- a/gutcheck/gutcheck.c	Fri Jan 27 00:28:11 2012 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,2982 +0,0 @@
-/*************************************************************************/
-/* gutcheck - check for assorted weirdnesses in a PG candidate text file */
-/*                                                                       */
-/* Version 0.991                                                         */
-/* Copyright 2000-2005 Jim Tinsley                   */
-/*                                                                       */
-/* This program is free software; you can redistribute it and/or modify  */
-/* it under the terms of the GNU General Public License as published by  */
-/* the Free Software Foundation; either version 2 of the License, or     */
-/* (at your option) any later version.                                   */
-/*                                                                       */
-/* This program is distributed in the hope that it will be useful,       */
-/* but WITHOUT ANY WARRANTY; without even the implied warranty of        */
-/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         */
-/* GNU General Public License for more details.                          */
-/*                                                                       */
-/* You should have received a copy of the GNU General Public License     */
-/* along with this program; if not, write to the                         */
-/*      Free Software Foundation, Inc.,                                  */
-/*      59 Temple Place,                                                 */
-/*      Suite 330,                                                       */
-/*      Boston, MA  02111-1307  USA                                      */
-/*                                                                       */
-/*                                                                       */
-/*                                                                       */
-/* Overview comments:                                                    */
-/*                                                                       */
-/* If you're reading this, you're either interested in how to detect     */
-/* formatting errors, or very very bored.                                */
-/*                                                                       */
-/* Gutcheck is a homebrew formatting checker specifically for            */
-/* spotting common formatting problems in a PG e-text. I typically       */
-/* run it once or twice on a file I'm about to submit; it usually        */
-/* finds a few formatting problems. It also usually finds lots of        */
-/* queries that aren't problems at all; it _really_ doesn't like         */
-/* the standard PG header, for example.  It's optimized for straight     */
-/* prose; poetry and non-fiction involving tables tend to trigger        */
-/* false alarms.                                                         */
-/*                                                                       */
-/* The code of gutcheck is not very interesting, but the experience      */
-/* of what constitutes a possible error may be, and the best way to      */
-/* illustrate that is by example.                                        */
-/*                                                                       */
-/*                                                                       */
-/* Here are some common typos found in PG texts that gutcheck            */
-/* will flag as errors:                                                  */
-/*                                                                       */
-/* "Look!John , over there!"                                             */
-/*                                                   */
-/* &so is this;                                                          */
-/* Margaret said: " Now you should start for school."                    */
-/* Margaret said: "Now you should start for school. (if end of para)     */
-/* The horse is said to he worth a lot.                                  */
-/* 0K - this'11 make you look close1y.                                   */
-/* "If you do. you'll regret it!"                                        */
-/*                                                                       */
-/* There are some complications . The extra space left around that       */
-/* period was an error . . . but that ellipsis wasn't.                   */
-/*                                                                       */
-/* The last line of a paragraph                                          */
-/* is usually short.                                                     */
-/*                                                                       */
-/* This period is an error.But the periods in a.m. aren't.               */
-/*                                                                       */
-/* Checks that are do-able but not (well) implemented are:               */
-/*        Single-quote chcking.                                          */
-/*          Despite 3 attempts at it, singlequote checking is still      */
-/*          crap in gutcheck. It may not be possible without analysis    */
-/*          of the whole paragraph.                                      */
-/*                                                                       */
-/*************************************************************************/
-
-
-#include 
-#include 
-#include 
-#include 
-
-#define MAXWORDLEN    80    /* max length of one word             */
-#define LINEBUFSIZE 2048    /* buffer size for an input line      */
-
-#define MAX_USER_TYPOS 1000
-#define USERTYPO_FILE "gutcheck.typ"
-
-#ifndef MAX_PATH
-#define MAX_PATH 16384
-#endif
-
-char aline[LINEBUFSIZE];
-char prevline[LINEBUFSIZE];
-
-                 /* Common typos. */
-char *typo[] = { "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane", "nad",
-                "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa", "bakc", "om",
-                "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt", "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr",
-                "hmi", "hse", "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd", "gerat", "goign",
-                "gruop", "haev", "hda", "hearign", "seeign", "sayign", "herat", "hge", "hsa", "hsi", "hte", "htere",
-                "htese", "htey", "htis", "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut", "loev",
-                "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter", "omre", "onyl", "otehr", "otu", "owrk",
-                "owuld", "peice", "peices", "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
-                "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe", "sohw", "stnad", "stopry",
-                "stoyr", "stpo", "tahn", "taht", "tath", "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge",
-                "thier", "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne", "tirne", "tkae",
-                "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey", "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih",
-                "whihc", "whta", "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
-                "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking", "wtih", "wuould", "wya", "yera",
-                "yeras", "yersa", "yoiu", "youve", "ytou", "yuor",
-                /* added h/b words for version 12 - removed a few with "tbe" v.25 */
-                "abead", "ahle", "ahout", "ahove", "altbough", "balf", "bardly", "bas", "bave", "baving", "bebind", 
-                "beld", "belp", "belped", "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge", "dehates", 
-                "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan", "hegin", "heing", 
-                "helieve", "henefit", "hetter", "hetween", "heyond", "hig", "higber", "huild", "huy", "hy", "jobn", "joh", 
-                "meanwbile", "memher", "memhers", "numher", "numhers", 
-                "perbaps", "prohlem", "puhlic", "witbout", 
-                /* and a few more for .18 */
-                "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud", "prornise", "prornised", "modem", "bo",
-                "heside", "chapteb", "chaptee", "se",
-                 ""};
-
-char *usertypo[MAX_USER_TYPOS];
-
-                 /* Common abbreviations and other OK words not to query as typos. */
-                 /* 0.99 last-minute - removed "ms"      */
-char *okword[] = {"mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm", "rd", "sh", "br",
-                  "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "pompeii","hawaii","hawaiian",
-                  "hotbed", "heartbeat", "heartbeats", "outbid", "outbids", "frostbite", "frostbitten",
-                  ""};
-
-                 /* Common abbreviations that cause otherwise unexplained periods. */
-char *abbrev[] = {"cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op", "cit",
-                  "deg", "min", "chap", "oz", "mme", "mlle", "mssrs",
-                  ""};
-                 /* Two-Letter combinations that rarely if ever start words, */
-                 /* but are common scannos or otherwise common letter        */
-                 /* combinations.                                            */
-char *nostart[] = { "hr", "hl", "cb", "sb", "tb", "wb", "tl",
-                    "tn", "rn", "lt", "tj",
-                    "" };
-
-                 /* Two-Letter combinations that rarely if ever end words    */
-                 /* but are common scannos or otherwise common letter        */
-                 /* combinations                                             */
-char *noend[]   = { "cb", "gb", "pb", "sb", "tb", 
-                    "wh","fr","br","qu","tw","gl","fl","sw","gr","sl","cl",
-                    "iy",
-                    ""};
-
-char *markup[]  = { "a", "b", "big", "blockquote", "body", "br", "center", 
-                    "col", "div", "em", "font", "h1", "h2", "h3", "h4", 
-                    "h5", "h6", "head", "hr", "html", "i", "img", "li", 
-                    "meta", "ol", "p", "pre", "small", "span", "strong", 
-                    "sub", "sup", "table", "td", "tfoot", "thead", "title", 
-                    "tr", "tt", "u", "ul", 
-                    ""};
-
-char *DPmarkup[] = { "", "", "/*", "*/", "/#", "#/", "/$", "$/", "",
-                    ""}; /*  added .991 */
-
-char *nocomma[]  = { "the", "it's", "their", "an", "mrs", "a", "our", "that's",
-                     "its", "whose", "every", "i'll", "your", "my", 
-                     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "rd", 
-                     "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", 
-                     "i'm", "during", "let", "toward", "among",
-                     ""};
-
-
-char *noperiod[] = { "every", "i'm", "during", "that's", "their", "your", "our", "my", "or", 
-                     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether", 
-                     "i'll", "whose", "who", "because", "when", "let", "till", "very",
-                     "an", "among", "those", "into", "whom", "having", "thence",
-                     ""}; 
-
-
-char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü";  /* Carlo's old suggestion, updated .991 */
-
-struct {
-    char *htmlent;
-    char *htmlnum;
-    char *textent;
-    } entities[] = { "&",           "&",        "&", 
-                     "<",            "<",        "<",
-                     ">",            ">",        ">",
-                     "°",           "°",       " degrees",
-                     "£",         "£",       "L",
-                     """,          """,        "\"",   /* -- quotation mark = APL quote, */
-                     "Œ",         "Œ",       "OE",  /* -- latin capital ligature OE, */
-                     "œ",         "œ",       "oe",  /* -- latin small ligature oe, U+0153 ISOlat2 --> */
-                     "Š",        "Š",       "S",  /* -- latin capital letter S with caron, */
-                     "š",        "š",       "s",  /* -- latin small letter s with caron, */
-                     "Ÿ",          "Ÿ",       "Y",  /* -- latin capital letter Y with diaeresis, */
-                     "ˆ",          "ˆ",       "",  /* -- modifier letter circumflex accent, */
-                     "˜",         "˜",       "~",  /* -- small tilde, U+02DC ISOdia --> */
-                     " ",          " ",      " ", /* -- en space, U+2002 ISOpub --> */
-                     " ",          " ",      " ", /* -- em space, U+2003 ISOpub --> */
-                     " ",        " ",      " ", /* -- thin space, U+2009 ISOpub --> */
-                     "–",         "–",      "-", /* -- en dash, U+2013 ISOpub --> */
-                     "—",         "—",      "--", /* -- em dash, U+2014 ISOpub --> */
-                     "‘",         "‘",      "'", /* -- left single quotation mark, */
-                     "’",         "’",      "'", /* -- right single quotation mark, */
-                     "‚",         "‚",      "'", /* -- single low-9 quotation mark, U+201A NEW --> */
-                     "“",         "“",      "\"", /* -- left double quotation mark, */
-                     "”",         "”",      "\"", /* -- right double quotation mark, */
-                     "„",         "„",      "\"", /* -- double low-9 quotation mark, U+201E NEW --> */
-                     "‹",        "‹",      "\"", /* -- single left-pointing angle quotation mark, */
-                     "›",        "›",      "\"", /* -- single right-pointing angle quotation mark, */
-                     " ",          " ",       " ", /* -- no-break space = non-breaking space, */
-                     "¡",         "¡",       "!", /* -- inverted exclamation mark, U+00A1 ISOnum --> */
-                     "¢",          "¢",       "c", /* -- cent sign, U+00A2 ISOnum --> */
-                     "£",         "£",       "L", /* -- pound sign, U+00A3 ISOnum --> */
-                     "¤",        "¤",       "$", /* -- currency sign, U+00A4 ISOnum --> */
-                     "¥",           "¥",       "Y", /* -- yen sign = yuan sign, U+00A5 ISOnum --> */
-                     "§",          "§",       "--", /* -- section sign, U+00A7 ISOnum --> */
-                     "¨",           "¨",       " ", /* -- diaeresis = spacing diaeresis, */
-                     "©",          "©",       "(C) ", /* -- copyright sign, U+00A9 ISOnum --> */
-                     "ª",          "ª",       " ", /* -- feminine ordinal indicator, U+00AA ISOnum --> */
-                     "«",         "«",       "\"", /* -- left-pointing double angle quotation mark */
-                     "­",           "­",       "-", /* -- soft hyphen = discretionary hyphen, */
-                     "®",           "®",       "(R) ", /* -- registered sign = registered trade mark sign, */
-                     "¯",          "¯",       " ", /* -- macron = spacing macron = overline */
-                     "°",           "°",       " degrees", /* -- degree sign, U+00B0 ISOnum --> */
-                     "±",        "±",       "+-", /* -- plus-minus sign = plus-or-minus sign, */
-                     "²",          "²",       "2", /* -- superscript two = superscript digit two */
-                     "³",          "³",       "3", /* -- superscript three = superscript digit three */
-                     "´",         "´",       " ", /* -- acute accent = spacing acute, */
-                     "µ",         "µ",       "m", /* -- micro sign, U+00B5 ISOnum --> */
-                     "¶",          "¶",       "--", /* -- pilcrow sign = paragraph sign, */
-                     "¸",         "¸",       " ", /* -- cedilla = spacing cedilla, U+00B8 ISOdia --> */
-                     "¹",          "¹",       "1", /* -- superscript one = superscript digit one, */
-                     "º",          "º",       " ", /* -- masculine ordinal indicator, */
-                     "»",         "»",       "\"", /* -- right-pointing double angle quotation mark */
-                     "¼",        "¼",       "1/4", /* -- vulgar fraction one quarter */
-                     "½",        "½",       "1/2", /* -- vulgar fraction one half */
-                     "¾",        "¾",       "3/4", /* -- vulgar fraction three quarters */
-                     "¿",        "¿",       "?", /* -- inverted question mark */
-                     "À",        "À",       "A", /* -- latin capital letter A with grave */
-                     "Á",        "Á",       "A", /* -- latin capital letter A with acute, */
-                     "Â",         "Â",       "A", /* -- latin capital letter A with circumflex, */
-                     "Ã",        "Ã",       "A", /* -- latin capital letter A with tilde, */
-                     "Ä",          "Ä",       "A", /* -- latin capital letter A with diaeresis, */
-                     "Å",         "Å",       "A", /* -- latin capital letter A with ring above */
-                     "Æ",         "Æ",       "AE", /* -- latin capital letter AE */
-                     "Ç",        "Ç",       "C", /* -- latin capital letter C with cedilla, */
-                     "È",        "È",       "E", /* -- latin capital letter E with grave, */
-                     "É",        "É",       "E", /* -- latin capital letter E with acute, */
-                     "Ê",         "Ê",       "E", /* -- latin capital letter E with circumflex, */
-                     "Ë",          "Ë",       "E", /* -- latin capital letter E with diaeresis, */
-                     "Ì",        "Ì",       "I", /* -- latin capital letter I with grave, */
-                     "Í",        "Í",       "I", /* -- latin capital letter I with acute, */
-                     "Î",         "Î",       "I", /* -- latin capital letter I with circumflex, */
-                     "Ï",          "Ï",       "I", /* -- latin capital letter I with diaeresis, */
-                     "Ð",           "Ð",       "E", /* -- latin capital letter ETH, U+00D0 ISOlat1 --> */
-                     "Ñ",        "Ñ",       "N", /* -- latin capital letter N with tilde, */
-                     "Ò",        "Ò",       "O", /* -- latin capital letter O with grave, */
-                     "Ó",        "Ó",       "O", /* -- latin capital letter O with acute, */
-                     "Ô",         "Ô",       "O", /* -- latin capital letter O with circumflex, */
-                     "Õ",        "Õ",       "O", /* -- latin capital letter O with tilde, */
-                     "Ö",          "Ö",       "O", /* -- latin capital letter O with diaeresis, */
-                     "×",         "×",       "*", /* -- multiplication sign, U+00D7 ISOnum --> */
-                     "Ø",        "Ø",       "O", /* -- latin capital letter O with stroke */
-                     "Ù",        "Ù",       "U", /* -- latin capital letter U with grave, */
-                     "Ú",        "Ú",       "U", /* -- latin capital letter U with acute, */
-                     "Û",         "Û",       "U", /* -- latin capital letter U with circumflex, */
-                     "Ü",          "Ü",       "U", /* -- latin capital letter U with diaeresis, */
-                     "Ý",        "Ý",       "Y", /* -- latin capital letter Y with acute, */
-                     "Þ",         "Þ",       "TH", /* -- latin capital letter THORN, */
-                     "ß",         "ß",       "sz", /* -- latin small letter sharp s = ess-zed, */
-                     "à",        "à",       "a", /* -- latin small letter a with grave */
-                     "á",        "á",       "a", /* -- latin small letter a with acute, */
-                     "â",         "â",       "a", /* -- latin small letter a with circumflex, */
-                     "ã",        "ã",       "a", /* -- latin small letter a with tilde, */
-                     "ä",          "ä",       "a", /* -- latin small letter a with diaeresis, */
-                     "å",         "å",       "a", /* -- latin small letter a with ring above */
-                     "æ",         "æ",       "ae", /* -- latin small letter ae */
-                     "ç",        "ç",       "c", /* -- latin small letter c with cedilla, */
-                     "è",        "è",       "e", /* -- latin small letter e with grave, */
-                     "é",        "é",       "e", /* -- latin small letter e with acute, */
-                     "ê",         "ê",       "e", /* -- latin small letter e with circumflex, */
-                     "ë",          "ë",       "e", /* -- latin small letter e with diaeresis, */
-                     "ì",        "ì",       "i", /* -- latin small letter i with grave, */
-                     "í",        "í",       "i", /* -- latin small letter i with acute, */
-                     "î",         "î",       "i", /* -- latin small letter i with circumflex, */
-                     "ï",          "ï",       "i", /* -- latin small letter i with diaeresis, */
-                     "ð",           "ð",       "eth", /* -- latin small letter eth, U+00F0 ISOlat1 --> */
-                     "ñ",        "ñ",       "n", /* -- latin small letter n with tilde, */
-                     "ò",        "ò",       "o", /* -- latin small letter o with grave, */
-                     "ó",        "ó",       "o", /* -- latin small letter o with acute, */
-                     "ô",         "ô",       "o", /* -- latin small letter o with circumflex, */
-                     "õ",        "õ",       "o", /* -- latin small letter o with tilde, */
-                     "ö",          "ö",       "o", /* -- latin small letter o with diaeresis, */
-                     "÷",        "÷",       "/", /* -- division sign, U+00F7 ISOnum --> */
-                     "ø",        "ø",       "o", /* -- latin small letter o with stroke, */
-                     "ù",        "ù",       "u", /* -- latin small letter u with grave, */
-                     "ú",        "ú",       "u", /* -- latin small letter u with acute, */
-                     "û",         "û",       "u", /* -- latin small letter u with circumflex, */
-                     "ü",          "ü",       "u", /* -- latin small letter u with diaeresis, */
-                     "ý",        "ý",       "y", /* -- latin small letter y with acute, */
-                     "þ",         "þ",       "th", /* -- latin small letter thorn, */
-                     "ÿ",          "ÿ",       "y", /* -- latin small letter y with diaeresis, */
-                      "", "" };
-                    
-/* ---- list of special characters ---- */
-#define CHAR_SPACE        32
-#define CHAR_TAB           9
-#define CHAR_LF           10
-#define CHAR_CR           13
-#define CHAR_DQUOTE       34
-#define CHAR_SQUOTE       39
-#define CHAR_OPEN_SQUOTE  96
-#define CHAR_TILDE       126
-#define CHAR_ASTERISK     42
-#define CHAR_FORESLASH    47
-#define CHAR_CARAT        94
-
-#define CHAR_UNDERSCORE    '_'
-#define CHAR_OPEN_CBRACK   '{'
-#define CHAR_CLOSE_CBRACK  '}'
-#define CHAR_OPEN_RBRACK   '('
-#define CHAR_CLOSE_RBRACK  ')'
-#define CHAR_OPEN_SBRACK   '['
-#define CHAR_CLOSE_SBRACK  ']'
-
-
-
-
-
-/* ---- longest and shortest normal PG line lengths ----*/
-#define LONGEST_PG_LINE   75
-#define WAY_TOO_LONG      80
-#define SHORTEST_PG_LINE  55
-
-#define SWITCHES "ESTPXLOYHWVMUD" /* switches:-                            */
-                                  /*     D - ignore DP-specific markup     */
-                                  /*     E - echo queried line             */
-                                  /*     S - check single quotes           */
-                                  /*     T - check common typos            */
-                                  /*     P - require closure of quotes on  */
-                                  /*         every paragraph               */
-                                  /*     X - "Trust no one" :-) Paranoid!  */
-                                  /*         Queries everything            */
-                                  /*     L - line end checking defaults on */
-                                  /*         -L turns it off               */
-                                  /*     O - overview. Just shows counts.  */
-                                  /*     Y - puts errors to stdout         */
-                                  /*         instead of stderr             */
-                                  /*     H - Echoes header fields          */
-                                  /*     M - Ignore markup in < >          */
-                                  /*     U - Use file of User-defined Typos*/
-                                  /*     W - Defaults for use on Web upload*/
-                                  /*     V - Verbose - list EVERYTHING!    */
-#define SWITNO 14                 /* max number of switch parms            */
-                                  /*        - used for defining array-size */
-#define MINARGS   1               /* minimum no of args excl switches      */
-#define MAXARGS   1               /* maximum no of args excl switches      */
-
-int pswit[SWITNO];                /* program switches set by SWITCHES      */
-
-#define ECHO_SWITCH      0
-#define SQUOTE_SWITCH    1
-#define TYPO_SWITCH      2
-#define QPARA_SWITCH     3
-#define PARANOID_SWITCH  4
-#define LINE_END_SWITCH  5
-#define OVERVIEW_SWITCH  6
-#define STDOUT_SWITCH    7
-#define HEADER_SWITCH    8
-#define WEB_SWITCH       9
-#define VERBOSE_SWITCH   10
-#define MARKUP_SWITCH    11
-#define USERTYPO_SWITCH  12
-#define DP_SWITCH        13
-
-
-
-long cnt_dquot;       /* for overview mode, count of doublequote queries */
-long cnt_squot;       /* for overview mode, count of singlequote queries */
-long cnt_brack;       /* for overview mode, count of brackets queries */
-long cnt_bin;         /* for overview mode, count of non-ASCII queries */
-long cnt_odd;         /* for overview mode, count of odd character queries */
-long cnt_long;        /* for overview mode, count of long line errors */
-long cnt_short;       /* for overview mode, count of short line queries */
-long cnt_punct;       /* for overview mode, count of punctuation and spacing queries */
-long cnt_dash;        /* for overview mode, count of dash-related queries */
-long cnt_word;        /* for overview mode, count of word queries */
-long cnt_html;        /* for overview mode, count of html queries */
-long cnt_lineend;     /* for overview mode, count of line-end queries */
-long cnt_spacend;     /* count of lines with space at end  V .21 */
-long linecnt;         /* count of total lines in the file */
-long checked_linecnt; /* count of lines actually gutchecked V .26 */
-
-void proghelp(void);
-void procfile(char *);
-
-#define LOW_THRESHOLD    0
-#define HIGH_THRESHOLD   1
-
-#define START 0
-#define END 1
-#define PREV 0
-#define NEXT 1
-#define FIRST_OF_PAIR 0
-#define SECOND_OF_PAIR 1
-
-#define MAX_WORDPAIR 1000
-
-char running_from[MAX_PATH];
-
-int mixdigit(char *);
-char *getaword(char *, char *);
-int matchword(char *, char *);
-char *flgets(char *, int, FILE *, long);
-void lowerit(char *);
-int gcisalpha(unsigned char);
-int gcisdigit(unsigned char);
-int gcisletter(unsigned char);
-char *gcstrchr(char *s, char c);
-void postprocess_for_HTML(char *);
-char *linehasmarkup(char *);
-char *losemarkup(char *);
-int tagcomp(char *, char *);
-char *loseentities(char *);
-int isroman(char *);
-int usertypo_count;
-void postprocess_for_DP(char *);
-
-char wrk[LINEBUFSIZE];
-
-/* This is disgustingly lazy, predefining max words & lengths,   */
-/* but now I'm out of 16-bit restrictions, what's a couple of K? */
-#define MAX_QWORD           50
-#define MAX_QWORD_LENGTH    40
-char qword[MAX_QWORD][MAX_QWORD_LENGTH];
-char qperiod[MAX_QWORD][MAX_QWORD_LENGTH];
-signed int dupcnt[MAX_QWORD];
-
-
-
-
-int main(int argc, char **argv)
-{
-    char *argsw, *s;
-    int i, switno, invarg;
-    char usertypo_file[MAX_PATH];
-    FILE *usertypofile;
-
-
-    if (strlen(argv[0]) < sizeof(running_from))
-        strcpy(running_from, argv[0]);  /* save the path to the executable gutcheck */
-
-    /* find out what directory we're running from */
-    for (s = running_from + strlen(running_from); *s != '/' && *s != '\\' && s >= running_from; s--)
-        *s = 0;
-
-
-    switno = strlen(SWITCHES);
-    for (i = switno ; --i >0 ; )
-        pswit[i] = 0;           /* initialise switches */
-
-    /* Standard loop to extract switches.                   */
-    /* When we come out of this loop, the arguments will be */
-    /* in argv[0] upwards and the switches used will be     */
-    /* represented by their equivalent elements in pswit[]  */
-    while ( --argc > 0 && **++argv == '-')
-        for (argsw = argv[0]+1; *argsw !='\0'; argsw++)
-            for (i = switno, invarg = 1; (--i >= 0) && invarg == 1 ; )
-                if ((toupper(*argsw)) == SWITCHES[i] ) {
-                    invarg = 0;
-                    pswit[i] = 1;
-                    }
-
-    pswit[PARANOID_SWITCH] ^= 1;         /* Paranoid checking is turned OFF, not on, by its switch */
-
-    if (pswit[PARANOID_SWITCH]) {                         /* if running in paranoid mode */
-        pswit[TYPO_SWITCH] = pswit[TYPO_SWITCH] ^ 1;      /* force typo checks as well   */
-        }                                                 /* v.20 removed s and p switches from paranoid mode */
-
-    pswit[LINE_END_SWITCH] ^= 1;         /* Line-end checking is turned OFF, not on, by its switch */
-    pswit[ECHO_SWITCH] ^= 1;             /* V.21 Echoing is turned OFF, not on, by its switch      */
-
-    if (pswit[OVERVIEW_SWITCH])       /* just print summary; don't echo */
-        pswit[ECHO_SWITCH] = 0;
-
-    /* Web uploads - for the moment, this is really just a placeholder     */
-    /* until we decide what processing we really want to do on web uploads */
-    if (pswit[WEB_SWITCH]) {          /* specific override for web uploads */
-        pswit[ECHO_SWITCH] =     1;
-        pswit[SQUOTE_SWITCH] =   0;
-        pswit[TYPO_SWITCH] =     1;
-        pswit[QPARA_SWITCH] =    0;
-        pswit[PARANOID_SWITCH] = 1;
-        pswit[LINE_END_SWITCH] = 0;
-        pswit[OVERVIEW_SWITCH] = 0;
-        pswit[STDOUT_SWITCH] =   0;
-        pswit[HEADER_SWITCH] =   1;
-        pswit[VERBOSE_SWITCH] =  0;
-        pswit[MARKUP_SWITCH] =   0;
-        pswit[USERTYPO_SWITCH] = 0;
-        pswit[DP_SWITCH] = 0;
-        }
-
-
-    if (argc < MINARGS || argc > MAXARGS) {  /* check number of args */
-        proghelp();
-        return(1);            /* exit */
-        }
-
-
-    /* read in the user-defined stealth scanno list */
-
-    if (pswit[USERTYPO_SWITCH]) {                    /* ... we were told we had one! */
-        if ((usertypofile = fopen(USERTYPO_FILE, "rb")) == NULL) {   /* not in cwd. try gutcheck directory. */
-            strcpy(usertypo_file, running_from);
-            strcat(usertypo_file, USERTYPO_FILE);
-            if ((usertypofile = fopen(usertypo_file, "rb")) == NULL) {  /* we ain't got no user typo file! */
-                printf("   --> I couldn't find gutcheck.typ -- proceeding without user typos.\n");
-                }
-            }
-
-        usertypo_count = 0;
-        if (usertypofile) {  /* we managed to open a User Typo File! */
-            if (pswit[USERTYPO_SWITCH]) {
-                while (flgets(aline, LINEBUFSIZE-1, usertypofile, (long)usertypo_count)) {
-                    if (strlen(aline) > 1) {
-                        if ((int)*aline > 33) {
-                            s = malloc(strlen(aline)+1);
-                            if (!s) {
-                                fprintf(stderr, "gutcheck: cannot get enough memory for user typo file!!\n");
-                                exit(1);
-                                }
-                            strcpy(s, aline);
-                            usertypo[usertypo_count] = s;
-                            usertypo_count++;
-                            if (usertypo_count >= MAX_USER_TYPOS) {
-                                printf("   --> Only %d user-defined typos allowed: ignoring the rest\n");
-                                break;
-                                }
-                            }
-                        }
-                    }
-                }
-            fclose(usertypofile);
-            }
-        }
-
-
-
-
-    fprintf(stderr, "gutcheck: Check and report on an e-text\n");
-
-    cnt_dquot = cnt_squot = cnt_brack = cnt_bin = cnt_odd = cnt_long =
-    cnt_short = cnt_punct = cnt_dash = cnt_word = cnt_html = cnt_lineend =
-    cnt_spacend = 0;
-
-    procfile(argv[0]);
-
-    if (pswit[OVERVIEW_SWITCH]) {
-                         printf("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
-                            checked_linecnt, linecnt, linecnt - checked_linecnt);
-                         printf("    --------------- Queries found --------------\n");
-        if (cnt_long)    printf("    Long lines:                             %5ld\n",cnt_long);
-        if (cnt_short)   printf("    Short lines:                            %5ld\n",cnt_short);
-        if (cnt_lineend) printf("    Line-end problems:                      %5ld\n",cnt_lineend);
-        if (cnt_word)    printf("    Common typos:                           %5ld\n",cnt_word);
-        if (cnt_dquot)   printf("    Unmatched quotes:                       %5ld\n",cnt_dquot);
-        if (cnt_squot)   printf("    Unmatched SingleQuotes:                 %5ld\n",cnt_squot);
-        if (cnt_brack)   printf("    Unmatched brackets:                     %5ld\n",cnt_brack);
-        if (cnt_bin)     printf("    Non-ASCII characters:                   %5ld\n",cnt_bin);
-        if (cnt_odd)     printf("    Proofing characters:                    %5ld\n",cnt_odd);
-        if (cnt_punct)   printf("    Punctuation & spacing queries:          %5ld\n",cnt_punct);
-        if (cnt_dash)    printf("    Non-standard dashes:                    %5ld\n",cnt_dash);
-        if (cnt_html)    printf("    Possible HTML tags:                     %5ld\n",cnt_html);
-        printf("\n");
-        printf("    TOTAL QUERIES                           %5ld\n",
-            cnt_dquot + cnt_squot + cnt_brack + cnt_bin + cnt_odd + cnt_long +
-            cnt_short + cnt_punct + cnt_dash + cnt_word + cnt_html + cnt_lineend);
-        }
-
-    return(0);
-}
-
-
-
-/* procfile - process one file */
-
-void procfile(char *filename)
-{
-
-    char *s, *t, *s1, laststart, *wordstart;
-    char inword[MAXWORDLEN], testword[MAXWORDLEN];
-    char parastart[81];     /* first line of current para */
-    FILE *infile;
-    long quot, squot, firstline, alphalen, totlen, binlen,
-         shortline, longline, verylongline, spacedash, emdash,
-         space_emdash, non_PG_space_emdash, PG_space_emdash,
-         footerline, dotcomma, start_para_line, astline, fslashline,
-         standalone_digit, hyphens, htmcount, endquote_count;
-    long spline, nspline;
-    signed int i, j, llen, isemptyline, isacro, isellipsis, istypo, alower,
-         eNon_A, eTab, eTilde, eAst, eFSlash, eCarat;
-    signed int warn_short, warn_long, warn_bin, warn_dash, warn_dotcomma,
-         warn_ast, warn_fslash, warn_digit, warn_hyphen, warn_endquote;
-    unsigned int lastlen, lastblen;
-    signed int s_brack, c_brack, r_brack, c_unders;
-    signed int open_single_quote, close_single_quote, guessquote, dquotepar, squotepar;
-    signed int isnewpara, vowel, consonant;
-    char dquote_err[80], squote_err[80], rbrack_err[80], sbrack_err[80], cbrack_err[80],
-         unders_err[80];
-    signed int qword_index, qperiod_index, isdup;
-    signed int enddash;
-    signed int Dutchcount, isDutch, Frenchcount, isFrench;
-
-
-    
-
-
-    laststart = CHAR_SPACE;
-    lastlen = lastblen = 0;
-    *dquote_err = *squote_err = *rbrack_err = *cbrack_err = *sbrack_err =
-        *unders_err = *prevline = 0;
-    linecnt = firstline = alphalen = totlen = binlen =
-        shortline = longline = spacedash = emdash = checked_linecnt =
-        space_emdash = non_PG_space_emdash = PG_space_emdash =
-        footerline = dotcomma = start_para_line = astline = fslashline = 
-        standalone_digit = hyphens = htmcount = endquote_count = 0;
-    quot = squot = s_brack = c_brack = r_brack = c_unders = 0;
-    i = llen = isemptyline = isacro = isellipsis = istypo = 0;
-    warn_short = warn_long = warn_bin = warn_dash = warn_dotcomma = 
-        warn_ast = warn_fslash = warn_digit = warn_endquote = 0;
-    isnewpara = vowel = consonant = enddash = 0;
-    spline = nspline = 0;
-    qword_index = qperiod_index = isdup = 0;
-    *inword = *testword = 0;
-    open_single_quote = close_single_quote = guessquote = dquotepar = squotepar = 0;
-    Dutchcount = isDutch = Frenchcount = isFrench = 0;
-
-
-    for (j = 0; j < MAX_QWORD; j++) {
-        dupcnt[j] = 0;
-        for (i = 0; i < MAX_QWORD_LENGTH; i++)
-            qword[i][j] = 0;
-            qperiod[i][j] = 0;
-            }
-
-
-    if ((infile = fopen(filename, "rb")) == NULL) {
-        if (pswit[STDOUT_SWITCH])
-            fprintf(stdout, "gutcheck: cannot open %s\n", filename);
-        else
-            fprintf(stderr, "gutcheck: cannot open %s\n", filename);
-        exit(1);
-        }
-
-    fprintf(stdout, "\n\nFile: %s\n\n", filename);
-    firstline = shortline = longline = verylongline = 0;
-
-
-    /*****************************************************/
-    /*                                                   */
-    /*  Run a first pass - verify that it's a valid PG   */
-    /*  file, decide whether to report some things that  */
-    /*  occur many times in the text like long or short  */
-    /*  lines, non-standard dashes, and other good stuff */
-    /*  I'll doubtless think of later.                   */
-    /*                                                   */
-    /*****************************************************/
-
-    /*****************************************************/
-    /* V.24  Sigh. Yet Another Header Change             */
-    /*****************************************************/
-
-    while (fgets(aline, LINEBUFSIZE-1, infile)) {
-        while (aline[strlen(aline)-1] == 10 || aline[strlen(aline)-1] == 13 ) aline[strlen(aline)-1] = 0;
-        linecnt++;
-        if (strstr(aline, "*END") && strstr(aline, "SMALL PRINT") && (strstr(aline, "PUBLIC DOMAIN") || strstr(aline, "COPYRIGHT"))) {
-            if (spline)
-                printf("   --> Duplicate header?\n");
-            spline = linecnt + 1;   /* first line of non-header text, that is */
-            }
-        if (!strncmp(aline, "*** START", 9) && strstr(aline, "PROJECT GUTENBERG")) {
-            if (nspline)
-                printf("   --> Duplicate header?\n");
-            nspline = linecnt + 1;   /* first line of non-header text, that is */
-            }
-        if (spline || nspline) {
-            lowerit(aline);
-            if (strstr(aline, "end") && strstr(aline, "project gutenberg")) {
-                if (strstr(aline, "end") < strstr(aline, "project gutenberg")) {
-                    if (footerline) {
-                        if (!nspline) /* it's an old-form header - we can detect duplicates */
-                            printf("   --> Duplicate footer?\n");
-                        else 
-                            ;
-                        }
-                    else {
-                        footerline = linecnt;
-                        }
-                    }
-                }
-            }
-        if (spline) firstline = spline;
-        if (nspline) firstline = nspline;  /* override with new */
-
-        if (footerline) continue;    /* 0.99+ don't count the boilerplate in the footer */
-
-        llen = strlen(aline);
-        totlen += llen;
-        for (i = 0; i < llen; i++) {
-            if ((unsigned char)aline[i] > 127) binlen++;
-            if (gcisalpha(aline[i])) alphalen++;
-            if (i > 0)
-                if (aline[i] == CHAR_DQUOTE && isalpha(aline[i-1]))
-                    endquote_count++;
-            }
-        if (strlen(aline) > 2
-            && lastlen > 2 && lastlen < SHORTEST_PG_LINE
-            && lastblen > 2 && lastblen > SHORTEST_PG_LINE
-            && laststart != CHAR_SPACE)
-                shortline++;
-
-        if (*aline) /* fixed line below for 0.96 */
-            if ((unsigned char)aline[strlen(aline)-1] <= CHAR_SPACE) cnt_spacend++;
-
-        if (strstr(aline, ".,")) dotcomma++;
-        /* 0.98 only count ast lines for ignoring purposes where there is */
-        /* locase text on the line */
-        if (strstr(aline, "*")) {
-            for (s = aline; *s; s++)
-                if (*s >='a' && *s <= 'z')
-                    break;
-             if (*s) astline++;
-             }
-        if (strstr(aline, "/"))
-            fslashline++;
-        for (i = llen-1; i > 0 && (unsigned char)aline[i] <= CHAR_SPACE; i--);
-        if (aline[i] == '-' && aline[i-1] != '-') hyphens++;
-
-        if (llen > LONGEST_PG_LINE) longline++;
-        if (llen > WAY_TOO_LONG) verylongline++;
-
-        if (strstr(aline, "<") && strstr(aline, ">")) {
-            i = (signed int) (strstr(aline, ">") - strstr(aline, "<") + 1);
-            if (i > 0) 
-                htmcount++;
-            if (strstr(aline, "")) htmcount +=4; /* bonus marks! */
-            }
-
-        /* Check for spaced em-dashes */
-        if (strstr(aline,"--")) {
-            emdash++;
-            if (*(strstr(aline, "--")-1) == CHAR_SPACE ||
-               (*(strstr(aline, "--")+2) == CHAR_SPACE))
-                    space_emdash++;
-            if (*(strstr(aline, "--")-1) == CHAR_SPACE &&
-               (*(strstr(aline, "--")+2) == CHAR_SPACE))
-                    non_PG_space_emdash++;             /* count of em-dashes with spaces both sides */
-            if (*(strstr(aline, "--")-1) != CHAR_SPACE &&
-               (*(strstr(aline, "--")+2) != CHAR_SPACE))
-                    PG_space_emdash++;                 /* count of PG-type em-dashes with no spaces */
-            }
-
-        for (s = aline; *s;) {
-            s = getaword(s, inword);
-            if (!strcmp(inword, "hij") || !strcmp(inword, "niet")) 
-                Dutchcount++;
-            if (!strcmp(inword, "dans") || !strcmp(inword, "avec")) 
-                Frenchcount++;
-            if (!strcmp(inword, "0") || !strcmp(inword, "1")) 
-                standalone_digit++;
-            }
-
-        /* Check for spaced dashes */
-        if (strstr(aline," -"))
-            if (*(strstr(aline, " -")+2) != '-')
-                    spacedash++;
-        lastblen = lastlen;
-        lastlen = strlen(aline);
-        laststart = aline[0];
-
-        }
-    fclose(infile);
-
-
-    /* now, based on this quick view, make some snap decisions */
-    if (cnt_spacend > 0) {
-        printf("   --> %ld lines in this file have white space at end\n", cnt_spacend);
-        }
-
-    warn_dotcomma = 1;
-    if (dotcomma > 5) {
-        warn_dotcomma = 0;
-        printf("   --> %ld lines in this file contain '.,'. Not reporting them.\n", dotcomma);
-        }
-
-    /* if more than 50 lines, or one-tenth, are short, don't bother reporting them */
-    warn_short = 1;
-    if (shortline > 50 || shortline * 10 > linecnt) {
-        warn_short = 0;
-        printf("   --> %ld lines in this file are short. Not reporting short lines.\n", shortline);
-        }
-
-    /* if more than 50 lines, or one-tenth, are long, don't bother reporting them */
-    warn_long = 1;
-    if (longline > 50 || longline * 10 > linecnt) {
-        warn_long = 0;
-        printf("   --> %ld lines in this file are long. Not reporting long lines.\n", longline);
-        }
-
-    /* if more than 10 lines contain asterisks, don't bother reporting them V.0.97 */
-    warn_ast = 1;
-    if (astline > 10 ) {
-        warn_ast = 0;
-        printf("   --> %ld lines in this file contain asterisks. Not reporting them.\n", astline);
-        }
-
-    /* if more than 10 lines contain forward slashes, don't bother reporting them V.0.99 */
-    warn_fslash = 1;
-    if (fslashline > 10 ) {
-        warn_fslash = 0;
-        printf("   --> %ld lines in this file contain forward slashes. Not reporting them.\n", fslashline);
-        }
-
-    /* if more than 20 lines contain unpunctuated endquotes, don't bother reporting them V.0.99 */
-    warn_endquote = 1;
-    if (endquote_count > 20 ) {
-        warn_endquote = 0;
-        printf("   --> %ld lines in this file contain unpunctuated endquotes. Not reporting them.\n", endquote_count);
-        }
-
-    /* if more than 15 lines contain standalone digits, don't bother reporting them V.0.97 */
-    warn_digit = 1;
-    if (standalone_digit > 10 ) {
-        warn_digit = 0;
-        printf("   --> %ld lines in this file contain standalone 0s and 1s. Not reporting them.\n", standalone_digit);
-        }
-
-    /* if more than 20 lines contain hyphens at end, don't bother reporting them V.0.98 */
-    warn_hyphen = 1;
-    if (hyphens > 20 ) {
-        warn_hyphen = 0;
-        printf("   --> %ld lines in this file have hyphens at end. Not reporting them.\n", hyphens);
-        }
-
-    if (htmcount > 20 && !pswit[MARKUP_SWITCH]) {
-        printf("   --> Looks like this is HTML. Switching HTML mode ON.\n");
-        pswit[MARKUP_SWITCH] = 1;
-        }
-        
-    if (verylongline > 0) {
-        printf("   --> %ld lines in this file are VERY long!\n", verylongline);
-        }
-
-    /* If there are more non-PG spaced dashes than PG em-dashes,    */
-    /* assume it's deliberate                                       */
-    /* Current PG guidelines say don't use them, but older texts do,*/
-    /* and some people insist on them whatever the guidelines say.  */
-    /* V.20 removed requirement that PG_space_emdash be greater than*/
-    /* ten before turning off warnings about spaced dashes.         */
-    warn_dash = 1;
-    if (spacedash + non_PG_space_emdash > PG_space_emdash) {
-        warn_dash = 0;
-        printf("   --> There are %ld spaced dashes and em-dashes. Not reporting them.\n", spacedash + non_PG_space_emdash);
-        }
-
-    /* if more than a quarter of characters are hi-bit, bug out */
-    warn_bin = 1;
-    if (binlen * 4 > totlen) {
-        printf("   --> This file does not appear to be ASCII. Terminating. Best of luck with it!\n");
-        exit(1);
-        }
-    if (alphalen * 4 < totlen) {
-        printf("   --> This file does not appear to be text. Terminating. Best of luck with it!\n");
-        exit(1);
-        }
-    if ((binlen * 100 > totlen) || (binlen > 100)) {
-        printf("   --> There are a lot of foreign letters here. Not reporting them.\n");
-        warn_bin = 0;
-        }
-
-    /* isDutch and isFrench added .991 Feb 06 for Frank, Jeroen, Renald */
-    isDutch = 0;
-    if (Dutchcount > 50) {
-        isDutch = 1;
-        printf("   --> This looks like Dutch - switching off dashes and warnings for 's Middags case.\n");
-        }
-
-    isFrench = 0;
-    if (Frenchcount > 50) {
-        isFrench = 1;
-        printf("   --> This looks like French - switching off some doublepunct.\n");
-        }
-
-    if (firstline && footerline)
-        printf("    The PG header and footer appear to be already on.\n");
-    else {
-        if (firstline)
-            printf("    The PG header is on - no footer.\n");
-        if (footerline)
-            printf("    The PG footer is on - no header.\n");
-        }
-    printf("\n");
-
-    /* V.22 George Davis asked for an override switch to force it to list everything */
-    if (pswit[VERBOSE_SWITCH]) {
-        warn_bin = 1;
-        warn_short = 1;
-        warn_dotcomma = 1;
-        warn_long = 1;
-        warn_dash = 1;
-        warn_digit = 1;
-        warn_ast = 1;
-        warn_fslash = 1;
-        warn_hyphen = 1;
-        warn_endquote = 1;
-        printf("   *** Verbose output is ON -- you asked for it! ***\n");
-        }
-
-    if (isDutch)
-        warn_dash = 0;  /* Frank suggested turning it REALLY off for Dutch */
-
-    if ((infile = fopen(filename, "rb")) == NULL) {
-        if (pswit[STDOUT_SWITCH])
-            fprintf(stdout, "gutcheck: cannot open %s\n", filename);
-        else
-            fprintf(stderr, "gutcheck: cannot open %s\n", filename);
-        exit(1);
-        }
-
-    if (footerline > 0 && firstline > 0 && footerline > firstline && footerline - firstline < 100) { /* ugh */
-        printf("   --> I don't really know where this text starts. \n");
-        printf("       There are no reference points.\n");
-        printf("       I'm going to have to report the header and footer as well.\n");
-        firstline=0;
-        }
-        
-
-
-    /*****************************************************/
-    /*                                                   */
-    /* Here we go with the main pass. Hold onto yer hat! */
-    /*                                                   */
-    /*****************************************************/
-
-    /* Re-init some variables we've dirtied */
-    quot = squot = linecnt = 0;
-    laststart = CHAR_SPACE;
-    lastlen = lastblen = 0;
-
-    while (flgets(aline, LINEBUFSIZE-1, infile, linecnt+1)) {
-        linecnt++;
-        if (linecnt == 1) isnewpara = 1;
-        if (pswit[DP_SWITCH])
-            if (!strncmp(aline, "-----File: ", 11))
-                continue;    // skip DP page separators completely
-        if (linecnt < firstline || (footerline > 0 && linecnt > footerline)) {
-            if (pswit[HEADER_SWITCH]) {
-                if (!strncmp(aline, "Title:", 6))
-                    printf("    %s\n", aline);
-                if (!strncmp (aline, "Author:", 7))
-                    printf("    %s\n", aline);
-                if (!strncmp(aline, "Release Date:", 13))
-                    printf("    %s\n", aline);
-                if (!strncmp(aline, "Edition:", 8))
-                    printf("    %s\n\n", aline);
-                }
-            continue;                /* skip through the header */
-            }
-        checked_linecnt++;
-        s = aline;
-        isemptyline = 1;      /* assume the line is empty until proven otherwise */
-
-        /* If we are in a state of unbalanced quotes, and this line    */
-        /* doesn't begin with a quote, output the stored error message */
-        /* If the -P switch was used, print the warning even if the    */
-        /* new para starts with quotes                                 */
-        /* Version .20 - if the new paragraph does start with a quote, */
-        /* but is indented, I was giving a spurious error. Need to     */
-        /* check the first _non-space_ character on the line rather    */
-        /* than the first character when deciding whether the para     */
-        /* starts with a quote. Using *t for this.                     */
-        t = s;
-        while (*t == ' ') t++;
-        if (*dquote_err)
-            if (*t != CHAR_DQUOTE || pswit[QPARA_SWITCH]) {
-                if (!pswit[OVERVIEW_SWITCH]) {
-                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
-                    printf(dquote_err);
-                    }
-                else
-                    cnt_dquot++;
-            }
-        if (*squote_err) {
-            if (*t != CHAR_SQUOTE && *t != CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] || squot) {
-                if (!pswit[OVERVIEW_SWITCH]) {
-                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
-                    printf(squote_err);
-                    }
-                else
-                    cnt_squot++;
-                }
-            squot = 0;
-            }
-        if (*rbrack_err) {
-            if (!pswit[OVERVIEW_SWITCH]) {
-                if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
-                printf(rbrack_err);
-                }
-            else
-                cnt_brack++;
-            }
-        if (*sbrack_err) {
-            if (!pswit[OVERVIEW_SWITCH]) {
-                if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
-                printf(sbrack_err);
-                }
-            else
-                cnt_brack++;
-            }
-        if (*cbrack_err) {
-            if (!pswit[OVERVIEW_SWITCH]) {
-                if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
-                printf(cbrack_err);
-                }
-            else
-                cnt_brack++;
-            }
-        if (*unders_err) {
-            if (!pswit[OVERVIEW_SWITCH]) {
-                if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
-                printf(unders_err);
-                }
-            else
-                cnt_brack++;
-            }
-
-        *dquote_err = *squote_err = *rbrack_err = *cbrack_err = 
-            *sbrack_err = *unders_err = 0;
-
-
-        /* look along the line, accumulate the count of quotes, and see */
-        /* if this is an empty line - i.e. a line with nothing on it    */
-        /* but spaces.                                                  */
-        /* V .12 also if line has just spaces, * and/or - on it, don't  */
-        /* count it, since empty lines with asterisks or dashes to      */
-        /* separate sections are common.                                */
-        /* V .15 new single-quote checking - has to be better than the  */
-        /* previous version, but how much better? fingers crossed!      */
-        /* V .20 add period to * and - as characters on a separator line*/
-        s = aline;
-        while (*s) {
-            if (*s == CHAR_DQUOTE) quot++;
-            if (*s == CHAR_SQUOTE || *s == CHAR_OPEN_SQUOTE)
-                if (s == aline) { /* at start of line, it can only be an openquote */
-                    if (strncmp(s+2, "tis", 3) && strncmp(s+2, "Tis", 3)) /* hardcode a very common exception! */
-                        open_single_quote++;
-                    }
-                else
-                    if (gcisalpha(*(s-1)) && gcisalpha(*(s+1)))
-                        ; /* do nothing! - it's definitely an apostrophe, not a quote */
-                    else        /* it's outside a word - let's check it out */
-                        if (*s == CHAR_OPEN_SQUOTE || gcisalpha(*(s+1))) { /* it damwell better BE an openquote */
-                            if (strncmp(s+1, "tis", 3) && strncmp(s+1, "Tis", 3)) /* hardcode a very common exception! */
-                                open_single_quote++;
-                            }
-                        else { /* now - is it a closequote? */
-                            guessquote = 0;   /* accumulate clues */
-                            if (gcisalpha(*(s-1))) { /* it follows a letter - could be either */
-                                guessquote += 1;
-                                if (*(s-1) == 's') { /* looks like a plural apostrophe */
-                                    guessquote -= 3;
-                                    if (*(s+1) == CHAR_SPACE)  /* bonus marks! */
-                                        guessquote -= 2;
-                                    }
-                                }
-                            else /* it doesn't have a letter either side */
-                                if (strchr(".?!,;:", *(s-1)) && (strchr(".?!,;: ", *(s+1))))
-                                    guessquote += 8; /* looks like a closequote */
-                                else
-                                    guessquote += 1;
-                            if (open_single_quote > close_single_quote)
-                                guessquote += 1; /* give it the benefit of some doubt - if a squote is already open */
-                            else
-                                guessquote -= 1;
-                            if (guessquote >= 0)
-                                close_single_quote++;
-                            }
-
-            if (*s != CHAR_SPACE
-                && *s != '-'
-                && *s != '.'
-                && *s != CHAR_ASTERISK
-                && *s != 13
-                && *s != 10) isemptyline = 0;  /* ignore lines like  *  *  *  as spacers */
-            if (*s == CHAR_UNDERSCORE) c_unders++;
-            if (*s == CHAR_OPEN_CBRACK) c_brack++;
-            if (*s == CHAR_CLOSE_CBRACK) c_brack--;
-            if (*s == CHAR_OPEN_RBRACK) r_brack++;
-            if (*s == CHAR_CLOSE_RBRACK) r_brack--;
-            if (*s == CHAR_OPEN_SBRACK) s_brack++;
-            if (*s == CHAR_CLOSE_SBRACK) s_brack--;
-            s++;
-            }
-
-        if (isnewpara && !isemptyline) {   /* This line is the start of a new paragraph */
-            start_para_line = linecnt;
-            strncpy(parastart, aline, 80); /* Capture its first line in case we want to report it later */
-            parastart[79] = 0;
-            dquotepar = squotepar = 0; /* restart the quote count 0.98 */
-            s = aline;
-            while (!gcisalpha(*s) && !gcisdigit(*s) && *s) s++;    /* V.97 fixed bug - overran line and gave false warning - rare */
-            if (*s >= 'a' && *s <='z') { /* and its first letter is lowercase */
-                if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-                if (!pswit[OVERVIEW_SWITCH])
-                    printf("    Line %ld column %d - Paragraph starts with lower-case\n", linecnt, (int)(s - aline) +1);
-                else
-                    cnt_punct++;
-                }
-            isnewpara = 0; /* Signal the end of new para processing */
-            }
-
-        /* Check for an em-dash broken at line end */
-        if (enddash && *aline == '-') {
-            if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-            if (!pswit[OVERVIEW_SWITCH])
-                printf("    Line %ld column 1 - Broken em-dash?\n", linecnt);
-            else
-                cnt_punct++;
-            }
-        enddash = 0;
-        for (s = aline + strlen(aline) - 1; *s == ' ' && s > aline; s--);
-        if (s >= aline && *s == '-')
-            enddash = 1;
-            
-
-        /* Check for invalid or questionable characters in the line */
-        /* Anything above 127 is invalid for plain ASCII,  and      */
-        /* non-printable control characters should also be flagged. */
-        /* Tabs should generally not be there.                      */
-        /* Jan 06, in 0.99: Hm. For some strange reason, I either   */
-        /* never created or deleted the check for unprintable       */
-        /* control characters. They should be reported even if      */
-        /* warn_bin is on, I think, and in full.                    */
-
-        for (s = aline; *s; s++) {
-            i = (unsigned char) *s;
-            if (i < CHAR_SPACE && i != CHAR_LF && i != CHAR_CR && i != CHAR_TAB) {
-                if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-                if (!pswit[OVERVIEW_SWITCH])
-                    printf("    Line %ld column %d - Control character %d\n", linecnt, (int) (s - aline) + 1, i);
-                else
-                    cnt_bin++;
-                }
-            }
-
-        if (warn_bin) {
-            eNon_A = eTab = eTilde = eCarat = eFSlash = eAst = 0;  /* don't repeat multiple warnings on one line */
-            for (s = aline; *s; s++) {
-                if (!eNon_A && ((*s < CHAR_SPACE && *s != 9 && *s != '\n') || (unsigned char)*s > 127)) {
-                    i = *s;                           /* annoying kludge for signed chars */
-                    if (i < 0) i += 256;
-                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-                    if (!pswit[OVERVIEW_SWITCH])
-                        if (i > 127 && i < 160)
-                            printf("    Line %ld column %d - Non-ISO-8859 character %d\n", linecnt, (int) (s - aline) + 1, i);
-                        else
-                            printf("    Line %ld column %d - Non-ASCII character %d\n", linecnt, (int) (s - aline) + 1, i);
-                    else
-                        cnt_bin++;
-                    eNon_A = 1;
-                    }
-                if (!eTab && *s == CHAR_TAB) {
-                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-                    if (!pswit[OVERVIEW_SWITCH])
-                        printf("    Line %ld column %d - Tab character?\n", linecnt, (int) (s - aline) + 1);
-                    else
-                        cnt_odd++;
-                    eTab = 1;
-                    }
-                if (!eTilde && *s == CHAR_TILDE) {  /* often used by OCR software to indicate an unrecognizable character */
-                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-                    if (!pswit[OVERVIEW_SWITCH])
-                        printf("    Line %ld column %d - Tilde character?\n", linecnt, (int) (s - aline) + 1);
-                    else
-                        cnt_odd++;
-                    eTilde = 1;
-                    }
-                if (!eCarat && *s == CHAR_CARAT) {  
-                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-                    if (!pswit[OVERVIEW_SWITCH])
-                        printf("    Line %ld column %d - Carat character?\n", linecnt, (int) (s - aline) + 1);
-                    else
-                        cnt_odd++;
-                    eCarat = 1;
-                    }
-                if (!eFSlash && *s == CHAR_FORESLASH && warn_fslash) {  
-                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-                    if (!pswit[OVERVIEW_SWITCH])
-                        printf("    Line %ld column %d - Forward slash?\n", linecnt, (int) (s - aline) + 1);
-                    else
-                        cnt_odd++;
-                    eFSlash = 1;
-                    }
-                /* report asterisks only in paranoid mode, since they're often deliberate */
-                if (!eAst && pswit[PARANOID_SWITCH] && warn_ast && !isemptyline && *s == CHAR_ASTERISK) {
-                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-                    if (!pswit[OVERVIEW_SWITCH])
-                        printf("    Line %ld column %d - Asterisk?\n", linecnt, (int) (s - aline) + 1);
-                    else
-                        cnt_odd++;
-                    eAst = 1;
-                    }
-                }
-            }
-
-        /* Check for line too long */
-        if (warn_long) {
-            if (strlen(aline) > LONGEST_PG_LINE) {
-                if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-                if (!pswit[OVERVIEW_SWITCH])
-                    printf("    Line %ld column %d - Long line %d\n", linecnt, strlen(aline), strlen(aline));
-                else
-                    cnt_long++;
-                }
-            }
-
-        /* Check for line too short.                                     */
-        /* This one is a bit trickier to implement: we don't want to     */
-        /* flag the last line of a paragraph for being short, so we      */
-        /* have to wait until we know that our current line is a         */
-        /* "normal" line, then report the _previous_ line if it was too  */
-        /* short. We also don't want to report indented lines like       */
-        /* chapter heads or formatted quotations. We therefore keep      */
-        /* lastlen as the length of the last line examined, and          */
-        /* lastblen as the length of the last but one, and try to        */
-        /* suppress unnecessary warnings by checking that both were of   */
-        /* "normal" length. We keep the first character of the last      */
-        /* line in laststart, and if it was a space, we assume that the  */
-        /* formatting is deliberate. I can't figure out a way to         */
-        /* distinguish something like a quoted verse left-aligned or     */
-        /* the header or footer of a letter from a paragraph of short    */
-        /* lines - maybe if I examined the whole paragraph, and if the   */
-        /* para has less than, say, 8 lines and if all lines are short,  */
-        /* then just assume it's OK? Need to look at some texts to see   */
-        /* how often a formula like this would get the right result.     */
-        /* V0.99 changed the tolerance for length to ignore from 2 to 1  */
-        if (warn_short) {
-            if (strlen(aline) > 1
-                && lastlen > 1 && lastlen < SHORTEST_PG_LINE
-                && lastblen > 1 && lastblen > SHORTEST_PG_LINE
-                && laststart != CHAR_SPACE) {
-                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", prevline);
-                    if (!pswit[OVERVIEW_SWITCH])
-                        printf("    Line %ld column %d - Short line %d?\n", linecnt-1, strlen(prevline), strlen(prevline));
-                    else
-                        cnt_short++;
-                    }
-            }
-        lastblen = lastlen;
-        lastlen = strlen(aline);
-        laststart = aline[0];
-
-        /* look for punctuation at start of line */
-        if  (*aline && strchr(".?!,;:",  aline[0]))  {            /* if it's punctuation */
-            if (strncmp(". . .", aline, 5)) {   /* exception for ellipsis: V.98 tightened up to except only a full ellipsis */
-                if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-                if (!pswit[OVERVIEW_SWITCH])
-                    printf("    Line %ld column 1 - Begins with punctuation?\n", linecnt);
-                else
-                    cnt_punct++;
-                }
-            }
-
-        /* Check for spaced em-dashes                            */
-        /* V.20 must check _all_ occurrences of "--" on the line */
-        /* hence the loop - even if the first double-dash is OK  */
-        /* there may be another that's wrong later on.           */
-        if (warn_dash) {
-            s = aline;
-            while (strstr(s,"--")) {
-                if (*(strstr(s, "--")-1) == CHAR_SPACE ||
-                   (*(strstr(s, "--")+2) == CHAR_SPACE)) {
-                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-                    if (!pswit[OVERVIEW_SWITCH])
-                        printf("    Line %ld column %d - Spaced em-dash?\n", linecnt, (int) (strstr(s,"--") - aline) + 1);
-                    else
-                        cnt_dash++;
-                    }
-                s = strstr(s,"--") + 2;
-                }
-            }
-
-        /* Check for spaced dashes */
-        if (warn_dash)
-            if (strstr(aline," -")) {
-                if (*(strstr(aline, " -")+2) != '-') {
-                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-                    if (!pswit[OVERVIEW_SWITCH])
-                        printf("    Line %ld column %d - Spaced dash?\n", linecnt, (int) (strstr(aline," -") - aline) + 1);
-                    else
-                        cnt_dash++;
-                    }
-                }
-            else
-                if (strstr(aline,"- ")) {
-                    if (*(strstr(aline, "- ")-1) != '-') {
-                        if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-                        if (!pswit[OVERVIEW_SWITCH])
-                            printf("    Line %ld column %d - Spaced dash?\n", linecnt, (int) (strstr(aline,"- ") - aline) + 1);
-                        else
-                            cnt_dash++;
-                        }
-                    }
-
-        /* v 0.99                                                       */
-        /* Check for unmarked paragraphs indicated by separate speakers */
-        /* May well be false positive:                                  */
-        /* "Bravo!" "Wonderful!" called the crowd.                      */
-        /* but useful all the same.                                     */
-        s = wrk;
-        *s = 0;
-        if (strstr(aline, "\" \"")) s = strstr(aline, "\" \"");
-        if (strstr(aline, "\"  \"")) s = strstr(aline, "\"  \"");
-        if (*s) {
-            if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-            if (!pswit[OVERVIEW_SWITCH])
-                printf("    Line %ld column %d - Query missing paragraph break?\n", linecnt, (int)(s - aline) +1);
-            else
-                cnt_punct++;
-            }
-
-
-
-        /* Check for "to he" and other easy he/be errors          */
-        /* This is a very inadequate effort on the he/be problem, */
-        /* but the phrase "to he" is always an error, whereas "to */
-        /* be" is quite common. I chuckle when it does catch one! */
-        /* Similarly, '"Quiet!", be said.' is a non-be error      */
-        /* V .18 - "to he" is _not_ always an error!:             */
-        /*           "Where they went to he couldn't say."        */
-        /* but I'm leaving it in anyway.                          */
-        /* V .20 Another false positive:                          */
-        /*       What would "Cinderella" be without the . . .     */
-        /* and another "If he wants to he can see for himself."   */
-        /* V .21 Added " is be " and " be is " and " be was "     */
-        /* V .99 Added jeebies code -- removed again.             */
-        /*       Is jeebies code worth adding? Rare to see he/be  */
-        /*       errors with modern OCR. Separate program? Yes!   */
-        /*       jeebies does the job without cluttering up this. */
-        /*       We do get a few more queryable pairs from the    */
-        /*       project though -- they're cheap to implement.    */
-        /*       Also added a column number for guiguts.          */
-
-        s = wrk;
-        *s = 0;
-        if (strstr(aline," to he ")) s = strstr(aline," to he ");
-        if (strstr(aline,"\" be ")) s = strstr(aline,"\" be ");
-        if (strstr(aline,"\", be ")) s = strstr(aline,"\", be ");
-        if (strstr(aline," is be ")) s = strstr(aline," is be ");
-        if (strstr(aline," be is ")) s = strstr(aline," be is ");
-        if (strstr(aline," was be ")) s = strstr(aline," was be ");
-        if (strstr(aline," be would ")) s = strstr(aline," be would ");
-        if (strstr(aline," be could ")) s = strstr(aline," be could ");
-        if (*s) {
-            if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-            if (!pswit[OVERVIEW_SWITCH])
-                printf("    Line %ld column %d - Query he/be error?\n", linecnt, (int)(s - aline) +1);
-            else
-                cnt_word++;
-            }
-
-        s = wrk;
-        *s = 0;
-        if (strstr(aline," i bad ")) s = strstr(aline," i bad ");
-        if (strstr(aline," you bad ")) s = strstr(aline," you bad ");
-        if (strstr(aline," he bad ")) s = strstr(aline," he bad ");
-        if (strstr(aline," she bad ")) s = strstr(aline," she bad ");
-        if (strstr(aline," they bad ")) s = strstr(aline," they bad ");
-        if (strstr(aline," a had ")) s = strstr(aline," a had ");
-        if (strstr(aline," the had ")) s = strstr(aline," the had ");
-        if (*s) {
-            if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-            if (!pswit[OVERVIEW_SWITCH])
-                printf("    Line %ld column %d - Query had/bad error?\n", linecnt, (int)(s - aline) +1);
-            else
-                cnt_word++;
-            }
-
-
-        /* V .97 Added ", hut "  Not too common, hut pretty certain   */
-        /* V.99 changed to add a column number for guiguts            */
-        s = wrk;
-        *s = 0;
-        if (strstr(aline,", hut ")) s = strstr(aline,", hut ");
-        if (strstr(aline,"; hut ")) s = strstr(aline,"; hut ");
-        if (*s) {
-            if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-            if (!pswit[OVERVIEW_SWITCH])
-                printf("    Line %ld column %d - Query hut/but error?\n", linecnt, (int)(s - aline) +1);
-            else
-                cnt_word++;
-            }
-
-        /* Special case - angled bracket in front of "From" placed there by an MTA */
-        /* when sending an e-mail.  V .21                                          */
-        if (strstr(aline, ">From")) {
-            if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-            if (!pswit[OVERVIEW_SWITCH])
-                printf("    Line %ld column %d - Query angled bracket with From\n", linecnt, (int)(strstr(aline, ">From") - aline) +1);
-            else
-                cnt_punct++;
-            }
-
-        /* V 0.98 Check for a single character line - often an overflow from bad wrapping. */
-        if (*aline && !*(aline+1)) {
-            if (*aline == 'I' || *aline == 'V' || *aline == 'X' || *aline == 'L' || gcisdigit(*aline))
-                ; /* nothing - ignore numerals alone on a line. */
-            else {
-                if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-                if (!pswit[OVERVIEW_SWITCH])
-                    printf("    Line %ld column 1 - Query single character line\n", linecnt);
-                else
-                    cnt_punct++;
-                }
-            }
-
-        /* V 0.98 Check for I" - often should be ! */
-        if (strstr(aline, " I\"")) {
-            if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-            if (!pswit[OVERVIEW_SWITCH])
-                printf("    Line %ld column %ld - Query I=exclamation mark?\n", linecnt, strstr(aline, " I\"") - aline);
-            else
-                cnt_punct++;
-            }
-
-        /* V 0.98 Check for period without a capital letter. Cut-down from gutspell */
-        /*        Only works when it happens on a single line.                      */
-
-        if (pswit[PARANOID_SWITCH])
-            for (t = s = aline; strstr(t,". ");) {
-                t = strstr(t, ". ");
-                if (t == s)  {
-                    t++;
-                    continue; /* start of line punctuation is handled elsewhere */
-                    }
-                if (!gcisalpha(*(t-1))) {
-                    t++;
-                    continue;
-                    }
-                if (isDutch) {  /* For Frank & Jeroen -- 's Middags case */
-                    if (*(t+2) == CHAR_SQUOTE &&
-                      *(t+3)>='a' && *(t+3)<='z' &&
-                      *(t+4) == CHAR_SPACE &&
-                      *(t+5)>='A' && *(t+5)<='Z') {
-                        t++;
-                        continue;
-                        }
-                      }
-                s1 = t+2;
-                while (*s1 && !gcisalpha(*s1) && !isdigit(*s1))
-                    s1++;
-                if (*s1 >= 'a' && *s1 <= 'z') {  /* we have something to investigate */
-                    istypo = 1;
-                    for (s1 = t - 1; s1 >= s && 
-                        (gcisalpha(*s1) || gcisdigit(*s1) || 
-                        (*s1 == CHAR_SQUOTE && gcisalpha(*(s1+1)) && gcisalpha(*(s1-1)))); s1--); /* so let's go back and find out */
-                    s1++;
-                    for (i = 0; *s1 && *s1 != '.'; s1++, i++)
-                        testword[i] = *s1;
-                    testword[i] = 0;
-                    for (i = 0; *abbrev[i]; i++)
-                        if (!strcmp(testword, abbrev[i]))
-                            istypo = 0;
-//                    if (*testword >= 'A' && *testword <= 'Z') 
-//                        istypo = 0;
-                    if (gcisdigit(*testword)) istypo = 0;
-                    if (!*(testword+1)) istypo = 0;
-                    if (isroman(testword)) istypo = 0;
-                    if (istypo) {
-                        istypo = 0;
-                        for (i = 0; testword[i]; i++)
-                            if (strchr(vowels, testword[i]))
-                                istypo = 1;
-                        }
-                    if (istypo) {
-                        isdup = 0;
-                        if (strlen(testword) < MAX_QWORD_LENGTH && !pswit[VERBOSE_SWITCH])
-                            for (i = 0; i < qperiod_index; i++)
-                                if (!strcmp(testword, qperiod[i])) {
-                                    isdup = 1;
-                                    }
-                        if (!isdup) {
-                            if (qperiod_index < MAX_QWORD && strlen(testword) < MAX_QWORD_LENGTH) {
-                                strcpy(qperiod[qperiod_index], testword);
-                                qperiod_index++;
-                                }
-                            if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-                            if (!pswit[OVERVIEW_SWITCH])
-                                printf("    Line %ld column %d - Extra period?\n", linecnt, (int)(t - aline)+1);
-                            else
-                                cnt_punct++;
-                            }
-                        }
-                    }
-                t++;
-                }
-
-
-        if (pswit[TYPO_SWITCH]) {    /* Should have put this condition in at the start of 0.99. Duh! */
-            /* Check for words usually not followed by punctuation 0.99 */
-            for (s = aline; *s;) {
-                wordstart = s;
-                s = getaword(s, inword);
-                if (!*inword) continue;
-                lowerit(inword);
-                for (i = 0; *nocomma[i]; i++)
-                    if (!strcmp(inword, nocomma[i])) {
-                        if (*s == ',' || *s == ';' || *s == ':') {
-                            if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-                            if (!pswit[OVERVIEW_SWITCH])
-                                printf("    Line %ld column %d - Query punctuation after %s?\n", linecnt, (int)(s - aline)+1, inword);
-                            else
-                                cnt_punct++;
-                            }
-                        }
-                for (i = 0; *noperiod[i]; i++)
-                    if (!strcmp(inword, noperiod[i])) {
-                        if (*s == '.' || *s == '!') {
-                            if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-                            if (!pswit[OVERVIEW_SWITCH])
-                                printf("    Line %ld column %d - Query punctuation after %s?\n", linecnt, (int)(s - aline)+1, inword);
-                            else
-                                cnt_punct++;
-                            }
-                        }
-                }
-            }
-
-
-
-        /* Check for commonly mistyped words, and digits like 0 for O in a word */
-        for (s = aline; *s;) {
-            wordstart = s;
-            s = getaword(s, inword);
-            if (!*inword) continue; /* don't bother with empty lines */
-            if (mixdigit(inword)) {
-                if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-                if (!pswit[OVERVIEW_SWITCH])
-                    printf("    Line %ld column %ld - Query digit in %s\n", linecnt, (int)(wordstart - aline) + 1, inword);
-                else
-                    cnt_word++;
-                }
-
-            /* put the word through a series of tests for likely typos and OCR errors */
-            /* V.21 I had allowed lots of typo-checking even with the typo switch     */
-            /* turned off, but I really should disallow reporting of them when        */
-            /* the switch is off. Hence the "if" below.                               */
-            if (pswit[TYPO_SWITCH]) {
-                istypo = 0;
-                strcpy(testword, inword);
-                alower = 0;
-                for (i = 0; i < (signed int)strlen(testword); i++) { /* lowercase for testing */
-                    if (testword[i] >= 'a' && testword[i] <= 'z') alower = 1;
-                    if (alower && testword[i] >= 'A' && testword[i] <= 'Z') {
-                        /* we have an uppercase mid-word. However, there are common cases: */
-                        /*   Mac and Mc like McGill                                        */
-                        /*   French contractions like l'Abbe                               */
-                        if ((i == 2 && testword[0] == 'm' && testword[1] == 'c') ||
-                            (i == 3 && testword[0] == 'm' && testword[1] == 'a' && testword[2] == 'c') ||
-                            (i > 0 && testword[i-1] == CHAR_SQUOTE))
-                                ; /* do nothing! */
-
-                        else {  /* V.97 - remove separate case of uppercase within word so that         */
-                                /* names like VanAllen fall into qword_index and get reported only once */
-                            istypo = 1;
-                            }
-                        }
-                    testword[i] = (char)tolower(testword[i]);
-                    }
-
-                /* check for certain unlikely two-letter combinations at word start and end */
-                /* V.0.97 - this replaces individual hardcoded checks in previous versions */
-                if (strlen(testword) > 1) {
-                    for (i = 0; *nostart[i]; i++)
-                        if (!strncmp(testword, nostart[i], 2))
-                            istypo = 1;
-                    for (i = 0; *noend[i]; i++)
-                        if (!strncmp(testword + strlen(testword) -2, noend[i], 2))
-                            istypo = 1;
-                    }
-
-
-                /* ght is common, gbt never. Like that. */
-                if (strstr(testword, "cb")) istypo = 1;
-                if (strstr(testword, "gbt")) istypo = 1;
-                if (strstr(testword, "pbt")) istypo = 1;
-                if (strstr(testword, "tbs")) istypo = 1;
-                if (strstr(testword, "mrn")) istypo = 1;
-                if (strstr(testword, "ahle")) istypo = 1;
-                if (strstr(testword, "ihle")) istypo = 1;
-
-                /* "TBE" does happen - like HEARTBEAT - but uncommon.                    */
-                /*  Also "TBI" - frostbite, outbid - but uncommon.                       */
-                /*  Similarly "ii" like Hawaii, or Pompeii, and in Roman numerals,       */
-                /*  but these are covered in V.20. "ii" is a common scanno.              */
-                if (strstr(testword, "tbi")) istypo = 1;
-                if (strstr(testword, "tbe")) istypo = 1;
-                if (strstr(testword, "ii")) istypo = 1;
-
-                /* check for no vowels or no consonants. */
-                /* If none, flag a typo                  */
-                if (!istypo && strlen(testword)>1) {
-                    vowel = consonant = 0;
-                    for (i = 0; testword[i]; i++)
-                        if (testword[i] == 'y' || gcisdigit(testword[i])) {  /* Yah, this is loose. */
-                            vowel++;
-                            consonant++;
-                            }
-                        else
-                            if  (strchr(vowels, testword[i])) vowel++;
-                            else consonant++;
-                    if (!vowel || !consonant) {
-                        istypo = 1;
-                        }
-                    }
-
-                /* now exclude the word from being reported if it's in */
-                /* the okword list                                     */
-                for (i = 0; *okword[i]; i++)
-                    if (!strcmp(testword, okword[i]))
-                        istypo = 0;
-
-                /* what looks like a typo may be a Roman numeral. Exclude these */
-                if (istypo)
-                    if (isroman(testword))
-                        istypo = 0;
-
-                /* check the manual list of typos */
-                if (!istypo)
-                    for (i = 0; *typo[i]; i++)
-                        if (!strcmp(testword, typo[i]))
-                            istypo = 1;
-
-
-                /* V.21 - check lowercase s and l - special cases */
-                /* V.98 - added "i" and "m"                       */
-                /* V.99 - added "j" often a semi-colon gone wrong */
-                /*      - and "d" for a missing apostrophe - he d */
-                /*      - and "n" for "in"                        */
-                if (!istypo && strlen(testword) == 1)
-                    if (strchr("slmijdn", *inword))
-                        istypo = 1;
-
-
-                if (istypo) {
-                    isdup = 0;
-                    if (strlen(testword) < MAX_QWORD_LENGTH && !pswit[VERBOSE_SWITCH])
-                        for (i = 0; i < qword_index; i++)
-                            if (!strcmp(testword, qword[i])) {
-                                isdup = 1;
-                                ++dupcnt[i];
-                                }
-                    if (!isdup) {
-                        if (qword_index < MAX_QWORD && strlen(testword) < MAX_QWORD_LENGTH) {
-                            strcpy(qword[qword_index], testword);
-                            qword_index++;
-                            }
-                        if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-                        if (!pswit[OVERVIEW_SWITCH]) {
-                            printf("    Line %ld column %d - Query word %s", linecnt, (int)(wordstart - aline) + 1, inword);
-                            if (strlen(testword) < MAX_QWORD_LENGTH && !pswit[VERBOSE_SWITCH])
-                                printf(" - not reporting duplicates");
-                            printf("\n");
-                            }
-                        else
-                            cnt_word++;
-                        }
-                    }
-                }        /* end of typo-checking */
-
-                /* check the user's list of typos */
-                if (!istypo)
-                    if (usertypo_count)
-                        for (i = 0; i < usertypo_count; i++)
-                            if (!strcmp(testword, usertypo[i])) {
-                                if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-                                if (!pswit[OVERVIEW_SWITCH])  
-                                    printf("    Line %ld column %d - Query possible scanno %s\n", linecnt, (int)(wordstart - aline) + 2, inword);
-                                }
-
-
-
-            if (pswit[PARANOID_SWITCH] && warn_digit) {   /* in paranoid mode, query all 0 and 1 standing alone - added warn_digit V.97*/
-                if (!strcmp(inword, "0") || !strcmp(inword, "1")) {
-                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-                    if (!pswit[OVERVIEW_SWITCH])
-                        printf("    Line %ld column %d - Query standalone %s\n", linecnt, (int)(wordstart - aline) + 2, inword);
-                    else
-                        cnt_word++;
-                    }
-                }
-            }
-
-        /* look for added or missing spaces around punctuation and quotes */
-        /* If there is a punctuation character like ! with no space on    */
-        /* either side, suspect a missing!space. If there are spaces on   */
-        /* both sides , assume a typo. If we see a double quote with no   */
-        /* space or punctuation on either side of it, assume unspaced     */
-        /* quotes "like"this.                                             */
-        llen = strlen(aline);
-        for (i = 1; i < llen; i++) {                               /* for each character in the line after the first */
-            if  (strchr(".?!,;:_", aline[i])) {                    /* if it's punctuation */
-                isacro = 0;                       /* we need to suppress warnings for acronyms like M.D. */
-                isellipsis = 0;                   /* we need to suppress warnings for ellipsis . . . */
-                if ( (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1])) ||     /* if there are letters on both sides of it or ... */
-                   (gcisalpha(aline[i+1]) && strchr("?!,;:", aline[i]))) { /* ...if it's strict punctuation followed by an alpha */
-                    if (aline[i] == '.') {
-                        if (i > 2)
-                            if (aline[i-2] == '.') isacro = 1;
-                        if (i + 2 < llen)
-                            if (aline[i+2] == '.') isacro = 1;
-                        }
-                    if (!isacro) {
-                        if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-                        if (!pswit[OVERVIEW_SWITCH])
-                            printf("    Line %ld column %d - Missing space?\n", linecnt, i+1);
-                        else
-                            cnt_punct++;
-                        }
-                    }
-                if (aline[i-1] == CHAR_SPACE && (aline[i+1] == CHAR_SPACE || aline[i+1] == 0)) { /* if there are spaces on both sides, or space before and end of line */
-                    if (aline[i] == '.') {
-                        if (i > 2)
-                            if (aline[i-2] == '.') isellipsis = 1;
-                        if (i + 2 < llen)
-                            if (aline[i+2] == '.') isellipsis = 1;
-                        }
-                    if (!isemptyline && !isellipsis) {
-                        if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-                        if (!pswit[OVERVIEW_SWITCH])
-                            printf("    Line %ld column %d - Spaced punctuation?\n", linecnt, i+1);
-                        else
-                            cnt_punct++;
-                        }
-                    }
-                }
-            }
-
-        /* 0.98 -- split out the characters that CANNOT be preceded by space */
-        llen = strlen(aline);
-        for (i = 1; i < llen; i++) {                             /* for each character in the line after the first */
-            if  (strchr("?!,;:", aline[i])) {                    /* if it's punctuation that _cannot_ have a space before it */
-                if (aline[i-1] == CHAR_SPACE && !isemptyline && aline[i+1] != CHAR_SPACE) { /* if aline[i+1) DOES == space, it was already reported just above */
-                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-                    if (!pswit[OVERVIEW_SWITCH])
-                        printf("    Line %ld column %d - Spaced punctuation?\n", linecnt, i+1);
-                    else
-                        cnt_punct++;
-                    }
-                }
-            }
-
-
-        /* 0.99 -- special case " .X" where X is any alpha. */
-        /* This plugs a hole in the acronym code above. Inelegant, but maintainable. */
-        llen = strlen(aline);
-        for (i = 1; i < llen; i++) {             /* for each character in the line after the first */
-            if  (aline[i] == '.') {              /* if it's a period */
-                if (aline[i-1] == CHAR_SPACE && gcisalpha(aline[i+1])) { /* if the period follows a space and is followed by a letter */
-                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-                    if (!pswit[OVERVIEW_SWITCH])
-                        printf("    Line %ld column %d - Spaced punctuation?\n", linecnt, i+1);
-                    else
-                        cnt_punct++;
-                    }
-                }
-            }
-
-
-
-
-        /* v.21 breaking out the search for unspaced doublequotes        */
-        /* This is not as efficient, but it's more maintainable          */
-        /* V.97 added underscore to the list of characters not to query, */
-        /* since underscores are commonly used as italics indicators.    */
-        /* V.98 Added slash as well, same reason.                        */
-        for (i = 1; i < llen; i++) {                               /* for each character in the line after the first */
-            if (aline[i] == CHAR_DQUOTE) {
-                if ((!strchr(" _-.'`,;:!/([{?}])",  aline[i-1]) &&
-                     !strchr(" _-.'`,;:!/([{?}])",  aline[i+1]) &&
-                     aline[i+1] != 0
-                     || (!strchr(" _-([{'`", aline[i-1]) && gcisalpha(aline[i+1])))) {
-                        if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-                        if (!pswit[OVERVIEW_SWITCH])
-                            printf("    Line %ld column %d - Unspaced quotes?\n", linecnt, i+1);
-                        else
-                            cnt_punct++;
-                        }
-                }
-            }
-
-
-        /* v.98 check parity of quotes                             */
-        /* v.99 added !*(s+1) in some tests to catch "I am," he said, but I will not be soon". */
-        for (s = aline; *s; s++) {
-            if (*s == CHAR_DQUOTE) {
-                if (!(dquotepar = !dquotepar)) {    /* parity even */
-                    if (!strchr("_-.'`/,;:!?)]} ",  *(s+1))) {
-                        if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-                        if (!pswit[OVERVIEW_SWITCH])
-                            printf("    Line %ld column %d - Wrongspaced quotes?\n", linecnt, (int)(s - aline)+1);
-                        else
-                            cnt_punct++;
-                        }
-                    }
-                else {                              /* parity odd */
-                    if (!gcisalpha(*(s+1)) && !isdigit(*(s+1)) && !strchr("_-/.'`([{$",  *(s+1)) || !*(s+1)) {
-                        if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-                        if (!pswit[OVERVIEW_SWITCH])
-                            printf("    Line %ld column %d - Wrongspaced quotes?\n", linecnt, (int)(s - aline)+1);
-                        else
-                            cnt_punct++;
-                        }
-                    }
-                }
-            }
-
-            if (*aline == CHAR_DQUOTE) {
-                if (strchr(",;:!?)]} ", aline[1])) {
-                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-                    if (!pswit[OVERVIEW_SWITCH])
-                        printf("    Line %ld column 1 - Wrongspaced quotes?\n", linecnt, (int)(s - aline)+1);
-                    else
-                        cnt_punct++;
-                    }
-                }
-
-        if (pswit[SQUOTE_SWITCH])
-            for (s = aline; *s; s++) {
-                if ((*s == CHAR_SQUOTE || *s == CHAR_OPEN_SQUOTE)
-                     && ( s == aline || (s > aline && !gcisalpha(*(s-1))) || !gcisalpha(*(s+1)))) {
-                    if (!(squotepar = !squotepar)) {    /* parity even */
-                        if (!strchr("_-.'`/\",;:!?)]} ",  *(s+1))) {
-                            if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-                            if (!pswit[OVERVIEW_SWITCH])
-                                printf("    Line %ld column %d - Wrongspaced singlequotes?\n", linecnt, (int)(s - aline)+1);
-                            else
-                                cnt_punct++;
-                            }
-                        }
-                    else {                              /* parity odd */
-                        if (!gcisalpha(*(s+1)) && !isdigit(*(s+1)) && !strchr("_-/\".'`",  *(s+1)) || !*(s+1)) {
-                            if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-                            if (!pswit[OVERVIEW_SWITCH])
-                                printf("    Line %ld column %d - Wrongspaced singlequotes?\n", linecnt, (int)(s - aline)+1);
-                            else
-                                cnt_punct++;
-                            }
-                        }
-                    }
-                }
-                    
-
-        /* v.20 also look for double punctuation like ,. or ,,     */
-        /* Thanks to DW for the suggestion!                        */
-        /* I'm putting this in a separate loop for clarity         */
-        /* In books with references, ".," and ".;" are common      */
-        /* e.g. "etc., etc.," and vol. 1.; vol 3.;                 */
-        /* OTOH, from my initial tests, there are also fairly      */
-        /* common errors. What to do? Make these cases paranoid?   */
-        /* V.21 ".," is the most common, so invented warn_dotcomma */
-        /* to suppress detailed reporting if it occurs often       */
-        llen = strlen(aline);
-        for (i = 0; i < llen; i++)                  /* for each character in the line */
-            if (strchr(".?!,;:", aline[i])          /* if it's punctuation */
-            && (strchr(".?!,;:", aline[i+1]))
-            && aline[i] && aline[i+1])      /* followed by punctuation, it's a query, unless . . . */
-                if (
-                  (aline[i] == aline[i+1]
-                  && (aline[i] == '.' || aline[i] == '?' || aline[i] == '!'))
-                  || (!warn_dotcomma && aline[i] == '.' && aline[i+1] == ',')
-                  || (isFrench && !strncmp(aline+i, ",...", 4))
-                  || (isFrench && !strncmp(aline+i, "...,", 4))
-                  || (isFrench && !strncmp(aline+i, ";...", 4))
-                  || (isFrench && !strncmp(aline+i, "...;", 4))
-                  || (isFrench && !strncmp(aline+i, ":...", 4))
-                  || (isFrench && !strncmp(aline+i, "...:", 4))
-                  || (isFrench && !strncmp(aline+i, "!...", 4))
-                  || (isFrench && !strncmp(aline+i, "...!", 4))
-                  || (isFrench && !strncmp(aline+i, "?...", 4))
-                  || (isFrench && !strncmp(aline+i, "...?", 4))
-                ) {
-                if ((isFrench && !strncmp(aline+i, ",...", 4))    /* could this BE any more awkward? */
-                  || (isFrench && !strncmp(aline+i, "...,", 4))
-                  || (isFrench && !strncmp(aline+i, ";...", 4))
-                  || (isFrench && !strncmp(aline+i, "...;", 4))
-                  || (isFrench && !strncmp(aline+i, ":...", 4))
-                  || (isFrench && !strncmp(aline+i, "...:", 4))
-                  || (isFrench && !strncmp(aline+i, "!...", 4))
-                  || (isFrench && !strncmp(aline+i, "...!", 4))
-                  || (isFrench && !strncmp(aline+i, "?...", 4))
-                  || (isFrench && !strncmp(aline+i, "...?", 4)))
-                    i +=4;
-                        ; /* do nothing for .. !! and ?? which can be legit */
-                    }
-                else {
-                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-                    if (!pswit[OVERVIEW_SWITCH])
-                        printf("    Line %ld column %d - Double punctuation?\n", linecnt, i+1);
-                    else
-                        cnt_punct++;
-                    }
-
-        /* v.21 breaking out the search for spaced doublequotes */
-        /* This is not as efficient, but it's more maintainable */
-        s = aline;
-        while (strstr(s," \" ")) {
-            if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-            if (!pswit[OVERVIEW_SWITCH])
-                printf("    Line %ld column %d - Spaced doublequote?\n", linecnt, (int)(strstr(s," \" ")-aline+1));
-            else
-                cnt_punct++;
-            s = strstr(s," \" ") + 2;
-            }
-
-        /* v.20 also look for spaced singlequotes ' and `  */
-        s = aline;
-        while (strstr(s," ' ")) {
-            if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-            if (!pswit[OVERVIEW_SWITCH])
-                printf("    Line %ld column %d - Spaced singlequote?\n", linecnt, (int)(strstr(s," ' ")-aline+1));
-            else
-                cnt_punct++;
-            s = strstr(s," ' ") + 2;
-            }
-
-        s = aline;
-        while (strstr(s," ` ")) {
-            if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-            if (!pswit[OVERVIEW_SWITCH])
-                printf("    Line %ld column %d - Spaced singlequote?\n", linecnt, (int)(strstr(s," ` ")-aline+1));
-            else
-                cnt_punct++;
-            s = strstr(s," ` ") + 2;
-            }
-
-        /* v.99 check special case of 'S instead of 's at end of word */
-        s = aline + 1;
-        while (*s) {
-            if (*s == CHAR_SQUOTE && *(s+1) == 'S' && *(s-1)>='a' && *(s-1)<='z')  {
-                if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-                if (!pswit[OVERVIEW_SWITCH])
-                    printf("    Line %ld column %d - Capital \"S\"?\n", linecnt, (int)(s-aline+2));
-                else
-                    cnt_punct++;
-                }
-            s++;
-            }
-
-
-        /* v.21 Now check special cases - start and end of line - */
-        /* for single and double quotes. Start is sometimes [sic] */
-        /* but better to query it anyway.                         */
-        /* While I'm here, check for dash at end of line          */
-        llen = strlen(aline);
-        if (llen > 1) {
-            if (aline[llen-1] == CHAR_DQUOTE ||
-                aline[llen-1] == CHAR_SQUOTE ||
-                aline[llen-1] == CHAR_OPEN_SQUOTE)
-                if (aline[llen-2] == CHAR_SPACE) {
-                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-                    if (!pswit[OVERVIEW_SWITCH])
-                        printf("    Line %ld column %d - Spaced quote?\n", linecnt, llen);
-                    else
-                        cnt_punct++;
-                    }
-            
-            /* V 0.98 removed aline[0] == CHAR_DQUOTE from the test below, since */
-            /* Wrongspaced quotes test also catches it for "                     */
-            if (aline[0] == CHAR_SQUOTE ||
-                aline[0] == CHAR_OPEN_SQUOTE)
-                if (aline[1] == CHAR_SPACE) {
-                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-                    if (!pswit[OVERVIEW_SWITCH])
-                        printf("    Line %ld column 1 - Spaced quote?\n", linecnt);
-                    else
-                        cnt_punct++;
-                    }
-            /* dash at end of line may well be legit - paranoid mode only */
-            /* and don't report em-dash at line-end                       */
-            if (pswit[PARANOID_SWITCH] && warn_hyphen) {
-                for (i = llen-1; i > 0 && (unsigned char)aline[i] <= CHAR_SPACE; i--);
-                if (aline[i] == '-' && aline[i-1] != '-') {
-                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-                    if (!pswit[OVERVIEW_SWITCH])
-                        printf("    Line %ld column %d - Hyphen at end of line?\n", linecnt, i);
-                    }
-                }
-            }
-
-        /* v.21 also look for brackets surrounded by alpha                    */
-        /* Brackets are often unspaced, but shouldn't be surrounded by alpha. */
-        /* If so, suspect a scanno like "a]most"                              */
-        llen = strlen(aline);
-        for (i = 1; i < llen-1; i++) {           /* for each character in the line except 1st & last*/
-            if (strchr("{[()]}", aline[i])         /* if it's a bracket */
-                && gcisalpha(aline[i-1]) && gcisalpha(aline[i+1])) {
-                if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-                if (!pswit[OVERVIEW_SWITCH])
-                    printf("    Line %ld column %d - Unspaced bracket?\n", linecnt, i);
-                else
-                    cnt_punct++;
-                }
-            }
-        /* The "Cinderella" case, back in again! :-S Give it another shot */
-        if (warn_endquote) {
-            llen = strlen(aline);
-            for (i = 1; i < llen; i++) {           /* for each character in the line except 1st */
-                if (aline[i] == CHAR_DQUOTE)
-                    if (isalpha(aline[i-1])) {
-                        if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-                        if (!pswit[OVERVIEW_SWITCH])
-                            printf("    Line %ld column %d - endquote missing punctuation?\n", linecnt, i);
-                        else
-                            cnt_punct++;
-                        }
-                }
-            }
-
-        llen = strlen(aline);
-
-        /* Check for  */
-        /* If there is a < in the line, followed at some point  */
-        /* by a > then we suspect HTML                          */
-        if (strstr(aline, "<") && strstr(aline, ">")) {
-            i = (signed int) (strstr(aline, ">") - strstr(aline, "<") + 1);
-            if (i > 0) {
-                strncpy(wrk, strstr(aline, "<"), i);
-                wrk[i] = 0;
-                if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-                if (!pswit[OVERVIEW_SWITCH])
-                    printf("    Line %ld column %d - HTML Tag? %s \n", linecnt, (int)(strstr(aline, "<") - aline) + 1, wrk);
-                else
-                    cnt_html++;
-                }
-            }
-
-        /* Check for &symbol; HTML                   */
-        /* If there is a & in the line, followed at  */
-        /* some point by a ; then we suspect HTML    */
-        if (strstr(aline, "&") && strstr(aline, ";")) {
-            i = (int)(strstr(aline, ";") - strstr(aline, "&") + 1);
-            for (s = strstr(aline, "&"); s < strstr(aline, ";"); s++)   
-                if (*s == CHAR_SPACE) i = 0;                /* 0.99 don't report "Jones & Son;" */
-            if (i > 0) {
-                strncpy(wrk, strstr(aline,"&"), i);
-                wrk[i] = 0;
-                if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
-                if (!pswit[OVERVIEW_SWITCH])
-                    printf("    Line %ld column %d - HTML symbol? %s \n", linecnt, (int)(strstr(aline, "&") - aline) + 1, wrk);
-                else
-                    cnt_html++;
-                }
-            }
-
-        /* At end of paragraph, check for mismatched quotes.           */
-        /* We don't want to report an error immediately, since it is a */
-        /* common convention to omit the quotes at end of paragraph if */
-        /* the next paragraph is a continuation of the same speaker.   */
-        /* Where this is the case, the next para should begin with a   */
-        /* quote, so we store the warning message and only display it  */
-        /* at the top of the next iteration if the new para doesn't    */
-        /* start with a quote.                                         */
-        /* The -p switch overrides this default, and warns of unclosed */
-        /* quotes on _every_ paragraph, whether the next begins with a */
-        /* quote or not.                                               */
-        /* Version .16 - only report mismatched single quotes if       */
-        /* an open_single_quotes was found.                            */
-
-        if (isemptyline) {          /* end of para - add up the totals */
-            if (quot % 2)
-                sprintf(dquote_err, "    Line %ld - Mismatched quotes\n", linecnt);
-            if (pswit[SQUOTE_SWITCH] && open_single_quote && (open_single_quote != close_single_quote) )
-                sprintf(squote_err,"    Line %ld - Mismatched singlequotes?\n", linecnt);
-            if (pswit[SQUOTE_SWITCH] && open_single_quote
-                                     && (open_single_quote != close_single_quote)
-                                     && (open_single_quote != close_single_quote +1) )
-                squot = 1;    /* flag it to be noted regardless of the first char of the next para */
-            if (r_brack)
-                sprintf(rbrack_err, "    Line %ld - Mismatched round brackets?\n", linecnt);
-            if (s_brack)
-                sprintf(sbrack_err, "    Line %ld - Mismatched square brackets?\n", linecnt);
-            if (c_brack)
-                sprintf(cbrack_err, "    Line %ld - Mismatched curly brackets?\n", linecnt);
-            if (c_unders % 2)
-                sprintf(unders_err, "    Line %ld - Mismatched underscores?\n", linecnt);
-            quot = s_brack = c_brack = r_brack = c_unders =
-                open_single_quote = close_single_quote = 0;
-            isnewpara = 1;     /* let the next iteration know that it's starting a new para */
-            }
-
-        /* V.21 _ALSO_ at end of paragraph, check for omitted punctuation. */
-        /*      by working back through prevline. DW.                      */
-        /* Hmmm. Need to check this only for "normal" paras.               */
-        /* So what is a "normal" para? ouch!                               */
-        /* Not normal if one-liner (chapter headings, etc.)                */
-        /* Not normal if doesn't contain at least one locase letter        */
-        /* Not normal if starts with space                                 */
-
-        /* 0.99 tighten up on para end checks. Disallow comma and */
-        /* semi-colon. Check for legit para end before quotes.    */
-        if (isemptyline) {          /* end of para */
-            for (s = prevline, i = 0; *s && !i; s++)
-                if (gcisletter(*s))
-                    i = 1;    /* use i to indicate the presence of a letter on the line */
-            /* This next "if" is a problem.                                             */
-            /* If I say "start_para_line <= linecnt - 1", that includes one-line        */
-            /* "paragraphs" like chapter heads. Lotsa false positives.                  */
-            /* If I say "start_para_line < linecnt - 1" it doesn't, but then it         */
-            /* misses genuine one-line paragraphs.                                      */
-            /* So what do I do? */
-            if (i
-                && lastblen > 2
-                && start_para_line < linecnt - 1
-                && *prevline > CHAR_SPACE
-                ) {
-                for (i = strlen(prevline)-1; (prevline[i] == CHAR_DQUOTE || prevline[i] == CHAR_SQUOTE) && prevline[i] > CHAR_SPACE && i > 0; i--);
-                for (  ; i > 0; i--) {
-                    if (gcisalpha(prevline[i])) {
-                        if (pswit[ECHO_SWITCH]) printf("\n%s\n", prevline);
-                        if (!pswit[OVERVIEW_SWITCH])
-                            printf("    Line %ld column %d - No punctuation at para end?\n", linecnt-1, strlen(prevline));
-                        else
-                            cnt_punct++;
-                        break;
-                        }
-                    if (strchr("-.:!([{?}])", prevline[i]))
-                        break;
-                    }
-                }
-            }
-        strcpy(prevline, aline);
-    }
-    fclose (infile);
-    if (!pswit[OVERVIEW_SWITCH])
-        for (i = 0; i < MAX_QWORD; i++)
-            if (dupcnt[i])
-                printf("\nNote: Queried word %s was duplicated %d time%s\n", qword[i], dupcnt[i], "s");
-}
-
-
-
-/* flgets - get one line from the input stream, checking for   */
-/* the existence of exactly one CR/LF line-end per line.       */
-/* Returns a pointer to the line.                              */
-
-char *flgets(char *theline, int maxlen, FILE *thefile, long lcnt)
-{
-    char c;
-    int len, isCR, cint;
-
-    *theline = 0;
-    len = isCR = 0;
-    c = cint = fgetc(thefile);
-    do {
-        if (cint == EOF)
-            return (NULL);
-        if (c == 10)  /* either way, it's end of line */
-            if (isCR)
-                break;
-            else {   /* Error - a LF without a preceding CR */
-                if (pswit[LINE_END_SWITCH]) {
-                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", theline);
-                    if (!pswit[OVERVIEW_SWITCH])
-                        printf("    Line %ld - No CR?\n", lcnt);
-                    else
-                        cnt_lineend++;
-                    }
-                break;
-                }
-        if (c == 13) {
-            if (isCR) { /* Error - two successive CRs */
-                if (pswit[LINE_END_SWITCH]) {
-                    if (pswit[ECHO_SWITCH]) printf("\n%s\n", theline);
-                    if (!pswit[OVERVIEW_SWITCH])
-                        printf("    Line %ld - Two successive CRs?\n", lcnt);
-                    else
-                        cnt_lineend++;
-                    }
-                }
-            isCR = 1;
-            }
-        else {
-            if (pswit[LINE_END_SWITCH] && isCR) {
-                if (pswit[ECHO_SWITCH]) printf("\n%s\n", theline);
-                if (!pswit[OVERVIEW_SWITCH])
-                    printf("    Line %ld column %d - CR without LF?\n", lcnt, len+1);
-                else
-                    cnt_lineend++;
-                }
-             theline[len] = c;
-             len++;
-             theline[len] = 0;
-             isCR = 0;
-             }
-        c = cint = fgetc(thefile);
-    } while(len < maxlen);
-    if (pswit[MARKUP_SWITCH])  
-        postprocess_for_HTML(theline);
-    if (pswit[DP_SWITCH])  
-        postprocess_for_DP(theline);
-    return(theline);
-}
-
-
-
-
-/* mixdigit - takes a "word" as a parameter, and checks whether it   */
-/* contains a mixture of alpha and digits. Generally, this is an     */
-/* error, but may not be for cases like 4th or L5 12s. 3d.           */
-/* Returns 0 if no error found, 1 if error.                          */
-
-int mixdigit(char *checkword)   /* check for digits like 1 or 0 in words */
-{
-    int wehaveadigit, wehavealetter, firstdigits, query, wl;
-    char *s;
-
-
-    wehaveadigit = wehavealetter = query = 0;
-    for (s = checkword; *s; s++)
-        if (gcisalpha(*s))
-            wehavealetter = 1;
-        else
-            if (gcisdigit(*s))
-                wehaveadigit = 1;
-    if (wehaveadigit && wehavealetter) {         /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
-        query = 1;
-        wl = strlen(checkword);
-        for (firstdigits = 0; gcisdigit(checkword[firstdigits]); firstdigits++)
-            ;
-        /* digits, ending in st, rd, nd, th of either case */
-        /* 0.99 donovan points out an error below. Turns out */
-        /*      I was using matchword like strcmp when the   */
-        /*      return values are different! Duh.            */
-        if (firstdigits + 2 == wl &&
-              (matchword(checkword + wl - 2, "st")
-            || matchword(checkword + wl - 2, "rd")
-            || matchword(checkword + wl - 2, "nd")
-            || matchword(checkword + wl - 2, "th"))
-            )
-                query = 0;
-        if (firstdigits + 3 == wl &&
-              (matchword(checkword + wl - 3, "sts")
-            || matchword(checkword + wl - 3, "rds")
-            || matchword(checkword + wl - 3, "nds")
-            || matchword(checkword + wl - 3, "ths"))
-            )
-                query = 0;
-        if (firstdigits + 3 == wl &&
-              (matchword(checkword + wl - 4, "stly")
-            || matchword(checkword + wl - 4, "rdly")
-            || matchword(checkword + wl - 4, "ndly")
-            || matchword(checkword + wl - 4, "thly"))
-            )
-                query = 0;
-
-        /* digits, ending in l, L, s or d */
-        if (firstdigits + 1 == wl &&
-            (checkword[wl-1] == 'l'
-            || checkword[wl-1] == 'L'
-            || checkword[wl-1] == 's'
-            || checkword[wl-1] == 'd'))
-                query = 0;
-        /* L at the start of a number, representing Britsh pounds, like L500  */
-        /* This is cute. We know the current word is mixeddigit. If the first */
-        /* letter is L, there must be at least one digit following. If both   */
-        /* digits and letters follow, we have a genuine error, else we have a */
-        /* capital L followed by digits, and we accept that as a non-error.   */
-        if (checkword[0] == 'L')
-            if (!mixdigit(checkword+1))
-                query = 0;
-        }
-    return (query);
-}
-
-
-
-
-/* getaword - extracts the first/next "word" from the line, and puts */
-/* it into "thisword". A word is defined as one English word unit    */
-/* -- or at least that's what I'm trying for.                        */
-/* Returns a pointer to the position in the line where we will start */
-/* looking for the next word.                                        */
-
-char *getaword(char *fromline, char *thisword)
-{
-    int i, wordlen;
-    char *s;
-
-    wordlen = 0;
-    for ( ; !gcisdigit(*fromline) && !gcisalpha(*fromline) && *fromline ; fromline++ );
-
-    /* V .20                                                                   */
-    /* add a look-ahead to handle exceptions for numbers like 1,000 and 1.35.  */
-    /* Especially yucky is the case of L1,000                                  */
-    /* I hate this, and I see other ways, but I don't see that any is _better_.*/
-    /* This section looks for a pattern of characters including a digit        */
-    /* followed by a comma or period followed by one or more digits.           */
-    /* If found, it returns this whole pattern as a word; otherwise we discard */
-    /* the results and resume our normal programming.                          */
-    s = fromline;
-    for (  ; (gcisdigit(*s) || gcisalpha(*s) || *s == ',' || *s == '.') && wordlen < MAXWORDLEN ; s++ ) {
-        thisword[wordlen] = *s;
-        wordlen++;
-        }
-    thisword[wordlen] = 0;
-    for (i = 1; i < wordlen -1; i++) {
-        if (thisword[i] == '.' || thisword[i] == ',') {
-            if (gcisdigit(thisword[i-1]) && gcisdigit(thisword[i-1])) {   /* we have one of the damned things */
-                fromline = s;
-                return(fromline);
-                }
-            }
-        }
-
-    /* we didn't find a punctuated number - do the regular getword thing */
-    wordlen = 0;
-    for (  ; (gcisdigit(*fromline) || gcisalpha(*fromline) || *fromline == '\'') && wordlen < MAXWORDLEN ; fromline++ ) {
-        thisword[wordlen] = *fromline;
-        wordlen++;
-        }
-    thisword[wordlen] = 0;
-    return(fromline);
-}
-
-
-
-
-
-/* matchword - just a case-insensitive string matcher    */
-/* yes, I know this is not efficient. I'll worry about   */
-/* that when I have a clear idea where I'm going with it.*/
-
-int matchword(char *checkfor, char *thisword)
-{
-    unsigned int ismatch, i;
-
-    if (strlen(checkfor) != strlen(thisword)) return(0);
-
-    ismatch = 1;     /* assume a match until we find a difference */
-    for (i = 0; i ='A' && *theline <='Z')
-            *theline += 32;
-}
-
-
-/* Is this word a Roman Numeral?                                    */
-/* v 0.99 improved to be better. It still doesn't actually          */
-/* validate that the number is a valid Roman Numeral -- for example */
-/* it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not*/
-/* what we're here to do. If it passes this, it LOOKS like a Roman  */
-/* numeral. Anyway, the actual Romans were pretty tolerant of bad   */
-/* arithmetic, or expressions thereof, except when it came to taxes.*/
-/* Allow any number of M, an optional D, an optional CM or CD,      */
-/* any number of optional Cs, an optional XL or an optional XC, an  */
-/* optional IX or IV, an optional V and any number of optional Is.  */
-/* Good enough for jazz chords.                                     */
-
-int isroman(char *t)
-{
-    char *s;
-
-    if (!t || !*t) return (0);
-
-    s = t;
-
-    while (*t == 'm' && *t ) t++;
-    if (*t == 'd') t++;
-    if (*t == 'c' && *(t+1) == 'm') t+=2;
-    if (*t == 'c' && *(t+1) == 'd') t+=2;
-    while (*t == 'c' && *t) t++;
-    if (*t == 'x' && *(t+1) == 'l') t+=2;
-    if (*t == 'x' && *(t+1) == 'c') t+=2;
-    if (*t == 'l') t++;
-    while (*t == 'x' && *t) t++;
-    if (*t == 'i' && *(t+1) == 'x') t+=2;
-    if (*t == 'i' && *(t+1) == 'v') t+=2;
-    if (*t == 'v') t++;
-    while (*t == 'i' && *t) t++;
-    if (!*t) return (1);
-
-    return(0);
-}
-
-
-
-
-/* gcisalpha is a special version that is somewhat lenient on 8-bit texts.     */
-/* If we use the standard isalpha() function, 8-bit accented characters break  */
-/* words, so that tete with accented characters appears to be two words, "t"   */
-/* and "t", with 8-bit characters between them. This causes over-reporting of  */
-/* errors. gcisalpha() recognizes accented letters from the CP1252 (Windows)   */
-/* and ISO-8859-1 character sets, which are the most common PG 8-bit types.    */
-
-int gcisalpha(unsigned char c)
-{
-    if (c >='a' && c <='z') return(1);
-    if (c >='A' && c <='Z') return(1);
-    if (c < 140) return(0);
-    if (c >=192 && c != 208 && c != 215 && c != 222 && c != 240 && c != 247 && c != 254) return(1);
-    if (c == 140 || c == 142 || c == 156 || c == 158 || c == 159) return (1);
-    return(0);
-}
-
-/* gcisdigit is a special version that doesn't get confused in 8-bit texts.    */
-int gcisdigit(unsigned char c)
-{   
-    if (c >= '0' && c <='9') return(1);
-    return(0);
-}
-
-/* gcisletter is a special version that doesn't get confused in 8-bit texts.    */
-/* Yeah, we're ISO-8891-1-specific. So sue me.                                  */
-int gcisletter(unsigned char c)
-{   
-    if ((c >= 'A' && c <='Z') || (c >= 'a' && c <='z') || c >= 192) return(1);
-    return(0);
-}
-
-
-
-
-/* gcstrchr wraps strchr to return NULL if the character being searched for is zero */
-
-char *gcstrchr(char *s, char c)
-{
-    if (c == 0) return(NULL);
-    return(strchr(s,c));
-}
-
-/* postprocess_for_DP is derived from postprocess_for_HTML          */
-/* It is invoked with the -d switch from flgets().                  */
-/* It simply "removes" from the line a hard-coded set of common     */
-/* DP-specific tags, so that the line passed to the main routine has*/
-/* been pre-cleaned of DP markup.                                   */
-
-void postprocess_for_DP(char *theline)
-{
-
-    char *s, *t;
-    int i;
-
-    if (!*theline) 
-        return;
-
-    for (i = 0; *DPmarkup[i]; i++) {
-        s = strstr(theline, DPmarkup[i]);
-        while (s) {
-            t = s + strlen(DPmarkup[i]);
-            while (*t) {
-                *s = *t;
-                t++; s++;
-                }
-            *s = 0;
-            s = strstr(theline, DPmarkup[i]);
-            }
-        }
-
-}
-
-
-/* postprocess_for_HTML is, at the moment (0.97), a very nasty      */
-/* short-term fix for Charlz. Nasty, nasty, nasty.                  */
-/* It is invoked with the -m switch from flgets().                  */
-/* It simply "removes" from the line a hard-coded set of common     */
-/* HTML tags and "replaces" a hard-coded set of common HTML         */
-/* entities, so that the line passed to the main routine has        */
-/* been pre-cleaned of HTML. This is _so_ not the right way to      */
-/* deal with HTML, but what Charlz needs now is not HTML handling   */
-/* proper: just ignoring  tags and some others.                  */
-/* To be revisited in future releases!                              */
-
-void postprocess_for_HTML(char *theline)
-{
-
-    if (strstr(theline, "<") && strstr(theline, ">"))
-        while (losemarkup(theline))
-            ;
-    while (loseentities(theline))
-        ;
-}
-
-char *losemarkup(char *theline)
-{
-    char *s, *t;
-    int i;
-
-    if (!*theline) 
-        return(NULL);
-
-    s = strstr(theline, "<");
-    t = strstr(theline, ">");
-    if (!s || !t) return(NULL);
-    for (i = 0; *markup[i]; i++)
-        if (!tagcomp(s+1, markup[i])) {
-            if (!*(t+1)) {
-                *s = 0;
-                return(s);
-                }
-            else
-                if (t > s) {
-                    strcpy(s, t+1);
-                    return(s);
-                    }
-        }
-    /* it's an unrecognized  */
-    return(NULL);
-}
-
-char *loseentities(char *theline)
-{
-    int i;
-    char *s, *t;
-
-    if (!*theline) 
-        return(NULL);
-
-    for (i = 0; *entities[i].htmlent; i++) {
-        s = strstr(theline, entities[i].htmlent);
-        if (s) {
-            t = malloc((size_t)strlen(s));
-            if (!t) return(NULL);
-            strcpy(t, s + strlen(entities[i].htmlent));
-            strcpy(s, entities[i].textent);
-            strcat(s, t);
-            free(t);
-            return(theline);
-            }
-        }
-
-    /* V0.97 Duh. Forgot to check the htmlnum member */
-    for (i = 0; *entities[i].htmlnum; i++) {
-        s = strstr(theline, entities[i].htmlnum);
-        if (s) {
-            t = malloc((size_t)strlen(s));
-            if (!t) return(NULL);
-            strcpy(t, s + strlen(entities[i].htmlnum));
-            strcpy(s, entities[i].textent);
-            strcat(s, t);
-            free(t);
-            return(theline);
-            }
-        }
-    return(NULL);
-}
-
-
-int tagcomp(char *strin, char *basetag)
-{
-    char *s, *t;
-
-    s = basetag;
-    t  = strin;
-    if (*t == '/') t++; /* ignore a slash */
-    while (*s && *t) {
-        if (tolower(*s) != tolower(*t)) return(1);
-        s++; t++;
-        }
-    /* OK, we have < followed by a valid tag start  */
-    /* should I do something about length?          */
-    /* this is messy. The length of an  tag is   */
-    /* limited, but a 
could go on for miles */ - /* so I'd have to parse the tags . . . ugh. */ - /* It isn't what Charlz needs now, so mark it */ - /* as 'pending'. */ - return(0); -} - -void proghelp() /* explain program usage here */ -{ - fputs("V. 0.991. Copyright 2000-2005 Jim Tinsley .\n",stderr); - fputs("Gutcheck comes wih ABSOLUTELY NO WARRANTY. For details, read the file COPYING.\n", stderr); - fputs("This is Free Software; you may redistribute it under certain conditions (GPL);\n", stderr); - fputs("read the file COPYING for details.\n\n", stderr); - fputs("Usage is: gutcheck [-setpxloyhud] filename\n",stderr); - fputs(" where -s checks single quotes, -e suppresses echoing lines, -t checks typos\n",stderr); - fputs(" -x (paranoid) switches OFF -t and extra checks, -l turns OFF line-end checks\n",stderr); - fputs(" -o just displays overview without detail, -h echoes header fields\n",stderr); - fputs(" -v (verbose) unsuppresses duplicate reporting, -m suppresses markup\n",stderr); - fputs(" -d ignores DP-specific markup,\n",stderr); - fputs(" -u uses a file gutcheck.typ to query user-defined possible typos\n",stderr); - fputs("Sample usage: gutcheck warpeace.txt \n",stderr); - fputs("\n",stderr); - fputs("Gutcheck looks for errors in Project Gutenberg(TM) etexts.\n", stderr); - fputs("Gutcheck queries anything it thinks shouldn't be in a PG text; non-ASCII\n",stderr); - fputs("characters like accented letters, lines longer than 75 or shorter than 55,\n",stderr); - fputs("unbalanced quotes or brackets, a variety of badly formatted punctuation, \n",stderr); - fputs("HTML tags, some likely typos. It is NOT a substitute for human judgement.\n",stderr); - fputs("\n",stderr); -} - - - -/********************************************************************* - Revision History: - - 04/22/01 Cleaned up some stuff and released .10 - - --------------- - - 05/09/01 Added the typo list, added two extra cases of he/be error, - added -p switch, OPEN_SINGLE QUOTE char as .11 - - --------------- - - 05/20/01 Increased the typo list, - added paranoid mode, - ANSIfied the code and added some casts - so the compiler wouldn't keep asking if I knew what I was doing, - fixed bug in l.s.d. condition (thanks, Dave!), - standardized spacing when echoing, - added letter-combo checking code to typo section, - added more h/b words to typo array. - Not too sure about putting letter combos outside of the TYPO conditions - - someone is sure to have a book about the tbaka tribe, or something. Anyway, let's see. - Released as .12 - - --------------- - - 06/01/01 Removed duplicate reporting of Tildes, asterisks, etc. - 06/10/01 Added flgets routine to help with platform-independent - detection of invalid line-ends. All PG text files should - have CR/LF (13/10) at end of line, regardless of system. - Gutcheck now validates this by default. (Thanks, Charles!) - Released as .13 - - --------------- - - 06/11/01 Added parenthesis match checking. (c_brack, cbrack_err etc.) - Released as .14 - - --------------- - - 06/23/01 Fixed: 'No',he said. not being flagged. - - Improved: better single-quotes checking: - - Ignore singlequotes surrounded by alpha, like didn't. (was OK) - - If a singlequote is at the END of a word AND the word ends in "s": - The dogs' tails wagged. - it's probably an apostrophe, but less commonly may be a closequote: - "These 'pack dogs' of yours look more like wolves." - - If it's got punctuation before it and is followed by a space - or punctuation: - . . . was a problem,' he said - . . . was a problem,'" - it is probably (certainly?) a closequote. - - If it's at start of paragraph, it's probably an openquote. - (but watch dialect) - - Words with ' at beginning and end are probably quoted: - "You have the word 'chivalry' frequently on your lips." - (Not specifically implemented) - V.18 I'm glad I didn't implement this, 'cos it jest ain't so - where the convention is to punctuate outside the quotes. - 'Come', he said, 'and join the party'. - - If it is followed by an alpha, and especially a capital: - 'Hello,' called he. - it is either an openquote or dialect. - - Dialect breaks ALL the rules: - A man's a man for a' that. - "Aye, but 'tis all in the pas' now." - "'Tis often the way," he said. - 'Ave a drink on me. - - This version looks to be an improvement, and produces - fewer false positives, but is still not perfect. The - 'pack dogs' case still fools it, and dialect is still - a problem. Oh, well, it's an improvement, and I have - a weighted structure in place for refining guesses at - closequotes. Maybe next time, I'll add a bit of logic - where if there is an open quote and one that was guessed - to be a possessive apostrophe after s, I'll re-guess it - to be a closequote. Let's see how this one flies, first. - - (Afterview: it's still crap. Needs much work, and a deeper insight.) - - Released as .15 - - TODO: More he/be checks. Can't be perfect - counterexamples: - I gave my son good advice: be married regardless of the world's opinion. - I gave my son good advice: he married regardless of the world's opinion. - - If by "primitive" be meant "crude", we can understand the sentence. - If by "primitive" he meant "crude", we can understand the sentence. - - No matter what be said, I must go on. - No matter what he said, I must go on. - - No value, however great, can be set upon them. - No value, however great, can he set upon them. - - Real-Life one from a DP International Weekly Miscellany: - He wandered through the forest without fear, sleeping - much, for in sleep be had companionship--the Great - Spirit teaching him what he should know in dreams. - That one found by jeebies, and it turned out to be "he". - - - --------------- - - 07/01/01 Added -O option. - Improved singlequotes by reporting mismatched single quotes - only if an open_single_quotes was found. - - Released as .16 - - --------------- - - 08/27/01 Added -Y switch for Robert Rowe to allow his app to - catch the error output. - - Released as .17 - - --------------- - - 09/08/01 Added checking Capitals at start of paragraph, but not - checking them at start of sentence. - - TODO: Parse sentences out so can check reliably for start of - sentence. Need a whole different approach for that. - (Can't just rely on periods, since they are also - used for abbreviations, etc.) - - Added checking for all vowels or all consonants in a word. - - While I was in, I added "ii" checking and "tl" at start of word. - - Added echoing of first line of paragraph when reporting - mismatched quoted or brackets (thanks to David Widger for the - suggestion) - - Not querying L at start of a number (used for British pounds). - - The spelling changes are sort of half-done but released anyway - Skipped .18 because I had given out a couple of test versions - with that number. - - 09/25/01 Released as .19 - - --------------- - - TODO: - Use the logic from my new version of safewrap to stop querying - short lines like poems and TOCs. - Ignore non-standard ellipses like . . . or ... - - - --------------- - 10/01/01 Made any line over 80 a VERY long line (was 85). - Recognized openquotes on indented paragraphs as continuations - of the same speech. - Added "cf" to the okword list (how did I forget _that_?) and a few others. - Moved abbrev to okword and made it more general. - Removed requirement that PG_space_emdash be greater than - ten before turning off warnings about spaced dashes. - Added period to list of characters that might constitute a separator line. - Now checking for double punctuation (Thanks, David!) - Now if two spaced em-dashes on a line, reports both. (DW) - Bug: Wasn't catching spaced punctuation at line-end since I - added flgets in version .13 - fixed. - Bug: Wasn't catching spaced singlequotes - fixed - Now reads punctuated numbers like 1,000 as a single word. - (Used to give "standalone 1" type queries) - Changed paranoid mode - not including s and p options. -ex is now quite usable. - Bug: was calling `"For it is perfectly impossible," Unspaced Quotes - fixed - Bug: Sometimes gave _next_ line number for queried word at end of line - fixed - - 10/22/01 Released as .20 - - --------------- - - Added count of lines with spaces at end. (cnt_spacend) (Thanks, Brett!) - Reduced the number of hi-bit letters needed to stop reporting them - from 1/20 to 1/100 or 200 in total. - Added PG footer check. - Added the -h switch. - Fixed platform-specific CHAR_EOL checking for isemptyline - changed to 13 and 10 - Not reporting ".," when there are many of them, such as a book with many references to "Vol 1., p. 23" - Added unspaced brackets check when surrounded by alpha. - Removed all typo reporting unless the typo switch is on. - Added gcisalpha to ease over-reporting of 8-bit queries. - ECHO_SWITCH is now ON by default! - PARANOID_SWITCH is now ON by default! - Checking for ">From" placed there by e-mail MTA (Thanks Andrew & Greg) - Checking for standalone lowercase "l" - Checking for standalone lowercase "s" - Considering "is be" and "be is" "be was" "was be" as he/be errors - Looking at punct at end of para - - 01/20/02 Released as .21 - - --------------- - - Added VERBOSE_SWITCH to make it list everything. (George Davis) - - --------------- - - 02/17/02 Added cint in flgets to try fix an EOF failure on a compiler I don't have. - after which - This line caused a coredump on Solaris - fixed. - Da sagte die Figur: " Das ist alles gar schoen, und man mag die Puppe - 03/09/02 Changed header recognition for another header change - Called it .24 - 03/29/02 Added qword[][] so I can suppress massive overreporting - of queried "words" like "FN", "Wm.", "th'", people's - initials, chemical formulae and suchlike in some texts. - Called it .25 - 04/07/02 The qword summary reports at end shouldn't show in OVERVIEW mode. Fixed. - Added linecounts in overview mode. - Wow! gutcheck gutcheck.exe doesn't report a binary! :-) Need to tighten up. Done. - "m" is a not uncommon scanno for "in", but also appears in "a.m." - Can I get round that? - 07/07/02 Added GPL. - Added checking for broken em-dash at line-end (enddash) - Released as 0.95 - 08/17/02 Fixed a bug that treated some hi-bit characters as spaces. Thanks, Carlo. - Released as 0.96 - 10/10/02 Suppressing some annoying multiple reports by default: - Standalone Ones, Asterisks, Square Brackets. - Digit 1 occurs often in many scientific texts. - Asterisk occurs often in multi-footnoted texts. - Mismatch Square Brackets occurs often in multi-para footnotes. - Added -m switch for Charlz. Horrible. Nasty. Kludgy. Evil. - . . . but it does more or less work for the main cases. - Removed uppercase within a word as a separate category so - that names like VanAllen get reported only once, like other - suspected typos. - 11/24/02 Fixed - -m switch wasn't looking at htmlnum in - loseentities (Thanks, Brett!) - Fixed bug which occasionally gave false warning of - paragraph starting with lowercase. - Added underscore as character not to query around doublequotes. - Split the "Non-ASCII" message into "Non-ASCII" vs. "Non-ISO-8859" - . . . this is to help detect things like CP1252 characters. - Released as 0.97 - - 12/01/02 Hacked a simplified version of the "Wrongspaced quotes" out of gutspell, - for doublequotes only. Replaces "Spaced quote", since it also covers that - case. - Added "warn_hyphen" to ease over-reporting of hyphens. - - 12/20/02 Added "extra period" checks. - Added single character line check - Added I" check - is usually an exclam - Released as 0.98 - - 1/5/03 Eeek! Left in a lowerit(argv[0]) at the start before procfile() - from when I was looking at ways to identify markup. Refuses to - open files for *nix users with upcase in the filemanes. Removed. - Fixed quickly and released as 0.981 - - 1/8/03 Added "arid" to the list of typos, slightly against my better - judgement, but the DP gang are all excited about it. :-) - Added a check for comma followed by capital letter, where - a period has OCRed into a comma. (DW). Not sure about this - either; we'll see. - Compiling for Win32 to allow longfilenames. - - 6/1/04 A messy test release for DW to include the "gutcheck.typ" - process. And the gutcheck.jee trials. Removed "arid" -- - it can go in gutcheck.typ - - Added checks for carats ^ and slants / but disabling slant - queries if more than 20 of them, because some people use them - for /italics/. Slants are commonly mistaken italic "I"s. - - Later: removed gutcheck.jee -- wrote jeebies instead. - -Random TODO: - Check brackets more closely, like quotes, so that it becomes - easy to find the error in long paragraphs full of brackets. - - - 11/4/04 Assorted cleanup. Fixed case where text started with an - unbalanced paragraph. - - 1/2/05 Has it really been that long? Added "nocomma", "noperiod" check. - Bits and pieces: improved isroman(). Added isletter(). - Other stuff I never noted before this. - - 7/3/05 Stuck in a quick start on DP-markup ignoring - at BillFlis's suggestion. - - 1/23/06 Took out nocomma etc if typos are off. Why did I ever leave that in? - Don't count footer for dotcomma etc. - - -1 I -ail all -arc are -arid and -bad had -ball hall -band hand -bar her -bat but -be he -bead head -beads heads -bear hear -bit hit -bo be -boon been -borne home -bow how -bumbled humbled -car ear -carnage carriage -carne came -cast east -cat cut -cat eat -cheek check -clay day -coining coming -comer corner -die she -docs does -ease case -fail fall -fee he -haying having -ho he -ho who -hut but -is as -lie he -lime time -loth 10th -m in -modem modern -Ms his -ray away -ray my -ringer finger -ringers fingers -rioted noted -tho the -tie he -tie the -tier her -tight right -tile the -tiling thing -tip up -tram train -tune time -u " -wen well -yon you - -*********************************************************************/ - diff -r 218904410231 -r f600b0d1fc5d gutcheck/gutcheck.typ.in --- a/gutcheck/gutcheck.typ.in Fri Jan 27 00:28:11 2012 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,54 +0,0 @@ -11 -44 -ms -ail -alien -arc -arid -bar -bat -bo -borne -bow -bum -bumbled -carnage -carne -cither -coining -comer -cur -docs -eve -eves -gaming -gram -guru -hag -hare -haying -ho -lime -loth -m -modem -nave -ringer -ringers -riot -rioted -signer -snore -spam -tho -tier -tile -tiling -tram -tum -tune -u -vas -wag -wen -yon diff -r 218904410231 -r f600b0d1fc5d test/compatibility/Makefile.am --- a/test/compatibility/Makefile.am Fri Jan 27 00:28:11 2012 +0000 +++ b/test/compatibility/Makefile.am Fri Jan 27 10:30:16 2012 +0000 @@ -1,4 +1,4 @@ -TESTS_ENVIRONMENT=GUTCHECK=../../gutcheck/gutcheck ../harness/gc-test +TESTS_ENVIRONMENT=BOOKLOUPE=../../bookloupe/bookloupe ../harness/loupe-test TESTS=missing-space.tst spaced-punctuation.tst html-tag.tst html-symbol.tst \ spaced-doublequote.tst mismatched-quotes.tst he-be.tst digits.tst \ extra-period.tst ellipsis.tst short-line.tst abbreviation.tst \ diff -r 218904410231 -r f600b0d1fc5d test/harness/Makefile.am --- a/test/harness/Makefile.am Fri Jan 27 00:28:11 2012 +0000 +++ b/test/harness/Makefile.am Fri Jan 27 10:30:16 2012 +0000 @@ -1,8 +1,8 @@ INCLUDES=-I$(top_srcdir) -bin_PROGRAMS=gc-test +bin_PROGRAMS=loupe-test AM_CFLAGS=$(GLIB_CFLAGS) LIBS=$(GLIB_LIBS) -gc_test_SOURCES=gc-test.c testcase.c testcase.h testcaseio.c testcaseio.h \ - testcaseparser.c testcaseparser.h -gc_test_LDADD=../../gclib/libgc.la +loupe_test_SOURCES=loupe-test.c testcase.c testcase.h testcaseio.c \ + testcaseio.h testcaseparser.c testcaseparser.h +loupe_test_LDADD=../../bl/libbl.la diff -r 218904410231 -r f600b0d1fc5d test/harness/gc-test.c --- a/test/harness/gc-test.c Fri Jan 27 00:28:11 2012 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,31 +0,0 @@ -#include -#include -#include -#include -#include "testcase.h" -#include "testcaseio.h" - -/* - * Returns FALSE if the test should be considered to have failed. - * (returns TRUE on pass or expected-fail). - */ -boolean run_test(const char *filename) -{ - Testcase *testcase; - boolean retval; - testcase=testcase_parse_file(filename); - if (!testcase) - return FALSE; - retval=testcase_run(testcase); - testcase_free(testcase); - return retval; -} - -int main(int argc,char **argv) -{ - int i; - boolean pass=TRUE; - for(i=1;i +#include +#include +#include +#include "testcase.h" +#include "testcaseio.h" + +/* + * Returns FALSE if the test should be considered to have failed. + * (returns TRUE on pass or expected-fail). + */ +boolean run_test(const char *filename) +{ + Testcase *testcase; + boolean retval; + testcase=testcase_parse_file(filename); + if (!testcase) + return FALSE; + retval=testcase_run(testcase); + testcase_free(testcase); + return retval; +} + +int main(int argc,char **argv) +{ + int i; + boolean pass=TRUE; + for(i=1;i #endif #include -#include +#include #include "testcase.h" #if !HAVE_MKSTEMP @@ -124,9 +124,9 @@ return FALSE; } close(fd); - command[0]=getenv("GUTCHECK"); + command[0]=getenv("BOOKLOUPE"); if (!command[0]) - command[0]="." GC_DIR_SEPARATOR_S "gutcheck"; + command[0]="." BL_DIR_SEPARATOR_S "bookloupe"; command[1]=input; command[2]=NULL; if (testcase->expected) @@ -157,7 +157,7 @@ fprintf(stderr,"%s: FAIL\n",testcase->basename); offset=common_prefix_length(output,expected->str); if (offset==header_len && !output[offset]) - fprintf(stderr,"Unexpected zero warnings from gutcheck.\n"); + fprintf(stderr,"Unexpected zero warnings from bookloupe.\n"); else { endp=strchr(output+offset,'\n'); @@ -171,7 +171,7 @@ else bol=report->str; col=offset-(bol-report->str); - fprintf(stderr,"Unexpected output from gutcheck:\n"); + fprintf(stderr,"Unexpected output from bookloupe:\n"); if (report->len>=header_len) fprintf(stderr,"%s\n%*s^\n",report->str+header_len,col,""); else @@ -185,7 +185,7 @@ string_free(expected,TRUE); mem_free(output); if (exit_status) - fprintf(stderr,"gutcheck exited with code %d\n",r); + fprintf(stderr,"bookloupe exited with code %d\n",r); if (!exit_status) fprintf(stderr,"%s: PASS\n",testcase->basename); return !exit_status; diff -r 218904410231 -r f600b0d1fc5d test/harness/testcaseio.c --- a/test/harness/testcaseio.c Fri Jan 27 00:28:11 2012 +0000 +++ b/test/harness/testcaseio.c Fri Jan 27 10:30:16 2012 +0000 @@ -1,7 +1,7 @@ #include #include #include -#include +#include #include "testcaseparser.h" #include "testcaseio.h" diff -r 218904410231 -r f600b0d1fc5d test/harness/testcaseparser.c --- a/test/harness/testcaseparser.c Fri Jan 27 00:28:11 2012 +0000 +++ b/test/harness/testcaseparser.c Fri Jan 27 10:30:16 2012 +0000 @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include "testcaseparser.h" /* diff -r 218904410231 -r f600b0d1fc5d test/harness/testcaseparser.h --- a/test/harness/testcaseparser.h Fri Jan 27 00:28:11 2012 +0000 +++ b/test/harness/testcaseparser.h Fri Jan 27 10:30:16 2012 +0000 @@ -1,7 +1,7 @@ #ifndef TESTCASE_PARSER_H #define TESTCASE_PARSER_H -#include +#include typedef struct { char *filename;