1.1 --- a/.hgignore Fri Jan 27 00:28:11 2012 +0000
1.2 +++ b/.hgignore Fri Jan 27 10:30:16 2012 +0000
1.3 @@ -1,5 +1,5 @@
1.4 -gutcheck-.*\.tar\.gz
1.5 -gutcheck-.*/
1.6 +bookloupe-.*\.tar\.gz
1.7 +bookloupe-.*/
1.8 Makefile$
1.9 Makefile\.in
1.10 aclocal\.m4
1.11 @@ -17,6 +17,6 @@
1.12 .*\.la
1.13 .*\.lo
1.14 .*\.exe
1.15 -gutcheck/gutcheck\.typ
1.16 -gutcheck/gutcheck
1.17 -test/harness/gc-test
1.18 +bookloupe/bookloupe\.typ
1.19 +bookloupe/bookloupe
1.20 +test/harness/loupe-test
2.1 --- a/Makefile.am Fri Jan 27 00:28:11 2012 +0000
2.2 +++ b/Makefile.am Fri Jan 27 10:30:16 2012 +0000
2.3 @@ -1,1 +1,1 @@
2.4 -SUBDIRS=gclib gutcheck test doc
2.5 +SUBDIRS=bl bookloupe test doc
3.1 --- a/README Fri Jan 27 00:28:11 2012 +0000
3.2 +++ b/README Fri Jan 27 10:30:16 2012 +0000
3.3 @@ -1,10 +1,10 @@
3.4 - gutcheck
3.5 - ========
3.6 + bookloupe
3.7 + =========
3.8
3.9 General installation instructions can be found in INSTALL. The following
3.10 aim to give a quick overview and some help for specific systems. Documentation
3.11 -for gutcheck itself can be found in doc/gutcheck.txt and for the test
3.12 -framework in doc/gc-test.txt.
3.13 +for bookloupe itself can be found in doc/bookloupe.txt and for the test
3.14 +framework in doc/loupe-test.txt.
3.15
3.16 Linux
3.17 -----
3.18 @@ -43,12 +43,12 @@
3.19 % sudo yum install mingw32-gcc pkgconfig mingw32-glib2-static \
3.20 mingw32-gettext-static mingw32-iconv-static
3.21 % ./configure --host=i686-w64-mingw32 --disable-shared \
3.22 - --bindir=/gutcheck --datadir=/
3.23 + --bindir=/bookloupe --datadir=/
3.24 % make
3.25 % mkdir build
3.26 % make install DESTDIR=`pwd`/build
3.27
3.28 -The contents of the build/gutcheck directory can then be copied to a
3.29 +The contents of the build/bookloupe directory can then be copied to a
3.30 Microsoft Windows machine.
3.31
3.32 Depending on the version of mingw32-gcc you use, you may need to specify a
4.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
4.2 +++ b/bl/Makefile.am Fri Jan 27 10:30:16 2012 +0000
4.3 @@ -0,0 +1,10 @@
4.4 +INCLUDES=-I$(top_srcdir)
4.5 +AM_CFLAGS=$(GLIB_CFLAGS)
4.6 +LIBS=$(GLIB_LIBS)
4.7 +
4.8 +noinst_LTLIBRARIES=libbl.la
4.9 +libbl_la_SOURCES=bl.h textfileutils.c textfileutils.h spawn.c spawn.h
4.10 +if !HAVE_GLIB
4.11 +libbl_la_SOURCES+=macros.h types.h fileutils.c fileutils.h mem.c mem.h \
4.12 + strfuncs.c strfuncs.h blstring.c blstring.h utils.c utils.h
4.13 +endif
5.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
5.2 +++ b/bl/bl.h Fri Jan 27 10:30:16 2012 +0000
5.3 @@ -0,0 +1,36 @@
5.4 +#if HAVE_GLIB
5.5 +
5.6 +#include <glib.h>
5.7 +#define BL_DIR_SEPARATOR G_DIR_SEPARATOR
5.8 +#define BL_DIR_SEPARATOR_S G_DIR_SEPARATOR_S
5.9 +#define BL_IS_DIR_SEPARATOR(c) G_IS_DIR_SEPARATOR(c)
5.10 +#define boolean gboolean
5.11 +#define String GString
5.12 +#define mem_new0 g_new0
5.13 +#define mem_free g_free
5.14 +#define str_dup g_strdup
5.15 +#define str_ndup g_strndup
5.16 +#define path_get_basename g_path_get_basename
5.17 +#define file_get_contents(filename,contents,length) \
5.18 + g_file_get_contents(filename,contents,length,NULL)
5.19 +#define string_new g_string_new
5.20 +#define string_append g_string_append
5.21 +#define string_append_len g_string_append_len
5.22 +#define string_append_c g_string_append_c
5.23 +#define string_free g_string_free
5.24 +#define string_set_size g_string_set_size
5.25 +
5.26 +#else /* !HAVE_GLIB */
5.27 +
5.28 +#include <bl/macros.h>
5.29 +#include <bl/types.h>
5.30 +#include <bl/mem.h>
5.31 +#include <bl/fileutils.h>
5.32 +#include <bl/strfuncs.h>
5.33 +#include <bl/blstring.h>
5.34 +#include <bl/utils.h>
5.35 +
5.36 +#endif /* HAVE_GLIB */
5.37 +
5.38 +#include <bl/textfileutils.h>
5.39 +#include <bl/spawn.h>
6.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
6.2 +++ b/bl/blstring.c Fri Jan 27 10:30:16 2012 +0000
6.3 @@ -0,0 +1,90 @@
6.4 +#include <stdlib.h>
6.5 +#include <string.h>
6.6 +#include <bl/blstring.h>
6.7 +#include <bl/types.h>
6.8 +#include <bl/mem.h>
6.9 +#include <bl/strfuncs.h>
6.10 +
6.11 +/*
6.12 + * Strings which manage their own memory
6.13 + */
6.14 +
6.15 +String *string_new(const char *init)
6.16 +{
6.17 + String *string=mem_new(String,1);
6.18 + if (!init)
6.19 + init="";
6.20 + string->len=strlen(init);
6.21 + string->alloc=string->len+1;
6.22 + string->str=str_dup(init);
6.23 + return string;
6.24 +}
6.25 +
6.26 +/*
6.27 + * Free a string and either return the contents (if free_segment is FALSE)
6.28 + * or free the contents as well and return NULL (if free_segment is TRUE).
6.29 + */
6.30 +char *string_free(String *string,boolean free_segment)
6.31 +{
6.32 + char *retval;
6.33 + if (free_segment)
6.34 + {
6.35 + mem_free(string->str);
6.36 + retval=NULL;
6.37 + }
6.38 + else
6.39 + retval=string->str;
6.40 + mem_free(string);
6.41 + return retval;
6.42 +}
6.43 +
6.44 +/*
6.45 + * Append a byte to string.
6.46 + */
6.47 +void string_append_c(String *string,char c)
6.48 +{
6.49 + if (string->len+1==string->alloc)
6.50 + {
6.51 + string->alloc*=2;
6.52 + string->str=mem_renew(char,string->str,string->alloc);
6.53 + }
6.54 + string->str[string->len++]=c;
6.55 + string->str[string->len]='\0';
6.56 +}
6.57 +
6.58 +/*
6.59 + * Append len bytes from s to string. len may be passed as <0 if s is
6.60 + * a nul-terminated string of unknown length.
6.61 + */
6.62 +void string_append_len(String *string,const char *s,ssize_t len)
6.63 +{
6.64 + if (len<0)
6.65 + len=strlen(s);
6.66 + if (string->len+len>=string->alloc)
6.67 + {
6.68 + while (string->len+len>=string->alloc)
6.69 + string->alloc*=2;
6.70 + string->str=mem_renew(char,string->str,string->alloc);
6.71 + }
6.72 + memcpy(string->str+string->len,s,len);
6.73 + string->len+=len;
6.74 + string->str[string->len]='\0';
6.75 +}
6.76 +
6.77 +/*
6.78 + * Sets the length of a String. If the length is less than the current length,
6.79 + * the string will be truncated. If the length is greater than the current
6.80 + * length, the contents of the newly added area are undefined. (However, as
6.81 + * always, string->str[string->len] will be a nul byte.)
6.82 + */
6.83 +void string_set_size(String *string,size_t len)
6.84 +{
6.85 + if (len>=string->alloc)
6.86 + {
6.87 + while (len>=string->alloc)
6.88 + string->alloc*=2;
6.89 + string->str=mem_renew(char,string->str,string->alloc);
6.90 + }
6.91 + string->len=len;
6.92 + string->str[string->len]='\0';
6.93 +}
7.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
7.2 +++ b/bl/blstring.h Fri Jan 27 10:30:16 2012 +0000
7.3 @@ -0,0 +1,18 @@
7.4 +#ifndef BL_STRING_H
7.5 +#define BL_STRING_H
7.6 +
7.7 +#include <unistd.h>
7.8 +#include <bl/types.h>
7.9 +
7.10 +typedef struct {
7.11 + char *str;
7.12 + size_t alloc,len;
7.13 +} String;
7.14 +
7.15 +String *string_new(const char *init);
7.16 +char *string_free(String *string,boolean free_segment);
7.17 +void string_append_c(String *string,char c);
7.18 +void string_append_len(String *string,const char *s,ssize_t len);
7.19 +#define string_append(string,s) string_append_len(string,s,-1)
7.20 +
7.21 +#endif /* BL_STRING_H */
8.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
8.2 +++ b/bl/fileutils.c Fri Jan 27 10:30:16 2012 +0000
8.3 @@ -0,0 +1,46 @@
8.4 +#include <stdlib.h>
8.5 +#include <stdio.h>
8.6 +#include <bl/macros.h>
8.7 +#include <bl/mem.h>
8.8 +#include <bl/fileutils.h>
8.9 +#include <bl/blstring.h>
8.10 +
8.11 +/*
8.12 + * Read a file into memory (which should be freed with mem_free when no
8.13 + * longer required). Returns FALSE on error and outputs a suitable error
8.14 + * message to stderr.
8.15 + */
8.16 +boolean file_get_contents(const char *filename,char **contents,size_t *length)
8.17 +{
8.18 + FILE *fp;
8.19 + size_t n;
8.20 + char *buffer;
8.21 + String *string;
8.22 + fp=fopen(filename,"rb");
8.23 + if (!fp)
8.24 + {
8.25 + perror(filename);
8.26 + return FALSE;
8.27 + }
8.28 + buffer=mem_new(char,1024);
8.29 + string=string_new(NULL);
8.30 + do
8.31 + {
8.32 + n=fread(buffer,1,1024,fp);
8.33 + if (n<0)
8.34 + {
8.35 + perror(filename);
8.36 + string_free(string,TRUE);
8.37 + mem_free(buffer);
8.38 + free(fp);
8.39 + return FALSE;
8.40 + }
8.41 + string_append_len(string,buffer,n);
8.42 + } while(n);
8.43 + mem_free(buffer);
8.44 + if (length)
8.45 + *length=string->len;
8.46 + *contents=string_free(string,FALSE);
8.47 + fclose(fp);
8.48 + return TRUE;
8.49 +}
9.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
9.2 +++ b/bl/fileutils.h Fri Jan 27 10:30:16 2012 +0000
9.3 @@ -0,0 +1,8 @@
9.4 +#ifndef BL_FILEUTILS_H
9.5 +#define BL_FILEUTILS_H
9.6 +
9.7 +#include <bl/types.h>
9.8 +
9.9 +boolean file_get_contents(const char *filename,char **contents,size_t *length);
9.10 +
9.11 +#endif /* BL_FILEUTILS_H */
10.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
10.2 +++ b/bl/macros.h Fri Jan 27 10:30:16 2012 +0000
10.3 @@ -0,0 +1,7 @@
10.4 +#ifndef FALSE
10.5 +#define FALSE 0
10.6 +#endif
10.7 +
10.8 +#ifndef TRUE
10.9 +#define TRUE (!FALSE)
10.10 +#endif
11.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
11.2 +++ b/bl/mem.c Fri Jan 27 10:30:16 2012 +0000
11.3 @@ -0,0 +1,54 @@
11.4 +#include <stdlib.h>
11.5 +#include <stdio.h>
11.6 +#include <string.h>
11.7 +#include <bl/mem.h>
11.8 +
11.9 +/*
11.10 + * A memory allocator that aborts on failure (so that the caller never
11.11 + * needs to handle out of memory, which we assume is very unlikely to
11.12 + * happen under normal circumstances on any modern machine).
11.13 + */
11.14 +void *mem_alloc(size_t nmemb,size_t size)
11.15 +{
11.16 + void *ptr=malloc(nmemb*size);
11.17 + if (!ptr)
11.18 + {
11.19 + fprintf(stderr,
11.20 + "Not enough memory to allocate %lu elements of %lu bytes.\n",
11.21 + (unsigned long)nmemb,(unsigned long)size);
11.22 + abort();
11.23 + }
11.24 + return ptr;
11.25 +}
11.26 +
11.27 +/*
11.28 + * As mem_new, but new memory is cleared to zero.
11.29 + */
11.30 +void *mem_alloc0(size_t nmemb,size_t size)
11.31 +{
11.32 + void *ptr=calloc(nmemb,size);
11.33 + if (!ptr)
11.34 + {
11.35 + fprintf(stderr,
11.36 + "Not enough memory to allocate %lu elements of %lu bytes.\n",
11.37 + (unsigned long)nmemb,(unsigned long)size);
11.38 + abort();
11.39 + }
11.40 + return ptr;
11.41 +}
11.42 +
11.43 +/*
11.44 + * Grow or shrink a memory block, aborting on failure.
11.45 + */
11.46 +void *mem_realloc(void *ptr,size_t nmemb,size_t size)
11.47 +{
11.48 + ptr=realloc(ptr,nmemb*size);
11.49 + if (!ptr)
11.50 + {
11.51 + fprintf(stderr,
11.52 + "Not enough memory to allocate %lu elements of %lu bytes.\n",
11.53 + (unsigned long)nmemb,(unsigned long)size);
11.54 + abort();
11.55 + }
11.56 + return ptr;
11.57 +}
12.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
12.2 +++ b/bl/mem.h Fri Jan 27 10:30:16 2012 +0000
12.3 @@ -0,0 +1,13 @@
12.4 +#ifndef BL_MEM_H
12.5 +#define BL_MEM_H
12.6 +
12.7 +void *mem_alloc(size_t nmemb,size_t size);
12.8 +void *mem_alloc0(size_t nmemb,size_t size);
12.9 +void *mem_realloc(void *ptr,size_t nmemb,size_t size);
12.10 +
12.11 +#define mem_new(type,n) ((type *)mem_alloc(n,sizeof(type)))
12.12 +#define mem_new0(type,n) ((type *)mem_alloc0(n,sizeof(type)))
12.13 +#define mem_renew(type,ptr,n) ((type *)mem_realloc(ptr,n,sizeof(type)))
12.14 +#define mem_free(ptr) free(ptr)
12.15 +
12.16 +#endif /* BL_MEM_H */
13.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
13.2 +++ b/bl/spawn.c Fri Jan 27 10:30:16 2012 +0000
13.3 @@ -0,0 +1,84 @@
13.4 +#include <stdlib.h>
13.5 +#include <stdio.h>
13.6 +#ifndef WIN32
13.7 +#include <sys/wait.h>
13.8 +#endif
13.9 +#include <bl/bl.h>
13.10 +
13.11 +#define SPAWN_BUFSIZE 128
13.12 +
13.13 +boolean spawn_sync(char **argv,char **standard_output,int *exit_status)
13.14 +{
13.15 +/* Don't use g_spawn_sync on WIN32 for now to avoid needing the helper */
13.16 +#if HAVE_GLIB && !defined(WIN32)
13.17 + char *standard_error;
13.18 + GError *error=NULL;
13.19 + gboolean retval;
13.20 + GSpawnFlags flags=G_SPAWN_SEARCH_PATH;
13.21 + if (!standard_output)
13.22 + flags=G_SPAWN_STDOUT_TO_DEV_NULL;
13.23 + retval=g_spawn_sync(NULL,argv,NULL,flags,NULL,NULL,standard_output,
13.24 + &standard_error,exit_status,&error);
13.25 + fputs(standard_error,stderr);
13.26 + g_free(standard_error);
13.27 + if (!retval)
13.28 + {
13.29 + fprintf(stderr,"%s\n",error->message);
13.30 + g_error_free(error);
13.31 + }
13.32 + else if (exit_status)
13.33 + *exit_status=WEXITSTATUS(*exit_status);
13.34 + return retval;
13.35 +#else
13.36 + FILE *fp;
13.37 + int i,r;
13.38 + size_t n,len;
13.39 + String *command_line,*string;
13.40 + command_line=string_new(NULL);
13.41 + for(i=0;argv[i];i++)
13.42 + {
13.43 + if (i)
13.44 + string_append_c(command_line,' ');
13.45 + string_append(command_line,argv[i]);
13.46 + }
13.47 + fp=popen(command_line->str,"r");
13.48 + string_free(command_line,TRUE);
13.49 + if (!fp)
13.50 + {
13.51 + perror(command_line->str);
13.52 + return FALSE;
13.53 + }
13.54 + string=string_new(NULL);
13.55 + do
13.56 + {
13.57 + len=string->len;
13.58 + string_set_size(string,len+SPAWN_BUFSIZE);
13.59 + n=fread(string->str+len,1,SPAWN_BUFSIZE,fp);
13.60 + if (n<0)
13.61 + {
13.62 + perror("fread");
13.63 + (void)pclose(fp);
13.64 + string_free(string,TRUE);
13.65 + return FALSE;
13.66 + }
13.67 + string_set_size(string,len+n);
13.68 + } while(n);
13.69 + r=pclose(fp);
13.70 + if (r<0)
13.71 + {
13.72 + perror("pclose");
13.73 + string_free(string,TRUE);
13.74 + return FALSE;
13.75 + }
13.76 + else
13.77 + {
13.78 + if (exit_status)
13.79 + *exit_status=r;
13.80 + if (standard_output)
13.81 + *standard_output=string_free(string,FALSE);
13.82 + else
13.83 + string_free(string,TRUE);
13.84 + return TRUE;
13.85 + }
13.86 +#endif
13.87 +}
14.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
14.2 +++ b/bl/spawn.h Fri Jan 27 10:30:16 2012 +0000
14.3 @@ -0,0 +1,8 @@
14.4 +#ifndef BL_SPAWN_H
14.5 +#define BL_SPAWN_H
14.6 +
14.7 +#include <bl/bl.h>
14.8 +
14.9 +boolean spawn_sync(char **argv,char **standard_output,int *exit_status);
14.10 +
14.11 +#endif /* BL_SPAWN_H */
15.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
15.2 +++ b/bl/strfuncs.c Fri Jan 27 10:30:16 2012 +0000
15.3 @@ -0,0 +1,26 @@
15.4 +#include <stdlib.h>
15.5 +#include <string.h>
15.6 +#include <bl/mem.h>
15.7 +#include <bl/strfuncs.h>
15.8 +
15.9 +/*
15.10 + * Like strndup, but only returns NULL if str is NULL.
15.11 + * Note that this routine copies n bytes rather than n characters.
15.12 + */
15.13 +char *str_ndup(const char *str,size_t n)
15.14 +{
15.15 + char *dup;
15.16 + if (!str)
15.17 + return NULL;
15.18 + dup=mem_alloc0(n+1,1);
15.19 + strncpy(dup,str,n);
15.20 + return dup;
15.21 +}
15.22 +
15.23 +/*
15.24 + * Like strdup, but only returns NULL if str is NULL.
15.25 + */
15.26 +char *str_dup(const char *str)
15.27 +{
15.28 + return str_ndup(str,strlen(str));
15.29 +}
16.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
16.2 +++ b/bl/strfuncs.h Fri Jan 27 10:30:16 2012 +0000
16.3 @@ -0,0 +1,7 @@
16.4 +#ifndef BL_STRFUNCS_H
16.5 +#define BL_STRFUNCS_H
16.6 +
16.7 +char *str_dup(const char *str);
16.8 +char *str_ndup(const char *str,size_t n);
16.9 +
16.10 +#endif /* BL_STRFUNCS_H */
17.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
17.2 +++ b/bl/textfileutils.c Fri Jan 27 10:30:16 2012 +0000
17.3 @@ -0,0 +1,33 @@
17.4 +#include <stdlib.h>
17.5 +#include <stdio.h>
17.6 +#include <bl/bl.h>
17.7 +
17.8 +/*
17.9 + * Read a file into memory (which should be freed with mem_free when no
17.10 + * longer required). Returns NULL on error and outputs a suitable error
17.11 + * message to stderr.
17.12 + * DOS-style line endings are handled transparently even on platforms which
17.13 + * don't normally use this format.
17.14 + */
17.15 +boolean file_get_contents_text(const char *filename,char **contents,
17.16 + size_t *length)
17.17 +{
17.18 + int i;
17.19 + char *raw;
17.20 + size_t raw_length;
17.21 + String *string;
17.22 + if (!file_get_contents(filename,&raw,&raw_length))
17.23 + return FALSE;
17.24 + string=string_new(NULL);
17.25 + for(i=0;i<raw_length;i++)
17.26 + if (raw[i]!='\r')
17.27 + string_append_c(string,raw[i]);
17.28 + mem_free(raw);
17.29 + if (length)
17.30 + *length=string->len;
17.31 + if (contents)
17.32 + *contents=string_free(string,FALSE);
17.33 + else
17.34 + string_free(string,TRUE);
17.35 + return TRUE;
17.36 +}
18.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
18.2 +++ b/bl/textfileutils.h Fri Jan 27 10:30:16 2012 +0000
18.3 @@ -0,0 +1,9 @@
18.4 +#ifndef BL_TEXTFILEUTILS_H
18.5 +#define BL_TEXTFILEUTILS_H
18.6 +
18.7 +#include <bl/bl.h>
18.8 +
18.9 +boolean file_get_contents_text(const char *filename,char **contents,
18.10 + size_t *length);
18.11 +
18.12 +#endif /* BL_TEXTFILEUTILS_H */
19.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
19.2 +++ b/bl/types.h Fri Jan 27 10:30:16 2012 +0000
19.3 @@ -0,0 +1,6 @@
19.4 +#ifndef BL_TYPES_H
19.5 +#define BL_TYPES_H
19.6 +
19.7 +typedef int boolean;
19.8 +
19.9 +#endif /* BL_TYPES_H */
20.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
20.2 +++ b/bl/utils.c Fri Jan 27 10:30:16 2012 +0000
20.3 @@ -0,0 +1,46 @@
20.4 +#include <stdlib.h>
20.5 +#include <string.h>
20.6 +#include <unistd.h>
20.7 +#include <bl/mem.h>
20.8 +#include <bl/strfuncs.h>
20.9 +#include <bl/utils.h>
20.10 +
20.11 +#define is_valid_drive(d) ((d)>='a' && (d)<='z' || (d)>='A' && (d)<='Z')
20.12 +
20.13 +/*
20.14 + * Gets the last component of the filename. If filename ends with a directory
20.15 + * separator it gets the component before the last slash. If filename consists
20.16 + * only of directory separators (and on Windows, possibly a drive letter), a
20.17 + * single separator is returned. If filename is empty, it gets ".".
20.18 + */
20.19 +char *path_get_basename(const char *filename)
20.20 +{
20.21 + ssize_t base,last_nonslash;
20.22 + size_t len;
20.23 + char *retval;
20.24 + if (*filename=='\0')
20.25 + return str_dup(".");
20.26 + last_nonslash=strlen(filename)-1;
20.27 + while (last_nonslash>=0 && BL_IS_DIR_SEPARATOR(filename[last_nonslash]))
20.28 + last_nonslash--;
20.29 + if (last_nonslash<0)
20.30 + /* string only containing slashes */
20.31 + return str_dup(BL_DIR_SEPARATOR_S);
20.32 +#ifdef WIN32
20.33 + if (last_nonslash==1 && is_valid_drive(filename[0]) && filename[1]==':')
20.34 + /* string only containing slashes and a drive */
20.35 + return str_dup(BL_DIR_SEPARATOR_S);
20.36 +#endif
20.37 + base=last_nonslash;
20.38 + while (base>=0 && !BL_IS_DIR_SEPARATOR(filename[base]))
20.39 + base--;
20.40 +#ifdef WIN32
20.41 + if (base==-1 && is_valid_drive(filename[0]) && filename[1] == ':')
20.42 + base=1;
20.43 +#endif
20.44 + len=last_nonslash-base;
20.45 + retval=mem_alloc(len+1,1);
20.46 + memcpy(retval,filename+base+1,len);
20.47 + retval[len]='\0';
20.48 + return retval;
20.49 +}
21.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
21.2 +++ b/bl/utils.h Fri Jan 27 10:30:16 2012 +0000
21.3 @@ -0,0 +1,16 @@
21.4 +#ifndef BL_UTIL_H
21.5 +#define BL_UTIL_H
21.6 +
21.7 +#ifdef WIN32
21.8 +#define BL_DIR_SEPARATOR '\\'
21.9 +#define BL_DIR_SEPARATOR_S "\\"
21.10 +#define BL_IS_DIR_SEPARATOR(c) ((c)==BL_DIR_SEPARATOR || (c)=='/')
21.11 +#else
21.12 +#define BL_DIR_SEPARATOR '/'
21.13 +#define BL_DIR_SEPARATOR_S "/"
21.14 +#define BL_IS_DIR_SEPARATOR(c) ((c)==BL_DIR_SEPARATOR)
21.15 +#endif
21.16 +
21.17 +char *path_get_basename(const char *filename);
21.18 +
21.19 +#endif /* BL_UTIL_H */
22.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
22.2 +++ b/bookloupe/Makefile.am Fri Jan 27 10:30:16 2012 +0000
22.3 @@ -0,0 +1,8 @@
22.4 +bin_PROGRAMS=bookloupe
22.5 +pkgdata_DATA=bookloupe.typ
22.6 +
22.7 +bookloupe.typ: bookloupe.typ.in
22.8 + sed 's/$$/\r/' $< > $@
22.9 +
22.10 +EXTRA_DIST=bookloupe.typ.in
22.11 +CLEANFILES=bookloupe.typ
23.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
23.2 +++ b/bookloupe/bookloupe.c Fri Jan 27 10:30:16 2012 +0000
23.3 @@ -0,0 +1,2982 @@
23.4 +/*************************************************************************/
23.5 +/* gutcheck - check for assorted weirdnesses in a PG candidate text file */
23.6 +/* */
23.7 +/* Version 0.991 */
23.8 +/* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
23.9 +/* */
23.10 +/* This program is free software; you can redistribute it and/or modify */
23.11 +/* it under the terms of the GNU General Public License as published by */
23.12 +/* the Free Software Foundation; either version 2 of the License, or */
23.13 +/* (at your option) any later version. */
23.14 +/* */
23.15 +/* This program is distributed in the hope that it will be useful, */
23.16 +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */
23.17 +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
23.18 +/* GNU General Public License for more details. */
23.19 +/* */
23.20 +/* You should have received a copy of the GNU General Public License */
23.21 +/* along with this program; if not, write to the */
23.22 +/* Free Software Foundation, Inc., */
23.23 +/* 59 Temple Place, */
23.24 +/* Suite 330, */
23.25 +/* Boston, MA 02111-1307 USA */
23.26 +/* */
23.27 +/* */
23.28 +/* */
23.29 +/* Overview comments: */
23.30 +/* */
23.31 +/* If you're reading this, you're either interested in how to detect */
23.32 +/* formatting errors, or very very bored. */
23.33 +/* */
23.34 +/* Gutcheck is a homebrew formatting checker specifically for */
23.35 +/* spotting common formatting problems in a PG e-text. I typically */
23.36 +/* run it once or twice on a file I'm about to submit; it usually */
23.37 +/* finds a few formatting problems. It also usually finds lots of */
23.38 +/* queries that aren't problems at all; it _really_ doesn't like */
23.39 +/* the standard PG header, for example. It's optimized for straight */
23.40 +/* prose; poetry and non-fiction involving tables tend to trigger */
23.41 +/* false alarms. */
23.42 +/* */
23.43 +/* The code of gutcheck is not very interesting, but the experience */
23.44 +/* of what constitutes a possible error may be, and the best way to */
23.45 +/* illustrate that is by example. */
23.46 +/* */
23.47 +/* */
23.48 +/* Here are some common typos found in PG texts that gutcheck */
23.49 +/* will flag as errors: */
23.50 +/* */
23.51 +/* "Look!John , over there!" */
23.52 +/* <this is a HTML tag> */
23.53 +/* &so is this; */
23.54 +/* Margaret said: " Now you should start for school." */
23.55 +/* Margaret said: "Now you should start for school. (if end of para) */
23.56 +/* The horse is said to he worth a lot. */
23.57 +/* 0K - this'11 make you look close1y. */
23.58 +/* "If you do. you'll regret it!" */
23.59 +/* */
23.60 +/* There are some complications . The extra space left around that */
23.61 +/* period was an error . . . but that ellipsis wasn't. */
23.62 +/* */
23.63 +/* The last line of a paragraph */
23.64 +/* is usually short. */
23.65 +/* */
23.66 +/* This period is an error.But the periods in a.m. aren't. */
23.67 +/* */
23.68 +/* Checks that are do-able but not (well) implemented are: */
23.69 +/* Single-quote chcking. */
23.70 +/* Despite 3 attempts at it, singlequote checking is still */
23.71 +/* crap in gutcheck. It may not be possible without analysis */
23.72 +/* of the whole paragraph. */
23.73 +/* */
23.74 +/*************************************************************************/
23.75 +
23.76 +
23.77 +#include <stdio.h>
23.78 +#include <stdlib.h>
23.79 +#include <string.h>
23.80 +#include <ctype.h>
23.81 +
23.82 +#define MAXWORDLEN 80 /* max length of one word */
23.83 +#define LINEBUFSIZE 2048 /* buffer size for an input line */
23.84 +
23.85 +#define MAX_USER_TYPOS 1000
23.86 +#define USERTYPO_FILE "gutcheck.typ"
23.87 +
23.88 +#ifndef MAX_PATH
23.89 +#define MAX_PATH 16384
23.90 +#endif
23.91 +
23.92 +char aline[LINEBUFSIZE];
23.93 +char prevline[LINEBUFSIZE];
23.94 +
23.95 + /* Common typos. */
23.96 +char *typo[] = { "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane", "nad",
23.97 + "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa", "bakc", "om",
23.98 + "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt", "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr",
23.99 + "hmi", "hse", "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd", "gerat", "goign",
23.100 + "gruop", "haev", "hda", "hearign", "seeign", "sayign", "herat", "hge", "hsa", "hsi", "hte", "htere",
23.101 + "htese", "htey", "htis", "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut", "loev",
23.102 + "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter", "omre", "onyl", "otehr", "otu", "owrk",
23.103 + "owuld", "peice", "peices", "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
23.104 + "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe", "sohw", "stnad", "stopry",
23.105 + "stoyr", "stpo", "tahn", "taht", "tath", "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge",
23.106 + "thier", "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne", "tirne", "tkae",
23.107 + "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey", "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih",
23.108 + "whihc", "whta", "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
23.109 + "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking", "wtih", "wuould", "wya", "yera",
23.110 + "yeras", "yersa", "yoiu", "youve", "ytou", "yuor",
23.111 + /* added h/b words for version 12 - removed a few with "tbe" v.25 */
23.112 + "abead", "ahle", "ahout", "ahove", "altbough", "balf", "bardly", "bas", "bave", "baving", "bebind",
23.113 + "beld", "belp", "belped", "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge", "dehates",
23.114 + "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan", "hegin", "heing",
23.115 + "helieve", "henefit", "hetter", "hetween", "heyond", "hig", "higber", "huild", "huy", "hy", "jobn", "joh",
23.116 + "meanwbile", "memher", "memhers", "numher", "numhers",
23.117 + "perbaps", "prohlem", "puhlic", "witbout",
23.118 + /* and a few more for .18 */
23.119 + "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud", "prornise", "prornised", "modem", "bo",
23.120 + "heside", "chapteb", "chaptee", "se",
23.121 + ""};
23.122 +
23.123 +char *usertypo[MAX_USER_TYPOS];
23.124 +
23.125 + /* Common abbreviations and other OK words not to query as typos. */
23.126 + /* 0.99 last-minute - removed "ms" */
23.127 +char *okword[] = {"mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm", "rd", "sh", "br",
23.128 + "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "pompeii","hawaii","hawaiian",
23.129 + "hotbed", "heartbeat", "heartbeats", "outbid", "outbids", "frostbite", "frostbitten",
23.130 + ""};
23.131 +
23.132 + /* Common abbreviations that cause otherwise unexplained periods. */
23.133 +char *abbrev[] = {"cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op", "cit",
23.134 + "deg", "min", "chap", "oz", "mme", "mlle", "mssrs",
23.135 + ""};
23.136 + /* Two-Letter combinations that rarely if ever start words, */
23.137 + /* but are common scannos or otherwise common letter */
23.138 + /* combinations. */
23.139 +char *nostart[] = { "hr", "hl", "cb", "sb", "tb", "wb", "tl",
23.140 + "tn", "rn", "lt", "tj",
23.141 + "" };
23.142 +
23.143 + /* Two-Letter combinations that rarely if ever end words */
23.144 + /* but are common scannos or otherwise common letter */
23.145 + /* combinations */
23.146 +char *noend[] = { "cb", "gb", "pb", "sb", "tb",
23.147 + "wh","fr","br","qu","tw","gl","fl","sw","gr","sl","cl",
23.148 + "iy",
23.149 + ""};
23.150 +
23.151 +char *markup[] = { "a", "b", "big", "blockquote", "body", "br", "center",
23.152 + "col", "div", "em", "font", "h1", "h2", "h3", "h4",
23.153 + "h5", "h6", "head", "hr", "html", "i", "img", "li",
23.154 + "meta", "ol", "p", "pre", "small", "span", "strong",
23.155 + "sub", "sup", "table", "td", "tfoot", "thead", "title",
23.156 + "tr", "tt", "u", "ul",
23.157 + ""};
23.158 +
23.159 +char *DPmarkup[] = { "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>",
23.160 + ""}; /* <tb> added .991 */
23.161 +
23.162 +char *nocomma[] = { "the", "it's", "their", "an", "mrs", "a", "our", "that's",
23.163 + "its", "whose", "every", "i'll", "your", "my",
23.164 + "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "rd",
23.165 + "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
23.166 + "i'm", "during", "let", "toward", "among",
23.167 + ""};
23.168 +
23.169 +
23.170 +char *noperiod[] = { "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
23.171 + "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
23.172 + "i'll", "whose", "who", "because", "when", "let", "till", "very",
23.173 + "an", "among", "those", "into", "whom", "having", "thence",
23.174 + ""};
23.175 +
23.176 +
23.177 +char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü"; /* Carlo's old suggestion, updated .991 */
23.178 +
23.179 +struct {
23.180 + char *htmlent;
23.181 + char *htmlnum;
23.182 + char *textent;
23.183 + } entities[] = { "&", "&", "&",
23.184 + "<", "<", "<",
23.185 + ">", ">", ">",
23.186 + "°", "°", " degrees",
23.187 + "£", "£", "L",
23.188 + """, """, "\"", /* -- quotation mark = APL quote, */
23.189 + "Œ", "Œ", "OE", /* -- latin capital ligature OE, */
23.190 + "œ", "œ", "oe", /* -- latin small ligature oe, U+0153 ISOlat2 --> */
23.191 + "Š", "Š", "S", /* -- latin capital letter S with caron, */
23.192 + "š", "š", "s", /* -- latin small letter s with caron, */
23.193 + "Ÿ", "Ÿ", "Y", /* -- latin capital letter Y with diaeresis, */
23.194 + "ˆ", "ˆ", "", /* -- modifier letter circumflex accent, */
23.195 + "˜", "˜", "~", /* -- small tilde, U+02DC ISOdia --> */
23.196 + " ", " ", " ", /* -- en space, U+2002 ISOpub --> */
23.197 + " ", " ", " ", /* -- em space, U+2003 ISOpub --> */
23.198 + " ", " ", " ", /* -- thin space, U+2009 ISOpub --> */
23.199 + "–", "–", "-", /* -- en dash, U+2013 ISOpub --> */
23.200 + "—", "—", "--", /* -- em dash, U+2014 ISOpub --> */
23.201 + "‘", "‘", "'", /* -- left single quotation mark, */
23.202 + "’", "’", "'", /* -- right single quotation mark, */
23.203 + "‚", "‚", "'", /* -- single low-9 quotation mark, U+201A NEW --> */
23.204 + "“", "“", "\"", /* -- left double quotation mark, */
23.205 + "”", "”", "\"", /* -- right double quotation mark, */
23.206 + "„", "„", "\"", /* -- double low-9 quotation mark, U+201E NEW --> */
23.207 + "‹", "‹", "\"", /* -- single left-pointing angle quotation mark, */
23.208 + "›", "›", "\"", /* -- single right-pointing angle quotation mark, */
23.209 + " ", " ", " ", /* -- no-break space = non-breaking space, */
23.210 + "¡", "¡", "!", /* -- inverted exclamation mark, U+00A1 ISOnum --> */
23.211 + "¢", "¢", "c", /* -- cent sign, U+00A2 ISOnum --> */
23.212 + "£", "£", "L", /* -- pound sign, U+00A3 ISOnum --> */
23.213 + "¤", "¤", "$", /* -- currency sign, U+00A4 ISOnum --> */
23.214 + "¥", "¥", "Y", /* -- yen sign = yuan sign, U+00A5 ISOnum --> */
23.215 + "§", "§", "--", /* -- section sign, U+00A7 ISOnum --> */
23.216 + "¨", "¨", " ", /* -- diaeresis = spacing diaeresis, */
23.217 + "©", "©", "(C) ", /* -- copyright sign, U+00A9 ISOnum --> */
23.218 + "ª", "ª", " ", /* -- feminine ordinal indicator, U+00AA ISOnum --> */
23.219 + "«", "«", "\"", /* -- left-pointing double angle quotation mark */
23.220 + "­", "­", "-", /* -- soft hyphen = discretionary hyphen, */
23.221 + "®", "®", "(R) ", /* -- registered sign = registered trade mark sign, */
23.222 + "¯", "¯", " ", /* -- macron = spacing macron = overline */
23.223 + "°", "°", " degrees", /* -- degree sign, U+00B0 ISOnum --> */
23.224 + "±", "±", "+-", /* -- plus-minus sign = plus-or-minus sign, */
23.225 + "²", "²", "2", /* -- superscript two = superscript digit two */
23.226 + "³", "³", "3", /* -- superscript three = superscript digit three */
23.227 + "´", "´", " ", /* -- acute accent = spacing acute, */
23.228 + "µ", "µ", "m", /* -- micro sign, U+00B5 ISOnum --> */
23.229 + "¶", "¶", "--", /* -- pilcrow sign = paragraph sign, */
23.230 + "¸", "¸", " ", /* -- cedilla = spacing cedilla, U+00B8 ISOdia --> */
23.231 + "¹", "¹", "1", /* -- superscript one = superscript digit one, */
23.232 + "º", "º", " ", /* -- masculine ordinal indicator, */
23.233 + "»", "»", "\"", /* -- right-pointing double angle quotation mark */
23.234 + "¼", "¼", "1/4", /* -- vulgar fraction one quarter */
23.235 + "½", "½", "1/2", /* -- vulgar fraction one half */
23.236 + "¾", "¾", "3/4", /* -- vulgar fraction three quarters */
23.237 + "¿", "¿", "?", /* -- inverted question mark */
23.238 + "À", "À", "A", /* -- latin capital letter A with grave */
23.239 + "Á", "Á", "A", /* -- latin capital letter A with acute, */
23.240 + "Â", "Â", "A", /* -- latin capital letter A with circumflex, */
23.241 + "Ã", "Ã", "A", /* -- latin capital letter A with tilde, */
23.242 + "Ä", "Ä", "A", /* -- latin capital letter A with diaeresis, */
23.243 + "Å", "Å", "A", /* -- latin capital letter A with ring above */
23.244 + "Æ", "Æ", "AE", /* -- latin capital letter AE */
23.245 + "Ç", "Ç", "C", /* -- latin capital letter C with cedilla, */
23.246 + "È", "È", "E", /* -- latin capital letter E with grave, */
23.247 + "É", "É", "E", /* -- latin capital letter E with acute, */
23.248 + "Ê", "Ê", "E", /* -- latin capital letter E with circumflex, */
23.249 + "Ë", "Ë", "E", /* -- latin capital letter E with diaeresis, */
23.250 + "Ì", "Ì", "I", /* -- latin capital letter I with grave, */
23.251 + "Í", "Í", "I", /* -- latin capital letter I with acute, */
23.252 + "Î", "Î", "I", /* -- latin capital letter I with circumflex, */
23.253 + "Ï", "Ï", "I", /* -- latin capital letter I with diaeresis, */
23.254 + "Ð", "Ð", "E", /* -- latin capital letter ETH, U+00D0 ISOlat1 --> */
23.255 + "Ñ", "Ñ", "N", /* -- latin capital letter N with tilde, */
23.256 + "Ò", "Ò", "O", /* -- latin capital letter O with grave, */
23.257 + "Ó", "Ó", "O", /* -- latin capital letter O with acute, */
23.258 + "Ô", "Ô", "O", /* -- latin capital letter O with circumflex, */
23.259 + "Õ", "Õ", "O", /* -- latin capital letter O with tilde, */
23.260 + "Ö", "Ö", "O", /* -- latin capital letter O with diaeresis, */
23.261 + "×", "×", "*", /* -- multiplication sign, U+00D7 ISOnum --> */
23.262 + "Ø", "Ø", "O", /* -- latin capital letter O with stroke */
23.263 + "Ù", "Ù", "U", /* -- latin capital letter U with grave, */
23.264 + "Ú", "Ú", "U", /* -- latin capital letter U with acute, */
23.265 + "Û", "Û", "U", /* -- latin capital letter U with circumflex, */
23.266 + "Ü", "Ü", "U", /* -- latin capital letter U with diaeresis, */
23.267 + "Ý", "Ý", "Y", /* -- latin capital letter Y with acute, */
23.268 + "Þ", "Þ", "TH", /* -- latin capital letter THORN, */
23.269 + "ß", "ß", "sz", /* -- latin small letter sharp s = ess-zed, */
23.270 + "à", "à", "a", /* -- latin small letter a with grave */
23.271 + "á", "á", "a", /* -- latin small letter a with acute, */
23.272 + "â", "â", "a", /* -- latin small letter a with circumflex, */
23.273 + "ã", "ã", "a", /* -- latin small letter a with tilde, */
23.274 + "ä", "ä", "a", /* -- latin small letter a with diaeresis, */
23.275 + "å", "å", "a", /* -- latin small letter a with ring above */
23.276 + "æ", "æ", "ae", /* -- latin small letter ae */
23.277 + "ç", "ç", "c", /* -- latin small letter c with cedilla, */
23.278 + "è", "è", "e", /* -- latin small letter e with grave, */
23.279 + "é", "é", "e", /* -- latin small letter e with acute, */
23.280 + "ê", "ê", "e", /* -- latin small letter e with circumflex, */
23.281 + "ë", "ë", "e", /* -- latin small letter e with diaeresis, */
23.282 + "ì", "ì", "i", /* -- latin small letter i with grave, */
23.283 + "í", "í", "i", /* -- latin small letter i with acute, */
23.284 + "î", "î", "i", /* -- latin small letter i with circumflex, */
23.285 + "ï", "ï", "i", /* -- latin small letter i with diaeresis, */
23.286 + "ð", "ð", "eth", /* -- latin small letter eth, U+00F0 ISOlat1 --> */
23.287 + "ñ", "ñ", "n", /* -- latin small letter n with tilde, */
23.288 + "ò", "ò", "o", /* -- latin small letter o with grave, */
23.289 + "ó", "ó", "o", /* -- latin small letter o with acute, */
23.290 + "ô", "ô", "o", /* -- latin small letter o with circumflex, */
23.291 + "õ", "õ", "o", /* -- latin small letter o with tilde, */
23.292 + "ö", "ö", "o", /* -- latin small letter o with diaeresis, */
23.293 + "÷", "÷", "/", /* -- division sign, U+00F7 ISOnum --> */
23.294 + "ø", "ø", "o", /* -- latin small letter o with stroke, */
23.295 + "ù", "ù", "u", /* -- latin small letter u with grave, */
23.296 + "ú", "ú", "u", /* -- latin small letter u with acute, */
23.297 + "û", "û", "u", /* -- latin small letter u with circumflex, */
23.298 + "ü", "ü", "u", /* -- latin small letter u with diaeresis, */
23.299 + "ý", "ý", "y", /* -- latin small letter y with acute, */
23.300 + "þ", "þ", "th", /* -- latin small letter thorn, */
23.301 + "ÿ", "ÿ", "y", /* -- latin small letter y with diaeresis, */
23.302 + "", "" };
23.303 +
23.304 +/* ---- list of special characters ---- */
23.305 +#define CHAR_SPACE 32
23.306 +#define CHAR_TAB 9
23.307 +#define CHAR_LF 10
23.308 +#define CHAR_CR 13
23.309 +#define CHAR_DQUOTE 34
23.310 +#define CHAR_SQUOTE 39
23.311 +#define CHAR_OPEN_SQUOTE 96
23.312 +#define CHAR_TILDE 126
23.313 +#define CHAR_ASTERISK 42
23.314 +#define CHAR_FORESLASH 47
23.315 +#define CHAR_CARAT 94
23.316 +
23.317 +#define CHAR_UNDERSCORE '_'
23.318 +#define CHAR_OPEN_CBRACK '{'
23.319 +#define CHAR_CLOSE_CBRACK '}'
23.320 +#define CHAR_OPEN_RBRACK '('
23.321 +#define CHAR_CLOSE_RBRACK ')'
23.322 +#define CHAR_OPEN_SBRACK '['
23.323 +#define CHAR_CLOSE_SBRACK ']'
23.324 +
23.325 +
23.326 +
23.327 +
23.328 +
23.329 +/* ---- longest and shortest normal PG line lengths ----*/
23.330 +#define LONGEST_PG_LINE 75
23.331 +#define WAY_TOO_LONG 80
23.332 +#define SHORTEST_PG_LINE 55
23.333 +
23.334 +#define SWITCHES "ESTPXLOYHWVMUD" /* switches:- */
23.335 + /* D - ignore DP-specific markup */
23.336 + /* E - echo queried line */
23.337 + /* S - check single quotes */
23.338 + /* T - check common typos */
23.339 + /* P - require closure of quotes on */
23.340 + /* every paragraph */
23.341 + /* X - "Trust no one" :-) Paranoid! */
23.342 + /* Queries everything */
23.343 + /* L - line end checking defaults on */
23.344 + /* -L turns it off */
23.345 + /* O - overview. Just shows counts. */
23.346 + /* Y - puts errors to stdout */
23.347 + /* instead of stderr */
23.348 + /* H - Echoes header fields */
23.349 + /* M - Ignore markup in < > */
23.350 + /* U - Use file of User-defined Typos*/
23.351 + /* W - Defaults for use on Web upload*/
23.352 + /* V - Verbose - list EVERYTHING! */
23.353 +#define SWITNO 14 /* max number of switch parms */
23.354 + /* - used for defining array-size */
23.355 +#define MINARGS 1 /* minimum no of args excl switches */
23.356 +#define MAXARGS 1 /* maximum no of args excl switches */
23.357 +
23.358 +int pswit[SWITNO]; /* program switches set by SWITCHES */
23.359 +
23.360 +#define ECHO_SWITCH 0
23.361 +#define SQUOTE_SWITCH 1
23.362 +#define TYPO_SWITCH 2
23.363 +#define QPARA_SWITCH 3
23.364 +#define PARANOID_SWITCH 4
23.365 +#define LINE_END_SWITCH 5
23.366 +#define OVERVIEW_SWITCH 6
23.367 +#define STDOUT_SWITCH 7
23.368 +#define HEADER_SWITCH 8
23.369 +#define WEB_SWITCH 9
23.370 +#define VERBOSE_SWITCH 10
23.371 +#define MARKUP_SWITCH 11
23.372 +#define USERTYPO_SWITCH 12
23.373 +#define DP_SWITCH 13
23.374 +
23.375 +
23.376 +
23.377 +long cnt_dquot; /* for overview mode, count of doublequote queries */
23.378 +long cnt_squot; /* for overview mode, count of singlequote queries */
23.379 +long cnt_brack; /* for overview mode, count of brackets queries */
23.380 +long cnt_bin; /* for overview mode, count of non-ASCII queries */
23.381 +long cnt_odd; /* for overview mode, count of odd character queries */
23.382 +long cnt_long; /* for overview mode, count of long line errors */
23.383 +long cnt_short; /* for overview mode, count of short line queries */
23.384 +long cnt_punct; /* for overview mode, count of punctuation and spacing queries */
23.385 +long cnt_dash; /* for overview mode, count of dash-related queries */
23.386 +long cnt_word; /* for overview mode, count of word queries */
23.387 +long cnt_html; /* for overview mode, count of html queries */
23.388 +long cnt_lineend; /* for overview mode, count of line-end queries */
23.389 +long cnt_spacend; /* count of lines with space at end V .21 */
23.390 +long linecnt; /* count of total lines in the file */
23.391 +long checked_linecnt; /* count of lines actually gutchecked V .26 */
23.392 +
23.393 +void proghelp(void);
23.394 +void procfile(char *);
23.395 +
23.396 +#define LOW_THRESHOLD 0
23.397 +#define HIGH_THRESHOLD 1
23.398 +
23.399 +#define START 0
23.400 +#define END 1
23.401 +#define PREV 0
23.402 +#define NEXT 1
23.403 +#define FIRST_OF_PAIR 0
23.404 +#define SECOND_OF_PAIR 1
23.405 +
23.406 +#define MAX_WORDPAIR 1000
23.407 +
23.408 +char running_from[MAX_PATH];
23.409 +
23.410 +int mixdigit(char *);
23.411 +char *getaword(char *, char *);
23.412 +int matchword(char *, char *);
23.413 +char *flgets(char *, int, FILE *, long);
23.414 +void lowerit(char *);
23.415 +int gcisalpha(unsigned char);
23.416 +int gcisdigit(unsigned char);
23.417 +int gcisletter(unsigned char);
23.418 +char *gcstrchr(char *s, char c);
23.419 +void postprocess_for_HTML(char *);
23.420 +char *linehasmarkup(char *);
23.421 +char *losemarkup(char *);
23.422 +int tagcomp(char *, char *);
23.423 +char *loseentities(char *);
23.424 +int isroman(char *);
23.425 +int usertypo_count;
23.426 +void postprocess_for_DP(char *);
23.427 +
23.428 +char wrk[LINEBUFSIZE];
23.429 +
23.430 +/* This is disgustingly lazy, predefining max words & lengths, */
23.431 +/* but now I'm out of 16-bit restrictions, what's a couple of K? */
23.432 +#define MAX_QWORD 50
23.433 +#define MAX_QWORD_LENGTH 40
23.434 +char qword[MAX_QWORD][MAX_QWORD_LENGTH];
23.435 +char qperiod[MAX_QWORD][MAX_QWORD_LENGTH];
23.436 +signed int dupcnt[MAX_QWORD];
23.437 +
23.438 +
23.439 +
23.440 +
23.441 +int main(int argc, char **argv)
23.442 +{
23.443 + char *argsw, *s;
23.444 + int i, switno, invarg;
23.445 + char usertypo_file[MAX_PATH];
23.446 + FILE *usertypofile;
23.447 +
23.448 +
23.449 + if (strlen(argv[0]) < sizeof(running_from))
23.450 + strcpy(running_from, argv[0]); /* save the path to the executable gutcheck */
23.451 +
23.452 + /* find out what directory we're running from */
23.453 + for (s = running_from + strlen(running_from); *s != '/' && *s != '\\' && s >= running_from; s--)
23.454 + *s = 0;
23.455 +
23.456 +
23.457 + switno = strlen(SWITCHES);
23.458 + for (i = switno ; --i >0 ; )
23.459 + pswit[i] = 0; /* initialise switches */
23.460 +
23.461 + /* Standard loop to extract switches. */
23.462 + /* When we come out of this loop, the arguments will be */
23.463 + /* in argv[0] upwards and the switches used will be */
23.464 + /* represented by their equivalent elements in pswit[] */
23.465 + while ( --argc > 0 && **++argv == '-')
23.466 + for (argsw = argv[0]+1; *argsw !='\0'; argsw++)
23.467 + for (i = switno, invarg = 1; (--i >= 0) && invarg == 1 ; )
23.468 + if ((toupper(*argsw)) == SWITCHES[i] ) {
23.469 + invarg = 0;
23.470 + pswit[i] = 1;
23.471 + }
23.472 +
23.473 + pswit[PARANOID_SWITCH] ^= 1; /* Paranoid checking is turned OFF, not on, by its switch */
23.474 +
23.475 + if (pswit[PARANOID_SWITCH]) { /* if running in paranoid mode */
23.476 + pswit[TYPO_SWITCH] = pswit[TYPO_SWITCH] ^ 1; /* force typo checks as well */
23.477 + } /* v.20 removed s and p switches from paranoid mode */
23.478 +
23.479 + pswit[LINE_END_SWITCH] ^= 1; /* Line-end checking is turned OFF, not on, by its switch */
23.480 + pswit[ECHO_SWITCH] ^= 1; /* V.21 Echoing is turned OFF, not on, by its switch */
23.481 +
23.482 + if (pswit[OVERVIEW_SWITCH]) /* just print summary; don't echo */
23.483 + pswit[ECHO_SWITCH] = 0;
23.484 +
23.485 + /* Web uploads - for the moment, this is really just a placeholder */
23.486 + /* until we decide what processing we really want to do on web uploads */
23.487 + if (pswit[WEB_SWITCH]) { /* specific override for web uploads */
23.488 + pswit[ECHO_SWITCH] = 1;
23.489 + pswit[SQUOTE_SWITCH] = 0;
23.490 + pswit[TYPO_SWITCH] = 1;
23.491 + pswit[QPARA_SWITCH] = 0;
23.492 + pswit[PARANOID_SWITCH] = 1;
23.493 + pswit[LINE_END_SWITCH] = 0;
23.494 + pswit[OVERVIEW_SWITCH] = 0;
23.495 + pswit[STDOUT_SWITCH] = 0;
23.496 + pswit[HEADER_SWITCH] = 1;
23.497 + pswit[VERBOSE_SWITCH] = 0;
23.498 + pswit[MARKUP_SWITCH] = 0;
23.499 + pswit[USERTYPO_SWITCH] = 0;
23.500 + pswit[DP_SWITCH] = 0;
23.501 + }
23.502 +
23.503 +
23.504 + if (argc < MINARGS || argc > MAXARGS) { /* check number of args */
23.505 + proghelp();
23.506 + return(1); /* exit */
23.507 + }
23.508 +
23.509 +
23.510 + /* read in the user-defined stealth scanno list */
23.511 +
23.512 + if (pswit[USERTYPO_SWITCH]) { /* ... we were told we had one! */
23.513 + if ((usertypofile = fopen(USERTYPO_FILE, "rb")) == NULL) { /* not in cwd. try gutcheck directory. */
23.514 + strcpy(usertypo_file, running_from);
23.515 + strcat(usertypo_file, USERTYPO_FILE);
23.516 + if ((usertypofile = fopen(usertypo_file, "rb")) == NULL) { /* we ain't got no user typo file! */
23.517 + printf(" --> I couldn't find gutcheck.typ -- proceeding without user typos.\n");
23.518 + }
23.519 + }
23.520 +
23.521 + usertypo_count = 0;
23.522 + if (usertypofile) { /* we managed to open a User Typo File! */
23.523 + if (pswit[USERTYPO_SWITCH]) {
23.524 + while (flgets(aline, LINEBUFSIZE-1, usertypofile, (long)usertypo_count)) {
23.525 + if (strlen(aline) > 1) {
23.526 + if ((int)*aline > 33) {
23.527 + s = malloc(strlen(aline)+1);
23.528 + if (!s) {
23.529 + fprintf(stderr, "gutcheck: cannot get enough memory for user typo file!!\n");
23.530 + exit(1);
23.531 + }
23.532 + strcpy(s, aline);
23.533 + usertypo[usertypo_count] = s;
23.534 + usertypo_count++;
23.535 + if (usertypo_count >= MAX_USER_TYPOS) {
23.536 + printf(" --> Only %d user-defined typos allowed: ignoring the rest\n");
23.537 + break;
23.538 + }
23.539 + }
23.540 + }
23.541 + }
23.542 + }
23.543 + fclose(usertypofile);
23.544 + }
23.545 + }
23.546 +
23.547 +
23.548 +
23.549 +
23.550 + fprintf(stderr, "gutcheck: Check and report on an e-text\n");
23.551 +
23.552 + cnt_dquot = cnt_squot = cnt_brack = cnt_bin = cnt_odd = cnt_long =
23.553 + cnt_short = cnt_punct = cnt_dash = cnt_word = cnt_html = cnt_lineend =
23.554 + cnt_spacend = 0;
23.555 +
23.556 + procfile(argv[0]);
23.557 +
23.558 + if (pswit[OVERVIEW_SWITCH]) {
23.559 + printf(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
23.560 + checked_linecnt, linecnt, linecnt - checked_linecnt);
23.561 + printf(" --------------- Queries found --------------\n");
23.562 + if (cnt_long) printf(" Long lines: %5ld\n",cnt_long);
23.563 + if (cnt_short) printf(" Short lines: %5ld\n",cnt_short);
23.564 + if (cnt_lineend) printf(" Line-end problems: %5ld\n",cnt_lineend);
23.565 + if (cnt_word) printf(" Common typos: %5ld\n",cnt_word);
23.566 + if (cnt_dquot) printf(" Unmatched quotes: %5ld\n",cnt_dquot);
23.567 + if (cnt_squot) printf(" Unmatched SingleQuotes: %5ld\n",cnt_squot);
23.568 + if (cnt_brack) printf(" Unmatched brackets: %5ld\n",cnt_brack);
23.569 + if (cnt_bin) printf(" Non-ASCII characters: %5ld\n",cnt_bin);
23.570 + if (cnt_odd) printf(" Proofing characters: %5ld\n",cnt_odd);
23.571 + if (cnt_punct) printf(" Punctuation & spacing queries: %5ld\n",cnt_punct);
23.572 + if (cnt_dash) printf(" Non-standard dashes: %5ld\n",cnt_dash);
23.573 + if (cnt_html) printf(" Possible HTML tags: %5ld\n",cnt_html);
23.574 + printf("\n");
23.575 + printf(" TOTAL QUERIES %5ld\n",
23.576 + cnt_dquot + cnt_squot + cnt_brack + cnt_bin + cnt_odd + cnt_long +
23.577 + cnt_short + cnt_punct + cnt_dash + cnt_word + cnt_html + cnt_lineend);
23.578 + }
23.579 +
23.580 + return(0);
23.581 +}
23.582 +
23.583 +
23.584 +
23.585 +/* procfile - process one file */
23.586 +
23.587 +void procfile(char *filename)
23.588 +{
23.589 +
23.590 + char *s, *t, *s1, laststart, *wordstart;
23.591 + char inword[MAXWORDLEN], testword[MAXWORDLEN];
23.592 + char parastart[81]; /* first line of current para */
23.593 + FILE *infile;
23.594 + long quot, squot, firstline, alphalen, totlen, binlen,
23.595 + shortline, longline, verylongline, spacedash, emdash,
23.596 + space_emdash, non_PG_space_emdash, PG_space_emdash,
23.597 + footerline, dotcomma, start_para_line, astline, fslashline,
23.598 + standalone_digit, hyphens, htmcount, endquote_count;
23.599 + long spline, nspline;
23.600 + signed int i, j, llen, isemptyline, isacro, isellipsis, istypo, alower,
23.601 + eNon_A, eTab, eTilde, eAst, eFSlash, eCarat;
23.602 + signed int warn_short, warn_long, warn_bin, warn_dash, warn_dotcomma,
23.603 + warn_ast, warn_fslash, warn_digit, warn_hyphen, warn_endquote;
23.604 + unsigned int lastlen, lastblen;
23.605 + signed int s_brack, c_brack, r_brack, c_unders;
23.606 + signed int open_single_quote, close_single_quote, guessquote, dquotepar, squotepar;
23.607 + signed int isnewpara, vowel, consonant;
23.608 + char dquote_err[80], squote_err[80], rbrack_err[80], sbrack_err[80], cbrack_err[80],
23.609 + unders_err[80];
23.610 + signed int qword_index, qperiod_index, isdup;
23.611 + signed int enddash;
23.612 + signed int Dutchcount, isDutch, Frenchcount, isFrench;
23.613 +
23.614 +
23.615 +
23.616 +
23.617 +
23.618 + laststart = CHAR_SPACE;
23.619 + lastlen = lastblen = 0;
23.620 + *dquote_err = *squote_err = *rbrack_err = *cbrack_err = *sbrack_err =
23.621 + *unders_err = *prevline = 0;
23.622 + linecnt = firstline = alphalen = totlen = binlen =
23.623 + shortline = longline = spacedash = emdash = checked_linecnt =
23.624 + space_emdash = non_PG_space_emdash = PG_space_emdash =
23.625 + footerline = dotcomma = start_para_line = astline = fslashline =
23.626 + standalone_digit = hyphens = htmcount = endquote_count = 0;
23.627 + quot = squot = s_brack = c_brack = r_brack = c_unders = 0;
23.628 + i = llen = isemptyline = isacro = isellipsis = istypo = 0;
23.629 + warn_short = warn_long = warn_bin = warn_dash = warn_dotcomma =
23.630 + warn_ast = warn_fslash = warn_digit = warn_endquote = 0;
23.631 + isnewpara = vowel = consonant = enddash = 0;
23.632 + spline = nspline = 0;
23.633 + qword_index = qperiod_index = isdup = 0;
23.634 + *inword = *testword = 0;
23.635 + open_single_quote = close_single_quote = guessquote = dquotepar = squotepar = 0;
23.636 + Dutchcount = isDutch = Frenchcount = isFrench = 0;
23.637 +
23.638 +
23.639 + for (j = 0; j < MAX_QWORD; j++) {
23.640 + dupcnt[j] = 0;
23.641 + for (i = 0; i < MAX_QWORD_LENGTH; i++)
23.642 + qword[i][j] = 0;
23.643 + qperiod[i][j] = 0;
23.644 + }
23.645 +
23.646 +
23.647 + if ((infile = fopen(filename, "rb")) == NULL) {
23.648 + if (pswit[STDOUT_SWITCH])
23.649 + fprintf(stdout, "gutcheck: cannot open %s\n", filename);
23.650 + else
23.651 + fprintf(stderr, "gutcheck: cannot open %s\n", filename);
23.652 + exit(1);
23.653 + }
23.654 +
23.655 + fprintf(stdout, "\n\nFile: %s\n\n", filename);
23.656 + firstline = shortline = longline = verylongline = 0;
23.657 +
23.658 +
23.659 + /*****************************************************/
23.660 + /* */
23.661 + /* Run a first pass - verify that it's a valid PG */
23.662 + /* file, decide whether to report some things that */
23.663 + /* occur many times in the text like long or short */
23.664 + /* lines, non-standard dashes, and other good stuff */
23.665 + /* I'll doubtless think of later. */
23.666 + /* */
23.667 + /*****************************************************/
23.668 +
23.669 + /*****************************************************/
23.670 + /* V.24 Sigh. Yet Another Header Change */
23.671 + /*****************************************************/
23.672 +
23.673 + while (fgets(aline, LINEBUFSIZE-1, infile)) {
23.674 + while (aline[strlen(aline)-1] == 10 || aline[strlen(aline)-1] == 13 ) aline[strlen(aline)-1] = 0;
23.675 + linecnt++;
23.676 + if (strstr(aline, "*END") && strstr(aline, "SMALL PRINT") && (strstr(aline, "PUBLIC DOMAIN") || strstr(aline, "COPYRIGHT"))) {
23.677 + if (spline)
23.678 + printf(" --> Duplicate header?\n");
23.679 + spline = linecnt + 1; /* first line of non-header text, that is */
23.680 + }
23.681 + if (!strncmp(aline, "*** START", 9) && strstr(aline, "PROJECT GUTENBERG")) {
23.682 + if (nspline)
23.683 + printf(" --> Duplicate header?\n");
23.684 + nspline = linecnt + 1; /* first line of non-header text, that is */
23.685 + }
23.686 + if (spline || nspline) {
23.687 + lowerit(aline);
23.688 + if (strstr(aline, "end") && strstr(aline, "project gutenberg")) {
23.689 + if (strstr(aline, "end") < strstr(aline, "project gutenberg")) {
23.690 + if (footerline) {
23.691 + if (!nspline) /* it's an old-form header - we can detect duplicates */
23.692 + printf(" --> Duplicate footer?\n");
23.693 + else
23.694 + ;
23.695 + }
23.696 + else {
23.697 + footerline = linecnt;
23.698 + }
23.699 + }
23.700 + }
23.701 + }
23.702 + if (spline) firstline = spline;
23.703 + if (nspline) firstline = nspline; /* override with new */
23.704 +
23.705 + if (footerline) continue; /* 0.99+ don't count the boilerplate in the footer */
23.706 +
23.707 + llen = strlen(aline);
23.708 + totlen += llen;
23.709 + for (i = 0; i < llen; i++) {
23.710 + if ((unsigned char)aline[i] > 127) binlen++;
23.711 + if (gcisalpha(aline[i])) alphalen++;
23.712 + if (i > 0)
23.713 + if (aline[i] == CHAR_DQUOTE && isalpha(aline[i-1]))
23.714 + endquote_count++;
23.715 + }
23.716 + if (strlen(aline) > 2
23.717 + && lastlen > 2 && lastlen < SHORTEST_PG_LINE
23.718 + && lastblen > 2 && lastblen > SHORTEST_PG_LINE
23.719 + && laststart != CHAR_SPACE)
23.720 + shortline++;
23.721 +
23.722 + if (*aline) /* fixed line below for 0.96 */
23.723 + if ((unsigned char)aline[strlen(aline)-1] <= CHAR_SPACE) cnt_spacend++;
23.724 +
23.725 + if (strstr(aline, ".,")) dotcomma++;
23.726 + /* 0.98 only count ast lines for ignoring purposes where there is */
23.727 + /* locase text on the line */
23.728 + if (strstr(aline, "*")) {
23.729 + for (s = aline; *s; s++)
23.730 + if (*s >='a' && *s <= 'z')
23.731 + break;
23.732 + if (*s) astline++;
23.733 + }
23.734 + if (strstr(aline, "/"))
23.735 + fslashline++;
23.736 + for (i = llen-1; i > 0 && (unsigned char)aline[i] <= CHAR_SPACE; i--);
23.737 + if (aline[i] == '-' && aline[i-1] != '-') hyphens++;
23.738 +
23.739 + if (llen > LONGEST_PG_LINE) longline++;
23.740 + if (llen > WAY_TOO_LONG) verylongline++;
23.741 +
23.742 + if (strstr(aline, "<") && strstr(aline, ">")) {
23.743 + i = (signed int) (strstr(aline, ">") - strstr(aline, "<") + 1);
23.744 + if (i > 0)
23.745 + htmcount++;
23.746 + if (strstr(aline, "<i>")) htmcount +=4; /* bonus marks! */
23.747 + }
23.748 +
23.749 + /* Check for spaced em-dashes */
23.750 + if (strstr(aline,"--")) {
23.751 + emdash++;
23.752 + if (*(strstr(aline, "--")-1) == CHAR_SPACE ||
23.753 + (*(strstr(aline, "--")+2) == CHAR_SPACE))
23.754 + space_emdash++;
23.755 + if (*(strstr(aline, "--")-1) == CHAR_SPACE &&
23.756 + (*(strstr(aline, "--")+2) == CHAR_SPACE))
23.757 + non_PG_space_emdash++; /* count of em-dashes with spaces both sides */
23.758 + if (*(strstr(aline, "--")-1) != CHAR_SPACE &&
23.759 + (*(strstr(aline, "--")+2) != CHAR_SPACE))
23.760 + PG_space_emdash++; /* count of PG-type em-dashes with no spaces */
23.761 + }
23.762 +
23.763 + for (s = aline; *s;) {
23.764 + s = getaword(s, inword);
23.765 + if (!strcmp(inword, "hij") || !strcmp(inword, "niet"))
23.766 + Dutchcount++;
23.767 + if (!strcmp(inword, "dans") || !strcmp(inword, "avec"))
23.768 + Frenchcount++;
23.769 + if (!strcmp(inword, "0") || !strcmp(inword, "1"))
23.770 + standalone_digit++;
23.771 + }
23.772 +
23.773 + /* Check for spaced dashes */
23.774 + if (strstr(aline," -"))
23.775 + if (*(strstr(aline, " -")+2) != '-')
23.776 + spacedash++;
23.777 + lastblen = lastlen;
23.778 + lastlen = strlen(aline);
23.779 + laststart = aline[0];
23.780 +
23.781 + }
23.782 + fclose(infile);
23.783 +
23.784 +
23.785 + /* now, based on this quick view, make some snap decisions */
23.786 + if (cnt_spacend > 0) {
23.787 + printf(" --> %ld lines in this file have white space at end\n", cnt_spacend);
23.788 + }
23.789 +
23.790 + warn_dotcomma = 1;
23.791 + if (dotcomma > 5) {
23.792 + warn_dotcomma = 0;
23.793 + printf(" --> %ld lines in this file contain '.,'. Not reporting them.\n", dotcomma);
23.794 + }
23.795 +
23.796 + /* if more than 50 lines, or one-tenth, are short, don't bother reporting them */
23.797 + warn_short = 1;
23.798 + if (shortline > 50 || shortline * 10 > linecnt) {
23.799 + warn_short = 0;
23.800 + printf(" --> %ld lines in this file are short. Not reporting short lines.\n", shortline);
23.801 + }
23.802 +
23.803 + /* if more than 50 lines, or one-tenth, are long, don't bother reporting them */
23.804 + warn_long = 1;
23.805 + if (longline > 50 || longline * 10 > linecnt) {
23.806 + warn_long = 0;
23.807 + printf(" --> %ld lines in this file are long. Not reporting long lines.\n", longline);
23.808 + }
23.809 +
23.810 + /* if more than 10 lines contain asterisks, don't bother reporting them V.0.97 */
23.811 + warn_ast = 1;
23.812 + if (astline > 10 ) {
23.813 + warn_ast = 0;
23.814 + printf(" --> %ld lines in this file contain asterisks. Not reporting them.\n", astline);
23.815 + }
23.816 +
23.817 + /* if more than 10 lines contain forward slashes, don't bother reporting them V.0.99 */
23.818 + warn_fslash = 1;
23.819 + if (fslashline > 10 ) {
23.820 + warn_fslash = 0;
23.821 + printf(" --> %ld lines in this file contain forward slashes. Not reporting them.\n", fslashline);
23.822 + }
23.823 +
23.824 + /* if more than 20 lines contain unpunctuated endquotes, don't bother reporting them V.0.99 */
23.825 + warn_endquote = 1;
23.826 + if (endquote_count > 20 ) {
23.827 + warn_endquote = 0;
23.828 + printf(" --> %ld lines in this file contain unpunctuated endquotes. Not reporting them.\n", endquote_count);
23.829 + }
23.830 +
23.831 + /* if more than 15 lines contain standalone digits, don't bother reporting them V.0.97 */
23.832 + warn_digit = 1;
23.833 + if (standalone_digit > 10 ) {
23.834 + warn_digit = 0;
23.835 + printf(" --> %ld lines in this file contain standalone 0s and 1s. Not reporting them.\n", standalone_digit);
23.836 + }
23.837 +
23.838 + /* if more than 20 lines contain hyphens at end, don't bother reporting them V.0.98 */
23.839 + warn_hyphen = 1;
23.840 + if (hyphens > 20 ) {
23.841 + warn_hyphen = 0;
23.842 + printf(" --> %ld lines in this file have hyphens at end. Not reporting them.\n", hyphens);
23.843 + }
23.844 +
23.845 + if (htmcount > 20 && !pswit[MARKUP_SWITCH]) {
23.846 + printf(" --> Looks like this is HTML. Switching HTML mode ON.\n");
23.847 + pswit[MARKUP_SWITCH] = 1;
23.848 + }
23.849 +
23.850 + if (verylongline > 0) {
23.851 + printf(" --> %ld lines in this file are VERY long!\n", verylongline);
23.852 + }
23.853 +
23.854 + /* If there are more non-PG spaced dashes than PG em-dashes, */
23.855 + /* assume it's deliberate */
23.856 + /* Current PG guidelines say don't use them, but older texts do,*/
23.857 + /* and some people insist on them whatever the guidelines say. */
23.858 + /* V.20 removed requirement that PG_space_emdash be greater than*/
23.859 + /* ten before turning off warnings about spaced dashes. */
23.860 + warn_dash = 1;
23.861 + if (spacedash + non_PG_space_emdash > PG_space_emdash) {
23.862 + warn_dash = 0;
23.863 + printf(" --> There are %ld spaced dashes and em-dashes. Not reporting them.\n", spacedash + non_PG_space_emdash);
23.864 + }
23.865 +
23.866 + /* if more than a quarter of characters are hi-bit, bug out */
23.867 + warn_bin = 1;
23.868 + if (binlen * 4 > totlen) {
23.869 + printf(" --> This file does not appear to be ASCII. Terminating. Best of luck with it!\n");
23.870 + exit(1);
23.871 + }
23.872 + if (alphalen * 4 < totlen) {
23.873 + printf(" --> This file does not appear to be text. Terminating. Best of luck with it!\n");
23.874 + exit(1);
23.875 + }
23.876 + if ((binlen * 100 > totlen) || (binlen > 100)) {
23.877 + printf(" --> There are a lot of foreign letters here. Not reporting them.\n");
23.878 + warn_bin = 0;
23.879 + }
23.880 +
23.881 + /* isDutch and isFrench added .991 Feb 06 for Frank, Jeroen, Renald */
23.882 + isDutch = 0;
23.883 + if (Dutchcount > 50) {
23.884 + isDutch = 1;
23.885 + printf(" --> This looks like Dutch - switching off dashes and warnings for 's Middags case.\n");
23.886 + }
23.887 +
23.888 + isFrench = 0;
23.889 + if (Frenchcount > 50) {
23.890 + isFrench = 1;
23.891 + printf(" --> This looks like French - switching off some doublepunct.\n");
23.892 + }
23.893 +
23.894 + if (firstline && footerline)
23.895 + printf(" The PG header and footer appear to be already on.\n");
23.896 + else {
23.897 + if (firstline)
23.898 + printf(" The PG header is on - no footer.\n");
23.899 + if (footerline)
23.900 + printf(" The PG footer is on - no header.\n");
23.901 + }
23.902 + printf("\n");
23.903 +
23.904 + /* V.22 George Davis asked for an override switch to force it to list everything */
23.905 + if (pswit[VERBOSE_SWITCH]) {
23.906 + warn_bin = 1;
23.907 + warn_short = 1;
23.908 + warn_dotcomma = 1;
23.909 + warn_long = 1;
23.910 + warn_dash = 1;
23.911 + warn_digit = 1;
23.912 + warn_ast = 1;
23.913 + warn_fslash = 1;
23.914 + warn_hyphen = 1;
23.915 + warn_endquote = 1;
23.916 + printf(" *** Verbose output is ON -- you asked for it! ***\n");
23.917 + }
23.918 +
23.919 + if (isDutch)
23.920 + warn_dash = 0; /* Frank suggested turning it REALLY off for Dutch */
23.921 +
23.922 + if ((infile = fopen(filename, "rb")) == NULL) {
23.923 + if (pswit[STDOUT_SWITCH])
23.924 + fprintf(stdout, "gutcheck: cannot open %s\n", filename);
23.925 + else
23.926 + fprintf(stderr, "gutcheck: cannot open %s\n", filename);
23.927 + exit(1);
23.928 + }
23.929 +
23.930 + if (footerline > 0 && firstline > 0 && footerline > firstline && footerline - firstline < 100) { /* ugh */
23.931 + printf(" --> I don't really know where this text starts. \n");
23.932 + printf(" There are no reference points.\n");
23.933 + printf(" I'm going to have to report the header and footer as well.\n");
23.934 + firstline=0;
23.935 + }
23.936 +
23.937 +
23.938 +
23.939 + /*****************************************************/
23.940 + /* */
23.941 + /* Here we go with the main pass. Hold onto yer hat! */
23.942 + /* */
23.943 + /*****************************************************/
23.944 +
23.945 + /* Re-init some variables we've dirtied */
23.946 + quot = squot = linecnt = 0;
23.947 + laststart = CHAR_SPACE;
23.948 + lastlen = lastblen = 0;
23.949 +
23.950 + while (flgets(aline, LINEBUFSIZE-1, infile, linecnt+1)) {
23.951 + linecnt++;
23.952 + if (linecnt == 1) isnewpara = 1;
23.953 + if (pswit[DP_SWITCH])
23.954 + if (!strncmp(aline, "-----File: ", 11))
23.955 + continue; // skip DP page separators completely
23.956 + if (linecnt < firstline || (footerline > 0 && linecnt > footerline)) {
23.957 + if (pswit[HEADER_SWITCH]) {
23.958 + if (!strncmp(aline, "Title:", 6))
23.959 + printf(" %s\n", aline);
23.960 + if (!strncmp (aline, "Author:", 7))
23.961 + printf(" %s\n", aline);
23.962 + if (!strncmp(aline, "Release Date:", 13))
23.963 + printf(" %s\n", aline);
23.964 + if (!strncmp(aline, "Edition:", 8))
23.965 + printf(" %s\n\n", aline);
23.966 + }
23.967 + continue; /* skip through the header */
23.968 + }
23.969 + checked_linecnt++;
23.970 + s = aline;
23.971 + isemptyline = 1; /* assume the line is empty until proven otherwise */
23.972 +
23.973 + /* If we are in a state of unbalanced quotes, and this line */
23.974 + /* doesn't begin with a quote, output the stored error message */
23.975 + /* If the -P switch was used, print the warning even if the */
23.976 + /* new para starts with quotes */
23.977 + /* Version .20 - if the new paragraph does start with a quote, */
23.978 + /* but is indented, I was giving a spurious error. Need to */
23.979 + /* check the first _non-space_ character on the line rather */
23.980 + /* than the first character when deciding whether the para */
23.981 + /* starts with a quote. Using *t for this. */
23.982 + t = s;
23.983 + while (*t == ' ') t++;
23.984 + if (*dquote_err)
23.985 + if (*t != CHAR_DQUOTE || pswit[QPARA_SWITCH]) {
23.986 + if (!pswit[OVERVIEW_SWITCH]) {
23.987 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
23.988 + printf(dquote_err);
23.989 + }
23.990 + else
23.991 + cnt_dquot++;
23.992 + }
23.993 + if (*squote_err) {
23.994 + if (*t != CHAR_SQUOTE && *t != CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] || squot) {
23.995 + if (!pswit[OVERVIEW_SWITCH]) {
23.996 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
23.997 + printf(squote_err);
23.998 + }
23.999 + else
23.1000 + cnt_squot++;
23.1001 + }
23.1002 + squot = 0;
23.1003 + }
23.1004 + if (*rbrack_err) {
23.1005 + if (!pswit[OVERVIEW_SWITCH]) {
23.1006 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
23.1007 + printf(rbrack_err);
23.1008 + }
23.1009 + else
23.1010 + cnt_brack++;
23.1011 + }
23.1012 + if (*sbrack_err) {
23.1013 + if (!pswit[OVERVIEW_SWITCH]) {
23.1014 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
23.1015 + printf(sbrack_err);
23.1016 + }
23.1017 + else
23.1018 + cnt_brack++;
23.1019 + }
23.1020 + if (*cbrack_err) {
23.1021 + if (!pswit[OVERVIEW_SWITCH]) {
23.1022 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
23.1023 + printf(cbrack_err);
23.1024 + }
23.1025 + else
23.1026 + cnt_brack++;
23.1027 + }
23.1028 + if (*unders_err) {
23.1029 + if (!pswit[OVERVIEW_SWITCH]) {
23.1030 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
23.1031 + printf(unders_err);
23.1032 + }
23.1033 + else
23.1034 + cnt_brack++;
23.1035 + }
23.1036 +
23.1037 + *dquote_err = *squote_err = *rbrack_err = *cbrack_err =
23.1038 + *sbrack_err = *unders_err = 0;
23.1039 +
23.1040 +
23.1041 + /* look along the line, accumulate the count of quotes, and see */
23.1042 + /* if this is an empty line - i.e. a line with nothing on it */
23.1043 + /* but spaces. */
23.1044 + /* V .12 also if line has just spaces, * and/or - on it, don't */
23.1045 + /* count it, since empty lines with asterisks or dashes to */
23.1046 + /* separate sections are common. */
23.1047 + /* V .15 new single-quote checking - has to be better than the */
23.1048 + /* previous version, but how much better? fingers crossed! */
23.1049 + /* V .20 add period to * and - as characters on a separator line*/
23.1050 + s = aline;
23.1051 + while (*s) {
23.1052 + if (*s == CHAR_DQUOTE) quot++;
23.1053 + if (*s == CHAR_SQUOTE || *s == CHAR_OPEN_SQUOTE)
23.1054 + if (s == aline) { /* at start of line, it can only be an openquote */
23.1055 + if (strncmp(s+2, "tis", 3) && strncmp(s+2, "Tis", 3)) /* hardcode a very common exception! */
23.1056 + open_single_quote++;
23.1057 + }
23.1058 + else
23.1059 + if (gcisalpha(*(s-1)) && gcisalpha(*(s+1)))
23.1060 + ; /* do nothing! - it's definitely an apostrophe, not a quote */
23.1061 + else /* it's outside a word - let's check it out */
23.1062 + if (*s == CHAR_OPEN_SQUOTE || gcisalpha(*(s+1))) { /* it damwell better BE an openquote */
23.1063 + if (strncmp(s+1, "tis", 3) && strncmp(s+1, "Tis", 3)) /* hardcode a very common exception! */
23.1064 + open_single_quote++;
23.1065 + }
23.1066 + else { /* now - is it a closequote? */
23.1067 + guessquote = 0; /* accumulate clues */
23.1068 + if (gcisalpha(*(s-1))) { /* it follows a letter - could be either */
23.1069 + guessquote += 1;
23.1070 + if (*(s-1) == 's') { /* looks like a plural apostrophe */
23.1071 + guessquote -= 3;
23.1072 + if (*(s+1) == CHAR_SPACE) /* bonus marks! */
23.1073 + guessquote -= 2;
23.1074 + }
23.1075 + }
23.1076 + else /* it doesn't have a letter either side */
23.1077 + if (strchr(".?!,;:", *(s-1)) && (strchr(".?!,;: ", *(s+1))))
23.1078 + guessquote += 8; /* looks like a closequote */
23.1079 + else
23.1080 + guessquote += 1;
23.1081 + if (open_single_quote > close_single_quote)
23.1082 + guessquote += 1; /* give it the benefit of some doubt - if a squote is already open */
23.1083 + else
23.1084 + guessquote -= 1;
23.1085 + if (guessquote >= 0)
23.1086 + close_single_quote++;
23.1087 + }
23.1088 +
23.1089 + if (*s != CHAR_SPACE
23.1090 + && *s != '-'
23.1091 + && *s != '.'
23.1092 + && *s != CHAR_ASTERISK
23.1093 + && *s != 13
23.1094 + && *s != 10) isemptyline = 0; /* ignore lines like * * * as spacers */
23.1095 + if (*s == CHAR_UNDERSCORE) c_unders++;
23.1096 + if (*s == CHAR_OPEN_CBRACK) c_brack++;
23.1097 + if (*s == CHAR_CLOSE_CBRACK) c_brack--;
23.1098 + if (*s == CHAR_OPEN_RBRACK) r_brack++;
23.1099 + if (*s == CHAR_CLOSE_RBRACK) r_brack--;
23.1100 + if (*s == CHAR_OPEN_SBRACK) s_brack++;
23.1101 + if (*s == CHAR_CLOSE_SBRACK) s_brack--;
23.1102 + s++;
23.1103 + }
23.1104 +
23.1105 + if (isnewpara && !isemptyline) { /* This line is the start of a new paragraph */
23.1106 + start_para_line = linecnt;
23.1107 + strncpy(parastart, aline, 80); /* Capture its first line in case we want to report it later */
23.1108 + parastart[79] = 0;
23.1109 + dquotepar = squotepar = 0; /* restart the quote count 0.98 */
23.1110 + s = aline;
23.1111 + while (!gcisalpha(*s) && !gcisdigit(*s) && *s) s++; /* V.97 fixed bug - overran line and gave false warning - rare */
23.1112 + if (*s >= 'a' && *s <='z') { /* and its first letter is lowercase */
23.1113 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1114 + if (!pswit[OVERVIEW_SWITCH])
23.1115 + printf(" Line %ld column %d - Paragraph starts with lower-case\n", linecnt, (int)(s - aline) +1);
23.1116 + else
23.1117 + cnt_punct++;
23.1118 + }
23.1119 + isnewpara = 0; /* Signal the end of new para processing */
23.1120 + }
23.1121 +
23.1122 + /* Check for an em-dash broken at line end */
23.1123 + if (enddash && *aline == '-') {
23.1124 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1125 + if (!pswit[OVERVIEW_SWITCH])
23.1126 + printf(" Line %ld column 1 - Broken em-dash?\n", linecnt);
23.1127 + else
23.1128 + cnt_punct++;
23.1129 + }
23.1130 + enddash = 0;
23.1131 + for (s = aline + strlen(aline) - 1; *s == ' ' && s > aline; s--);
23.1132 + if (s >= aline && *s == '-')
23.1133 + enddash = 1;
23.1134 +
23.1135 +
23.1136 + /* Check for invalid or questionable characters in the line */
23.1137 + /* Anything above 127 is invalid for plain ASCII, and */
23.1138 + /* non-printable control characters should also be flagged. */
23.1139 + /* Tabs should generally not be there. */
23.1140 + /* Jan 06, in 0.99: Hm. For some strange reason, I either */
23.1141 + /* never created or deleted the check for unprintable */
23.1142 + /* control characters. They should be reported even if */
23.1143 + /* warn_bin is on, I think, and in full. */
23.1144 +
23.1145 + for (s = aline; *s; s++) {
23.1146 + i = (unsigned char) *s;
23.1147 + if (i < CHAR_SPACE && i != CHAR_LF && i != CHAR_CR && i != CHAR_TAB) {
23.1148 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1149 + if (!pswit[OVERVIEW_SWITCH])
23.1150 + printf(" Line %ld column %d - Control character %d\n", linecnt, (int) (s - aline) + 1, i);
23.1151 + else
23.1152 + cnt_bin++;
23.1153 + }
23.1154 + }
23.1155 +
23.1156 + if (warn_bin) {
23.1157 + eNon_A = eTab = eTilde = eCarat = eFSlash = eAst = 0; /* don't repeat multiple warnings on one line */
23.1158 + for (s = aline; *s; s++) {
23.1159 + if (!eNon_A && ((*s < CHAR_SPACE && *s != 9 && *s != '\n') || (unsigned char)*s > 127)) {
23.1160 + i = *s; /* annoying kludge for signed chars */
23.1161 + if (i < 0) i += 256;
23.1162 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1163 + if (!pswit[OVERVIEW_SWITCH])
23.1164 + if (i > 127 && i < 160)
23.1165 + printf(" Line %ld column %d - Non-ISO-8859 character %d\n", linecnt, (int) (s - aline) + 1, i);
23.1166 + else
23.1167 + printf(" Line %ld column %d - Non-ASCII character %d\n", linecnt, (int) (s - aline) + 1, i);
23.1168 + else
23.1169 + cnt_bin++;
23.1170 + eNon_A = 1;
23.1171 + }
23.1172 + if (!eTab && *s == CHAR_TAB) {
23.1173 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1174 + if (!pswit[OVERVIEW_SWITCH])
23.1175 + printf(" Line %ld column %d - Tab character?\n", linecnt, (int) (s - aline) + 1);
23.1176 + else
23.1177 + cnt_odd++;
23.1178 + eTab = 1;
23.1179 + }
23.1180 + if (!eTilde && *s == CHAR_TILDE) { /* often used by OCR software to indicate an unrecognizable character */
23.1181 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1182 + if (!pswit[OVERVIEW_SWITCH])
23.1183 + printf(" Line %ld column %d - Tilde character?\n", linecnt, (int) (s - aline) + 1);
23.1184 + else
23.1185 + cnt_odd++;
23.1186 + eTilde = 1;
23.1187 + }
23.1188 + if (!eCarat && *s == CHAR_CARAT) {
23.1189 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1190 + if (!pswit[OVERVIEW_SWITCH])
23.1191 + printf(" Line %ld column %d - Carat character?\n", linecnt, (int) (s - aline) + 1);
23.1192 + else
23.1193 + cnt_odd++;
23.1194 + eCarat = 1;
23.1195 + }
23.1196 + if (!eFSlash && *s == CHAR_FORESLASH && warn_fslash) {
23.1197 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1198 + if (!pswit[OVERVIEW_SWITCH])
23.1199 + printf(" Line %ld column %d - Forward slash?\n", linecnt, (int) (s - aline) + 1);
23.1200 + else
23.1201 + cnt_odd++;
23.1202 + eFSlash = 1;
23.1203 + }
23.1204 + /* report asterisks only in paranoid mode, since they're often deliberate */
23.1205 + if (!eAst && pswit[PARANOID_SWITCH] && warn_ast && !isemptyline && *s == CHAR_ASTERISK) {
23.1206 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1207 + if (!pswit[OVERVIEW_SWITCH])
23.1208 + printf(" Line %ld column %d - Asterisk?\n", linecnt, (int) (s - aline) + 1);
23.1209 + else
23.1210 + cnt_odd++;
23.1211 + eAst = 1;
23.1212 + }
23.1213 + }
23.1214 + }
23.1215 +
23.1216 + /* Check for line too long */
23.1217 + if (warn_long) {
23.1218 + if (strlen(aline) > LONGEST_PG_LINE) {
23.1219 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1220 + if (!pswit[OVERVIEW_SWITCH])
23.1221 + printf(" Line %ld column %d - Long line %d\n", linecnt, strlen(aline), strlen(aline));
23.1222 + else
23.1223 + cnt_long++;
23.1224 + }
23.1225 + }
23.1226 +
23.1227 + /* Check for line too short. */
23.1228 + /* This one is a bit trickier to implement: we don't want to */
23.1229 + /* flag the last line of a paragraph for being short, so we */
23.1230 + /* have to wait until we know that our current line is a */
23.1231 + /* "normal" line, then report the _previous_ line if it was too */
23.1232 + /* short. We also don't want to report indented lines like */
23.1233 + /* chapter heads or formatted quotations. We therefore keep */
23.1234 + /* lastlen as the length of the last line examined, and */
23.1235 + /* lastblen as the length of the last but one, and try to */
23.1236 + /* suppress unnecessary warnings by checking that both were of */
23.1237 + /* "normal" length. We keep the first character of the last */
23.1238 + /* line in laststart, and if it was a space, we assume that the */
23.1239 + /* formatting is deliberate. I can't figure out a way to */
23.1240 + /* distinguish something like a quoted verse left-aligned or */
23.1241 + /* the header or footer of a letter from a paragraph of short */
23.1242 + /* lines - maybe if I examined the whole paragraph, and if the */
23.1243 + /* para has less than, say, 8 lines and if all lines are short, */
23.1244 + /* then just assume it's OK? Need to look at some texts to see */
23.1245 + /* how often a formula like this would get the right result. */
23.1246 + /* V0.99 changed the tolerance for length to ignore from 2 to 1 */
23.1247 + if (warn_short) {
23.1248 + if (strlen(aline) > 1
23.1249 + && lastlen > 1 && lastlen < SHORTEST_PG_LINE
23.1250 + && lastblen > 1 && lastblen > SHORTEST_PG_LINE
23.1251 + && laststart != CHAR_SPACE) {
23.1252 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", prevline);
23.1253 + if (!pswit[OVERVIEW_SWITCH])
23.1254 + printf(" Line %ld column %d - Short line %d?\n", linecnt-1, strlen(prevline), strlen(prevline));
23.1255 + else
23.1256 + cnt_short++;
23.1257 + }
23.1258 + }
23.1259 + lastblen = lastlen;
23.1260 + lastlen = strlen(aline);
23.1261 + laststart = aline[0];
23.1262 +
23.1263 + /* look for punctuation at start of line */
23.1264 + if (*aline && strchr(".?!,;:", aline[0])) { /* if it's punctuation */
23.1265 + if (strncmp(". . .", aline, 5)) { /* exception for ellipsis: V.98 tightened up to except only a full ellipsis */
23.1266 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1267 + if (!pswit[OVERVIEW_SWITCH])
23.1268 + printf(" Line %ld column 1 - Begins with punctuation?\n", linecnt);
23.1269 + else
23.1270 + cnt_punct++;
23.1271 + }
23.1272 + }
23.1273 +
23.1274 + /* Check for spaced em-dashes */
23.1275 + /* V.20 must check _all_ occurrences of "--" on the line */
23.1276 + /* hence the loop - even if the first double-dash is OK */
23.1277 + /* there may be another that's wrong later on. */
23.1278 + if (warn_dash) {
23.1279 + s = aline;
23.1280 + while (strstr(s,"--")) {
23.1281 + if (*(strstr(s, "--")-1) == CHAR_SPACE ||
23.1282 + (*(strstr(s, "--")+2) == CHAR_SPACE)) {
23.1283 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1284 + if (!pswit[OVERVIEW_SWITCH])
23.1285 + printf(" Line %ld column %d - Spaced em-dash?\n", linecnt, (int) (strstr(s,"--") - aline) + 1);
23.1286 + else
23.1287 + cnt_dash++;
23.1288 + }
23.1289 + s = strstr(s,"--") + 2;
23.1290 + }
23.1291 + }
23.1292 +
23.1293 + /* Check for spaced dashes */
23.1294 + if (warn_dash)
23.1295 + if (strstr(aline," -")) {
23.1296 + if (*(strstr(aline, " -")+2) != '-') {
23.1297 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1298 + if (!pswit[OVERVIEW_SWITCH])
23.1299 + printf(" Line %ld column %d - Spaced dash?\n", linecnt, (int) (strstr(aline," -") - aline) + 1);
23.1300 + else
23.1301 + cnt_dash++;
23.1302 + }
23.1303 + }
23.1304 + else
23.1305 + if (strstr(aline,"- ")) {
23.1306 + if (*(strstr(aline, "- ")-1) != '-') {
23.1307 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1308 + if (!pswit[OVERVIEW_SWITCH])
23.1309 + printf(" Line %ld column %d - Spaced dash?\n", linecnt, (int) (strstr(aline,"- ") - aline) + 1);
23.1310 + else
23.1311 + cnt_dash++;
23.1312 + }
23.1313 + }
23.1314 +
23.1315 + /* v 0.99 */
23.1316 + /* Check for unmarked paragraphs indicated by separate speakers */
23.1317 + /* May well be false positive: */
23.1318 + /* "Bravo!" "Wonderful!" called the crowd. */
23.1319 + /* but useful all the same. */
23.1320 + s = wrk;
23.1321 + *s = 0;
23.1322 + if (strstr(aline, "\" \"")) s = strstr(aline, "\" \"");
23.1323 + if (strstr(aline, "\" \"")) s = strstr(aline, "\" \"");
23.1324 + if (*s) {
23.1325 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1326 + if (!pswit[OVERVIEW_SWITCH])
23.1327 + printf(" Line %ld column %d - Query missing paragraph break?\n", linecnt, (int)(s - aline) +1);
23.1328 + else
23.1329 + cnt_punct++;
23.1330 + }
23.1331 +
23.1332 +
23.1333 +
23.1334 + /* Check for "to he" and other easy he/be errors */
23.1335 + /* This is a very inadequate effort on the he/be problem, */
23.1336 + /* but the phrase "to he" is always an error, whereas "to */
23.1337 + /* be" is quite common. I chuckle when it does catch one! */
23.1338 + /* Similarly, '"Quiet!", be said.' is a non-be error */
23.1339 + /* V .18 - "to he" is _not_ always an error!: */
23.1340 + /* "Where they went to he couldn't say." */
23.1341 + /* but I'm leaving it in anyway. */
23.1342 + /* V .20 Another false positive: */
23.1343 + /* What would "Cinderella" be without the . . . */
23.1344 + /* and another "If he wants to he can see for himself." */
23.1345 + /* V .21 Added " is be " and " be is " and " be was " */
23.1346 + /* V .99 Added jeebies code -- removed again. */
23.1347 + /* Is jeebies code worth adding? Rare to see he/be */
23.1348 + /* errors with modern OCR. Separate program? Yes! */
23.1349 + /* jeebies does the job without cluttering up this. */
23.1350 + /* We do get a few more queryable pairs from the */
23.1351 + /* project though -- they're cheap to implement. */
23.1352 + /* Also added a column number for guiguts. */
23.1353 +
23.1354 + s = wrk;
23.1355 + *s = 0;
23.1356 + if (strstr(aline," to he ")) s = strstr(aline," to he ");
23.1357 + if (strstr(aline,"\" be ")) s = strstr(aline,"\" be ");
23.1358 + if (strstr(aline,"\", be ")) s = strstr(aline,"\", be ");
23.1359 + if (strstr(aline," is be ")) s = strstr(aline," is be ");
23.1360 + if (strstr(aline," be is ")) s = strstr(aline," be is ");
23.1361 + if (strstr(aline," was be ")) s = strstr(aline," was be ");
23.1362 + if (strstr(aline," be would ")) s = strstr(aline," be would ");
23.1363 + if (strstr(aline," be could ")) s = strstr(aline," be could ");
23.1364 + if (*s) {
23.1365 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1366 + if (!pswit[OVERVIEW_SWITCH])
23.1367 + printf(" Line %ld column %d - Query he/be error?\n", linecnt, (int)(s - aline) +1);
23.1368 + else
23.1369 + cnt_word++;
23.1370 + }
23.1371 +
23.1372 + s = wrk;
23.1373 + *s = 0;
23.1374 + if (strstr(aline," i bad ")) s = strstr(aline," i bad ");
23.1375 + if (strstr(aline," you bad ")) s = strstr(aline," you bad ");
23.1376 + if (strstr(aline," he bad ")) s = strstr(aline," he bad ");
23.1377 + if (strstr(aline," she bad ")) s = strstr(aline," she bad ");
23.1378 + if (strstr(aline," they bad ")) s = strstr(aline," they bad ");
23.1379 + if (strstr(aline," a had ")) s = strstr(aline," a had ");
23.1380 + if (strstr(aline," the had ")) s = strstr(aline," the had ");
23.1381 + if (*s) {
23.1382 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1383 + if (!pswit[OVERVIEW_SWITCH])
23.1384 + printf(" Line %ld column %d - Query had/bad error?\n", linecnt, (int)(s - aline) +1);
23.1385 + else
23.1386 + cnt_word++;
23.1387 + }
23.1388 +
23.1389 +
23.1390 + /* V .97 Added ", hut " Not too common, hut pretty certain */
23.1391 + /* V.99 changed to add a column number for guiguts */
23.1392 + s = wrk;
23.1393 + *s = 0;
23.1394 + if (strstr(aline,", hut ")) s = strstr(aline,", hut ");
23.1395 + if (strstr(aline,"; hut ")) s = strstr(aline,"; hut ");
23.1396 + if (*s) {
23.1397 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1398 + if (!pswit[OVERVIEW_SWITCH])
23.1399 + printf(" Line %ld column %d - Query hut/but error?\n", linecnt, (int)(s - aline) +1);
23.1400 + else
23.1401 + cnt_word++;
23.1402 + }
23.1403 +
23.1404 + /* Special case - angled bracket in front of "From" placed there by an MTA */
23.1405 + /* when sending an e-mail. V .21 */
23.1406 + if (strstr(aline, ">From")) {
23.1407 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1408 + if (!pswit[OVERVIEW_SWITCH])
23.1409 + printf(" Line %ld column %d - Query angled bracket with From\n", linecnt, (int)(strstr(aline, ">From") - aline) +1);
23.1410 + else
23.1411 + cnt_punct++;
23.1412 + }
23.1413 +
23.1414 + /* V 0.98 Check for a single character line - often an overflow from bad wrapping. */
23.1415 + if (*aline && !*(aline+1)) {
23.1416 + if (*aline == 'I' || *aline == 'V' || *aline == 'X' || *aline == 'L' || gcisdigit(*aline))
23.1417 + ; /* nothing - ignore numerals alone on a line. */
23.1418 + else {
23.1419 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1420 + if (!pswit[OVERVIEW_SWITCH])
23.1421 + printf(" Line %ld column 1 - Query single character line\n", linecnt);
23.1422 + else
23.1423 + cnt_punct++;
23.1424 + }
23.1425 + }
23.1426 +
23.1427 + /* V 0.98 Check for I" - often should be ! */
23.1428 + if (strstr(aline, " I\"")) {
23.1429 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1430 + if (!pswit[OVERVIEW_SWITCH])
23.1431 + printf(" Line %ld column %ld - Query I=exclamation mark?\n", linecnt, strstr(aline, " I\"") - aline);
23.1432 + else
23.1433 + cnt_punct++;
23.1434 + }
23.1435 +
23.1436 + /* V 0.98 Check for period without a capital letter. Cut-down from gutspell */
23.1437 + /* Only works when it happens on a single line. */
23.1438 +
23.1439 + if (pswit[PARANOID_SWITCH])
23.1440 + for (t = s = aline; strstr(t,". ");) {
23.1441 + t = strstr(t, ". ");
23.1442 + if (t == s) {
23.1443 + t++;
23.1444 + continue; /* start of line punctuation is handled elsewhere */
23.1445 + }
23.1446 + if (!gcisalpha(*(t-1))) {
23.1447 + t++;
23.1448 + continue;
23.1449 + }
23.1450 + if (isDutch) { /* For Frank & Jeroen -- 's Middags case */
23.1451 + if (*(t+2) == CHAR_SQUOTE &&
23.1452 + *(t+3)>='a' && *(t+3)<='z' &&
23.1453 + *(t+4) == CHAR_SPACE &&
23.1454 + *(t+5)>='A' && *(t+5)<='Z') {
23.1455 + t++;
23.1456 + continue;
23.1457 + }
23.1458 + }
23.1459 + s1 = t+2;
23.1460 + while (*s1 && !gcisalpha(*s1) && !isdigit(*s1))
23.1461 + s1++;
23.1462 + if (*s1 >= 'a' && *s1 <= 'z') { /* we have something to investigate */
23.1463 + istypo = 1;
23.1464 + for (s1 = t - 1; s1 >= s &&
23.1465 + (gcisalpha(*s1) || gcisdigit(*s1) ||
23.1466 + (*s1 == CHAR_SQUOTE && gcisalpha(*(s1+1)) && gcisalpha(*(s1-1)))); s1--); /* so let's go back and find out */
23.1467 + s1++;
23.1468 + for (i = 0; *s1 && *s1 != '.'; s1++, i++)
23.1469 + testword[i] = *s1;
23.1470 + testword[i] = 0;
23.1471 + for (i = 0; *abbrev[i]; i++)
23.1472 + if (!strcmp(testword, abbrev[i]))
23.1473 + istypo = 0;
23.1474 +// if (*testword >= 'A' && *testword <= 'Z')
23.1475 +// istypo = 0;
23.1476 + if (gcisdigit(*testword)) istypo = 0;
23.1477 + if (!*(testword+1)) istypo = 0;
23.1478 + if (isroman(testword)) istypo = 0;
23.1479 + if (istypo) {
23.1480 + istypo = 0;
23.1481 + for (i = 0; testword[i]; i++)
23.1482 + if (strchr(vowels, testword[i]))
23.1483 + istypo = 1;
23.1484 + }
23.1485 + if (istypo) {
23.1486 + isdup = 0;
23.1487 + if (strlen(testword) < MAX_QWORD_LENGTH && !pswit[VERBOSE_SWITCH])
23.1488 + for (i = 0; i < qperiod_index; i++)
23.1489 + if (!strcmp(testword, qperiod[i])) {
23.1490 + isdup = 1;
23.1491 + }
23.1492 + if (!isdup) {
23.1493 + if (qperiod_index < MAX_QWORD && strlen(testword) < MAX_QWORD_LENGTH) {
23.1494 + strcpy(qperiod[qperiod_index], testword);
23.1495 + qperiod_index++;
23.1496 + }
23.1497 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1498 + if (!pswit[OVERVIEW_SWITCH])
23.1499 + printf(" Line %ld column %d - Extra period?\n", linecnt, (int)(t - aline)+1);
23.1500 + else
23.1501 + cnt_punct++;
23.1502 + }
23.1503 + }
23.1504 + }
23.1505 + t++;
23.1506 + }
23.1507 +
23.1508 +
23.1509 + if (pswit[TYPO_SWITCH]) { /* Should have put this condition in at the start of 0.99. Duh! */
23.1510 + /* Check for words usually not followed by punctuation 0.99 */
23.1511 + for (s = aline; *s;) {
23.1512 + wordstart = s;
23.1513 + s = getaword(s, inword);
23.1514 + if (!*inword) continue;
23.1515 + lowerit(inword);
23.1516 + for (i = 0; *nocomma[i]; i++)
23.1517 + if (!strcmp(inword, nocomma[i])) {
23.1518 + if (*s == ',' || *s == ';' || *s == ':') {
23.1519 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1520 + if (!pswit[OVERVIEW_SWITCH])
23.1521 + printf(" Line %ld column %d - Query punctuation after %s?\n", linecnt, (int)(s - aline)+1, inword);
23.1522 + else
23.1523 + cnt_punct++;
23.1524 + }
23.1525 + }
23.1526 + for (i = 0; *noperiod[i]; i++)
23.1527 + if (!strcmp(inword, noperiod[i])) {
23.1528 + if (*s == '.' || *s == '!') {
23.1529 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1530 + if (!pswit[OVERVIEW_SWITCH])
23.1531 + printf(" Line %ld column %d - Query punctuation after %s?\n", linecnt, (int)(s - aline)+1, inword);
23.1532 + else
23.1533 + cnt_punct++;
23.1534 + }
23.1535 + }
23.1536 + }
23.1537 + }
23.1538 +
23.1539 +
23.1540 +
23.1541 + /* Check for commonly mistyped words, and digits like 0 for O in a word */
23.1542 + for (s = aline; *s;) {
23.1543 + wordstart = s;
23.1544 + s = getaword(s, inword);
23.1545 + if (!*inword) continue; /* don't bother with empty lines */
23.1546 + if (mixdigit(inword)) {
23.1547 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1548 + if (!pswit[OVERVIEW_SWITCH])
23.1549 + printf(" Line %ld column %ld - Query digit in %s\n", linecnt, (int)(wordstart - aline) + 1, inword);
23.1550 + else
23.1551 + cnt_word++;
23.1552 + }
23.1553 +
23.1554 + /* put the word through a series of tests for likely typos and OCR errors */
23.1555 + /* V.21 I had allowed lots of typo-checking even with the typo switch */
23.1556 + /* turned off, but I really should disallow reporting of them when */
23.1557 + /* the switch is off. Hence the "if" below. */
23.1558 + if (pswit[TYPO_SWITCH]) {
23.1559 + istypo = 0;
23.1560 + strcpy(testword, inword);
23.1561 + alower = 0;
23.1562 + for (i = 0; i < (signed int)strlen(testword); i++) { /* lowercase for testing */
23.1563 + if (testword[i] >= 'a' && testword[i] <= 'z') alower = 1;
23.1564 + if (alower && testword[i] >= 'A' && testword[i] <= 'Z') {
23.1565 + /* we have an uppercase mid-word. However, there are common cases: */
23.1566 + /* Mac and Mc like McGill */
23.1567 + /* French contractions like l'Abbe */
23.1568 + if ((i == 2 && testword[0] == 'm' && testword[1] == 'c') ||
23.1569 + (i == 3 && testword[0] == 'm' && testword[1] == 'a' && testword[2] == 'c') ||
23.1570 + (i > 0 && testword[i-1] == CHAR_SQUOTE))
23.1571 + ; /* do nothing! */
23.1572 +
23.1573 + else { /* V.97 - remove separate case of uppercase within word so that */
23.1574 + /* names like VanAllen fall into qword_index and get reported only once */
23.1575 + istypo = 1;
23.1576 + }
23.1577 + }
23.1578 + testword[i] = (char)tolower(testword[i]);
23.1579 + }
23.1580 +
23.1581 + /* check for certain unlikely two-letter combinations at word start and end */
23.1582 + /* V.0.97 - this replaces individual hardcoded checks in previous versions */
23.1583 + if (strlen(testword) > 1) {
23.1584 + for (i = 0; *nostart[i]; i++)
23.1585 + if (!strncmp(testword, nostart[i], 2))
23.1586 + istypo = 1;
23.1587 + for (i = 0; *noend[i]; i++)
23.1588 + if (!strncmp(testword + strlen(testword) -2, noend[i], 2))
23.1589 + istypo = 1;
23.1590 + }
23.1591 +
23.1592 +
23.1593 + /* ght is common, gbt never. Like that. */
23.1594 + if (strstr(testword, "cb")) istypo = 1;
23.1595 + if (strstr(testword, "gbt")) istypo = 1;
23.1596 + if (strstr(testword, "pbt")) istypo = 1;
23.1597 + if (strstr(testword, "tbs")) istypo = 1;
23.1598 + if (strstr(testword, "mrn")) istypo = 1;
23.1599 + if (strstr(testword, "ahle")) istypo = 1;
23.1600 + if (strstr(testword, "ihle")) istypo = 1;
23.1601 +
23.1602 + /* "TBE" does happen - like HEARTBEAT - but uncommon. */
23.1603 + /* Also "TBI" - frostbite, outbid - but uncommon. */
23.1604 + /* Similarly "ii" like Hawaii, or Pompeii, and in Roman numerals, */
23.1605 + /* but these are covered in V.20. "ii" is a common scanno. */
23.1606 + if (strstr(testword, "tbi")) istypo = 1;
23.1607 + if (strstr(testword, "tbe")) istypo = 1;
23.1608 + if (strstr(testword, "ii")) istypo = 1;
23.1609 +
23.1610 + /* check for no vowels or no consonants. */
23.1611 + /* If none, flag a typo */
23.1612 + if (!istypo && strlen(testword)>1) {
23.1613 + vowel = consonant = 0;
23.1614 + for (i = 0; testword[i]; i++)
23.1615 + if (testword[i] == 'y' || gcisdigit(testword[i])) { /* Yah, this is loose. */
23.1616 + vowel++;
23.1617 + consonant++;
23.1618 + }
23.1619 + else
23.1620 + if (strchr(vowels, testword[i])) vowel++;
23.1621 + else consonant++;
23.1622 + if (!vowel || !consonant) {
23.1623 + istypo = 1;
23.1624 + }
23.1625 + }
23.1626 +
23.1627 + /* now exclude the word from being reported if it's in */
23.1628 + /* the okword list */
23.1629 + for (i = 0; *okword[i]; i++)
23.1630 + if (!strcmp(testword, okword[i]))
23.1631 + istypo = 0;
23.1632 +
23.1633 + /* what looks like a typo may be a Roman numeral. Exclude these */
23.1634 + if (istypo)
23.1635 + if (isroman(testword))
23.1636 + istypo = 0;
23.1637 +
23.1638 + /* check the manual list of typos */
23.1639 + if (!istypo)
23.1640 + for (i = 0; *typo[i]; i++)
23.1641 + if (!strcmp(testword, typo[i]))
23.1642 + istypo = 1;
23.1643 +
23.1644 +
23.1645 + /* V.21 - check lowercase s and l - special cases */
23.1646 + /* V.98 - added "i" and "m" */
23.1647 + /* V.99 - added "j" often a semi-colon gone wrong */
23.1648 + /* - and "d" for a missing apostrophe - he d */
23.1649 + /* - and "n" for "in" */
23.1650 + if (!istypo && strlen(testword) == 1)
23.1651 + if (strchr("slmijdn", *inword))
23.1652 + istypo = 1;
23.1653 +
23.1654 +
23.1655 + if (istypo) {
23.1656 + isdup = 0;
23.1657 + if (strlen(testword) < MAX_QWORD_LENGTH && !pswit[VERBOSE_SWITCH])
23.1658 + for (i = 0; i < qword_index; i++)
23.1659 + if (!strcmp(testword, qword[i])) {
23.1660 + isdup = 1;
23.1661 + ++dupcnt[i];
23.1662 + }
23.1663 + if (!isdup) {
23.1664 + if (qword_index < MAX_QWORD && strlen(testword) < MAX_QWORD_LENGTH) {
23.1665 + strcpy(qword[qword_index], testword);
23.1666 + qword_index++;
23.1667 + }
23.1668 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1669 + if (!pswit[OVERVIEW_SWITCH]) {
23.1670 + printf(" Line %ld column %d - Query word %s", linecnt, (int)(wordstart - aline) + 1, inword);
23.1671 + if (strlen(testword) < MAX_QWORD_LENGTH && !pswit[VERBOSE_SWITCH])
23.1672 + printf(" - not reporting duplicates");
23.1673 + printf("\n");
23.1674 + }
23.1675 + else
23.1676 + cnt_word++;
23.1677 + }
23.1678 + }
23.1679 + } /* end of typo-checking */
23.1680 +
23.1681 + /* check the user's list of typos */
23.1682 + if (!istypo)
23.1683 + if (usertypo_count)
23.1684 + for (i = 0; i < usertypo_count; i++)
23.1685 + if (!strcmp(testword, usertypo[i])) {
23.1686 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1687 + if (!pswit[OVERVIEW_SWITCH])
23.1688 + printf(" Line %ld column %d - Query possible scanno %s\n", linecnt, (int)(wordstart - aline) + 2, inword);
23.1689 + }
23.1690 +
23.1691 +
23.1692 +
23.1693 + if (pswit[PARANOID_SWITCH] && warn_digit) { /* in paranoid mode, query all 0 and 1 standing alone - added warn_digit V.97*/
23.1694 + if (!strcmp(inword, "0") || !strcmp(inword, "1")) {
23.1695 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1696 + if (!pswit[OVERVIEW_SWITCH])
23.1697 + printf(" Line %ld column %d - Query standalone %s\n", linecnt, (int)(wordstart - aline) + 2, inword);
23.1698 + else
23.1699 + cnt_word++;
23.1700 + }
23.1701 + }
23.1702 + }
23.1703 +
23.1704 + /* look for added or missing spaces around punctuation and quotes */
23.1705 + /* If there is a punctuation character like ! with no space on */
23.1706 + /* either side, suspect a missing!space. If there are spaces on */
23.1707 + /* both sides , assume a typo. If we see a double quote with no */
23.1708 + /* space or punctuation on either side of it, assume unspaced */
23.1709 + /* quotes "like"this. */
23.1710 + llen = strlen(aline);
23.1711 + for (i = 1; i < llen; i++) { /* for each character in the line after the first */
23.1712 + if (strchr(".?!,;:_", aline[i])) { /* if it's punctuation */
23.1713 + isacro = 0; /* we need to suppress warnings for acronyms like M.D. */
23.1714 + isellipsis = 0; /* we need to suppress warnings for ellipsis . . . */
23.1715 + if ( (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1])) || /* if there are letters on both sides of it or ... */
23.1716 + (gcisalpha(aline[i+1]) && strchr("?!,;:", aline[i]))) { /* ...if it's strict punctuation followed by an alpha */
23.1717 + if (aline[i] == '.') {
23.1718 + if (i > 2)
23.1719 + if (aline[i-2] == '.') isacro = 1;
23.1720 + if (i + 2 < llen)
23.1721 + if (aline[i+2] == '.') isacro = 1;
23.1722 + }
23.1723 + if (!isacro) {
23.1724 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1725 + if (!pswit[OVERVIEW_SWITCH])
23.1726 + printf(" Line %ld column %d - Missing space?\n", linecnt, i+1);
23.1727 + else
23.1728 + cnt_punct++;
23.1729 + }
23.1730 + }
23.1731 + if (aline[i-1] == CHAR_SPACE && (aline[i+1] == CHAR_SPACE || aline[i+1] == 0)) { /* if there are spaces on both sides, or space before and end of line */
23.1732 + if (aline[i] == '.') {
23.1733 + if (i > 2)
23.1734 + if (aline[i-2] == '.') isellipsis = 1;
23.1735 + if (i + 2 < llen)
23.1736 + if (aline[i+2] == '.') isellipsis = 1;
23.1737 + }
23.1738 + if (!isemptyline && !isellipsis) {
23.1739 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1740 + if (!pswit[OVERVIEW_SWITCH])
23.1741 + printf(" Line %ld column %d - Spaced punctuation?\n", linecnt, i+1);
23.1742 + else
23.1743 + cnt_punct++;
23.1744 + }
23.1745 + }
23.1746 + }
23.1747 + }
23.1748 +
23.1749 + /* 0.98 -- split out the characters that CANNOT be preceded by space */
23.1750 + llen = strlen(aline);
23.1751 + for (i = 1; i < llen; i++) { /* for each character in the line after the first */
23.1752 + if (strchr("?!,;:", aline[i])) { /* if it's punctuation that _cannot_ have a space before it */
23.1753 + if (aline[i-1] == CHAR_SPACE && !isemptyline && aline[i+1] != CHAR_SPACE) { /* if aline[i+1) DOES == space, it was already reported just above */
23.1754 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1755 + if (!pswit[OVERVIEW_SWITCH])
23.1756 + printf(" Line %ld column %d - Spaced punctuation?\n", linecnt, i+1);
23.1757 + else
23.1758 + cnt_punct++;
23.1759 + }
23.1760 + }
23.1761 + }
23.1762 +
23.1763 +
23.1764 + /* 0.99 -- special case " .X" where X is any alpha. */
23.1765 + /* This plugs a hole in the acronym code above. Inelegant, but maintainable. */
23.1766 + llen = strlen(aline);
23.1767 + for (i = 1; i < llen; i++) { /* for each character in the line after the first */
23.1768 + if (aline[i] == '.') { /* if it's a period */
23.1769 + if (aline[i-1] == CHAR_SPACE && gcisalpha(aline[i+1])) { /* if the period follows a space and is followed by a letter */
23.1770 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1771 + if (!pswit[OVERVIEW_SWITCH])
23.1772 + printf(" Line %ld column %d - Spaced punctuation?\n", linecnt, i+1);
23.1773 + else
23.1774 + cnt_punct++;
23.1775 + }
23.1776 + }
23.1777 + }
23.1778 +
23.1779 +
23.1780 +
23.1781 +
23.1782 + /* v.21 breaking out the search for unspaced doublequotes */
23.1783 + /* This is not as efficient, but it's more maintainable */
23.1784 + /* V.97 added underscore to the list of characters not to query, */
23.1785 + /* since underscores are commonly used as italics indicators. */
23.1786 + /* V.98 Added slash as well, same reason. */
23.1787 + for (i = 1; i < llen; i++) { /* for each character in the line after the first */
23.1788 + if (aline[i] == CHAR_DQUOTE) {
23.1789 + if ((!strchr(" _-.'`,;:!/([{?}])", aline[i-1]) &&
23.1790 + !strchr(" _-.'`,;:!/([{?}])", aline[i+1]) &&
23.1791 + aline[i+1] != 0
23.1792 + || (!strchr(" _-([{'`", aline[i-1]) && gcisalpha(aline[i+1])))) {
23.1793 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1794 + if (!pswit[OVERVIEW_SWITCH])
23.1795 + printf(" Line %ld column %d - Unspaced quotes?\n", linecnt, i+1);
23.1796 + else
23.1797 + cnt_punct++;
23.1798 + }
23.1799 + }
23.1800 + }
23.1801 +
23.1802 +
23.1803 + /* v.98 check parity of quotes */
23.1804 + /* v.99 added !*(s+1) in some tests to catch "I am," he said, but I will not be soon". */
23.1805 + for (s = aline; *s; s++) {
23.1806 + if (*s == CHAR_DQUOTE) {
23.1807 + if (!(dquotepar = !dquotepar)) { /* parity even */
23.1808 + if (!strchr("_-.'`/,;:!?)]} ", *(s+1))) {
23.1809 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1810 + if (!pswit[OVERVIEW_SWITCH])
23.1811 + printf(" Line %ld column %d - Wrongspaced quotes?\n", linecnt, (int)(s - aline)+1);
23.1812 + else
23.1813 + cnt_punct++;
23.1814 + }
23.1815 + }
23.1816 + else { /* parity odd */
23.1817 + if (!gcisalpha(*(s+1)) && !isdigit(*(s+1)) && !strchr("_-/.'`([{$", *(s+1)) || !*(s+1)) {
23.1818 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1819 + if (!pswit[OVERVIEW_SWITCH])
23.1820 + printf(" Line %ld column %d - Wrongspaced quotes?\n", linecnt, (int)(s - aline)+1);
23.1821 + else
23.1822 + cnt_punct++;
23.1823 + }
23.1824 + }
23.1825 + }
23.1826 + }
23.1827 +
23.1828 + if (*aline == CHAR_DQUOTE) {
23.1829 + if (strchr(",;:!?)]} ", aline[1])) {
23.1830 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1831 + if (!pswit[OVERVIEW_SWITCH])
23.1832 + printf(" Line %ld column 1 - Wrongspaced quotes?\n", linecnt, (int)(s - aline)+1);
23.1833 + else
23.1834 + cnt_punct++;
23.1835 + }
23.1836 + }
23.1837 +
23.1838 + if (pswit[SQUOTE_SWITCH])
23.1839 + for (s = aline; *s; s++) {
23.1840 + if ((*s == CHAR_SQUOTE || *s == CHAR_OPEN_SQUOTE)
23.1841 + && ( s == aline || (s > aline && !gcisalpha(*(s-1))) || !gcisalpha(*(s+1)))) {
23.1842 + if (!(squotepar = !squotepar)) { /* parity even */
23.1843 + if (!strchr("_-.'`/\",;:!?)]} ", *(s+1))) {
23.1844 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1845 + if (!pswit[OVERVIEW_SWITCH])
23.1846 + printf(" Line %ld column %d - Wrongspaced singlequotes?\n", linecnt, (int)(s - aline)+1);
23.1847 + else
23.1848 + cnt_punct++;
23.1849 + }
23.1850 + }
23.1851 + else { /* parity odd */
23.1852 + if (!gcisalpha(*(s+1)) && !isdigit(*(s+1)) && !strchr("_-/\".'`", *(s+1)) || !*(s+1)) {
23.1853 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1854 + if (!pswit[OVERVIEW_SWITCH])
23.1855 + printf(" Line %ld column %d - Wrongspaced singlequotes?\n", linecnt, (int)(s - aline)+1);
23.1856 + else
23.1857 + cnt_punct++;
23.1858 + }
23.1859 + }
23.1860 + }
23.1861 + }
23.1862 +
23.1863 +
23.1864 + /* v.20 also look for double punctuation like ,. or ,, */
23.1865 + /* Thanks to DW for the suggestion! */
23.1866 + /* I'm putting this in a separate loop for clarity */
23.1867 + /* In books with references, ".," and ".;" are common */
23.1868 + /* e.g. "etc., etc.," and vol. 1.; vol 3.; */
23.1869 + /* OTOH, from my initial tests, there are also fairly */
23.1870 + /* common errors. What to do? Make these cases paranoid? */
23.1871 + /* V.21 ".," is the most common, so invented warn_dotcomma */
23.1872 + /* to suppress detailed reporting if it occurs often */
23.1873 + llen = strlen(aline);
23.1874 + for (i = 0; i < llen; i++) /* for each character in the line */
23.1875 + if (strchr(".?!,;:", aline[i]) /* if it's punctuation */
23.1876 + && (strchr(".?!,;:", aline[i+1]))
23.1877 + && aline[i] && aline[i+1]) /* followed by punctuation, it's a query, unless . . . */
23.1878 + if (
23.1879 + (aline[i] == aline[i+1]
23.1880 + && (aline[i] == '.' || aline[i] == '?' || aline[i] == '!'))
23.1881 + || (!warn_dotcomma && aline[i] == '.' && aline[i+1] == ',')
23.1882 + || (isFrench && !strncmp(aline+i, ",...", 4))
23.1883 + || (isFrench && !strncmp(aline+i, "...,", 4))
23.1884 + || (isFrench && !strncmp(aline+i, ";...", 4))
23.1885 + || (isFrench && !strncmp(aline+i, "...;", 4))
23.1886 + || (isFrench && !strncmp(aline+i, ":...", 4))
23.1887 + || (isFrench && !strncmp(aline+i, "...:", 4))
23.1888 + || (isFrench && !strncmp(aline+i, "!...", 4))
23.1889 + || (isFrench && !strncmp(aline+i, "...!", 4))
23.1890 + || (isFrench && !strncmp(aline+i, "?...", 4))
23.1891 + || (isFrench && !strncmp(aline+i, "...?", 4))
23.1892 + ) {
23.1893 + if ((isFrench && !strncmp(aline+i, ",...", 4)) /* could this BE any more awkward? */
23.1894 + || (isFrench && !strncmp(aline+i, "...,", 4))
23.1895 + || (isFrench && !strncmp(aline+i, ";...", 4))
23.1896 + || (isFrench && !strncmp(aline+i, "...;", 4))
23.1897 + || (isFrench && !strncmp(aline+i, ":...", 4))
23.1898 + || (isFrench && !strncmp(aline+i, "...:", 4))
23.1899 + || (isFrench && !strncmp(aline+i, "!...", 4))
23.1900 + || (isFrench && !strncmp(aline+i, "...!", 4))
23.1901 + || (isFrench && !strncmp(aline+i, "?...", 4))
23.1902 + || (isFrench && !strncmp(aline+i, "...?", 4)))
23.1903 + i +=4;
23.1904 + ; /* do nothing for .. !! and ?? which can be legit */
23.1905 + }
23.1906 + else {
23.1907 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1908 + if (!pswit[OVERVIEW_SWITCH])
23.1909 + printf(" Line %ld column %d - Double punctuation?\n", linecnt, i+1);
23.1910 + else
23.1911 + cnt_punct++;
23.1912 + }
23.1913 +
23.1914 + /* v.21 breaking out the search for spaced doublequotes */
23.1915 + /* This is not as efficient, but it's more maintainable */
23.1916 + s = aline;
23.1917 + while (strstr(s," \" ")) {
23.1918 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1919 + if (!pswit[OVERVIEW_SWITCH])
23.1920 + printf(" Line %ld column %d - Spaced doublequote?\n", linecnt, (int)(strstr(s," \" ")-aline+1));
23.1921 + else
23.1922 + cnt_punct++;
23.1923 + s = strstr(s," \" ") + 2;
23.1924 + }
23.1925 +
23.1926 + /* v.20 also look for spaced singlequotes ' and ` */
23.1927 + s = aline;
23.1928 + while (strstr(s," ' ")) {
23.1929 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1930 + if (!pswit[OVERVIEW_SWITCH])
23.1931 + printf(" Line %ld column %d - Spaced singlequote?\n", linecnt, (int)(strstr(s," ' ")-aline+1));
23.1932 + else
23.1933 + cnt_punct++;
23.1934 + s = strstr(s," ' ") + 2;
23.1935 + }
23.1936 +
23.1937 + s = aline;
23.1938 + while (strstr(s," ` ")) {
23.1939 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1940 + if (!pswit[OVERVIEW_SWITCH])
23.1941 + printf(" Line %ld column %d - Spaced singlequote?\n", linecnt, (int)(strstr(s," ` ")-aline+1));
23.1942 + else
23.1943 + cnt_punct++;
23.1944 + s = strstr(s," ` ") + 2;
23.1945 + }
23.1946 +
23.1947 + /* v.99 check special case of 'S instead of 's at end of word */
23.1948 + s = aline + 1;
23.1949 + while (*s) {
23.1950 + if (*s == CHAR_SQUOTE && *(s+1) == 'S' && *(s-1)>='a' && *(s-1)<='z') {
23.1951 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1952 + if (!pswit[OVERVIEW_SWITCH])
23.1953 + printf(" Line %ld column %d - Capital \"S\"?\n", linecnt, (int)(s-aline+2));
23.1954 + else
23.1955 + cnt_punct++;
23.1956 + }
23.1957 + s++;
23.1958 + }
23.1959 +
23.1960 +
23.1961 + /* v.21 Now check special cases - start and end of line - */
23.1962 + /* for single and double quotes. Start is sometimes [sic] */
23.1963 + /* but better to query it anyway. */
23.1964 + /* While I'm here, check for dash at end of line */
23.1965 + llen = strlen(aline);
23.1966 + if (llen > 1) {
23.1967 + if (aline[llen-1] == CHAR_DQUOTE ||
23.1968 + aline[llen-1] == CHAR_SQUOTE ||
23.1969 + aline[llen-1] == CHAR_OPEN_SQUOTE)
23.1970 + if (aline[llen-2] == CHAR_SPACE) {
23.1971 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1972 + if (!pswit[OVERVIEW_SWITCH])
23.1973 + printf(" Line %ld column %d - Spaced quote?\n", linecnt, llen);
23.1974 + else
23.1975 + cnt_punct++;
23.1976 + }
23.1977 +
23.1978 + /* V 0.98 removed aline[0] == CHAR_DQUOTE from the test below, since */
23.1979 + /* Wrongspaced quotes test also catches it for " */
23.1980 + if (aline[0] == CHAR_SQUOTE ||
23.1981 + aline[0] == CHAR_OPEN_SQUOTE)
23.1982 + if (aline[1] == CHAR_SPACE) {
23.1983 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1984 + if (!pswit[OVERVIEW_SWITCH])
23.1985 + printf(" Line %ld column 1 - Spaced quote?\n", linecnt);
23.1986 + else
23.1987 + cnt_punct++;
23.1988 + }
23.1989 + /* dash at end of line may well be legit - paranoid mode only */
23.1990 + /* and don't report em-dash at line-end */
23.1991 + if (pswit[PARANOID_SWITCH] && warn_hyphen) {
23.1992 + for (i = llen-1; i > 0 && (unsigned char)aline[i] <= CHAR_SPACE; i--);
23.1993 + if (aline[i] == '-' && aline[i-1] != '-') {
23.1994 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.1995 + if (!pswit[OVERVIEW_SWITCH])
23.1996 + printf(" Line %ld column %d - Hyphen at end of line?\n", linecnt, i);
23.1997 + }
23.1998 + }
23.1999 + }
23.2000 +
23.2001 + /* v.21 also look for brackets surrounded by alpha */
23.2002 + /* Brackets are often unspaced, but shouldn't be surrounded by alpha. */
23.2003 + /* If so, suspect a scanno like "a]most" */
23.2004 + llen = strlen(aline);
23.2005 + for (i = 1; i < llen-1; i++) { /* for each character in the line except 1st & last*/
23.2006 + if (strchr("{[()]}", aline[i]) /* if it's a bracket */
23.2007 + && gcisalpha(aline[i-1]) && gcisalpha(aline[i+1])) {
23.2008 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.2009 + if (!pswit[OVERVIEW_SWITCH])
23.2010 + printf(" Line %ld column %d - Unspaced bracket?\n", linecnt, i);
23.2011 + else
23.2012 + cnt_punct++;
23.2013 + }
23.2014 + }
23.2015 + /* The "Cinderella" case, back in again! :-S Give it another shot */
23.2016 + if (warn_endquote) {
23.2017 + llen = strlen(aline);
23.2018 + for (i = 1; i < llen; i++) { /* for each character in the line except 1st */
23.2019 + if (aline[i] == CHAR_DQUOTE)
23.2020 + if (isalpha(aline[i-1])) {
23.2021 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.2022 + if (!pswit[OVERVIEW_SWITCH])
23.2023 + printf(" Line %ld column %d - endquote missing punctuation?\n", linecnt, i);
23.2024 + else
23.2025 + cnt_punct++;
23.2026 + }
23.2027 + }
23.2028 + }
23.2029 +
23.2030 + llen = strlen(aline);
23.2031 +
23.2032 + /* Check for <HTML TAG> */
23.2033 + /* If there is a < in the line, followed at some point */
23.2034 + /* by a > then we suspect HTML */
23.2035 + if (strstr(aline, "<") && strstr(aline, ">")) {
23.2036 + i = (signed int) (strstr(aline, ">") - strstr(aline, "<") + 1);
23.2037 + if (i > 0) {
23.2038 + strncpy(wrk, strstr(aline, "<"), i);
23.2039 + wrk[i] = 0;
23.2040 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.2041 + if (!pswit[OVERVIEW_SWITCH])
23.2042 + printf(" Line %ld column %d - HTML Tag? %s \n", linecnt, (int)(strstr(aline, "<") - aline) + 1, wrk);
23.2043 + else
23.2044 + cnt_html++;
23.2045 + }
23.2046 + }
23.2047 +
23.2048 + /* Check for &symbol; HTML */
23.2049 + /* If there is a & in the line, followed at */
23.2050 + /* some point by a ; then we suspect HTML */
23.2051 + if (strstr(aline, "&") && strstr(aline, ";")) {
23.2052 + i = (int)(strstr(aline, ";") - strstr(aline, "&") + 1);
23.2053 + for (s = strstr(aline, "&"); s < strstr(aline, ";"); s++)
23.2054 + if (*s == CHAR_SPACE) i = 0; /* 0.99 don't report "Jones & Son;" */
23.2055 + if (i > 0) {
23.2056 + strncpy(wrk, strstr(aline,"&"), i);
23.2057 + wrk[i] = 0;
23.2058 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
23.2059 + if (!pswit[OVERVIEW_SWITCH])
23.2060 + printf(" Line %ld column %d - HTML symbol? %s \n", linecnt, (int)(strstr(aline, "&") - aline) + 1, wrk);
23.2061 + else
23.2062 + cnt_html++;
23.2063 + }
23.2064 + }
23.2065 +
23.2066 + /* At end of paragraph, check for mismatched quotes. */
23.2067 + /* We don't want to report an error immediately, since it is a */
23.2068 + /* common convention to omit the quotes at end of paragraph if */
23.2069 + /* the next paragraph is a continuation of the same speaker. */
23.2070 + /* Where this is the case, the next para should begin with a */
23.2071 + /* quote, so we store the warning message and only display it */
23.2072 + /* at the top of the next iteration if the new para doesn't */
23.2073 + /* start with a quote. */
23.2074 + /* The -p switch overrides this default, and warns of unclosed */
23.2075 + /* quotes on _every_ paragraph, whether the next begins with a */
23.2076 + /* quote or not. */
23.2077 + /* Version .16 - only report mismatched single quotes if */
23.2078 + /* an open_single_quotes was found. */
23.2079 +
23.2080 + if (isemptyline) { /* end of para - add up the totals */
23.2081 + if (quot % 2)
23.2082 + sprintf(dquote_err, " Line %ld - Mismatched quotes\n", linecnt);
23.2083 + if (pswit[SQUOTE_SWITCH] && open_single_quote && (open_single_quote != close_single_quote) )
23.2084 + sprintf(squote_err," Line %ld - Mismatched singlequotes?\n", linecnt);
23.2085 + if (pswit[SQUOTE_SWITCH] && open_single_quote
23.2086 + && (open_single_quote != close_single_quote)
23.2087 + && (open_single_quote != close_single_quote +1) )
23.2088 + squot = 1; /* flag it to be noted regardless of the first char of the next para */
23.2089 + if (r_brack)
23.2090 + sprintf(rbrack_err, " Line %ld - Mismatched round brackets?\n", linecnt);
23.2091 + if (s_brack)
23.2092 + sprintf(sbrack_err, " Line %ld - Mismatched square brackets?\n", linecnt);
23.2093 + if (c_brack)
23.2094 + sprintf(cbrack_err, " Line %ld - Mismatched curly brackets?\n", linecnt);
23.2095 + if (c_unders % 2)
23.2096 + sprintf(unders_err, " Line %ld - Mismatched underscores?\n", linecnt);
23.2097 + quot = s_brack = c_brack = r_brack = c_unders =
23.2098 + open_single_quote = close_single_quote = 0;
23.2099 + isnewpara = 1; /* let the next iteration know that it's starting a new para */
23.2100 + }
23.2101 +
23.2102 + /* V.21 _ALSO_ at end of paragraph, check for omitted punctuation. */
23.2103 + /* by working back through prevline. DW. */
23.2104 + /* Hmmm. Need to check this only for "normal" paras. */
23.2105 + /* So what is a "normal" para? ouch! */
23.2106 + /* Not normal if one-liner (chapter headings, etc.) */
23.2107 + /* Not normal if doesn't contain at least one locase letter */
23.2108 + /* Not normal if starts with space */
23.2109 +
23.2110 + /* 0.99 tighten up on para end checks. Disallow comma and */
23.2111 + /* semi-colon. Check for legit para end before quotes. */
23.2112 + if (isemptyline) { /* end of para */
23.2113 + for (s = prevline, i = 0; *s && !i; s++)
23.2114 + if (gcisletter(*s))
23.2115 + i = 1; /* use i to indicate the presence of a letter on the line */
23.2116 + /* This next "if" is a problem. */
23.2117 + /* If I say "start_para_line <= linecnt - 1", that includes one-line */
23.2118 + /* "paragraphs" like chapter heads. Lotsa false positives. */
23.2119 + /* If I say "start_para_line < linecnt - 1" it doesn't, but then it */
23.2120 + /* misses genuine one-line paragraphs. */
23.2121 + /* So what do I do? */
23.2122 + if (i
23.2123 + && lastblen > 2
23.2124 + && start_para_line < linecnt - 1
23.2125 + && *prevline > CHAR_SPACE
23.2126 + ) {
23.2127 + for (i = strlen(prevline)-1; (prevline[i] == CHAR_DQUOTE || prevline[i] == CHAR_SQUOTE) && prevline[i] > CHAR_SPACE && i > 0; i--);
23.2128 + for ( ; i > 0; i--) {
23.2129 + if (gcisalpha(prevline[i])) {
23.2130 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", prevline);
23.2131 + if (!pswit[OVERVIEW_SWITCH])
23.2132 + printf(" Line %ld column %d - No punctuation at para end?\n", linecnt-1, strlen(prevline));
23.2133 + else
23.2134 + cnt_punct++;
23.2135 + break;
23.2136 + }
23.2137 + if (strchr("-.:!([{?}])", prevline[i]))
23.2138 + break;
23.2139 + }
23.2140 + }
23.2141 + }
23.2142 + strcpy(prevline, aline);
23.2143 + }
23.2144 + fclose (infile);
23.2145 + if (!pswit[OVERVIEW_SWITCH])
23.2146 + for (i = 0; i < MAX_QWORD; i++)
23.2147 + if (dupcnt[i])
23.2148 + printf("\nNote: Queried word %s was duplicated %d time%s\n", qword[i], dupcnt[i], "s");
23.2149 +}
23.2150 +
23.2151 +
23.2152 +
23.2153 +/* flgets - get one line from the input stream, checking for */
23.2154 +/* the existence of exactly one CR/LF line-end per line. */
23.2155 +/* Returns a pointer to the line. */
23.2156 +
23.2157 +char *flgets(char *theline, int maxlen, FILE *thefile, long lcnt)
23.2158 +{
23.2159 + char c;
23.2160 + int len, isCR, cint;
23.2161 +
23.2162 + *theline = 0;
23.2163 + len = isCR = 0;
23.2164 + c = cint = fgetc(thefile);
23.2165 + do {
23.2166 + if (cint == EOF)
23.2167 + return (NULL);
23.2168 + if (c == 10) /* either way, it's end of line */
23.2169 + if (isCR)
23.2170 + break;
23.2171 + else { /* Error - a LF without a preceding CR */
23.2172 + if (pswit[LINE_END_SWITCH]) {
23.2173 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", theline);
23.2174 + if (!pswit[OVERVIEW_SWITCH])
23.2175 + printf(" Line %ld - No CR?\n", lcnt);
23.2176 + else
23.2177 + cnt_lineend++;
23.2178 + }
23.2179 + break;
23.2180 + }
23.2181 + if (c == 13) {
23.2182 + if (isCR) { /* Error - two successive CRs */
23.2183 + if (pswit[LINE_END_SWITCH]) {
23.2184 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", theline);
23.2185 + if (!pswit[OVERVIEW_SWITCH])
23.2186 + printf(" Line %ld - Two successive CRs?\n", lcnt);
23.2187 + else
23.2188 + cnt_lineend++;
23.2189 + }
23.2190 + }
23.2191 + isCR = 1;
23.2192 + }
23.2193 + else {
23.2194 + if (pswit[LINE_END_SWITCH] && isCR) {
23.2195 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", theline);
23.2196 + if (!pswit[OVERVIEW_SWITCH])
23.2197 + printf(" Line %ld column %d - CR without LF?\n", lcnt, len+1);
23.2198 + else
23.2199 + cnt_lineend++;
23.2200 + }
23.2201 + theline[len] = c;
23.2202 + len++;
23.2203 + theline[len] = 0;
23.2204 + isCR = 0;
23.2205 + }
23.2206 + c = cint = fgetc(thefile);
23.2207 + } while(len < maxlen);
23.2208 + if (pswit[MARKUP_SWITCH])
23.2209 + postprocess_for_HTML(theline);
23.2210 + if (pswit[DP_SWITCH])
23.2211 + postprocess_for_DP(theline);
23.2212 + return(theline);
23.2213 +}
23.2214 +
23.2215 +
23.2216 +
23.2217 +
23.2218 +/* mixdigit - takes a "word" as a parameter, and checks whether it */
23.2219 +/* contains a mixture of alpha and digits. Generally, this is an */
23.2220 +/* error, but may not be for cases like 4th or L5 12s. 3d. */
23.2221 +/* Returns 0 if no error found, 1 if error. */
23.2222 +
23.2223 +int mixdigit(char *checkword) /* check for digits like 1 or 0 in words */
23.2224 +{
23.2225 + int wehaveadigit, wehavealetter, firstdigits, query, wl;
23.2226 + char *s;
23.2227 +
23.2228 +
23.2229 + wehaveadigit = wehavealetter = query = 0;
23.2230 + for (s = checkword; *s; s++)
23.2231 + if (gcisalpha(*s))
23.2232 + wehavealetter = 1;
23.2233 + else
23.2234 + if (gcisdigit(*s))
23.2235 + wehaveadigit = 1;
23.2236 + if (wehaveadigit && wehavealetter) { /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
23.2237 + query = 1;
23.2238 + wl = strlen(checkword);
23.2239 + for (firstdigits = 0; gcisdigit(checkword[firstdigits]); firstdigits++)
23.2240 + ;
23.2241 + /* digits, ending in st, rd, nd, th of either case */
23.2242 + /* 0.99 donovan points out an error below. Turns out */
23.2243 + /* I was using matchword like strcmp when the */
23.2244 + /* return values are different! Duh. */
23.2245 + if (firstdigits + 2 == wl &&
23.2246 + (matchword(checkword + wl - 2, "st")
23.2247 + || matchword(checkword + wl - 2, "rd")
23.2248 + || matchword(checkword + wl - 2, "nd")
23.2249 + || matchword(checkword + wl - 2, "th"))
23.2250 + )
23.2251 + query = 0;
23.2252 + if (firstdigits + 3 == wl &&
23.2253 + (matchword(checkword + wl - 3, "sts")
23.2254 + || matchword(checkword + wl - 3, "rds")
23.2255 + || matchword(checkword + wl - 3, "nds")
23.2256 + || matchword(checkword + wl - 3, "ths"))
23.2257 + )
23.2258 + query = 0;
23.2259 + if (firstdigits + 3 == wl &&
23.2260 + (matchword(checkword + wl - 4, "stly")
23.2261 + || matchword(checkword + wl - 4, "rdly")
23.2262 + || matchword(checkword + wl - 4, "ndly")
23.2263 + || matchword(checkword + wl - 4, "thly"))
23.2264 + )
23.2265 + query = 0;
23.2266 +
23.2267 + /* digits, ending in l, L, s or d */
23.2268 + if (firstdigits + 1 == wl &&
23.2269 + (checkword[wl-1] == 'l'
23.2270 + || checkword[wl-1] == 'L'
23.2271 + || checkword[wl-1] == 's'
23.2272 + || checkword[wl-1] == 'd'))
23.2273 + query = 0;
23.2274 + /* L at the start of a number, representing Britsh pounds, like L500 */
23.2275 + /* This is cute. We know the current word is mixeddigit. If the first */
23.2276 + /* letter is L, there must be at least one digit following. If both */
23.2277 + /* digits and letters follow, we have a genuine error, else we have a */
23.2278 + /* capital L followed by digits, and we accept that as a non-error. */
23.2279 + if (checkword[0] == 'L')
23.2280 + if (!mixdigit(checkword+1))
23.2281 + query = 0;
23.2282 + }
23.2283 + return (query);
23.2284 +}
23.2285 +
23.2286 +
23.2287 +
23.2288 +
23.2289 +/* getaword - extracts the first/next "word" from the line, and puts */
23.2290 +/* it into "thisword". A word is defined as one English word unit */
23.2291 +/* -- or at least that's what I'm trying for. */
23.2292 +/* Returns a pointer to the position in the line where we will start */
23.2293 +/* looking for the next word. */
23.2294 +
23.2295 +char *getaword(char *fromline, char *thisword)
23.2296 +{
23.2297 + int i, wordlen;
23.2298 + char *s;
23.2299 +
23.2300 + wordlen = 0;
23.2301 + for ( ; !gcisdigit(*fromline) && !gcisalpha(*fromline) && *fromline ; fromline++ );
23.2302 +
23.2303 + /* V .20 */
23.2304 + /* add a look-ahead to handle exceptions for numbers like 1,000 and 1.35. */
23.2305 + /* Especially yucky is the case of L1,000 */
23.2306 + /* I hate this, and I see other ways, but I don't see that any is _better_.*/
23.2307 + /* This section looks for a pattern of characters including a digit */
23.2308 + /* followed by a comma or period followed by one or more digits. */
23.2309 + /* If found, it returns this whole pattern as a word; otherwise we discard */
23.2310 + /* the results and resume our normal programming. */
23.2311 + s = fromline;
23.2312 + for ( ; (gcisdigit(*s) || gcisalpha(*s) || *s == ',' || *s == '.') && wordlen < MAXWORDLEN ; s++ ) {
23.2313 + thisword[wordlen] = *s;
23.2314 + wordlen++;
23.2315 + }
23.2316 + thisword[wordlen] = 0;
23.2317 + for (i = 1; i < wordlen -1; i++) {
23.2318 + if (thisword[i] == '.' || thisword[i] == ',') {
23.2319 + if (gcisdigit(thisword[i-1]) && gcisdigit(thisword[i-1])) { /* we have one of the damned things */
23.2320 + fromline = s;
23.2321 + return(fromline);
23.2322 + }
23.2323 + }
23.2324 + }
23.2325 +
23.2326 + /* we didn't find a punctuated number - do the regular getword thing */
23.2327 + wordlen = 0;
23.2328 + for ( ; (gcisdigit(*fromline) || gcisalpha(*fromline) || *fromline == '\'') && wordlen < MAXWORDLEN ; fromline++ ) {
23.2329 + thisword[wordlen] = *fromline;
23.2330 + wordlen++;
23.2331 + }
23.2332 + thisword[wordlen] = 0;
23.2333 + return(fromline);
23.2334 +}
23.2335 +
23.2336 +
23.2337 +
23.2338 +
23.2339 +
23.2340 +/* matchword - just a case-insensitive string matcher */
23.2341 +/* yes, I know this is not efficient. I'll worry about */
23.2342 +/* that when I have a clear idea where I'm going with it.*/
23.2343 +
23.2344 +int matchword(char *checkfor, char *thisword)
23.2345 +{
23.2346 + unsigned int ismatch, i;
23.2347 +
23.2348 + if (strlen(checkfor) != strlen(thisword)) return(0);
23.2349 +
23.2350 + ismatch = 1; /* assume a match until we find a difference */
23.2351 + for (i = 0; i <strlen(checkfor); i++)
23.2352 + if (toupper(checkfor[i]) != toupper(thisword[i]))
23.2353 + ismatch = 0;
23.2354 + return (ismatch);
23.2355 +}
23.2356 +
23.2357 +
23.2358 +
23.2359 +
23.2360 +
23.2361 +/* lowerit - lowercase the line. Yes, strlwr does the same job, */
23.2362 +/* but not on all platforms, and I'm a bit paranoid about what */
23.2363 +/* some implementations of tolower might do to hi-bit characters,*/
23.2364 +/* which shouldn't matter, but better safe than sorry. */
23.2365 +
23.2366 +void lowerit(char *theline)
23.2367 +{
23.2368 + for ( ; *theline; theline++)
23.2369 + if (*theline >='A' && *theline <='Z')
23.2370 + *theline += 32;
23.2371 +}
23.2372 +
23.2373 +
23.2374 +/* Is this word a Roman Numeral? */
23.2375 +/* v 0.99 improved to be better. It still doesn't actually */
23.2376 +/* validate that the number is a valid Roman Numeral -- for example */
23.2377 +/* it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not*/
23.2378 +/* what we're here to do. If it passes this, it LOOKS like a Roman */
23.2379 +/* numeral. Anyway, the actual Romans were pretty tolerant of bad */
23.2380 +/* arithmetic, or expressions thereof, except when it came to taxes.*/
23.2381 +/* Allow any number of M, an optional D, an optional CM or CD, */
23.2382 +/* any number of optional Cs, an optional XL or an optional XC, an */
23.2383 +/* optional IX or IV, an optional V and any number of optional Is. */
23.2384 +/* Good enough for jazz chords. */
23.2385 +
23.2386 +int isroman(char *t)
23.2387 +{
23.2388 + char *s;
23.2389 +
23.2390 + if (!t || !*t) return (0);
23.2391 +
23.2392 + s = t;
23.2393 +
23.2394 + while (*t == 'm' && *t ) t++;
23.2395 + if (*t == 'd') t++;
23.2396 + if (*t == 'c' && *(t+1) == 'm') t+=2;
23.2397 + if (*t == 'c' && *(t+1) == 'd') t+=2;
23.2398 + while (*t == 'c' && *t) t++;
23.2399 + if (*t == 'x' && *(t+1) == 'l') t+=2;
23.2400 + if (*t == 'x' && *(t+1) == 'c') t+=2;
23.2401 + if (*t == 'l') t++;
23.2402 + while (*t == 'x' && *t) t++;
23.2403 + if (*t == 'i' && *(t+1) == 'x') t+=2;
23.2404 + if (*t == 'i' && *(t+1) == 'v') t+=2;
23.2405 + if (*t == 'v') t++;
23.2406 + while (*t == 'i' && *t) t++;
23.2407 + if (!*t) return (1);
23.2408 +
23.2409 + return(0);
23.2410 +}
23.2411 +
23.2412 +
23.2413 +
23.2414 +
23.2415 +/* gcisalpha is a special version that is somewhat lenient on 8-bit texts. */
23.2416 +/* If we use the standard isalpha() function, 8-bit accented characters break */
23.2417 +/* words, so that tete with accented characters appears to be two words, "t" */
23.2418 +/* and "t", with 8-bit characters between them. This causes over-reporting of */
23.2419 +/* errors. gcisalpha() recognizes accented letters from the CP1252 (Windows) */
23.2420 +/* and ISO-8859-1 character sets, which are the most common PG 8-bit types. */
23.2421 +
23.2422 +int gcisalpha(unsigned char c)
23.2423 +{
23.2424 + if (c >='a' && c <='z') return(1);
23.2425 + if (c >='A' && c <='Z') return(1);
23.2426 + if (c < 140) return(0);
23.2427 + if (c >=192 && c != 208 && c != 215 && c != 222 && c != 240 && c != 247 && c != 254) return(1);
23.2428 + if (c == 140 || c == 142 || c == 156 || c == 158 || c == 159) return (1);
23.2429 + return(0);
23.2430 +}
23.2431 +
23.2432 +/* gcisdigit is a special version that doesn't get confused in 8-bit texts. */
23.2433 +int gcisdigit(unsigned char c)
23.2434 +{
23.2435 + if (c >= '0' && c <='9') return(1);
23.2436 + return(0);
23.2437 +}
23.2438 +
23.2439 +/* gcisletter is a special version that doesn't get confused in 8-bit texts. */
23.2440 +/* Yeah, we're ISO-8891-1-specific. So sue me. */
23.2441 +int gcisletter(unsigned char c)
23.2442 +{
23.2443 + if ((c >= 'A' && c <='Z') || (c >= 'a' && c <='z') || c >= 192) return(1);
23.2444 + return(0);
23.2445 +}
23.2446 +
23.2447 +
23.2448 +
23.2449 +
23.2450 +/* gcstrchr wraps strchr to return NULL if the character being searched for is zero */
23.2451 +
23.2452 +char *gcstrchr(char *s, char c)
23.2453 +{
23.2454 + if (c == 0) return(NULL);
23.2455 + return(strchr(s,c));
23.2456 +}
23.2457 +
23.2458 +/* postprocess_for_DP is derived from postprocess_for_HTML */
23.2459 +/* It is invoked with the -d switch from flgets(). */
23.2460 +/* It simply "removes" from the line a hard-coded set of common */
23.2461 +/* DP-specific tags, so that the line passed to the main routine has*/
23.2462 +/* been pre-cleaned of DP markup. */
23.2463 +
23.2464 +void postprocess_for_DP(char *theline)
23.2465 +{
23.2466 +
23.2467 + char *s, *t;
23.2468 + int i;
23.2469 +
23.2470 + if (!*theline)
23.2471 + return;
23.2472 +
23.2473 + for (i = 0; *DPmarkup[i]; i++) {
23.2474 + s = strstr(theline, DPmarkup[i]);
23.2475 + while (s) {
23.2476 + t = s + strlen(DPmarkup[i]);
23.2477 + while (*t) {
23.2478 + *s = *t;
23.2479 + t++; s++;
23.2480 + }
23.2481 + *s = 0;
23.2482 + s = strstr(theline, DPmarkup[i]);
23.2483 + }
23.2484 + }
23.2485 +
23.2486 +}
23.2487 +
23.2488 +
23.2489 +/* postprocess_for_HTML is, at the moment (0.97), a very nasty */
23.2490 +/* short-term fix for Charlz. Nasty, nasty, nasty. */
23.2491 +/* It is invoked with the -m switch from flgets(). */
23.2492 +/* It simply "removes" from the line a hard-coded set of common */
23.2493 +/* HTML tags and "replaces" a hard-coded set of common HTML */
23.2494 +/* entities, so that the line passed to the main routine has */
23.2495 +/* been pre-cleaned of HTML. This is _so_ not the right way to */
23.2496 +/* deal with HTML, but what Charlz needs now is not HTML handling */
23.2497 +/* proper: just ignoring <i> tags and some others. */
23.2498 +/* To be revisited in future releases! */
23.2499 +
23.2500 +void postprocess_for_HTML(char *theline)
23.2501 +{
23.2502 +
23.2503 + if (strstr(theline, "<") && strstr(theline, ">"))
23.2504 + while (losemarkup(theline))
23.2505 + ;
23.2506 + while (loseentities(theline))
23.2507 + ;
23.2508 +}
23.2509 +
23.2510 +char *losemarkup(char *theline)
23.2511 +{
23.2512 + char *s, *t;
23.2513 + int i;
23.2514 +
23.2515 + if (!*theline)
23.2516 + return(NULL);
23.2517 +
23.2518 + s = strstr(theline, "<");
23.2519 + t = strstr(theline, ">");
23.2520 + if (!s || !t) return(NULL);
23.2521 + for (i = 0; *markup[i]; i++)
23.2522 + if (!tagcomp(s+1, markup[i])) {
23.2523 + if (!*(t+1)) {
23.2524 + *s = 0;
23.2525 + return(s);
23.2526 + }
23.2527 + else
23.2528 + if (t > s) {
23.2529 + strcpy(s, t+1);
23.2530 + return(s);
23.2531 + }
23.2532 + }
23.2533 + /* it's an unrecognized <xxx> */
23.2534 + return(NULL);
23.2535 +}
23.2536 +
23.2537 +char *loseentities(char *theline)
23.2538 +{
23.2539 + int i;
23.2540 + char *s, *t;
23.2541 +
23.2542 + if (!*theline)
23.2543 + return(NULL);
23.2544 +
23.2545 + for (i = 0; *entities[i].htmlent; i++) {
23.2546 + s = strstr(theline, entities[i].htmlent);
23.2547 + if (s) {
23.2548 + t = malloc((size_t)strlen(s));
23.2549 + if (!t) return(NULL);
23.2550 + strcpy(t, s + strlen(entities[i].htmlent));
23.2551 + strcpy(s, entities[i].textent);
23.2552 + strcat(s, t);
23.2553 + free(t);
23.2554 + return(theline);
23.2555 + }
23.2556 + }
23.2557 +
23.2558 + /* V0.97 Duh. Forgot to check the htmlnum member */
23.2559 + for (i = 0; *entities[i].htmlnum; i++) {
23.2560 + s = strstr(theline, entities[i].htmlnum);
23.2561 + if (s) {
23.2562 + t = malloc((size_t)strlen(s));
23.2563 + if (!t) return(NULL);
23.2564 + strcpy(t, s + strlen(entities[i].htmlnum));
23.2565 + strcpy(s, entities[i].textent);
23.2566 + strcat(s, t);
23.2567 + free(t);
23.2568 + return(theline);
23.2569 + }
23.2570 + }
23.2571 + return(NULL);
23.2572 +}
23.2573 +
23.2574 +
23.2575 +int tagcomp(char *strin, char *basetag)
23.2576 +{
23.2577 + char *s, *t;
23.2578 +
23.2579 + s = basetag;
23.2580 + t = strin;
23.2581 + if (*t == '/') t++; /* ignore a slash */
23.2582 + while (*s && *t) {
23.2583 + if (tolower(*s) != tolower(*t)) return(1);
23.2584 + s++; t++;
23.2585 + }
23.2586 + /* OK, we have < followed by a valid tag start */
23.2587 + /* should I do something about length? */
23.2588 + /* this is messy. The length of an <i> tag is */
23.2589 + /* limited, but a <table> could go on for miles */
23.2590 + /* so I'd have to parse the tags . . . ugh. */
23.2591 + /* It isn't what Charlz needs now, so mark it */
23.2592 + /* as 'pending'. */
23.2593 + return(0);
23.2594 +}
23.2595 +
23.2596 +void proghelp() /* explain program usage here */
23.2597 +{
23.2598 + fputs("V. 0.991. Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
23.2599 + fputs("Gutcheck comes wih ABSOLUTELY NO WARRANTY. For details, read the file COPYING.\n", stderr);
23.2600 + fputs("This is Free Software; you may redistribute it under certain conditions (GPL);\n", stderr);
23.2601 + fputs("read the file COPYING for details.\n\n", stderr);
23.2602 + fputs("Usage is: gutcheck [-setpxloyhud] filename\n",stderr);
23.2603 + fputs(" where -s checks single quotes, -e suppresses echoing lines, -t checks typos\n",stderr);
23.2604 + fputs(" -x (paranoid) switches OFF -t and extra checks, -l turns OFF line-end checks\n",stderr);
23.2605 + fputs(" -o just displays overview without detail, -h echoes header fields\n",stderr);
23.2606 + fputs(" -v (verbose) unsuppresses duplicate reporting, -m suppresses markup\n",stderr);
23.2607 + fputs(" -d ignores DP-specific markup,\n",stderr);
23.2608 + fputs(" -u uses a file gutcheck.typ to query user-defined possible typos\n",stderr);
23.2609 + fputs("Sample usage: gutcheck warpeace.txt \n",stderr);
23.2610 + fputs("\n",stderr);
23.2611 + fputs("Gutcheck looks for errors in Project Gutenberg(TM) etexts.\n", stderr);
23.2612 + fputs("Gutcheck queries anything it thinks shouldn't be in a PG text; non-ASCII\n",stderr);
23.2613 + fputs("characters like accented letters, lines longer than 75 or shorter than 55,\n",stderr);
23.2614 + fputs("unbalanced quotes or brackets, a variety of badly formatted punctuation, \n",stderr);
23.2615 + fputs("HTML tags, some likely typos. It is NOT a substitute for human judgement.\n",stderr);
23.2616 + fputs("\n",stderr);
23.2617 +}
23.2618 +
23.2619 +
23.2620 +
23.2621 +/*********************************************************************
23.2622 + Revision History:
23.2623 +
23.2624 + 04/22/01 Cleaned up some stuff and released .10
23.2625 +
23.2626 + ---------------
23.2627 +
23.2628 + 05/09/01 Added the typo list, added two extra cases of he/be error,
23.2629 + added -p switch, OPEN_SINGLE QUOTE char as .11
23.2630 +
23.2631 + ---------------
23.2632 +
23.2633 + 05/20/01 Increased the typo list,
23.2634 + added paranoid mode,
23.2635 + ANSIfied the code and added some casts
23.2636 + so the compiler wouldn't keep asking if I knew what I was doing,
23.2637 + fixed bug in l.s.d. condition (thanks, Dave!),
23.2638 + standardized spacing when echoing,
23.2639 + added letter-combo checking code to typo section,
23.2640 + added more h/b words to typo array.
23.2641 + Not too sure about putting letter combos outside of the TYPO conditions -
23.2642 + someone is sure to have a book about the tbaka tribe, or something. Anyway, let's see.
23.2643 + Released as .12
23.2644 +
23.2645 + ---------------
23.2646 +
23.2647 + 06/01/01 Removed duplicate reporting of Tildes, asterisks, etc.
23.2648 + 06/10/01 Added flgets routine to help with platform-independent
23.2649 + detection of invalid line-ends. All PG text files should
23.2650 + have CR/LF (13/10) at end of line, regardless of system.
23.2651 + Gutcheck now validates this by default. (Thanks, Charles!)
23.2652 + Released as .13
23.2653 +
23.2654 + ---------------
23.2655 +
23.2656 + 06/11/01 Added parenthesis match checking. (c_brack, cbrack_err etc.)
23.2657 + Released as .14
23.2658 +
23.2659 + ---------------
23.2660 +
23.2661 + 06/23/01 Fixed: 'No',he said. not being flagged.
23.2662 +
23.2663 + Improved: better single-quotes checking:
23.2664 +
23.2665 + Ignore singlequotes surrounded by alpha, like didn't. (was OK)
23.2666 +
23.2667 + If a singlequote is at the END of a word AND the word ends in "s":
23.2668 + The dogs' tails wagged.
23.2669 + it's probably an apostrophe, but less commonly may be a closequote:
23.2670 + "These 'pack dogs' of yours look more like wolves."
23.2671 +
23.2672 + If it's got punctuation before it and is followed by a space
23.2673 + or punctuation:
23.2674 + . . . was a problem,' he said
23.2675 + . . . was a problem,'"
23.2676 + it is probably (certainly?) a closequote.
23.2677 +
23.2678 + If it's at start of paragraph, it's probably an openquote.
23.2679 + (but watch dialect)
23.2680 +
23.2681 + Words with ' at beginning and end are probably quoted:
23.2682 + "You have the word 'chivalry' frequently on your lips."
23.2683 + (Not specifically implemented)
23.2684 + V.18 I'm glad I didn't implement this, 'cos it jest ain't so
23.2685 + where the convention is to punctuate outside the quotes.
23.2686 + 'Come', he said, 'and join the party'.
23.2687 +
23.2688 + If it is followed by an alpha, and especially a capital:
23.2689 + 'Hello,' called he.
23.2690 + it is either an openquote or dialect.
23.2691 +
23.2692 + Dialect breaks ALL the rules:
23.2693 + A man's a man for a' that.
23.2694 + "Aye, but 'tis all in the pas' now."
23.2695 + "'Tis often the way," he said.
23.2696 + 'Ave a drink on me.
23.2697 +
23.2698 + This version looks to be an improvement, and produces
23.2699 + fewer false positives, but is still not perfect. The
23.2700 + 'pack dogs' case still fools it, and dialect is still
23.2701 + a problem. Oh, well, it's an improvement, and I have
23.2702 + a weighted structure in place for refining guesses at
23.2703 + closequotes. Maybe next time, I'll add a bit of logic
23.2704 + where if there is an open quote and one that was guessed
23.2705 + to be a possessive apostrophe after s, I'll re-guess it
23.2706 + to be a closequote. Let's see how this one flies, first.
23.2707 +
23.2708 + (Afterview: it's still crap. Needs much work, and a deeper insight.)
23.2709 +
23.2710 + Released as .15
23.2711 +
23.2712 + TODO: More he/be checks. Can't be perfect - counterexamples:
23.2713 + I gave my son good advice: be married regardless of the world's opinion.
23.2714 + I gave my son good advice: he married regardless of the world's opinion.
23.2715 +
23.2716 + If by "primitive" be meant "crude", we can understand the sentence.
23.2717 + If by "primitive" he meant "crude", we can understand the sentence.
23.2718 +
23.2719 + No matter what be said, I must go on.
23.2720 + No matter what he said, I must go on.
23.2721 +
23.2722 + No value, however great, can be set upon them.
23.2723 + No value, however great, can he set upon them.
23.2724 +
23.2725 + Real-Life one from a DP International Weekly Miscellany:
23.2726 + He wandered through the forest without fear, sleeping
23.2727 + much, for in sleep be had companionship--the Great
23.2728 + Spirit teaching him what he should know in dreams.
23.2729 + That one found by jeebies, and it turned out to be "he".
23.2730 +
23.2731 +
23.2732 + ---------------
23.2733 +
23.2734 + 07/01/01 Added -O option.
23.2735 + Improved singlequotes by reporting mismatched single quotes
23.2736 + only if an open_single_quotes was found.
23.2737 +
23.2738 + Released as .16
23.2739 +
23.2740 + ---------------
23.2741 +
23.2742 + 08/27/01 Added -Y switch for Robert Rowe to allow his app to
23.2743 + catch the error output.
23.2744 +
23.2745 + Released as .17
23.2746 +
23.2747 + ---------------
23.2748 +
23.2749 + 09/08/01 Added checking Capitals at start of paragraph, but not
23.2750 + checking them at start of sentence.
23.2751 +
23.2752 + TODO: Parse sentences out so can check reliably for start of
23.2753 + sentence. Need a whole different approach for that.
23.2754 + (Can't just rely on periods, since they are also
23.2755 + used for abbreviations, etc.)
23.2756 +
23.2757 + Added checking for all vowels or all consonants in a word.
23.2758 +
23.2759 + While I was in, I added "ii" checking and "tl" at start of word.
23.2760 +
23.2761 + Added echoing of first line of paragraph when reporting
23.2762 + mismatched quoted or brackets (thanks to David Widger for the
23.2763 + suggestion)
23.2764 +
23.2765 + Not querying L at start of a number (used for British pounds).
23.2766 +
23.2767 + The spelling changes are sort of half-done but released anyway
23.2768 + Skipped .18 because I had given out a couple of test versions
23.2769 + with that number.
23.2770 +
23.2771 + 09/25/01 Released as .19
23.2772 +
23.2773 + ---------------
23.2774 +
23.2775 + TODO:
23.2776 + Use the logic from my new version of safewrap to stop querying
23.2777 + short lines like poems and TOCs.
23.2778 + Ignore non-standard ellipses like . . . or ...
23.2779 +
23.2780 +
23.2781 + ---------------
23.2782 + 10/01/01 Made any line over 80 a VERY long line (was 85).
23.2783 + Recognized openquotes on indented paragraphs as continuations
23.2784 + of the same speech.
23.2785 + Added "cf" to the okword list (how did I forget _that_?) and a few others.
23.2786 + Moved abbrev to okword and made it more general.
23.2787 + Removed requirement that PG_space_emdash be greater than
23.2788 + ten before turning off warnings about spaced dashes.
23.2789 + Added period to list of characters that might constitute a separator line.
23.2790 + Now checking for double punctuation (Thanks, David!)
23.2791 + Now if two spaced em-dashes on a line, reports both. (DW)
23.2792 + Bug: Wasn't catching spaced punctuation at line-end since I
23.2793 + added flgets in version .13 - fixed.
23.2794 + Bug: Wasn't catching spaced singlequotes - fixed
23.2795 + Now reads punctuated numbers like 1,000 as a single word.
23.2796 + (Used to give "standalone 1" type queries)
23.2797 + Changed paranoid mode - not including s and p options. -ex is now quite usable.
23.2798 + Bug: was calling `"For it is perfectly impossible," Unspaced Quotes - fixed
23.2799 + Bug: Sometimes gave _next_ line number for queried word at end of line - fixed
23.2800 +
23.2801 + 10/22/01 Released as .20
23.2802 +
23.2803 + ---------------
23.2804 +
23.2805 + Added count of lines with spaces at end. (cnt_spacend) (Thanks, Brett!)
23.2806 + Reduced the number of hi-bit letters needed to stop reporting them
23.2807 + from 1/20 to 1/100 or 200 in total.
23.2808 + Added PG footer check.
23.2809 + Added the -h switch.
23.2810 + Fixed platform-specific CHAR_EOL checking for isemptyline - changed to 13 and 10
23.2811 + Not reporting ".," when there are many of them, such as a book with many references to "Vol 1., p. 23"
23.2812 + Added unspaced brackets check when surrounded by alpha.
23.2813 + Removed all typo reporting unless the typo switch is on.
23.2814 + Added gcisalpha to ease over-reporting of 8-bit queries.
23.2815 + ECHO_SWITCH is now ON by default!
23.2816 + PARANOID_SWITCH is now ON by default!
23.2817 + Checking for ">From" placed there by e-mail MTA (Thanks Andrew & Greg)
23.2818 + Checking for standalone lowercase "l"
23.2819 + Checking for standalone lowercase "s"
23.2820 + Considering "is be" and "be is" "be was" "was be" as he/be errors
23.2821 + Looking at punct at end of para
23.2822 +
23.2823 + 01/20/02 Released as .21
23.2824 +
23.2825 + ---------------
23.2826 +
23.2827 + Added VERBOSE_SWITCH to make it list everything. (George Davis)
23.2828 +
23.2829 + ---------------
23.2830 +
23.2831 + 02/17/02 Added cint in flgets to try fix an EOF failure on a compiler I don't have.
23.2832 + after which
23.2833 + This line caused a coredump on Solaris - fixed.
23.2834 + Da sagte die Figur: " Das ist alles gar schoen, und man mag die Puppe
23.2835 + 03/09/02 Changed header recognition for another header change
23.2836 + Called it .24
23.2837 + 03/29/02 Added qword[][] so I can suppress massive overreporting
23.2838 + of queried "words" like "FN", "Wm.", "th'", people's
23.2839 + initials, chemical formulae and suchlike in some texts.
23.2840 + Called it .25
23.2841 + 04/07/02 The qword summary reports at end shouldn't show in OVERVIEW mode. Fixed.
23.2842 + Added linecounts in overview mode.
23.2843 + Wow! gutcheck gutcheck.exe doesn't report a binary! :-) Need to tighten up. Done.
23.2844 + "m" is a not uncommon scanno for "in", but also appears in "a.m." - Can I get round that?
23.2845 + 07/07/02 Added GPL.
23.2846 + Added checking for broken em-dash at line-end (enddash)
23.2847 + Released as 0.95
23.2848 + 08/17/02 Fixed a bug that treated some hi-bit characters as spaces. Thanks, Carlo.
23.2849 + Released as 0.96
23.2850 + 10/10/02 Suppressing some annoying multiple reports by default:
23.2851 + Standalone Ones, Asterisks, Square Brackets.
23.2852 + Digit 1 occurs often in many scientific texts.
23.2853 + Asterisk occurs often in multi-footnoted texts.
23.2854 + Mismatch Square Brackets occurs often in multi-para footnotes.
23.2855 + Added -m switch for Charlz. Horrible. Nasty. Kludgy. Evil.
23.2856 + . . . but it does more or less work for the main cases.
23.2857 + Removed uppercase within a word as a separate category so
23.2858 + that names like VanAllen get reported only once, like other
23.2859 + suspected typos.
23.2860 + 11/24/02 Fixed - -m switch wasn't looking at htmlnum in
23.2861 + loseentities (Thanks, Brett!)
23.2862 + Fixed bug which occasionally gave false warning of
23.2863 + paragraph starting with lowercase.
23.2864 + Added underscore as character not to query around doublequotes.
23.2865 + Split the "Non-ASCII" message into "Non-ASCII" vs. "Non-ISO-8859"
23.2866 + . . . this is to help detect things like CP1252 characters.
23.2867 + Released as 0.97
23.2868 +
23.2869 + 12/01/02 Hacked a simplified version of the "Wrongspaced quotes" out of gutspell,
23.2870 + for doublequotes only. Replaces "Spaced quote", since it also covers that
23.2871 + case.
23.2872 + Added "warn_hyphen" to ease over-reporting of hyphens.
23.2873 +
23.2874 + 12/20/02 Added "extra period" checks.
23.2875 + Added single character line check
23.2876 + Added I" check - is usually an exclam
23.2877 + Released as 0.98
23.2878 +
23.2879 + 1/5/03 Eeek! Left in a lowerit(argv[0]) at the start before procfile()
23.2880 + from when I was looking at ways to identify markup. Refuses to
23.2881 + open files for *nix users with upcase in the filemanes. Removed.
23.2882 + Fixed quickly and released as 0.981
23.2883 +
23.2884 + 1/8/03 Added "arid" to the list of typos, slightly against my better
23.2885 + judgement, but the DP gang are all excited about it. :-)
23.2886 + Added a check for comma followed by capital letter, where
23.2887 + a period has OCRed into a comma. (DW). Not sure about this
23.2888 + either; we'll see.
23.2889 + Compiling for Win32 to allow longfilenames.
23.2890 +
23.2891 + 6/1/04 A messy test release for DW to include the "gutcheck.typ"
23.2892 + process. And the gutcheck.jee trials. Removed "arid" --
23.2893 + it can go in gutcheck.typ
23.2894 +
23.2895 + Added checks for carats ^ and slants / but disabling slant
23.2896 + queries if more than 20 of them, because some people use them
23.2897 + for /italics/. Slants are commonly mistaken italic "I"s.
23.2898 +
23.2899 + Later: removed gutcheck.jee -- wrote jeebies instead.
23.2900 +
23.2901 +Random TODO:
23.2902 + Check brackets more closely, like quotes, so that it becomes
23.2903 + easy to find the error in long paragraphs full of brackets.
23.2904 +
23.2905 +
23.2906 + 11/4/04 Assorted cleanup. Fixed case where text started with an
23.2907 + unbalanced paragraph.
23.2908 +
23.2909 + 1/2/05 Has it really been that long? Added "nocomma", "noperiod" check.
23.2910 + Bits and pieces: improved isroman(). Added isletter().
23.2911 + Other stuff I never noted before this.
23.2912 +
23.2913 + 7/3/05 Stuck in a quick start on DP-markup ignoring
23.2914 + at BillFlis's suggestion.
23.2915 +
23.2916 + 1/23/06 Took out nocomma etc if typos are off. Why did I ever leave that in?
23.2917 + Don't count footer for dotcomma etc.
23.2918 +
23.2919 +
23.2920 +1 I
23.2921 +ail all
23.2922 +arc are
23.2923 +arid and
23.2924 +bad had
23.2925 +ball hall
23.2926 +band hand
23.2927 +bar her
23.2928 +bat but
23.2929 +be he
23.2930 +bead head
23.2931 +beads heads
23.2932 +bear hear
23.2933 +bit hit
23.2934 +bo be
23.2935 +boon been
23.2936 +borne home
23.2937 +bow how
23.2938 +bumbled humbled
23.2939 +car ear
23.2940 +carnage carriage
23.2941 +carne came
23.2942 +cast east
23.2943 +cat cut
23.2944 +cat eat
23.2945 +cheek check
23.2946 +clay day
23.2947 +coining coming
23.2948 +comer corner
23.2949 +die she
23.2950 +docs does
23.2951 +ease case
23.2952 +fail fall
23.2953 +fee he
23.2954 +haying having
23.2955 +ho he
23.2956 +ho who
23.2957 +hut but
23.2958 +is as
23.2959 +lie he
23.2960 +lime time
23.2961 +loth 10th
23.2962 +m in
23.2963 +modem modern
23.2964 +Ms his
23.2965 +ray away
23.2966 +ray my
23.2967 +ringer finger
23.2968 +ringers fingers
23.2969 +rioted noted
23.2970 +tho the
23.2971 +tie he
23.2972 +tie the
23.2973 +tier her
23.2974 +tight right
23.2975 +tile the
23.2976 +tiling thing
23.2977 +tip up
23.2978 +tram train
23.2979 +tune time
23.2980 +u "
23.2981 +wen well
23.2982 +yon you
23.2983 +
23.2984 +*********************************************************************/
23.2985 +
24.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
24.2 +++ b/bookloupe/bookloupe.typ.in Fri Jan 27 10:30:16 2012 +0000
24.3 @@ -0,0 +1,54 @@
24.4 +11
24.5 +44
24.6 +ms
24.7 +ail
24.8 +alien
24.9 +arc
24.10 +arid
24.11 +bar
24.12 +bat
24.13 +bo
24.14 +borne
24.15 +bow
24.16 +bum
24.17 +bumbled
24.18 +carnage
24.19 +carne
24.20 +cither
24.21 +coining
24.22 +comer
24.23 +cur
24.24 +docs
24.25 +eve
24.26 +eves
24.27 +gaming
24.28 +gram
24.29 +guru
24.30 +hag
24.31 +hare
24.32 +haying
24.33 +ho
24.34 +lime
24.35 +loth
24.36 +m
24.37 +modem
24.38 +nave
24.39 +ringer
24.40 +ringers
24.41 +riot
24.42 +rioted
24.43 +signer
24.44 +snore
24.45 +spam
24.46 +tho
24.47 +tier
24.48 +tile
24.49 +tiling
24.50 +tram
24.51 +tum
24.52 +tune
24.53 +u
24.54 +vas
24.55 +wag
24.56 +wen
24.57 +yon
25.1 --- a/configure.ac Fri Jan 27 00:28:11 2012 +0000
25.2 +++ b/configure.ac Fri Jan 27 10:30:16 2012 +0000
25.3 @@ -1,13 +1,13 @@
25.4 # -*- Autoconf -*-
25.5 # Process this file with autoconf to produce a configure script.
25.6
25.7 -AC_INIT([gutcheck],[1.50],[ali@juiblex.co.uk])
25.8 +AC_INIT([bookloupe],[1.50],[ali@juiblex.co.uk])
25.9 AC_PREREQ(2.59)
25.10 AC_CONFIG_AUX_DIR([config])
25.11 -AC_CONFIG_SRCDIR([gutcheck/gutcheck.c])
25.12 +AC_CONFIG_SRCDIR([bookloupe/bookloupe.c])
25.13 AC_CONFIG_FILES([Makefile
25.14 -gclib/Makefile
25.15 -gutcheck/Makefile
25.16 +bl/Makefile
25.17 +bookloupe/Makefile
25.18 test/Makefile
25.19 test/harness/Makefile
25.20 test/compatibility/Makefile
26.1 --- a/doc/Makefile.am Fri Jan 27 00:28:11 2012 +0000
26.2 +++ b/doc/Makefile.am Fri Jan 27 10:30:16 2012 +0000
26.3 @@ -1,3 +1,3 @@
26.4 -dist_pkgdata_DATA=gutcheck.txt gc-test.txt
26.5 +dist_pkgdata_DATA=bookloupe.txt loupe-test.txt
26.6
26.7 EXTRA_DIST=README-0.99
27.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
27.2 +++ b/doc/bookloupe.txt Fri Jan 27 10:30:16 2012 +0000
27.3 @@ -0,0 +1,742 @@
27.4 +
27.5 +
27.6 + Gutcheck documentation
27.7 +
27.8 +
27.9 +gutcheck: lists possible common formatting errors in a Project
27.10 +Gutenberg candidate file. It is a command line program and can be used
27.11 +under Win32 or Unix (gutcheck.c should compile anywhere; if it doesn't,
27.12 +tell me). For Windows-only people, there is an appendix at the end
27.13 +with brief instructions for running it.
27.14 +
27.15 +
27.16 +Current version: 0.99. Users of 0.98 see end of file for changes.
27.17 +
27.18 +You should also have received the licence file COPYING, a README file,
27.19 +gutcheck.c, the source code, and gutcheck.exe, a DOS executable, with
27.20 +this file.
27.21 +
27.22 +This software is Copyright Jim Tinsley 2000-2005.
27.23 +
27.24 +Gutcheck comes wih ABSOLUTELY NO WARRANTY. For details, read the file COPYING.
27.25 +This is Free Software; you may redistribute it under certain conditions (GPL).
27.26 +
27.27 +See http://gutcheck.sourceforge.net for the latest version.
27.28 +
27.29 +
27.30 +Usage is: gutcheck [-setopxlywm] filename
27.31 + where:
27.32 + -s checks Single quotes
27.33 + -e switches off Echoing of lines
27.34 + -t checks Typos
27.35 + -o produces an Overview only
27.36 + -p sets strict quotes checking for Paragraphs
27.37 + -x (paranoid) switches OFF typo checking and extra checks
27.38 + -l turns off Line-end checks
27.39 + -y sets error messages to stdout
27.40 + -w is a special mode for web uploads (for future use)
27.41 + -v (verbose) forces individual reporting of minor problems
27.42 + -m interprets Markup of some common HTML tags and entities
27.43 + -u warns about words in a user-defined typo file gutcheck.typ
27.44 + -d ignores some DP-specific markup
27.45 +
27.46 +Running gutcheck without any parameters will display a brief help message.
27.47 +
27.48 +Sample usage:
27.49 +
27.50 + gutcheck warpeace.txt
27.51 +
27.52 +
27.53 +More detail:
27.54 +
27.55 + Echoing lines (-e to switch off)
27.56 +
27.57 + You may find it convenient, when reviewing Gutcheck's
27.58 + suggestions, to see the line that Gutcheck is questioning.
27.59 + That way, you can often see at a glance whether it is
27.60 + a real error that needs to be fixed, or a false positive
27.61 + that should be in the text, but Gutcheck's limited
27.62 + programming doesn't understand.
27.63 +
27.64 + By default, gutcheck echoes these lines, but if you don't
27.65 + want to see the lines referred to, -e will switch it OFF.
27.66 +
27.67 +
27.68 + Quotes (-s and -p switches)
27.69 +
27.70 + Gutcheck always looks for unbalanced doublequotes in a
27.71 + paragraph. It is a common convention for writers not to
27.72 + close quotes in a paragraph if the next paragraph opens
27.73 + with quotes and is a continuation by the same speaker.
27.74 +
27.75 + Gutcheck therefore does not normally report unclosed quotes
27.76 + if the next paragraph begins with a quote. If you need
27.77 + to see all unclosed quotes, even where the next paragraph
27.78 + begins with a quote, you should use the -p switch.
27.79 +
27.80 + Singlequotes (') are a problem, since the same character
27.81 + is used for an apostrophe. I'm not sure that it is
27.82 + possible to get 100% accuracy on singlequotes checking,
27.83 + particularly since dialect, quite common in PG texts,
27.84 + upsets the normal rules so badly. Consider the sentence:
27.85 + 'Tis often said that a man's a man for a' that.
27.86 + As humans, we recognize that both apostrophes are used
27.87 + for contractions rather than quotes, but it isn't easy
27.88 + to get a program to recognize that.
27.89 +
27.90 + Since Gutcheck makes too many mistakes when trying to match
27.91 + singlequotes, it doesn't look for unbalanced singlequotes
27.92 + unless you specify the -s switch.
27.93 +
27.94 + Consider these sentences, which illustrate the main cases:
27.95 +
27.96 + 'Tis often said that a fool and his money are soon parted.
27.97 +
27.98 + 'Becky's goin' home,' said Tom.
27.99 +
27.100 + The dogs' tails wagged in unison.
27.101 +
27.102 + Those 'pack dogs' of yours look more like wolves.
27.103 +
27.104 +
27.105 +
27.106 + Typos (-t switch)
27.107 +
27.108 + It's not Gutcheck's job to be a spelling checker, but it
27.109 + does check for a list of common typos and OCR errors if you
27.110 + use the -t switch. (The -x switch also turns typo checking on.)
27.111 +
27.112 + It also checks for character combinations, especially involving
27.113 + h and b, which are often confused by OCR, that rarely or never
27.114 + occur. For example, it queries "tbe" in a word. Now, "the" often
27.115 + occurs, but "tbe" is very rare (heartbeat, hotbed), so I'm
27.116 + playing the odds - a few false positives for many errors found.
27.117 + Similarly with "ii", which is a very common OCR error.
27.118 +
27.119 + Gutcheck suppresses multiple reporting of the first 40 "typos"
27.120 + found. This is to remove the annoyance of seeing something like
27.121 + "FN" (footnote) or "LK" (initials) flagged as a typo 147 times
27.122 + in a text.
27.123 +
27.124 +
27.125 + Line-end checking (-l switch to disable)
27.126 +
27.127 + All PG texts should have a Carriage Return (CR - character 13)
27.128 + and a Line Feed (LF - character 10) at end of each line,
27.129 + regardless of what O/S you made them on. DOS/Windows, Unix
27.130 + and Mac have different conventions, but the final text should
27.131 + always use a CR/LF pair as its line terminator.
27.132 +
27.133 + By default, Gutcheck verifies that every line does have
27.134 + the correct terminator, but if you're on a work-in-progress
27.135 + in Linux, you might want to convert the line-ends as a final
27.136 + step, and not want to see thousands of errors every time you
27.137 + run Gutcheck before that final step, so you can turn off
27.138 + this checking with the -l switch.
27.139 +
27.140 +
27.141 + Paranoid mode (-x switch to disable: Trust No One :-)
27.142 +
27.143 + -x switches OFF typo-checking, the -t flag, automatically
27.144 + and some extra checks like standalone 1 and 0 queries.
27.145 +
27.146 +
27.147 + Overview mode (-o switch)
27.148 +
27.149 + This mode just gives a count of queries found
27.150 + instead of a detailed list.
27.151 +
27.152 +
27.153 + Header quote (-h switch)
27.154 +
27.155 + If you use the -h switch, gutcheck will also display
27.156 + the Title, Author, Release and Edition fields from the
27.157 + PG header. This is useful mostly for the automated
27.158 + checks we do on recently-posted texts.
27.159 +
27.160 +
27.161 + Errors to stdout (-y switch)
27.162 +
27.163 + If you're just running gutcheck normally, you can ignore
27.164 + this. It's only there for programs that provide a front
27.165 + end to gutcheck. It makes error messages appear within
27.166 + the output of gutcheck so that the front end knows whether
27.167 + gutcheck ran OK.
27.168 +
27.169 +
27.170 + Verbose reporting (-v switch)
27.171 +
27.172 + Normally, if gutcheck sees lots of long lines, short lines,
27.173 + spaced dashes, non-ASCII characters or dot-commas ".," it
27.174 + assumes these are features of the text, counts and summarizes
27.175 + them at the top of its report, but does not list them
27.176 + individually. If the -v switch is on, gutcheck will list them all.
27.177 +
27.178 +
27.179 + Markup interpretation (-m switch)
27.180 +
27.181 + Normally, gutcheck flags anything it suspects of being HTML
27.182 + markup as a possible error. When you use the -m switch,
27.183 + however, it matches anything that looks like markup against
27.184 + a short list of common HTML tags and entities. If the markup
27.185 + is in that list, it either ignores the markup, in the case
27.186 + of a tag, or "interprets" the markup as its nearest ASCII
27.187 + equivalent, in the case of an entity. So, for example, using
27.188 + this switch, gutcheck will "see"
27.189 +
27.190 + “He went <i>thataway!</i>”
27.191 +
27.192 + as
27.193 +
27.194 + "He went thataway!"
27.195 +
27.196 + and report accordingly.
27.197 +
27.198 + This switch does not, not, NOT check the validity of HTML;
27.199 + it exists so that you can run gutcheck on most HTML texts
27.200 + for PG, and get sane results. It does not support all tags.
27.201 + It does not support all entities. When it sees a tag or entity
27.202 + it does not recognize, it will query it as HTML just as if
27.203 + you hadn't specified the -m switch.
27.204 +
27.205 + Gutcheck 0.99 will automatically switch on markup interpretation
27.206 + if it sees a lot of tags that appear to be markup, so mostly, you
27.207 + won't have to specify this.
27.208 +
27.209 + User-defined typos (-u switch)
27.210 +
27.211 + If you have a file named gutcheck.typ either in your current
27.212 + working directory or in the directory from which you explicitly
27.213 + invoked gutcheck, but not necessarily on your path, and if you
27.214 + specify the -u switch, gutcheck will query any word specified
27.215 + in that file. The file is simple: one word, in lower case, per
27.216 + line. 999 lines are allowed for. Be careful not to put multiple
27.217 + words onto a line, or leave any rubbish other than the word on
27.218 + the line. You should have received a sample file gutcheck.typ
27.219 + with this package.
27.220 +
27.221 + Ignore DP markup (-d switch)
27.222 +
27.223 + Distributed Proofreaders (http://www.pgdp.net) is currently
27.224 + (2005) the main source of PG texts, and proofers there use
27.225 + special conventions. This switch understands those conventions,
27.226 + so that people can use gutcheck on files in process that still
27.227 + haven't had the special conventions removed yet. The special
27.228 + conventions supported in 0.99 are page-separators and
27.229 + "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/".
27.230 +
27.231 +
27.232 +You will probably only run gutcheck on a text once or maybe twice,
27.233 +just prior to uploading; it usually finds a few formatting problems;
27.234 +it also usually finds queries that aren't problems at all - it often
27.235 +questions Tables of Contents for having short lines, for example.
27.236 +These are called "false positives", and need a human to decide on
27.237 +them.
27.238 +
27.239 +The text should be standard prose, and already close to PG normal
27.240 +format (plain text, about 70 characters per line with blank lines
27.241 +between paragraphs).
27.242 +
27.243 +Gutcheck merely draws your attention to things that might be errors.
27.244 +It is NOT a substitute for human judgement. Formatting choices like
27.245 +short lines may be for a reason that this program can't understand.
27.246 +
27.247 +Even the most careful human proofing can leave errors behind in a
27.248 +text, and there are several automated checks you can do to help find
27.249 +them. Of these, spellchecking (with _very_ careful human judgement) is
27.250 +the most important and most useful.
27.251 +
27.252 +Gutcheck does perform some basic typo-checking if you ask it to,
27.253 +but its focus is on formatting errors specific to PG texts -
27.254 +mismatched quotes, non-ASCII characters, bad spacing, bad line
27.255 +length, HTML tags perhaps left from a conversion, unbalanced
27.256 +brackets.
27.257 +
27.258 +Suggestions for additional checks would be appreciated and duly
27.259 +considered, but no guarantees that they will be implemented.
27.260 +
27.261 +
27.262 +
27.263 +
27.264 + How do _I_ use it?
27.265 +
27.266 +Practically everyone I give gutcheck to asks me how _I_ use it.
27.267 +Well, when I get a text for posting, say filename.txt, I run
27.268 +
27.269 + gutcheck -o filename.txt
27.270 +
27.271 +That gives me a quick idea what I'm dealing with. It'll tell
27.272 +me what kind of problems gutcheck sees, and give me an idea
27.273 +of how much more work needs to be done on the text. Keep in
27.274 +mind that gutcheck doesn't do anything like a full spellcheck,
27.275 +but when I see a text that has a lot of problems, I assume that
27.276 +it probably needs a spellcheck too.
27.277 +
27.278 +Having got a feel for the ballpark, I run
27.279 +
27.280 + gutcheck filename.txt > jj
27.281 +
27.282 +where jj is my personal, all-purpose filename for temporary data
27.283 +that doesn't need to be kept. Then I open filename.txt and jj in
27.284 +a split-screen view in my editor, and work down the text, fixing
27.285 +whatever needs fixing, and skipping whatever doesn't. If your
27.286 +editor doesn't split-screen, you can get much the same effect by
27.287 +opening your original file in your normal editor, and jj (or your
27.288 +equivalent name) in something like Notepad, keeping both in view
27.289 +at the same time.
27.290 +
27.291 +Twice a day, an automatic process looks at all recently-posted
27.292 +texts, and emails Michael, me, and sometimes other people with
27.293 +their gutcheck summaries.
27.294 +
27.295 +
27.296 +
27.297 + Future development of gutcheck
27.298 +
27.299 +Gutcheck has gone about as far as it can, given its current
27.300 +structure. In order to add better singlequotes checking,
27.301 +sentence checking, better he/be checking and other good stuff
27.302 +that I'd like to see, I'll have to rewrite it from a different
27.303 +angle - looking at the syntax instead of the lines. And I'll
27.304 +probably get around to that sooner or later.
27.305 +
27.306 +Meantime, I'm just trying to get this version stabilized, so
27.307 +please report any bugs you find. When it is stable, I'll run
27.308 +up a Windows port for those timid souls who can't look a
27.309 +command line in the eye. :-)
27.310 +
27.311 +And I've started work on gutspell, a companion to gutcheck
27.312 +which will concentrate on spelling problems. PG spelling
27.313 +problems are unusual, since the range of texts we cover is
27.314 +so wide, and I'll be taking a somewhat unorthodox approach
27.315 +to writing this spelling-checker _specifically_ for texts
27.316 +containing a lot of dialect and uncommon words that have
27.317 +probably already been spell-checked against a standard
27.318 +modern dictionary.
27.319 +
27.320 +
27.321 +
27.322 +
27.323 +Explanations of common gutcheck messages:
27.324 +
27.325 + --> 74 lines in this file have white space at end
27.326 +
27.327 + PG texts shouldn't have extra white space added at end of line.
27.328 + Don't worry too much about this; they're not doing any harm,
27.329 + and they'll be removed during posting anyway.
27.330 +
27.331 +
27.332 + --> 348 lines in this file are short. Not reporting short lines.
27.333 + --> 84 lines in this file are long. Not reporting long lines.
27.334 + --> 8 lines in this file are VERY long!
27.335 +
27.336 + If there are a lot of long or short lines, Gutcheck won't list
27.337 + them individually. The short lines version of this message
27.338 + is commonly seen when gutchecking poetry and some plays, where
27.339 + the normal line length is shorter than the standard for prose.
27.340 + A "VERY long" line is one over 80 characters. You normally
27.341 + shouldn't have any of these, but sometimes you may have to render
27.342 + a table that must be that long, or some special preformatted
27.343 + quotation that can't be broken.
27.344 +
27.345 +
27.346 + --> There are 75 spaced dashes and em-dashes in this file. Not reporting them.
27.347 +
27.348 + The PG standard for an emdash--like these--is two minus signs
27.349 + with no spaces before or after them. However, some older texts
27.350 + used spaced dashes - like these -- and if there are very many
27.351 + such spaced dashes in the file, gutcheck just draws your
27.352 + attention to it and doesn't list them individually.
27.353 +
27.354 +
27.355 +
27.356 + Line 3020 - Non-ASCII character 233
27.357 +
27.358 + Standard PG texts should use only ASCII characters with values
27.359 + up to 127; however, non-English, accented characters can be
27.360 + represented according to several different non-ASCII encoding
27.361 + schemes, using values over 127. If you have a plain English text
27.362 + with a few accented characters in words like cafe or tete-a-tete,
27.363 + you should replace the accented characters with their unaccented
27.364 + versions. The English pound sign is another commonly-seen
27.365 + non-ASCII character. If you have enough non-ASCII characters in
27.366 + your text that you feel removing them would degrade your text
27.367 + unacceptably, you should probably consider doing an 8-bit text
27.368 + as well as a plain-ASCII version.
27.369 +
27.370 +
27.371 +
27.372 + Line 1207 - Non-ISO-8859 character 156
27.373 +
27.374 + Even in "8-bit" texts, there are distinctions between code sets.
27.375 + The ISO-8859 family of 8-bit code sets is the most commonly used
27.376 + in PG, and these sets do not define values in the range 128 through
27.377 + 159 as printable characters. It's quite common for someone on a
27.378 + Windows or Mac machine to use a non-ISO character inadvertently,
27.379 + so this message warns that the character is not only not ASCII,
27.380 + but also outside the ISO-8859 range.
27.381 +
27.382 +
27.383 +
27.384 + Line 46 - Tab character?
27.385 +
27.386 + Some editors and WPs will put in Tab characters (character 9) to
27.387 + indicate indented text. You should not use these in a PG text,
27.388 + because you can't be sure how they will appear on a reader's
27.389 + screen. Find the Tab, and replace it with the appropriate number
27.390 + of spaces.
27.391 +
27.392 +
27.393 + Line 1327 - Tilde character?
27.394 +
27.395 + The tilde character (~) might be legitimately used, but it's the
27.396 + character commonly used by OCR software to indicate a place where
27.397 + it couldn't make out the letter, so gutcheck flags it.
27.398 +
27.399 +
27.400 +
27.401 + Line 1347 - Asterisk?
27.402 +
27.403 + Asterisks are reported only in paranoid mode (see -x).
27.404 + Like tildes, they are often used to indicate errors, but they are
27.405 + also legitimately used as line delimiters and footnote markers.
27.406 +
27.407 +
27.408 +
27.409 + Line 1451 - Long line 129
27.410 +
27.411 + PG texts should have lines shorter than 76. There may be occasions
27.412 + where you decide that you really have to go out to 79 characters,
27.413 + but the sample above says that line 1451 is 129 characters long -
27.414 + probably two lines run together.
27.415 +
27.416 +
27.417 +
27.418 + Line 1590 - Short line?
27.419 +
27.420 + PG texts should have lines longer than 54 characters. However,
27.421 + there are special cases like poetry and tables of contents where
27.422 + the lines _should_ be shorter. So treat Gutcheck warnings about
27.423 + short lines carefully. Sometimes it's a genuine formatting
27.424 + problem; sometimes the line really needs to be short.
27.425 +
27.426 + Hint: gutcheck will not flag lines as short if they are indented
27.427 + - if they start with a space. I like to start inserted stanzas
27.428 + and other such items indented with a couple of spaces so that
27.429 + they stand out from the main text anyway.
27.430 +
27.431 +
27.432 +
27.433 + Line 1804 - Begins with punctuation?
27.434 +
27.435 + Lines should normally not begin with commas, periods and so on.
27.436 + An exception is ellipses . . . which can happen at start of line.
27.437 +
27.438 +
27.439 +
27.440 + Line 1850 - Spaced em-dash?
27.441 +
27.442 + The PG standard for an em-dash--like these--is two minus signs
27.443 + with no spaces before or after them. Gutcheck flags non-PG
27.444 + em-dashes - like this one. Normally, you will replace it with a
27.445 + PG-standard em-dash.
27.446 +
27.447 +
27.448 +
27.449 + Line 1904 - Query he/be error?
27.450 +
27.451 + Gutcheck makes a very minor effort to look for that scourge of all
27.452 + proofreaders, "be" replacing "he" or vice-versa, and draws your
27.453 + attention to it when it thinks it has found one.
27.454 +
27.455 +
27.456 +
27.457 + Line 2017 - Query digit in a1most
27.458 +
27.459 + The digit 1 is commonly OCRed for the letter l, the digit 0 for
27.460 + the letter O, and so on. When gutcheck sees a mix of digits and
27.461 + letters, it warns you. It may generate a false positive for
27.462 + something like 7am.
27.463 +
27.464 +
27.465 +
27.466 + Line 2083 - Query standalone 0
27.467 +
27.468 + In paranoid mode (see -x) only, gutcheck warns about the digit 0
27.469 + and the number 1 standing alone as a word. This can happen if the
27.470 + OCR misreads the words O or I.
27.471 +
27.472 +
27.473 +
27.474 + Line 2115 - Query word whetber
27.475 +
27.476 + If you have switched typo-checking on, gutcheck looks for
27.477 + potential typos, especially common h/b errors. It's not
27.478 + infallible; it sometimes queries legit words, but it's
27.479 + always worth taking a look.
27.480 +
27.481 +
27.482 +
27.483 + Line 2190 column 14 - Missing space?
27.484 +
27.485 + Omitting a space is a very common error,especially coming from
27.486 + OCRed text,and can be hard for a human to spot. The commas in
27.487 + the previous sentence illustrate the kind of thing I mean.
27.488 +
27.489 +
27.490 +
27.491 + Line 2240 column 48 - Spaced punctuation?
27.492 +
27.493 + The flip side of the "missing space" error , here , is when extra
27.494 + spaces are added before punctuation . Some old texts appear to add
27.495 + extra spaces around punctuation consistently, but this was a
27.496 + typographical convention rather than the author's intent, and the
27.497 + extra "spaces" should be removed when preparing a PG text.
27.498 +
27.499 +
27.500 +
27.501 + Line 2301 column 19 - Unspaced quotes?
27.502 +
27.503 + Another common spacing problem occurs in a phrase like "You wait
27.504 + there,"he said.
27.505 +
27.506 +
27.507 +
27.508 + Line 2385 column 27 - Wrongspaced quotes?
27.509 +
27.510 + As of version 0.98, gutcheck adds extra checks on whether a quote
27.511 + seems to be a start or end quote, and queries those that appear to
27.512 + be misplaced. This does give rise to false positives when quotes are
27.513 + nested, for example:
27.514 +
27.515 + "And how," she asked, "will your "friends" help you now?"
27.516 +
27.517 + but these false positives are worth it because of the many cases
27.518 + that this test catches, notably those like:
27.519 +
27.520 + "And how, "she said," will your friends help you now?"
27.521 +
27.522 + Sometimes a "wrongspaced quotes" query will arise because an earlier
27.523 + quote in the paragraph was omitted, so if the place specified seems
27.524 + to be OK, look back to see whether there's a problem in the preceding
27.525 + lines.
27.526 +
27.527 +
27.528 +
27.529 + Line 2400 - HTML Tag? <PRE>
27.530 +
27.531 + Some PG texts have been converted from HTML, and not all of the
27.532 + HTML tags have been removed.
27.533 +
27.534 +
27.535 +
27.536 + Line 2402 - HTML symbol? &emdash;
27.537 +
27.538 + Similarly, special HTML symbol characters can survive into PG
27.539 + texts. Can occasionally produce amusing false positives like
27.540 + . . . Marwick & Co were well known for it;
27.541 +
27.542 +
27.543 +
27.544 + Line 2540 - Mismatched quotes
27.545 +
27.546 + Another gutcheck mainstay - unclosed doublequotes in a paragraph.
27.547 + See the discussion of quotes in the switches section near the
27.548 + start of this file.
27.549 +
27.550 + Since the mismatch doesn't occur on any one line, gutcheck quotes
27.551 + the line number of the first blank line following the paragraph,
27.552 + since this is the point where it reconciles the count of quotes.
27.553 + However, if gutcheck is echoing lines, that is, you haven't used
27.554 + the -e switch, it will show the _first_ line of the paragraph,
27.555 + to help you find the place without using line numbers. The
27.556 + offending paragraph is therefore between the quoted line and
27.557 + the line number given.
27.558 +
27.559 +
27.560 +
27.561 + Line 2587 - Mismatched single quotes
27.562 +
27.563 + Only checked with the -s switch, since checking single quotes is
27.564 + not a very reliable process. Otherwise, the same logic as for
27.565 + doublequotes applies.
27.566 +
27.567 +
27.568 +
27.569 + Line 2877 - Mismatched round brackets?
27.570 +
27.571 + Also curly and square brackets. Texts with a lot of brackets, like
27.572 + plays with bracketed stage instructions, may have mismatches.
27.573 +
27.574 +
27.575 + Line 3150 - No CR?
27.576 + Line 3204 - Two successive CRs?
27.577 + Line 3281 position 75 - CR without LF?
27.578 +
27.579 + These are the invalid line-end warnings. See the discussion of
27.580 + line-end checking in the switches section near the start of this
27.581 + file. If you see these, and your editor doesn't show anything
27.582 + wrong, you should probably try deleting the characters just before
27.583 + and after the line end, and the line-end itself, then retyping the
27.584 + characters and the line-end.
27.585 +
27.586 +
27.587 + Line 2940 - Paragraph starts with lower-case
27.588 +
27.589 + A common error in an e-text is for an extra blank line
27.590 +
27.591 + to be put in, like the blank line above, and this often
27.592 + shows up as a new paragraph beginning with lower case.
27.593 + Sometimes the blank line is deliberate, as when a
27.594 + quotation is inserted in a speech. Use your judgement.
27.595 +
27.596 +
27.597 + Line 2987 - Extra period?
27.598 +
27.599 + An extra period. is a. common problem in OCRed text. and usually
27.600 + arises when a speck of dust on the page is mistaken for a period.
27.601 + or. as occasionally happens. when a comma loses its tail.
27.602 +
27.603 +
27.604 + Line 3012 column 12 - Double punctuation?
27.605 +
27.606 + Double punctuation., like that,, is a common typo and
27.607 + scanno. Some books have much legit double punctuation,
27.608 + like etc., etc., but it's worth checking anyway.
27.609 +
27.610 +
27.611 +
27.612 + * * * *
27.613 +
27.614 +For Windows-only users who are unfamiliar with DOS:
27.615 +
27.616 + If you're a Windows-only user, you need to save
27.617 + gutcheck.exe into the folder (directory) where the
27.618 + text file you want to check is. Let's say your
27.619 + text file is in C:\GUT, then you should save
27.620 + GUTCHECK.EXE into C:\GUT.
27.621 +
27.622 + Now get to a DOS prompt. You can do this by
27.623 + selecting the "Command Prompt" or "MS-DOS Prompt"
27.624 + option that will be somewhere on your
27.625 + Start/Programs menu.
27.626 +
27.627 + Now get into the C:\GUT directory.
27.628 + You can do this using the CD (change directory)
27.629 + command, like this:
27.630 + CD \GUT
27.631 + and your prompt will change to
27.632 + C:\GUT>
27.633 + so you know you're in the right place.
27.634 +
27.635 + Now type
27.636 + gutcheck yourfile.txt
27.637 + and you'll see gutcheck's report
27.638 +
27.639 + By default, gutcheck prints its queries to screen.
27.640 + If you want to create a file of them, to edit
27.641 + against the text, you can use the greater-than
27.642 + sign (>) to tell it to output the report to a
27.643 + file. For example, if you want its report in a
27.644 + file called QUERIES.LST, you could type
27.645 +
27.646 + gutcheck yourfile.txt > queries.lst
27.647 +
27.648 + The queries.lst file will then contain the listing
27.649 + of possible formatting errors, and you can
27.650 + edit it alongside your text.
27.651 +
27.652 + Whatever you do, DON'T make the filename after
27.653 + the greater-than sign the name of a file already
27.654 + on your disk that you want to keep, because
27.655 + the greater-than sign will cause gutcheck to
27.656 + replace any existing file of that name.
27.657 +
27.658 + So, for example, if you have two Tolstoy files
27.659 + that you want to check, called WARPEACE.TXT and
27.660 + ANNAK.TXT, make sure that neither of these names
27.661 + is ever used following the greater-than sign.
27.662 + To check these correctly, you might do:
27.663 +
27.664 + gutcheck warpeace.txt >war.lst
27.665 +
27.666 + and
27.667 +
27.668 + gutcheck annak.txt > annak.lst
27.669 +
27.670 + separately. Then you can look at war.lst and annak.lst
27.671 + to see the gutcheck reports.
27.672 +
27.673 + * * * *
27.674 +
27.675 +
27.676 +For existing 0.98 users upgrading to 0.99:
27.677 +
27.678 + If you run on old 16-bit DOS or Windows 3.x, I'm afraid
27.679 + you're out of luck. I'm not saying it _can't_ be compiled
27.680 + to run on 16-bit, but the executable with the package is
27.681 + for Win32 only. *nix users won't notice the change at all.
27.682 +
27.683 +
27.684 + There are two new switches: -u and -d.
27.685 + See above for full rundown.
27.686 +
27.687 +
27.688 +Here's a list of the new errors:
27.689 +
27.690 + Line 1456 - Carat character?
27.691 +
27.692 + I^ve found a few.
27.693 +
27.694 +
27.695 + Line 1821 - Forward slash?
27.696 +
27.697 + Common error for italicized "I", or so /'ve found.
27.698 +
27.699 +
27.700 + Line 2139 - Query missing paragraph break?
27.701 +
27.702 + "Come here, son." "Do I _have_ to go, dad?"
27.703 + Like that. False positives in some texts. Sorry 'bout that,
27.704 + but these are often errors.
27.705 +
27.706 +
27.707 + Line 2200 - Query had/bad error?
27.708 +
27.709 + Clear enough. Doesn't catch as many as I'd like it to,
27.710 + but rarely gives false alarms.
27.711 +
27.712 +
27.713 + Line 2268 - Query punctuation after the?
27.714 +
27.715 + Some words, like "the", very rarely have punctuation
27.716 + following them. Others, like "Mrs", usually have a
27.717 + period, but never a comma. Occasional false positives.
27.718 +
27.719 +
27.720 + Line 2380 - Query possible scanno arid
27.721 +
27.722 + It found one of your user-defined typos when you
27.723 + used the -u switch.
27.724 +
27.725 +
27.726 + Line 2511 - Capital "S"?
27.727 +
27.728 + Surprisingly common specific case, like: Jane'S
27.729 +
27.730 +
27.731 + Line 3469 - endquote missing punctuation?
27.732 +
27.733 + OK. This one can really cause a lot of false positives
27.734 + in some books, but it switches itself off if it finds
27.735 + more than 20 in a text, unless you force it to list them
27.736 + all with the -v switch.
27.737 + "Hey, dad" Johnny said, "can we go now?"
27.738 + is a common punctuation-missing error.
27.739 +
27.740 +
27.741 + Line 4266 - Mismatched underscores?
27.742 +
27.743 + Like mismatched anything else!
27.744 +
27.745 +
28.1 --- a/doc/gc-test.txt Fri Jan 27 00:28:11 2012 +0000
28.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000
28.3 @@ -1,64 +0,0 @@
28.4 - gutcheck test framework
28.5 - =======================
28.6 -
28.7 -Running existing testcases
28.8 ---------------------------
28.9 -
28.10 -The test harness (the program that runs a test) is called gc-test. The various
28.11 -testcases are stored in multiple text files, typically with a .tst extension.
28.12 -
28.13 -To run a testcase when all of gutcheck, gc-test and the testcase file are
28.14 -in the current directory simply do something like:
28.15 -
28.16 -% gc-test missing-space.tst
28.17 -
28.18 -from a command prompt. Under MS-Windows, this is called a command window and
28.19 -the prompt will normally look slightly different, eg.,
28.20 -
28.21 -C:\DP> gc-test missing-space.tst
28.22 -
28.23 -To run all the tests in the current directory, do something like this:
28.24 -
28.25 -% gc-test *.tst
28.26 -
28.27 -If gutcheck is not in the current directory, then you can set an environment
28.28 -variable (GUTCHECK) to point at it. For example, on MS-Windows you might do:
28.29 -
28.30 -C:\DP> set GUTCHECK=C:\GUTCHECK\GUTCHECK.EXE
28.31 -C:\DP> gc-test *.tst
28.32 -
28.33 -Writing your own testcases
28.34 ---------------------------
28.35 -
28.36 -Writing a new testcase is pretty painless. Most testcases follow this simple
28.37 -pattern:
28.38 -
28.39 - ┌──────────────────────────────────────────â”
28.40 - │**************** INPUT **************** │
28.41 - │"Look!John, over there!" │
28.42 - │**************** EXPECTED ****************│
28.43 - │ │
28.44 - │"Look!John, over there!" │
28.45 - │ Line 1 column 6 - Missing space? │
28.46 - └──────────────────────────────────────────┘
28.47 -
28.48 -The sixteen asterisks in this example form what is known as the "flag". This
28.49 -flag must come before and after all tags (eg., INPUT and EXPECTED). In the
28.50 -unlikely event that you need sixteen asterisks at the start of line of text,
28.51 -then simply choose a different flag and use it throughout the file (flags
28.52 -can be any sequence of ASCII characters except control codes and space).
28.53 -
28.54 -Note that the header that gutcheck normally outputs is not included in the
28.55 -expected output. This avoids problems with not knowing beforehand the name
28.56 -of the file that gutcheck will be asked to look at (and saves typing!).
28.57 -gutcheck prints a blank line before each warning. These are not part of the
28.58 -header and so do need to be included.
28.59 -
28.60 -To test that gutcheck produces no output, you still need to include
28.61 -an EXPECTED tag, just with no text following it. If there is no EXPECTED
28.62 -tag, then gc-test will consider that no expectation exists and won't check
28.63 -the output at all.
28.64 -
28.65 -There is no support yet for non-ASCII testcases, embedded linefeeds,
28.66 -passing command line options to gutcheck or for testcases which are
28.67 -expected to fail.
29.1 --- a/doc/gutcheck.txt Fri Jan 27 00:28:11 2012 +0000
29.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000
29.3 @@ -1,742 +0,0 @@
29.4 -
29.5 -
29.6 - Gutcheck documentation
29.7 -
29.8 -
29.9 -gutcheck: lists possible common formatting errors in a Project
29.10 -Gutenberg candidate file. It is a command line program and can be used
29.11 -under Win32 or Unix (gutcheck.c should compile anywhere; if it doesn't,
29.12 -tell me). For Windows-only people, there is an appendix at the end
29.13 -with brief instructions for running it.
29.14 -
29.15 -
29.16 -Current version: 0.99. Users of 0.98 see end of file for changes.
29.17 -
29.18 -You should also have received the licence file COPYING, a README file,
29.19 -gutcheck.c, the source code, and gutcheck.exe, a DOS executable, with
29.20 -this file.
29.21 -
29.22 -This software is Copyright Jim Tinsley 2000-2005.
29.23 -
29.24 -Gutcheck comes wih ABSOLUTELY NO WARRANTY. For details, read the file COPYING.
29.25 -This is Free Software; you may redistribute it under certain conditions (GPL).
29.26 -
29.27 -See http://gutcheck.sourceforge.net for the latest version.
29.28 -
29.29 -
29.30 -Usage is: gutcheck [-setopxlywm] filename
29.31 - where:
29.32 - -s checks Single quotes
29.33 - -e switches off Echoing of lines
29.34 - -t checks Typos
29.35 - -o produces an Overview only
29.36 - -p sets strict quotes checking for Paragraphs
29.37 - -x (paranoid) switches OFF typo checking and extra checks
29.38 - -l turns off Line-end checks
29.39 - -y sets error messages to stdout
29.40 - -w is a special mode for web uploads (for future use)
29.41 - -v (verbose) forces individual reporting of minor problems
29.42 - -m interprets Markup of some common HTML tags and entities
29.43 - -u warns about words in a user-defined typo file gutcheck.typ
29.44 - -d ignores some DP-specific markup
29.45 -
29.46 -Running gutcheck without any parameters will display a brief help message.
29.47 -
29.48 -Sample usage:
29.49 -
29.50 - gutcheck warpeace.txt
29.51 -
29.52 -
29.53 -More detail:
29.54 -
29.55 - Echoing lines (-e to switch off)
29.56 -
29.57 - You may find it convenient, when reviewing Gutcheck's
29.58 - suggestions, to see the line that Gutcheck is questioning.
29.59 - That way, you can often see at a glance whether it is
29.60 - a real error that needs to be fixed, or a false positive
29.61 - that should be in the text, but Gutcheck's limited
29.62 - programming doesn't understand.
29.63 -
29.64 - By default, gutcheck echoes these lines, but if you don't
29.65 - want to see the lines referred to, -e will switch it OFF.
29.66 -
29.67 -
29.68 - Quotes (-s and -p switches)
29.69 -
29.70 - Gutcheck always looks for unbalanced doublequotes in a
29.71 - paragraph. It is a common convention for writers not to
29.72 - close quotes in a paragraph if the next paragraph opens
29.73 - with quotes and is a continuation by the same speaker.
29.74 -
29.75 - Gutcheck therefore does not normally report unclosed quotes
29.76 - if the next paragraph begins with a quote. If you need
29.77 - to see all unclosed quotes, even where the next paragraph
29.78 - begins with a quote, you should use the -p switch.
29.79 -
29.80 - Singlequotes (') are a problem, since the same character
29.81 - is used for an apostrophe. I'm not sure that it is
29.82 - possible to get 100% accuracy on singlequotes checking,
29.83 - particularly since dialect, quite common in PG texts,
29.84 - upsets the normal rules so badly. Consider the sentence:
29.85 - 'Tis often said that a man's a man for a' that.
29.86 - As humans, we recognize that both apostrophes are used
29.87 - for contractions rather than quotes, but it isn't easy
29.88 - to get a program to recognize that.
29.89 -
29.90 - Since Gutcheck makes too many mistakes when trying to match
29.91 - singlequotes, it doesn't look for unbalanced singlequotes
29.92 - unless you specify the -s switch.
29.93 -
29.94 - Consider these sentences, which illustrate the main cases:
29.95 -
29.96 - 'Tis often said that a fool and his money are soon parted.
29.97 -
29.98 - 'Becky's goin' home,' said Tom.
29.99 -
29.100 - The dogs' tails wagged in unison.
29.101 -
29.102 - Those 'pack dogs' of yours look more like wolves.
29.103 -
29.104 -
29.105 -
29.106 - Typos (-t switch)
29.107 -
29.108 - It's not Gutcheck's job to be a spelling checker, but it
29.109 - does check for a list of common typos and OCR errors if you
29.110 - use the -t switch. (The -x switch also turns typo checking on.)
29.111 -
29.112 - It also checks for character combinations, especially involving
29.113 - h and b, which are often confused by OCR, that rarely or never
29.114 - occur. For example, it queries "tbe" in a word. Now, "the" often
29.115 - occurs, but "tbe" is very rare (heartbeat, hotbed), so I'm
29.116 - playing the odds - a few false positives for many errors found.
29.117 - Similarly with "ii", which is a very common OCR error.
29.118 -
29.119 - Gutcheck suppresses multiple reporting of the first 40 "typos"
29.120 - found. This is to remove the annoyance of seeing something like
29.121 - "FN" (footnote) or "LK" (initials) flagged as a typo 147 times
29.122 - in a text.
29.123 -
29.124 -
29.125 - Line-end checking (-l switch to disable)
29.126 -
29.127 - All PG texts should have a Carriage Return (CR - character 13)
29.128 - and a Line Feed (LF - character 10) at end of each line,
29.129 - regardless of what O/S you made them on. DOS/Windows, Unix
29.130 - and Mac have different conventions, but the final text should
29.131 - always use a CR/LF pair as its line terminator.
29.132 -
29.133 - By default, Gutcheck verifies that every line does have
29.134 - the correct terminator, but if you're on a work-in-progress
29.135 - in Linux, you might want to convert the line-ends as a final
29.136 - step, and not want to see thousands of errors every time you
29.137 - run Gutcheck before that final step, so you can turn off
29.138 - this checking with the -l switch.
29.139 -
29.140 -
29.141 - Paranoid mode (-x switch to disable: Trust No One :-)
29.142 -
29.143 - -x switches OFF typo-checking, the -t flag, automatically
29.144 - and some extra checks like standalone 1 and 0 queries.
29.145 -
29.146 -
29.147 - Overview mode (-o switch)
29.148 -
29.149 - This mode just gives a count of queries found
29.150 - instead of a detailed list.
29.151 -
29.152 -
29.153 - Header quote (-h switch)
29.154 -
29.155 - If you use the -h switch, gutcheck will also display
29.156 - the Title, Author, Release and Edition fields from the
29.157 - PG header. This is useful mostly for the automated
29.158 - checks we do on recently-posted texts.
29.159 -
29.160 -
29.161 - Errors to stdout (-y switch)
29.162 -
29.163 - If you're just running gutcheck normally, you can ignore
29.164 - this. It's only there for programs that provide a front
29.165 - end to gutcheck. It makes error messages appear within
29.166 - the output of gutcheck so that the front end knows whether
29.167 - gutcheck ran OK.
29.168 -
29.169 -
29.170 - Verbose reporting (-v switch)
29.171 -
29.172 - Normally, if gutcheck sees lots of long lines, short lines,
29.173 - spaced dashes, non-ASCII characters or dot-commas ".," it
29.174 - assumes these are features of the text, counts and summarizes
29.175 - them at the top of its report, but does not list them
29.176 - individually. If the -v switch is on, gutcheck will list them all.
29.177 -
29.178 -
29.179 - Markup interpretation (-m switch)
29.180 -
29.181 - Normally, gutcheck flags anything it suspects of being HTML
29.182 - markup as a possible error. When you use the -m switch,
29.183 - however, it matches anything that looks like markup against
29.184 - a short list of common HTML tags and entities. If the markup
29.185 - is in that list, it either ignores the markup, in the case
29.186 - of a tag, or "interprets" the markup as its nearest ASCII
29.187 - equivalent, in the case of an entity. So, for example, using
29.188 - this switch, gutcheck will "see"
29.189 -
29.190 - “He went <i>thataway!</i>”
29.191 -
29.192 - as
29.193 -
29.194 - "He went thataway!"
29.195 -
29.196 - and report accordingly.
29.197 -
29.198 - This switch does not, not, NOT check the validity of HTML;
29.199 - it exists so that you can run gutcheck on most HTML texts
29.200 - for PG, and get sane results. It does not support all tags.
29.201 - It does not support all entities. When it sees a tag or entity
29.202 - it does not recognize, it will query it as HTML just as if
29.203 - you hadn't specified the -m switch.
29.204 -
29.205 - Gutcheck 0.99 will automatically switch on markup interpretation
29.206 - if it sees a lot of tags that appear to be markup, so mostly, you
29.207 - won't have to specify this.
29.208 -
29.209 - User-defined typos (-u switch)
29.210 -
29.211 - If you have a file named gutcheck.typ either in your current
29.212 - working directory or in the directory from which you explicitly
29.213 - invoked gutcheck, but not necessarily on your path, and if you
29.214 - specify the -u switch, gutcheck will query any word specified
29.215 - in that file. The file is simple: one word, in lower case, per
29.216 - line. 999 lines are allowed for. Be careful not to put multiple
29.217 - words onto a line, or leave any rubbish other than the word on
29.218 - the line. You should have received a sample file gutcheck.typ
29.219 - with this package.
29.220 -
29.221 - Ignore DP markup (-d switch)
29.222 -
29.223 - Distributed Proofreaders (http://www.pgdp.net) is currently
29.224 - (2005) the main source of PG texts, and proofers there use
29.225 - special conventions. This switch understands those conventions,
29.226 - so that people can use gutcheck on files in process that still
29.227 - haven't had the special conventions removed yet. The special
29.228 - conventions supported in 0.99 are page-separators and
29.229 - "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/".
29.230 -
29.231 -
29.232 -You will probably only run gutcheck on a text once or maybe twice,
29.233 -just prior to uploading; it usually finds a few formatting problems;
29.234 -it also usually finds queries that aren't problems at all - it often
29.235 -questions Tables of Contents for having short lines, for example.
29.236 -These are called "false positives", and need a human to decide on
29.237 -them.
29.238 -
29.239 -The text should be standard prose, and already close to PG normal
29.240 -format (plain text, about 70 characters per line with blank lines
29.241 -between paragraphs).
29.242 -
29.243 -Gutcheck merely draws your attention to things that might be errors.
29.244 -It is NOT a substitute for human judgement. Formatting choices like
29.245 -short lines may be for a reason that this program can't understand.
29.246 -
29.247 -Even the most careful human proofing can leave errors behind in a
29.248 -text, and there are several automated checks you can do to help find
29.249 -them. Of these, spellchecking (with _very_ careful human judgement) is
29.250 -the most important and most useful.
29.251 -
29.252 -Gutcheck does perform some basic typo-checking if you ask it to,
29.253 -but its focus is on formatting errors specific to PG texts -
29.254 -mismatched quotes, non-ASCII characters, bad spacing, bad line
29.255 -length, HTML tags perhaps left from a conversion, unbalanced
29.256 -brackets.
29.257 -
29.258 -Suggestions for additional checks would be appreciated and duly
29.259 -considered, but no guarantees that they will be implemented.
29.260 -
29.261 -
29.262 -
29.263 -
29.264 - How do _I_ use it?
29.265 -
29.266 -Practically everyone I give gutcheck to asks me how _I_ use it.
29.267 -Well, when I get a text for posting, say filename.txt, I run
29.268 -
29.269 - gutcheck -o filename.txt
29.270 -
29.271 -That gives me a quick idea what I'm dealing with. It'll tell
29.272 -me what kind of problems gutcheck sees, and give me an idea
29.273 -of how much more work needs to be done on the text. Keep in
29.274 -mind that gutcheck doesn't do anything like a full spellcheck,
29.275 -but when I see a text that has a lot of problems, I assume that
29.276 -it probably needs a spellcheck too.
29.277 -
29.278 -Having got a feel for the ballpark, I run
29.279 -
29.280 - gutcheck filename.txt > jj
29.281 -
29.282 -where jj is my personal, all-purpose filename for temporary data
29.283 -that doesn't need to be kept. Then I open filename.txt and jj in
29.284 -a split-screen view in my editor, and work down the text, fixing
29.285 -whatever needs fixing, and skipping whatever doesn't. If your
29.286 -editor doesn't split-screen, you can get much the same effect by
29.287 -opening your original file in your normal editor, and jj (or your
29.288 -equivalent name) in something like Notepad, keeping both in view
29.289 -at the same time.
29.290 -
29.291 -Twice a day, an automatic process looks at all recently-posted
29.292 -texts, and emails Michael, me, and sometimes other people with
29.293 -their gutcheck summaries.
29.294 -
29.295 -
29.296 -
29.297 - Future development of gutcheck
29.298 -
29.299 -Gutcheck has gone about as far as it can, given its current
29.300 -structure. In order to add better singlequotes checking,
29.301 -sentence checking, better he/be checking and other good stuff
29.302 -that I'd like to see, I'll have to rewrite it from a different
29.303 -angle - looking at the syntax instead of the lines. And I'll
29.304 -probably get around to that sooner or later.
29.305 -
29.306 -Meantime, I'm just trying to get this version stabilized, so
29.307 -please report any bugs you find. When it is stable, I'll run
29.308 -up a Windows port for those timid souls who can't look a
29.309 -command line in the eye. :-)
29.310 -
29.311 -And I've started work on gutspell, a companion to gutcheck
29.312 -which will concentrate on spelling problems. PG spelling
29.313 -problems are unusual, since the range of texts we cover is
29.314 -so wide, and I'll be taking a somewhat unorthodox approach
29.315 -to writing this spelling-checker _specifically_ for texts
29.316 -containing a lot of dialect and uncommon words that have
29.317 -probably already been spell-checked against a standard
29.318 -modern dictionary.
29.319 -
29.320 -
29.321 -
29.322 -
29.323 -Explanations of common gutcheck messages:
29.324 -
29.325 - --> 74 lines in this file have white space at end
29.326 -
29.327 - PG texts shouldn't have extra white space added at end of line.
29.328 - Don't worry too much about this; they're not doing any harm,
29.329 - and they'll be removed during posting anyway.
29.330 -
29.331 -
29.332 - --> 348 lines in this file are short. Not reporting short lines.
29.333 - --> 84 lines in this file are long. Not reporting long lines.
29.334 - --> 8 lines in this file are VERY long!
29.335 -
29.336 - If there are a lot of long or short lines, Gutcheck won't list
29.337 - them individually. The short lines version of this message
29.338 - is commonly seen when gutchecking poetry and some plays, where
29.339 - the normal line length is shorter than the standard for prose.
29.340 - A "VERY long" line is one over 80 characters. You normally
29.341 - shouldn't have any of these, but sometimes you may have to render
29.342 - a table that must be that long, or some special preformatted
29.343 - quotation that can't be broken.
29.344 -
29.345 -
29.346 - --> There are 75 spaced dashes and em-dashes in this file. Not reporting them.
29.347 -
29.348 - The PG standard for an emdash--like these--is two minus signs
29.349 - with no spaces before or after them. However, some older texts
29.350 - used spaced dashes - like these -- and if there are very many
29.351 - such spaced dashes in the file, gutcheck just draws your
29.352 - attention to it and doesn't list them individually.
29.353 -
29.354 -
29.355 -
29.356 - Line 3020 - Non-ASCII character 233
29.357 -
29.358 - Standard PG texts should use only ASCII characters with values
29.359 - up to 127; however, non-English, accented characters can be
29.360 - represented according to several different non-ASCII encoding
29.361 - schemes, using values over 127. If you have a plain English text
29.362 - with a few accented characters in words like cafe or tete-a-tete,
29.363 - you should replace the accented characters with their unaccented
29.364 - versions. The English pound sign is another commonly-seen
29.365 - non-ASCII character. If you have enough non-ASCII characters in
29.366 - your text that you feel removing them would degrade your text
29.367 - unacceptably, you should probably consider doing an 8-bit text
29.368 - as well as a plain-ASCII version.
29.369 -
29.370 -
29.371 -
29.372 - Line 1207 - Non-ISO-8859 character 156
29.373 -
29.374 - Even in "8-bit" texts, there are distinctions between code sets.
29.375 - The ISO-8859 family of 8-bit code sets is the most commonly used
29.376 - in PG, and these sets do not define values in the range 128 through
29.377 - 159 as printable characters. It's quite common for someone on a
29.378 - Windows or Mac machine to use a non-ISO character inadvertently,
29.379 - so this message warns that the character is not only not ASCII,
29.380 - but also outside the ISO-8859 range.
29.381 -
29.382 -
29.383 -
29.384 - Line 46 - Tab character?
29.385 -
29.386 - Some editors and WPs will put in Tab characters (character 9) to
29.387 - indicate indented text. You should not use these in a PG text,
29.388 - because you can't be sure how they will appear on a reader's
29.389 - screen. Find the Tab, and replace it with the appropriate number
29.390 - of spaces.
29.391 -
29.392 -
29.393 - Line 1327 - Tilde character?
29.394 -
29.395 - The tilde character (~) might be legitimately used, but it's the
29.396 - character commonly used by OCR software to indicate a place where
29.397 - it couldn't make out the letter, so gutcheck flags it.
29.398 -
29.399 -
29.400 -
29.401 - Line 1347 - Asterisk?
29.402 -
29.403 - Asterisks are reported only in paranoid mode (see -x).
29.404 - Like tildes, they are often used to indicate errors, but they are
29.405 - also legitimately used as line delimiters and footnote markers.
29.406 -
29.407 -
29.408 -
29.409 - Line 1451 - Long line 129
29.410 -
29.411 - PG texts should have lines shorter than 76. There may be occasions
29.412 - where you decide that you really have to go out to 79 characters,
29.413 - but the sample above says that line 1451 is 129 characters long -
29.414 - probably two lines run together.
29.415 -
29.416 -
29.417 -
29.418 - Line 1590 - Short line?
29.419 -
29.420 - PG texts should have lines longer than 54 characters. However,
29.421 - there are special cases like poetry and tables of contents where
29.422 - the lines _should_ be shorter. So treat Gutcheck warnings about
29.423 - short lines carefully. Sometimes it's a genuine formatting
29.424 - problem; sometimes the line really needs to be short.
29.425 -
29.426 - Hint: gutcheck will not flag lines as short if they are indented
29.427 - - if they start with a space. I like to start inserted stanzas
29.428 - and other such items indented with a couple of spaces so that
29.429 - they stand out from the main text anyway.
29.430 -
29.431 -
29.432 -
29.433 - Line 1804 - Begins with punctuation?
29.434 -
29.435 - Lines should normally not begin with commas, periods and so on.
29.436 - An exception is ellipses . . . which can happen at start of line.
29.437 -
29.438 -
29.439 -
29.440 - Line 1850 - Spaced em-dash?
29.441 -
29.442 - The PG standard for an em-dash--like these--is two minus signs
29.443 - with no spaces before or after them. Gutcheck flags non-PG
29.444 - em-dashes - like this one. Normally, you will replace it with a
29.445 - PG-standard em-dash.
29.446 -
29.447 -
29.448 -
29.449 - Line 1904 - Query he/be error?
29.450 -
29.451 - Gutcheck makes a very minor effort to look for that scourge of all
29.452 - proofreaders, "be" replacing "he" or vice-versa, and draws your
29.453 - attention to it when it thinks it has found one.
29.454 -
29.455 -
29.456 -
29.457 - Line 2017 - Query digit in a1most
29.458 -
29.459 - The digit 1 is commonly OCRed for the letter l, the digit 0 for
29.460 - the letter O, and so on. When gutcheck sees a mix of digits and
29.461 - letters, it warns you. It may generate a false positive for
29.462 - something like 7am.
29.463 -
29.464 -
29.465 -
29.466 - Line 2083 - Query standalone 0
29.467 -
29.468 - In paranoid mode (see -x) only, gutcheck warns about the digit 0
29.469 - and the number 1 standing alone as a word. This can happen if the
29.470 - OCR misreads the words O or I.
29.471 -
29.472 -
29.473 -
29.474 - Line 2115 - Query word whetber
29.475 -
29.476 - If you have switched typo-checking on, gutcheck looks for
29.477 - potential typos, especially common h/b errors. It's not
29.478 - infallible; it sometimes queries legit words, but it's
29.479 - always worth taking a look.
29.480 -
29.481 -
29.482 -
29.483 - Line 2190 column 14 - Missing space?
29.484 -
29.485 - Omitting a space is a very common error,especially coming from
29.486 - OCRed text,and can be hard for a human to spot. The commas in
29.487 - the previous sentence illustrate the kind of thing I mean.
29.488 -
29.489 -
29.490 -
29.491 - Line 2240 column 48 - Spaced punctuation?
29.492 -
29.493 - The flip side of the "missing space" error , here , is when extra
29.494 - spaces are added before punctuation . Some old texts appear to add
29.495 - extra spaces around punctuation consistently, but this was a
29.496 - typographical convention rather than the author's intent, and the
29.497 - extra "spaces" should be removed when preparing a PG text.
29.498 -
29.499 -
29.500 -
29.501 - Line 2301 column 19 - Unspaced quotes?
29.502 -
29.503 - Another common spacing problem occurs in a phrase like "You wait
29.504 - there,"he said.
29.505 -
29.506 -
29.507 -
29.508 - Line 2385 column 27 - Wrongspaced quotes?
29.509 -
29.510 - As of version 0.98, gutcheck adds extra checks on whether a quote
29.511 - seems to be a start or end quote, and queries those that appear to
29.512 - be misplaced. This does give rise to false positives when quotes are
29.513 - nested, for example:
29.514 -
29.515 - "And how," she asked, "will your "friends" help you now?"
29.516 -
29.517 - but these false positives are worth it because of the many cases
29.518 - that this test catches, notably those like:
29.519 -
29.520 - "And how, "she said," will your friends help you now?"
29.521 -
29.522 - Sometimes a "wrongspaced quotes" query will arise because an earlier
29.523 - quote in the paragraph was omitted, so if the place specified seems
29.524 - to be OK, look back to see whether there's a problem in the preceding
29.525 - lines.
29.526 -
29.527 -
29.528 -
29.529 - Line 2400 - HTML Tag? <PRE>
29.530 -
29.531 - Some PG texts have been converted from HTML, and not all of the
29.532 - HTML tags have been removed.
29.533 -
29.534 -
29.535 -
29.536 - Line 2402 - HTML symbol? &emdash;
29.537 -
29.538 - Similarly, special HTML symbol characters can survive into PG
29.539 - texts. Can occasionally produce amusing false positives like
29.540 - . . . Marwick & Co were well known for it;
29.541 -
29.542 -
29.543 -
29.544 - Line 2540 - Mismatched quotes
29.545 -
29.546 - Another gutcheck mainstay - unclosed doublequotes in a paragraph.
29.547 - See the discussion of quotes in the switches section near the
29.548 - start of this file.
29.549 -
29.550 - Since the mismatch doesn't occur on any one line, gutcheck quotes
29.551 - the line number of the first blank line following the paragraph,
29.552 - since this is the point where it reconciles the count of quotes.
29.553 - However, if gutcheck is echoing lines, that is, you haven't used
29.554 - the -e switch, it will show the _first_ line of the paragraph,
29.555 - to help you find the place without using line numbers. The
29.556 - offending paragraph is therefore between the quoted line and
29.557 - the line number given.
29.558 -
29.559 -
29.560 -
29.561 - Line 2587 - Mismatched single quotes
29.562 -
29.563 - Only checked with the -s switch, since checking single quotes is
29.564 - not a very reliable process. Otherwise, the same logic as for
29.565 - doublequotes applies.
29.566 -
29.567 -
29.568 -
29.569 - Line 2877 - Mismatched round brackets?
29.570 -
29.571 - Also curly and square brackets. Texts with a lot of brackets, like
29.572 - plays with bracketed stage instructions, may have mismatches.
29.573 -
29.574 -
29.575 - Line 3150 - No CR?
29.576 - Line 3204 - Two successive CRs?
29.577 - Line 3281 position 75 - CR without LF?
29.578 -
29.579 - These are the invalid line-end warnings. See the discussion of
29.580 - line-end checking in the switches section near the start of this
29.581 - file. If you see these, and your editor doesn't show anything
29.582 - wrong, you should probably try deleting the characters just before
29.583 - and after the line end, and the line-end itself, then retyping the
29.584 - characters and the line-end.
29.585 -
29.586 -
29.587 - Line 2940 - Paragraph starts with lower-case
29.588 -
29.589 - A common error in an e-text is for an extra blank line
29.590 -
29.591 - to be put in, like the blank line above, and this often
29.592 - shows up as a new paragraph beginning with lower case.
29.593 - Sometimes the blank line is deliberate, as when a
29.594 - quotation is inserted in a speech. Use your judgement.
29.595 -
29.596 -
29.597 - Line 2987 - Extra period?
29.598 -
29.599 - An extra period. is a. common problem in OCRed text. and usually
29.600 - arises when a speck of dust on the page is mistaken for a period.
29.601 - or. as occasionally happens. when a comma loses its tail.
29.602 -
29.603 -
29.604 - Line 3012 column 12 - Double punctuation?
29.605 -
29.606 - Double punctuation., like that,, is a common typo and
29.607 - scanno. Some books have much legit double punctuation,
29.608 - like etc., etc., but it's worth checking anyway.
29.609 -
29.610 -
29.611 -
29.612 - * * * *
29.613 -
29.614 -For Windows-only users who are unfamiliar with DOS:
29.615 -
29.616 - If you're a Windows-only user, you need to save
29.617 - gutcheck.exe into the folder (directory) where the
29.618 - text file you want to check is. Let's say your
29.619 - text file is in C:\GUT, then you should save
29.620 - GUTCHECK.EXE into C:\GUT.
29.621 -
29.622 - Now get to a DOS prompt. You can do this by
29.623 - selecting the "Command Prompt" or "MS-DOS Prompt"
29.624 - option that will be somewhere on your
29.625 - Start/Programs menu.
29.626 -
29.627 - Now get into the C:\GUT directory.
29.628 - You can do this using the CD (change directory)
29.629 - command, like this:
29.630 - CD \GUT
29.631 - and your prompt will change to
29.632 - C:\GUT>
29.633 - so you know you're in the right place.
29.634 -
29.635 - Now type
29.636 - gutcheck yourfile.txt
29.637 - and you'll see gutcheck's report
29.638 -
29.639 - By default, gutcheck prints its queries to screen.
29.640 - If you want to create a file of them, to edit
29.641 - against the text, you can use the greater-than
29.642 - sign (>) to tell it to output the report to a
29.643 - file. For example, if you want its report in a
29.644 - file called QUERIES.LST, you could type
29.645 -
29.646 - gutcheck yourfile.txt > queries.lst
29.647 -
29.648 - The queries.lst file will then contain the listing
29.649 - of possible formatting errors, and you can
29.650 - edit it alongside your text.
29.651 -
29.652 - Whatever you do, DON'T make the filename after
29.653 - the greater-than sign the name of a file already
29.654 - on your disk that you want to keep, because
29.655 - the greater-than sign will cause gutcheck to
29.656 - replace any existing file of that name.
29.657 -
29.658 - So, for example, if you have two Tolstoy files
29.659 - that you want to check, called WARPEACE.TXT and
29.660 - ANNAK.TXT, make sure that neither of these names
29.661 - is ever used following the greater-than sign.
29.662 - To check these correctly, you might do:
29.663 -
29.664 - gutcheck warpeace.txt >war.lst
29.665 -
29.666 - and
29.667 -
29.668 - gutcheck annak.txt > annak.lst
29.669 -
29.670 - separately. Then you can look at war.lst and annak.lst
29.671 - to see the gutcheck reports.
29.672 -
29.673 - * * * *
29.674 -
29.675 -
29.676 -For existing 0.98 users upgrading to 0.99:
29.677 -
29.678 - If you run on old 16-bit DOS or Windows 3.x, I'm afraid
29.679 - you're out of luck. I'm not saying it _can't_ be compiled
29.680 - to run on 16-bit, but the executable with the package is
29.681 - for Win32 only. *nix users won't notice the change at all.
29.682 -
29.683 -
29.684 - There are two new switches: -u and -d.
29.685 - See above for full rundown.
29.686 -
29.687 -
29.688 -Here's a list of the new errors:
29.689 -
29.690 - Line 1456 - Carat character?
29.691 -
29.692 - I^ve found a few.
29.693 -
29.694 -
29.695 - Line 1821 - Forward slash?
29.696 -
29.697 - Common error for italicized "I", or so /'ve found.
29.698 -
29.699 -
29.700 - Line 2139 - Query missing paragraph break?
29.701 -
29.702 - "Come here, son." "Do I _have_ to go, dad?"
29.703 - Like that. False positives in some texts. Sorry 'bout that,
29.704 - but these are often errors.
29.705 -
29.706 -
29.707 - Line 2200 - Query had/bad error?
29.708 -
29.709 - Clear enough. Doesn't catch as many as I'd like it to,
29.710 - but rarely gives false alarms.
29.711 -
29.712 -
29.713 - Line 2268 - Query punctuation after the?
29.714 -
29.715 - Some words, like "the", very rarely have punctuation
29.716 - following them. Others, like "Mrs", usually have a
29.717 - period, but never a comma. Occasional false positives.
29.718 -
29.719 -
29.720 - Line 2380 - Query possible scanno arid
29.721 -
29.722 - It found one of your user-defined typos when you
29.723 - used the -u switch.
29.724 -
29.725 -
29.726 - Line 2511 - Capital "S"?
29.727 -
29.728 - Surprisingly common specific case, like: Jane'S
29.729 -
29.730 -
29.731 - Line 3469 - endquote missing punctuation?
29.732 -
29.733 - OK. This one can really cause a lot of false positives
29.734 - in some books, but it switches itself off if it finds
29.735 - more than 20 in a text, unless you force it to list them
29.736 - all with the -v switch.
29.737 - "Hey, dad" Johnny said, "can we go now?"
29.738 - is a common punctuation-missing error.
29.739 -
29.740 -
29.741 - Line 4266 - Mismatched underscores?
29.742 -
29.743 - Like mismatched anything else!
29.744 -
29.745 -
30.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
30.2 +++ b/doc/loupe-test.txt Fri Jan 27 10:30:16 2012 +0000
30.3 @@ -0,0 +1,68 @@
30.4 + bookloupe test framework
30.5 + ========================
30.6 +
30.7 +Running existing testcases
30.8 +--------------------------
30.9 +
30.10 +The test harness (the program that runs a test) is called loupe-test. The
30.11 +various testcases are stored in multiple text files, typically with a .tst
30.12 +extension.
30.13 +
30.14 +To run a testcase when all of bookloupe, loupe-test and the testcase file are
30.15 +in the current directory simply do something like:
30.16 +
30.17 +% loupe-test missing-space.tst
30.18 +
30.19 +from a command prompt. Under MS-Windows, this is called a command window and
30.20 +the prompt will normally look slightly different, eg.,
30.21 +
30.22 +C:\DP> loupe-test missing-space.tst
30.23 +
30.24 +To run all the tests in the current directory, do something like this:
30.25 +
30.26 +% loupe-test *.tst
30.27 +
30.28 +If bookloupe is not in the current directory or you want to run the testsuite
30.29 +against gutcheck (the program that bookloupe is based on), then you can set an
30.30 +environment variable (BOOKLOUPE) to point at it. For example, on MS-Windows
30.31 +you might do:
30.32 +
30.33 +C:\DP> set BOOKLOUPE=C:\GUTCHECK\GUTCHECK.EXE
30.34 +C:\DP> loupe-test *.tst
30.35 +
30.36 +Writing your own testcases
30.37 +--------------------------
30.38 +
30.39 +Writing a new testcase is pretty painless. Most testcases follow this simple
30.40 +pattern:
30.41 +
30.42 + ┌──────────────────────────────────────────â”
30.43 + │**************** INPUT **************** │
30.44 + │"Look!John, over there!" │
30.45 + │**************** EXPECTED ****************│
30.46 + │ │
30.47 + │"Look!John, over there!" │
30.48 + │ Line 1 column 6 - Missing space? │
30.49 + └──────────────────────────────────────────┘
30.50 +
30.51 +The sixteen asterisks in this example form what is known as the "flag". This
30.52 +flag must come before and after all tags (eg., INPUT and EXPECTED). In the
30.53 +unlikely event that you need sixteen asterisks at the start of a line of text,
30.54 +then simply choose a different flag and use it throughout the file (flags
30.55 +can be any sequence of ASCII characters except control codes and space).
30.56 +
30.57 +Note that the header that bookloupe and gutcheck normally output is not
30.58 +included in the expected output. This avoids problems with not knowing
30.59 +beforehand the name of the file that bookloupe/gutcheck will be asked to
30.60 +look at (and saves typing!). bookloupe (and gutcheck) prints a blank line
30.61 +before each warning. These are not part of the header and so do need to
30.62 +be included.
30.63 +
30.64 +To test that bookloupe produces no output, you still need to include
30.65 +an EXPECTED tag, just with no text following it. If there is no EXPECTED
30.66 +tag, then loupe-test will consider that no expectation exists and won't
30.67 +check the output at all.
30.68 +
30.69 +There is no support yet for non-ASCII testcases, embedded linefeeds,
30.70 +passing command line options to bookloupe or for testcases which are
30.71 +expected to fail.
31.1 --- a/gclib/Makefile.am Fri Jan 27 00:28:11 2012 +0000
31.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000
31.3 @@ -1,10 +0,0 @@
31.4 -INCLUDES=-I$(top_srcdir)
31.5 -AM_CFLAGS=$(GLIB_CFLAGS)
31.6 -LIBS=$(GLIB_LIBS)
31.7 -
31.8 -noinst_LTLIBRARIES=libgc.la
31.9 -libgc_la_SOURCES=gclib.h textfileutils.c textfileutils.h spawn.c spawn.h
31.10 -if !HAVE_GLIB
31.11 -libgc_la_SOURCES+=macros.h types.h fileutils.c fileutils.h mem.c mem.h \
31.12 - strfuncs.c strfuncs.h gcstring.c gcstring.h utils.c utils.h
31.13 -endif
32.1 --- a/gclib/fileutils.c Fri Jan 27 00:28:11 2012 +0000
32.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000
32.3 @@ -1,46 +0,0 @@
32.4 -#include <stdlib.h>
32.5 -#include <stdio.h>
32.6 -#include <gclib/macros.h>
32.7 -#include <gclib/mem.h>
32.8 -#include <gclib/fileutils.h>
32.9 -#include <gclib/gcstring.h>
32.10 -
32.11 -/*
32.12 - * Read a file into memory (which should be freed with mem_free when no
32.13 - * longer required). Returns FALSE on error and outputs a suitable error
32.14 - * message to stderr.
32.15 - */
32.16 -boolean file_get_contents(const char *filename,char **contents,size_t *length)
32.17 -{
32.18 - FILE *fp;
32.19 - size_t n;
32.20 - char *buffer;
32.21 - String *string;
32.22 - fp=fopen(filename,"rb");
32.23 - if (!fp)
32.24 - {
32.25 - perror(filename);
32.26 - return FALSE;
32.27 - }
32.28 - buffer=mem_new(char,1024);
32.29 - string=string_new(NULL);
32.30 - do
32.31 - {
32.32 - n=fread(buffer,1,1024,fp);
32.33 - if (n<0)
32.34 - {
32.35 - perror(filename);
32.36 - string_free(string,TRUE);
32.37 - mem_free(buffer);
32.38 - free(fp);
32.39 - return FALSE;
32.40 - }
32.41 - string_append_len(string,buffer,n);
32.42 - } while(n);
32.43 - mem_free(buffer);
32.44 - if (length)
32.45 - *length=string->len;
32.46 - *contents=string_free(string,FALSE);
32.47 - fclose(fp);
32.48 - return TRUE;
32.49 -}
33.1 --- a/gclib/fileutils.h Fri Jan 27 00:28:11 2012 +0000
33.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000
33.3 @@ -1,8 +0,0 @@
33.4 -#ifndef GC_FILEUTILS_H
33.5 -#define GC_FILEUTILS_H
33.6 -
33.7 -#include <gclib/types.h>
33.8 -
33.9 -boolean file_get_contents(const char *filename,char **contents,size_t *length);
33.10 -
33.11 -#endif /* GC_FILEUTILS_H */
34.1 --- a/gclib/gclib.h Fri Jan 27 00:28:11 2012 +0000
34.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000
34.3 @@ -1,36 +0,0 @@
34.4 -#if HAVE_GLIB
34.5 -
34.6 -#include <glib.h>
34.7 -#define GC_DIR_SEPARATOR G_DIR_SEPARATOR
34.8 -#define GC_DIR_SEPARATOR_S G_DIR_SEPARATOR_S
34.9 -#define GC_IS_DIR_SEPARATOR(c) G_IS_DIR_SEPARATOR(c)
34.10 -#define boolean gboolean
34.11 -#define String GString
34.12 -#define mem_new0 g_new0
34.13 -#define mem_free g_free
34.14 -#define str_dup g_strdup
34.15 -#define str_ndup g_strndup
34.16 -#define path_get_basename g_path_get_basename
34.17 -#define file_get_contents(filename,contents,length) \
34.18 - g_file_get_contents(filename,contents,length,NULL)
34.19 -#define string_new g_string_new
34.20 -#define string_append g_string_append
34.21 -#define string_append_len g_string_append_len
34.22 -#define string_append_c g_string_append_c
34.23 -#define string_free g_string_free
34.24 -#define string_set_size g_string_set_size
34.25 -
34.26 -#else /* !HAVE_GLIB */
34.27 -
34.28 -#include <gclib/macros.h>
34.29 -#include <gclib/types.h>
34.30 -#include <gclib/mem.h>
34.31 -#include <gclib/fileutils.h>
34.32 -#include <gclib/strfuncs.h>
34.33 -#include <gclib/gcstring.h>
34.34 -#include <gclib/utils.h>
34.35 -
34.36 -#endif /* HAVE_GLIB */
34.37 -
34.38 -#include <gclib/textfileutils.h>
34.39 -#include <gclib/spawn.h>
35.1 --- a/gclib/gcstring.c Fri Jan 27 00:28:11 2012 +0000
35.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000
35.3 @@ -1,90 +0,0 @@
35.4 -#include <stdlib.h>
35.5 -#include <string.h>
35.6 -#include <gclib/gcstring.h>
35.7 -#include <gclib/types.h>
35.8 -#include <gclib/mem.h>
35.9 -#include <gclib/strfuncs.h>
35.10 -
35.11 -/*
35.12 - * Strings which manage their own memory
35.13 - */
35.14 -
35.15 -String *string_new(const char *init)
35.16 -{
35.17 - String *string=mem_new(String,1);
35.18 - if (!init)
35.19 - init="";
35.20 - string->len=strlen(init);
35.21 - string->alloc=string->len+1;
35.22 - string->str=str_dup(init);
35.23 - return string;
35.24 -}
35.25 -
35.26 -/*
35.27 - * Free a string and either return the contents (if free_segment is FALSE)
35.28 - * or free the contents as well and return NULL (if free_segment is TRUE).
35.29 - */
35.30 -char *string_free(String *string,boolean free_segment)
35.31 -{
35.32 - char *retval;
35.33 - if (free_segment)
35.34 - {
35.35 - mem_free(string->str);
35.36 - retval=NULL;
35.37 - }
35.38 - else
35.39 - retval=string->str;
35.40 - mem_free(string);
35.41 - return retval;
35.42 -}
35.43 -
35.44 -/*
35.45 - * Append a byte to string.
35.46 - */
35.47 -void string_append_c(String *string,char c)
35.48 -{
35.49 - if (string->len+1==string->alloc)
35.50 - {
35.51 - string->alloc*=2;
35.52 - string->str=mem_renew(char,string->str,string->alloc);
35.53 - }
35.54 - string->str[string->len++]=c;
35.55 - string->str[string->len]='\0';
35.56 -}
35.57 -
35.58 -/*
35.59 - * Append len bytes from s to string. len may be passed as <0 if s is
35.60 - * a nul-terminated string of unknown length.
35.61 - */
35.62 -void string_append_len(String *string,const char *s,ssize_t len)
35.63 -{
35.64 - if (len<0)
35.65 - len=strlen(s);
35.66 - if (string->len+len>=string->alloc)
35.67 - {
35.68 - while (string->len+len>=string->alloc)
35.69 - string->alloc*=2;
35.70 - string->str=mem_renew(char,string->str,string->alloc);
35.71 - }
35.72 - memcpy(string->str+string->len,s,len);
35.73 - string->len+=len;
35.74 - string->str[string->len]='\0';
35.75 -}
35.76 -
35.77 -/*
35.78 - * Sets the length of a String. If the length is less than the current length,
35.79 - * the string will be truncated. If the length is greater than the current
35.80 - * length, the contents of the newly added area are undefined. (However, as
35.81 - * always, string->str[string->len] will be a nul byte.)
35.82 - */
35.83 -void string_set_size(String *string,size_t len)
35.84 -{
35.85 - if (len>=string->alloc)
35.86 - {
35.87 - while (len>=string->alloc)
35.88 - string->alloc*=2;
35.89 - string->str=mem_renew(char,string->str,string->alloc);
35.90 - }
35.91 - string->len=len;
35.92 - string->str[string->len]='\0';
35.93 -}
36.1 --- a/gclib/gcstring.h Fri Jan 27 00:28:11 2012 +0000
36.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000
36.3 @@ -1,18 +0,0 @@
36.4 -#ifndef GC_STRING_H
36.5 -#define GC_STRING_H
36.6 -
36.7 -#include <unistd.h>
36.8 -#include <gclib/types.h>
36.9 -
36.10 -typedef struct {
36.11 - char *str;
36.12 - size_t alloc,len;
36.13 -} String;
36.14 -
36.15 -String *string_new(const char *init);
36.16 -char *string_free(String *string,boolean free_segment);
36.17 -void string_append_c(String *string,char c);
36.18 -void string_append_len(String *string,const char *s,ssize_t len);
36.19 -#define string_append(string,s) string_append_len(string,s,-1)
36.20 -
36.21 -#endif /* GC_STRING_H */
37.1 --- a/gclib/macros.h Fri Jan 27 00:28:11 2012 +0000
37.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000
37.3 @@ -1,7 +0,0 @@
37.4 -#ifndef FALSE
37.5 -#define FALSE 0
37.6 -#endif
37.7 -
37.8 -#ifndef TRUE
37.9 -#define TRUE (!FALSE)
37.10 -#endif
38.1 --- a/gclib/mem.c Fri Jan 27 00:28:11 2012 +0000
38.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000
38.3 @@ -1,54 +0,0 @@
38.4 -#include <stdlib.h>
38.5 -#include <stdio.h>
38.6 -#include <string.h>
38.7 -#include <gclib/mem.h>
38.8 -
38.9 -/*
38.10 - * A memory allocator that aborts on failure (so that the caller never
38.11 - * needs to handle out of memory, which we assume is very unlikely to
38.12 - * happen under normal circumstances on any modern machine).
38.13 - */
38.14 -void *mem_alloc(size_t nmemb,size_t size)
38.15 -{
38.16 - void *ptr=malloc(nmemb*size);
38.17 - if (!ptr)
38.18 - {
38.19 - fprintf(stderr,
38.20 - "Not enough memory to allocate %lu elements of %lu bytes.\n",
38.21 - (unsigned long)nmemb,(unsigned long)size);
38.22 - abort();
38.23 - }
38.24 - return ptr;
38.25 -}
38.26 -
38.27 -/*
38.28 - * As mem_new, but new memory is cleared to zero.
38.29 - */
38.30 -void *mem_alloc0(size_t nmemb,size_t size)
38.31 -{
38.32 - void *ptr=calloc(nmemb,size);
38.33 - if (!ptr)
38.34 - {
38.35 - fprintf(stderr,
38.36 - "Not enough memory to allocate %lu elements of %lu bytes.\n",
38.37 - (unsigned long)nmemb,(unsigned long)size);
38.38 - abort();
38.39 - }
38.40 - return ptr;
38.41 -}
38.42 -
38.43 -/*
38.44 - * Grow or shrink a memory block, aborting on failure.
38.45 - */
38.46 -void *mem_realloc(void *ptr,size_t nmemb,size_t size)
38.47 -{
38.48 - ptr=realloc(ptr,nmemb*size);
38.49 - if (!ptr)
38.50 - {
38.51 - fprintf(stderr,
38.52 - "Not enough memory to allocate %lu elements of %lu bytes.\n",
38.53 - (unsigned long)nmemb,(unsigned long)size);
38.54 - abort();
38.55 - }
38.56 - return ptr;
38.57 -}
39.1 --- a/gclib/mem.h Fri Jan 27 00:28:11 2012 +0000
39.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000
39.3 @@ -1,13 +0,0 @@
39.4 -#ifndef GC_MEM_H
39.5 -#define GC_MEM_H
39.6 -
39.7 -void *mem_alloc(size_t nmemb,size_t size);
39.8 -void *mem_alloc0(size_t nmemb,size_t size);
39.9 -void *mem_realloc(void *ptr,size_t nmemb,size_t size);
39.10 -
39.11 -#define mem_new(type,n) ((type *)mem_alloc(n,sizeof(type)))
39.12 -#define mem_new0(type,n) ((type *)mem_alloc0(n,sizeof(type)))
39.13 -#define mem_renew(type,ptr,n) ((type *)mem_realloc(ptr,n,sizeof(type)))
39.14 -#define mem_free(ptr) free(ptr)
39.15 -
39.16 -#endif /* GC_MEM_H */
40.1 --- a/gclib/spawn.c Fri Jan 27 00:28:11 2012 +0000
40.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000
40.3 @@ -1,84 +0,0 @@
40.4 -#include <stdlib.h>
40.5 -#include <stdio.h>
40.6 -#ifndef WIN32
40.7 -#include <sys/wait.h>
40.8 -#endif
40.9 -#include <gclib/gclib.h>
40.10 -
40.11 -#define SPAWN_BUFSIZE 128
40.12 -
40.13 -boolean spawn_sync(char **argv,char **standard_output,int *exit_status)
40.14 -{
40.15 -/* Don't use g_spawn_sync on WIN32 for now to avoid needing the helper */
40.16 -#if HAVE_GLIB && !defined(WIN32)
40.17 - char *standard_error;
40.18 - GError *error=NULL;
40.19 - gboolean retval;
40.20 - GSpawnFlags flags=G_SPAWN_SEARCH_PATH;
40.21 - if (!standard_output)
40.22 - flags=G_SPAWN_STDOUT_TO_DEV_NULL;
40.23 - retval=g_spawn_sync(NULL,argv,NULL,flags,NULL,NULL,standard_output,
40.24 - &standard_error,exit_status,&error);
40.25 - fputs(standard_error,stderr);
40.26 - g_free(standard_error);
40.27 - if (!retval)
40.28 - {
40.29 - fprintf(stderr,"%s\n",error->message);
40.30 - g_error_free(error);
40.31 - }
40.32 - else if (exit_status)
40.33 - *exit_status=WEXITSTATUS(*exit_status);
40.34 - return retval;
40.35 -#else
40.36 - FILE *fp;
40.37 - int i,r;
40.38 - size_t n,len;
40.39 - String *command_line,*string;
40.40 - command_line=string_new(NULL);
40.41 - for(i=0;argv[i];i++)
40.42 - {
40.43 - if (i)
40.44 - string_append_c(command_line,' ');
40.45 - string_append(command_line,argv[i]);
40.46 - }
40.47 - fp=popen(command_line->str,"r");
40.48 - string_free(command_line,TRUE);
40.49 - if (!fp)
40.50 - {
40.51 - perror(command_line->str);
40.52 - return FALSE;
40.53 - }
40.54 - string=string_new(NULL);
40.55 - do
40.56 - {
40.57 - len=string->len;
40.58 - string_set_size(string,len+SPAWN_BUFSIZE);
40.59 - n=fread(string->str+len,1,SPAWN_BUFSIZE,fp);
40.60 - if (n<0)
40.61 - {
40.62 - perror("fread");
40.63 - (void)pclose(fp);
40.64 - string_free(string,TRUE);
40.65 - return FALSE;
40.66 - }
40.67 - string_set_size(string,len+n);
40.68 - } while(n);
40.69 - r=pclose(fp);
40.70 - if (r<0)
40.71 - {
40.72 - perror("pclose");
40.73 - string_free(string,TRUE);
40.74 - return FALSE;
40.75 - }
40.76 - else
40.77 - {
40.78 - if (exit_status)
40.79 - *exit_status=r;
40.80 - if (standard_output)
40.81 - *standard_output=string_free(string,FALSE);
40.82 - else
40.83 - string_free(string,TRUE);
40.84 - return TRUE;
40.85 - }
40.86 -#endif
40.87 -}
41.1 --- a/gclib/spawn.h Fri Jan 27 00:28:11 2012 +0000
41.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000
41.3 @@ -1,8 +0,0 @@
41.4 -#ifndef GC_SPAWN_H
41.5 -#define GC_SPAWN_H
41.6 -
41.7 -#include <gclib/gclib.h>
41.8 -
41.9 -boolean spawn_sync(char **argv,char **standard_output,int *exit_status);
41.10 -
41.11 -#endif /* GC_SPAWN_H */
42.1 --- a/gclib/strfuncs.c Fri Jan 27 00:28:11 2012 +0000
42.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000
42.3 @@ -1,26 +0,0 @@
42.4 -#include <stdlib.h>
42.5 -#include <string.h>
42.6 -#include <gclib/mem.h>
42.7 -#include <gclib/strfuncs.h>
42.8 -
42.9 -/*
42.10 - * Like strndup, but only returns NULL if str is NULL.
42.11 - * Note that this routine copies n bytes rather than n characters.
42.12 - */
42.13 -char *str_ndup(const char *str,size_t n)
42.14 -{
42.15 - char *dup;
42.16 - if (!str)
42.17 - return NULL;
42.18 - dup=mem_alloc0(n+1,1);
42.19 - strncpy(dup,str,n);
42.20 - return dup;
42.21 -}
42.22 -
42.23 -/*
42.24 - * Like strdup, but only returns NULL if str is NULL.
42.25 - */
42.26 -char *str_dup(const char *str)
42.27 -{
42.28 - return str_ndup(str,strlen(str));
42.29 -}
43.1 --- a/gclib/strfuncs.h Fri Jan 27 00:28:11 2012 +0000
43.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000
43.3 @@ -1,7 +0,0 @@
43.4 -#ifndef GC_STRFUNCS_H
43.5 -#define GC_STRFUNCS_H
43.6 -
43.7 -char *str_dup(const char *str);
43.8 -char *str_ndup(const char *str,size_t n);
43.9 -
43.10 -#endif /* GC_STRFUNCS_H */
44.1 --- a/gclib/textfileutils.c Fri Jan 27 00:28:11 2012 +0000
44.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000
44.3 @@ -1,33 +0,0 @@
44.4 -#include <stdlib.h>
44.5 -#include <stdio.h>
44.6 -#include <gclib/gclib.h>
44.7 -
44.8 -/*
44.9 - * Read a file into memory (which should be freed with mem_free when no
44.10 - * longer required). Returns NULL on error and outputs a suitable error
44.11 - * message to stderr.
44.12 - * DOS-style line endings are handled transparently even on platforms which
44.13 - * don't normally use this format.
44.14 - */
44.15 -boolean file_get_contents_text(const char *filename,char **contents,
44.16 - size_t *length)
44.17 -{
44.18 - int i;
44.19 - char *raw;
44.20 - size_t raw_length;
44.21 - String *string;
44.22 - if (!file_get_contents(filename,&raw,&raw_length))
44.23 - return FALSE;
44.24 - string=string_new(NULL);
44.25 - for(i=0;i<raw_length;i++)
44.26 - if (raw[i]!='\r')
44.27 - string_append_c(string,raw[i]);
44.28 - mem_free(raw);
44.29 - if (length)
44.30 - *length=string->len;
44.31 - if (contents)
44.32 - *contents=string_free(string,FALSE);
44.33 - else
44.34 - string_free(string,TRUE);
44.35 - return TRUE;
44.36 -}
45.1 --- a/gclib/textfileutils.h Fri Jan 27 00:28:11 2012 +0000
45.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000
45.3 @@ -1,9 +0,0 @@
45.4 -#ifndef GC_TEXTFILEUTILS_H
45.5 -#define GC_TEXTFILEUTILS_H
45.6 -
45.7 -#include <gclib/gclib.h>
45.8 -
45.9 -boolean file_get_contents_text(const char *filename,char **contents,
45.10 - size_t *length);
45.11 -
45.12 -#endif /* GC_TEXTFILEUTILS_H */
46.1 --- a/gclib/types.h Fri Jan 27 00:28:11 2012 +0000
46.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000
46.3 @@ -1,6 +0,0 @@
46.4 -#ifndef GC_TYPES_H
46.5 -#define GC_TYPES_H
46.6 -
46.7 -typedef int boolean;
46.8 -
46.9 -#endif /* GC_TYPES_H */
47.1 --- a/gclib/utils.c Fri Jan 27 00:28:11 2012 +0000
47.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000
47.3 @@ -1,46 +0,0 @@
47.4 -#include <stdlib.h>
47.5 -#include <string.h>
47.6 -#include <unistd.h>
47.7 -#include <gclib/mem.h>
47.8 -#include <gclib/strfuncs.h>
47.9 -#include <gclib/utils.h>
47.10 -
47.11 -#define is_valid_drive(d) ((d)>='a' && (d)<='z' || (d)>='A' && (d)<='Z')
47.12 -
47.13 -/*
47.14 - * Gets the last component of the filename. If filename ends with a directory
47.15 - * separator it gets the component before the last slash. If filename consists
47.16 - * only of directory separators (and on Windows, possibly a drive letter), a
47.17 - * single separator is returned. If filename is empty, it gets ".".
47.18 - */
47.19 -char *path_get_basename(const char *filename)
47.20 -{
47.21 - ssize_t base,last_nonslash;
47.22 - size_t len;
47.23 - char *retval;
47.24 - if (*filename=='\0')
47.25 - return str_dup(".");
47.26 - last_nonslash=strlen(filename)-1;
47.27 - while (last_nonslash>=0 && GC_IS_DIR_SEPARATOR(filename[last_nonslash]))
47.28 - last_nonslash--;
47.29 - if (last_nonslash<0)
47.30 - /* string only containing slashes */
47.31 - return str_dup(GC_DIR_SEPARATOR_S);
47.32 -#ifdef WIN32
47.33 - if (last_nonslash==1 && is_valid_drive(filename[0]) && filename[1]==':')
47.34 - /* string only containing slashes and a drive */
47.35 - return str_dup(GC_DIR_SEPARATOR_S);
47.36 -#endif
47.37 - base=last_nonslash;
47.38 - while (base>=0 && !GC_IS_DIR_SEPARATOR(filename[base]))
47.39 - base--;
47.40 -#ifdef WIN32
47.41 - if (base==-1 && is_valid_drive(filename[0]) && filename[1] == ':')
47.42 - base=1;
47.43 -#endif
47.44 - len=last_nonslash-base;
47.45 - retval=mem_alloc(len+1,1);
47.46 - memcpy(retval,filename+base+1,len);
47.47 - retval[len]='\0';
47.48 - return retval;
47.49 -}
48.1 --- a/gclib/utils.h Fri Jan 27 00:28:11 2012 +0000
48.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000
48.3 @@ -1,16 +0,0 @@
48.4 -#ifndef GC_UTIL_H
48.5 -#define GC_UTIL_H
48.6 -
48.7 -#ifdef WIN32
48.8 -#define GC_DIR_SEPARATOR '\\'
48.9 -#define GC_DIR_SEPARATOR_S "\\"
48.10 -#define GC_IS_DIR_SEPARATOR(c) ((c)==GC_DIR_SEPARATOR || (c)=='/')
48.11 -#else
48.12 -#define GC_DIR_SEPARATOR '/'
48.13 -#define GC_DIR_SEPARATOR_S "/"
48.14 -#define GC_IS_DIR_SEPARATOR(c) ((c)==GC_DIR_SEPARATOR)
48.15 -#endif
48.16 -
48.17 -char *path_get_basename(const char *filename);
48.18 -
48.19 -#endif /* GC_UTIL_H */
49.1 --- a/gutcheck/Makefile.am Fri Jan 27 00:28:11 2012 +0000
49.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000
49.3 @@ -1,8 +0,0 @@
49.4 -bin_PROGRAMS=gutcheck
49.5 -pkgdata_DATA=gutcheck.typ
49.6 -
49.7 -gutcheck.typ: gutcheck.typ.in
49.8 - sed 's/$$/\r/' $< > $@
49.9 -
49.10 -EXTRA_DIST=gutcheck.typ.in
49.11 -CLEANFILES=gutcheck.typ
50.1 --- a/gutcheck/gutcheck.c Fri Jan 27 00:28:11 2012 +0000
50.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000
50.3 @@ -1,2982 +0,0 @@
50.4 -/*************************************************************************/
50.5 -/* gutcheck - check for assorted weirdnesses in a PG candidate text file */
50.6 -/* */
50.7 -/* Version 0.991 */
50.8 -/* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
50.9 -/* */
50.10 -/* This program is free software; you can redistribute it and/or modify */
50.11 -/* it under the terms of the GNU General Public License as published by */
50.12 -/* the Free Software Foundation; either version 2 of the License, or */
50.13 -/* (at your option) any later version. */
50.14 -/* */
50.15 -/* This program is distributed in the hope that it will be useful, */
50.16 -/* but WITHOUT ANY WARRANTY; without even the implied warranty of */
50.17 -/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
50.18 -/* GNU General Public License for more details. */
50.19 -/* */
50.20 -/* You should have received a copy of the GNU General Public License */
50.21 -/* along with this program; if not, write to the */
50.22 -/* Free Software Foundation, Inc., */
50.23 -/* 59 Temple Place, */
50.24 -/* Suite 330, */
50.25 -/* Boston, MA 02111-1307 USA */
50.26 -/* */
50.27 -/* */
50.28 -/* */
50.29 -/* Overview comments: */
50.30 -/* */
50.31 -/* If you're reading this, you're either interested in how to detect */
50.32 -/* formatting errors, or very very bored. */
50.33 -/* */
50.34 -/* Gutcheck is a homebrew formatting checker specifically for */
50.35 -/* spotting common formatting problems in a PG e-text. I typically */
50.36 -/* run it once or twice on a file I'm about to submit; it usually */
50.37 -/* finds a few formatting problems. It also usually finds lots of */
50.38 -/* queries that aren't problems at all; it _really_ doesn't like */
50.39 -/* the standard PG header, for example. It's optimized for straight */
50.40 -/* prose; poetry and non-fiction involving tables tend to trigger */
50.41 -/* false alarms. */
50.42 -/* */
50.43 -/* The code of gutcheck is not very interesting, but the experience */
50.44 -/* of what constitutes a possible error may be, and the best way to */
50.45 -/* illustrate that is by example. */
50.46 -/* */
50.47 -/* */
50.48 -/* Here are some common typos found in PG texts that gutcheck */
50.49 -/* will flag as errors: */
50.50 -/* */
50.51 -/* "Look!John , over there!" */
50.52 -/* <this is a HTML tag> */
50.53 -/* &so is this; */
50.54 -/* Margaret said: " Now you should start for school." */
50.55 -/* Margaret said: "Now you should start for school. (if end of para) */
50.56 -/* The horse is said to he worth a lot. */
50.57 -/* 0K - this'11 make you look close1y. */
50.58 -/* "If you do. you'll regret it!" */
50.59 -/* */
50.60 -/* There are some complications . The extra space left around that */
50.61 -/* period was an error . . . but that ellipsis wasn't. */
50.62 -/* */
50.63 -/* The last line of a paragraph */
50.64 -/* is usually short. */
50.65 -/* */
50.66 -/* This period is an error.But the periods in a.m. aren't. */
50.67 -/* */
50.68 -/* Checks that are do-able but not (well) implemented are: */
50.69 -/* Single-quote chcking. */
50.70 -/* Despite 3 attempts at it, singlequote checking is still */
50.71 -/* crap in gutcheck. It may not be possible without analysis */
50.72 -/* of the whole paragraph. */
50.73 -/* */
50.74 -/*************************************************************************/
50.75 -
50.76 -
50.77 -#include <stdio.h>
50.78 -#include <stdlib.h>
50.79 -#include <string.h>
50.80 -#include <ctype.h>
50.81 -
50.82 -#define MAXWORDLEN 80 /* max length of one word */
50.83 -#define LINEBUFSIZE 2048 /* buffer size for an input line */
50.84 -
50.85 -#define MAX_USER_TYPOS 1000
50.86 -#define USERTYPO_FILE "gutcheck.typ"
50.87 -
50.88 -#ifndef MAX_PATH
50.89 -#define MAX_PATH 16384
50.90 -#endif
50.91 -
50.92 -char aline[LINEBUFSIZE];
50.93 -char prevline[LINEBUFSIZE];
50.94 -
50.95 - /* Common typos. */
50.96 -char *typo[] = { "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane", "nad",
50.97 - "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa", "bakc", "om",
50.98 - "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt", "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr",
50.99 - "hmi", "hse", "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd", "gerat", "goign",
50.100 - "gruop", "haev", "hda", "hearign", "seeign", "sayign", "herat", "hge", "hsa", "hsi", "hte", "htere",
50.101 - "htese", "htey", "htis", "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut", "loev",
50.102 - "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter", "omre", "onyl", "otehr", "otu", "owrk",
50.103 - "owuld", "peice", "peices", "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
50.104 - "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe", "sohw", "stnad", "stopry",
50.105 - "stoyr", "stpo", "tahn", "taht", "tath", "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge",
50.106 - "thier", "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne", "tirne", "tkae",
50.107 - "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey", "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih",
50.108 - "whihc", "whta", "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
50.109 - "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking", "wtih", "wuould", "wya", "yera",
50.110 - "yeras", "yersa", "yoiu", "youve", "ytou", "yuor",
50.111 - /* added h/b words for version 12 - removed a few with "tbe" v.25 */
50.112 - "abead", "ahle", "ahout", "ahove", "altbough", "balf", "bardly", "bas", "bave", "baving", "bebind",
50.113 - "beld", "belp", "belped", "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge", "dehates",
50.114 - "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan", "hegin", "heing",
50.115 - "helieve", "henefit", "hetter", "hetween", "heyond", "hig", "higber", "huild", "huy", "hy", "jobn", "joh",
50.116 - "meanwbile", "memher", "memhers", "numher", "numhers",
50.117 - "perbaps", "prohlem", "puhlic", "witbout",
50.118 - /* and a few more for .18 */
50.119 - "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud", "prornise", "prornised", "modem", "bo",
50.120 - "heside", "chapteb", "chaptee", "se",
50.121 - ""};
50.122 -
50.123 -char *usertypo[MAX_USER_TYPOS];
50.124 -
50.125 - /* Common abbreviations and other OK words not to query as typos. */
50.126 - /* 0.99 last-minute - removed "ms" */
50.127 -char *okword[] = {"mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm", "rd", "sh", "br",
50.128 - "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "pompeii","hawaii","hawaiian",
50.129 - "hotbed", "heartbeat", "heartbeats", "outbid", "outbids", "frostbite", "frostbitten",
50.130 - ""};
50.131 -
50.132 - /* Common abbreviations that cause otherwise unexplained periods. */
50.133 -char *abbrev[] = {"cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op", "cit",
50.134 - "deg", "min", "chap", "oz", "mme", "mlle", "mssrs",
50.135 - ""};
50.136 - /* Two-Letter combinations that rarely if ever start words, */
50.137 - /* but are common scannos or otherwise common letter */
50.138 - /* combinations. */
50.139 -char *nostart[] = { "hr", "hl", "cb", "sb", "tb", "wb", "tl",
50.140 - "tn", "rn", "lt", "tj",
50.141 - "" };
50.142 -
50.143 - /* Two-Letter combinations that rarely if ever end words */
50.144 - /* but are common scannos or otherwise common letter */
50.145 - /* combinations */
50.146 -char *noend[] = { "cb", "gb", "pb", "sb", "tb",
50.147 - "wh","fr","br","qu","tw","gl","fl","sw","gr","sl","cl",
50.148 - "iy",
50.149 - ""};
50.150 -
50.151 -char *markup[] = { "a", "b", "big", "blockquote", "body", "br", "center",
50.152 - "col", "div", "em", "font", "h1", "h2", "h3", "h4",
50.153 - "h5", "h6", "head", "hr", "html", "i", "img", "li",
50.154 - "meta", "ol", "p", "pre", "small", "span", "strong",
50.155 - "sub", "sup", "table", "td", "tfoot", "thead", "title",
50.156 - "tr", "tt", "u", "ul",
50.157 - ""};
50.158 -
50.159 -char *DPmarkup[] = { "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>",
50.160 - ""}; /* <tb> added .991 */
50.161 -
50.162 -char *nocomma[] = { "the", "it's", "their", "an", "mrs", "a", "our", "that's",
50.163 - "its", "whose", "every", "i'll", "your", "my",
50.164 - "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "rd",
50.165 - "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
50.166 - "i'm", "during", "let", "toward", "among",
50.167 - ""};
50.168 -
50.169 -
50.170 -char *noperiod[] = { "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
50.171 - "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
50.172 - "i'll", "whose", "who", "because", "when", "let", "till", "very",
50.173 - "an", "among", "those", "into", "whom", "having", "thence",
50.174 - ""};
50.175 -
50.176 -
50.177 -char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü"; /* Carlo's old suggestion, updated .991 */
50.178 -
50.179 -struct {
50.180 - char *htmlent;
50.181 - char *htmlnum;
50.182 - char *textent;
50.183 - } entities[] = { "&", "&", "&",
50.184 - "<", "<", "<",
50.185 - ">", ">", ">",
50.186 - "°", "°", " degrees",
50.187 - "£", "£", "L",
50.188 - """, """, "\"", /* -- quotation mark = APL quote, */
50.189 - "Œ", "Œ", "OE", /* -- latin capital ligature OE, */
50.190 - "œ", "œ", "oe", /* -- latin small ligature oe, U+0153 ISOlat2 --> */
50.191 - "Š", "Š", "S", /* -- latin capital letter S with caron, */
50.192 - "š", "š", "s", /* -- latin small letter s with caron, */
50.193 - "Ÿ", "Ÿ", "Y", /* -- latin capital letter Y with diaeresis, */
50.194 - "ˆ", "ˆ", "", /* -- modifier letter circumflex accent, */
50.195 - "˜", "˜", "~", /* -- small tilde, U+02DC ISOdia --> */
50.196 - " ", " ", " ", /* -- en space, U+2002 ISOpub --> */
50.197 - " ", " ", " ", /* -- em space, U+2003 ISOpub --> */
50.198 - " ", " ", " ", /* -- thin space, U+2009 ISOpub --> */
50.199 - "–", "–", "-", /* -- en dash, U+2013 ISOpub --> */
50.200 - "—", "—", "--", /* -- em dash, U+2014 ISOpub --> */
50.201 - "‘", "‘", "'", /* -- left single quotation mark, */
50.202 - "’", "’", "'", /* -- right single quotation mark, */
50.203 - "‚", "‚", "'", /* -- single low-9 quotation mark, U+201A NEW --> */
50.204 - "“", "“", "\"", /* -- left double quotation mark, */
50.205 - "”", "”", "\"", /* -- right double quotation mark, */
50.206 - "„", "„", "\"", /* -- double low-9 quotation mark, U+201E NEW --> */
50.207 - "‹", "‹", "\"", /* -- single left-pointing angle quotation mark, */
50.208 - "›", "›", "\"", /* -- single right-pointing angle quotation mark, */
50.209 - " ", " ", " ", /* -- no-break space = non-breaking space, */
50.210 - "¡", "¡", "!", /* -- inverted exclamation mark, U+00A1 ISOnum --> */
50.211 - "¢", "¢", "c", /* -- cent sign, U+00A2 ISOnum --> */
50.212 - "£", "£", "L", /* -- pound sign, U+00A3 ISOnum --> */
50.213 - "¤", "¤", "$", /* -- currency sign, U+00A4 ISOnum --> */
50.214 - "¥", "¥", "Y", /* -- yen sign = yuan sign, U+00A5 ISOnum --> */
50.215 - "§", "§", "--", /* -- section sign, U+00A7 ISOnum --> */
50.216 - "¨", "¨", " ", /* -- diaeresis = spacing diaeresis, */
50.217 - "©", "©", "(C) ", /* -- copyright sign, U+00A9 ISOnum --> */
50.218 - "ª", "ª", " ", /* -- feminine ordinal indicator, U+00AA ISOnum --> */
50.219 - "«", "«", "\"", /* -- left-pointing double angle quotation mark */
50.220 - "­", "­", "-", /* -- soft hyphen = discretionary hyphen, */
50.221 - "®", "®", "(R) ", /* -- registered sign = registered trade mark sign, */
50.222 - "¯", "¯", " ", /* -- macron = spacing macron = overline */
50.223 - "°", "°", " degrees", /* -- degree sign, U+00B0 ISOnum --> */
50.224 - "±", "±", "+-", /* -- plus-minus sign = plus-or-minus sign, */
50.225 - "²", "²", "2", /* -- superscript two = superscript digit two */
50.226 - "³", "³", "3", /* -- superscript three = superscript digit three */
50.227 - "´", "´", " ", /* -- acute accent = spacing acute, */
50.228 - "µ", "µ", "m", /* -- micro sign, U+00B5 ISOnum --> */
50.229 - "¶", "¶", "--", /* -- pilcrow sign = paragraph sign, */
50.230 - "¸", "¸", " ", /* -- cedilla = spacing cedilla, U+00B8 ISOdia --> */
50.231 - "¹", "¹", "1", /* -- superscript one = superscript digit one, */
50.232 - "º", "º", " ", /* -- masculine ordinal indicator, */
50.233 - "»", "»", "\"", /* -- right-pointing double angle quotation mark */
50.234 - "¼", "¼", "1/4", /* -- vulgar fraction one quarter */
50.235 - "½", "½", "1/2", /* -- vulgar fraction one half */
50.236 - "¾", "¾", "3/4", /* -- vulgar fraction three quarters */
50.237 - "¿", "¿", "?", /* -- inverted question mark */
50.238 - "À", "À", "A", /* -- latin capital letter A with grave */
50.239 - "Á", "Á", "A", /* -- latin capital letter A with acute, */
50.240 - "Â", "Â", "A", /* -- latin capital letter A with circumflex, */
50.241 - "Ã", "Ã", "A", /* -- latin capital letter A with tilde, */
50.242 - "Ä", "Ä", "A", /* -- latin capital letter A with diaeresis, */
50.243 - "Å", "Å", "A", /* -- latin capital letter A with ring above */
50.244 - "Æ", "Æ", "AE", /* -- latin capital letter AE */
50.245 - "Ç", "Ç", "C", /* -- latin capital letter C with cedilla, */
50.246 - "È", "È", "E", /* -- latin capital letter E with grave, */
50.247 - "É", "É", "E", /* -- latin capital letter E with acute, */
50.248 - "Ê", "Ê", "E", /* -- latin capital letter E with circumflex, */
50.249 - "Ë", "Ë", "E", /* -- latin capital letter E with diaeresis, */
50.250 - "Ì", "Ì", "I", /* -- latin capital letter I with grave, */
50.251 - "Í", "Í", "I", /* -- latin capital letter I with acute, */
50.252 - "Î", "Î", "I", /* -- latin capital letter I with circumflex, */
50.253 - "Ï", "Ï", "I", /* -- latin capital letter I with diaeresis, */
50.254 - "Ð", "Ð", "E", /* -- latin capital letter ETH, U+00D0 ISOlat1 --> */
50.255 - "Ñ", "Ñ", "N", /* -- latin capital letter N with tilde, */
50.256 - "Ò", "Ò", "O", /* -- latin capital letter O with grave, */
50.257 - "Ó", "Ó", "O", /* -- latin capital letter O with acute, */
50.258 - "Ô", "Ô", "O", /* -- latin capital letter O with circumflex, */
50.259 - "Õ", "Õ", "O", /* -- latin capital letter O with tilde, */
50.260 - "Ö", "Ö", "O", /* -- latin capital letter O with diaeresis, */
50.261 - "×", "×", "*", /* -- multiplication sign, U+00D7 ISOnum --> */
50.262 - "Ø", "Ø", "O", /* -- latin capital letter O with stroke */
50.263 - "Ù", "Ù", "U", /* -- latin capital letter U with grave, */
50.264 - "Ú", "Ú", "U", /* -- latin capital letter U with acute, */
50.265 - "Û", "Û", "U", /* -- latin capital letter U with circumflex, */
50.266 - "Ü", "Ü", "U", /* -- latin capital letter U with diaeresis, */
50.267 - "Ý", "Ý", "Y", /* -- latin capital letter Y with acute, */
50.268 - "Þ", "Þ", "TH", /* -- latin capital letter THORN, */
50.269 - "ß", "ß", "sz", /* -- latin small letter sharp s = ess-zed, */
50.270 - "à", "à", "a", /* -- latin small letter a with grave */
50.271 - "á", "á", "a", /* -- latin small letter a with acute, */
50.272 - "â", "â", "a", /* -- latin small letter a with circumflex, */
50.273 - "ã", "ã", "a", /* -- latin small letter a with tilde, */
50.274 - "ä", "ä", "a", /* -- latin small letter a with diaeresis, */
50.275 - "å", "å", "a", /* -- latin small letter a with ring above */
50.276 - "æ", "æ", "ae", /* -- latin small letter ae */
50.277 - "ç", "ç", "c", /* -- latin small letter c with cedilla, */
50.278 - "è", "è", "e", /* -- latin small letter e with grave, */
50.279 - "é", "é", "e", /* -- latin small letter e with acute, */
50.280 - "ê", "ê", "e", /* -- latin small letter e with circumflex, */
50.281 - "ë", "ë", "e", /* -- latin small letter e with diaeresis, */
50.282 - "ì", "ì", "i", /* -- latin small letter i with grave, */
50.283 - "í", "í", "i", /* -- latin small letter i with acute, */
50.284 - "î", "î", "i", /* -- latin small letter i with circumflex, */
50.285 - "ï", "ï", "i", /* -- latin small letter i with diaeresis, */
50.286 - "ð", "ð", "eth", /* -- latin small letter eth, U+00F0 ISOlat1 --> */
50.287 - "ñ", "ñ", "n", /* -- latin small letter n with tilde, */
50.288 - "ò", "ò", "o", /* -- latin small letter o with grave, */
50.289 - "ó", "ó", "o", /* -- latin small letter o with acute, */
50.290 - "ô", "ô", "o", /* -- latin small letter o with circumflex, */
50.291 - "õ", "õ", "o", /* -- latin small letter o with tilde, */
50.292 - "ö", "ö", "o", /* -- latin small letter o with diaeresis, */
50.293 - "÷", "÷", "/", /* -- division sign, U+00F7 ISOnum --> */
50.294 - "ø", "ø", "o", /* -- latin small letter o with stroke, */
50.295 - "ù", "ù", "u", /* -- latin small letter u with grave, */
50.296 - "ú", "ú", "u", /* -- latin small letter u with acute, */
50.297 - "û", "û", "u", /* -- latin small letter u with circumflex, */
50.298 - "ü", "ü", "u", /* -- latin small letter u with diaeresis, */
50.299 - "ý", "ý", "y", /* -- latin small letter y with acute, */
50.300 - "þ", "þ", "th", /* -- latin small letter thorn, */
50.301 - "ÿ", "ÿ", "y", /* -- latin small letter y with diaeresis, */
50.302 - "", "" };
50.303 -
50.304 -/* ---- list of special characters ---- */
50.305 -#define CHAR_SPACE 32
50.306 -#define CHAR_TAB 9
50.307 -#define CHAR_LF 10
50.308 -#define CHAR_CR 13
50.309 -#define CHAR_DQUOTE 34
50.310 -#define CHAR_SQUOTE 39
50.311 -#define CHAR_OPEN_SQUOTE 96
50.312 -#define CHAR_TILDE 126
50.313 -#define CHAR_ASTERISK 42
50.314 -#define CHAR_FORESLASH 47
50.315 -#define CHAR_CARAT 94
50.316 -
50.317 -#define CHAR_UNDERSCORE '_'
50.318 -#define CHAR_OPEN_CBRACK '{'
50.319 -#define CHAR_CLOSE_CBRACK '}'
50.320 -#define CHAR_OPEN_RBRACK '('
50.321 -#define CHAR_CLOSE_RBRACK ')'
50.322 -#define CHAR_OPEN_SBRACK '['
50.323 -#define CHAR_CLOSE_SBRACK ']'
50.324 -
50.325 -
50.326 -
50.327 -
50.328 -
50.329 -/* ---- longest and shortest normal PG line lengths ----*/
50.330 -#define LONGEST_PG_LINE 75
50.331 -#define WAY_TOO_LONG 80
50.332 -#define SHORTEST_PG_LINE 55
50.333 -
50.334 -#define SWITCHES "ESTPXLOYHWVMUD" /* switches:- */
50.335 - /* D - ignore DP-specific markup */
50.336 - /* E - echo queried line */
50.337 - /* S - check single quotes */
50.338 - /* T - check common typos */
50.339 - /* P - require closure of quotes on */
50.340 - /* every paragraph */
50.341 - /* X - "Trust no one" :-) Paranoid! */
50.342 - /* Queries everything */
50.343 - /* L - line end checking defaults on */
50.344 - /* -L turns it off */
50.345 - /* O - overview. Just shows counts. */
50.346 - /* Y - puts errors to stdout */
50.347 - /* instead of stderr */
50.348 - /* H - Echoes header fields */
50.349 - /* M - Ignore markup in < > */
50.350 - /* U - Use file of User-defined Typos*/
50.351 - /* W - Defaults for use on Web upload*/
50.352 - /* V - Verbose - list EVERYTHING! */
50.353 -#define SWITNO 14 /* max number of switch parms */
50.354 - /* - used for defining array-size */
50.355 -#define MINARGS 1 /* minimum no of args excl switches */
50.356 -#define MAXARGS 1 /* maximum no of args excl switches */
50.357 -
50.358 -int pswit[SWITNO]; /* program switches set by SWITCHES */
50.359 -
50.360 -#define ECHO_SWITCH 0
50.361 -#define SQUOTE_SWITCH 1
50.362 -#define TYPO_SWITCH 2
50.363 -#define QPARA_SWITCH 3
50.364 -#define PARANOID_SWITCH 4
50.365 -#define LINE_END_SWITCH 5
50.366 -#define OVERVIEW_SWITCH 6
50.367 -#define STDOUT_SWITCH 7
50.368 -#define HEADER_SWITCH 8
50.369 -#define WEB_SWITCH 9
50.370 -#define VERBOSE_SWITCH 10
50.371 -#define MARKUP_SWITCH 11
50.372 -#define USERTYPO_SWITCH 12
50.373 -#define DP_SWITCH 13
50.374 -
50.375 -
50.376 -
50.377 -long cnt_dquot; /* for overview mode, count of doublequote queries */
50.378 -long cnt_squot; /* for overview mode, count of singlequote queries */
50.379 -long cnt_brack; /* for overview mode, count of brackets queries */
50.380 -long cnt_bin; /* for overview mode, count of non-ASCII queries */
50.381 -long cnt_odd; /* for overview mode, count of odd character queries */
50.382 -long cnt_long; /* for overview mode, count of long line errors */
50.383 -long cnt_short; /* for overview mode, count of short line queries */
50.384 -long cnt_punct; /* for overview mode, count of punctuation and spacing queries */
50.385 -long cnt_dash; /* for overview mode, count of dash-related queries */
50.386 -long cnt_word; /* for overview mode, count of word queries */
50.387 -long cnt_html; /* for overview mode, count of html queries */
50.388 -long cnt_lineend; /* for overview mode, count of line-end queries */
50.389 -long cnt_spacend; /* count of lines with space at end V .21 */
50.390 -long linecnt; /* count of total lines in the file */
50.391 -long checked_linecnt; /* count of lines actually gutchecked V .26 */
50.392 -
50.393 -void proghelp(void);
50.394 -void procfile(char *);
50.395 -
50.396 -#define LOW_THRESHOLD 0
50.397 -#define HIGH_THRESHOLD 1
50.398 -
50.399 -#define START 0
50.400 -#define END 1
50.401 -#define PREV 0
50.402 -#define NEXT 1
50.403 -#define FIRST_OF_PAIR 0
50.404 -#define SECOND_OF_PAIR 1
50.405 -
50.406 -#define MAX_WORDPAIR 1000
50.407 -
50.408 -char running_from[MAX_PATH];
50.409 -
50.410 -int mixdigit(char *);
50.411 -char *getaword(char *, char *);
50.412 -int matchword(char *, char *);
50.413 -char *flgets(char *, int, FILE *, long);
50.414 -void lowerit(char *);
50.415 -int gcisalpha(unsigned char);
50.416 -int gcisdigit(unsigned char);
50.417 -int gcisletter(unsigned char);
50.418 -char *gcstrchr(char *s, char c);
50.419 -void postprocess_for_HTML(char *);
50.420 -char *linehasmarkup(char *);
50.421 -char *losemarkup(char *);
50.422 -int tagcomp(char *, char *);
50.423 -char *loseentities(char *);
50.424 -int isroman(char *);
50.425 -int usertypo_count;
50.426 -void postprocess_for_DP(char *);
50.427 -
50.428 -char wrk[LINEBUFSIZE];
50.429 -
50.430 -/* This is disgustingly lazy, predefining max words & lengths, */
50.431 -/* but now I'm out of 16-bit restrictions, what's a couple of K? */
50.432 -#define MAX_QWORD 50
50.433 -#define MAX_QWORD_LENGTH 40
50.434 -char qword[MAX_QWORD][MAX_QWORD_LENGTH];
50.435 -char qperiod[MAX_QWORD][MAX_QWORD_LENGTH];
50.436 -signed int dupcnt[MAX_QWORD];
50.437 -
50.438 -
50.439 -
50.440 -
50.441 -int main(int argc, char **argv)
50.442 -{
50.443 - char *argsw, *s;
50.444 - int i, switno, invarg;
50.445 - char usertypo_file[MAX_PATH];
50.446 - FILE *usertypofile;
50.447 -
50.448 -
50.449 - if (strlen(argv[0]) < sizeof(running_from))
50.450 - strcpy(running_from, argv[0]); /* save the path to the executable gutcheck */
50.451 -
50.452 - /* find out what directory we're running from */
50.453 - for (s = running_from + strlen(running_from); *s != '/' && *s != '\\' && s >= running_from; s--)
50.454 - *s = 0;
50.455 -
50.456 -
50.457 - switno = strlen(SWITCHES);
50.458 - for (i = switno ; --i >0 ; )
50.459 - pswit[i] = 0; /* initialise switches */
50.460 -
50.461 - /* Standard loop to extract switches. */
50.462 - /* When we come out of this loop, the arguments will be */
50.463 - /* in argv[0] upwards and the switches used will be */
50.464 - /* represented by their equivalent elements in pswit[] */
50.465 - while ( --argc > 0 && **++argv == '-')
50.466 - for (argsw = argv[0]+1; *argsw !='\0'; argsw++)
50.467 - for (i = switno, invarg = 1; (--i >= 0) && invarg == 1 ; )
50.468 - if ((toupper(*argsw)) == SWITCHES[i] ) {
50.469 - invarg = 0;
50.470 - pswit[i] = 1;
50.471 - }
50.472 -
50.473 - pswit[PARANOID_SWITCH] ^= 1; /* Paranoid checking is turned OFF, not on, by its switch */
50.474 -
50.475 - if (pswit[PARANOID_SWITCH]) { /* if running in paranoid mode */
50.476 - pswit[TYPO_SWITCH] = pswit[TYPO_SWITCH] ^ 1; /* force typo checks as well */
50.477 - } /* v.20 removed s and p switches from paranoid mode */
50.478 -
50.479 - pswit[LINE_END_SWITCH] ^= 1; /* Line-end checking is turned OFF, not on, by its switch */
50.480 - pswit[ECHO_SWITCH] ^= 1; /* V.21 Echoing is turned OFF, not on, by its switch */
50.481 -
50.482 - if (pswit[OVERVIEW_SWITCH]) /* just print summary; don't echo */
50.483 - pswit[ECHO_SWITCH] = 0;
50.484 -
50.485 - /* Web uploads - for the moment, this is really just a placeholder */
50.486 - /* until we decide what processing we really want to do on web uploads */
50.487 - if (pswit[WEB_SWITCH]) { /* specific override for web uploads */
50.488 - pswit[ECHO_SWITCH] = 1;
50.489 - pswit[SQUOTE_SWITCH] = 0;
50.490 - pswit[TYPO_SWITCH] = 1;
50.491 - pswit[QPARA_SWITCH] = 0;
50.492 - pswit[PARANOID_SWITCH] = 1;
50.493 - pswit[LINE_END_SWITCH] = 0;
50.494 - pswit[OVERVIEW_SWITCH] = 0;
50.495 - pswit[STDOUT_SWITCH] = 0;
50.496 - pswit[HEADER_SWITCH] = 1;
50.497 - pswit[VERBOSE_SWITCH] = 0;
50.498 - pswit[MARKUP_SWITCH] = 0;
50.499 - pswit[USERTYPO_SWITCH] = 0;
50.500 - pswit[DP_SWITCH] = 0;
50.501 - }
50.502 -
50.503 -
50.504 - if (argc < MINARGS || argc > MAXARGS) { /* check number of args */
50.505 - proghelp();
50.506 - return(1); /* exit */
50.507 - }
50.508 -
50.509 -
50.510 - /* read in the user-defined stealth scanno list */
50.511 -
50.512 - if (pswit[USERTYPO_SWITCH]) { /* ... we were told we had one! */
50.513 - if ((usertypofile = fopen(USERTYPO_FILE, "rb")) == NULL) { /* not in cwd. try gutcheck directory. */
50.514 - strcpy(usertypo_file, running_from);
50.515 - strcat(usertypo_file, USERTYPO_FILE);
50.516 - if ((usertypofile = fopen(usertypo_file, "rb")) == NULL) { /* we ain't got no user typo file! */
50.517 - printf(" --> I couldn't find gutcheck.typ -- proceeding without user typos.\n");
50.518 - }
50.519 - }
50.520 -
50.521 - usertypo_count = 0;
50.522 - if (usertypofile) { /* we managed to open a User Typo File! */
50.523 - if (pswit[USERTYPO_SWITCH]) {
50.524 - while (flgets(aline, LINEBUFSIZE-1, usertypofile, (long)usertypo_count)) {
50.525 - if (strlen(aline) > 1) {
50.526 - if ((int)*aline > 33) {
50.527 - s = malloc(strlen(aline)+1);
50.528 - if (!s) {
50.529 - fprintf(stderr, "gutcheck: cannot get enough memory for user typo file!!\n");
50.530 - exit(1);
50.531 - }
50.532 - strcpy(s, aline);
50.533 - usertypo[usertypo_count] = s;
50.534 - usertypo_count++;
50.535 - if (usertypo_count >= MAX_USER_TYPOS) {
50.536 - printf(" --> Only %d user-defined typos allowed: ignoring the rest\n");
50.537 - break;
50.538 - }
50.539 - }
50.540 - }
50.541 - }
50.542 - }
50.543 - fclose(usertypofile);
50.544 - }
50.545 - }
50.546 -
50.547 -
50.548 -
50.549 -
50.550 - fprintf(stderr, "gutcheck: Check and report on an e-text\n");
50.551 -
50.552 - cnt_dquot = cnt_squot = cnt_brack = cnt_bin = cnt_odd = cnt_long =
50.553 - cnt_short = cnt_punct = cnt_dash = cnt_word = cnt_html = cnt_lineend =
50.554 - cnt_spacend = 0;
50.555 -
50.556 - procfile(argv[0]);
50.557 -
50.558 - if (pswit[OVERVIEW_SWITCH]) {
50.559 - printf(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
50.560 - checked_linecnt, linecnt, linecnt - checked_linecnt);
50.561 - printf(" --------------- Queries found --------------\n");
50.562 - if (cnt_long) printf(" Long lines: %5ld\n",cnt_long);
50.563 - if (cnt_short) printf(" Short lines: %5ld\n",cnt_short);
50.564 - if (cnt_lineend) printf(" Line-end problems: %5ld\n",cnt_lineend);
50.565 - if (cnt_word) printf(" Common typos: %5ld\n",cnt_word);
50.566 - if (cnt_dquot) printf(" Unmatched quotes: %5ld\n",cnt_dquot);
50.567 - if (cnt_squot) printf(" Unmatched SingleQuotes: %5ld\n",cnt_squot);
50.568 - if (cnt_brack) printf(" Unmatched brackets: %5ld\n",cnt_brack);
50.569 - if (cnt_bin) printf(" Non-ASCII characters: %5ld\n",cnt_bin);
50.570 - if (cnt_odd) printf(" Proofing characters: %5ld\n",cnt_odd);
50.571 - if (cnt_punct) printf(" Punctuation & spacing queries: %5ld\n",cnt_punct);
50.572 - if (cnt_dash) printf(" Non-standard dashes: %5ld\n",cnt_dash);
50.573 - if (cnt_html) printf(" Possible HTML tags: %5ld\n",cnt_html);
50.574 - printf("\n");
50.575 - printf(" TOTAL QUERIES %5ld\n",
50.576 - cnt_dquot + cnt_squot + cnt_brack + cnt_bin + cnt_odd + cnt_long +
50.577 - cnt_short + cnt_punct + cnt_dash + cnt_word + cnt_html + cnt_lineend);
50.578 - }
50.579 -
50.580 - return(0);
50.581 -}
50.582 -
50.583 -
50.584 -
50.585 -/* procfile - process one file */
50.586 -
50.587 -void procfile(char *filename)
50.588 -{
50.589 -
50.590 - char *s, *t, *s1, laststart, *wordstart;
50.591 - char inword[MAXWORDLEN], testword[MAXWORDLEN];
50.592 - char parastart[81]; /* first line of current para */
50.593 - FILE *infile;
50.594 - long quot, squot, firstline, alphalen, totlen, binlen,
50.595 - shortline, longline, verylongline, spacedash, emdash,
50.596 - space_emdash, non_PG_space_emdash, PG_space_emdash,
50.597 - footerline, dotcomma, start_para_line, astline, fslashline,
50.598 - standalone_digit, hyphens, htmcount, endquote_count;
50.599 - long spline, nspline;
50.600 - signed int i, j, llen, isemptyline, isacro, isellipsis, istypo, alower,
50.601 - eNon_A, eTab, eTilde, eAst, eFSlash, eCarat;
50.602 - signed int warn_short, warn_long, warn_bin, warn_dash, warn_dotcomma,
50.603 - warn_ast, warn_fslash, warn_digit, warn_hyphen, warn_endquote;
50.604 - unsigned int lastlen, lastblen;
50.605 - signed int s_brack, c_brack, r_brack, c_unders;
50.606 - signed int open_single_quote, close_single_quote, guessquote, dquotepar, squotepar;
50.607 - signed int isnewpara, vowel, consonant;
50.608 - char dquote_err[80], squote_err[80], rbrack_err[80], sbrack_err[80], cbrack_err[80],
50.609 - unders_err[80];
50.610 - signed int qword_index, qperiod_index, isdup;
50.611 - signed int enddash;
50.612 - signed int Dutchcount, isDutch, Frenchcount, isFrench;
50.613 -
50.614 -
50.615 -
50.616 -
50.617 -
50.618 - laststart = CHAR_SPACE;
50.619 - lastlen = lastblen = 0;
50.620 - *dquote_err = *squote_err = *rbrack_err = *cbrack_err = *sbrack_err =
50.621 - *unders_err = *prevline = 0;
50.622 - linecnt = firstline = alphalen = totlen = binlen =
50.623 - shortline = longline = spacedash = emdash = checked_linecnt =
50.624 - space_emdash = non_PG_space_emdash = PG_space_emdash =
50.625 - footerline = dotcomma = start_para_line = astline = fslashline =
50.626 - standalone_digit = hyphens = htmcount = endquote_count = 0;
50.627 - quot = squot = s_brack = c_brack = r_brack = c_unders = 0;
50.628 - i = llen = isemptyline = isacro = isellipsis = istypo = 0;
50.629 - warn_short = warn_long = warn_bin = warn_dash = warn_dotcomma =
50.630 - warn_ast = warn_fslash = warn_digit = warn_endquote = 0;
50.631 - isnewpara = vowel = consonant = enddash = 0;
50.632 - spline = nspline = 0;
50.633 - qword_index = qperiod_index = isdup = 0;
50.634 - *inword = *testword = 0;
50.635 - open_single_quote = close_single_quote = guessquote = dquotepar = squotepar = 0;
50.636 - Dutchcount = isDutch = Frenchcount = isFrench = 0;
50.637 -
50.638 -
50.639 - for (j = 0; j < MAX_QWORD; j++) {
50.640 - dupcnt[j] = 0;
50.641 - for (i = 0; i < MAX_QWORD_LENGTH; i++)
50.642 - qword[i][j] = 0;
50.643 - qperiod[i][j] = 0;
50.644 - }
50.645 -
50.646 -
50.647 - if ((infile = fopen(filename, "rb")) == NULL) {
50.648 - if (pswit[STDOUT_SWITCH])
50.649 - fprintf(stdout, "gutcheck: cannot open %s\n", filename);
50.650 - else
50.651 - fprintf(stderr, "gutcheck: cannot open %s\n", filename);
50.652 - exit(1);
50.653 - }
50.654 -
50.655 - fprintf(stdout, "\n\nFile: %s\n\n", filename);
50.656 - firstline = shortline = longline = verylongline = 0;
50.657 -
50.658 -
50.659 - /*****************************************************/
50.660 - /* */
50.661 - /* Run a first pass - verify that it's a valid PG */
50.662 - /* file, decide whether to report some things that */
50.663 - /* occur many times in the text like long or short */
50.664 - /* lines, non-standard dashes, and other good stuff */
50.665 - /* I'll doubtless think of later. */
50.666 - /* */
50.667 - /*****************************************************/
50.668 -
50.669 - /*****************************************************/
50.670 - /* V.24 Sigh. Yet Another Header Change */
50.671 - /*****************************************************/
50.672 -
50.673 - while (fgets(aline, LINEBUFSIZE-1, infile)) {
50.674 - while (aline[strlen(aline)-1] == 10 || aline[strlen(aline)-1] == 13 ) aline[strlen(aline)-1] = 0;
50.675 - linecnt++;
50.676 - if (strstr(aline, "*END") && strstr(aline, "SMALL PRINT") && (strstr(aline, "PUBLIC DOMAIN") || strstr(aline, "COPYRIGHT"))) {
50.677 - if (spline)
50.678 - printf(" --> Duplicate header?\n");
50.679 - spline = linecnt + 1; /* first line of non-header text, that is */
50.680 - }
50.681 - if (!strncmp(aline, "*** START", 9) && strstr(aline, "PROJECT GUTENBERG")) {
50.682 - if (nspline)
50.683 - printf(" --> Duplicate header?\n");
50.684 - nspline = linecnt + 1; /* first line of non-header text, that is */
50.685 - }
50.686 - if (spline || nspline) {
50.687 - lowerit(aline);
50.688 - if (strstr(aline, "end") && strstr(aline, "project gutenberg")) {
50.689 - if (strstr(aline, "end") < strstr(aline, "project gutenberg")) {
50.690 - if (footerline) {
50.691 - if (!nspline) /* it's an old-form header - we can detect duplicates */
50.692 - printf(" --> Duplicate footer?\n");
50.693 - else
50.694 - ;
50.695 - }
50.696 - else {
50.697 - footerline = linecnt;
50.698 - }
50.699 - }
50.700 - }
50.701 - }
50.702 - if (spline) firstline = spline;
50.703 - if (nspline) firstline = nspline; /* override with new */
50.704 -
50.705 - if (footerline) continue; /* 0.99+ don't count the boilerplate in the footer */
50.706 -
50.707 - llen = strlen(aline);
50.708 - totlen += llen;
50.709 - for (i = 0; i < llen; i++) {
50.710 - if ((unsigned char)aline[i] > 127) binlen++;
50.711 - if (gcisalpha(aline[i])) alphalen++;
50.712 - if (i > 0)
50.713 - if (aline[i] == CHAR_DQUOTE && isalpha(aline[i-1]))
50.714 - endquote_count++;
50.715 - }
50.716 - if (strlen(aline) > 2
50.717 - && lastlen > 2 && lastlen < SHORTEST_PG_LINE
50.718 - && lastblen > 2 && lastblen > SHORTEST_PG_LINE
50.719 - && laststart != CHAR_SPACE)
50.720 - shortline++;
50.721 -
50.722 - if (*aline) /* fixed line below for 0.96 */
50.723 - if ((unsigned char)aline[strlen(aline)-1] <= CHAR_SPACE) cnt_spacend++;
50.724 -
50.725 - if (strstr(aline, ".,")) dotcomma++;
50.726 - /* 0.98 only count ast lines for ignoring purposes where there is */
50.727 - /* locase text on the line */
50.728 - if (strstr(aline, "*")) {
50.729 - for (s = aline; *s; s++)
50.730 - if (*s >='a' && *s <= 'z')
50.731 - break;
50.732 - if (*s) astline++;
50.733 - }
50.734 - if (strstr(aline, "/"))
50.735 - fslashline++;
50.736 - for (i = llen-1; i > 0 && (unsigned char)aline[i] <= CHAR_SPACE; i--);
50.737 - if (aline[i] == '-' && aline[i-1] != '-') hyphens++;
50.738 -
50.739 - if (llen > LONGEST_PG_LINE) longline++;
50.740 - if (llen > WAY_TOO_LONG) verylongline++;
50.741 -
50.742 - if (strstr(aline, "<") && strstr(aline, ">")) {
50.743 - i = (signed int) (strstr(aline, ">") - strstr(aline, "<") + 1);
50.744 - if (i > 0)
50.745 - htmcount++;
50.746 - if (strstr(aline, "<i>")) htmcount +=4; /* bonus marks! */
50.747 - }
50.748 -
50.749 - /* Check for spaced em-dashes */
50.750 - if (strstr(aline,"--")) {
50.751 - emdash++;
50.752 - if (*(strstr(aline, "--")-1) == CHAR_SPACE ||
50.753 - (*(strstr(aline, "--")+2) == CHAR_SPACE))
50.754 - space_emdash++;
50.755 - if (*(strstr(aline, "--")-1) == CHAR_SPACE &&
50.756 - (*(strstr(aline, "--")+2) == CHAR_SPACE))
50.757 - non_PG_space_emdash++; /* count of em-dashes with spaces both sides */
50.758 - if (*(strstr(aline, "--")-1) != CHAR_SPACE &&
50.759 - (*(strstr(aline, "--")+2) != CHAR_SPACE))
50.760 - PG_space_emdash++; /* count of PG-type em-dashes with no spaces */
50.761 - }
50.762 -
50.763 - for (s = aline; *s;) {
50.764 - s = getaword(s, inword);
50.765 - if (!strcmp(inword, "hij") || !strcmp(inword, "niet"))
50.766 - Dutchcount++;
50.767 - if (!strcmp(inword, "dans") || !strcmp(inword, "avec"))
50.768 - Frenchcount++;
50.769 - if (!strcmp(inword, "0") || !strcmp(inword, "1"))
50.770 - standalone_digit++;
50.771 - }
50.772 -
50.773 - /* Check for spaced dashes */
50.774 - if (strstr(aline," -"))
50.775 - if (*(strstr(aline, " -")+2) != '-')
50.776 - spacedash++;
50.777 - lastblen = lastlen;
50.778 - lastlen = strlen(aline);
50.779 - laststart = aline[0];
50.780 -
50.781 - }
50.782 - fclose(infile);
50.783 -
50.784 -
50.785 - /* now, based on this quick view, make some snap decisions */
50.786 - if (cnt_spacend > 0) {
50.787 - printf(" --> %ld lines in this file have white space at end\n", cnt_spacend);
50.788 - }
50.789 -
50.790 - warn_dotcomma = 1;
50.791 - if (dotcomma > 5) {
50.792 - warn_dotcomma = 0;
50.793 - printf(" --> %ld lines in this file contain '.,'. Not reporting them.\n", dotcomma);
50.794 - }
50.795 -
50.796 - /* if more than 50 lines, or one-tenth, are short, don't bother reporting them */
50.797 - warn_short = 1;
50.798 - if (shortline > 50 || shortline * 10 > linecnt) {
50.799 - warn_short = 0;
50.800 - printf(" --> %ld lines in this file are short. Not reporting short lines.\n", shortline);
50.801 - }
50.802 -
50.803 - /* if more than 50 lines, or one-tenth, are long, don't bother reporting them */
50.804 - warn_long = 1;
50.805 - if (longline > 50 || longline * 10 > linecnt) {
50.806 - warn_long = 0;
50.807 - printf(" --> %ld lines in this file are long. Not reporting long lines.\n", longline);
50.808 - }
50.809 -
50.810 - /* if more than 10 lines contain asterisks, don't bother reporting them V.0.97 */
50.811 - warn_ast = 1;
50.812 - if (astline > 10 ) {
50.813 - warn_ast = 0;
50.814 - printf(" --> %ld lines in this file contain asterisks. Not reporting them.\n", astline);
50.815 - }
50.816 -
50.817 - /* if more than 10 lines contain forward slashes, don't bother reporting them V.0.99 */
50.818 - warn_fslash = 1;
50.819 - if (fslashline > 10 ) {
50.820 - warn_fslash = 0;
50.821 - printf(" --> %ld lines in this file contain forward slashes. Not reporting them.\n", fslashline);
50.822 - }
50.823 -
50.824 - /* if more than 20 lines contain unpunctuated endquotes, don't bother reporting them V.0.99 */
50.825 - warn_endquote = 1;
50.826 - if (endquote_count > 20 ) {
50.827 - warn_endquote = 0;
50.828 - printf(" --> %ld lines in this file contain unpunctuated endquotes. Not reporting them.\n", endquote_count);
50.829 - }
50.830 -
50.831 - /* if more than 15 lines contain standalone digits, don't bother reporting them V.0.97 */
50.832 - warn_digit = 1;
50.833 - if (standalone_digit > 10 ) {
50.834 - warn_digit = 0;
50.835 - printf(" --> %ld lines in this file contain standalone 0s and 1s. Not reporting them.\n", standalone_digit);
50.836 - }
50.837 -
50.838 - /* if more than 20 lines contain hyphens at end, don't bother reporting them V.0.98 */
50.839 - warn_hyphen = 1;
50.840 - if (hyphens > 20 ) {
50.841 - warn_hyphen = 0;
50.842 - printf(" --> %ld lines in this file have hyphens at end. Not reporting them.\n", hyphens);
50.843 - }
50.844 -
50.845 - if (htmcount > 20 && !pswit[MARKUP_SWITCH]) {
50.846 - printf(" --> Looks like this is HTML. Switching HTML mode ON.\n");
50.847 - pswit[MARKUP_SWITCH] = 1;
50.848 - }
50.849 -
50.850 - if (verylongline > 0) {
50.851 - printf(" --> %ld lines in this file are VERY long!\n", verylongline);
50.852 - }
50.853 -
50.854 - /* If there are more non-PG spaced dashes than PG em-dashes, */
50.855 - /* assume it's deliberate */
50.856 - /* Current PG guidelines say don't use them, but older texts do,*/
50.857 - /* and some people insist on them whatever the guidelines say. */
50.858 - /* V.20 removed requirement that PG_space_emdash be greater than*/
50.859 - /* ten before turning off warnings about spaced dashes. */
50.860 - warn_dash = 1;
50.861 - if (spacedash + non_PG_space_emdash > PG_space_emdash) {
50.862 - warn_dash = 0;
50.863 - printf(" --> There are %ld spaced dashes and em-dashes. Not reporting them.\n", spacedash + non_PG_space_emdash);
50.864 - }
50.865 -
50.866 - /* if more than a quarter of characters are hi-bit, bug out */
50.867 - warn_bin = 1;
50.868 - if (binlen * 4 > totlen) {
50.869 - printf(" --> This file does not appear to be ASCII. Terminating. Best of luck with it!\n");
50.870 - exit(1);
50.871 - }
50.872 - if (alphalen * 4 < totlen) {
50.873 - printf(" --> This file does not appear to be text. Terminating. Best of luck with it!\n");
50.874 - exit(1);
50.875 - }
50.876 - if ((binlen * 100 > totlen) || (binlen > 100)) {
50.877 - printf(" --> There are a lot of foreign letters here. Not reporting them.\n");
50.878 - warn_bin = 0;
50.879 - }
50.880 -
50.881 - /* isDutch and isFrench added .991 Feb 06 for Frank, Jeroen, Renald */
50.882 - isDutch = 0;
50.883 - if (Dutchcount > 50) {
50.884 - isDutch = 1;
50.885 - printf(" --> This looks like Dutch - switching off dashes and warnings for 's Middags case.\n");
50.886 - }
50.887 -
50.888 - isFrench = 0;
50.889 - if (Frenchcount > 50) {
50.890 - isFrench = 1;
50.891 - printf(" --> This looks like French - switching off some doublepunct.\n");
50.892 - }
50.893 -
50.894 - if (firstline && footerline)
50.895 - printf(" The PG header and footer appear to be already on.\n");
50.896 - else {
50.897 - if (firstline)
50.898 - printf(" The PG header is on - no footer.\n");
50.899 - if (footerline)
50.900 - printf(" The PG footer is on - no header.\n");
50.901 - }
50.902 - printf("\n");
50.903 -
50.904 - /* V.22 George Davis asked for an override switch to force it to list everything */
50.905 - if (pswit[VERBOSE_SWITCH]) {
50.906 - warn_bin = 1;
50.907 - warn_short = 1;
50.908 - warn_dotcomma = 1;
50.909 - warn_long = 1;
50.910 - warn_dash = 1;
50.911 - warn_digit = 1;
50.912 - warn_ast = 1;
50.913 - warn_fslash = 1;
50.914 - warn_hyphen = 1;
50.915 - warn_endquote = 1;
50.916 - printf(" *** Verbose output is ON -- you asked for it! ***\n");
50.917 - }
50.918 -
50.919 - if (isDutch)
50.920 - warn_dash = 0; /* Frank suggested turning it REALLY off for Dutch */
50.921 -
50.922 - if ((infile = fopen(filename, "rb")) == NULL) {
50.923 - if (pswit[STDOUT_SWITCH])
50.924 - fprintf(stdout, "gutcheck: cannot open %s\n", filename);
50.925 - else
50.926 - fprintf(stderr, "gutcheck: cannot open %s\n", filename);
50.927 - exit(1);
50.928 - }
50.929 -
50.930 - if (footerline > 0 && firstline > 0 && footerline > firstline && footerline - firstline < 100) { /* ugh */
50.931 - printf(" --> I don't really know where this text starts. \n");
50.932 - printf(" There are no reference points.\n");
50.933 - printf(" I'm going to have to report the header and footer as well.\n");
50.934 - firstline=0;
50.935 - }
50.936 -
50.937 -
50.938 -
50.939 - /*****************************************************/
50.940 - /* */
50.941 - /* Here we go with the main pass. Hold onto yer hat! */
50.942 - /* */
50.943 - /*****************************************************/
50.944 -
50.945 - /* Re-init some variables we've dirtied */
50.946 - quot = squot = linecnt = 0;
50.947 - laststart = CHAR_SPACE;
50.948 - lastlen = lastblen = 0;
50.949 -
50.950 - while (flgets(aline, LINEBUFSIZE-1, infile, linecnt+1)) {
50.951 - linecnt++;
50.952 - if (linecnt == 1) isnewpara = 1;
50.953 - if (pswit[DP_SWITCH])
50.954 - if (!strncmp(aline, "-----File: ", 11))
50.955 - continue; // skip DP page separators completely
50.956 - if (linecnt < firstline || (footerline > 0 && linecnt > footerline)) {
50.957 - if (pswit[HEADER_SWITCH]) {
50.958 - if (!strncmp(aline, "Title:", 6))
50.959 - printf(" %s\n", aline);
50.960 - if (!strncmp (aline, "Author:", 7))
50.961 - printf(" %s\n", aline);
50.962 - if (!strncmp(aline, "Release Date:", 13))
50.963 - printf(" %s\n", aline);
50.964 - if (!strncmp(aline, "Edition:", 8))
50.965 - printf(" %s\n\n", aline);
50.966 - }
50.967 - continue; /* skip through the header */
50.968 - }
50.969 - checked_linecnt++;
50.970 - s = aline;
50.971 - isemptyline = 1; /* assume the line is empty until proven otherwise */
50.972 -
50.973 - /* If we are in a state of unbalanced quotes, and this line */
50.974 - /* doesn't begin with a quote, output the stored error message */
50.975 - /* If the -P switch was used, print the warning even if the */
50.976 - /* new para starts with quotes */
50.977 - /* Version .20 - if the new paragraph does start with a quote, */
50.978 - /* but is indented, I was giving a spurious error. Need to */
50.979 - /* check the first _non-space_ character on the line rather */
50.980 - /* than the first character when deciding whether the para */
50.981 - /* starts with a quote. Using *t for this. */
50.982 - t = s;
50.983 - while (*t == ' ') t++;
50.984 - if (*dquote_err)
50.985 - if (*t != CHAR_DQUOTE || pswit[QPARA_SWITCH]) {
50.986 - if (!pswit[OVERVIEW_SWITCH]) {
50.987 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
50.988 - printf(dquote_err);
50.989 - }
50.990 - else
50.991 - cnt_dquot++;
50.992 - }
50.993 - if (*squote_err) {
50.994 - if (*t != CHAR_SQUOTE && *t != CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] || squot) {
50.995 - if (!pswit[OVERVIEW_SWITCH]) {
50.996 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
50.997 - printf(squote_err);
50.998 - }
50.999 - else
50.1000 - cnt_squot++;
50.1001 - }
50.1002 - squot = 0;
50.1003 - }
50.1004 - if (*rbrack_err) {
50.1005 - if (!pswit[OVERVIEW_SWITCH]) {
50.1006 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
50.1007 - printf(rbrack_err);
50.1008 - }
50.1009 - else
50.1010 - cnt_brack++;
50.1011 - }
50.1012 - if (*sbrack_err) {
50.1013 - if (!pswit[OVERVIEW_SWITCH]) {
50.1014 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
50.1015 - printf(sbrack_err);
50.1016 - }
50.1017 - else
50.1018 - cnt_brack++;
50.1019 - }
50.1020 - if (*cbrack_err) {
50.1021 - if (!pswit[OVERVIEW_SWITCH]) {
50.1022 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
50.1023 - printf(cbrack_err);
50.1024 - }
50.1025 - else
50.1026 - cnt_brack++;
50.1027 - }
50.1028 - if (*unders_err) {
50.1029 - if (!pswit[OVERVIEW_SWITCH]) {
50.1030 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
50.1031 - printf(unders_err);
50.1032 - }
50.1033 - else
50.1034 - cnt_brack++;
50.1035 - }
50.1036 -
50.1037 - *dquote_err = *squote_err = *rbrack_err = *cbrack_err =
50.1038 - *sbrack_err = *unders_err = 0;
50.1039 -
50.1040 -
50.1041 - /* look along the line, accumulate the count of quotes, and see */
50.1042 - /* if this is an empty line - i.e. a line with nothing on it */
50.1043 - /* but spaces. */
50.1044 - /* V .12 also if line has just spaces, * and/or - on it, don't */
50.1045 - /* count it, since empty lines with asterisks or dashes to */
50.1046 - /* separate sections are common. */
50.1047 - /* V .15 new single-quote checking - has to be better than the */
50.1048 - /* previous version, but how much better? fingers crossed! */
50.1049 - /* V .20 add period to * and - as characters on a separator line*/
50.1050 - s = aline;
50.1051 - while (*s) {
50.1052 - if (*s == CHAR_DQUOTE) quot++;
50.1053 - if (*s == CHAR_SQUOTE || *s == CHAR_OPEN_SQUOTE)
50.1054 - if (s == aline) { /* at start of line, it can only be an openquote */
50.1055 - if (strncmp(s+2, "tis", 3) && strncmp(s+2, "Tis", 3)) /* hardcode a very common exception! */
50.1056 - open_single_quote++;
50.1057 - }
50.1058 - else
50.1059 - if (gcisalpha(*(s-1)) && gcisalpha(*(s+1)))
50.1060 - ; /* do nothing! - it's definitely an apostrophe, not a quote */
50.1061 - else /* it's outside a word - let's check it out */
50.1062 - if (*s == CHAR_OPEN_SQUOTE || gcisalpha(*(s+1))) { /* it damwell better BE an openquote */
50.1063 - if (strncmp(s+1, "tis", 3) && strncmp(s+1, "Tis", 3)) /* hardcode a very common exception! */
50.1064 - open_single_quote++;
50.1065 - }
50.1066 - else { /* now - is it a closequote? */
50.1067 - guessquote = 0; /* accumulate clues */
50.1068 - if (gcisalpha(*(s-1))) { /* it follows a letter - could be either */
50.1069 - guessquote += 1;
50.1070 - if (*(s-1) == 's') { /* looks like a plural apostrophe */
50.1071 - guessquote -= 3;
50.1072 - if (*(s+1) == CHAR_SPACE) /* bonus marks! */
50.1073 - guessquote -= 2;
50.1074 - }
50.1075 - }
50.1076 - else /* it doesn't have a letter either side */
50.1077 - if (strchr(".?!,;:", *(s-1)) && (strchr(".?!,;: ", *(s+1))))
50.1078 - guessquote += 8; /* looks like a closequote */
50.1079 - else
50.1080 - guessquote += 1;
50.1081 - if (open_single_quote > close_single_quote)
50.1082 - guessquote += 1; /* give it the benefit of some doubt - if a squote is already open */
50.1083 - else
50.1084 - guessquote -= 1;
50.1085 - if (guessquote >= 0)
50.1086 - close_single_quote++;
50.1087 - }
50.1088 -
50.1089 - if (*s != CHAR_SPACE
50.1090 - && *s != '-'
50.1091 - && *s != '.'
50.1092 - && *s != CHAR_ASTERISK
50.1093 - && *s != 13
50.1094 - && *s != 10) isemptyline = 0; /* ignore lines like * * * as spacers */
50.1095 - if (*s == CHAR_UNDERSCORE) c_unders++;
50.1096 - if (*s == CHAR_OPEN_CBRACK) c_brack++;
50.1097 - if (*s == CHAR_CLOSE_CBRACK) c_brack--;
50.1098 - if (*s == CHAR_OPEN_RBRACK) r_brack++;
50.1099 - if (*s == CHAR_CLOSE_RBRACK) r_brack--;
50.1100 - if (*s == CHAR_OPEN_SBRACK) s_brack++;
50.1101 - if (*s == CHAR_CLOSE_SBRACK) s_brack--;
50.1102 - s++;
50.1103 - }
50.1104 -
50.1105 - if (isnewpara && !isemptyline) { /* This line is the start of a new paragraph */
50.1106 - start_para_line = linecnt;
50.1107 - strncpy(parastart, aline, 80); /* Capture its first line in case we want to report it later */
50.1108 - parastart[79] = 0;
50.1109 - dquotepar = squotepar = 0; /* restart the quote count 0.98 */
50.1110 - s = aline;
50.1111 - while (!gcisalpha(*s) && !gcisdigit(*s) && *s) s++; /* V.97 fixed bug - overran line and gave false warning - rare */
50.1112 - if (*s >= 'a' && *s <='z') { /* and its first letter is lowercase */
50.1113 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1114 - if (!pswit[OVERVIEW_SWITCH])
50.1115 - printf(" Line %ld column %d - Paragraph starts with lower-case\n", linecnt, (int)(s - aline) +1);
50.1116 - else
50.1117 - cnt_punct++;
50.1118 - }
50.1119 - isnewpara = 0; /* Signal the end of new para processing */
50.1120 - }
50.1121 -
50.1122 - /* Check for an em-dash broken at line end */
50.1123 - if (enddash && *aline == '-') {
50.1124 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1125 - if (!pswit[OVERVIEW_SWITCH])
50.1126 - printf(" Line %ld column 1 - Broken em-dash?\n", linecnt);
50.1127 - else
50.1128 - cnt_punct++;
50.1129 - }
50.1130 - enddash = 0;
50.1131 - for (s = aline + strlen(aline) - 1; *s == ' ' && s > aline; s--);
50.1132 - if (s >= aline && *s == '-')
50.1133 - enddash = 1;
50.1134 -
50.1135 -
50.1136 - /* Check for invalid or questionable characters in the line */
50.1137 - /* Anything above 127 is invalid for plain ASCII, and */
50.1138 - /* non-printable control characters should also be flagged. */
50.1139 - /* Tabs should generally not be there. */
50.1140 - /* Jan 06, in 0.99: Hm. For some strange reason, I either */
50.1141 - /* never created or deleted the check for unprintable */
50.1142 - /* control characters. They should be reported even if */
50.1143 - /* warn_bin is on, I think, and in full. */
50.1144 -
50.1145 - for (s = aline; *s; s++) {
50.1146 - i = (unsigned char) *s;
50.1147 - if (i < CHAR_SPACE && i != CHAR_LF && i != CHAR_CR && i != CHAR_TAB) {
50.1148 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1149 - if (!pswit[OVERVIEW_SWITCH])
50.1150 - printf(" Line %ld column %d - Control character %d\n", linecnt, (int) (s - aline) + 1, i);
50.1151 - else
50.1152 - cnt_bin++;
50.1153 - }
50.1154 - }
50.1155 -
50.1156 - if (warn_bin) {
50.1157 - eNon_A = eTab = eTilde = eCarat = eFSlash = eAst = 0; /* don't repeat multiple warnings on one line */
50.1158 - for (s = aline; *s; s++) {
50.1159 - if (!eNon_A && ((*s < CHAR_SPACE && *s != 9 && *s != '\n') || (unsigned char)*s > 127)) {
50.1160 - i = *s; /* annoying kludge for signed chars */
50.1161 - if (i < 0) i += 256;
50.1162 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1163 - if (!pswit[OVERVIEW_SWITCH])
50.1164 - if (i > 127 && i < 160)
50.1165 - printf(" Line %ld column %d - Non-ISO-8859 character %d\n", linecnt, (int) (s - aline) + 1, i);
50.1166 - else
50.1167 - printf(" Line %ld column %d - Non-ASCII character %d\n", linecnt, (int) (s - aline) + 1, i);
50.1168 - else
50.1169 - cnt_bin++;
50.1170 - eNon_A = 1;
50.1171 - }
50.1172 - if (!eTab && *s == CHAR_TAB) {
50.1173 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1174 - if (!pswit[OVERVIEW_SWITCH])
50.1175 - printf(" Line %ld column %d - Tab character?\n", linecnt, (int) (s - aline) + 1);
50.1176 - else
50.1177 - cnt_odd++;
50.1178 - eTab = 1;
50.1179 - }
50.1180 - if (!eTilde && *s == CHAR_TILDE) { /* often used by OCR software to indicate an unrecognizable character */
50.1181 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1182 - if (!pswit[OVERVIEW_SWITCH])
50.1183 - printf(" Line %ld column %d - Tilde character?\n", linecnt, (int) (s - aline) + 1);
50.1184 - else
50.1185 - cnt_odd++;
50.1186 - eTilde = 1;
50.1187 - }
50.1188 - if (!eCarat && *s == CHAR_CARAT) {
50.1189 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1190 - if (!pswit[OVERVIEW_SWITCH])
50.1191 - printf(" Line %ld column %d - Carat character?\n", linecnt, (int) (s - aline) + 1);
50.1192 - else
50.1193 - cnt_odd++;
50.1194 - eCarat = 1;
50.1195 - }
50.1196 - if (!eFSlash && *s == CHAR_FORESLASH && warn_fslash) {
50.1197 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1198 - if (!pswit[OVERVIEW_SWITCH])
50.1199 - printf(" Line %ld column %d - Forward slash?\n", linecnt, (int) (s - aline) + 1);
50.1200 - else
50.1201 - cnt_odd++;
50.1202 - eFSlash = 1;
50.1203 - }
50.1204 - /* report asterisks only in paranoid mode, since they're often deliberate */
50.1205 - if (!eAst && pswit[PARANOID_SWITCH] && warn_ast && !isemptyline && *s == CHAR_ASTERISK) {
50.1206 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1207 - if (!pswit[OVERVIEW_SWITCH])
50.1208 - printf(" Line %ld column %d - Asterisk?\n", linecnt, (int) (s - aline) + 1);
50.1209 - else
50.1210 - cnt_odd++;
50.1211 - eAst = 1;
50.1212 - }
50.1213 - }
50.1214 - }
50.1215 -
50.1216 - /* Check for line too long */
50.1217 - if (warn_long) {
50.1218 - if (strlen(aline) > LONGEST_PG_LINE) {
50.1219 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1220 - if (!pswit[OVERVIEW_SWITCH])
50.1221 - printf(" Line %ld column %d - Long line %d\n", linecnt, strlen(aline), strlen(aline));
50.1222 - else
50.1223 - cnt_long++;
50.1224 - }
50.1225 - }
50.1226 -
50.1227 - /* Check for line too short. */
50.1228 - /* This one is a bit trickier to implement: we don't want to */
50.1229 - /* flag the last line of a paragraph for being short, so we */
50.1230 - /* have to wait until we know that our current line is a */
50.1231 - /* "normal" line, then report the _previous_ line if it was too */
50.1232 - /* short. We also don't want to report indented lines like */
50.1233 - /* chapter heads or formatted quotations. We therefore keep */
50.1234 - /* lastlen as the length of the last line examined, and */
50.1235 - /* lastblen as the length of the last but one, and try to */
50.1236 - /* suppress unnecessary warnings by checking that both were of */
50.1237 - /* "normal" length. We keep the first character of the last */
50.1238 - /* line in laststart, and if it was a space, we assume that the */
50.1239 - /* formatting is deliberate. I can't figure out a way to */
50.1240 - /* distinguish something like a quoted verse left-aligned or */
50.1241 - /* the header or footer of a letter from a paragraph of short */
50.1242 - /* lines - maybe if I examined the whole paragraph, and if the */
50.1243 - /* para has less than, say, 8 lines and if all lines are short, */
50.1244 - /* then just assume it's OK? Need to look at some texts to see */
50.1245 - /* how often a formula like this would get the right result. */
50.1246 - /* V0.99 changed the tolerance for length to ignore from 2 to 1 */
50.1247 - if (warn_short) {
50.1248 - if (strlen(aline) > 1
50.1249 - && lastlen > 1 && lastlen < SHORTEST_PG_LINE
50.1250 - && lastblen > 1 && lastblen > SHORTEST_PG_LINE
50.1251 - && laststart != CHAR_SPACE) {
50.1252 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", prevline);
50.1253 - if (!pswit[OVERVIEW_SWITCH])
50.1254 - printf(" Line %ld column %d - Short line %d?\n", linecnt-1, strlen(prevline), strlen(prevline));
50.1255 - else
50.1256 - cnt_short++;
50.1257 - }
50.1258 - }
50.1259 - lastblen = lastlen;
50.1260 - lastlen = strlen(aline);
50.1261 - laststart = aline[0];
50.1262 -
50.1263 - /* look for punctuation at start of line */
50.1264 - if (*aline && strchr(".?!,;:", aline[0])) { /* if it's punctuation */
50.1265 - if (strncmp(". . .", aline, 5)) { /* exception for ellipsis: V.98 tightened up to except only a full ellipsis */
50.1266 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1267 - if (!pswit[OVERVIEW_SWITCH])
50.1268 - printf(" Line %ld column 1 - Begins with punctuation?\n", linecnt);
50.1269 - else
50.1270 - cnt_punct++;
50.1271 - }
50.1272 - }
50.1273 -
50.1274 - /* Check for spaced em-dashes */
50.1275 - /* V.20 must check _all_ occurrences of "--" on the line */
50.1276 - /* hence the loop - even if the first double-dash is OK */
50.1277 - /* there may be another that's wrong later on. */
50.1278 - if (warn_dash) {
50.1279 - s = aline;
50.1280 - while (strstr(s,"--")) {
50.1281 - if (*(strstr(s, "--")-1) == CHAR_SPACE ||
50.1282 - (*(strstr(s, "--")+2) == CHAR_SPACE)) {
50.1283 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1284 - if (!pswit[OVERVIEW_SWITCH])
50.1285 - printf(" Line %ld column %d - Spaced em-dash?\n", linecnt, (int) (strstr(s,"--") - aline) + 1);
50.1286 - else
50.1287 - cnt_dash++;
50.1288 - }
50.1289 - s = strstr(s,"--") + 2;
50.1290 - }
50.1291 - }
50.1292 -
50.1293 - /* Check for spaced dashes */
50.1294 - if (warn_dash)
50.1295 - if (strstr(aline," -")) {
50.1296 - if (*(strstr(aline, " -")+2) != '-') {
50.1297 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1298 - if (!pswit[OVERVIEW_SWITCH])
50.1299 - printf(" Line %ld column %d - Spaced dash?\n", linecnt, (int) (strstr(aline," -") - aline) + 1);
50.1300 - else
50.1301 - cnt_dash++;
50.1302 - }
50.1303 - }
50.1304 - else
50.1305 - if (strstr(aline,"- ")) {
50.1306 - if (*(strstr(aline, "- ")-1) != '-') {
50.1307 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1308 - if (!pswit[OVERVIEW_SWITCH])
50.1309 - printf(" Line %ld column %d - Spaced dash?\n", linecnt, (int) (strstr(aline,"- ") - aline) + 1);
50.1310 - else
50.1311 - cnt_dash++;
50.1312 - }
50.1313 - }
50.1314 -
50.1315 - /* v 0.99 */
50.1316 - /* Check for unmarked paragraphs indicated by separate speakers */
50.1317 - /* May well be false positive: */
50.1318 - /* "Bravo!" "Wonderful!" called the crowd. */
50.1319 - /* but useful all the same. */
50.1320 - s = wrk;
50.1321 - *s = 0;
50.1322 - if (strstr(aline, "\" \"")) s = strstr(aline, "\" \"");
50.1323 - if (strstr(aline, "\" \"")) s = strstr(aline, "\" \"");
50.1324 - if (*s) {
50.1325 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1326 - if (!pswit[OVERVIEW_SWITCH])
50.1327 - printf(" Line %ld column %d - Query missing paragraph break?\n", linecnt, (int)(s - aline) +1);
50.1328 - else
50.1329 - cnt_punct++;
50.1330 - }
50.1331 -
50.1332 -
50.1333 -
50.1334 - /* Check for "to he" and other easy he/be errors */
50.1335 - /* This is a very inadequate effort on the he/be problem, */
50.1336 - /* but the phrase "to he" is always an error, whereas "to */
50.1337 - /* be" is quite common. I chuckle when it does catch one! */
50.1338 - /* Similarly, '"Quiet!", be said.' is a non-be error */
50.1339 - /* V .18 - "to he" is _not_ always an error!: */
50.1340 - /* "Where they went to he couldn't say." */
50.1341 - /* but I'm leaving it in anyway. */
50.1342 - /* V .20 Another false positive: */
50.1343 - /* What would "Cinderella" be without the . . . */
50.1344 - /* and another "If he wants to he can see for himself." */
50.1345 - /* V .21 Added " is be " and " be is " and " be was " */
50.1346 - /* V .99 Added jeebies code -- removed again. */
50.1347 - /* Is jeebies code worth adding? Rare to see he/be */
50.1348 - /* errors with modern OCR. Separate program? Yes! */
50.1349 - /* jeebies does the job without cluttering up this. */
50.1350 - /* We do get a few more queryable pairs from the */
50.1351 - /* project though -- they're cheap to implement. */
50.1352 - /* Also added a column number for guiguts. */
50.1353 -
50.1354 - s = wrk;
50.1355 - *s = 0;
50.1356 - if (strstr(aline," to he ")) s = strstr(aline," to he ");
50.1357 - if (strstr(aline,"\" be ")) s = strstr(aline,"\" be ");
50.1358 - if (strstr(aline,"\", be ")) s = strstr(aline,"\", be ");
50.1359 - if (strstr(aline," is be ")) s = strstr(aline," is be ");
50.1360 - if (strstr(aline," be is ")) s = strstr(aline," be is ");
50.1361 - if (strstr(aline," was be ")) s = strstr(aline," was be ");
50.1362 - if (strstr(aline," be would ")) s = strstr(aline," be would ");
50.1363 - if (strstr(aline," be could ")) s = strstr(aline," be could ");
50.1364 - if (*s) {
50.1365 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1366 - if (!pswit[OVERVIEW_SWITCH])
50.1367 - printf(" Line %ld column %d - Query he/be error?\n", linecnt, (int)(s - aline) +1);
50.1368 - else
50.1369 - cnt_word++;
50.1370 - }
50.1371 -
50.1372 - s = wrk;
50.1373 - *s = 0;
50.1374 - if (strstr(aline," i bad ")) s = strstr(aline," i bad ");
50.1375 - if (strstr(aline," you bad ")) s = strstr(aline," you bad ");
50.1376 - if (strstr(aline," he bad ")) s = strstr(aline," he bad ");
50.1377 - if (strstr(aline," she bad ")) s = strstr(aline," she bad ");
50.1378 - if (strstr(aline," they bad ")) s = strstr(aline," they bad ");
50.1379 - if (strstr(aline," a had ")) s = strstr(aline," a had ");
50.1380 - if (strstr(aline," the had ")) s = strstr(aline," the had ");
50.1381 - if (*s) {
50.1382 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1383 - if (!pswit[OVERVIEW_SWITCH])
50.1384 - printf(" Line %ld column %d - Query had/bad error?\n", linecnt, (int)(s - aline) +1);
50.1385 - else
50.1386 - cnt_word++;
50.1387 - }
50.1388 -
50.1389 -
50.1390 - /* V .97 Added ", hut " Not too common, hut pretty certain */
50.1391 - /* V.99 changed to add a column number for guiguts */
50.1392 - s = wrk;
50.1393 - *s = 0;
50.1394 - if (strstr(aline,", hut ")) s = strstr(aline,", hut ");
50.1395 - if (strstr(aline,"; hut ")) s = strstr(aline,"; hut ");
50.1396 - if (*s) {
50.1397 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1398 - if (!pswit[OVERVIEW_SWITCH])
50.1399 - printf(" Line %ld column %d - Query hut/but error?\n", linecnt, (int)(s - aline) +1);
50.1400 - else
50.1401 - cnt_word++;
50.1402 - }
50.1403 -
50.1404 - /* Special case - angled bracket in front of "From" placed there by an MTA */
50.1405 - /* when sending an e-mail. V .21 */
50.1406 - if (strstr(aline, ">From")) {
50.1407 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1408 - if (!pswit[OVERVIEW_SWITCH])
50.1409 - printf(" Line %ld column %d - Query angled bracket with From\n", linecnt, (int)(strstr(aline, ">From") - aline) +1);
50.1410 - else
50.1411 - cnt_punct++;
50.1412 - }
50.1413 -
50.1414 - /* V 0.98 Check for a single character line - often an overflow from bad wrapping. */
50.1415 - if (*aline && !*(aline+1)) {
50.1416 - if (*aline == 'I' || *aline == 'V' || *aline == 'X' || *aline == 'L' || gcisdigit(*aline))
50.1417 - ; /* nothing - ignore numerals alone on a line. */
50.1418 - else {
50.1419 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1420 - if (!pswit[OVERVIEW_SWITCH])
50.1421 - printf(" Line %ld column 1 - Query single character line\n", linecnt);
50.1422 - else
50.1423 - cnt_punct++;
50.1424 - }
50.1425 - }
50.1426 -
50.1427 - /* V 0.98 Check for I" - often should be ! */
50.1428 - if (strstr(aline, " I\"")) {
50.1429 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1430 - if (!pswit[OVERVIEW_SWITCH])
50.1431 - printf(" Line %ld column %ld - Query I=exclamation mark?\n", linecnt, strstr(aline, " I\"") - aline);
50.1432 - else
50.1433 - cnt_punct++;
50.1434 - }
50.1435 -
50.1436 - /* V 0.98 Check for period without a capital letter. Cut-down from gutspell */
50.1437 - /* Only works when it happens on a single line. */
50.1438 -
50.1439 - if (pswit[PARANOID_SWITCH])
50.1440 - for (t = s = aline; strstr(t,". ");) {
50.1441 - t = strstr(t, ". ");
50.1442 - if (t == s) {
50.1443 - t++;
50.1444 - continue; /* start of line punctuation is handled elsewhere */
50.1445 - }
50.1446 - if (!gcisalpha(*(t-1))) {
50.1447 - t++;
50.1448 - continue;
50.1449 - }
50.1450 - if (isDutch) { /* For Frank & Jeroen -- 's Middags case */
50.1451 - if (*(t+2) == CHAR_SQUOTE &&
50.1452 - *(t+3)>='a' && *(t+3)<='z' &&
50.1453 - *(t+4) == CHAR_SPACE &&
50.1454 - *(t+5)>='A' && *(t+5)<='Z') {
50.1455 - t++;
50.1456 - continue;
50.1457 - }
50.1458 - }
50.1459 - s1 = t+2;
50.1460 - while (*s1 && !gcisalpha(*s1) && !isdigit(*s1))
50.1461 - s1++;
50.1462 - if (*s1 >= 'a' && *s1 <= 'z') { /* we have something to investigate */
50.1463 - istypo = 1;
50.1464 - for (s1 = t - 1; s1 >= s &&
50.1465 - (gcisalpha(*s1) || gcisdigit(*s1) ||
50.1466 - (*s1 == CHAR_SQUOTE && gcisalpha(*(s1+1)) && gcisalpha(*(s1-1)))); s1--); /* so let's go back and find out */
50.1467 - s1++;
50.1468 - for (i = 0; *s1 && *s1 != '.'; s1++, i++)
50.1469 - testword[i] = *s1;
50.1470 - testword[i] = 0;
50.1471 - for (i = 0; *abbrev[i]; i++)
50.1472 - if (!strcmp(testword, abbrev[i]))
50.1473 - istypo = 0;
50.1474 -// if (*testword >= 'A' && *testword <= 'Z')
50.1475 -// istypo = 0;
50.1476 - if (gcisdigit(*testword)) istypo = 0;
50.1477 - if (!*(testword+1)) istypo = 0;
50.1478 - if (isroman(testword)) istypo = 0;
50.1479 - if (istypo) {
50.1480 - istypo = 0;
50.1481 - for (i = 0; testword[i]; i++)
50.1482 - if (strchr(vowels, testword[i]))
50.1483 - istypo = 1;
50.1484 - }
50.1485 - if (istypo) {
50.1486 - isdup = 0;
50.1487 - if (strlen(testword) < MAX_QWORD_LENGTH && !pswit[VERBOSE_SWITCH])
50.1488 - for (i = 0; i < qperiod_index; i++)
50.1489 - if (!strcmp(testword, qperiod[i])) {
50.1490 - isdup = 1;
50.1491 - }
50.1492 - if (!isdup) {
50.1493 - if (qperiod_index < MAX_QWORD && strlen(testword) < MAX_QWORD_LENGTH) {
50.1494 - strcpy(qperiod[qperiod_index], testword);
50.1495 - qperiod_index++;
50.1496 - }
50.1497 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1498 - if (!pswit[OVERVIEW_SWITCH])
50.1499 - printf(" Line %ld column %d - Extra period?\n", linecnt, (int)(t - aline)+1);
50.1500 - else
50.1501 - cnt_punct++;
50.1502 - }
50.1503 - }
50.1504 - }
50.1505 - t++;
50.1506 - }
50.1507 -
50.1508 -
50.1509 - if (pswit[TYPO_SWITCH]) { /* Should have put this condition in at the start of 0.99. Duh! */
50.1510 - /* Check for words usually not followed by punctuation 0.99 */
50.1511 - for (s = aline; *s;) {
50.1512 - wordstart = s;
50.1513 - s = getaword(s, inword);
50.1514 - if (!*inword) continue;
50.1515 - lowerit(inword);
50.1516 - for (i = 0; *nocomma[i]; i++)
50.1517 - if (!strcmp(inword, nocomma[i])) {
50.1518 - if (*s == ',' || *s == ';' || *s == ':') {
50.1519 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1520 - if (!pswit[OVERVIEW_SWITCH])
50.1521 - printf(" Line %ld column %d - Query punctuation after %s?\n", linecnt, (int)(s - aline)+1, inword);
50.1522 - else
50.1523 - cnt_punct++;
50.1524 - }
50.1525 - }
50.1526 - for (i = 0; *noperiod[i]; i++)
50.1527 - if (!strcmp(inword, noperiod[i])) {
50.1528 - if (*s == '.' || *s == '!') {
50.1529 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1530 - if (!pswit[OVERVIEW_SWITCH])
50.1531 - printf(" Line %ld column %d - Query punctuation after %s?\n", linecnt, (int)(s - aline)+1, inword);
50.1532 - else
50.1533 - cnt_punct++;
50.1534 - }
50.1535 - }
50.1536 - }
50.1537 - }
50.1538 -
50.1539 -
50.1540 -
50.1541 - /* Check for commonly mistyped words, and digits like 0 for O in a word */
50.1542 - for (s = aline; *s;) {
50.1543 - wordstart = s;
50.1544 - s = getaword(s, inword);
50.1545 - if (!*inword) continue; /* don't bother with empty lines */
50.1546 - if (mixdigit(inword)) {
50.1547 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1548 - if (!pswit[OVERVIEW_SWITCH])
50.1549 - printf(" Line %ld column %ld - Query digit in %s\n", linecnt, (int)(wordstart - aline) + 1, inword);
50.1550 - else
50.1551 - cnt_word++;
50.1552 - }
50.1553 -
50.1554 - /* put the word through a series of tests for likely typos and OCR errors */
50.1555 - /* V.21 I had allowed lots of typo-checking even with the typo switch */
50.1556 - /* turned off, but I really should disallow reporting of them when */
50.1557 - /* the switch is off. Hence the "if" below. */
50.1558 - if (pswit[TYPO_SWITCH]) {
50.1559 - istypo = 0;
50.1560 - strcpy(testword, inword);
50.1561 - alower = 0;
50.1562 - for (i = 0; i < (signed int)strlen(testword); i++) { /* lowercase for testing */
50.1563 - if (testword[i] >= 'a' && testword[i] <= 'z') alower = 1;
50.1564 - if (alower && testword[i] >= 'A' && testword[i] <= 'Z') {
50.1565 - /* we have an uppercase mid-word. However, there are common cases: */
50.1566 - /* Mac and Mc like McGill */
50.1567 - /* French contractions like l'Abbe */
50.1568 - if ((i == 2 && testword[0] == 'm' && testword[1] == 'c') ||
50.1569 - (i == 3 && testword[0] == 'm' && testword[1] == 'a' && testword[2] == 'c') ||
50.1570 - (i > 0 && testword[i-1] == CHAR_SQUOTE))
50.1571 - ; /* do nothing! */
50.1572 -
50.1573 - else { /* V.97 - remove separate case of uppercase within word so that */
50.1574 - /* names like VanAllen fall into qword_index and get reported only once */
50.1575 - istypo = 1;
50.1576 - }
50.1577 - }
50.1578 - testword[i] = (char)tolower(testword[i]);
50.1579 - }
50.1580 -
50.1581 - /* check for certain unlikely two-letter combinations at word start and end */
50.1582 - /* V.0.97 - this replaces individual hardcoded checks in previous versions */
50.1583 - if (strlen(testword) > 1) {
50.1584 - for (i = 0; *nostart[i]; i++)
50.1585 - if (!strncmp(testword, nostart[i], 2))
50.1586 - istypo = 1;
50.1587 - for (i = 0; *noend[i]; i++)
50.1588 - if (!strncmp(testword + strlen(testword) -2, noend[i], 2))
50.1589 - istypo = 1;
50.1590 - }
50.1591 -
50.1592 -
50.1593 - /* ght is common, gbt never. Like that. */
50.1594 - if (strstr(testword, "cb")) istypo = 1;
50.1595 - if (strstr(testword, "gbt")) istypo = 1;
50.1596 - if (strstr(testword, "pbt")) istypo = 1;
50.1597 - if (strstr(testword, "tbs")) istypo = 1;
50.1598 - if (strstr(testword, "mrn")) istypo = 1;
50.1599 - if (strstr(testword, "ahle")) istypo = 1;
50.1600 - if (strstr(testword, "ihle")) istypo = 1;
50.1601 -
50.1602 - /* "TBE" does happen - like HEARTBEAT - but uncommon. */
50.1603 - /* Also "TBI" - frostbite, outbid - but uncommon. */
50.1604 - /* Similarly "ii" like Hawaii, or Pompeii, and in Roman numerals, */
50.1605 - /* but these are covered in V.20. "ii" is a common scanno. */
50.1606 - if (strstr(testword, "tbi")) istypo = 1;
50.1607 - if (strstr(testword, "tbe")) istypo = 1;
50.1608 - if (strstr(testword, "ii")) istypo = 1;
50.1609 -
50.1610 - /* check for no vowels or no consonants. */
50.1611 - /* If none, flag a typo */
50.1612 - if (!istypo && strlen(testword)>1) {
50.1613 - vowel = consonant = 0;
50.1614 - for (i = 0; testword[i]; i++)
50.1615 - if (testword[i] == 'y' || gcisdigit(testword[i])) { /* Yah, this is loose. */
50.1616 - vowel++;
50.1617 - consonant++;
50.1618 - }
50.1619 - else
50.1620 - if (strchr(vowels, testword[i])) vowel++;
50.1621 - else consonant++;
50.1622 - if (!vowel || !consonant) {
50.1623 - istypo = 1;
50.1624 - }
50.1625 - }
50.1626 -
50.1627 - /* now exclude the word from being reported if it's in */
50.1628 - /* the okword list */
50.1629 - for (i = 0; *okword[i]; i++)
50.1630 - if (!strcmp(testword, okword[i]))
50.1631 - istypo = 0;
50.1632 -
50.1633 - /* what looks like a typo may be a Roman numeral. Exclude these */
50.1634 - if (istypo)
50.1635 - if (isroman(testword))
50.1636 - istypo = 0;
50.1637 -
50.1638 - /* check the manual list of typos */
50.1639 - if (!istypo)
50.1640 - for (i = 0; *typo[i]; i++)
50.1641 - if (!strcmp(testword, typo[i]))
50.1642 - istypo = 1;
50.1643 -
50.1644 -
50.1645 - /* V.21 - check lowercase s and l - special cases */
50.1646 - /* V.98 - added "i" and "m" */
50.1647 - /* V.99 - added "j" often a semi-colon gone wrong */
50.1648 - /* - and "d" for a missing apostrophe - he d */
50.1649 - /* - and "n" for "in" */
50.1650 - if (!istypo && strlen(testword) == 1)
50.1651 - if (strchr("slmijdn", *inword))
50.1652 - istypo = 1;
50.1653 -
50.1654 -
50.1655 - if (istypo) {
50.1656 - isdup = 0;
50.1657 - if (strlen(testword) < MAX_QWORD_LENGTH && !pswit[VERBOSE_SWITCH])
50.1658 - for (i = 0; i < qword_index; i++)
50.1659 - if (!strcmp(testword, qword[i])) {
50.1660 - isdup = 1;
50.1661 - ++dupcnt[i];
50.1662 - }
50.1663 - if (!isdup) {
50.1664 - if (qword_index < MAX_QWORD && strlen(testword) < MAX_QWORD_LENGTH) {
50.1665 - strcpy(qword[qword_index], testword);
50.1666 - qword_index++;
50.1667 - }
50.1668 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1669 - if (!pswit[OVERVIEW_SWITCH]) {
50.1670 - printf(" Line %ld column %d - Query word %s", linecnt, (int)(wordstart - aline) + 1, inword);
50.1671 - if (strlen(testword) < MAX_QWORD_LENGTH && !pswit[VERBOSE_SWITCH])
50.1672 - printf(" - not reporting duplicates");
50.1673 - printf("\n");
50.1674 - }
50.1675 - else
50.1676 - cnt_word++;
50.1677 - }
50.1678 - }
50.1679 - } /* end of typo-checking */
50.1680 -
50.1681 - /* check the user's list of typos */
50.1682 - if (!istypo)
50.1683 - if (usertypo_count)
50.1684 - for (i = 0; i < usertypo_count; i++)
50.1685 - if (!strcmp(testword, usertypo[i])) {
50.1686 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1687 - if (!pswit[OVERVIEW_SWITCH])
50.1688 - printf(" Line %ld column %d - Query possible scanno %s\n", linecnt, (int)(wordstart - aline) + 2, inword);
50.1689 - }
50.1690 -
50.1691 -
50.1692 -
50.1693 - if (pswit[PARANOID_SWITCH] && warn_digit) { /* in paranoid mode, query all 0 and 1 standing alone - added warn_digit V.97*/
50.1694 - if (!strcmp(inword, "0") || !strcmp(inword, "1")) {
50.1695 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1696 - if (!pswit[OVERVIEW_SWITCH])
50.1697 - printf(" Line %ld column %d - Query standalone %s\n", linecnt, (int)(wordstart - aline) + 2, inword);
50.1698 - else
50.1699 - cnt_word++;
50.1700 - }
50.1701 - }
50.1702 - }
50.1703 -
50.1704 - /* look for added or missing spaces around punctuation and quotes */
50.1705 - /* If there is a punctuation character like ! with no space on */
50.1706 - /* either side, suspect a missing!space. If there are spaces on */
50.1707 - /* both sides , assume a typo. If we see a double quote with no */
50.1708 - /* space or punctuation on either side of it, assume unspaced */
50.1709 - /* quotes "like"this. */
50.1710 - llen = strlen(aline);
50.1711 - for (i = 1; i < llen; i++) { /* for each character in the line after the first */
50.1712 - if (strchr(".?!,;:_", aline[i])) { /* if it's punctuation */
50.1713 - isacro = 0; /* we need to suppress warnings for acronyms like M.D. */
50.1714 - isellipsis = 0; /* we need to suppress warnings for ellipsis . . . */
50.1715 - if ( (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1])) || /* if there are letters on both sides of it or ... */
50.1716 - (gcisalpha(aline[i+1]) && strchr("?!,;:", aline[i]))) { /* ...if it's strict punctuation followed by an alpha */
50.1717 - if (aline[i] == '.') {
50.1718 - if (i > 2)
50.1719 - if (aline[i-2] == '.') isacro = 1;
50.1720 - if (i + 2 < llen)
50.1721 - if (aline[i+2] == '.') isacro = 1;
50.1722 - }
50.1723 - if (!isacro) {
50.1724 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1725 - if (!pswit[OVERVIEW_SWITCH])
50.1726 - printf(" Line %ld column %d - Missing space?\n", linecnt, i+1);
50.1727 - else
50.1728 - cnt_punct++;
50.1729 - }
50.1730 - }
50.1731 - if (aline[i-1] == CHAR_SPACE && (aline[i+1] == CHAR_SPACE || aline[i+1] == 0)) { /* if there are spaces on both sides, or space before and end of line */
50.1732 - if (aline[i] == '.') {
50.1733 - if (i > 2)
50.1734 - if (aline[i-2] == '.') isellipsis = 1;
50.1735 - if (i + 2 < llen)
50.1736 - if (aline[i+2] == '.') isellipsis = 1;
50.1737 - }
50.1738 - if (!isemptyline && !isellipsis) {
50.1739 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1740 - if (!pswit[OVERVIEW_SWITCH])
50.1741 - printf(" Line %ld column %d - Spaced punctuation?\n", linecnt, i+1);
50.1742 - else
50.1743 - cnt_punct++;
50.1744 - }
50.1745 - }
50.1746 - }
50.1747 - }
50.1748 -
50.1749 - /* 0.98 -- split out the characters that CANNOT be preceded by space */
50.1750 - llen = strlen(aline);
50.1751 - for (i = 1; i < llen; i++) { /* for each character in the line after the first */
50.1752 - if (strchr("?!,;:", aline[i])) { /* if it's punctuation that _cannot_ have a space before it */
50.1753 - if (aline[i-1] == CHAR_SPACE && !isemptyline && aline[i+1] != CHAR_SPACE) { /* if aline[i+1) DOES == space, it was already reported just above */
50.1754 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1755 - if (!pswit[OVERVIEW_SWITCH])
50.1756 - printf(" Line %ld column %d - Spaced punctuation?\n", linecnt, i+1);
50.1757 - else
50.1758 - cnt_punct++;
50.1759 - }
50.1760 - }
50.1761 - }
50.1762 -
50.1763 -
50.1764 - /* 0.99 -- special case " .X" where X is any alpha. */
50.1765 - /* This plugs a hole in the acronym code above. Inelegant, but maintainable. */
50.1766 - llen = strlen(aline);
50.1767 - for (i = 1; i < llen; i++) { /* for each character in the line after the first */
50.1768 - if (aline[i] == '.') { /* if it's a period */
50.1769 - if (aline[i-1] == CHAR_SPACE && gcisalpha(aline[i+1])) { /* if the period follows a space and is followed by a letter */
50.1770 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1771 - if (!pswit[OVERVIEW_SWITCH])
50.1772 - printf(" Line %ld column %d - Spaced punctuation?\n", linecnt, i+1);
50.1773 - else
50.1774 - cnt_punct++;
50.1775 - }
50.1776 - }
50.1777 - }
50.1778 -
50.1779 -
50.1780 -
50.1781 -
50.1782 - /* v.21 breaking out the search for unspaced doublequotes */
50.1783 - /* This is not as efficient, but it's more maintainable */
50.1784 - /* V.97 added underscore to the list of characters not to query, */
50.1785 - /* since underscores are commonly used as italics indicators. */
50.1786 - /* V.98 Added slash as well, same reason. */
50.1787 - for (i = 1; i < llen; i++) { /* for each character in the line after the first */
50.1788 - if (aline[i] == CHAR_DQUOTE) {
50.1789 - if ((!strchr(" _-.'`,;:!/([{?}])", aline[i-1]) &&
50.1790 - !strchr(" _-.'`,;:!/([{?}])", aline[i+1]) &&
50.1791 - aline[i+1] != 0
50.1792 - || (!strchr(" _-([{'`", aline[i-1]) && gcisalpha(aline[i+1])))) {
50.1793 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1794 - if (!pswit[OVERVIEW_SWITCH])
50.1795 - printf(" Line %ld column %d - Unspaced quotes?\n", linecnt, i+1);
50.1796 - else
50.1797 - cnt_punct++;
50.1798 - }
50.1799 - }
50.1800 - }
50.1801 -
50.1802 -
50.1803 - /* v.98 check parity of quotes */
50.1804 - /* v.99 added !*(s+1) in some tests to catch "I am," he said, but I will not be soon". */
50.1805 - for (s = aline; *s; s++) {
50.1806 - if (*s == CHAR_DQUOTE) {
50.1807 - if (!(dquotepar = !dquotepar)) { /* parity even */
50.1808 - if (!strchr("_-.'`/,;:!?)]} ", *(s+1))) {
50.1809 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1810 - if (!pswit[OVERVIEW_SWITCH])
50.1811 - printf(" Line %ld column %d - Wrongspaced quotes?\n", linecnt, (int)(s - aline)+1);
50.1812 - else
50.1813 - cnt_punct++;
50.1814 - }
50.1815 - }
50.1816 - else { /* parity odd */
50.1817 - if (!gcisalpha(*(s+1)) && !isdigit(*(s+1)) && !strchr("_-/.'`([{$", *(s+1)) || !*(s+1)) {
50.1818 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1819 - if (!pswit[OVERVIEW_SWITCH])
50.1820 - printf(" Line %ld column %d - Wrongspaced quotes?\n", linecnt, (int)(s - aline)+1);
50.1821 - else
50.1822 - cnt_punct++;
50.1823 - }
50.1824 - }
50.1825 - }
50.1826 - }
50.1827 -
50.1828 - if (*aline == CHAR_DQUOTE) {
50.1829 - if (strchr(",;:!?)]} ", aline[1])) {
50.1830 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1831 - if (!pswit[OVERVIEW_SWITCH])
50.1832 - printf(" Line %ld column 1 - Wrongspaced quotes?\n", linecnt, (int)(s - aline)+1);
50.1833 - else
50.1834 - cnt_punct++;
50.1835 - }
50.1836 - }
50.1837 -
50.1838 - if (pswit[SQUOTE_SWITCH])
50.1839 - for (s = aline; *s; s++) {
50.1840 - if ((*s == CHAR_SQUOTE || *s == CHAR_OPEN_SQUOTE)
50.1841 - && ( s == aline || (s > aline && !gcisalpha(*(s-1))) || !gcisalpha(*(s+1)))) {
50.1842 - if (!(squotepar = !squotepar)) { /* parity even */
50.1843 - if (!strchr("_-.'`/\",;:!?)]} ", *(s+1))) {
50.1844 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1845 - if (!pswit[OVERVIEW_SWITCH])
50.1846 - printf(" Line %ld column %d - Wrongspaced singlequotes?\n", linecnt, (int)(s - aline)+1);
50.1847 - else
50.1848 - cnt_punct++;
50.1849 - }
50.1850 - }
50.1851 - else { /* parity odd */
50.1852 - if (!gcisalpha(*(s+1)) && !isdigit(*(s+1)) && !strchr("_-/\".'`", *(s+1)) || !*(s+1)) {
50.1853 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1854 - if (!pswit[OVERVIEW_SWITCH])
50.1855 - printf(" Line %ld column %d - Wrongspaced singlequotes?\n", linecnt, (int)(s - aline)+1);
50.1856 - else
50.1857 - cnt_punct++;
50.1858 - }
50.1859 - }
50.1860 - }
50.1861 - }
50.1862 -
50.1863 -
50.1864 - /* v.20 also look for double punctuation like ,. or ,, */
50.1865 - /* Thanks to DW for the suggestion! */
50.1866 - /* I'm putting this in a separate loop for clarity */
50.1867 - /* In books with references, ".," and ".;" are common */
50.1868 - /* e.g. "etc., etc.," and vol. 1.; vol 3.; */
50.1869 - /* OTOH, from my initial tests, there are also fairly */
50.1870 - /* common errors. What to do? Make these cases paranoid? */
50.1871 - /* V.21 ".," is the most common, so invented warn_dotcomma */
50.1872 - /* to suppress detailed reporting if it occurs often */
50.1873 - llen = strlen(aline);
50.1874 - for (i = 0; i < llen; i++) /* for each character in the line */
50.1875 - if (strchr(".?!,;:", aline[i]) /* if it's punctuation */
50.1876 - && (strchr(".?!,;:", aline[i+1]))
50.1877 - && aline[i] && aline[i+1]) /* followed by punctuation, it's a query, unless . . . */
50.1878 - if (
50.1879 - (aline[i] == aline[i+1]
50.1880 - && (aline[i] == '.' || aline[i] == '?' || aline[i] == '!'))
50.1881 - || (!warn_dotcomma && aline[i] == '.' && aline[i+1] == ',')
50.1882 - || (isFrench && !strncmp(aline+i, ",...", 4))
50.1883 - || (isFrench && !strncmp(aline+i, "...,", 4))
50.1884 - || (isFrench && !strncmp(aline+i, ";...", 4))
50.1885 - || (isFrench && !strncmp(aline+i, "...;", 4))
50.1886 - || (isFrench && !strncmp(aline+i, ":...", 4))
50.1887 - || (isFrench && !strncmp(aline+i, "...:", 4))
50.1888 - || (isFrench && !strncmp(aline+i, "!...", 4))
50.1889 - || (isFrench && !strncmp(aline+i, "...!", 4))
50.1890 - || (isFrench && !strncmp(aline+i, "?...", 4))
50.1891 - || (isFrench && !strncmp(aline+i, "...?", 4))
50.1892 - ) {
50.1893 - if ((isFrench && !strncmp(aline+i, ",...", 4)) /* could this BE any more awkward? */
50.1894 - || (isFrench && !strncmp(aline+i, "...,", 4))
50.1895 - || (isFrench && !strncmp(aline+i, ";...", 4))
50.1896 - || (isFrench && !strncmp(aline+i, "...;", 4))
50.1897 - || (isFrench && !strncmp(aline+i, ":...", 4))
50.1898 - || (isFrench && !strncmp(aline+i, "...:", 4))
50.1899 - || (isFrench && !strncmp(aline+i, "!...", 4))
50.1900 - || (isFrench && !strncmp(aline+i, "...!", 4))
50.1901 - || (isFrench && !strncmp(aline+i, "?...", 4))
50.1902 - || (isFrench && !strncmp(aline+i, "...?", 4)))
50.1903 - i +=4;
50.1904 - ; /* do nothing for .. !! and ?? which can be legit */
50.1905 - }
50.1906 - else {
50.1907 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1908 - if (!pswit[OVERVIEW_SWITCH])
50.1909 - printf(" Line %ld column %d - Double punctuation?\n", linecnt, i+1);
50.1910 - else
50.1911 - cnt_punct++;
50.1912 - }
50.1913 -
50.1914 - /* v.21 breaking out the search for spaced doublequotes */
50.1915 - /* This is not as efficient, but it's more maintainable */
50.1916 - s = aline;
50.1917 - while (strstr(s," \" ")) {
50.1918 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1919 - if (!pswit[OVERVIEW_SWITCH])
50.1920 - printf(" Line %ld column %d - Spaced doublequote?\n", linecnt, (int)(strstr(s," \" ")-aline+1));
50.1921 - else
50.1922 - cnt_punct++;
50.1923 - s = strstr(s," \" ") + 2;
50.1924 - }
50.1925 -
50.1926 - /* v.20 also look for spaced singlequotes ' and ` */
50.1927 - s = aline;
50.1928 - while (strstr(s," ' ")) {
50.1929 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1930 - if (!pswit[OVERVIEW_SWITCH])
50.1931 - printf(" Line %ld column %d - Spaced singlequote?\n", linecnt, (int)(strstr(s," ' ")-aline+1));
50.1932 - else
50.1933 - cnt_punct++;
50.1934 - s = strstr(s," ' ") + 2;
50.1935 - }
50.1936 -
50.1937 - s = aline;
50.1938 - while (strstr(s," ` ")) {
50.1939 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1940 - if (!pswit[OVERVIEW_SWITCH])
50.1941 - printf(" Line %ld column %d - Spaced singlequote?\n", linecnt, (int)(strstr(s," ` ")-aline+1));
50.1942 - else
50.1943 - cnt_punct++;
50.1944 - s = strstr(s," ` ") + 2;
50.1945 - }
50.1946 -
50.1947 - /* v.99 check special case of 'S instead of 's at end of word */
50.1948 - s = aline + 1;
50.1949 - while (*s) {
50.1950 - if (*s == CHAR_SQUOTE && *(s+1) == 'S' && *(s-1)>='a' && *(s-1)<='z') {
50.1951 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1952 - if (!pswit[OVERVIEW_SWITCH])
50.1953 - printf(" Line %ld column %d - Capital \"S\"?\n", linecnt, (int)(s-aline+2));
50.1954 - else
50.1955 - cnt_punct++;
50.1956 - }
50.1957 - s++;
50.1958 - }
50.1959 -
50.1960 -
50.1961 - /* v.21 Now check special cases - start and end of line - */
50.1962 - /* for single and double quotes. Start is sometimes [sic] */
50.1963 - /* but better to query it anyway. */
50.1964 - /* While I'm here, check for dash at end of line */
50.1965 - llen = strlen(aline);
50.1966 - if (llen > 1) {
50.1967 - if (aline[llen-1] == CHAR_DQUOTE ||
50.1968 - aline[llen-1] == CHAR_SQUOTE ||
50.1969 - aline[llen-1] == CHAR_OPEN_SQUOTE)
50.1970 - if (aline[llen-2] == CHAR_SPACE) {
50.1971 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1972 - if (!pswit[OVERVIEW_SWITCH])
50.1973 - printf(" Line %ld column %d - Spaced quote?\n", linecnt, llen);
50.1974 - else
50.1975 - cnt_punct++;
50.1976 - }
50.1977 -
50.1978 - /* V 0.98 removed aline[0] == CHAR_DQUOTE from the test below, since */
50.1979 - /* Wrongspaced quotes test also catches it for " */
50.1980 - if (aline[0] == CHAR_SQUOTE ||
50.1981 - aline[0] == CHAR_OPEN_SQUOTE)
50.1982 - if (aline[1] == CHAR_SPACE) {
50.1983 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1984 - if (!pswit[OVERVIEW_SWITCH])
50.1985 - printf(" Line %ld column 1 - Spaced quote?\n", linecnt);
50.1986 - else
50.1987 - cnt_punct++;
50.1988 - }
50.1989 - /* dash at end of line may well be legit - paranoid mode only */
50.1990 - /* and don't report em-dash at line-end */
50.1991 - if (pswit[PARANOID_SWITCH] && warn_hyphen) {
50.1992 - for (i = llen-1; i > 0 && (unsigned char)aline[i] <= CHAR_SPACE; i--);
50.1993 - if (aline[i] == '-' && aline[i-1] != '-') {
50.1994 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.1995 - if (!pswit[OVERVIEW_SWITCH])
50.1996 - printf(" Line %ld column %d - Hyphen at end of line?\n", linecnt, i);
50.1997 - }
50.1998 - }
50.1999 - }
50.2000 -
50.2001 - /* v.21 also look for brackets surrounded by alpha */
50.2002 - /* Brackets are often unspaced, but shouldn't be surrounded by alpha. */
50.2003 - /* If so, suspect a scanno like "a]most" */
50.2004 - llen = strlen(aline);
50.2005 - for (i = 1; i < llen-1; i++) { /* for each character in the line except 1st & last*/
50.2006 - if (strchr("{[()]}", aline[i]) /* if it's a bracket */
50.2007 - && gcisalpha(aline[i-1]) && gcisalpha(aline[i+1])) {
50.2008 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.2009 - if (!pswit[OVERVIEW_SWITCH])
50.2010 - printf(" Line %ld column %d - Unspaced bracket?\n", linecnt, i);
50.2011 - else
50.2012 - cnt_punct++;
50.2013 - }
50.2014 - }
50.2015 - /* The "Cinderella" case, back in again! :-S Give it another shot */
50.2016 - if (warn_endquote) {
50.2017 - llen = strlen(aline);
50.2018 - for (i = 1; i < llen; i++) { /* for each character in the line except 1st */
50.2019 - if (aline[i] == CHAR_DQUOTE)
50.2020 - if (isalpha(aline[i-1])) {
50.2021 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.2022 - if (!pswit[OVERVIEW_SWITCH])
50.2023 - printf(" Line %ld column %d - endquote missing punctuation?\n", linecnt, i);
50.2024 - else
50.2025 - cnt_punct++;
50.2026 - }
50.2027 - }
50.2028 - }
50.2029 -
50.2030 - llen = strlen(aline);
50.2031 -
50.2032 - /* Check for <HTML TAG> */
50.2033 - /* If there is a < in the line, followed at some point */
50.2034 - /* by a > then we suspect HTML */
50.2035 - if (strstr(aline, "<") && strstr(aline, ">")) {
50.2036 - i = (signed int) (strstr(aline, ">") - strstr(aline, "<") + 1);
50.2037 - if (i > 0) {
50.2038 - strncpy(wrk, strstr(aline, "<"), i);
50.2039 - wrk[i] = 0;
50.2040 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.2041 - if (!pswit[OVERVIEW_SWITCH])
50.2042 - printf(" Line %ld column %d - HTML Tag? %s \n", linecnt, (int)(strstr(aline, "<") - aline) + 1, wrk);
50.2043 - else
50.2044 - cnt_html++;
50.2045 - }
50.2046 - }
50.2047 -
50.2048 - /* Check for &symbol; HTML */
50.2049 - /* If there is a & in the line, followed at */
50.2050 - /* some point by a ; then we suspect HTML */
50.2051 - if (strstr(aline, "&") && strstr(aline, ";")) {
50.2052 - i = (int)(strstr(aline, ";") - strstr(aline, "&") + 1);
50.2053 - for (s = strstr(aline, "&"); s < strstr(aline, ";"); s++)
50.2054 - if (*s == CHAR_SPACE) i = 0; /* 0.99 don't report "Jones & Son;" */
50.2055 - if (i > 0) {
50.2056 - strncpy(wrk, strstr(aline,"&"), i);
50.2057 - wrk[i] = 0;
50.2058 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
50.2059 - if (!pswit[OVERVIEW_SWITCH])
50.2060 - printf(" Line %ld column %d - HTML symbol? %s \n", linecnt, (int)(strstr(aline, "&") - aline) + 1, wrk);
50.2061 - else
50.2062 - cnt_html++;
50.2063 - }
50.2064 - }
50.2065 -
50.2066 - /* At end of paragraph, check for mismatched quotes. */
50.2067 - /* We don't want to report an error immediately, since it is a */
50.2068 - /* common convention to omit the quotes at end of paragraph if */
50.2069 - /* the next paragraph is a continuation of the same speaker. */
50.2070 - /* Where this is the case, the next para should begin with a */
50.2071 - /* quote, so we store the warning message and only display it */
50.2072 - /* at the top of the next iteration if the new para doesn't */
50.2073 - /* start with a quote. */
50.2074 - /* The -p switch overrides this default, and warns of unclosed */
50.2075 - /* quotes on _every_ paragraph, whether the next begins with a */
50.2076 - /* quote or not. */
50.2077 - /* Version .16 - only report mismatched single quotes if */
50.2078 - /* an open_single_quotes was found. */
50.2079 -
50.2080 - if (isemptyline) { /* end of para - add up the totals */
50.2081 - if (quot % 2)
50.2082 - sprintf(dquote_err, " Line %ld - Mismatched quotes\n", linecnt);
50.2083 - if (pswit[SQUOTE_SWITCH] && open_single_quote && (open_single_quote != close_single_quote) )
50.2084 - sprintf(squote_err," Line %ld - Mismatched singlequotes?\n", linecnt);
50.2085 - if (pswit[SQUOTE_SWITCH] && open_single_quote
50.2086 - && (open_single_quote != close_single_quote)
50.2087 - && (open_single_quote != close_single_quote +1) )
50.2088 - squot = 1; /* flag it to be noted regardless of the first char of the next para */
50.2089 - if (r_brack)
50.2090 - sprintf(rbrack_err, " Line %ld - Mismatched round brackets?\n", linecnt);
50.2091 - if (s_brack)
50.2092 - sprintf(sbrack_err, " Line %ld - Mismatched square brackets?\n", linecnt);
50.2093 - if (c_brack)
50.2094 - sprintf(cbrack_err, " Line %ld - Mismatched curly brackets?\n", linecnt);
50.2095 - if (c_unders % 2)
50.2096 - sprintf(unders_err, " Line %ld - Mismatched underscores?\n", linecnt);
50.2097 - quot = s_brack = c_brack = r_brack = c_unders =
50.2098 - open_single_quote = close_single_quote = 0;
50.2099 - isnewpara = 1; /* let the next iteration know that it's starting a new para */
50.2100 - }
50.2101 -
50.2102 - /* V.21 _ALSO_ at end of paragraph, check for omitted punctuation. */
50.2103 - /* by working back through prevline. DW. */
50.2104 - /* Hmmm. Need to check this only for "normal" paras. */
50.2105 - /* So what is a "normal" para? ouch! */
50.2106 - /* Not normal if one-liner (chapter headings, etc.) */
50.2107 - /* Not normal if doesn't contain at least one locase letter */
50.2108 - /* Not normal if starts with space */
50.2109 -
50.2110 - /* 0.99 tighten up on para end checks. Disallow comma and */
50.2111 - /* semi-colon. Check for legit para end before quotes. */
50.2112 - if (isemptyline) { /* end of para */
50.2113 - for (s = prevline, i = 0; *s && !i; s++)
50.2114 - if (gcisletter(*s))
50.2115 - i = 1; /* use i to indicate the presence of a letter on the line */
50.2116 - /* This next "if" is a problem. */
50.2117 - /* If I say "start_para_line <= linecnt - 1", that includes one-line */
50.2118 - /* "paragraphs" like chapter heads. Lotsa false positives. */
50.2119 - /* If I say "start_para_line < linecnt - 1" it doesn't, but then it */
50.2120 - /* misses genuine one-line paragraphs. */
50.2121 - /* So what do I do? */
50.2122 - if (i
50.2123 - && lastblen > 2
50.2124 - && start_para_line < linecnt - 1
50.2125 - && *prevline > CHAR_SPACE
50.2126 - ) {
50.2127 - for (i = strlen(prevline)-1; (prevline[i] == CHAR_DQUOTE || prevline[i] == CHAR_SQUOTE) && prevline[i] > CHAR_SPACE && i > 0; i--);
50.2128 - for ( ; i > 0; i--) {
50.2129 - if (gcisalpha(prevline[i])) {
50.2130 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", prevline);
50.2131 - if (!pswit[OVERVIEW_SWITCH])
50.2132 - printf(" Line %ld column %d - No punctuation at para end?\n", linecnt-1, strlen(prevline));
50.2133 - else
50.2134 - cnt_punct++;
50.2135 - break;
50.2136 - }
50.2137 - if (strchr("-.:!([{?}])", prevline[i]))
50.2138 - break;
50.2139 - }
50.2140 - }
50.2141 - }
50.2142 - strcpy(prevline, aline);
50.2143 - }
50.2144 - fclose (infile);
50.2145 - if (!pswit[OVERVIEW_SWITCH])
50.2146 - for (i = 0; i < MAX_QWORD; i++)
50.2147 - if (dupcnt[i])
50.2148 - printf("\nNote: Queried word %s was duplicated %d time%s\n", qword[i], dupcnt[i], "s");
50.2149 -}
50.2150 -
50.2151 -
50.2152 -
50.2153 -/* flgets - get one line from the input stream, checking for */
50.2154 -/* the existence of exactly one CR/LF line-end per line. */
50.2155 -/* Returns a pointer to the line. */
50.2156 -
50.2157 -char *flgets(char *theline, int maxlen, FILE *thefile, long lcnt)
50.2158 -{
50.2159 - char c;
50.2160 - int len, isCR, cint;
50.2161 -
50.2162 - *theline = 0;
50.2163 - len = isCR = 0;
50.2164 - c = cint = fgetc(thefile);
50.2165 - do {
50.2166 - if (cint == EOF)
50.2167 - return (NULL);
50.2168 - if (c == 10) /* either way, it's end of line */
50.2169 - if (isCR)
50.2170 - break;
50.2171 - else { /* Error - a LF without a preceding CR */
50.2172 - if (pswit[LINE_END_SWITCH]) {
50.2173 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", theline);
50.2174 - if (!pswit[OVERVIEW_SWITCH])
50.2175 - printf(" Line %ld - No CR?\n", lcnt);
50.2176 - else
50.2177 - cnt_lineend++;
50.2178 - }
50.2179 - break;
50.2180 - }
50.2181 - if (c == 13) {
50.2182 - if (isCR) { /* Error - two successive CRs */
50.2183 - if (pswit[LINE_END_SWITCH]) {
50.2184 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", theline);
50.2185 - if (!pswit[OVERVIEW_SWITCH])
50.2186 - printf(" Line %ld - Two successive CRs?\n", lcnt);
50.2187 - else
50.2188 - cnt_lineend++;
50.2189 - }
50.2190 - }
50.2191 - isCR = 1;
50.2192 - }
50.2193 - else {
50.2194 - if (pswit[LINE_END_SWITCH] && isCR) {
50.2195 - if (pswit[ECHO_SWITCH]) printf("\n%s\n", theline);
50.2196 - if (!pswit[OVERVIEW_SWITCH])
50.2197 - printf(" Line %ld column %d - CR without LF?\n", lcnt, len+1);
50.2198 - else
50.2199 - cnt_lineend++;
50.2200 - }
50.2201 - theline[len] = c;
50.2202 - len++;
50.2203 - theline[len] = 0;
50.2204 - isCR = 0;
50.2205 - }
50.2206 - c = cint = fgetc(thefile);
50.2207 - } while(len < maxlen);
50.2208 - if (pswit[MARKUP_SWITCH])
50.2209 - postprocess_for_HTML(theline);
50.2210 - if (pswit[DP_SWITCH])
50.2211 - postprocess_for_DP(theline);
50.2212 - return(theline);
50.2213 -}
50.2214 -
50.2215 -
50.2216 -
50.2217 -
50.2218 -/* mixdigit - takes a "word" as a parameter, and checks whether it */
50.2219 -/* contains a mixture of alpha and digits. Generally, this is an */
50.2220 -/* error, but may not be for cases like 4th or L5 12s. 3d. */
50.2221 -/* Returns 0 if no error found, 1 if error. */
50.2222 -
50.2223 -int mixdigit(char *checkword) /* check for digits like 1 or 0 in words */
50.2224 -{
50.2225 - int wehaveadigit, wehavealetter, firstdigits, query, wl;
50.2226 - char *s;
50.2227 -
50.2228 -
50.2229 - wehaveadigit = wehavealetter = query = 0;
50.2230 - for (s = checkword; *s; s++)
50.2231 - if (gcisalpha(*s))
50.2232 - wehavealetter = 1;
50.2233 - else
50.2234 - if (gcisdigit(*s))
50.2235 - wehaveadigit = 1;
50.2236 - if (wehaveadigit && wehavealetter) { /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
50.2237 - query = 1;
50.2238 - wl = strlen(checkword);
50.2239 - for (firstdigits = 0; gcisdigit(checkword[firstdigits]); firstdigits++)
50.2240 - ;
50.2241 - /* digits, ending in st, rd, nd, th of either case */
50.2242 - /* 0.99 donovan points out an error below. Turns out */
50.2243 - /* I was using matchword like strcmp when the */
50.2244 - /* return values are different! Duh. */
50.2245 - if (firstdigits + 2 == wl &&
50.2246 - (matchword(checkword + wl - 2, "st")
50.2247 - || matchword(checkword + wl - 2, "rd")
50.2248 - || matchword(checkword + wl - 2, "nd")
50.2249 - || matchword(checkword + wl - 2, "th"))
50.2250 - )
50.2251 - query = 0;
50.2252 - if (firstdigits + 3 == wl &&
50.2253 - (matchword(checkword + wl - 3, "sts")
50.2254 - || matchword(checkword + wl - 3, "rds")
50.2255 - || matchword(checkword + wl - 3, "nds")
50.2256 - || matchword(checkword + wl - 3, "ths"))
50.2257 - )
50.2258 - query = 0;
50.2259 - if (firstdigits + 3 == wl &&
50.2260 - (matchword(checkword + wl - 4, "stly")
50.2261 - || matchword(checkword + wl - 4, "rdly")
50.2262 - || matchword(checkword + wl - 4, "ndly")
50.2263 - || matchword(checkword + wl - 4, "thly"))
50.2264 - )
50.2265 - query = 0;
50.2266 -
50.2267 - /* digits, ending in l, L, s or d */
50.2268 - if (firstdigits + 1 == wl &&
50.2269 - (checkword[wl-1] == 'l'
50.2270 - || checkword[wl-1] == 'L'
50.2271 - || checkword[wl-1] == 's'
50.2272 - || checkword[wl-1] == 'd'))
50.2273 - query = 0;
50.2274 - /* L at the start of a number, representing Britsh pounds, like L500 */
50.2275 - /* This is cute. We know the current word is mixeddigit. If the first */
50.2276 - /* letter is L, there must be at least one digit following. If both */
50.2277 - /* digits and letters follow, we have a genuine error, else we have a */
50.2278 - /* capital L followed by digits, and we accept that as a non-error. */
50.2279 - if (checkword[0] == 'L')
50.2280 - if (!mixdigit(checkword+1))
50.2281 - query = 0;
50.2282 - }
50.2283 - return (query);
50.2284 -}
50.2285 -
50.2286 -
50.2287 -
50.2288 -
50.2289 -/* getaword - extracts the first/next "word" from the line, and puts */
50.2290 -/* it into "thisword". A word is defined as one English word unit */
50.2291 -/* -- or at least that's what I'm trying for. */
50.2292 -/* Returns a pointer to the position in the line where we will start */
50.2293 -/* looking for the next word. */
50.2294 -
50.2295 -char *getaword(char *fromline, char *thisword)
50.2296 -{
50.2297 - int i, wordlen;
50.2298 - char *s;
50.2299 -
50.2300 - wordlen = 0;
50.2301 - for ( ; !gcisdigit(*fromline) && !gcisalpha(*fromline) && *fromline ; fromline++ );
50.2302 -
50.2303 - /* V .20 */
50.2304 - /* add a look-ahead to handle exceptions for numbers like 1,000 and 1.35. */
50.2305 - /* Especially yucky is the case of L1,000 */
50.2306 - /* I hate this, and I see other ways, but I don't see that any is _better_.*/
50.2307 - /* This section looks for a pattern of characters including a digit */
50.2308 - /* followed by a comma or period followed by one or more digits. */
50.2309 - /* If found, it returns this whole pattern as a word; otherwise we discard */
50.2310 - /* the results and resume our normal programming. */
50.2311 - s = fromline;
50.2312 - for ( ; (gcisdigit(*s) || gcisalpha(*s) || *s == ',' || *s == '.') && wordlen < MAXWORDLEN ; s++ ) {
50.2313 - thisword[wordlen] = *s;
50.2314 - wordlen++;
50.2315 - }
50.2316 - thisword[wordlen] = 0;
50.2317 - for (i = 1; i < wordlen -1; i++) {
50.2318 - if (thisword[i] == '.' || thisword[i] == ',') {
50.2319 - if (gcisdigit(thisword[i-1]) && gcisdigit(thisword[i-1])) { /* we have one of the damned things */
50.2320 - fromline = s;
50.2321 - return(fromline);
50.2322 - }
50.2323 - }
50.2324 - }
50.2325 -
50.2326 - /* we didn't find a punctuated number - do the regular getword thing */
50.2327 - wordlen = 0;
50.2328 - for ( ; (gcisdigit(*fromline) || gcisalpha(*fromline) || *fromline == '\'') && wordlen < MAXWORDLEN ; fromline++ ) {
50.2329 - thisword[wordlen] = *fromline;
50.2330 - wordlen++;
50.2331 - }
50.2332 - thisword[wordlen] = 0;
50.2333 - return(fromline);
50.2334 -}
50.2335 -
50.2336 -
50.2337 -
50.2338 -
50.2339 -
50.2340 -/* matchword - just a case-insensitive string matcher */
50.2341 -/* yes, I know this is not efficient. I'll worry about */
50.2342 -/* that when I have a clear idea where I'm going with it.*/
50.2343 -
50.2344 -int matchword(char *checkfor, char *thisword)
50.2345 -{
50.2346 - unsigned int ismatch, i;
50.2347 -
50.2348 - if (strlen(checkfor) != strlen(thisword)) return(0);
50.2349 -
50.2350 - ismatch = 1; /* assume a match until we find a difference */
50.2351 - for (i = 0; i <strlen(checkfor); i++)
50.2352 - if (toupper(checkfor[i]) != toupper(thisword[i]))
50.2353 - ismatch = 0;
50.2354 - return (ismatch);
50.2355 -}
50.2356 -
50.2357 -
50.2358 -
50.2359 -
50.2360 -
50.2361 -/* lowerit - lowercase the line. Yes, strlwr does the same job, */
50.2362 -/* but not on all platforms, and I'm a bit paranoid about what */
50.2363 -/* some implementations of tolower might do to hi-bit characters,*/
50.2364 -/* which shouldn't matter, but better safe than sorry. */
50.2365 -
50.2366 -void lowerit(char *theline)
50.2367 -{
50.2368 - for ( ; *theline; theline++)
50.2369 - if (*theline >='A' && *theline <='Z')
50.2370 - *theline += 32;
50.2371 -}
50.2372 -
50.2373 -
50.2374 -/* Is this word a Roman Numeral? */
50.2375 -/* v 0.99 improved to be better. It still doesn't actually */
50.2376 -/* validate that the number is a valid Roman Numeral -- for example */
50.2377 -/* it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not*/
50.2378 -/* what we're here to do. If it passes this, it LOOKS like a Roman */
50.2379 -/* numeral. Anyway, the actual Romans were pretty tolerant of bad */
50.2380 -/* arithmetic, or expressions thereof, except when it came to taxes.*/
50.2381 -/* Allow any number of M, an optional D, an optional CM or CD, */
50.2382 -/* any number of optional Cs, an optional XL or an optional XC, an */
50.2383 -/* optional IX or IV, an optional V and any number of optional Is. */
50.2384 -/* Good enough for jazz chords. */
50.2385 -
50.2386 -int isroman(char *t)
50.2387 -{
50.2388 - char *s;
50.2389 -
50.2390 - if (!t || !*t) return (0);
50.2391 -
50.2392 - s = t;
50.2393 -
50.2394 - while (*t == 'm' && *t ) t++;
50.2395 - if (*t == 'd') t++;
50.2396 - if (*t == 'c' && *(t+1) == 'm') t+=2;
50.2397 - if (*t == 'c' && *(t+1) == 'd') t+=2;
50.2398 - while (*t == 'c' && *t) t++;
50.2399 - if (*t == 'x' && *(t+1) == 'l') t+=2;
50.2400 - if (*t == 'x' && *(t+1) == 'c') t+=2;
50.2401 - if (*t == 'l') t++;
50.2402 - while (*t == 'x' && *t) t++;
50.2403 - if (*t == 'i' && *(t+1) == 'x') t+=2;
50.2404 - if (*t == 'i' && *(t+1) == 'v') t+=2;
50.2405 - if (*t == 'v') t++;
50.2406 - while (*t == 'i' && *t) t++;
50.2407 - if (!*t) return (1);
50.2408 -
50.2409 - return(0);
50.2410 -}
50.2411 -
50.2412 -
50.2413 -
50.2414 -
50.2415 -/* gcisalpha is a special version that is somewhat lenient on 8-bit texts. */
50.2416 -/* If we use the standard isalpha() function, 8-bit accented characters break */
50.2417 -/* words, so that tete with accented characters appears to be two words, "t" */
50.2418 -/* and "t", with 8-bit characters between them. This causes over-reporting of */
50.2419 -/* errors. gcisalpha() recognizes accented letters from the CP1252 (Windows) */
50.2420 -/* and ISO-8859-1 character sets, which are the most common PG 8-bit types. */
50.2421 -
50.2422 -int gcisalpha(unsigned char c)
50.2423 -{
50.2424 - if (c >='a' && c <='z') return(1);
50.2425 - if (c >='A' && c <='Z') return(1);
50.2426 - if (c < 140) return(0);
50.2427 - if (c >=192 && c != 208 && c != 215 && c != 222 && c != 240 && c != 247 && c != 254) return(1);
50.2428 - if (c == 140 || c == 142 || c == 156 || c == 158 || c == 159) return (1);
50.2429 - return(0);
50.2430 -}
50.2431 -
50.2432 -/* gcisdigit is a special version that doesn't get confused in 8-bit texts. */
50.2433 -int gcisdigit(unsigned char c)
50.2434 -{
50.2435 - if (c >= '0' && c <='9') return(1);
50.2436 - return(0);
50.2437 -}
50.2438 -
50.2439 -/* gcisletter is a special version that doesn't get confused in 8-bit texts. */
50.2440 -/* Yeah, we're ISO-8891-1-specific. So sue me. */
50.2441 -int gcisletter(unsigned char c)
50.2442 -{
50.2443 - if ((c >= 'A' && c <='Z') || (c >= 'a' && c <='z') || c >= 192) return(1);
50.2444 - return(0);
50.2445 -}
50.2446 -
50.2447 -
50.2448 -
50.2449 -
50.2450 -/* gcstrchr wraps strchr to return NULL if the character being searched for is zero */
50.2451 -
50.2452 -char *gcstrchr(char *s, char c)
50.2453 -{
50.2454 - if (c == 0) return(NULL);
50.2455 - return(strchr(s,c));
50.2456 -}
50.2457 -
50.2458 -/* postprocess_for_DP is derived from postprocess_for_HTML */
50.2459 -/* It is invoked with the -d switch from flgets(). */
50.2460 -/* It simply "removes" from the line a hard-coded set of common */
50.2461 -/* DP-specific tags, so that the line passed to the main routine has*/
50.2462 -/* been pre-cleaned of DP markup. */
50.2463 -
50.2464 -void postprocess_for_DP(char *theline)
50.2465 -{
50.2466 -
50.2467 - char *s, *t;
50.2468 - int i;
50.2469 -
50.2470 - if (!*theline)
50.2471 - return;
50.2472 -
50.2473 - for (i = 0; *DPmarkup[i]; i++) {
50.2474 - s = strstr(theline, DPmarkup[i]);
50.2475 - while (s) {
50.2476 - t = s + strlen(DPmarkup[i]);
50.2477 - while (*t) {
50.2478 - *s = *t;
50.2479 - t++; s++;
50.2480 - }
50.2481 - *s = 0;
50.2482 - s = strstr(theline, DPmarkup[i]);
50.2483 - }
50.2484 - }
50.2485 -
50.2486 -}
50.2487 -
50.2488 -
50.2489 -/* postprocess_for_HTML is, at the moment (0.97), a very nasty */
50.2490 -/* short-term fix for Charlz. Nasty, nasty, nasty. */
50.2491 -/* It is invoked with the -m switch from flgets(). */
50.2492 -/* It simply "removes" from the line a hard-coded set of common */
50.2493 -/* HTML tags and "replaces" a hard-coded set of common HTML */
50.2494 -/* entities, so that the line passed to the main routine has */
50.2495 -/* been pre-cleaned of HTML. This is _so_ not the right way to */
50.2496 -/* deal with HTML, but what Charlz needs now is not HTML handling */
50.2497 -/* proper: just ignoring <i> tags and some others. */
50.2498 -/* To be revisited in future releases! */
50.2499 -
50.2500 -void postprocess_for_HTML(char *theline)
50.2501 -{
50.2502 -
50.2503 - if (strstr(theline, "<") && strstr(theline, ">"))
50.2504 - while (losemarkup(theline))
50.2505 - ;
50.2506 - while (loseentities(theline))
50.2507 - ;
50.2508 -}
50.2509 -
50.2510 -char *losemarkup(char *theline)
50.2511 -{
50.2512 - char *s, *t;
50.2513 - int i;
50.2514 -
50.2515 - if (!*theline)
50.2516 - return(NULL);
50.2517 -
50.2518 - s = strstr(theline, "<");
50.2519 - t = strstr(theline, ">");
50.2520 - if (!s || !t) return(NULL);
50.2521 - for (i = 0; *markup[i]; i++)
50.2522 - if (!tagcomp(s+1, markup[i])) {
50.2523 - if (!*(t+1)) {
50.2524 - *s = 0;
50.2525 - return(s);
50.2526 - }
50.2527 - else
50.2528 - if (t > s) {
50.2529 - strcpy(s, t+1);
50.2530 - return(s);
50.2531 - }
50.2532 - }
50.2533 - /* it's an unrecognized <xxx> */
50.2534 - return(NULL);
50.2535 -}
50.2536 -
50.2537 -char *loseentities(char *theline)
50.2538 -{
50.2539 - int i;
50.2540 - char *s, *t;
50.2541 -
50.2542 - if (!*theline)
50.2543 - return(NULL);
50.2544 -
50.2545 - for (i = 0; *entities[i].htmlent; i++) {
50.2546 - s = strstr(theline, entities[i].htmlent);
50.2547 - if (s) {
50.2548 - t = malloc((size_t)strlen(s));
50.2549 - if (!t) return(NULL);
50.2550 - strcpy(t, s + strlen(entities[i].htmlent));
50.2551 - strcpy(s, entities[i].textent);
50.2552 - strcat(s, t);
50.2553 - free(t);
50.2554 - return(theline);
50.2555 - }
50.2556 - }
50.2557 -
50.2558 - /* V0.97 Duh. Forgot to check the htmlnum member */
50.2559 - for (i = 0; *entities[i].htmlnum; i++) {
50.2560 - s = strstr(theline, entities[i].htmlnum);
50.2561 - if (s) {
50.2562 - t = malloc((size_t)strlen(s));
50.2563 - if (!t) return(NULL);
50.2564 - strcpy(t, s + strlen(entities[i].htmlnum));
50.2565 - strcpy(s, entities[i].textent);
50.2566 - strcat(s, t);
50.2567 - free(t);
50.2568 - return(theline);
50.2569 - }
50.2570 - }
50.2571 - return(NULL);
50.2572 -}
50.2573 -
50.2574 -
50.2575 -int tagcomp(char *strin, char *basetag)
50.2576 -{
50.2577 - char *s, *t;
50.2578 -
50.2579 - s = basetag;
50.2580 - t = strin;
50.2581 - if (*t == '/') t++; /* ignore a slash */
50.2582 - while (*s && *t) {
50.2583 - if (tolower(*s) != tolower(*t)) return(1);
50.2584 - s++; t++;
50.2585 - }
50.2586 - /* OK, we have < followed by a valid tag start */
50.2587 - /* should I do something about length? */
50.2588 - /* this is messy. The length of an <i> tag is */
50.2589 - /* limited, but a <table> could go on for miles */
50.2590 - /* so I'd have to parse the tags . . . ugh. */
50.2591 - /* It isn't what Charlz needs now, so mark it */
50.2592 - /* as 'pending'. */
50.2593 - return(0);
50.2594 -}
50.2595 -
50.2596 -void proghelp() /* explain program usage here */
50.2597 -{
50.2598 - fputs("V. 0.991. Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
50.2599 - fputs("Gutcheck comes wih ABSOLUTELY NO WARRANTY. For details, read the file COPYING.\n", stderr);
50.2600 - fputs("This is Free Software; you may redistribute it under certain conditions (GPL);\n", stderr);
50.2601 - fputs("read the file COPYING for details.\n\n", stderr);
50.2602 - fputs("Usage is: gutcheck [-setpxloyhud] filename\n",stderr);
50.2603 - fputs(" where -s checks single quotes, -e suppresses echoing lines, -t checks typos\n",stderr);
50.2604 - fputs(" -x (paranoid) switches OFF -t and extra checks, -l turns OFF line-end checks\n",stderr);
50.2605 - fputs(" -o just displays overview without detail, -h echoes header fields\n",stderr);
50.2606 - fputs(" -v (verbose) unsuppresses duplicate reporting, -m suppresses markup\n",stderr);
50.2607 - fputs(" -d ignores DP-specific markup,\n",stderr);
50.2608 - fputs(" -u uses a file gutcheck.typ to query user-defined possible typos\n",stderr);
50.2609 - fputs("Sample usage: gutcheck warpeace.txt \n",stderr);
50.2610 - fputs("\n",stderr);
50.2611 - fputs("Gutcheck looks for errors in Project Gutenberg(TM) etexts.\n", stderr);
50.2612 - fputs("Gutcheck queries anything it thinks shouldn't be in a PG text; non-ASCII\n",stderr);
50.2613 - fputs("characters like accented letters, lines longer than 75 or shorter than 55,\n",stderr);
50.2614 - fputs("unbalanced quotes or brackets, a variety of badly formatted punctuation, \n",stderr);
50.2615 - fputs("HTML tags, some likely typos. It is NOT a substitute for human judgement.\n",stderr);
50.2616 - fputs("\n",stderr);
50.2617 -}
50.2618 -
50.2619 -
50.2620 -
50.2621 -/*********************************************************************
50.2622 - Revision History:
50.2623 -
50.2624 - 04/22/01 Cleaned up some stuff and released .10
50.2625 -
50.2626 - ---------------
50.2627 -
50.2628 - 05/09/01 Added the typo list, added two extra cases of he/be error,
50.2629 - added -p switch, OPEN_SINGLE QUOTE char as .11
50.2630 -
50.2631 - ---------------
50.2632 -
50.2633 - 05/20/01 Increased the typo list,
50.2634 - added paranoid mode,
50.2635 - ANSIfied the code and added some casts
50.2636 - so the compiler wouldn't keep asking if I knew what I was doing,
50.2637 - fixed bug in l.s.d. condition (thanks, Dave!),
50.2638 - standardized spacing when echoing,
50.2639 - added letter-combo checking code to typo section,
50.2640 - added more h/b words to typo array.
50.2641 - Not too sure about putting letter combos outside of the TYPO conditions -
50.2642 - someone is sure to have a book about the tbaka tribe, or something. Anyway, let's see.
50.2643 - Released as .12
50.2644 -
50.2645 - ---------------
50.2646 -
50.2647 - 06/01/01 Removed duplicate reporting of Tildes, asterisks, etc.
50.2648 - 06/10/01 Added flgets routine to help with platform-independent
50.2649 - detection of invalid line-ends. All PG text files should
50.2650 - have CR/LF (13/10) at end of line, regardless of system.
50.2651 - Gutcheck now validates this by default. (Thanks, Charles!)
50.2652 - Released as .13
50.2653 -
50.2654 - ---------------
50.2655 -
50.2656 - 06/11/01 Added parenthesis match checking. (c_brack, cbrack_err etc.)
50.2657 - Released as .14
50.2658 -
50.2659 - ---------------
50.2660 -
50.2661 - 06/23/01 Fixed: 'No',he said. not being flagged.
50.2662 -
50.2663 - Improved: better single-quotes checking:
50.2664 -
50.2665 - Ignore singlequotes surrounded by alpha, like didn't. (was OK)
50.2666 -
50.2667 - If a singlequote is at the END of a word AND the word ends in "s":
50.2668 - The dogs' tails wagged.
50.2669 - it's probably an apostrophe, but less commonly may be a closequote:
50.2670 - "These 'pack dogs' of yours look more like wolves."
50.2671 -
50.2672 - If it's got punctuation before it and is followed by a space
50.2673 - or punctuation:
50.2674 - . . . was a problem,' he said
50.2675 - . . . was a problem,'"
50.2676 - it is probably (certainly?) a closequote.
50.2677 -
50.2678 - If it's at start of paragraph, it's probably an openquote.
50.2679 - (but watch dialect)
50.2680 -
50.2681 - Words with ' at beginning and end are probably quoted:
50.2682 - "You have the word 'chivalry' frequently on your lips."
50.2683 - (Not specifically implemented)
50.2684 - V.18 I'm glad I didn't implement this, 'cos it jest ain't so
50.2685 - where the convention is to punctuate outside the quotes.
50.2686 - 'Come', he said, 'and join the party'.
50.2687 -
50.2688 - If it is followed by an alpha, and especially a capital:
50.2689 - 'Hello,' called he.
50.2690 - it is either an openquote or dialect.
50.2691 -
50.2692 - Dialect breaks ALL the rules:
50.2693 - A man's a man for a' that.
50.2694 - "Aye, but 'tis all in the pas' now."
50.2695 - "'Tis often the way," he said.
50.2696 - 'Ave a drink on me.
50.2697 -
50.2698 - This version looks to be an improvement, and produces
50.2699 - fewer false positives, but is still not perfect. The
50.2700 - 'pack dogs' case still fools it, and dialect is still
50.2701 - a problem. Oh, well, it's an improvement, and I have
50.2702 - a weighted structure in place for refining guesses at
50.2703 - closequotes. Maybe next time, I'll add a bit of logic
50.2704 - where if there is an open quote and one that was guessed
50.2705 - to be a possessive apostrophe after s, I'll re-guess it
50.2706 - to be a closequote. Let's see how this one flies, first.
50.2707 -
50.2708 - (Afterview: it's still crap. Needs much work, and a deeper insight.)
50.2709 -
50.2710 - Released as .15
50.2711 -
50.2712 - TODO: More he/be checks. Can't be perfect - counterexamples:
50.2713 - I gave my son good advice: be married regardless of the world's opinion.
50.2714 - I gave my son good advice: he married regardless of the world's opinion.
50.2715 -
50.2716 - If by "primitive" be meant "crude", we can understand the sentence.
50.2717 - If by "primitive" he meant "crude", we can understand the sentence.
50.2718 -
50.2719 - No matter what be said, I must go on.
50.2720 - No matter what he said, I must go on.
50.2721 -
50.2722 - No value, however great, can be set upon them.
50.2723 - No value, however great, can he set upon them.
50.2724 -
50.2725 - Real-Life one from a DP International Weekly Miscellany:
50.2726 - He wandered through the forest without fear, sleeping
50.2727 - much, for in sleep be had companionship--the Great
50.2728 - Spirit teaching him what he should know in dreams.
50.2729 - That one found by jeebies, and it turned out to be "he".
50.2730 -
50.2731 -
50.2732 - ---------------
50.2733 -
50.2734 - 07/01/01 Added -O option.
50.2735 - Improved singlequotes by reporting mismatched single quotes
50.2736 - only if an open_single_quotes was found.
50.2737 -
50.2738 - Released as .16
50.2739 -
50.2740 - ---------------
50.2741 -
50.2742 - 08/27/01 Added -Y switch for Robert Rowe to allow his app to
50.2743 - catch the error output.
50.2744 -
50.2745 - Released as .17
50.2746 -
50.2747 - ---------------
50.2748 -
50.2749 - 09/08/01 Added checking Capitals at start of paragraph, but not
50.2750 - checking them at start of sentence.
50.2751 -
50.2752 - TODO: Parse sentences out so can check reliably for start of
50.2753 - sentence. Need a whole different approach for that.
50.2754 - (Can't just rely on periods, since they are also
50.2755 - used for abbreviations, etc.)
50.2756 -
50.2757 - Added checking for all vowels or all consonants in a word.
50.2758 -
50.2759 - While I was in, I added "ii" checking and "tl" at start of word.
50.2760 -
50.2761 - Added echoing of first line of paragraph when reporting
50.2762 - mismatched quoted or brackets (thanks to David Widger for the
50.2763 - suggestion)
50.2764 -
50.2765 - Not querying L at start of a number (used for British pounds).
50.2766 -
50.2767 - The spelling changes are sort of half-done but released anyway
50.2768 - Skipped .18 because I had given out a couple of test versions
50.2769 - with that number.
50.2770 -
50.2771 - 09/25/01 Released as .19
50.2772 -
50.2773 - ---------------
50.2774 -
50.2775 - TODO:
50.2776 - Use the logic from my new version of safewrap to stop querying
50.2777 - short lines like poems and TOCs.
50.2778 - Ignore non-standard ellipses like . . . or ...
50.2779 -
50.2780 -
50.2781 - ---------------
50.2782 - 10/01/01 Made any line over 80 a VERY long line (was 85).
50.2783 - Recognized openquotes on indented paragraphs as continuations
50.2784 - of the same speech.
50.2785 - Added "cf" to the okword list (how did I forget _that_?) and a few others.
50.2786 - Moved abbrev to okword and made it more general.
50.2787 - Removed requirement that PG_space_emdash be greater than
50.2788 - ten before turning off warnings about spaced dashes.
50.2789 - Added period to list of characters that might constitute a separator line.
50.2790 - Now checking for double punctuation (Thanks, David!)
50.2791 - Now if two spaced em-dashes on a line, reports both. (DW)
50.2792 - Bug: Wasn't catching spaced punctuation at line-end since I
50.2793 - added flgets in version .13 - fixed.
50.2794 - Bug: Wasn't catching spaced singlequotes - fixed
50.2795 - Now reads punctuated numbers like 1,000 as a single word.
50.2796 - (Used to give "standalone 1" type queries)
50.2797 - Changed paranoid mode - not including s and p options. -ex is now quite usable.
50.2798 - Bug: was calling `"For it is perfectly impossible," Unspaced Quotes - fixed
50.2799 - Bug: Sometimes gave _next_ line number for queried word at end of line - fixed
50.2800 -
50.2801 - 10/22/01 Released as .20
50.2802 -
50.2803 - ---------------
50.2804 -
50.2805 - Added count of lines with spaces at end. (cnt_spacend) (Thanks, Brett!)
50.2806 - Reduced the number of hi-bit letters needed to stop reporting them
50.2807 - from 1/20 to 1/100 or 200 in total.
50.2808 - Added PG footer check.
50.2809 - Added the -h switch.
50.2810 - Fixed platform-specific CHAR_EOL checking for isemptyline - changed to 13 and 10
50.2811 - Not reporting ".," when there are many of them, such as a book with many references to "Vol 1., p. 23"
50.2812 - Added unspaced brackets check when surrounded by alpha.
50.2813 - Removed all typo reporting unless the typo switch is on.
50.2814 - Added gcisalpha to ease over-reporting of 8-bit queries.
50.2815 - ECHO_SWITCH is now ON by default!
50.2816 - PARANOID_SWITCH is now ON by default!
50.2817 - Checking for ">From" placed there by e-mail MTA (Thanks Andrew & Greg)
50.2818 - Checking for standalone lowercase "l"
50.2819 - Checking for standalone lowercase "s"
50.2820 - Considering "is be" and "be is" "be was" "was be" as he/be errors
50.2821 - Looking at punct at end of para
50.2822 -
50.2823 - 01/20/02 Released as .21
50.2824 -
50.2825 - ---------------
50.2826 -
50.2827 - Added VERBOSE_SWITCH to make it list everything. (George Davis)
50.2828 -
50.2829 - ---------------
50.2830 -
50.2831 - 02/17/02 Added cint in flgets to try fix an EOF failure on a compiler I don't have.
50.2832 - after which
50.2833 - This line caused a coredump on Solaris - fixed.
50.2834 - Da sagte die Figur: " Das ist alles gar schoen, und man mag die Puppe
50.2835 - 03/09/02 Changed header recognition for another header change
50.2836 - Called it .24
50.2837 - 03/29/02 Added qword[][] so I can suppress massive overreporting
50.2838 - of queried "words" like "FN", "Wm.", "th'", people's
50.2839 - initials, chemical formulae and suchlike in some texts.
50.2840 - Called it .25
50.2841 - 04/07/02 The qword summary reports at end shouldn't show in OVERVIEW mode. Fixed.
50.2842 - Added linecounts in overview mode.
50.2843 - Wow! gutcheck gutcheck.exe doesn't report a binary! :-) Need to tighten up. Done.
50.2844 - "m" is a not uncommon scanno for "in", but also appears in "a.m." - Can I get round that?
50.2845 - 07/07/02 Added GPL.
50.2846 - Added checking for broken em-dash at line-end (enddash)
50.2847 - Released as 0.95
50.2848 - 08/17/02 Fixed a bug that treated some hi-bit characters as spaces. Thanks, Carlo.
50.2849 - Released as 0.96
50.2850 - 10/10/02 Suppressing some annoying multiple reports by default:
50.2851 - Standalone Ones, Asterisks, Square Brackets.
50.2852 - Digit 1 occurs often in many scientific texts.
50.2853 - Asterisk occurs often in multi-footnoted texts.
50.2854 - Mismatch Square Brackets occurs often in multi-para footnotes.
50.2855 - Added -m switch for Charlz. Horrible. Nasty. Kludgy. Evil.
50.2856 - . . . but it does more or less work for the main cases.
50.2857 - Removed uppercase within a word as a separate category so
50.2858 - that names like VanAllen get reported only once, like other
50.2859 - suspected typos.
50.2860 - 11/24/02 Fixed - -m switch wasn't looking at htmlnum in
50.2861 - loseentities (Thanks, Brett!)
50.2862 - Fixed bug which occasionally gave false warning of
50.2863 - paragraph starting with lowercase.
50.2864 - Added underscore as character not to query around doublequotes.
50.2865 - Split the "Non-ASCII" message into "Non-ASCII" vs. "Non-ISO-8859"
50.2866 - . . . this is to help detect things like CP1252 characters.
50.2867 - Released as 0.97
50.2868 -
50.2869 - 12/01/02 Hacked a simplified version of the "Wrongspaced quotes" out of gutspell,
50.2870 - for doublequotes only. Replaces "Spaced quote", since it also covers that
50.2871 - case.
50.2872 - Added "warn_hyphen" to ease over-reporting of hyphens.
50.2873 -
50.2874 - 12/20/02 Added "extra period" checks.
50.2875 - Added single character line check
50.2876 - Added I" check - is usually an exclam
50.2877 - Released as 0.98
50.2878 -
50.2879 - 1/5/03 Eeek! Left in a lowerit(argv[0]) at the start before procfile()
50.2880 - from when I was looking at ways to identify markup. Refuses to
50.2881 - open files for *nix users with upcase in the filemanes. Removed.
50.2882 - Fixed quickly and released as 0.981
50.2883 -
50.2884 - 1/8/03 Added "arid" to the list of typos, slightly against my better
50.2885 - judgement, but the DP gang are all excited about it. :-)
50.2886 - Added a check for comma followed by capital letter, where
50.2887 - a period has OCRed into a comma. (DW). Not sure about this
50.2888 - either; we'll see.
50.2889 - Compiling for Win32 to allow longfilenames.
50.2890 -
50.2891 - 6/1/04 A messy test release for DW to include the "gutcheck.typ"
50.2892 - process. And the gutcheck.jee trials. Removed "arid" --
50.2893 - it can go in gutcheck.typ
50.2894 -
50.2895 - Added checks for carats ^ and slants / but disabling slant
50.2896 - queries if more than 20 of them, because some people use them
50.2897 - for /italics/. Slants are commonly mistaken italic "I"s.
50.2898 -
50.2899 - Later: removed gutcheck.jee -- wrote jeebies instead.
50.2900 -
50.2901 -Random TODO:
50.2902 - Check brackets more closely, like quotes, so that it becomes
50.2903 - easy to find the error in long paragraphs full of brackets.
50.2904 -
50.2905 -
50.2906 - 11/4/04 Assorted cleanup. Fixed case where text started with an
50.2907 - unbalanced paragraph.
50.2908 -
50.2909 - 1/2/05 Has it really been that long? Added "nocomma", "noperiod" check.
50.2910 - Bits and pieces: improved isroman(). Added isletter().
50.2911 - Other stuff I never noted before this.
50.2912 -
50.2913 - 7/3/05 Stuck in a quick start on DP-markup ignoring
50.2914 - at BillFlis's suggestion.
50.2915 -
50.2916 - 1/23/06 Took out nocomma etc if typos are off. Why did I ever leave that in?
50.2917 - Don't count footer for dotcomma etc.
50.2918 -
50.2919 -
50.2920 -1 I
50.2921 -ail all
50.2922 -arc are
50.2923 -arid and
50.2924 -bad had
50.2925 -ball hall
50.2926 -band hand
50.2927 -bar her
50.2928 -bat but
50.2929 -be he
50.2930 -bead head
50.2931 -beads heads
50.2932 -bear hear
50.2933 -bit hit
50.2934 -bo be
50.2935 -boon been
50.2936 -borne home
50.2937 -bow how
50.2938 -bumbled humbled
50.2939 -car ear
50.2940 -carnage carriage
50.2941 -carne came
50.2942 -cast east
50.2943 -cat cut
50.2944 -cat eat
50.2945 -cheek check
50.2946 -clay day
50.2947 -coining coming
50.2948 -comer corner
50.2949 -die she
50.2950 -docs does
50.2951 -ease case
50.2952 -fail fall
50.2953 -fee he
50.2954 -haying having
50.2955 -ho he
50.2956 -ho who
50.2957 -hut but
50.2958 -is as
50.2959 -lie he
50.2960 -lime time
50.2961 -loth 10th
50.2962 -m in
50.2963 -modem modern
50.2964 -Ms his
50.2965 -ray away
50.2966 -ray my
50.2967 -ringer finger
50.2968 -ringers fingers
50.2969 -rioted noted
50.2970 -tho the
50.2971 -tie he
50.2972 -tie the
50.2973 -tier her
50.2974 -tight right
50.2975 -tile the
50.2976 -tiling thing
50.2977 -tip up
50.2978 -tram train
50.2979 -tune time
50.2980 -u "
50.2981 -wen well
50.2982 -yon you
50.2983 -
50.2984 -*********************************************************************/
50.2985 -
51.1 --- a/gutcheck/gutcheck.typ.in Fri Jan 27 00:28:11 2012 +0000
51.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000
51.3 @@ -1,54 +0,0 @@
51.4 -11
51.5 -44
51.6 -ms
51.7 -ail
51.8 -alien
51.9 -arc
51.10 -arid
51.11 -bar
51.12 -bat
51.13 -bo
51.14 -borne
51.15 -bow
51.16 -bum
51.17 -bumbled
51.18 -carnage
51.19 -carne
51.20 -cither
51.21 -coining
51.22 -comer
51.23 -cur
51.24 -docs
51.25 -eve
51.26 -eves
51.27 -gaming
51.28 -gram
51.29 -guru
51.30 -hag
51.31 -hare
51.32 -haying
51.33 -ho
51.34 -lime
51.35 -loth
51.36 -m
51.37 -modem
51.38 -nave
51.39 -ringer
51.40 -ringers
51.41 -riot
51.42 -rioted
51.43 -signer
51.44 -snore
51.45 -spam
51.46 -tho
51.47 -tier
51.48 -tile
51.49 -tiling
51.50 -tram
51.51 -tum
51.52 -tune
51.53 -u
51.54 -vas
51.55 -wag
51.56 -wen
51.57 -yon
52.1 --- a/test/compatibility/Makefile.am Fri Jan 27 00:28:11 2012 +0000
52.2 +++ b/test/compatibility/Makefile.am Fri Jan 27 10:30:16 2012 +0000
52.3 @@ -1,4 +1,4 @@
52.4 -TESTS_ENVIRONMENT=GUTCHECK=../../gutcheck/gutcheck ../harness/gc-test
52.5 +TESTS_ENVIRONMENT=BOOKLOUPE=../../bookloupe/bookloupe ../harness/loupe-test
52.6 TESTS=missing-space.tst spaced-punctuation.tst html-tag.tst html-symbol.tst \
52.7 spaced-doublequote.tst mismatched-quotes.tst he-be.tst digits.tst \
52.8 extra-period.tst ellipsis.tst short-line.tst abbreviation.tst \
53.1 --- a/test/harness/Makefile.am Fri Jan 27 00:28:11 2012 +0000
53.2 +++ b/test/harness/Makefile.am Fri Jan 27 10:30:16 2012 +0000
53.3 @@ -1,8 +1,8 @@
53.4 INCLUDES=-I$(top_srcdir)
53.5 -bin_PROGRAMS=gc-test
53.6 +bin_PROGRAMS=loupe-test
53.7 AM_CFLAGS=$(GLIB_CFLAGS)
53.8 LIBS=$(GLIB_LIBS)
53.9
53.10 -gc_test_SOURCES=gc-test.c testcase.c testcase.h testcaseio.c testcaseio.h \
53.11 - testcaseparser.c testcaseparser.h
53.12 -gc_test_LDADD=../../gclib/libgc.la
53.13 +loupe_test_SOURCES=loupe-test.c testcase.c testcase.h testcaseio.c \
53.14 + testcaseio.h testcaseparser.c testcaseparser.h
53.15 +loupe_test_LDADD=../../bl/libbl.la
54.1 --- a/test/harness/gc-test.c Fri Jan 27 00:28:11 2012 +0000
54.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000
54.3 @@ -1,31 +0,0 @@
54.4 -#include <stdlib.h>
54.5 -#include <stdio.h>
54.6 -#include <string.h>
54.7 -#include <gclib/gclib.h>
54.8 -#include "testcase.h"
54.9 -#include "testcaseio.h"
54.10 -
54.11 -/*
54.12 - * Returns FALSE if the test should be considered to have failed.
54.13 - * (returns TRUE on pass or expected-fail).
54.14 - */
54.15 -boolean run_test(const char *filename)
54.16 -{
54.17 - Testcase *testcase;
54.18 - boolean retval;
54.19 - testcase=testcase_parse_file(filename);
54.20 - if (!testcase)
54.21 - return FALSE;
54.22 - retval=testcase_run(testcase);
54.23 - testcase_free(testcase);
54.24 - return retval;
54.25 -}
54.26 -
54.27 -int main(int argc,char **argv)
54.28 -{
54.29 - int i;
54.30 - boolean pass=TRUE;
54.31 - for(i=1;i<argc;i++)
54.32 - pass&=run_test(argv[i]);
54.33 - return pass?0:1;
54.34 -}
55.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
55.2 +++ b/test/harness/loupe-test.c Fri Jan 27 10:30:16 2012 +0000
55.3 @@ -0,0 +1,31 @@
55.4 +#include <stdlib.h>
55.5 +#include <stdio.h>
55.6 +#include <string.h>
55.7 +#include <bl/bl.h>
55.8 +#include "testcase.h"
55.9 +#include "testcaseio.h"
55.10 +
55.11 +/*
55.12 + * Returns FALSE if the test should be considered to have failed.
55.13 + * (returns TRUE on pass or expected-fail).
55.14 + */
55.15 +boolean run_test(const char *filename)
55.16 +{
55.17 + Testcase *testcase;
55.18 + boolean retval;
55.19 + testcase=testcase_parse_file(filename);
55.20 + if (!testcase)
55.21 + return FALSE;
55.22 + retval=testcase_run(testcase);
55.23 + testcase_free(testcase);
55.24 + return retval;
55.25 +}
55.26 +
55.27 +int main(int argc,char **argv)
55.28 +{
55.29 + int i;
55.30 + boolean pass=TRUE;
55.31 + for(i=1;i<argc;i++)
55.32 + pass&=run_test(argv[i]);
55.33 + return pass?0:1;
55.34 +}
56.1 --- a/test/harness/testcase.c Fri Jan 27 00:28:11 2012 +0000
56.2 +++ b/test/harness/testcase.c Fri Jan 27 10:30:16 2012 +0000
56.3 @@ -7,7 +7,7 @@
56.4 #include <io.h>
56.5 #endif
56.6 #include <fcntl.h>
56.7 -#include <gclib/gclib.h>
56.8 +#include <bl/bl.h>
56.9 #include "testcase.h"
56.10
56.11 #if !HAVE_MKSTEMP
56.12 @@ -124,9 +124,9 @@
56.13 return FALSE;
56.14 }
56.15 close(fd);
56.16 - command[0]=getenv("GUTCHECK");
56.17 + command[0]=getenv("BOOKLOUPE");
56.18 if (!command[0])
56.19 - command[0]="." GC_DIR_SEPARATOR_S "gutcheck";
56.20 + command[0]="." BL_DIR_SEPARATOR_S "bookloupe";
56.21 command[1]=input;
56.22 command[2]=NULL;
56.23 if (testcase->expected)
56.24 @@ -157,7 +157,7 @@
56.25 fprintf(stderr,"%s: FAIL\n",testcase->basename);
56.26 offset=common_prefix_length(output,expected->str);
56.27 if (offset==header_len && !output[offset])
56.28 - fprintf(stderr,"Unexpected zero warnings from gutcheck.\n");
56.29 + fprintf(stderr,"Unexpected zero warnings from bookloupe.\n");
56.30 else
56.31 {
56.32 endp=strchr(output+offset,'\n');
56.33 @@ -171,7 +171,7 @@
56.34 else
56.35 bol=report->str;
56.36 col=offset-(bol-report->str);
56.37 - fprintf(stderr,"Unexpected output from gutcheck:\n");
56.38 + fprintf(stderr,"Unexpected output from bookloupe:\n");
56.39 if (report->len>=header_len)
56.40 fprintf(stderr,"%s\n%*s^\n",report->str+header_len,col,"");
56.41 else
56.42 @@ -185,7 +185,7 @@
56.43 string_free(expected,TRUE);
56.44 mem_free(output);
56.45 if (exit_status)
56.46 - fprintf(stderr,"gutcheck exited with code %d\n",r);
56.47 + fprintf(stderr,"bookloupe exited with code %d\n",r);
56.48 if (!exit_status)
56.49 fprintf(stderr,"%s: PASS\n",testcase->basename);
56.50 return !exit_status;
57.1 --- a/test/harness/testcaseio.c Fri Jan 27 00:28:11 2012 +0000
57.2 +++ b/test/harness/testcaseio.c Fri Jan 27 10:30:16 2012 +0000
57.3 @@ -1,7 +1,7 @@
57.4 #include <stdlib.h>
57.5 #include <stdio.h>
57.6 #include <string.h>
57.7 -#include <gclib/gclib.h>
57.8 +#include <bl/bl.h>
57.9 #include "testcaseparser.h"
57.10 #include "testcaseio.h"
57.11
58.1 --- a/test/harness/testcaseparser.c Fri Jan 27 00:28:11 2012 +0000
58.2 +++ b/test/harness/testcaseparser.c Fri Jan 27 10:30:16 2012 +0000
58.3 @@ -2,7 +2,7 @@
58.4 #include <stdio.h>
58.5 #include <string.h>
58.6 #include <ctype.h>
58.7 -#include <gclib/gclib.h>
58.8 +#include <bl/bl.h>
58.9 #include "testcaseparser.h"
58.10
58.11 /*
59.1 --- a/test/harness/testcaseparser.h Fri Jan 27 00:28:11 2012 +0000
59.2 +++ b/test/harness/testcaseparser.h Fri Jan 27 10:30:16 2012 +0000
59.3 @@ -1,7 +1,7 @@
59.4 #ifndef TESTCASE_PARSER_H
59.5 #define TESTCASE_PARSER_H
59.6
59.7 -#include <gclib/gclib.h>
59.8 +#include <bl/bl.h>
59.9
59.10 typedef struct {
59.11 char *filename;