bookloupe: bookloupe/bookloupe.c@f44c530f80da

     1 /*************************************************************************/

     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */

     3 /*									 */

     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */

     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */

     6 /*									 */

     7 /* This program is free software; you can redistribute it and/or modify  */

     8 /* it under the terms of the GNU General Public License as published by  */

     9 /* the Free Software Foundation; either version 2 of the License, or     */

    10 /* (at your option) any later version.					 */

    11 /*									 */

    12 /* This program is distributed in the hope that it will be useful,       */

    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */

    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */

    15 /* GNU General Public License for more details.				 */

    16 /*									 */

    17 /* You should have received a copy of the GNU General Public License	 */

    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */

    19 /*************************************************************************/

    21 #include <stdio.h>

    22 #include <stdlib.h>

    23 #include <string.h>

    24 #include <ctype.h>

    25 #ifdef __WIN32__

    26 #include <windows.h>

    27 #endif

    28 #include <glib.h>

    29 #include <bl/bl.h>

    30 #include "bookloupe.h"

    31 #include "counters.h"

    32 #include "pending.h"

    33 #include "HTMLentities.h"

    35 gchar *prevline;

    37 /* Common typos. */

    38 char *typo[] = {

    39     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",

    40     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",

    41     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",

    42     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",

    43     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",

    44     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",

    45     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",

    46     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",

    47     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",

    48     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",

    49     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",

    50     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",

    51     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",

    52     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",

    53     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",

    54     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",

    55     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",

    56     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",

    57     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",

    58     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",

    59     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",

    60     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",

    61     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",

    62     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",

    63     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",

    64     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",

    65     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",

    66     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",

    67     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",

    68     "se", ""

    69 };

    71 GTree *usertypo;

    73 /* Common abbreviations and other OK words not to query as typos. */

    74 char *okword[] = {

    75     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",

    76     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",

    77     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",

    78     "outbid", "outbids", "frostbite", "frostbitten", ""

    79 };

    81 /* Common abbreviations that cause otherwise unexplained periods. */

    82 char *abbrev[] = {

    83     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",

    84     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""

    85 };

    87 /*

    88  * Two-Letter combinations that rarely if ever start words,

    89  * but are common scannos or otherwise common letter combinations.

    90  */

    91 char *nostart[] = {

    92     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""

    93 };

    95 /*

    96  * Two-Letter combinations that rarely if ever end words,

    97  * but are common scannos or otherwise common letter combinations.

    98  */

    99 char *noend[] = {

   100     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",

   101     "sw", "gr", "sl", "cl", "iy", ""

   102 };

   104 char *markup[] = {

   105     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",

   106     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",

   107     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",

   108     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""

   109 };

   111 char *DPmarkup[] = {

   112     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""

   113 };

   115 char *nocomma[] = {

   116     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",

   117     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",

   118     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",

   119     "during", "let", "toward", "among", ""

   120 };

   122 char *noperiod[] = {

   123     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",

   124     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",

   125     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",

   126     "among", "those", "into", "whom", "having", "thence", ""

   127 };

   129 gboolean pswit[SWITNO];  /* program switches */

   131 static GOptionEntry options[]={

   132     { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,

   133       "Ignore DP-specific markup", NULL },

   134     { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,

   135       "Don't echo queried line", NULL },

   136     { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,

   137       "Check single quotes", NULL },

   138     { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,

   139       "Check common typos", NULL },

   140     { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,

   141       "Require closure of quotes on every paragraph", NULL },

   142     { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,

   143       "Disable paranoid querying of everything", NULL },

   144     { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,

   145       "Disable line end checking", NULL },

   146     { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,

   147       "Overview: just show counts", NULL },

   148     { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,

   149       "Output errors to stdout instead of stderr", NULL },

   150     { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,

   151       "Echo header fields", NULL },

   152     { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,

   153       "Ignore markup in < >", NULL },

   154     { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,

   155       "Use file of user-defined typos", NULL },

   156     { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,

   157       "Defaults for use on www upload", NULL },

   158     { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,

   159       "Verbose - list everything", NULL },

   160     { NULL }

   161 };

   163 long cnt_quote;		/* for overview mode, count of quote queries */

   164 long cnt_brack;		/* for overview mode, count of brackets queries */

   165 long cnt_bin;		/* for overview mode, count of non-ASCII queries */

   166 long cnt_odd;		/* for overview mode, count of odd character queries */

   167 long cnt_long;		/* for overview mode, count of long line errors */

   168 long cnt_short;		/* for overview mode, count of short line queries */

   169 long cnt_punct;		/* for overview mode,

   170 			   count of punctuation and spacing queries */

   171 long cnt_dash;		/* for overview mode, count of dash-related queries */

   172 long cnt_word;		/* for overview mode, count of word queries */

   173 long cnt_html;		/* for overview mode, count of html queries */

   174 long cnt_lineend;	/* for overview mode, count of line-end queries */

   175 long cnt_spacend;	/* count of lines with space at end */

   176 long linecnt;		/* count of total lines in the file */

   177 long checked_linecnt;	/* count of lines actually checked */

   179 void proghelp(GOptionContext *context);

   180 void procfile(const char *);

   182 gchar *running_from;

   184 gboolean mixdigit(const char *);

   185 gchar *getaword(const char **);

   186 char *flgets(char **,long,int);

   187 void postprocess_for_HTML(char *);

   188 char *linehasmarkup(char *);

   189 char *losemarkup(char *);

   190 gboolean tagcomp(const char *,const char *);

   191 void loseentities(char *);

   192 gboolean isroman(const char *);

   193 void postprocess_for_DP(char *);

   194 void print_as_windows_1252(const char *string);

   195 void print_as_utf_8(const char *string);

   197 GTree *qword,*qperiod;

   199 #ifdef __WIN32__

   200 UINT saved_cp;

   201 #endif

   203 void parse_options(int *argc,char ***argv)

   204 {

   205     GError *err=NULL;

   206     GOptionContext *context;

   207     context=g_option_context_new(

   208       "file - looks for errors in Project Gutenberg(TM) etexts");

   209     g_option_context_add_main_entries(context,options,NULL);

   210     if (!g_option_context_parse(context,argc,argv,&err))

   211     {

   212 	g_printerr("Bookloupe: %s\n",err->message);

   213 	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);

   214 	exit(1);

   215     }

   216     /* Paranoid checking is turned OFF, not on, by its switch */

   217     pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];

   218     if (pswit[PARANOID_SWITCH])

   219 	/* if running in paranoid mode, typo checks default to enabled */

   220 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];

   221     /* Line-end checking is turned OFF, not on, by its switch */

   222     pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];

   223     /* Echoing is turned OFF, not on, by its switch */

   224     pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];

   225     if (pswit[OVERVIEW_SWITCH])

   226 	/* just print summary; don't echo */

   227 	pswit[ECHO_SWITCH]=FALSE;

   228     /*

   229      * Web uploads - for the moment, this is really just a placeholder

   230      * until we decide what processing we really want to do on web uploads

   231      */

   232     if (pswit[WEB_SWITCH])

   233     {

   234 	/* specific override for web uploads */

   235 	pswit[ECHO_SWITCH]=TRUE;

   236 	pswit[SQUOTE_SWITCH]=FALSE;

   237 	pswit[TYPO_SWITCH]=TRUE;

   238 	pswit[QPARA_SWITCH]=FALSE;

   239 	pswit[PARANOID_SWITCH]=TRUE;

   240 	pswit[LINE_END_SWITCH]=FALSE;

   241 	pswit[OVERVIEW_SWITCH]=FALSE;

   242 	pswit[STDOUT_SWITCH]=FALSE;

   243 	pswit[HEADER_SWITCH]=TRUE;

   244 	pswit[VERBOSE_SWITCH]=FALSE;

   245 	pswit[MARKUP_SWITCH]=FALSE;

   246 	pswit[USERTYPO_SWITCH]=FALSE;

   247 	pswit[DP_SWITCH]=FALSE;

   248     }

   249     if (*argc<2)

   250     {

   251 	proghelp(context);

   252 	exit(1);

   253     }

   254     g_option_context_free(context);

   255 }

   257 /*

   258  * read_user_scannos:

   259  *

   260  * Read in the user-defined stealth scanno list.

   261  */

   262 void read_user_scannos(void)

   263 {

   264     GError *err=NULL;

   265     gchar *usertypo_file;

   266     gboolean okay;

   267     int i;

   268     gsize len,nb;

   269     gchar *contents,*utf8,**lines;

   270     usertypo_file=g_strdup("bookloupe.typ");

   271     okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   272     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   273     {

   274 	g_clear_error(&err);

   275 	g_free(usertypo_file);

   276 	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);

   277 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   278     }

   279     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   280     {

   281 	g_clear_error(&err);

   282 	g_free(usertypo_file);

   283 	usertypo_file=g_strdup("gutcheck.typ");

   284 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   285     }

   286     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   287     {

   288 	g_clear_error(&err);

   289 	g_free(usertypo_file);

   290 	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);

   291 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   292     }

   293     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   294     {

   295 	g_free(usertypo_file);

   296 	g_print("   --> I couldn't find bookloupe.typ "

   297 	  "-- proceeding without user typos.\n");

   298 	return;

   299     }

   300     else if (!okay)

   301     {

   302 	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);

   303 	g_free(usertypo_file);

   304 	g_clear_error(&err);

   305 	exit(1);

   306     }

   307     if (g_utf8_validate(contents,len,NULL))

   308 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);

   309     else

   310 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);

   311     g_free(contents);

   312     lines=g_strsplit_set(utf8,"\r\n",0);

   313     g_free(utf8);

   314     usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);

   315     for (i=0;lines[i];i++)

   316 	if (*(unsigned char *)lines[i]>'!')

   317 	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));

   318 	else

   319 	    g_free(lines[i]);

   320     g_free(lines);

   321 }

   323 /*

   324  * read_etext:

   325  *

   326  * Read an etext returning a newly allocated string containing the file

   327  * contents or NULL on error.

   328  */

   329 gchar *read_etext(const char *filename,GError **err)

   330 {

   331     GError *tmp_err=NULL;

   332     gchar *contents,*utf8;

   333     gsize len,bytes_read,bytes_written;

   334     int i,line,col;

   335     if (!g_file_get_contents(filename,&contents,&len,err))

   336 	return NULL;

   337     if (g_utf8_validate(contents,len,NULL))

   338     {

   339 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);

   340 	g_set_print_handler(print_as_utf_8);

   341 #ifdef __WIN32__

   342 	SetConsoleOutputCP(CP_UTF8);

   343 #endif

   344     }

   345     else

   346     {

   347 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,

   348 	  &bytes_written,&tmp_err);

   349 	if (g_error_matches(tmp_err,G_CONVERT_ERROR,

   350 	  G_CONVERT_ERROR_ILLEGAL_SEQUENCE))

   351 	{

   352 	    line=col=1;

   353 	    for(i=0;i<bytes_read;i++)

   354 		if (contents[i]=='\n')

   355 		{

   356 		    line++;

   357 		    col=1;

   358 		}

   359 		else if (contents[i]!='\r')

   360 		    col++;

   361 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,

   362 	      "Input conversion failed. Byte %d at line %d, column %d is not a "

   363 	      "valid Windows-1252 character",

   364 	      ((unsigned char *)contents)[bytes_read],line,col);

   365 	}

   366 	else if (tmp_err)

   367 	    g_propagate_error(err,tmp_err);

   368 	g_set_print_handler(print_as_windows_1252);

   369 #ifdef __WIN32__

   370 	SetConsoleOutputCP(1252);

   371 #endif

   372     }

   373     g_free(contents);

   374     return utf8;

   375 }

   377 void cleanup_on_exit(void)

   378 {

   379 #ifdef __WIN32__

   380     SetConsoleOutputCP(saved_cp);

   381 #endif

   382 }

   384 int main(int argc,char **argv)

   385 {

   386 #ifdef __WIN32__

   387     atexit(cleanup_on_exit);

   388     saved_cp=GetConsoleOutputCP();

   389 #endif

   390     running_from=g_path_get_dirname(argv[0]);

   391     parse_options(&argc,&argv);

   392     if (pswit[USERTYPO_SWITCH])

   393 	read_user_scannos();

   394     fprintf(stderr,"bookloupe: Check and report on an e-text\n");

   395     procfile(argv[1]);

   396     if (pswit[OVERVIEW_SWITCH])

   397     {

   398 	g_print("    Checked %ld lines of %ld (head+foot = %ld)\n\n",

   399 	  checked_linecnt,linecnt,linecnt-checked_linecnt);

   400 	g_print("    --------------- Queries found --------------\n");

   401 	if (cnt_long)

   402 	    g_print("    Long lines:		    %14ld\n",cnt_long);

   403 	if (cnt_short)

   404 	    g_print("    Short lines:		   %14ld\n",cnt_short);

   405 	if (cnt_lineend)

   406 	    g_print("    Line-end problems:	     %14ld\n",cnt_lineend);

   407 	if (cnt_word)

   408 	    g_print("    Common typos:		  %14ld\n",cnt_word);

   409 	if (cnt_quote)

   410 	    g_print("    Unmatched quotes:	      %14ld\n",cnt_quote);

   411 	if (cnt_brack)

   412 	    g_print("    Unmatched brackets:	    %14ld\n",cnt_brack);

   413 	if (cnt_bin)

   414 	    g_print("    Non-ASCII characters:	  %14ld\n",cnt_bin);

   415 	if (cnt_odd)

   416 	    g_print("    Proofing characters:	   %14ld\n",cnt_odd);

   417 	if (cnt_punct)

   418 	    g_print("    Punctuation & spacing queries: %14ld\n",cnt_punct);

   419 	if (cnt_dash)

   420 	    g_print("    Non-standard dashes:	   %14ld\n",cnt_dash);

   421 	if (cnt_html)

   422 	    g_print("    Possible HTML tags:	    %14ld\n",cnt_html);

   423 	g_print("\n");

   424 	g_print("    TOTAL QUERIES		  %14ld\n",

   425 	  cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+

   426 	  cnt_dash+cnt_word+cnt_html+cnt_lineend);

   427     }

   428     g_free(running_from);

   429     if (usertypo)

   430 	g_tree_unref(usertypo);

   431     return 0;

   432 }

   434 void count_dashes(const char *line,const char *dash,

   435   struct dash_results *results)

   436 {

   437     int i;

   438     gchar **tokens;

   439     gunichar pc,nc;

   440     gboolean spaced=FALSE,unspaced=FALSE,spaced2=FALSE;

   441     if (!*line)

   442 	return;

   443     tokens=g_strsplit(line,dash,0);

   444     if (tokens[1])

   445 	results->base++;

   446     for(i=1;tokens[i];i++)

   447     {

   448 	pc=g_utf8_get_char(g_utf8_prev_char(tokens[i-1]+strlen(tokens[i-1])));

   449 	nc=g_utf8_get_char(tokens[i]);

   450 	if (g_unichar_isspace(pc) || g_unichar_isspace(nc))

   451 	    spaced=TRUE;

   452 	if (g_unichar_isspace(pc) && g_unichar_isspace(nc))

   453 	    spaced2=TRUE;

   454 	else if (!g_unichar_isspace(pc) && !g_unichar_isspace(nc))

   455 	    unspaced=TRUE;

   456     }

   457     if (spaced)

   458 	results->space++;

   459     if (spaced2)

   460 	/* count of lines with em-dashes with spaces both sides */

   461 	results->non_PG_space++;

   462     if (unspaced)

   463 	/* count of lines with PG-type em-dashes with no spaces */

   464 	results->PG_space++;

   465     g_strfreev(tokens);

   466 }

   468 /*

   469  * first_pass:

   470  *

   471  * Run a first pass - verify that it's a valid PG

   472  * file, decide whether to report some things that

   473  * occur many times in the text like long or short

   474  * lines, non-standard dashes, etc.

   475  */

   476 struct first_pass_results *first_pass(const char *etext)

   477 {

   478     gunichar laststart=CHAR_SPACE;

   479     const char *s;

   480     gchar *lc_line;

   481     int i,j,lbytes,llen;

   482     gchar **lines;

   483     unsigned int lastlen=0,lastblen=0;

   484     long spline=0,nspline=0;

   485     static struct first_pass_results results={0};

   486     struct dash_results tmp_dash_results;

   487     gchar *inword;

   488     QuoteClass qc;

   489     lines=g_strsplit(etext,"\n",0);

   490     if (!lines[0])

   491     {

   492 	/* An empty etext has no terminators */

   493 	results.newlines=DOS_NEWLINES;

   494     }

   495     else if (!lines[1])

   496     {

   497 	/*

   498 	 * If there are no LFs, we don't have UNIX-style

   499 	 * terminators, but we might have OS9-style ones.

   500 	 */

   501 	results.newlines=OS9_NEWLINES;

   502 	g_strfreev(lines);

   503 	lines=g_strsplit(etext,"\r",0);

   504 	if (!lines[0] || !lines[1])

   505 	    /* Looks like we don't have any terminators at all */

   506 	    results.newlines=DOS_NEWLINES;

   507     }

   508     else

   509     {

   510 	/* We might have UNIX-style terminators */

   511 	results.newlines=UNIX_NEWLINES;

   512     }

   513     for (j=0;lines[j];j++)

   514     {

   515 	lbytes=strlen(lines[j]);

   516 	if (lbytes>0 && lines[j][lbytes-1]=='\r')

   517 	{

   518 	    results.newlines=DOS_NEWLINES;

   519 	    do

   520 	    {

   521 		lines[j][--lbytes]='\0';

   522 	    } while (lbytes>0 && lines[j][lbytes-1]=='\r');

   523 	}

   524 	llen=g_utf8_strlen(lines[j],lbytes);

   525 	linecnt++;

   526 	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&

   527 	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))

   528 	{

   529 	    if (spline)

   530 		g_print("   --> Duplicate header?\n");

   531 	    spline=linecnt+1;   /* first line of non-header text, that is */

   532 	}

   533 	if (!strncmp(lines[j],"*** START",9) &&

   534 	  strstr(lines[j],"PROJECT GUTENBERG"))

   535 	{

   536 	    if (nspline)

   537 		g_print("   --> Duplicate header?\n");

   538 	    nspline=linecnt+1;   /* first line of non-header text, that is */

   539 	}

   540 	if (spline || nspline)

   541 	{

   542 	    lc_line=g_utf8_strdown(lines[j],lbytes);

   543 	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))

   544 	    {

   545 		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))

   546 		{

   547 		    if (results.footerline)

   548 		    {

   549 			/* it's an old-form header - we can detect duplicates */

   550 			if (!nspline)

   551 			    g_print("   --> Duplicate footer?\n");

   552 		    }

   553 		    else

   554 			results.footerline=linecnt;

   555 		}

   556 	    }

   557 	    g_free(lc_line);

   558 	}

   559 	if (spline)

   560 	    results.firstline=spline;

   561 	if (nspline)

   562 	    results.firstline=nspline;  /* override with new */

   563 	if (results.footerline)

   564 	    continue;    /* don't count the boilerplate in the footer */

   565 	results.totlen+=llen;

   566 	for (s=lines[j];*s;s=g_utf8_next_char(s))

   567 	{

   568 	    if (g_utf8_get_char(s)>127)

   569 		results.binlen++;

   570 	    if (g_unichar_isalpha(g_utf8_get_char(s)))

   571 		results.alphalen++;

   572 	    if (s>lines[j])

   573 	    {

   574 		if (CHAR_IS_DQUOTE(g_utf8_get_char(s)))

   575 		    qc=QUOTE_CLASS(g_utf8_get_char(s));

   576 		else

   577 		    qc=INVALID_QUOTE;

   578 		if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) &&

   579 		  g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))))

   580 		    results.endquote_count++;

   581 	    }

   582 	}

   583 	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&

   584 	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)

   585 	    results.shortline++;

   586 	if (lbytes>0 &&

   587 	  g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)

   588 	    cnt_spacend++;

   589 	if (strstr(lines[j],".,"))

   590 	    results.dotcomma++;

   591 	/* only count ast lines for ignoring purposes where there is */

   592 	/* locase text on the line */

   593 	if (strchr(lines[j],'*'))

   594 	{

   595 	    for (s=lines[j];*s;s=g_utf8_next_char(s))

   596 		if (g_unichar_islower(g_utf8_get_char(s)))

   597 		    break;

   598 	    if (*s)

   599 		results.astline++;

   600 	}

   601 	if (strchr(lines[j],'/'))

   602 	    results.fslashline++;

   603 	if (lbytes>0)

   604 	{

   605 	    for (s=g_utf8_prev_char(lines[j]+lbytes);

   606 	      s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;

   607 	      s=g_utf8_prev_char(s))

   608 		;

   609 	    if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&

   610 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')

   611 		results.hyphens++;

   612 	}

   613 	if (llen>LONGEST_PG_LINE)

   614 	    results.longline++;

   615 	if (llen>WAY_TOO_LONG)

   616 	    results.verylongline++;

   617 	if (strchr(lines[j],'<') && strchr(lines[j],'>'))

   618 	{

   619 	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);

   620 	    if (i>0)

   621 		results.htmcount++;

   622 	    if (strstr(lines[j],"<i>"))

   623 		results.htmcount+=4; /* bonus marks! */

   624 	}

   625 	/* Check for spaced em-dashes */

   626 	memset(&tmp_dash_results,0,sizeof(tmp_dash_results));

   627 	count_dashes(lines[j],"--",&tmp_dash_results);

   628 	count_dashes(lines[j],"—",&tmp_dash_results);

   629 	if (tmp_dash_results.base)

   630 	    results.emdash.base++;

   631 	if (tmp_dash_results.non_PG_space)

   632 	    results.emdash.non_PG_space++;

   633 	if (tmp_dash_results.PG_space)

   634 	    results.emdash.PG_space++;

   635 	for (s=lines[j];*s;)

   636 	{

   637 	    inword=getaword(&s);

   638 	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))

   639 		results.Dutchcount++;

   640 	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))

   641 		results.Frenchcount++;

   642 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))

   643 		results.standalone_digit++;

   644 	    g_free(inword);

   645 	}

   646 	/* Check for spaced dashes */

   647 	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')

   648 	    results.spacedash++;

   649 	lastblen=lastlen;

   650 	lastlen=llen;

   651 	laststart=lines[j][0];

   652     }

   653     g_strfreev(lines);

   654     return &results;

   655 }

   657 /*

   658  * report_first_pass:

   659  *

   660  * Make some snap decisions based on the first pass results.

   661  */

   662 struct warnings *report_first_pass(struct first_pass_results *results)

   663 {

   664     static struct warnings warnings={0};

   665     warnings.newlines=results->newlines;

   666     if (warnings.newlines==UNIX_NEWLINES)

   667 	g_print("   --> No lines in this file have a CR. Not reporting them. "

   668 	  "Project Gutenberg requires that all lineends be CR-LF.\n");

   669     else if (warnings.newlines==OS9_NEWLINES)

   670 	g_print("   --> No lines in this file have a LF. Not reporting them. "

   671 	  "Project Gutenberg requires that all lineends be CR-LF.\n");

   672     if (cnt_spacend>0)

   673 	g_print("   --> %ld lines in this file have white space at end\n",

   674 	  cnt_spacend);

   675     warnings.dotcomma=1;

   676     if (results->dotcomma>5)

   677     {

   678 	warnings.dotcomma=0;

   679 	g_print("   --> %ld lines in this file contain '.,'. "

   680 	  "Not reporting them.\n",results->dotcomma);

   681     }

   682     /*

   683      * If more than 50 lines, or one-tenth, are short,

   684      * don't bother reporting them.

   685      */

   686     warnings.shortline=1;

   687     if (results->shortline>50 || results->shortline*10>linecnt)

   688     {

   689 	warnings.shortline=0;

   690 	g_print("   --> %ld lines in this file are short. "

   691 	  "Not reporting short lines.\n",results->shortline);

   692     }

   693     /*

   694      * If more than 50 lines, or one-tenth, are long,

   695      * don't bother reporting them.

   696      */

   697     warnings.longline=1;

   698     if (results->longline>50 || results->longline*10>linecnt)

   699     {

   700 	warnings.longline=0;

   701 	g_print("   --> %ld lines in this file are long. "

   702 	  "Not reporting long lines.\n",results->longline);

   703     }

   704     /* If more than 10 lines contain asterisks, don't bother reporting them. */

   705     warnings.ast=1;

   706     if (results->astline>10)

   707     {

   708 	warnings.ast=0;

   709 	g_print("   --> %ld lines in this file contain asterisks. "

   710 	  "Not reporting them.\n",results->astline);

   711     }

   712     /*

   713      * If more than 10 lines contain forward slashes,

   714      * don't bother reporting them.

   715      */

   716     warnings.fslash=1;

   717     if (results->fslashline>10)

   718     {

   719 	warnings.fslash=0;

   720 	g_print("   --> %ld lines in this file contain forward slashes. "

   721 	  "Not reporting them.\n",results->fslashline);

   722     }

   723     /*

   724      * If more than 20 lines contain unpunctuated endquotes,

   725      * don't bother reporting them.

   726      */

   727     warnings.endquote=1;

   728     if (results->endquote_count>20)

   729     {

   730 	warnings.endquote=0;

   731 	g_print("   --> %ld lines in this file contain unpunctuated endquotes. "

   732 	  "Not reporting them.\n",results->endquote_count);

   733     }

   734     /*

   735      * If more than 15 lines contain standalone digits,

   736      * don't bother reporting them.

   737      */

   738     warnings.digit=1;

   739     if (results->standalone_digit>10)

   740     {

   741 	warnings.digit=0;

   742 	g_print("   --> %ld lines in this file contain standalone 0s and 1s. "

   743 	  "Not reporting them.\n",results->standalone_digit);

   744     }

   745     /*

   746      * If more than 20 lines contain hyphens at end,

   747      * don't bother reporting them.

   748      */

   749     warnings.hyphen=1;

   750     if (results->hyphens>20)

   751     {

   752 	warnings.hyphen=0;

   753 	g_print("   --> %ld lines in this file have hyphens at end. "

   754 	  "Not reporting them.\n",results->hyphens);

   755     }

   756     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])

   757     {

   758 	g_print("   --> Looks like this is HTML. Switching HTML mode ON.\n");

   759 	pswit[MARKUP_SWITCH]=1;

   760     }

   761     if (results->verylongline>0)

   762 	g_print("   --> %ld lines in this file are VERY long!\n",

   763 	  results->verylongline);

   764     /*

   765      * If there are more non-PG spaced dashes than PG em-dashes,

   766      * assume it's deliberate.

   767      * Current PG guidelines say don't use them, but older texts do,

   768      * and some people insist on them whatever the guidelines say.

   769      */

   770     warnings.dash=1;

   771     if (results->spacedash+results->emdash.non_PG_space>

   772       results->emdash.PG_space)

   773     {

   774 	warnings.dash=0;

   775 	g_print("   --> There are %ld spaced dashes and em-dashes. "

   776 	  "Not reporting them.\n",

   777 	  results->spacedash+results->emdash.non_PG_space);

   778     }

   779     /* If more than a quarter of characters are hi-bit, bug out. */

   780     warnings.bin=1;

   781     if (results->binlen*4>results->totlen)

   782     {

   783 	g_print("   --> This file does not appear to be ASCII. "

   784 	  "Terminating. Best of luck with it!\n");

   785 	exit(1);

   786     }

   787     if (results->alphalen*4<results->totlen)

   788     {

   789 	g_print("   --> This file does not appear to be text. "

   790 	  "Terminating. Best of luck with it!\n");

   791 	exit(1);

   792     }

   793     if (results->binlen*100>results->totlen || results->binlen>100)

   794     {

   795 	g_print("   --> There are a lot of foreign letters here. "

   796 	  "Not reporting them.\n");

   797 	warnings.bin=0;

   798     }

   799     warnings.isDutch=FALSE;

   800     if (results->Dutchcount>50)

   801     {

   802 	warnings.isDutch=TRUE;

   803 	g_print("   --> This looks like Dutch - "

   804 	  "switching off dashes and warnings for 's Middags case.\n");

   805     }

   806     warnings.isFrench=FALSE;

   807     if (results->Frenchcount>50)

   808     {

   809 	warnings.isFrench=TRUE;

   810 	g_print("   --> This looks like French - "

   811 	  "switching off some doublepunct.\n");

   812     }

   813     if (results->firstline && results->footerline)

   814 	g_print("    The PG header and footer appear to be already on.\n");

   815     else

   816     {

   817 	if (results->firstline)

   818 	    g_print("    The PG header is on - no footer.\n");

   819 	if (results->footerline)

   820 	    g_print("    The PG footer is on - no header.\n");

   821     }

   822     g_print("\n");

   823     if (pswit[VERBOSE_SWITCH])

   824     {

   825 	warnings.bin=1;

   826 	warnings.shortline=1;

   827 	warnings.dotcomma=1;

   828 	warnings.longline=1;

   829 	warnings.dash=1;

   830 	warnings.digit=1;

   831 	warnings.ast=1;

   832 	warnings.fslash=1;

   833 	warnings.hyphen=1;

   834 	warnings.endquote=1;

   835 	g_print("   *** Verbose output is ON -- you asked for it! ***\n");

   836     }

   837     if (warnings.isDutch)

   838 	warnings.dash=0;

   839     if (results->footerline>0 && results->firstline>0 &&

   840       results->footerline>results->firstline &&

   841       results->footerline-results->firstline<100)

   842     {

   843 	g_print("   --> I don't really know where this text starts. \n");

   844 	g_print("       There are no reference points.\n");

   845 	g_print("       I'm going to have to report the header and footer "

   846 	  "as well.\n");

   847 	results->firstline=0;

   848     }

   849     return &warnings;

   850 }

   852 /*

   853  * analyse_quotes:

   854  *

   855  * Look along the line, accumulate the count of quotes, and see

   856  * if this is an empty line - i.e. a line with nothing on it

   857  * but spaces.

   858  * If line has just spaces, period, * and/or - on it, don't

   859  * count it, since empty lines with asterisks or dashes to

   860  * separate sections are common.

   861  *

   862  * Returns: TRUE if the line is empty.

   863  */

   864 gboolean analyse_quotes(const char *aline,struct counters *counters)

   865 {

   866     int guessquote=0;

   867     /* assume the line is empty until proven otherwise */

   868     gboolean isemptyline=TRUE;

   869     const char *s=aline,*sprev,*snext;

   870     gunichar c;

   871     sprev=NULL;

   872     GError *tmp_err=NULL;

   873     while (*s)

   874     {

   875 	snext=g_utf8_next_char(s);

   876 	c=g_utf8_get_char(s);

   877 	if (CHAR_IS_DQUOTE(c))

   878 	    (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);

   879 	else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])

   880 	{

   881 	    if (s==aline)

   882 	    {

   883 		/*

   884 		 * At start of line, it can only be a quotation mark.

   885 		 * Hardcode a very common exception!

   886 		 */

   887 		if (!g_str_has_prefix(snext,"tis") &&

   888 		  !g_str_has_prefix(snext,"Tis"))

   889 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);

   890 	    }

   891 	    else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&

   892 	      g_unichar_isalpha(g_utf8_get_char(snext)))

   893 		/* Do nothing! it's definitely an apostrophe, not a quote */

   894 		;

   895 	    /* it's outside a word - let's check it out */

   896 	    else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||

   897 	      g_unichar_isalpha(g_utf8_get_char(snext)))

   898 	    {

   899 		/* certainly looks like a quotation mark */

   900 		if (!g_str_has_prefix(snext,"tis") &&

   901 		  !g_str_has_prefix(snext,"Tis"))

   902 		    /* hardcode a very common exception! */

   903 		{

   904 		    if (strchr(".?!,;:",g_utf8_get_char(sprev)))

   905 			(void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);

   906 		    else

   907 			(void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);

   908 		}

   909 	    }

   910 	    else

   911 	    {

   912 		/* now - is it a quotation mark? */

   913 		guessquote=0;   /* accumulate clues */

   914 		if (g_unichar_isalpha(g_utf8_get_char(sprev)))

   915 		{

   916 		    /* it follows a letter - could be either */

   917 		    guessquote++;

   918 		    if (g_utf8_get_char(sprev)=='s')

   919 		    {

   920 			/* looks like a plural apostrophe */

   921 			guessquote-=3;

   922 			if (g_utf8_get_char(snext)==CHAR_SPACE)

   923 			    /* bonus marks! */

   924 			    guessquote-=2;

   925 		    }

   926 		    if (innermost_quote_matches(counters,c))

   927 			/*

   928 			 * Give it the benefit of some doubt,

   929 			 * if a squote is already open.

   930 			 */

   931 			guessquote++;

   932 		    else

   933 			guessquote--;

   934 		    if (guessquote>=0)

   935 			(void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);

   936 		}

   937 		else

   938 		    /* no adjacent letter - it must be a quote of some kind */

   939 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);

   940 	    }

   941 	}

   942 	if (tmp_err)

   943 	{

   944 	    if (pswit[ECHO_SWITCH])

   945 		g_print("\n%s\n",aline);

   946 	    if (!pswit[OVERVIEW_SWITCH])

   947 		g_print("    Line %ld column %ld - %s\n",

   948 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);

   949 	    g_clear_error(&tmp_err);

   950 	}

   951 	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&

   952 	  c!='\r' && c!='\n')

   953 	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */

   954 	if (c==CHAR_UNDERSCORE)

   955 	    counters->c_unders++;

   956 	if (c==CHAR_OPEN_SBRACK)

   957 	{

   958 	    if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&

   959 	      !matching_difference(counters,c) && s==aline &&

   960 	      g_str_has_prefix(s,"[Illustration:"))

   961 		increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);

   962 	    else

   963 		increment_matching(counters,c,TRUE);

   964 	}

   965 	else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)

   966 	    increment_matching(counters,c,TRUE);

   967 	if (c==CHAR_CLOSE_SBRACK)

   968 	{

   969 	    if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&

   970 	      !matching_difference(counters,c) && !*snext)

   971 		increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);

   972 	    else

   973 		increment_matching(counters,c,FALSE);

   974 	}

   975 	else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)

   976 	    increment_matching(counters,c,FALSE);

   977 	sprev=s;

   978 	s=snext;

   979     }

   980     return isemptyline;

   981 }

   983 /*

   984  * check_for_control_characters:

   985  *

   986  * Check for invalid or questionable characters in the line

   987  * Anything above 127 is invalid for plain ASCII, and

   988  * non-printable control characters should also be flagged.

   989  * Tabs should generally not be there.

   990  */

   991 void check_for_control_characters(const char *aline)

   992 {

   993     gunichar c;

   994     const char *s;

   995     for (s=aline;*s;s=g_utf8_next_char(s))

   996     {

   997 	c=g_utf8_get_char(s);

   998 	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)

   999 	{

  1000 	    if (pswit[ECHO_SWITCH])

  1001 		g_print("\n%s\n",aline);

  1002 	    if (!pswit[OVERVIEW_SWITCH])

  1003 		g_print("    Line %ld column %ld - Control character %u\n",

  1004 		  linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);

  1005 	    else

  1006 		cnt_bin++;

  1007 	}

  1008     }

  1009 }

  1011 /*

  1012  * check_for_odd_characters:

  1013  *

  1014  * Check for binary and other odd characters.

  1015  */

  1016 void check_for_odd_characters(const char *aline,const struct warnings *warnings,

  1017   gboolean isemptyline)

  1018 {

  1019     /* Don't repeat multiple warnings on one line. */

  1020     gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;

  1021     gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;

  1022     const char *s;

  1023     gunichar c;

  1024     for (s=aline;*s;s=g_utf8_next_char(s))

  1025     {

  1026 	c=g_utf8_get_char(s);

  1027 	if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))

  1028 	{

  1029 	    if (pswit[ECHO_SWITCH])

  1030 		g_print("\n%s\n",aline);

  1031 	    if (!pswit[OVERVIEW_SWITCH])

  1032 		if (c>127 && c<160 || c>255)

  1033 		    g_print("    Line %ld column %ld - "

  1034 		      "Non-ISO-8859 character %u\n",

  1035 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);

  1036 		else

  1037 		    g_print("    Line %ld column %ld - "

  1038 		      "Non-ASCII character %u\n",

  1039 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);

  1040 	    else

  1041 		cnt_bin++;

  1042 	    eNon_A=TRUE;

  1043 	}

  1044 	if (!eTab && c==CHAR_TAB)

  1045 	{

  1046 	    if (pswit[ECHO_SWITCH])

  1047 		g_print("\n%s\n",aline);

  1048 	    if (!pswit[OVERVIEW_SWITCH])

  1049 		g_print("    Line %ld column %ld - Tab character?\n",

  1050 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1051 	    else

  1052 		cnt_odd++;

  1053 	    eTab=TRUE;

  1054 	}

  1055 	if (!eTilde && c==CHAR_TILDE)

  1056 	{

  1057 	    /*

  1058 	     * Often used by OCR software to indicate an

  1059 	     * unrecognizable character.

  1060 	     */

  1061 	    if (pswit[ECHO_SWITCH])

  1062 		g_print("\n%s\n",aline);

  1063 	    if (!pswit[OVERVIEW_SWITCH])

  1064 		g_print("    Line %ld column %ld - Tilde character?\n",

  1065 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1066 	    else

  1067 		cnt_odd++;

  1068 	    eTilde=TRUE;

  1069 	}

  1070 	if (!eCarat && c==CHAR_CARAT)

  1071 	{

  1072 	    if (pswit[ECHO_SWITCH])

  1073 		g_print("\n%s\n",aline);

  1074 	    if (!pswit[OVERVIEW_SWITCH])

  1075 		g_print("    Line %ld column %ld - Carat character?\n",

  1076 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1077 	    else

  1078 		cnt_odd++;

  1079 	    eCarat=TRUE;

  1080 	}

  1081 	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)

  1082 	{

  1083 	    if (pswit[ECHO_SWITCH])

  1084 		g_print("\n%s\n",aline);

  1085 	    if (!pswit[OVERVIEW_SWITCH])

  1086 		g_print("    Line %ld column %ld - Forward slash?\n",

  1087 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1088 	    else

  1089 		cnt_odd++;

  1090 	    eFSlash=TRUE;

  1091 	}

  1092 	/*

  1093 	 * Report asterisks only in paranoid mode,

  1094 	 * since they're often deliberate.

  1095 	 */

  1096 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&

  1097 	  c==CHAR_ASTERISK)

  1098 	{

  1099 	    if (pswit[ECHO_SWITCH])

  1100 		g_print("\n%s\n",aline);

  1101 	    if (!pswit[OVERVIEW_SWITCH])

  1102 		g_print("    Line %ld column %ld - Asterisk?\n",

  1103 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1104 	    else

  1105 		cnt_odd++;

  1106 	    eAst=TRUE;

  1107 	}

  1108     }

  1109 }

  1111 /*

  1112  * check_for_long_line:

  1113  *

  1114  * Check for line too long.

  1115  */

  1116 void check_for_long_line(const char *aline)

  1117 {

  1118     if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)

  1119     {

  1120 	if (pswit[ECHO_SWITCH])

  1121 	    g_print("\n%s\n",aline);

  1122 	if (!pswit[OVERVIEW_SWITCH])

  1123 	    g_print("    Line %ld column %ld - Long line %ld\n",

  1124 	      linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));

  1125 	else

  1126 	    cnt_long++;

  1127     }

  1128 }

  1130 /*

  1131  * check_for_short_line:

  1132  *

  1133  * Check for line too short.

  1134  *

  1135  * This one is a bit trickier to implement: we don't want to

  1136  * flag the last line of a paragraph for being short, so we

  1137  * have to wait until we know that our current line is a

  1138  * "normal" line, then report the _previous_ line if it was too

  1139  * short. We also don't want to report indented lines like

  1140  * chapter heads or formatted quotations. We therefore keep

  1141  * last->len as the length of the last line examined, and

  1142  * last->blen as the length of the last but one, and try to

  1143  * suppress unnecessary warnings by checking that both were of

  1144  * "normal" length. We keep the first character of the last

  1145  * line in last->start, and if it was a space, we assume that

  1146  * the formatting is deliberate. I can't figure out a way to

  1147  * distinguish something like a quoted verse left-aligned or

  1148  * the header or footer of a letter from a paragraph of short

  1149  * lines - maybe if I examined the whole paragraph, and if the

  1150  * para has less than, say, 8 lines and if all lines are short,

  1151  * then just assume it's OK? Need to look at some texts to see

  1152  * how often a formula like this would get the right result.

  1153  */

  1154 void check_for_short_line(const char *aline,const struct line_properties *last)

  1155 {

  1156     if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&

  1157       last->len<SHORTEST_PG_LINE && last->blen>1 &&

  1158       last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)

  1159     {

  1160 	if (pswit[ECHO_SWITCH])

  1161 	    g_print("\n%s\n",prevline);

  1162 	if (!pswit[OVERVIEW_SWITCH])

  1163 	    g_print("    Line %ld column %ld - Short line %ld?\n",

  1164 	      linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));

  1165 	else

  1166 	    cnt_short++;

  1167     }

  1168 }

  1170 /*

  1171  * check_for_starting_punctuation:

  1172  *

  1173  * Look for punctuation other than full ellipses at start of line.

  1174  */

  1175 void check_for_starting_punctuation(const char *aline)

  1176 {

  1177     if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&

  1178       !g_str_has_prefix(aline,". . ."))

  1179     {

  1180 	if (pswit[ECHO_SWITCH])

  1181 	    g_print("\n%s\n",aline);

  1182 	if (!pswit[OVERVIEW_SWITCH])

  1183 	    g_print("    Line %ld column 1 - Begins with punctuation?\n",

  1184 	      linecnt);

  1185 	else

  1186 	    cnt_punct++;

  1187     }

  1188 }

  1190 /*

  1191  * str_emdash:

  1192  *

  1193  * Find the first em-dash, return a pointer to it and set <next> to the

  1194  * character following the dash.

  1195  */

  1196 char *str_emdash(const char *s,const char **next)

  1197 {

  1198     const char *s1,*s2;

  1199     s1=strstr(s,"--");

  1200     s2=strstr(s,"—");

  1201     if (!s1)

  1202     {

  1203 	if (s2)

  1204 	    *next=g_utf8_next_char(s2);

  1205 	return (char *)s2;

  1206     }

  1207     else if (!s2)

  1208     {

  1209 	*next=g_utf8_next_char(g_utf8_next_char(s1));

  1210 	return (char *)s1;

  1211     }

  1212     else if (s1<s2)

  1213     {

  1214 	*next=g_utf8_next_char(g_utf8_next_char(s1));

  1215 	return (char *)s1;

  1216     }

  1217     else

  1218     {

  1219 	*next=g_utf8_next_char(s2);

  1220 	return (char *)s2;

  1221     }

  1222 }

  1224 /*

  1225  * check_for_spaced_emdash:

  1226  *

  1227  * Check for spaced em-dashes.

  1228  *

  1229  * We must check _all_ occurrences of em-dashes on the line

  1230  * hence the loop - even if the first dash is OK

  1231  * there may be another that's wrong later on.

  1232  */

  1233 void check_for_spaced_emdash(const char *aline)

  1234 {

  1235     const char *s,*t,*next;

  1236     for (s=aline;t=str_emdash(s,&next);s=next)

  1237     {

  1238 	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||

  1239 	  g_utf8_get_char(next)==CHAR_SPACE)

  1240 	{

  1241 	    if (pswit[ECHO_SWITCH])

  1242 		g_print("\n%s\n",aline);

  1243 	    if (!pswit[OVERVIEW_SWITCH])

  1244 		g_print("    Line %ld column %ld - Spaced em-dash?\n",

  1245 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  1246 	    else

  1247 		cnt_dash++;

  1248 	}

  1249     }

  1250 }

  1252 /*

  1253  * check_for_spaced_dash:

  1254  *

  1255  * Check for spaced dashes.

  1256  */

  1257 void check_for_spaced_dash(const char *aline)

  1258 {

  1259     const char *s;

  1260     if ((s=strstr(aline," -")))

  1261     {

  1262 	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')

  1263 	{

  1264 	    if (pswit[ECHO_SWITCH])

  1265 		g_print("\n%s\n",aline);

  1266 	    if (!pswit[OVERVIEW_SWITCH])

  1267 		g_print("    Line %ld column %ld - Spaced dash?\n",

  1268 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1269 	    else

  1270 		cnt_dash++;

  1271 	}

  1272     }

  1273     else if ((s=strstr(aline,"- ")))

  1274     {

  1275 	if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')

  1276 	{

  1277 	    if (pswit[ECHO_SWITCH])

  1278 		g_print("\n%s\n",aline);

  1279 	    if (!pswit[OVERVIEW_SWITCH])

  1280 		g_print("    Line %ld column %ld - Spaced dash?\n",

  1281 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1282 	    else

  1283 		cnt_dash++;

  1284 	}

  1285     }

  1286 }

  1288 /*

  1289  * check_for_unmarked_paragraphs:

  1290  *

  1291  * Check for unmarked paragraphs indicated by separate speakers.

  1292  *

  1293  * May well be false positive:

  1294  * "Bravo!" "Wonderful!" called the crowd.

  1295  * but useful all the same.

  1296  */

  1297 void check_for_unmarked_paragraphs(const char *aline)

  1298 {

  1299     const char *s;

  1300     s=strstr(aline,"\"  \"");

  1301     if (!s)

  1302 	s=strstr(aline,"\" \"");

  1303     if (s)

  1304     {

  1305 	if (pswit[ECHO_SWITCH])

  1306 	    g_print("\n%s\n",aline);

  1307 	if (!pswit[OVERVIEW_SWITCH])

  1308 	    g_print("    Line %ld column %ld - "

  1309 	      "Query missing paragraph break?\n",

  1310 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1311 	else

  1312 	    cnt_punct++;

  1313     }

  1314 }

  1316 /*

  1317  * check_for_jeebies:

  1318  *

  1319  * Check for "to he" and other easy h/b errors.

  1320  *

  1321  * This is a very inadequate effort on the h/b problem,

  1322  * but the phrase "to he" is always an error, whereas "to

  1323  * be" is quite common.

  1324  * Similarly, '"Quiet!", be said.' is a non-be error

  1325  * "to he" is _not_ always an error!:

  1326  *       "Where they went to he couldn't say."

  1327  * Another false positive:

  1328  *       What would "Cinderella" be without the . . .

  1329  * and another: "If he wants to he can see for himself."

  1330  */

  1331 void check_for_jeebies(const char *aline)

  1332 {

  1333     const char *s;

  1334     s=strstr(aline," be could ");

  1335     if (!s)

  1336 	s=strstr(aline," be would ");

  1337     if (!s)

  1338 	s=strstr(aline," was be ");

  1339     if (!s)

  1340 	s=strstr(aline," be is ");

  1341     if (!s)

  1342 	s=strstr(aline," is be ");

  1343     if (!s)

  1344 	s=strstr(aline,"\", be ");

  1345     if (!s)

  1346 	s=strstr(aline,"\" be ");

  1347     if (!s)

  1348 	s=strstr(aline,"\" be ");

  1349     if (!s)

  1350 	s=strstr(aline," to he ");

  1351     if (s)

  1352     {

  1353 	if (pswit[ECHO_SWITCH])

  1354 	    g_print("\n%s\n",aline);

  1355 	if (!pswit[OVERVIEW_SWITCH])

  1356 	    g_print("    Line %ld column %ld - Query he/be error?\n",

  1357 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1358 	else

  1359 	    cnt_word++;

  1360     }

  1361     s=strstr(aline," the had ");

  1362     if (!s)

  1363 	s=strstr(aline," a had ");

  1364     if (!s)

  1365 	s=strstr(aline," they bad ");

  1366     if (!s)

  1367 	s=strstr(aline," she bad ");

  1368     if (!s)

  1369 	s=strstr(aline," he bad ");

  1370     if (!s)

  1371 	s=strstr(aline," you bad ");

  1372     if (!s)

  1373 	s=strstr(aline," i bad ");

  1374     if (s)

  1375     {

  1376 	if (pswit[ECHO_SWITCH])

  1377 	    g_print("\n%s\n",aline);

  1378 	if (!pswit[OVERVIEW_SWITCH])

  1379 	    g_print("    Line %ld column %ld - Query had/bad error?\n",

  1380 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1381 	else

  1382 	    cnt_word++;

  1383     }

  1384     s=strstr(aline,"; hut ");

  1385     if (!s)

  1386 	s=strstr(aline,", hut ");

  1387     if (s)

  1388     {

  1389 	if (pswit[ECHO_SWITCH])

  1390 	    g_print("\n%s\n",aline);

  1391 	if (!pswit[OVERVIEW_SWITCH])

  1392 	    g_print("    Line %ld column %ld - Query hut/but error?\n",

  1393 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1394 	else

  1395 	    cnt_word++;

  1396     }

  1397 }

  1399 /*

  1400  * check_for_mta_from:

  1401  *

  1402  * Special case - angled bracket in front of "From" placed there by an

  1403  * MTA when sending an e-mail.

  1404  */

  1405 void check_for_mta_from(const char *aline)

  1406 {

  1407     const char *s;

  1408     s=strstr(aline,">From");

  1409     if (s)

  1410     {

  1411 	if (pswit[ECHO_SWITCH])

  1412 	    g_print("\n%s\n",aline);

  1413 	if (!pswit[OVERVIEW_SWITCH])

  1414 	    g_print("    Line %ld column %ld - "

  1415 	      "Query angled bracket with From\n",

  1416 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1417 	else

  1418 	    cnt_punct++;

  1419     }

  1420 }

  1422 /*

  1423  * check_for_orphan_character:

  1424  *

  1425  * Check for a single character line -

  1426  * often an overflow from bad wrapping.

  1427  */

  1428 void check_for_orphan_character(const char *aline)

  1429 {

  1430     gunichar c;

  1431     c=g_utf8_get_char(aline);

  1432     if (c && !*g_utf8_next_char(aline))

  1433     {

  1434 	if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))

  1435 	    ; /* Nothing - ignore numerals alone on a line. */

  1436 	else

  1437 	{

  1438 	    if (pswit[ECHO_SWITCH])

  1439 		g_print("\n%s\n",aline);

  1440 	    if (!pswit[OVERVIEW_SWITCH])

  1441 		g_print("    Line %ld column 1 - Query single character line\n",

  1442 		  linecnt);

  1443 	    else

  1444 		cnt_punct++;

  1445 	}

  1446     }

  1447 }

  1449 /*

  1450  * check_for_pling_scanno:

  1451  *

  1452  * Check for I" - often should be !

  1453  */

  1454 void check_for_pling_scanno(const char *aline)

  1455 {

  1456     const char *s;

  1457     s=strstr(aline," I\"");

  1458     if (s)

  1459     {

  1460 	if (pswit[ECHO_SWITCH])

  1461 	    g_print("\n%s\n",aline);

  1462 	if (!pswit[OVERVIEW_SWITCH])

  1463 	    g_print("    Line %ld column %ld - Query I=exclamation mark?\n",

  1464 	      linecnt,g_utf8_pointer_to_offset(aline,s));

  1465 	else

  1466 	    cnt_punct++;

  1467     }

  1468 }

  1470 /*

  1471  * check_for_extra_period:

  1472  *

  1473  * Check for period without a capital letter. Cut-down from gutspell.

  1474  * Only works when it happens on a single line.

  1475  */

  1476 void check_for_extra_period(const char *aline,const struct warnings *warnings)

  1477 {

  1478     const char *s,*t,*s1,*sprev;

  1479     int i;

  1480     gsize len;

  1481     gboolean istypo;

  1482     gchar *testword;

  1483     gunichar c,nc,pc,*decomposition;

  1484     if (pswit[PARANOID_SWITCH])

  1485     {

  1486 	for (t=aline;t=strstr(t,". ");)

  1487 	{

  1488 	    if (t==aline)

  1489 	    {

  1490 		t=g_utf8_next_char(t);

  1491 		/* start of line punctuation is handled elsewhere */

  1492 		continue;

  1493 	    }

  1494 	    if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))

  1495 	    {

  1496 		t=g_utf8_next_char(t);

  1497 		continue;

  1498 	    }

  1499 	    if (warnings->isDutch)

  1500 	    {

  1501 		/* For Frank & Jeroen -- 's Middags case */

  1502 		gunichar c2,c3,c4,c5;

  1503 		c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));

  1504 		c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));

  1505 		c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));

  1506 		c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));

  1507 		if (CHAR_IS_APOSTROPHE(c2) &&

  1508 		  g_unichar_islower(c3) && c4==CHAR_SPACE &&

  1509 		  g_unichar_isupper(c5))

  1510 		{

  1511 		    t=g_utf8_next_char(t);

  1512 		    continue;

  1513 		}

  1514 	    }

  1515 	    s1=g_utf8_next_char(g_utf8_next_char(t));

  1516 	    while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&

  1517 	      !g_unichar_isdigit(g_utf8_get_char(s1)))

  1518 		s1=g_utf8_next_char(s1);

  1519 	    if (g_unichar_islower(g_utf8_get_char(s1)))

  1520 	    {

  1521 		/* we have something to investigate */

  1522 		istypo=TRUE;

  1523 		/* so let's go back and find out */

  1524 		nc=g_utf8_get_char(t);

  1525 		s1=g_utf8_prev_char(t);

  1526 		c=g_utf8_get_char(s1);

  1527 		sprev=g_utf8_prev_char(s1);

  1528 		pc=g_utf8_get_char(sprev);

  1529 		while (s1>=aline &&

  1530 		  (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||

  1531 		  g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&

  1532 		  g_unichar_isalpha(nc)))

  1533 		{

  1534 		    nc=c;

  1535 		    s1=sprev;

  1536 		    c=pc;

  1537 		    sprev=g_utf8_prev_char(s1);

  1538 		    pc=g_utf8_get_char(sprev);

  1539 		}

  1540 		s1=g_utf8_next_char(s1);

  1541 		s=strchr(s1,'.');

  1542 		if (s)

  1543 		    testword=g_strndup(s1,s-s1);

  1544 		else

  1545 		    testword=g_strdup(s1);

  1546 		for (i=0;*abbrev[i];i++)

  1547 		    if (!strcmp(testword,abbrev[i]))

  1548 			istypo=FALSE;

  1549 		if (g_unichar_isdigit(g_utf8_get_char(testword)))

  1550 		    istypo=FALSE;

  1551 		if (!*g_utf8_next_char(testword))

  1552 		    istypo=FALSE;

  1553 		if (isroman(testword))

  1554 		    istypo=FALSE;

  1555 		if (istypo)

  1556 		{

  1557 		    istypo=FALSE;

  1558 		    for (s=testword;*s;s=g_utf8_next_char(s))

  1559 		    {

  1560 			decomposition=g_unicode_canonical_decomposition(

  1561 			  g_utf8_get_char(s),&len);

  1562 			if (g_utf8_strchr("aeiou",-1,decomposition[0]))

  1563 			    istypo=TRUE;

  1564 			g_free(decomposition);

  1565 		    }

  1566 		}

  1567 		if (istypo &&

  1568 		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))

  1569 		{

  1570 		    g_tree_insert(qperiod,g_strdup(testword),

  1571 		      GINT_TO_POINTER(1));

  1572 		    if (pswit[ECHO_SWITCH])

  1573 			g_print("\n%s\n",aline);

  1574 		    if (!pswit[OVERVIEW_SWITCH])

  1575 			g_print("    Line %ld column %ld - Extra period?\n",

  1576 			  linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  1577 		    else

  1578 			cnt_punct++;

  1579 		}

  1580 		g_free(testword);

  1581 	    }

  1582 	    t=g_utf8_next_char(t);

  1583 	}

  1584     }

  1585 }

  1587 /*

  1588  * check_for_following_punctuation:

  1589  *

  1590  * Check for words usually not followed by punctuation.

  1591  */

  1592 void check_for_following_punctuation(const char *aline)

  1593 {

  1594     int i;

  1595     const char *s,*wordstart;

  1596     gunichar c;

  1597     gchar *inword,*t;

  1598     if (pswit[TYPO_SWITCH])

  1599     {

  1600 	for (s=aline;*s;)

  1601 	{

  1602 	    wordstart=s;

  1603 	    t=getaword(&s);

  1604 	    if (!*t)

  1605 	    {

  1606 		g_free(t);

  1607 		continue;

  1608 	    }

  1609 	    inword=g_utf8_strdown(t,-1);

  1610 	    g_free(t);

  1611 	    for (i=0;*nocomma[i];i++)

  1612 		if (!strcmp(inword,nocomma[i]))

  1613 		{

  1614 		    c=g_utf8_get_char(s);

  1615 		    if (c==',' || c==';' || c==':')

  1616 		    {

  1617 			if (pswit[ECHO_SWITCH])

  1618 			    g_print("\n%s\n",aline);

  1619 			if (!pswit[OVERVIEW_SWITCH])

  1620 			    g_print("    Line %ld column %ld - "

  1621 			      "Query punctuation after %s?\n",

  1622 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,

  1623 			      inword);

  1624 			else

  1625 			    cnt_punct++;

  1626 		    }

  1627 		}

  1628 	    for (i=0;*noperiod[i];i++)

  1629 		if (!strcmp(inword,noperiod[i]))

  1630 		{

  1631 		    c=g_utf8_get_char(s);

  1632 		    if (c=='.' || c=='!')

  1633 		    {

  1634 			if (pswit[ECHO_SWITCH])

  1635 			    g_print("\n%s\n",aline);

  1636 			if (!pswit[OVERVIEW_SWITCH])

  1637 			    g_print("    Line %ld column %ld - "

  1638 			      "Query punctuation after %s?\n",

  1639 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,

  1640 			      inword);

  1641 			else

  1642 			    cnt_punct++;

  1643 		    }

  1644 		}

  1645 	    g_free(inword);

  1646 	}

  1647     }

  1648 }

  1650 /*

  1651  * check_for_typos:

  1652  *

  1653  * Check for commonly mistyped words,

  1654  * and digits like 0 for O in a word.

  1655  */

  1656 void check_for_typos(const char *aline,struct warnings *warnings)

  1657 {

  1658     const char *s,*t,*nt,*wordstart;

  1659     gchar *inword;

  1660     gunichar *decomposition;

  1661     gchar *testword;

  1662     int i,vowel,consonant,*dupcnt;

  1663     gboolean isdup,istypo,alower;

  1664     gunichar c,pc;

  1665     long offset,len;

  1666     gsize decomposition_len;

  1667     for (s=aline;*s;)

  1668     {

  1669 	wordstart=s;

  1670 	inword=getaword(&s);

  1671 	if (!*inword)

  1672 	{

  1673 	    g_free(inword);

  1674 	    continue; /* don't bother with empty lines */

  1675 	}

  1676 	if (mixdigit(inword))

  1677 	{

  1678 	    if (pswit[ECHO_SWITCH])

  1679 		g_print("\n%s\n",aline);

  1680 	    if (!pswit[OVERVIEW_SWITCH])

  1681 		g_print("    Line %ld column %ld - Query digit in %s\n",

  1682 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);

  1683 	    else

  1684 		cnt_word++;

  1685 	}

  1686 	/*

  1687 	 * Put the word through a series of tests for likely typos and OCR

  1688 	 * errors.

  1689 	 */

  1690 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])

  1691 	{

  1692 	    istypo=FALSE;

  1693 	    alower=FALSE;

  1694 	    for (t=inword;*t;t=g_utf8_next_char(t))

  1695 	    {

  1696 		c=g_utf8_get_char(t);

  1697 		nt=g_utf8_next_char(t);

  1698 		/* lowercase for testing */

  1699 		if (g_unichar_islower(c))

  1700 		    alower=TRUE;

  1701 		if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))

  1702 		{

  1703 		    /*

  1704 		     * We have an uppercase mid-word. However, there are

  1705 		     * common cases:

  1706 		     *   Mac and Mc like McGill

  1707 		     *   French contractions like l'Abbe

  1708 		     */

  1709 		    offset=g_utf8_pointer_to_offset(inword,t);

  1710 		    if (offset>0)

  1711 			pc=g_utf8_get_char(g_utf8_prev_char(t));

  1712 		    else

  1713 			pc='\0';

  1714 		    if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||

  1715 		      offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&

  1716 		      g_utf8_get_char(g_utf8_next_char(nt))=='c' ||

  1717 		      CHAR_IS_APOSTROPHE(pc))

  1718 			; /* do nothing! */

  1719 		    else

  1720 			istypo=TRUE;

  1721 		}

  1722 	    }

  1723 	    testword=g_utf8_casefold(inword,-1);

  1724 	}

  1725 	if (pswit[TYPO_SWITCH])

  1726 	{

  1727 	    /*

  1728 	     * Check for certain unlikely two-letter combinations at word

  1729 	     * start and end.

  1730 	     */

  1731 	    len=g_utf8_strlen(testword,-1);

  1732 	    if (len>1)

  1733 	    {

  1734 		for (i=0;*nostart[i];i++)

  1735 		    if (g_str_has_prefix(testword,nostart[i]))

  1736 			istypo=TRUE;

  1737 		for (i=0;*noend[i];i++)

  1738 		    if (g_str_has_suffix(testword,noend[i]))

  1739 			istypo=TRUE;

  1740 	    }

  1741 	    /* ght is common, gbt never. Like that. */

  1742 	    if (strstr(testword,"cb"))

  1743 		istypo=TRUE;

  1744 	    if (strstr(testword,"gbt"))

  1745 		istypo=TRUE;

  1746 	    if (strstr(testword,"pbt"))

  1747 		istypo=TRUE;

  1748 	    if (strstr(testword,"tbs"))

  1749 		istypo=TRUE;

  1750 	    if (strstr(testword,"mrn"))

  1751 		istypo=TRUE;

  1752 	    if (strstr(testword,"ahle"))

  1753 		istypo=TRUE;

  1754 	    if (strstr(testword,"ihle"))

  1755 		istypo=TRUE;

  1756 	    /*

  1757 	     * "TBE" does happen - like HEARTBEAT - but uncommon.

  1758 	     * Also "TBI" - frostbite, outbid - but uncommon.

  1759 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman

  1760 	     * numerals, but "ii" is a common scanno.

  1761 	     */

  1762 	    if (strstr(testword,"tbi"))

  1763 		istypo=TRUE;

  1764 	    if (strstr(testword,"tbe"))

  1765 		istypo=TRUE;

  1766 	    if (strstr(testword,"ii"))

  1767 		istypo=TRUE;

  1768 	    /*

  1769 	     * Check for no vowels or no consonants.

  1770 	     * If none, flag a typo.

  1771 	     */

  1772 	    if (!istypo && len>1)

  1773 	    {

  1774 		vowel=consonant=0;

  1775 		for (t=testword;*t;t=g_utf8_next_char(t))

  1776 		{

  1777 		    c=g_utf8_get_char(t);

  1778 		    decomposition=

  1779 		      g_unicode_canonical_decomposition(c,&decomposition_len);

  1780 		    if (c=='y' || g_unichar_isdigit(c))

  1781 		    {

  1782 			/* Yah, this is loose. */

  1783 			vowel++;

  1784 			consonant++;

  1785 		    }

  1786 		    else if (g_utf8_strchr("aeiou",-1,decomposition[0]))

  1787 			vowel++;

  1788 		    else

  1789 			consonant++;

  1790 		    g_free(decomposition);

  1791 		}

  1792 		if (!vowel || !consonant)

  1793 		    istypo=TRUE;

  1794 	    }

  1795 	    /*

  1796 	     * Now exclude the word from being reported if it's in

  1797 	     * the okword list.

  1798 	     */

  1799 	    for (i=0;*okword[i];i++)

  1800 		if (!strcmp(testword,okword[i]))

  1801 		    istypo=FALSE;

  1802 	    /*

  1803 	     * What looks like a typo may be a Roman numeral.

  1804 	     * Exclude these.

  1805 	     */

  1806 	    if (istypo && isroman(testword))

  1807 		istypo=FALSE;

  1808 	    /* Check the manual list of typos. */

  1809 	    if (!istypo)

  1810 		for (i=0;*typo[i];i++)

  1811 		    if (!strcmp(testword,typo[i]))

  1812 			istypo=TRUE;

  1813 	    /*

  1814 	     * Check lowercase s, l, i and m - special cases.

  1815 	     *   "j" - often a semi-colon gone wrong.

  1816 	     *   "d" for a missing apostrophe - he d

  1817 	     *   "n" for "in"

  1818 	     */

  1819 	    if (!istypo && len==1 &&

  1820 	      g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))

  1821 		istypo=TRUE;

  1822 	    if (istypo)

  1823 	    {

  1824 		dupcnt=g_tree_lookup(qword,testword);

  1825 		if (dupcnt)

  1826 		{

  1827 		    (*dupcnt)++;

  1828 		    isdup=!pswit[VERBOSE_SWITCH];

  1829 		}

  1830 		else

  1831 		{

  1832 		    dupcnt=g_new0(int,1);

  1833 		    g_tree_insert(qword,g_strdup(testword),dupcnt);

  1834 		    isdup=FALSE;

  1835 		}

  1836 		if (!isdup)

  1837 		{

  1838 		    if (pswit[ECHO_SWITCH])

  1839 			g_print("\n%s\n",aline);

  1840 		    if (!pswit[OVERVIEW_SWITCH])

  1841 		    {

  1842 			g_print("    Line %ld column %ld - Query word %s",

  1843 			  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,

  1844 			  inword);

  1845 			if (!pswit[VERBOSE_SWITCH])

  1846 			    g_print(" - not reporting duplicates");

  1847 			g_print("\n");

  1848 		    }

  1849 		    else

  1850 			cnt_word++;

  1851 		}

  1852 	    }

  1853 	}

  1854 	/* check the user's list of typos */

  1855 	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))

  1856 	{

  1857 	    if (pswit[ECHO_SWITCH])

  1858 		g_print("\n%s\n",aline);

  1859 	    if (!pswit[OVERVIEW_SWITCH])

  1860 		g_print("    Line %ld column %ld - Query possible scanno %s\n",

  1861 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);

  1862 	}

  1863 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])

  1864 	    g_free(testword);

  1865 	if (pswit[PARANOID_SWITCH] && warnings->digit)

  1866 	{

  1867 	    /* In paranoid mode, query all 0 and 1 standing alone. */

  1868 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))

  1869 	    {

  1870 		if (pswit[ECHO_SWITCH])

  1871 		    g_print("\n%s\n",aline);

  1872 		if (!pswit[OVERVIEW_SWITCH])

  1873 		    g_print("    Line %ld column %ld - Query standalone %s\n",

  1874 		      linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,

  1875 		      inword);

  1876 		else

  1877 		    cnt_word++;

  1878 	    }

  1879 	}

  1880 	g_free(inword);

  1881     }

  1882 }

  1884 /*

  1885  * check_for_misspaced_punctuation:

  1886  *

  1887  * Look for added or missing spaces around punctuation and quotes.

  1888  * If there is a punctuation character like ! with no space on

  1889  * either side, suspect a missing!space. If there are spaces on

  1890  * both sides , assume a typo. If we see a double quote with no

  1891  * space or punctuation on either side of it, assume unspaced

  1892  * quotes "like"this.

  1893  */

  1894 void check_for_misspaced_punctuation(const char *aline,

  1895   struct parities *parities,gboolean isemptyline)

  1896 {

  1897     gboolean isacro,isellipsis;

  1898     const char *s;

  1899     gunichar c,nc,pc,n2c;

  1900     int parity;

  1901     c=g_utf8_get_char(aline);

  1902     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  1903     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  1904     {

  1905 	pc=c;

  1906 	c=nc;

  1907 	nc=g_utf8_get_char(g_utf8_next_char(s));

  1908 	/* For each character in the line after the first. */

  1909 	if (g_utf8_strchr(".?!,;:_",-1,c))  /* if it's punctuation */

  1910 	{

  1911 	    /* we need to suppress warnings for acronyms like M.D. */

  1912 	    isacro=FALSE;

  1913 	    /* we need to suppress warnings for ellipsis . . . */

  1914 	    isellipsis=FALSE;

  1915 	    /*

  1916 	     * If there are letters on both sides of it or

  1917 	     * if it's strict punctuation followed by an alpha.

  1918 	     */

  1919 	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||

  1920 	      g_utf8_strchr("?!,;:",-1,c)))

  1921 	    {

  1922 		if (c=='.')

  1923 		{

  1924 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&

  1925 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')

  1926 			isacro=TRUE;

  1927 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));

  1928 		    if (nc && n2c=='.')

  1929 			isacro=TRUE;

  1930 		}

  1931 		if (!isacro)

  1932 		{

  1933 		    if (pswit[ECHO_SWITCH])

  1934 			g_print("\n%s\n",aline);

  1935 		    if (!pswit[OVERVIEW_SWITCH])

  1936 			g_print("    Line %ld column %ld - Missing space?\n",

  1937 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1938 		    else

  1939 			cnt_punct++;

  1940 		}

  1941 	    }

  1942 	    if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))

  1943 	    {

  1944 		/*

  1945 		 * If there are spaces on both sides,

  1946 		 * or space before and end of line.

  1947 		 */

  1948 		if (c=='.')

  1949 		{

  1950 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&

  1951 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')

  1952 			isellipsis=TRUE;

  1953 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));

  1954 		    if (nc && n2c=='.')

  1955 			isellipsis=TRUE;

  1956 		}

  1957 		if (!isemptyline && !isellipsis)

  1958 		{

  1959 		    if (pswit[ECHO_SWITCH])

  1960 			g_print("\n%s\n",aline);

  1961 		    if (!pswit[OVERVIEW_SWITCH])

  1962 			g_print("    Line %ld column %ld - "

  1963 			  "Spaced punctuation?\n",linecnt,

  1964 			  g_utf8_pointer_to_offset(aline,s)+1);

  1965 		    else

  1966 			cnt_punct++;

  1967 		}

  1968 	    }

  1969 	}

  1970     }

  1971     /* Split out the characters that CANNOT be preceded by space. */

  1972     c=g_utf8_get_char(aline);

  1973     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  1974     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  1975     {

  1976 	pc=c;

  1977 	c=nc;

  1978 	nc=g_utf8_get_char(g_utf8_next_char(s));

  1979 	/* for each character in the line after the first */

  1980 	if (g_utf8_strchr("?!,;:",-1,c))

  1981 	{

  1982 	    /* if it's punctuation that _cannot_ have a space before it */

  1983 	    if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)

  1984 	    {

  1985 		/*

  1986 		 * If nc DOES == space,

  1987 		 * it was already reported just above.

  1988 		 */

  1989 		if (pswit[ECHO_SWITCH])

  1990 		    g_print("\n%s\n",aline);

  1991 		if (!pswit[OVERVIEW_SWITCH])

  1992 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",

  1993 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1994 		else

  1995 		    cnt_punct++;

  1996 	    }

  1997 	}

  1998     }

  1999     /*

  2000      * Special case " .X" where X is any alpha.

  2001      * This plugs a hole in the acronym code above.

  2002      * Inelegant, but maintainable.

  2003      */

  2004     c=g_utf8_get_char(aline);

  2005     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2006     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2007     {

  2008 	pc=c;

  2009 	c=nc;

  2010 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2011 	/* for each character in the line after the first */

  2012 	if (c=='.')

  2013 	{

  2014 	    /* if it's a period */

  2015 	    if (pc==CHAR_SPACE && g_unichar_isalpha(nc))

  2016 	    {

  2017 		/*

  2018 		 * If the period follows a space and

  2019 		 * is followed by a letter.

  2020 		 */

  2021 		if (pswit[ECHO_SWITCH])

  2022 		    g_print("\n%s\n",aline);

  2023 		if (!pswit[OVERVIEW_SWITCH])

  2024 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",

  2025 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2026 		else

  2027 		    cnt_punct++;

  2028 	    }

  2029 	}

  2030     }

  2031     c=g_utf8_get_char(aline);

  2032     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2033     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2034     {

  2035 	pc=c;

  2036 	c=nc;

  2037 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2038 	/* for each character in the line after the first */

  2039 	if (CHAR_IS_DQUOTE(c))

  2040 	{

  2041 	    if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&

  2042 	      !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||

  2043 	      !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))

  2044 	    {

  2045 		if (pswit[ECHO_SWITCH])

  2046 		    g_print("\n%s\n",aline);

  2047 		if (!pswit[OVERVIEW_SWITCH])

  2048 		    g_print("    Line %ld column %ld - Unspaced quotes?\n",

  2049 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2050 		else

  2051 		    cnt_punct++;

  2052 	    }

  2053 	}

  2054     }

  2055     /* Check parity of quotes. */

  2056     nc=g_utf8_get_char(aline);

  2057     for (s=aline;*s;s=g_utf8_next_char(s))

  2058     {

  2059 	c=nc;

  2060 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2061 	if (CHAR_IS_DQUOTE(c))

  2062 	{

  2063 	    if (c==CHAR_DQUOTE)

  2064 	    {

  2065 		parities->dquote=!parities->dquote;

  2066 		parity=parities->dquote;

  2067 	    }

  2068 	    else if (c==CHAR_LD_QUOTE)

  2069 		parity=1;

  2070 	    else

  2071 		parity=0;

  2072 	    if (!parity)

  2073 	    {

  2074 		/* parity even */

  2075 		if (!g_utf8_strchr("_-.'`‘’/,;:!?)]} ",-1,nc))

  2076 		{

  2077 		    if (pswit[ECHO_SWITCH])

  2078 			g_print("\n%s\n",aline);

  2079 		    if (!pswit[OVERVIEW_SWITCH])

  2080 			g_print("    Line %ld column %ld - "

  2081 			  "Wrongspaced quotes?\n",

  2082 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2083 		    else

  2084 			cnt_punct++;

  2085 		}

  2086 	    }

  2087 	    else

  2088 	    {

  2089 		/* parity odd */

  2090 		if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&

  2091 		  !g_utf8_strchr("_-/.'`‘’([{$",-1,nc) || !nc)

  2092 		{

  2093 		    if (pswit[ECHO_SWITCH])

  2094 			g_print("\n%s\n",aline);

  2095 		    if (!pswit[OVERVIEW_SWITCH])

  2096 			g_print("    Line %ld column %ld - "

  2097 			  "Wrongspaced quotes?\n",

  2098 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2099 		    else

  2100 			cnt_punct++;

  2101 		}

  2102 	    }

  2103 	}

  2104     }

  2105     c=g_utf8_get_char(aline);

  2106     if (CHAR_IS_DQUOTE(c))

  2107     {

  2108 	if (g_utf8_strchr(",;:!?)]} ",-1,

  2109 	  g_utf8_get_char(g_utf8_next_char(aline))))

  2110 	{

  2111 	    if (pswit[ECHO_SWITCH])

  2112 		g_print("\n%s\n",aline);

  2113 	    if (!pswit[OVERVIEW_SWITCH])

  2114 		g_print("    Line %ld column 1 - Wrongspaced quotes?\n",

  2115 		  linecnt);

  2116 	    else

  2117 		cnt_punct++;

  2118 	}

  2119     }

  2120     if (pswit[SQUOTE_SWITCH])

  2121     {

  2122 	nc=g_utf8_get_char(aline);

  2123 	for (s=aline;*s;s=g_utf8_next_char(s))

  2124 	{

  2125 	    c=nc;

  2126 	    nc=g_utf8_get_char(g_utf8_next_char(s));

  2127 	    if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&

  2128 	      !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||

  2129 	      !g_unichar_isalpha(nc)))

  2130 	    {

  2131 		parities->squote=!parities->squote;

  2132 		if (!parities->squote)

  2133 		{

  2134 		    /* parity even */

  2135 		    if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))

  2136 		    {

  2137 			if (pswit[ECHO_SWITCH])

  2138 			    g_print("\n%s\n",aline);

  2139 			if (!pswit[OVERVIEW_SWITCH])

  2140 			    g_print("    Line %ld column %ld - "

  2141 			      "Wrongspaced singlequotes?\n",

  2142 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2143 			else

  2144 			    cnt_punct++;

  2145 		    }

  2146 		}

  2147 		else

  2148 		{

  2149 		    /* parity odd */

  2150 		    if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&

  2151 		      !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)

  2152 		    {

  2153 			if (pswit[ECHO_SWITCH])

  2154 			    g_print("\n%s\n",aline);

  2155 			if (!pswit[OVERVIEW_SWITCH])

  2156 			    g_print("    Line %ld column %ld - "

  2157 			      "Wrongspaced singlequotes?\n",

  2158 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2159 			else

  2160 			    cnt_punct++;

  2161 		    }

  2162 		}

  2163 	    }

  2164 	}

  2165     }

  2166 }

  2168 /*

  2169  * check_for_double_punctuation:

  2170  *

  2171  * Look for double punctuation like ,. or ,,

  2172  * Thanks to DW for the suggestion!

  2173  * In books with references, ".," and ".;" are common

  2174  * e.g. "etc., etc.," and vol. 1.; vol 3.;

  2175  * OTOH, from my initial tests, there are also fairly

  2176  * common errors. What to do? Make these cases paranoid?

  2177  * ".," is the most common, so warnings->dotcomma is used

  2178  * to suppress detailed reporting if it occurs often.

  2179  */

  2180 void check_for_double_punctuation(const char *aline,struct warnings *warnings)

  2181 {

  2182     const char *s;

  2183     gunichar c,nc;

  2184     nc=g_utf8_get_char(aline);

  2185     for (s=aline;*s;s=g_utf8_next_char(s))

  2186     {

  2187 	c=nc;

  2188 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2189 	/* for each punctuation character in the line */

  2190 	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&

  2191 	  g_utf8_strchr(".?!,;:",-1,nc))

  2192 	{

  2193 	    /* followed by punctuation, it's a query, unless . . . */

  2194 	    if (c==nc && (c=='.' || c=='?' || c=='!') ||

  2195 	      !warnings->dotcomma && c=='.' && nc==',' ||

  2196 	      warnings->isFrench && g_str_has_prefix(s,",...") ||

  2197 	      warnings->isFrench && g_str_has_prefix(s,"...,") ||

  2198 	      warnings->isFrench && g_str_has_prefix(s,";...") ||

  2199 	      warnings->isFrench && g_str_has_prefix(s,"...;") ||

  2200 	      warnings->isFrench && g_str_has_prefix(s,":...") ||

  2201 	      warnings->isFrench && g_str_has_prefix(s,"...:") ||

  2202 	      warnings->isFrench && g_str_has_prefix(s,"!...") ||

  2203 	      warnings->isFrench && g_str_has_prefix(s,"...!") ||

  2204 	      warnings->isFrench && g_str_has_prefix(s,"?...") ||

  2205 	      warnings->isFrench && g_str_has_prefix(s,"...?"))

  2206 	    {

  2207 		if (warnings->isFrench && g_str_has_prefix(s,",...") ||

  2208 		  warnings->isFrench && g_str_has_prefix(s,"...,") ||

  2209 		  warnings->isFrench && g_str_has_prefix(s,";...") ||

  2210 		  warnings->isFrench && g_str_has_prefix(s,"...;") ||

  2211 		  warnings->isFrench && g_str_has_prefix(s,":...") ||

  2212 		  warnings->isFrench && g_str_has_prefix(s,"...:") ||

  2213 		  warnings->isFrench && g_str_has_prefix(s,"!...") ||

  2214 		  warnings->isFrench && g_str_has_prefix(s,"...!") ||

  2215 		  warnings->isFrench && g_str_has_prefix(s,"?...") ||

  2216 		  warnings->isFrench && g_str_has_prefix(s,"...?"))

  2217 		{

  2218 		    s+=4;

  2219 		    nc=g_utf8_get_char(g_utf8_next_char(s));

  2220 		}

  2221 		; /* do nothing for .. !! and ?? which can be legit */

  2222 	    }

  2223 	    else

  2224 	    {

  2225 		if (pswit[ECHO_SWITCH])

  2226 		    g_print("\n%s\n",aline);

  2227 		if (!pswit[OVERVIEW_SWITCH])

  2228 		    g_print("    Line %ld column %ld - Double punctuation?\n",

  2229 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2230 		else

  2231 		    cnt_punct++;

  2232 	    }

  2233 	}

  2234     }

  2235 }

  2237 /*

  2238  * check_for_spaced_quotes:

  2239  */

  2240 void check_for_spaced_quotes(const char *aline)

  2241 {

  2242     int i;

  2243     const char *s,*t;

  2244     const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,

  2245       CHAR_RS_QUOTE};

  2246     GString *pattern;

  2247     s=aline;

  2248     while ((t=strstr(s," \" ")))

  2249     {

  2250 	if (pswit[ECHO_SWITCH])

  2251 	    g_print("\n%s\n",aline);

  2252 	if (!pswit[OVERVIEW_SWITCH])

  2253 	    g_print("    Line %ld column %ld - Spaced doublequote?\n",

  2254 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  2255 	else

  2256 	    cnt_punct++;

  2257 	s=g_utf8_next_char(g_utf8_next_char(t));

  2258     }

  2259     pattern=g_string_new(NULL);

  2260     for(i=0;i<G_N_ELEMENTS(single_quotes);i++)

  2261     {

  2262 	g_string_assign(pattern," ");

  2263 	g_string_append_unichar(pattern,single_quotes[i]);

  2264 	g_string_append_c(pattern,' ');

  2265 	s=aline;

  2266 	while ((t=strstr(s,pattern->str)))

  2267 	{

  2268 	    if (pswit[ECHO_SWITCH])

  2269 		g_print("\n%s\n",aline);

  2270 	    if (!pswit[OVERVIEW_SWITCH])

  2271 		g_print("    Line %ld column %ld - Spaced singlequote?\n",

  2272 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  2273 	    else

  2274 		cnt_punct++;

  2275 	    s=g_utf8_next_char(g_utf8_next_char(t));

  2276 	}

  2277     }

  2278     g_string_free(pattern,TRUE);

  2279 }

  2281 /*

  2282  * check_for_miscased_genative:

  2283  *

  2284  * Check special case of 'S instead of 's at end of word.

  2285  */

  2286 void check_for_miscased_genative(const char *aline)

  2287 {

  2288     const char *s;

  2289     gunichar c,nc,pc;

  2290     if (!*aline)

  2291 	return;

  2292     c=g_utf8_get_char(aline);

  2293     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2294     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2295     {

  2296 	pc=c;

  2297 	c=nc;

  2298 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2299 	if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))

  2300 	{

  2301 	    if (pswit[ECHO_SWITCH])

  2302 		g_print("\n%s\n",aline);

  2303 	    if (!pswit[OVERVIEW_SWITCH])

  2304 		g_print("    Line %ld column %ld - Capital \"S\"?\n",

  2305 		  linecnt,g_utf8_pointer_to_offset(aline,s)+2);

  2306 	    else

  2307 		cnt_punct++;

  2308 	}

  2309     }

  2310 }

  2312 /*

  2313  * check_end_of_line:

  2314  *

  2315  * Now check special cases - start and end of line -

  2316  * for single and double quotes. Start is sometimes [sic]

  2317  * but better to query it anyway.

  2318  * While we're here, check for dash at end of line.

  2319  */

  2320 void check_end_of_line(const char *aline,struct warnings *warnings)

  2321 {

  2322     int lbytes;

  2323     const char *s;

  2324     gunichar c1,c2;

  2325     lbytes=strlen(aline);

  2326     if (g_utf8_strlen(aline,lbytes)>1)

  2327     {

  2328 	s=g_utf8_prev_char(aline+lbytes);

  2329 	c1=g_utf8_get_char(s);

  2330 	c2=g_utf8_get_char(g_utf8_prev_char(s));

  2331 	if ((CHAR_IS_DQUOTE(c1) || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)

  2332 	{

  2333 	    if (pswit[ECHO_SWITCH])

  2334 		g_print("\n%s\n",aline);

  2335 	    if (!pswit[OVERVIEW_SWITCH])

  2336 		g_print("    Line %ld column %ld - Spaced quote?\n",linecnt,

  2337 		  g_utf8_strlen(aline,lbytes));

  2338 	    else

  2339 		cnt_punct++;

  2340 	}

  2341 	c1=g_utf8_get_char(aline);

  2342 	c2=g_utf8_get_char(g_utf8_next_char(aline));

  2343 	if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)

  2344 	{

  2345 	    if (pswit[ECHO_SWITCH])

  2346 		g_print("\n%s\n",aline);

  2347 	    if (!pswit[OVERVIEW_SWITCH])

  2348 		g_print("    Line %ld column 1 - Spaced quote?\n",linecnt);

  2349 	    else

  2350 		cnt_punct++;

  2351 	}

  2352 	/*

  2353 	 * Dash at end of line may well be legit - paranoid mode only

  2354 	 * and don't report em-dash at line-end.

  2355 	 */

  2356 	if (pswit[PARANOID_SWITCH] && warnings->hyphen)

  2357 	{

  2358 	    for (s=g_utf8_prev_char(aline+lbytes);

  2359 	      s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))

  2360 		;

  2361 	    if (g_utf8_get_char(s)=='-' &&

  2362 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')

  2363 	    {

  2364 		if (pswit[ECHO_SWITCH])

  2365 		    g_print("\n%s\n",aline);

  2366 		if (!pswit[OVERVIEW_SWITCH])

  2367 		    g_print("    Line %ld column %ld - "

  2368 		      "Hyphen at end of line?\n",

  2369 		      linecnt,g_utf8_pointer_to_offset(aline,s));

  2370 	    }

  2371 	}

  2372     }

  2373 }

  2375 /*

  2376  * check_for_unspaced_bracket:

  2377  *

  2378  * Brackets are often unspaced, but shouldn't be surrounded by alpha.

  2379  * If so, suspect a scanno like "a]most".

  2380  */

  2381 void check_for_unspaced_bracket(const char *aline)

  2382 {

  2383     const char *s;

  2384     gunichar c,nc,pc;

  2385     c=g_utf8_get_char(aline);

  2386     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2387     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2388     {

  2389 	pc=c;

  2390 	c=nc;

  2391 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2392 	if (!nc)

  2393 	    break;

  2394 	/* for each bracket character in the line except 1st & last */

  2395 	if (g_utf8_strchr("{[()]}",-1,c) &&

  2396 	  g_unichar_isalpha(pc) && g_unichar_isalpha(nc))

  2397 	{

  2398 	    if (pswit[ECHO_SWITCH])

  2399 		g_print("\n%s\n",aline);

  2400 	    if (!pswit[OVERVIEW_SWITCH])

  2401 		g_print("    Line %ld column %ld - Unspaced bracket?\n",

  2402 		  linecnt,g_utf8_pointer_to_offset(aline,s));

  2403 	    else

  2404 		cnt_punct++;

  2405 	}

  2406     }

  2407 }

  2409 /*

  2410  * check_for_unpunctuated_endquote:

  2411  */

  2412 void check_for_unpunctuated_endquote(const char *aline)

  2413 {

  2414     const char *s;

  2415     gunichar c,nc,pc;

  2416     QuoteClass qc;

  2417     c=g_utf8_get_char(aline);

  2418     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2419     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2420     {

  2421 	pc=c;

  2422 	c=nc;

  2423 	qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;

  2424 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2425 	/* for each character in the line except 1st */

  2426 	if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && g_unichar_isalpha(pc))

  2427 	{

  2428 	    if (pswit[ECHO_SWITCH])

  2429 		g_print("\n%s\n",aline);

  2430 	    if (!pswit[OVERVIEW_SWITCH])

  2431 		g_print("    Line %ld column %ld - "

  2432 		  "endquote missing punctuation?\n",

  2433 		  linecnt,g_utf8_pointer_to_offset(aline,s));

  2434 	    else

  2435 		cnt_punct++;

  2436 	}

  2437     }

  2438 }

  2440 /*

  2441  * check_for_html_tag:

  2442  *

  2443  * Check for <HTML TAG>.

  2444  *

  2445  * If there is a < in the line, followed at some point

  2446  * by a > then we suspect HTML.

  2447  */

  2448 void check_for_html_tag(const char *aline)

  2449 {

  2450     const char *open,*close;

  2451     gchar *tag;

  2452     open=strchr(aline,'<');

  2453     if (open)

  2454     {

  2455 	close=strchr(g_utf8_next_char(open),'>');

  2456 	if (close)

  2457 	{

  2458 	    if (pswit[ECHO_SWITCH])

  2459 		g_print("\n%s\n",aline);

  2460 	    if (!pswit[OVERVIEW_SWITCH])

  2461 	    {

  2462 		tag=g_strndup(open,close-open+1);

  2463 		g_print("    Line %ld column %ld - HTML Tag? %s \n",

  2464 		  linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);

  2465 		g_free(tag);

  2466 	    }

  2467 	    else

  2468 		cnt_html++;

  2469 	}

  2470     }

  2471 }

  2473 /*

  2474  * check_for_html_entity:

  2475  *

  2476  * Check for &symbol; HTML.

  2477  *

  2478  * If there is a & in the line, followed at

  2479  * some point by a ; then we suspect HTML.

  2480  */

  2481 void check_for_html_entity(const char *aline)

  2482 {

  2483     const char *s,*amp,*scolon;

  2484     gchar *entity;

  2485     amp=strchr(aline,'&');

  2486     if (amp)

  2487     {

  2488 	scolon=strchr(amp,';');

  2489 	if (scolon)

  2490 	{

  2491 	    for (s=amp;s<scolon;s=g_utf8_next_char(s))

  2492 		if (g_utf8_get_char(s)==CHAR_SPACE)

  2493 		    break;		/* Don't report "Jones & Son;" */

  2494 	    if (s>=scolon)

  2495 	    {

  2496 		if (pswit[ECHO_SWITCH])

  2497 		    g_print("\n%s\n",aline);

  2498 		if (!pswit[OVERVIEW_SWITCH])

  2499 		{

  2500 		    entity=g_strndup(amp,scolon-amp+1);

  2501 		    g_print("    Line %ld column %d - HTML symbol? %s \n",

  2502 		      linecnt,(int)(amp-aline)+1,entity);

  2503 		    g_free(entity);

  2504 		}

  2505 		else

  2506 		    cnt_html++;

  2507 	    }

  2508 	}

  2509     }

  2510 }

  2512 /*

  2513  * check_for_omitted_punctuation:

  2514  *

  2515  * Check for omitted punctuation at end of paragraph by working back

  2516  * through prevline. DW.

  2517  * Need to check this only for "normal" paras.

  2518  * So what is a "normal" para?

  2519  *    Not normal if one-liner (chapter headings, etc.)

  2520  *    Not normal if doesn't contain at least one locase letter

  2521  *    Not normal if starts with space

  2522  */

  2523 void check_for_omitted_punctuation(const char *prevline,

  2524   struct line_properties *last,int start_para_line)

  2525 {

  2526     gboolean letter_on_line=FALSE;

  2527     const char *s;

  2528     gunichar c;

  2529     gboolean closing_quote;

  2530     for (s=prevline;*s;s=g_utf8_next_char(s))

  2531 	if (g_unichar_isalpha(g_utf8_get_char(s)))

  2532 	{

  2533 	    letter_on_line=TRUE;

  2534 	    break;

  2535 	}

  2536     /*

  2537      * This next "if" is a problem.

  2538      * If we say "start_para_line <= linecnt - 1", that includes

  2539      * one-line "paragraphs" like chapter heads. Lotsa false positives.

  2540      * If we say "start_para_line < linecnt - 1" it doesn't, but then it

  2541      * misses genuine one-line paragraphs.

  2542      */

  2543     if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&

  2544       g_utf8_get_char(prevline)>CHAR_SPACE)

  2545     {

  2546 	s=prevline+strlen(prevline);

  2547 	do

  2548 	{

  2549 	    s=g_utf8_prev_char(s);

  2550 	    c=g_utf8_get_char(s);

  2551 	    if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)

  2552 		closing_quote=TRUE;

  2553 	    else

  2554 		closing_quote=FALSE;

  2555 	} while (closing_quote && s>prevline);

  2556 	for (;s>prevline;s=g_utf8_prev_char(s))

  2557 	{

  2558 	    if (g_unichar_isalpha(g_utf8_get_char(s)))

  2559 	    {

  2560 		if (pswit[ECHO_SWITCH])

  2561 		    g_print("\n%s\n",prevline);

  2562 		if (!pswit[OVERVIEW_SWITCH])

  2563 		    g_print("    Line %ld column %ld - "

  2564 		      "No punctuation at para end?\n",

  2565 		      linecnt-1,g_utf8_strlen(prevline,-1));

  2566 		else

  2567 		    cnt_punct++;

  2568 		break;

  2569 	    }

  2570 	    if (g_utf8_strchr("—-.:!([{?}])",-1,g_utf8_get_char(s)))

  2571 		break;

  2572 	}

  2573     }

  2574 }

  2576 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)

  2577 {

  2578     const char *word=key;

  2579     int *dupcnt=value;

  2580     if (*dupcnt)

  2581 	g_print("\nNote: Queried word %s was duplicated %d times\n",

  2582 	  word,*dupcnt);

  2583     return FALSE;

  2584 }

  2586 void print_as_windows_1252(const char *string)

  2587 {

  2588     gsize inbytes,outbytes;

  2589     gchar *buf,*bp;

  2590     static GIConv converter=(GIConv)-1;

  2591     if (!string)

  2592     {

  2593 	if (converter!=(GIConv)-1)

  2594 	    g_iconv_close(converter);

  2595 	converter=(GIConv)-1;

  2596 	return;

  2597     }

  2598     if (converter==(GIConv)-1)

  2599 	converter=g_iconv_open("WINDOWS-1252","UTF-8");

  2600     if (converter!=(GIConv)-1)

  2601     {

  2602 	inbytes=outbytes=strlen(string);

  2603 	bp=buf=g_malloc(outbytes+1);

  2604 	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);

  2605 	*bp='\0';

  2606 	fputs(buf,stdout);

  2607 	g_free(buf);

  2608     }

  2609     else

  2610 	fputs(string,stdout);

  2611 }

  2613 void print_as_utf_8(const char *string)

  2614 {

  2615     fputs(string,stdout);

  2616 }

  2618 /*

  2619  * procfile:

  2620  *

  2621  * Process one file.

  2622  */

  2623 void procfile(const char *filename)

  2624 {

  2625     const char *s;

  2626     gchar *parastart=NULL;	/* first line of current para */

  2627     gchar *etext,*aline;

  2628     gchar *etext_ptr;

  2629     GError *err=NULL;

  2630     struct first_pass_results *first_pass_results;

  2631     struct warnings *warnings;

  2632     struct counters counters={0};

  2633     struct line_properties last={0};

  2634     struct parities parities={0};

  2635     struct pending pending={0};

  2636     gboolean isemptyline;

  2637     long start_para_line=0;

  2638     gboolean isnewpara=FALSE,enddash=FALSE;

  2639     last.start=CHAR_SPACE;

  2640     linecnt=checked_linecnt=0;

  2641     etext=read_etext(filename,&err);

  2642     if (!etext)

  2643     {

  2644 	if (pswit[STDOUT_SWITCH])

  2645 	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);

  2646 	else

  2647 	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);

  2648 	exit(1);

  2649     }

  2650     g_print("\n\nFile: %s\n\n",filename);

  2651     first_pass_results=first_pass(etext);

  2652     warnings=report_first_pass(first_pass_results);

  2653     qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);

  2654     qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);

  2655     /*

  2656      * Here we go with the main pass. Hold onto yer hat!

  2657      */

  2658     linecnt=0;

  2659     etext_ptr=etext;

  2660     while ((aline=flgets(&etext_ptr,linecnt+1,warnings->newlines)))

  2661     {

  2662 	linecnt++;

  2663 	if (linecnt==1)

  2664 	    isnewpara=TRUE;

  2665 	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))

  2666 	    continue;    // skip DP page separators completely

  2667 	if (linecnt<first_pass_results->firstline ||

  2668 	  (first_pass_results->footerline>0 &&

  2669 	  linecnt>first_pass_results->footerline))

  2670 	{

  2671 	    if (pswit[HEADER_SWITCH])

  2672 	    {

  2673 		if (g_str_has_prefix(aline,"Title:"))

  2674 		    g_print("    %s\n",aline);

  2675 		if (g_str_has_prefix(aline,"Author:"))

  2676 		    g_print("    %s\n",aline);

  2677 		if (g_str_has_prefix(aline,"Release Date:"))

  2678 		    g_print("    %s\n",aline);

  2679 		if (g_str_has_prefix(aline,"Edition:"))

  2680 		    g_print("    %s\n\n",aline);

  2681 	    }

  2682 	    continue;		/* skip through the header */

  2683 	}

  2684 	checked_linecnt++;

  2685 	print_pending(aline,parastart,&pending);

  2686 	isemptyline=analyse_quotes(aline,&counters);

  2687 	if (isnewpara && !isemptyline)

  2688 	{

  2689 	    /* This line is the start of a new paragraph. */

  2690 	    start_para_line=linecnt;

  2691 	    /* Capture its first line in case we want to report it later. */

  2692 	    g_free(parastart);

  2693 	    parastart=g_strdup(aline);

  2694 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */

  2695 	    s=aline;

  2696 	    while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&

  2697 	      !g_unichar_isdigit(g_utf8_get_char(s)))

  2698 		s=g_utf8_next_char(s);

  2699 	    if (g_unichar_islower(g_utf8_get_char(s)))

  2700 	    {

  2701 		/* and its first letter is lowercase */

  2702 		if (pswit[ECHO_SWITCH])

  2703 		    g_print("\n%s\n",aline);

  2704 		if (!pswit[OVERVIEW_SWITCH])

  2705 		    g_print("    Line %ld column %ld - "

  2706 		      "Paragraph starts with lower-case\n",

  2707 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2708 		else

  2709 		    cnt_punct++;

  2710 	    }

  2711 	    isnewpara=FALSE; /* Signal the end of new para processing. */

  2712 	}

  2713 	/* Check for an em-dash broken at line end. */

  2714 	if (enddash && g_utf8_get_char(aline)=='-')

  2715 	{

  2716 	    if (pswit[ECHO_SWITCH])

  2717 		g_print("\n%s\n",aline);

  2718 	    if (!pswit[OVERVIEW_SWITCH])

  2719 		g_print("    Line %ld column 1 - Broken em-dash?\n",linecnt);

  2720 	    else

  2721 		cnt_punct++;

  2722 	}

  2723 	enddash=FALSE;

  2724 	for (s=g_utf8_prev_char(aline+strlen(aline));

  2725 	  g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))

  2726 	    ;

  2727 	if (s>=aline && g_utf8_get_char(s)=='-')

  2728 	    enddash=TRUE;

  2729 	check_for_control_characters(aline);

  2730 	if (warnings->bin)

  2731 	    check_for_odd_characters(aline,warnings,isemptyline);

  2732 	if (warnings->longline)

  2733 	    check_for_long_line(aline);

  2734 	if (warnings->shortline)

  2735 	    check_for_short_line(aline,&last);

  2736 	last.blen=last.len;

  2737 	last.len=g_utf8_strlen(aline,-1);

  2738 	last.start=g_utf8_get_char(aline);

  2739 	check_for_starting_punctuation(aline);

  2740 	if (warnings->dash)

  2741 	{

  2742 	    check_for_spaced_emdash(aline);

  2743 	    check_for_spaced_dash(aline);

  2744 	}

  2745 	check_for_unmarked_paragraphs(aline);

  2746 	check_for_jeebies(aline);

  2747 	check_for_mta_from(aline);

  2748 	check_for_orphan_character(aline);

  2749 	check_for_pling_scanno(aline);

  2750 	check_for_extra_period(aline,warnings);

  2751 	check_for_following_punctuation(aline);

  2752 	check_for_typos(aline,warnings);

  2753 	check_for_misspaced_punctuation(aline,&parities,isemptyline);

  2754 	check_for_double_punctuation(aline,warnings);

  2755 	check_for_spaced_quotes(aline);

  2756 	check_for_miscased_genative(aline);

  2757 	check_end_of_line(aline,warnings);

  2758 	check_for_unspaced_bracket(aline);

  2759 	if (warnings->endquote)

  2760 	    check_for_unpunctuated_endquote(aline);

  2761 	check_for_html_tag(aline);

  2762 	check_for_html_entity(aline);

  2763 	if (isemptyline)

  2764 	{

  2765 	    check_for_mismatched_quotes(&counters,&pending);

  2766 	    counters_reset(&counters);

  2767 	    /* let the next iteration know that it's starting a new para */

  2768 	    isnewpara=TRUE;

  2769 	    if (prevline)

  2770 		check_for_omitted_punctuation(prevline,&last,start_para_line);

  2771 	}

  2772 	g_free(prevline);

  2773 	prevline=g_strdup(aline);

  2774     }

  2775     linecnt++;

  2776     check_for_mismatched_quotes(&counters,&pending);

  2777     print_pending(NULL,parastart,&pending);

  2778     reset_pending(&pending);

  2779     if (prevline)

  2780     {

  2781 	g_free(prevline);

  2782 	prevline=NULL;

  2783     }

  2784     g_free(parastart);

  2785     g_free(prevline);

  2786     g_free(etext);

  2787     if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])

  2788 	g_tree_foreach(qword,report_duplicate_queries,NULL);

  2789     g_tree_unref(qword);

  2790     g_tree_unref(qperiod);

  2791     counters_destroy(&counters);

  2792     g_set_print_handler(NULL);

  2793     print_as_windows_1252(NULL);

  2794     if (pswit[MARKUP_SWITCH])

  2795 	loseentities(NULL);

  2796 }

  2798 /*

  2799  * flgets:

  2800  *

  2801  * Get one line from the input text. The setting of newlines has the following

  2802  * effect:

  2803  *

  2804  * DOS_NEWLINES: Check for the existence of exactly one CR-LF line-end per line.

  2805  *

  2806  * OS9_NEWLINES: Asserts that etext contains no LFs. CR is used as

  2807  *		 the newline character.

  2808  *

  2809  * UNIX_NEWLINES: Check for the presence of CRs.

  2810  *

  2811  * In all cases, check that the last line is correctly terminated.

  2812  *

  2813  * Returns: a pointer to the line.

  2814  */

  2815 char *flgets(char **etext,long lcnt,int newlines)

  2816 {

  2817     gunichar c;

  2818     gboolean isCR=FALSE;

  2819     char *theline=*etext;

  2820     char *eos=theline;

  2821     gchar *s;

  2822     for (;;)

  2823     {

  2824 	c=g_utf8_get_char(*etext);

  2825 	if (!c)

  2826 	{

  2827 	    if (*etext==theline)

  2828 		return NULL;

  2829 	    else if (pswit[LINE_END_SWITCH])

  2830 	    {

  2831 		if (pswit[ECHO_SWITCH])

  2832 		{

  2833 		    s=g_strndup(theline,eos-theline);

  2834 		    g_print("\n%s\n",s);

  2835 		    g_free(s);

  2836 		}

  2837 		if (!pswit[OVERVIEW_SWITCH])

  2838 		{

  2839 		    if (newlines==OS9_NEWLINES)

  2840 			g_print("    Line %ld - No CR?\n",lcnt);

  2841 		    else

  2842 		    {

  2843 			/* There may, or may not, have been a CR */

  2844 			g_print("    Line %ld - No LF?\n",lcnt);

  2845 		    }

  2846 		}

  2847 		else

  2848 		    cnt_lineend++;

  2849 	    }

  2850 	    break;

  2851 	}

  2852 	*etext=g_utf8_next_char(*etext);

  2853 	/* either way, it's end of line */

  2854 	if (c=='\n')

  2855 	{

  2856 	    if (newlines==DOS_NEWLINES && !isCR)

  2857 	    {

  2858 		/* Error - a LF without a preceding CR */

  2859 		if (pswit[LINE_END_SWITCH])

  2860 		{

  2861 		    if (pswit[ECHO_SWITCH])

  2862 		    {

  2863 			s=g_strndup(theline,eos-theline);

  2864 			g_print("\n%s\n",s);

  2865 			g_free(s);

  2866 		    }

  2867 		    if (!pswit[OVERVIEW_SWITCH])

  2868 			g_print("    Line %ld - No CR?\n",lcnt);

  2869 		    else

  2870 			cnt_lineend++;

  2871 		}

  2872 	    }

  2873 	    break;

  2874 	}

  2875 	if (c=='\r')

  2876 	{

  2877 	    if (newlines==OS9_NEWLINES)

  2878 		break;

  2879 	    if (isCR || newlines==UNIX_NEWLINES)

  2880 	    {

  2881 		if (pswit[LINE_END_SWITCH])

  2882 		{

  2883 		    if (pswit[ECHO_SWITCH])

  2884 		    {

  2885 			s=g_strndup(theline,eos-theline);

  2886 			g_print("\n%s\n",s);

  2887 			g_free(s);

  2888 		    }

  2889 		    if (!pswit[OVERVIEW_SWITCH])

  2890 		    {

  2891 			if (newlines==UNIX_NEWLINES)

  2892 			    g_print("    Line %ld column %ld - Embedded CR?\n",

  2893 			      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);

  2894 			else

  2895 			    g_print("    Line %ld - Two successive CRs?\n",

  2896 			      lcnt);

  2897 		    }

  2898 		    else

  2899 			cnt_lineend++;

  2900 		}

  2901 		if (newlines==UNIX_NEWLINES)

  2902 		    *eos=' ';

  2903 	    }

  2904 	    if (newlines==DOS_NEWLINES)

  2905 		isCR=TRUE;

  2906 	}

  2907 	else

  2908 	{

  2909 	    if (pswit[LINE_END_SWITCH] && isCR)

  2910 	    {

  2911 		if (pswit[ECHO_SWITCH])

  2912 		{

  2913 		    s=g_strndup(theline,eos-theline);

  2914 		    g_print("\n%s\n",s);

  2915 		    g_free(s);

  2916 		}

  2917 		if (!pswit[OVERVIEW_SWITCH])

  2918 		    g_print("    Line %ld column %ld - CR without LF?\n",

  2919 		      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);

  2920 		else

  2921 		    cnt_lineend++;

  2922 		*eos=' ';

  2923 	    }

  2924 	    isCR=FALSE;

  2925 	    eos=g_utf8_next_char(eos);

  2926 	}

  2927     }

  2928     *eos='\0';

  2929     if (pswit[MARKUP_SWITCH])

  2930 	postprocess_for_HTML(theline);

  2931     if (pswit[DP_SWITCH])

  2932 	postprocess_for_DP(theline);

  2933     return theline;

  2934 }

  2936 /*

  2937  * mixdigit:

  2938  *

  2939  * Takes a "word" as a parameter, and checks whether it

  2940  * contains a mixture of alpha and digits. Generally, this is an

  2941  * error, but may not be for cases like 4th or L5 12s. 3d.

  2942  *

  2943  * Returns: TRUE iff an is error found.

  2944  */

  2945 gboolean mixdigit(const char *checkword)

  2946 {

  2947     gboolean wehaveadigit,wehavealetter,query;

  2948     const char *s,*nondigit;

  2949     wehaveadigit=wehavealetter=query=FALSE;

  2950     for (s=checkword;*s;s=g_utf8_next_char(s))

  2951 	if (g_unichar_isalpha(g_utf8_get_char(s)))

  2952 	    wehavealetter=TRUE;

  2953 	else if (g_unichar_isdigit(g_utf8_get_char(s)))

  2954 	    wehaveadigit=TRUE;

  2955     if (wehaveadigit && wehavealetter)

  2956     {

  2957 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */

  2958 	query=TRUE;

  2959 	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));

  2960 	  nondigit=g_utf8_next_char(nondigit))

  2961 	    ;

  2962 	/* digits, ending in st, rd, nd, th of either case */

  2963 	if (!g_ascii_strcasecmp(nondigit,"st") ||

  2964 	  !g_ascii_strcasecmp(nondigit,"rd") ||

  2965 	  !g_ascii_strcasecmp(nondigit,"nd") ||

  2966 	  !g_ascii_strcasecmp(nondigit,"th"))

  2967 	    query=FALSE;

  2968 	if (!g_ascii_strcasecmp(nondigit,"sts") ||

  2969 	  !g_ascii_strcasecmp(nondigit,"rds") ||

  2970 	  !g_ascii_strcasecmp(nondigit,"nds") ||

  2971 	  !g_ascii_strcasecmp(nondigit,"ths"))

  2972 	    query=FALSE;

  2973 	if (!g_ascii_strcasecmp(nondigit,"stly") ||

  2974 	  !g_ascii_strcasecmp(nondigit,"rdly") ||

  2975 	  !g_ascii_strcasecmp(nondigit,"ndly") ||

  2976 	  !g_ascii_strcasecmp(nondigit,"thly"))

  2977 	    query=FALSE;

  2978 	/* digits, ending in l, L, s or d */

  2979 	if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||

  2980 	  !strcmp(nondigit,"d"))

  2981 	    query=FALSE;

  2982 	/*

  2983 	 * L at the start of a number, representing Britsh pounds, like L500.

  2984 	 * This is cute. We know the current word is mixed digit. If the first

  2985 	 * letter is L, there must be at least one digit following. If both

  2986 	 * digits and letters follow, we have a genuine error, else we have a

  2987 	 * capital L followed by digits, and we accept that as a non-error.

  2988 	 */

  2989 	if (g_utf8_get_char(checkword)=='L' &&

  2990 	  !mixdigit(g_utf8_next_char(checkword)))

  2991 	    query=FALSE;

  2992     }

  2993     return query;

  2994 }

  2996 /*

  2997  * getaword:

  2998  *

  2999  * Extracts the first/next "word" from the line, and returns it.

  3000  * A word is defined as one English word unit--or at least that's the aim.

  3001  * "ptr" is advanced to the position in the line where we will start

  3002  * looking for the next word.

  3003  *

  3004  * Returns: A newly-allocated string.

  3005  */

  3006 gchar *getaword(const char **ptr)

  3007 {

  3008     const char *s,*t;

  3009     GString *word;

  3010     gunichar c,pc;

  3011     word=g_string_new(NULL);

  3012     for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&

  3013       !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&

  3014       **ptr;*ptr=g_utf8_next_char(*ptr))

  3015     {

  3016 	/* Handle exceptions for footnote markers like [1] */

  3017 	if (g_utf8_get_char(*ptr)=='[')

  3018 	{

  3019 	    g_string_append_c(word,'[');

  3020 	    s=g_utf8_next_char(*ptr);

  3021 	    for (;g_unichar_isdigit(g_utf8_get_char(s));s=g_utf8_next_char(s))

  3022 		g_string_append_unichar(word,g_utf8_get_char(s));

  3023 	    if (g_utf8_get_char(s)==']')

  3024 	    {

  3025 		g_string_append_c(word,']');

  3026 		*ptr=g_utf8_next_char(s);

  3027 		return g_string_free(word,FALSE);

  3028 	    }

  3029 	    else

  3030 		g_string_truncate(word,0);

  3031 	}

  3032     }

  3033     /*

  3034      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.

  3035      * Especially yucky is the case of L1,000

  3036      * This section looks for a pattern of characters including a digit

  3037      * followed by a comma or period followed by one or more digits.

  3038      * If found, it returns this whole pattern as a word; otherwise we discard

  3039      * the results and resume our normal programming.

  3040      */

  3041     s=*ptr;

  3042     for (;g_unichar_isdigit(g_utf8_get_char(s)) ||

  3043       g_unichar_isalpha(g_utf8_get_char(s)) ||

  3044       g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))

  3045 	g_string_append_unichar(word,g_utf8_get_char(s));

  3046     if (word->len)

  3047     {

  3048 	for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))

  3049 	{

  3050 	    c=g_utf8_get_char(t);

  3051 	    pc=g_utf8_get_char(g_utf8_prev_char(t));

  3052 	    if ((c=='.' || c==',') && g_unichar_isdigit(pc))

  3053 	    {

  3054 		*ptr=s;

  3055 		return g_string_free(word,FALSE);

  3056 	    }

  3057 	}

  3058     }

  3059     /* we didn't find a punctuated number - do the regular getword thing */

  3060     g_string_truncate(word,0);

  3061     c=g_utf8_get_char(*ptr);

  3062     for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);

  3063       *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))

  3064 	g_string_append_unichar(word,c);

  3065     return g_string_free(word,FALSE);

  3066 }

  3068 /*

  3069  * isroman:

  3070  *

  3071  * Is this word a Roman Numeral?

  3072  *

  3073  * It doesn't actually validate that the number is a valid Roman Numeral--for

  3074  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not

  3075  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.

  3076  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or

  3077  * expressions thereof, except when it came to taxes. Allow any number of M,

  3078  * an optional D, an optional CM or CD, any number of optional Cs, an optional

  3079  * XL or an optional XC, an optional IX or IV, an optional V and any number

  3080  * of optional Is.

  3081  */

  3082 gboolean isroman(const char *t)

  3083 {

  3084     const char *s;

  3085     if (!t || !*t)

  3086 	return FALSE;

  3087     s=t;

  3088     while (g_utf8_get_char(t)=='m' && *t)

  3089 	t++;

  3090     if (g_utf8_get_char(t)=='d')

  3091 	t++;

  3092     if (g_str_has_prefix(t,"cm"))

  3093 	t+=2;

  3094     if (g_str_has_prefix(t,"cd"))

  3095 	t+=2;

  3096     while (g_utf8_get_char(t)=='c' && *t)

  3097 	t++;

  3098     if (g_str_has_prefix(t,"xl"))

  3099 	t+=2;

  3100     if (g_str_has_prefix(t,"xc"))

  3101 	t+=2;

  3102     if (g_utf8_get_char(t)=='l')

  3103 	t++;

  3104     while (g_utf8_get_char(t)=='x' && *t)

  3105 	t++;

  3106     if (g_str_has_prefix(t,"ix"))

  3107 	t+=2;

  3108     if (g_str_has_prefix(t,"iv"))

  3109 	t+=2;

  3110     if (g_utf8_get_char(t)=='v')

  3111 	t++;

  3112     while (g_utf8_get_char(t)=='i' && *t)

  3113 	t++;

  3114     return !*t;

  3115 }

  3117 /*

  3118  * postprocess_for_DP:

  3119  *

  3120  * Invoked with the -d switch from flgets().

  3121  * It simply "removes" from the line a hard-coded set of common

  3122  * DP-specific tags, so that the line passed to the main routine has

  3123  * been pre-cleaned of DP markup.

  3124  */

  3125 void postprocess_for_DP(char *theline)

  3126 {

  3127     char *s,*t;

  3128     int i;

  3129     if (!*theline)

  3130 	return;

  3131     for (i=0;*DPmarkup[i];i++)

  3132 	while ((s=strstr(theline,DPmarkup[i])))

  3133 	{

  3134 	    t=s+strlen(DPmarkup[i]);

  3135 	    memmove(s,t,strlen(t)+1);

  3136 	}

  3137 }

  3139 /*

  3140  * postprocess_for_HTML:

  3141  *

  3142  * Invoked with the -m switch from flgets().

  3143  * It simply "removes" from the line a hard-coded set of common

  3144  * HTML tags and "replaces" a hard-coded set of common HTML

  3145  * entities, so that the line passed to the main routine has

  3146  * been pre-cleaned of HTML.

  3147  */

  3148 void postprocess_for_HTML(char *theline)

  3149 {

  3150     while (losemarkup(theline))

  3151 	;

  3152     loseentities(theline);

  3153 }

  3155 char *losemarkup(char *theline)

  3156 {

  3157     char *s,*t;

  3158     int i;

  3159     s=strchr(theline,'<');

  3160     t=s?strchr(s,'>'):NULL;

  3161     if (!s || !t)

  3162 	return NULL;

  3163     for (i=0;*markup[i];i++)

  3164 	if (tagcomp(g_utf8_next_char(s),markup[i]))

  3165 	{

  3166 	    t=g_utf8_next_char(t);

  3167 	    memmove(s,t,strlen(t)+1);

  3168 	    return s;

  3169 	}

  3170     /* It's an unrecognized <xxx>. */

  3171     return NULL;

  3172 }

  3174 void loseentities(char *theline)

  3175 {

  3176     int i;

  3177     gsize nb;

  3178     char *amp,*scolon;

  3179     gchar *s,*t;

  3180     gunichar c;

  3181     GTree *entities=NULL;

  3182     static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;

  3183     if (!theline)

  3184     {

  3185 	if (entities)

  3186 	    g_tree_destroy(entities);

  3187 	entities=NULL;

  3188 	if (translit!=(GIConv)-1)

  3189 	    g_iconv_close(translit);

  3190 	translit=(GIConv)-1;

  3191 	if (to_utf8!=(GIConv)-1)

  3192 	    g_iconv_close(to_utf8);

  3193 	to_utf8=(GIConv)-1;

  3194 	return;

  3195     }

  3196     if (!*theline)

  3197 	return;

  3198     if (!entities)

  3199     {

  3200 	entities=g_tree_new((GCompareFunc)strcmp);

  3201 	for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)

  3202 	    g_tree_insert(entities,HTMLentities[i].name,

  3203 	      GUINT_TO_POINTER(HTMLentities[i].c));

  3204     }

  3205     if (translit==(GIConv)-1)

  3206 	translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");

  3207     if (to_utf8==(GIConv)-1)

  3208 	to_utf8=g_iconv_open("UTF-8","ISO_8859-1");

  3209     while((amp=strchr(theline,'&')))

  3210     {

  3211 	scolon=strchr(amp,';');

  3212 	if (scolon)

  3213 	{

  3214 	    if (amp[1]=='#')

  3215 	    {

  3216 		if (amp+2+strspn(amp+2,"0123456789")==scolon)

  3217 		    c=strtol(amp+2,NULL,10);

  3218 		else if (amp[2]=='x' &&

  3219 		  amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)

  3220 		    c=strtol(amp+3,NULL,16);

  3221 	    }

  3222 	    else

  3223 	    {

  3224 		s=g_strndup(amp+1,scolon-(amp+1));

  3225 	        c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));

  3226 		g_free(s);

  3227 	    }

  3228 	}

  3229 	else

  3230 	    c=0;

  3231 	if (c)

  3232 	{

  3233 	    theline=amp;

  3234 	    if (c<128 || c>=192 && c<=255)	/* An ISO-8859-1 character */

  3235 		theline+=g_unichar_to_utf8(c,theline);

  3236 	    else

  3237 	    {

  3238 		s=g_malloc(6);

  3239 		nb=g_unichar_to_utf8(c,s);

  3240 		t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);

  3241 		g_free(s);

  3242 		s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);

  3243 		g_free(t);

  3244 		memcpy(theline,s,nb);

  3245 		g_free(s);

  3246 		theline+=nb;

  3247 	    }

  3248 	    memmove(theline,g_utf8_next_char(scolon),

  3249 	      strlen(g_utf8_next_char(scolon))+1);

  3250 	}

  3251 	else

  3252 	    theline=g_utf8_next_char(amp);

  3253     }

  3254 }

  3256 gboolean tagcomp(const char *strin,const char *basetag)

  3257 {

  3258     gboolean retval;

  3259     gchar *s,*t;

  3260     if (g_utf8_get_char(strin)=='/')

  3261 	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */

  3262     else

  3263 	t=g_utf8_casefold(strin,-1);

  3264     s=g_utf8_casefold(basetag,-1);

  3265     retval=g_str_has_prefix(t,s);

  3266     g_free(s);

  3267     g_free(t);

  3268     return retval;

  3269 }

  3271 void proghelp(GOptionContext *context)

  3272 {

  3273     gchar *help;

  3274     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);

  3275     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);

  3276     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);

  3277     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "

  3278       "For details, read the file COPYING.\n",stderr);

  3279     fputs("This is Free Software; "

  3280       "you may redistribute it under certain conditions (GPL);\n",stderr);

  3281     fputs("read the file COPYING for details.\n\n",stderr);

  3282     help=g_option_context_get_help(context,TRUE,NULL);

  3283     fputs(help,stderr);

  3284     g_free(help);

  3285     fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);

  3286     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "

  3287       "non-ASCII\n",stderr);

  3288     fputs("characters like accented letters, "

  3289       "lines longer than 75 or shorter than 55,\n",stderr);

  3290     fputs("unbalanced quotes or brackets, "

  3291       "a variety of badly formatted punctuation, \n",stderr);

  3292     fputs("HTML tags, some likely typos. "

  3293       "It is NOT a substitute for human judgement.\n",stderr);

  3294     fputs("\n",stderr);

  3295 }

author	ali <ali@juiblex.co.uk>
	Sat Oct 26 18:47:33 2013 +0100 (2013-10-26)
changeset 101	f44c530f80da
parent 100	ad92d11d59b8
child 102	ff0aa9b1397a
permissions	-rw-r--r--