bookloupe-testing: bookloupe/bookloupe.c@f3c293593d44

     1 /*************************************************************************/

     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */

     3 /*									 */

     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */

     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */

     6 /*									 */

     7 /* This program is free software; you can redistribute it and/or modify  */

     8 /* it under the terms of the GNU General Public License as published by  */

     9 /* the Free Software Foundation; either version 2 of the License, or     */

    10 /* (at your option) any later version.					 */

    11 /*									 */

    12 /* This program is distributed in the hope that it will be useful,       */

    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */

    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */

    15 /* GNU General Public License for more details.				 */

    16 /*									 */

    17 /* You should have received a copy of the GNU General Public License	 */

    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */

    19 /*************************************************************************/

    21 #include <stdio.h>

    22 #include <stdlib.h>

    23 #include <string.h>

    24 #include <ctype.h>

    25 #ifdef __WIN32__

    26 #include <windows.h>

    27 #endif

    28 #include <glib.h>

    29 #include <bl/bl.h>

    30 #include "bookloupe.h"

    31 #include "counters.h"

    32 #include "pending.h"

    33 #include "HTMLentities.h"

    35 gchar *charset;		/* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */

    36 GIConv charset_validator=(GIConv)-1;

    38 gchar *prevline;

    40 /* Common typos. */

    41 char *typo[] = {

    42     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",

    43     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",

    44     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",

    45     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",

    46     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",

    47     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",

    48     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",

    49     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",

    50     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",

    51     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",

    52     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",

    53     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",

    54     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",

    55     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",

    56     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",

    57     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",

    58     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",

    59     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",

    60     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",

    61     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",

    62     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",

    63     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",

    64     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",

    65     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",

    66     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",

    67     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",

    68     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",

    69     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",

    70     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",

    71     "se", ""

    72 };

    74 GTree *usertypo;

    76 /* Common abbreviations and other OK words not to query as typos. */

    77 char *okword[] = {

    78     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",

    79     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",

    80     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",

    81     "outbid", "outbids", "frostbite", "frostbitten", ""

    82 };

    84 /* Common abbreviations that cause otherwise unexplained periods. */

    85 char *abbrev[] = {

    86     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",

    87     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""

    88 };

    90 /*

    91  * Two-Letter combinations that rarely if ever start words,

    92  * but are common scannos or otherwise common letter combinations.

    93  */

    94 char *nostart[] = {

    95     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""

    96 };

    98 /*

    99  * Two-Letter combinations that rarely if ever end words,

   100  * but are common scannos or otherwise common letter combinations.

   101  */

   102 char *noend[] = {

   103     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",

   104     "sw", "gr", "sl", "cl", "iy", ""

   105 };

   107 char *markup[] = {

   108     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",

   109     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",

   110     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",

   111     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""

   112 };

   114 char *DPmarkup[] = {

   115     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""

   116 };

   118 char *nocomma[] = {

   119     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",

   120     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",

   121     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",

   122     "during", "let", "toward", "among", ""

   123 };

   125 char *noperiod[] = {

   126     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",

   127     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",

   128     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",

   129     "among", "those", "into", "whom", "having", "thence", ""

   130 };

   132 gboolean pswit[SWITNO];  /* program switches */

   133 gchar *opt_charset;

   135 static GOptionEntry options[]={

   136     { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,

   137       "Ignore DP-specific markup", NULL },

   138     { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,

   139       "Don't echo queried line", NULL },

   140     { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,

   141       "Check single quotes", NULL },

   142     { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,

   143       "Check common typos", NULL },

   144     { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,

   145       "Require closure of quotes on every paragraph", NULL },

   146     { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,

   147       "Disable paranoid querying of everything", NULL },

   148     { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,

   149       "Disable line end checking", NULL },

   150     { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,

   151       "Overview: just show counts", NULL },

   152     { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,

   153       "Output errors to stdout instead of stderr", NULL },

   154     { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,

   155       "Echo header fields", NULL },

   156     { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,

   157       "Ignore markup in < >", NULL },

   158     { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,

   159       "Use file of user-defined typos", NULL },

   160     { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,

   161       "Defaults for use on www upload", NULL },

   162     { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,

   163       "Verbose - list everything", NULL },

   164     { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset,

   165       "Set of characters valid for this ebook", "NAME" },

   166     { NULL }

   167 };

   169 long cnt_quote;		/* for overview mode, count of quote queries */

   170 long cnt_brack;		/* for overview mode, count of brackets queries */

   171 long cnt_bin;		/* for overview mode, count of non-ASCII queries */

   172 long cnt_odd;		/* for overview mode, count of odd character queries */

   173 long cnt_long;		/* for overview mode, count of long line errors */

   174 long cnt_short;		/* for overview mode, count of short line queries */

   175 long cnt_punct;		/* for overview mode,

   176 			   count of punctuation and spacing queries */

   177 long cnt_dash;		/* for overview mode, count of dash-related queries */

   178 long cnt_word;		/* for overview mode, count of word queries */

   179 long cnt_html;		/* for overview mode, count of html queries */

   180 long cnt_lineend;	/* for overview mode, count of line-end queries */

   181 long cnt_spacend;	/* count of lines with space at end */

   182 long linecnt;		/* count of total lines in the file */

   183 long checked_linecnt;	/* count of lines actually checked */

   185 void proghelp(GOptionContext *context);

   186 void procfile(const char *);

   188 gchar *running_from;

   190 gboolean mixdigit(const char *);

   191 gchar *getaword(const char **);

   192 char *flgets(char **,long);

   193 void postprocess_for_HTML(char *);

   194 char *linehasmarkup(char *);

   195 char *losemarkup(char *);

   196 gboolean tagcomp(const char *,const char *);

   197 void loseentities(char *);

   198 gboolean isroman(const char *);

   199 void postprocess_for_DP(char *);

   200 void print_as_windows_1252(const char *string);

   201 void print_as_utf_8(const char *string);

   203 GTree *qword,*qperiod;

   205 #ifdef __WIN32__

   206 UINT saved_cp;

   207 #endif

   209 gboolean set_charset(const char *name,GError **err)

   210 {

   211     /* The various UNICODE encodings all share the same character set. */

   212     const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4",

   213       "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG",

   214       "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",

   215       "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE",

   216       "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" };

   217     int i;

   218     if (charset)

   219 	g_free(charset);

   220     if (charset_validator==(GIConv)-1)

   221 	g_iconv_close(charset_validator);

   222     if (!name || !g_strcasecmp(name,"auto"))

   223     {

   224 	charset=NULL;

   225 	charset_validator=(GIConv)-1;

   226 	return TRUE;

   227     }

   228     else

   229 	charset=g_strdup(name);

   230     for(i=0;i<G_N_ELEMENTS(unicode_aliases);i++)

   231 	if (!g_strcasecmp(charset,unicode_aliases[i]))

   232 	{

   233 	    g_free(charset);

   234 	    charset=g_strdup("UTF-8");

   235 	    break;

   236 	}

   237     if (!strcmp(charset,"UTF-8"))

   238 	charset_validator=(GIConv)-1;

   239     else

   240     {

   241 	charset_validator=g_iconv_open(charset,"UTF-8");

   242 	if (charset_validator==(GIConv)-1)

   243 	{

   244 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_NO_CONVERSION,

   245 	      "Unknown character set \"%s\"",charset);

   246 	    return FALSE;

   247 	}

   248     }

   249     return TRUE;

   250 }

   252 void parse_options(int *argc,char ***argv)

   253 {

   254     GError *err=NULL;

   255     GOptionContext *context;

   256     context=g_option_context_new(

   257       "file - looks for errors in Project Gutenberg(TM) etexts");

   258     g_option_context_add_main_entries(context,options,NULL);

   259     if (!g_option_context_parse(context,argc,argv,&err))

   260     {

   261 	g_printerr("Bookloupe: %s\n",err->message);

   262 	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);

   263 	exit(1);

   264     }

   265     /* Paranoid checking is turned OFF, not on, by its switch */

   266     pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];

   267     if (pswit[PARANOID_SWITCH])

   268 	/* if running in paranoid mode, typo checks default to enabled */

   269 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];

   270     /* Line-end checking is turned OFF, not on, by its switch */

   271     pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];

   272     /* Echoing is turned OFF, not on, by its switch */

   273     pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];

   274     if (pswit[OVERVIEW_SWITCH])

   275 	/* just print summary; don't echo */

   276 	pswit[ECHO_SWITCH]=FALSE;

   277     /*

   278      * Web uploads - for the moment, this is really just a placeholder

   279      * until we decide what processing we really want to do on web uploads

   280      */

   281     if (pswit[WEB_SWITCH])

   282     {

   283 	/* specific override for web uploads */

   284 	pswit[ECHO_SWITCH]=TRUE;

   285 	pswit[SQUOTE_SWITCH]=FALSE;

   286 	pswit[TYPO_SWITCH]=TRUE;

   287 	pswit[QPARA_SWITCH]=FALSE;

   288 	pswit[PARANOID_SWITCH]=TRUE;

   289 	pswit[LINE_END_SWITCH]=FALSE;

   290 	pswit[OVERVIEW_SWITCH]=FALSE;

   291 	pswit[STDOUT_SWITCH]=FALSE;

   292 	pswit[HEADER_SWITCH]=TRUE;

   293 	pswit[VERBOSE_SWITCH]=FALSE;

   294 	pswit[MARKUP_SWITCH]=FALSE;

   295 	pswit[USERTYPO_SWITCH]=FALSE;

   296 	pswit[DP_SWITCH]=FALSE;

   297     }

   298     if (opt_charset && !set_charset(opt_charset,&err))

   299     {

   300 	g_printerr("%s\n",err->message);

   301 	exit(1);

   302     }

   303     g_free(opt_charset);

   304     opt_charset=NULL;

   305     if (*argc<2)

   306     {

   307 	proghelp(context);

   308 	exit(1);

   309     }

   310     g_option_context_free(context);

   311 }

   313 /*

   314  * read_user_scannos:

   315  *

   316  * Read in the user-defined stealth scanno list.

   317  */

   318 void read_user_scannos(void)

   319 {

   320     GError *err=NULL;

   321     gchar *usertypo_file;

   322     gboolean okay;

   323     int i;

   324     gsize len,nb;

   325     gchar *contents,*utf8,**lines;

   326     usertypo_file=g_strdup("bookloupe.typ");

   327     okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   328     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   329     {

   330 	g_clear_error(&err);

   331 	g_free(usertypo_file);

   332 	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);

   333 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   334     }

   335     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   336     {

   337 	g_clear_error(&err);

   338 	g_free(usertypo_file);

   339 	usertypo_file=g_strdup("gutcheck.typ");

   340 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   341     }

   342     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   343     {

   344 	g_clear_error(&err);

   345 	g_free(usertypo_file);

   346 	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);

   347 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   348     }

   349     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   350     {

   351 	g_free(usertypo_file);

   352 	g_print("   --> I couldn't find bookloupe.typ "

   353 	  "-- proceeding without user typos.\n");

   354 	return;

   355     }

   356     else if (!okay)

   357     {

   358 	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);

   359 	g_free(usertypo_file);

   360 	g_clear_error(&err);

   361 	exit(1);

   362     }

   363     if (g_utf8_validate(contents,len,NULL))

   364     {

   365 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);

   366 	if (!charset)

   367 	    (void)set_charset("UNICODE",NULL);

   368     }

   369     else

   370 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);

   371     g_free(contents);

   372     lines=g_strsplit_set(utf8,"\r\n",0);

   373     g_free(utf8);

   374     usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);

   375     for (i=0;lines[i];i++)

   376 	if (*(unsigned char *)lines[i]>'!')

   377 	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));

   378 	else

   379 	    g_free(lines[i]);

   380     g_free(lines);

   381 }

   383 /*

   384  * read_etext:

   385  *

   386  * Read an etext returning a newly allocated string containing the file

   387  * contents or NULL on error.

   388  */

   389 gchar *read_etext(const char *filename,GError **err)

   390 {

   391     GError *tmp_err=NULL;

   392     gchar *contents,*utf8;

   393     gsize len,bytes_read,bytes_written;

   394     int i,line,col;

   395     if (!g_file_get_contents(filename,&contents,&len,err))

   396 	return NULL;

   397     if (g_utf8_validate(contents,len,NULL))

   398     {

   399 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);

   400 	g_set_print_handler(print_as_utf_8);

   401 #ifdef __WIN32__

   402 	SetConsoleOutputCP(CP_UTF8);

   403 #endif

   404     }

   405     else

   406     {

   407 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,

   408 	  &bytes_written,&tmp_err);

   409 	if (g_error_matches(tmp_err,G_CONVERT_ERROR,

   410 	  G_CONVERT_ERROR_ILLEGAL_SEQUENCE))

   411 	{

   412 	    line=col=1;

   413 	    for(i=0;i<bytes_read;i++)

   414 		if (contents[i]=='\n')

   415 		{

   416 		    line++;

   417 		    col=1;

   418 		}

   419 		else if (contents[i]!='\r')

   420 		    col++;

   421 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,

   422 	      "Input conversion failed. Byte %d at line %d, column %d is not a "

   423 	      "valid Windows-1252 character",

   424 	      ((unsigned char *)contents)[bytes_read],line,col);

   425 	}

   426 	else if (tmp_err)

   427 	    g_propagate_error(err,tmp_err);

   428 	g_set_print_handler(print_as_windows_1252);

   429 #ifdef __WIN32__

   430 	SetConsoleOutputCP(1252);

   431 #endif

   432     }

   433     g_free(contents);

   434     return utf8;

   435 }

   437 void cleanup_on_exit(void)

   438 {

   439 #ifdef __WIN32__

   440     SetConsoleOutputCP(saved_cp);

   441 #endif

   442 }

   444 int main(int argc,char **argv)

   445 {

   446 #ifdef __WIN32__

   447     atexit(cleanup_on_exit);

   448     saved_cp=GetConsoleOutputCP();

   449 #endif

   450     running_from=g_path_get_dirname(argv[0]);

   451     parse_options(&argc,&argv);

   452     if (pswit[USERTYPO_SWITCH])

   453 	read_user_scannos();

   454     fprintf(stderr,"bookloupe: Check and report on an e-text\n");

   455     procfile(argv[1]);

   456     if (pswit[OVERVIEW_SWITCH])

   457     {

   458 	g_print("    Checked %ld lines of %ld (head+foot = %ld)\n\n",

   459 	  checked_linecnt,linecnt,linecnt-checked_linecnt);

   460 	g_print("    --------------- Queries found --------------\n");

   461 	if (cnt_long)

   462 	    g_print("    Long lines:		    %14ld\n",cnt_long);

   463 	if (cnt_short)

   464 	    g_print("    Short lines:		   %14ld\n",cnt_short);

   465 	if (cnt_lineend)

   466 	    g_print("    Line-end problems:	     %14ld\n",cnt_lineend);

   467 	if (cnt_word)

   468 	    g_print("    Common typos:		  %14ld\n",cnt_word);

   469 	if (cnt_quote)

   470 	    g_print("    Unmatched quotes:	      %14ld\n",cnt_quote);

   471 	if (cnt_brack)

   472 	    g_print("    Unmatched brackets:	    %14ld\n",cnt_brack);

   473 	if (cnt_bin)

   474 	    g_print("    Non-ASCII characters:	  %14ld\n",cnt_bin);

   475 	if (cnt_odd)

   476 	    g_print("    Proofing characters:	   %14ld\n",cnt_odd);

   477 	if (cnt_punct)

   478 	    g_print("    Punctuation & spacing queries: %14ld\n",cnt_punct);

   479 	if (cnt_dash)

   480 	    g_print("    Non-standard dashes:	   %14ld\n",cnt_dash);

   481 	if (cnt_html)

   482 	    g_print("    Possible HTML tags:	    %14ld\n",cnt_html);

   483 	g_print("\n");

   484 	g_print("    TOTAL QUERIES		  %14ld\n",

   485 	  cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+

   486 	  cnt_dash+cnt_word+cnt_html+cnt_lineend);

   487     }

   488     g_free(running_from);

   489     if (usertypo)

   490 	g_tree_unref(usertypo);

   491     set_charset(NULL,NULL);

   492     return 0;

   493 }

   495 /*

   496  * first_pass:

   497  *

   498  * Run a first pass - verify that it's a valid PG

   499  * file, decide whether to report some things that

   500  * occur many times in the text like long or short

   501  * lines, non-standard dashes, etc.

   502  */

   503 struct first_pass_results *first_pass(const char *etext)

   504 {

   505     gunichar laststart=CHAR_SPACE;

   506     const char *s;

   507     gchar *lc_line;

   508     int i,j,lbytes,llen;

   509     gchar **lines;

   510     unsigned int lastlen=0,lastblen=0;

   511     long spline=0,nspline=0;

   512     static struct first_pass_results results={0};

   513     gchar *inword;

   514     lines=g_strsplit(etext,"\n",0);

   515     for (j=0;lines[j];j++)

   516     {

   517 	lbytes=strlen(lines[j]);

   518 	while (lbytes>0 && lines[j][lbytes-1]=='\r')

   519 	    lines[j][--lbytes]='\0';

   520 	llen=g_utf8_strlen(lines[j],lbytes);

   521 	linecnt++;

   522 	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&

   523 	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))

   524 	{

   525 	    if (spline)

   526 		g_print("   --> Duplicate header?\n");

   527 	    spline=linecnt+1;   /* first line of non-header text, that is */

   528 	}

   529 	if (!strncmp(lines[j],"*** START",9) &&

   530 	  strstr(lines[j],"PROJECT GUTENBERG"))

   531 	{

   532 	    if (nspline)

   533 		g_print("   --> Duplicate header?\n");

   534 	    nspline=linecnt+1;   /* first line of non-header text, that is */

   535 	}

   536 	if (spline || nspline)

   537 	{

   538 	    lc_line=g_utf8_strdown(lines[j],lbytes);

   539 	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))

   540 	    {

   541 		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))

   542 		{

   543 		    if (results.footerline)

   544 		    {

   545 			/* it's an old-form header - we can detect duplicates */

   546 			if (!nspline)

   547 			    g_print("   --> Duplicate footer?\n");

   548 		    }

   549 		    else

   550 			results.footerline=linecnt;

   551 		}

   552 	    }

   553 	    g_free(lc_line);

   554 	}

   555 	if (spline)

   556 	    results.firstline=spline;

   557 	if (nspline)

   558 	    results.firstline=nspline;  /* override with new */

   559 	if (results.footerline)

   560 	    continue;    /* don't count the boilerplate in the footer */

   561 	results.totlen+=llen;

   562 	for (s=lines[j];*s;s=g_utf8_next_char(s))

   563 	{

   564 	    if (g_utf8_get_char(s)>127)

   565 		results.binlen++;

   566 	    if (g_unichar_isalpha(g_utf8_get_char(s)))

   567 		results.alphalen++;

   568 	    if (s>lines[j] && g_utf8_get_char(s)==CHAR_DQUOTE &&

   569 	      isalpha(g_utf8_get_char(g_utf8_prev_char(s))))

   570 		results.endquote_count++;

   571 	}

   572 	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&

   573 	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)

   574 	    results.shortline++;

   575 	if (lbytes>0 &&

   576 	  g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)

   577 	    cnt_spacend++;

   578 	if (strstr(lines[j],".,"))

   579 	    results.dotcomma++;

   580 	/* only count ast lines for ignoring purposes where there is */

   581 	/* locase text on the line */

   582 	if (strchr(lines[j],'*'))

   583 	{

   584 	    for (s=lines[j];*s;s=g_utf8_next_char(s))

   585 		if (g_unichar_islower(g_utf8_get_char(s)))

   586 		    break;

   587 	    if (*s)

   588 		results.astline++;

   589 	}

   590 	if (strchr(lines[j],'/'))

   591 	    results.fslashline++;

   592 	if (lbytes>0)

   593 	{

   594 	    for (s=g_utf8_prev_char(lines[j]+lbytes);

   595 	      s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;

   596 	      s=g_utf8_prev_char(s))

   597 		;

   598 	    if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&

   599 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')

   600 		results.hyphens++;

   601 	}

   602 	if (llen>LONGEST_PG_LINE)

   603 	    results.longline++;

   604 	if (llen>WAY_TOO_LONG)

   605 	    results.verylongline++;

   606 	if (strchr(lines[j],'<') && strchr(lines[j],'>'))

   607 	{

   608 	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);

   609 	    if (i>0)

   610 		results.htmcount++;

   611 	    if (strstr(lines[j],"<i>"))

   612 		results.htmcount+=4; /* bonus marks! */

   613 	}

   614 	/* Check for spaced em-dashes */

   615 	if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))

   616 	{

   617 	    results.emdash++;

   618 	    if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)

   619 		results.space_emdash++;

   620 	    if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)

   621 		/* count of em-dashes with spaces both sides */

   622 		results.non_PG_space_emdash++;

   623 	    if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)

   624 		/* count of PG-type em-dashes with no spaces */

   625 		results.PG_space_emdash++;

   626 	}

   627 	for (s=lines[j];*s;)

   628 	{

   629 	    inword=getaword(&s);

   630 	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))

   631 		results.Dutchcount++;

   632 	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))

   633 		results.Frenchcount++;

   634 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))

   635 		results.standalone_digit++;

   636 	    g_free(inword);

   637 	}

   638 	/* Check for spaced dashes */

   639 	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')

   640 	    results.spacedash++;

   641 	lastblen=lastlen;

   642 	lastlen=llen;

   643 	laststart=lines[j][0];

   644     }

   645     g_strfreev(lines);

   646     return &results;

   647 }

   649 /*

   650  * report_first_pass:

   651  *

   652  * Make some snap decisions based on the first pass results.

   653  */

   654 struct warnings *report_first_pass(struct first_pass_results *results)

   655 {

   656     static struct warnings warnings={0};

   657     if (cnt_spacend>0)

   658 	g_print("   --> %ld lines in this file have white space at end\n",

   659 	  cnt_spacend);

   660     warnings.dotcomma=1;

   661     if (results->dotcomma>5)

   662     {

   663 	warnings.dotcomma=0;

   664 	g_print("   --> %ld lines in this file contain '.,'. "

   665 	  "Not reporting them.\n",results->dotcomma);

   666     }

   667     /*

   668      * If more than 50 lines, or one-tenth, are short,

   669      * don't bother reporting them.

   670      */

   671     warnings.shortline=1;

   672     if (results->shortline>50 || results->shortline*10>linecnt)

   673     {

   674 	warnings.shortline=0;

   675 	g_print("   --> %ld lines in this file are short. "

   676 	  "Not reporting short lines.\n",results->shortline);

   677     }

   678     /*

   679      * If more than 50 lines, or one-tenth, are long,

   680      * don't bother reporting them.

   681      */

   682     warnings.longline=1;

   683     if (results->longline>50 || results->longline*10>linecnt)

   684     {

   685 	warnings.longline=0;

   686 	g_print("   --> %ld lines in this file are long. "

   687 	  "Not reporting long lines.\n",results->longline);

   688     }

   689     /* If more than 10 lines contain asterisks, don't bother reporting them. */

   690     warnings.ast=1;

   691     if (results->astline>10)

   692     {

   693 	warnings.ast=0;

   694 	g_print("   --> %ld lines in this file contain asterisks. "

   695 	  "Not reporting them.\n",results->astline);

   696     }

   697     /*

   698      * If more than 10 lines contain forward slashes,

   699      * don't bother reporting them.

   700      */

   701     warnings.fslash=1;

   702     if (results->fslashline>10)

   703     {

   704 	warnings.fslash=0;

   705 	g_print("   --> %ld lines in this file contain forward slashes. "

   706 	  "Not reporting them.\n",results->fslashline);

   707     }

   708     /*

   709      * If more than 20 lines contain unpunctuated endquotes,

   710      * don't bother reporting them.

   711      */

   712     warnings.endquote=1;

   713     if (results->endquote_count>20)

   714     {

   715 	warnings.endquote=0;

   716 	g_print("   --> %ld lines in this file contain unpunctuated endquotes. "

   717 	  "Not reporting them.\n",results->endquote_count);

   718     }

   719     /*

   720      * If more than 15 lines contain standalone digits,

   721      * don't bother reporting them.

   722      */

   723     warnings.digit=1;

   724     if (results->standalone_digit>10)

   725     {

   726 	warnings.digit=0;

   727 	g_print("   --> %ld lines in this file contain standalone 0s and 1s. "

   728 	  "Not reporting them.\n",results->standalone_digit);

   729     }

   730     /*

   731      * If more than 20 lines contain hyphens at end,

   732      * don't bother reporting them.

   733      */

   734     warnings.hyphen=1;

   735     if (results->hyphens>20)

   736     {

   737 	warnings.hyphen=0;

   738 	g_print("   --> %ld lines in this file have hyphens at end. "

   739 	  "Not reporting them.\n",results->hyphens);

   740     }

   741     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])

   742     {

   743 	g_print("   --> Looks like this is HTML. Switching HTML mode ON.\n");

   744 	pswit[MARKUP_SWITCH]=1;

   745     }

   746     if (results->verylongline>0)

   747 	g_print("   --> %ld lines in this file are VERY long!\n",

   748 	  results->verylongline);

   749     /*

   750      * If there are more non-PG spaced dashes than PG em-dashes,

   751      * assume it's deliberate.

   752      * Current PG guidelines say don't use them, but older texts do,

   753      * and some people insist on them whatever the guidelines say.

   754      */

   755     warnings.dash=1;

   756     if (results->spacedash+results->non_PG_space_emdash>

   757       results->PG_space_emdash)

   758     {

   759 	warnings.dash=0;

   760 	g_print("   --> There are %ld spaced dashes and em-dashes. "

   761 	  "Not reporting them.\n",

   762 	  results->spacedash+results->non_PG_space_emdash);

   763     }

   764     if (charset)

   765 	warnings.bin=0;

   766     else

   767     {

   768 	/* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */

   769 	warnings.bin=1;

   770 	/* If more than a quarter of characters are hi-bit, bug out. */

   771 	if (results->binlen*4>results->totlen)

   772 	{

   773 	    g_print("   --> This file does not appear to be ASCII. "

   774 	      "Terminating. Best of luck with it!\n");

   775 	    exit(1);

   776 	}

   777 	if (results->alphalen*4<results->totlen)

   778 	{

   779 	    g_print("   --> This file does not appear to be text. "

   780 	      "Terminating. Best of luck with it!\n");

   781 	    exit(1);

   782 	}

   783 	if (results->binlen*100>results->totlen || results->binlen>100)

   784 	{

   785 	    g_print("   --> There are a lot of foreign letters here. "

   786 	      "Not reporting them.\n");

   787 	    if (!pswit[VERBOSE_SWITCH])

   788 		warnings.bin=0;

   789 	}

   790     }

   791     warnings.isDutch=FALSE;

   792     if (results->Dutchcount>50)

   793     {

   794 	warnings.isDutch=TRUE;

   795 	g_print("   --> This looks like Dutch - "

   796 	  "switching off dashes and warnings for 's Middags case.\n");

   797     }

   798     warnings.isFrench=FALSE;

   799     if (results->Frenchcount>50)

   800     {

   801 	warnings.isFrench=TRUE;

   802 	g_print("   --> This looks like French - "

   803 	  "switching off some doublepunct.\n");

   804     }

   805     if (results->firstline && results->footerline)

   806 	g_print("    The PG header and footer appear to be already on.\n");

   807     else

   808     {

   809 	if (results->firstline)

   810 	    g_print("    The PG header is on - no footer.\n");

   811 	if (results->footerline)

   812 	    g_print("    The PG footer is on - no header.\n");

   813     }

   814     g_print("\n");

   815     if (pswit[VERBOSE_SWITCH])

   816     {

   817 	warnings.shortline=1;

   818 	warnings.dotcomma=1;

   819 	warnings.longline=1;

   820 	warnings.dash=1;

   821 	warnings.digit=1;

   822 	warnings.ast=1;

   823 	warnings.fslash=1;

   824 	warnings.hyphen=1;

   825 	warnings.endquote=1;

   826 	g_print("   *** Verbose output is ON -- you asked for it! ***\n");

   827     }

   828     if (warnings.isDutch)

   829 	warnings.dash=0;

   830     if (results->footerline>0 && results->firstline>0 &&

   831       results->footerline>results->firstline &&

   832       results->footerline-results->firstline<100)

   833     {

   834 	g_print("   --> I don't really know where this text starts. \n");

   835 	g_print("       There are no reference points.\n");

   836 	g_print("       I'm going to have to report the header and footer "

   837 	  "as well.\n");

   838 	results->firstline=0;

   839     }

   840     return &warnings;

   841 }

   843 /*

   844  * analyse_quotes:

   845  *

   846  * Look along the line, accumulate the count of quotes, and see

   847  * if this is an empty line - i.e. a line with nothing on it

   848  * but spaces.

   849  * If line has just spaces, period, * and/or - on it, don't

   850  * count it, since empty lines with asterisks or dashes to

   851  * separate sections are common.

   852  *

   853  * Returns: TRUE if the line is empty.

   854  */

   855 gboolean analyse_quotes(const char *aline,int linecnt,struct counters *counters)

   856 {

   857     int guessquote=0;

   858     /* assume the line is empty until proven otherwise */

   859     gboolean isemptyline=TRUE;

   860     const char *s=aline,*sprev,*snext;

   861     gunichar c;

   862     sprev=NULL;

   863     GError *tmp_err=NULL;

   864     while (*s)

   865     {

   866 	snext=g_utf8_next_char(s);

   867 	c=g_utf8_get_char(s);

   868 	if (CHAR_IS_DQUOTE(c))

   869 	    (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);

   870 	else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])

   871 	{

   872 	    if (s==aline)

   873 	    {

   874 		/*

   875 		 * At start of line, it can only be a quotation mark.

   876 		 * Hardcode a very common exception!

   877 		 */

   878 		if (!g_str_has_prefix(snext,"tis") &&

   879 		  !g_str_has_prefix(snext,"Tis"))

   880 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);

   881 	    }

   882 	    else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&

   883 	      g_unichar_isalpha(g_utf8_get_char(snext)))

   884 		/* Do nothing! it's definitely an apostrophe, not a quote */

   885 		;

   886 	    /* it's outside a word - let's check it out */

   887 	    else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||

   888 	      g_unichar_isalpha(g_utf8_get_char(snext)))

   889 	    {

   890 		/* certainly looks like a quotation mark */

   891 		if (!g_str_has_prefix(snext,"tis") &&

   892 		  !g_str_has_prefix(snext,"Tis"))

   893 		    /* hardcode a very common exception! */

   894 		{

   895 		    if (strchr(".?!,;:",g_utf8_get_char(sprev)))

   896 			(void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);

   897 		    else

   898 			(void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);

   899 		}

   900 	    }

   901 	    else

   902 	    {

   903 		/* now - is it a quotation mark? */

   904 		guessquote=0;   /* accumulate clues */

   905 		if (g_unichar_isalpha(g_utf8_get_char(sprev)))

   906 		{

   907 		    /* it follows a letter - could be either */

   908 		    guessquote++;

   909 		    if (g_utf8_get_char(sprev)=='s')

   910 		    {

   911 			/* looks like a plural apostrophe */

   912 			guessquote-=3;

   913 			if (g_utf8_get_char(snext)==CHAR_SPACE)

   914 			    /* bonus marks! */

   915 			    guessquote-=2;

   916 		    }

   917 		    if (innermost_quote_matches(counters,c))

   918 			/*

   919 			 * Give it the benefit of some doubt,

   920 			 * if a squote is already open.

   921 			 */

   922 			guessquote++;

   923 		    else

   924 			guessquote--;

   925 		    if (guessquote>=0)

   926 			(void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);

   927 		}

   928 		else

   929 		    /* no adjacent letter - it must be a quote of some kind */

   930 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);

   931 	    }

   932 	}

   933 	if (tmp_err)

   934 	{

   935 	    if (pswit[ECHO_SWITCH])

   936 		g_print("\n%s\n",aline);

   937 	    if (!pswit[OVERVIEW_SWITCH])

   938 		g_print("    Line %ld column %ld - %s\n",

   939 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);

   940 	    g_clear_error(&tmp_err);

   941 	}

   942 	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&

   943 	  c!='\r' && c!='\n')

   944 	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */

   945 	if (c==CHAR_UNDERSCORE)

   946 	    counters->c_unders++;

   947 	if (c==CHAR_OPEN_SBRACK)

   948 	{

   949 	    if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&

   950 	      !matching_difference(counters,c) && s==aline &&

   951 	      g_str_has_prefix(s,"[Illustration:"))

   952 		increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);

   953 	    else

   954 		increment_matching(counters,c,TRUE);

   955 	}

   956 	else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)

   957 	    increment_matching(counters,c,TRUE);

   958 	if (c==CHAR_CLOSE_SBRACK)

   959 	{

   960 	    if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&

   961 	      !matching_difference(counters,c) && !*snext)

   962 		increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);

   963 	    else

   964 		increment_matching(counters,c,FALSE);

   965 	}

   966 	else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)

   967 	    increment_matching(counters,c,FALSE);

   968 	sprev=s;

   969 	s=snext;

   970     }

   971     return isemptyline;

   972 }

   974 /*

   975  * check_for_control_characters:

   976  *

   977  * Check for invalid or questionable characters in the line

   978  * Anything above 127 is invalid for plain ASCII, and

   979  * non-printable control characters should also be flagged.

   980  * Tabs should generally not be there.

   981  */

   982 void check_for_control_characters(const char *aline)

   983 {

   984     gunichar c;

   985     const char *s;

   986     for (s=aline;*s;s=g_utf8_next_char(s))

   987     {

   988 	c=g_utf8_get_char(s);

   989 	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)

   990 	{

   991 	    if (pswit[ECHO_SWITCH])

   992 		g_print("\n%s\n",aline);

   993 	    if (!pswit[OVERVIEW_SWITCH])

   994 		g_print("    Line %ld column %ld - Control character %u\n",

   995 		  linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);

   996 	    else

   997 		cnt_bin++;

   998 	}

   999     }

  1000 }

  1002 /*

  1003  * check_for_odd_characters:

  1004  *

  1005  * Check for binary and other odd characters.

  1006  */

  1007 void check_for_odd_characters(const char *aline,const struct warnings *warnings,

  1008   gboolean isemptyline)

  1009 {

  1010     /* Don't repeat multiple warnings on one line. */

  1011     gboolean eInvalidChar=FALSE,eTab=FALSE,eTilde=FALSE;

  1012     gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;

  1013     const char *s;

  1014     gunichar c;

  1015     gsize nb;

  1016     gchar *t;

  1017     for (s=aline;*s;s=g_utf8_next_char(s))

  1018     {

  1019 	c=g_utf8_get_char(s);

  1020 	if (warnings->bin && !eInvalidChar &&

  1021 	  (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))

  1022 	{

  1023 	    if (pswit[ECHO_SWITCH])

  1024 		g_print("\n%s\n",aline);

  1025 	    if (!pswit[OVERVIEW_SWITCH])

  1026 		if (c>127 && c<160 || c>255)

  1027 		    g_print("    Line %ld column %ld - "

  1028 		      "Non-ISO-8859 character %u\n",

  1029 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);

  1030 		else

  1031 		    g_print("    Line %ld column %ld - "

  1032 		      "Non-ASCII character %u\n",

  1033 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);

  1034 	    else

  1035 		cnt_bin++;

  1036 	    eInvalidChar=TRUE;

  1037 	}

  1038 	if (!eInvalidChar && charset)

  1039 	{

  1040 	    if (charset_validator==(GIConv)-1)

  1041 	    {

  1042 		if (!g_unichar_isdefined(c))

  1043 		{

  1044 		    if (pswit[ECHO_SWITCH])

  1045 			g_print("\n%s\n",aline);

  1046 		    if (!pswit[OVERVIEW_SWITCH])

  1047 			g_print("    Line %ld column %ld - Unassigned UNICODE "

  1048 			  "code point U+%04" G_GINT32_MODIFIER "X\n",

  1049 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);

  1050 		    else

  1051 			cnt_bin++;

  1052 		    eInvalidChar=TRUE;

  1053 		}

  1054 		else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD ||

  1055 		  c>=100000 && c<=0x10FFFD)

  1056 		{

  1057 		    if (pswit[ECHO_SWITCH])

  1058 			g_print("\n%s\n",aline);

  1059 		    if (!pswit[OVERVIEW_SWITCH])

  1060 			g_print("    Line %ld column %ld - Private Use "

  1061 			  "character U+%04" G_GINT32_MODIFIER "X\n",

  1062 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);

  1063 		    else

  1064 			cnt_bin++;

  1065 		    eInvalidChar=TRUE;

  1066 		}

  1067 	    }

  1068 	    else

  1069 	    {

  1070 		t=g_convert_with_iconv(s,g_utf8_next_char(s)-s,

  1071 		  charset_validator,NULL,&nb,NULL);

  1072 		if (t)

  1073 		    g_free(t);

  1074 		else

  1075 		{

  1076 		    if (pswit[ECHO_SWITCH])

  1077 			g_print("\n%s\n",aline);

  1078 		    if (!pswit[OVERVIEW_SWITCH])

  1079 			g_print("    Line %ld column %ld - Non-%s "

  1080 			  "character %u\n",linecnt,

  1081 			  g_utf8_pointer_to_offset(aline,s)+1,charset,c);

  1082 		    else

  1083 			cnt_bin++;

  1084 		    eInvalidChar=TRUE;

  1085 		}

  1086 	    }

  1087 	}

  1088 	if (!eTab && c==CHAR_TAB)

  1089 	{

  1090 	    if (pswit[ECHO_SWITCH])

  1091 		g_print("\n%s\n",aline);

  1092 	    if (!pswit[OVERVIEW_SWITCH])

  1093 		g_print("    Line %ld column %ld - Tab character?\n",

  1094 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1095 	    else

  1096 		cnt_odd++;

  1097 	    eTab=TRUE;

  1098 	}

  1099 	if (!eTilde && c==CHAR_TILDE)

  1100 	{

  1101 	    /*

  1102 	     * Often used by OCR software to indicate an

  1103 	     * unrecognizable character.

  1104 	     */

  1105 	    if (pswit[ECHO_SWITCH])

  1106 		g_print("\n%s\n",aline);

  1107 	    if (!pswit[OVERVIEW_SWITCH])

  1108 		g_print("    Line %ld column %ld - Tilde character?\n",

  1109 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1110 	    else

  1111 		cnt_odd++;

  1112 	    eTilde=TRUE;

  1113 	}

  1114 	if (!eCarat && c==CHAR_CARAT)

  1115 	{

  1116 	    if (pswit[ECHO_SWITCH])

  1117 		g_print("\n%s\n",aline);

  1118 	    if (!pswit[OVERVIEW_SWITCH])

  1119 		g_print("    Line %ld column %ld - Carat character?\n",

  1120 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1121 	    else

  1122 		cnt_odd++;

  1123 	    eCarat=TRUE;

  1124 	}

  1125 	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)

  1126 	{

  1127 	    if (pswit[ECHO_SWITCH])

  1128 		g_print("\n%s\n",aline);

  1129 	    if (!pswit[OVERVIEW_SWITCH])

  1130 		g_print("    Line %ld column %ld - Forward slash?\n",

  1131 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1132 	    else

  1133 		cnt_odd++;

  1134 	    eFSlash=TRUE;

  1135 	}

  1136 	/*

  1137 	 * Report asterisks only in paranoid mode,

  1138 	 * since they're often deliberate.

  1139 	 */

  1140 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&

  1141 	  c==CHAR_ASTERISK)

  1142 	{

  1143 	    if (pswit[ECHO_SWITCH])

  1144 		g_print("\n%s\n",aline);

  1145 	    if (!pswit[OVERVIEW_SWITCH])

  1146 		g_print("    Line %ld column %ld - Asterisk?\n",

  1147 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1148 	    else

  1149 		cnt_odd++;

  1150 	    eAst=TRUE;

  1151 	}

  1152     }

  1153 }

  1155 /*

  1156  * check_for_long_line:

  1157  *

  1158  * Check for line too long.

  1159  */

  1160 void check_for_long_line(const char *aline)

  1161 {

  1162     if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)

  1163     {

  1164 	if (pswit[ECHO_SWITCH])

  1165 	    g_print("\n%s\n",aline);

  1166 	if (!pswit[OVERVIEW_SWITCH])

  1167 	    g_print("    Line %ld column %ld - Long line %ld\n",

  1168 	      linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));

  1169 	else

  1170 	    cnt_long++;

  1171     }

  1172 }

  1174 /*

  1175  * check_for_short_line:

  1176  *

  1177  * Check for line too short.

  1178  *

  1179  * This one is a bit trickier to implement: we don't want to

  1180  * flag the last line of a paragraph for being short, so we

  1181  * have to wait until we know that our current line is a

  1182  * "normal" line, then report the _previous_ line if it was too

  1183  * short. We also don't want to report indented lines like

  1184  * chapter heads or formatted quotations. We therefore keep

  1185  * last->len as the length of the last line examined, and

  1186  * last->blen as the length of the last but one, and try to

  1187  * suppress unnecessary warnings by checking that both were of

  1188  * "normal" length. We keep the first character of the last

  1189  * line in last->start, and if it was a space, we assume that

  1190  * the formatting is deliberate. I can't figure out a way to

  1191  * distinguish something like a quoted verse left-aligned or

  1192  * the header or footer of a letter from a paragraph of short

  1193  * lines - maybe if I examined the whole paragraph, and if the

  1194  * para has less than, say, 8 lines and if all lines are short,

  1195  * then just assume it's OK? Need to look at some texts to see

  1196  * how often a formula like this would get the right result.

  1197  */

  1198 void check_for_short_line(const char *aline,const struct line_properties *last)

  1199 {

  1200     if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&

  1201       last->len<SHORTEST_PG_LINE && last->blen>1 &&

  1202       last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)

  1203     {

  1204 	if (pswit[ECHO_SWITCH])

  1205 	    g_print("\n%s\n",prevline);

  1206 	if (!pswit[OVERVIEW_SWITCH])

  1207 	    g_print("    Line %ld column %ld - Short line %ld?\n",

  1208 	      linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));

  1209 	else

  1210 	    cnt_short++;

  1211     }

  1212 }

  1214 /*

  1215  * check_for_starting_punctuation:

  1216  *

  1217  * Look for punctuation other than full ellipses at start of line.

  1218  */

  1219 void check_for_starting_punctuation(const char *aline)

  1220 {

  1221     if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&

  1222       !g_str_has_prefix(aline,". . ."))

  1223     {

  1224 	if (pswit[ECHO_SWITCH])

  1225 	    g_print("\n%s\n",aline);

  1226 	if (!pswit[OVERVIEW_SWITCH])

  1227 	    g_print("    Line %ld column 1 - Begins with punctuation?\n",

  1228 	      linecnt);

  1229 	else

  1230 	    cnt_punct++;

  1231     }

  1232 }

  1234 /*

  1235  * check_for_spaced_emdash:

  1236  *

  1237  * Check for spaced em-dashes.

  1238  *

  1239  * We must check _all_ occurrences of "--" on the line

  1240  * hence the loop - even if the first double-dash is OK

  1241  * there may be another that's wrong later on.

  1242  */

  1243 void check_for_spaced_emdash(const char *aline)

  1244 {

  1245     const char *s,*t,*next;

  1246     for (s=aline;t=strstr(s,"--");s=next)

  1247     {

  1248 	next=g_utf8_next_char(g_utf8_next_char(t));

  1249 	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||

  1250 	  g_utf8_get_char(next)==CHAR_SPACE)

  1251 	{

  1252 	    if (pswit[ECHO_SWITCH])

  1253 		g_print("\n%s\n",aline);

  1254 	    if (!pswit[OVERVIEW_SWITCH])

  1255 		g_print("    Line %ld column %ld - Spaced em-dash?\n",

  1256 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  1257 	    else

  1258 		cnt_dash++;

  1259 	}

  1260     }

  1261 }

  1263 /*

  1264  * check_for_spaced_dash:

  1265  *

  1266  * Check for spaced dashes.

  1267  */

  1268 void check_for_spaced_dash(const char *aline)

  1269 {

  1270     const char *s;

  1271     if ((s=strstr(aline," -")))

  1272     {

  1273 	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')

  1274 	{

  1275 	    if (pswit[ECHO_SWITCH])

  1276 		g_print("\n%s\n",aline);

  1277 	    if (!pswit[OVERVIEW_SWITCH])

  1278 		g_print("    Line %ld column %ld - Spaced dash?\n",

  1279 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1280 	    else

  1281 		cnt_dash++;

  1282 	}

  1283     }

  1284     else if ((s=strstr(aline,"- ")))

  1285     {

  1286 	if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')

  1287 	{

  1288 	    if (pswit[ECHO_SWITCH])

  1289 		g_print("\n%s\n",aline);

  1290 	    if (!pswit[OVERVIEW_SWITCH])

  1291 		g_print("    Line %ld column %ld - Spaced dash?\n",

  1292 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1293 	    else

  1294 		cnt_dash++;

  1295 	}

  1296     }

  1297 }

  1299 /*

  1300  * check_for_unmarked_paragraphs:

  1301  *

  1302  * Check for unmarked paragraphs indicated by separate speakers.

  1303  *

  1304  * May well be false positive:

  1305  * "Bravo!" "Wonderful!" called the crowd.

  1306  * but useful all the same.

  1307  */

  1308 void check_for_unmarked_paragraphs(const char *aline)

  1309 {

  1310     const char *s;

  1311     s=strstr(aline,"\"  \"");

  1312     if (!s)

  1313 	s=strstr(aline,"\" \"");

  1314     if (s)

  1315     {

  1316 	if (pswit[ECHO_SWITCH])

  1317 	    g_print("\n%s\n",aline);

  1318 	if (!pswit[OVERVIEW_SWITCH])

  1319 	    g_print("    Line %ld column %ld - "

  1320 	      "Query missing paragraph break?\n",

  1321 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1322 	else

  1323 	    cnt_punct++;

  1324     }

  1325 }

  1327 /*

  1328  * check_for_jeebies:

  1329  *

  1330  * Check for "to he" and other easy h/b errors.

  1331  *

  1332  * This is a very inadequate effort on the h/b problem,

  1333  * but the phrase "to he" is always an error, whereas "to

  1334  * be" is quite common.

  1335  * Similarly, '"Quiet!", be said.' is a non-be error

  1336  * "to he" is _not_ always an error!:

  1337  *       "Where they went to he couldn't say."

  1338  * Another false positive:

  1339  *       What would "Cinderella" be without the . . .

  1340  * and another: "If he wants to he can see for himself."

  1341  */

  1342 void check_for_jeebies(const char *aline)

  1343 {

  1344     const char *s;

  1345     s=strstr(aline," be could ");

  1346     if (!s)

  1347 	s=strstr(aline," be would ");

  1348     if (!s)

  1349 	s=strstr(aline," was be ");

  1350     if (!s)

  1351 	s=strstr(aline," be is ");

  1352     if (!s)

  1353 	s=strstr(aline," is be ");

  1354     if (!s)

  1355 	s=strstr(aline,"\", be ");

  1356     if (!s)

  1357 	s=strstr(aline,"\" be ");

  1358     if (!s)

  1359 	s=strstr(aline,"\" be ");

  1360     if (!s)

  1361 	s=strstr(aline," to he ");

  1362     if (s)

  1363     {

  1364 	if (pswit[ECHO_SWITCH])

  1365 	    g_print("\n%s\n",aline);

  1366 	if (!pswit[OVERVIEW_SWITCH])

  1367 	    g_print("    Line %ld column %ld - Query he/be error?\n",

  1368 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1369 	else

  1370 	    cnt_word++;

  1371     }

  1372     s=strstr(aline," the had ");

  1373     if (!s)

  1374 	s=strstr(aline," a had ");

  1375     if (!s)

  1376 	s=strstr(aline," they bad ");

  1377     if (!s)

  1378 	s=strstr(aline," she bad ");

  1379     if (!s)

  1380 	s=strstr(aline," he bad ");

  1381     if (!s)

  1382 	s=strstr(aline," you bad ");

  1383     if (!s)

  1384 	s=strstr(aline," i bad ");

  1385     if (s)

  1386     {

  1387 	if (pswit[ECHO_SWITCH])

  1388 	    g_print("\n%s\n",aline);

  1389 	if (!pswit[OVERVIEW_SWITCH])

  1390 	    g_print("    Line %ld column %ld - Query had/bad error?\n",

  1391 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1392 	else

  1393 	    cnt_word++;

  1394     }

  1395     s=strstr(aline,"; hut ");

  1396     if (!s)

  1397 	s=strstr(aline,", hut ");

  1398     if (s)

  1399     {

  1400 	if (pswit[ECHO_SWITCH])

  1401 	    g_print("\n%s\n",aline);

  1402 	if (!pswit[OVERVIEW_SWITCH])

  1403 	    g_print("    Line %ld column %ld - Query hut/but error?\n",

  1404 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1405 	else

  1406 	    cnt_word++;

  1407     }

  1408 }

  1410 /*

  1411  * check_for_mta_from:

  1412  *

  1413  * Special case - angled bracket in front of "From" placed there by an

  1414  * MTA when sending an e-mail.

  1415  */

  1416 void check_for_mta_from(const char *aline)

  1417 {

  1418     const char *s;

  1419     s=strstr(aline,">From");

  1420     if (s)

  1421     {

  1422 	if (pswit[ECHO_SWITCH])

  1423 	    g_print("\n%s\n",aline);

  1424 	if (!pswit[OVERVIEW_SWITCH])

  1425 	    g_print("    Line %ld column %ld - "

  1426 	      "Query angled bracket with From\n",

  1427 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1428 	else

  1429 	    cnt_punct++;

  1430     }

  1431 }

  1433 /*

  1434  * check_for_orphan_character:

  1435  *

  1436  * Check for a single character line -

  1437  * often an overflow from bad wrapping.

  1438  */

  1439 void check_for_orphan_character(const char *aline)

  1440 {

  1441     gunichar c;

  1442     c=g_utf8_get_char(aline);

  1443     if (c && !*g_utf8_next_char(aline))

  1444     {

  1445 	if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))

  1446 	    ; /* Nothing - ignore numerals alone on a line. */

  1447 	else

  1448 	{

  1449 	    if (pswit[ECHO_SWITCH])

  1450 		g_print("\n%s\n",aline);

  1451 	    if (!pswit[OVERVIEW_SWITCH])

  1452 		g_print("    Line %ld column 1 - Query single character line\n",

  1453 		  linecnt);

  1454 	    else

  1455 		cnt_punct++;

  1456 	}

  1457     }

  1458 }

  1460 /*

  1461  * check_for_pling_scanno:

  1462  *

  1463  * Check for I" - often should be !

  1464  */

  1465 void check_for_pling_scanno(const char *aline)

  1466 {

  1467     const char *s;

  1468     s=strstr(aline," I\"");

  1469     if (s)

  1470     {

  1471 	if (pswit[ECHO_SWITCH])

  1472 	    g_print("\n%s\n",aline);

  1473 	if (!pswit[OVERVIEW_SWITCH])

  1474 	    g_print("    Line %ld column %ld - Query I=exclamation mark?\n",

  1475 	      linecnt,g_utf8_pointer_to_offset(aline,s));

  1476 	else

  1477 	    cnt_punct++;

  1478     }

  1479 }

  1481 /*

  1482  * check_for_extra_period:

  1483  *

  1484  * Check for period without a capital letter. Cut-down from gutspell.

  1485  * Only works when it happens on a single line.

  1486  */

  1487 void check_for_extra_period(const char *aline,const struct warnings *warnings)

  1488 {

  1489     const char *s,*t,*s1,*sprev;

  1490     int i;

  1491     gsize len;

  1492     gboolean istypo;

  1493     gchar *testword;

  1494     gunichar c,nc,pc,*decomposition;

  1495     if (pswit[PARANOID_SWITCH])

  1496     {

  1497 	for (t=aline;t=strstr(t,". ");)

  1498 	{

  1499 	    if (t==aline)

  1500 	    {

  1501 		t=g_utf8_next_char(t);

  1502 		/* start of line punctuation is handled elsewhere */

  1503 		continue;

  1504 	    }

  1505 	    if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))

  1506 	    {

  1507 		t=g_utf8_next_char(t);

  1508 		continue;

  1509 	    }

  1510 	    if (warnings->isDutch)

  1511 	    {

  1512 		/* For Frank & Jeroen -- 's Middags case */

  1513 		gunichar c2,c3,c4,c5;

  1514 		c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));

  1515 		c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));

  1516 		c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));

  1517 		c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));

  1518 		if (CHAR_IS_APOSTROPHE(c2) &&

  1519 		  g_unichar_islower(c3) && c4==CHAR_SPACE &&

  1520 		  g_unichar_isupper(c5))

  1521 		{

  1522 		    t=g_utf8_next_char(t);

  1523 		    continue;

  1524 		}

  1525 	    }

  1526 	    s1=g_utf8_next_char(g_utf8_next_char(t));

  1527 	    while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&

  1528 	      !isdigit(g_utf8_get_char(s1)))

  1529 		s1=g_utf8_next_char(s1);

  1530 	    if (g_unichar_islower(g_utf8_get_char(s1)))

  1531 	    {

  1532 		/* we have something to investigate */

  1533 		istypo=TRUE;

  1534 		/* so let's go back and find out */

  1535 		nc=g_utf8_get_char(t);

  1536 		s1=g_utf8_prev_char(t);

  1537 		c=g_utf8_get_char(s1);

  1538 		sprev=g_utf8_prev_char(s1);

  1539 		pc=g_utf8_get_char(sprev);

  1540 		while (s1>=aline &&

  1541 		  (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||

  1542 		  g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&

  1543 		  g_unichar_isalpha(nc)))

  1544 		{

  1545 		    nc=c;

  1546 		    s1=sprev;

  1547 		    c=pc;

  1548 		    sprev=g_utf8_prev_char(s1);

  1549 		    pc=g_utf8_get_char(sprev);

  1550 		}

  1551 		s1=g_utf8_next_char(s1);

  1552 		s=strchr(s1,'.');

  1553 		if (s)

  1554 		    testword=g_strndup(s1,s-s1);

  1555 		else

  1556 		    testword=g_strdup(s1);

  1557 		for (i=0;*abbrev[i];i++)

  1558 		    if (!strcmp(testword,abbrev[i]))

  1559 			istypo=FALSE;

  1560 		if (g_unichar_isdigit(g_utf8_get_char(testword)))

  1561 		    istypo=FALSE;

  1562 		if (!*g_utf8_next_char(testword))

  1563 		    istypo=FALSE;

  1564 		if (isroman(testword))

  1565 		    istypo=FALSE;

  1566 		if (istypo)

  1567 		{

  1568 		    istypo=FALSE;

  1569 		    for (s=testword;*s;s=g_utf8_next_char(s))

  1570 		    {

  1571 			decomposition=g_unicode_canonical_decomposition(

  1572 			  g_utf8_get_char(s),&len);

  1573 			if (g_utf8_strchr("aeiou",-1,decomposition[0]))

  1574 			    istypo=TRUE;

  1575 			g_free(decomposition);

  1576 		    }

  1577 		}

  1578 		if (istypo &&

  1579 		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))

  1580 		{

  1581 		    g_tree_insert(qperiod,g_strdup(testword),

  1582 		      GINT_TO_POINTER(1));

  1583 		    if (pswit[ECHO_SWITCH])

  1584 			g_print("\n%s\n",aline);

  1585 		    if (!pswit[OVERVIEW_SWITCH])

  1586 			g_print("    Line %ld column %ld - Extra period?\n",

  1587 			  linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  1588 		    else

  1589 			cnt_punct++;

  1590 		}

  1591 		g_free(testword);

  1592 	    }

  1593 	    t=g_utf8_next_char(t);

  1594 	}

  1595     }

  1596 }

  1598 /*

  1599  * check_for_following_punctuation:

  1600  *

  1601  * Check for words usually not followed by punctuation.

  1602  */

  1603 void check_for_following_punctuation(const char *aline)

  1604 {

  1605     int i;

  1606     const char *s,*wordstart;

  1607     gunichar c;

  1608     gchar *inword,*t;

  1609     if (pswit[TYPO_SWITCH])

  1610     {

  1611 	for (s=aline;*s;)

  1612 	{

  1613 	    wordstart=s;

  1614 	    t=getaword(&s);

  1615 	    if (!*t)

  1616 	    {

  1617 		g_free(t);

  1618 		continue;

  1619 	    }

  1620 	    inword=g_utf8_strdown(t,-1);

  1621 	    g_free(t);

  1622 	    for (i=0;*nocomma[i];i++)

  1623 		if (!strcmp(inword,nocomma[i]))

  1624 		{

  1625 		    c=g_utf8_get_char(s);

  1626 		    if (c==',' || c==';' || c==':')

  1627 		    {

  1628 			if (pswit[ECHO_SWITCH])

  1629 			    g_print("\n%s\n",aline);

  1630 			if (!pswit[OVERVIEW_SWITCH])

  1631 			    g_print("    Line %ld column %ld - "

  1632 			      "Query punctuation after %s?\n",

  1633 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,

  1634 			      inword);

  1635 			else

  1636 			    cnt_punct++;

  1637 		    }

  1638 		}

  1639 	    for (i=0;*noperiod[i];i++)

  1640 		if (!strcmp(inword,noperiod[i]))

  1641 		{

  1642 		    c=g_utf8_get_char(s);

  1643 		    if (c=='.' || c=='!')

  1644 		    {

  1645 			if (pswit[ECHO_SWITCH])

  1646 			    g_print("\n%s\n",aline);

  1647 			if (!pswit[OVERVIEW_SWITCH])

  1648 			    g_print("    Line %ld column %ld - "

  1649 			      "Query punctuation after %s?\n",

  1650 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,

  1651 			      inword);

  1652 			else

  1653 			    cnt_punct++;

  1654 		    }

  1655 		}

  1656 	    g_free(inword);

  1657 	}

  1658     }

  1659 }

  1661 /*

  1662  * check_for_typos:

  1663  *

  1664  * Check for commonly mistyped words,

  1665  * and digits like 0 for O in a word.

  1666  */

  1667 void check_for_typos(const char *aline,struct warnings *warnings)

  1668 {

  1669     const char *s,*t,*nt,*wordstart;

  1670     gchar *inword;

  1671     gunichar *decomposition;

  1672     gchar *testword;

  1673     int i,vowel,consonant,*dupcnt;

  1674     gboolean isdup,istypo,alower;

  1675     gunichar c,pc;

  1676     long offset,len;

  1677     gsize decomposition_len;

  1678     for (s=aline;*s;)

  1679     {

  1680 	wordstart=s;

  1681 	inword=getaword(&s);

  1682 	if (!*inword)

  1683 	{

  1684 	    g_free(inword);

  1685 	    continue; /* don't bother with empty lines */

  1686 	}

  1687 	if (mixdigit(inword))

  1688 	{

  1689 	    if (pswit[ECHO_SWITCH])

  1690 		g_print("\n%s\n",aline);

  1691 	    if (!pswit[OVERVIEW_SWITCH])

  1692 		g_print("    Line %ld column %ld - Query digit in %s\n",

  1693 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);

  1694 	    else

  1695 		cnt_word++;

  1696 	}

  1697 	/*

  1698 	 * Put the word through a series of tests for likely typos and OCR

  1699 	 * errors.

  1700 	 */

  1701 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])

  1702 	{

  1703 	    istypo=FALSE;

  1704 	    alower=FALSE;

  1705 	    for (t=inword;*t;t=g_utf8_next_char(t))

  1706 	    {

  1707 		c=g_utf8_get_char(t);

  1708 		nt=g_utf8_next_char(t);

  1709 		/* lowercase for testing */

  1710 		if (g_unichar_islower(c))

  1711 		    alower=TRUE;

  1712 		if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))

  1713 		{

  1714 		    /*

  1715 		     * We have an uppercase mid-word. However, there are

  1716 		     * common cases:

  1717 		     *   Mac and Mc like McGill

  1718 		     *   French contractions like l'Abbe

  1719 		     */

  1720 		    offset=g_utf8_pointer_to_offset(inword,t);

  1721 		    if (offset>0)

  1722 			pc=g_utf8_get_char(g_utf8_prev_char(t));

  1723 		    else

  1724 			pc='\0';

  1725 		    if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||

  1726 		      offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&

  1727 		      g_utf8_get_char(g_utf8_next_char(nt))=='c' ||

  1728 		      CHAR_IS_APOSTROPHE(pc))

  1729 			; /* do nothing! */

  1730 		    else

  1731 			istypo=TRUE;

  1732 		}

  1733 	    }

  1734 	    testword=g_utf8_casefold(inword,-1);

  1735 	}

  1736 	if (pswit[TYPO_SWITCH])

  1737 	{

  1738 	    /*

  1739 	     * Check for certain unlikely two-letter combinations at word

  1740 	     * start and end.

  1741 	     */

  1742 	    len=g_utf8_strlen(testword,-1);

  1743 	    if (len>1)

  1744 	    {

  1745 		for (i=0;*nostart[i];i++)

  1746 		    if (g_str_has_prefix(testword,nostart[i]))

  1747 			istypo=TRUE;

  1748 		for (i=0;*noend[i];i++)

  1749 		    if (g_str_has_suffix(testword,noend[i]))

  1750 			istypo=TRUE;

  1751 	    }

  1752 	    /* ght is common, gbt never. Like that. */

  1753 	    if (strstr(testword,"cb"))

  1754 		istypo=TRUE;

  1755 	    if (strstr(testword,"gbt"))

  1756 		istypo=TRUE;

  1757 	    if (strstr(testword,"pbt"))

  1758 		istypo=TRUE;

  1759 	    if (strstr(testword,"tbs"))

  1760 		istypo=TRUE;

  1761 	    if (strstr(testword,"mrn"))

  1762 		istypo=TRUE;

  1763 	    if (strstr(testword,"ahle"))

  1764 		istypo=TRUE;

  1765 	    if (strstr(testword,"ihle"))

  1766 		istypo=TRUE;

  1767 	    /*

  1768 	     * "TBE" does happen - like HEARTBEAT - but uncommon.

  1769 	     * Also "TBI" - frostbite, outbid - but uncommon.

  1770 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman

  1771 	     * numerals, but "ii" is a common scanno.

  1772 	     */

  1773 	    if (strstr(testword,"tbi"))

  1774 		istypo=TRUE;

  1775 	    if (strstr(testword,"tbe"))

  1776 		istypo=TRUE;

  1777 	    if (strstr(testword,"ii"))

  1778 		istypo=TRUE;

  1779 	    /*

  1780 	     * Check for no vowels or no consonants.

  1781 	     * If none, flag a typo.

  1782 	     */

  1783 	    if (!istypo && len>1)

  1784 	    {

  1785 		vowel=consonant=0;

  1786 		for (t=testword;*t;t=g_utf8_next_char(t))

  1787 		{

  1788 		    c=g_utf8_get_char(t);

  1789 		    decomposition=

  1790 		      g_unicode_canonical_decomposition(c,&decomposition_len);

  1791 		    if (c=='y' || g_unichar_isdigit(c))

  1792 		    {

  1793 			/* Yah, this is loose. */

  1794 			vowel++;

  1795 			consonant++;

  1796 		    }

  1797 		    else if (g_utf8_strchr("aeiou",-1,decomposition[0]))

  1798 			vowel++;

  1799 		    else

  1800 			consonant++;

  1801 		    g_free(decomposition);

  1802 		}

  1803 		if (!vowel || !consonant)

  1804 		    istypo=TRUE;

  1805 	    }

  1806 	    /*

  1807 	     * Now exclude the word from being reported if it's in

  1808 	     * the okword list.

  1809 	     */

  1810 	    for (i=0;*okword[i];i++)

  1811 		if (!strcmp(testword,okword[i]))

  1812 		    istypo=FALSE;

  1813 	    /*

  1814 	     * What looks like a typo may be a Roman numeral.

  1815 	     * Exclude these.

  1816 	     */

  1817 	    if (istypo && isroman(testword))

  1818 		istypo=FALSE;

  1819 	    /* Check the manual list of typos. */

  1820 	    if (!istypo)

  1821 		for (i=0;*typo[i];i++)

  1822 		    if (!strcmp(testword,typo[i]))

  1823 			istypo=TRUE;

  1824 	    /*

  1825 	     * Check lowercase s, l, i and m - special cases.

  1826 	     *   "j" - often a semi-colon gone wrong.

  1827 	     *   "d" for a missing apostrophe - he d

  1828 	     *   "n" for "in"

  1829 	     */

  1830 	    if (!istypo && len==1 &&

  1831 	      g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))

  1832 		istypo=TRUE;

  1833 	    if (istypo)

  1834 	    {

  1835 		dupcnt=g_tree_lookup(qword,testword);

  1836 		if (dupcnt)

  1837 		{

  1838 		    (*dupcnt)++;

  1839 		    isdup=!pswit[VERBOSE_SWITCH];

  1840 		}

  1841 		else

  1842 		{

  1843 		    dupcnt=g_new0(int,1);

  1844 		    g_tree_insert(qword,g_strdup(testword),dupcnt);

  1845 		    isdup=FALSE;

  1846 		}

  1847 		if (!isdup)

  1848 		{

  1849 		    if (pswit[ECHO_SWITCH])

  1850 			g_print("\n%s\n",aline);

  1851 		    if (!pswit[OVERVIEW_SWITCH])

  1852 		    {

  1853 			g_print("    Line %ld column %ld - Query word %s",

  1854 			  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,

  1855 			  inword);

  1856 			if (!pswit[VERBOSE_SWITCH])

  1857 			    g_print(" - not reporting duplicates");

  1858 			g_print("\n");

  1859 		    }

  1860 		    else

  1861 			cnt_word++;

  1862 		}

  1863 	    }

  1864 	}

  1865 	/* check the user's list of typos */

  1866 	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))

  1867 	{

  1868 	    if (pswit[ECHO_SWITCH])

  1869 		g_print("\n%s\n",aline);

  1870 	    if (!pswit[OVERVIEW_SWITCH])

  1871 		g_print("    Line %ld column %ld - Query possible scanno %s\n",

  1872 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);

  1873 	}

  1874 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])

  1875 	    g_free(testword);

  1876 	if (pswit[PARANOID_SWITCH] && warnings->digit)

  1877 	{

  1878 	    /* In paranoid mode, query all 0 and 1 standing alone. */

  1879 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))

  1880 	    {

  1881 		if (pswit[ECHO_SWITCH])

  1882 		    g_print("\n%s\n",aline);

  1883 		if (!pswit[OVERVIEW_SWITCH])

  1884 		    g_print("    Line %ld column %ld - Query standalone %s\n",

  1885 		      linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,

  1886 		      inword);

  1887 		else

  1888 		    cnt_word++;

  1889 	    }

  1890 	}

  1891 	g_free(inword);

  1892     }

  1893 }

  1895 /*

  1896  * check_for_misspaced_punctuation:

  1897  *

  1898  * Look for added or missing spaces around punctuation and quotes.

  1899  * If there is a punctuation character like ! with no space on

  1900  * either side, suspect a missing!space. If there are spaces on

  1901  * both sides , assume a typo. If we see a double quote with no

  1902  * space or punctuation on either side of it, assume unspaced

  1903  * quotes "like"this.

  1904  */

  1905 void check_for_misspaced_punctuation(const char *aline,

  1906   struct parities *parities,gboolean isemptyline)

  1907 {

  1908     gboolean isacro,isellipsis;

  1909     const char *s;

  1910     gunichar c,nc,pc,n2c;

  1911     c=g_utf8_get_char(aline);

  1912     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  1913     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  1914     {

  1915 	pc=c;

  1916 	c=nc;

  1917 	nc=g_utf8_get_char(g_utf8_next_char(s));

  1918 	/* For each character in the line after the first. */

  1919 	if (g_utf8_strchr(".?!,;:_",-1,c))  /* if it's punctuation */

  1920 	{

  1921 	    /* we need to suppress warnings for acronyms like M.D. */

  1922 	    isacro=FALSE;

  1923 	    /* we need to suppress warnings for ellipsis . . . */

  1924 	    isellipsis=FALSE;

  1925 	    /*

  1926 	     * If there are letters on both sides of it or

  1927 	     * if it's strict punctuation followed by an alpha.

  1928 	     */

  1929 	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||

  1930 	      g_utf8_strchr("?!,;:",-1,c)))

  1931 	    {

  1932 		if (c=='.')

  1933 		{

  1934 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&

  1935 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')

  1936 			isacro=TRUE;

  1937 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));

  1938 		    if (nc && n2c=='.')

  1939 			isacro=TRUE;

  1940 		}

  1941 		if (!isacro)

  1942 		{

  1943 		    if (pswit[ECHO_SWITCH])

  1944 			g_print("\n%s\n",aline);

  1945 		    if (!pswit[OVERVIEW_SWITCH])

  1946 			g_print("    Line %ld column %ld - Missing space?\n",

  1947 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1948 		    else

  1949 			cnt_punct++;

  1950 		}

  1951 	    }

  1952 	    if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))

  1953 	    {

  1954 		/*

  1955 		 * If there are spaces on both sides,

  1956 		 * or space before and end of line.

  1957 		 */

  1958 		if (c=='.')

  1959 		{

  1960 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&

  1961 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')

  1962 			isellipsis=TRUE;

  1963 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));

  1964 		    if (nc && n2c=='.')

  1965 			isellipsis=TRUE;

  1966 		}

  1967 		if (!isemptyline && !isellipsis)

  1968 		{

  1969 		    if (pswit[ECHO_SWITCH])

  1970 			g_print("\n%s\n",aline);

  1971 		    if (!pswit[OVERVIEW_SWITCH])

  1972 			g_print("    Line %ld column %ld - "

  1973 			  "Spaced punctuation?\n",linecnt,

  1974 			  g_utf8_pointer_to_offset(aline,s)+1);

  1975 		    else

  1976 			cnt_punct++;

  1977 		}

  1978 	    }

  1979 	}

  1980     }

  1981     /* Split out the characters that CANNOT be preceded by space. */

  1982     c=g_utf8_get_char(aline);

  1983     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  1984     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  1985     {

  1986 	pc=c;

  1987 	c=nc;

  1988 	nc=g_utf8_get_char(g_utf8_next_char(s));

  1989 	/* for each character in the line after the first */

  1990 	if (g_utf8_strchr("?!,;:",-1,c))

  1991 	{

  1992 	    /* if it's punctuation that _cannot_ have a space before it */

  1993 	    if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)

  1994 	    {

  1995 		/*

  1996 		 * If nc DOES == space,

  1997 		 * it was already reported just above.

  1998 		 */

  1999 		if (pswit[ECHO_SWITCH])

  2000 		    g_print("\n%s\n",aline);

  2001 		if (!pswit[OVERVIEW_SWITCH])

  2002 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",

  2003 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2004 		else

  2005 		    cnt_punct++;

  2006 	    }

  2007 	}

  2008     }

  2009     /*

  2010      * Special case " .X" where X is any alpha.

  2011      * This plugs a hole in the acronym code above.

  2012      * Inelegant, but maintainable.

  2013      */

  2014     c=g_utf8_get_char(aline);

  2015     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2016     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2017     {

  2018 	pc=c;

  2019 	c=nc;

  2020 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2021 	/* for each character in the line after the first */

  2022 	if (c=='.')

  2023 	{

  2024 	    /* if it's a period */

  2025 	    if (pc==CHAR_SPACE && g_unichar_isalpha(nc))

  2026 	    {

  2027 		/*

  2028 		 * If the period follows a space and

  2029 		 * is followed by a letter.

  2030 		 */

  2031 		if (pswit[ECHO_SWITCH])

  2032 		    g_print("\n%s\n",aline);

  2033 		if (!pswit[OVERVIEW_SWITCH])

  2034 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",

  2035 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2036 		else

  2037 		    cnt_punct++;

  2038 	    }

  2039 	}

  2040     }

  2041     c=g_utf8_get_char(aline);

  2042     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2043     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2044     {

  2045 	pc=c;

  2046 	c=nc;

  2047 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2048 	/* for each character in the line after the first */

  2049 	if (c==CHAR_DQUOTE)

  2050 	{

  2051 	    if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&

  2052 	      !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||

  2053 	      !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))

  2054 	    {

  2055 		if (pswit[ECHO_SWITCH])

  2056 		    g_print("\n%s\n",aline);

  2057 		if (!pswit[OVERVIEW_SWITCH])

  2058 		    g_print("    Line %ld column %ld - Unspaced quotes?\n",

  2059 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2060 		else

  2061 		    cnt_punct++;

  2062 	    }

  2063 	}

  2064     }

  2065     /* Check parity of quotes. */

  2066     nc=g_utf8_get_char(aline);

  2067     for (s=aline;*s;s=g_utf8_next_char(s))

  2068     {

  2069 	c=nc;

  2070 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2071 	if (c==CHAR_DQUOTE)

  2072 	{

  2073 	    parities->dquote=!parities->dquote;

  2074 	    if (!parities->dquote)

  2075 	    {

  2076 		/* parity even */

  2077 		if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))

  2078 		{

  2079 		    if (pswit[ECHO_SWITCH])

  2080 			g_print("\n%s\n",aline);

  2081 		    if (!pswit[OVERVIEW_SWITCH])

  2082 			g_print("    Line %ld column %ld - "

  2083 			  "Wrongspaced quotes?\n",

  2084 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2085 		    else

  2086 			cnt_punct++;

  2087 		}

  2088 	    }

  2089 	    else

  2090 	    {

  2091 		/* parity odd */

  2092 		if (!g_unichar_isalpha(nc) && !isdigit(nc) &&

  2093 		  !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)

  2094 		{

  2095 		    if (pswit[ECHO_SWITCH])

  2096 			g_print("\n%s\n",aline);

  2097 		    if (!pswit[OVERVIEW_SWITCH])

  2098 			g_print("    Line %ld column %ld - "

  2099 			  "Wrongspaced quotes?\n",

  2100 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2101 		    else

  2102 			cnt_punct++;

  2103 		}

  2104 	    }

  2105 	}

  2106     }

  2107     if (g_utf8_get_char(aline)==CHAR_DQUOTE)

  2108     {

  2109 	if (g_utf8_strchr(",;:!?)]} ",-1,

  2110 	  g_utf8_get_char(g_utf8_next_char(aline))))

  2111 	{

  2112 	    if (pswit[ECHO_SWITCH])

  2113 		g_print("\n%s\n",aline);

  2114 	    if (!pswit[OVERVIEW_SWITCH])

  2115 		g_print("    Line %ld column 1 - Wrongspaced quotes?\n",

  2116 		  linecnt);

  2117 	    else

  2118 		cnt_punct++;

  2119 	}

  2120     }

  2121     if (pswit[SQUOTE_SWITCH])

  2122     {

  2123 	nc=g_utf8_get_char(aline);

  2124 	for (s=aline;*s;s=g_utf8_next_char(s))

  2125 	{

  2126 	    c=nc;

  2127 	    nc=g_utf8_get_char(g_utf8_next_char(s));

  2128 	    if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&

  2129 	      !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||

  2130 	      !g_unichar_isalpha(nc)))

  2131 	    {

  2132 		parities->squote=!parities->squote;

  2133 		if (!parities->squote)

  2134 		{

  2135 		    /* parity even */

  2136 		    if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))

  2137 		    {

  2138 			if (pswit[ECHO_SWITCH])

  2139 			    g_print("\n%s\n",aline);

  2140 			if (!pswit[OVERVIEW_SWITCH])

  2141 			    g_print("    Line %ld column %ld - "

  2142 			      "Wrongspaced singlequotes?\n",

  2143 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2144 			else

  2145 			    cnt_punct++;

  2146 		    }

  2147 		}

  2148 		else

  2149 		{

  2150 		    /* parity odd */

  2151 		    if (!g_unichar_isalpha(nc) && !isdigit(nc) &&

  2152 		      !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)

  2153 		    {

  2154 			if (pswit[ECHO_SWITCH])

  2155 			    g_print("\n%s\n",aline);

  2156 			if (!pswit[OVERVIEW_SWITCH])

  2157 			    g_print("    Line %ld column %ld - "

  2158 			      "Wrongspaced singlequotes?\n",

  2159 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2160 			else

  2161 			    cnt_punct++;

  2162 		    }

  2163 		}

  2164 	    }

  2165 	}

  2166     }

  2167 }

  2169 /*

  2170  * check_for_double_punctuation:

  2171  *

  2172  * Look for double punctuation like ,. or ,,

  2173  * Thanks to DW for the suggestion!

  2174  * In books with references, ".," and ".;" are common

  2175  * e.g. "etc., etc.," and vol. 1.; vol 3.;

  2176  * OTOH, from my initial tests, there are also fairly

  2177  * common errors. What to do? Make these cases paranoid?

  2178  * ".," is the most common, so warnings->dotcomma is used

  2179  * to suppress detailed reporting if it occurs often.

  2180  */

  2181 void check_for_double_punctuation(const char *aline,struct warnings *warnings)

  2182 {

  2183     const char *s;

  2184     gunichar c,nc;

  2185     nc=g_utf8_get_char(aline);

  2186     for (s=aline;*s;s=g_utf8_next_char(s))

  2187     {

  2188 	c=nc;

  2189 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2190 	/* for each punctuation character in the line */

  2191 	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&

  2192 	  g_utf8_strchr(".?!,;:",-1,nc))

  2193 	{

  2194 	    /* followed by punctuation, it's a query, unless . . . */

  2195 	    if (c==nc && (c=='.' || c=='?' || c=='!') ||

  2196 	      !warnings->dotcomma && c=='.' && nc==',' ||

  2197 	      warnings->isFrench && g_str_has_prefix(s,",...") ||

  2198 	      warnings->isFrench && g_str_has_prefix(s,"...,") ||

  2199 	      warnings->isFrench && g_str_has_prefix(s,";...") ||

  2200 	      warnings->isFrench && g_str_has_prefix(s,"...;") ||

  2201 	      warnings->isFrench && g_str_has_prefix(s,":...") ||

  2202 	      warnings->isFrench && g_str_has_prefix(s,"...:") ||

  2203 	      warnings->isFrench && g_str_has_prefix(s,"!...") ||

  2204 	      warnings->isFrench && g_str_has_prefix(s,"...!") ||

  2205 	      warnings->isFrench && g_str_has_prefix(s,"?...") ||

  2206 	      warnings->isFrench && g_str_has_prefix(s,"...?"))

  2207 	    {

  2208 		if (warnings->isFrench && g_str_has_prefix(s,",...") ||

  2209 		  warnings->isFrench && g_str_has_prefix(s,"...,") ||

  2210 		  warnings->isFrench && g_str_has_prefix(s,";...") ||

  2211 		  warnings->isFrench && g_str_has_prefix(s,"...;") ||

  2212 		  warnings->isFrench && g_str_has_prefix(s,":...") ||

  2213 		  warnings->isFrench && g_str_has_prefix(s,"...:") ||

  2214 		  warnings->isFrench && g_str_has_prefix(s,"!...") ||

  2215 		  warnings->isFrench && g_str_has_prefix(s,"...!") ||

  2216 		  warnings->isFrench && g_str_has_prefix(s,"?...") ||

  2217 		  warnings->isFrench && g_str_has_prefix(s,"...?"))

  2218 		{

  2219 		    s+=4;

  2220 		    nc=g_utf8_get_char(g_utf8_next_char(s));

  2221 		}

  2222 		; /* do nothing for .. !! and ?? which can be legit */

  2223 	    }

  2224 	    else

  2225 	    {

  2226 		if (pswit[ECHO_SWITCH])

  2227 		    g_print("\n%s\n",aline);

  2228 		if (!pswit[OVERVIEW_SWITCH])

  2229 		    g_print("    Line %ld column %ld - Double punctuation?\n",

  2230 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2231 		else

  2232 		    cnt_punct++;

  2233 	    }

  2234 	}

  2235     }

  2236 }

  2238 /*

  2239  * check_for_spaced_quotes:

  2240  */

  2241 void check_for_spaced_quotes(const char *aline)

  2242 {

  2243     int i;

  2244     const char *s,*t;

  2245     const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,

  2246       CHAR_RS_QUOTE};

  2247     GString *pattern;

  2248     s=aline;

  2249     while ((t=strstr(s," \" ")))

  2250     {

  2251 	if (pswit[ECHO_SWITCH])

  2252 	    g_print("\n%s\n",aline);

  2253 	if (!pswit[OVERVIEW_SWITCH])

  2254 	    g_print("    Line %ld column %ld - Spaced doublequote?\n",

  2255 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  2256 	else

  2257 	    cnt_punct++;

  2258 	s=g_utf8_next_char(g_utf8_next_char(t));

  2259     }

  2260     pattern=g_string_new(NULL);

  2261     for(i=0;i<G_N_ELEMENTS(single_quotes);i++)

  2262     {

  2263 	g_string_assign(pattern," ");

  2264 	g_string_append_unichar(pattern,single_quotes[i]);

  2265 	g_string_append_c(pattern,' ');

  2266 	s=aline;

  2267 	while ((t=strstr(s,pattern->str)))

  2268 	{

  2269 	    if (pswit[ECHO_SWITCH])

  2270 		g_print("\n%s\n",aline);

  2271 	    if (!pswit[OVERVIEW_SWITCH])

  2272 		g_print("    Line %ld column %ld - Spaced singlequote?\n",

  2273 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  2274 	    else

  2275 		cnt_punct++;

  2276 	    s=g_utf8_next_char(g_utf8_next_char(t));

  2277 	}

  2278     }

  2279     g_string_free(pattern,TRUE);

  2280 }

  2282 /*

  2283  * check_for_miscased_genative:

  2284  *

  2285  * Check special case of 'S instead of 's at end of word.

  2286  */

  2287 void check_for_miscased_genative(const char *aline)

  2288 {

  2289     const char *s;

  2290     gunichar c,nc,pc;

  2291     if (!*aline)

  2292 	return;

  2293     c=g_utf8_get_char(aline);

  2294     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2295     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2296     {

  2297 	pc=c;

  2298 	c=nc;

  2299 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2300 	if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))

  2301 	{

  2302 	    if (pswit[ECHO_SWITCH])

  2303 		g_print("\n%s\n",aline);

  2304 	    if (!pswit[OVERVIEW_SWITCH])

  2305 		g_print("    Line %ld column %ld - Capital \"S\"?\n",

  2306 		  linecnt,g_utf8_pointer_to_offset(aline,s)+2);

  2307 	    else

  2308 		cnt_punct++;

  2309 	}

  2310     }

  2311 }

  2313 /*

  2314  * check_end_of_line:

  2315  *

  2316  * Now check special cases - start and end of line -

  2317  * for single and double quotes. Start is sometimes [sic]

  2318  * but better to query it anyway.

  2319  * While we're here, check for dash at end of line.

  2320  */

  2321 void check_end_of_line(const char *aline,struct warnings *warnings)

  2322 {

  2323     int lbytes;

  2324     const char *s;

  2325     gunichar c1,c2;

  2326     lbytes=strlen(aline);

  2327     if (g_utf8_strlen(aline,lbytes)>1)

  2328     {

  2329 	s=g_utf8_prev_char(aline+lbytes);

  2330 	c1=g_utf8_get_char(s);

  2331 	c2=g_utf8_get_char(g_utf8_prev_char(s));

  2332 	if ((c1==CHAR_DQUOTE || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)

  2333 	{

  2334 	    if (pswit[ECHO_SWITCH])

  2335 		g_print("\n%s\n",aline);

  2336 	    if (!pswit[OVERVIEW_SWITCH])

  2337 		g_print("    Line %ld column %ld - Spaced quote?\n",linecnt,

  2338 		  g_utf8_strlen(aline,lbytes));

  2339 	    else

  2340 		cnt_punct++;

  2341 	}

  2342 	c1=g_utf8_get_char(aline);

  2343 	c2=g_utf8_get_char(g_utf8_next_char(aline));

  2344 	if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)

  2345 	{

  2346 	    if (pswit[ECHO_SWITCH])

  2347 		g_print("\n%s\n",aline);

  2348 	    if (!pswit[OVERVIEW_SWITCH])

  2349 		g_print("    Line %ld column 1 - Spaced quote?\n",linecnt);

  2350 	    else

  2351 		cnt_punct++;

  2352 	}

  2353 	/*

  2354 	 * Dash at end of line may well be legit - paranoid mode only

  2355 	 * and don't report em-dash at line-end.

  2356 	 */

  2357 	if (pswit[PARANOID_SWITCH] && warnings->hyphen)

  2358 	{

  2359 	    for (s=g_utf8_prev_char(aline+lbytes);

  2360 	      s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))

  2361 		;

  2362 	    if (g_utf8_get_char(s)=='-' &&

  2363 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')

  2364 	    {

  2365 		if (pswit[ECHO_SWITCH])

  2366 		    g_print("\n%s\n",aline);

  2367 		if (!pswit[OVERVIEW_SWITCH])

  2368 		    g_print("    Line %ld column %ld - "

  2369 		      "Hyphen at end of line?\n",

  2370 		      linecnt,g_utf8_pointer_to_offset(aline,s));

  2371 	    }

  2372 	}

  2373     }

  2374 }

  2376 /*

  2377  * check_for_unspaced_bracket:

  2378  *

  2379  * Brackets are often unspaced, but shouldn't be surrounded by alpha.

  2380  * If so, suspect a scanno like "a]most".

  2381  */

  2382 void check_for_unspaced_bracket(const char *aline)

  2383 {

  2384     const char *s;

  2385     gunichar c,nc,pc;

  2386     c=g_utf8_get_char(aline);

  2387     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2388     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2389     {

  2390 	pc=c;

  2391 	c=nc;

  2392 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2393 	if (!nc)

  2394 	    break;

  2395 	/* for each bracket character in the line except 1st & last */

  2396 	if (g_utf8_strchr("{[()]}",-1,c) &&

  2397 	  g_unichar_isalpha(pc) && g_unichar_isalpha(nc))

  2398 	{

  2399 	    if (pswit[ECHO_SWITCH])

  2400 		g_print("\n%s\n",aline);

  2401 	    if (!pswit[OVERVIEW_SWITCH])

  2402 		g_print("    Line %ld column %ld - Unspaced bracket?\n",

  2403 		  linecnt,g_utf8_pointer_to_offset(aline,s));

  2404 	    else

  2405 		cnt_punct++;

  2406 	}

  2407     }

  2408 }

  2410 /*

  2411  * check_for_unpunctuated_endquote:

  2412  */

  2413 void check_for_unpunctuated_endquote(const char *aline)

  2414 {

  2415     const char *s;

  2416     gunichar c,nc,pc;

  2417     c=g_utf8_get_char(aline);

  2418     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2419     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2420     {

  2421 	pc=c;

  2422 	c=nc;

  2423 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2424 	/* for each character in the line except 1st */

  2425 	if (c==CHAR_DQUOTE && isalpha(pc))

  2426 	{

  2427 	    if (pswit[ECHO_SWITCH])

  2428 		g_print("\n%s\n",aline);

  2429 	    if (!pswit[OVERVIEW_SWITCH])

  2430 		g_print("    Line %ld column %ld - "

  2431 		  "endquote missing punctuation?\n",

  2432 		  linecnt,g_utf8_pointer_to_offset(aline,s));

  2433 	    else

  2434 		cnt_punct++;

  2435 	}

  2436     }

  2437 }

  2439 /*

  2440  * check_for_html_tag:

  2441  *

  2442  * Check for <HTML TAG>.

  2443  *

  2444  * If there is a < in the line, followed at some point

  2445  * by a > then we suspect HTML.

  2446  */

  2447 void check_for_html_tag(const char *aline)

  2448 {

  2449     const char *open,*close;

  2450     gchar *tag;

  2451     open=strchr(aline,'<');

  2452     if (open)

  2453     {

  2454 	close=strchr(g_utf8_next_char(open),'>');

  2455 	if (close)

  2456 	{

  2457 	    if (pswit[ECHO_SWITCH])

  2458 		g_print("\n%s\n",aline);

  2459 	    if (!pswit[OVERVIEW_SWITCH])

  2460 	    {

  2461 		tag=g_strndup(open,close-open+1);

  2462 		g_print("    Line %ld column %ld - HTML Tag? %s \n",

  2463 		  linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);

  2464 		g_free(tag);

  2465 	    }

  2466 	    else

  2467 		cnt_html++;

  2468 	}

  2469     }

  2470 }

  2472 /*

  2473  * check_for_html_entity:

  2474  *

  2475  * Check for &symbol; HTML.

  2476  *

  2477  * If there is a & in the line, followed at

  2478  * some point by a ; then we suspect HTML.

  2479  */

  2480 void check_for_html_entity(const char *aline)

  2481 {

  2482     const char *s,*amp,*scolon;

  2483     gchar *entity;

  2484     amp=strchr(aline,'&');

  2485     if (amp)

  2486     {

  2487 	scolon=strchr(amp,';');

  2488 	if (scolon)

  2489 	{

  2490 	    for (s=amp;s<scolon;s=g_utf8_next_char(s))

  2491 		if (g_utf8_get_char(s)==CHAR_SPACE)

  2492 		    break;		/* Don't report "Jones & Son;" */

  2493 	    if (s>=scolon)

  2494 	    {

  2495 		if (pswit[ECHO_SWITCH])

  2496 		    g_print("\n%s\n",aline);

  2497 		if (!pswit[OVERVIEW_SWITCH])

  2498 		{

  2499 		    entity=g_strndup(amp,scolon-amp+1);

  2500 		    g_print("    Line %ld column %d - HTML symbol? %s \n",

  2501 		      linecnt,(int)(amp-aline)+1,entity);

  2502 		    g_free(entity);

  2503 		}

  2504 		else

  2505 		    cnt_html++;

  2506 	    }

  2507 	}

  2508     }

  2509 }

  2511 /*

  2512  * check_for_omitted_punctuation:

  2513  *

  2514  * Check for omitted punctuation at end of paragraph by working back

  2515  * through prevline. DW.

  2516  * Need to check this only for "normal" paras.

  2517  * So what is a "normal" para?

  2518  *    Not normal if one-liner (chapter headings, etc.)

  2519  *    Not normal if doesn't contain at least one locase letter

  2520  *    Not normal if starts with space

  2521  */

  2522 void check_for_omitted_punctuation(const char *prevline,

  2523   struct line_properties *last,int start_para_line)

  2524 {

  2525     gboolean letter_on_line=FALSE;

  2526     const char *s;

  2527     gunichar c;

  2528     gboolean closing_quote;

  2529     for (s=prevline;*s;s=g_utf8_next_char(s))

  2530 	if (g_unichar_isalpha(g_utf8_get_char(s)))

  2531 	{

  2532 	    letter_on_line=TRUE;

  2533 	    break;

  2534 	}

  2535     /*

  2536      * This next "if" is a problem.

  2537      * If we say "start_para_line <= linecnt - 1", that includes

  2538      * one-line "paragraphs" like chapter heads. Lotsa false positives.

  2539      * If we say "start_para_line < linecnt - 1" it doesn't, but then it

  2540      * misses genuine one-line paragraphs.

  2541      */

  2542     if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&

  2543       g_utf8_get_char(prevline)>CHAR_SPACE)

  2544     {

  2545 	s=prevline+strlen(prevline);

  2546 	do

  2547 	{

  2548 	    s=g_utf8_prev_char(s);

  2549 	    c=g_utf8_get_char(s);

  2550 	    if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)

  2551 		closing_quote=TRUE;

  2552 	    else

  2553 		closing_quote=FALSE;

  2554 	} while (closing_quote && s>prevline);

  2555 	for (;s>prevline;s=g_utf8_prev_char(s))

  2556 	{

  2557 	    if (g_unichar_isalpha(g_utf8_get_char(s)))

  2558 	    {

  2559 		if (pswit[ECHO_SWITCH])

  2560 		    g_print("\n%s\n",prevline);

  2561 		if (!pswit[OVERVIEW_SWITCH])

  2562 		    g_print("    Line %ld column %ld - "

  2563 		      "No punctuation at para end?\n",

  2564 		      linecnt-1,g_utf8_strlen(prevline,-1));

  2565 		else

  2566 		    cnt_punct++;

  2567 		break;

  2568 	    }

  2569 	    if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))

  2570 		break;

  2571 	}

  2572     }

  2573 }

  2575 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)

  2576 {

  2577     const char *word=key;

  2578     int *dupcnt=value;

  2579     if (*dupcnt)

  2580 	g_print("\nNote: Queried word %s was duplicated %d times\n",

  2581 	  word,*dupcnt);

  2582     return FALSE;

  2583 }

  2585 void print_as_windows_1252(const char *string)

  2586 {

  2587     gsize inbytes,outbytes;

  2588     gchar *buf,*bp;

  2589     static GIConv converter=(GIConv)-1;

  2590     if (!string)

  2591     {

  2592 	if (converter!=(GIConv)-1)

  2593 	    g_iconv_close(converter);

  2594 	converter=(GIConv)-1;

  2595 	return;

  2596     }

  2597     if (converter==(GIConv)-1)

  2598 	converter=g_iconv_open("WINDOWS-1252","UTF-8");

  2599     if (converter!=(GIConv)-1)

  2600     {

  2601 	inbytes=outbytes=strlen(string);

  2602 	bp=buf=g_malloc(outbytes+1);

  2603 	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);

  2604 	*bp='\0';

  2605 	fputs(buf,stdout);

  2606 	g_free(buf);

  2607     }

  2608     else

  2609 	fputs(string,stdout);

  2610 }

  2612 void print_as_utf_8(const char *string)

  2613 {

  2614     fputs(string,stdout);

  2615 }

  2617 /*

  2618  * procfile:

  2619  *

  2620  * Process one file.

  2621  */

  2622 void procfile(const char *filename)

  2623 {

  2624     const char *s;

  2625     gchar *parastart=NULL;	/* first line of current para */

  2626     gchar *etext,*aline;

  2627     gchar *etext_ptr;

  2628     GError *err=NULL;

  2629     struct first_pass_results *first_pass_results;

  2630     struct warnings *warnings;

  2631     struct counters counters={0};

  2632     struct line_properties last={0};

  2633     struct parities parities={0};

  2634     struct pending pending={0};

  2635     gboolean isemptyline;

  2636     long start_para_line=0;

  2637     gboolean isnewpara=FALSE,enddash=FALSE;

  2638     last.start=CHAR_SPACE;

  2639     linecnt=checked_linecnt=0;

  2640     etext=read_etext(filename,&err);

  2641     if (!etext)

  2642     {

  2643 	if (pswit[STDOUT_SWITCH])

  2644 	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);

  2645 	else

  2646 	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);

  2647 	exit(1);

  2648     }

  2649     g_print("\n\nFile: %s\n\n",filename);

  2650     first_pass_results=first_pass(etext);

  2651     warnings=report_first_pass(first_pass_results);

  2652     qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);

  2653     qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);

  2654     /*

  2655      * Here we go with the main pass. Hold onto yer hat!

  2656      */

  2657     linecnt=0;

  2658     etext_ptr=etext;

  2659     while ((aline=flgets(&etext_ptr,linecnt+1)))

  2660     {

  2661 	linecnt++;

  2662 	if (linecnt==1)

  2663 	    isnewpara=TRUE;

  2664 	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))

  2665 	    continue;    // skip DP page separators completely

  2666 	if (linecnt<first_pass_results->firstline ||

  2667 	  (first_pass_results->footerline>0 &&

  2668 	  linecnt>first_pass_results->footerline))

  2669 	{

  2670 	    if (pswit[HEADER_SWITCH])

  2671 	    {

  2672 		if (g_str_has_prefix(aline,"Title:"))

  2673 		    g_print("    %s\n",aline);

  2674 		if (g_str_has_prefix(aline,"Author:"))

  2675 		    g_print("    %s\n",aline);

  2676 		if (g_str_has_prefix(aline,"Release Date:"))

  2677 		    g_print("    %s\n",aline);

  2678 		if (g_str_has_prefix(aline,"Edition:"))

  2679 		    g_print("    %s\n\n",aline);

  2680 	    }

  2681 	    continue;		/* skip through the header */

  2682 	}

  2683 	checked_linecnt++;

  2684 	print_pending(aline,parastart,&pending);

  2685 	isemptyline=analyse_quotes(aline,linecnt,&counters);

  2686 	if (isnewpara && !isemptyline)

  2687 	{

  2688 	    /* This line is the start of a new paragraph. */

  2689 	    start_para_line=linecnt;

  2690 	    /* Capture its first line in case we want to report it later. */

  2691 	    g_free(parastart);

  2692 	    parastart=g_strdup(aline);

  2693 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */

  2694 	    s=aline;

  2695 	    while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&

  2696 	      !g_unichar_isdigit(g_utf8_get_char(s)))

  2697 		s=g_utf8_next_char(s);

  2698 	    if (g_unichar_islower(g_utf8_get_char(s)))

  2699 	    {

  2700 		/* and its first letter is lowercase */

  2701 		if (pswit[ECHO_SWITCH])

  2702 		    g_print("\n%s\n",aline);

  2703 		if (!pswit[OVERVIEW_SWITCH])

  2704 		    g_print("    Line %ld column %ld - "

  2705 		      "Paragraph starts with lower-case\n",

  2706 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2707 		else

  2708 		    cnt_punct++;

  2709 	    }

  2710 	    isnewpara=FALSE; /* Signal the end of new para processing. */

  2711 	}

  2712 	/* Check for an em-dash broken at line end. */

  2713 	if (enddash && g_utf8_get_char(aline)=='-')

  2714 	{

  2715 	    if (pswit[ECHO_SWITCH])

  2716 		g_print("\n%s\n",aline);

  2717 	    if (!pswit[OVERVIEW_SWITCH])

  2718 		g_print("    Line %ld column 1 - Broken em-dash?\n",linecnt);

  2719 	    else

  2720 		cnt_punct++;

  2721 	}

  2722 	enddash=FALSE;

  2723 	for (s=g_utf8_prev_char(aline+strlen(aline));

  2724 	  g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))

  2725 	    ;

  2726 	if (s>=aline && g_utf8_get_char(s)=='-')

  2727 	    enddash=TRUE;

  2728 	check_for_control_characters(aline);

  2729 	check_for_odd_characters(aline,warnings,isemptyline);

  2730 	if (warnings->longline)

  2731 	    check_for_long_line(aline);

  2732 	if (warnings->shortline)

  2733 	    check_for_short_line(aline,&last);

  2734 	last.blen=last.len;

  2735 	last.len=g_utf8_strlen(aline,-1);

  2736 	last.start=g_utf8_get_char(aline);

  2737 	check_for_starting_punctuation(aline);

  2738 	if (warnings->dash)

  2739 	{

  2740 	    check_for_spaced_emdash(aline);

  2741 	    check_for_spaced_dash(aline);

  2742 	}

  2743 	check_for_unmarked_paragraphs(aline);

  2744 	check_for_jeebies(aline);

  2745 	check_for_mta_from(aline);

  2746 	check_for_orphan_character(aline);

  2747 	check_for_pling_scanno(aline);

  2748 	check_for_extra_period(aline,warnings);

  2749 	check_for_following_punctuation(aline);

  2750 	check_for_typos(aline,warnings);

  2751 	check_for_misspaced_punctuation(aline,&parities,isemptyline);

  2752 	check_for_double_punctuation(aline,warnings);

  2753 	check_for_spaced_quotes(aline);

  2754 	check_for_miscased_genative(aline);

  2755 	check_end_of_line(aline,warnings);

  2756 	check_for_unspaced_bracket(aline);

  2757 	if (warnings->endquote)

  2758 	    check_for_unpunctuated_endquote(aline);

  2759 	check_for_html_tag(aline);

  2760 	check_for_html_entity(aline);

  2761 	if (isemptyline)

  2762 	{

  2763 	    check_for_mismatched_quotes(&counters,&pending);

  2764 	    counters_reset(&counters);

  2765 	    /* let the next iteration know that it's starting a new para */

  2766 	    isnewpara=TRUE;

  2767 	    if (prevline)

  2768 		check_for_omitted_punctuation(prevline,&last,start_para_line);

  2769 	}

  2770 	g_free(prevline);

  2771 	prevline=g_strdup(aline);

  2772     }

  2773     linecnt++;

  2774     check_for_mismatched_quotes(&counters,&pending);

  2775     print_pending(NULL,parastart,&pending);

  2776     reset_pending(&pending);

  2777     if (prevline)

  2778     {

  2779 	g_free(prevline);

  2780 	prevline=NULL;

  2781     }

  2782     g_free(parastart);

  2783     g_free(prevline);

  2784     g_free(etext);

  2785     if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])

  2786 	g_tree_foreach(qword,report_duplicate_queries,NULL);

  2787     g_tree_unref(qword);

  2788     g_tree_unref(qperiod);

  2789     counters_destroy(&counters);

  2790     g_set_print_handler(NULL);

  2791     print_as_windows_1252(NULL);

  2792     if (pswit[MARKUP_SWITCH])

  2793 	loseentities(NULL);

  2794 }

  2796 /*

  2797  * flgets:

  2798  *

  2799  * Get one line from the input text, checking for

  2800  * the existence of exactly one CR/LF line-end per line.

  2801  *

  2802  * Returns: a pointer to the line.

  2803  */

  2804 char *flgets(char **etext,long lcnt)

  2805 {

  2806     gunichar c;

  2807     gboolean isCR=FALSE;

  2808     char *theline=*etext;

  2809     char *eos=theline;

  2810     gchar *s;

  2811     for (;;)

  2812     {

  2813 	c=g_utf8_get_char(*etext);

  2814 	*etext=g_utf8_next_char(*etext);

  2815 	if (!c)

  2816 	    return NULL;

  2817 	/* either way, it's end of line */

  2818 	if (c=='\n')

  2819 	{

  2820 	    if (isCR)

  2821 		break;

  2822 	    else

  2823 	    {

  2824 		/* Error - a LF without a preceding CR */

  2825 		if (pswit[LINE_END_SWITCH])

  2826 		{

  2827 		    if (pswit[ECHO_SWITCH])

  2828 		    {

  2829 			s=g_strndup(theline,eos-theline);

  2830 			g_print("\n%s\n",s);

  2831 			g_free(s);

  2832 		    }

  2833 		    if (!pswit[OVERVIEW_SWITCH])

  2834 			g_print("    Line %ld - No CR?\n",lcnt);

  2835 		    else

  2836 			cnt_lineend++;

  2837 		}

  2838 		break;

  2839 	    }

  2840 	}

  2841 	if (c=='\r')

  2842 	{

  2843 	    if (isCR)

  2844 	    {

  2845 		/* Error - two successive CRs */

  2846 		if (pswit[LINE_END_SWITCH])

  2847 		{

  2848 		    if (pswit[ECHO_SWITCH])

  2849 		    {

  2850 			s=g_strndup(theline,eos-theline);

  2851 			g_print("\n%s\n",s);

  2852 			g_free(s);

  2853 		    }

  2854 		    if (!pswit[OVERVIEW_SWITCH])

  2855 			g_print("    Line %ld - Two successive CRs?\n",lcnt);

  2856 		    else

  2857 			cnt_lineend++;

  2858 		}

  2859 	    }

  2860 	    isCR=TRUE;

  2861 	}

  2862 	else

  2863 	{

  2864 	    if (pswit[LINE_END_SWITCH] && isCR)

  2865 	    {

  2866 		if (pswit[ECHO_SWITCH])

  2867 		{

  2868 		    s=g_strndup(theline,eos-theline);

  2869 		    g_print("\n%s\n",s);

  2870 		    g_free(s);

  2871 		}

  2872 		if (!pswit[OVERVIEW_SWITCH])

  2873 		    g_print("    Line %ld column %ld - CR without LF?\n",

  2874 		      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);

  2875 		else

  2876 		    cnt_lineend++;

  2877 		*eos=' ';

  2878 	    }

  2879 	    isCR=FALSE;

  2880 	    eos=g_utf8_next_char(eos);

  2881 	}

  2882     }

  2883     *eos='\0';

  2884     if (pswit[MARKUP_SWITCH])

  2885 	postprocess_for_HTML(theline);

  2886     if (pswit[DP_SWITCH])

  2887 	postprocess_for_DP(theline);

  2888     return theline;

  2889 }

  2891 /*

  2892  * mixdigit:

  2893  *

  2894  * Takes a "word" as a parameter, and checks whether it

  2895  * contains a mixture of alpha and digits. Generally, this is an

  2896  * error, but may not be for cases like 4th or L5 12s. 3d.

  2897  *

  2898  * Returns: TRUE iff an is error found.

  2899  */

  2900 gboolean mixdigit(const char *checkword)

  2901 {

  2902     gboolean wehaveadigit,wehavealetter,query;

  2903     const char *s,*nondigit;

  2904     wehaveadigit=wehavealetter=query=FALSE;

  2905     for (s=checkword;*s;s=g_utf8_next_char(s))

  2906 	if (g_unichar_isalpha(g_utf8_get_char(s)))

  2907 	    wehavealetter=TRUE;

  2908 	else if (g_unichar_isdigit(g_utf8_get_char(s)))

  2909 	    wehaveadigit=TRUE;

  2910     if (wehaveadigit && wehavealetter)

  2911     {

  2912 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */

  2913 	query=TRUE;

  2914 	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));

  2915 	  nondigit=g_utf8_next_char(nondigit))

  2916 	    ;

  2917 	/* digits, ending in st, rd, nd, th of either case */

  2918 	if (!g_ascii_strcasecmp(nondigit,"st") ||

  2919 	  !g_ascii_strcasecmp(nondigit,"rd") ||

  2920 	  !g_ascii_strcasecmp(nondigit,"nd") ||

  2921 	  !g_ascii_strcasecmp(nondigit,"th"))

  2922 	    query=FALSE;

  2923 	if (!g_ascii_strcasecmp(nondigit,"sts") ||

  2924 	  !g_ascii_strcasecmp(nondigit,"rds") ||

  2925 	  !g_ascii_strcasecmp(nondigit,"nds") ||

  2926 	  !g_ascii_strcasecmp(nondigit,"ths"))

  2927 	    query=FALSE;

  2928 	if (!g_ascii_strcasecmp(nondigit,"stly") ||

  2929 	  !g_ascii_strcasecmp(nondigit,"rdly") ||

  2930 	  !g_ascii_strcasecmp(nondigit,"ndly") ||

  2931 	  !g_ascii_strcasecmp(nondigit,"thly"))

  2932 	    query=FALSE;

  2933 	/* digits, ending in l, L, s or d */

  2934 	if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||

  2935 	  !strcmp(nondigit,"d"))

  2936 	    query=FALSE;

  2937 	/*

  2938 	 * L at the start of a number, representing Britsh pounds, like L500.

  2939 	 * This is cute. We know the current word is mixed digit. If the first

  2940 	 * letter is L, there must be at least one digit following. If both

  2941 	 * digits and letters follow, we have a genuine error, else we have a

  2942 	 * capital L followed by digits, and we accept that as a non-error.

  2943 	 */

  2944 	if (g_utf8_get_char(checkword)=='L' &&

  2945 	  !mixdigit(g_utf8_next_char(checkword)))

  2946 	    query=FALSE;

  2947     }

  2948     return query;

  2949 }

  2951 /*

  2952  * getaword:

  2953  *

  2954  * Extracts the first/next "word" from the line, and returns it.

  2955  * A word is defined as one English word unit--or at least that's the aim.

  2956  * "ptr" is advanced to the position in the line where we will start

  2957  * looking for the next word.

  2958  *

  2959  * Returns: A newly-allocated string.

  2960  */

  2961 gchar *getaword(const char **ptr)

  2962 {

  2963     const char *s,*t;

  2964     GString *word;

  2965     gunichar c,pc;

  2966     word=g_string_new(NULL);

  2967     for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&

  2968       !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&

  2969       **ptr;*ptr=g_utf8_next_char(*ptr))

  2970 	;

  2971     /*

  2972      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.

  2973      * Especially yucky is the case of L1,000

  2974      * This section looks for a pattern of characters including a digit

  2975      * followed by a comma or period followed by one or more digits.

  2976      * If found, it returns this whole pattern as a word; otherwise we discard

  2977      * the results and resume our normal programming.

  2978      */

  2979     s=*ptr;

  2980     for (;g_unichar_isdigit(g_utf8_get_char(s)) ||

  2981       g_unichar_isalpha(g_utf8_get_char(s)) ||

  2982       g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))

  2983 	g_string_append_unichar(word,g_utf8_get_char(s));

  2984     if (word->len)

  2985     {

  2986 	for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))

  2987 	{

  2988 	    c=g_utf8_get_char(t);

  2989 	    pc=g_utf8_get_char(g_utf8_prev_char(t));

  2990 	    if ((c=='.' || c==',') && g_unichar_isdigit(pc))

  2991 	    {

  2992 		*ptr=s;

  2993 		return g_string_free(word,FALSE);

  2994 	    }

  2995 	}

  2996     }

  2997     /* we didn't find a punctuated number - do the regular getword thing */

  2998     g_string_truncate(word,0);

  2999     c=g_utf8_get_char(*ptr);

  3000     for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);

  3001       *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))

  3002 	g_string_append_unichar(word,c);

  3003     return g_string_free(word,FALSE);

  3004 }

  3006 /*

  3007  * isroman:

  3008  *

  3009  * Is this word a Roman Numeral?

  3010  *

  3011  * It doesn't actually validate that the number is a valid Roman Numeral--for

  3012  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not

  3013  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.

  3014  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or

  3015  * expressions thereof, except when it came to taxes. Allow any number of M,

  3016  * an optional D, an optional CM or CD, any number of optional Cs, an optional

  3017  * XL or an optional XC, an optional IX or IV, an optional V and any number

  3018  * of optional Is.

  3019  */

  3020 gboolean isroman(const char *t)

  3021 {

  3022     const char *s;

  3023     if (!t || !*t)

  3024 	return FALSE;

  3025     s=t;

  3026     while (g_utf8_get_char(t)=='m' && *t)

  3027 	t++;

  3028     if (g_utf8_get_char(t)=='d')

  3029 	t++;

  3030     if (g_str_has_prefix(t,"cm"))

  3031 	t+=2;

  3032     if (g_str_has_prefix(t,"cd"))

  3033 	t+=2;

  3034     while (g_utf8_get_char(t)=='c' && *t)

  3035 	t++;

  3036     if (g_str_has_prefix(t,"xl"))

  3037 	t+=2;

  3038     if (g_str_has_prefix(t,"xc"))

  3039 	t+=2;

  3040     if (g_utf8_get_char(t)=='l')

  3041 	t++;

  3042     while (g_utf8_get_char(t)=='x' && *t)

  3043 	t++;

  3044     if (g_str_has_prefix(t,"ix"))

  3045 	t+=2;

  3046     if (g_str_has_prefix(t,"iv"))

  3047 	t+=2;

  3048     if (g_utf8_get_char(t)=='v')

  3049 	t++;

  3050     while (g_utf8_get_char(t)=='i' && *t)

  3051 	t++;

  3052     return !*t;

  3053 }

  3055 /*

  3056  * postprocess_for_DP:

  3057  *

  3058  * Invoked with the -d switch from flgets().

  3059  * It simply "removes" from the line a hard-coded set of common

  3060  * DP-specific tags, so that the line passed to the main routine has

  3061  * been pre-cleaned of DP markup.

  3062  */

  3063 void postprocess_for_DP(char *theline)

  3064 {

  3065     char *s,*t;

  3066     int i;

  3067     if (!*theline)

  3068 	return;

  3069     for (i=0;*DPmarkup[i];i++)

  3070 	while ((s=strstr(theline,DPmarkup[i])))

  3071 	{

  3072 	    t=s+strlen(DPmarkup[i]);

  3073 	    memmove(s,t,strlen(t)+1);

  3074 	}

  3075 }

  3077 /*

  3078  * postprocess_for_HTML:

  3079  *

  3080  * Invoked with the -m switch from flgets().

  3081  * It simply "removes" from the line a hard-coded set of common

  3082  * HTML tags and "replaces" a hard-coded set of common HTML

  3083  * entities, so that the line passed to the main routine has

  3084  * been pre-cleaned of HTML.

  3085  */

  3086 void postprocess_for_HTML(char *theline)

  3087 {

  3088     while (losemarkup(theline))

  3089 	;

  3090     loseentities(theline);

  3091 }

  3093 char *losemarkup(char *theline)

  3094 {

  3095     char *s,*t;

  3096     int i;

  3097     s=strchr(theline,'<');

  3098     t=s?strchr(s,'>'):NULL;

  3099     if (!s || !t)

  3100 	return NULL;

  3101     for (i=0;*markup[i];i++)

  3102 	if (tagcomp(g_utf8_next_char(s),markup[i]))

  3103 	{

  3104 	    t=g_utf8_next_char(t);

  3105 	    memmove(s,t,strlen(t)+1);

  3106 	    return s;

  3107 	}

  3108     /* It's an unrecognized <xxx>. */

  3109     return NULL;

  3110 }

  3112 void loseentities(char *theline)

  3113 {

  3114     int i;

  3115     gsize nb;

  3116     char *amp,*scolon;

  3117     gchar *s,*t;

  3118     gunichar c;

  3119     GTree *entities=NULL;

  3120     static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;

  3121     if (!theline)

  3122     {

  3123 	if (entities)

  3124 	    g_tree_destroy(entities);

  3125 	entities=NULL;

  3126 	if (translit!=(GIConv)-1)

  3127 	    g_iconv_close(translit);

  3128 	translit=(GIConv)-1;

  3129 	if (to_utf8!=(GIConv)-1)

  3130 	    g_iconv_close(to_utf8);

  3131 	to_utf8=(GIConv)-1;

  3132 	return;

  3133     }

  3134     if (!*theline)

  3135 	return;

  3136     if (!entities)

  3137     {

  3138 	entities=g_tree_new((GCompareFunc)strcmp);

  3139 	for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)

  3140 	    g_tree_insert(entities,HTMLentities[i].name,

  3141 	      GUINT_TO_POINTER(HTMLentities[i].c));

  3142     }

  3143     if (translit==(GIConv)-1)

  3144 	translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");

  3145     if (to_utf8==(GIConv)-1)

  3146 	to_utf8=g_iconv_open("UTF-8","ISO_8859-1");

  3147     while((amp=strchr(theline,'&')))

  3148     {

  3149 	scolon=strchr(amp,';');

  3150 	if (scolon)

  3151 	{

  3152 	    if (amp[1]=='#')

  3153 	    {

  3154 		if (amp+2+strspn(amp+2,"0123456789")==scolon)

  3155 		    c=strtol(amp+2,NULL,10);

  3156 		else if (amp[2]=='x' &&

  3157 		  amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)

  3158 		    c=strtol(amp+3,NULL,16);

  3159 	    }

  3160 	    else

  3161 	    {

  3162 		s=g_strndup(amp+1,scolon-(amp+1));

  3163 	        c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));

  3164 		g_free(s);

  3165 	    }

  3166 	}

  3167 	else

  3168 	    c=0;

  3169 	if (c)

  3170 	{

  3171 	    theline=amp;

  3172 	    if (c<128 || c>=192 && c<=255)	/* An ISO-8859-1 character */

  3173 		theline+=g_unichar_to_utf8(c,theline);

  3174 	    else

  3175 	    {

  3176 		s=g_malloc(6);

  3177 		nb=g_unichar_to_utf8(c,s);

  3178 		t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);

  3179 		g_free(s);

  3180 		s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);

  3181 		g_free(t);

  3182 		memcpy(theline,s,nb);

  3183 		g_free(s);

  3184 		theline+=nb;

  3185 	    }

  3186 	    memmove(theline,g_utf8_next_char(scolon),

  3187 	      strlen(g_utf8_next_char(scolon))+1);

  3188 	}

  3189 	else

  3190 	    theline=g_utf8_next_char(amp);

  3191     }

  3192 }

  3194 gboolean tagcomp(const char *strin,const char *basetag)

  3195 {

  3196     gboolean retval;

  3197     gchar *s,*t;

  3198     if (g_utf8_get_char(strin)=='/')

  3199 	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */

  3200     else

  3201 	t=g_utf8_casefold(strin,-1);

  3202     s=g_utf8_casefold(basetag,-1);

  3203     retval=g_str_has_prefix(t,s);

  3204     g_free(s);

  3205     g_free(t);

  3206     return retval;

  3207 }

  3209 void proghelp(GOptionContext *context)

  3210 {

  3211     gchar *help;

  3212     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);

  3213     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);

  3214     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);

  3215     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "

  3216       "For details, read the file COPYING.\n",stderr);

  3217     fputs("This is Free Software; "

  3218       "you may redistribute it under certain conditions (GPL);\n",stderr);

  3219     fputs("read the file COPYING for details.\n\n",stderr);

  3220     help=g_option_context_get_help(context,TRUE,NULL);

  3221     fputs(help,stderr);

  3222     g_free(help);

  3223     fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);

  3224     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "

  3225       "non-ASCII\n",stderr);

  3226     fputs("characters like accented letters, "

  3227       "lines longer than 75 or shorter than 55,\n",stderr);

  3228     fputs("unbalanced quotes or brackets, "

  3229       "a variety of badly formatted punctuation, \n",stderr);

  3230     fputs("HTML tags, some likely typos. "

  3231       "It is NOT a substitute for human judgement.\n",stderr);

  3232     fputs("\n",stderr);

  3233 }

author	ali <ali@juiblex.co.uk>
	Sat Sep 28 17:17:10 2013 +0100 (2013-09-28)
changeset 128	f3c293593d44
parent 123	ddb5ddba6ef3
parent 127	badcc3b340ab
permissions	-rw-r--r--