bookloupe-testing: bookloupe/bookloupe.c@9d3a8ee81151

     1 /*************************************************************************/

     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */

     3 /*									 */

     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */

     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */

     6 /*									 */

     7 /* This program is free software; you can redistribute it and/or modify  */

     8 /* it under the terms of the GNU General Public License as published by  */

     9 /* the Free Software Foundation; either version 2 of the License, or     */

    10 /* (at your option) any later version.					 */

    11 /*									 */

    12 /* This program is distributed in the hope that it will be useful,       */

    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */

    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */

    15 /* GNU General Public License for more details.				 */

    16 /*									 */

    17 /* You should have received a copy of the GNU General Public License	 */

    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */

    19 /*************************************************************************/

    21 #include <stdio.h>

    22 #include <stdlib.h>

    23 #include <string.h>

    24 #include <ctype.h>

    25 #ifdef __WIN32__

    26 #include <windows.h>

    27 #endif

    28 #include <glib.h>

    29 #include <bl/bl.h>

    30 #include "bookloupe.h"

    31 #include "counters.h"

    32 #include "pending.h"

    33 #include "HTMLentities.h"

    35 gchar *prevline;

    37 /* Common typos. */

    38 char *typo[] = {

    39     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",

    40     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",

    41     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",

    42     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",

    43     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",

    44     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",

    45     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",

    46     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",

    47     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",

    48     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",

    49     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",

    50     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",

    51     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",

    52     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",

    53     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",

    54     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",

    55     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",

    56     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",

    57     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",

    58     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",

    59     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",

    60     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",

    61     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",

    62     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",

    63     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",

    64     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",

    65     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",

    66     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",

    67     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",

    68     "se", ""

    69 };

    71 GTree *usertypo;

    73 /* Common abbreviations and other OK words not to query as typos. */

    74 char *okword[] = {

    75     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",

    76     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",

    77     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",

    78     "outbid", "outbids", "frostbite", "frostbitten", ""

    79 };

    81 /* Common abbreviations that cause otherwise unexplained periods. */

    82 char *abbrev[] = {

    83     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",

    84     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""

    85 };

    87 /*

    88  * Two-Letter combinations that rarely if ever start words,

    89  * but are common scannos or otherwise common letter combinations.

    90  */

    91 char *nostart[] = {

    92     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""

    93 };

    95 /*

    96  * Two-Letter combinations that rarely if ever end words,

    97  * but are common scannos or otherwise common letter combinations.

    98  */

    99 char *noend[] = {

   100     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",

   101     "sw", "gr", "sl", "cl", "iy", ""

   102 };

   104 char *markup[] = {

   105     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",

   106     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",

   107     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",

   108     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""

   109 };

   111 char *DPmarkup[] = {

   112     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""

   113 };

   115 char *nocomma[] = {

   116     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",

   117     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",

   118     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",

   119     "during", "let", "toward", "among", ""

   120 };

   122 char *noperiod[] = {

   123     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",

   124     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",

   125     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",

   126     "among", "those", "into", "whom", "having", "thence", ""

   127 };

   129 gboolean pswit[SWITNO];  /* program switches */

   131 static GOptionEntry options[]={

   132     { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,

   133       "Ignore DP-specific markup", NULL },

   134     { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,

   135       "Don't echo queried line", NULL },

   136     { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,

   137       "Check single quotes", NULL },

   138     { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,

   139       "Check common typos", NULL },

   140     { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,

   141       "Require closure of quotes on every paragraph", NULL },

   142     { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,

   143       "Disable paranoid querying of everything", NULL },

   144     { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,

   145       "Disable line end checking", NULL },

   146     { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,

   147       "Overview: just show counts", NULL },

   148     { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,

   149       "Output errors to stdout instead of stderr", NULL },

   150     { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,

   151       "Echo header fields", NULL },

   152     { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,

   153       "Ignore markup in < >", NULL },

   154     { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,

   155       "Use file of user-defined typos", NULL },

   156     { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,

   157       "Defaults for use on www upload", NULL },

   158     { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,

   159       "Verbose - list everything", NULL },

   160     { NULL }

   161 };

   163 long cnt_dquot;		/* for overview mode, count of doublequote queries */

   164 long cnt_squot;		/* for overview mode, count of singlequote queries */

   165 long cnt_brack;		/* for overview mode, count of brackets queries */

   166 long cnt_bin;		/* for overview mode, count of non-ASCII queries */

   167 long cnt_odd;		/* for overview mode, count of odd character queries */

   168 long cnt_long;		/* for overview mode, count of long line errors */

   169 long cnt_short;		/* for overview mode, count of short line queries */

   170 long cnt_punct;		/* for overview mode,

   171 			   count of punctuation and spacing queries */

   172 long cnt_dash;		/* for overview mode, count of dash-related queries */

   173 long cnt_word;		/* for overview mode, count of word queries */

   174 long cnt_html;		/* for overview mode, count of html queries */

   175 long cnt_lineend;	/* for overview mode, count of line-end queries */

   176 long cnt_spacend;	/* count of lines with space at end */

   177 long linecnt;		/* count of total lines in the file */

   178 long checked_linecnt;	/* count of lines actually checked */

   180 void proghelp(GOptionContext *context);

   181 void procfile(const char *);

   183 gchar *running_from;

   185 gboolean mixdigit(const char *);

   186 gchar *getaword(const char **);

   187 char *flgets(char **,long);

   188 void postprocess_for_HTML(char *);

   189 char *linehasmarkup(char *);

   190 char *losemarkup(char *);

   191 gboolean tagcomp(const char *,const char *);

   192 void loseentities(char *);

   193 gboolean isroman(const char *);

   194 void postprocess_for_DP(char *);

   195 void print_as_windows_1252(const char *string);

   196 void print_as_utf_8(const char *string);

   198 GTree *qword,*qperiod;

   200 #ifdef __WIN32__

   201 UINT saved_cp;

   202 #endif

   204 void parse_options(int *argc,char ***argv)

   205 {

   206     GError *err=NULL;

   207     GOptionContext *context;

   208     context=g_option_context_new(

   209       "file - looks for errors in Project Gutenberg(TM) etexts");

   210     g_option_context_add_main_entries(context,options,NULL);

   211     if (!g_option_context_parse(context,argc,argv,&err))

   212     {

   213 	g_printerr("Bookloupe: %s\n",err->message);

   214 	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);

   215 	exit(1);

   216     }

   217     /* Paranoid checking is turned OFF, not on, by its switch */

   218     pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];

   219     if (pswit[PARANOID_SWITCH])

   220 	/* if running in paranoid mode, typo checks default to enabled */

   221 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];

   222     /* Line-end checking is turned OFF, not on, by its switch */

   223     pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];

   224     /* Echoing is turned OFF, not on, by its switch */

   225     pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];

   226     if (pswit[OVERVIEW_SWITCH])

   227 	/* just print summary; don't echo */

   228 	pswit[ECHO_SWITCH]=FALSE;

   229     /*

   230      * Web uploads - for the moment, this is really just a placeholder

   231      * until we decide what processing we really want to do on web uploads

   232      */

   233     if (pswit[WEB_SWITCH])

   234     {

   235 	/* specific override for web uploads */

   236 	pswit[ECHO_SWITCH]=TRUE;

   237 	pswit[SQUOTE_SWITCH]=FALSE;

   238 	pswit[TYPO_SWITCH]=TRUE;

   239 	pswit[QPARA_SWITCH]=FALSE;

   240 	pswit[PARANOID_SWITCH]=TRUE;

   241 	pswit[LINE_END_SWITCH]=FALSE;

   242 	pswit[OVERVIEW_SWITCH]=FALSE;

   243 	pswit[STDOUT_SWITCH]=FALSE;

   244 	pswit[HEADER_SWITCH]=TRUE;

   245 	pswit[VERBOSE_SWITCH]=FALSE;

   246 	pswit[MARKUP_SWITCH]=FALSE;

   247 	pswit[USERTYPO_SWITCH]=FALSE;

   248 	pswit[DP_SWITCH]=FALSE;

   249     }

   250     if (*argc<2)

   251     {

   252 	proghelp(context);

   253 	exit(1);

   254     }

   255     g_option_context_free(context);

   256 }

   258 /*

   259  * read_user_scannos:

   260  *

   261  * Read in the user-defined stealth scanno list.

   262  */

   263 void read_user_scannos(void)

   264 {

   265     GError *err=NULL;

   266     gchar *usertypo_file;

   267     gboolean okay;

   268     int i;

   269     gsize len,nb;

   270     gchar *contents,*utf8,**lines;

   271     usertypo_file=g_strdup("bookloupe.typ");

   272     okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   273     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   274     {

   275 	g_clear_error(&err);

   276 	g_free(usertypo_file);

   277 	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);

   278 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   279     }

   280     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   281     {

   282 	g_clear_error(&err);

   283 	g_free(usertypo_file);

   284 	usertypo_file=g_strdup("gutcheck.typ");

   285 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   286     }

   287     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   288     {

   289 	g_clear_error(&err);

   290 	g_free(usertypo_file);

   291 	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);

   292 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   293     }

   294     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   295     {

   296 	g_free(usertypo_file);

   297 	g_print("   --> I couldn't find bookloupe.typ "

   298 	  "-- proceeding without user typos.\n");

   299 	return;

   300     }

   301     else if (!okay)

   302     {

   303 	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);

   304 	g_free(usertypo_file);

   305 	g_clear_error(&err);

   306 	exit(1);

   307     }

   308     if (g_utf8_validate(contents,len,NULL))

   309 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);

   310     else

   311 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);

   312     g_free(contents);

   313     lines=g_strsplit_set(utf8,"\r\n",0);

   314     g_free(utf8);

   315     usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);

   316     for (i=0;lines[i];i++)

   317 	if (*(unsigned char *)lines[i]>'!')

   318 	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));

   319 	else

   320 	    g_free(lines[i]);

   321     g_free(lines);

   322 }

   324 /*

   325  * read_etext:

   326  *

   327  * Read an etext returning a newly allocated string containing the file

   328  * contents or NULL on error.

   329  */

   330 gchar *read_etext(const char *filename,GError **err)

   331 {

   332     GError *tmp_err=NULL;

   333     gchar *contents,*utf8;

   334     gsize len,bytes_read,bytes_written;

   335     int i,line,col;

   336     if (!g_file_get_contents(filename,&contents,&len,err))

   337 	return NULL;

   338     if (g_utf8_validate(contents,len,NULL))

   339     {

   340 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);

   341 	g_set_print_handler(print_as_utf_8);

   342 #ifdef __WIN32__

   343 	SetConsoleOutputCP(CP_UTF8);

   344 #endif

   345     }

   346     else

   347     {

   348 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,

   349 	  &bytes_written,&tmp_err);

   350 	if (g_error_matches(tmp_err,G_CONVERT_ERROR,

   351 	  G_CONVERT_ERROR_ILLEGAL_SEQUENCE))

   352 	{

   353 	    line=col=1;

   354 	    for(i=0;i<bytes_read;i++)

   355 		if (contents[i]=='\n')

   356 		{

   357 		    line++;

   358 		    col=1;

   359 		}

   360 		else if (contents[i]!='\r')

   361 		    col++;

   362 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,

   363 	      "Input conversion failed. Byte %d at line %d, column %d is not a "

   364 	      "valid Windows-1252 character",

   365 	      ((unsigned char *)contents)[bytes_read],line,col);

   366 	}

   367 	else if (tmp_err)

   368 	    g_propagate_error(err,tmp_err);

   369 	g_set_print_handler(print_as_windows_1252);

   370 #ifdef __WIN32__

   371 	SetConsoleOutputCP(1252);

   372 #endif

   373     }

   374     g_free(contents);

   375     return utf8;

   376 }

   378 void cleanup_on_exit(void)

   379 {

   380 #ifdef __WIN32__

   381     SetConsoleOutputCP(saved_cp);

   382 #endif

   383 }

   385 int main(int argc,char **argv)

   386 {

   387 #ifdef __WIN32__

   388     atexit(cleanup_on_exit);

   389     saved_cp=GetConsoleOutputCP();

   390 #endif

   391     running_from=g_path_get_dirname(argv[0]);

   392     parse_options(&argc,&argv);

   393     if (pswit[USERTYPO_SWITCH])

   394 	read_user_scannos();

   395     fprintf(stderr,"bookloupe: Check and report on an e-text\n");

   396     procfile(argv[1]);

   397     if (pswit[OVERVIEW_SWITCH])

   398     {

   399 	g_print("    Checked %ld lines of %ld (head+foot = %ld)\n\n",

   400 	  checked_linecnt,linecnt,linecnt-checked_linecnt);

   401 	g_print("    --------------- Queries found --------------\n");

   402 	if (cnt_long)

   403 	    g_print("    Long lines:		    %14ld\n",cnt_long);

   404 	if (cnt_short)

   405 	    g_print("    Short lines:		   %14ld\n",cnt_short);

   406 	if (cnt_lineend)

   407 	    g_print("    Line-end problems:	     %14ld\n",cnt_lineend);

   408 	if (cnt_word)

   409 	    g_print("    Common typos:		  %14ld\n",cnt_word);

   410 	if (cnt_dquot)

   411 	    g_print("    Unmatched quotes:	      %14ld\n",cnt_dquot);

   412 	if (cnt_squot)

   413 	    g_print("    Unmatched SingleQuotes:	%14ld\n",cnt_squot);

   414 	if (cnt_brack)

   415 	    g_print("    Unmatched brackets:	    %14ld\n",cnt_brack);

   416 	if (cnt_bin)

   417 	    g_print("    Non-ASCII characters:	  %14ld\n",cnt_bin);

   418 	if (cnt_odd)

   419 	    g_print("    Proofing characters:	   %14ld\n",cnt_odd);

   420 	if (cnt_punct)

   421 	    g_print("    Punctuation & spacing queries: %14ld\n",cnt_punct);

   422 	if (cnt_dash)

   423 	    g_print("    Non-standard dashes:	   %14ld\n",cnt_dash);

   424 	if (cnt_html)

   425 	    g_print("    Possible HTML tags:	    %14ld\n",cnt_html);

   426 	g_print("\n");

   427 	g_print("    TOTAL QUERIES		  %14ld\n",

   428 	  cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+

   429 	  cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);

   430     }

   431     g_free(running_from);

   432     if (usertypo)

   433 	g_tree_unref(usertypo);

   434     return 0;

   435 }

   437 /*

   438  * first_pass:

   439  *

   440  * Run a first pass - verify that it's a valid PG

   441  * file, decide whether to report some things that

   442  * occur many times in the text like long or short

   443  * lines, non-standard dashes, etc.

   444  */

   445 struct first_pass_results *first_pass(const char *etext)

   446 {

   447     gunichar laststart=CHAR_SPACE;

   448     const char *s;

   449     gchar *lc_line;

   450     int i,j,lbytes,llen;

   451     gchar **lines;

   452     unsigned int lastlen=0,lastblen=0;

   453     long spline=0,nspline=0;

   454     static struct first_pass_results results={0};

   455     gchar *inword;

   456     lines=g_strsplit(etext,"\n",0);

   457     for (j=0;lines[j];j++)

   458     {

   459 	lbytes=strlen(lines[j]);

   460 	while (lbytes>0 && lines[j][lbytes-1]=='\r')

   461 	    lines[j][--lbytes]='\0';

   462 	llen=g_utf8_strlen(lines[j],lbytes);

   463 	linecnt++;

   464 	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&

   465 	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))

   466 	{

   467 	    if (spline)

   468 		g_print("   --> Duplicate header?\n");

   469 	    spline=linecnt+1;   /* first line of non-header text, that is */

   470 	}

   471 	if (!strncmp(lines[j],"*** START",9) &&

   472 	  strstr(lines[j],"PROJECT GUTENBERG"))

   473 	{

   474 	    if (nspline)

   475 		g_print("   --> Duplicate header?\n");

   476 	    nspline=linecnt+1;   /* first line of non-header text, that is */

   477 	}

   478 	if (spline || nspline)

   479 	{

   480 	    lc_line=g_utf8_strdown(lines[j],lbytes);

   481 	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))

   482 	    {

   483 		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))

   484 		{

   485 		    if (results.footerline)

   486 		    {

   487 			/* it's an old-form header - we can detect duplicates */

   488 			if (!nspline)

   489 			    g_print("   --> Duplicate footer?\n");

   490 		    }

   491 		    else

   492 			results.footerline=linecnt;

   493 		}

   494 	    }

   495 	    g_free(lc_line);

   496 	}

   497 	if (spline)

   498 	    results.firstline=spline;

   499 	if (nspline)

   500 	    results.firstline=nspline;  /* override with new */

   501 	if (results.footerline)

   502 	    continue;    /* don't count the boilerplate in the footer */

   503 	results.totlen+=llen;

   504 	for (s=lines[j];*s;s=g_utf8_next_char(s))

   505 	{

   506 	    if (g_utf8_get_char(s)>127)

   507 		results.binlen++;

   508 	    if (g_unichar_isalpha(g_utf8_get_char(s)))

   509 		results.alphalen++;

   510 	    if (s>lines[j] && g_utf8_get_char(s)==CHAR_DQUOTE &&

   511 	      isalpha(g_utf8_get_char(g_utf8_prev_char(s))))

   512 		results.endquote_count++;

   513 	}

   514 	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&

   515 	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)

   516 	    results.shortline++;

   517 	if (lbytes>0 &&

   518 	  g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)

   519 	    cnt_spacend++;

   520 	if (strstr(lines[j],".,"))

   521 	    results.dotcomma++;

   522 	/* only count ast lines for ignoring purposes where there is */

   523 	/* locase text on the line */

   524 	if (strchr(lines[j],'*'))

   525 	{

   526 	    for (s=lines[j];*s;s=g_utf8_next_char(s))

   527 		if (g_unichar_islower(g_utf8_get_char(s)))

   528 		    break;

   529 	    if (*s)

   530 		results.astline++;

   531 	}

   532 	if (strchr(lines[j],'/'))

   533 	    results.fslashline++;

   534 	if (lbytes>0)

   535 	{

   536 	    for (s=g_utf8_prev_char(lines[j]+lbytes);

   537 	      s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;

   538 	      s=g_utf8_prev_char(s))

   539 		;

   540 	    if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&

   541 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')

   542 		results.hyphens++;

   543 	}

   544 	if (llen>LONGEST_PG_LINE)

   545 	    results.longline++;

   546 	if (llen>WAY_TOO_LONG)

   547 	    results.verylongline++;

   548 	if (strchr(lines[j],'<') && strchr(lines[j],'>'))

   549 	{

   550 	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);

   551 	    if (i>0)

   552 		results.htmcount++;

   553 	    if (strstr(lines[j],"<i>"))

   554 		results.htmcount+=4; /* bonus marks! */

   555 	}

   556 	/* Check for spaced em-dashes */

   557 	if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))

   558 	{

   559 	    results.emdash++;

   560 	    if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)

   561 		results.space_emdash++;

   562 	    if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)

   563 		/* count of em-dashes with spaces both sides */

   564 		results.non_PG_space_emdash++;

   565 	    if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)

   566 		/* count of PG-type em-dashes with no spaces */

   567 		results.PG_space_emdash++;

   568 	}

   569 	for (s=lines[j];*s;)

   570 	{

   571 	    inword=getaword(&s);

   572 	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))

   573 		results.Dutchcount++;

   574 	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))

   575 		results.Frenchcount++;

   576 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))

   577 		results.standalone_digit++;

   578 	    g_free(inword);

   579 	}

   580 	/* Check for spaced dashes */

   581 	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')

   582 	    results.spacedash++;

   583 	lastblen=lastlen;

   584 	lastlen=llen;

   585 	laststart=lines[j][0];

   586     }

   587     g_strfreev(lines);

   588     return &results;

   589 }

   591 /*

   592  * report_first_pass:

   593  *

   594  * Make some snap decisions based on the first pass results.

   595  */

   596 struct warnings *report_first_pass(struct first_pass_results *results)

   597 {

   598     static struct warnings warnings={0};

   599     if (cnt_spacend>0)

   600 	g_print("   --> %ld lines in this file have white space at end\n",

   601 	  cnt_spacend);

   602     warnings.dotcomma=1;

   603     if (results->dotcomma>5)

   604     {

   605 	warnings.dotcomma=0;

   606 	g_print("   --> %ld lines in this file contain '.,'. "

   607 	  "Not reporting them.\n",results->dotcomma);

   608     }

   609     /*

   610      * If more than 50 lines, or one-tenth, are short,

   611      * don't bother reporting them.

   612      */

   613     warnings.shortline=1;

   614     if (results->shortline>50 || results->shortline*10>linecnt)

   615     {

   616 	warnings.shortline=0;

   617 	g_print("   --> %ld lines in this file are short. "

   618 	  "Not reporting short lines.\n",results->shortline);

   619     }

   620     /*

   621      * If more than 50 lines, or one-tenth, are long,

   622      * don't bother reporting them.

   623      */

   624     warnings.longline=1;

   625     if (results->longline>50 || results->longline*10>linecnt)

   626     {

   627 	warnings.longline=0;

   628 	g_print("   --> %ld lines in this file are long. "

   629 	  "Not reporting long lines.\n",results->longline);

   630     }

   631     /* If more than 10 lines contain asterisks, don't bother reporting them. */

   632     warnings.ast=1;

   633     if (results->astline>10)

   634     {

   635 	warnings.ast=0;

   636 	g_print("   --> %ld lines in this file contain asterisks. "

   637 	  "Not reporting them.\n",results->astline);

   638     }

   639     /*

   640      * If more than 10 lines contain forward slashes,

   641      * don't bother reporting them.

   642      */

   643     warnings.fslash=1;

   644     if (results->fslashline>10)

   645     {

   646 	warnings.fslash=0;

   647 	g_print("   --> %ld lines in this file contain forward slashes. "

   648 	  "Not reporting them.\n",results->fslashline);

   649     }

   650     /*

   651      * If more than 20 lines contain unpunctuated endquotes,

   652      * don't bother reporting them.

   653      */

   654     warnings.endquote=1;

   655     if (results->endquote_count>20)

   656     {

   657 	warnings.endquote=0;

   658 	g_print("   --> %ld lines in this file contain unpunctuated endquotes. "

   659 	  "Not reporting them.\n",results->endquote_count);

   660     }

   661     /*

   662      * If more than 15 lines contain standalone digits,

   663      * don't bother reporting them.

   664      */

   665     warnings.digit=1;

   666     if (results->standalone_digit>10)

   667     {

   668 	warnings.digit=0;

   669 	g_print("   --> %ld lines in this file contain standalone 0s and 1s. "

   670 	  "Not reporting them.\n",results->standalone_digit);

   671     }

   672     /*

   673      * If more than 20 lines contain hyphens at end,

   674      * don't bother reporting them.

   675      */

   676     warnings.hyphen=1;

   677     if (results->hyphens>20)

   678     {

   679 	warnings.hyphen=0;

   680 	g_print("   --> %ld lines in this file have hyphens at end. "

   681 	  "Not reporting them.\n",results->hyphens);

   682     }

   683     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])

   684     {

   685 	g_print("   --> Looks like this is HTML. Switching HTML mode ON.\n");

   686 	pswit[MARKUP_SWITCH]=1;

   687     }

   688     if (results->verylongline>0)

   689 	g_print("   --> %ld lines in this file are VERY long!\n",

   690 	  results->verylongline);

   691     /*

   692      * If there are more non-PG spaced dashes than PG em-dashes,

   693      * assume it's deliberate.

   694      * Current PG guidelines say don't use them, but older texts do,

   695      * and some people insist on them whatever the guidelines say.

   696      */

   697     warnings.dash=1;

   698     if (results->spacedash+results->non_PG_space_emdash>

   699       results->PG_space_emdash)

   700     {

   701 	warnings.dash=0;

   702 	g_print("   --> There are %ld spaced dashes and em-dashes. "

   703 	  "Not reporting them.\n",

   704 	  results->spacedash+results->non_PG_space_emdash);

   705     }

   706     /* If more than a quarter of characters are hi-bit, bug out. */

   707     warnings.bin=1;

   708     if (results->binlen*4>results->totlen)

   709     {

   710 	g_print("   --> This file does not appear to be ASCII. "

   711 	  "Terminating. Best of luck with it!\n");

   712 	exit(1);

   713     }

   714     if (results->alphalen*4<results->totlen)

   715     {

   716 	g_print("   --> This file does not appear to be text. "

   717 	  "Terminating. Best of luck with it!\n");

   718 	exit(1);

   719     }

   720     if (results->binlen*100>results->totlen || results->binlen>100)

   721     {

   722 	g_print("   --> There are a lot of foreign letters here. "

   723 	  "Not reporting them.\n");

   724 	warnings.bin=0;

   725     }

   726     warnings.isDutch=FALSE;

   727     if (results->Dutchcount>50)

   728     {

   729 	warnings.isDutch=TRUE;

   730 	g_print("   --> This looks like Dutch - "

   731 	  "switching off dashes and warnings for 's Middags case.\n");

   732     }

   733     warnings.isFrench=FALSE;

   734     if (results->Frenchcount>50)

   735     {

   736 	warnings.isFrench=TRUE;

   737 	g_print("   --> This looks like French - "

   738 	  "switching off some doublepunct.\n");

   739     }

   740     if (results->firstline && results->footerline)

   741 	g_print("    The PG header and footer appear to be already on.\n");

   742     else

   743     {

   744 	if (results->firstline)

   745 	    g_print("    The PG header is on - no footer.\n");

   746 	if (results->footerline)

   747 	    g_print("    The PG footer is on - no header.\n");

   748     }

   749     g_print("\n");

   750     if (pswit[VERBOSE_SWITCH])

   751     {

   752 	warnings.bin=1;

   753 	warnings.shortline=1;

   754 	warnings.dotcomma=1;

   755 	warnings.longline=1;

   756 	warnings.dash=1;

   757 	warnings.digit=1;

   758 	warnings.ast=1;

   759 	warnings.fslash=1;

   760 	warnings.hyphen=1;

   761 	warnings.endquote=1;

   762 	g_print("   *** Verbose output is ON -- you asked for it! ***\n");

   763     }

   764     if (warnings.isDutch)

   765 	warnings.dash=0;

   766     if (results->footerline>0 && results->firstline>0 &&

   767       results->footerline>results->firstline &&

   768       results->footerline-results->firstline<100)

   769     {

   770 	g_print("   --> I don't really know where this text starts. \n");

   771 	g_print("       There are no reference points.\n");

   772 	g_print("       I'm going to have to report the header and footer "

   773 	  "as well.\n");

   774 	results->firstline=0;

   775     }

   776     return &warnings;

   777 }

   779 /*

   780  * analyse_quotes:

   781  *

   782  * Look along the line, accumulate the count of quotes, and see

   783  * if this is an empty line - i.e. a line with nothing on it

   784  * but spaces.

   785  * If line has just spaces, period, * and/or - on it, don't

   786  * count it, since empty lines with asterisks or dashes to

   787  * separate sections are common.

   788  *

   789  * Returns: TRUE if the line is empty.

   790  */

   791 gboolean analyse_quotes(const char *aline,struct counters *counters)

   792 {

   793     int guessquote=0;

   794     /* assume the line is empty until proven otherwise */

   795     gboolean isemptyline=TRUE;

   796     const char *s=aline,*sprev,*snext;

   797     gunichar c;

   798     sprev=NULL;

   799     while (*s)

   800     {

   801 	snext=g_utf8_next_char(s);

   802 	c=g_utf8_get_char(s);

   803 	if (c==CHAR_DQUOTE)

   804 	    counters->quot++;

   805 	if (CHAR_IS_SQUOTE(c))

   806 	{

   807 	    if (s==aline)

   808 	    {

   809 		/*

   810 		 * At start of line, it can only be an openquote.

   811 		 * Hardcode a very common exception!

   812 		 */

   813 		if (!g_str_has_prefix(snext,"tis") &&

   814 		  !g_str_has_prefix(snext,"Tis"))

   815 		    increment_matching(counters,c,TRUE);

   816 	    }

   817 	    else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&

   818 	      g_unichar_isalpha(g_utf8_get_char(snext)))

   819 		/* Do nothing! it's definitely an apostrophe, not a quote */

   820 		;

   821 	    /* it's outside a word - let's check it out */

   822 	    else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||

   823 	      g_unichar_isalpha(g_utf8_get_char(snext)))

   824 	    {

   825 		/* it damwell better BE an openquote */

   826 		if (!g_str_has_prefix(snext,"tis") &&

   827 		  !g_str_has_prefix(snext,"Tis"))

   828 		    /* hardcode a very common exception! */

   829 		    increment_matching(counters,c,TRUE);

   830 	    }

   831 	    else

   832 	    {

   833 		/* now - is it a closequote? */

   834 		guessquote=0;   /* accumulate clues */

   835 		if (g_unichar_isalpha(g_utf8_get_char(sprev)))

   836 		{

   837 		    /* it follows a letter - could be either */

   838 		    guessquote++;

   839 		    if (g_utf8_get_char(sprev)=='s')

   840 		    {

   841 			/* looks like a plural apostrophe */

   842 			guessquote-=3;

   843 			if (g_utf8_get_char(snext)==CHAR_SPACE)

   844 			    /* bonus marks! */

   845 			    guessquote-=2;

   846 		    }

   847 		}

   848 		/* it doesn't have a letter either side */

   849 		else if (strchr(".?!,;:",g_utf8_get_char(sprev)) &&

   850 		  strchr(".?!,;: ",g_utf8_get_char(snext)))

   851 		    guessquote+=8; /* looks like a closequote */

   852 		else

   853 		    guessquote++;

   854 		if (matching_difference(counters,CHAR_SQUOTE)>0)

   855 		    /*

   856 		     * Give it the benefit of some doubt,

   857 		     * if a squote is already open.

   858 		     */

   859 		    guessquote++;

   860 		else

   861 		    guessquote--;

   862 		if (guessquote>=0)

   863 		    increment_matching(counters,c,FALSE);

   864 	    }

   865 	}

   866 	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&

   867 	  c!='\r' && c!='\n')

   868 	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */

   869 	if (c==CHAR_UNDERSCORE)

   870 	    counters->c_unders++;

   871 	if (c==CHAR_OPEN_SBRACK)

   872 	{

   873 	    if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&

   874 	      !matching_difference(counters,c) && s==aline &&

   875 	      g_str_has_prefix(s,"[Illustration:"))

   876 		increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);

   877 	    else

   878 		increment_matching(counters,c,TRUE);

   879 	}

   880 	else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)

   881 	    increment_matching(counters,c,TRUE);

   882 	if (c==CHAR_CLOSE_SBRACK)

   883 	{

   884 	    if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&

   885 	      !matching_difference(counters,c) && !*snext)

   886 		increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);

   887 	    else

   888 		increment_matching(counters,c,FALSE);

   889 	}

   890 	else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)

   891 	    increment_matching(counters,c,FALSE);

   892 	sprev=s;

   893 	s=snext;

   894     }

   895     return isemptyline;

   896 }

   898 /*

   899  * check_for_control_characters:

   900  *

   901  * Check for invalid or questionable characters in the line

   902  * Anything above 127 is invalid for plain ASCII, and

   903  * non-printable control characters should also be flagged.

   904  * Tabs should generally not be there.

   905  */

   906 void check_for_control_characters(const char *aline)

   907 {

   908     gunichar c;

   909     const char *s;

   910     for (s=aline;*s;s=g_utf8_next_char(s))

   911     {

   912 	c=g_utf8_get_char(s);

   913 	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)

   914 	{

   915 	    if (pswit[ECHO_SWITCH])

   916 		g_print("\n%s\n",aline);

   917 	    if (!pswit[OVERVIEW_SWITCH])

   918 		g_print("    Line %ld column %ld - Control character %u\n",

   919 		  linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);

   920 	    else

   921 		cnt_bin++;

   922 	}

   923     }

   924 }

   926 /*

   927  * check_for_odd_characters:

   928  *

   929  * Check for binary and other odd characters.

   930  */

   931 void check_for_odd_characters(const char *aline,const struct warnings *warnings,

   932   gboolean isemptyline)

   933 {

   934     /* Don't repeat multiple warnings on one line. */

   935     gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;

   936     gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;

   937     const char *s;

   938     gunichar c;

   939     for (s=aline;*s;s=g_utf8_next_char(s))

   940     {

   941 	c=g_utf8_get_char(s);

   942 	if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))

   943 	{

   944 	    if (pswit[ECHO_SWITCH])

   945 		g_print("\n%s\n",aline);

   946 	    if (!pswit[OVERVIEW_SWITCH])

   947 		if (c>127 && c<160 || c>255)

   948 		    g_print("    Line %ld column %ld - "

   949 		      "Non-ISO-8859 character %u\n",

   950 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);

   951 		else

   952 		    g_print("    Line %ld column %ld - "

   953 		      "Non-ASCII character %u\n",

   954 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);

   955 	    else

   956 		cnt_bin++;

   957 	    eNon_A=TRUE;

   958 	}

   959 	if (!eTab && c==CHAR_TAB)

   960 	{

   961 	    if (pswit[ECHO_SWITCH])

   962 		g_print("\n%s\n",aline);

   963 	    if (!pswit[OVERVIEW_SWITCH])

   964 		g_print("    Line %ld column %ld - Tab character?\n",

   965 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

   966 	    else

   967 		cnt_odd++;

   968 	    eTab=TRUE;

   969 	}

   970 	if (!eTilde && c==CHAR_TILDE)

   971 	{

   972 	    /*

   973 	     * Often used by OCR software to indicate an

   974 	     * unrecognizable character.

   975 	     */

   976 	    if (pswit[ECHO_SWITCH])

   977 		g_print("\n%s\n",aline);

   978 	    if (!pswit[OVERVIEW_SWITCH])

   979 		g_print("    Line %ld column %ld - Tilde character?\n",

   980 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

   981 	    else

   982 		cnt_odd++;

   983 	    eTilde=TRUE;

   984 	}

   985 	if (!eCarat && c==CHAR_CARAT)

   986 	{

   987 	    if (pswit[ECHO_SWITCH])

   988 		g_print("\n%s\n",aline);

   989 	    if (!pswit[OVERVIEW_SWITCH])

   990 		g_print("    Line %ld column %ld - Carat character?\n",

   991 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

   992 	    else

   993 		cnt_odd++;

   994 	    eCarat=TRUE;

   995 	}

   996 	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)

   997 	{

   998 	    if (pswit[ECHO_SWITCH])

   999 		g_print("\n%s\n",aline);

  1000 	    if (!pswit[OVERVIEW_SWITCH])

  1001 		g_print("    Line %ld column %ld - Forward slash?\n",

  1002 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1003 	    else

  1004 		cnt_odd++;

  1005 	    eFSlash=TRUE;

  1006 	}

  1007 	/*

  1008 	 * Report asterisks only in paranoid mode,

  1009 	 * since they're often deliberate.

  1010 	 */

  1011 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&

  1012 	  c==CHAR_ASTERISK)

  1013 	{

  1014 	    if (pswit[ECHO_SWITCH])

  1015 		g_print("\n%s\n",aline);

  1016 	    if (!pswit[OVERVIEW_SWITCH])

  1017 		g_print("    Line %ld column %ld - Asterisk?\n",

  1018 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1019 	    else

  1020 		cnt_odd++;

  1021 	    eAst=TRUE;

  1022 	}

  1023     }

  1024 }

  1026 /*

  1027  * check_for_long_line:

  1028  *

  1029  * Check for line too long.

  1030  */

  1031 void check_for_long_line(const char *aline)

  1032 {

  1033     if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)

  1034     {

  1035 	if (pswit[ECHO_SWITCH])

  1036 	    g_print("\n%s\n",aline);

  1037 	if (!pswit[OVERVIEW_SWITCH])

  1038 	    g_print("    Line %ld column %ld - Long line %ld\n",

  1039 	      linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));

  1040 	else

  1041 	    cnt_long++;

  1042     }

  1043 }

  1045 /*

  1046  * check_for_short_line:

  1047  *

  1048  * Check for line too short.

  1049  *

  1050  * This one is a bit trickier to implement: we don't want to

  1051  * flag the last line of a paragraph for being short, so we

  1052  * have to wait until we know that our current line is a

  1053  * "normal" line, then report the _previous_ line if it was too

  1054  * short. We also don't want to report indented lines like

  1055  * chapter heads or formatted quotations. We therefore keep

  1056  * last->len as the length of the last line examined, and

  1057  * last->blen as the length of the last but one, and try to

  1058  * suppress unnecessary warnings by checking that both were of

  1059  * "normal" length. We keep the first character of the last

  1060  * line in last->start, and if it was a space, we assume that

  1061  * the formatting is deliberate. I can't figure out a way to

  1062  * distinguish something like a quoted verse left-aligned or

  1063  * the header or footer of a letter from a paragraph of short

  1064  * lines - maybe if I examined the whole paragraph, and if the

  1065  * para has less than, say, 8 lines and if all lines are short,

  1066  * then just assume it's OK? Need to look at some texts to see

  1067  * how often a formula like this would get the right result.

  1068  */

  1069 void check_for_short_line(const char *aline,const struct line_properties *last)

  1070 {

  1071     if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&

  1072       last->len<SHORTEST_PG_LINE && last->blen>1 &&

  1073       last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)

  1074     {

  1075 	if (pswit[ECHO_SWITCH])

  1076 	    g_print("\n%s\n",prevline);

  1077 	if (!pswit[OVERVIEW_SWITCH])

  1078 	    g_print("    Line %ld column %ld - Short line %ld?\n",

  1079 	      linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));

  1080 	else

  1081 	    cnt_short++;

  1082     }

  1083 }

  1085 /*

  1086  * check_for_starting_punctuation:

  1087  *

  1088  * Look for punctuation other than full ellipses at start of line.

  1089  */

  1090 void check_for_starting_punctuation(const char *aline)

  1091 {

  1092     if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&

  1093       !g_str_has_prefix(aline,". . ."))

  1094     {

  1095 	if (pswit[ECHO_SWITCH])

  1096 	    g_print("\n%s\n",aline);

  1097 	if (!pswit[OVERVIEW_SWITCH])

  1098 	    g_print("    Line %ld column 1 - Begins with punctuation?\n",

  1099 	      linecnt);

  1100 	else

  1101 	    cnt_punct++;

  1102     }

  1103 }

  1105 /*

  1106  * check_for_spaced_emdash:

  1107  *

  1108  * Check for spaced em-dashes.

  1109  *

  1110  * We must check _all_ occurrences of "--" on the line

  1111  * hence the loop - even if the first double-dash is OK

  1112  * there may be another that's wrong later on.

  1113  */

  1114 void check_for_spaced_emdash(const char *aline)

  1115 {

  1116     const char *s,*t,*next;

  1117     for (s=aline;t=strstr(s,"--");s=next)

  1118     {

  1119 	next=g_utf8_next_char(g_utf8_next_char(t));

  1120 	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||

  1121 	  g_utf8_get_char(next)==CHAR_SPACE)

  1122 	{

  1123 	    if (pswit[ECHO_SWITCH])

  1124 		g_print("\n%s\n",aline);

  1125 	    if (!pswit[OVERVIEW_SWITCH])

  1126 		g_print("    Line %ld column %ld - Spaced em-dash?\n",

  1127 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  1128 	    else

  1129 		cnt_dash++;

  1130 	}

  1131     }

  1132 }

  1134 /*

  1135  * check_for_spaced_dash:

  1136  *

  1137  * Check for spaced dashes.

  1138  */

  1139 void check_for_spaced_dash(const char *aline)

  1140 {

  1141     const char *s;

  1142     if ((s=strstr(aline," -")))

  1143     {

  1144 	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')

  1145 	{

  1146 	    if (pswit[ECHO_SWITCH])

  1147 		g_print("\n%s\n",aline);

  1148 	    if (!pswit[OVERVIEW_SWITCH])

  1149 		g_print("    Line %ld column %ld - Spaced dash?\n",

  1150 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1151 	    else

  1152 		cnt_dash++;

  1153 	}

  1154     }

  1155     else if ((s=strstr(aline,"- ")))

  1156     {

  1157 	if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')

  1158 	{

  1159 	    if (pswit[ECHO_SWITCH])

  1160 		g_print("\n%s\n",aline);

  1161 	    if (!pswit[OVERVIEW_SWITCH])

  1162 		g_print("    Line %ld column %ld - Spaced dash?\n",

  1163 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1164 	    else

  1165 		cnt_dash++;

  1166 	}

  1167     }

  1168 }

  1170 /*

  1171  * check_for_unmarked_paragraphs:

  1172  *

  1173  * Check for unmarked paragraphs indicated by separate speakers.

  1174  *

  1175  * May well be false positive:

  1176  * "Bravo!" "Wonderful!" called the crowd.

  1177  * but useful all the same.

  1178  */

  1179 void check_for_unmarked_paragraphs(const char *aline)

  1180 {

  1181     const char *s;

  1182     s=strstr(aline,"\"  \"");

  1183     if (!s)

  1184 	s=strstr(aline,"\" \"");

  1185     if (s)

  1186     {

  1187 	if (pswit[ECHO_SWITCH])

  1188 	    g_print("\n%s\n",aline);

  1189 	if (!pswit[OVERVIEW_SWITCH])

  1190 	    g_print("    Line %ld column %ld - "

  1191 	      "Query missing paragraph break?\n",

  1192 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1193 	else

  1194 	    cnt_punct++;

  1195     }

  1196 }

  1198 /*

  1199  * check_for_jeebies:

  1200  *

  1201  * Check for "to he" and other easy h/b errors.

  1202  *

  1203  * This is a very inadequate effort on the h/b problem,

  1204  * but the phrase "to he" is always an error, whereas "to

  1205  * be" is quite common.

  1206  * Similarly, '"Quiet!", be said.' is a non-be error

  1207  * "to he" is _not_ always an error!:

  1208  *       "Where they went to he couldn't say."

  1209  * Another false positive:

  1210  *       What would "Cinderella" be without the . . .

  1211  * and another: "If he wants to he can see for himself."

  1212  */

  1213 void check_for_jeebies(const char *aline)

  1214 {

  1215     const char *s;

  1216     s=strstr(aline," be could ");

  1217     if (!s)

  1218 	s=strstr(aline," be would ");

  1219     if (!s)

  1220 	s=strstr(aline," was be ");

  1221     if (!s)

  1222 	s=strstr(aline," be is ");

  1223     if (!s)

  1224 	s=strstr(aline," is be ");

  1225     if (!s)

  1226 	s=strstr(aline,"\", be ");

  1227     if (!s)

  1228 	s=strstr(aline,"\" be ");

  1229     if (!s)

  1230 	s=strstr(aline,"\" be ");

  1231     if (!s)

  1232 	s=strstr(aline," to he ");

  1233     if (s)

  1234     {

  1235 	if (pswit[ECHO_SWITCH])

  1236 	    g_print("\n%s\n",aline);

  1237 	if (!pswit[OVERVIEW_SWITCH])

  1238 	    g_print("    Line %ld column %ld - Query he/be error?\n",

  1239 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1240 	else

  1241 	    cnt_word++;

  1242     }

  1243     s=strstr(aline," the had ");

  1244     if (!s)

  1245 	s=strstr(aline," a had ");

  1246     if (!s)

  1247 	s=strstr(aline," they bad ");

  1248     if (!s)

  1249 	s=strstr(aline," she bad ");

  1250     if (!s)

  1251 	s=strstr(aline," he bad ");

  1252     if (!s)

  1253 	s=strstr(aline," you bad ");

  1254     if (!s)

  1255 	s=strstr(aline," i bad ");

  1256     if (s)

  1257     {

  1258 	if (pswit[ECHO_SWITCH])

  1259 	    g_print("\n%s\n",aline);

  1260 	if (!pswit[OVERVIEW_SWITCH])

  1261 	    g_print("    Line %ld column %ld - Query had/bad error?\n",

  1262 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1263 	else

  1264 	    cnt_word++;

  1265     }

  1266     s=strstr(aline,"; hut ");

  1267     if (!s)

  1268 	s=strstr(aline,", hut ");

  1269     if (s)

  1270     {

  1271 	if (pswit[ECHO_SWITCH])

  1272 	    g_print("\n%s\n",aline);

  1273 	if (!pswit[OVERVIEW_SWITCH])

  1274 	    g_print("    Line %ld column %ld - Query hut/but error?\n",

  1275 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1276 	else

  1277 	    cnt_word++;

  1278     }

  1279 }

  1281 /*

  1282  * check_for_mta_from:

  1283  *

  1284  * Special case - angled bracket in front of "From" placed there by an

  1285  * MTA when sending an e-mail.

  1286  */

  1287 void check_for_mta_from(const char *aline)

  1288 {

  1289     const char *s;

  1290     s=strstr(aline,">From");

  1291     if (s)

  1292     {

  1293 	if (pswit[ECHO_SWITCH])

  1294 	    g_print("\n%s\n",aline);

  1295 	if (!pswit[OVERVIEW_SWITCH])

  1296 	    g_print("    Line %ld column %ld - "

  1297 	      "Query angled bracket with From\n",

  1298 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1299 	else

  1300 	    cnt_punct++;

  1301     }

  1302 }

  1304 /*

  1305  * check_for_orphan_character:

  1306  *

  1307  * Check for a single character line -

  1308  * often an overflow from bad wrapping.

  1309  */

  1310 void check_for_orphan_character(const char *aline)

  1311 {

  1312     gunichar c;

  1313     c=g_utf8_get_char(aline);

  1314     if (c && !*g_utf8_next_char(aline))

  1315     {

  1316 	if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))

  1317 	    ; /* Nothing - ignore numerals alone on a line. */

  1318 	else

  1319 	{

  1320 	    if (pswit[ECHO_SWITCH])

  1321 		g_print("\n%s\n",aline);

  1322 	    if (!pswit[OVERVIEW_SWITCH])

  1323 		g_print("    Line %ld column 1 - Query single character line\n",

  1324 		  linecnt);

  1325 	    else

  1326 		cnt_punct++;

  1327 	}

  1328     }

  1329 }

  1331 /*

  1332  * check_for_pling_scanno:

  1333  *

  1334  * Check for I" - often should be !

  1335  */

  1336 void check_for_pling_scanno(const char *aline)

  1337 {

  1338     const char *s;

  1339     s=strstr(aline," I\"");

  1340     if (s)

  1341     {

  1342 	if (pswit[ECHO_SWITCH])

  1343 	    g_print("\n%s\n",aline);

  1344 	if (!pswit[OVERVIEW_SWITCH])

  1345 	    g_print("    Line %ld column %ld - Query I=exclamation mark?\n",

  1346 	      linecnt,g_utf8_pointer_to_offset(aline,s));

  1347 	else

  1348 	    cnt_punct++;

  1349     }

  1350 }

  1352 /*

  1353  * check_for_extra_period:

  1354  *

  1355  * Check for period without a capital letter. Cut-down from gutspell.

  1356  * Only works when it happens on a single line.

  1357  */

  1358 void check_for_extra_period(const char *aline,const struct warnings *warnings)

  1359 {

  1360     const char *s,*t,*s1,*sprev;

  1361     int i;

  1362     gsize len;

  1363     gboolean istypo;

  1364     gchar *testword;

  1365     gunichar c,nc,pc,*decomposition;

  1366     if (pswit[PARANOID_SWITCH])

  1367     {

  1368 	for (t=aline;t=strstr(t,". ");)

  1369 	{

  1370 	    if (t==aline)

  1371 	    {

  1372 		t=g_utf8_next_char(t);

  1373 		/* start of line punctuation is handled elsewhere */

  1374 		continue;

  1375 	    }

  1376 	    if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))

  1377 	    {

  1378 		t=g_utf8_next_char(t);

  1379 		continue;

  1380 	    }

  1381 	    if (warnings->isDutch)

  1382 	    {

  1383 		/* For Frank & Jeroen -- 's Middags case */

  1384 		gunichar c2,c3,c4,c5;

  1385 		c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));

  1386 		c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));

  1387 		c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));

  1388 		c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));

  1389 		if (CHAR_IS_APOSTROPHE(c2) &&

  1390 		  g_unichar_islower(c3) && c4==CHAR_SPACE &&

  1391 		  g_unichar_isupper(c5))

  1392 		{

  1393 		    t=g_utf8_next_char(t);

  1394 		    continue;

  1395 		}

  1396 	    }

  1397 	    s1=g_utf8_next_char(g_utf8_next_char(t));

  1398 	    while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&

  1399 	      !isdigit(g_utf8_get_char(s1)))

  1400 		s1=g_utf8_next_char(s1);

  1401 	    if (g_unichar_islower(g_utf8_get_char(s1)))

  1402 	    {

  1403 		/* we have something to investigate */

  1404 		istypo=TRUE;

  1405 		/* so let's go back and find out */

  1406 		nc=g_utf8_get_char(t);

  1407 		s1=g_utf8_prev_char(t);

  1408 		c=g_utf8_get_char(s1);

  1409 		sprev=g_utf8_prev_char(s1);

  1410 		pc=g_utf8_get_char(sprev);

  1411 		while (s1>=aline &&

  1412 		  (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||

  1413 		  g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&

  1414 		  g_unichar_isalpha(nc)))

  1415 		{

  1416 		    nc=c;

  1417 		    s1=sprev;

  1418 		    c=pc;

  1419 		    sprev=g_utf8_prev_char(s1);

  1420 		    pc=g_utf8_get_char(sprev);

  1421 		}

  1422 		s1=g_utf8_next_char(s1);

  1423 		s=strchr(s1,'.');

  1424 		if (s)

  1425 		    testword=g_strndup(s1,s-s1);

  1426 		else

  1427 		    testword=g_strdup(s1);

  1428 		for (i=0;*abbrev[i];i++)

  1429 		    if (!strcmp(testword,abbrev[i]))

  1430 			istypo=FALSE;

  1431 		if (g_unichar_isdigit(g_utf8_get_char(testword)))

  1432 		    istypo=FALSE;

  1433 		if (!*g_utf8_next_char(testword))

  1434 		    istypo=FALSE;

  1435 		if (isroman(testword))

  1436 		    istypo=FALSE;

  1437 		if (istypo)

  1438 		{

  1439 		    istypo=FALSE;

  1440 		    for (s=testword;*s;s=g_utf8_next_char(s))

  1441 		    {

  1442 			decomposition=g_unicode_canonical_decomposition(

  1443 			  g_utf8_get_char(s),&len);

  1444 			if (g_utf8_strchr("aeiou",-1,decomposition[0]))

  1445 			    istypo=TRUE;

  1446 			g_free(decomposition);

  1447 		    }

  1448 		}

  1449 		if (istypo &&

  1450 		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))

  1451 		{

  1452 		    g_tree_insert(qperiod,g_strdup(testword),

  1453 		      GINT_TO_POINTER(1));

  1454 		    if (pswit[ECHO_SWITCH])

  1455 			g_print("\n%s\n",aline);

  1456 		    if (!pswit[OVERVIEW_SWITCH])

  1457 			g_print("    Line %ld column %ld - Extra period?\n",

  1458 			  linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  1459 		    else

  1460 			cnt_punct++;

  1461 		}

  1462 		g_free(testword);

  1463 	    }

  1464 	    t=g_utf8_next_char(t);

  1465 	}

  1466     }

  1467 }

  1469 /*

  1470  * check_for_following_punctuation:

  1471  *

  1472  * Check for words usually not followed by punctuation.

  1473  */

  1474 void check_for_following_punctuation(const char *aline)

  1475 {

  1476     int i;

  1477     const char *s,*wordstart;

  1478     gunichar c;

  1479     gchar *inword,*t;

  1480     if (pswit[TYPO_SWITCH])

  1481     {

  1482 	for (s=aline;*s;)

  1483 	{

  1484 	    wordstart=s;

  1485 	    t=getaword(&s);

  1486 	    if (!*t)

  1487 	    {

  1488 		g_free(t);

  1489 		continue;

  1490 	    }

  1491 	    inword=g_utf8_strdown(t,-1);

  1492 	    g_free(t);

  1493 	    for (i=0;*nocomma[i];i++)

  1494 		if (!strcmp(inword,nocomma[i]))

  1495 		{

  1496 		    c=g_utf8_get_char(s);

  1497 		    if (c==',' || c==';' || c==':')

  1498 		    {

  1499 			if (pswit[ECHO_SWITCH])

  1500 			    g_print("\n%s\n",aline);

  1501 			if (!pswit[OVERVIEW_SWITCH])

  1502 			    g_print("    Line %ld column %ld - "

  1503 			      "Query punctuation after %s?\n",

  1504 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,

  1505 			      inword);

  1506 			else

  1507 			    cnt_punct++;

  1508 		    }

  1509 		}

  1510 	    for (i=0;*noperiod[i];i++)

  1511 		if (!strcmp(inword,noperiod[i]))

  1512 		{

  1513 		    c=g_utf8_get_char(s);

  1514 		    if (c=='.' || c=='!')

  1515 		    {

  1516 			if (pswit[ECHO_SWITCH])

  1517 			    g_print("\n%s\n",aline);

  1518 			if (!pswit[OVERVIEW_SWITCH])

  1519 			    g_print("    Line %ld column %ld - "

  1520 			      "Query punctuation after %s?\n",

  1521 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,

  1522 			      inword);

  1523 			else

  1524 			    cnt_punct++;

  1525 		    }

  1526 		}

  1527 	    g_free(inword);

  1528 	}

  1529     }

  1530 }

  1532 /*

  1533  * check_for_typos:

  1534  *

  1535  * Check for commonly mistyped words,

  1536  * and digits like 0 for O in a word.

  1537  */

  1538 void check_for_typos(const char *aline,struct warnings *warnings)

  1539 {

  1540     const char *s,*t,*nt,*wordstart;

  1541     gchar *inword;

  1542     gunichar *decomposition;

  1543     gchar *testword;

  1544     int i,vowel,consonant,*dupcnt;

  1545     gboolean isdup,istypo,alower;

  1546     gunichar c,pc;

  1547     long offset,len;

  1548     gsize decomposition_len;

  1549     for (s=aline;*s;)

  1550     {

  1551 	wordstart=s;

  1552 	inword=getaword(&s);

  1553 	if (!*inword)

  1554 	{

  1555 	    g_free(inword);

  1556 	    continue; /* don't bother with empty lines */

  1557 	}

  1558 	if (mixdigit(inword))

  1559 	{

  1560 	    if (pswit[ECHO_SWITCH])

  1561 		g_print("\n%s\n",aline);

  1562 	    if (!pswit[OVERVIEW_SWITCH])

  1563 		g_print("    Line %ld column %ld - Query digit in %s\n",

  1564 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);

  1565 	    else

  1566 		cnt_word++;

  1567 	}

  1568 	/*

  1569 	 * Put the word through a series of tests for likely typos and OCR

  1570 	 * errors.

  1571 	 */

  1572 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])

  1573 	{

  1574 	    istypo=FALSE;

  1575 	    alower=FALSE;

  1576 	    for (t=inword;*t;t=g_utf8_next_char(t))

  1577 	    {

  1578 		c=g_utf8_get_char(t);

  1579 		nt=g_utf8_next_char(t);

  1580 		/* lowercase for testing */

  1581 		if (g_unichar_islower(c))

  1582 		    alower=TRUE;

  1583 		if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))

  1584 		{

  1585 		    /*

  1586 		     * We have an uppercase mid-word. However, there are

  1587 		     * common cases:

  1588 		     *   Mac and Mc like McGill

  1589 		     *   French contractions like l'Abbe

  1590 		     */

  1591 		    offset=g_utf8_pointer_to_offset(inword,t);

  1592 		    if (offset>0)

  1593 			pc=g_utf8_get_char(g_utf8_prev_char(t));

  1594 		    else

  1595 			pc='\0';

  1596 		    if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||

  1597 		      offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&

  1598 		      g_utf8_get_char(g_utf8_next_char(nt))=='c' ||

  1599 		      CHAR_IS_APOSTROPHE(pc))

  1600 			; /* do nothing! */

  1601 		    else

  1602 			istypo=TRUE;

  1603 		}

  1604 	    }

  1605 	    testword=g_utf8_casefold(inword,-1);

  1606 	}

  1607 	if (pswit[TYPO_SWITCH])

  1608 	{

  1609 	    /*

  1610 	     * Check for certain unlikely two-letter combinations at word

  1611 	     * start and end.

  1612 	     */

  1613 	    len=g_utf8_strlen(testword,-1);

  1614 	    if (len>1)

  1615 	    {

  1616 		for (i=0;*nostart[i];i++)

  1617 		    if (g_str_has_prefix(testword,nostart[i]))

  1618 			istypo=TRUE;

  1619 		for (i=0;*noend[i];i++)

  1620 		    if (g_str_has_suffix(testword,noend[i]))

  1621 			istypo=TRUE;

  1622 	    }

  1623 	    /* ght is common, gbt never. Like that. */

  1624 	    if (strstr(testword,"cb"))

  1625 		istypo=TRUE;

  1626 	    if (strstr(testword,"gbt"))

  1627 		istypo=TRUE;

  1628 	    if (strstr(testword,"pbt"))

  1629 		istypo=TRUE;

  1630 	    if (strstr(testword,"tbs"))

  1631 		istypo=TRUE;

  1632 	    if (strstr(testword,"mrn"))

  1633 		istypo=TRUE;

  1634 	    if (strstr(testword,"ahle"))

  1635 		istypo=TRUE;

  1636 	    if (strstr(testword,"ihle"))

  1637 		istypo=TRUE;

  1638 	    /*

  1639 	     * "TBE" does happen - like HEARTBEAT - but uncommon.

  1640 	     * Also "TBI" - frostbite, outbid - but uncommon.

  1641 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman

  1642 	     * numerals, but "ii" is a common scanno.

  1643 	     */

  1644 	    if (strstr(testword,"tbi"))

  1645 		istypo=TRUE;

  1646 	    if (strstr(testword,"tbe"))

  1647 		istypo=TRUE;

  1648 	    if (strstr(testword,"ii"))

  1649 		istypo=TRUE;

  1650 	    /*

  1651 	     * Check for no vowels or no consonants.

  1652 	     * If none, flag a typo.

  1653 	     */

  1654 	    if (!istypo && len>1)

  1655 	    {

  1656 		vowel=consonant=0;

  1657 		for (t=testword;*t;t=g_utf8_next_char(t))

  1658 		{

  1659 		    c=g_utf8_get_char(t);

  1660 		    decomposition=

  1661 		      g_unicode_canonical_decomposition(c,&decomposition_len);

  1662 		    if (c=='y' || g_unichar_isdigit(c))

  1663 		    {

  1664 			/* Yah, this is loose. */

  1665 			vowel++;

  1666 			consonant++;

  1667 		    }

  1668 		    else if (g_utf8_strchr("aeiou",-1,decomposition[0]))

  1669 			vowel++;

  1670 		    else

  1671 			consonant++;

  1672 		    g_free(decomposition);

  1673 		}

  1674 		if (!vowel || !consonant)

  1675 		    istypo=TRUE;

  1676 	    }

  1677 	    /*

  1678 	     * Now exclude the word from being reported if it's in

  1679 	     * the okword list.

  1680 	     */

  1681 	    for (i=0;*okword[i];i++)

  1682 		if (!strcmp(testword,okword[i]))

  1683 		    istypo=FALSE;

  1684 	    /*

  1685 	     * What looks like a typo may be a Roman numeral.

  1686 	     * Exclude these.

  1687 	     */

  1688 	    if (istypo && isroman(testword))

  1689 		istypo=FALSE;

  1690 	    /* Check the manual list of typos. */

  1691 	    if (!istypo)

  1692 		for (i=0;*typo[i];i++)

  1693 		    if (!strcmp(testword,typo[i]))

  1694 			istypo=TRUE;

  1695 	    /*

  1696 	     * Check lowercase s, l, i and m - special cases.

  1697 	     *   "j" - often a semi-colon gone wrong.

  1698 	     *   "d" for a missing apostrophe - he d

  1699 	     *   "n" for "in"

  1700 	     */

  1701 	    if (!istypo && len==1 &&

  1702 	      g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))

  1703 		istypo=TRUE;

  1704 	    if (istypo)

  1705 	    {

  1706 		dupcnt=g_tree_lookup(qword,testword);

  1707 		if (dupcnt)

  1708 		{

  1709 		    (*dupcnt)++;

  1710 		    isdup=!pswit[VERBOSE_SWITCH];

  1711 		}

  1712 		else

  1713 		{

  1714 		    dupcnt=g_new0(int,1);

  1715 		    g_tree_insert(qword,g_strdup(testword),dupcnt);

  1716 		    isdup=FALSE;

  1717 		}

  1718 		if (!isdup)

  1719 		{

  1720 		    if (pswit[ECHO_SWITCH])

  1721 			g_print("\n%s\n",aline);

  1722 		    if (!pswit[OVERVIEW_SWITCH])

  1723 		    {

  1724 			g_print("    Line %ld column %ld - Query word %s",

  1725 			  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,

  1726 			  inword);

  1727 			if (!pswit[VERBOSE_SWITCH])

  1728 			    g_print(" - not reporting duplicates");

  1729 			g_print("\n");

  1730 		    }

  1731 		    else

  1732 			cnt_word++;

  1733 		}

  1734 	    }

  1735 	}

  1736 	/* check the user's list of typos */

  1737 	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))

  1738 	{

  1739 	    if (pswit[ECHO_SWITCH])

  1740 		g_print("\n%s\n",aline);

  1741 	    if (!pswit[OVERVIEW_SWITCH])

  1742 		g_print("    Line %ld column %ld - Query possible scanno %s\n",

  1743 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);

  1744 	}

  1745 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])

  1746 	    g_free(testword);

  1747 	if (pswit[PARANOID_SWITCH] && warnings->digit)

  1748 	{

  1749 	    /* In paranoid mode, query all 0 and 1 standing alone. */

  1750 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))

  1751 	    {

  1752 		if (pswit[ECHO_SWITCH])

  1753 		    g_print("\n%s\n",aline);

  1754 		if (!pswit[OVERVIEW_SWITCH])

  1755 		    g_print("    Line %ld column %ld - Query standalone %s\n",

  1756 		      linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,

  1757 		      inword);

  1758 		else

  1759 		    cnt_word++;

  1760 	    }

  1761 	}

  1762 	g_free(inword);

  1763     }

  1764 }

  1766 /*

  1767  * check_for_misspaced_punctuation:

  1768  *

  1769  * Look for added or missing spaces around punctuation and quotes.

  1770  * If there is a punctuation character like ! with no space on

  1771  * either side, suspect a missing!space. If there are spaces on

  1772  * both sides , assume a typo. If we see a double quote with no

  1773  * space or punctuation on either side of it, assume unspaced

  1774  * quotes "like"this.

  1775  */

  1776 void check_for_misspaced_punctuation(const char *aline,

  1777   struct parities *parities,gboolean isemptyline)

  1778 {

  1779     gboolean isacro,isellipsis;

  1780     const char *s;

  1781     gunichar c,nc,pc,n2c;

  1782     c=g_utf8_get_char(aline);

  1783     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  1784     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  1785     {

  1786 	pc=c;

  1787 	c=nc;

  1788 	nc=g_utf8_get_char(g_utf8_next_char(s));

  1789 	/* For each character in the line after the first. */

  1790 	if (g_utf8_strchr(".?!,;:_",-1,c))  /* if it's punctuation */

  1791 	{

  1792 	    /* we need to suppress warnings for acronyms like M.D. */

  1793 	    isacro=FALSE;

  1794 	    /* we need to suppress warnings for ellipsis . . . */

  1795 	    isellipsis=FALSE;

  1796 	    /*

  1797 	     * If there are letters on both sides of it or

  1798 	     * if it's strict punctuation followed by an alpha.

  1799 	     */

  1800 	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||

  1801 	      g_utf8_strchr("?!,;:",-1,c)))

  1802 	    {

  1803 		if (c=='.')

  1804 		{

  1805 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&

  1806 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')

  1807 			isacro=TRUE;

  1808 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));

  1809 		    if (nc && n2c=='.')

  1810 			isacro=TRUE;

  1811 		}

  1812 		if (!isacro)

  1813 		{

  1814 		    if (pswit[ECHO_SWITCH])

  1815 			g_print("\n%s\n",aline);

  1816 		    if (!pswit[OVERVIEW_SWITCH])

  1817 			g_print("    Line %ld column %ld - Missing space?\n",

  1818 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1819 		    else

  1820 			cnt_punct++;

  1821 		}

  1822 	    }

  1823 	    if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))

  1824 	    {

  1825 		/*

  1826 		 * If there are spaces on both sides,

  1827 		 * or space before and end of line.

  1828 		 */

  1829 		if (c=='.')

  1830 		{

  1831 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&

  1832 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')

  1833 			isellipsis=TRUE;

  1834 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));

  1835 		    if (nc && n2c=='.')

  1836 			isellipsis=TRUE;

  1837 		}

  1838 		if (!isemptyline && !isellipsis)

  1839 		{

  1840 		    if (pswit[ECHO_SWITCH])

  1841 			g_print("\n%s\n",aline);

  1842 		    if (!pswit[OVERVIEW_SWITCH])

  1843 			g_print("    Line %ld column %ld - "

  1844 			  "Spaced punctuation?\n",linecnt,

  1845 			  g_utf8_pointer_to_offset(aline,s)+1);

  1846 		    else

  1847 			cnt_punct++;

  1848 		}

  1849 	    }

  1850 	}

  1851     }

  1852     /* Split out the characters that CANNOT be preceded by space. */

  1853     c=g_utf8_get_char(aline);

  1854     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  1855     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  1856     {

  1857 	pc=c;

  1858 	c=nc;

  1859 	nc=g_utf8_get_char(g_utf8_next_char(s));

  1860 	/* for each character in the line after the first */

  1861 	if (g_utf8_strchr("?!,;:",-1,c))

  1862 	{

  1863 	    /* if it's punctuation that _cannot_ have a space before it */

  1864 	    if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)

  1865 	    {

  1866 		/*

  1867 		 * If nc DOES == space,

  1868 		 * it was already reported just above.

  1869 		 */

  1870 		if (pswit[ECHO_SWITCH])

  1871 		    g_print("\n%s\n",aline);

  1872 		if (!pswit[OVERVIEW_SWITCH])

  1873 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",

  1874 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1875 		else

  1876 		    cnt_punct++;

  1877 	    }

  1878 	}

  1879     }

  1880     /*

  1881      * Special case " .X" where X is any alpha.

  1882      * This plugs a hole in the acronym code above.

  1883      * Inelegant, but maintainable.

  1884      */

  1885     c=g_utf8_get_char(aline);

  1886     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  1887     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  1888     {

  1889 	pc=c;

  1890 	c=nc;

  1891 	nc=g_utf8_get_char(g_utf8_next_char(s));

  1892 	/* for each character in the line after the first */

  1893 	if (c=='.')

  1894 	{

  1895 	    /* if it's a period */

  1896 	    if (pc==CHAR_SPACE && g_unichar_isalpha(nc))

  1897 	    {

  1898 		/*

  1899 		 * If the period follows a space and

  1900 		 * is followed by a letter.

  1901 		 */

  1902 		if (pswit[ECHO_SWITCH])

  1903 		    g_print("\n%s\n",aline);

  1904 		if (!pswit[OVERVIEW_SWITCH])

  1905 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",

  1906 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1907 		else

  1908 		    cnt_punct++;

  1909 	    }

  1910 	}

  1911     }

  1912     c=g_utf8_get_char(aline);

  1913     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  1914     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  1915     {

  1916 	pc=c;

  1917 	c=nc;

  1918 	nc=g_utf8_get_char(g_utf8_next_char(s));

  1919 	/* for each character in the line after the first */

  1920 	if (c==CHAR_DQUOTE)

  1921 	{

  1922 	    if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&

  1923 	      !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||

  1924 	      !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))

  1925 	    {

  1926 		if (pswit[ECHO_SWITCH])

  1927 		    g_print("\n%s\n",aline);

  1928 		if (!pswit[OVERVIEW_SWITCH])

  1929 		    g_print("    Line %ld column %ld - Unspaced quotes?\n",

  1930 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1931 		else

  1932 		    cnt_punct++;

  1933 	    }

  1934 	}

  1935     }

  1936     /* Check parity of quotes. */

  1937     nc=g_utf8_get_char(aline);

  1938     for (s=aline;*s;s=g_utf8_next_char(s))

  1939     {

  1940 	c=nc;

  1941 	nc=g_utf8_get_char(g_utf8_next_char(s));

  1942 	if (c==CHAR_DQUOTE)

  1943 	{

  1944 	    parities->dquote=!parities->dquote;

  1945 	    if (!parities->dquote)

  1946 	    {

  1947 		/* parity even */

  1948 		if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))

  1949 		{

  1950 		    if (pswit[ECHO_SWITCH])

  1951 			g_print("\n%s\n",aline);

  1952 		    if (!pswit[OVERVIEW_SWITCH])

  1953 			g_print("    Line %ld column %ld - "

  1954 			  "Wrongspaced quotes?\n",

  1955 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1956 		    else

  1957 			cnt_punct++;

  1958 		}

  1959 	    }

  1960 	    else

  1961 	    {

  1962 		/* parity odd */

  1963 		if (!g_unichar_isalpha(nc) && !isdigit(nc) &&

  1964 		  !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)

  1965 		{

  1966 		    if (pswit[ECHO_SWITCH])

  1967 			g_print("\n%s\n",aline);

  1968 		    if (!pswit[OVERVIEW_SWITCH])

  1969 			g_print("    Line %ld column %ld - "

  1970 			  "Wrongspaced quotes?\n",

  1971 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1972 		    else

  1973 			cnt_punct++;

  1974 		}

  1975 	    }

  1976 	}

  1977     }

  1978     if (g_utf8_get_char(aline)==CHAR_DQUOTE)

  1979     {

  1980 	if (g_utf8_strchr(",;:!?)]} ",-1,

  1981 	  g_utf8_get_char(g_utf8_next_char(aline))))

  1982 	{

  1983 	    if (pswit[ECHO_SWITCH])

  1984 		g_print("\n%s\n",aline);

  1985 	    if (!pswit[OVERVIEW_SWITCH])

  1986 		g_print("    Line %ld column 1 - Wrongspaced quotes?\n",

  1987 		  linecnt);

  1988 	    else

  1989 		cnt_punct++;

  1990 	}

  1991     }

  1992     if (pswit[SQUOTE_SWITCH])

  1993     {

  1994 	nc=g_utf8_get_char(aline);

  1995 	for (s=aline;*s;s=g_utf8_next_char(s))

  1996 	{

  1997 	    c=nc;

  1998 	    nc=g_utf8_get_char(g_utf8_next_char(s));

  1999 	    if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&

  2000 	      !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||

  2001 	      !g_unichar_isalpha(nc)))

  2002 	    {

  2003 		parities->squote=!parities->squote;

  2004 		if (!parities->squote)

  2005 		{

  2006 		    /* parity even */

  2007 		    if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))

  2008 		    {

  2009 			if (pswit[ECHO_SWITCH])

  2010 			    g_print("\n%s\n",aline);

  2011 			if (!pswit[OVERVIEW_SWITCH])

  2012 			    g_print("    Line %ld column %ld - "

  2013 			      "Wrongspaced singlequotes?\n",

  2014 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2015 			else

  2016 			    cnt_punct++;

  2017 		    }

  2018 		}

  2019 		else

  2020 		{

  2021 		    /* parity odd */

  2022 		    if (!g_unichar_isalpha(nc) && !isdigit(nc) &&

  2023 		      !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)

  2024 		    {

  2025 			if (pswit[ECHO_SWITCH])

  2026 			    g_print("\n%s\n",aline);

  2027 			if (!pswit[OVERVIEW_SWITCH])

  2028 			    g_print("    Line %ld column %ld - "

  2029 			      "Wrongspaced singlequotes?\n",

  2030 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2031 			else

  2032 			    cnt_punct++;

  2033 		    }

  2034 		}

  2035 	    }

  2036 	}

  2037     }

  2038 }

  2040 /*

  2041  * check_for_double_punctuation:

  2042  *

  2043  * Look for double punctuation like ,. or ,,

  2044  * Thanks to DW for the suggestion!

  2045  * In books with references, ".," and ".;" are common

  2046  * e.g. "etc., etc.," and vol. 1.; vol 3.;

  2047  * OTOH, from my initial tests, there are also fairly

  2048  * common errors. What to do? Make these cases paranoid?

  2049  * ".," is the most common, so warnings->dotcomma is used

  2050  * to suppress detailed reporting if it occurs often.

  2051  */

  2052 void check_for_double_punctuation(const char *aline,struct warnings *warnings)

  2053 {

  2054     const char *s;

  2055     gunichar c,nc;

  2056     nc=g_utf8_get_char(aline);

  2057     for (s=aline;*s;s=g_utf8_next_char(s))

  2058     {

  2059 	c=nc;

  2060 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2061 	/* for each punctuation character in the line */

  2062 	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&

  2063 	  g_utf8_strchr(".?!,;:",-1,nc))

  2064 	{

  2065 	    /* followed by punctuation, it's a query, unless . . . */

  2066 	    if (c==nc && (c=='.' || c=='?' || c=='!') ||

  2067 	      !warnings->dotcomma && c=='.' && nc==',' ||

  2068 	      warnings->isFrench && g_str_has_prefix(s,",...") ||

  2069 	      warnings->isFrench && g_str_has_prefix(s,"...,") ||

  2070 	      warnings->isFrench && g_str_has_prefix(s,";...") ||

  2071 	      warnings->isFrench && g_str_has_prefix(s,"...;") ||

  2072 	      warnings->isFrench && g_str_has_prefix(s,":...") ||

  2073 	      warnings->isFrench && g_str_has_prefix(s,"...:") ||

  2074 	      warnings->isFrench && g_str_has_prefix(s,"!...") ||

  2075 	      warnings->isFrench && g_str_has_prefix(s,"...!") ||

  2076 	      warnings->isFrench && g_str_has_prefix(s,"?...") ||

  2077 	      warnings->isFrench && g_str_has_prefix(s,"...?"))

  2078 	    {

  2079 		if (warnings->isFrench && g_str_has_prefix(s,",...") ||

  2080 		  warnings->isFrench && g_str_has_prefix(s,"...,") ||

  2081 		  warnings->isFrench && g_str_has_prefix(s,";...") ||

  2082 		  warnings->isFrench && g_str_has_prefix(s,"...;") ||

  2083 		  warnings->isFrench && g_str_has_prefix(s,":...") ||

  2084 		  warnings->isFrench && g_str_has_prefix(s,"...:") ||

  2085 		  warnings->isFrench && g_str_has_prefix(s,"!...") ||

  2086 		  warnings->isFrench && g_str_has_prefix(s,"...!") ||

  2087 		  warnings->isFrench && g_str_has_prefix(s,"?...") ||

  2088 		  warnings->isFrench && g_str_has_prefix(s,"...?"))

  2089 		{

  2090 		    s+=4;

  2091 		    nc=g_utf8_get_char(g_utf8_next_char(s));

  2092 		}

  2093 		; /* do nothing for .. !! and ?? which can be legit */

  2094 	    }

  2095 	    else

  2096 	    {

  2097 		if (pswit[ECHO_SWITCH])

  2098 		    g_print("\n%s\n",aline);

  2099 		if (!pswit[OVERVIEW_SWITCH])

  2100 		    g_print("    Line %ld column %ld - Double punctuation?\n",

  2101 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2102 		else

  2103 		    cnt_punct++;

  2104 	    }

  2105 	}

  2106     }

  2107 }

  2109 /*

  2110  * check_for_spaced_quotes:

  2111  */

  2112 void check_for_spaced_quotes(const char *aline)

  2113 {

  2114     int i;

  2115     const char *s,*t;

  2116     const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,

  2117       CHAR_RS_QUOTE};

  2118     GString *pattern;

  2119     s=aline;

  2120     while ((t=strstr(s," \" ")))

  2121     {

  2122 	if (pswit[ECHO_SWITCH])

  2123 	    g_print("\n%s\n",aline);

  2124 	if (!pswit[OVERVIEW_SWITCH])

  2125 	    g_print("    Line %ld column %ld - Spaced doublequote?\n",

  2126 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  2127 	else

  2128 	    cnt_punct++;

  2129 	s=g_utf8_next_char(g_utf8_next_char(t));

  2130     }

  2131     pattern=g_string_new(NULL);

  2132     for(i=0;i<G_N_ELEMENTS(single_quotes);i++)

  2133     {

  2134 	g_string_assign(pattern," ");

  2135 	g_string_append_unichar(pattern,single_quotes[i]);

  2136 	g_string_append_c(pattern,' ');

  2137 	s=aline;

  2138 	while ((t=strstr(s,pattern->str)))

  2139 	{

  2140 	    if (pswit[ECHO_SWITCH])

  2141 		g_print("\n%s\n",aline);

  2142 	    if (!pswit[OVERVIEW_SWITCH])

  2143 		g_print("    Line %ld column %ld - Spaced singlequote?\n",

  2144 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  2145 	    else

  2146 		cnt_punct++;

  2147 	    s=g_utf8_next_char(g_utf8_next_char(t));

  2148 	}

  2149     }

  2150     g_string_free(pattern,TRUE);

  2151 }

  2153 /*

  2154  * check_for_miscased_genative:

  2155  *

  2156  * Check special case of 'S instead of 's at end of word.

  2157  */

  2158 void check_for_miscased_genative(const char *aline)

  2159 {

  2160     const char *s;

  2161     gunichar c,nc,pc;

  2162     if (!*aline)

  2163 	return;

  2164     c=g_utf8_get_char(aline);

  2165     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2166     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2167     {

  2168 	pc=c;

  2169 	c=nc;

  2170 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2171 	if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))

  2172 	{

  2173 	    if (pswit[ECHO_SWITCH])

  2174 		g_print("\n%s\n",aline);

  2175 	    if (!pswit[OVERVIEW_SWITCH])

  2176 		g_print("    Line %ld column %ld - Capital \"S\"?\n",

  2177 		  linecnt,g_utf8_pointer_to_offset(aline,s)+2);

  2178 	    else

  2179 		cnt_punct++;

  2180 	}

  2181     }

  2182 }

  2184 /*

  2185  * check_end_of_line:

  2186  *

  2187  * Now check special cases - start and end of line -

  2188  * for single and double quotes. Start is sometimes [sic]

  2189  * but better to query it anyway.

  2190  * While we're here, check for dash at end of line.

  2191  */

  2192 void check_end_of_line(const char *aline,struct warnings *warnings)

  2193 {

  2194     int lbytes;

  2195     const char *s;

  2196     gunichar c1,c2;

  2197     lbytes=strlen(aline);

  2198     if (g_utf8_strlen(aline,lbytes)>1)

  2199     {

  2200 	s=g_utf8_prev_char(aline+lbytes);

  2201 	c1=g_utf8_get_char(s);

  2202 	c2=g_utf8_get_char(g_utf8_prev_char(s));

  2203 	if ((c1==CHAR_DQUOTE || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)

  2204 	{

  2205 	    if (pswit[ECHO_SWITCH])

  2206 		g_print("\n%s\n",aline);

  2207 	    if (!pswit[OVERVIEW_SWITCH])

  2208 		g_print("    Line %ld column %ld - Spaced quote?\n",linecnt,

  2209 		  g_utf8_strlen(aline,lbytes));

  2210 	    else

  2211 		cnt_punct++;

  2212 	}

  2213 	c1=g_utf8_get_char(aline);

  2214 	c2=g_utf8_get_char(g_utf8_next_char(aline));

  2215 	if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)

  2216 	{

  2217 	    if (pswit[ECHO_SWITCH])

  2218 		g_print("\n%s\n",aline);

  2219 	    if (!pswit[OVERVIEW_SWITCH])

  2220 		g_print("    Line %ld column 1 - Spaced quote?\n",linecnt);

  2221 	    else

  2222 		cnt_punct++;

  2223 	}

  2224 	/*

  2225 	 * Dash at end of line may well be legit - paranoid mode only

  2226 	 * and don't report em-dash at line-end.

  2227 	 */

  2228 	if (pswit[PARANOID_SWITCH] && warnings->hyphen)

  2229 	{

  2230 	    for (s=g_utf8_prev_char(aline+lbytes);

  2231 	      s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))

  2232 		;

  2233 	    if (g_utf8_get_char(s)=='-' &&

  2234 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')

  2235 	    {

  2236 		if (pswit[ECHO_SWITCH])

  2237 		    g_print("\n%s\n",aline);

  2238 		if (!pswit[OVERVIEW_SWITCH])

  2239 		    g_print("    Line %ld column %ld - "

  2240 		      "Hyphen at end of line?\n",

  2241 		      linecnt,g_utf8_pointer_to_offset(aline,s));

  2242 	    }

  2243 	}

  2244     }

  2245 }

  2247 /*

  2248  * check_for_unspaced_bracket:

  2249  *

  2250  * Brackets are often unspaced, but shouldn't be surrounded by alpha.

  2251  * If so, suspect a scanno like "a]most".

  2252  */

  2253 void check_for_unspaced_bracket(const char *aline)

  2254 {

  2255     const char *s;

  2256     gunichar c,nc,pc;

  2257     c=g_utf8_get_char(aline);

  2258     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2259     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2260     {

  2261 	pc=c;

  2262 	c=nc;

  2263 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2264 	if (!nc)

  2265 	    break;

  2266 	/* for each bracket character in the line except 1st & last */

  2267 	if (g_utf8_strchr("{[()]}",-1,c) &&

  2268 	  g_unichar_isalpha(pc) && g_unichar_isalpha(nc))

  2269 	{

  2270 	    if (pswit[ECHO_SWITCH])

  2271 		g_print("\n%s\n",aline);

  2272 	    if (!pswit[OVERVIEW_SWITCH])

  2273 		g_print("    Line %ld column %ld - Unspaced bracket?\n",

  2274 		  linecnt,g_utf8_pointer_to_offset(aline,s));

  2275 	    else

  2276 		cnt_punct++;

  2277 	}

  2278     }

  2279 }

  2281 /*

  2282  * check_for_unpunctuated_endquote:

  2283  */

  2284 void check_for_unpunctuated_endquote(const char *aline)

  2285 {

  2286     const char *s;

  2287     gunichar c,nc,pc;

  2288     c=g_utf8_get_char(aline);

  2289     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2290     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2291     {

  2292 	pc=c;

  2293 	c=nc;

  2294 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2295 	/* for each character in the line except 1st */

  2296 	if (c==CHAR_DQUOTE && isalpha(pc))

  2297 	{

  2298 	    if (pswit[ECHO_SWITCH])

  2299 		g_print("\n%s\n",aline);

  2300 	    if (!pswit[OVERVIEW_SWITCH])

  2301 		g_print("    Line %ld column %ld - "

  2302 		  "endquote missing punctuation?\n",

  2303 		  linecnt,g_utf8_pointer_to_offset(aline,s));

  2304 	    else

  2305 		cnt_punct++;

  2306 	}

  2307     }

  2308 }

  2310 /*

  2311  * check_for_html_tag:

  2312  *

  2313  * Check for <HTML TAG>.

  2314  *

  2315  * If there is a < in the line, followed at some point

  2316  * by a > then we suspect HTML.

  2317  */

  2318 void check_for_html_tag(const char *aline)

  2319 {

  2320     const char *open,*close;

  2321     gchar *tag;

  2322     open=strchr(aline,'<');

  2323     if (open)

  2324     {

  2325 	close=strchr(g_utf8_next_char(open),'>');

  2326 	if (close)

  2327 	{

  2328 	    if (pswit[ECHO_SWITCH])

  2329 		g_print("\n%s\n",aline);

  2330 	    if (!pswit[OVERVIEW_SWITCH])

  2331 	    {

  2332 		tag=g_strndup(open,close-open+1);

  2333 		g_print("    Line %ld column %ld - HTML Tag? %s \n",

  2334 		  linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);

  2335 		g_free(tag);

  2336 	    }

  2337 	    else

  2338 		cnt_html++;

  2339 	}

  2340     }

  2341 }

  2343 /*

  2344  * check_for_html_entity:

  2345  *

  2346  * Check for &symbol; HTML.

  2347  *

  2348  * If there is a & in the line, followed at

  2349  * some point by a ; then we suspect HTML.

  2350  */

  2351 void check_for_html_entity(const char *aline)

  2352 {

  2353     const char *s,*amp,*scolon;

  2354     gchar *entity;

  2355     amp=strchr(aline,'&');

  2356     if (amp)

  2357     {

  2358 	scolon=strchr(amp,';');

  2359 	if (scolon)

  2360 	{

  2361 	    for (s=amp;s<scolon;s=g_utf8_next_char(s))

  2362 		if (g_utf8_get_char(s)==CHAR_SPACE)

  2363 		    break;		/* Don't report "Jones & Son;" */

  2364 	    if (s>=scolon)

  2365 	    {

  2366 		if (pswit[ECHO_SWITCH])

  2367 		    g_print("\n%s\n",aline);

  2368 		if (!pswit[OVERVIEW_SWITCH])

  2369 		{

  2370 		    entity=g_strndup(amp,scolon-amp+1);

  2371 		    g_print("    Line %ld column %d - HTML symbol? %s \n",

  2372 		      linecnt,(int)(amp-aline)+1,entity);

  2373 		    g_free(entity);

  2374 		}

  2375 		else

  2376 		    cnt_html++;

  2377 	    }

  2378 	}

  2379     }

  2380 }

  2382 /*

  2383  * check_for_omitted_punctuation:

  2384  *

  2385  * Check for omitted punctuation at end of paragraph by working back

  2386  * through prevline. DW.

  2387  * Need to check this only for "normal" paras.

  2388  * So what is a "normal" para?

  2389  *    Not normal if one-liner (chapter headings, etc.)

  2390  *    Not normal if doesn't contain at least one locase letter

  2391  *    Not normal if starts with space

  2392  */

  2393 void check_for_omitted_punctuation(const char *prevline,

  2394   struct line_properties *last,int start_para_line)

  2395 {

  2396     gboolean letter_on_line=FALSE;

  2397     const char *s;

  2398     gunichar c;

  2399     for (s=prevline;*s;s=g_utf8_next_char(s))

  2400 	if (g_unichar_isalpha(g_utf8_get_char(s)))

  2401 	{

  2402 	    letter_on_line=TRUE;

  2403 	    break;

  2404 	}

  2405     /*

  2406      * This next "if" is a problem.

  2407      * If we say "start_para_line <= linecnt - 1", that includes

  2408      * one-line "paragraphs" like chapter heads. Lotsa false positives.

  2409      * If we say "start_para_line < linecnt - 1" it doesn't, but then it

  2410      * misses genuine one-line paragraphs.

  2411      */

  2412     if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&

  2413       g_utf8_get_char(prevline)>CHAR_SPACE)

  2414     {

  2415 	s=prevline+strlen(prevline);

  2416 	do

  2417 	{

  2418 	    s=g_utf8_prev_char(s);

  2419 	    c=g_utf8_get_char(s);

  2420 	} while (CHAR_IS_CLOSING_QUOTE(c) && c>CHAR_SPACE && s>prevline);

  2421 	for (;s>prevline;s=g_utf8_prev_char(s))

  2422 	{

  2423 	    if (g_unichar_isalpha(g_utf8_get_char(s)))

  2424 	    {

  2425 		if (pswit[ECHO_SWITCH])

  2426 		    g_print("\n%s\n",prevline);

  2427 		if (!pswit[OVERVIEW_SWITCH])

  2428 		    g_print("    Line %ld column %ld - "

  2429 		      "No punctuation at para end?\n",

  2430 		      linecnt-1,g_utf8_strlen(prevline,-1));

  2431 		else

  2432 		    cnt_punct++;

  2433 		break;

  2434 	    }

  2435 	    if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))

  2436 		break;

  2437 	}

  2438     }

  2439 }

  2441 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)

  2442 {

  2443     const char *word=key;

  2444     int *dupcnt=value;

  2445     if (*dupcnt)

  2446 	g_print("\nNote: Queried word %s was duplicated %d times\n",

  2447 	  word,*dupcnt);

  2448     return FALSE;

  2449 }

  2451 void print_as_windows_1252(const char *string)

  2452 {

  2453     gsize inbytes,outbytes;

  2454     gchar *buf,*bp;

  2455     static GIConv converter=(GIConv)-1;

  2456     if (!string)

  2457     {

  2458 	if (converter!=(GIConv)-1)

  2459 	    g_iconv_close(converter);

  2460 	converter=(GIConv)-1;

  2461 	return;

  2462     }

  2463     if (converter==(GIConv)-1)

  2464 	converter=g_iconv_open("WINDOWS-1252","UTF-8");

  2465     if (converter!=(GIConv)-1)

  2466     {

  2467 	inbytes=outbytes=strlen(string);

  2468 	bp=buf=g_malloc(outbytes+1);

  2469 	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);

  2470 	*bp='\0';

  2471 	fputs(buf,stdout);

  2472 	g_free(buf);

  2473     }

  2474     else

  2475 	fputs(string,stdout);

  2476 }

  2478 void print_as_utf_8(const char *string)

  2479 {

  2480     fputs(string,stdout);

  2481 }

  2483 /*

  2484  * procfile:

  2485  *

  2486  * Process one file.

  2487  */

  2488 void procfile(const char *filename)

  2489 {

  2490     const char *s;

  2491     gchar *parastart=NULL;	/* first line of current para */

  2492     gchar *etext,*aline;

  2493     gchar *etext_ptr;

  2494     GError *err=NULL;

  2495     struct first_pass_results *first_pass_results;

  2496     struct warnings *warnings;

  2497     struct counters counters={0};

  2498     struct line_properties last={0};

  2499     struct parities parities={0};

  2500     struct pending pending={0};

  2501     gboolean isemptyline;

  2502     long start_para_line=0;

  2503     gboolean isnewpara=FALSE,enddash=FALSE;

  2504     last.start=CHAR_SPACE;

  2505     linecnt=checked_linecnt=0;

  2506     etext=read_etext(filename,&err);

  2507     if (!etext)

  2508     {

  2509 	if (pswit[STDOUT_SWITCH])

  2510 	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);

  2511 	else

  2512 	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);

  2513 	exit(1);

  2514     }

  2515     g_print("\n\nFile: %s\n\n",filename);

  2516     first_pass_results=first_pass(etext);

  2517     warnings=report_first_pass(first_pass_results);

  2518     qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);

  2519     qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);

  2520     /*

  2521      * Here we go with the main pass. Hold onto yer hat!

  2522      */

  2523     linecnt=0;

  2524     etext_ptr=etext;

  2525     if (g_path_is_absolute(filename))

  2526 	g_print("\nPath: %s\n",filename);

  2527     else

  2528     {

  2529 	gchar *cwd,*path;

  2530 	cwd=g_get_current_dir();

  2531 	path=g_build_filename(cwd,filename,NULL);

  2532 	g_free(cwd);

  2533 	g_print("\nPath: %s\n",path);

  2534 	g_free(path);

  2535     }

  2536     g_print("    Line 1 - Path to ebook printed\n");

  2537     while ((aline=flgets(&etext_ptr,linecnt+1)))

  2538     {

  2539 	linecnt++;

  2540 	if (linecnt==1)

  2541 	    isnewpara=TRUE;

  2542 	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))

  2543 	    continue;    // skip DP page separators completely

  2544 	if (linecnt<first_pass_results->firstline ||

  2545 	  (first_pass_results->footerline>0 &&

  2546 	  linecnt>first_pass_results->footerline))

  2547 	{

  2548 	    if (pswit[HEADER_SWITCH])

  2549 	    {

  2550 		if (g_str_has_prefix(aline,"Title:"))

  2551 		    g_print("    %s\n",aline);

  2552 		if (g_str_has_prefix(aline,"Author:"))

  2553 		    g_print("    %s\n",aline);

  2554 		if (g_str_has_prefix(aline,"Release Date:"))

  2555 		    g_print("    %s\n",aline);

  2556 		if (g_str_has_prefix(aline,"Edition:"))

  2557 		    g_print("    %s\n\n",aline);

  2558 	    }

  2559 	    continue;		/* skip through the header */

  2560 	}

  2561 	checked_linecnt++;

  2562 	print_pending(aline,parastart,&pending);

  2563 	isemptyline=analyse_quotes(aline,&counters);

  2564 	if (isnewpara && !isemptyline)

  2565 	{

  2566 	    /* This line is the start of a new paragraph. */

  2567 	    start_para_line=linecnt;

  2568 	    /* Capture its first line in case we want to report it later. */

  2569 	    g_free(parastart);

  2570 	    parastart=g_strdup(aline);

  2571 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */

  2572 	    s=aline;

  2573 	    while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&

  2574 	      !g_unichar_isdigit(g_utf8_get_char(s)))

  2575 		s=g_utf8_next_char(s);

  2576 	    if (g_unichar_islower(g_utf8_get_char(s)))

  2577 	    {

  2578 		/* and its first letter is lowercase */

  2579 		if (pswit[ECHO_SWITCH])

  2580 		    g_print("\n%s\n",aline);

  2581 		if (!pswit[OVERVIEW_SWITCH])

  2582 		    g_print("    Line %ld column %ld - "

  2583 		      "Paragraph starts with lower-case\n",

  2584 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2585 		else

  2586 		    cnt_punct++;

  2587 	    }

  2588 	    isnewpara=FALSE; /* Signal the end of new para processing. */

  2589 	}

  2590 	/* Check for an em-dash broken at line end. */

  2591 	if (enddash && g_utf8_get_char(aline)=='-')

  2592 	{

  2593 	    if (pswit[ECHO_SWITCH])

  2594 		g_print("\n%s\n",aline);

  2595 	    if (!pswit[OVERVIEW_SWITCH])

  2596 		g_print("    Line %ld column 1 - Broken em-dash?\n",linecnt);

  2597 	    else

  2598 		cnt_punct++;

  2599 	}

  2600 	enddash=FALSE;

  2601 	for (s=g_utf8_prev_char(aline+strlen(aline));

  2602 	  g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))

  2603 	    ;

  2604 	if (s>=aline && g_utf8_get_char(s)=='-')

  2605 	    enddash=TRUE;

  2606 	check_for_control_characters(aline);

  2607 	if (warnings->bin)

  2608 	    check_for_odd_characters(aline,warnings,isemptyline);

  2609 	if (warnings->longline)

  2610 	    check_for_long_line(aline);

  2611 	if (warnings->shortline)

  2612 	    check_for_short_line(aline,&last);

  2613 	last.blen=last.len;

  2614 	last.len=g_utf8_strlen(aline,-1);

  2615 	last.start=g_utf8_get_char(aline);

  2616 	check_for_starting_punctuation(aline);

  2617 	if (warnings->dash)

  2618 	{

  2619 	    check_for_spaced_emdash(aline);

  2620 	    check_for_spaced_dash(aline);

  2621 	}

  2622 	check_for_unmarked_paragraphs(aline);

  2623 	check_for_jeebies(aline);

  2624 	check_for_mta_from(aline);

  2625 	check_for_orphan_character(aline);

  2626 	check_for_pling_scanno(aline);

  2627 	check_for_extra_period(aline,warnings);

  2628 	check_for_following_punctuation(aline);

  2629 	check_for_typos(aline,warnings);

  2630 	check_for_misspaced_punctuation(aline,&parities,isemptyline);

  2631 	check_for_double_punctuation(aline,warnings);

  2632 	check_for_spaced_quotes(aline);

  2633 	check_for_miscased_genative(aline);

  2634 	check_end_of_line(aline,warnings);

  2635 	check_for_unspaced_bracket(aline);

  2636 	if (warnings->endquote)

  2637 	    check_for_unpunctuated_endquote(aline);

  2638 	check_for_html_tag(aline);

  2639 	check_for_html_entity(aline);

  2640 	if (isemptyline)

  2641 	{

  2642 	    check_for_mismatched_quotes(&counters,&pending);

  2643 	    counters_reset(&counters);

  2644 	    /* let the next iteration know that it's starting a new para */

  2645 	    isnewpara=TRUE;

  2646 	    if (prevline)

  2647 		check_for_omitted_punctuation(prevline,&last,start_para_line);

  2648 	}

  2649 	g_free(prevline);

  2650 	prevline=g_strdup(aline);

  2651     }

  2652     linecnt++;

  2653     check_for_mismatched_quotes(&counters,&pending);

  2654     print_pending(NULL,parastart,&pending);

  2655     reset_pending(&pending);

  2656     if (prevline)

  2657     {

  2658 	g_free(prevline);

  2659 	prevline=NULL;

  2660     }

  2661     g_free(parastart);

  2662     g_free(prevline);

  2663     g_free(etext);

  2664     if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])

  2665 	g_tree_foreach(qword,report_duplicate_queries,NULL);

  2666     g_tree_unref(qword);

  2667     g_tree_unref(qperiod);

  2668     counters_destroy(&counters);

  2669     g_set_print_handler(NULL);

  2670     print_as_windows_1252(NULL);

  2671     if (pswit[MARKUP_SWITCH])

  2672 	loseentities(NULL);

  2673 }

  2675 /*

  2676  * flgets:

  2677  *

  2678  * Get one line from the input text, checking for

  2679  * the existence of exactly one CR/LF line-end per line.

  2680  *

  2681  * Returns: a pointer to the line.

  2682  */

  2683 char *flgets(char **etext,long lcnt)

  2684 {

  2685     gunichar c;

  2686     gboolean isCR=FALSE;

  2687     char *theline=*etext;

  2688     char *eos=theline;

  2689     gchar *s;

  2690     for (;;)

  2691     {

  2692 	c=g_utf8_get_char(*etext);

  2693 	*etext=g_utf8_next_char(*etext);

  2694 	if (!c)

  2695 	    return NULL;

  2696 	/* either way, it's end of line */

  2697 	if (c=='\n')

  2698 	{

  2699 	    if (isCR)

  2700 		break;

  2701 	    else

  2702 	    {

  2703 		/* Error - a LF without a preceding CR */

  2704 		if (pswit[LINE_END_SWITCH])

  2705 		{

  2706 		    if (pswit[ECHO_SWITCH])

  2707 		    {

  2708 			s=g_strndup(theline,eos-theline);

  2709 			g_print("\n%s\n",s);

  2710 			g_free(s);

  2711 		    }

  2712 		    if (!pswit[OVERVIEW_SWITCH])

  2713 			g_print("    Line %ld - No CR?\n",lcnt);

  2714 		    else

  2715 			cnt_lineend++;

  2716 		}

  2717 		break;

  2718 	    }

  2719 	}

  2720 	if (c=='\r')

  2721 	{

  2722 	    if (isCR)

  2723 	    {

  2724 		/* Error - two successive CRs */

  2725 		if (pswit[LINE_END_SWITCH])

  2726 		{

  2727 		    if (pswit[ECHO_SWITCH])

  2728 		    {

  2729 			s=g_strndup(theline,eos-theline);

  2730 			g_print("\n%s\n",s);

  2731 			g_free(s);

  2732 		    }

  2733 		    if (!pswit[OVERVIEW_SWITCH])

  2734 			g_print("    Line %ld - Two successive CRs?\n",lcnt);

  2735 		    else

  2736 			cnt_lineend++;

  2737 		}

  2738 	    }

  2739 	    isCR=TRUE;

  2740 	}

  2741 	else

  2742 	{

  2743 	    if (pswit[LINE_END_SWITCH] && isCR)

  2744 	    {

  2745 		if (pswit[ECHO_SWITCH])

  2746 		{

  2747 		    s=g_strndup(theline,eos-theline);

  2748 		    g_print("\n%s\n",s);

  2749 		    g_free(s);

  2750 		}

  2751 		if (!pswit[OVERVIEW_SWITCH])

  2752 		    g_print("    Line %ld column %ld - CR without LF?\n",

  2753 		      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);

  2754 		else

  2755 		    cnt_lineend++;

  2756 		*eos=' ';

  2757 	    }

  2758 	    isCR=FALSE;

  2759 	    eos=g_utf8_next_char(eos);

  2760 	}

  2761     }

  2762     *eos='\0';

  2763     if (pswit[MARKUP_SWITCH])

  2764 	postprocess_for_HTML(theline);

  2765     if (pswit[DP_SWITCH])

  2766 	postprocess_for_DP(theline);

  2767     return theline;

  2768 }

  2770 /*

  2771  * mixdigit:

  2772  *

  2773  * Takes a "word" as a parameter, and checks whether it

  2774  * contains a mixture of alpha and digits. Generally, this is an

  2775  * error, but may not be for cases like 4th or L5 12s. 3d.

  2776  *

  2777  * Returns: TRUE iff an is error found.

  2778  */

  2779 gboolean mixdigit(const char *checkword)

  2780 {

  2781     gboolean wehaveadigit,wehavealetter,query;

  2782     const char *s,*nondigit;

  2783     wehaveadigit=wehavealetter=query=FALSE;

  2784     for (s=checkword;*s;s=g_utf8_next_char(s))

  2785 	if (g_unichar_isalpha(g_utf8_get_char(s)))

  2786 	    wehavealetter=TRUE;

  2787 	else if (g_unichar_isdigit(g_utf8_get_char(s)))

  2788 	    wehaveadigit=TRUE;

  2789     if (wehaveadigit && wehavealetter)

  2790     {

  2791 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */

  2792 	query=TRUE;

  2793 	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));

  2794 	  nondigit=g_utf8_next_char(nondigit))

  2795 	    ;

  2796 	/* digits, ending in st, rd, nd, th of either case */

  2797 	if (!g_ascii_strcasecmp(nondigit,"st") ||

  2798 	  !g_ascii_strcasecmp(nondigit,"rd") ||

  2799 	  !g_ascii_strcasecmp(nondigit,"nd") ||

  2800 	  !g_ascii_strcasecmp(nondigit,"th"))

  2801 	    query=FALSE;

  2802 	if (!g_ascii_strcasecmp(nondigit,"sts") ||

  2803 	  !g_ascii_strcasecmp(nondigit,"rds") ||

  2804 	  !g_ascii_strcasecmp(nondigit,"nds") ||

  2805 	  !g_ascii_strcasecmp(nondigit,"ths"))

  2806 	    query=FALSE;

  2807 	if (!g_ascii_strcasecmp(nondigit,"stly") ||

  2808 	  !g_ascii_strcasecmp(nondigit,"rdly") ||

  2809 	  !g_ascii_strcasecmp(nondigit,"ndly") ||

  2810 	  !g_ascii_strcasecmp(nondigit,"thly"))

  2811 	    query=FALSE;

  2812 	/* digits, ending in l, L, s or d */

  2813 	if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||

  2814 	  !strcmp(nondigit,"d"))

  2815 	    query=FALSE;

  2816 	/*

  2817 	 * L at the start of a number, representing Britsh pounds, like L500.

  2818 	 * This is cute. We know the current word is mixed digit. If the first

  2819 	 * letter is L, there must be at least one digit following. If both

  2820 	 * digits and letters follow, we have a genuine error, else we have a

  2821 	 * capital L followed by digits, and we accept that as a non-error.

  2822 	 */

  2823 	if (g_utf8_get_char(checkword)=='L' &&

  2824 	  !mixdigit(g_utf8_next_char(checkword)))

  2825 	    query=FALSE;

  2826     }

  2827     return query;

  2828 }

  2830 /*

  2831  * getaword:

  2832  *

  2833  * Extracts the first/next "word" from the line, and returns it.

  2834  * A word is defined as one English word unit--or at least that's the aim.

  2835  * "ptr" is advanced to the position in the line where we will start

  2836  * looking for the next word.

  2837  *

  2838  * Returns: A newly-allocated string.

  2839  */

  2840 gchar *getaword(const char **ptr)

  2841 {

  2842     const char *s,*t;

  2843     GString *word;

  2844     gunichar c,pc;

  2845     word=g_string_new(NULL);

  2846     for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&

  2847       !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&

  2848       **ptr;*ptr=g_utf8_next_char(*ptr))

  2849 	;

  2850     /*

  2851      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.

  2852      * Especially yucky is the case of L1,000

  2853      * This section looks for a pattern of characters including a digit

  2854      * followed by a comma or period followed by one or more digits.

  2855      * If found, it returns this whole pattern as a word; otherwise we discard

  2856      * the results and resume our normal programming.

  2857      */

  2858     s=*ptr;

  2859     for (;g_unichar_isdigit(g_utf8_get_char(s)) ||

  2860       g_unichar_isalpha(g_utf8_get_char(s)) ||

  2861       g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))

  2862 	g_string_append_unichar(word,g_utf8_get_char(s));

  2863     if (word->len)

  2864     {

  2865 	for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))

  2866 	{

  2867 	    c=g_utf8_get_char(t);

  2868 	    pc=g_utf8_get_char(g_utf8_prev_char(t));

  2869 	    if ((c=='.' || c==',') && g_unichar_isdigit(pc))

  2870 	    {

  2871 		*ptr=s;

  2872 		return g_string_free(word,FALSE);

  2873 	    }

  2874 	}

  2875     }

  2876     /* we didn't find a punctuated number - do the regular getword thing */

  2877     g_string_truncate(word,0);

  2878     c=g_utf8_get_char(*ptr);

  2879     for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);

  2880       *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))

  2881 	g_string_append_unichar(word,c);

  2882     return g_string_free(word,FALSE);

  2883 }

  2885 /*

  2886  * isroman:

  2887  *

  2888  * Is this word a Roman Numeral?

  2889  *

  2890  * It doesn't actually validate that the number is a valid Roman Numeral--for

  2891  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not

  2892  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.

  2893  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or

  2894  * expressions thereof, except when it came to taxes. Allow any number of M,

  2895  * an optional D, an optional CM or CD, any number of optional Cs, an optional

  2896  * XL or an optional XC, an optional IX or IV, an optional V and any number

  2897  * of optional Is.

  2898  */

  2899 gboolean isroman(const char *t)

  2900 {

  2901     const char *s;

  2902     if (!t || !*t)

  2903 	return FALSE;

  2904     s=t;

  2905     while (g_utf8_get_char(t)=='m' && *t)

  2906 	t++;

  2907     if (g_utf8_get_char(t)=='d')

  2908 	t++;

  2909     if (g_str_has_prefix(t,"cm"))

  2910 	t+=2;

  2911     if (g_str_has_prefix(t,"cd"))

  2912 	t+=2;

  2913     while (g_utf8_get_char(t)=='c' && *t)

  2914 	t++;

  2915     if (g_str_has_prefix(t,"xl"))

  2916 	t+=2;

  2917     if (g_str_has_prefix(t,"xc"))

  2918 	t+=2;

  2919     if (g_utf8_get_char(t)=='l')

  2920 	t++;

  2921     while (g_utf8_get_char(t)=='x' && *t)

  2922 	t++;

  2923     if (g_str_has_prefix(t,"ix"))

  2924 	t+=2;

  2925     if (g_str_has_prefix(t,"iv"))

  2926 	t+=2;

  2927     if (g_utf8_get_char(t)=='v')

  2928 	t++;

  2929     while (g_utf8_get_char(t)=='i' && *t)

  2930 	t++;

  2931     return !*t;

  2932 }

  2934 /*

  2935  * postprocess_for_DP:

  2936  *

  2937  * Invoked with the -d switch from flgets().

  2938  * It simply "removes" from the line a hard-coded set of common

  2939  * DP-specific tags, so that the line passed to the main routine has

  2940  * been pre-cleaned of DP markup.

  2941  */

  2942 void postprocess_for_DP(char *theline)

  2943 {

  2944     char *s,*t;

  2945     int i;

  2946     if (!*theline)

  2947 	return;

  2948     for (i=0;*DPmarkup[i];i++)

  2949 	while ((s=strstr(theline,DPmarkup[i])))

  2950 	{

  2951 	    t=s+strlen(DPmarkup[i]);

  2952 	    memmove(s,t,strlen(t)+1);

  2953 	}

  2954 }

  2956 /*

  2957  * postprocess_for_HTML:

  2958  *

  2959  * Invoked with the -m switch from flgets().

  2960  * It simply "removes" from the line a hard-coded set of common

  2961  * HTML tags and "replaces" a hard-coded set of common HTML

  2962  * entities, so that the line passed to the main routine has

  2963  * been pre-cleaned of HTML.

  2964  */

  2965 void postprocess_for_HTML(char *theline)

  2966 {

  2967     while (losemarkup(theline))

  2968 	;

  2969     loseentities(theline);

  2970 }

  2972 char *losemarkup(char *theline)

  2973 {

  2974     char *s,*t;

  2975     int i;

  2976     s=strchr(theline,'<');

  2977     t=s?strchr(s,'>'):NULL;

  2978     if (!s || !t)

  2979 	return NULL;

  2980     for (i=0;*markup[i];i++)

  2981 	if (tagcomp(g_utf8_next_char(s),markup[i]))

  2982 	{

  2983 	    t=g_utf8_next_char(t);

  2984 	    memmove(s,t,strlen(t)+1);

  2985 	    return s;

  2986 	}

  2987     /* It's an unrecognized <xxx>. */

  2988     return NULL;

  2989 }

  2991 void loseentities(char *theline)

  2992 {

  2993     int i;

  2994     gsize nb;

  2995     char *amp,*scolon;

  2996     gchar *s,*t;

  2997     gunichar c;

  2998     GTree *entities=NULL;

  2999     static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;

  3000     if (!theline)

  3001     {

  3002 	if (entities)

  3003 	    g_tree_destroy(entities);

  3004 	entities=NULL;

  3005 	if (translit!=(GIConv)-1)

  3006 	    g_iconv_close(translit);

  3007 	translit=(GIConv)-1;

  3008 	if (to_utf8!=(GIConv)-1)

  3009 	    g_iconv_close(to_utf8);

  3010 	to_utf8=(GIConv)-1;

  3011 	return;

  3012     }

  3013     if (!*theline)

  3014 	return;

  3015     if (!entities)

  3016     {

  3017 	entities=g_tree_new((GCompareFunc)strcmp);

  3018 	for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)

  3019 	    g_tree_insert(entities,HTMLentities[i].name,

  3020 	      GUINT_TO_POINTER(HTMLentities[i].c));

  3021     }

  3022     if (translit==(GIConv)-1)

  3023 	translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");

  3024     if (to_utf8==(GIConv)-1)

  3025 	to_utf8=g_iconv_open("UTF-8","ISO_8859-1");

  3026     while((amp=strchr(theline,'&')))

  3027     {

  3028 	scolon=strchr(amp,';');

  3029 	if (scolon)

  3030 	{

  3031 	    if (amp[1]=='#')

  3032 	    {

  3033 		if (amp+2+strspn(amp+2,"0123456789")==scolon)

  3034 		    c=strtol(amp+2,NULL,10);

  3035 		else if (amp[2]=='x' &&

  3036 		  amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)

  3037 		    c=strtol(amp+3,NULL,16);

  3038 	    }

  3039 	    else

  3040 	    {

  3041 		s=g_strndup(amp+1,scolon-(amp+1));

  3042 	        c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));

  3043 		g_free(s);

  3044 	    }

  3045 	}

  3046 	else

  3047 	    c=0;

  3048 	if (c)

  3049 	{

  3050 	    theline=amp;

  3051 	    if (c<128 || c>=192 && c<=255)	/* An ISO-8859-1 character */

  3052 		theline+=g_unichar_to_utf8(c,theline);

  3053 	    else

  3054 	    {

  3055 		s=g_malloc(6);

  3056 		nb=g_unichar_to_utf8(c,s);

  3057 		t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);

  3058 		g_free(s);

  3059 		s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);

  3060 		g_free(t);

  3061 		memcpy(theline,s,nb);

  3062 		g_free(s);

  3063 		theline+=nb;

  3064 	    }

  3065 	    memmove(theline,g_utf8_next_char(scolon),

  3066 	      strlen(g_utf8_next_char(scolon))+1);

  3067 	}

  3068 	else

  3069 	    theline=g_utf8_next_char(amp);

  3070     }

  3071 }

  3073 gboolean tagcomp(const char *strin,const char *basetag)

  3074 {

  3075     gboolean retval;

  3076     gchar *s,*t;

  3077     if (g_utf8_get_char(strin)=='/')

  3078 	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */

  3079     else

  3080 	t=g_utf8_casefold(strin,-1);

  3081     s=g_utf8_casefold(basetag,-1);

  3082     retval=g_str_has_prefix(t,s);

  3083     g_free(s);

  3084     g_free(t);

  3085     return retval;

  3086 }

  3088 void proghelp(GOptionContext *context)

  3089 {

  3090     gchar *help;

  3091     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);

  3092     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);

  3093     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);

  3094     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "

  3095       "For details, read the file COPYING.\n",stderr);

  3096     fputs("This is Free Software; "

  3097       "you may redistribute it under certain conditions (GPL);\n",stderr);

  3098     fputs("read the file COPYING for details.\n\n",stderr);

  3099     help=g_option_context_get_help(context,TRUE,NULL);

  3100     fputs(help,stderr);

  3101     g_free(help);

  3102     fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);

  3103     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "

  3104       "non-ASCII\n",stderr);

  3105     fputs("characters like accented letters, "

  3106       "lines longer than 75 or shorter than 55,\n",stderr);

  3107     fputs("unbalanced quotes or brackets, "

  3108       "a variety of badly formatted punctuation, \n",stderr);

  3109     fputs("HTML tags, some likely typos. "

  3110       "It is NOT a substitute for human judgement.\n",stderr);

  3111     fputs("\n",stderr);

  3112 }

author	ali <ali@juiblex.co.uk>
	Thu Sep 26 08:35:03 2013 +0100 (2013-09-26)
changeset 119	9d3a8ee81151
parent 115	df21841a2b64
child 120	797e80d13543
permissions	-rw-r--r--