bookloupe-testing: bookloupe/bookloupe.c@24b0e5ecffe5

     1 /*************************************************************************/

     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */

     3 /*									 */

     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */

     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */

     6 /*									 */

     7 /* This program is free software; you can redistribute it and/or modify  */

     8 /* it under the terms of the GNU General Public License as published by  */

     9 /* the Free Software Foundation; either version 2 of the License, or     */

    10 /* (at your option) any later version.					 */

    11 /*									 */

    12 /* This program is distributed in the hope that it will be useful,       */

    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */

    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */

    15 /* GNU General Public License for more details.				 */

    16 /*									 */

    17 /* You should have received a copy of the GNU General Public License	 */

    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */

    19 /*************************************************************************/

    21 #include <stdio.h>

    22 #include <stdlib.h>

    23 #include <string.h>

    24 #include <ctype.h>

    25 #ifdef __WIN32__

    26 #include <windows.h>

    27 #endif

    28 #include <glib.h>

    29 #include <bl/bl.h>

    30 #include "bookloupe.h"

    31 #include "counters.h"

    32 #include "pending.h"

    33 #include "HTMLentities.h"

    35 gchar *prevline;

    37 /* Common typos. */

    38 char *typo[] = {

    39     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",

    40     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",

    41     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",

    42     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",

    43     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",

    44     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",

    45     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",

    46     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",

    47     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",

    48     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",

    49     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",

    50     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",

    51     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",

    52     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",

    53     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",

    54     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",

    55     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",

    56     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",

    57     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",

    58     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",

    59     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",

    60     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",

    61     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",

    62     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",

    63     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",

    64     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",

    65     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",

    66     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",

    67     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",

    68     "se", ""

    69 };

    71 GTree *usertypo;

    73 /* Common abbreviations and other OK words not to query as typos. */

    74 char *okword[] = {

    75     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",

    76     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",

    77     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",

    78     "outbid", "outbids", "frostbite", "frostbitten", ""

    79 };

    81 /* Common abbreviations that cause otherwise unexplained periods. */

    82 char *abbrev[] = {

    83     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",

    84     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""

    85 };

    87 /*

    88  * Two-Letter combinations that rarely if ever start words,

    89  * but are common scannos or otherwise common letter combinations.

    90  */

    91 char *nostart[] = {

    92     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""

    93 };

    95 /*

    96  * Two-Letter combinations that rarely if ever end words,

    97  * but are common scannos or otherwise common letter combinations.

    98  */

    99 char *noend[] = {

   100     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",

   101     "sw", "gr", "sl", "cl", "iy", ""

   102 };

   104 char *markup[] = {

   105     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",

   106     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",

   107     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",

   108     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""

   109 };

   111 char *DPmarkup[] = {

   112     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""

   113 };

   115 char *nocomma[] = {

   116     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",

   117     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",

   118     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",

   119     "during", "let", "toward", "among", ""

   120 };

   122 char *noperiod[] = {

   123     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",

   124     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",

   125     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",

   126     "among", "those", "into", "whom", "having", "thence", ""

   127 };

   129 gboolean pswit[SWITNO];  /* program switches */

   131 static GOptionEntry options[]={

   132     { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,

   133       "Ignore DP-specific markup", NULL },

   134     { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,

   135       "Don't echo queried line", NULL },

   136     { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,

   137       "Check single quotes", NULL },

   138     { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,

   139       "Check common typos", NULL },

   140     { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,

   141       "Require closure of quotes on every paragraph", NULL },

   142     { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,

   143       "Disable paranoid querying of everything", NULL },

   144     { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,

   145       "Disable line end checking", NULL },

   146     { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,

   147       "Overview: just show counts", NULL },

   148     { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,

   149       "Output errors to stdout instead of stderr", NULL },

   150     { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,

   151       "Echo header fields", NULL },

   152     { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,

   153       "Ignore markup in < >", NULL },

   154     { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,

   155       "Use file of user-defined typos", NULL },

   156     { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,

   157       "Defaults for use on www upload", NULL },

   158     { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,

   159       "Verbose - list everything", NULL },

   160     { NULL }

   161 };

   163 long cnt_dquot;		/* for overview mode, count of doublequote queries */

   164 long cnt_squot;		/* for overview mode, count of singlequote queries */

   165 long cnt_brack;		/* for overview mode, count of brackets queries */

   166 long cnt_bin;		/* for overview mode, count of non-ASCII queries */

   167 long cnt_odd;		/* for overview mode, count of odd character queries */

   168 long cnt_long;		/* for overview mode, count of long line errors */

   169 long cnt_short;		/* for overview mode, count of short line queries */

   170 long cnt_punct;		/* for overview mode,

   171 			   count of punctuation and spacing queries */

   172 long cnt_dash;		/* for overview mode, count of dash-related queries */

   173 long cnt_word;		/* for overview mode, count of word queries */

   174 long cnt_html;		/* for overview mode, count of html queries */

   175 long cnt_lineend;	/* for overview mode, count of line-end queries */

   176 long cnt_spacend;	/* count of lines with space at end */

   177 long linecnt;		/* count of total lines in the file */

   178 long checked_linecnt;	/* count of lines actually checked */

   180 void proghelp(GOptionContext *context);

   181 void procfile(const char *);

   183 gchar *running_from;

   185 gboolean mixdigit(const char *);

   186 gchar *getaword(const char **);

   187 char *flgets(char **,long);

   188 void postprocess_for_HTML(char *);

   189 char *linehasmarkup(char *);

   190 char *losemarkup(char *);

   191 gboolean tagcomp(const char *,const char *);

   192 void loseentities(char *);

   193 gboolean isroman(const char *);

   194 void postprocess_for_DP(char *);

   195 void print_as_windows_1252(const char *string);

   196 void print_as_utf_8(const char *string);

   198 GTree *qword,*qperiod;

   200 #ifdef __WIN32__

   201 UINT saved_cp;

   202 #endif

   204 void parse_options(int *argc,char ***argv)

   205 {

   206     GError *err=NULL;

   207     GOptionContext *context;

   208     context=g_option_context_new(

   209       "file - looks for errors in Project Gutenberg(TM) etexts");

   210     g_option_context_add_main_entries(context,options,NULL);

   211     if (!g_option_context_parse(context,argc,argv,&err))

   212     {

   213 	g_printerr("Bookloupe: %s\n",err->message);

   214 	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);

   215 	exit(1);

   216     }

   217     /* Paranoid checking is turned OFF, not on, by its switch */

   218     pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];

   219     if (pswit[PARANOID_SWITCH])

   220 	/* if running in paranoid mode, typo checks default to enabled */

   221 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];

   222     /* Line-end checking is turned OFF, not on, by its switch */

   223     pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];

   224     /* Echoing is turned OFF, not on, by its switch */

   225     pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];

   226     if (pswit[OVERVIEW_SWITCH])

   227 	/* just print summary; don't echo */

   228 	pswit[ECHO_SWITCH]=FALSE;

   229     /*

   230      * Web uploads - for the moment, this is really just a placeholder

   231      * until we decide what processing we really want to do on web uploads

   232      */

   233     if (pswit[WEB_SWITCH])

   234     {

   235 	/* specific override for web uploads */

   236 	pswit[ECHO_SWITCH]=TRUE;

   237 	pswit[SQUOTE_SWITCH]=FALSE;

   238 	pswit[TYPO_SWITCH]=TRUE;

   239 	pswit[QPARA_SWITCH]=FALSE;

   240 	pswit[PARANOID_SWITCH]=TRUE;

   241 	pswit[LINE_END_SWITCH]=FALSE;

   242 	pswit[OVERVIEW_SWITCH]=FALSE;

   243 	pswit[STDOUT_SWITCH]=FALSE;

   244 	pswit[HEADER_SWITCH]=TRUE;

   245 	pswit[VERBOSE_SWITCH]=FALSE;

   246 	pswit[MARKUP_SWITCH]=FALSE;

   247 	pswit[USERTYPO_SWITCH]=FALSE;

   248 	pswit[DP_SWITCH]=FALSE;

   249     }

   250     if (*argc<2)

   251     {

   252 	proghelp(context);

   253 	exit(1);

   254     }

   255     g_option_context_free(context);

   256 }

   258 /*

   259  * read_user_scannos:

   260  *

   261  * Read in the user-defined stealth scanno list.

   262  */

   263 void read_user_scannos(void)

   264 {

   265     GError *err=NULL;

   266     gchar *usertypo_file;

   267     gboolean okay;

   268     int i;

   269     gsize len,nb;

   270     gchar *contents,*utf8,**lines;

   271     usertypo_file=g_strdup("bookloupe.typ");

   272     okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   273     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   274     {

   275 	g_clear_error(&err);

   276 	g_free(usertypo_file);

   277 	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);

   278 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   279     }

   280     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   281     {

   282 	g_clear_error(&err);

   283 	g_free(usertypo_file);

   284 	usertypo_file=g_strdup("gutcheck.typ");

   285 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   286     }

   287     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   288     {

   289 	g_clear_error(&err);

   290 	g_free(usertypo_file);

   291 	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);

   292 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   293     }

   294     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   295     {

   296 	g_free(usertypo_file);

   297 	g_print("   --> I couldn't find bookloupe.typ "

   298 	  "-- proceeding without user typos.\n");

   299 	return;

   300     }

   301     else if (!okay)

   302     {

   303 	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);

   304 	g_free(usertypo_file);

   305 	g_clear_error(&err);

   306 	exit(1);

   307     }

   308     if (g_utf8_validate(contents,len,NULL))

   309 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);

   310     else

   311 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);

   312     g_free(contents);

   313     lines=g_strsplit_set(utf8,"\r\n",0);

   314     g_free(utf8);

   315     usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);

   316     for (i=0;lines[i];i++)

   317 	if (*(unsigned char *)lines[i]>'!')

   318 	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));

   319 	else

   320 	    g_free(lines[i]);

   321     g_free(lines);

   322 }

   324 /*

   325  * read_etext:

   326  *

   327  * Read an etext returning a newly allocated string containing the file

   328  * contents or NULL on error.

   329  */

   330 gchar *read_etext(const char *filename,GError **err)

   331 {

   332     GError *tmp_err=NULL;

   333     gchar *contents,*utf8;

   334     gsize len,bytes_read,bytes_written;

   335     int i,line,col;

   336     if (!g_file_get_contents(filename,&contents,&len,err))

   337 	return NULL;

   338     if (g_utf8_validate(contents,len,NULL))

   339     {

   340 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);

   341 	g_set_print_handler(print_as_utf_8);

   342 #ifdef __WIN32__

   343 	SetConsoleOutputCP(CP_UTF8);

   344 #endif

   345     }

   346     else

   347     {

   348 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,

   349 	  &bytes_written,&tmp_err);

   350 	if (g_error_matches(tmp_err,G_CONVERT_ERROR,

   351 	  G_CONVERT_ERROR_ILLEGAL_SEQUENCE))

   352 	{

   353 	    line=col=1;

   354 	    for(i=0;i<bytes_read;i++)

   355 		if (contents[i]=='\n')

   356 		{

   357 		    line++;

   358 		    col=1;

   359 		}

   360 		else if (contents[i]!='\r')

   361 		    col++;

   362 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,

   363 	      "Input conversion failed. Byte %d at line %d, column %d is not a "

   364 	      "valid Windows-1252 character",

   365 	      ((unsigned char *)contents)[bytes_read],line,col);

   366 	}

   367 	else if (tmp_err)

   368 	    g_propagate_error(err,tmp_err);

   369 	g_set_print_handler(print_as_windows_1252);

   370 #ifdef __WIN32__

   371 	SetConsoleOutputCP(1252);

   372 #endif

   373     }

   374     g_free(contents);

   375     return utf8;

   376 }

   378 void cleanup_on_exit(void)

   379 {

   380 #ifdef __WIN32__

   381     SetConsoleOutputCP(saved_cp);

   382 #endif

   383 }

   385 int main(int argc,char **argv)

   386 {

   387 #ifdef __WIN32__

   388     atexit(cleanup_on_exit);

   389     saved_cp=GetConsoleOutputCP();

   390 #endif

   391     running_from=g_path_get_dirname(argv[0]);

   392     parse_options(&argc,&argv);

   393     if (pswit[USERTYPO_SWITCH])

   394 	read_user_scannos();

   395     fprintf(stderr,"bookloupe: Check and report on an e-text\n");

   396     procfile(argv[1]);

   397     if (pswit[OVERVIEW_SWITCH])

   398     {

   399 	g_print("    Checked %ld lines of %ld (head+foot = %ld)\n\n",

   400 	  checked_linecnt,linecnt,linecnt-checked_linecnt);

   401 	g_print("    --------------- Queries found --------------\n");

   402 	if (cnt_long)

   403 	    g_print("    Long lines:		    %14ld\n",cnt_long);

   404 	if (cnt_short)

   405 	    g_print("    Short lines:		   %14ld\n",cnt_short);

   406 	if (cnt_lineend)

   407 	    g_print("    Line-end problems:	     %14ld\n",cnt_lineend);

   408 	if (cnt_word)

   409 	    g_print("    Common typos:		  %14ld\n",cnt_word);

   410 	if (cnt_dquot)

   411 	    g_print("    Unmatched quotes:	      %14ld\n",cnt_dquot);

   412 	if (cnt_squot)

   413 	    g_print("    Unmatched SingleQuotes:	%14ld\n",cnt_squot);

   414 	if (cnt_brack)

   415 	    g_print("    Unmatched brackets:	    %14ld\n",cnt_brack);

   416 	if (cnt_bin)

   417 	    g_print("    Non-ASCII characters:	  %14ld\n",cnt_bin);

   418 	if (cnt_odd)

   419 	    g_print("    Proofing characters:	   %14ld\n",cnt_odd);

   420 	if (cnt_punct)

   421 	    g_print("    Punctuation & spacing queries: %14ld\n",cnt_punct);

   422 	if (cnt_dash)

   423 	    g_print("    Non-standard dashes:	   %14ld\n",cnt_dash);

   424 	if (cnt_html)

   425 	    g_print("    Possible HTML tags:	    %14ld\n",cnt_html);

   426 	g_print("\n");

   427 	g_print("    TOTAL QUERIES		  %14ld\n",

   428 	  cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+

   429 	  cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);

   430     }

   431     g_free(running_from);

   432     if (usertypo)

   433 	g_tree_unref(usertypo);

   434     return 0;

   435 }

   437 /*

   438  * first_pass:

   439  *

   440  * Run a first pass - verify that it's a valid PG

   441  * file, decide whether to report some things that

   442  * occur many times in the text like long or short

   443  * lines, non-standard dashes, etc.

   444  */

   445 struct first_pass_results *first_pass(const char *etext)

   446 {

   447     gunichar laststart=CHAR_SPACE;

   448     const char *s;

   449     gchar *lc_line;

   450     int i,j,lbytes,llen;

   451     gchar **lines;

   452     unsigned int lastlen=0,lastblen=0;

   453     long spline=0,nspline=0;

   454     static struct first_pass_results results={0};

   455     gchar *inword;

   456     lines=g_strsplit(etext,"\n",0);

   457     for (j=0;lines[j];j++)

   458     {

   459 	lbytes=strlen(lines[j]);

   460 	while (lbytes>0 && lines[j][lbytes-1]=='\r')

   461 	    lines[j][--lbytes]='\0';

   462 	llen=g_utf8_strlen(lines[j],lbytes);

   463 	linecnt++;

   464 	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&

   465 	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))

   466 	{

   467 	    if (spline)

   468 		g_print("   --> Duplicate header?\n");

   469 	    spline=linecnt+1;   /* first line of non-header text, that is */

   470 	}

   471 	if (!strncmp(lines[j],"*** START",9) &&

   472 	  strstr(lines[j],"PROJECT GUTENBERG"))

   473 	{

   474 	    if (nspline)

   475 		g_print("   --> Duplicate header?\n");

   476 	    nspline=linecnt+1;   /* first line of non-header text, that is */

   477 	}

   478 	if (spline || nspline)

   479 	{

   480 	    lc_line=g_utf8_strdown(lines[j],lbytes);

   481 	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))

   482 	    {

   483 		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))

   484 		{

   485 		    if (results.footerline)

   486 		    {

   487 			/* it's an old-form header - we can detect duplicates */

   488 			if (!nspline)

   489 			    g_print("   --> Duplicate footer?\n");

   490 		    }

   491 		    else

   492 			results.footerline=linecnt;

   493 		}

   494 	    }

   495 	    g_free(lc_line);

   496 	}

   497 	if (spline)

   498 	    results.firstline=spline;

   499 	if (nspline)

   500 	    results.firstline=nspline;  /* override with new */

   501 	if (results.footerline)

   502 	    continue;    /* don't count the boilerplate in the footer */

   503 	results.totlen+=llen;

   504 	for (s=lines[j];*s;s=g_utf8_next_char(s))

   505 	{

   506 	    if (g_utf8_get_char(s)>127)

   507 		results.binlen++;

   508 	    if (g_unichar_isalpha(g_utf8_get_char(s)))

   509 		results.alphalen++;

   510 	    if (s>lines[j] && g_utf8_get_char(s)==CHAR_DQUOTE &&

   511 	      isalpha(g_utf8_get_char(g_utf8_prev_char(s))))

   512 		results.endquote_count++;

   513 	}

   514 	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&

   515 	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)

   516 	    results.shortline++;

   517 	if (lbytes>0 &&

   518 	  g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)

   519 	    cnt_spacend++;

   520 	if (strstr(lines[j],".,"))

   521 	    results.dotcomma++;

   522 	/* only count ast lines for ignoring purposes where there is */

   523 	/* locase text on the line */

   524 	if (strchr(lines[j],'*'))

   525 	{

   526 	    for (s=lines[j];*s;s=g_utf8_next_char(s))

   527 		if (g_unichar_islower(g_utf8_get_char(s)))

   528 		    break;

   529 	    if (*s)

   530 		results.astline++;

   531 	}

   532 	if (strchr(lines[j],'/'))

   533 	    results.fslashline++;

   534 	if (lbytes>0)

   535 	{

   536 	    for (s=g_utf8_prev_char(lines[j]+lbytes);

   537 	      s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;

   538 	      s=g_utf8_prev_char(s))

   539 		;

   540 	    if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&

   541 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')

   542 		results.hyphens++;

   543 	}

   544 	if (llen>LONGEST_PG_LINE)

   545 	    results.longline++;

   546 	if (llen>WAY_TOO_LONG)

   547 	    results.verylongline++;

   548 	if (strchr(lines[j],'<') && strchr(lines[j],'>'))

   549 	{

   550 	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);

   551 	    if (i>0)

   552 		results.htmcount++;

   553 	    if (strstr(lines[j],"<i>"))

   554 		results.htmcount+=4; /* bonus marks! */

   555 	}

   556 	/* Check for spaced em-dashes */

   557 	if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))

   558 	{

   559 	    results.emdash++;

   560 	    if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)

   561 		results.space_emdash++;

   562 	    if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)

   563 		/* count of em-dashes with spaces both sides */

   564 		results.non_PG_space_emdash++;

   565 	    if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)

   566 		/* count of PG-type em-dashes with no spaces */

   567 		results.PG_space_emdash++;

   568 	}

   569 	for (s=lines[j];*s;)

   570 	{

   571 	    inword=getaword(&s);

   572 	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))

   573 		results.Dutchcount++;

   574 	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))

   575 		results.Frenchcount++;

   576 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))

   577 		results.standalone_digit++;

   578 	    g_free(inword);

   579 	}

   580 	/* Check for spaced dashes */

   581 	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')

   582 	    results.spacedash++;

   583 	lastblen=lastlen;

   584 	lastlen=llen;

   585 	laststart=lines[j][0];

   586     }

   587     g_strfreev(lines);

   588     return &results;

   589 }

   591 /*

   592  * report_first_pass:

   593  *

   594  * Make some snap decisions based on the first pass results.

   595  */

   596 struct warnings *report_first_pass(struct first_pass_results *results)

   597 {

   598     static struct warnings warnings={0};

   599     if (cnt_spacend>0)

   600 	g_print("   --> %ld lines in this file have white space at end\n",

   601 	  cnt_spacend);

   602     warnings.dotcomma=1;

   603     if (results->dotcomma>5)

   604     {

   605 	warnings.dotcomma=0;

   606 	g_print("   --> %ld lines in this file contain '.,'. "

   607 	  "Not reporting them.\n",results->dotcomma);

   608     }

   609     /*

   610      * If more than 50 lines, or one-tenth, are short,

   611      * don't bother reporting them.

   612      */

   613     warnings.shortline=1;

   614     if (results->shortline>50 || results->shortline*10>linecnt)

   615     {

   616 	warnings.shortline=0;

   617 	g_print("   --> %ld lines in this file are short. "

   618 	  "Not reporting short lines.\n",results->shortline);

   619     }

   620     /*

   621      * If more than 50 lines, or one-tenth, are long,

   622      * don't bother reporting them.

   623      */

   624     warnings.longline=1;

   625     if (results->longline>50 || results->longline*10>linecnt)

   626     {

   627 	warnings.longline=0;

   628 	g_print("   --> %ld lines in this file are long. "

   629 	  "Not reporting long lines.\n",results->longline);

   630     }

   631     /* If more than 10 lines contain asterisks, don't bother reporting them. */

   632     warnings.ast=1;

   633     if (results->astline>10)

   634     {

   635 	warnings.ast=0;

   636 	g_print("   --> %ld lines in this file contain asterisks. "

   637 	  "Not reporting them.\n",results->astline);

   638     }

   639     /*

   640      * If more than 10 lines contain forward slashes,

   641      * don't bother reporting them.

   642      */

   643     warnings.fslash=1;

   644     if (results->fslashline>10)

   645     {

   646 	warnings.fslash=0;

   647 	g_print("   --> %ld lines in this file contain forward slashes. "

   648 	  "Not reporting them.\n",results->fslashline);

   649     }

   650     /*

   651      * If more than 20 lines contain unpunctuated endquotes,

   652      * don't bother reporting them.

   653      */

   654     warnings.endquote=1;

   655     if (results->endquote_count>20)

   656     {

   657 	warnings.endquote=0;

   658 	g_print("   --> %ld lines in this file contain unpunctuated endquotes. "

   659 	  "Not reporting them.\n",results->endquote_count);

   660     }

   661     /*

   662      * If more than 15 lines contain standalone digits,

   663      * don't bother reporting them.

   664      */

   665     warnings.digit=1;

   666     if (results->standalone_digit>10)

   667     {

   668 	warnings.digit=0;

   669 	g_print("   --> %ld lines in this file contain standalone 0s and 1s. "

   670 	  "Not reporting them.\n",results->standalone_digit);

   671     }

   672     /*

   673      * If more than 20 lines contain hyphens at end,

   674      * don't bother reporting them.

   675      */

   676     warnings.hyphen=1;

   677     if (results->hyphens>20)

   678     {

   679 	warnings.hyphen=0;

   680 	g_print("   --> %ld lines in this file have hyphens at end. "

   681 	  "Not reporting them.\n",results->hyphens);

   682     }

   683     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])

   684     {

   685 	g_print("   --> Looks like this is HTML. Switching HTML mode ON.\n");

   686 	pswit[MARKUP_SWITCH]=1;

   687     }

   688     if (results->verylongline>0)

   689 	g_print("   --> %ld lines in this file are VERY long!\n",

   690 	  results->verylongline);

   691     /*

   692      * If there are more non-PG spaced dashes than PG em-dashes,

   693      * assume it's deliberate.

   694      * Current PG guidelines say don't use them, but older texts do,

   695      * and some people insist on them whatever the guidelines say.

   696      */

   697     warnings.dash=1;

   698     if (results->spacedash+results->non_PG_space_emdash>

   699       results->PG_space_emdash)

   700     {

   701 	warnings.dash=0;

   702 	g_print("   --> There are %ld spaced dashes and em-dashes. "

   703 	  "Not reporting them.\n",

   704 	  results->spacedash+results->non_PG_space_emdash);

   705     }

   706     /* If more than a quarter of characters are hi-bit, bug out. */

   707     warnings.bin=1;

   708     if (results->binlen*4>results->totlen)

   709     {

   710 	g_print("   --> This file does not appear to be ASCII. "

   711 	  "Terminating. Best of luck with it!\n");

   712 	exit(1);

   713     }

   714     if (results->alphalen*4<results->totlen)

   715     {

   716 	g_print("   --> This file does not appear to be text. "

   717 	  "Terminating. Best of luck with it!\n");

   718 	exit(1);

   719     }

   720     if (results->binlen*100>results->totlen || results->binlen>100)

   721     {

   722 	g_print("   --> There are a lot of foreign letters here. "

   723 	  "Not reporting them.\n");

   724 	warnings.bin=0;

   725     }

   726     warnings.isDutch=FALSE;

   727     if (results->Dutchcount>50)

   728     {

   729 	warnings.isDutch=TRUE;

   730 	g_print("   --> This looks like Dutch - "

   731 	  "switching off dashes and warnings for 's Middags case.\n");

   732     }

   733     warnings.isFrench=FALSE;

   734     if (results->Frenchcount>50)

   735     {

   736 	warnings.isFrench=TRUE;

   737 	g_print("   --> This looks like French - "

   738 	  "switching off some doublepunct.\n");

   739     }

   740     if (results->firstline && results->footerline)

   741 	g_print("    The PG header and footer appear to be already on.\n");

   742     else

   743     {

   744 	if (results->firstline)

   745 	    g_print("    The PG header is on - no footer.\n");

   746 	if (results->footerline)

   747 	    g_print("    The PG footer is on - no header.\n");

   748     }

   749     g_print("\n");

   750     if (pswit[VERBOSE_SWITCH])

   751     {

   752 	warnings.bin=1;

   753 	warnings.shortline=1;

   754 	warnings.dotcomma=1;

   755 	warnings.longline=1;

   756 	warnings.dash=1;

   757 	warnings.digit=1;

   758 	warnings.ast=1;

   759 	warnings.fslash=1;

   760 	warnings.hyphen=1;

   761 	warnings.endquote=1;

   762 	g_print("   *** Verbose output is ON -- you asked for it! ***\n");

   763     }

   764     if (warnings.isDutch)

   765 	warnings.dash=0;

   766     if (results->footerline>0 && results->firstline>0 &&

   767       results->footerline>results->firstline &&

   768       results->footerline-results->firstline<100)

   769     {

   770 	g_print("   --> I don't really know where this text starts. \n");

   771 	g_print("       There are no reference points.\n");

   772 	g_print("       I'm going to have to report the header and footer "

   773 	  "as well.\n");

   774 	results->firstline=0;

   775     }

   776     return &warnings;

   777 }

   779 /*

   780  * analyse_quotes:

   781  *

   782  * Look along the line, accumulate the count of quotes, and see

   783  * if this is an empty line - i.e. a line with nothing on it

   784  * but spaces.

   785  * If line has just spaces, period, * and/or - on it, don't

   786  * count it, since empty lines with asterisks or dashes to

   787  * separate sections are common.

   788  *

   789  * Returns: TRUE if the line is empty.

   790  */

   791 gboolean analyse_quotes(const char *aline,struct counters *counters)

   792 {

   793     int guessquote=0;

   794     /* assume the line is empty until proven otherwise */

   795     gboolean isemptyline=TRUE;

   796     const char *s=aline,*sprev,*snext;

   797     gunichar c;

   798     sprev=NULL;

   799     while (*s)

   800     {

   801 	snext=g_utf8_next_char(s);

   802 	c=g_utf8_get_char(s);

   803 	if (c==CHAR_DQUOTE)

   804 	    increment_matching(counters,c,!matching_difference(counters,c));

   805 	else if (CHAR_IS_DQUOTE(c))

   806 	    increment_matching(counters,c,!CHAR_IS_CLOSING_QUOTE(c));

   807 	else if (CHAR_IS_SQUOTE(c))

   808 	{

   809 	    if (s==aline)

   810 	    {

   811 		/*

   812 		 * At start of line, it can only be an openquote.

   813 		 * Hardcode a very common exception!

   814 		 */

   815 		if (!g_str_has_prefix(snext,"tis") &&

   816 		  !g_str_has_prefix(snext,"Tis"))

   817 		    increment_matching(counters,c,TRUE);

   818 	    }

   819 	    else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&

   820 	      g_unichar_isalpha(g_utf8_get_char(snext)))

   821 		/* Do nothing! it's definitely an apostrophe, not a quote */

   822 		;

   823 	    /* it's outside a word - let's check it out */

   824 	    else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||

   825 	      g_unichar_isalpha(g_utf8_get_char(snext)))

   826 	    {

   827 		/* it damwell better BE an openquote */

   828 		if (!g_str_has_prefix(snext,"tis") &&

   829 		  !g_str_has_prefix(snext,"Tis"))

   830 		    /* hardcode a very common exception! */

   831 		    increment_matching(counters,c,TRUE);

   832 	    }

   833 	    else

   834 	    {

   835 		/* now - is it a closequote? */

   836 		guessquote=0;   /* accumulate clues */

   837 		if (g_unichar_isalpha(g_utf8_get_char(sprev)))

   838 		{

   839 		    /* it follows a letter - could be either */

   840 		    guessquote++;

   841 		    if (g_utf8_get_char(sprev)=='s')

   842 		    {

   843 			/* looks like a plural apostrophe */

   844 			guessquote-=3;

   845 			if (g_utf8_get_char(snext)==CHAR_SPACE)

   846 			    /* bonus marks! */

   847 			    guessquote-=2;

   848 		    }

   849 		}

   850 		/* it doesn't have a letter either side */

   851 		else if (strchr(".?!,;:",g_utf8_get_char(sprev)) &&

   852 		  strchr(".?!,;: ",g_utf8_get_char(snext)))

   853 		    guessquote+=8; /* looks like a closequote */

   854 		else

   855 		    guessquote++;

   856 		if (matching_difference(counters,CHAR_SQUOTE)>0)

   857 		    /*

   858 		     * Give it the benefit of some doubt,

   859 		     * if a squote is already open.

   860 		     */

   861 		    guessquote++;

   862 		else

   863 		    guessquote--;

   864 		if (guessquote>=0)

   865 		    increment_matching(counters,c,FALSE);

   866 	    }

   867 	}

   868 	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&

   869 	  c!='\r' && c!='\n')

   870 	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */

   871 	if (c==CHAR_UNDERSCORE)

   872 	    counters->c_unders++;

   873 	if (c==CHAR_OPEN_SBRACK)

   874 	{

   875 	    if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&

   876 	      !matching_difference(counters,c) && s==aline &&

   877 	      g_str_has_prefix(s,"[Illustration:"))

   878 		increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);

   879 	    else

   880 		increment_matching(counters,c,TRUE);

   881 	}

   882 	else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)

   883 	    increment_matching(counters,c,TRUE);

   884 	if (c==CHAR_CLOSE_SBRACK)

   885 	{

   886 	    if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&

   887 	      !matching_difference(counters,c) && !*snext)

   888 		increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);

   889 	    else

   890 		increment_matching(counters,c,FALSE);

   891 	}

   892 	else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)

   893 	    increment_matching(counters,c,FALSE);

   894 	sprev=s;

   895 	s=snext;

   896     }

   897     return isemptyline;

   898 }

   900 /*

   901  * check_for_control_characters:

   902  *

   903  * Check for invalid or questionable characters in the line

   904  * Anything above 127 is invalid for plain ASCII, and

   905  * non-printable control characters should also be flagged.

   906  * Tabs should generally not be there.

   907  */

   908 void check_for_control_characters(const char *aline)

   909 {

   910     gunichar c;

   911     const char *s;

   912     for (s=aline;*s;s=g_utf8_next_char(s))

   913     {

   914 	c=g_utf8_get_char(s);

   915 	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)

   916 	{

   917 	    if (pswit[ECHO_SWITCH])

   918 		g_print("\n%s\n",aline);

   919 	    if (!pswit[OVERVIEW_SWITCH])

   920 		g_print("    Line %ld column %ld - Control character %u\n",

   921 		  linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);

   922 	    else

   923 		cnt_bin++;

   924 	}

   925     }

   926 }

   928 /*

   929  * check_for_odd_characters:

   930  *

   931  * Check for binary and other odd characters.

   932  */

   933 void check_for_odd_characters(const char *aline,const struct warnings *warnings,

   934   gboolean isemptyline)

   935 {

   936     /* Don't repeat multiple warnings on one line. */

   937     gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;

   938     gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;

   939     const char *s;

   940     gunichar c;

   941     for (s=aline;*s;s=g_utf8_next_char(s))

   942     {

   943 	c=g_utf8_get_char(s);

   944 	if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))

   945 	{

   946 	    if (pswit[ECHO_SWITCH])

   947 		g_print("\n%s\n",aline);

   948 	    if (!pswit[OVERVIEW_SWITCH])

   949 		if (c>127 && c<160 || c>255)

   950 		    g_print("    Line %ld column %ld - "

   951 		      "Non-ISO-8859 character %u\n",

   952 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);

   953 		else

   954 		    g_print("    Line %ld column %ld - "

   955 		      "Non-ASCII character %u\n",

   956 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);

   957 	    else

   958 		cnt_bin++;

   959 	    eNon_A=TRUE;

   960 	}

   961 	if (!eTab && c==CHAR_TAB)

   962 	{

   963 	    if (pswit[ECHO_SWITCH])

   964 		g_print("\n%s\n",aline);

   965 	    if (!pswit[OVERVIEW_SWITCH])

   966 		g_print("    Line %ld column %ld - Tab character?\n",

   967 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

   968 	    else

   969 		cnt_odd++;

   970 	    eTab=TRUE;

   971 	}

   972 	if (!eTilde && c==CHAR_TILDE)

   973 	{

   974 	    /*

   975 	     * Often used by OCR software to indicate an

   976 	     * unrecognizable character.

   977 	     */

   978 	    if (pswit[ECHO_SWITCH])

   979 		g_print("\n%s\n",aline);

   980 	    if (!pswit[OVERVIEW_SWITCH])

   981 		g_print("    Line %ld column %ld - Tilde character?\n",

   982 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

   983 	    else

   984 		cnt_odd++;

   985 	    eTilde=TRUE;

   986 	}

   987 	if (!eCarat && c==CHAR_CARAT)

   988 	{

   989 	    if (pswit[ECHO_SWITCH])

   990 		g_print("\n%s\n",aline);

   991 	    if (!pswit[OVERVIEW_SWITCH])

   992 		g_print("    Line %ld column %ld - Carat character?\n",

   993 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

   994 	    else

   995 		cnt_odd++;

   996 	    eCarat=TRUE;

   997 	}

   998 	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)

   999 	{

  1000 	    if (pswit[ECHO_SWITCH])

  1001 		g_print("\n%s\n",aline);

  1002 	    if (!pswit[OVERVIEW_SWITCH])

  1003 		g_print("    Line %ld column %ld - Forward slash?\n",

  1004 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1005 	    else

  1006 		cnt_odd++;

  1007 	    eFSlash=TRUE;

  1008 	}

  1009 	/*

  1010 	 * Report asterisks only in paranoid mode,

  1011 	 * since they're often deliberate.

  1012 	 */

  1013 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&

  1014 	  c==CHAR_ASTERISK)

  1015 	{

  1016 	    if (pswit[ECHO_SWITCH])

  1017 		g_print("\n%s\n",aline);

  1018 	    if (!pswit[OVERVIEW_SWITCH])

  1019 		g_print("    Line %ld column %ld - Asterisk?\n",

  1020 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1021 	    else

  1022 		cnt_odd++;

  1023 	    eAst=TRUE;

  1024 	}

  1025     }

  1026 }

  1028 /*

  1029  * check_for_long_line:

  1030  *

  1031  * Check for line too long.

  1032  */

  1033 void check_for_long_line(const char *aline)

  1034 {

  1035     if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)

  1036     {

  1037 	if (pswit[ECHO_SWITCH])

  1038 	    g_print("\n%s\n",aline);

  1039 	if (!pswit[OVERVIEW_SWITCH])

  1040 	    g_print("    Line %ld column %ld - Long line %ld\n",

  1041 	      linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));

  1042 	else

  1043 	    cnt_long++;

  1044     }

  1045 }

  1047 /*

  1048  * check_for_short_line:

  1049  *

  1050  * Check for line too short.

  1051  *

  1052  * This one is a bit trickier to implement: we don't want to

  1053  * flag the last line of a paragraph for being short, so we

  1054  * have to wait until we know that our current line is a

  1055  * "normal" line, then report the _previous_ line if it was too

  1056  * short. We also don't want to report indented lines like

  1057  * chapter heads or formatted quotations. We therefore keep

  1058  * last->len as the length of the last line examined, and

  1059  * last->blen as the length of the last but one, and try to

  1060  * suppress unnecessary warnings by checking that both were of

  1061  * "normal" length. We keep the first character of the last

  1062  * line in last->start, and if it was a space, we assume that

  1063  * the formatting is deliberate. I can't figure out a way to

  1064  * distinguish something like a quoted verse left-aligned or

  1065  * the header or footer of a letter from a paragraph of short

  1066  * lines - maybe if I examined the whole paragraph, and if the

  1067  * para has less than, say, 8 lines and if all lines are short,

  1068  * then just assume it's OK? Need to look at some texts to see

  1069  * how often a formula like this would get the right result.

  1070  */

  1071 void check_for_short_line(const char *aline,const struct line_properties *last)

  1072 {

  1073     if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&

  1074       last->len<SHORTEST_PG_LINE && last->blen>1 &&

  1075       last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)

  1076     {

  1077 	if (pswit[ECHO_SWITCH])

  1078 	    g_print("\n%s\n",prevline);

  1079 	if (!pswit[OVERVIEW_SWITCH])

  1080 	    g_print("    Line %ld column %ld - Short line %ld?\n",

  1081 	      linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));

  1082 	else

  1083 	    cnt_short++;

  1084     }

  1085 }

  1087 /*

  1088  * check_for_starting_punctuation:

  1089  *

  1090  * Look for punctuation other than full ellipses at start of line.

  1091  */

  1092 void check_for_starting_punctuation(const char *aline)

  1093 {

  1094     if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&

  1095       !g_str_has_prefix(aline,". . ."))

  1096     {

  1097 	if (pswit[ECHO_SWITCH])

  1098 	    g_print("\n%s\n",aline);

  1099 	if (!pswit[OVERVIEW_SWITCH])

  1100 	    g_print("    Line %ld column 1 - Begins with punctuation?\n",

  1101 	      linecnt);

  1102 	else

  1103 	    cnt_punct++;

  1104     }

  1105 }

  1107 /*

  1108  * check_for_spaced_emdash:

  1109  *

  1110  * Check for spaced em-dashes.

  1111  *

  1112  * We must check _all_ occurrences of "--" on the line

  1113  * hence the loop - even if the first double-dash is OK

  1114  * there may be another that's wrong later on.

  1115  */

  1116 void check_for_spaced_emdash(const char *aline)

  1117 {

  1118     const char *s,*t,*next;

  1119     for (s=aline;t=strstr(s,"--");s=next)

  1120     {

  1121 	next=g_utf8_next_char(g_utf8_next_char(t));

  1122 	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||

  1123 	  g_utf8_get_char(next)==CHAR_SPACE)

  1124 	{

  1125 	    if (pswit[ECHO_SWITCH])

  1126 		g_print("\n%s\n",aline);

  1127 	    if (!pswit[OVERVIEW_SWITCH])

  1128 		g_print("    Line %ld column %ld - Spaced em-dash?\n",

  1129 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  1130 	    else

  1131 		cnt_dash++;

  1132 	}

  1133     }

  1134 }

  1136 /*

  1137  * check_for_spaced_dash:

  1138  *

  1139  * Check for spaced dashes.

  1140  */

  1141 void check_for_spaced_dash(const char *aline)

  1142 {

  1143     const char *s;

  1144     if ((s=strstr(aline," -")))

  1145     {

  1146 	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')

  1147 	{

  1148 	    if (pswit[ECHO_SWITCH])

  1149 		g_print("\n%s\n",aline);

  1150 	    if (!pswit[OVERVIEW_SWITCH])

  1151 		g_print("    Line %ld column %ld - Spaced dash?\n",

  1152 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1153 	    else

  1154 		cnt_dash++;

  1155 	}

  1156     }

  1157     else if ((s=strstr(aline,"- ")))

  1158     {

  1159 	if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')

  1160 	{

  1161 	    if (pswit[ECHO_SWITCH])

  1162 		g_print("\n%s\n",aline);

  1163 	    if (!pswit[OVERVIEW_SWITCH])

  1164 		g_print("    Line %ld column %ld - Spaced dash?\n",

  1165 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1166 	    else

  1167 		cnt_dash++;

  1168 	}

  1169     }

  1170 }

  1172 /*

  1173  * check_for_unmarked_paragraphs:

  1174  *

  1175  * Check for unmarked paragraphs indicated by separate speakers.

  1176  *

  1177  * May well be false positive:

  1178  * "Bravo!" "Wonderful!" called the crowd.

  1179  * but useful all the same.

  1180  */

  1181 void check_for_unmarked_paragraphs(const char *aline)

  1182 {

  1183     const char *s;

  1184     s=strstr(aline,"\"  \"");

  1185     if (!s)

  1186 	s=strstr(aline,"\" \"");

  1187     if (s)

  1188     {

  1189 	if (pswit[ECHO_SWITCH])

  1190 	    g_print("\n%s\n",aline);

  1191 	if (!pswit[OVERVIEW_SWITCH])

  1192 	    g_print("    Line %ld column %ld - "

  1193 	      "Query missing paragraph break?\n",

  1194 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1195 	else

  1196 	    cnt_punct++;

  1197     }

  1198 }

  1200 /*

  1201  * check_for_jeebies:

  1202  *

  1203  * Check for "to he" and other easy h/b errors.

  1204  *

  1205  * This is a very inadequate effort on the h/b problem,

  1206  * but the phrase "to he" is always an error, whereas "to

  1207  * be" is quite common.

  1208  * Similarly, '"Quiet!", be said.' is a non-be error

  1209  * "to he" is _not_ always an error!:

  1210  *       "Where they went to he couldn't say."

  1211  * Another false positive:

  1212  *       What would "Cinderella" be without the . . .

  1213  * and another: "If he wants to he can see for himself."

  1214  */

  1215 void check_for_jeebies(const char *aline)

  1216 {

  1217     const char *s;

  1218     s=strstr(aline," be could ");

  1219     if (!s)

  1220 	s=strstr(aline," be would ");

  1221     if (!s)

  1222 	s=strstr(aline," was be ");

  1223     if (!s)

  1224 	s=strstr(aline," be is ");

  1225     if (!s)

  1226 	s=strstr(aline," is be ");

  1227     if (!s)

  1228 	s=strstr(aline,"\", be ");

  1229     if (!s)

  1230 	s=strstr(aline,"\" be ");

  1231     if (!s)

  1232 	s=strstr(aline,"\" be ");

  1233     if (!s)

  1234 	s=strstr(aline," to he ");

  1235     if (s)

  1236     {

  1237 	if (pswit[ECHO_SWITCH])

  1238 	    g_print("\n%s\n",aline);

  1239 	if (!pswit[OVERVIEW_SWITCH])

  1240 	    g_print("    Line %ld column %ld - Query he/be error?\n",

  1241 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1242 	else

  1243 	    cnt_word++;

  1244     }

  1245     s=strstr(aline," the had ");

  1246     if (!s)

  1247 	s=strstr(aline," a had ");

  1248     if (!s)

  1249 	s=strstr(aline," they bad ");

  1250     if (!s)

  1251 	s=strstr(aline," she bad ");

  1252     if (!s)

  1253 	s=strstr(aline," he bad ");

  1254     if (!s)

  1255 	s=strstr(aline," you bad ");

  1256     if (!s)

  1257 	s=strstr(aline," i bad ");

  1258     if (s)

  1259     {

  1260 	if (pswit[ECHO_SWITCH])

  1261 	    g_print("\n%s\n",aline);

  1262 	if (!pswit[OVERVIEW_SWITCH])

  1263 	    g_print("    Line %ld column %ld - Query had/bad error?\n",

  1264 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1265 	else

  1266 	    cnt_word++;

  1267     }

  1268     s=strstr(aline,"; hut ");

  1269     if (!s)

  1270 	s=strstr(aline,", hut ");

  1271     if (s)

  1272     {

  1273 	if (pswit[ECHO_SWITCH])

  1274 	    g_print("\n%s\n",aline);

  1275 	if (!pswit[OVERVIEW_SWITCH])

  1276 	    g_print("    Line %ld column %ld - Query hut/but error?\n",

  1277 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1278 	else

  1279 	    cnt_word++;

  1280     }

  1281 }

  1283 /*

  1284  * check_for_mta_from:

  1285  *

  1286  * Special case - angled bracket in front of "From" placed there by an

  1287  * MTA when sending an e-mail.

  1288  */

  1289 void check_for_mta_from(const char *aline)

  1290 {

  1291     const char *s;

  1292     s=strstr(aline,">From");

  1293     if (s)

  1294     {

  1295 	if (pswit[ECHO_SWITCH])

  1296 	    g_print("\n%s\n",aline);

  1297 	if (!pswit[OVERVIEW_SWITCH])

  1298 	    g_print("    Line %ld column %ld - "

  1299 	      "Query angled bracket with From\n",

  1300 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1301 	else

  1302 	    cnt_punct++;

  1303     }

  1304 }

  1306 /*

  1307  * check_for_orphan_character:

  1308  *

  1309  * Check for a single character line -

  1310  * often an overflow from bad wrapping.

  1311  */

  1312 void check_for_orphan_character(const char *aline)

  1313 {

  1314     gunichar c;

  1315     c=g_utf8_get_char(aline);

  1316     if (c && !*g_utf8_next_char(aline))

  1317     {

  1318 	if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))

  1319 	    ; /* Nothing - ignore numerals alone on a line. */

  1320 	else

  1321 	{

  1322 	    if (pswit[ECHO_SWITCH])

  1323 		g_print("\n%s\n",aline);

  1324 	    if (!pswit[OVERVIEW_SWITCH])

  1325 		g_print("    Line %ld column 1 - Query single character line\n",

  1326 		  linecnt);

  1327 	    else

  1328 		cnt_punct++;

  1329 	}

  1330     }

  1331 }

  1333 /*

  1334  * check_for_pling_scanno:

  1335  *

  1336  * Check for I" - often should be !

  1337  */

  1338 void check_for_pling_scanno(const char *aline)

  1339 {

  1340     const char *s;

  1341     s=strstr(aline," I\"");

  1342     if (s)

  1343     {

  1344 	if (pswit[ECHO_SWITCH])

  1345 	    g_print("\n%s\n",aline);

  1346 	if (!pswit[OVERVIEW_SWITCH])

  1347 	    g_print("    Line %ld column %ld - Query I=exclamation mark?\n",

  1348 	      linecnt,g_utf8_pointer_to_offset(aline,s));

  1349 	else

  1350 	    cnt_punct++;

  1351     }

  1352 }

  1354 /*

  1355  * check_for_extra_period:

  1356  *

  1357  * Check for period without a capital letter. Cut-down from gutspell.

  1358  * Only works when it happens on a single line.

  1359  */

  1360 void check_for_extra_period(const char *aline,const struct warnings *warnings)

  1361 {

  1362     const char *s,*t,*s1,*sprev;

  1363     int i;

  1364     gsize len;

  1365     gboolean istypo;

  1366     gchar *testword;

  1367     gunichar c,nc,pc,*decomposition;

  1368     if (pswit[PARANOID_SWITCH])

  1369     {

  1370 	for (t=aline;t=strstr(t,". ");)

  1371 	{

  1372 	    if (t==aline)

  1373 	    {

  1374 		t=g_utf8_next_char(t);

  1375 		/* start of line punctuation is handled elsewhere */

  1376 		continue;

  1377 	    }

  1378 	    if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))

  1379 	    {

  1380 		t=g_utf8_next_char(t);

  1381 		continue;

  1382 	    }

  1383 	    if (warnings->isDutch)

  1384 	    {

  1385 		/* For Frank & Jeroen -- 's Middags case */

  1386 		gunichar c2,c3,c4,c5;

  1387 		c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));

  1388 		c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));

  1389 		c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));

  1390 		c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));

  1391 		if (CHAR_IS_APOSTROPHE(c2) &&

  1392 		  g_unichar_islower(c3) && c4==CHAR_SPACE &&

  1393 		  g_unichar_isupper(c5))

  1394 		{

  1395 		    t=g_utf8_next_char(t);

  1396 		    continue;

  1397 		}

  1398 	    }

  1399 	    s1=g_utf8_next_char(g_utf8_next_char(t));

  1400 	    while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&

  1401 	      !isdigit(g_utf8_get_char(s1)))

  1402 		s1=g_utf8_next_char(s1);

  1403 	    if (g_unichar_islower(g_utf8_get_char(s1)))

  1404 	    {

  1405 		/* we have something to investigate */

  1406 		istypo=TRUE;

  1407 		/* so let's go back and find out */

  1408 		nc=g_utf8_get_char(t);

  1409 		s1=g_utf8_prev_char(t);

  1410 		c=g_utf8_get_char(s1);

  1411 		sprev=g_utf8_prev_char(s1);

  1412 		pc=g_utf8_get_char(sprev);

  1413 		while (s1>=aline &&

  1414 		  (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||

  1415 		  g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&

  1416 		  g_unichar_isalpha(nc)))

  1417 		{

  1418 		    nc=c;

  1419 		    s1=sprev;

  1420 		    c=pc;

  1421 		    sprev=g_utf8_prev_char(s1);

  1422 		    pc=g_utf8_get_char(sprev);

  1423 		}

  1424 		s1=g_utf8_next_char(s1);

  1425 		s=strchr(s1,'.');

  1426 		if (s)

  1427 		    testword=g_strndup(s1,s-s1);

  1428 		else

  1429 		    testword=g_strdup(s1);

  1430 		for (i=0;*abbrev[i];i++)

  1431 		    if (!strcmp(testword,abbrev[i]))

  1432 			istypo=FALSE;

  1433 		if (g_unichar_isdigit(g_utf8_get_char(testword)))

  1434 		    istypo=FALSE;

  1435 		if (!*g_utf8_next_char(testword))

  1436 		    istypo=FALSE;

  1437 		if (isroman(testword))

  1438 		    istypo=FALSE;

  1439 		if (istypo)

  1440 		{

  1441 		    istypo=FALSE;

  1442 		    for (s=testword;*s;s=g_utf8_next_char(s))

  1443 		    {

  1444 			decomposition=g_unicode_canonical_decomposition(

  1445 			  g_utf8_get_char(s),&len);

  1446 			if (g_utf8_strchr("aeiou",-1,decomposition[0]))

  1447 			    istypo=TRUE;

  1448 			g_free(decomposition);

  1449 		    }

  1450 		}

  1451 		if (istypo &&

  1452 		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))

  1453 		{

  1454 		    g_tree_insert(qperiod,g_strdup(testword),

  1455 		      GINT_TO_POINTER(1));

  1456 		    if (pswit[ECHO_SWITCH])

  1457 			g_print("\n%s\n",aline);

  1458 		    if (!pswit[OVERVIEW_SWITCH])

  1459 			g_print("    Line %ld column %ld - Extra period?\n",

  1460 			  linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  1461 		    else

  1462 			cnt_punct++;

  1463 		}

  1464 		g_free(testword);

  1465 	    }

  1466 	    t=g_utf8_next_char(t);

  1467 	}

  1468     }

  1469 }

  1471 /*

  1472  * check_for_following_punctuation:

  1473  *

  1474  * Check for words usually not followed by punctuation.

  1475  */

  1476 void check_for_following_punctuation(const char *aline)

  1477 {

  1478     int i;

  1479     const char *s,*wordstart;

  1480     gunichar c;

  1481     gchar *inword,*t;

  1482     if (pswit[TYPO_SWITCH])

  1483     {

  1484 	for (s=aline;*s;)

  1485 	{

  1486 	    wordstart=s;

  1487 	    t=getaword(&s);

  1488 	    if (!*t)

  1489 	    {

  1490 		g_free(t);

  1491 		continue;

  1492 	    }

  1493 	    inword=g_utf8_strdown(t,-1);

  1494 	    g_free(t);

  1495 	    for (i=0;*nocomma[i];i++)

  1496 		if (!strcmp(inword,nocomma[i]))

  1497 		{

  1498 		    c=g_utf8_get_char(s);

  1499 		    if (c==',' || c==';' || c==':')

  1500 		    {

  1501 			if (pswit[ECHO_SWITCH])

  1502 			    g_print("\n%s\n",aline);

  1503 			if (!pswit[OVERVIEW_SWITCH])

  1504 			    g_print("    Line %ld column %ld - "

  1505 			      "Query punctuation after %s?\n",

  1506 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,

  1507 			      inword);

  1508 			else

  1509 			    cnt_punct++;

  1510 		    }

  1511 		}

  1512 	    for (i=0;*noperiod[i];i++)

  1513 		if (!strcmp(inword,noperiod[i]))

  1514 		{

  1515 		    c=g_utf8_get_char(s);

  1516 		    if (c=='.' || c=='!')

  1517 		    {

  1518 			if (pswit[ECHO_SWITCH])

  1519 			    g_print("\n%s\n",aline);

  1520 			if (!pswit[OVERVIEW_SWITCH])

  1521 			    g_print("    Line %ld column %ld - "

  1522 			      "Query punctuation after %s?\n",

  1523 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,

  1524 			      inword);

  1525 			else

  1526 			    cnt_punct++;

  1527 		    }

  1528 		}

  1529 	    g_free(inword);

  1530 	}

  1531     }

  1532 }

  1534 /*

  1535  * check_for_typos:

  1536  *

  1537  * Check for commonly mistyped words,

  1538  * and digits like 0 for O in a word.

  1539  */

  1540 void check_for_typos(const char *aline,struct warnings *warnings)

  1541 {

  1542     const char *s,*t,*nt,*wordstart;

  1543     gchar *inword;

  1544     gunichar *decomposition;

  1545     gchar *testword;

  1546     int i,vowel,consonant,*dupcnt;

  1547     gboolean isdup,istypo,alower;

  1548     gunichar c,pc;

  1549     long offset,len;

  1550     gsize decomposition_len;

  1551     for (s=aline;*s;)

  1552     {

  1553 	wordstart=s;

  1554 	inword=getaword(&s);

  1555 	if (!*inword)

  1556 	{

  1557 	    g_free(inword);

  1558 	    continue; /* don't bother with empty lines */

  1559 	}

  1560 	if (mixdigit(inword))

  1561 	{

  1562 	    if (pswit[ECHO_SWITCH])

  1563 		g_print("\n%s\n",aline);

  1564 	    if (!pswit[OVERVIEW_SWITCH])

  1565 		g_print("    Line %ld column %ld - Query digit in %s\n",

  1566 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);

  1567 	    else

  1568 		cnt_word++;

  1569 	}

  1570 	/*

  1571 	 * Put the word through a series of tests for likely typos and OCR

  1572 	 * errors.

  1573 	 */

  1574 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])

  1575 	{

  1576 	    istypo=FALSE;

  1577 	    alower=FALSE;

  1578 	    for (t=inword;*t;t=g_utf8_next_char(t))

  1579 	    {

  1580 		c=g_utf8_get_char(t);

  1581 		nt=g_utf8_next_char(t);

  1582 		/* lowercase for testing */

  1583 		if (g_unichar_islower(c))

  1584 		    alower=TRUE;

  1585 		if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))

  1586 		{

  1587 		    /*

  1588 		     * We have an uppercase mid-word. However, there are

  1589 		     * common cases:

  1590 		     *   Mac and Mc like McGill

  1591 		     *   French contractions like l'Abbe

  1592 		     */

  1593 		    offset=g_utf8_pointer_to_offset(inword,t);

  1594 		    if (offset>0)

  1595 			pc=g_utf8_get_char(g_utf8_prev_char(t));

  1596 		    else

  1597 			pc='\0';

  1598 		    if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||

  1599 		      offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&

  1600 		      g_utf8_get_char(g_utf8_next_char(nt))=='c' ||

  1601 		      CHAR_IS_APOSTROPHE(pc))

  1602 			; /* do nothing! */

  1603 		    else

  1604 			istypo=TRUE;

  1605 		}

  1606 	    }

  1607 	    testword=g_utf8_casefold(inword,-1);

  1608 	}

  1609 	if (pswit[TYPO_SWITCH])

  1610 	{

  1611 	    /*

  1612 	     * Check for certain unlikely two-letter combinations at word

  1613 	     * start and end.

  1614 	     */

  1615 	    len=g_utf8_strlen(testword,-1);

  1616 	    if (len>1)

  1617 	    {

  1618 		for (i=0;*nostart[i];i++)

  1619 		    if (g_str_has_prefix(testword,nostart[i]))

  1620 			istypo=TRUE;

  1621 		for (i=0;*noend[i];i++)

  1622 		    if (g_str_has_suffix(testword,noend[i]))

  1623 			istypo=TRUE;

  1624 	    }

  1625 	    /* ght is common, gbt never. Like that. */

  1626 	    if (strstr(testword,"cb"))

  1627 		istypo=TRUE;

  1628 	    if (strstr(testword,"gbt"))

  1629 		istypo=TRUE;

  1630 	    if (strstr(testword,"pbt"))

  1631 		istypo=TRUE;

  1632 	    if (strstr(testword,"tbs"))

  1633 		istypo=TRUE;

  1634 	    if (strstr(testword,"mrn"))

  1635 		istypo=TRUE;

  1636 	    if (strstr(testword,"ahle"))

  1637 		istypo=TRUE;

  1638 	    if (strstr(testword,"ihle"))

  1639 		istypo=TRUE;

  1640 	    /*

  1641 	     * "TBE" does happen - like HEARTBEAT - but uncommon.

  1642 	     * Also "TBI" - frostbite, outbid - but uncommon.

  1643 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman

  1644 	     * numerals, but "ii" is a common scanno.

  1645 	     */

  1646 	    if (strstr(testword,"tbi"))

  1647 		istypo=TRUE;

  1648 	    if (strstr(testword,"tbe"))

  1649 		istypo=TRUE;

  1650 	    if (strstr(testword,"ii"))

  1651 		istypo=TRUE;

  1652 	    /*

  1653 	     * Check for no vowels or no consonants.

  1654 	     * If none, flag a typo.

  1655 	     */

  1656 	    if (!istypo && len>1)

  1657 	    {

  1658 		vowel=consonant=0;

  1659 		for (t=testword;*t;t=g_utf8_next_char(t))

  1660 		{

  1661 		    c=g_utf8_get_char(t);

  1662 		    decomposition=

  1663 		      g_unicode_canonical_decomposition(c,&decomposition_len);

  1664 		    if (c=='y' || g_unichar_isdigit(c))

  1665 		    {

  1666 			/* Yah, this is loose. */

  1667 			vowel++;

  1668 			consonant++;

  1669 		    }

  1670 		    else if (g_utf8_strchr("aeiou",-1,decomposition[0]))

  1671 			vowel++;

  1672 		    else

  1673 			consonant++;

  1674 		    g_free(decomposition);

  1675 		}

  1676 		if (!vowel || !consonant)

  1677 		    istypo=TRUE;

  1678 	    }

  1679 	    /*

  1680 	     * Now exclude the word from being reported if it's in

  1681 	     * the okword list.

  1682 	     */

  1683 	    for (i=0;*okword[i];i++)

  1684 		if (!strcmp(testword,okword[i]))

  1685 		    istypo=FALSE;

  1686 	    /*

  1687 	     * What looks like a typo may be a Roman numeral.

  1688 	     * Exclude these.

  1689 	     */

  1690 	    if (istypo && isroman(testword))

  1691 		istypo=FALSE;

  1692 	    /* Check the manual list of typos. */

  1693 	    if (!istypo)

  1694 		for (i=0;*typo[i];i++)

  1695 		    if (!strcmp(testword,typo[i]))

  1696 			istypo=TRUE;

  1697 	    /*

  1698 	     * Check lowercase s, l, i and m - special cases.

  1699 	     *   "j" - often a semi-colon gone wrong.

  1700 	     *   "d" for a missing apostrophe - he d

  1701 	     *   "n" for "in"

  1702 	     */

  1703 	    if (!istypo && len==1 &&

  1704 	      g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))

  1705 		istypo=TRUE;

  1706 	    if (istypo)

  1707 	    {

  1708 		dupcnt=g_tree_lookup(qword,testword);

  1709 		if (dupcnt)

  1710 		{

  1711 		    (*dupcnt)++;

  1712 		    isdup=!pswit[VERBOSE_SWITCH];

  1713 		}

  1714 		else

  1715 		{

  1716 		    dupcnt=g_new0(int,1);

  1717 		    g_tree_insert(qword,g_strdup(testword),dupcnt);

  1718 		    isdup=FALSE;

  1719 		}

  1720 		if (!isdup)

  1721 		{

  1722 		    if (pswit[ECHO_SWITCH])

  1723 			g_print("\n%s\n",aline);

  1724 		    if (!pswit[OVERVIEW_SWITCH])

  1725 		    {

  1726 			g_print("    Line %ld column %ld - Query word %s",

  1727 			  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,

  1728 			  inword);

  1729 			if (!pswit[VERBOSE_SWITCH])

  1730 			    g_print(" - not reporting duplicates");

  1731 			g_print("\n");

  1732 		    }

  1733 		    else

  1734 			cnt_word++;

  1735 		}

  1736 	    }

  1737 	}

  1738 	/* check the user's list of typos */

  1739 	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))

  1740 	{

  1741 	    if (pswit[ECHO_SWITCH])

  1742 		g_print("\n%s\n",aline);

  1743 	    if (!pswit[OVERVIEW_SWITCH])

  1744 		g_print("    Line %ld column %ld - Query possible scanno %s\n",

  1745 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);

  1746 	}

  1747 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])

  1748 	    g_free(testword);

  1749 	if (pswit[PARANOID_SWITCH] && warnings->digit)

  1750 	{

  1751 	    /* In paranoid mode, query all 0 and 1 standing alone. */

  1752 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))

  1753 	    {

  1754 		if (pswit[ECHO_SWITCH])

  1755 		    g_print("\n%s\n",aline);

  1756 		if (!pswit[OVERVIEW_SWITCH])

  1757 		    g_print("    Line %ld column %ld - Query standalone %s\n",

  1758 		      linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,

  1759 		      inword);

  1760 		else

  1761 		    cnt_word++;

  1762 	    }

  1763 	}

  1764 	g_free(inword);

  1765     }

  1766 }

  1768 /*

  1769  * check_for_misspaced_punctuation:

  1770  *

  1771  * Look for added or missing spaces around punctuation and quotes.

  1772  * If there is a punctuation character like ! with no space on

  1773  * either side, suspect a missing!space. If there are spaces on

  1774  * both sides , assume a typo. If we see a double quote with no

  1775  * space or punctuation on either side of it, assume unspaced

  1776  * quotes "like"this.

  1777  */

  1778 void check_for_misspaced_punctuation(const char *aline,

  1779   struct parities *parities,gboolean isemptyline)

  1780 {

  1781     gboolean isacro,isellipsis;

  1782     const char *s;

  1783     gunichar c,nc,pc,n2c;

  1784     c=g_utf8_get_char(aline);

  1785     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  1786     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  1787     {

  1788 	pc=c;

  1789 	c=nc;

  1790 	nc=g_utf8_get_char(g_utf8_next_char(s));

  1791 	/* For each character in the line after the first. */

  1792 	if (g_utf8_strchr(".?!,;:_",-1,c))  /* if it's punctuation */

  1793 	{

  1794 	    /* we need to suppress warnings for acronyms like M.D. */

  1795 	    isacro=FALSE;

  1796 	    /* we need to suppress warnings for ellipsis . . . */

  1797 	    isellipsis=FALSE;

  1798 	    /*

  1799 	     * If there are letters on both sides of it or

  1800 	     * if it's strict punctuation followed by an alpha.

  1801 	     */

  1802 	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||

  1803 	      g_utf8_strchr("?!,;:",-1,c)))

  1804 	    {

  1805 		if (c=='.')

  1806 		{

  1807 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&

  1808 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')

  1809 			isacro=TRUE;

  1810 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));

  1811 		    if (nc && n2c=='.')

  1812 			isacro=TRUE;

  1813 		}

  1814 		if (!isacro)

  1815 		{

  1816 		    if (pswit[ECHO_SWITCH])

  1817 			g_print("\n%s\n",aline);

  1818 		    if (!pswit[OVERVIEW_SWITCH])

  1819 			g_print("    Line %ld column %ld - Missing space?\n",

  1820 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1821 		    else

  1822 			cnt_punct++;

  1823 		}

  1824 	    }

  1825 	    if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))

  1826 	    {

  1827 		/*

  1828 		 * If there are spaces on both sides,

  1829 		 * or space before and end of line.

  1830 		 */

  1831 		if (c=='.')

  1832 		{

  1833 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&

  1834 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')

  1835 			isellipsis=TRUE;

  1836 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));

  1837 		    if (nc && n2c=='.')

  1838 			isellipsis=TRUE;

  1839 		}

  1840 		if (!isemptyline && !isellipsis)

  1841 		{

  1842 		    if (pswit[ECHO_SWITCH])

  1843 			g_print("\n%s\n",aline);

  1844 		    if (!pswit[OVERVIEW_SWITCH])

  1845 			g_print("    Line %ld column %ld - "

  1846 			  "Spaced punctuation?\n",linecnt,

  1847 			  g_utf8_pointer_to_offset(aline,s)+1);

  1848 		    else

  1849 			cnt_punct++;

  1850 		}

  1851 	    }

  1852 	}

  1853     }

  1854     /* Split out the characters that CANNOT be preceded by space. */

  1855     c=g_utf8_get_char(aline);

  1856     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  1857     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  1858     {

  1859 	pc=c;

  1860 	c=nc;

  1861 	nc=g_utf8_get_char(g_utf8_next_char(s));

  1862 	/* for each character in the line after the first */

  1863 	if (g_utf8_strchr("?!,;:",-1,c))

  1864 	{

  1865 	    /* if it's punctuation that _cannot_ have a space before it */

  1866 	    if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)

  1867 	    {

  1868 		/*

  1869 		 * If nc DOES == space,

  1870 		 * it was already reported just above.

  1871 		 */

  1872 		if (pswit[ECHO_SWITCH])

  1873 		    g_print("\n%s\n",aline);

  1874 		if (!pswit[OVERVIEW_SWITCH])

  1875 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",

  1876 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1877 		else

  1878 		    cnt_punct++;

  1879 	    }

  1880 	}

  1881     }

  1882     /*

  1883      * Special case " .X" where X is any alpha.

  1884      * This plugs a hole in the acronym code above.

  1885      * Inelegant, but maintainable.

  1886      */

  1887     c=g_utf8_get_char(aline);

  1888     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  1889     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  1890     {

  1891 	pc=c;

  1892 	c=nc;

  1893 	nc=g_utf8_get_char(g_utf8_next_char(s));

  1894 	/* for each character in the line after the first */

  1895 	if (c=='.')

  1896 	{

  1897 	    /* if it's a period */

  1898 	    if (pc==CHAR_SPACE && g_unichar_isalpha(nc))

  1899 	    {

  1900 		/*

  1901 		 * If the period follows a space and

  1902 		 * is followed by a letter.

  1903 		 */

  1904 		if (pswit[ECHO_SWITCH])

  1905 		    g_print("\n%s\n",aline);

  1906 		if (!pswit[OVERVIEW_SWITCH])

  1907 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",

  1908 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1909 		else

  1910 		    cnt_punct++;

  1911 	    }

  1912 	}

  1913     }

  1914     c=g_utf8_get_char(aline);

  1915     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  1916     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  1917     {

  1918 	pc=c;

  1919 	c=nc;

  1920 	nc=g_utf8_get_char(g_utf8_next_char(s));

  1921 	/* for each character in the line after the first */

  1922 	if (c==CHAR_DQUOTE)

  1923 	{

  1924 	    if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&

  1925 	      !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||

  1926 	      !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))

  1927 	    {

  1928 		if (pswit[ECHO_SWITCH])

  1929 		    g_print("\n%s\n",aline);

  1930 		if (!pswit[OVERVIEW_SWITCH])

  1931 		    g_print("    Line %ld column %ld - Unspaced quotes?\n",

  1932 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1933 		else

  1934 		    cnt_punct++;

  1935 	    }

  1936 	}

  1937     }

  1938     /* Check parity of quotes. */

  1939     nc=g_utf8_get_char(aline);

  1940     for (s=aline;*s;s=g_utf8_next_char(s))

  1941     {

  1942 	c=nc;

  1943 	nc=g_utf8_get_char(g_utf8_next_char(s));

  1944 	if (c==CHAR_DQUOTE)

  1945 	{

  1946 	    parities->dquote=!parities->dquote;

  1947 	    if (!parities->dquote)

  1948 	    {

  1949 		/* parity even */

  1950 		if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))

  1951 		{

  1952 		    if (pswit[ECHO_SWITCH])

  1953 			g_print("\n%s\n",aline);

  1954 		    if (!pswit[OVERVIEW_SWITCH])

  1955 			g_print("    Line %ld column %ld - "

  1956 			  "Wrongspaced quotes?\n",

  1957 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1958 		    else

  1959 			cnt_punct++;

  1960 		}

  1961 	    }

  1962 	    else

  1963 	    {

  1964 		/* parity odd */

  1965 		if (!g_unichar_isalpha(nc) && !isdigit(nc) &&

  1966 		  !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)

  1967 		{

  1968 		    if (pswit[ECHO_SWITCH])

  1969 			g_print("\n%s\n",aline);

  1970 		    if (!pswit[OVERVIEW_SWITCH])

  1971 			g_print("    Line %ld column %ld - "

  1972 			  "Wrongspaced quotes?\n",

  1973 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1974 		    else

  1975 			cnt_punct++;

  1976 		}

  1977 	    }

  1978 	}

  1979     }

  1980     if (g_utf8_get_char(aline)==CHAR_DQUOTE)

  1981     {

  1982 	if (g_utf8_strchr(",;:!?)]} ",-1,

  1983 	  g_utf8_get_char(g_utf8_next_char(aline))))

  1984 	{

  1985 	    if (pswit[ECHO_SWITCH])

  1986 		g_print("\n%s\n",aline);

  1987 	    if (!pswit[OVERVIEW_SWITCH])

  1988 		g_print("    Line %ld column 1 - Wrongspaced quotes?\n",

  1989 		  linecnt);

  1990 	    else

  1991 		cnt_punct++;

  1992 	}

  1993     }

  1994     if (pswit[SQUOTE_SWITCH])

  1995     {

  1996 	nc=g_utf8_get_char(aline);

  1997 	for (s=aline;*s;s=g_utf8_next_char(s))

  1998 	{

  1999 	    c=nc;

  2000 	    nc=g_utf8_get_char(g_utf8_next_char(s));

  2001 	    if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&

  2002 	      !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||

  2003 	      !g_unichar_isalpha(nc)))

  2004 	    {

  2005 		parities->squote=!parities->squote;

  2006 		if (!parities->squote)

  2007 		{

  2008 		    /* parity even */

  2009 		    if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))

  2010 		    {

  2011 			if (pswit[ECHO_SWITCH])

  2012 			    g_print("\n%s\n",aline);

  2013 			if (!pswit[OVERVIEW_SWITCH])

  2014 			    g_print("    Line %ld column %ld - "

  2015 			      "Wrongspaced singlequotes?\n",

  2016 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2017 			else

  2018 			    cnt_punct++;

  2019 		    }

  2020 		}

  2021 		else

  2022 		{

  2023 		    /* parity odd */

  2024 		    if (!g_unichar_isalpha(nc) && !isdigit(nc) &&

  2025 		      !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)

  2026 		    {

  2027 			if (pswit[ECHO_SWITCH])

  2028 			    g_print("\n%s\n",aline);

  2029 			if (!pswit[OVERVIEW_SWITCH])

  2030 			    g_print("    Line %ld column %ld - "

  2031 			      "Wrongspaced singlequotes?\n",

  2032 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2033 			else

  2034 			    cnt_punct++;

  2035 		    }

  2036 		}

  2037 	    }

  2038 	}

  2039     }

  2040 }

  2042 /*

  2043  * check_for_double_punctuation:

  2044  *

  2045  * Look for double punctuation like ,. or ,,

  2046  * Thanks to DW for the suggestion!

  2047  * In books with references, ".," and ".;" are common

  2048  * e.g. "etc., etc.," and vol. 1.; vol 3.;

  2049  * OTOH, from my initial tests, there are also fairly

  2050  * common errors. What to do? Make these cases paranoid?

  2051  * ".," is the most common, so warnings->dotcomma is used

  2052  * to suppress detailed reporting if it occurs often.

  2053  */

  2054 void check_for_double_punctuation(const char *aline,struct warnings *warnings)

  2055 {

  2056     const char *s;

  2057     gunichar c,nc;

  2058     nc=g_utf8_get_char(aline);

  2059     for (s=aline;*s;s=g_utf8_next_char(s))

  2060     {

  2061 	c=nc;

  2062 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2063 	/* for each punctuation character in the line */

  2064 	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&

  2065 	  g_utf8_strchr(".?!,;:",-1,nc))

  2066 	{

  2067 	    /* followed by punctuation, it's a query, unless . . . */

  2068 	    if (c==nc && (c=='.' || c=='?' || c=='!') ||

  2069 	      !warnings->dotcomma && c=='.' && nc==',' ||

  2070 	      warnings->isFrench && g_str_has_prefix(s,",...") ||

  2071 	      warnings->isFrench && g_str_has_prefix(s,"...,") ||

  2072 	      warnings->isFrench && g_str_has_prefix(s,";...") ||

  2073 	      warnings->isFrench && g_str_has_prefix(s,"...;") ||

  2074 	      warnings->isFrench && g_str_has_prefix(s,":...") ||

  2075 	      warnings->isFrench && g_str_has_prefix(s,"...:") ||

  2076 	      warnings->isFrench && g_str_has_prefix(s,"!...") ||

  2077 	      warnings->isFrench && g_str_has_prefix(s,"...!") ||

  2078 	      warnings->isFrench && g_str_has_prefix(s,"?...") ||

  2079 	      warnings->isFrench && g_str_has_prefix(s,"...?"))

  2080 	    {

  2081 		if (warnings->isFrench && g_str_has_prefix(s,",...") ||

  2082 		  warnings->isFrench && g_str_has_prefix(s,"...,") ||

  2083 		  warnings->isFrench && g_str_has_prefix(s,";...") ||

  2084 		  warnings->isFrench && g_str_has_prefix(s,"...;") ||

  2085 		  warnings->isFrench && g_str_has_prefix(s,":...") ||

  2086 		  warnings->isFrench && g_str_has_prefix(s,"...:") ||

  2087 		  warnings->isFrench && g_str_has_prefix(s,"!...") ||

  2088 		  warnings->isFrench && g_str_has_prefix(s,"...!") ||

  2089 		  warnings->isFrench && g_str_has_prefix(s,"?...") ||

  2090 		  warnings->isFrench && g_str_has_prefix(s,"...?"))

  2091 		{

  2092 		    s+=4;

  2093 		    nc=g_utf8_get_char(g_utf8_next_char(s));

  2094 		}

  2095 		; /* do nothing for .. !! and ?? which can be legit */

  2096 	    }

  2097 	    else

  2098 	    {

  2099 		if (pswit[ECHO_SWITCH])

  2100 		    g_print("\n%s\n",aline);

  2101 		if (!pswit[OVERVIEW_SWITCH])

  2102 		    g_print("    Line %ld column %ld - Double punctuation?\n",

  2103 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2104 		else

  2105 		    cnt_punct++;

  2106 	    }

  2107 	}

  2108     }

  2109 }

  2111 /*

  2112  * check_for_spaced_quotes:

  2113  */

  2114 void check_for_spaced_quotes(const char *aline)

  2115 {

  2116     int i;

  2117     const char *s,*t;

  2118     const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,

  2119       CHAR_RS_QUOTE};

  2120     GString *pattern;

  2121     s=aline;

  2122     while ((t=strstr(s," \" ")))

  2123     {

  2124 	if (pswit[ECHO_SWITCH])

  2125 	    g_print("\n%s\n",aline);

  2126 	if (!pswit[OVERVIEW_SWITCH])

  2127 	    g_print("    Line %ld column %ld - Spaced doublequote?\n",

  2128 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  2129 	else

  2130 	    cnt_punct++;

  2131 	s=g_utf8_next_char(g_utf8_next_char(t));

  2132     }

  2133     pattern=g_string_new(NULL);

  2134     for(i=0;i<G_N_ELEMENTS(single_quotes);i++)

  2135     {

  2136 	g_string_assign(pattern," ");

  2137 	g_string_append_unichar(pattern,single_quotes[i]);

  2138 	g_string_append_c(pattern,' ');

  2139 	s=aline;

  2140 	while ((t=strstr(s,pattern->str)))

  2141 	{

  2142 	    if (pswit[ECHO_SWITCH])

  2143 		g_print("\n%s\n",aline);

  2144 	    if (!pswit[OVERVIEW_SWITCH])

  2145 		g_print("    Line %ld column %ld - Spaced singlequote?\n",

  2146 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  2147 	    else

  2148 		cnt_punct++;

  2149 	    s=g_utf8_next_char(g_utf8_next_char(t));

  2150 	}

  2151     }

  2152     g_string_free(pattern,TRUE);

  2153 }

  2155 /*

  2156  * check_for_miscased_genative:

  2157  *

  2158  * Check special case of 'S instead of 's at end of word.

  2159  */

  2160 void check_for_miscased_genative(const char *aline)

  2161 {

  2162     const char *s;

  2163     gunichar c,nc,pc;

  2164     if (!*aline)

  2165 	return;

  2166     c=g_utf8_get_char(aline);

  2167     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2168     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2169     {

  2170 	pc=c;

  2171 	c=nc;

  2172 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2173 	if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))

  2174 	{

  2175 	    if (pswit[ECHO_SWITCH])

  2176 		g_print("\n%s\n",aline);

  2177 	    if (!pswit[OVERVIEW_SWITCH])

  2178 		g_print("    Line %ld column %ld - Capital \"S\"?\n",

  2179 		  linecnt,g_utf8_pointer_to_offset(aline,s)+2);

  2180 	    else

  2181 		cnt_punct++;

  2182 	}

  2183     }

  2184 }

  2186 /*

  2187  * check_end_of_line:

  2188  *

  2189  * Now check special cases - start and end of line -

  2190  * for single and double quotes. Start is sometimes [sic]

  2191  * but better to query it anyway.

  2192  * While we're here, check for dash at end of line.

  2193  */

  2194 void check_end_of_line(const char *aline,struct warnings *warnings)

  2195 {

  2196     int lbytes;

  2197     const char *s;

  2198     gunichar c1,c2;

  2199     lbytes=strlen(aline);

  2200     if (g_utf8_strlen(aline,lbytes)>1)

  2201     {

  2202 	s=g_utf8_prev_char(aline+lbytes);

  2203 	c1=g_utf8_get_char(s);

  2204 	c2=g_utf8_get_char(g_utf8_prev_char(s));

  2205 	if ((c1==CHAR_DQUOTE || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)

  2206 	{

  2207 	    if (pswit[ECHO_SWITCH])

  2208 		g_print("\n%s\n",aline);

  2209 	    if (!pswit[OVERVIEW_SWITCH])

  2210 		g_print("    Line %ld column %ld - Spaced quote?\n",linecnt,

  2211 		  g_utf8_strlen(aline,lbytes));

  2212 	    else

  2213 		cnt_punct++;

  2214 	}

  2215 	c1=g_utf8_get_char(aline);

  2216 	c2=g_utf8_get_char(g_utf8_next_char(aline));

  2217 	if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)

  2218 	{

  2219 	    if (pswit[ECHO_SWITCH])

  2220 		g_print("\n%s\n",aline);

  2221 	    if (!pswit[OVERVIEW_SWITCH])

  2222 		g_print("    Line %ld column 1 - Spaced quote?\n",linecnt);

  2223 	    else

  2224 		cnt_punct++;

  2225 	}

  2226 	/*

  2227 	 * Dash at end of line may well be legit - paranoid mode only

  2228 	 * and don't report em-dash at line-end.

  2229 	 */

  2230 	if (pswit[PARANOID_SWITCH] && warnings->hyphen)

  2231 	{

  2232 	    for (s=g_utf8_prev_char(aline+lbytes);

  2233 	      s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))

  2234 		;

  2235 	    if (g_utf8_get_char(s)=='-' &&

  2236 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')

  2237 	    {

  2238 		if (pswit[ECHO_SWITCH])

  2239 		    g_print("\n%s\n",aline);

  2240 		if (!pswit[OVERVIEW_SWITCH])

  2241 		    g_print("    Line %ld column %ld - "

  2242 		      "Hyphen at end of line?\n",

  2243 		      linecnt,g_utf8_pointer_to_offset(aline,s));

  2244 	    }

  2245 	}

  2246     }

  2247 }

  2249 /*

  2250  * check_for_unspaced_bracket:

  2251  *

  2252  * Brackets are often unspaced, but shouldn't be surrounded by alpha.

  2253  * If so, suspect a scanno like "a]most".

  2254  */

  2255 void check_for_unspaced_bracket(const char *aline)

  2256 {

  2257     const char *s;

  2258     gunichar c,nc,pc;

  2259     c=g_utf8_get_char(aline);

  2260     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2261     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2262     {

  2263 	pc=c;

  2264 	c=nc;

  2265 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2266 	if (!nc)

  2267 	    break;

  2268 	/* for each bracket character in the line except 1st & last */

  2269 	if (g_utf8_strchr("{[()]}",-1,c) &&

  2270 	  g_unichar_isalpha(pc) && g_unichar_isalpha(nc))

  2271 	{

  2272 	    if (pswit[ECHO_SWITCH])

  2273 		g_print("\n%s\n",aline);

  2274 	    if (!pswit[OVERVIEW_SWITCH])

  2275 		g_print("    Line %ld column %ld - Unspaced bracket?\n",

  2276 		  linecnt,g_utf8_pointer_to_offset(aline,s));

  2277 	    else

  2278 		cnt_punct++;

  2279 	}

  2280     }

  2281 }

  2283 /*

  2284  * check_for_unpunctuated_endquote:

  2285  */

  2286 void check_for_unpunctuated_endquote(const char *aline)

  2287 {

  2288     const char *s;

  2289     gunichar c,nc,pc;

  2290     c=g_utf8_get_char(aline);

  2291     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2292     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2293     {

  2294 	pc=c;

  2295 	c=nc;

  2296 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2297 	/* for each character in the line except 1st */

  2298 	if (c==CHAR_DQUOTE && isalpha(pc))

  2299 	{

  2300 	    if (pswit[ECHO_SWITCH])

  2301 		g_print("\n%s\n",aline);

  2302 	    if (!pswit[OVERVIEW_SWITCH])

  2303 		g_print("    Line %ld column %ld - "

  2304 		  "endquote missing punctuation?\n",

  2305 		  linecnt,g_utf8_pointer_to_offset(aline,s));

  2306 	    else

  2307 		cnt_punct++;

  2308 	}

  2309     }

  2310 }

  2312 /*

  2313  * check_for_html_tag:

  2314  *

  2315  * Check for <HTML TAG>.

  2316  *

  2317  * If there is a < in the line, followed at some point

  2318  * by a > then we suspect HTML.

  2319  */

  2320 void check_for_html_tag(const char *aline)

  2321 {

  2322     const char *open,*close;

  2323     gchar *tag;

  2324     open=strchr(aline,'<');

  2325     if (open)

  2326     {

  2327 	close=strchr(g_utf8_next_char(open),'>');

  2328 	if (close)

  2329 	{

  2330 	    if (pswit[ECHO_SWITCH])

  2331 		g_print("\n%s\n",aline);

  2332 	    if (!pswit[OVERVIEW_SWITCH])

  2333 	    {

  2334 		tag=g_strndup(open,close-open+1);

  2335 		g_print("    Line %ld column %ld - HTML Tag? %s \n",

  2336 		  linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);

  2337 		g_free(tag);

  2338 	    }

  2339 	    else

  2340 		cnt_html++;

  2341 	}

  2342     }

  2343 }

  2345 /*

  2346  * check_for_html_entity:

  2347  *

  2348  * Check for &symbol; HTML.

  2349  *

  2350  * If there is a & in the line, followed at

  2351  * some point by a ; then we suspect HTML.

  2352  */

  2353 void check_for_html_entity(const char *aline)

  2354 {

  2355     const char *s,*amp,*scolon;

  2356     gchar *entity;

  2357     amp=strchr(aline,'&');

  2358     if (amp)

  2359     {

  2360 	scolon=strchr(amp,';');

  2361 	if (scolon)

  2362 	{

  2363 	    for (s=amp;s<scolon;s=g_utf8_next_char(s))

  2364 		if (g_utf8_get_char(s)==CHAR_SPACE)

  2365 		    break;		/* Don't report "Jones & Son;" */

  2366 	    if (s>=scolon)

  2367 	    {

  2368 		if (pswit[ECHO_SWITCH])

  2369 		    g_print("\n%s\n",aline);

  2370 		if (!pswit[OVERVIEW_SWITCH])

  2371 		{

  2372 		    entity=g_strndup(amp,scolon-amp+1);

  2373 		    g_print("    Line %ld column %d - HTML symbol? %s \n",

  2374 		      linecnt,(int)(amp-aline)+1,entity);

  2375 		    g_free(entity);

  2376 		}

  2377 		else

  2378 		    cnt_html++;

  2379 	    }

  2380 	}

  2381     }

  2382 }

  2384 /*

  2385  * check_for_omitted_punctuation:

  2386  *

  2387  * Check for omitted punctuation at end of paragraph by working back

  2388  * through prevline. DW.

  2389  * Need to check this only for "normal" paras.

  2390  * So what is a "normal" para?

  2391  *    Not normal if one-liner (chapter headings, etc.)

  2392  *    Not normal if doesn't contain at least one locase letter

  2393  *    Not normal if starts with space

  2394  */

  2395 void check_for_omitted_punctuation(const char *prevline,

  2396   struct line_properties *last,int start_para_line)

  2397 {

  2398     gboolean letter_on_line=FALSE;

  2399     const char *s;

  2400     gunichar c;

  2401     for (s=prevline;*s;s=g_utf8_next_char(s))

  2402 	if (g_unichar_isalpha(g_utf8_get_char(s)))

  2403 	{

  2404 	    letter_on_line=TRUE;

  2405 	    break;

  2406 	}

  2407     /*

  2408      * This next "if" is a problem.

  2409      * If we say "start_para_line <= linecnt - 1", that includes

  2410      * one-line "paragraphs" like chapter heads. Lotsa false positives.

  2411      * If we say "start_para_line < linecnt - 1" it doesn't, but then it

  2412      * misses genuine one-line paragraphs.

  2413      */

  2414     if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&

  2415       g_utf8_get_char(prevline)>CHAR_SPACE)

  2416     {

  2417 	s=prevline+strlen(prevline);

  2418 	do

  2419 	{

  2420 	    s=g_utf8_prev_char(s);

  2421 	    c=g_utf8_get_char(s);

  2422 	} while (CHAR_IS_CLOSING_QUOTE(c) && c>CHAR_SPACE && s>prevline);

  2423 	for (;s>prevline;s=g_utf8_prev_char(s))

  2424 	{

  2425 	    if (g_unichar_isalpha(g_utf8_get_char(s)))

  2426 	    {

  2427 		if (pswit[ECHO_SWITCH])

  2428 		    g_print("\n%s\n",prevline);

  2429 		if (!pswit[OVERVIEW_SWITCH])

  2430 		    g_print("    Line %ld column %ld - "

  2431 		      "No punctuation at para end?\n",

  2432 		      linecnt-1,g_utf8_strlen(prevline,-1));

  2433 		else

  2434 		    cnt_punct++;

  2435 		break;

  2436 	    }

  2437 	    if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))

  2438 		break;

  2439 	}

  2440     }

  2441 }

  2443 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)

  2444 {

  2445     const char *word=key;

  2446     int *dupcnt=value;

  2447     if (*dupcnt)

  2448 	g_print("\nNote: Queried word %s was duplicated %d times\n",

  2449 	  word,*dupcnt);

  2450     return FALSE;

  2451 }

  2453 void print_as_windows_1252(const char *string)

  2454 {

  2455     gsize inbytes,outbytes;

  2456     gchar *buf,*bp;

  2457     static GIConv converter=(GIConv)-1;

  2458     if (!string)

  2459     {

  2460 	if (converter!=(GIConv)-1)

  2461 	    g_iconv_close(converter);

  2462 	converter=(GIConv)-1;

  2463 	return;

  2464     }

  2465     if (converter==(GIConv)-1)

  2466 	converter=g_iconv_open("WINDOWS-1252","UTF-8");

  2467     if (converter!=(GIConv)-1)

  2468     {

  2469 	inbytes=outbytes=strlen(string);

  2470 	bp=buf=g_malloc(outbytes+1);

  2471 	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);

  2472 	*bp='\0';

  2473 	fputs(buf,stdout);

  2474 	g_free(buf);

  2475     }

  2476     else

  2477 	fputs(string,stdout);

  2478 }

  2480 void print_as_utf_8(const char *string)

  2481 {

  2482     fputs(string,stdout);

  2483 }

  2485 /*

  2486  * procfile:

  2487  *

  2488  * Process one file.

  2489  */

  2490 void procfile(const char *filename)

  2491 {

  2492     const char *s;

  2493     gchar *parastart=NULL;	/* first line of current para */

  2494     gchar *etext,*aline;

  2495     gchar *etext_ptr;

  2496     GError *err=NULL;

  2497     struct first_pass_results *first_pass_results;

  2498     struct warnings *warnings;

  2499     struct counters counters={0};

  2500     struct line_properties last={0};

  2501     struct parities parities={0};

  2502     struct pending pending={0};

  2503     gboolean isemptyline;

  2504     long start_para_line=0;

  2505     gboolean isnewpara=FALSE,enddash=FALSE;

  2506     last.start=CHAR_SPACE;

  2507     linecnt=checked_linecnt=0;

  2508     etext=read_etext(filename,&err);

  2509     if (!etext)

  2510     {

  2511 	if (pswit[STDOUT_SWITCH])

  2512 	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);

  2513 	else

  2514 	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);

  2515 	exit(1);

  2516     }

  2517     if (g_path_is_absolute(filename))

  2518 	g_print("\n\nFile: %s\n\n",filename);

  2519     else

  2520     {

  2521 	gchar *cwd,*path;

  2522 	cwd=g_get_current_dir();

  2523 	path=g_build_filename(cwd,filename,NULL);

  2524 	g_free(cwd);

  2525 	g_print("\n\nFile: %s\n\n",path);

  2526 	g_free(path);

  2527     }

  2528     first_pass_results=first_pass(etext);

  2529     warnings=report_first_pass(first_pass_results);

  2530     qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);

  2531     qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);

  2532     /*

  2533      * Here we go with the main pass. Hold onto yer hat!

  2534      */

  2535     linecnt=0;

  2536     etext_ptr=etext;

  2537     while ((aline=flgets(&etext_ptr,linecnt+1)))

  2538     {

  2539 	linecnt++;

  2540 	if (linecnt==1)

  2541 	    isnewpara=TRUE;

  2542 	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))

  2543 	    continue;    // skip DP page separators completely

  2544 	if (linecnt<first_pass_results->firstline ||

  2545 	  (first_pass_results->footerline>0 &&

  2546 	  linecnt>first_pass_results->footerline))

  2547 	{

  2548 	    if (pswit[HEADER_SWITCH])

  2549 	    {

  2550 		if (g_str_has_prefix(aline,"Title:"))

  2551 		    g_print("    %s\n",aline);

  2552 		if (g_str_has_prefix(aline,"Author:"))

  2553 		    g_print("    %s\n",aline);

  2554 		if (g_str_has_prefix(aline,"Release Date:"))

  2555 		    g_print("    %s\n",aline);

  2556 		if (g_str_has_prefix(aline,"Edition:"))

  2557 		    g_print("    %s\n\n",aline);

  2558 	    }

  2559 	    continue;		/* skip through the header */

  2560 	}

  2561 	checked_linecnt++;

  2562 	print_pending(aline,parastart,&pending);

  2563 	isemptyline=analyse_quotes(aline,&counters);

  2564 	if (isnewpara && !isemptyline)

  2565 	{

  2566 	    /* This line is the start of a new paragraph. */

  2567 	    start_para_line=linecnt;

  2568 	    /* Capture its first line in case we want to report it later. */

  2569 	    g_free(parastart);

  2570 	    parastart=g_strdup(aline);

  2571 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */

  2572 	    s=aline;

  2573 	    while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&

  2574 	      !g_unichar_isdigit(g_utf8_get_char(s)))

  2575 		s=g_utf8_next_char(s);

  2576 	    if (g_unichar_islower(g_utf8_get_char(s)))

  2577 	    {

  2578 		/* and its first letter is lowercase */

  2579 		if (pswit[ECHO_SWITCH])

  2580 		    g_print("\n%s\n",aline);

  2581 		if (!pswit[OVERVIEW_SWITCH])

  2582 		    g_print("    Line %ld column %ld - "

  2583 		      "Paragraph starts with lower-case\n",

  2584 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2585 		else

  2586 		    cnt_punct++;

  2587 	    }

  2588 	    isnewpara=FALSE; /* Signal the end of new para processing. */

  2589 	}

  2590 	/* Check for an em-dash broken at line end. */

  2591 	if (enddash && g_utf8_get_char(aline)=='-')

  2592 	{

  2593 	    if (pswit[ECHO_SWITCH])

  2594 		g_print("\n%s\n",aline);

  2595 	    if (!pswit[OVERVIEW_SWITCH])

  2596 		g_print("    Line %ld column 1 - Broken em-dash?\n",linecnt);

  2597 	    else

  2598 		cnt_punct++;

  2599 	}

  2600 	enddash=FALSE;

  2601 	for (s=g_utf8_prev_char(aline+strlen(aline));

  2602 	  g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))

  2603 	    ;

  2604 	if (s>=aline && g_utf8_get_char(s)=='-')

  2605 	    enddash=TRUE;

  2606 	check_for_control_characters(aline);

  2607 	if (warnings->bin)

  2608 	    check_for_odd_characters(aline,warnings,isemptyline);

  2609 	if (warnings->longline)

  2610 	    check_for_long_line(aline);

  2611 	if (warnings->shortline)

  2612 	    check_for_short_line(aline,&last);

  2613 	last.blen=last.len;

  2614 	last.len=g_utf8_strlen(aline,-1);

  2615 	last.start=g_utf8_get_char(aline);

  2616 	check_for_starting_punctuation(aline);

  2617 	if (warnings->dash)

  2618 	{

  2619 	    check_for_spaced_emdash(aline);

  2620 	    check_for_spaced_dash(aline);

  2621 	}

  2622 	check_for_unmarked_paragraphs(aline);

  2623 	check_for_jeebies(aline);

  2624 	check_for_mta_from(aline);

  2625 	check_for_orphan_character(aline);

  2626 	check_for_pling_scanno(aline);

  2627 	check_for_extra_period(aline,warnings);

  2628 	check_for_following_punctuation(aline);

  2629 	check_for_typos(aline,warnings);

  2630 	check_for_misspaced_punctuation(aline,&parities,isemptyline);

  2631 	check_for_double_punctuation(aline,warnings);

  2632 	check_for_spaced_quotes(aline);

  2633 	check_for_miscased_genative(aline);

  2634 	check_end_of_line(aline,warnings);

  2635 	check_for_unspaced_bracket(aline);

  2636 	if (warnings->endquote)

  2637 	    check_for_unpunctuated_endquote(aline);

  2638 	check_for_html_tag(aline);

  2639 	check_for_html_entity(aline);

  2640 	if (isemptyline)

  2641 	{

  2642 	    check_for_mismatched_quotes(&counters,&pending);

  2643 	    counters_reset(&counters);

  2644 	    /* let the next iteration know that it's starting a new para */

  2645 	    isnewpara=TRUE;

  2646 	    if (prevline)

  2647 		check_for_omitted_punctuation(prevline,&last,start_para_line);

  2648 	}

  2649 	g_free(prevline);

  2650 	prevline=g_strdup(aline);

  2651     }

  2652     linecnt++;

  2653     check_for_mismatched_quotes(&counters,&pending);

  2654     print_pending(NULL,parastart,&pending);

  2655     reset_pending(&pending);

  2656     if (prevline)

  2657     {

  2658 	g_free(prevline);

  2659 	prevline=NULL;

  2660     }

  2661     g_free(parastart);

  2662     g_free(prevline);

  2663     g_free(etext);

  2664     if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])

  2665 	g_tree_foreach(qword,report_duplicate_queries,NULL);

  2666     g_tree_unref(qword);

  2667     g_tree_unref(qperiod);

  2668     counters_destroy(&counters);

  2669     g_set_print_handler(NULL);

  2670     print_as_windows_1252(NULL);

  2671     if (pswit[MARKUP_SWITCH])

  2672 	loseentities(NULL);

  2673 }

  2675 /*

  2676  * flgets:

  2677  *

  2678  * Get one line from the input text, checking for

  2679  * the existence of exactly one CR/LF line-end per line.

  2680  *

  2681  * Returns: a pointer to the line.

  2682  */

  2683 char *flgets(char **etext,long lcnt)

  2684 {

  2685     gunichar c;

  2686     gboolean isCR=FALSE;

  2687     char *theline=*etext;

  2688     char *eos=theline;

  2689     gchar *s;

  2690     for (;;)

  2691     {

  2692 	c=g_utf8_get_char(*etext);

  2693 	*etext=g_utf8_next_char(*etext);

  2694 	if (!c)

  2695 	    return NULL;

  2696 	/* either way, it's end of line */

  2697 	if (c=='\n')

  2698 	{

  2699 	    if (isCR)

  2700 		break;

  2701 	    else

  2702 	    {

  2703 		/* Error - a LF without a preceding CR */

  2704 		if (pswit[LINE_END_SWITCH])

  2705 		{

  2706 		    if (pswit[ECHO_SWITCH])

  2707 		    {

  2708 			s=g_strndup(theline,eos-theline);

  2709 			g_print("\n%s\n",s);

  2710 			g_free(s);

  2711 		    }

  2712 		    if (!pswit[OVERVIEW_SWITCH])

  2713 			g_print("    Line %ld - No CR?\n",lcnt);

  2714 		    else

  2715 			cnt_lineend++;

  2716 		}

  2717 		break;

  2718 	    }

  2719 	}

  2720 	if (c=='\r')

  2721 	{

  2722 	    if (isCR)

  2723 	    {

  2724 		/* Error - two successive CRs */

  2725 		if (pswit[LINE_END_SWITCH])

  2726 		{

  2727 		    if (pswit[ECHO_SWITCH])

  2728 		    {

  2729 			s=g_strndup(theline,eos-theline);

  2730 			g_print("\n%s\n",s);

  2731 			g_free(s);

  2732 		    }

  2733 		    if (!pswit[OVERVIEW_SWITCH])

  2734 			g_print("    Line %ld - Two successive CRs?\n",lcnt);

  2735 		    else

  2736 			cnt_lineend++;

  2737 		}

  2738 	    }

  2739 	    isCR=TRUE;

  2740 	}

  2741 	else

  2742 	{

  2743 	    if (pswit[LINE_END_SWITCH] && isCR)

  2744 	    {

  2745 		if (pswit[ECHO_SWITCH])

  2746 		{

  2747 		    s=g_strndup(theline,eos-theline);

  2748 		    g_print("\n%s\n",s);

  2749 		    g_free(s);

  2750 		}

  2751 		if (!pswit[OVERVIEW_SWITCH])

  2752 		    g_print("    Line %ld column %ld - CR without LF?\n",

  2753 		      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);

  2754 		else

  2755 		    cnt_lineend++;

  2756 		*eos=' ';

  2757 	    }

  2758 	    isCR=FALSE;

  2759 	    eos=g_utf8_next_char(eos);

  2760 	}

  2761     }

  2762     *eos='\0';

  2763     if (pswit[MARKUP_SWITCH])

  2764 	postprocess_for_HTML(theline);

  2765     if (pswit[DP_SWITCH])

  2766 	postprocess_for_DP(theline);

  2767     return theline;

  2768 }

  2770 /*

  2771  * mixdigit:

  2772  *

  2773  * Takes a "word" as a parameter, and checks whether it

  2774  * contains a mixture of alpha and digits. Generally, this is an

  2775  * error, but may not be for cases like 4th or L5 12s. 3d.

  2776  *

  2777  * Returns: TRUE iff an is error found.

  2778  */

  2779 gboolean mixdigit(const char *checkword)

  2780 {

  2781     gboolean wehaveadigit,wehavealetter,query;

  2782     const char *s,*nondigit;

  2783     wehaveadigit=wehavealetter=query=FALSE;

  2784     for (s=checkword;*s;s=g_utf8_next_char(s))

  2785 	if (g_unichar_isalpha(g_utf8_get_char(s)))

  2786 	    wehavealetter=TRUE;

  2787 	else if (g_unichar_isdigit(g_utf8_get_char(s)))

  2788 	    wehaveadigit=TRUE;

  2789     if (wehaveadigit && wehavealetter)

  2790     {

  2791 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */

  2792 	query=TRUE;

  2793 	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));

  2794 	  nondigit=g_utf8_next_char(nondigit))

  2795 	    ;

  2796 	/* digits, ending in st, rd, nd, th of either case */

  2797 	if (!g_ascii_strcasecmp(nondigit,"st") ||

  2798 	  !g_ascii_strcasecmp(nondigit,"rd") ||

  2799 	  !g_ascii_strcasecmp(nondigit,"nd") ||

  2800 	  !g_ascii_strcasecmp(nondigit,"th"))

  2801 	    query=FALSE;

  2802 	if (!g_ascii_strcasecmp(nondigit,"sts") ||

  2803 	  !g_ascii_strcasecmp(nondigit,"rds") ||

  2804 	  !g_ascii_strcasecmp(nondigit,"nds") ||

  2805 	  !g_ascii_strcasecmp(nondigit,"ths"))

  2806 	    query=FALSE;

  2807 	if (!g_ascii_strcasecmp(nondigit,"stly") ||

  2808 	  !g_ascii_strcasecmp(nondigit,"rdly") ||

  2809 	  !g_ascii_strcasecmp(nondigit,"ndly") ||

  2810 	  !g_ascii_strcasecmp(nondigit,"thly"))

  2811 	    query=FALSE;

  2812 	/* digits, ending in l, L, s or d */

  2813 	if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||

  2814 	  !strcmp(nondigit,"d"))

  2815 	    query=FALSE;

  2816 	/*

  2817 	 * L at the start of a number, representing Britsh pounds, like L500.

  2818 	 * This is cute. We know the current word is mixed digit. If the first

  2819 	 * letter is L, there must be at least one digit following. If both

  2820 	 * digits and letters follow, we have a genuine error, else we have a

  2821 	 * capital L followed by digits, and we accept that as a non-error.

  2822 	 */

  2823 	if (g_utf8_get_char(checkword)=='L' &&

  2824 	  !mixdigit(g_utf8_next_char(checkword)))

  2825 	    query=FALSE;

  2826     }

  2827     return query;

  2828 }

  2830 /*

  2831  * getaword:

  2832  *

  2833  * Extracts the first/next "word" from the line, and returns it.

  2834  * A word is defined as one English word unit--or at least that's the aim.

  2835  * "ptr" is advanced to the position in the line where we will start

  2836  * looking for the next word.

  2837  *

  2838  * Returns: A newly-allocated string.

  2839  */

  2840 gchar *getaword(const char **ptr)

  2841 {

  2842     const char *s,*t;

  2843     GString *word;

  2844     gunichar c,pc;

  2845     word=g_string_new(NULL);

  2846     for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&

  2847       !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&

  2848       **ptr;*ptr=g_utf8_next_char(*ptr))

  2849 	;

  2850     /*

  2851      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.

  2852      * Especially yucky is the case of L1,000

  2853      * This section looks for a pattern of characters including a digit

  2854      * followed by a comma or period followed by one or more digits.

  2855      * If found, it returns this whole pattern as a word; otherwise we discard

  2856      * the results and resume our normal programming.

  2857      */

  2858     s=*ptr;

  2859     for (;g_unichar_isdigit(g_utf8_get_char(s)) ||

  2860       g_unichar_isalpha(g_utf8_get_char(s)) ||

  2861       g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))

  2862 	g_string_append_unichar(word,g_utf8_get_char(s));

  2863     if (word->len)

  2864     {

  2865 	for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))

  2866 	{

  2867 	    c=g_utf8_get_char(t);

  2868 	    pc=g_utf8_get_char(g_utf8_prev_char(t));

  2869 	    if ((c=='.' || c==',') && g_unichar_isdigit(pc))

  2870 	    {

  2871 		*ptr=s;

  2872 		return g_string_free(word,FALSE);

  2873 	    }

  2874 	}

  2875     }

  2876     /* we didn't find a punctuated number - do the regular getword thing */

  2877     g_string_truncate(word,0);

  2878     c=g_utf8_get_char(*ptr);

  2879     for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);

  2880       *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))

  2881 	g_string_append_unichar(word,c);

  2882     return g_string_free(word,FALSE);

  2883 }

  2885 /*

  2886  * isroman:

  2887  *

  2888  * Is this word a Roman Numeral?

  2889  *

  2890  * It doesn't actually validate that the number is a valid Roman Numeral--for

  2891  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not

  2892  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.

  2893  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or

  2894  * expressions thereof, except when it came to taxes. Allow any number of M,

  2895  * an optional D, an optional CM or CD, any number of optional Cs, an optional

  2896  * XL or an optional XC, an optional IX or IV, an optional V and any number

  2897  * of optional Is.

  2898  */

  2899 gboolean isroman(const char *t)

  2900 {

  2901     const char *s;

  2902     if (!t || !*t)

  2903 	return FALSE;

  2904     s=t;

  2905     while (g_utf8_get_char(t)=='m' && *t)

  2906 	t++;

  2907     if (g_utf8_get_char(t)=='d')

  2908 	t++;

  2909     if (g_str_has_prefix(t,"cm"))

  2910 	t+=2;

  2911     if (g_str_has_prefix(t,"cd"))

  2912 	t+=2;

  2913     while (g_utf8_get_char(t)=='c' && *t)

  2914 	t++;

  2915     if (g_str_has_prefix(t,"xl"))

  2916 	t+=2;

  2917     if (g_str_has_prefix(t,"xc"))

  2918 	t+=2;

  2919     if (g_utf8_get_char(t)=='l')

  2920 	t++;

  2921     while (g_utf8_get_char(t)=='x' && *t)

  2922 	t++;

  2923     if (g_str_has_prefix(t,"ix"))

  2924 	t+=2;

  2925     if (g_str_has_prefix(t,"iv"))

  2926 	t+=2;

  2927     if (g_utf8_get_char(t)=='v')

  2928 	t++;

  2929     while (g_utf8_get_char(t)=='i' && *t)

  2930 	t++;

  2931     return !*t;

  2932 }

  2934 /*

  2935  * postprocess_for_DP:

  2936  *

  2937  * Invoked with the -d switch from flgets().

  2938  * It simply "removes" from the line a hard-coded set of common

  2939  * DP-specific tags, so that the line passed to the main routine has

  2940  * been pre-cleaned of DP markup.

  2941  */

  2942 void postprocess_for_DP(char *theline)

  2943 {

  2944     char *s,*t;

  2945     int i;

  2946     if (!*theline)

  2947 	return;

  2948     for (i=0;*DPmarkup[i];i++)

  2949 	while ((s=strstr(theline,DPmarkup[i])))

  2950 	{

  2951 	    t=s+strlen(DPmarkup[i]);

  2952 	    memmove(s,t,strlen(t)+1);

  2953 	}

  2954 }

  2956 /*

  2957  * postprocess_for_HTML:

  2958  *

  2959  * Invoked with the -m switch from flgets().

  2960  * It simply "removes" from the line a hard-coded set of common

  2961  * HTML tags and "replaces" a hard-coded set of common HTML

  2962  * entities, so that the line passed to the main routine has

  2963  * been pre-cleaned of HTML.

  2964  */

  2965 void postprocess_for_HTML(char *theline)

  2966 {

  2967     while (losemarkup(theline))

  2968 	;

  2969     loseentities(theline);

  2970 }

  2972 char *losemarkup(char *theline)

  2973 {

  2974     char *s,*t;

  2975     int i;

  2976     s=strchr(theline,'<');

  2977     t=s?strchr(s,'>'):NULL;

  2978     if (!s || !t)

  2979 	return NULL;

  2980     for (i=0;*markup[i];i++)

  2981 	if (tagcomp(g_utf8_next_char(s),markup[i]))

  2982 	{

  2983 	    t=g_utf8_next_char(t);

  2984 	    memmove(s,t,strlen(t)+1);

  2985 	    return s;

  2986 	}

  2987     /* It's an unrecognized <xxx>. */

  2988     return NULL;

  2989 }

  2991 void loseentities(char *theline)

  2992 {

  2993     int i;

  2994     gsize nb;

  2995     char *amp,*scolon;

  2996     gchar *s,*t;

  2997     gunichar c;

  2998     GTree *entities=NULL;

  2999     static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;

  3000     if (!theline)

  3001     {

  3002 	if (entities)

  3003 	    g_tree_destroy(entities);

  3004 	entities=NULL;

  3005 	if (translit!=(GIConv)-1)

  3006 	    g_iconv_close(translit);

  3007 	translit=(GIConv)-1;

  3008 	if (to_utf8!=(GIConv)-1)

  3009 	    g_iconv_close(to_utf8);

  3010 	to_utf8=(GIConv)-1;

  3011 	return;

  3012     }

  3013     if (!*theline)

  3014 	return;

  3015     if (!entities)

  3016     {

  3017 	entities=g_tree_new((GCompareFunc)strcmp);

  3018 	for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)

  3019 	    g_tree_insert(entities,HTMLentities[i].name,

  3020 	      GUINT_TO_POINTER(HTMLentities[i].c));

  3021     }

  3022     if (translit==(GIConv)-1)

  3023 	translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");

  3024     if (to_utf8==(GIConv)-1)

  3025 	to_utf8=g_iconv_open("UTF-8","ISO_8859-1");

  3026     while((amp=strchr(theline,'&')))

  3027     {

  3028 	scolon=strchr(amp,';');

  3029 	if (scolon)

  3030 	{

  3031 	    if (amp[1]=='#')

  3032 	    {

  3033 		if (amp+2+strspn(amp+2,"0123456789")==scolon)

  3034 		    c=strtol(amp+2,NULL,10);

  3035 		else if (amp[2]=='x' &&

  3036 		  amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)

  3037 		    c=strtol(amp+3,NULL,16);

  3038 	    }

  3039 	    else

  3040 	    {

  3041 		s=g_strndup(amp+1,scolon-(amp+1));

  3042 	        c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));

  3043 		g_free(s);

  3044 	    }

  3045 	}

  3046 	else

  3047 	    c=0;

  3048 	if (c)

  3049 	{

  3050 	    theline=amp;

  3051 	    if (c<128 || c>=192 && c<=255)	/* An ISO-8859-1 character */

  3052 		theline+=g_unichar_to_utf8(c,theline);

  3053 	    else

  3054 	    {

  3055 		s=g_malloc(6);

  3056 		nb=g_unichar_to_utf8(c,s);

  3057 		t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);

  3058 		g_free(s);

  3059 		s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);

  3060 		g_free(t);

  3061 		memcpy(theline,s,nb);

  3062 		g_free(s);

  3063 		theline+=nb;

  3064 	    }

  3065 	    memmove(theline,g_utf8_next_char(scolon),

  3066 	      strlen(g_utf8_next_char(scolon))+1);

  3067 	}

  3068 	else

  3069 	    theline=g_utf8_next_char(amp);

  3070     }

  3071 }

  3073 gboolean tagcomp(const char *strin,const char *basetag)

  3074 {

  3075     gboolean retval;

  3076     gchar *s,*t;

  3077     if (g_utf8_get_char(strin)=='/')

  3078 	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */

  3079     else

  3080 	t=g_utf8_casefold(strin,-1);

  3081     s=g_utf8_casefold(basetag,-1);

  3082     retval=g_str_has_prefix(t,s);

  3083     g_free(s);

  3084     g_free(t);

  3085     return retval;

  3086 }

  3088 void proghelp(GOptionContext *context)

  3089 {

  3090     gchar *help;

  3091     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);

  3092     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);

  3093     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);

  3094     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "

  3095       "For details, read the file COPYING.\n",stderr);

  3096     fputs("This is Free Software; "

  3097       "you may redistribute it under certain conditions (GPL);\n",stderr);

  3098     fputs("read the file COPYING for details.\n\n",stderr);

  3099     help=g_option_context_get_help(context,TRUE,NULL);

  3100     fputs(help,stderr);

  3101     g_free(help);

  3102     fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);

  3103     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "

  3104       "non-ASCII\n",stderr);

  3105     fputs("characters like accented letters, "

  3106       "lines longer than 75 or shorter than 55,\n",stderr);

  3107     fputs("unbalanced quotes or brackets, "

  3108       "a variety of badly formatted punctuation, \n",stderr);

  3109     fputs("HTML tags, some likely typos. "

  3110       "It is NOT a substitute for human judgement.\n",stderr);

  3111     fputs("\n",stderr);

  3112 }

author	ali <ali@juiblex.co.uk>
	Thu Sep 26 07:05:47 2013 +0100 (2013-09-26)
changeset 116	24b0e5ecffe5
parent 111	f805130deb6f
parent 115	df21841a2b64
child 120	797e80d13543
permissions	-rw-r--r--