bookloupe-testing: bookloupe/bookloupe.c@1016349e619f

     1 /*************************************************************************/

     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */

     3 /*									 */

     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */

     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */

     6 /*									 */

     7 /* This program is free software; you can redistribute it and/or modify  */

     8 /* it under the terms of the GNU General Public License as published by  */

     9 /* the Free Software Foundation; either version 2 of the License, or     */

    10 /* (at your option) any later version.					 */

    11 /*									 */

    12 /* This program is distributed in the hope that it will be useful,       */

    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */

    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */

    15 /* GNU General Public License for more details.				 */

    16 /*									 */

    17 /* You should have received a copy of the GNU General Public License	 */

    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */

    19 /*************************************************************************/

    21 #include <stdio.h>

    22 #include <stdlib.h>

    23 #include <string.h>

    24 #include <ctype.h>

    25 #include <glib.h>

    26 #include <bl/bl.h>

    28 gchar *prevline;

    30 /* Common typos. */

    31 char *typo[] = {

    32     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",

    33     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",

    34     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",

    35     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",

    36     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",

    37     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",

    38     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",

    39     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",

    40     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",

    41     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",

    42     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",

    43     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",

    44     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",

    45     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",

    46     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",

    47     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",

    48     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",

    49     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",

    50     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",

    51     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",

    52     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",

    53     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",

    54     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",

    55     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",

    56     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",

    57     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",

    58     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",

    59     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",

    60     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",

    61     "se", ""

    62 };

    64 GTree *usertypo;

    66 /* Common abbreviations and other OK words not to query as typos. */

    67 char *okword[] = {

    68     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",

    69     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",

    70     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",

    71     "outbid", "outbids", "frostbite", "frostbitten", ""

    72 };

    74 /* Common abbreviations that cause otherwise unexplained periods. */

    75 char *abbrev[] = {

    76     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",

    77     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""

    78 };

    80 /*

    81  * Two-Letter combinations that rarely if ever start words,

    82  * but are common scannos or otherwise common letter combinations.

    83  */

    84 char *nostart[] = {

    85     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""

    86 };

    88 /*

    89  * Two-Letter combinations that rarely if ever end words,

    90  * but are common scannos or otherwise common letter combinations.

    91  */

    92 char *noend[] = {

    93     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",

    94     "sw", "gr", "sl", "cl", "iy", ""

    95 };

    97 char *markup[] = {

    98     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",

    99     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",

   100     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",

   101     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""

   102 };

   104 char *DPmarkup[] = {

   105     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""

   106 };

   108 char *nocomma[] = {

   109     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",

   110     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",

   111     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",

   112     "during", "let", "toward", "among", ""

   113 };

   115 char *noperiod[] = {

   116     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",

   117     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",

   118     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",

   119     "among", "those", "into", "whom", "having", "thence", ""

   120 };

   122 char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü";

   124 struct {

   125     char *htmlent;

   126     char *htmlnum;

   127     char *textent;

   128 } entities[] = {

   129     "&amp;",	"&#38;",     "&",

   130     "&lt;",	"&#60;",     "<",

   131     "&gt;",	"&#62;",     ">",

   132     "&deg;",	"&#176;",    " degrees",

   133     "&pound;",	"&#163;",    "L",

   134     "&quot;",	"&#34;",     "\"", /* quotation mark = APL quote */

   135     "&OElig;",	"&#338;",    "OE", /* latin capital ligature OE */

   136     "&oelig;",	"&#339;",    "oe", /* latin small ligature oe */

   137     "&Scaron;",	"&#352;",    "S", /* latin capital letter S with caron */

   138     "&scaron;",	"&#353;",    "s", /* latin small letter s with caron */

   139     "&Yuml;",	"&#376;",    "Y", /* latin capital letter Y with diaeresis */

   140     "&circ;",	"&#710;",    "",  /* modifier letter circumflex accent */

   141     "&tilde;",	"&#732;",    "~", /* small tilde, U+02DC ISOdia */

   142     "&ensp;",	"&#8194;",   " ", /* en space, U+2002 ISOpub */

   143     "&emsp;",	"&#8195;",   " ", /* em space, U+2003 ISOpub */

   144     "&thinsp;",	"&#8201;",   " ", /* thin space, U+2009 ISOpub */

   145     "&ndash;",	"&#8211;",   "-", /* en dash, U+2013 ISOpub */

   146     "&mdash;",	"&#8212;",   "--", /* em dash, U+2014 ISOpub */

   147     "&rsquo;",	"&#8217;",   "'", /* right single quotation mark */

   148     "&sbquo;",	"&#8218;",   "'", /* single low-9 quotation mark */

   149     "&ldquo;",	"&#8220;",   "\"", /* left double quotation mark */

   150     "&rdquo;",	"&#8221;",   "\"", /* right double quotation mark */

   151     "&bdquo;",	"&#8222;",   "\"", /* double low-9 quotation mark */

   152     "&lsaquo;",	"&#8249;",   "\"", /* single left-pointing angle quotation mark */

   153     "&rsaquo;",	"&#8250;",   "\"", /* single right-pointing angle quotation mark */

   154     "&nbsp;",	"&#160;",    " ", /* no-break space = non-breaking space, */

   155     "&iexcl;",	"&#161;",    "!", /* inverted exclamation mark */

   156     "&cent;",	"&#162;",    "c", /* cent sign */

   157     "&pound;",	"&#163;",    "L", /* pound sign */

   158     "&curren;",	"&#164;",    "$", /* currency sign */

   159     "&yen;",	"&#165;",    "Y", /* yen sign = yuan sign */

   160     "&sect;",	"&#167;",    "--", /* section sign */

   161     "&uml;",	"&#168;",    " ", /* diaeresis = spacing diaeresis */

   162     "&copy;",	"&#169;",    "(C) ", /* copyright sign */

   163     "&ordf;",	"&#170;",    " ", /* feminine ordinal indicator */

   164     "&laquo;",	"&#171;",    "\"", /* left-pointing double angle quotation mark */

   165     "&shy;",	"&#173;",    "-", /* soft hyphen = discretionary hyphen */

   166     "&reg;",	"&#174;",    "(R) ", /* registered sign = registered trade mark sign */

   167     "&macr;",	"&#175;",    " ", /* macron = spacing macron = overline */

   168     "&deg;",	"&#176;",    " degrees", /* degree sign */

   169     "&plusmn;",	"&#177;",    "+-", /* plus-minus sign = plus-or-minus sign */

   170     "&sup2;",	"&#178;",    "2", /* superscript two = superscript digit two */

   171     "&sup3;",	"&#179;",    "3", /* superscript three = superscript digit three */

   172     "&acute;",	"&#180;",    " ", /* acute accent = spacing acute */

   173     "&micro;",	"&#181;",    "m", /* micro sign */

   174     "&para;",	"&#182;",    "--", /* pilcrow sign = paragraph sign */

   175     "&cedil;",	"&#184;",    " ", /* cedilla = spacing cedilla */

   176     "&sup1;",	"&#185;",    "1", /* superscript one = superscript digit one */

   177     "&ordm;",	"&#186;",    " ", /* masculine ordinal indicator */

   178     "&raquo;",	"&#187;",    "\"", /* right-pointing double angle quotation mark */

   179     "&frac14;",	"&#188;",    "1/4", /* vulgar fraction one quarter */

   180     "&frac12;",	"&#189;",    "1/2", /* vulgar fraction one half */

   181     "&frac34;",	"&#190;",    "3/4", /* vulgar fraction three quarters */

   182     "&iquest;",	"&#191;",    "?", /* inverted question mark */

   183     "&Agrave;",	"&#192;",    "A", /* latin capital letter A with grave */

   184     "&Aacute;",	"&#193;",    "A", /* latin capital letter A with acute */

   185     "&Acirc;",	"&#194;",    "A", /* latin capital letter A with circumflex */

   186     "&Atilde;",	"&#195;",    "A", /* latin capital letter A with tilde */

   187     "&Auml;",	"&#196;",    "A", /* latin capital letter A with diaeresis */

   188     "&Aring;",	"&#197;",    "A", /* latin capital letter A with ring above */

   189     "&AElig;",	"&#198;",    "AE", /* latin capital letter AE */

   190     "&Ccedil;",	"&#199;",    "C", /* latin capital letter C with cedilla */

   191     "&Egrave;",	"&#200;",    "E", /* latin capital letter E with grave */

   192     "&Eacute;",	"&#201;",    "E", /* latin capital letter E with acute */

   193     "&Ecirc;",	"&#202;",    "E", /* latin capital letter E with circumflex */

   194     "&Euml;",	"&#203;",    "E", /* latin capital letter E with diaeresis */

   195     "&Igrave;",	"&#204;",    "I", /* latin capital letter I with grave */

   196     "&Iacute;",	"&#205;",    "I", /* latin capital letter I with acute */

   197     "&Icirc;",	"&#206;",    "I", /* latin capital letter I with circumflex */

   198     "&Iuml;",	"&#207;",    "I", /* latin capital letter I with diaeresis */

   199     "&ETH;",	"&#208;",    "E", /* latin capital letter ETH */

   200     "&Ntilde;",	"&#209;",    "N", /* latin capital letter N with tilde */

   201     "&Ograve;",	"&#210;",    "O", /* latin capital letter O with grave */

   202     "&Oacute;",	"&#211;",    "O", /* latin capital letter O with acute */

   203     "&Ocirc;",	"&#212;",    "O", /* latin capital letter O with circumflex */

   204     "&Otilde;",	"&#213;",    "O", /* latin capital letter O with tilde */

   205     "&Ouml;",	"&#214;",    "O", /* latin capital letter O with diaeresis */

   206     "&times;",	"&#215;",    "*", /* multiplication sign */

   207     "&Oslash;",	"&#216;",    "O", /* latin capital letter O with stroke */

   208     "&Ugrave;",	"&#217;",    "U", /* latin capital letter U with grave */

   209     "&Uacute;",	"&#218;",    "U", /* latin capital letter U with acute */

   210     "&Ucirc;",	"&#219;",    "U", /* latin capital letter U with circumflex */

   211     "&Uuml;",	"&#220;",    "U", /* latin capital letter U with diaeresis */

   212     "&Yacute;",	"&#221;",    "Y", /* latin capital letter Y with acute */

   213     "&THORN;",	"&#222;",    "TH", /* latin capital letter THORN */

   214     "&szlig;",	"&#223;",    "sz", /* latin small letter sharp s = ess-zed */

   215     "&agrave;",	"&#224;",    "a", /* latin small letter a with grave */

   216     "&aacute;",	"&#225;",    "a", /* latin small letter a with acute */

   217     "&acirc;",	"&#226;",    "a", /* latin small letter a with circumflex */

   218     "&atilde;",	"&#227;",    "a", /* latin small letter a with tilde */

   219     "&auml;",	"&#228;",    "a", /* latin small letter a with diaeresis */

   220     "&aring;",	"&#229;",    "a", /* latin small letter a with ring above */

   221     "&aelig;",	"&#230;",    "ae", /* latin small letter ae */

   222     "&ccedil;",	"&#231;",    "c", /* latin small letter c with cedilla */

   223     "&egrave;",	"&#232;",    "e", /* latin small letter e with grave */

   224     "&eacute;",	"&#233;",    "e", /* latin small letter e with acute */

   225     "&ecirc;",	"&#234;",    "e", /* latin small letter e with circumflex */

   226     "&euml;",	"&#235;",    "e", /* latin small letter e with diaeresis */

   227     "&igrave;",	"&#236;",    "i", /* latin small letter i with grave */

   228     "&iacute;",	"&#237;",    "i", /* latin small letter i with acute */

   229     "&icirc;",	"&#238;",    "i", /* latin small letter i with circumflex */

   230     "&iuml;",	"&#239;",    "i", /* latin small letter i with diaeresis */

   231     "&eth;",	"&#240;",    "eth", /* latin small letter eth */

   232     "&ntilde;",	"&#241;",    "n", /* latin small letter n with tilde */

   233     "&ograve;",	"&#242;",    "o", /* latin small letter o with grave */

   234     "&oacute;",	"&#243;",    "o", /* latin small letter o with acute */

   235     "&ocirc;",	"&#244;",    "o", /* latin small letter o with circumflex */

   236     "&otilde;",	"&#245;",    "o", /* latin small letter o with tilde */

   237     "&ouml;",	"&#246;",    "o", /* latin small letter o with diaeresis */

   238     "&divide;",	"&#247;",    "/", /* division sign */

   239     "&oslash;",	"&#248;",    "o", /* latin small letter o with stroke */

   240     "&ugrave;",	"&#249;",    "u", /* latin small letter u with grave */

   241     "&uacute;",	"&#250;",    "u", /* latin small letter u with acute */

   242     "&ucirc;",	"&#251;",    "u", /* latin small letter u with circumflex */

   243     "&uuml;",	"&#252;",    "u", /* latin small letter u with diaeresis */

   244     "&yacute;",	"&#253;",    "y", /* latin small letter y with acute */

   245     "&thorn;",	"&#254;",    "th", /* latin small letter thorn */

   246     "&yuml;",	"&#255;",    "y", /* latin small letter y with diaeresis */

   247     "", ""

   248 };

   250 /* special characters */

   251 #define CHAR_SPACE	  32

   252 #define CHAR_TAB	   9

   253 #define CHAR_LF		  10

   254 #define CHAR_CR		  13

   255 #define CHAR_DQUOTE	  34

   256 #define CHAR_SQUOTE	  39

   257 #define CHAR_OPEN_SQUOTE  96

   258 #define CHAR_TILDE	 126

   259 #define CHAR_ASTERISK	  42

   260 #define CHAR_FORESLASH	  47

   261 #define CHAR_CARAT	  94

   263 #define CHAR_UNDERSCORE    '_'

   264 #define CHAR_OPEN_CBRACK   '{'

   265 #define CHAR_CLOSE_CBRACK  '}'

   266 #define CHAR_OPEN_RBRACK   '('

   267 #define CHAR_CLOSE_RBRACK  ')'

   268 #define CHAR_OPEN_SBRACK   '['

   269 #define CHAR_CLOSE_SBRACK  ']'

   271 /* longest and shortest normal PG line lengths */

   272 #define LONGEST_PG_LINE   75

   273 #define WAY_TOO_LONG      80

   274 #define SHORTEST_PG_LINE  55

   276 enum {

   277     ECHO_SWITCH,

   278     SQUOTE_SWITCH,

   279     TYPO_SWITCH,

   280     QPARA_SWITCH,

   281     PARANOID_SWITCH,

   282     LINE_END_SWITCH,

   283     OVERVIEW_SWITCH,

   284     STDOUT_SWITCH,

   285     HEADER_SWITCH,

   286     WEB_SWITCH,

   287     VERBOSE_SWITCH,

   288     MARKUP_SWITCH,

   289     USERTYPO_SWITCH,

   290     DP_SWITCH,

   291     SWITNO

   292 };

   294 gboolean pswit[SWITNO];  /* program switches */

   296 static GOptionEntry options[]={

   297     { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,

   298       "Ignore DP-specific markup", NULL },

   299     { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,

   300       "Don't echo queried line", NULL },

   301     { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,

   302       "Check single quotes", NULL },

   303     { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,

   304       "Check common typos", NULL },

   305     { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,

   306       "Require closure of quotes on every paragraph", NULL },

   307     { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,

   308       "Disable paranoid querying of everything", NULL },

   309     { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,

   310       "Disable line end checking", NULL },

   311     { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,

   312       "Overview: just show counts", NULL },

   313     { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,

   314       "Output errors to stdout instead of stderr", NULL },

   315     { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,

   316       "Echo header fields", NULL },

   317     { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,

   318       "Ignore markup in < >", NULL },

   319     { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,

   320       "Use file of user-defined typos", NULL },

   321     { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,

   322       "Defaults for use on www upload", NULL },

   323     { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,

   324       "Verbose - list everything", NULL },

   325     { NULL }

   326 };

   328 long cnt_dquot;		/* for overview mode, count of doublequote queries */

   329 long cnt_squot;		/* for overview mode, count of singlequote queries */

   330 long cnt_brack;		/* for overview mode, count of brackets queries */

   331 long cnt_bin;		/* for overview mode, count of non-ASCII queries */

   332 long cnt_odd;		/* for overview mode, count of odd character queries */

   333 long cnt_long;		/* for overview mode, count of long line errors */

   334 long cnt_short;		/* for overview mode, count of short line queries */

   335 long cnt_punct;		/* for overview mode,

   336 			   count of punctuation and spacing queries */

   337 long cnt_dash;		/* for overview mode, count of dash-related queries */

   338 long cnt_word;		/* for overview mode, count of word queries */

   339 long cnt_html;		/* for overview mode, count of html queries */

   340 long cnt_lineend;	/* for overview mode, count of line-end queries */

   341 long cnt_spacend;	/* count of lines with space at end */

   342 long linecnt;		/* count of total lines in the file */

   343 long checked_linecnt;	/* count of lines actually checked */

   345 void proghelp(GOptionContext *context);

   346 void procfile(const char *);

   348 gchar *running_from;

   350 int mixdigit(const char *);

   351 gchar *getaword(const char **);

   352 char *flgets(char **,long);

   353 gboolean gcisalpha(unsigned char);

   354 gboolean gcisdigit(unsigned char);

   355 gboolean gcisletter(unsigned char);

   356 void postprocess_for_HTML(char *);

   357 char *linehasmarkup(char *);

   358 char *losemarkup(char *);

   359 int tagcomp(const char *,const char *);

   360 char *loseentities(char *);

   361 gboolean isroman(const char *);

   362 void postprocess_for_DP(char *);

   364 GTree *qword,*qperiod;

   366 struct first_pass_results {

   367     long firstline,astline;

   368     long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;

   369     long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;

   370     long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;

   371     int Dutchcount,Frenchcount;

   372 };

   374 struct warnings {

   375     int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;

   376     int endquote;

   377     gboolean isDutch,isFrench;

   378 };

   380 struct counters {

   381     long quot;

   382     int c_unders,c_brack,s_brack,r_brack;

   383     int open_single_quote,close_single_quote;

   384 };

   386 struct line_properties {

   387     unsigned int len,blen;

   388     char start;

   389 };

   391 struct parities {

   392     int dquote,squote;

   393 };

   395 struct pending {

   396     char *dquote,*squote,*rbrack,*sbrack,*cbrack,*unders;

   397     long squot;

   398 };

   400 void parse_options(int *argc,char ***argv)

   401 {

   402     GError *err=NULL;

   403     GOptionContext *context;

   404     context=g_option_context_new(

   405       "file - looks for errors in Project Gutenberg(TM) etexts");

   406     g_option_context_add_main_entries(context,options,NULL);

   407     if (!g_option_context_parse(context,argc,argv,&err))

   408     {

   409 	g_printerr("Bookloupe: %s\n",err->message);

   410 	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);

   411 	exit(1);

   412     }

   413     /* Paranoid checking is turned OFF, not on, by its switch */

   414     pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];

   415     if (pswit[PARANOID_SWITCH])

   416 	/* if running in paranoid mode, typo checks default to enabled */

   417 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];

   418     /* Line-end checking is turned OFF, not on, by its switch */

   419     pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];

   420     /* Echoing is turned OFF, not on, by its switch */

   421     pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];

   422     if (pswit[OVERVIEW_SWITCH])

   423 	/* just print summary; don't echo */

   424 	pswit[ECHO_SWITCH]=FALSE;

   425     /*

   426      * Web uploads - for the moment, this is really just a placeholder

   427      * until we decide what processing we really want to do on web uploads

   428      */

   429     if (pswit[WEB_SWITCH])

   430     {

   431 	/* specific override for web uploads */

   432 	pswit[ECHO_SWITCH]=TRUE;

   433 	pswit[SQUOTE_SWITCH]=FALSE;

   434 	pswit[TYPO_SWITCH]=TRUE;

   435 	pswit[QPARA_SWITCH]=FALSE;

   436 	pswit[PARANOID_SWITCH]=TRUE;

   437 	pswit[LINE_END_SWITCH]=FALSE;

   438 	pswit[OVERVIEW_SWITCH]=FALSE;

   439 	pswit[STDOUT_SWITCH]=FALSE;

   440 	pswit[HEADER_SWITCH]=TRUE;

   441 	pswit[VERBOSE_SWITCH]=FALSE;

   442 	pswit[MARKUP_SWITCH]=FALSE;

   443 	pswit[USERTYPO_SWITCH]=FALSE;

   444 	pswit[DP_SWITCH]=FALSE;

   445     }

   446     if (*argc<2)

   447     {

   448 	proghelp(context);

   449 	exit(1);

   450     }

   451     g_option_context_free(context);

   452 }

   454 /*

   455  * read_user_scannos:

   456  *

   457  * Read in the user-defined stealth scanno list.

   458  */

   459 void read_user_scannos(void)

   460 {

   461     GError *err=NULL;

   462     gchar *usertypo_file;

   463     gboolean okay;

   464     int i;

   465     gsize len;

   466     gchar *contents,**lines;

   467     usertypo_file=g_strdup("bookloupe.typ");

   468     okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   469     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   470     {

   471 	g_clear_error(&err);

   472 	g_free(usertypo_file);

   473 	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);

   474 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   475     }

   476     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   477     {

   478 	g_clear_error(&err);

   479 	g_free(usertypo_file);

   480 	usertypo_file=g_strdup("gutcheck.typ");

   481 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   482     }

   483     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   484     {

   485 	g_clear_error(&err);

   486 	g_free(usertypo_file);

   487 	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);

   488 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   489     }

   490     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   491     {

   492 	g_free(usertypo_file);

   493 	printf("   --> I couldn't find bookloupe.typ "

   494 	  "-- proceeding without user typos.\n");

   495 	return;

   496     }

   497     else if (!okay)

   498     {

   499 	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);

   500 	g_free(usertypo_file);

   501 	g_clear_error(&err);

   502 	exit(1);

   503     }

   504     lines=g_strsplit(contents,"\n",0);

   505     usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);

   506     for (i=0;lines[i];i++)

   507 	if (*(unsigned char *)lines[i]>'!')

   508 	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));

   509 	else

   510 	    g_free(lines[i]);

   511     g_free(lines);

   512 }

   514 #if 0

   515 /*

   516  * read_etext:

   517  *

   518  * Read an etext returning an array of lines. Lines are normally expected

   519  * to be terminated by CR LF. Solitary LFs delimit lines but are left

   520  * embedded at the end of the line for further processing. Solitary CRs

   521  * do not delimit lines.

   522  */

   523 gchar **read_etext(const char *filename,GError **err)

   524 {

   525     int i;

   526     const char *s,*t;

   527     gchar *contents;

   528     gchar **raw_lines;

   529     GPtrArray *lines;

   530     gsize len;

   531     if (!g_file_get_contents(filename,&contents,&len,err))

   532 	return NULL;

   533     raw_lines=g_strsplit(contents,"\r\n",0);

   534     lines=g_ptr_array_sized_new(g_strv_length(raw_lines)+1);

   535     for (i=0;raw_lines[i];i++)

   536     {

   537 	t=strchr(raw_lines[i],'\n');

   538 	if (t)

   539 	{

   540 	    s=raw_lines[i];

   541 	    while ((t=strchr(s,'\n')))

   542 	    {

   543 		g_ptr_array_add(lines,g_strndup(s,t-s+1));

   544 		s=t+1;

   545 	    }

   546 	    g_ptr_array_add(lines,g_strdup(s));

   547 	    g_free(raw_lines[i]);

   548 	}

   549 	else

   550 	    g_ptr_array_add(lines,raw_lines[i]);

   551     }

   552     g_free(raw_lines);

   553     g_ptr_array_add(lines,NULL);

   554     return (gchar **)g_ptr_array_free(lines,FALSE);

   555 }

   556 #else

   557 /*

   558  * read_etext:

   559  *

   560  * Read an etext returning a newly allocated string containing the file

   561  * contents or NULL on error.

   562  */

   563 gchar *read_etext(const char *filename,GError **err)

   564 {

   565     gchar *contents;

   566     gsize len;

   567     if (!g_file_get_contents(filename,&contents,&len,err))

   568 	return NULL;

   569     return contents;

   570 }

   571 #endif

   573 int main(int argc,char **argv)

   574 {

   575     running_from=g_path_get_dirname(argv[0]);

   576     parse_options(&argc,&argv);

   577     if (pswit[USERTYPO_SWITCH])

   578 	read_user_scannos();

   579     fprintf(stderr,"bookloupe: Check and report on an e-text\n");

   580     procfile(argv[1]);

   581     if (pswit[OVERVIEW_SWITCH])

   582     {

   583 	printf("    Checked %ld lines of %ld (head+foot = %ld)\n\n",

   584 	  checked_linecnt,linecnt,linecnt-checked_linecnt);

   585 	printf("    --------------- Queries found --------------\n");

   586 	if (cnt_long)

   587 	    printf("    Long lines:		    %14ld\n",cnt_long);

   588 	if (cnt_short)

   589 	    printf("    Short lines:		   %14ld\n",cnt_short);

   590 	if (cnt_lineend)

   591 	    printf("    Line-end problems:	     %14ld\n",cnt_lineend);

   592 	if (cnt_word)

   593 	    printf("    Common typos:		  %14ld\n",cnt_word);

   594 	if (cnt_dquot)

   595 	    printf("    Unmatched quotes:	      %14ld\n",cnt_dquot);

   596 	if (cnt_squot)

   597 	    printf("    Unmatched SingleQuotes:	%14ld\n",cnt_squot);

   598 	if (cnt_brack)

   599 	    printf("    Unmatched brackets:	    %14ld\n",cnt_brack);

   600 	if (cnt_bin)

   601 	    printf("    Non-ASCII characters:	  %14ld\n",cnt_bin);

   602 	if (cnt_odd)

   603 	    printf("    Proofing characters:	   %14ld\n",cnt_odd);

   604 	if (cnt_punct)

   605 	    printf("    Punctuation & spacing queries: %14ld\n",cnt_punct);

   606 	if (cnt_dash)

   607 	    printf("    Non-standard dashes:	   %14ld\n",cnt_dash);

   608 	if (cnt_html)

   609 	    printf("    Possible HTML tags:	    %14ld\n",cnt_html);

   610 	printf("\n");

   611 	printf("    TOTAL QUERIES		  %14ld\n",

   612 	  cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+

   613 	  cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);

   614     }

   615     g_free(running_from);

   616     if (usertypo)

   617 	g_tree_unref(usertypo);

   618     return 0;

   619 }

   621 /*

   622  * first_pass:

   623  *

   624  * Run a first pass - verify that it's a valid PG

   625  * file, decide whether to report some things that

   626  * occur many times in the text like long or short

   627  * lines, non-standard dashes, etc.

   628  */

   629 struct first_pass_results *first_pass(const char *etext)

   630 {

   631     char laststart=CHAR_SPACE;

   632     const char *s;

   633     gchar *lc_line;

   634     int i,j,llen;

   635     gchar **lines;

   636     unsigned int lastlen=0,lastblen=0;

   637     long spline=0,nspline=0;

   638     static struct first_pass_results results={0};

   639     gchar *inword;

   640     lines=g_strsplit(etext,"\n",0);

   641     for (j=0;lines[j];j++)

   642     {

   643 	llen=strlen(lines[j]);

   644 	while(lines[j][llen-1]=='\r')

   645 	    lines[j][llen--]='\0';

   646 	linecnt++;

   647 	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&

   648 	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))

   649 	{

   650 	    if (spline)

   651 		printf("   --> Duplicate header?\n");

   652 	    spline=linecnt+1;   /* first line of non-header text, that is */

   653 	}

   654 	if (!strncmp(lines[j],"*** START",9) &&

   655 	  strstr(lines[j],"PROJECT GUTENBERG"))

   656 	{

   657 	    if (nspline)

   658 		printf("   --> Duplicate header?\n");

   659 	    nspline=linecnt+1;   /* first line of non-header text, that is */

   660 	}

   661 	if (spline || nspline)

   662 	{

   663 	    lc_line=g_ascii_strdown(lines[j],llen);

   664 	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))

   665 	    {

   666 		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))

   667 		{

   668 		    if (results.footerline)

   669 		    {

   670 			/* it's an old-form header - we can detect duplicates */

   671 			if (!nspline)

   672 			    printf("   --> Duplicate footer?\n");

   673 		    }

   674 		    else

   675 			results.footerline=linecnt;

   676 		}

   677 	    }

   678 	    g_free(lc_line);

   679 	}

   680 	if (spline)

   681 	    results.firstline=spline;

   682 	if (nspline)

   683 	    results.firstline=nspline;  /* override with new */

   684 	if (results.footerline)

   685 	    continue;    /* don't count the boilerplate in the footer */

   686 	results.totlen+=llen;

   687 	for (i=0;i<llen;i++)

   688 	{

   689 	    if ((unsigned char)lines[j][i]>127)

   690 		results.binlen++;

   691 	    if (gcisalpha(lines[j][i]))

   692 		results.alphalen++;

   693 	    if (i>0 && lines[j][i]==CHAR_DQUOTE && isalpha(lines[j][i-1]))

   694 		results.endquote_count++;

   695 	}

   696 	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&

   697 	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)

   698 	    results.shortline++;

   699 	if (llen>0 && (unsigned char)lines[j][llen-1]<=CHAR_SPACE)

   700 	    cnt_spacend++;

   701 	if (strstr(lines[j],".,"))

   702 	    results.dotcomma++;

   703 	/* only count ast lines for ignoring purposes where there is */

   704 	/* locase text on the line */

   705 	if (strchr(lines[j],'*'))

   706 	{

   707 	    for (s=lines[j];*s;s++)

   708 		if (*s>='a' && *s<='z')

   709 		    break;

   710 	     if (*s)

   711 		results.astline++;

   712 	}

   713 	if (strchr(lines[j],'/'))

   714 	    results.fslashline++;

   715 	for (i=llen-1;i>0 && (unsigned char)lines[j][i]<=CHAR_SPACE;i--)

   716 	    ;

   717 	if (i>1 && lines[j][i]=='-' && lines[j][i-1]!='-')

   718 	    results.hyphens++;

   719 	if (llen>LONGEST_PG_LINE)

   720 	    results.longline++;

   721 	if (llen>WAY_TOO_LONG)

   722 	    results.verylongline++;

   723 	if (strchr(lines[j],'<') && strchr(lines[j],'>'))

   724 	{

   725 	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);

   726 	    if (i>0)

   727 		results.htmcount++;

   728 	    if (strstr(lines[j],"<i>"))

   729 		results.htmcount+=4; /* bonus marks! */

   730 	}

   731 	/* Check for spaced em-dashes */

   732 	if (lines[j][0] && (s=strstr(lines[j]+1,"--")))

   733 	{

   734 	    results.emdash++;

   735 	    if (s[-1]==CHAR_SPACE || (s[2]==CHAR_SPACE))

   736 		results.space_emdash++;

   737 	    if (s[-1]==CHAR_SPACE && (s[2]==CHAR_SPACE))

   738 		/* count of em-dashes with spaces both sides */

   739 		results.non_PG_space_emdash++;

   740 	    if (s[-1]!=CHAR_SPACE && (s[2]!=CHAR_SPACE))

   741 		/* count of PG-type em-dashes with no spaces */

   742 		results.PG_space_emdash++;

   743 	}

   744 	for (s=lines[j];*s;)

   745 	{

   746 	    inword=getaword(&s);

   747 	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))

   748 		results.Dutchcount++;

   749 	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))

   750 		results.Frenchcount++;

   751 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))

   752 		results.standalone_digit++;

   753 	    g_free(inword);

   754 	}

   755 	/* Check for spaced dashes */

   756 	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')

   757 	    results.spacedash++;

   758 	lastblen=lastlen;

   759 	lastlen=llen;

   760 	laststart=lines[j][0];

   761     }

   762     g_strfreev(lines);

   763     return &results;

   764 }

   766 /*

   767  * report_first_pass:

   768  *

   769  * Make some snap decisions based on the first pass results.

   770  */

   771 struct warnings *report_first_pass(struct first_pass_results *results)

   772 {

   773     static struct warnings warnings={0};

   774     if (cnt_spacend>0)

   775 	printf("   --> %ld lines in this file have white space at end\n",

   776 	  cnt_spacend);

   777     warnings.dotcomma=1;

   778     if (results->dotcomma>5)

   779     {

   780 	warnings.dotcomma=0;

   781 	printf("   --> %ld lines in this file contain '.,'. "

   782 	  "Not reporting them.\n",results->dotcomma);

   783     }

   784     /*

   785      * If more than 50 lines, or one-tenth, are short,

   786      * don't bother reporting them.

   787      */

   788     warnings.shortline=1;

   789     if (results->shortline>50 || results->shortline*10>linecnt)

   790     {

   791 	warnings.shortline=0;

   792 	printf("   --> %ld lines in this file are short. "

   793 	  "Not reporting short lines.\n",results->shortline);

   794     }

   795     /*

   796      * If more than 50 lines, or one-tenth, are long,

   797      * don't bother reporting them.

   798      */

   799     warnings.longline=1;

   800     if (results->longline>50 || results->longline*10>linecnt)

   801     {

   802 	warnings.longline=0;

   803 	printf("   --> %ld lines in this file are long. "

   804 	  "Not reporting long lines.\n",results->longline);

   805     }

   806     /* If more than 10 lines contain asterisks, don't bother reporting them. */

   807     warnings.ast=1;

   808     if (results->astline>10)

   809     {

   810 	warnings.ast=0;

   811 	printf("   --> %ld lines in this file contain asterisks. "

   812 	  "Not reporting them.\n",results->astline);

   813     }

   814     /*

   815      * If more than 10 lines contain forward slashes,

   816      * don't bother reporting them.

   817      */

   818     warnings.fslash=1;

   819     if (results->fslashline>10)

   820     {

   821 	warnings.fslash=0;

   822 	printf("   --> %ld lines in this file contain forward slashes. "

   823 	  "Not reporting them.\n",results->fslashline);

   824     }

   825     /*

   826      * If more than 20 lines contain unpunctuated endquotes,

   827      * don't bother reporting them.

   828      */

   829     warnings.endquote=1;

   830     if (results->endquote_count>20)

   831     {

   832 	warnings.endquote=0;

   833 	printf("   --> %ld lines in this file contain unpunctuated endquotes. "

   834 	  "Not reporting them.\n",results->endquote_count);

   835     }

   836     /*

   837      * If more than 15 lines contain standalone digits,

   838      * don't bother reporting them.

   839      */

   840     warnings.digit=1;

   841     if (results->standalone_digit>10)

   842     {

   843 	warnings.digit=0;

   844 	printf("   --> %ld lines in this file contain standalone 0s and 1s. "

   845 	  "Not reporting them.\n",results->standalone_digit);

   846     }

   847     /*

   848      * If more than 20 lines contain hyphens at end,

   849      * don't bother reporting them.

   850      */

   851     warnings.hyphen=1;

   852     if (results->hyphens>20)

   853     {

   854 	warnings.hyphen=0;

   855 	printf("   --> %ld lines in this file have hyphens at end. "

   856 	  "Not reporting them.\n",results->hyphens);

   857     }

   858     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])

   859     {

   860 	printf("   --> Looks like this is HTML. Switching HTML mode ON.\n");

   861 	pswit[MARKUP_SWITCH]=1;

   862     }

   863     if (results->verylongline>0)

   864 	printf("   --> %ld lines in this file are VERY long!\n",

   865 	  results->verylongline);

   866     /*

   867      * If there are more non-PG spaced dashes than PG em-dashes,

   868      * assume it's deliberate.

   869      * Current PG guidelines say don't use them, but older texts do,

   870      * and some people insist on them whatever the guidelines say.

   871      */

   872     warnings.dash=1;

   873     if (results->spacedash+results->non_PG_space_emdash>

   874       results->PG_space_emdash)

   875     {

   876 	warnings.dash=0;

   877 	printf("   --> There are %ld spaced dashes and em-dashes. "

   878 	  "Not reporting them.\n",

   879 	  results->spacedash+results->non_PG_space_emdash);

   880     }

   881     /* If more than a quarter of characters are hi-bit, bug out. */

   882     warnings.bin=1;

   883     if (results->binlen*4>results->totlen)

   884     {

   885 	printf("   --> This file does not appear to be ASCII. "

   886 	  "Terminating. Best of luck with it!\n");

   887 	exit(1);

   888     }

   889     if (results->alphalen*4<results->totlen)

   890     {

   891 	printf("   --> This file does not appear to be text. "

   892 	  "Terminating. Best of luck with it!\n");

   893 	exit(1);

   894     }

   895     if (results->binlen*100>results->totlen || results->binlen>100)

   896     {

   897 	printf("   --> There are a lot of foreign letters here. "

   898 	  "Not reporting them.\n");

   899 	warnings.bin=0;

   900     }

   901     warnings.isDutch=FALSE;

   902     if (results->Dutchcount>50)

   903     {

   904 	warnings.isDutch=TRUE;

   905 	printf("   --> This looks like Dutch - "

   906 	  "switching off dashes and warnings for 's Middags case.\n");

   907     }

   908     warnings.isFrench=FALSE;

   909     if (results->Frenchcount>50)

   910     {

   911 	warnings.isFrench=TRUE;

   912 	printf("   --> This looks like French - "

   913 	  "switching off some doublepunct.\n");

   914     }

   915     if (results->firstline && results->footerline)

   916 	printf("    The PG header and footer appear to be already on.\n");

   917     else

   918     {

   919 	if (results->firstline)

   920 	    printf("    The PG header is on - no footer.\n");

   921 	if (results->footerline)

   922 	    printf("    The PG footer is on - no header.\n");

   923     }

   924     printf("\n");

   925     if (pswit[VERBOSE_SWITCH])

   926     {

   927 	warnings.bin=1;

   928 	warnings.shortline=1;

   929 	warnings.dotcomma=1;

   930 	warnings.longline=1;

   931 	warnings.dash=1;

   932 	warnings.digit=1;

   933 	warnings.ast=1;

   934 	warnings.fslash=1;

   935 	warnings.hyphen=1;

   936 	warnings.endquote=1;

   937 	printf("   *** Verbose output is ON -- you asked for it! ***\n");

   938     }

   939     if (warnings.isDutch)

   940 	warnings.dash=0;

   941     if (results->footerline>0 && results->firstline>0 &&

   942       results->footerline>results->firstline &&

   943       results->footerline-results->firstline<100)

   944     {

   945 	printf("   --> I don't really know where this text starts. \n");

   946 	printf("       There are no reference points.\n");

   947 	printf("       I'm going to have to report the header and footer "

   948 	  "as well.\n");

   949 	results->firstline=0;

   950     }

   951     return &warnings;

   952 }

   954 /*

   955  * analyse_quotes:

   956  *

   957  * Look along the line, accumulate the count of quotes, and see

   958  * if this is an empty line - i.e. a line with nothing on it

   959  * but spaces.

   960  * If line has just spaces, period, * and/or - on it, don't

   961  * count it, since empty lines with asterisks or dashes to

   962  * separate sections are common.

   963  *

   964  * Returns: TRUE if the line is empty.

   965  */

   966 gboolean analyse_quotes(const char *aline,struct counters *counters)

   967 {

   968     int guessquote=0;

   969     /* assume the line is empty until proven otherwise */

   970     gboolean isemptyline=TRUE;

   971     const char *s=aline;

   972     while (*s)

   973     {

   974 	if (*s==CHAR_DQUOTE)

   975 	    counters->quot++;

   976 	if (*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE)

   977 	{

   978 	    if (s==aline)

   979 	    {

   980 		/*

   981 		 * At start of line, it can only be an openquote.

   982 		 * Hardcode a very common exception!

   983 		 */

   984 		if (strncmp(s+2,"tis",3) && strncmp(s+2,"Tis",3))

   985 		    counters->open_single_quote++;

   986 	    }

   987 	    else if (gcisalpha(s[-1]) && gcisalpha(s[1]))

   988 		/* Do nothing! it's definitely an apostrophe, not a quote */

   989 		;

   990 	    /* it's outside a word - let's check it out */

   991 	    else if (*s==CHAR_OPEN_SQUOTE || gcisalpha(s[1]))

   992 	    {

   993 		/* it damwell better BE an openquote */

   994 		if (strncmp(s+1,"tis",3) && strncmp(s+1,"Tis",3))

   995 		    /* hardcode a very common exception! */

   996 		    counters->open_single_quote++;

   997 	    }

   998 	    else

   999 	    {

  1000 		/* now - is it a closequote? */

  1001 		guessquote=0;   /* accumulate clues */

  1002 		if (gcisalpha(s[-1]))

  1003 		{

  1004 		    /* it follows a letter - could be either */

  1005 		    guessquote++;

  1006 		    if (s[-1]=='s')

  1007 		    {

  1008 			/* looks like a plural apostrophe */

  1009 			guessquote-=3;

  1010 			if (s[1]==CHAR_SPACE)  /* bonus marks! */

  1011 			    guessquote-=2;

  1012 		    }

  1013 		}

  1014 		/* it doesn't have a letter either side */

  1015 		else if (strchr(".?!,;:",s[-1]) && strchr(".?!,;: ",s[1]))

  1016 		    guessquote+=8; /* looks like a closequote */

  1017 		else

  1018 		    guessquote++;

  1019 		if (counters->open_single_quote>counters->close_single_quote)

  1020 		    /*

  1021 		     * Give it the benefit of some doubt,

  1022 		     * if a squote is already open.

  1023 		     */

  1024 		    guessquote++;

  1025 		else

  1026 		    guessquote--;

  1027 		if (guessquote>=0)

  1028 		    counters->close_single_quote++;

  1029 	    }

  1030 	}

  1031 	if (*s!=CHAR_SPACE && *s!='-' && *s!='.' && *s!=CHAR_ASTERISK &&

  1032 	  *s!=13 && *s!=10)

  1033 	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */

  1034 	if (*s==CHAR_UNDERSCORE)

  1035 	    counters->c_unders++;

  1036 	if (*s==CHAR_OPEN_CBRACK)

  1037 	    counters->c_brack++;

  1038 	if (*s==CHAR_CLOSE_CBRACK)

  1039 	    counters->c_brack--;

  1040 	if (*s==CHAR_OPEN_RBRACK)

  1041 	    counters->r_brack++;

  1042 	if (*s==CHAR_CLOSE_RBRACK)

  1043 	    counters->r_brack--;

  1044 	if (*s==CHAR_OPEN_SBRACK)

  1045 	    counters->s_brack++;

  1046 	if (*s==CHAR_CLOSE_SBRACK)

  1047 	    counters->s_brack--;

  1048 	s++;

  1049     }

  1050     return isemptyline;

  1051 }

  1053 /*

  1054  * check_for_control_characters:

  1055  *

  1056  * Check for invalid or questionable characters in the line

  1057  * Anything above 127 is invalid for plain ASCII, and

  1058  * non-printable control characters should also be flagged.

  1059  * Tabs should generally not be there.

  1060  */

  1061 void check_for_control_characters(const char *aline)

  1062 {

  1063     unsigned char c;

  1064     const char *s;

  1065     for (s=aline;*s;s++)

  1066     {

  1067 	c=*(unsigned char *)s;

  1068 	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)

  1069 	{

  1070 	    if (pswit[ECHO_SWITCH])

  1071 		printf("\n%s\n",aline);

  1072 	    if (!pswit[OVERVIEW_SWITCH])

  1073 		printf("    Line %ld column %d - Control character %d\n",

  1074 		  linecnt,(int)(s-aline)+1,c);

  1075 	    else

  1076 		cnt_bin++;

  1077 	}

  1078     }

  1079 }

  1081 /*

  1082  * check_for_odd_characters:

  1083  *

  1084  * Check for binary and other odd characters.

  1085  */

  1086 void check_for_odd_characters(const char *aline,const struct warnings *warnings,

  1087   gboolean isemptyline)

  1088 {

  1089     /* Don't repeat multiple warnings on one line. */

  1090     int eNon_A=0,eTab=0,eTilde=0,eCarat=0,eFSlash=0,eAst=0;

  1091     const char *s;

  1092     unsigned char c;

  1093     for (s=aline;*s;s++)

  1094     {

  1095 	c=*(unsigned char *)s;

  1096 	if (!eNon_A && (*s<CHAR_SPACE && *s!=9 && *s!='\n' || c>127))

  1097 	{

  1098 	    if (pswit[ECHO_SWITCH])

  1099 		printf("\n%s\n",aline);

  1100 	    if (!pswit[OVERVIEW_SWITCH])

  1101 		if (c>127 && c<160)

  1102 		    printf("    Line %ld column %d - "

  1103 		      "Non-ISO-8859 character %d\n",linecnt,(int)(s-aline)+1,c);

  1104 		else

  1105 		    printf("    Line %ld column %d - Non-ASCII character %d\n",

  1106 		      linecnt,(int)(s-aline)+1,c);

  1107 	    else

  1108 		cnt_bin++;

  1109 	    eNon_A=1;

  1110 	}

  1111 	if (!eTab && *s==CHAR_TAB)

  1112 	{

  1113 	    if (pswit[ECHO_SWITCH])

  1114 		printf("\n%s\n",aline);

  1115 	    if (!pswit[OVERVIEW_SWITCH])

  1116 		printf("    Line %ld column %d - Tab character?\n",

  1117 		  linecnt,(int)(s-aline)+1);

  1118 	    else

  1119 		cnt_odd++;

  1120 	    eTab=1;

  1121 	}

  1122 	if (!eTilde && *s==CHAR_TILDE)

  1123 	{

  1124 	    /*

  1125 	     * Often used by OCR software to indicate an

  1126 	     * unrecognizable character.

  1127 	     */

  1128 	    if (pswit[ECHO_SWITCH])

  1129 		printf("\n%s\n",aline);

  1130 	    if (!pswit[OVERVIEW_SWITCH])

  1131 		printf("    Line %ld column %d - Tilde character?\n",

  1132 		  linecnt,(int)(s-aline)+1);

  1133 	    else

  1134 		cnt_odd++;

  1135 	    eTilde=1;

  1136 	}

  1137 	if (!eCarat && *s==CHAR_CARAT)

  1138 	{

  1139 	    if (pswit[ECHO_SWITCH])

  1140 		printf("\n%s\n",aline);

  1141 	    if (!pswit[OVERVIEW_SWITCH])

  1142 		printf("    Line %ld column %d - Carat character?\n",

  1143 		  linecnt,(int)(s-aline)+1);

  1144 	    else

  1145 		cnt_odd++;

  1146 	    eCarat=1;

  1147 	}

  1148 	if (!eFSlash && *s==CHAR_FORESLASH && warnings->fslash)

  1149 	{

  1150 	    if (pswit[ECHO_SWITCH])

  1151 		printf("\n%s\n",aline);

  1152 	    if (!pswit[OVERVIEW_SWITCH])

  1153 		printf("    Line %ld column %d - Forward slash?\n",

  1154 		  linecnt,(int)(s-aline)+1);

  1155 	    else

  1156 		cnt_odd++;

  1157 	    eFSlash=1;

  1158 	}

  1159 	/*

  1160 	 * Report asterisks only in paranoid mode,

  1161 	 * since they're often deliberate.

  1162 	 */

  1163 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&

  1164 	  *s==CHAR_ASTERISK)

  1165 	{

  1166 	    if (pswit[ECHO_SWITCH])

  1167 		printf("\n%s\n",aline);

  1168 	    if (!pswit[OVERVIEW_SWITCH])

  1169 		printf("    Line %ld column %d - Asterisk?\n",

  1170 		  linecnt,(int)(s-aline)+1);

  1171 	    else

  1172 		cnt_odd++;

  1173 	    eAst=1;

  1174 	}

  1175     }

  1176 }

  1178 /*

  1179  * check_for_long_line:

  1180  *

  1181  * Check for line too long.

  1182  */

  1183 void check_for_long_line(const char *aline)

  1184 {

  1185     if (strlen(aline)>LONGEST_PG_LINE)

  1186     {

  1187 	if (pswit[ECHO_SWITCH])

  1188 	    printf("\n%s\n",aline);

  1189 	if (!pswit[OVERVIEW_SWITCH])

  1190 	    printf("    Line %ld column %d - Long line %d\n",

  1191 	      linecnt,(int)strlen(aline),(int)strlen(aline));

  1192 	else

  1193 	    cnt_long++;

  1194     }

  1195 }

  1197 /*

  1198  * check_for_short_line:

  1199  *

  1200  * Check for line too short.

  1201  *

  1202  * This one is a bit trickier to implement: we don't want to

  1203  * flag the last line of a paragraph for being short, so we

  1204  * have to wait until we know that our current line is a

  1205  * "normal" line, then report the _previous_ line if it was too

  1206  * short. We also don't want to report indented lines like

  1207  * chapter heads or formatted quotations. We therefore keep

  1208  * last->len as the length of the last line examined, and

  1209  * last->blen as the length of the last but one, and try to

  1210  * suppress unnecessary warnings by checking that both were of

  1211  * "normal" length. We keep the first character of the last

  1212  * line in last->start, and if it was a space, we assume that

  1213  * the formatting is deliberate. I can't figure out a way to

  1214  * distinguish something like a quoted verse left-aligned or

  1215  * the header or footer of a letter from a paragraph of short

  1216  * lines - maybe if I examined the whole paragraph, and if the

  1217  * para has less than, say, 8 lines and if all lines are short,

  1218  * then just assume it's OK? Need to look at some texts to see

  1219  * how often a formula like this would get the right result.

  1220  */

  1221 void check_for_short_line(const char *aline,const struct line_properties *last)

  1222 {

  1223     if (strlen(aline)>1 && last->len>1 && last->len<SHORTEST_PG_LINE &&

  1224       last->blen>1 && last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)

  1225     {

  1226 	if (pswit[ECHO_SWITCH])

  1227 	    printf("\n%s\n",prevline);

  1228 	if (!pswit[OVERVIEW_SWITCH])

  1229 	    printf("    Line %ld column %d - Short line %d?\n",

  1230 	      linecnt-1,(int)strlen(prevline),(int)strlen(prevline));

  1231 	else

  1232 	    cnt_short++;

  1233     }

  1234 }

  1236 /*

  1237  * check_for_starting_punctuation:

  1238  *

  1239  * Look for punctuation other than full ellipses at start of line.

  1240  */

  1241 void check_for_starting_punctuation(const char *aline)

  1242 {

  1243     if (*aline && strchr(".?!,;:",aline[0]) && strncmp(". . .",aline,5))

  1244     {

  1245 	if (pswit[ECHO_SWITCH])

  1246 	    printf("\n%s\n",aline);

  1247 	if (!pswit[OVERVIEW_SWITCH])

  1248 	    printf("    Line %ld column 1 - Begins with punctuation?\n",

  1249 	      linecnt);

  1250 	else

  1251 	    cnt_punct++;

  1252     }

  1253 }

  1255 /*

  1256  * check_for_spaced_emdash:

  1257  *

  1258  * Check for spaced em-dashes.

  1259  *

  1260  * We must check _all_ occurrences of "--" on the line

  1261  * hence the loop - even if the first double-dash is OK

  1262  * there may be another that's wrong later on.

  1263  */

  1264 void check_for_spaced_emdash(const char *aline)

  1265 {

  1266     const char *s,*t;

  1267     s=aline;

  1268     while ((t=strstr(s,"--")))

  1269     {

  1270 	if (t>aline && t[-1]==CHAR_SPACE || t[2]==CHAR_SPACE)

  1271 	{

  1272 	    if (pswit[ECHO_SWITCH])

  1273 		printf("\n%s\n",aline);

  1274 	    if (!pswit[OVERVIEW_SWITCH])

  1275 		printf("    Line %ld column %d - Spaced em-dash?\n",

  1276 		  linecnt,(int)(t-aline)+1);

  1277 	    else

  1278 		cnt_dash++;

  1279 	}

  1280 	s=t+2;

  1281     }

  1282 }

  1284 /*

  1285  * check_for_spaced_dash:

  1286  *

  1287  * Check for spaced dashes.

  1288  */

  1289 void check_for_spaced_dash(const char *aline)

  1290 {

  1291     const char *s;

  1292     if ((s=strstr(aline," -")))

  1293     {

  1294 	if (s[2]!='-')

  1295 	{

  1296 	    if (pswit[ECHO_SWITCH])

  1297 		printf("\n%s\n",aline);

  1298 	    if (!pswit[OVERVIEW_SWITCH])

  1299 		printf("    Line %ld column %d - Spaced dash?\n",

  1300 		  linecnt,(int)(s-aline)+1);

  1301 	    else

  1302 		cnt_dash++;

  1303 	}

  1304     }

  1305     else if ((s=strstr(aline,"- ")))

  1306     {

  1307 	if (s==aline || s[-1]!='-')

  1308 	{

  1309 	    if (pswit[ECHO_SWITCH])

  1310 		printf("\n%s\n",aline);

  1311 	    if (!pswit[OVERVIEW_SWITCH])

  1312 		printf("    Line %ld column %d - Spaced dash?\n",

  1313 		  linecnt,(int)(s-aline)+1);

  1314 	    else

  1315 		cnt_dash++;

  1316 	}

  1317     }

  1318 }

  1320 /*

  1321  * check_for_unmarked_paragraphs:

  1322  *

  1323  * Check for unmarked paragraphs indicated by separate speakers.

  1324  *

  1325  * May well be false positive:

  1326  * "Bravo!" "Wonderful!" called the crowd.

  1327  * but useful all the same.

  1328  */

  1329 void check_for_unmarked_paragraphs(const char *aline)

  1330 {

  1331     const char *s;

  1332     s=strstr(aline,"\"  \"");

  1333     if (!s)

  1334 	s=strstr(aline,"\" \"");

  1335     if (s)

  1336     {

  1337 	if (pswit[ECHO_SWITCH])

  1338 	    printf("\n%s\n",aline);

  1339 	if (!pswit[OVERVIEW_SWITCH])

  1340 	    printf("    Line %ld column %d - Query missing paragraph break?\n",

  1341 	      linecnt,(int)(s-aline)+1);

  1342 	else

  1343 	    cnt_punct++;

  1344     }

  1345 }

  1347 /*

  1348  * check_for_jeebies:

  1349  *

  1350  * Check for "to he" and other easy h/b errors.

  1351  *

  1352  * This is a very inadequate effort on the h/b problem,

  1353  * but the phrase "to he" is always an error, whereas "to

  1354  * be" is quite common.

  1355  * Similarly, '"Quiet!", be said.' is a non-be error

  1356  * "to he" is _not_ always an error!:

  1357  *       "Where they went to he couldn't say."

  1358  * Another false positive:

  1359  *       What would "Cinderella" be without the . . .

  1360  * and another: "If he wants to he can see for himself."

  1361  */

  1362 void check_for_jeebies(const char *aline)

  1363 {

  1364     const char *s;

  1365     s=strstr(aline," be could ");

  1366     if (!s)

  1367 	s=strstr(aline," be would ");

  1368     if (!s)

  1369 	s=strstr(aline," was be ");

  1370     if (!s)

  1371 	s=strstr(aline," be is ");

  1372     if (!s)

  1373 	s=strstr(aline," is be ");

  1374     if (!s)

  1375 	s=strstr(aline,"\", be ");

  1376     if (!s)

  1377 	s=strstr(aline,"\" be ");

  1378     if (!s)

  1379 	s=strstr(aline,"\" be ");

  1380     if (!s)

  1381 	s=strstr(aline," to he ");

  1382     if (s)

  1383     {

  1384 	if (pswit[ECHO_SWITCH])

  1385 	    printf("\n%s\n",aline);

  1386 	if (!pswit[OVERVIEW_SWITCH])

  1387 	    printf("    Line %ld column %d - Query he/be error?\n",

  1388 	      linecnt,(int)(s-aline)+1);

  1389 	else

  1390 	    cnt_word++;

  1391     }

  1392     s=strstr(aline," the had ");

  1393     if (!s)

  1394 	s=strstr(aline," a had ");

  1395     if (!s)

  1396 	s=strstr(aline," they bad ");

  1397     if (!s)

  1398 	s=strstr(aline," she bad ");

  1399     if (!s)

  1400 	s=strstr(aline," he bad ");

  1401     if (!s)

  1402 	s=strstr(aline," you bad ");

  1403     if (!s)

  1404 	s=strstr(aline," i bad ");

  1405     if (s)

  1406     {

  1407 	if (pswit[ECHO_SWITCH])

  1408 	    printf("\n%s\n",aline);

  1409 	if (!pswit[OVERVIEW_SWITCH])

  1410 	    printf("    Line %ld column %d - Query had/bad error?\n",

  1411 	      linecnt,(int)(s-aline)+1);

  1412 	else

  1413 	    cnt_word++;

  1414     }

  1415     s=strstr(aline,"; hut ");

  1416     if (!s)

  1417 	s=strstr(aline,", hut ");

  1418     if (s)

  1419     {

  1420 	if (pswit[ECHO_SWITCH])

  1421 	    printf("\n%s\n",aline);

  1422 	if (!pswit[OVERVIEW_SWITCH])

  1423 	    printf("    Line %ld column %d - Query hut/but error?\n",

  1424 	      linecnt,(int)(s-aline)+1);

  1425 	else

  1426 	    cnt_word++;

  1427     }

  1428 }

  1430 /*

  1431  * check_for_mta_from:

  1432  *

  1433  * Special case - angled bracket in front of "From" placed there by an

  1434  * MTA when sending an e-mail.

  1435  */

  1436 void check_for_mta_from(const char *aline)

  1437 {

  1438     const char *s;

  1439     s=strstr(aline,">From");

  1440     if (s)

  1441     {

  1442 	if (pswit[ECHO_SWITCH])

  1443 	    printf("\n%s\n",aline);

  1444 	if (!pswit[OVERVIEW_SWITCH])

  1445 	    printf("    Line %ld column %d - Query angled bracket with From\n",

  1446 	      linecnt,(int)(s-aline)+1);

  1447 	else

  1448 	    cnt_punct++;

  1449     }

  1450 }

  1452 /*

  1453  * check_for_orphan_character:

  1454  *

  1455  * Check for a single character line -

  1456  * often an overflow from bad wrapping.

  1457  */

  1458 void check_for_orphan_character(const char *aline)

  1459 {

  1460     if (*aline && !aline[1])

  1461     {

  1462 	if (*aline=='I' || *aline=='V' || *aline=='X' || *aline=='L' ||

  1463 	  gcisdigit(*aline))

  1464 	    ; /* Nothing - ignore numerals alone on a line. */

  1465 	else

  1466 	{

  1467 	    if (pswit[ECHO_SWITCH])

  1468 		printf("\n%s\n",aline);

  1469 	    if (!pswit[OVERVIEW_SWITCH])

  1470 		printf("    Line %ld column 1 - Query single character line\n",

  1471 		  linecnt);

  1472 	    else

  1473 		cnt_punct++;

  1474 	}

  1475     }

  1476 }

  1478 /*

  1479  * check_for_pling_scanno:

  1480  *

  1481  * Check for I" - often should be !

  1482  */

  1483 void check_for_pling_scanno(const char *aline)

  1484 {

  1485     const char *s;

  1486     s=strstr(aline," I\"");

  1487     if (s)

  1488     {

  1489 	if (pswit[ECHO_SWITCH])

  1490 	    printf("\n%s\n",aline);

  1491 	if (!pswit[OVERVIEW_SWITCH])

  1492 	    printf("    Line %ld column %ld - Query I=exclamation mark?\n",

  1493 	      linecnt,s-aline);

  1494 	else

  1495 	    cnt_punct++;

  1496     }

  1497 }

  1499 /*

  1500  * check_for_extra_period:

  1501  *

  1502  * Check for period without a capital letter. Cut-down from gutspell.

  1503  * Only works when it happens on a single line.

  1504  */

  1505 void check_for_extra_period(const char *aline,const struct warnings *warnings)

  1506 {

  1507     const char *s,*t,*s1;

  1508     int i;

  1509     gboolean istypo;

  1510     gchar *testword;

  1511     if (pswit[PARANOID_SWITCH])

  1512     {

  1513 	for (t=aline;strstr(t,". ");)

  1514 	{

  1515 	    t=strstr(t,". ");

  1516 	    if (t==aline)

  1517 	    {

  1518 		t++;

  1519 		/* start of line punctuation is handled elsewhere */

  1520 		continue;

  1521 	    }

  1522 	    if (!gcisalpha(t[-1]))

  1523 	    {

  1524 		t++;

  1525 		continue;

  1526 	    }

  1527 	    if (warnings->isDutch)

  1528 	    {

  1529 		/* For Frank & Jeroen -- 's Middags case */

  1530 		if (t[2]==CHAR_SQUOTE && t[3]>='a' && t[3]<='z' &&

  1531 		  t[4]==CHAR_SPACE && t[5]>='A' && t[5]<='Z')

  1532 		{

  1533 		    t++;

  1534 		    continue;

  1535 		}

  1536 	    }

  1537 	    s1=t+2;

  1538 	    while (*s1 && !gcisalpha(*s1) && !isdigit(*s1))

  1539 		s1++;

  1540 	    if (*s1>='a' && *s1<='z')

  1541 	    {

  1542 		/* we have something to investigate */

  1543 		istypo=TRUE;

  1544 		/* so let's go back and find out */

  1545 		for (s1=t-1;s1>=aline &&

  1546 		  (gcisalpha(*s1) || gcisdigit(*s1) || *s1==CHAR_SQUOTE &&

  1547 		  gcisalpha(s1[1]) && gcisalpha(s1[-1]));s1--)

  1548 		    ;

  1549 		s1++;

  1550 		s=strchr(s1,'.');

  1551 		if (s)

  1552 		    testword=g_strndup(s1,s-s1);

  1553 		else

  1554 		    testword=g_strdup(s1);

  1555 		for (i=0;*abbrev[i];i++)

  1556 		    if (!strcmp(testword,abbrev[i]))

  1557 			istypo=FALSE;

  1558 		if (gcisdigit(*testword))

  1559 		    istypo=FALSE;

  1560 		if (!testword[1])

  1561 		    istypo=FALSE;

  1562 		if (isroman(testword))

  1563 		    istypo=FALSE;

  1564 		if (istypo)

  1565 		{

  1566 		    istypo=FALSE;

  1567 		    for (i=0;testword[i];i++)

  1568 			if (strchr(vowels,testword[i]))

  1569 			    istypo=TRUE;

  1570 		}

  1571 		if (istypo &&

  1572 		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))

  1573 		{

  1574 		    g_tree_insert(qperiod,g_strdup(testword),

  1575 		      GINT_TO_POINTER(1));

  1576 		    if (pswit[ECHO_SWITCH])

  1577 			printf("\n%s\n",aline);

  1578 		    if (!pswit[OVERVIEW_SWITCH])

  1579 			printf("    Line %ld column %d - Extra period?\n",

  1580 			  linecnt,(int)(t-aline)+1);

  1581 		    else

  1582 			cnt_punct++;

  1583 		}

  1584 		g_free(testword);

  1585 	    }

  1586 	    t++;

  1587 	}

  1588     }

  1589 }

  1591 /*

  1592  * check_for_following_punctuation:

  1593  *

  1594  * Check for words usually not followed by punctuation.

  1595  */

  1596 void check_for_following_punctuation(const char *aline)

  1597 {

  1598     int i;

  1599     const char *s,*wordstart;

  1600     gchar *inword,*t;

  1601     if (pswit[TYPO_SWITCH])

  1602     {

  1603 	for (s=aline;*s;)

  1604 	{

  1605 	    wordstart=s;

  1606 	    t=getaword(&s);

  1607 	    if (!*t)

  1608 	    {

  1609 		g_free(t);

  1610 		continue;

  1611 	    }

  1612 	    inword=g_ascii_strdown(t,-1);

  1613 	    g_free(t);

  1614 	    for (i=0;*nocomma[i];i++)

  1615 		if (!strcmp(inword,nocomma[i]))

  1616 		{

  1617 		    if (*s==',' || *s==';' || *s==':')

  1618 		    {

  1619 			if (pswit[ECHO_SWITCH])

  1620 			    printf("\n%s\n",aline);

  1621 			if (!pswit[OVERVIEW_SWITCH])

  1622 			    printf("    Line %ld column %d - "

  1623 			      "Query punctuation after %s?\n",

  1624 			      linecnt,(int)(s-aline)+1,inword);

  1625 			else

  1626 			    cnt_punct++;

  1627 		    }

  1628 		}

  1629 	    for (i=0;*noperiod[i];i++)

  1630 		if (!strcmp(inword,noperiod[i]))

  1631 		{

  1632 		    if (*s=='.' || *s=='!')

  1633 		    {

  1634 			if (pswit[ECHO_SWITCH])

  1635 			    printf("\n%s\n",aline);

  1636 			if (!pswit[OVERVIEW_SWITCH])

  1637 			    printf("    Line %ld column %d - "

  1638 			      "Query punctuation after %s?\n",

  1639 			      linecnt,(int)(s-aline)+1,inword);

  1640 			else

  1641 			    cnt_punct++;

  1642 		    }

  1643 		}

  1644 	    g_free(inword);

  1645 	}

  1646     }

  1647 }

  1649 /*

  1650  * check_for_typos:

  1651  *

  1652  * Check for commonly mistyped words,

  1653  * and digits like 0 for O in a word.

  1654  */

  1655 void check_for_typos(const char *aline,struct warnings *warnings)

  1656 {

  1657     const char *s,*wordstart;

  1658     gchar *inword,*testword;

  1659     int i,alower,vowel,consonant,*dupcnt;

  1660     gboolean isdup,istypo;

  1661     for (s=aline;*s;)

  1662     {

  1663 	wordstart=s;

  1664 	inword=getaword(&s);

  1665 	if (!*inword)

  1666 	{

  1667 	    g_free(inword);

  1668 	    continue; /* don't bother with empty lines */

  1669 	}

  1670 	if (mixdigit(inword))

  1671 	{

  1672 	    if (pswit[ECHO_SWITCH])

  1673 		printf("\n%s\n",aline);

  1674 	    if (!pswit[OVERVIEW_SWITCH])

  1675 		printf("    Line %ld column %d - Query digit in %s\n",

  1676 		  linecnt,(int)(wordstart-aline)+1,inword);

  1677 	    else

  1678 		cnt_word++;

  1679 	}

  1680 	/*

  1681 	 * Put the word through a series of tests for likely typos and OCR

  1682 	 * errors.

  1683 	 */

  1684 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])

  1685 	{

  1686 	    istypo=FALSE;

  1687 	    testword=g_strdup(inword);

  1688 	    alower=0;

  1689 	    for (i=0;i<(int)strlen(testword);i++)

  1690 	    {

  1691 		/* lowercase for testing */

  1692 		if (testword[i]>='a' && testword[i]<='z')

  1693 		    alower=1;

  1694 		if (alower && testword[i]>='A' && testword[i]<='Z')

  1695 		{

  1696 		    /*

  1697 		     * We have an uppercase mid-word. However, there are

  1698 		     * common cases:

  1699 		     *   Mac and Mc like McGill

  1700 		     *   French contractions like l'Abbe

  1701 		     */

  1702 		    if (i==2 && testword[0]=='m' && testword[1]=='c' ||

  1703 		      i==3 && testword[0]=='m' && testword[1]=='a' &&

  1704 		      testword[2]=='c' || i>0 && testword[i-1]==CHAR_SQUOTE)

  1705 			; /* do nothing! */

  1706 		    else

  1707 			istypo=TRUE;

  1708 		}

  1709 		testword[i]=(char)tolower(testword[i]);

  1710 	    }

  1711 	}

  1712 	if (pswit[TYPO_SWITCH])

  1713 	{

  1714 	    /*

  1715 	     * Check for certain unlikely two-letter combinations at word

  1716 	     * start and end.

  1717 	     */

  1718 	    if (strlen(testword)>1)

  1719 	    {

  1720 		for (i=0;*nostart[i];i++)

  1721 		    if (!strncmp(testword,nostart[i],2))

  1722 			istypo=TRUE;

  1723 		for (i=0;*noend[i];i++)

  1724 		    if (!strncmp(testword+strlen(testword)-2,noend[i],2))

  1725 			istypo=TRUE;

  1726 	    }

  1727 	    /* ght is common, gbt never. Like that. */

  1728 	    if (strstr(testword,"cb"))

  1729 		istypo=TRUE;

  1730 	    if (strstr(testword,"gbt"))

  1731 		istypo=TRUE;

  1732 	    if (strstr(testword,"pbt"))

  1733 		istypo=TRUE;

  1734 	    if (strstr(testword,"tbs"))

  1735 		istypo=TRUE;

  1736 	    if (strstr(testword,"mrn"))

  1737 		istypo=TRUE;

  1738 	    if (strstr(testword,"ahle"))

  1739 		istypo=TRUE;

  1740 	    if (strstr(testword,"ihle"))

  1741 		istypo=TRUE;

  1742 	    /*

  1743 	     * "TBE" does happen - like HEARTBEAT - but uncommon.

  1744 	     * Also "TBI" - frostbite, outbid - but uncommon.

  1745 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman

  1746 	     * numerals, but "ii" is a common scanno.

  1747 	     */

  1748 	    if (strstr(testword,"tbi"))

  1749 		istypo=TRUE;

  1750 	    if (strstr(testword,"tbe"))

  1751 		istypo=TRUE;

  1752 	    if (strstr(testword,"ii"))

  1753 		istypo=TRUE;

  1754 	    /*

  1755 	     * Check for no vowels or no consonants.

  1756 	     * If none, flag a typo.

  1757 	     */

  1758 	    if (!istypo && strlen(testword)>1)

  1759 	    {

  1760 		vowel=consonant=0;

  1761 		for (i=0;testword[i];i++)

  1762 		{

  1763 		    if (testword[i]=='y' || gcisdigit(testword[i]))

  1764 		    {

  1765 			/* Yah, this is loose. */

  1766 			vowel++;

  1767 			consonant++;

  1768 		    }

  1769 		    else if (strchr(vowels,testword[i]))

  1770 			vowel++;

  1771 		    else

  1772 			consonant++;

  1773 		}

  1774 		if (!vowel || !consonant)

  1775 		    istypo=TRUE;

  1776 	    }

  1777 	    /*

  1778 	     * Now exclude the word from being reported if it's in

  1779 	     * the okword list.

  1780 	     */

  1781 	    for (i=0;*okword[i];i++)

  1782 		if (!strcmp(testword,okword[i]))

  1783 		    istypo=FALSE;

  1784 	    /*

  1785 	     * What looks like a typo may be a Roman numeral.

  1786 	     * Exclude these.

  1787 	     */

  1788 	    if (istypo && isroman(testword))

  1789 		istypo=FALSE;

  1790 	    /* Check the manual list of typos. */

  1791 	    if (!istypo)

  1792 		for (i=0;*typo[i];i++)

  1793 		    if (!strcmp(testword,typo[i]))

  1794 			istypo=TRUE;

  1795 	    /*

  1796 	     * Check lowercase s, l, i and m - special cases.

  1797 	     *   "j" - often a semi-colon gone wrong.

  1798 	     *   "d" for a missing apostrophe - he d

  1799 	     *   "n" for "in"

  1800 	     */

  1801 	    if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword))

  1802 		istypo=TRUE;

  1803 	    if (istypo)

  1804 	    {

  1805 		dupcnt=g_tree_lookup(qword,testword);

  1806 		if (dupcnt)

  1807 		{

  1808 		    (*dupcnt)++;

  1809 		    isdup=!pswit[VERBOSE_SWITCH];

  1810 		}

  1811 		else

  1812 		{

  1813 		    dupcnt=g_new0(int,1);

  1814 		    g_tree_insert(qword,g_strdup(testword),dupcnt);

  1815 		    isdup=FALSE;

  1816 		}

  1817 		if (!isdup)

  1818 		{

  1819 		    if (pswit[ECHO_SWITCH])

  1820 			printf("\n%s\n",aline);

  1821 		    if (!pswit[OVERVIEW_SWITCH])

  1822 		    {

  1823 			printf("    Line %ld column %d - Query word %s",

  1824 			  linecnt,(int)(wordstart-aline)+1,inword);

  1825 			if (!pswit[VERBOSE_SWITCH])

  1826 			    printf(" - not reporting duplicates");

  1827 			printf("\n");

  1828 		    }

  1829 		    else

  1830 			cnt_word++;

  1831 		}

  1832 	    }

  1833 	}

  1834 	/* check the user's list of typos */

  1835 	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))

  1836 	{

  1837 	    if (pswit[ECHO_SWITCH])

  1838 		printf("\n%s\n",aline);

  1839 	    if (!pswit[OVERVIEW_SWITCH])

  1840 		printf("    Line %ld column %d - Query possible scanno %s\n",

  1841 		  linecnt,(int)(wordstart-aline)+2,inword);

  1842 	}

  1843 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])

  1844 	    g_free(testword);

  1845 	if (pswit[PARANOID_SWITCH] && warnings->digit)

  1846 	{

  1847 	    /* In paranoid mode, query all 0 and 1 standing alone. */

  1848 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))

  1849 	    {

  1850 		if (pswit[ECHO_SWITCH])

  1851 		    printf("\n%s\n",aline);

  1852 		if (!pswit[OVERVIEW_SWITCH])

  1853 		    printf("    Line %ld column %d - Query standalone %s\n",

  1854 		      linecnt,(int)(wordstart-aline)+2,inword);

  1855 		else

  1856 		    cnt_word++;

  1857 	    }

  1858 	}

  1859 	g_free(inword);

  1860     }

  1861 }

  1863 /*

  1864  * check_for_misspaced_punctuation:

  1865  *

  1866  * Look for added or missing spaces around punctuation and quotes.

  1867  * If there is a punctuation character like ! with no space on

  1868  * either side, suspect a missing!space. If there are spaces on

  1869  * both sides , assume a typo. If we see a double quote with no

  1870  * space or punctuation on either side of it, assume unspaced

  1871  * quotes "like"this.

  1872  */

  1873 void check_for_misspaced_punctuation(const char *aline,

  1874   struct parities *parities,gboolean isemptyline)

  1875 {

  1876     int i,llen;

  1877     gboolean isacro,isellipsis;

  1878     const char *s;

  1879     llen=strlen(aline);

  1880     for (i=1;i<llen;i++)

  1881     {

  1882 	/* For each character in the line after the first. */

  1883 	if (strchr(".?!,;:_",aline[i]))  /* if it's punctuation */

  1884 	{

  1885 	    /* we need to suppress warnings for acronyms like M.D. */

  1886 	    isacro=FALSE;

  1887 	    /* we need to suppress warnings for ellipsis . . . */

  1888 	    isellipsis=FALSE;

  1889 	    /* if there are letters on both sides of it or ... */

  1890 	    if (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1]) ||

  1891 	       gcisalpha(aline[i+1]) && strchr("?!,;:",aline[i]))

  1892 	    {

  1893 		/* ...if it's strict punctuation followed by an alpha */

  1894 		if (aline[i]=='.')

  1895 		{

  1896 		    if (i>2 && aline[i-2]=='.')

  1897 			isacro=TRUE;

  1898 		    if (i+2<llen && aline[i+2]=='.')

  1899 			isacro=TRUE;

  1900 		}

  1901 		if (!isacro)

  1902 		{

  1903 		    if (pswit[ECHO_SWITCH])

  1904 			printf("\n%s\n",aline);

  1905 		    if (!pswit[OVERVIEW_SWITCH])

  1906 			printf("    Line %ld column %d - Missing space?\n",

  1907 			  linecnt,i+1);

  1908 		    else

  1909 			cnt_punct++;

  1910 		}

  1911 	    }

  1912 	    if (aline[i-1]==CHAR_SPACE &&

  1913 	      (aline[i+1]==CHAR_SPACE || aline[i+1]==0))

  1914 	    {

  1915 		/*

  1916 		 * If there are spaces on both sides,

  1917 		 * or space before and end of line.

  1918 		 */

  1919 		if (aline[i]=='.')

  1920 		{

  1921 		    if (i>2 && aline[i-2]=='.')

  1922 			isellipsis=TRUE;

  1923 		    if (i+2<llen && aline[i+2]=='.')

  1924 			isellipsis=TRUE;

  1925 		}

  1926 		if (!isemptyline && !isellipsis)

  1927 		{

  1928 		    if (pswit[ECHO_SWITCH])

  1929 			printf("\n%s\n",aline);

  1930 		    if (!pswit[OVERVIEW_SWITCH])

  1931 			printf("    Line %ld column %d - "

  1932 			  "Spaced punctuation?\n",linecnt,i+1);

  1933 		    else

  1934 			cnt_punct++;

  1935 		}

  1936 	    }

  1937 	}

  1938     }

  1939     /* Split out the characters that CANNOT be preceded by space. */

  1940     llen=strlen(aline);

  1941     for (i=1;i<llen;i++)

  1942     {

  1943 	/* for each character in the line after the first */

  1944 	if (strchr("?!,;:",aline[i]))

  1945 	{

  1946 	    /* if it's punctuation that _cannot_ have a space before it */

  1947 	    if (aline[i-1]==CHAR_SPACE && !isemptyline &&

  1948 	      aline[i+1]!=CHAR_SPACE)

  1949 	    {

  1950 		/*

  1951 		 * If aline[i+1) DOES == space,

  1952 		 * it was already reported just above.

  1953 		 */

  1954 		if (pswit[ECHO_SWITCH])

  1955 		    printf("\n%s\n",aline);

  1956 		if (!pswit[OVERVIEW_SWITCH])

  1957 		    printf("    Line %ld column %d - Spaced punctuation?\n",

  1958 		      linecnt,i+1);

  1959 		else

  1960 		    cnt_punct++;

  1961 	    }

  1962 	}

  1963     }

  1964     /*

  1965      * Special case " .X" where X is any alpha.

  1966      * This plugs a hole in the acronym code above.

  1967      * Inelegant, but maintainable.

  1968      */

  1969     llen=strlen(aline);

  1970     for (i=1;i<llen;i++)

  1971     {

  1972 	/* for each character in the line after the first */

  1973 	if (aline[i]=='.')

  1974 	{

  1975 	    /* if it's a period */

  1976 	    if (aline[i-1]==CHAR_SPACE && gcisalpha(aline[i+1]))

  1977 	    {

  1978 		/*

  1979 		 * If the period follows a space and

  1980 		 * is followed by a letter.

  1981 		 */

  1982 		if (pswit[ECHO_SWITCH])

  1983 		    printf("\n%s\n",aline);

  1984 		if (!pswit[OVERVIEW_SWITCH])

  1985 		    printf("    Line %ld column %d - Spaced punctuation?\n",

  1986 		      linecnt,i+1);

  1987 		else

  1988 		    cnt_punct++;

  1989 	    }

  1990 	}

  1991     }

  1992     for (i=1;i<llen;i++)

  1993     {

  1994 	/* for each character in the line after the first */

  1995 	if (aline[i]==CHAR_DQUOTE)

  1996 	{

  1997 	    if (!strchr(" _-.'`,;:!/([{?}])",aline[i-1]) &&

  1998 	      !strchr(" _-.'`,;:!/([{?}])",aline[i+1]) && aline[i+1] ||

  1999 	      !strchr(" _-([{'`",aline[i-1]) && gcisalpha(aline[i+1]))

  2000 	    {

  2001 		if (pswit[ECHO_SWITCH])

  2002 		    printf("\n%s\n",aline);

  2003 		if (!pswit[OVERVIEW_SWITCH])

  2004 		    printf("    Line %ld column %d - Unspaced quotes?\n",

  2005 		      linecnt,i+1);

  2006 		else

  2007 		    cnt_punct++;

  2008 	    }

  2009 	}

  2010     }

  2011     /* Check parity of quotes. */

  2012     for (s=aline;*s;s++)

  2013     {

  2014 	if (*s==CHAR_DQUOTE)

  2015 	{

  2016 	    parities->dquote=!parities->dquote;

  2017 	    if (!parities->dquote)

  2018 	    {

  2019 		/* parity even */

  2020 		if (!strchr("_-.'`/,;:!?)]} ",s[1]))

  2021 		{

  2022 		    if (pswit[ECHO_SWITCH])

  2023 			printf("\n%s\n",aline);

  2024 		    if (!pswit[OVERVIEW_SWITCH])

  2025 			printf("    Line %ld column %d - "

  2026 			  "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);

  2027 		    else

  2028 			cnt_punct++;

  2029 		}

  2030 	    }

  2031 	    else

  2032 	    {

  2033 		/* parity odd */

  2034 		if (!gcisalpha(s[1]) && !isdigit(s[1]) &&

  2035 		  !strchr("_-/.'`([{$",s[1]) || !s[1])

  2036 		{

  2037 		    if (pswit[ECHO_SWITCH])

  2038 			printf("\n%s\n",aline);

  2039 		    if (!pswit[OVERVIEW_SWITCH])

  2040 			printf("    Line %ld column %d - "

  2041 			  "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);

  2042 		    else

  2043 			cnt_punct++;

  2044 		}

  2045 	    }

  2046 	}

  2047     }

  2048     if (*aline==CHAR_DQUOTE)

  2049     {

  2050 	if (strchr(",;:!?)]} ",aline[1]))

  2051 	{

  2052 	    if (pswit[ECHO_SWITCH])

  2053 		printf("\n%s\n",aline);

  2054 	    if (!pswit[OVERVIEW_SWITCH])

  2055 		printf("    Line %ld column 1 - Wrongspaced quotes?\n",

  2056 		  linecnt);

  2057 	    else

  2058 		cnt_punct++;

  2059 	}

  2060     }

  2061     if (pswit[SQUOTE_SWITCH])

  2062     {

  2063 	for (s=aline;*s;s++)

  2064 	{

  2065 	    if ((*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE) &&

  2066 	      (s==aline || s>aline && !gcisalpha(s[-1]) ||

  2067 	      !gcisalpha(s[1])))

  2068 	    {

  2069 		parities->squote=!parities->squote;

  2070 		if (!parities->squote)

  2071 		{

  2072 		    /* parity even */

  2073 		    if (!strchr("_-.'`/\",;:!?)]} ",s[1]))

  2074 		    {

  2075 			if (pswit[ECHO_SWITCH])

  2076 			    printf("\n%s\n",aline);

  2077 			if (!pswit[OVERVIEW_SWITCH])

  2078 			    printf("    Line %ld column %d - "

  2079 			      "Wrongspaced singlequotes?\n",

  2080 			      linecnt,(int)(s-aline)+1);

  2081 			else

  2082 			    cnt_punct++;

  2083 		    }

  2084 		}

  2085 		else

  2086 		{

  2087 		    /* parity odd */

  2088 		    if (!gcisalpha(s[1]) && !isdigit(s[1]) &&

  2089 		      !strchr("_-/\".'`",s[1]) || !s[1])

  2090 		    {

  2091 			if (pswit[ECHO_SWITCH])

  2092 			    printf("\n%s\n",aline);

  2093 			if (!pswit[OVERVIEW_SWITCH])

  2094 			    printf("    Line %ld column %d - "

  2095 			      "Wrongspaced singlequotes?\n",

  2096 			      linecnt,(int)(s-aline)+1);

  2097 			else

  2098 			    cnt_punct++;

  2099 		    }

  2100 		}

  2101 	    }

  2102 	}

  2103     }

  2104 }

  2106 /*

  2107  * check_for_double_punctuation:

  2108  *

  2109  * Look for double punctuation like ,. or ,,

  2110  * Thanks to DW for the suggestion!

  2111  * In books with references, ".," and ".;" are common

  2112  * e.g. "etc., etc.," and vol. 1.; vol 3.;

  2113  * OTOH, from my initial tests, there are also fairly

  2114  * common errors. What to do? Make these cases paranoid?

  2115  * ".," is the most common, so warnings->dotcomma is used

  2116  * to suppress detailed reporting if it occurs often.

  2117  */

  2118 void check_for_double_punctuation(const char *aline,struct warnings *warnings)

  2119 {

  2120     int i,llen;

  2121     llen=strlen(aline);

  2122     for (i=0;i<llen;i++)

  2123     {

  2124 	/* for each punctuation character in the line */

  2125 	if (strchr(".?!,;:",aline[i]) && strchr(".?!,;:",aline[i+1]) &&

  2126 	  aline[i] && aline[i+1])

  2127 	{

  2128 	    /* followed by punctuation, it's a query, unless . . . */

  2129 	    if (aline[i]==aline[i+1] && (aline[i]=='.' || aline[i]=='?' ||

  2130 	      aline[i]=='!') ||

  2131 	      !warnings->dotcomma && aline[i]=='.' && aline[i+1]==',' ||

  2132 	      warnings->isFrench && !strncmp(aline+i,",...",4) ||

  2133 	      warnings->isFrench && !strncmp(aline+i,"...,",4) ||

  2134 	      warnings->isFrench && !strncmp(aline+i,";...",4) ||

  2135 	      warnings->isFrench && !strncmp(aline+i,"...;",4) ||

  2136 	      warnings->isFrench && !strncmp(aline+i,":...",4) ||

  2137 	      warnings->isFrench && !strncmp(aline+i,"...:",4) ||

  2138 	      warnings->isFrench && !strncmp(aline+i,"!...",4) ||

  2139 	      warnings->isFrench && !strncmp(aline+i,"...!",4) ||

  2140 	      warnings->isFrench && !strncmp(aline+i,"?...",4) ||

  2141 	      warnings->isFrench && !strncmp(aline+i,"...?",4))

  2142 	    {

  2143 		if (warnings->isFrench && !strncmp(aline+i,",...",4) ||

  2144 		  warnings->isFrench && !strncmp(aline+i,"...,",4) ||

  2145 		  warnings->isFrench && !strncmp(aline+i,";...",4) ||

  2146 		  warnings->isFrench && !strncmp(aline+i,"...;",4) ||

  2147 		  warnings->isFrench && !strncmp(aline+i,":...",4) ||

  2148 		  warnings->isFrench && !strncmp(aline+i,"...:",4) ||

  2149 		  warnings->isFrench && !strncmp(aline+i,"!...",4) ||

  2150 		  warnings->isFrench && !strncmp(aline+i,"...!",4) ||

  2151 		  warnings->isFrench && !strncmp(aline+i,"?...",4) ||

  2152 		  warnings->isFrench && !strncmp(aline+i,"...?",4))

  2153 		    i+=4;

  2154 		; /* do nothing for .. !! and ?? which can be legit */

  2155 	    }

  2156 	    else

  2157 	    {

  2158 		if (pswit[ECHO_SWITCH])

  2159 		    printf("\n%s\n",aline);

  2160 		if (!pswit[OVERVIEW_SWITCH])

  2161 		    printf("    Line %ld column %d - Double punctuation?\n",

  2162 		      linecnt,i+1);

  2163 		else

  2164 		    cnt_punct++;

  2165 	    }

  2166 	}

  2167     }

  2168 }

  2170 /*

  2171  * check_for_spaced_quotes:

  2172  */

  2173 void check_for_spaced_quotes(const char *aline)

  2174 {

  2175     const char *s,*t;

  2176     s=aline;

  2177     while ((t=strstr(s," \" ")))

  2178     {

  2179 	if (pswit[ECHO_SWITCH])

  2180 	    printf("\n%s\n",aline);

  2181 	if (!pswit[OVERVIEW_SWITCH])

  2182 	    printf("    Line %ld column %d - Spaced doublequote?\n",

  2183 	      linecnt,(int)(t-aline+1));

  2184 	else

  2185 	    cnt_punct++;

  2186 	s=t+2;

  2187     }

  2188     s=aline;

  2189     while ((t=strstr(s," ' ")))

  2190     {

  2191 	if (pswit[ECHO_SWITCH])

  2192 	    printf("\n%s\n",aline);

  2193 	if (!pswit[OVERVIEW_SWITCH])

  2194 	    printf("    Line %ld column %d - Spaced singlequote?\n",

  2195 	      linecnt,(int)(t-aline+1));

  2196 	else

  2197 	    cnt_punct++;

  2198 	s=t+2;

  2199     }

  2200     s=aline;

  2201     while ((t=strstr(s," ` ")))

  2202     {

  2203 	if (pswit[ECHO_SWITCH])

  2204 	    printf("\n%s\n",aline);

  2205 	if (!pswit[OVERVIEW_SWITCH])

  2206 	    printf("    Line %ld column %d - Spaced singlequote?\n",

  2207 	      linecnt,(int)(t-aline+1));

  2208 	else

  2209 	    cnt_punct++;

  2210 	s=t+2;

  2211     }

  2212 }

  2214 /*

  2215  * check_for_miscased_genative:

  2216  *

  2217  * Check special case of 'S instead of 's at end of word.

  2218  */

  2219 void check_for_miscased_genative(const char *aline)

  2220 {

  2221     const char *s;

  2222     if (!*aline)

  2223 	return;

  2224     s=aline+1;

  2225     while (*s)

  2226     {

  2227 	if (*s==CHAR_SQUOTE && s[1]=='S' && s[-1]>='a' && s[-1]<='z')

  2228 	{

  2229 	    if (pswit[ECHO_SWITCH])

  2230 		printf("\n%s\n",aline);

  2231 	    if (!pswit[OVERVIEW_SWITCH])

  2232 		printf("    Line %ld column %d - Capital \"S\"?\n",

  2233 		  linecnt,(int)(s-aline+2));

  2234 	    else

  2235 		cnt_punct++;

  2236 	}

  2237 	s++;

  2238     }

  2239 }

  2241 /*

  2242  * check_end_of_line:

  2243  *

  2244  * Now check special cases - start and end of line -

  2245  * for single and double quotes. Start is sometimes [sic]

  2246  * but better to query it anyway.

  2247  * While we're here, check for dash at end of line.

  2248  */

  2249 void check_end_of_line(const char *aline,struct warnings *warnings)

  2250 {

  2251     int i,llen;

  2252     llen=strlen(aline);

  2253     if (llen>1)

  2254     {

  2255 	if (aline[llen-1]==CHAR_DQUOTE || aline[llen-1]==CHAR_SQUOTE ||

  2256 	  aline[llen-1]==CHAR_OPEN_SQUOTE)

  2257 	    if (aline[llen-2]==CHAR_SPACE)

  2258 	    {

  2259 		if (pswit[ECHO_SWITCH])

  2260 		    printf("\n%s\n",aline);

  2261 		if (!pswit[OVERVIEW_SWITCH])

  2262 		    printf("    Line %ld column %d - Spaced quote?\n",

  2263 		      linecnt,llen);

  2264 		else

  2265 		    cnt_punct++;

  2266 	    }

  2267 	if ((aline[0]==CHAR_SQUOTE || aline[0]==CHAR_OPEN_SQUOTE) &&

  2268 	  aline[1]==CHAR_SPACE)

  2269 	{

  2270 	    if (pswit[ECHO_SWITCH])

  2271 		printf("\n%s\n",aline);

  2272 	    if (!pswit[OVERVIEW_SWITCH])

  2273 		printf("    Line %ld column 1 - Spaced quote?\n",linecnt);

  2274 	    else

  2275 		cnt_punct++;

  2276 	}

  2277 	/*

  2278 	 * Dash at end of line may well be legit - paranoid mode only

  2279 	 * and don't report em-dash at line-end.

  2280 	 */

  2281 	if (pswit[PARANOID_SWITCH] && warnings->hyphen)

  2282 	{

  2283 	    for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)

  2284 		;

  2285 	    if (aline[i]=='-' && aline[i-1]!='-')

  2286 	    {

  2287 		if (pswit[ECHO_SWITCH])

  2288 		    printf("\n%s\n",aline);

  2289 		if (!pswit[OVERVIEW_SWITCH])

  2290 		    printf("    Line %ld column %d - Hyphen at end of line?\n",

  2291 		      linecnt,i);

  2292 	    }

  2293 	}

  2294     }

  2295 }

  2297 /*

  2298  * check_for_unspaced_bracket:

  2299  *

  2300  * Brackets are often unspaced, but shouldn't be surrounded by alpha.

  2301  * If so, suspect a scanno like "a]most".

  2302  */

  2303 void check_for_unspaced_bracket(const char *aline)

  2304 {

  2305     int i,llen;

  2306     llen=strlen(aline);

  2307     for (i=1;i<llen-1;i++)

  2308     {

  2309 	/* for each bracket character in the line except 1st & last */

  2310 	if (strchr("{[()]}",aline[i]) && gcisalpha(aline[i-1]) &&

  2311 	  gcisalpha(aline[i+1]))

  2312 	{

  2313 	    if (pswit[ECHO_SWITCH])

  2314 		printf("\n%s\n",aline);

  2315 	    if (!pswit[OVERVIEW_SWITCH])

  2316 		printf("    Line %ld column %d - Unspaced bracket?\n",

  2317 		  linecnt,i);

  2318 	    else

  2319 		cnt_punct++;

  2320 	}

  2321     }

  2322 }

  2324 /*

  2325  * check_for_unpunctuated_endquote:

  2326  */

  2327 void check_for_unpunctuated_endquote(const char *aline)

  2328 {

  2329     int i,llen;

  2330     llen=strlen(aline);

  2331     for (i=1;i<llen;i++)

  2332     {

  2333 	/* for each character in the line except 1st */

  2334 	if (aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))

  2335 	{

  2336 	    if (pswit[ECHO_SWITCH])

  2337 		printf("\n%s\n",aline);

  2338 	    if (!pswit[OVERVIEW_SWITCH])

  2339 		printf("    Line %ld column %d - "

  2340 		  "endquote missing punctuation?\n",linecnt,i);

  2341 	    else

  2342 		cnt_punct++;

  2343 	}

  2344     }

  2345 }

  2347 /*

  2348  * check_for_html_tag:

  2349  *

  2350  * Check for <HTML TAG>.

  2351  *

  2352  * If there is a < in the line, followed at some point

  2353  * by a > then we suspect HTML.

  2354  */

  2355 void check_for_html_tag(const char *aline)

  2356 {

  2357     int i;

  2358     const char *open,*close;

  2359     open=strstr(aline,"<");

  2360     if (open)

  2361     {

  2362 	close=strstr(aline,">");

  2363 	if (close)

  2364 	{

  2365 	    i=(int)(close-open+1);

  2366 	    if (i>0)

  2367 	    {

  2368 		if (pswit[ECHO_SWITCH])

  2369 		    printf("\n%s\n",aline);

  2370 		if (!pswit[OVERVIEW_SWITCH])

  2371 		    printf("    Line %ld column %d - HTML Tag? %*.*s \n",

  2372 		      linecnt,(int)(open-aline)+1,i,i,open);

  2373 		else

  2374 		    cnt_html++;

  2375 	    }

  2376 	}

  2377     }

  2378 }

  2380 /*

  2381  * check_for_html_entity:

  2382  *

  2383  * Check for &symbol; HTML.

  2384  *

  2385  * If there is a & in the line, followed at

  2386  * some point by a ; then we suspect HTML.

  2387  */

  2388 void check_for_html_entity(const char *aline)

  2389 {

  2390     int i;

  2391     const char *s,*amp,*scolon;

  2392     amp=strstr(aline,"&");

  2393     if (amp)

  2394     {

  2395 	scolon=strstr(aline,";");

  2396 	if (scolon)

  2397 	{

  2398 	    i=(int)(scolon-amp+1);

  2399 	    for (s=amp;s<scolon;s++)

  2400 		if (*s==CHAR_SPACE)

  2401 		    i=0;		/* Don't report "Jones & Son;" */

  2402 	    if (i>0)

  2403 	    {

  2404 		if (pswit[ECHO_SWITCH])

  2405 		    printf("\n%s\n",aline);

  2406 		if (!pswit[OVERVIEW_SWITCH])

  2407 		    printf("    Line %ld column %d - HTML symbol? %*.*s \n",

  2408 		      linecnt,(int)(amp-aline)+1,i,i,amp);

  2409 		else

  2410 		    cnt_html++;

  2411 	    }

  2412 	}

  2413     }

  2414 }

  2416 /*

  2417  * print_pending:

  2418  *

  2419  * If we are in a state of unbalanced quotes, and this line

  2420  * doesn't begin with a quote, output the stored error message.

  2421  * If the -P switch was used, print the warning even if the

  2422  * new para starts with quotes.

  2423  */

  2424 void print_pending(const char *aline,const char *parastart,

  2425   struct pending *pending)

  2426 {

  2427     const char *s;

  2428     s=aline;

  2429     while (*s==' ')

  2430 	s++;

  2431     if (pending->dquote)

  2432     {

  2433 	if (*s!=CHAR_DQUOTE || pswit[QPARA_SWITCH])

  2434 	{

  2435 	    if (!pswit[OVERVIEW_SWITCH])

  2436 	    {

  2437 		if (pswit[ECHO_SWITCH])

  2438 		    printf("\n%s\n",parastart);

  2439 		puts(pending->dquote);

  2440 	    }

  2441 	    else

  2442 		cnt_dquot++;

  2443 	}

  2444 	g_free(pending->dquote);

  2445 	pending->dquote=NULL;

  2446     }

  2447     if (pending->squote)

  2448     {

  2449 	if (*s!=CHAR_SQUOTE && *s!=CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] ||

  2450 	  pending->squot)

  2451 	{

  2452 	    if (!pswit[OVERVIEW_SWITCH])

  2453 	    {

  2454 		if (pswit[ECHO_SWITCH])

  2455 		    printf("\n%s\n",parastart);

  2456 		puts(pending->squote);

  2457 	    }

  2458 	    else

  2459 		cnt_squot++;

  2460 	}

  2461 	g_free(pending->squote);

  2462 	pending->squote=NULL;

  2463     }

  2464     if (pending->rbrack)

  2465     {

  2466 	if (!pswit[OVERVIEW_SWITCH])

  2467 	{

  2468 	    if (pswit[ECHO_SWITCH])

  2469 		printf("\n%s\n",parastart);

  2470 	    puts(pending->rbrack);

  2471 	}

  2472 	else

  2473 	    cnt_brack++;

  2474 	g_free(pending->rbrack);

  2475 	pending->rbrack=NULL;

  2476     }

  2477     if (pending->sbrack)

  2478     {

  2479 	if (!pswit[OVERVIEW_SWITCH])

  2480 	{

  2481 	    if (pswit[ECHO_SWITCH])

  2482 		printf("\n%s\n",parastart);

  2483 	    puts(pending->sbrack);

  2484 	}

  2485 	else

  2486 	    cnt_brack++;

  2487 	g_free(pending->sbrack);

  2488 	pending->sbrack=NULL;

  2489     }

  2490     if (pending->cbrack)

  2491     {

  2492 	if (!pswit[OVERVIEW_SWITCH])

  2493 	{

  2494 	    if (pswit[ECHO_SWITCH])

  2495 		printf("\n%s\n",parastart);

  2496 	    puts(pending->cbrack);

  2497 	}

  2498 	else

  2499 	    cnt_brack++;

  2500 	g_free(pending->cbrack);

  2501 	pending->cbrack=NULL;

  2502     }

  2503     if (pending->unders)

  2504     {

  2505 	if (!pswit[OVERVIEW_SWITCH])

  2506 	{

  2507 	    if (pswit[ECHO_SWITCH])

  2508 		printf("\n%s\n",parastart);

  2509 	    puts(pending->unders);

  2510 	}

  2511 	else

  2512 	    cnt_brack++;

  2513 	g_free(pending->unders);

  2514 	pending->unders=NULL;

  2515     }

  2516 }

  2518 /*

  2519  * check_for_mismatched_quotes:

  2520  *

  2521  * At end of paragraph, check for mismatched quotes.

  2522  *

  2523  * We don't want to report an error immediately, since it is a

  2524  * common convention to omit the quotes at end of paragraph if

  2525  * the next paragraph is a continuation of the same speaker.

  2526  * Where this is the case, the next para should begin with a

  2527  * quote, so we store the warning message and only display it

  2528  * at the top of the next iteration if the new para doesn't

  2529  * start with a quote.

  2530  * The -p switch overrides this default, and warns of unclosed

  2531  * quotes on _every_ paragraph, whether the next begins with a

  2532  * quote or not.

  2533  */

  2534 void check_for_mismatched_quotes(const struct counters *counters,

  2535   struct pending *pending)

  2536 {

  2537     if (counters->quot%2)

  2538 	pending->dquote=

  2539 	  g_strdup_printf("    Line %ld - Mismatched quotes",linecnt);

  2540     if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&

  2541       counters->open_single_quote!=counters->close_single_quote)

  2542 	pending->squote=

  2543 	  g_strdup_printf("    Line %ld - Mismatched singlequotes?",linecnt);

  2544     if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&

  2545       counters->open_single_quote!=counters->close_single_quote &&

  2546       counters->open_single_quote!=counters->close_single_quote+1)

  2547 	/*

  2548 	 * Flag it to be noted regardless of the

  2549 	 * first char of the next para.

  2550 	 */

  2551 	pending->squot=1;

  2552     if (counters->r_brack)

  2553 	pending->rbrack=

  2554 	  g_strdup_printf("    Line %ld - Mismatched round brackets?",linecnt);

  2555     if (counters->s_brack)

  2556 	pending->sbrack=

  2557 	  g_strdup_printf("    Line %ld - Mismatched square brackets?",linecnt);

  2558     if (counters->c_brack)

  2559 	pending->cbrack=

  2560 	  g_strdup_printf("    Line %ld - Mismatched curly brackets?",linecnt);

  2561     if (counters->c_unders%2)

  2562 	pending->unders=

  2563 	  g_strdup_printf("    Line %ld - Mismatched underscores?",linecnt);

  2564 }

  2566 /*

  2567  * check_for_omitted_punctuation:

  2568  *

  2569  * Check for omitted punctuation at end of paragraph by working back

  2570  * through prevline. DW.

  2571  * Need to check this only for "normal" paras.

  2572  * So what is a "normal" para?

  2573  *    Not normal if one-liner (chapter headings, etc.)

  2574  *    Not normal if doesn't contain at least one locase letter

  2575  *    Not normal if starts with space

  2576  */

  2577 void check_for_omitted_punctuation(const char *prevline,

  2578   struct line_properties *last,int start_para_line)

  2579 {

  2580     int i;

  2581     const char *s;

  2582     for (s=prevline,i=0;*s && !i;s++)

  2583 	if (gcisletter(*s))

  2584 	    /* use i to indicate the presence of a letter on the line */

  2585 	    i=1;

  2586     /*

  2587      * This next "if" is a problem.

  2588      * If we say "start_para_line <= linecnt - 1", that includes

  2589      * one-line "paragraphs" like chapter heads. Lotsa false positives.

  2590      * If we say "start_para_line < linecnt - 1" it doesn't, but then it

  2591      * misses genuine one-line paragraphs.

  2592      */

  2593     if (i && last->blen>2 && start_para_line<linecnt-1 && *prevline>CHAR_SPACE)

  2594     {

  2595 	for (i=strlen(prevline)-1;

  2596 	  (prevline[i]==CHAR_DQUOTE || prevline[i]==CHAR_SQUOTE) &&

  2597 	  prevline[i]>CHAR_SPACE && i>0;

  2598 	  i--)

  2599 	    ;

  2600 	for (;i>0;i--)

  2601 	{

  2602 	    if (gcisalpha(prevline[i]))

  2603 	    {

  2604 		if (pswit[ECHO_SWITCH])

  2605 		    printf("\n%s\n",prevline);

  2606 		if (!pswit[OVERVIEW_SWITCH])

  2607 		    printf("    Line %ld column %d - "

  2608 		      "No punctuation at para end?\n",

  2609 		      linecnt-1,(int)strlen(prevline));

  2610 		else

  2611 		    cnt_punct++;

  2612 		break;

  2613 	    }

  2614 	    if (strchr("-.:!([{?}])",prevline[i]))

  2615 		break;

  2616 	}

  2617     }

  2618 }

  2620 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)

  2621 {

  2622     const char *word=key;

  2623     int *dupcnt=value;

  2624     if (*dupcnt)

  2625 	printf("\nNote: Queried word %s was duplicated %d times\n",

  2626 	  word,*dupcnt);

  2627     return FALSE;

  2628 }

  2630 /*

  2631  * procfile:

  2632  *

  2633  * Process one file.

  2634  */

  2635 void procfile(const char *filename)

  2636 {

  2637     const char *s;

  2638     gchar *parastart=NULL;	/* first line of current para */

  2639     gchar *etext,*aline;

  2640     gchar *etext_ptr;

  2641     GError *err=NULL;

  2642     struct first_pass_results *first_pass_results;

  2643     struct warnings *warnings;

  2644     struct counters counters={0};

  2645     struct line_properties last={0};

  2646     struct parities parities={0};

  2647     struct pending pending={0};

  2648     gboolean isemptyline;

  2649     long start_para_line=0;

  2650     gboolean isnewpara=FALSE,enddash=FALSE;

  2651     last.start=CHAR_SPACE;

  2652     linecnt=checked_linecnt=0;

  2653     etext=read_etext(filename,&err);

  2654     if (!etext)

  2655     {

  2656 	if (pswit[STDOUT_SWITCH])

  2657 	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);

  2658 	else

  2659 	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);

  2660 	exit(1);

  2661     }

  2662     fprintf(stdout,"\n\nFile: %s\n\n",filename);

  2663     first_pass_results=first_pass(etext);

  2664     warnings=report_first_pass(first_pass_results);

  2665     qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);

  2666     qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);

  2667     /*

  2668      * Here we go with the main pass. Hold onto yer hat!

  2669      */

  2670     linecnt=0;

  2671     etext_ptr=etext;

  2672     while ((aline=flgets(&etext_ptr,linecnt+1)))

  2673     {

  2674 	linecnt++;

  2675 	if (linecnt==1)

  2676 	    isnewpara=TRUE;

  2677 	if (pswit[DP_SWITCH] && !strncmp(aline,"-----File: ",11))

  2678 	    continue;    // skip DP page separators completely

  2679 	if (linecnt<first_pass_results->firstline ||

  2680 	  (first_pass_results->footerline>0 &&

  2681 	  linecnt>first_pass_results->footerline))

  2682 	{

  2683 	    if (pswit[HEADER_SWITCH])

  2684 	    {

  2685 		if (!strncmp(aline,"Title:",6))

  2686 		    printf("    %s\n",aline);

  2687 		if (!strncmp(aline,"Author:",7))

  2688 		    printf("    %s\n",aline);

  2689 		if (!strncmp(aline,"Release Date:",13))

  2690 		    printf("    %s\n",aline);

  2691 		if (!strncmp(aline,"Edition:",8))

  2692 		    printf("    %s\n\n",aline);

  2693 	    }

  2694 	    continue;		/* skip through the header */

  2695 	}

  2696 	checked_linecnt++;

  2697 	print_pending(aline,parastart,&pending);

  2698 	memset(&pending,0,sizeof(pending));

  2699 	isemptyline=analyse_quotes(aline,&counters);

  2700 	if (isnewpara && !isemptyline)

  2701 	{

  2702 	    /* This line is the start of a new paragraph. */

  2703 	    start_para_line=linecnt;

  2704 	    /* Capture its first line in case we want to report it later. */

  2705 	    g_free(parastart);

  2706 	    parastart=g_strdup(aline);

  2707 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */

  2708 	    s=aline;

  2709 	    while (!gcisalpha(*s) && !gcisdigit(*s) && *s)

  2710 		s++;

  2711 	    if (*s>='a' && *s<='z')

  2712 	    {

  2713 		/* and its first letter is lowercase */

  2714 		if (pswit[ECHO_SWITCH])

  2715 		    printf("\n%s\n",aline);

  2716 		if (!pswit[OVERVIEW_SWITCH])

  2717 		    printf("    Line %ld column %d - "

  2718 		      "Paragraph starts with lower-case\n",

  2719 		      linecnt,(int)(s-aline)+1);

  2720 		else

  2721 		    cnt_punct++;

  2722 	    }

  2723 	    isnewpara=FALSE; /* Signal the end of new para processing. */

  2724 	}

  2725 	/* Check for an em-dash broken at line end. */

  2726 	if (enddash && *aline=='-')

  2727 	{

  2728 	    if (pswit[ECHO_SWITCH])

  2729 		printf("\n%s\n",aline);

  2730 	    if (!pswit[OVERVIEW_SWITCH])

  2731 		printf("    Line %ld column 1 - Broken em-dash?\n",linecnt);

  2732 	    else

  2733 		cnt_punct++;

  2734 	}

  2735 	enddash=FALSE;

  2736 	for (s=aline+strlen(aline)-1;*s==' ' && s>aline;s--)

  2737 	    ;

  2738 	if (s>=aline && *s=='-')

  2739 	    enddash=TRUE;

  2740 	check_for_control_characters(aline);

  2741 	if (warnings->bin)

  2742 	    check_for_odd_characters(aline,warnings,isemptyline);

  2743 	if (warnings->longline)

  2744 	    check_for_long_line(aline);

  2745 	if (warnings->shortline)

  2746 	    check_for_short_line(aline,&last);

  2747 	last.blen=last.len;

  2748 	last.len=strlen(aline);

  2749 	last.start=aline[0];

  2750 	check_for_starting_punctuation(aline);

  2751 	if (warnings->dash)

  2752 	{

  2753 	    check_for_spaced_emdash(aline);

  2754 	    check_for_spaced_dash(aline);

  2755 	}

  2756 	check_for_unmarked_paragraphs(aline);

  2757 	check_for_jeebies(aline);

  2758 	check_for_mta_from(aline);

  2759 	check_for_orphan_character(aline);

  2760 	check_for_pling_scanno(aline);

  2761 	check_for_extra_period(aline,warnings);

  2762 	check_for_following_punctuation(aline);

  2763 	check_for_typos(aline,warnings);

  2764 	check_for_misspaced_punctuation(aline,&parities,isemptyline);

  2765 	check_for_double_punctuation(aline,warnings);

  2766 	check_for_spaced_quotes(aline);

  2767 	check_for_miscased_genative(aline);

  2768 	check_end_of_line(aline,warnings);

  2769 	check_for_unspaced_bracket(aline);

  2770 	if (warnings->endquote)

  2771 	    check_for_unpunctuated_endquote(aline);

  2772 	check_for_html_tag(aline);

  2773 	check_for_html_entity(aline);

  2774 	if (isemptyline)

  2775 	{

  2776 	    check_for_mismatched_quotes(&counters,&pending);

  2777 	    memset(&counters,0,sizeof(counters));

  2778 	    /* let the next iteration know that it's starting a new para */

  2779 	    isnewpara=TRUE;

  2780 	    if (prevline)

  2781 		check_for_omitted_punctuation(prevline,&last,start_para_line);

  2782 	}

  2783 	g_free(prevline);

  2784 	prevline=g_strdup(aline);

  2785     }

  2786     if (prevline)

  2787     {

  2788 	g_free(prevline);

  2789 	prevline=NULL;

  2790     }

  2791     g_free(parastart);

  2792     g_free(prevline);

  2793     g_free(etext);

  2794     if (!pswit[OVERVIEW_SWITCH])

  2795 	g_tree_foreach(qword,report_duplicate_queries,NULL);

  2796     g_tree_unref(qword);

  2797     g_tree_unref(qperiod);

  2798 }

  2800 /*

  2801  * flgets:

  2802  *

  2803  * Get one line from the input text, checking for

  2804  * the existence of exactly one CR/LF line-end per line.

  2805  *

  2806  * Returns: a pointer to the line.

  2807  */

  2808 char *flgets(char **etext,long lcnt)

  2809 {

  2810     char c;

  2811     int len;

  2812     gboolean isCR=FALSE;

  2813     char *theline=*etext;

  2814     len=0;

  2815     for(;;)

  2816     {

  2817 	c=*(*etext)++;

  2818 	if (!c)

  2819 	    return NULL;

  2820 	/* either way, it's end of line */

  2821 	if (c=='\n')

  2822 	{

  2823 	    if (isCR)

  2824 		break;

  2825 	    else

  2826 	    {

  2827 		/* Error - a LF without a preceding CR */

  2828 		if (pswit[LINE_END_SWITCH])

  2829 		{

  2830 		    if (pswit[ECHO_SWITCH])

  2831 			printf("\n%*.*s\n",len,len,theline);

  2832 		    if (!pswit[OVERVIEW_SWITCH])

  2833 			printf("    Line %ld - No CR?\n",lcnt);

  2834 		    else

  2835 			cnt_lineend++;

  2836 		}

  2837 		break;

  2838 	    }

  2839 	}

  2840 	if (c=='\r')

  2841 	{

  2842 	    if (isCR)

  2843 	    {

  2844 		/* Error - two successive CRs */

  2845 		if (pswit[LINE_END_SWITCH])

  2846 		{

  2847 		    if (pswit[ECHO_SWITCH])

  2848 			printf("\n%*.*s\n",len,len,theline);

  2849 		    if (!pswit[OVERVIEW_SWITCH])

  2850 			printf("    Line %ld - Two successive CRs?\n",lcnt);

  2851 		    else

  2852 			cnt_lineend++;

  2853 		}

  2854 	    }

  2855 	    isCR=TRUE;

  2856 	}

  2857 	else

  2858 	{

  2859 	    if (pswit[LINE_END_SWITCH] && isCR)

  2860 	    {

  2861 		if (pswit[ECHO_SWITCH])

  2862 		    printf("\n%*.*s\n",len,len,theline);

  2863 		if (!pswit[OVERVIEW_SWITCH])

  2864 		    printf("    Line %ld column %d - CR without LF?\n",

  2865 		      lcnt,len+1);

  2866 		else

  2867 		    cnt_lineend++;

  2868 		theline[len]=' ';

  2869 	    }

  2870 	    isCR=FALSE;

  2871 	    len++;

  2872 	}

  2873     }

  2874     theline[len]='\0';

  2875     if (pswit[MARKUP_SWITCH])

  2876 	postprocess_for_HTML(theline);

  2877     if (pswit[DP_SWITCH])

  2878 	postprocess_for_DP(theline);

  2879     return theline;

  2880 }

  2882 /*

  2883  * mixdigit:

  2884  *

  2885  * Takes a "word" as a parameter, and checks whether it

  2886  * contains a mixture of alpha and digits. Generally, this is an

  2887  * error, but may not be for cases like 4th or L5 12s. 3d.

  2888  *

  2889  * Returns: 0 if no error found, 1 if error.

  2890  */

  2891 int mixdigit(const char *checkword)

  2892 {

  2893     int wehaveadigit,wehavealetter,firstdigits,query,wl;

  2894     const char *s;

  2895     wehaveadigit=wehavealetter=query=0;

  2896     for (s=checkword;*s;s++)

  2897 	if (gcisalpha(*s))

  2898 	    wehavealetter=1;

  2899 	else

  2900 	    if (gcisdigit(*s))

  2901 		wehaveadigit=1;

  2902     if (wehaveadigit && wehavealetter)

  2903     {

  2904 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */

  2905 	query=1;

  2906 	wl=strlen(checkword);

  2907 	for (firstdigits=0;gcisdigit(checkword[firstdigits]);firstdigits++)

  2908 	    ;

  2909 	/* digits, ending in st, rd, nd, th of either case */

  2910 	if (firstdigits+2==wl && (!g_ascii_strcasecmp(checkword+wl-2,"st") ||

  2911 	  !g_ascii_strcasecmp(checkword+wl-2,"rd") ||

  2912 	  !g_ascii_strcasecmp(checkword+wl-2,"nd") ||

  2913 	  !g_ascii_strcasecmp(checkword+wl-2,"th")))

  2914 	    query=0;

  2915 	if (firstdigits+3==wl && (!g_ascii_strcasecmp(checkword+wl-3,"sts") ||

  2916 	  !g_ascii_strcasecmp(checkword+wl-3,"rds") ||

  2917 	  !g_ascii_strcasecmp(checkword+wl-3,"nds") ||

  2918 	  !g_ascii_strcasecmp(checkword+wl-3,"ths")))

  2919 	    query=0;

  2920 	if (firstdigits+3==wl && (!g_ascii_strcasecmp(checkword+wl-4,"stly") ||

  2921 	  !g_ascii_strcasecmp(checkword+wl-4,"rdly") ||

  2922 	  !g_ascii_strcasecmp(checkword+wl-4,"ndly") ||

  2923 	  !g_ascii_strcasecmp(checkword+wl-4,"thly")))

  2924 	    query=0;

  2925 	/* digits, ending in l, L, s or d */

  2926 	if (firstdigits+1==wl && (checkword[wl-1]=='l' ||

  2927 	  checkword[wl-1]=='L' || checkword[wl-1]=='s' || checkword[wl-1]=='d'))

  2928 	    query=0;

  2929 	/*

  2930 	 * L at the start of a number, representing Britsh pounds, like L500.

  2931 	 * This is cute. We know the current word is mixeddigit. If the first

  2932 	 * letter is L, there must be at least one digit following. If both

  2933 	 * digits and letters follow, we have a genuine error, else we have a

  2934 	 * capital L followed by digits, and we accept that as a non-error.

  2935 	 */

  2936 	if (checkword[0]=='L' && !mixdigit(checkword+1))

  2937 	    query=0;

  2938     }

  2939     return query;

  2940 }

  2942 /*

  2943  * getaword:

  2944  *

  2945  * Extracts the first/next "word" from the line, and returns it.

  2946  * A word is defined as one English word unit--or at least that's the aim.

  2947  * "ptr" is advanced to the position in the line where we will start

  2948  * looking for the next word.

  2949  *

  2950  * Returns: A newly-allocated string.

  2951  */

  2952 gchar *getaword(const char **ptr)

  2953 {

  2954     int i;

  2955     const char *s;

  2956     GString *word;

  2957     word=g_string_new(NULL);

  2958     for (;!gcisdigit(**ptr) && !gcisalpha(**ptr) && **ptr;(*ptr)++)

  2959 	;

  2960     /*

  2961      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.

  2962      * Especially yucky is the case of L1,000

  2963      * This section looks for a pattern of characters including a digit

  2964      * followed by a comma or period followed by one or more digits.

  2965      * If found, it returns this whole pattern as a word; otherwise we discard

  2966      * the results and resume our normal programming.

  2967      */

  2968     s=*ptr;

  2969     for (;gcisdigit(*s) || gcisalpha(*s) || *s==',' || *s=='.';s++)

  2970 	g_string_append_c(word,*s);

  2971     for (i=1;i+1<word->len;i++)

  2972     {

  2973 	if (word->str[i]=='.' || word->str[i]==',')

  2974 	{

  2975 	    if (gcisdigit(word->str[i-1]) && gcisdigit(word->str[i-1]))

  2976 	    {

  2977 		*ptr=s;

  2978 		return g_string_free(word,FALSE);

  2979 	    }

  2980 	}

  2981     }

  2982     /* we didn't find a punctuated number - do the regular getword thing */

  2983     g_string_truncate(word,0);

  2984     for (;gcisdigit(**ptr) || gcisalpha(**ptr) || **ptr=='\'';(*ptr)++)

  2985 	g_string_append_c(word,**ptr);

  2986     return g_string_free(word,FALSE);

  2987 }

  2989 /*

  2990  * isroman:

  2991  *

  2992  * Is this word a Roman Numeral?

  2993  *

  2994  * It doesn't actually validate that the number is a valid Roman Numeral--for

  2995  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not

  2996  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.

  2997  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or

  2998  * expressions thereof, except when it came to taxes. Allow any number of M,

  2999  * an optional D, an optional CM or CD, any number of optional Cs, an optional

  3000  * XL or an optional XC, an optional IX or IV, an optional V and any number

  3001  * of optional Is.

  3002  */

  3003 gboolean isroman(const char *t)

  3004 {

  3005     const char *s;

  3006     if (!t || !*t)

  3007 	return FALSE;

  3008     s=t;

  3009     while (*t=='m' && *t)

  3010 	t++;

  3011     if (*t=='d')

  3012 	t++;

  3013     if (*t=='c' && t[1]=='m')

  3014 	t+=2;

  3015     if (*t=='c' && t[1]=='d')

  3016 	t+=2;

  3017     while (*t=='c' && *t)

  3018 	t++;

  3019     if (*t=='x' && t[1]=='l')

  3020 	t+=2;

  3021     if (*t=='x' && t[1]=='c')

  3022 	t+=2;

  3023     if (*t=='l')

  3024 	t++;

  3025     while (*t=='x' && *t)

  3026 	t++;

  3027     if (*t=='i' && t[1]=='x')

  3028 	t+=2;

  3029     if (*t=='i' && t[1]=='v')

  3030 	t+=2;

  3031     if (*t=='v')

  3032 	t++;

  3033     while (*t=='i' && *t)

  3034 	t++;

  3035     return !*t;

  3036 }

  3038 /*

  3039  * gcisalpha:

  3040  *

  3041  * A version of isalpha() that is somewhat lenient on 8-bit texts.

  3042  * If we use the standard function, 8-bit accented characters break

  3043  * words, so that tete with accented characters appears to be two words, "t"

  3044  * and "t", with 8-bit characters between them. This causes over-reporting of

  3045  * errors. gcisalpha() recognizes accented letters from the CP1252 (Windows)

  3046  * and ISO-8859-1 character sets, which are the most common PG 8-bit types.

  3047  */

  3048 gboolean gcisalpha(unsigned char c)

  3049 {

  3050     if (c>='a' && c<='z')

  3051 	return TRUE;

  3052     if (c>='A' && c<='Z')

  3053 	return TRUE;

  3054     if (c<140)

  3055 	return FALSE;

  3056     if (c>=192 && c!=208 && c!=215 && c!=222 && c!=240 && c!=247 && c!=254)

  3057 	return TRUE;

  3058     if (c==140 || c==142 || c==156 || c==158 || c==159)

  3059 	return TRUE;

  3060     return FALSE;

  3061 }

  3063 /*

  3064  * gcisdigit:

  3065  *

  3066  * A version of isdigit() that doesn't get confused in 8-bit texts.

  3067  */

  3068 gboolean gcisdigit(unsigned char c)

  3069 {

  3070     return c>='0' && c<='9';

  3071 }

  3073 /*

  3074  * gcisletter:

  3075  *

  3076  * A version of isletter() that doesn't get confused in 8-bit texts.

  3077  * NB: this is ISO-8891-1-specific.

  3078  */

  3079 gboolean gcisletter(unsigned char c)

  3080 {

  3081     return c>='A' && c<='Z' || c>='a' && c<='z' || c>=192;

  3082 }

  3084 /*

  3085  * postprocess_for_DP:

  3086  *

  3087  * Invoked with the -d switch from flgets().

  3088  * It simply "removes" from the line a hard-coded set of common

  3089  * DP-specific tags, so that the line passed to the main routine has

  3090  * been pre-cleaned of DP markup.

  3091  */

  3092 void postprocess_for_DP(char *theline)

  3093 {

  3094     char *s,*t;

  3095     int i;

  3096     if (!*theline)

  3097 	return;

  3098     for (i=0;*DPmarkup[i];i++)

  3099     {

  3100 	s=strstr(theline,DPmarkup[i]);

  3101 	while (s)

  3102 	{

  3103 	    t=s+strlen(DPmarkup[i]);

  3104 	    while (*t)

  3105 	    {

  3106 		*s=*t;

  3107 		t++;

  3108 		s++;

  3109 	    }

  3110 	    *s=0;

  3111 	    s=strstr(theline,DPmarkup[i]);

  3112 	}

  3113     }

  3114 }

  3116 /*

  3117  * postprocess_for_HTML:

  3118  *

  3119  * Invoked with the -m switch from flgets().

  3120  * It simply "removes" from the line a hard-coded set of common

  3121  * HTML tags and "replaces" a hard-coded set of common HTML

  3122  * entities, so that the line passed to the main routine has

  3123  * been pre-cleaned of HTML.

  3124  */

  3125 void postprocess_for_HTML(char *theline)

  3126 {

  3127     if (strchr(theline,'<') && strchr(theline,'>'))

  3128 	while (losemarkup(theline))

  3129 	    ;

  3130     while (loseentities(theline))

  3131 	;

  3132 }

  3134 char *losemarkup(char *theline)

  3135 {

  3136     char *s,*t;

  3137     int i;

  3138     if (!*theline)

  3139 	return NULL;

  3140     s=strstr(theline,"<");

  3141     t=strstr(theline,">");

  3142     if (!s || !t)

  3143 	return NULL;

  3144     for (i=0;*markup[i];i++)

  3145 	if (!tagcomp(s+1,markup[i]))

  3146 	{

  3147 	    if (!t[1])

  3148 	    {

  3149 		*s=0;

  3150 		return s;

  3151 	    }

  3152 	    else if (t>s)

  3153 	    {

  3154 		strcpy(s,t+1);

  3155 		return s;

  3156 	    }

  3157 	}

  3158     /* It's an unrecognized <xxx>. */

  3159     return NULL;

  3160 }

  3162 char *loseentities(char *theline)

  3163 {

  3164     int i;

  3165     char *s,*t;

  3166     if (!*theline)

  3167 	return NULL;

  3168     for (i=0;*entities[i].htmlent;i++)

  3169     {

  3170 	s=strstr(theline,entities[i].htmlent);

  3171 	if (s)

  3172 	{

  3173 	    t=malloc((size_t)strlen(s));

  3174 	    if (!t)

  3175 		return NULL;

  3176 	    strcpy(t,s+strlen(entities[i].htmlent));

  3177 	    strcpy(s,entities[i].textent);

  3178 	    strcat(s,t);

  3179 	    free(t);

  3180 	    return theline;

  3181 	}

  3182     }

  3183     for (i=0;*entities[i].htmlnum;i++)

  3184     {

  3185 	s=strstr(theline,entities[i].htmlnum);

  3186 	if (s)

  3187 	{

  3188 	    t=malloc((size_t)strlen(s));

  3189 	    if (!t)

  3190 		return NULL;

  3191 	    strcpy(t,s+strlen(entities[i].htmlnum));

  3192 	    strcpy(s,entities[i].textent);

  3193 	    strcat(s,t);

  3194 	    free(t);

  3195 	    return theline;

  3196 	}

  3197     }

  3198     return NULL;

  3199 }

  3201 int tagcomp(const char *strin,const char *basetag)

  3202 {

  3203     const char *s,*t;

  3204     s=basetag;

  3205     t=strin;

  3206     if (*t=='/')

  3207 	t++; /* ignore a slash */

  3208     while (*s && *t)

  3209     {

  3210 	if (tolower(*s)!=tolower(*t))

  3211 	    return 1;

  3212 	s++;

  3213 	t++;

  3214     }

  3215     return 0;

  3216 }

  3218 void proghelp(GOptionContext *context)

  3219 {

  3220     gchar *help;

  3221     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);

  3222     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);

  3223     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);

  3224     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "

  3225       "For details, read the file COPYING.\n",stderr);

  3226     fputs("This is Free Software; "

  3227       "you may redistribute it under certain conditions (GPL);\n",stderr);

  3228     fputs("read the file COPYING for details.\n\n",stderr);

  3229     help=g_option_context_get_help(context,TRUE,NULL);

  3230     fputs(help,stderr);

  3231     g_free(help);

  3232     fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);

  3233     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "

  3234       "non-ASCII\n",stderr);

  3235     fputs("characters like accented letters, "

  3236       "lines longer than 75 or shorter than 55,\n",stderr);

  3237     fputs("unbalanced quotes or brackets, "

  3238       "a variety of badly formatted punctuation, \n",stderr);

  3239     fputs("HTML tags, some likely typos. "

  3240       "It is NOT a substitute for human judgement.\n",stderr);

  3241     fputs("\n",stderr);

  3242 }

author	ali <ali@juiblex.co.uk>
	Tue May 28 15:17:19 2013 +0100 (2013-05-28)
changeset 69	1016349e619f
parent 68	adb087007d08
child 70	aa916da2e452
permissions	-rw-r--r--