/*************************************************************************/ /* bookloupe--check for assorted weirdnesses in a PG candidate text file */ /* */ /* Copyright 2000-2005 Jim Tinsley */ /* Copyright 2012- J. Ali Harlow */ /* */ /* This program is free software; you can redistribute it and/or modify */ /* it under the terms of the GNU General Public License as published by */ /* the Free Software Foundation; either version 2 of the License, or */ /* (at your option) any later version. */ /* */ /* This program is distributed in the hope that it will be useful, */ /* but WITHOUT ANY WARRANTY; without even the implied warranty of */ /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ /* GNU General Public License for more details. */ /* */ /* You should have received a copy of the GNU General Public License */ /* along with this program. If not, see . */ /*************************************************************************/ #include #include #include #include #define MAXWORDLEN 80 /* max length of one word */ #define LINEBUFSIZE 2048 /* buffer size for an input line */ #define MAX_USER_TYPOS 1000 #define USERTYPO_FILE "gutcheck.typ" #ifndef MAX_PATH #define MAX_PATH 16384 #endif char aline[LINEBUFSIZE]; char prevline[LINEBUFSIZE]; /* Common typos. */ char *typo[] = { "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane", "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa", "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt", "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse", "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd", "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign", "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis", "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut", "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter", "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices", "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem", "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe", "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath", "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier", "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne", "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey", "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta", "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats", "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking", "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve", "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf", "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped", "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge", "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan", "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond", "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile", "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic", "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud", "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee", "se", "" }; char *usertypo[MAX_USER_TYPOS]; /* Common abbreviations and other OK words not to query as typos. */ char *okword[] = { "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm", "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats", "outbid", "outbids", "frostbite", "frostbitten", "" }; /* Common abbreviations that cause otherwise unexplained periods. */ char *abbrev[] = { "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op", "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", "" }; /* * Two-Letter combinations that rarely if ever start words, * but are common scannos or otherwise common letter combinations. */ char *nostart[] = { "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", "" }; /* * Two-Letter combinations that rarely if ever end words, * but are common scannos or otherwise common letter combinations. */ char *noend[] = { "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl", "sw", "gr", "sl", "cl", "iy", "" }; char *markup[] = { "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em", "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i", "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub", "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", "" }; char *DPmarkup[] = { "", "", "/*", "*/", "/#", "#/", "/$", "$/", "", "" }; char *nocomma[] = { "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose", "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm", "during", "let", "toward", "among", "" }; char *noperiod[] = { "every", "i'm", "during", "that's", "their", "your", "our", "my", "or", "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether", "i'll", "whose", "who", "because", "when", "let", "till", "very", "an", "among", "those", "into", "whom", "having", "thence", "" }; char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü"; struct { char *htmlent; char *htmlnum; char *textent; } entities[] = { "&", "&", "&", "<", "<", "<", ">", ">", ">", "°", "°", " degrees", "£", "£", "L", """, """, "\"", /* quotation mark = APL quote */ "Œ", "Œ", "OE", /* latin capital ligature OE */ "œ", "œ", "oe", /* latin small ligature oe */ "Š", "Š", "S", /* latin capital letter S with caron */ "š", "š", "s", /* latin small letter s with caron */ "Ÿ", "Ÿ", "Y", /* latin capital letter Y with diaeresis */ "ˆ", "ˆ", "", /* modifier letter circumflex accent */ "˜", "˜", "~", /* small tilde, U+02DC ISOdia */ " ", " ", " ", /* en space, U+2002 ISOpub */ " ", " ", " ", /* em space, U+2003 ISOpub */ " ", " ", " ", /* thin space, U+2009 ISOpub */ "–", "–", "-", /* en dash, U+2013 ISOpub */ "—", "—", "--", /* em dash, U+2014 ISOpub */ "’", "’", "'", /* right single quotation mark */ "‚", "‚", "'", /* single low-9 quotation mark */ "“", "“", "\"", /* left double quotation mark */ "”", "”", "\"", /* right double quotation mark */ "„", "„", "\"", /* double low-9 quotation mark */ "‹", "‹", "\"", /* single left-pointing angle quotation mark */ "›", "›", "\"", /* single right-pointing angle quotation mark */ " ", " ", " ", /* no-break space = non-breaking space, */ "¡", "¡", "!", /* inverted exclamation mark */ "¢", "¢", "c", /* cent sign */ "£", "£", "L", /* pound sign */ "¤", "¤", "$", /* currency sign */ "¥", "¥", "Y", /* yen sign = yuan sign */ "§", "§", "--", /* section sign */ "¨", "¨", " ", /* diaeresis = spacing diaeresis */ "©", "©", "(C) ", /* copyright sign */ "ª", "ª", " ", /* feminine ordinal indicator */ "«", "«", "\"", /* left-pointing double angle quotation mark */ "­", "­", "-", /* soft hyphen = discretionary hyphen */ "®", "®", "(R) ", /* registered sign = registered trade mark sign */ "¯", "¯", " ", /* macron = spacing macron = overline */ "°", "°", " degrees", /* degree sign */ "±", "±", "+-", /* plus-minus sign = plus-or-minus sign */ "²", "²", "2", /* superscript two = superscript digit two */ "³", "³", "3", /* superscript three = superscript digit three */ "´", "´", " ", /* acute accent = spacing acute */ "µ", "µ", "m", /* micro sign */ "¶", "¶", "--", /* pilcrow sign = paragraph sign */ "¸", "¸", " ", /* cedilla = spacing cedilla */ "¹", "¹", "1", /* superscript one = superscript digit one */ "º", "º", " ", /* masculine ordinal indicator */ "»", "»", "\"", /* right-pointing double angle quotation mark */ "¼", "¼", "1/4", /* vulgar fraction one quarter */ "½", "½", "1/2", /* vulgar fraction one half */ "¾", "¾", "3/4", /* vulgar fraction three quarters */ "¿", "¿", "?", /* inverted question mark */ "À", "À", "A", /* latin capital letter A with grave */ "Á", "Á", "A", /* latin capital letter A with acute */ "Â", "Â", "A", /* latin capital letter A with circumflex */ "Ã", "Ã", "A", /* latin capital letter A with tilde */ "Ä", "Ä", "A", /* latin capital letter A with diaeresis */ "Å", "Å", "A", /* latin capital letter A with ring above */ "Æ", "Æ", "AE", /* latin capital letter AE */ "Ç", "Ç", "C", /* latin capital letter C with cedilla */ "È", "È", "E", /* latin capital letter E with grave */ "É", "É", "E", /* latin capital letter E with acute */ "Ê", "Ê", "E", /* latin capital letter E with circumflex */ "Ë", "Ë", "E", /* latin capital letter E with diaeresis */ "Ì", "Ì", "I", /* latin capital letter I with grave */ "Í", "Í", "I", /* latin capital letter I with acute */ "Î", "Î", "I", /* latin capital letter I with circumflex */ "Ï", "Ï", "I", /* latin capital letter I with diaeresis */ "Ð", "Ð", "E", /* latin capital letter ETH */ "Ñ", "Ñ", "N", /* latin capital letter N with tilde */ "Ò", "Ò", "O", /* latin capital letter O with grave */ "Ó", "Ó", "O", /* latin capital letter O with acute */ "Ô", "Ô", "O", /* latin capital letter O with circumflex */ "Õ", "Õ", "O", /* latin capital letter O with tilde */ "Ö", "Ö", "O", /* latin capital letter O with diaeresis */ "×", "×", "*", /* multiplication sign */ "Ø", "Ø", "O", /* latin capital letter O with stroke */ "Ù", "Ù", "U", /* latin capital letter U with grave */ "Ú", "Ú", "U", /* latin capital letter U with acute */ "Û", "Û", "U", /* latin capital letter U with circumflex */ "Ü", "Ü", "U", /* latin capital letter U with diaeresis */ "Ý", "Ý", "Y", /* latin capital letter Y with acute */ "Þ", "Þ", "TH", /* latin capital letter THORN */ "ß", "ß", "sz", /* latin small letter sharp s = ess-zed */ "à", "à", "a", /* latin small letter a with grave */ "á", "á", "a", /* latin small letter a with acute */ "â", "â", "a", /* latin small letter a with circumflex */ "ã", "ã", "a", /* latin small letter a with tilde */ "ä", "ä", "a", /* latin small letter a with diaeresis */ "å", "å", "a", /* latin small letter a with ring above */ "æ", "æ", "ae", /* latin small letter ae */ "ç", "ç", "c", /* latin small letter c with cedilla */ "è", "è", "e", /* latin small letter e with grave */ "é", "é", "e", /* latin small letter e with acute */ "ê", "ê", "e", /* latin small letter e with circumflex */ "ë", "ë", "e", /* latin small letter e with diaeresis */ "ì", "ì", "i", /* latin small letter i with grave */ "í", "í", "i", /* latin small letter i with acute */ "î", "î", "i", /* latin small letter i with circumflex */ "ï", "ï", "i", /* latin small letter i with diaeresis */ "ð", "ð", "eth", /* latin small letter eth */ "ñ", "ñ", "n", /* latin small letter n with tilde */ "ò", "ò", "o", /* latin small letter o with grave */ "ó", "ó", "o", /* latin small letter o with acute */ "ô", "ô", "o", /* latin small letter o with circumflex */ "õ", "õ", "o", /* latin small letter o with tilde */ "ö", "ö", "o", /* latin small letter o with diaeresis */ "÷", "÷", "/", /* division sign */ "ø", "ø", "o", /* latin small letter o with stroke */ "ù", "ù", "u", /* latin small letter u with grave */ "ú", "ú", "u", /* latin small letter u with acute */ "û", "û", "u", /* latin small letter u with circumflex */ "ü", "ü", "u", /* latin small letter u with diaeresis */ "ý", "ý", "y", /* latin small letter y with acute */ "þ", "þ", "th", /* latin small letter thorn */ "ÿ", "ÿ", "y", /* latin small letter y with diaeresis */ "", "" }; /* special characters */ #define CHAR_SPACE 32 #define CHAR_TAB 9 #define CHAR_LF 10 #define CHAR_CR 13 #define CHAR_DQUOTE 34 #define CHAR_SQUOTE 39 #define CHAR_OPEN_SQUOTE 96 #define CHAR_TILDE 126 #define CHAR_ASTERISK 42 #define CHAR_FORESLASH 47 #define CHAR_CARAT 94 #define CHAR_UNDERSCORE '_' #define CHAR_OPEN_CBRACK '{' #define CHAR_CLOSE_CBRACK '}' #define CHAR_OPEN_RBRACK '(' #define CHAR_CLOSE_RBRACK ')' #define CHAR_OPEN_SBRACK '[' #define CHAR_CLOSE_SBRACK ']' /* longest and shortest normal PG line lengths */ #define LONGEST_PG_LINE 75 #define WAY_TOO_LONG 80 #define SHORTEST_PG_LINE 55 #define SWITCHES "ESTPXLOYHWVMUD" /* switches:- */ /* D - ignore DP-specific markup */ /* E - echo queried line */ /* S - check single quotes */ /* T - check common typos */ /* P - require closure of quotes on */ /* every paragraph */ /* X - "Trust no one" :-) Paranoid! */ /* Queries everything */ /* L - line end checking defaults on */ /* -L turns it off */ /* O - overview. Just shows counts. */ /* Y - puts errors to stdout */ /* instead of stderr */ /* H - Echoes header fields */ /* M - Ignore markup in < > */ /* U - Use file of User-defined Typos*/ /* W - Defaults for use on Web upload*/ /* V - Verbose - list EVERYTHING! */ #define SWITNO 14 /* max number of switch parms */ /* - used for defining array-size */ #define MINARGS 1 /* minimum no of args excl switches */ #define MAXARGS 1 /* maximum no of args excl switches */ int pswit[SWITNO]; /* program switches set by SWITCHES */ #define ECHO_SWITCH 0 #define SQUOTE_SWITCH 1 #define TYPO_SWITCH 2 #define QPARA_SWITCH 3 #define PARANOID_SWITCH 4 #define LINE_END_SWITCH 5 #define OVERVIEW_SWITCH 6 #define STDOUT_SWITCH 7 #define HEADER_SWITCH 8 #define WEB_SWITCH 9 #define VERBOSE_SWITCH 10 #define MARKUP_SWITCH 11 #define USERTYPO_SWITCH 12 #define DP_SWITCH 13 long cnt_dquot; /* for overview mode, count of doublequote queries */ long cnt_squot; /* for overview mode, count of singlequote queries */ long cnt_brack; /* for overview mode, count of brackets queries */ long cnt_bin; /* for overview mode, count of non-ASCII queries */ long cnt_odd; /* for overview mode, count of odd character queries */ long cnt_long; /* for overview mode, count of long line errors */ long cnt_short; /* for overview mode, count of short line queries */ long cnt_punct; /* for overview mode, count of punctuation and spacing queries */ long cnt_dash; /* for overview mode, count of dash-related queries */ long cnt_word; /* for overview mode, count of word queries */ long cnt_html; /* for overview mode, count of html queries */ long cnt_lineend; /* for overview mode, count of line-end queries */ long cnt_spacend; /* count of lines with space at end */ long linecnt; /* count of total lines in the file */ long checked_linecnt; /* count of lines actually checked */ void proghelp(void); void procfile(char *); #define LOW_THRESHOLD 0 #define HIGH_THRESHOLD 1 #define START 0 #define END 1 #define PREV 0 #define NEXT 1 #define FIRST_OF_PAIR 0 #define SECOND_OF_PAIR 1 #define MAX_WORDPAIR 1000 char running_from[MAX_PATH]; int mixdigit(char *); char *getaword(char *,char *); int matchword(char *,char *); char *flgets(char *,int,FILE *,long); void lowerit(char *); int gcisalpha(unsigned char); int gcisdigit(unsigned char); int gcisletter(unsigned char); char *gcstrchr(char *s,char c); void postprocess_for_HTML(char *); char *linehasmarkup(char *); char *losemarkup(char *); int tagcomp(char *,char *); char *loseentities(char *); int isroman(char *); int usertypo_count; void postprocess_for_DP(char *); char wrk[LINEBUFSIZE]; #define MAX_QWORD 50 #define MAX_QWORD_LENGTH 40 char qword[MAX_QWORD][MAX_QWORD_LENGTH]; char qperiod[MAX_QWORD][MAX_QWORD_LENGTH]; signed int dupcnt[MAX_QWORD]; int main(int argc,char **argv) { char *argsw,*s; int i,switno,invarg; char usertypo_file[MAX_PATH]; FILE *usertypofile; if (strlen(argv[0])=running_from;s--) *s=0; switno=strlen(SWITCHES); for (i=switno;--i>0;) pswit[i]=0; /* initialise switches */ /* * Standard loop to extract switches. * When we come out of this loop, the arguments will be * in argv[0] upwards and the switches used will be * represented by their equivalent elements in pswit[] */ while (--argc>0 && **++argv=='-') for (argsw=argv[0]+1;*argsw!='\0';argsw++) for (i=switno,invarg=1;(--i>=0) && invarg==1;) if ((toupper(*argsw))==SWITCHES[i]) { invarg=0; pswit[i]=1; } /* Paranoid checking is turned OFF, not on, by its switch */ pswit[PARANOID_SWITCH]^=1; if (pswit[PARANOID_SWITCH]) /* if running in paranoid mode force typo checks as well */ pswit[TYPO_SWITCH]=pswit[TYPO_SWITCH]^1; /* Line-end checking is turned OFF, not on, by its switch */ pswit[LINE_END_SWITCH]^=1; /* Echoing is turned OFF, not on, by its switch */ pswit[ECHO_SWITCH]^=1; if (pswit[OVERVIEW_SWITCH]) /* just print summary; don't echo */ pswit[ECHO_SWITCH]=0; /* * Web uploads - for the moment, this is really just a placeholder * until we decide what processing we really want to do on web uploads */ if (pswit[WEB_SWITCH]) { /* specific override for web uploads */ pswit[ECHO_SWITCH]=1; pswit[SQUOTE_SWITCH]=0; pswit[TYPO_SWITCH]=1; pswit[QPARA_SWITCH]=0; pswit[PARANOID_SWITCH]=1; pswit[LINE_END_SWITCH]=0; pswit[OVERVIEW_SWITCH]=0; pswit[STDOUT_SWITCH]=0; pswit[HEADER_SWITCH]=1; pswit[VERBOSE_SWITCH]=0; pswit[MARKUP_SWITCH]=0; pswit[USERTYPO_SWITCH]=0; pswit[DP_SWITCH]=0; } if (argcMAXARGS) { /* check number of args */ proghelp(); return 1; } /* read in the user-defined stealth scanno list */ if (pswit[USERTYPO_SWITCH]) { /* ... we were told we had one! */ usertypofile=fopen(USERTYPO_FILE,"rb"); if (!usertypofile) { /* not in cwd. try excuteable directory. */ strcpy(usertypo_file,running_from); strcat(usertypo_file,USERTYPO_FILE); usertypofile=fopen(usertypo_file,"rb"); if (!usertypofile) { /* we ain't got no user typo file! */ printf(" --> I couldn't find gutcheck.typ " "-- proceeding without user typos.\n"); } } usertypo_count=0; if (usertypofile) { /* we managed to open a User Typo File! */ if (pswit[USERTYPO_SWITCH]) { while (flgets(aline,LINEBUFSIZE-1,usertypofile, (long)usertypo_count)) { if (strlen(aline)>1) { if ((int)*aline>33) { s=malloc(strlen(aline)+1); if (!s) { fprintf(stderr,"bookloupe: cannot get enough " "memory for user typo file!\n"); exit(1); } strcpy(s,aline); usertypo[usertypo_count]=s; usertypo_count++; if (usertypo_count>=MAX_USER_TYPOS) { printf(" --> Only %d user-defined typos " "allowed: ignoring the rest\n", MAX_USER_TYPOS); break; } } } } } fclose(usertypofile); } } fprintf(stderr,"bookloupe: Check and report on an e-text\n"); cnt_dquot=cnt_squot=cnt_brack=cnt_bin=cnt_odd=cnt_long= cnt_short=cnt_punct=cnt_dash=cnt_word=cnt_html=cnt_lineend= cnt_spacend=0; procfile(argv[0]); if (pswit[OVERVIEW_SWITCH]) { printf(" Checked %ld lines of %ld (head+foot = %ld)\n\n", checked_linecnt,linecnt,linecnt-checked_linecnt); printf(" --------------- Queries found --------------\n"); if (cnt_long) printf(" Long lines: %14ld\n",cnt_long); if (cnt_short) printf(" Short lines: %14ld\n",cnt_short); if (cnt_lineend) printf(" Line-end problems: %14ld\n",cnt_lineend); if (cnt_word) printf(" Common typos: %14ld\n",cnt_word); if (cnt_dquot) printf(" Unmatched quotes: %14ld\n",cnt_dquot); if (cnt_squot) printf(" Unmatched SingleQuotes: %14ld\n",cnt_squot); if (cnt_brack) printf(" Unmatched brackets: %14ld\n",cnt_brack); if (cnt_bin) printf(" Non-ASCII characters: %14ld\n",cnt_bin); if (cnt_odd) printf(" Proofing characters: %14ld\n",cnt_odd); if (cnt_punct) printf(" Punctuation & spacing queries: %14ld\n",cnt_punct); if (cnt_dash) printf(" Non-standard dashes: %14ld\n",cnt_dash); if (cnt_html) printf(" Possible HTML tags: %14ld\n",cnt_html); printf("\n"); printf(" TOTAL QUERIES %14ld\n", cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+ cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend); } return 0; } struct first_pass_results { long firstline,astline; long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma; long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit; long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash; signed int Dutchcount,Frenchcount; }; /* * first_pass: * * Run a first pass - verify that it's a valid PG * file, decide whether to report some things that * occur many times in the text like long or short * lines, non-standard dashes, etc. */ struct first_pass_results *first_pass(FILE *infile) { char laststart=CHAR_SPACE,*s; signed int i,llen; unsigned int lastlen=0,lastblen=0; long spline=0,nspline=0; static struct first_pass_results results={0}; char inword[MAXWORDLEN]=""; while (fgets(aline,LINEBUFSIZE-1,infile)) { while (aline[strlen(aline)-1]==10 || aline[strlen(aline)-1]==13) aline[strlen(aline)-1]=0; linecnt++; if (strstr(aline,"*END") && strstr(aline,"SMALL PRINT") && (strstr(aline,"PUBLIC DOMAIN") || strstr(aline,"COPYRIGHT"))) { if (spline) printf(" --> Duplicate header?\n"); spline=linecnt+1; /* first line of non-header text, that is */ } if (!strncmp(aline,"*** START",9) && strstr(aline,"PROJECT GUTENBERG")) { if (nspline) printf(" --> Duplicate header?\n"); nspline=linecnt+1; /* first line of non-header text, that is */ } if (spline || nspline) { lowerit(aline); if (strstr(aline,"end") && strstr(aline,"project gutenberg")) { if (strstr(aline,"end") Duplicate footer?\n"); } else results.footerline=linecnt; } } } if (spline) results.firstline=spline; if (nspline) results.firstline=nspline; /* override with new */ if (results.footerline) continue; /* don't count the boilerplate in the footer */ llen=strlen(aline); results.totlen+=llen; for (i=0;i127) results.binlen++; if (gcisalpha(aline[i])) results.alphalen++; if (i>0 && aline[i]==CHAR_DQUOTE && isalpha(aline[i-1])) results.endquote_count++; } if (strlen(aline)>2 && lastlen>2 && lastlen2 && lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE) results.shortline++; if (*aline && (unsigned char)aline[strlen(aline)-1]<=CHAR_SPACE) cnt_spacend++; if (strstr(aline,".,")) results.dotcomma++; /* only count ast lines for ignoring purposes where there is */ /* locase text on the line */ if (strstr(aline,"*")) { for (s=aline;*s;s++) if (*s>='a' && *s<='z') break; if (*s) results.astline++; } if (strstr(aline,"/")) results.fslashline++; for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--) ; if (aline[i]=='-' && aline[i-1]!='-') results.hyphens++; if (llen>LONGEST_PG_LINE) results.longline++; if (llen>WAY_TOO_LONG) results.verylongline++; if (strstr(aline,"<") && strstr(aline,">")) { i=(signed int)(strstr(aline,">")-strstr(aline,"<")+1); if (i>0) results.htmcount++; if (strstr(aline,"")) results.htmcount+=4; /* bonus marks! */ } /* Check for spaced em-dashes */ if (strstr(aline,"--")) { results.emdash++; if (*(strstr(aline,"--")-1)==CHAR_SPACE || (*(strstr(aline,"--")+2)==CHAR_SPACE)) results.space_emdash++; if (*(strstr(aline,"--")-1)==CHAR_SPACE && (*(strstr(aline,"--")+2)==CHAR_SPACE)) /* count of em-dashes with spaces both sides */ results.non_PG_space_emdash++; if (*(strstr(aline,"--")-1)!=CHAR_SPACE && (*(strstr(aline,"--")+2)!=CHAR_SPACE)) /* count of PG-type em-dashes with no spaces */ results.PG_space_emdash++; } for (s=aline;*s;) { s=getaword(s,inword); if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) results.Dutchcount++; if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) results.Frenchcount++; if (!strcmp(inword,"0") || !strcmp(inword,"1")) results.standalone_digit++; } /* Check for spaced dashes */ if (strstr(aline," -") && *(strstr(aline," -")+2)!='-') results.spacedash++; lastblen=lastlen; lastlen=strlen(aline); laststart=aline[0]; } return &results; } struct warnings { signed int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen; signed int endquote,isDutch,isFrench; }; /* * report_first_pass: * * Make some snap decisions based on the first pass results. */ struct warnings *report_first_pass(struct first_pass_results *results) { static struct warnings warnings={0}; if (cnt_spacend>0) printf(" --> %ld lines in this file have white space at end\n", cnt_spacend); warnings.dotcomma=1; if (results->dotcomma>5) { warnings.dotcomma=0; printf(" --> %ld lines in this file contain '.,'. " "Not reporting them.\n",results->dotcomma); } /* * If more than 50 lines, or one-tenth, are short, * don't bother reporting them. */ warnings.shortline=1; if (results->shortline>50 || results->shortline*10>linecnt) { warnings.shortline=0; printf(" --> %ld lines in this file are short. " "Not reporting short lines.\n",results->shortline); } /* * If more than 50 lines, or one-tenth, are long, * don't bother reporting them. */ warnings.longline=1; if (results->longline>50 || results->longline*10>linecnt) { warnings.longline=0; printf(" --> %ld lines in this file are long. " "Not reporting long lines.\n",results->longline); } /* If more than 10 lines contain asterisks, don't bother reporting them. */ warnings.ast=1; if (results->astline>10) { warnings.ast=0; printf(" --> %ld lines in this file contain asterisks. " "Not reporting them.\n",results->astline); } /* * If more than 10 lines contain forward slashes, * don't bother reporting them. */ warnings.fslash=1; if (results->fslashline>10) { warnings.fslash=0; printf(" --> %ld lines in this file contain forward slashes. " "Not reporting them.\n",results->fslashline); } /* * If more than 20 lines contain unpunctuated endquotes, * don't bother reporting them. */ warnings.endquote=1; if (results->endquote_count>20) { warnings.endquote=0; printf(" --> %ld lines in this file contain unpunctuated endquotes. " "Not reporting them.\n",results->endquote_count); } /* * If more than 15 lines contain standalone digits, * don't bother reporting them. */ warnings.digit=1; if (results->standalone_digit>10) { warnings.digit=0; printf(" --> %ld lines in this file contain standalone 0s and 1s. " "Not reporting them.\n",results->standalone_digit); } /* * If more than 20 lines contain hyphens at end, * don't bother reporting them. */ warnings.hyphen=1; if (results->hyphens>20) { warnings.hyphen=0; printf(" --> %ld lines in this file have hyphens at end. " "Not reporting them.\n",results->hyphens); } if (results->htmcount>20 && !pswit[MARKUP_SWITCH]) { printf(" --> Looks like this is HTML. Switching HTML mode ON.\n"); pswit[MARKUP_SWITCH]=1; } if (results->verylongline>0) printf(" --> %ld lines in this file are VERY long!\n", results->verylongline); /* * If there are more non-PG spaced dashes than PG em-dashes, * assume it's deliberate. * Current PG guidelines say don't use them, but older texts do, * and some people insist on them whatever the guidelines say. */ warnings.dash=1; if (results->spacedash+results->non_PG_space_emdash> results->PG_space_emdash) { warnings.dash=0; printf(" --> There are %ld spaced dashes and em-dashes. " "Not reporting them.\n", results->spacedash+results->non_PG_space_emdash); } /* If more than a quarter of characters are hi-bit, bug out. */ warnings.bin=1; if (results->binlen*4>results->totlen) { printf(" --> This file does not appear to be ASCII. " "Terminating. Best of luck with it!\n"); exit(1); } if (results->alphalen*4totlen) { printf(" --> This file does not appear to be text. " "Terminating. Best of luck with it!\n"); exit(1); } if (results->binlen*100>results->totlen || results->binlen>100) { printf(" --> There are a lot of foreign letters here. " "Not reporting them.\n"); warnings.bin=0; } warnings.isDutch=0; if (results->Dutchcount>50) { warnings.isDutch=1; printf(" --> This looks like Dutch - " "switching off dashes and warnings for 's Middags case.\n"); } warnings.isFrench=0; if (results->Frenchcount>50) { warnings.isFrench=1; printf(" --> This looks like French - " "switching off some doublepunct.\n"); } if (results->firstline && results->footerline) printf(" The PG header and footer appear to be already on.\n"); else { if (results->firstline) printf(" The PG header is on - no footer.\n"); if (results->footerline) printf(" The PG footer is on - no header.\n"); } printf("\n"); if (pswit[VERBOSE_SWITCH]) { warnings.bin=1; warnings.shortline=1; warnings.dotcomma=1; warnings.longline=1; warnings.dash=1; warnings.digit=1; warnings.ast=1; warnings.fslash=1; warnings.hyphen=1; warnings.endquote=1; printf(" *** Verbose output is ON -- you asked for it! ***\n"); } if (warnings.isDutch) warnings.dash=0; if (results->footerline>0 && results->firstline>0 && results->footerline>results->firstline && results->footerline-results->firstline<100) { printf(" --> I don't really know where this text starts. \n"); printf(" There are no reference points.\n"); printf(" I'm going to have to report the header and footer " "as well.\n"); results->firstline=0; } return &warnings; } struct counters { long quot; signed int c_unders,c_brack,s_brack,r_brack; signed int open_single_quote,close_single_quote; }; /* * analyse_quotes: * * Look along the line, accumulate the count of quotes, and see * if this is an empty line - i.e. a line with nothing on it * but spaces. * If line has just spaces, period, * and/or - on it, don't * count it, since empty lines with asterisks or dashes to * separate sections are common. * * Returns: Non-zero if the line is empty. */ int analyse_quotes(const char *s,struct counters *counters) { signed int guessquote=0; int isemptyline=1; /* assume the line is empty until proven otherwise */ while (*s) { if (*s==CHAR_DQUOTE) counters->quot++; if (*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE) { if (s==aline) { /* * At start of line, it can only be an openquote. * Hardcode a very common exception! */ if (strncmp(s+2,"tis",3) && strncmp(s+2,"Tis",3)) counters->open_single_quote++; } else if (gcisalpha(s[-1]) && gcisalpha(s[1])) /* Do nothing! it's definitely an apostrophe, not a quote */ ; /* it's outside a word - let's check it out */ else if (*s==CHAR_OPEN_SQUOTE || gcisalpha(s[1])) { /* it damwell better BE an openquote */ if (strncmp(s+1,"tis",3) && strncmp(s+1,"Tis",3)) /* hardcode a very common exception! */ counters->open_single_quote++; } else { /* now - is it a closequote? */ guessquote=0; /* accumulate clues */ if (gcisalpha(s[-1])) { /* it follows a letter - could be either */ guessquote++; if (s[-1]=='s') { /* looks like a plural apostrophe */ guessquote-=3; if (s[1]==CHAR_SPACE) /* bonus marks! */ guessquote-=2; } } /* it doesn't have a letter either side */ else if (strchr(".?!,;:",s[-1]) && strchr(".?!,;: ",s[1])) guessquote+=8; /* looks like a closequote */ else guessquote++; if (counters->open_single_quote>counters->close_single_quote) /* * Give it the benefit of some doubt, * if a squote is already open. */ guessquote++; else guessquote--; if (guessquote>=0) counters->close_single_quote++; } } if (*s!=CHAR_SPACE && *s!='-' && *s!='.' && *s!=CHAR_ASTERISK && *s!=13 && *s!=10) isemptyline=0; /* ignore lines like * * * as spacers */ if (*s==CHAR_UNDERSCORE) counters->c_unders++; if (*s==CHAR_OPEN_CBRACK) counters->c_brack++; if (*s==CHAR_CLOSE_CBRACK) counters->c_brack--; if (*s==CHAR_OPEN_RBRACK) counters->r_brack++; if (*s==CHAR_CLOSE_RBRACK) counters->r_brack--; if (*s==CHAR_OPEN_SBRACK) counters->s_brack++; if (*s==CHAR_CLOSE_SBRACK) counters->s_brack--; s++; } return isemptyline; } /* * check_for_odd_characters: * * Check for binary and other odd characters. */ void check_for_odd_characters(const char *aline,const struct warnings *warnings, int isemptyline) { /* Don't repeat multiple warnings on one line. */ signed int eNon_A=0,eTab=0,eTilde=0,eCarat=0,eFSlash=0,eAst=0; const char *s; unsigned char c; for (s=aline;*s;s++) { c=*(unsigned char *)s; if (!eNon_A && (*s127)) { if (pswit[ECHO_SWITCH]) printf("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) if (c>127 && c<160) printf(" Line %ld column %d - " "Non-ISO-8859 character %d\n",linecnt,(int)(s-aline)+1,c); else printf(" Line %ld column %d - Non-ASCII character %d\n", linecnt,(int)(s-aline)+1,c); else cnt_bin++; eNon_A=1; } if (!eTab && *s==CHAR_TAB) { if (pswit[ECHO_SWITCH]) printf("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) printf(" Line %ld column %d - Tab character?\n", linecnt,(int)(s-aline)+1); else cnt_odd++; eTab=1; } if (!eTilde && *s==CHAR_TILDE) { /* * Often used by OCR software to indicate an * unrecognizable character. */ if (pswit[ECHO_SWITCH]) printf("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) printf(" Line %ld column %d - Tilde character?\n", linecnt,(int)(s-aline)+1); else cnt_odd++; eTilde=1; } if (!eCarat && *s==CHAR_CARAT) { if (pswit[ECHO_SWITCH]) printf("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) printf(" Line %ld column %d - Carat character?\n", linecnt,(int)(s-aline)+1); else cnt_odd++; eCarat=1; } if (!eFSlash && *s==CHAR_FORESLASH && warnings->fslash) { if (pswit[ECHO_SWITCH]) printf("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) printf(" Line %ld column %d - Forward slash?\n", linecnt,(int)(s-aline)+1); else cnt_odd++; eFSlash=1; } /* * Report asterisks only in paranoid mode, * since they're often deliberate. */ if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline && *s==CHAR_ASTERISK) { if (pswit[ECHO_SWITCH]) printf("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) printf(" Line %ld column %d - Asterisk?\n", linecnt,(int)(s-aline)+1); else cnt_odd++; eAst=1; } } } /* * check_for_long_line: * * Check for line too long. */ void check_for_long_line(const char *aline) { if (strlen(aline)>LONGEST_PG_LINE) { if (pswit[ECHO_SWITCH]) printf("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) printf(" Line %ld column %d - Long line %d\n", linecnt,strlen(aline),strlen(aline)); else cnt_long++; } } struct line_properties { unsigned int len,blen; char start; }; /* * check_for_short_line: * * Check for line too short. * * This one is a bit trickier to implement: we don't want to * flag the last line of a paragraph for being short, so we * have to wait until we know that our current line is a * "normal" line, then report the _previous_ line if it was too * short. We also don't want to report indented lines like * chapter heads or formatted quotations. We therefore keep * last->len as the length of the last line examined, and * last->blen as the length of the last but one, and try to * suppress unnecessary warnings by checking that both were of * "normal" length. We keep the first character of the last * line in last->start, and if it was a space, we assume that * the formatting is deliberate. I can't figure out a way to * distinguish something like a quoted verse left-aligned or * the header or footer of a letter from a paragraph of short * lines - maybe if I examined the whole paragraph, and if the * para has less than, say, 8 lines and if all lines are short, * then just assume it's OK? Need to look at some texts to see * how often a formula like this would get the right result. */ void check_for_short_line(const char *aline,const struct line_properties *last) { if (strlen(aline)>1 && last->len>1 && last->lenblen>1 && last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE) { if (pswit[ECHO_SWITCH]) printf("\n%s\n",prevline); if (!pswit[OVERVIEW_SWITCH]) printf(" Line %ld column %d - Short line %d?\n", linecnt-1,strlen(prevline),strlen(prevline)); else cnt_short++; } } /* * procfile: * * Process one file. */ void procfile(char *filename) { char *s,*t,*s1,*wordstart; char inword[MAXWORDLEN],testword[MAXWORDLEN]; char parastart[81]; /* first line of current para */ FILE *infile; struct first_pass_results *first_pass_results; struct warnings *warnings; struct counters counters={0}; struct line_properties last={0}; int isemptyline; long squot,start_para_line; signed int i,j,llen,isacro,isellipsis,istypo,alower; signed int dquotepar,squotepar; signed int isnewpara,vowel,consonant; char dquote_err[80],squote_err[80],rbrack_err[80],sbrack_err[80], cbrack_err[80],unders_err[80]; signed int qword_index,qperiod_index,isdup; signed int enddash; last.start=CHAR_SPACE; *dquote_err=*squote_err=*rbrack_err=*cbrack_err=*sbrack_err= *unders_err=*prevline=0; linecnt=checked_linecnt=start_para_line=0; squot=0; i=llen=isacro=isellipsis=istypo=0; isnewpara=vowel=consonant=enddash=0; qword_index=qperiod_index=isdup=0; *inword=*testword=0; dquotepar=squotepar=0; for (j=0;jfirstline || (first_pass_results->footerline>0 && linecnt>first_pass_results->footerline)) { if (pswit[HEADER_SWITCH]) { if (!strncmp(aline,"Title:",6)) printf(" %s\n",aline); if (!strncmp(aline,"Author:",7)) printf(" %s\n",aline); if (!strncmp(aline,"Release Date:",13)) printf(" %s\n",aline); if (!strncmp(aline,"Edition:",8)) printf(" %s\n\n",aline); } continue; /* skip through the header */ } checked_linecnt++; s=aline; /* * If we are in a state of unbalanced quotes, and this line * doesn't begin with a quote, output the stored error message. * If the -P switch was used, print the warning even if the * new para starts with quotes. */ t=s; while (*t==' ') t++; if (*dquote_err) if (*t!=CHAR_DQUOTE || pswit[QPARA_SWITCH]) { if (!pswit[OVERVIEW_SWITCH]) { if (pswit[ECHO_SWITCH]) printf("\n%s\n",parastart); printf(dquote_err); } else cnt_dquot++; } if (*squote_err) { if (*t!=CHAR_SQUOTE && *t!=CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] || squot) { if (!pswit[OVERVIEW_SWITCH]) { if (pswit[ECHO_SWITCH]) printf("\n%s\n",parastart); printf(squote_err); } else cnt_squot++; } squot=0; } if (*rbrack_err) { if (!pswit[OVERVIEW_SWITCH]) { if (pswit[ECHO_SWITCH]) printf("\n%s\n",parastart); printf(rbrack_err); } else cnt_brack++; } if (*sbrack_err) { if (!pswit[OVERVIEW_SWITCH]) { if (pswit[ECHO_SWITCH]) printf("\n%s\n",parastart); printf(sbrack_err); } else cnt_brack++; } if (*cbrack_err) { if (!pswit[OVERVIEW_SWITCH]) { if (pswit[ECHO_SWITCH]) printf("\n%s\n",parastart); printf(cbrack_err); } else cnt_brack++; } if (*unders_err) { if (!pswit[OVERVIEW_SWITCH]) { if (pswit[ECHO_SWITCH]) printf("\n%s\n",parastart); printf(unders_err); } else cnt_brack++; } *dquote_err=*squote_err=*rbrack_err=*cbrack_err= *sbrack_err=*unders_err=0; isemptyline=analyse_quotes(aline,&counters); if (isnewpara && !isemptyline) { /* This line is the start of a new paragraph. */ start_para_line=linecnt; /* Capture its first line in case we want to report it later. */ strncpy(parastart,aline,80); parastart[79]=0; dquotepar=squotepar=0; /* restart the quote count */ s=aline; while (!gcisalpha(*s) && !gcisdigit(*s) && *s) s++; if (*s>='a' && *s<='z') { /* and its first letter is lowercase */ if (pswit[ECHO_SWITCH]) printf("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) printf(" Line %ld column %d - " "Paragraph starts with lower-case\n", linecnt,(int)(s-aline)+1); else cnt_punct++; } isnewpara=0; /* Signal the end of new para processing. */ } /* Check for an em-dash broken at line end. */ if (enddash && *aline=='-') { if (pswit[ECHO_SWITCH]) printf("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) printf(" Line %ld column 1 - Broken em-dash?\n",linecnt); else cnt_punct++; } enddash=0; for (s=aline+strlen(aline)-1;*s==' ' && s>aline;s--) ; if (s>=aline && *s=='-') enddash=1; /* * Check for invalid or questionable characters in the line * Anything above 127 is invalid for plain ASCII, and * non-printable control characters should also be flagged. * Tabs should generally not be there. */ for (s=aline;*s;s++) { i=(unsigned char)*s; if (ibin) check_for_odd_characters(aline,warnings,isemptyline); if (warnings->longline) check_for_long_line(aline); if (warnings->shortline) check_for_short_line(aline,&last); last.blen=last.len; last.len=strlen(aline); last.start=aline[0]; /* Look for punctuation other than full ellipses at start of line. */ if (*aline && strchr(".?!,;:",aline[0]) && strncmp(". . .",aline,5)) { if (pswit[ECHO_SWITCH]) printf("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) printf(" Line %ld column 1 - Begins with punctuation?\n", linecnt); else cnt_punct++; } /* * Check for spaced em-dashes. * We must check _all_ occurrences of "--" on the line * hence the loop - even if the first double-dash is OK * there may be another that's wrong later on. */ if (warnings->dash) { s=aline; while (strstr(s,"--")) { if (*(strstr(s,"--")-1)==CHAR_SPACE || (*(strstr(s,"--")+2)==CHAR_SPACE)) { if (pswit[ECHO_SWITCH]) printf("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) printf(" Line %ld column %d - Spaced em-dash?\n", linecnt,(int)(strstr(s,"--")-aline)+1); else cnt_dash++; } s=strstr(s,"--")+2; } } /* Check for spaced dashes. */ if (warnings->dash) { if (strstr(aline," -")) { if (*(strstr(aline," -")+2)!='-') { if (pswit[ECHO_SWITCH]) printf("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) printf(" Line %ld column %d - Spaced dash?\n", linecnt,(int)(strstr(aline," -")-aline)+1); else cnt_dash++; } } else if (strstr(aline,"- ")) { if (*(strstr(aline,"- ")-1)!='-') { if (pswit[ECHO_SWITCH]) printf("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) printf(" Line %ld column %d - Spaced dash?\n", linecnt,(int)(strstr(aline,"- ")-aline)+1); else cnt_dash++; } } } /* * Check for unmarked paragraphs indicated by separate speakers. * May well be false positive: * "Bravo!" "Wonderful!" called the crowd. * but useful all the same. */ s=wrk; *s=0; if (strstr(aline,"\" \"")) s=strstr(aline,"\" \""); if (strstr(aline,"\" \"")) s=strstr(aline,"\" \""); if (*s) { if (pswit[ECHO_SWITCH]) printf("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) printf(" Line %ld column %d - " "Query missing paragraph break?\n", linecnt,(int)(s-aline)+1); else cnt_punct++; } /* * Check for "to he" and other easy he/be errors. * This is a very inadequate effort on the he/be problem, * but the phrase "to he" is always an error, whereas "to * be" is quite common. * Similarly, '"Quiet!", be said.' is a non-be error * "to he" is _not_ always an error!: * "Where they went to he couldn't say." * Another false positive: * What would "Cinderella" be without the . . . * and another: "If he wants to he can see for himself." */ s=wrk; *s=0; if (strstr(aline," to he ")) s=strstr(aline," to he "); if (strstr(aline,"\" be ")) s=strstr(aline,"\" be "); if (strstr(aline,"\", be ")) s=strstr(aline,"\", be "); if (strstr(aline," is be ")) s=strstr(aline," is be "); if (strstr(aline," be is ")) s=strstr(aline," be is "); if (strstr(aline," was be ")) s=strstr(aline," was be "); if (strstr(aline," be would ")) s=strstr(aline," be would "); if (strstr(aline," be could ")) s=strstr(aline," be could "); if (*s) { if (pswit[ECHO_SWITCH]) printf("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) printf(" Line %ld column %d - Query he/be error?\n", linecnt,(int)(s-aline)+1); else cnt_word++; } s=wrk; *s=0; if (strstr(aline," i bad ")) s=strstr(aline," i bad "); if (strstr(aline," you bad ")) s=strstr(aline," you bad "); if (strstr(aline," he bad ")) s=strstr(aline," he bad "); if (strstr(aline," she bad ")) s=strstr(aline," she bad "); if (strstr(aline," they bad ")) s=strstr(aline," they bad "); if (strstr(aline," a had ")) s=strstr(aline," a had "); if (strstr(aline," the had ")) s=strstr(aline," the had "); if (*s) { if (pswit[ECHO_SWITCH]) printf("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) printf(" Line %ld column %d - Query had/bad error?\n", linecnt,(int)(s-aline)+1); else cnt_word++; } s=wrk; *s=0; if (strstr(aline,", hut ")) s=strstr(aline,", hut "); if (strstr(aline,"; hut ")) s=strstr(aline,"; hut "); if (*s) { if (pswit[ECHO_SWITCH]) printf("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) printf(" Line %ld column %d - Query hut/but error?\n", linecnt,(int)(s-aline)+1); else cnt_word++; } /* * Special case - angled bracket in front of "From" placed there by an * MTA when sending an e-mail. */ if (strstr(aline,">From")) { if (pswit[ECHO_SWITCH]) printf("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) printf(" Line %ld column %d - " "Query angled bracket with From\n", linecnt,(int)(strstr(aline,">From")-aline)+1); else cnt_punct++; } /* * Check for a single character line - * often an overflow from bad wrapping. */ if (*aline && !aline[1]) { if (*aline=='I' || *aline=='V' || *aline=='X' || *aline=='L' || gcisdigit(*aline)) ; /* Nothing - ignore numerals alone on a line. */ else { if (pswit[ECHO_SWITCH]) printf("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) printf(" Line %ld column 1 - " "Query single character line\n",linecnt); else cnt_punct++; } } /* Check for I" - often should be ! */ if (strstr(aline," I\"")) { if (pswit[ECHO_SWITCH]) printf("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) printf(" Line %ld column %ld - Query I=exclamation mark?\n", linecnt,strstr(aline," I\"")-aline); else cnt_punct++; } /* * Check for period without a capital letter. Cut-down from gutspell. * Only works when it happens on a single line. */ if (pswit[PARANOID_SWITCH]) { for (t=s=aline;strstr(t,". ");) { t=strstr(t,". "); if (t==s) { t++; /* start of line punctuation is handled elsewhere */ continue; } if (!gcisalpha(t[-1])) { t++; continue; } if (warnings->isDutch) { /* For Frank & Jeroen -- 's Middags case */ if (t[2]==CHAR_SQUOTE && t[3]>='a' && t[3]<='z' && t[4]==CHAR_SPACE && t[5]>='A' && t[5]<='Z') { t++; continue; } } s1=t+2; while (*s1 && !gcisalpha(*s1) && !isdigit(*s1)) s1++; if (*s1>='a' && *s1<='z') { /* we have something to investigate */ istypo=1; /* so let's go back and find out */ for (s1=t-1;s1>=s && (gcisalpha(*s1) || gcisdigit(*s1) || *s1==CHAR_SQUOTE && gcisalpha(s1[1]) && gcisalpha(s1[-1]));s1--) ; s1++; for (i=0;*s1 && *s1!='.';s1++,i++) testword[i]=*s1; testword[i]=0; for (i=0;*abbrev[i];i++) if (!strcmp(testword,abbrev[i])) istypo=0; if (gcisdigit(*testword)) istypo=0; if (!testword[1]) istypo=0; if (isroman(testword)) istypo=0; if (istypo) { istypo=0; for (i=0;testword[i];i++) if (strchr(vowels,testword[i])) istypo=1; } if (istypo) { isdup=0; if (strlen(testword)='a' && testword[i]<='z') alower=1; if (alower && testword[i]>='A' && testword[i]<='Z') { /* * We have an uppercase mid-word. However, there are * common cases: * Mac and Mc like McGill * French contractions like l'Abbe */ if (i==2 && testword[0]=='m' && testword[1]=='c' || i==3 && testword[0]=='m' && testword[1]=='a' && testword[2]=='c' || i>0 && testword[i-1]==CHAR_SQUOTE) ; /* do nothing! */ else istypo=1; } testword[i]=(char)tolower(testword[i]); } /* * Check for certain unlikely two-letter combinations at word * start and end. */ if (strlen(testword)>1) { for (i=0;*nostart[i];i++) if (!strncmp(testword,nostart[i],2)) istypo=1; for (i=0;*noend[i];i++) if (!strncmp(testword+strlen(testword)-2,noend[i],2)) istypo=1; } /* ght is common, gbt never. Like that. */ if (strstr(testword,"cb")) istypo=1; if (strstr(testword,"gbt")) istypo=1; if (strstr(testword,"pbt")) istypo=1; if (strstr(testword,"tbs")) istypo=1; if (strstr(testword,"mrn")) istypo=1; if (strstr(testword,"ahle")) istypo=1; if (strstr(testword,"ihle")) istypo=1; /* * "TBE" does happen - like HEARTBEAT - but uncommon. * Also "TBI" - frostbite, outbid - but uncommon. * Similarly "ii" like Hawaii, or Pompeii, and in Roman * numerals, but "ii" is a common scanno. */ if (strstr(testword,"tbi")) istypo=1; if (strstr(testword,"tbe")) istypo=1; if (strstr(testword,"ii")) istypo=1; /* * Check for no vowels or no consonants. * If none, flag a typo. */ if (!istypo && strlen(testword)>1) { vowel=consonant=0; for (i=0;testword[i];i++) { if (testword[i]=='y' || gcisdigit(testword[i])) { /* Yah, this is loose. */ vowel++; consonant++; } else if (strchr(vowels,testword[i])) vowel++; else consonant++; } if (!vowel || !consonant) istypo=1; } /* * Now exclude the word from being reported if it's in * the okword list. */ for (i=0;*okword[i];i++) if (!strcmp(testword,okword[i])) istypo=0; /* * What looks like a typo may be a Roman numeral. * Exclude these. */ if (istypo && isroman(testword)) istypo=0; /* Check the manual list of typos. */ if (!istypo) for (i=0;*typo[i];i++) if (!strcmp(testword,typo[i])) istypo=1; /* * Check lowercase s, l, i and m - special cases. * "j" - often a semi-colon gone wrong. * "d" for a missing apostrophe - he d * "n" for "in" */ if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword)) istypo=1; if (istypo) { isdup=0; if (strlen(testword)digit) { /* In paranoid mode, query all 0 and 1 standing alone. */ if (!strcmp(inword,"0") || !strcmp(inword,"1")) { if (pswit[ECHO_SWITCH]) printf("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) printf(" Line %ld column %d - Query standalone %s\n", linecnt,(int)(wordstart-aline)+2,inword); else cnt_word++; } } } /* * Look for added or missing spaces around punctuation and quotes. * If there is a punctuation character like ! with no space on * either side, suspect a missing!space. If there are spaces on * both sides , assume a typo. If we see a double quote with no * space or punctuation on either side of it, assume unspaced * quotes "like"this. */ llen=strlen(aline); for (i=1;i2 && aline[i-2]=='.') isacro=1; if (i+22 && aline[i-2]=='.') isellipsis=1; if (i+2aline && !gcisalpha(s[-1]) || !gcisalpha(s[1]))) { if (!(squotepar=!squotepar)) { /* parity even */ if (!strchr("_-.'`/\",;:!?)]} ",s[1])) { if (pswit[ECHO_SWITCH]) printf("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) printf(" Line %ld column %d - " "Wrongspaced singlequotes?\n", linecnt,(int)(s-aline)+1); else cnt_punct++; } } else { /* parity odd */ if (!gcisalpha(s[1]) && !isdigit(s[1]) && !strchr("_-/\".'`",s[1]) || !s[1]) { if (pswit[ECHO_SWITCH]) printf("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) printf(" Line %ld column %d - " "Wrongspaced singlequotes?\n", linecnt,(int)(s-aline)+1); else cnt_punct++; } } } } } /* * Look for double punctuation like ,. or ,, * Thanks to DW for the suggestion! * In books with references, ".," and ".;" are common * e.g. "etc., etc.," and vol. 1.; vol 3.; * OTOH, from my initial tests, there are also fairly * common errors. What to do? Make these cases paranoid? * ".," is the most common, so warnings->dotcomma is used * to suppress detailed reporting if it occurs often. */ llen=strlen(aline); for (i=0;idotcomma && aline[i]=='.' && aline[i+1]==',' || warnings->isFrench && !strncmp(aline+i,",...",4) || warnings->isFrench && !strncmp(aline+i,"...,",4) || warnings->isFrench && !strncmp(aline+i,";...",4) || warnings->isFrench && !strncmp(aline+i,"...;",4) || warnings->isFrench && !strncmp(aline+i,":...",4) || warnings->isFrench && !strncmp(aline+i,"...:",4) || warnings->isFrench && !strncmp(aline+i,"!...",4) || warnings->isFrench && !strncmp(aline+i,"...!",4) || warnings->isFrench && !strncmp(aline+i,"?...",4) || warnings->isFrench && !strncmp(aline+i,"...?",4)) { if (warnings->isFrench && !strncmp(aline+i,",...",4) || warnings->isFrench && !strncmp(aline+i,"...,",4) || warnings->isFrench && !strncmp(aline+i,";...",4) || warnings->isFrench && !strncmp(aline+i,"...;",4) || warnings->isFrench && !strncmp(aline+i,":...",4) || warnings->isFrench && !strncmp(aline+i,"...:",4) || warnings->isFrench && !strncmp(aline+i,"!...",4) || warnings->isFrench && !strncmp(aline+i,"...!",4) || warnings->isFrench && !strncmp(aline+i,"?...",4) || warnings->isFrench && !strncmp(aline+i,"...?",4)) i+=4; ; /* do nothing for .. !! and ?? which can be legit */ } else { if (pswit[ECHO_SWITCH]) printf("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) printf(" Line %ld column %d - Double punctuation?\n", linecnt,i+1); else cnt_punct++; } } } s=aline; while (strstr(s," \" ")) { if (pswit[ECHO_SWITCH]) printf("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) printf(" Line %ld column %d - Spaced doublequote?\n", linecnt,(int)(strstr(s," \" ")-aline+1)); else cnt_punct++; s=strstr(s," \" ")+2; } s=aline; while (strstr(s," ' ")) { if (pswit[ECHO_SWITCH]) printf("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) printf(" Line %ld column %d - Spaced singlequote?\n", linecnt,(int)(strstr(s," ' ")-aline+1)); else cnt_punct++; s=strstr(s," ' ")+2; } s=aline; while (strstr(s," ` ")) { if (pswit[ECHO_SWITCH]) printf("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) printf(" Line %ld column %d - Spaced singlequote?\n", linecnt,(int)(strstr(s," ` ")-aline+1)); else cnt_punct++; s=strstr(s," ` ")+2; } /* check special case of 'S instead of 's at end of word */ s=aline+1; while (*s) { if (*s==CHAR_SQUOTE && s[1]=='S' && s[-1]>='a' && s[-1]<='z') { if (pswit[ECHO_SWITCH]) printf("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) printf(" Line %ld column %d - Capital \"S\"?\n", linecnt,(int)(s-aline+2)); else cnt_punct++; } s++; } /* * Now check special cases - start and end of line - * for single and double quotes. Start is sometimes [sic] * but better to query it anyway. * While we're here, check for dash at end of line. */ llen=strlen(aline); if (llen>1) { if (aline[llen-1]==CHAR_DQUOTE || aline[llen-1]==CHAR_SQUOTE || aline[llen-1]==CHAR_OPEN_SQUOTE) if (aline[llen-2]==CHAR_SPACE) { if (pswit[ECHO_SWITCH]) printf("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) printf(" Line %ld column %d - Spaced quote?\n", linecnt,llen); else cnt_punct++; } if ((aline[0]==CHAR_SQUOTE || aline[0]==CHAR_OPEN_SQUOTE) && aline[1]==CHAR_SPACE) { if (pswit[ECHO_SWITCH]) printf("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) printf(" Line %ld column 1 - Spaced quote?\n",linecnt); else cnt_punct++; } /* * Dash at end of line may well be legit - paranoid mode only * and don't report em-dash at line-end. */ if (pswit[PARANOID_SWITCH] && warnings->hyphen) { for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--) ; if (aline[i]=='-' && aline[i-1]!='-') { if (pswit[ECHO_SWITCH]) printf("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) printf(" Line %ld column %d - " "Hyphen at end of line?\n",linecnt,i); } } } /* * Brackets are often unspaced, but shouldn't be surrounded by alpha. * If so, suspect a scanno like "a]most". */ llen=strlen(aline); for (i=1;iendquote) { for (i=1;i. * If there is a < in the line, followed at some point * by a > then we suspect HTML. */ if (strstr(aline,"<") && strstr(aline,">")) { i=(signed int)(strstr(aline,">")-strstr(aline,"<")+1); if (i>0) { strncpy(wrk,strstr(aline,"<"),i); wrk[i]=0; if (pswit[ECHO_SWITCH]) printf("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) printf(" Line %ld column %d - HTML Tag? %s \n", linecnt,(int)(strstr(aline,"<")-aline)+1,wrk); else cnt_html++; } } /* * Check for &symbol; HTML. * If there is a & in the line, followed at * some point by a ; then we suspect HTML. */ if (strstr(aline,"&") && strstr(aline,";")) { i=(int)(strstr(aline,";")-strstr(aline,"&")+1); for (s=strstr(aline,"&");s0) { strncpy(wrk,strstr(aline,"&"),i); wrk[i]=0; if (pswit[ECHO_SWITCH]) printf("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) printf(" Line %ld column %d - HTML symbol? %s \n", linecnt,(int)(strstr(aline,"&")-aline)+1,wrk); else cnt_html++; } } /* * At end of paragraph, check for mismatched quotes. * We don't want to report an error immediately, since it is a * common convention to omit the quotes at end of paragraph if * the next paragraph is a continuation of the same speaker. * Where this is the case, the next para should begin with a * quote, so we store the warning message and only display it * at the top of the next iteration if the new para doesn't * start with a quote. * The -p switch overrides this default, and warns of unclosed * quotes on _every_ paragraph, whether the next begins with a * quote or not. */ if (isemptyline) { /* end of para - add up the totals */ if (counters.quot%2) sprintf(dquote_err," Line %ld - Mismatched quotes\n", linecnt); if (pswit[SQUOTE_SWITCH] && counters.open_single_quote && counters.open_single_quote!=counters.close_single_quote) sprintf(squote_err," Line %ld - Mismatched singlequotes?\n", linecnt); if (pswit[SQUOTE_SWITCH] && counters.open_single_quote && counters.open_single_quote!=counters.close_single_quote && counters.open_single_quote!=counters.close_single_quote+1) /* * Flag it to be noted regardless of the * first char of the next para. */ squot=1; if (counters.r_brack) sprintf(rbrack_err," Line %ld - " "Mismatched round brackets?\n",linecnt); if (counters.s_brack) sprintf(sbrack_err," Line %ld - " "Mismatched square brackets?\n",linecnt); if (counters.c_brack) sprintf(cbrack_err," Line %ld - " "Mismatched curly brackets?\n",linecnt); if (counters.c_unders%2) sprintf(unders_err," Line %ld - Mismatched underscores?\n", linecnt); memset(&counters,0,sizeof(counters)); /* let the next iteration know that it's starting a new para */ isnewpara=1; } /* * Check for omitted punctuation at end of paragraph by working back * through prevline. DW. * Need to check this only for "normal" paras. * So what is a "normal" para? * Not normal if one-liner (chapter headings, etc.) * Not normal if doesn't contain at least one locase letter * Not normal if starts with space */ if (isemptyline) { /* end of para */ for (s=prevline,i=0;*s && !i;s++) if (gcisletter(*s)) /* use i to indicate the presence of a letter on the line */ i=1; /* * This next "if" is a problem. * If we say "start_para_line <= linecnt - 1", that includes * one-line "paragraphs" like chapter heads. Lotsa false positives. * If we say "start_para_line < linecnt - 1" it doesn't, but then it * misses genuine one-line paragraphs. */ if (i && last.blen>2 && start_para_lineCHAR_SPACE) { for (i=strlen(prevline)-1; (prevline[i]==CHAR_DQUOTE || prevline[i]==CHAR_SQUOTE) && prevline[i]>CHAR_SPACE && i>0; i--) ; for (;i>0;i--) { if (gcisalpha(prevline[i])) { if (pswit[ECHO_SWITCH]) printf("\n%s\n",prevline); if (!pswit[OVERVIEW_SWITCH]) printf(" Line %ld column %d - " "No punctuation at para end?\n", linecnt-1,strlen(prevline)); else cnt_punct++; break; } if (strchr("-.:!([{?}])",prevline[i])) break; } } } strcpy(prevline,aline); } fclose(infile); if (!pswit[OVERVIEW_SWITCH]) for (i=0;i='A' && *theline<='Z') *theline+=32; } /* * isroman: * * Is this word a Roman Numeral? * * It doesn't actually validate that the number is a valid Roman Numeral--for * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not * what we're here to do. If it passes this, it LOOKS like a Roman numeral. * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or * expressions thereof, except when it came to taxes. Allow any number of M, * an optional D, an optional CM or CD, any number of optional Cs, an optional * XL or an optional XC, an optional IX or IV, an optional V and any number * of optional Is. */ int isroman(char *t) { char *s; if (!t || !*t) return 0; s=t; while (*t=='m' && *t) t++; if (*t=='d') t++; if (*t=='c' && t[1]=='m') t+=2; if (*t=='c' && t[1]=='d') t+=2; while (*t=='c' && *t) t++; if (*t=='x' && t[1]=='l') t+=2; if (*t=='x' && t[1]=='c') t+=2; if (*t=='l') t++; while (*t=='x' && *t) t++; if (*t=='i' && t[1]=='x') t+=2; if (*t=='i' && t[1]=='v') t+=2; if (*t=='v') t++; while (*t=='i' && *t) t++; return !*t; } /* * gcisalpha: * * A version of isalpha() that is somewhat lenient on 8-bit texts. * If we use the standard function, 8-bit accented characters break * words, so that tete with accented characters appears to be two words, "t" * and "t", with 8-bit characters between them. This causes over-reporting of * errors. gcisalpha() recognizes accented letters from the CP1252 (Windows) * and ISO-8859-1 character sets, which are the most common PG 8-bit types. */ int gcisalpha(unsigned char c) { if (c>='a' && c<='z') return 1; if (c>='A' && c<='Z') return 1; if (c<140) return 0; if (c>=192 && c!=208 && c!=215 && c!=222 && c!=240 && c!=247 && c!=254) return 1; if (c==140 || c==142 || c==156 || c==158 || c==159) return 1; return 0; } /* * gcisdigit: * * A version of isdigit() that doesn't get confused in 8-bit texts. */ int gcisdigit(unsigned char c) { return c>='0' && c<='9'; } /* * gcisletter: * * A version of isletter() that doesn't get confused in 8-bit texts. * NB: this is ISO-8891-1-specific. */ int gcisletter(unsigned char c) { return c>='A' && c<='Z' || c>='a' && c<='z' || c>=192; } /* * gcstrchr: * * Wraps strchr to return NULL if the character being searched for is zero. */ char *gcstrchr(char *s,char c) { if (!c) return NULL; return strchr(s,c); } /* * postprocess_for_DP: * * Invoked with the -d switch from flgets(). * It simply "removes" from the line a hard-coded set of common * DP-specific tags, so that the line passed to the main routine has * been pre-cleaned of DP markup. */ void postprocess_for_DP(char *theline) { char *s,*t; int i; if (!*theline) return; for (i=0;*DPmarkup[i];i++) { s=strstr(theline,DPmarkup[i]); while (s) { t=s+strlen(DPmarkup[i]); while (*t) { *s=*t; t++; s++; } *s=0; s=strstr(theline,DPmarkup[i]); } } } /* * postprocess_for_HTML: * * Invoked with the -m switch from flgets(). * It simply "removes" from the line a hard-coded set of common * HTML tags and "replaces" a hard-coded set of common HTML * entities, so that the line passed to the main routine has * been pre-cleaned of HTML. */ void postprocess_for_HTML(char *theline) { if (strstr(theline,"<") && strstr(theline,">")) while (losemarkup(theline)) ; while (loseentities(theline)) ; } char *losemarkup(char *theline) { char *s,*t; int i; if (!*theline) return NULL; s=strstr(theline,"<"); t=strstr(theline,">"); if (!s || !t) return NULL; for (i=0;*markup[i];i++) if (!tagcomp(s+1,markup[i])) { if (!t[1]) { *s=0; return s; } else if (t>s) { strcpy(s,t+1); return s; } } /* It's an unrecognized . */ return NULL; } char *loseentities(char *theline) { int i; char *s,*t; if (!*theline) return NULL; for (i=0;*entities[i].htmlent;i++) { s=strstr(theline,entities[i].htmlent); if (s) { t=malloc((size_t)strlen(s)); if (!t) return NULL; strcpy(t,s+strlen(entities[i].htmlent)); strcpy(s,entities[i].textent); strcat(s,t); free(t); return theline; } } for (i=0;*entities[i].htmlnum;i++) { s=strstr(theline,entities[i].htmlnum); if (s) { t=malloc((size_t)strlen(s)); if (!t) return NULL; strcpy(t,s+strlen(entities[i].htmlnum)); strcpy(s,entities[i].textent); strcat(s,t); free(t); return theline; } } return NULL; } int tagcomp(char *strin,char *basetag) { char *s,*t; s=basetag; t=strin; if (*t=='/') t++; /* ignore a slash */ while (*s && *t) { if (tolower(*s)!=tolower(*t)) return 1; s++; t++; } return 0; } void proghelp() { fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr); fputs("Copyright 2000-2005 Jim Tinsley .\n",stderr); fputs("Copyright 2012- J. Ali Harlow .\n",stderr); fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. " "For details, read the file COPYING.\n",stderr); fputs("This is Free Software; " "you may redistribute it under certain conditions (GPL);\n",stderr); fputs("read the file COPYING for details.\n\n",stderr); fputs("Usage is: bookloupe [-setpxloyhud] filename\n",stderr); fputs(" where -s checks single quotes, -e suppresses echoing lines, " "-t checks typos\n",stderr); fputs(" -x (paranoid) switches OFF -t and extra checks, " "-l turns OFF line-end checks\n",stderr); fputs(" -o just displays overview without detail, " "-h echoes header fields\n",stderr); fputs(" -v (verbose) unsuppresses duplicate reporting, " "-m suppresses markup\n",stderr); fputs(" -d ignores DP-specific markup,\n",stderr); fputs(" -u uses a file gutcheck.typ to query user-defined " "possible typos\n",stderr); fputs("Sample usage: bookloupe warpeace.txt \n",stderr); fputs("\n",stderr); fputs("Bookloupe looks for errors in Project Gutenberg(TM) etexts.\n", stderr); fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; " "non-ASCII\n",stderr); fputs("characters like accented letters, " "lines longer than 75 or shorter than 55,\n",stderr); fputs("unbalanced quotes or brackets, " "a variety of badly formatted punctuation, \n",stderr); fputs("HTML tags, some likely typos. " "It is NOT a substitute for human judgement.\n",stderr); fputs("\n",stderr); }