# HG changeset patch # User ali # Date 1382809653 -3600 # Node ID f44c530f80da57947086d347e820a285e62bce92 # Parent ad92d11d59b812c1599d942452371f734742993f Fix bug #24: Accept alternate form of newline diff -r ad92d11d59b8 -r f44c530f80da bookloupe/bookloupe.c --- a/bookloupe/bookloupe.c Tue Oct 15 09:16:04 2013 +0100 +++ b/bookloupe/bookloupe.c Sat Oct 26 18:47:33 2013 +0100 @@ -183,7 +183,7 @@ gboolean mixdigit(const char *); gchar *getaword(const char **); -char *flgets(char **,long); +char *flgets(char **,long,int); void postprocess_for_HTML(char *); char *linehasmarkup(char *); char *losemarkup(char *); @@ -487,11 +487,40 @@ gchar *inword; QuoteClass qc; lines=g_strsplit(etext,"\n",0); + if (!lines[0]) + { + /* An empty etext has no terminators */ + results.newlines=DOS_NEWLINES; + } + else if (!lines[1]) + { + /* + * If there are no LFs, we don't have UNIX-style + * terminators, but we might have OS9-style ones. + */ + results.newlines=OS9_NEWLINES; + g_strfreev(lines); + lines=g_strsplit(etext,"\r",0); + if (!lines[0] || !lines[1]) + /* Looks like we don't have any terminators at all */ + results.newlines=DOS_NEWLINES; + } + else + { + /* We might have UNIX-style terminators */ + results.newlines=UNIX_NEWLINES; + } for (j=0;lines[j];j++) { lbytes=strlen(lines[j]); - while (lbytes>0 && lines[j][lbytes-1]=='\r') - lines[j][--lbytes]='\0'; + if (lbytes>0 && lines[j][lbytes-1]=='\r') + { + results.newlines=DOS_NEWLINES; + do + { + lines[j][--lbytes]='\0'; + } while (lbytes>0 && lines[j][lbytes-1]=='\r'); + } llen=g_utf8_strlen(lines[j],lbytes); linecnt++; if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") && @@ -633,6 +662,13 @@ struct warnings *report_first_pass(struct first_pass_results *results) { static struct warnings warnings={0}; + warnings.newlines=results->newlines; + if (warnings.newlines==UNIX_NEWLINES) + g_print(" --> No lines in this file have a CR. Not reporting them. " + "Project Gutenberg requires that all lineends be CR-LF.\n"); + else if (warnings.newlines==OS9_NEWLINES) + g_print(" --> No lines in this file have a LF. Not reporting them. " + "Project Gutenberg requires that all lineends be CR-LF.\n"); if (cnt_spacend>0) g_print(" --> %ld lines in this file have white space at end\n", cnt_spacend); @@ -2621,7 +2657,7 @@ */ linecnt=0; etext_ptr=etext; - while ((aline=flgets(&etext_ptr,linecnt+1))) + while ((aline=flgets(&etext_ptr,linecnt+1,warnings->newlines))) { linecnt++; if (linecnt==1) @@ -2762,12 +2798,21 @@ /* * flgets: * - * Get one line from the input text, checking for - * the existence of exactly one CR/LF line-end per line. + * Get one line from the input text. The setting of newlines has the following + * effect: + * + * DOS_NEWLINES: Check for the existence of exactly one CR-LF line-end per line. + * + * OS9_NEWLINES: Asserts that etext contains no LFs. CR is used as + * the newline character. + * + * UNIX_NEWLINES: Check for the presence of CRs. + * + * In all cases, check that the last line is correctly terminated. * * Returns: a pointer to the line. */ -char *flgets(char **etext,long lcnt) +char *flgets(char **etext,long lcnt,int newlines) { gunichar c; gboolean isCR=FALSE; @@ -2790,8 +2835,15 @@ g_free(s); } if (!pswit[OVERVIEW_SWITCH]) - /* There may, or may not, have been a CR */ - g_print(" Line %ld - No LF?\n",lcnt); + { + if (newlines==OS9_NEWLINES) + g_print(" Line %ld - No CR?\n",lcnt); + else + { + /* There may, or may not, have been a CR */ + g_print(" Line %ld - No LF?\n",lcnt); + } + } else cnt_lineend++; } @@ -2801,9 +2853,7 @@ /* either way, it's end of line */ if (c=='\n') { - if (isCR) - break; - else + if (newlines==DOS_NEWLINES && !isCR) { /* Error - a LF without a preceding CR */ if (pswit[LINE_END_SWITCH]) @@ -2819,14 +2869,15 @@ else cnt_lineend++; } - break; } + break; } if (c=='\r') { - if (isCR) + if (newlines==OS9_NEWLINES) + break; + if (isCR || newlines==UNIX_NEWLINES) { - /* Error - two successive CRs */ if (pswit[LINE_END_SWITCH]) { if (pswit[ECHO_SWITCH]) @@ -2836,12 +2887,22 @@ g_free(s); } if (!pswit[OVERVIEW_SWITCH]) - g_print(" Line %ld - Two successive CRs?\n",lcnt); + { + if (newlines==UNIX_NEWLINES) + g_print(" Line %ld column %ld - Embedded CR?\n", + lcnt,g_utf8_pointer_to_offset(theline,eos)+1); + else + g_print(" Line %ld - Two successive CRs?\n", + lcnt); + } else cnt_lineend++; } + if (newlines==UNIX_NEWLINES) + *eos=' '; } - isCR=TRUE; + if (newlines==DOS_NEWLINES) + isCR=TRUE; } else { diff -r ad92d11d59b8 -r f44c530f80da bookloupe/bookloupe.h --- a/bookloupe/bookloupe.h Tue Oct 15 09:16:04 2013 +0100 +++ b/bookloupe/bookloupe.h Sat Oct 26 18:47:33 2013 +0100 @@ -58,6 +58,12 @@ SWITNO }; +enum { + DOS_NEWLINES, + UNIX_NEWLINES, + OS9_NEWLINES, +}; + struct dash_results { long base,space,non_PG_space,PG_space; }; @@ -68,12 +74,13 @@ long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit; long spacedash; struct dash_results emdash; + int newlines; int Dutchcount,Frenchcount; }; struct warnings { int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen; - int endquote; + int endquote,newlines; gboolean isDutch,isFrench; }; diff -r ad92d11d59b8 -r f44c530f80da doc/loupe-test.txt --- a/doc/loupe-test.txt Tue Oct 15 09:16:04 2013 +0100 +++ b/doc/loupe-test.txt Sat Oct 26 18:47:33 2013 +0100 @@ -91,14 +91,35 @@ ------------------ One of the tests that bookloupe/gutcheck need to do is check that all -lines are ended with CR NL (as required by PG) rather than the UNIX -standard NL. loupe-test deliberately ignores the line endings in testcase -definition files and always uses CR NL. Thus there is needed a means +lines are ended with CR LF (as required by PG) rather than the UNIX +standard LF. loupe-test deliberately ignores the line endings in testcase +definition files and uses the expected CR LF. Thus there is needed a means to embed a linefeed (aka newline) character into the input to be sent to bookloupe/gutcheck to test that it correctly identified the problem. loupe-test recognises the unicode symbol for linefeed (U+240A): ␊ which can be used for this purpose instead of a normal newline. +UNIX-style newlines +------------------- + +To make life easier for users on UNIX and similar platforms, bookloupe +recognises the case of all lines terminated with UNIX-style newlines. +It notes this in the summary but does not issue any warnings. We thus +need some way to test this case which we do by the NEWLINES tag: + + ┌──────────────────────────────────────────────────────────────────────────┐ + │**************** NEWLINES **************** │ + │LF │ + │**************** INPUT **************** │ + │Katherine was assailed by a sudden doubt. Had she mailed that letter? Yes,│ + │she was certain of that. She had run out to the mail box at ten o'clock │ + │at night especially to mail it. What had gone wrong? Why wasn't there │ + │someone to meet her? │ + └──────────────────────────────────────────────────────────────────────────┘ + +The possible options are CRLF for DOS-style newlines (the default) and +LF for UNIX-style newlines. + Passing command line options ---------------------------- @@ -203,3 +224,16 @@ this, eg.: sample: PASS (with 1 of 1 false positives and 1 of 1 false negatives) + +The summary +----------- + +As part of the header (the first section of output), bookloupe may display +a number of summary lines. These are characterized by a leading ASCII +long arrow (-->) and generally say something about the ebook as a whole +rather than individual lines. Where it is desired to test for the presence +of a summary line, a "summary" node can be included within the "expected" +node of a testcase using structured warnings. The "summary" node can contain +one or more "text" nodes which indicate the text of lines that must be +present in the summary section in order for the test to pass. No account is +taken of the order of such lines and other summary lines may also be present. diff -r ad92d11d59b8 -r f44c530f80da test/bookloupe/Makefile.am --- a/test/bookloupe/Makefile.am Tue Oct 15 09:16:04 2013 +0100 +++ b/test/bookloupe/Makefile.am Sat Oct 26 18:47:33 2013 +0100 @@ -1,6 +1,6 @@ TESTS_ENVIRONMENT=BOOKLOUPE=../../bookloupe/bookloupe ../harness/loupe-test TESTS=non-ascii.tst long-line.tst curved-single-quotes.tst curved-quotes.tst \ runfox-quotes.tst curved-genitives.tst multi-line-illustration.tst \ - emdash.tst footnote-marker.tst + emdash.tst footnote-marker.tst unix-lineends.tst os9-lineends.tst dist_pkgdata_DATA=$(TESTS) diff -r ad92d11d59b8 -r f44c530f80da test/bookloupe/os9-lineends.tst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/bookloupe/os9-lineends.tst Sat Oct 26 18:47:33 2013 +0100 @@ -0,0 +1,13 @@ +**************** NEWLINES **************** +CR +**************** INPUT **************** +Katherine was assailed by a sudden doubt. Had she mailed that letter? Yes, +she was certain of that. She had run out to the mail box at ten o'clock +at night especially to mail it. What had gone wrong? Why wasn't there +someone to meet her? +**************** WARNINGS **************** + + + No lines in this file have a LF. Not reporting them. Project Gutenberg requires that all lineends be CR-LF. + + diff -r ad92d11d59b8 -r f44c530f80da test/bookloupe/unix-lineends.tst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/bookloupe/unix-lineends.tst Sat Oct 26 18:47:33 2013 +0100 @@ -0,0 +1,17 @@ +**************** NEWLINES **************** +LF +**************** INPUT **************** +Katherine was assailed by a sudden doubt. Had she mailed that letter? Yes, +she was certain of that. She had run out to the mail box at ten o'clock +at night especially to mail it. What had gone wrong?␍Why wasn't there +someone to meet her? +**************** WARNINGS **************** + + + No lines in this file have a CR. Not reporting them. Project Gutenberg requires that all lineends be CR-LF. + + + + Embedded CR? + + diff -r ad92d11d59b8 -r f44c530f80da test/harness/testcase.c --- a/test/harness/testcase.c Tue Oct 15 09:16:04 2013 +0100 +++ b/test/harness/testcase.c Sat Oct 26 18:47:33 2013 +0100 @@ -326,6 +326,42 @@ } /* + * Check the summary produced by bookloupe against testcase->summary. + */ +static gboolean testcase_check_summary(Testcase *testcase,const char *summary) +{ + int i; + gboolean r; + gchar **lines; + GSList *texts,*lnk; + if (!testcase->summary.texts) + return TRUE; + texts=g_slist_copy(testcase->summary.texts); + lines=g_strsplit(summary,"\n",0); + for(i=0;lines[i];i++) + { + if (!g_str_has_prefix(lines[i]," --> ")) + continue; + for(lnk=texts;lnk;lnk=lnk->next) + if (!strcmp(lines[i]+7,lnk->data)) + { + texts=g_slist_delete_link(texts,lnk); + break; + } + } + g_strfreev(lines); + r=!texts; + if (texts) + { + g_print("%s: FAIL\n",testcase->basename); + g_print("Missing summary text from bookloupe:\n"); + g_print(" --> %s\n",texts->data); + } + g_slist_free(texts); + return r; +} + +/* * Check the warnings produced by bookloupe against either the * unstructured testcase->expected or the structured testcase->warnings * as appropriate. @@ -460,7 +496,7 @@ gboolean r; size_t pos,offset; GString *header; - char *output,*filename,*s,*xfail=NULL; + char *output,*filename,*s,*summary,*xfail=NULL; GError *error=NULL; if (!testcase_create_input_files(testcase,&error)) { @@ -500,10 +536,15 @@ pos=header->len; if (r) { - /* Skip the summary */ + /* Find the end of the summary */ s=strstr(output+pos,"\n\n"); if (s) + { + summary=g_strndup(output+pos,s-(output+pos)); + r=testcase_check_summary(testcase,summary); + g_free(summary); pos=s-output+2; + } else { g_print("%s: FAIL\n",testcase->basename); @@ -512,7 +553,8 @@ } } g_string_free(header,TRUE); - r=testcase_check_warnings(testcase,output+pos,&xfail); + if (r) + r=testcase_check_warnings(testcase,output+pos,&xfail); g_free(filename); g_free(output); if (r) diff -r ad92d11d59b8 -r f44c530f80da test/harness/testcase.h --- a/test/harness/testcase.h Tue Oct 15 09:16:04 2013 +0100 +++ b/test/harness/testcase.h Sat Oct 26 18:47:33 2013 +0100 @@ -15,6 +15,10 @@ } TestcaseLocation; typedef struct { + GSList *texts; +} TestcaseSummary; + +typedef struct { /* * Does this warning relate to a real problem in the etext * (eg., error and false-negative). @@ -38,12 +42,15 @@ char *tmpdir; GSList *inputs; char *expected; + TestcaseSummary summary; GSList *warnings; char *encoding; /* The character encoding to talk to BOOKLOUPE in */ char **options; enum { TESTCASE_XFAIL=1<<0, TESTCASE_TMP_DIR=1<<1, + TESTCASE_UNIX_NEWLINES=1<<2, + TESTCASE_OS9_NEWLINES=1<<3, } flags; } Testcase; diff -r ad92d11d59b8 -r f44c530f80da test/harness/testcaseinput.c --- a/test/harness/testcaseinput.c Tue Oct 15 09:16:04 2013 +0100 +++ b/test/harness/testcaseinput.c Sat Oct 26 18:47:33 2013 +0100 @@ -32,10 +32,10 @@ } /* - * Replace \n with \r\n, U+240A (visible symbol for LF) with \n - * and U+240D (visible symbol for CR) with \r. + * Replace \n with requested newline, U+240A (visible symbol for LF) + * with \n and U+240D (visible symbol for CR) with \r. */ -static char *unix2dos(const char *text) +static char *convert_newlines(const char *text,int flags) { gunichar c; const gunichar visible_lf=0x240A; @@ -46,8 +46,13 @@ { c=g_utf8_get_char(text); text=g_utf8_next_char(text); - if (c=='\n') - g_string_append(string,"\r\n"); + if (c=='\n' && !(flags&TESTCASE_UNIX_NEWLINES)) + { + if (flags&TESTCASE_OS9_NEWLINES) + g_string_append_c(string,'\r'); + else + g_string_append(string,"\r\n"); + } else if (c==visible_lf) g_string_append_c(string,'\n'); else if (c==visible_cr) @@ -76,7 +81,7 @@ { if (testcase->encoding) { - t=unix2dos(input->contents); + t=convert_newlines(input->contents,testcase->flags); s=g_convert(t,-1,testcase->encoding,"UTF-8",NULL,&n,&tmp_err); g_free(t); if (!s) @@ -88,7 +93,7 @@ } else { - s=unix2dos(input->contents); + s=convert_newlines(input->contents,testcase->flags); n=strlen(s); } } diff -r ad92d11d59b8 -r f44c530f80da test/harness/testcaseio.c --- a/test/harness/testcaseio.c Tue Oct 15 09:16:04 2013 +0100 +++ b/test/harness/testcaseio.c Sat Oct 26 18:47:33 2013 +0100 @@ -22,7 +22,7 @@ GError *err=NULL; char *s,*arg; const char *tag,*text; - gboolean found_tag=FALSE; + gboolean found_tag=FALSE,newlines_set=FALSE; parser=testcase_parser_new_from_file(filename); if (!parser) return NULL; @@ -88,6 +88,26 @@ } else if (!testcase->encoding && !strcmp(tag,"ENCODING")) testcase->encoding=g_strchomp(g_strdup(text)); + else if (!newlines_set && !strcmp(tag,"NEWLINES")) + { + newlines_set=TRUE; + s=g_strdup(text); + g_strchomp(s); + if (!strcmp(s,"LF")) + testcase->flags|=TESTCASE_UNIX_NEWLINES; + else if (!strcmp(s,"CR")) + testcase->flags|=TESTCASE_OS9_NEWLINES; + else if (strcmp(s,"CRLF")) + { + g_printerr( + "%s: Unrecognised style for newlines. Try CR or LF.\n",s); + g_free(s); + testcase_free(testcase); + testcase_parser_free(parser); + return NULL; + } + g_free(s); + } else if (!testcase->encoding && !strcmp(tag,"OPTIONS")) { testcase->options=g_strsplit(text,"\n",0); diff -r ad92d11d59b8 -r f44c530f80da test/harness/warningsparser.c --- a/test/harness/warningsparser.c Tue Oct 15 09:16:04 2013 +0100 +++ b/test/harness/warningsparser.c Sat Oct 26 18:47:33 2013 +0100 @@ -15,11 +15,12 @@ enum { WARNINGS_INIT, WARNINGS_IN_EXPECTED, + WARNINGS_IN_SUMMARY, WARNINGS_IN_WARNING, WARNINGS_IN_AT, WARNINGS_IN_TEXT, WARNINGS_DONE, - } state; + } state,parent_state; } WarningsBaton; static void warnings_parser_start_element(GMarkupParseContext *context, @@ -30,6 +31,7 @@ guint64 tmp; char *endp; WarningsBaton *baton=user_data; + baton->parent_state=baton->state; switch(baton->state) { case WARNINGS_INIT: @@ -45,20 +47,36 @@ baton->state=WARNINGS_IN_EXPECTED; break; case WARNINGS_IN_EXPECTED: - baton->warning=g_new0(TestcaseWarning,1); - if (!strcmp(element_name,"error")) - baton->warning->is_real=TRUE; - else if (!strcmp(element_name,"false-positive")) - baton->warning->xfail=TRUE; - else if (!strcmp(element_name,"false-negative")) - baton->warning->is_real=baton->warning->xfail=TRUE; + if (!strcmp(element_name,"summary")) + { + if (baton->testcase->summary.texts) + { + g_set_error(error,G_MARKUP_ERROR, + G_MARKUP_ERROR_INVALID_CONTENT,"Multiple summary " + "elements are not valid"); + } + else + baton->state=WARNINGS_IN_SUMMARY; + } else { - g_set_error(error,G_MARKUP_ERROR,G_MARKUP_ERROR_UNKNOWN_ELEMENT, - "Unknown element in 'expected': '%s'",element_name); - g_free(baton->warning); - baton->warning=NULL; - return; + baton->warning=g_new0(TestcaseWarning,1); + if (!strcmp(element_name,"error")) + baton->warning->is_real=TRUE; + else if (!strcmp(element_name,"false-positive")) + baton->warning->xfail=TRUE; + else if (!strcmp(element_name,"false-negative")) + baton->warning->is_real=baton->warning->xfail=TRUE; + else + { + g_set_error(error,G_MARKUP_ERROR, + G_MARKUP_ERROR_UNKNOWN_ELEMENT, + "Unknown element in 'expected': '%s'",element_name); + g_free(baton->warning); + baton->warning=NULL; + return; + } + baton->state=WARNINGS_IN_WARNING; } if (attribute_names[0]) { @@ -66,12 +84,28 @@ G_MARKUP_ERROR_UNKNOWN_ATTRIBUTE, "Unknown attribute on element '%s': '%s'",element_name, attribute_names[0]); - g_free(baton->warning); - baton->warning=NULL; + if (baton->state==WARNINGS_IN_WARNING) + { + g_free(baton->warning); + baton->warning=NULL; + } + baton->state=WARNINGS_IN_EXPECTED; return; } - else - baton->state=WARNINGS_IN_WARNING; + break; + case WARNINGS_IN_SUMMARY: + if (!strcmp(element_name,"text")) + { + if (attribute_names[0]) + { + g_set_error(error,G_MARKUP_ERROR, + G_MARKUP_ERROR_UNKNOWN_ATTRIBUTE, + "Unknown attribute on element 'text': '%s'", + attribute_names[0]); + return; + } + baton->state=WARNINGS_IN_TEXT; + } break; case WARNINGS_IN_WARNING: if (!strcmp(element_name,"at")) @@ -162,6 +196,15 @@ g_slist_reverse(baton->testcase->warnings); baton->state=WARNINGS_DONE; break; + case WARNINGS_IN_SUMMARY: + if (!baton->testcase->summary.texts) + g_set_error(error,G_MARKUP_ERROR,G_MARKUP_ERROR_INVALID_CONTENT, + "Summary element must contain at least one text element"); + else + baton->testcase->summary.texts= + g_slist_reverse(baton->testcase->summary.texts); + baton->state=WARNINGS_IN_EXPECTED; + break; case WARNINGS_IN_WARNING: baton->warning->locations= g_slist_reverse(baton->warning->locations); @@ -177,7 +220,7 @@ baton->state=WARNINGS_IN_WARNING; break; case WARNINGS_IN_TEXT: - baton->state=WARNINGS_IN_WARNING; + baton->state=baton->parent_state; break; default: g_set_error(error,G_MARKUP_ERROR,G_MARKUP_ERROR_UNKNOWN_ELEMENT, @@ -198,6 +241,11 @@ g_set_error(error,G_MARKUP_ERROR,G_MARKUP_ERROR_INVALID_CONTENT, "The 'expected' tag does not take any content"); break; + case WARNINGS_IN_SUMMARY: + if (strspn(text," \t\n")!=text_len) + g_set_error(error,G_MARKUP_ERROR,G_MARKUP_ERROR_INVALID_CONTENT, + "The summary tags do not take any content"); + break; case WARNINGS_IN_WARNING: if (strspn(text," \t\n")!=text_len) g_set_error(error,G_MARKUP_ERROR,G_MARKUP_ERROR_INVALID_CONTENT, @@ -211,7 +259,10 @@ case WARNINGS_IN_TEXT: s=g_strdup(text+strspn(text," \t\n")); g_strchomp(s); - if (baton->warning->text) + if (baton->parent_state==WARNINGS_IN_SUMMARY) + baton->testcase->summary.texts= + g_slist_prepend(baton->testcase->summary.texts,s); + else if (baton->warning->text) { t=g_strconcat(baton->warning->text,s,NULL); g_free(baton->warning->text); @@ -237,6 +288,7 @@ parser.text=warnings_parser_text; baton=g_new0(WarningsBaton,1); baton->testcase=testcase; + baton->parent_state=WARNINGS_INIT; baton->state=WARNINGS_INIT; return g_markup_parse_context_new(&parser, G_MARKUP_TREAT_CDATA_AS_TEXT|G_MARKUP_PREFIX_ERROR_POSITION,