# HG changeset patch # User ali # Date 1382299585 -3600 # Node ID cd3068704d3ac2753cd9e7a3703689acb4fe06a2 # Parent ad92d11d59b812c1599d942452371f734742993f Fix bug #24: Accept alternate form of newline diff -r ad92d11d59b8 -r cd3068704d3a bookloupe/bookloupe.c --- a/bookloupe/bookloupe.c Tue Oct 15 09:16:04 2013 +0100 +++ b/bookloupe/bookloupe.c Sun Oct 20 21:06:25 2013 +0100 @@ -183,7 +183,7 @@ gboolean mixdigit(const char *); gchar *getaword(const char **); -char *flgets(char **,long); +char *flgets(char **,long,gboolean); void postprocess_for_HTML(char *); char *linehasmarkup(char *); char *losemarkup(char *); @@ -487,11 +487,20 @@ gchar *inword; QuoteClass qc; lines=g_strsplit(etext,"\n",0); + if (lines[0]) + /* If there's at least one line, we might have UNIX-style terminators */ + results.unix_lineends=TRUE; for (j=0;lines[j];j++) { lbytes=strlen(lines[j]); - while (lbytes>0 && lines[j][lbytes-1]=='\r') - lines[j][--lbytes]='\0'; + if (lbytes>0 && lines[j][lbytes-1]=='\r') + { + results.unix_lineends=FALSE; + do + { + lines[j][--lbytes]='\0'; + } while (lbytes>0 && lines[j][lbytes-1]=='\r'); + } llen=g_utf8_strlen(lines[j],lbytes); linecnt++; if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") && @@ -633,6 +642,13 @@ struct warnings *report_first_pass(struct first_pass_results *results) { static struct warnings warnings={0}; + warnings.nocr=1; + if (results->unix_lineends) + { + warnings.nocr=0; + g_print(" --> No lines in this file have a CR. Not reporting them. " + "Project Gutenberg requires that all lineends be CR-LF.\n"); + } if (cnt_spacend>0) g_print(" --> %ld lines in this file have white space at end\n", cnt_spacend); @@ -2621,7 +2637,7 @@ */ linecnt=0; etext_ptr=etext; - while ((aline=flgets(&etext_ptr,linecnt+1))) + while ((aline=flgets(&etext_ptr,linecnt+1,warnings->nocr))) { linecnt++; if (linecnt==1) @@ -2767,7 +2783,7 @@ * * Returns: a pointer to the line. */ -char *flgets(char **etext,long lcnt) +char *flgets(char **etext,long lcnt,gboolean warn_nocr) { gunichar c; gboolean isCR=FALSE; @@ -2806,7 +2822,7 @@ else { /* Error - a LF without a preceding CR */ - if (pswit[LINE_END_SWITCH]) + if (pswit[LINE_END_SWITCH] && warn_nocr) { if (pswit[ECHO_SWITCH]) { diff -r ad92d11d59b8 -r cd3068704d3a bookloupe/bookloupe.h --- a/bookloupe/bookloupe.h Tue Oct 15 09:16:04 2013 +0100 +++ b/bookloupe/bookloupe.h Sun Oct 20 21:06:25 2013 +0100 @@ -69,12 +69,13 @@ long spacedash; struct dash_results emdash; int Dutchcount,Frenchcount; + gboolean unix_lineends; }; struct warnings { int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen; int endquote; - gboolean isDutch,isFrench; + gboolean isDutch,isFrench,nocr; }; struct line_properties { diff -r ad92d11d59b8 -r cd3068704d3a doc/loupe-test.txt --- a/doc/loupe-test.txt Tue Oct 15 09:16:04 2013 +0100 +++ b/doc/loupe-test.txt Sun Oct 20 21:06:25 2013 +0100 @@ -91,14 +91,35 @@ ------------------ One of the tests that bookloupe/gutcheck need to do is check that all -lines are ended with CR NL (as required by PG) rather than the UNIX -standard NL. loupe-test deliberately ignores the line endings in testcase -definition files and always uses CR NL. Thus there is needed a means +lines are ended with CR LF (as required by PG) rather than the UNIX +standard LF. loupe-test deliberately ignores the line endings in testcase +definition files and uses the expected CR LF. Thus there is needed a means to embed a linefeed (aka newline) character into the input to be sent to bookloupe/gutcheck to test that it correctly identified the problem. loupe-test recognises the unicode symbol for linefeed (U+240A): ␊ which can be used for this purpose instead of a normal newline. +UNIX-style newlines +------------------- + +To make life easier for users on UNIX and similar platforms, bookloupe +recognises the case of all lines terminated with UNIX-style newlines. +It notes this in the summary but does not issue any warnings. We thus +need some way to test this case which we do by the NEWLINES tag: + + ┌──────────────────────────────────────────────────────────────────────────┐ + │**************** NEWLINES **************** │ + │LF │ + │**************** INPUT **************** │ + │Katherine was assailed by a sudden doubt. Had she mailed that letter? Yes,│ + │she was certain of that. She had run out to the mail box at ten o'clock │ + │at night especially to mail it. What had gone wrong? Why wasn't there │ + │someone to meet her? │ + └──────────────────────────────────────────────────────────────────────────┘ + +The possible options are CRLF for DOS-style newlines (the default) and +LF for UNIX-style newlines. + Passing command line options ---------------------------- @@ -203,3 +224,16 @@ this, eg.: sample: PASS (with 1 of 1 false positives and 1 of 1 false negatives) + +The summary +----------- + +As part of the header (the first section of output), bookloupe may display +a number of summary lines. These are characterized by a leading ASCII +long arrow (-->) and generally say something about the ebook as a whole +rather than individual lines. Where it is desired to test for the presence +of a summary line, a "summary" node can be included within the "expected" +node of a testcase using structured warnings. The "summary" node can contain +one or more "text" nodes which indicate the text of lines that must be +present in the summary section in order for the test to pass. No account is +taken of the order of such lines and other summary lines may also be present. diff -r ad92d11d59b8 -r cd3068704d3a test/bookloupe/Makefile.am --- a/test/bookloupe/Makefile.am Tue Oct 15 09:16:04 2013 +0100 +++ b/test/bookloupe/Makefile.am Sun Oct 20 21:06:25 2013 +0100 @@ -1,6 +1,6 @@ TESTS_ENVIRONMENT=BOOKLOUPE=../../bookloupe/bookloupe ../harness/loupe-test TESTS=non-ascii.tst long-line.tst curved-single-quotes.tst curved-quotes.tst \ runfox-quotes.tst curved-genitives.tst multi-line-illustration.tst \ - emdash.tst footnote-marker.tst + emdash.tst footnote-marker.tst unix-lineends.tst dist_pkgdata_DATA=$(TESTS) diff -r ad92d11d59b8 -r cd3068704d3a test/bookloupe/unix-lineends.tst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/bookloupe/unix-lineends.tst Sun Oct 20 21:06:25 2013 +0100 @@ -0,0 +1,13 @@ +**************** NEWLINES **************** +LF +**************** INPUT **************** +Katherine was assailed by a sudden doubt. Had she mailed that letter? Yes, +she was certain of that. She had run out to the mail box at ten o'clock +at night especially to mail it. What had gone wrong? Why wasn't there +someone to meet her? +**************** WARNINGS **************** + + + No lines in this file have a CR. Not reporting them. Project Gutenberg requires that all lineends be CR-LF. + + diff -r ad92d11d59b8 -r cd3068704d3a test/harness/testcase.c --- a/test/harness/testcase.c Tue Oct 15 09:16:04 2013 +0100 +++ b/test/harness/testcase.c Sun Oct 20 21:06:25 2013 +0100 @@ -326,6 +326,42 @@ } /* + * Check the summary produced by bookloupe against testcase->summary. + */ +static gboolean testcase_check_summary(Testcase *testcase,const char *summary) +{ + int i; + gboolean r; + gchar **lines; + GSList *texts,*lnk; + if (!testcase->summary.texts) + return TRUE; + texts=g_slist_copy(testcase->summary.texts); + lines=g_strsplit(summary,"\n",0); + for(i=0;lines[i];i++) + { + if (!g_str_has_prefix(lines[i]," --> ")) + continue; + for(lnk=texts;lnk;lnk=lnk->next) + if (!strcmp(lines[i]+7,lnk->data)) + { + texts=g_slist_delete_link(texts,lnk); + break; + } + } + g_strfreev(lines); + r=!texts; + if (texts) + { + g_print("%s: FAIL\n",testcase->basename); + g_print("Missing summary text from bookloupe:\n"); + g_print(" --> %s\n",texts->data); + } + g_slist_free(texts); + return r; +} + +/* * Check the warnings produced by bookloupe against either the * unstructured testcase->expected or the structured testcase->warnings * as appropriate. @@ -460,7 +496,7 @@ gboolean r; size_t pos,offset; GString *header; - char *output,*filename,*s,*xfail=NULL; + char *output,*filename,*s,*summary,*xfail=NULL; GError *error=NULL; if (!testcase_create_input_files(testcase,&error)) { @@ -500,10 +536,15 @@ pos=header->len; if (r) { - /* Skip the summary */ + /* Find the end of the summary */ s=strstr(output+pos,"\n\n"); if (s) + { + summary=g_strndup(output+pos,s-(output+pos)); + r=testcase_check_summary(testcase,summary); + g_free(summary); pos=s-output+2; + } else { g_print("%s: FAIL\n",testcase->basename); @@ -512,7 +553,8 @@ } } g_string_free(header,TRUE); - r=testcase_check_warnings(testcase,output+pos,&xfail); + if (r) + r=testcase_check_warnings(testcase,output+pos,&xfail); g_free(filename); g_free(output); if (r) diff -r ad92d11d59b8 -r cd3068704d3a test/harness/testcase.h --- a/test/harness/testcase.h Tue Oct 15 09:16:04 2013 +0100 +++ b/test/harness/testcase.h Sun Oct 20 21:06:25 2013 +0100 @@ -15,6 +15,10 @@ } TestcaseLocation; typedef struct { + GSList *texts; +} TestcaseSummary; + +typedef struct { /* * Does this warning relate to a real problem in the etext * (eg., error and false-negative). @@ -38,12 +42,14 @@ char *tmpdir; GSList *inputs; char *expected; + TestcaseSummary summary; GSList *warnings; char *encoding; /* The character encoding to talk to BOOKLOUPE in */ char **options; enum { TESTCASE_XFAIL=1<<0, TESTCASE_TMP_DIR=1<<1, + TESTCASE_UNIX_NEWLINES=1<<2, } flags; } Testcase; diff -r ad92d11d59b8 -r cd3068704d3a test/harness/testcaseinput.c --- a/test/harness/testcaseinput.c Tue Oct 15 09:16:04 2013 +0100 +++ b/test/harness/testcaseinput.c Sun Oct 20 21:06:25 2013 +0100 @@ -76,9 +76,15 @@ { if (testcase->encoding) { - t=unix2dos(input->contents); - s=g_convert(t,-1,testcase->encoding,"UTF-8",NULL,&n,&tmp_err); - g_free(t); + if (testcase->flags&TESTCASE_UNIX_NEWLINES) + s=g_convert(input->contents,-1,testcase->encoding,"UTF-8",NULL, + &n,&tmp_err); + else + { + t=unix2dos(input->contents); + s=g_convert(t,-1,testcase->encoding,"UTF-8",NULL,&n,&tmp_err); + g_free(t); + } if (!s) { g_propagate_prefixed_error(error,tmp_err, @@ -86,6 +92,11 @@ return FALSE; } } + else if (testcase->flags&TESTCASE_UNIX_NEWLINES) + { + s=g_strdup(input->contents); + n=strlen(s); + } else { s=unix2dos(input->contents); diff -r ad92d11d59b8 -r cd3068704d3a test/harness/testcaseio.c --- a/test/harness/testcaseio.c Tue Oct 15 09:16:04 2013 +0100 +++ b/test/harness/testcaseio.c Sun Oct 20 21:06:25 2013 +0100 @@ -22,7 +22,7 @@ GError *err=NULL; char *s,*arg; const char *tag,*text; - gboolean found_tag=FALSE; + gboolean found_tag=FALSE,newlines_set=FALSE; parser=testcase_parser_new_from_file(filename); if (!parser) return NULL; @@ -88,6 +88,24 @@ } else if (!testcase->encoding && !strcmp(tag,"ENCODING")) testcase->encoding=g_strchomp(g_strdup(text)); + else if (!newlines_set && !strcmp(tag,"NEWLINES")) + { + newlines_set=TRUE; + s=g_strdup(text); + g_strchomp(s); + if (!strcmp(s,"LF")) + testcase->flags|=TESTCASE_UNIX_NEWLINES; + else if (strcmp(s,"CRLF")) + { + g_printerr( + "%s: Unrecognised style for newlines. Try LF or CRLF.\n",s); + g_free(s); + testcase_free(testcase); + testcase_parser_free(parser); + return NULL; + } + g_free(s); + } else if (!testcase->encoding && !strcmp(tag,"OPTIONS")) { testcase->options=g_strsplit(text,"\n",0); diff -r ad92d11d59b8 -r cd3068704d3a test/harness/warningsparser.c --- a/test/harness/warningsparser.c Tue Oct 15 09:16:04 2013 +0100 +++ b/test/harness/warningsparser.c Sun Oct 20 21:06:25 2013 +0100 @@ -15,11 +15,12 @@ enum { WARNINGS_INIT, WARNINGS_IN_EXPECTED, + WARNINGS_IN_SUMMARY, WARNINGS_IN_WARNING, WARNINGS_IN_AT, WARNINGS_IN_TEXT, WARNINGS_DONE, - } state; + } state,parent_state; } WarningsBaton; static void warnings_parser_start_element(GMarkupParseContext *context, @@ -30,6 +31,7 @@ guint64 tmp; char *endp; WarningsBaton *baton=user_data; + baton->parent_state=baton->state; switch(baton->state) { case WARNINGS_INIT: @@ -45,20 +47,36 @@ baton->state=WARNINGS_IN_EXPECTED; break; case WARNINGS_IN_EXPECTED: - baton->warning=g_new0(TestcaseWarning,1); - if (!strcmp(element_name,"error")) - baton->warning->is_real=TRUE; - else if (!strcmp(element_name,"false-positive")) - baton->warning->xfail=TRUE; - else if (!strcmp(element_name,"false-negative")) - baton->warning->is_real=baton->warning->xfail=TRUE; + if (!strcmp(element_name,"summary")) + { + if (baton->testcase->summary.texts) + { + g_set_error(error,G_MARKUP_ERROR, + G_MARKUP_ERROR_INVALID_CONTENT,"Multiple summary " + "elements are not valid"); + } + else + baton->state=WARNINGS_IN_SUMMARY; + } else { - g_set_error(error,G_MARKUP_ERROR,G_MARKUP_ERROR_UNKNOWN_ELEMENT, - "Unknown element in 'expected': '%s'",element_name); - g_free(baton->warning); - baton->warning=NULL; - return; + baton->warning=g_new0(TestcaseWarning,1); + if (!strcmp(element_name,"error")) + baton->warning->is_real=TRUE; + else if (!strcmp(element_name,"false-positive")) + baton->warning->xfail=TRUE; + else if (!strcmp(element_name,"false-negative")) + baton->warning->is_real=baton->warning->xfail=TRUE; + else + { + g_set_error(error,G_MARKUP_ERROR, + G_MARKUP_ERROR_UNKNOWN_ELEMENT, + "Unknown element in 'expected': '%s'",element_name); + g_free(baton->warning); + baton->warning=NULL; + return; + } + baton->state=WARNINGS_IN_WARNING; } if (attribute_names[0]) { @@ -66,12 +84,28 @@ G_MARKUP_ERROR_UNKNOWN_ATTRIBUTE, "Unknown attribute on element '%s': '%s'",element_name, attribute_names[0]); - g_free(baton->warning); - baton->warning=NULL; + if (baton->state==WARNINGS_IN_WARNING) + { + g_free(baton->warning); + baton->warning=NULL; + } + baton->state=WARNINGS_IN_EXPECTED; return; } - else - baton->state=WARNINGS_IN_WARNING; + break; + case WARNINGS_IN_SUMMARY: + if (!strcmp(element_name,"text")) + { + if (attribute_names[0]) + { + g_set_error(error,G_MARKUP_ERROR, + G_MARKUP_ERROR_UNKNOWN_ATTRIBUTE, + "Unknown attribute on element 'text': '%s'", + attribute_names[0]); + return; + } + baton->state=WARNINGS_IN_TEXT; + } break; case WARNINGS_IN_WARNING: if (!strcmp(element_name,"at")) @@ -162,6 +196,15 @@ g_slist_reverse(baton->testcase->warnings); baton->state=WARNINGS_DONE; break; + case WARNINGS_IN_SUMMARY: + if (!baton->testcase->summary.texts) + g_set_error(error,G_MARKUP_ERROR,G_MARKUP_ERROR_INVALID_CONTENT, + "Summary element must contain at least one text element"); + else + baton->testcase->summary.texts= + g_slist_reverse(baton->testcase->summary.texts); + baton->state=WARNINGS_IN_EXPECTED; + break; case WARNINGS_IN_WARNING: baton->warning->locations= g_slist_reverse(baton->warning->locations); @@ -177,7 +220,7 @@ baton->state=WARNINGS_IN_WARNING; break; case WARNINGS_IN_TEXT: - baton->state=WARNINGS_IN_WARNING; + baton->state=baton->parent_state; break; default: g_set_error(error,G_MARKUP_ERROR,G_MARKUP_ERROR_UNKNOWN_ELEMENT, @@ -198,6 +241,11 @@ g_set_error(error,G_MARKUP_ERROR,G_MARKUP_ERROR_INVALID_CONTENT, "The 'expected' tag does not take any content"); break; + case WARNINGS_IN_SUMMARY: + if (strspn(text," \t\n")!=text_len) + g_set_error(error,G_MARKUP_ERROR,G_MARKUP_ERROR_INVALID_CONTENT, + "The summary tags do not take any content"); + break; case WARNINGS_IN_WARNING: if (strspn(text," \t\n")!=text_len) g_set_error(error,G_MARKUP_ERROR,G_MARKUP_ERROR_INVALID_CONTENT, @@ -211,7 +259,10 @@ case WARNINGS_IN_TEXT: s=g_strdup(text+strspn(text," \t\n")); g_strchomp(s); - if (baton->warning->text) + if (baton->parent_state==WARNINGS_IN_SUMMARY) + baton->testcase->summary.texts= + g_slist_prepend(baton->testcase->summary.texts,s); + else if (baton->warning->text) { t=g_strconcat(baton->warning->text,s,NULL); g_free(baton->warning->text); @@ -237,6 +288,7 @@ parser.text=warnings_parser_text; baton=g_new0(WarningsBaton,1); baton->testcase=testcase; + baton->parent_state=WARNINGS_INIT; baton->state=WARNINGS_INIT; return g_markup_parse_context_new(&parser, G_MARKUP_TREAT_CDATA_AS_TEXT|G_MARKUP_PREFIX_ERROR_POSITION,