# HG changeset patch
# User ali <ali@juiblex.co.uk>
# Date 1382299585 -3600
# Node ID cd3068704d3ac2753cd9e7a3703689acb4fe06a2
# Parent  ad92d11d59b812c1599d942452371f734742993f
Fix bug #24: Accept alternate form of newline

diff -r ad92d11d59b8 -r cd3068704d3a bookloupe/bookloupe.c
--- a/bookloupe/bookloupe.c	Tue Oct 15 09:16:04 2013 +0100
+++ b/bookloupe/bookloupe.c	Sun Oct 20 21:06:25 2013 +0100
@@ -183,7 +183,7 @@
 
 gboolean mixdigit(const char *);
 gchar *getaword(const char **);
-char *flgets(char **,long);
+char *flgets(char **,long,gboolean);
 void postprocess_for_HTML(char *);
 char *linehasmarkup(char *);
 char *losemarkup(char *);
@@ -487,11 +487,20 @@
     gchar *inword;
     QuoteClass qc;
     lines=g_strsplit(etext,"\n",0);
+    if (lines[0])
+	/* If there's at least one line, we might have UNIX-style terminators */
+	results.unix_lineends=TRUE;
     for (j=0;lines[j];j++)
     {
 	lbytes=strlen(lines[j]);
-	while (lbytes>0 && lines[j][lbytes-1]=='\r')
-	    lines[j][--lbytes]='\0';
+	if (lbytes>0 && lines[j][lbytes-1]=='\r')
+	{
+	    results.unix_lineends=FALSE;
+	    do
+	    {
+		lines[j][--lbytes]='\0';
+	    } while (lbytes>0 && lines[j][lbytes-1]=='\r');
+	}
 	llen=g_utf8_strlen(lines[j],lbytes);
 	linecnt++;
 	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
@@ -633,6 +642,13 @@
 struct warnings *report_first_pass(struct first_pass_results *results)
 {
     static struct warnings warnings={0};
+    warnings.nocr=1;
+    if (results->unix_lineends)
+    {
+	warnings.nocr=0;
+	g_print("   --> No lines in this file have a CR. Not reporting them. "
+	  "Project Gutenberg requires that all lineends be CR-LF.\n");
+    }
     if (cnt_spacend>0)
 	g_print("   --> %ld lines in this file have white space at end\n",
 	  cnt_spacend);
@@ -2621,7 +2637,7 @@
      */
     linecnt=0;
     etext_ptr=etext;
-    while ((aline=flgets(&etext_ptr,linecnt+1)))
+    while ((aline=flgets(&etext_ptr,linecnt+1,warnings->nocr)))
     {
 	linecnt++;
 	if (linecnt==1)
@@ -2767,7 +2783,7 @@
  *
  * Returns: a pointer to the line.
  */
-char *flgets(char **etext,long lcnt)
+char *flgets(char **etext,long lcnt,gboolean warn_nocr)
 {
     gunichar c;
     gboolean isCR=FALSE;
@@ -2806,7 +2822,7 @@
 	    else
 	    {
 		/* Error - a LF without a preceding CR */
-		if (pswit[LINE_END_SWITCH])
+		if (pswit[LINE_END_SWITCH] && warn_nocr)
 		{
 		    if (pswit[ECHO_SWITCH])
 		    {
diff -r ad92d11d59b8 -r cd3068704d3a bookloupe/bookloupe.h
--- a/bookloupe/bookloupe.h	Tue Oct 15 09:16:04 2013 +0100
+++ b/bookloupe/bookloupe.h	Sun Oct 20 21:06:25 2013 +0100
@@ -69,12 +69,13 @@
     long spacedash;
     struct dash_results emdash;
     int Dutchcount,Frenchcount;
+    gboolean unix_lineends;
 };
 
 struct warnings {
     int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;
     int endquote;
-    gboolean isDutch,isFrench;
+    gboolean isDutch,isFrench,nocr;
 };
 
 struct line_properties {
diff -r ad92d11d59b8 -r cd3068704d3a doc/loupe-test.txt
--- a/doc/loupe-test.txt	Tue Oct 15 09:16:04 2013 +0100
+++ b/doc/loupe-test.txt	Sun Oct 20 21:06:25 2013 +0100
@@ -91,14 +91,35 @@
 ------------------
 
 One of the tests that bookloupe/gutcheck need to do is check that all
-lines are ended with CR NL (as required by PG) rather than the UNIX
-standard NL. loupe-test deliberately ignores the line endings in testcase
-definition files and always uses CR NL. Thus there is needed a means
+lines are ended with CR LF (as required by PG) rather than the UNIX
+standard LF. loupe-test deliberately ignores the line endings in testcase
+definition files and uses the expected CR LF. Thus there is needed a means
 to embed a linefeed (aka newline) character into the input to be sent
 to bookloupe/gutcheck to test that it correctly identified the problem.
 loupe-test recognises the unicode symbol for linefeed (U+240A): ␊ which
 can be used for this purpose instead of a normal newline.
 
+UNIX-style newlines
+-------------------
+
+To make life easier for users on UNIX and similar platforms, bookloupe
+recognises the case of all lines terminated with UNIX-style newlines.
+It notes this in the summary but does not issue any warnings. We thus
+need some way to test this case which we do by the NEWLINES tag:
+
+  ┌──────────────────────────────────────────────────────────────────────────┐
+  │**************** NEWLINES ****************                                │
+  │LF                                                                        │
+  │**************** INPUT ****************                                   │
+  │Katherine was assailed by a sudden doubt. Had she mailed that letter? Yes,│
+  │she was certain of that. She had run out to the mail box at ten o'clock   │
+  │at night especially to mail it. What had gone wrong? Why wasn't there     │
+  │someone to meet her?                                                      │
+  └──────────────────────────────────────────────────────────────────────────┘
+
+The possible options are CRLF for DOS-style newlines (the default) and
+LF for UNIX-style newlines.
+
 Passing command line options
 ----------------------------
 
@@ -203,3 +224,16 @@
 this, eg.:
 
 sample: PASS (with 1 of 1 false positives and 1 of 1 false negatives)
+
+The summary
+-----------
+
+As part of the header (the first section of output), bookloupe may display
+a number of summary lines. These are characterized by a leading ASCII
+long arrow (-->) and generally say something about the ebook as a whole
+rather than individual lines. Where it is desired to test for the presence
+of a summary line, a "summary" node can be included within the "expected"
+node of a testcase using structured warnings. The "summary" node can contain
+one or more "text" nodes which indicate the text of lines that must be
+present in the summary section in order for the test to pass. No account is
+taken of the order of such lines and other summary lines may also be present.
diff -r ad92d11d59b8 -r cd3068704d3a test/bookloupe/Makefile.am
--- a/test/bookloupe/Makefile.am	Tue Oct 15 09:16:04 2013 +0100
+++ b/test/bookloupe/Makefile.am	Sun Oct 20 21:06:25 2013 +0100
@@ -1,6 +1,6 @@
 TESTS_ENVIRONMENT=BOOKLOUPE=../../bookloupe/bookloupe ../harness/loupe-test
 TESTS=non-ascii.tst long-line.tst curved-single-quotes.tst curved-quotes.tst \
 	runfox-quotes.tst curved-genitives.tst multi-line-illustration.tst \
-	emdash.tst footnote-marker.tst
+	emdash.tst footnote-marker.tst unix-lineends.tst
 
 dist_pkgdata_DATA=$(TESTS)
diff -r ad92d11d59b8 -r cd3068704d3a test/bookloupe/unix-lineends.tst
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/bookloupe/unix-lineends.tst	Sun Oct 20 21:06:25 2013 +0100
@@ -0,0 +1,13 @@
+**************** NEWLINES ****************
+LF
+**************** INPUT ****************
+Katherine was assailed by a sudden doubt. Had she mailed that letter? Yes,
+she was certain of that. She had run out to the mail box at ten o'clock
+at night especially to mail it. What had gone wrong? Why wasn't there
+someone to meet her?
+**************** WARNINGS ****************
+<expected>
+  <summary>
+    <text>No lines in this file have a CR. Not reporting them. Project Gutenberg requires that all lineends be CR-LF.</text>
+  </summary>
+</expected>
diff -r ad92d11d59b8 -r cd3068704d3a test/harness/testcase.c
--- a/test/harness/testcase.c	Tue Oct 15 09:16:04 2013 +0100
+++ b/test/harness/testcase.c	Sun Oct 20 21:06:25 2013 +0100
@@ -326,6 +326,42 @@
 }
 
 /*
+ * Check the summary produced by bookloupe against testcase->summary.
+ */
+static gboolean testcase_check_summary(Testcase *testcase,const char *summary)
+{
+    int i;
+    gboolean r;
+    gchar **lines;
+    GSList *texts,*lnk;
+    if (!testcase->summary.texts)
+	return TRUE;
+    texts=g_slist_copy(testcase->summary.texts);
+    lines=g_strsplit(summary,"\n",0);
+    for(i=0;lines[i];i++)
+    {
+	if (!g_str_has_prefix(lines[i],"   --> "))
+	    continue;
+	for(lnk=texts;lnk;lnk=lnk->next)
+	    if (!strcmp(lines[i]+7,lnk->data))
+	    {
+		texts=g_slist_delete_link(texts,lnk);
+		break;
+	    }
+    }
+    g_strfreev(lines);
+    r=!texts;
+    if (texts)
+    {
+	g_print("%s: FAIL\n",testcase->basename);
+	g_print("Missing summary text from bookloupe:\n");
+	g_print("   --> %s\n",texts->data);
+    }
+    g_slist_free(texts);
+    return r;
+}
+
+/*
  * Check the warnings produced by bookloupe against either the
  * unstructured testcase->expected or the structured testcase->warnings
  * as appropriate.
@@ -460,7 +496,7 @@
     gboolean r;
     size_t pos,offset;
     GString *header;
-    char *output,*filename,*s,*xfail=NULL;
+    char *output,*filename,*s,*summary,*xfail=NULL;
     GError *error=NULL;
     if (!testcase_create_input_files(testcase,&error))
     {
@@ -500,10 +536,15 @@
     pos=header->len;
     if (r)
     {
-	/* Skip the summary */
+	/* Find the end of the summary */
 	s=strstr(output+pos,"\n\n");
 	if (s)
+	{
+	    summary=g_strndup(output+pos,s-(output+pos));
+	    r=testcase_check_summary(testcase,summary);
+	    g_free(summary);
 	    pos=s-output+2;
+	}
 	else
 	{
 	    g_print("%s: FAIL\n",testcase->basename);
@@ -512,7 +553,8 @@
 	}
     }
     g_string_free(header,TRUE);
-    r=testcase_check_warnings(testcase,output+pos,&xfail);
+    if (r)
+	r=testcase_check_warnings(testcase,output+pos,&xfail);
     g_free(filename);
     g_free(output);
     if (r)
diff -r ad92d11d59b8 -r cd3068704d3a test/harness/testcase.h
--- a/test/harness/testcase.h	Tue Oct 15 09:16:04 2013 +0100
+++ b/test/harness/testcase.h	Sun Oct 20 21:06:25 2013 +0100
@@ -15,6 +15,10 @@
 } TestcaseLocation;
 
 typedef struct {
+    GSList *texts;
+} TestcaseSummary;
+
+typedef struct {
     /*
      * Does this warning relate to a real problem in the etext
      * (eg., error and false-negative).
@@ -38,12 +42,14 @@
     char *tmpdir;
     GSList *inputs;
     char *expected;
+    TestcaseSummary summary;
     GSList *warnings;
     char *encoding;	/* The character encoding to talk to BOOKLOUPE in */
     char **options;
     enum {
 	TESTCASE_XFAIL=1<<0,
 	TESTCASE_TMP_DIR=1<<1,
+	TESTCASE_UNIX_NEWLINES=1<<2,
     } flags;
 } Testcase;
 
diff -r ad92d11d59b8 -r cd3068704d3a test/harness/testcaseinput.c
--- a/test/harness/testcaseinput.c	Tue Oct 15 09:16:04 2013 +0100
+++ b/test/harness/testcaseinput.c	Sun Oct 20 21:06:25 2013 +0100
@@ -76,9 +76,15 @@
     {
 	if (testcase->encoding)
 	{
-	    t=unix2dos(input->contents);
-	    s=g_convert(t,-1,testcase->encoding,"UTF-8",NULL,&n,&tmp_err);
-	    g_free(t);
+	    if (testcase->flags&TESTCASE_UNIX_NEWLINES)
+		s=g_convert(input->contents,-1,testcase->encoding,"UTF-8",NULL,
+		  &n,&tmp_err);
+	    else
+	    {
+		t=unix2dos(input->contents);
+		s=g_convert(t,-1,testcase->encoding,"UTF-8",NULL,&n,&tmp_err);
+		g_free(t);
+	    }
 	    if (!s)
 	    {
 		g_propagate_prefixed_error(error,tmp_err,
@@ -86,6 +92,11 @@
 		return FALSE;
 	    }
 	}
+	else if (testcase->flags&TESTCASE_UNIX_NEWLINES)
+	{
+	    s=g_strdup(input->contents);
+	    n=strlen(s);
+	}
 	else
 	{
 	    s=unix2dos(input->contents);
diff -r ad92d11d59b8 -r cd3068704d3a test/harness/testcaseio.c
--- a/test/harness/testcaseio.c	Tue Oct 15 09:16:04 2013 +0100
+++ b/test/harness/testcaseio.c	Sun Oct 20 21:06:25 2013 +0100
@@ -22,7 +22,7 @@
     GError *err=NULL;
     char *s,*arg;
     const char *tag,*text;
-    gboolean found_tag=FALSE;
+    gboolean found_tag=FALSE,newlines_set=FALSE;
     parser=testcase_parser_new_from_file(filename);
     if (!parser)
 	return NULL;
@@ -88,6 +88,24 @@
 	}
 	else if (!testcase->encoding && !strcmp(tag,"ENCODING"))
 	    testcase->encoding=g_strchomp(g_strdup(text));
+	else if (!newlines_set && !strcmp(tag,"NEWLINES"))
+	{
+	    newlines_set=TRUE;
+	    s=g_strdup(text);
+	    g_strchomp(s);
+	    if (!strcmp(s,"LF"))
+		testcase->flags|=TESTCASE_UNIX_NEWLINES;
+	    else if (strcmp(s,"CRLF"))
+	    {
+		g_printerr(
+		  "%s: Unrecognised style for newlines. Try LF or CRLF.\n",s);
+		g_free(s);
+		testcase_free(testcase);
+		testcase_parser_free(parser);
+		return NULL;
+	    }
+	    g_free(s);
+	}
 	else if (!testcase->encoding && !strcmp(tag,"OPTIONS"))
 	{
 	    testcase->options=g_strsplit(text,"\n",0);
diff -r ad92d11d59b8 -r cd3068704d3a test/harness/warningsparser.c
--- a/test/harness/warningsparser.c	Tue Oct 15 09:16:04 2013 +0100
+++ b/test/harness/warningsparser.c	Sun Oct 20 21:06:25 2013 +0100
@@ -15,11 +15,12 @@
     enum {
 	WARNINGS_INIT,
 	WARNINGS_IN_EXPECTED,
+	WARNINGS_IN_SUMMARY,
 	WARNINGS_IN_WARNING,
 	WARNINGS_IN_AT,
 	WARNINGS_IN_TEXT,
 	WARNINGS_DONE,
-    } state;
+    } state,parent_state;
 } WarningsBaton;
 
 static void warnings_parser_start_element(GMarkupParseContext *context,
@@ -30,6 +31,7 @@
     guint64 tmp;
     char *endp;
     WarningsBaton *baton=user_data;
+    baton->parent_state=baton->state;
     switch(baton->state)
     {
 	case WARNINGS_INIT:
@@ -45,20 +47,36 @@
 		baton->state=WARNINGS_IN_EXPECTED;
 	    break;
 	case WARNINGS_IN_EXPECTED:
-	    baton->warning=g_new0(TestcaseWarning,1);
-	    if (!strcmp(element_name,"error"))
-		baton->warning->is_real=TRUE;
-	    else if (!strcmp(element_name,"false-positive"))
-		baton->warning->xfail=TRUE;
-	    else if (!strcmp(element_name,"false-negative"))
-		baton->warning->is_real=baton->warning->xfail=TRUE;
+	    if (!strcmp(element_name,"summary"))
+	    {
+		if (baton->testcase->summary.texts)
+		{
+		    g_set_error(error,G_MARKUP_ERROR,
+		      G_MARKUP_ERROR_INVALID_CONTENT,"Multiple summary "
+		      "elements are not valid");
+		}
+		else
+		    baton->state=WARNINGS_IN_SUMMARY;
+	    }
 	    else
 	    {
-		g_set_error(error,G_MARKUP_ERROR,G_MARKUP_ERROR_UNKNOWN_ELEMENT,
-		  "Unknown element in 'expected': '%s'",element_name);
-		g_free(baton->warning);
-		baton->warning=NULL;
-		return;
+		baton->warning=g_new0(TestcaseWarning,1);
+		if (!strcmp(element_name,"error"))
+		    baton->warning->is_real=TRUE;
+		else if (!strcmp(element_name,"false-positive"))
+		    baton->warning->xfail=TRUE;
+		else if (!strcmp(element_name,"false-negative"))
+		    baton->warning->is_real=baton->warning->xfail=TRUE;
+		else
+		{
+		    g_set_error(error,G_MARKUP_ERROR,
+		      G_MARKUP_ERROR_UNKNOWN_ELEMENT,
+		      "Unknown element in 'expected': '%s'",element_name);
+		    g_free(baton->warning);
+		    baton->warning=NULL;
+		    return;
+		}
+		baton->state=WARNINGS_IN_WARNING;
 	    }
 	    if (attribute_names[0])
 	    {
@@ -66,12 +84,28 @@
 		  G_MARKUP_ERROR_UNKNOWN_ATTRIBUTE,
 		  "Unknown attribute on element '%s': '%s'",element_name,
 		  attribute_names[0]);
-		g_free(baton->warning);
-		baton->warning=NULL;
+		if (baton->state==WARNINGS_IN_WARNING)
+		{
+		    g_free(baton->warning);
+		    baton->warning=NULL;
+		}
+		baton->state=WARNINGS_IN_EXPECTED;
 		return;
 	    }
-	    else
-		baton->state=WARNINGS_IN_WARNING;
+	    break;
+	case WARNINGS_IN_SUMMARY:
+	    if (!strcmp(element_name,"text"))
+	    {
+		if (attribute_names[0])
+		{
+		    g_set_error(error,G_MARKUP_ERROR,
+		      G_MARKUP_ERROR_UNKNOWN_ATTRIBUTE,
+		      "Unknown attribute on element 'text': '%s'",
+		      attribute_names[0]);
+		    return;
+		}
+		baton->state=WARNINGS_IN_TEXT;
+	    }
 	    break;
 	case WARNINGS_IN_WARNING:
 	    if (!strcmp(element_name,"at"))
@@ -162,6 +196,15 @@
 	      g_slist_reverse(baton->testcase->warnings);
 	    baton->state=WARNINGS_DONE;
 	    break;
+	case WARNINGS_IN_SUMMARY:
+	    if (!baton->testcase->summary.texts)
+		g_set_error(error,G_MARKUP_ERROR,G_MARKUP_ERROR_INVALID_CONTENT,
+		  "Summary element must contain at least one text element");
+	    else
+		baton->testcase->summary.texts=
+		  g_slist_reverse(baton->testcase->summary.texts);
+	    baton->state=WARNINGS_IN_EXPECTED;
+	    break;
 	case WARNINGS_IN_WARNING:
 	    baton->warning->locations=
 	      g_slist_reverse(baton->warning->locations);
@@ -177,7 +220,7 @@
 	    baton->state=WARNINGS_IN_WARNING;
 	    break;
 	case WARNINGS_IN_TEXT:
-	    baton->state=WARNINGS_IN_WARNING;
+	    baton->state=baton->parent_state;
 	    break;
 	default:
 	    g_set_error(error,G_MARKUP_ERROR,G_MARKUP_ERROR_UNKNOWN_ELEMENT,
@@ -198,6 +241,11 @@
 		g_set_error(error,G_MARKUP_ERROR,G_MARKUP_ERROR_INVALID_CONTENT,
 		  "The 'expected' tag does not take any content");
 	    break;
+	case WARNINGS_IN_SUMMARY:
+	    if (strspn(text," \t\n")!=text_len)
+		g_set_error(error,G_MARKUP_ERROR,G_MARKUP_ERROR_INVALID_CONTENT,
+		  "The summary tags do not take any content");
+	    break;
 	case WARNINGS_IN_WARNING:
 	    if (strspn(text," \t\n")!=text_len)
 		g_set_error(error,G_MARKUP_ERROR,G_MARKUP_ERROR_INVALID_CONTENT,
@@ -211,7 +259,10 @@
 	case WARNINGS_IN_TEXT:
 	    s=g_strdup(text+strspn(text," \t\n"));
 	    g_strchomp(s);
-	    if (baton->warning->text)
+	    if (baton->parent_state==WARNINGS_IN_SUMMARY)
+		baton->testcase->summary.texts=
+		  g_slist_prepend(baton->testcase->summary.texts,s);
+	    else if (baton->warning->text)
 	    {
 		t=g_strconcat(baton->warning->text,s,NULL);
 		g_free(baton->warning->text);
@@ -237,6 +288,7 @@
     parser.text=warnings_parser_text;
     baton=g_new0(WarningsBaton,1);
     baton->testcase=testcase;
+    baton->parent_state=WARNINGS_INIT;
     baton->state=WARNINGS_INIT;
     return g_markup_parse_context_new(&parser,
       G_MARKUP_TREAT_CDATA_AS_TEXT|G_MARKUP_PREFIX_ERROR_POSITION,