bookloupe/bookloupe.c
changeset 101 f44c530f80da
parent 100 ad92d11d59b8
child 102 ff0aa9b1397a
     1.1 --- a/bookloupe/bookloupe.c	Tue Oct 15 09:16:04 2013 +0100
     1.2 +++ b/bookloupe/bookloupe.c	Sat Oct 26 18:47:33 2013 +0100
     1.3 @@ -183,7 +183,7 @@
     1.4  
     1.5  gboolean mixdigit(const char *);
     1.6  gchar *getaword(const char **);
     1.7 -char *flgets(char **,long);
     1.8 +char *flgets(char **,long,int);
     1.9  void postprocess_for_HTML(char *);
    1.10  char *linehasmarkup(char *);
    1.11  char *losemarkup(char *);
    1.12 @@ -487,11 +487,40 @@
    1.13      gchar *inword;
    1.14      QuoteClass qc;
    1.15      lines=g_strsplit(etext,"\n",0);
    1.16 +    if (!lines[0])
    1.17 +    {
    1.18 +	/* An empty etext has no terminators */
    1.19 +	results.newlines=DOS_NEWLINES;
    1.20 +    }
    1.21 +    else if (!lines[1])
    1.22 +    {
    1.23 +	/*
    1.24 +	 * If there are no LFs, we don't have UNIX-style
    1.25 +	 * terminators, but we might have OS9-style ones.
    1.26 +	 */
    1.27 +	results.newlines=OS9_NEWLINES;
    1.28 +	g_strfreev(lines);
    1.29 +	lines=g_strsplit(etext,"\r",0);
    1.30 +	if (!lines[0] || !lines[1])
    1.31 +	    /* Looks like we don't have any terminators at all */
    1.32 +	    results.newlines=DOS_NEWLINES;
    1.33 +    }
    1.34 +    else
    1.35 +    {
    1.36 +	/* We might have UNIX-style terminators */
    1.37 +	results.newlines=UNIX_NEWLINES;
    1.38 +    }
    1.39      for (j=0;lines[j];j++)
    1.40      {
    1.41  	lbytes=strlen(lines[j]);
    1.42 -	while (lbytes>0 && lines[j][lbytes-1]=='\r')
    1.43 -	    lines[j][--lbytes]='\0';
    1.44 +	if (lbytes>0 && lines[j][lbytes-1]=='\r')
    1.45 +	{
    1.46 +	    results.newlines=DOS_NEWLINES;
    1.47 +	    do
    1.48 +	    {
    1.49 +		lines[j][--lbytes]='\0';
    1.50 +	    } while (lbytes>0 && lines[j][lbytes-1]=='\r');
    1.51 +	}
    1.52  	llen=g_utf8_strlen(lines[j],lbytes);
    1.53  	linecnt++;
    1.54  	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
    1.55 @@ -633,6 +662,13 @@
    1.56  struct warnings *report_first_pass(struct first_pass_results *results)
    1.57  {
    1.58      static struct warnings warnings={0};
    1.59 +    warnings.newlines=results->newlines;
    1.60 +    if (warnings.newlines==UNIX_NEWLINES)
    1.61 +	g_print("   --> No lines in this file have a CR. Not reporting them. "
    1.62 +	  "Project Gutenberg requires that all lineends be CR-LF.\n");
    1.63 +    else if (warnings.newlines==OS9_NEWLINES)
    1.64 +	g_print("   --> No lines in this file have a LF. Not reporting them. "
    1.65 +	  "Project Gutenberg requires that all lineends be CR-LF.\n");
    1.66      if (cnt_spacend>0)
    1.67  	g_print("   --> %ld lines in this file have white space at end\n",
    1.68  	  cnt_spacend);
    1.69 @@ -2621,7 +2657,7 @@
    1.70       */
    1.71      linecnt=0;
    1.72      etext_ptr=etext;
    1.73 -    while ((aline=flgets(&etext_ptr,linecnt+1)))
    1.74 +    while ((aline=flgets(&etext_ptr,linecnt+1,warnings->newlines)))
    1.75      {
    1.76  	linecnt++;
    1.77  	if (linecnt==1)
    1.78 @@ -2762,12 +2798,21 @@
    1.79  /*
    1.80   * flgets:
    1.81   *
    1.82 - * Get one line from the input text, checking for
    1.83 - * the existence of exactly one CR/LF line-end per line.
    1.84 + * Get one line from the input text. The setting of newlines has the following
    1.85 + * effect:
    1.86 + *
    1.87 + * DOS_NEWLINES: Check for the existence of exactly one CR-LF line-end per line.
    1.88 + *
    1.89 + * OS9_NEWLINES: Asserts that etext contains no LFs. CR is used as
    1.90 + *		 the newline character.
    1.91 + *
    1.92 + * UNIX_NEWLINES: Check for the presence of CRs.
    1.93 + *
    1.94 + * In all cases, check that the last line is correctly terminated.
    1.95   *
    1.96   * Returns: a pointer to the line.
    1.97   */
    1.98 -char *flgets(char **etext,long lcnt)
    1.99 +char *flgets(char **etext,long lcnt,int newlines)
   1.100  {
   1.101      gunichar c;
   1.102      gboolean isCR=FALSE;
   1.103 @@ -2790,8 +2835,15 @@
   1.104  		    g_free(s);
   1.105  		}
   1.106  		if (!pswit[OVERVIEW_SWITCH])
   1.107 -		    /* There may, or may not, have been a CR */
   1.108 -		    g_print("    Line %ld - No LF?\n",lcnt);
   1.109 +		{
   1.110 +		    if (newlines==OS9_NEWLINES)
   1.111 +			g_print("    Line %ld - No CR?\n",lcnt);
   1.112 +		    else
   1.113 +		    {
   1.114 +			/* There may, or may not, have been a CR */
   1.115 +			g_print("    Line %ld - No LF?\n",lcnt);
   1.116 +		    }
   1.117 +		}
   1.118  		else
   1.119  		    cnt_lineend++;
   1.120  	    }
   1.121 @@ -2801,9 +2853,7 @@
   1.122  	/* either way, it's end of line */
   1.123  	if (c=='\n')
   1.124  	{
   1.125 -	    if (isCR)
   1.126 -		break;
   1.127 -	    else
   1.128 +	    if (newlines==DOS_NEWLINES && !isCR)
   1.129  	    {
   1.130  		/* Error - a LF without a preceding CR */
   1.131  		if (pswit[LINE_END_SWITCH])
   1.132 @@ -2819,14 +2869,15 @@
   1.133  		    else
   1.134  			cnt_lineend++;
   1.135  		}
   1.136 -		break;
   1.137  	    }
   1.138 +	    break;
   1.139  	}
   1.140  	if (c=='\r')
   1.141  	{
   1.142 -	    if (isCR)
   1.143 +	    if (newlines==OS9_NEWLINES)
   1.144 +		break;
   1.145 +	    if (isCR || newlines==UNIX_NEWLINES)
   1.146  	    {
   1.147 -		/* Error - two successive CRs */
   1.148  		if (pswit[LINE_END_SWITCH])
   1.149  		{
   1.150  		    if (pswit[ECHO_SWITCH])
   1.151 @@ -2836,12 +2887,22 @@
   1.152  			g_free(s);
   1.153  		    }
   1.154  		    if (!pswit[OVERVIEW_SWITCH])
   1.155 -			g_print("    Line %ld - Two successive CRs?\n",lcnt);
   1.156 +		    {
   1.157 +			if (newlines==UNIX_NEWLINES)
   1.158 +			    g_print("    Line %ld column %ld - Embedded CR?\n",
   1.159 +			      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
   1.160 +			else
   1.161 +			    g_print("    Line %ld - Two successive CRs?\n",
   1.162 +			      lcnt);
   1.163 +		    }
   1.164  		    else
   1.165  			cnt_lineend++;
   1.166  		}
   1.167 +		if (newlines==UNIX_NEWLINES)
   1.168 +		    *eos=' ';
   1.169  	    }
   1.170 -	    isCR=TRUE;
   1.171 +	    if (newlines==DOS_NEWLINES)
   1.172 +		isCR=TRUE;
   1.173  	}
   1.174  	else
   1.175  	{