diff -r 000000000000 -r 9d0a04089d22 librazor/uri.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/librazor/uri.c Mon Jul 11 16:49:53 2016 +0100 @@ -0,0 +1,957 @@ +/* + * Copyright (C) 2016 J. Ali Harlow + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "config.h" + +#undef DEBUG + +#include +#include +#include "razor.h" +#include "types/types.h" +#include "razor-internal.h" +#include "uri.h" + +/* + * Following RFC 3986 § 3. + * Note that we don't validate queries or fragments. + */ + +#define strdup0(s) ((s) ? strdup(s) : NULL) + +#define string_str(str) ((char *)(str)->data) + +#define string_init(str) do { \ + char *_p; \ + array_init(str); \ + _p = array_add(str, 1); \ + *_p = '\0'; \ + } while(0) + +#define string_append_len(str, s, len) do { \ + char *_p; \ + _p = array_add(str, len); \ + _p--; \ + strncpy(_p, s, len); \ + _p[(len)] = '\0'; \ + } while(0) + +#define string_append(str, s) string_append_len(str, s, strlen(s)) + +#define string_truncate_at(str, s) do { \ + int _len; \ + _len = (s) - \ + (char *)(str)->data; \ + *(s) = '\0'; \ + (str)->size = _len + 1; \ + } while(0) + + +static const char *skip_uri_scheme(const char *uri) +{ + /* + * RFC 3986 defines scheme as: + * scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) + */ + if (*uri >= 'a' && *uri <= 'z' || *uri >= 'A' && *uri <= 'Z') { + do { + uri++; + } while (is_alnum(*uri) || *uri == '+' || *uri == '-' || + *uri == '.'); + if (*uri == ':') + return uri; + } + return NULL; +} + +static char *razor_strndup(const char *s, size_t n) +{ + char *result; + + if (memchr(s, '\0', n)) + result = strdup(s); + else { + result = malloc(n + 1); + memcpy(result, s, n); + result[n] = '\0'; + } + + return result; +} + +#if 0 +/* + * Return the (possibly decoded) pchar or 0 on end-of-string or -1 on error + */ +static int pchar_get_char_validated(const char *p) +{ + int c; + + if (p[0]=='\0') + c = 0; + else if (p[0]=='%') { + if (xdigit_value(p[1]) < 0) + return -1; + c = xdigit_value(p[1]) * 16; + if (xdigit_value(p[2]) < 0) + return -1; + c += xdigit_value(p[2]); + } else if (p[0] >= 'a' && p[0] <= 'z' || p[0] >= 'A' && p[0] <= 'Z' || + p[0] >= '0' && p[0] <= '9' || + strchr("-._~!$&'()*+,;=:@", p[0])) + c = p[0]; + else + c = -1; + + return c; +} +#endif + +/* + * Verify the percent encoding. All '%' characters must be followed by + * exactly two hexadecimal digits. + */ +static int pct_encoding_validate(const char *s) +{ + while (*s) { + if (*s == '%') { + if (xdigit_value(s[1]) < 0 || xdigit_value(s[2]) < 0) + return -1; + s += 2; + } + + s++; + } + + return 0; +} + +static char *pct_encoding_normalize(char *s) +{ + char *retval, *p; + int c; + + if (!s) + return NULL; + + p = retval = malloc(strlen(s) + 1); + + while (*s) { + if (*s == '%') { + c = pchar_get_char(s); + if (is_unreserved(c)) + *p++ = c; + else { + *p++ = '%'; + *p++ = "0123456789ABCDEF"[c/16]; + *p++ = "0123456789ABCDEF"[c%16]; + } + pchar_next_char(s); + } else + *p++ = *s++; + } + + *p++ = '\0'; + + return realloc(retval, p - retval); +} + +static int validate_userinfo(const char *userinfo, struct razor_error **error) +{ + const char *s; + + for (s = userinfo; *s; s++) { + if (!is_unreserved(*s) && *s != '%' && !is_sub_delim(*s) + && *s != ':') { + razor_set_error(error, RAZOR_GENERAL_ERROR, + RAZOR_GENERAL_ERROR_BAD_URI, userinfo, + "Invalid URI userinfo"); + return -1; + } + } + + return 0; +} + +static int validate_reg_name(const char *reg_name) +{ + const char *s; + + for (s = reg_name; *s; s++) { + if (!is_unreserved(*s) && *s != '%' && !is_sub_delim(*s)) + return -1; + } + + return 0; +} + +static int validate_ipv4address(const char *s, int length) +{ + int count = 0, digits, octet; + + for (;;) { + if (!length) + return -1; + + if (*s == '0') { + digits = 1; + octet = 0; + } else { + if (*s < '1' || *s > '9') + return -1; + + octet = *s - '0'; + + for (digits = 1; digits < length; digits++) { + if (s[digits] >= '0' && s[digits] <= '9') { + octet *= 10; + octet += s[digits] - '0'; + if (octet > 255) + return -1; + } else + break; + } + } + + s += digits; + length -= digits; + + if (++count == 4) + break; + + if (length < 1 || *s != '.') + return -1; + + s++; + length--; + } + + return length ? -1 : 0; +} + +static int count_ipv6_pieces(const char **s, int *length) +{ + int count, digits; + + for (digits = 0; digits < 4 && digits < *length; digits++) { + if (!is_xdigit((*s)[digits])) + break; + } + + if (!digits) + return 0; + + (*s) += digits; + (*length) -= digits; + count = 1; + + if (*length && **s == ':') { + (*s)++; + (*length)--; + count += count_ipv6_pieces(s, length); + if (count == 1) { + (*s)--; + (*length)++; + } + } + + return count; +} + +static int validate_ip_literal(const char *ip_literal, int length) +{ + const char *s, *dot; + int len, no_pieces, elide; + + if (length >= 4 && ip_literal[0] == 'v') { + /* IPvFuture */ + dot = strchr(ip_literal + 2, '.'); + if (!dot || dot >= ip_literal + length) + return -1; + for (s = ip_literal + 1; s < dot; s++) { + if (!is_xdigit(*s)) + return -1; + } + for (s = dot + 1; s < ip_literal + length; s++) { + if (!is_unreserved(*s) && !is_sub_delim(*s) && *s != ':') + return -1; + } + } else { + /* IPv6address */ + s = ip_literal; + len = length; + no_pieces = count_ipv6_pieces(&s, &len); + + if (len > 1 && s[0] == ':' && s[1] == ':') { + s += 2; + len -= 2; + elide = 1; + no_pieces += count_ipv6_pieces(&s, &len); + } else + elide = 0; + + if (!validate_ipv4address(s, len)) + no_pieces += 2; + else if (len) + return -1; + + if (no_pieces > 8 || no_pieces == 8 && elide || no_pieces < 1) + return -1; + } + + return 0; +} + +static int validate_host(const char *host, struct razor_error **error) +{ + int retval; + + if (host[0] == '[' && host[strlen(host) - 1] == ']') + retval = validate_ip_literal(host + 1, strlen(host) - 2); + else { + retval = validate_ipv4address(host, strlen(host)); + if (retval < 0) + retval = validate_reg_name(host); + } + + if (retval) + razor_set_error(error, RAZOR_GENERAL_ERROR, + RAZOR_GENERAL_ERROR_BAD_URI, host, + "Invalid URI host"); + + return retval; +} + +static char *strdown(char *s) +{ + while (*s) { + if (*s >= 'A' && *s <= 'Z') { + *s -= 'A'; + *s += 'a'; + } + s++; + } + + return s; +} + +static int razor_uri_parse_authority(struct razor_uri *ru, + const char *authority, int length, + struct razor_error **error) +{ + const char *s, *auth = authority; + char *userinfo, *port, *host; + + s = strchr(auth, '@'); + if (s && s < auth + length) { + userinfo = razor_strndup(auth, s - auth); + s++; + length -= s - auth; + auth = s; + + if (validate_userinfo(userinfo, error)) { + free(userinfo); + return -1; + } + } else + userinfo = NULL; + + s = strchr(auth, ':'); + if (s && s < auth + length) { + s++; + port = razor_strndup(s, length - (s - auth)); + s--; + length = s - auth; + + if (strspn(port, "0123456789") != strlen(port)) { + razor_set_error(error, RAZOR_GENERAL_ERROR, + RAZOR_GENERAL_ERROR_BAD_URI, port, + "Invalid URI port"); + free(userinfo); + free(port); + return -1; + } + } else + port = NULL; + + host = razor_strndup(auth, length); + + if (validate_host(host, error)) { + free(userinfo); + free(port); + free(host); + return -1; + } + + ru->userinfo = userinfo; + ru->port = port; + ru->host = host; + + return 0; +} + +/* + * Parse either a hier-part or a relative-part + */ +static int razor_uri_parse_part(struct razor_uri *ru, const char *part, + int relative_part, struct razor_error **error) +{ + const char *s, *hp = part; + char *path, *p; + int noscheme = 0; + + if (hp[0] == '/' && hp[1] == '/') { + hp += 2; + s = strpbrk(hp, "/?#"); + if (!s) + s = hp + strlen(hp); + if (razor_uri_parse_authority(ru, hp, s - hp, error) < 0) + return -1; + hp = s; + } else { + ru->userinfo = NULL; + ru->host = NULL; + ru->port = NULL; + } + + if (!*hp) { + /* path-empty */ + ru->path = strdup(""); + return 0; + } else if (*hp == '/') { + /* path-absolute */ + p = path = malloc(strlen(hp) + 1); + *p++ = '/'; + hp++; + if (!*hp) { + *p++ = '\0'; + ru->path = realloc(path, p - path); + return 0; + } + } else if (!ru->host) { + /* path-rootless or path-noscheme */ + noscheme = relative_part; + p = path = malloc(strlen(hp) + 1); + } else { + razor_set_error(error, RAZOR_GENERAL_ERROR, + RAZOR_GENERAL_ERROR_BAD_URI, part, + relative_part ? "Invalid URI relative part" : + "Invalid URI hierarchical part"); + return -1; + } + + if (!is_pchar(*hp) || noscheme && *hp == ':') { + free(path); + razor_set_error(error, RAZOR_GENERAL_ERROR, + RAZOR_GENERAL_ERROR_BAD_URI, part, + "Invalid character in URI path"); + return -1; + } + *p++ = *hp++; + + while (*hp) { + if (*hp == '/') + noscheme = 0; + else if (!is_pchar(*hp) || noscheme && *hp == ':') { + free(path); + razor_set_error(error, RAZOR_GENERAL_ERROR, + RAZOR_GENERAL_ERROR_BAD_URI, part, + "Invalid character in URI path"); + return -1; + } + *p++ = *hp++; + } + + *p++ = '\0'; + + ru->path = realloc(path, p - path); + + return 0; +} + +void razor_uri_destroy(struct razor_uri *ru) +{ + free(ru->scheme); + free(ru->userinfo); + free(ru->host); + free(ru->port); + free(ru->path); + free(ru->query); + free(ru->fragment); +} + +int razor_uri_parse_uri(struct razor_uri *ru, const char *uri, int absolute, + struct razor_error **error) +{ + int r; + const char *s; + char *hier_part; + + if (pct_encoding_validate(uri) < 0) { + razor_set_error(error, RAZOR_GENERAL_ERROR, + RAZOR_GENERAL_ERROR_BAD_URI, uri, + "Invalid percent encoding"); + return -1; + } + + memset(ru, 0, sizeof(*ru)); + + s = skip_uri_scheme(uri); + if (!s) { + razor_set_error(error, RAZOR_GENERAL_ERROR, + RAZOR_GENERAL_ERROR_BAD_URI, uri, + "Invalid URI scheme"); + return -1; + } + ru->scheme = razor_strndup(uri, s - uri); + uri = s + 1; + + s = strchr(uri, '?'); + if (!s) + s = strchr(uri, '#'); + if (!s) + s = uri + strlen(uri); + hier_part = razor_strndup(uri, s - uri); + uri = s; + + r = razor_uri_parse_part(ru, hier_part, 0, error); + free(hier_part); + if (r) { + razor_uri_destroy(ru); + return -1; + } + + if (*uri != '?') + ru->query = NULL; + else { + uri++; + s = strchr(uri, '#'); + if (!s) + s = uri + strlen(uri); + ru->query = razor_strndup(uri, s - uri); + uri = s; + } + + if (*uri != '#') + ru->fragment = NULL; + else if (absolute) { + razor_set_error(error, RAZOR_GENERAL_ERROR, + RAZOR_GENERAL_ERROR_BAD_URI, uri, + "Fragments are not allowed in absolute URIs"); + razor_uri_destroy(ru); + return -1; + } else { + uri++; + ru->fragment = strdup(uri); + } + + return 0; +} + +int razor_uri_parse_relative_ref(struct razor_uri *ru, const char *uri, + struct razor_error **error) +{ + int r; + const char *s; + char *relative_part; + + if (pct_encoding_validate(uri) < 0) { + razor_set_error(error, RAZOR_GENERAL_ERROR, + RAZOR_GENERAL_ERROR_BAD_URI, uri, + "Invalid percent encoding"); + return -1; + } + + memset(ru, 0, sizeof(*ru)); + + s = strchr(uri, '?'); + if (!s) + s = strchr(uri, '#'); + if (!s) + s = uri + strlen(uri); + relative_part = razor_strndup(uri, s - uri); + uri = s; + + r = razor_uri_parse_part(ru, relative_part, 1, error); + free(relative_part); + if (r) + return -1; + + if (*uri == '?') { + uri++; + s = strchr(uri, '#'); + if (!s) + s = uri + strlen(uri); + ru->query = razor_strndup(uri, s - uri); + uri = s; + } else + ru->query = NULL; + + if (*uri == '#') { + uri++; + ru->fragment = strdup(uri); + } else + ru->fragment = NULL; + + return 0; +} + +int razor_uri_parse(struct razor_uri *ru, const char *uri, + struct razor_error **error) +{ + struct razor_error *tmp_error = NULL; + int r; + + r = razor_uri_parse_uri(ru, uri, 0, &tmp_error); + if (r < 0) { + r = razor_uri_parse_relative_ref(ru, uri, NULL); + if (r < 0) + razor_propagate_error(error, tmp_error, NULL); + else + razor_error_free(tmp_error); + } + + return r; +} + +/* + * Following RFC 3986 § 5.2.4 + */ +static char *remove_dot_segments(const char *path) +{ + struct array output; + char *input, *in, *s, *t; + const char *step; + +#ifdef DEBUG + fprintf(stderr, "STEP OUTPUT BUFFER INPUT BUFFER\n"); +#endif + + input = strdup(path); + in = input; + string_init(&output); + +#ifdef DEBUG + fprintf(stderr, " 1 : %-21s %s\n", string_str(&output), in); +#endif + + while (*in) { + if (str_has_prefix(in, "../")) { + step = "2A"; + in += 3; + } else if (str_has_prefix(in, "./")) { + step = "2A"; + in += 2; + } else if (str_has_prefix(in, "/./")) { + step = "2B"; + in += 2; + } else if (!strcmp(in, "/.")) { + step = "2B"; + in++; + *in = '/'; + } else if (str_has_prefix(in, "/../")) { + step = "2C"; + in += 3; + s = strrchr(string_str(&output), '/'); + if (!s) + s = string_str(&output); + string_truncate_at(&output, s); + } else if (!strcmp(in, "/..")) { + step = "2C"; + in += 2; + *in = '/'; + s = strrchr(string_str(&output), '/'); + if (!s) + s = string_str(&output); + string_truncate_at(&output, s); + } else if (!strcmp(in, ".") || !strcmp(in, "..")) { + step = "2D"; + in += strlen(in); + } else { + step = "2E"; + t = strchr(in + 1, '/'); + if (!t) + t = in + strlen(in); + string_append_len(&output, in, t - in); + in = t; + } +#ifdef DEBUG + fprintf(stderr, " %s: %-21s %s\n", step, string_str(&output), + in); +#endif + } + + free(input); + return string_str(&output); +} + + +/* + * Following RFC 3986 § 6.2.2 + */ +void razor_uri_normalize(struct razor_uri *ru) +{ + char *s; + + strdown(ru->scheme); + if (ru->host) + strdown(ru->host); + + s = pct_encoding_normalize(ru->userinfo); + free(ru->userinfo); + ru->userinfo = s; + + s = pct_encoding_normalize(ru->host); + free(ru->host); + ru->host = s; + + s = pct_encoding_normalize(ru->path); + free(ru->path); + ru->path = s; + + s = pct_encoding_normalize(ru->query); + free(ru->query); + ru->query = s; + + s = pct_encoding_normalize(ru->fragment); + free(ru->fragment); + ru->fragment = s; + + s = remove_dot_segments(ru->path); + free(ru->path); + ru->path = s; +} + +char *razor_uri_get_authority(const struct razor_uri *ru) +{ + char *result, *r; + int len = 1; + + if (ru->host) { + if (ru->userinfo) + len += strlen(ru->userinfo) + 1; + len += strlen(ru->host); + if (ru->port) + len += strlen(ru->port) + 1; + } else + return NULL; + + r = result = malloc(len); + + if (ru->userinfo) { + strcpy(r, ru->userinfo); + r += strlen(r); + *r++ = '@'; + } + + strcpy(r, ru->host); + r += strlen(r); + + if (ru->port) { + *r++ = ':'; + strcpy(r, ru->port); + } + + return result; +} + +/* + * Following RFC 3986 § 5.3 + */ +char *razor_uri_recompose(const struct razor_uri *ru) +{ + char *authority, *result, *r; + int len = 1; + + authority = razor_uri_get_authority(ru); + + if (ru->scheme) + len += strlen(ru->scheme) + 1; + if (authority) + len += strlen(authority) + 2; + len += strlen(ru->path); + if (ru->query) + len += strlen(ru->query) + 1; + if (ru->fragment) + len += strlen(ru->fragment) + 1; + + r = result = malloc(len); + + if (ru->scheme) { + strcpy(r, ru->scheme); + r += strlen(r); + *r++ = ':'; + } + + if (authority) { + *r++ = '/'; + *r++ = '/'; + strcpy(r, authority); + free(authority); + r += strlen(r); + } + + strcpy(r, ru->path); + r += strlen(r); + + if (ru->query) { + *r++ = '?'; + strcpy(r, ru->query); + r += strlen(r); + } + + if (ru->fragment) { + *r++ = '#'; + strcpy(r, ru->fragment); + } + + return result; +} + +/* + * Following RFC 3986 § 5.2.3 + */ +static char *merge_paths(const struct razor_uri *base,const struct razor_uri *R) +{ + char *s, *t, *path; + + if (base->host && !*base->path) + path = razor_concat("/", R->path, NULL); + else { + s = strrchr(base->path, '/'); + if (s) { + t = razor_strndup(base->path, s + 1 - base->path); + path = razor_concat(t, R->path, NULL); + free(t); + } else + path = strdup(R->path); + } + + return path; +} + +/* + * Following RFC 3986 § 5.2 + */ +void razor_uri_resolve(struct razor_uri *T, const struct razor_uri *base, + const struct razor_uri *R) +{ + char *s; + + if (R->scheme) { + T->scheme = strdup(R->scheme); + T->userinfo = strdup0(R->userinfo); + T->host = strdup0(R->host); + T->port = strdup0(R->port); + T->path = remove_dot_segments(R->path); + T->query = strdup0(R->query); + } else { + if (R->host) { + T->userinfo = strdup0(R->userinfo); + T->host = strdup0(R->host); + T->port = strdup0(R->port); + T->path = remove_dot_segments(R->path); + T->query = strdup0(R->query); + } else { + if (!*R->path) { + T->path = strdup(base->path); + if (R->query) + T->query = strdup(R->query); + else + T->query = strdup0(base->query); + } else { + if (*R->path == '/') + T->path = remove_dot_segments(R->path); + else { + s = merge_paths(base, R); + T->path = remove_dot_segments(s); + free(s); + } + T->query = strdup0(R->query); + } + T->userinfo = strdup0(base->userinfo); + T->host = strdup0(base->host); + T->port = strdup0(base->port); + } + T->scheme = strdup(base->scheme); + } + T->fragment = strdup0(R->fragment); +} + +/* + * This differs from razor_uri_resolve() both in the types of its arguments + * and in the fact that it takes a root URI rather than a base URI. The base + * URI is determined by appending a slash to the root URI (if it doesn't + * already end in a slash). Finally, uri can be explicitly marked as either + * relative (ie., a relative-ref) or not (ie., a URI). This is important as + * otherwise "c:/xxx" could be interpreted as a URI in the "c" scheme. + */ +char *razor_resolve_uri_root(const char *root_uri, const char *uri, + int is_relative, struct razor_error **error) +{ + int r; + char *base_uri, *s, *result; + struct razor_uri ru, base, file; + + if (!root_uri || !*root_uri) + root_uri = "file:/"; + + if (root_uri[strlen(root_uri) - 1] == '/') + base_uri = strdup(root_uri); + else + base_uri = razor_concat(root_uri, "/", NULL); + + r = razor_uri_parse_uri(&base, base_uri, 1, error); + free(base_uri); + if (r) + return NULL; + + if (is_relative > 0) { + /* + * We can't use razor_uri_parse_relative_ref() to parse + * uri in case it starts with a segment that includes a + * colon. Thus we use this kludge. + */ + s = razor_concat("scheme:", uri, NULL); + r = razor_uri_parse_uri(&file, s, 0, error); + free(s); + if (!r) { + free(file.scheme); + file.scheme = NULL; + } + } + else if (!is_relative) + r = razor_uri_parse_uri(&file, uri, 0, error); + else + r = razor_uri_parse(&file, uri, error); + if (r) { + razor_uri_destroy(&base); + return NULL; + } + + razor_uri_resolve(&ru, &base, &file); + + razor_uri_destroy(&base); + razor_uri_destroy(&file); + + result = razor_uri_recompose(&ru); + + razor_uri_destroy(&ru); + + return result; +}