💾 Archived View for gemini.thededem.de › lc19 › src › src › url.c captured on 2024-07-08 at 23:38:31.

View Raw

More Information

⬅️ Previous capture (2021-12-03)

-=-=-=-=-=-=-

/* Copyright 2020, 2021 Lukas Wedeking
 *
 * This file is part of LC19.
 *
 * LC19 is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * LC19 is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with LC19.  If not, see <https://www.gnu.org/licenses/>.
 */

#define _XOPEN_SOURCE 500

#include "../include/url.h"

#include<assert.h>
#include<ctype.h>
#include<string.h>
#include<stdlib.h>
#include<stdio.h>

#include "../include/util.h"


size_t
url_char_is_gen_delim(const char *url, size_t offset, size_t max_length)
{
	assert(url != NULL);
	assert(offset < max_length);

	if (url[offset] == ':' || url[offset] == '/' || url[offset] == '?'
			|| url[offset] == '#' || url[offset] == '['
			|| url[offset] == ']' || url[offset] == '@') {
		return 1;
	}
	return 0;
}

size_t
url_char_is_sub_delim(const char *url, size_t offset, size_t max_length)
{
	assert(url != NULL);
	assert(offset < max_length);

	if (url[offset] == '!' || url[offset] == '


 || url[offset] == '&'
			|| url[offset] == '\'' || url[offset] == '('
			|| url[offset] == ')' || url[offset] == '*'
			|| url[offset] == '+' || url[offset] == ','
			|| url[offset] == ';' || url[offset] == '='
			) {
		return 1;
	}
	return 0;
}

size_t
url_char_is_reserved(const char *url, size_t offset, size_t max_length)
{
	assert(url != NULL);
	assert(offset < max_length);

	size_t step = 0;
	if ((step = url_char_is_gen_delim(url, offset, max_length))) {
		return step;
	} else if ((step = url_char_is_sub_delim(url, offset, max_length))) {
		return step;
	}
	return 0;
}

size_t
url_char_is_ucschar(const char *url, size_t offset, size_t max_length)
{
	assert(url != NULL);
	assert(offset < max_length);

	/*
	 * The allowed UTF-8 ranges are:
	 *     0xA0-D7FF
	 *     0xF900-FDCF
	 *     0xFDF0-FFEF
         *     0x10000-1FFFD
	 *     0x20000-2FFFD
	 *     0x30000-3FFFD
         *     0x40000-4FFFD
	 *     0x50000-5FFFD
	 *     0x60000-6FFFD
         *     0x70000-7FFFD
	 *     0x80000-8FFFD
	 *     0x90000-9FFFD
         *     0xA0000-AFFFD
	 *     0xB0000-BFFFD
	 *     0xC0000-CFFFD
         *     0xD0000-DFFFD
	 *     0xE1000-EFFFD
	 */
	const unsigned char ranges[18][2][3] = {
		{
			{0x0, 0x0, 0xa0},
			{0x0, 0xd7, 0xff}
		},
		{
			{0x0, 0xf9, 0x0},
			{0x0, 0xfd, 0xcf}
		},
		{
			{0x0, 0xfd, 0xf0},
			{0x0, 0xff, 0xef}
		},
		{
			{0x1, 0x0, 0x0},
			{0x1, 0xff, 0xfd}
		},
		{
			{0x2, 0x0, 0x0},
			{0x2, 0xff, 0xfd}
		},
		{
			{0x3, 0x0, 0x0},
			{0x3, 0xff, 0xfd}
		},
		{
			{0x4, 0x0, 0x0},
			{0x4, 0xff, 0xfd}
		},
		{
			{0x5, 0x0, 0x0},
			{0x5, 0xff, 0xfd}
		},
		{
			{0x6, 0x0, 0x0},
			{0x6, 0xff, 0xfd}
		},
		{
			{0x7, 0x0, 0x0},
			{0x7, 0xff, 0xfd}
		},
		{
			{0x8, 0x0, 0x0},
			{0x8, 0xff, 0xfd}
		},
		{
			{0x9, 0x0, 0x0},
			{0x9, 0xff, 0xfd}
		},
		{
			{0xa, 0x0, 0x0},
			{0xa, 0xff, 0xfd}
		},
		{
			{0xb, 0x0, 0x0},
			{0xb, 0xff, 0xfd}
		},
		{
			{0xc, 0x0, 0x0},
			{0xc, 0xff, 0xfd}
		},
		{
			{0xd, 0x0, 0x0},
			{0xd, 0xff, 0xfd}
		},
		{
			{0xe, 0x10, 0x0},
			{0xe, 0xff, 0xfd}
		}
	};
	unsigned char cp[3] = { 0x0 };
	size_t step = util_utf8_get_codepoint(url + offset, max_length - offset,
			cp);

	if (step == 0) {
		return 0;
	}

	for (size_t i = 0; i < 17; i++) {
		if ((ranges[i][0][0] < cp[0]
					|| (ranges[i][0][0] == cp[0]
						&& (ranges[i][0][1] < cp[1]
							|| (ranges[i][0][1] == cp[1]
								&& ranges[i][0][2] <= cp[2]))))
				&& (ranges[i][1][0] > cp[0]
					|| (ranges[i][1][0] == cp[0]
						&& (ranges[i][1][1] > cp[1]
							|| (ranges[i][1][1] == cp[1]
								&& ranges[i][1][2] >= cp[2]))))) {
			return step;
		}
	}

	return 0;
}

size_t
url_char_is_unreserved(const char *url, size_t offset, size_t max_length)
{
	assert(url != NULL);
	assert(offset < max_length);

	if (isalnum(url[offset]) || url[offset] == '-' || url[offset] == '.'
			|| url[offset] == '_' || url[offset] == '~') {
		return 1;
	}
	return url_char_is_ucschar(url, offset, max_length);
}

size_t
url_char_is_iprivate(const char *url, size_t offset, size_t max_length)
{
	assert(url != NULL);
	assert(offset < max_length);

	/*
	 * The allowed UTF-8 ranges are:
	 *     0xE000-F8FF
	 *     0xF0000-FFFFD
	 *     0x100000-10FFFD
	 */
	const unsigned char ranges[3][2][3] = {
		{
			{0x0, 0xe0, 0x0},
			{0x0, 0xf8, 0xff}
		},
		{
			{0xf, 0x0, 0x0},
			{0xf, 0xff, 0xfd}
		},
		{
			{0x10, 0x0, 0x0},
			{0x10, 0xff, 0xfd}
		}
	};
	unsigned char cp[3] = { 0x0 };
	size_t step = util_utf8_get_codepoint(url + offset, max_length - offset,
			cp);

	if (step == 0) {
		return 0;
	}

	for (size_t i = 0; i < 3; i++) {
		if ((ranges[i][0][0] < cp[0]
					|| (ranges[i][0][0] == cp[0]
						&& (ranges[i][0][1] < cp[1]
							|| (ranges[i][0][1] == cp[1]
								&& ranges[i][0][2] <= cp[2]))))
				&& (ranges[i][1][0] > cp[0]
					|| (ranges[i][1][0] == cp[0]
						&& (ranges[i][1][1] > cp[1]
							|| (ranges[i][1][1] == cp[1]
								&& ranges[i][1][2] >= cp[2]))))) {
			return step;
		}
	}

	return 0;
}

size_t
url_char_is_hex(char c)
{
	return (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')
		|| (c >= '0' && c <= '9');
}

size_t
url_char_is_pct_enc(const char *url, size_t offset, size_t max_length)
{
	assert(url != NULL);
	assert(offset < max_length);

	if (offset + 2 >= max_length) {
		return 0;
	}
	if (url[offset] == '%' && url_char_is_hex(url[offset + 1]) &&
			url_char_is_hex(url[offset + 2])) {
		return 3;
	}
	return 0;
}

size_t
url_find_scheme_end(const char *url, const size_t max_length)
{
	assert(url != NULL);

	if (!isalpha(url[0])) {
		return 0;
	}
	for (size_t i = 0; url[i] != 0 && i < max_length; i++) {
		/* Return, if the scheme separator is encountered. */
		if (url[i] == ':') {
			return i;
		}
		/* Fail, if the scheme does contain invalid characters. */
		if (!(isalnum(url[i]) || url[i] == '+' || url[i] == '-'
					|| url[i] == '.')) {
			return 0;
		}
	}
	return 0;
}

size_t
url_find_userinfo_end(const char *url, const size_t offset,
		const size_t max_length)
{
	assert(url != NULL);
	assert(offset < max_length);

	size_t step = 0;
	for (size_t i = offset; url[i] != 0 && i < max_length; i += step) {
		if (url[i] == '@') {
			return i;
		}
		if (!((step = url_char_is_unreserved(url, i, max_length))
					|| (step = url_char_is_pct_enc(url, i,
							max_length))
					|| (step = url_char_is_sub_delim(url, i,
							max_length))
					|| ((step = 1) && url[i] == ':'))) {
			return offset;
		}
	}
	return offset;
}

size_t
url_find_host_end(const char *url, const size_t offset,
		const size_t max_length)
{
	assert(url != NULL);
	assert(offset < max_length);

	/* TODO: Support IPv6 hosts. */
	size_t step = 0;
	size_t i = offset;
	for (; url[i] != 0 && i < max_length; i += step) {
		if (url[i] == ':' || url[i] == '/' || url[i] == '?'
				|| url[i] == '#' || url[i] == '\r') {
			return i;
		}
		if (!((step = url_char_is_unreserved(url, i, max_length))
				|| (step = url_char_is_pct_enc(url, i,
						max_length))
				|| (step = url_char_is_sub_delim(url, i,
						max_length)))) {
			return offset;
		}
	}
	return i;
}

size_t
url_find_port_end(const char *url, const size_t offset,
		const size_t max_length)
{
	assert(url != NULL);
	assert(offset < max_length);

	size_t i = offset;
	for (; url[i] != 0 && i < max_length; i++) {
		/* Check if we hit a separator. */
		if (url[i] == '/' || url[i] == '?' || url[i] == '#'
				|| url[i] == '\r') {
			return i;
		}
		if (!isdigit(url[i])) {
			return offset;
		}
	}
	return i;
}

size_t
url_find_path_end(const char *url, const size_t offset,
		const size_t max_length, int schemeless, int absolute)
{
	assert(url != NULL);
	assert(offset < max_length);

	if (absolute && url[offset] != '/') {
		return offset;
	}
	size_t step = 0;
	size_t i = offset;
	for (; url[i] != 0 && i < max_length; i += step) {
		if (url[i] == '?' || url[i] == '#' || url[i] == '\r') {
			return i;
		}
		if (schemeless) {
			if (url[i] == ':') {
				return offset;
			}
			else if (url[i] == '/') {
				schemeless = 0;
			}
		}
		if (!((step = url_char_is_unreserved(url, i, max_length))
				|| (step = url_char_is_pct_enc(url, i,
						max_length))
				|| (step = url_char_is_sub_delim(url, i,
						max_length))
				|| ((step = 1) && url[i] == '/')
				|| ((step = 1) && url[i] == ':')
				|| ((step = 1) && url[i] == '@'))) {
			return offset;
		}
	}
	return i;
}

size_t
url_find_query_end(const char *url, const size_t offset,
		const size_t max_length)
{
	assert(url != NULL);
	assert(offset < max_length);

	size_t step = 0;
	size_t i = offset;
	for (; url[i] != 0 && i < max_length; i += step) {
		if (url[i] == '#' || url[i] == '\r') {
			return i;
		}
		if (!((step = url_char_is_unreserved(url, i, max_length))
				|| (step = url_char_is_pct_enc(url, i,
						max_length))
				|| (step = url_char_is_sub_delim(url, i,
						max_length))
				|| ((step = 1) && url[i] == '/')
				|| ((step = 1) && url[i] == '?')
				|| ((step = 1) && url[i] == ':')
				|| ((step = 1) && url[i] == '@')
				|| (step = url_char_is_iprivate(url, i,
						max_length)))) {
			return offset;
		}
	}
	return i;
}

size_t
url_find_fragment_end(const char *url, const size_t offset,
		const size_t max_length)
{
	assert(url != NULL);
	assert(offset < max_length);

	size_t step = 0;
	size_t i = offset;
	for (; url[i] != 0 && i < max_length; i += step) {
		if (url[i] == '\r') {
			return i;
		}
		if (!((step = url_char_is_unreserved(url, i, max_length))
				|| (step = url_char_is_pct_enc(url, i,
						max_length))
				|| (step = url_char_is_sub_delim(url, i,
						max_length))
				|| ((step = 1) && url[i] == '/')
				|| ((step = 1) && url[i] == '?')
				|| ((step = 1) && url[i] == ':')
				|| ((step = 1) && url[i] == '@'))) {
			return offset;
		}
	}
	return i;
}

char*
url_part_alloc(const char *url, size_t start, size_t end)
{
	assert(url != NULL);
	assert(start <= end);

	size_t length = end - start;
	if (length == 0) {
		return NULL;
	}
	char *part = calloc(length + 1, sizeof(char));
	strncpy(part, url + start, length);
	return part;
}

void
url_decode_pct(char *s)
{
	assert(s != NULL);

	char hex[3] = {0x0, 0x0, 0x0};
	for (size_t r = 0, w = 0; s[r] != 0x0; r++, w++) {
		if (!(s[r] == '%' && url_char_is_hex(s[r + 1])
					&& url_char_is_hex(s[r + 2]))) {
			/*
			 * If read and write position have converged, copy the
			 * character from the read position to the write
			 * position.
			 */
			if (r != w) {
				s[w] = s[r];
				s[r] = 0x0;
			}
			continue;
		}
		hex[0] = s[r + 1];
		hex[1] = s[r + 2];
		s[w] = (char) strtol(hex, NULL, 16);
		s[r + 1] = 0x0;
		s[r + 2] = 0x0;
		r += 2;
	}
}

void
url_normalize_path(char *path)
{
	/*
	 * The for-loop implements the algorithm at 5.2.4 of RFC 3986. Input and
	 * output buffer of the algorithm are realized through two different
	 * indices of the path string: r(read) and w(write). The annotated cases
	 * below are those listed in the RFC.
	 */
	size_t w = 0;
	for (size_t r = 0; path[r] != 0; r++) {
		if (path[r] == '.' && path[r + 1] == '/') {
			/* Case A. */
			r++;
			continue;
		} else if (path[r] == '.' && path[r + 1] == '.'
				&& path[r + 2] == '/') {
			/* Case A. */
			r += 2;
			continue;
		} else if (path[r] == '/' && path[r + 1] == '.'
				&& path[r + 2] == '/') {
			/* Case B. */
			r++;
			continue;
		} else if (path[r] == '/' && path[r + 1] == '.'
				&& path[r + 2] == 0x0) {
			/* Case B. */
			path[w] = path[r];
			r++;
			w++;
			continue;
		} else if (path[r] == '/' && path[r + 1] == '.'
				&& path[r + 2] == '.' && path[r + 3] == '/') {
			/* Case C. */
			r += 2;
			while (w > 0) {
				w--;
				if (path[w] == '/') {
					break;
				}
			}
			continue;
		} else if (path[r] == '/' && path[r + 1] == '.'
				&& path[r + 2] == '.' && path[r + 3] == 0x0) {
			/* Case C. */
			r += 2;
			while (w > 0) {
				w--;
				if (path[w] == '/') {
					break;
				}
			}
			path[w] = '/';
			w++;
			continue;
		} else if (path[r] == '.' && path[r + 1] == '.'
				&& path[r + 2] == 0x0) {
			/* Case D. */
			break;
		} else if (path[r] == '.' && path[r + 1] == 0x0) {
			/* Case D. */
			break;
		} else if (path[r] == '/' && path[r + 1] == '/') {
			/*
			 * This last case is not part of the algorithm from the
			 * RFC. If the path contains multiple consequtive
			 * slashes, we replace them with a single slash.
			 */
			continue;
		}

		/* Case E. */
		if (w != r) {
			path[w] = path[r];
		}
		w++;
	}
	path[w] = 0x0;
}

void
url_destroy(struct Url *url)
{
	free(url->scheme);
	free(url->userinfo);
	free(url->host);
	free(url->port);
	free(url->path);
	free(url->path_orig);
	free(url->query);
	free(url->fragment);
	free(url->junk);
	free(url);
}

struct Url*
url_create(const char *url, size_t max_length)
{
	assert(url != NULL);

	struct Url *result = calloc(1, sizeof(struct Url));
	result->scheme = NULL;
	result->userinfo = NULL;
	result->host = NULL;
	result->port = NULL;
	result->path = NULL;
	result->path_orig = NULL;
	result->query = NULL;
	result->fragment = NULL;
	result->junk = NULL;

	size_t part_start = 0;
	size_t part_end = url_find_scheme_end(url, max_length);
	result->scheme = url_part_alloc(url, part_start, part_end);
	if (result->scheme != NULL) {
		for (size_t i = 0; result->scheme[i] != 0x0; i++) {
			result->scheme[i] = tolower(result->scheme[i]);
		}
	}
	if (part_start != part_end) {
		part_start = part_end + 1; /* Skip the colon. */
	}

	if (url[part_start] == '/' && url[part_start + 1] == '/') {
		part_start += 2; /* Skip the slashes. */

		part_end = url_find_userinfo_end(url, part_start, max_length);
		result->userinfo = url_part_alloc(url, part_start, part_end);
		if (part_start != part_end) {
			part_start = part_end + 1; /* Skip the @ sign. */
		}

		part_end = url_find_host_end(url, part_start, max_length);
		result->host = url_part_alloc(url, part_start, part_end);
		if (result->host != NULL) {
			url_decode_pct(result->host);
			for (size_t i = 0; result->host[i] != 0x0; i++) {
				result->host[i] = tolower(result->host[i]);
			}
		}

		if (url[part_end] == ':') {
			part_start = part_end + 1; /* Omit the colon. */
			part_end = url_find_port_end(url, part_start, max_length);
			result->port = url_part_alloc(url, part_start, part_end);
		}
	}
	if (result->port == NULL) {
		result->port = calloc(5, sizeof(char));
		strcpy(result->port, "1965");
	}

	part_start = part_end;
	part_end = url_find_path_end(url, part_start, max_length,
			result->scheme == NULL, result->host != NULL);
	result->path = url_part_alloc(url, part_start, part_end);
	if (result->path != NULL) {
		url_decode_pct(result->path);
		url_normalize_path(result->path);
		char *orig_path_tmp = url_part_alloc(url, part_start, part_end);
		url_decode_pct(orig_path_tmp);
		result->path_orig = url_pct_encode_path(orig_path_tmp, 0);
		free(orig_path_tmp);
	} else {
		result->path = calloc(2, sizeof(char));
		result->path[0] = '/';
		result->path_orig = calloc(2, sizeof(char));
		result->path_orig[0] = '/';
	}


	if (url[part_end] == '?') {
		part_start = part_end + 1; /* Omit the question mark. */
		part_end = url_find_query_end(url, part_start, max_length);
		result->query = url_part_alloc(url, part_start, part_end);
		if (result->query != NULL) {
			url_decode_pct(result->query);
		}
	}

	if (url[part_end] == '#') {
		part_start = part_end + 1; /* Omit the hash sign. */
		part_end = url_find_fragment_end(url, part_start, max_length);
		result->fragment = url_part_alloc(url, part_start, part_end);
		if (result->fragment != NULL) {
			url_decode_pct(result->fragment);
		}
	}

	part_start = part_end;
	for (; url[part_end] != 0 && part_end < max_length; part_end++) {
	}
	/*
	 * If start and end match, we reached the end of the request string
	 * without any characters to consume. This means that there is no CRLF
	 * line termination and this means we cannot be sure that we actually
	 * read the whole request, but that it exceeds the maximum length that
	 * we read.
	 *
	 * To mark that there is something wrong, we add junk to the parsed URL
	 * struct.
	 */
	if (part_start == part_end) {
		result->junk = url_part_alloc("too long", 0, 8);
	} else if (strncmp(url + part_start, "\r\n", 2) != 0) {
		result->junk = url_part_alloc(url, part_start, part_end);
	}

	return result;
}

char*
url_pct_encode_path(const char *s, int schemeless)
{
	if (s == NULL) {
		return NULL;
	}

	size_t s_len = strlen(s);
	size_t enc_len = s_len * 3 + 1; /* At most we need to encode every
					   character + NUL. */
	char *enc = calloc(enc_len, sizeof(char));
	char c_enc[3] = {0x0};

	int absolute = s_len > 0 && s[0] == '/';
	/*
	 * If the path is absolute and schemeless, we do not need to encode
	 * colons in the first segment of the path. If the path is not absolute,
	 * the first segment of the path must not contain a colon.
	 */
	if (absolute) {
		schemeless = 0;
	}

	size_t c_len = 0;
	for (size_t r = 0, w = 0; s[r] != 0; ) {
		if (schemeless && s[r] == '/') {
			schemeless = 0;
		}
		if ((c_len = url_char_is_unreserved(s, r, s_len)
					&& !url_char_is_ucschar(s, r, s_len))
				|| (c_len = url_char_is_sub_delim(s, r, s_len))
				|| ((c_len = 1) && s[r] == ':' && !schemeless)
				|| ((c_len = 1) && s[r] == '@')
				|| ((c_len = 1) && s[r] == '/')) {
			for (size_t j = 0; j < c_len; j++) {
				enc[w] = s[r];
				w++;
				r++;
			}
		} else {
			enc[w] = '%';
			w++;
			sprintf(c_enc, "%hhX", s[r]);
			enc[w] = c_enc[0];
			w++;
			enc[w] = c_enc[1];
			w++;
			r++;
		}
	}

	return enc;
}

void
url_normalize_pct_encoding(char *s)
{
	size_t len = strlen(s);
	for (size_t i = 0; i < len; i++) {
		if (url_char_is_pct_enc(s, i, len)) {
			s[i + 1] = toupper(s[i + 1]);
			s[i + 2] = toupper(s[i + 2]);
		}
	}
}