fix: Escape characters that could break URL

2024-04-27 13:24:23 +01:00 · 2024-04-27 13:24:23 +01:00 · a2e13b6dc3
parent d6d4af0c8c
commit a2e13b6dc3
1 changed files with 91 additions and 1 deletions
--- a/src/link.c
+++ b/src/link.c
@ -12,6 +12,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 #include <regex.h>
 #define STATUS_LEN 64
@ -28,6 +29,7 @@ int ROOT_LINK_OFFSET = 0;
 */
 static pthread_mutex_t link_lock;
 static void make_link_relative(const char *page_url, char *link_url);
 static char *escape_full_url(const char *f_url);
 /**
 * \brief create a new Link
@ -273,11 +275,12 @@ static LinkTable *single_LinkTable_new(const char *url)
    return linktbl;
 }
-LinkTable *LinkSystem_init(const char *url)
+LinkTable *LinkSystem_init(const char *f_url)
 {
    if (pthread_mutex_init(&link_lock, NULL)) {
        lprintf(error, "link_lock initialisation failed!\n");
    }
    char *url = escape_full_url(f_url);
    int url_len = strnlen(url, MAX_PATH_LEN) - 1;
    /*
     * --------- Set the length of the root link -----------
@ -317,6 +320,7 @@ LinkTable *LinkSystem_init(const char *url)
    } else {
        lprintf(fatal, "Invalid CONFIG.mode\n");
    }
    FREE(url);
    return ROOT_LINK_TBL;
 }
@ -1177,3 +1181,89 @@ static void make_link_relative(const char *page_url, char *link_url)
       before it. */
    memmove(link_url, link_url + skip_len, strlen(link_url) - skip_len + 1);
 }
 /**
 * \brief Pattern matching strings for URLs entered by user.
 * \details The order is important as we match from most specifiec to least.
 * For example, 192.168.1.1:80 is more specific than 192.168.1.1, even though
 * visiting the latter refers to the former by default.
 */
 const char *const ip_patterns[] = {
    "([0-9]{1,3}\\.){3}[0-9]{1,3}:[0-9]*",  // IPv4 with port
    "([0-9]{1,3}\\.){3}[0-9]{1,3}",         // IPv4 without port
    NULL,
 };
 static char *escape_full_url(const char *f_url)
 {
    char *const proto = strstr(f_url, "://");
    const char *url = proto + 3;
    int ret;
    int ip_in_path = 0;
    regex_t regex;
    regmatch_t  pmatch[1];
    for (int i = 0; ip_patterns[i]; i++) {
        ret = regcomp(&regex, ip_patterns[i], REG_EXTENDED | REG_ICASE);
        if (ret) {
            lprintf(fatal, "Could not compile regex\n");
        }
        ret = regexec(&regex, url, 1, pmatch, 0);
        if (!ret) {
            ip_in_path = 1;
            break;
        }
    }
    if (ip_in_path) {
        int path_offset = pmatch[0].rm_eo - pmatch[0].rm_so;
        if (*(url + path_offset) == '/' ) {
            url += path_offset + 1;
        } else {
            url += path_offset;
        }
    }
    CURL *c = curl_easy_init();
    char *next;
    char *unescaped_path = curl_easy_unescape(c, url, 0, NULL);
    char *escaped_path = curl_easy_escape(c, unescaped_path, 0);
    curl_free(unescaped_path);
    char *const base_url = CALLOC(MAX_PATH_LEN, sizeof(char));
    next = mempcpy(base_url, f_url, url - f_url);
    int len = strnlen(escaped_path, MAX_PATH_LEN);
    if (strnlen(next, MAX_PATH_LEN - (url - f_url)) + len >= MAX_PATH_LEN - 1) {
        lprintf(fatal, "URL too long\n");
    }
    next = mempcpy(next, escaped_path, len);
    next -= len;
    /* At this point, next should point to the part just after the IP address
     * or just after the protocol, depending on whether a user entered a URL
     * with a domain name or IP address.
     */
    /* curl_easy_escape does the correct thing and escapes whatever may break
     * the URL, but we must always preserve the slash in the URL since we make
     * decisions elsewhere based on the path (slash).
     */
    const char *e_slash;
    const char *e_p;
    char *b_p;
    e_slash = strstr(escaped_path, "%2F");
    for(e_p=escaped_path, b_p=next; (b_p - next < len) || e_slash; ) {
        *b_p++ = *e_p++;
        if (e_p == e_slash) {
            *b_p++ = '/';
            e_p += 3;
            e_slash = strstr(e_p, "%2F");
        }
    }
    curl_free(escaped_path);
    curl_easy_cleanup(c);
    regfree(&regex);
    return base_url;
 }