From 7363adaf124e3900604095fe9edd48edcb32c8ae Mon Sep 17 00:00:00 2001 From: Jonathan Kamens Date: Sun, 3 Sep 2023 16:03:00 -0400 Subject: [PATCH] Handle sites that put unencoded characters in URLs that curl dislikes Some sites put unencoded characters in their href attributes that really should be encoded, most notably spaces. Curl won't accept a URL with a space in it, and perhaps other such characters as well. Address this by properly encoding characters in URLs before feeding them to Curl. --- src/link.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/src/link.c b/src/link.c index 2122570..d82fa87 100644 --- a/src/link.c +++ b/src/link.c @@ -465,23 +465,40 @@ void Link_set_file_stat(Link *this_link, CURL *curl) static void LinkTable_fill(LinkTable *linktbl) { + CURL *c = curl_easy_init(); Link *head_link = linktbl->links[0]; lprintf(debug, "Filling %s\n", head_link->f_url); for (int i = 1; i < linktbl->num; i++) { Link *this_link = linktbl->links[i]; - char *url; - url = path_append(head_link->f_url, this_link->linkpath); + /* Some web sites use characters in their href attributes that really + shouldn't be in their href attributes, most commonly spaces. And + some web sites _do_ properly encode their href attributes. So we + first unescape the link path, and then we escape it, so that curl + will definitely be happy with it (e.g., curl won't accept URLs with + spaces in them!). If we only escaped it, and there were already + encoded characters in it, then that would break the link. */ + char *unescaped_path = curl_easy_unescape(c, this_link->linkpath, 0, + NULL); + char *escaped_path = curl_easy_escape(c, unescaped_path, 0); + curl_free(unescaped_path); + /* Our code does the wrong thing if there's a trailing slash that's been + replaced with %2F, which curl_easy_escape does, God bless it, so if + it did that then let's put it back. */ + int escaped_len = strlen(escaped_path); + if (escaped_len >= 3 && !strcmp(escaped_path + escaped_len - 3, "%2F")) + strcpy(escaped_path + escaped_len - 3, "/"); + char *url = path_append(head_link->f_url, escaped_path); + curl_free(escaped_path); strncpy(this_link->f_url, url, MAX_PATH_LEN); FREE(url); char *unescaped_linkname; - CURL *c = curl_easy_init(); unescaped_linkname = curl_easy_unescape(c, this_link->linkname, 0, NULL); strncpy(this_link->linkname, unescaped_linkname, MAX_FILENAME_LEN); curl_free(unescaped_linkname); - curl_easy_cleanup(c); } LinkTable_uninitialised_fill(linktbl); + curl_easy_cleanup(c); } /**