From 7363adaf124e3900604095fe9edd48edcb32c8ae Mon Sep 17 00:00:00 2001
From: Jonathan Kamens <jik@kamens.us>
Date: Sun, 3 Sep 2023 16:03:00 -0400
Subject: [PATCH] Handle sites that put unencoded characters in URLs that curl
 dislikes

Some sites put unencoded characters in their href attributes that
really should be encoded, most notably spaces. Curl won't accept a URL
with a space in it, and perhaps other such characters as well. Address
this by properly encoding characters in URLs before feeding them to
Curl.
---
 src/link.c | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/src/link.c b/src/link.c
index 2122570..d82fa87 100644
--- a/src/link.c
+++ b/src/link.c
@@ -465,23 +465,40 @@ void Link_set_file_stat(Link *this_link, CURL *curl)
 
 static void LinkTable_fill(LinkTable *linktbl)
 {
+    CURL *c = curl_easy_init();
     Link *head_link = linktbl->links[0];
     lprintf(debug, "Filling %s\n", head_link->f_url);
     for (int i = 1; i < linktbl->num; i++) {
         Link *this_link = linktbl->links[i];
-        char *url;
-        url = path_append(head_link->f_url, this_link->linkpath);
+        /* Some web sites use characters in their href attributes that really
+           shouldn't be in their href attributes, most commonly spaces. And
+           some web sites _do_ properly encode their href attributes. So we
+           first unescape the link path, and then we escape it, so that curl
+           will definitely be happy with it (e.g., curl won't accept URLs with
+           spaces in them!). If we only escaped it, and there were already
+           encoded characters in it, then that would break the link. */
+        char *unescaped_path = curl_easy_unescape(c, this_link->linkpath, 0,
+                                                  NULL);
+        char *escaped_path = curl_easy_escape(c, unescaped_path, 0);
+        curl_free(unescaped_path);
+        /* Our code does the wrong thing if there's a trailing slash that's been
+           replaced with %2F, which curl_easy_escape does, God bless it, so if
+           it did that then let's put it back. */
+        int escaped_len = strlen(escaped_path);
+        if (escaped_len >= 3 && !strcmp(escaped_path + escaped_len - 3, "%2F"))
+            strcpy(escaped_path + escaped_len - 3, "/");
+        char *url = path_append(head_link->f_url, escaped_path);
+        curl_free(escaped_path);
         strncpy(this_link->f_url, url, MAX_PATH_LEN);
         FREE(url);
         char *unescaped_linkname;
-        CURL *c = curl_easy_init();
         unescaped_linkname = curl_easy_unescape(c, this_link->linkname,
                                                 0, NULL);
         strncpy(this_link->linkname, unescaped_linkname, MAX_FILENAME_LEN);
         curl_free(unescaped_linkname);
-        curl_easy_cleanup(c);
     }
     LinkTable_uninitialised_fill(linktbl);
+    curl_easy_cleanup(c);
 }
 
 /**