Do the right thing with sites that use absolute links

On some sites, the link to each subfolder is an absolute link rather
than a relative one. To accommodate this, convert the links from
absolute to relative before storing them in the link table.
This commit is contained in:
Jonathan Kamens 2023-09-03 14:53:12 -04:00 committed by Fufu Fang
parent 41cb4b80bc
commit 4d323b846f
1 changed files with 59 additions and 7 deletions

View File

@ -27,6 +27,7 @@ int ROOT_LINK_OFFSET = 0;
* effectively gives LinkTable generation priority over file transfer.
*/
static pthread_mutex_t link_lock;
static void make_link_relative(const char *page_url, char *link_url);
/**
* \brief create a new Link
@ -382,7 +383,8 @@ static int linknames_equal(char *linkname, const char *linkname_new)
* Shamelessly copied and pasted from:
* https://github.com/google/gumbo-parser/blob/master/examples/find_links.cc
*/
static void HTML_to_LinkTable(GumboNode *node, LinkTable *linktbl)
static void HTML_to_LinkTable(const char *url, GumboNode *node,
LinkTable *linktbl)
{
if (node->type != GUMBO_NODE_ELEMENT) {
return;
@ -391,23 +393,25 @@ static void HTML_to_LinkTable(GumboNode *node, LinkTable *linktbl)
if (node->v.element.tag == GUMBO_TAG_A &&
(href =
gumbo_get_attribute(&node->v.element.attributes, "href"))) {
char *link_url = href->value;
make_link_relative(url, link_url);
/*
* if it is valid, copy the link onto the heap
*/
LinkType type = linkname_to_LinkType(href->value);
LinkType type = linkname_to_LinkType(link_url);
/*
* We also check if the link being added is the same as the last link.
* This is to prevent duplicated link, if an Apache server has the
* IconsAreLinks option.
*/
size_t comp_len = strnlen(href->value, MAX_FILENAME_LEN);
size_t comp_len = strnlen(link_url, MAX_FILENAME_LEN);
if (type == LINK_DIR) {
comp_len--;
}
if (((type == LINK_DIR) || (type == LINK_UNINITIALISED_FILE)) &&
!linknames_equal(linktbl->links[linktbl->num - 1]->linkname,
href->value)) {
LinkTable_add(linktbl, Link_new(href->value, type));
link_url)) {
LinkTable_add(linktbl, Link_new(link_url, type));
}
}
/*
@ -415,7 +419,7 @@ static void HTML_to_LinkTable(GumboNode *node, LinkTable *linktbl)
*/
GumboVector *children = &node->v.element.children;
for (size_t i = 0; i < children->length; ++i) {
HTML_to_LinkTable((GumboNode *) children->data[i], linktbl);
HTML_to_LinkTable(url, (GumboNode *) children->data[i], linktbl);
}
return;
}
@ -568,7 +572,7 @@ LinkTable *LinkTable_new(const char *url)
* Otherwise parsed the received data
*/
GumboOutput *output = gumbo_parse(ts.data);
HTML_to_LinkTable(output->root, linktbl);
HTML_to_LinkTable(url, output->root, linktbl);
gumbo_destroy_output(&kGumboDefaultOptions, output);
FREE(ts.data);
@ -1058,3 +1062,51 @@ long path_download(const char *path, char *output_buf, size_t req_size,
return Link_download(link, output_buf, req_size, offset);
}
static void make_link_relative(const char *page_url, char *link_url)
{
/*
Some servers make the links to subdirectories absolute, but our code
expects them to be relative, so change the contents of link_url as
needed to accommodate that.
*/
if (link_url[0] != '/') {
/* Already relative, nothing to do here! */
return;
}
/* Find the slash after the host name. */
int slashes_left_to_find = 3;
while (*page_url) {
if (*page_url == '/' && ! --slashes_left_to_find)
break;
/* N.B. This is here, rather than doing `while (*page_url++)`, because
when we're done we want the pointer to point at the final slash. */
page_url++;
}
if (slashes_left_to_find)
if (! *page_url)
/* We're at the top level of the web site and the user entered the URL
without a trailing slash. */
page_url = "/";
else
/* Well, that's odd. Let's return rather than trying to dig ourselves
deeper into whatever hole we're in. */
return;
/* The page URL is no longer the full page_url, it's just the part after
the host name.
/* The link URL should start with the page URL. */
if (strstr(link_url, page_url) != link_url)
return;
int skip_len = strlen(page_url);
if (page_url[skip_len-1] != '/') {
if (page_url[skip_len] != '/')
/* Um, I'm not sure what to do here, so give up. */
return;
skip_len++;
}
/* Move the part of the link URL after the parent page's pat to
the beginning of the link URL string, discarding what came
before it. */
memmove(link_url, link_url + skip_len, strlen(link_url) - skip_len + 1);
}