Do the right thing with sites that use absolute links
On some sites, the link to each subfolder is an absolute link rather than a relative one. To accommodate this, convert the links from absolute to relative before storing them in the link table.
This commit is contained in:
parent
41cb4b80bc
commit
4d323b846f
66
src/link.c
66
src/link.c
|
@ -27,6 +27,7 @@ int ROOT_LINK_OFFSET = 0;
|
||||||
* effectively gives LinkTable generation priority over file transfer.
|
* effectively gives LinkTable generation priority over file transfer.
|
||||||
*/
|
*/
|
||||||
static pthread_mutex_t link_lock;
|
static pthread_mutex_t link_lock;
|
||||||
|
static void make_link_relative(const char *page_url, char *link_url);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* \brief create a new Link
|
* \brief create a new Link
|
||||||
|
@ -382,7 +383,8 @@ static int linknames_equal(char *linkname, const char *linkname_new)
|
||||||
* Shamelessly copied and pasted from:
|
* Shamelessly copied and pasted from:
|
||||||
* https://github.com/google/gumbo-parser/blob/master/examples/find_links.cc
|
* https://github.com/google/gumbo-parser/blob/master/examples/find_links.cc
|
||||||
*/
|
*/
|
||||||
static void HTML_to_LinkTable(GumboNode *node, LinkTable *linktbl)
|
static void HTML_to_LinkTable(const char *url, GumboNode *node,
|
||||||
|
LinkTable *linktbl)
|
||||||
{
|
{
|
||||||
if (node->type != GUMBO_NODE_ELEMENT) {
|
if (node->type != GUMBO_NODE_ELEMENT) {
|
||||||
return;
|
return;
|
||||||
|
@ -391,23 +393,25 @@ static void HTML_to_LinkTable(GumboNode *node, LinkTable *linktbl)
|
||||||
if (node->v.element.tag == GUMBO_TAG_A &&
|
if (node->v.element.tag == GUMBO_TAG_A &&
|
||||||
(href =
|
(href =
|
||||||
gumbo_get_attribute(&node->v.element.attributes, "href"))) {
|
gumbo_get_attribute(&node->v.element.attributes, "href"))) {
|
||||||
|
char *link_url = href->value;
|
||||||
|
make_link_relative(url, link_url);
|
||||||
/*
|
/*
|
||||||
* if it is valid, copy the link onto the heap
|
* if it is valid, copy the link onto the heap
|
||||||
*/
|
*/
|
||||||
LinkType type = linkname_to_LinkType(href->value);
|
LinkType type = linkname_to_LinkType(link_url);
|
||||||
/*
|
/*
|
||||||
* We also check if the link being added is the same as the last link.
|
* We also check if the link being added is the same as the last link.
|
||||||
* This is to prevent duplicated link, if an Apache server has the
|
* This is to prevent duplicated link, if an Apache server has the
|
||||||
* IconsAreLinks option.
|
* IconsAreLinks option.
|
||||||
*/
|
*/
|
||||||
size_t comp_len = strnlen(href->value, MAX_FILENAME_LEN);
|
size_t comp_len = strnlen(link_url, MAX_FILENAME_LEN);
|
||||||
if (type == LINK_DIR) {
|
if (type == LINK_DIR) {
|
||||||
comp_len--;
|
comp_len--;
|
||||||
}
|
}
|
||||||
if (((type == LINK_DIR) || (type == LINK_UNINITIALISED_FILE)) &&
|
if (((type == LINK_DIR) || (type == LINK_UNINITIALISED_FILE)) &&
|
||||||
!linknames_equal(linktbl->links[linktbl->num - 1]->linkname,
|
!linknames_equal(linktbl->links[linktbl->num - 1]->linkname,
|
||||||
href->value)) {
|
link_url)) {
|
||||||
LinkTable_add(linktbl, Link_new(href->value, type));
|
LinkTable_add(linktbl, Link_new(link_url, type));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/*
|
/*
|
||||||
|
@ -415,7 +419,7 @@ static void HTML_to_LinkTable(GumboNode *node, LinkTable *linktbl)
|
||||||
*/
|
*/
|
||||||
GumboVector *children = &node->v.element.children;
|
GumboVector *children = &node->v.element.children;
|
||||||
for (size_t i = 0; i < children->length; ++i) {
|
for (size_t i = 0; i < children->length; ++i) {
|
||||||
HTML_to_LinkTable((GumboNode *) children->data[i], linktbl);
|
HTML_to_LinkTable(url, (GumboNode *) children->data[i], linktbl);
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -568,7 +572,7 @@ LinkTable *LinkTable_new(const char *url)
|
||||||
* Otherwise parsed the received data
|
* Otherwise parsed the received data
|
||||||
*/
|
*/
|
||||||
GumboOutput *output = gumbo_parse(ts.data);
|
GumboOutput *output = gumbo_parse(ts.data);
|
||||||
HTML_to_LinkTable(output->root, linktbl);
|
HTML_to_LinkTable(url, output->root, linktbl);
|
||||||
gumbo_destroy_output(&kGumboDefaultOptions, output);
|
gumbo_destroy_output(&kGumboDefaultOptions, output);
|
||||||
FREE(ts.data);
|
FREE(ts.data);
|
||||||
|
|
||||||
|
@ -1058,3 +1062,51 @@ long path_download(const char *path, char *output_buf, size_t req_size,
|
||||||
|
|
||||||
return Link_download(link, output_buf, req_size, offset);
|
return Link_download(link, output_buf, req_size, offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void make_link_relative(const char *page_url, char *link_url)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
Some servers make the links to subdirectories absolute, but our code
|
||||||
|
expects them to be relative, so change the contents of link_url as
|
||||||
|
needed to accommodate that.
|
||||||
|
*/
|
||||||
|
if (link_url[0] != '/') {
|
||||||
|
/* Already relative, nothing to do here! */
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Find the slash after the host name. */
|
||||||
|
int slashes_left_to_find = 3;
|
||||||
|
while (*page_url) {
|
||||||
|
if (*page_url == '/' && ! --slashes_left_to_find)
|
||||||
|
break;
|
||||||
|
/* N.B. This is here, rather than doing `while (*page_url++)`, because
|
||||||
|
when we're done we want the pointer to point at the final slash. */
|
||||||
|
page_url++;
|
||||||
|
}
|
||||||
|
if (slashes_left_to_find)
|
||||||
|
if (! *page_url)
|
||||||
|
/* We're at the top level of the web site and the user entered the URL
|
||||||
|
without a trailing slash. */
|
||||||
|
page_url = "/";
|
||||||
|
else
|
||||||
|
/* Well, that's odd. Let's return rather than trying to dig ourselves
|
||||||
|
deeper into whatever hole we're in. */
|
||||||
|
return;
|
||||||
|
/* The page URL is no longer the full page_url, it's just the part after
|
||||||
|
the host name.
|
||||||
|
/* The link URL should start with the page URL. */
|
||||||
|
if (strstr(link_url, page_url) != link_url)
|
||||||
|
return;
|
||||||
|
int skip_len = strlen(page_url);
|
||||||
|
if (page_url[skip_len-1] != '/') {
|
||||||
|
if (page_url[skip_len] != '/')
|
||||||
|
/* Um, I'm not sure what to do here, so give up. */
|
||||||
|
return;
|
||||||
|
skip_len++;
|
||||||
|
}
|
||||||
|
/* Move the part of the link URL after the parent page's pat to
|
||||||
|
the beginning of the link URL string, discarding what came
|
||||||
|
before it. */
|
||||||
|
memmove(link_url, link_url + skip_len, strlen(link_url) - skip_len + 1);
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue