2018-07-26 11:29:44 +02:00
|
|
|
#include "link.h"
|
|
|
|
|
2021-08-22 03:26:09 +02:00
|
|
|
#include "log.h"
|
2021-09-04 13:40:37 +02:00
|
|
|
#include "memcache.h"
|
2021-09-03 22:39:31 +02:00
|
|
|
#include "util.h"
|
2018-07-26 11:29:44 +02:00
|
|
|
|
|
|
|
#include <gumbo.h>
|
|
|
|
|
2019-10-27 22:21:30 +01:00
|
|
|
#include <assert.h>
|
2018-07-26 11:29:44 +02:00
|
|
|
#include <ctype.h>
|
|
|
|
#include <errno.h>
|
2019-04-24 00:48:08 +02:00
|
|
|
#include <stdlib.h>
|
2018-07-26 11:29:44 +02:00
|
|
|
#include <string.h>
|
|
|
|
#include <unistd.h>
|
|
|
|
|
2019-09-03 15:47:12 +02:00
|
|
|
#define STATUS_LEN 64
|
|
|
|
|
2021-08-30 12:24:32 +02:00
|
|
|
/*
|
|
|
|
* ---------------- External variables -----------------------
|
|
|
|
*/
|
2018-07-30 15:20:04 +02:00
|
|
|
LinkTable *ROOT_LINK_TBL = NULL;
|
2019-04-22 16:26:25 +02:00
|
|
|
int ROOT_LINK_OFFSET = 0;
|
2018-07-26 11:29:44 +02:00
|
|
|
|
2019-08-31 22:21:28 +02:00
|
|
|
/**
|
|
|
|
* \brief LinkTable generation priority lock
|
|
|
|
* \details This allows LinkTable generation to be run exclusively. This
|
|
|
|
* effectively gives LinkTable generation priority over file transfer.
|
|
|
|
*/
|
|
|
|
static pthread_mutex_t link_lock;
|
2023-09-03 20:53:12 +02:00
|
|
|
static void make_link_relative(const char *page_url, char *link_url);
|
2019-08-31 22:21:28 +02:00
|
|
|
|
2021-08-31 14:52:25 +02:00
|
|
|
/**
|
|
|
|
* \brief create a new Link
|
|
|
|
*/
|
|
|
|
static Link *Link_new(const char *linkname, LinkType type)
|
|
|
|
{
|
|
|
|
Link *link = CALLOC(1, sizeof(Link));
|
|
|
|
|
|
|
|
strncpy(link->linkname, linkname, MAX_FILENAME_LEN);
|
2023-09-03 19:56:02 +02:00
|
|
|
strncpy(link->linkpath, linkname, MAX_FILENAME_LEN);
|
2021-08-31 14:52:25 +02:00
|
|
|
link->type = type;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* remove the '/' from linkname if it exists
|
|
|
|
*/
|
2021-09-04 04:00:25 +02:00
|
|
|
char *c = &(link->linkname[strnlen(link->linkname, MAX_FILENAME_LEN) - 1]);
|
2021-08-31 14:52:25 +02:00
|
|
|
if (*c == '/') {
|
|
|
|
*c = '\0';
|
|
|
|
}
|
|
|
|
|
|
|
|
return link;
|
|
|
|
}
|
|
|
|
|
2021-09-03 15:56:11 +02:00
|
|
|
static CURL *Link_to_curl(Link *link)
|
2021-08-31 14:52:25 +02:00
|
|
|
{
|
2023-09-03 22:02:33 +02:00
|
|
|
lprintf(debug, "%s\n", link->f_url);
|
2021-08-31 14:52:25 +02:00
|
|
|
CURL *curl = curl_easy_init();
|
|
|
|
if (!curl) {
|
|
|
|
lprintf(fatal, "curl_easy_init() failed!\n");
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* set up some basic curl stuff
|
|
|
|
*/
|
2021-09-03 13:57:52 +02:00
|
|
|
CURLcode ret =
|
|
|
|
curl_easy_setopt(curl, CURLOPT_USERAGENT, CONFIG.user_agent);
|
|
|
|
if (ret) {
|
|
|
|
lprintf(error, "%s", curl_easy_strerror(ret));
|
|
|
|
}
|
|
|
|
ret = curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
|
|
|
|
if (ret) {
|
|
|
|
lprintf(error, "%s", curl_easy_strerror(ret));
|
|
|
|
}
|
2021-08-31 14:52:25 +02:00
|
|
|
/*
|
|
|
|
* for following directories without the '/'
|
|
|
|
*/
|
2021-09-03 13:57:52 +02:00
|
|
|
ret = curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 2);
|
|
|
|
if (ret) {
|
|
|
|
lprintf(error, "%s", curl_easy_strerror(ret));
|
|
|
|
}
|
|
|
|
ret = curl_easy_setopt(curl, CURLOPT_URL, link->f_url);
|
|
|
|
if (ret) {
|
|
|
|
lprintf(error, "%s", curl_easy_strerror(ret));
|
|
|
|
}
|
|
|
|
ret = curl_easy_setopt(curl, CURLOPT_TCP_KEEPALIVE, 1);
|
|
|
|
if (ret) {
|
|
|
|
lprintf(error, "%s", curl_easy_strerror(ret));
|
|
|
|
}
|
|
|
|
ret = curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 15);
|
|
|
|
if (ret) {
|
|
|
|
lprintf(error, "%s", curl_easy_strerror(ret));
|
|
|
|
}
|
|
|
|
ret = curl_easy_setopt(curl, CURLOPT_SHARE, CURL_SHARE);
|
|
|
|
if (ret) {
|
|
|
|
lprintf(error, "%s", curl_easy_strerror(ret));
|
|
|
|
}
|
|
|
|
ret =
|
2021-09-03 18:00:32 +02:00
|
|
|
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_memory_callback);
|
2021-09-03 13:57:52 +02:00
|
|
|
if (ret) {
|
|
|
|
lprintf(error, "%s", curl_easy_strerror(ret));
|
|
|
|
}
|
2022-11-01 02:51:02 +01:00
|
|
|
if (CONFIG.cafile) {
|
|
|
|
/*
|
|
|
|
* Having been given a certificate file, disable any search directory
|
|
|
|
* built into libcurl, so that we exclusively use the explicitly given
|
|
|
|
* certificate(s).
|
|
|
|
*
|
|
|
|
* If we ever add a CAPATH option, we should do the mirror for CAINFO,
|
|
|
|
* too: disable both and then enable whichever one(s) were given.
|
|
|
|
*/
|
|
|
|
ret = curl_easy_setopt(curl, CURLOPT_CAPATH, NULL);
|
|
|
|
if (ret) {
|
|
|
|
lprintf(error, "%s", curl_easy_strerror(ret));
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = curl_easy_setopt(curl, CURLOPT_CAINFO, CONFIG.cafile);
|
|
|
|
if (ret) {
|
|
|
|
lprintf(error, "%s", curl_easy_strerror(ret));
|
|
|
|
}
|
|
|
|
}
|
2021-08-31 14:52:25 +02:00
|
|
|
if (CONFIG.insecure_tls) {
|
2021-09-03 13:57:52 +02:00
|
|
|
ret = curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0);
|
|
|
|
if (ret) {
|
|
|
|
lprintf(error, "%s", curl_easy_strerror(ret));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (CONFIG.log_type & libcurl_debug) {
|
|
|
|
ret = curl_easy_setopt(curl, CURLOPT_VERBOSE, 1L);
|
|
|
|
if (ret) {
|
|
|
|
lprintf(error, "%s", curl_easy_strerror(ret));
|
|
|
|
}
|
2021-08-31 14:52:25 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
if (CONFIG.http_username) {
|
2021-09-03 18:00:32 +02:00
|
|
|
ret = curl_easy_setopt(curl, CURLOPT_USERNAME, CONFIG.http_username);
|
2021-09-03 13:57:52 +02:00
|
|
|
if (ret) {
|
|
|
|
lprintf(error, "%s", curl_easy_strerror(ret));
|
|
|
|
}
|
2021-08-31 14:52:25 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
if (CONFIG.http_password) {
|
2021-09-03 18:00:32 +02:00
|
|
|
ret = curl_easy_setopt(curl, CURLOPT_PASSWORD, CONFIG.http_password);
|
2021-09-03 13:57:52 +02:00
|
|
|
if (ret) {
|
|
|
|
lprintf(error, "%s", curl_easy_strerror(ret));
|
|
|
|
}
|
2021-08-31 14:52:25 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
if (CONFIG.proxy) {
|
2021-09-03 13:57:52 +02:00
|
|
|
ret = curl_easy_setopt(curl, CURLOPT_PROXY, CONFIG.proxy);
|
|
|
|
if (ret) {
|
|
|
|
lprintf(error, "%s", curl_easy_strerror(ret));
|
|
|
|
}
|
2021-08-31 14:52:25 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
if (CONFIG.proxy_username) {
|
2021-09-03 13:57:52 +02:00
|
|
|
ret = curl_easy_setopt(curl, CURLOPT_PROXYUSERNAME,
|
|
|
|
CONFIG.proxy_username);
|
|
|
|
if (ret) {
|
|
|
|
lprintf(error, "%s", curl_easy_strerror(ret));
|
|
|
|
}
|
2021-08-31 14:52:25 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
if (CONFIG.proxy_password) {
|
2021-09-03 13:57:52 +02:00
|
|
|
ret = curl_easy_setopt(curl, CURLOPT_PROXYPASSWORD,
|
|
|
|
CONFIG.proxy_password);
|
|
|
|
if (ret) {
|
|
|
|
lprintf(error, "%s", curl_easy_strerror(ret));
|
|
|
|
}
|
2021-08-31 14:52:25 +02:00
|
|
|
}
|
|
|
|
|
2022-11-01 02:51:02 +01:00
|
|
|
if (CONFIG.proxy_cafile) {
|
|
|
|
/* See CONFIG.cafile above */
|
|
|
|
ret = curl_easy_setopt(curl, CURLOPT_PROXY_CAPATH, NULL);
|
|
|
|
if (ret) {
|
|
|
|
lprintf(error, "%s", curl_easy_strerror(ret));
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = curl_easy_setopt(curl, CURLOPT_PROXY_CAINFO,
|
|
|
|
CONFIG.proxy_cafile);
|
|
|
|
if (ret) {
|
|
|
|
lprintf(error, "%s", curl_easy_strerror(ret));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-08-31 14:52:25 +02:00
|
|
|
return curl;
|
|
|
|
}
|
|
|
|
|
2021-09-03 15:56:11 +02:00
|
|
|
static void Link_req_file_stat(Link *this_link)
|
2021-08-31 14:52:25 +02:00
|
|
|
{
|
2023-09-03 22:02:33 +02:00
|
|
|
lprintf(debug, "%s\n", this_link->f_url);
|
2021-08-31 14:52:25 +02:00
|
|
|
CURL *curl = Link_to_curl(this_link);
|
2021-09-03 13:57:52 +02:00
|
|
|
CURLcode ret = curl_easy_setopt(curl, CURLOPT_NOBODY, 1);
|
|
|
|
if (ret) {
|
|
|
|
lprintf(error, "%s", curl_easy_strerror(ret));
|
|
|
|
}
|
|
|
|
ret = curl_easy_setopt(curl, CURLOPT_FILETIME, 1L);
|
|
|
|
if (ret) {
|
|
|
|
lprintf(error, "%s", curl_easy_strerror(ret));
|
|
|
|
}
|
2021-08-31 14:52:25 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We need to put the variable on the heap, because otherwise the
|
|
|
|
* variable gets popped from the stack as the function returns.
|
|
|
|
*
|
2021-09-02 16:36:53 +02:00
|
|
|
* It gets freed in curl_process_msgs();
|
2021-08-31 14:52:25 +02:00
|
|
|
*/
|
2021-09-01 12:56:18 +02:00
|
|
|
TransferStruct *transfer = CALLOC(1, sizeof(TransferStruct));
|
2021-08-31 14:52:25 +02:00
|
|
|
|
|
|
|
transfer->link = this_link;
|
|
|
|
transfer->type = FILESTAT;
|
2021-09-03 13:57:52 +02:00
|
|
|
ret = curl_easy_setopt(curl, CURLOPT_PRIVATE, transfer);
|
|
|
|
if (ret) {
|
|
|
|
lprintf(error, "%s", curl_easy_strerror(ret));
|
|
|
|
}
|
2021-08-31 14:52:25 +02:00
|
|
|
|
|
|
|
transfer_nonblocking(curl);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* \brief Fill in the uninitialised entries in a link table
|
|
|
|
* \details Try and get the stats for each link in the link table. This will get
|
|
|
|
* repeated until the uninitialised entry count drop to zero.
|
|
|
|
*/
|
2021-09-03 15:56:11 +02:00
|
|
|
static void LinkTable_uninitialised_fill(LinkTable *linktbl)
|
2021-08-31 14:52:25 +02:00
|
|
|
{
|
|
|
|
int u;
|
|
|
|
char s[STATUS_LEN];
|
2021-09-01 12:03:27 +02:00
|
|
|
lprintf(debug, " ... ");
|
2021-08-31 14:52:25 +02:00
|
|
|
do {
|
|
|
|
u = 0;
|
|
|
|
for (int i = 0; i < linktbl->num; i++) {
|
|
|
|
Link *this_link = linktbl->links[i];
|
|
|
|
if (this_link->type == LINK_UNINITIALISED_FILE) {
|
|
|
|
Link_req_file_stat(linktbl->links[i]);
|
|
|
|
u++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Block until the gaps are filled
|
|
|
|
*/
|
|
|
|
int n = curl_multi_perform_once();
|
|
|
|
int i = 0;
|
|
|
|
int j = 0;
|
|
|
|
while ((i = curl_multi_perform_once())) {
|
|
|
|
if (CONFIG.log_type & debug) {
|
|
|
|
if (j) {
|
|
|
|
erase_string(stderr, STATUS_LEN, s);
|
|
|
|
}
|
|
|
|
snprintf(s, STATUS_LEN, "%d / %d", n - i, n);
|
|
|
|
fprintf(stderr, "%s", s);
|
|
|
|
j++;
|
|
|
|
}
|
|
|
|
}
|
2021-09-03 15:56:11 +02:00
|
|
|
} while (u);
|
2021-08-31 14:52:25 +02:00
|
|
|
if (CONFIG.log_type & debug) {
|
|
|
|
erase_string(stderr, STATUS_LEN, s);
|
|
|
|
fprintf(stderr, "... Done!\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-08-31 11:37:56 +02:00
|
|
|
/**
|
|
|
|
* \brief Create the root linktable for single file mode
|
|
|
|
*/
|
2021-08-31 12:59:28 +02:00
|
|
|
static LinkTable *single_LinkTable_new(const char *url)
|
2021-08-31 11:37:56 +02:00
|
|
|
{
|
2021-08-31 14:52:25 +02:00
|
|
|
char *ptr = strrchr(url, '/') + 1;
|
|
|
|
LinkTable *linktbl = LinkTable_alloc(url);
|
|
|
|
Link *link = Link_new(ptr, LINK_UNINITIALISED_FILE);
|
|
|
|
strncpy(link->f_url, url, MAX_FILENAME_LEN);
|
|
|
|
LinkTable_add(linktbl, link);
|
|
|
|
LinkTable_uninitialised_fill(linktbl);
|
|
|
|
LinkTable_print(linktbl);
|
|
|
|
return linktbl;
|
2021-08-31 11:37:56 +02:00
|
|
|
}
|
|
|
|
|
2023-09-03 19:56:02 +02:00
|
|
|
LinkTable *LinkSystem_init(const char *url)
|
2019-08-31 22:21:28 +02:00
|
|
|
{
|
2021-08-31 12:18:39 +02:00
|
|
|
if (pthread_mutex_init(&link_lock, NULL)) {
|
|
|
|
lprintf(error, "link_lock initialisation failed!\n");
|
|
|
|
}
|
|
|
|
int url_len = strnlen(url, MAX_PATH_LEN) - 1;
|
|
|
|
/*
|
|
|
|
* --------- Set the length of the root link -----------
|
|
|
|
*/
|
|
|
|
/*
|
|
|
|
* This is where the '/' should be
|
|
|
|
*/
|
2023-09-03 19:56:02 +02:00
|
|
|
ROOT_LINK_OFFSET = strnlen(url, MAX_PATH_LEN) -
|
|
|
|
((url[url_len] == '/') ? 1 : 0);
|
2021-08-31 12:18:39 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* --------------------- Enable cache system --------------------
|
|
|
|
*/
|
|
|
|
if (CONFIG.cache_enabled) {
|
|
|
|
if (CONFIG.cache_dir) {
|
|
|
|
CacheSystem_init(CONFIG.cache_dir, 0);
|
|
|
|
} else {
|
|
|
|
CacheSystem_init(url, 1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ----------- Create the root link table --------------
|
|
|
|
*/
|
|
|
|
if (CONFIG.mode == NORMAL) {
|
|
|
|
ROOT_LINK_TBL = LinkTable_new(url);
|
2021-08-31 14:52:25 +02:00
|
|
|
} else if (CONFIG.mode == SINGLE) {
|
|
|
|
ROOT_LINK_TBL = single_LinkTable_new(url);
|
2021-08-31 12:18:39 +02:00
|
|
|
} else if (CONFIG.mode == SONIC) {
|
|
|
|
sonic_config_init(url, CONFIG.sonic_username,
|
|
|
|
CONFIG.sonic_password);
|
|
|
|
if (!CONFIG.sonic_id3) {
|
|
|
|
ROOT_LINK_TBL = sonic_LinkTable_new_index("0");
|
2021-08-31 12:15:00 +02:00
|
|
|
} else {
|
2021-08-31 12:18:39 +02:00
|
|
|
ROOT_LINK_TBL = sonic_LinkTable_new_id3(0, "0");
|
2021-08-31 12:15:00 +02:00
|
|
|
}
|
2021-08-31 12:18:39 +02:00
|
|
|
} else {
|
|
|
|
lprintf(fatal, "Invalid CONFIG.mode\n");
|
|
|
|
}
|
|
|
|
return ROOT_LINK_TBL;
|
2019-08-31 22:21:28 +02:00
|
|
|
}
|
|
|
|
|
2021-09-03 15:56:11 +02:00
|
|
|
void LinkTable_add(LinkTable *linktbl, Link *link)
|
2019-04-22 14:32:15 +02:00
|
|
|
{
|
2021-08-31 12:18:39 +02:00
|
|
|
linktbl->num++;
|
|
|
|
linktbl->links =
|
|
|
|
realloc(linktbl->links, linktbl->num * sizeof(Link *));
|
|
|
|
if (!linktbl->links) {
|
|
|
|
lprintf(fatal, "realloc() failure!\n");
|
|
|
|
}
|
|
|
|
linktbl->links[linktbl->num - 1] = link;
|
2019-04-22 14:32:15 +02:00
|
|
|
}
|
|
|
|
|
2019-09-03 23:11:23 +02:00
|
|
|
static LinkType linkname_to_LinkType(const char *linkname)
|
2019-04-22 14:32:15 +02:00
|
|
|
{
|
2021-08-31 12:18:39 +02:00
|
|
|
/*
|
|
|
|
* The link name has to start with alphanumerical character
|
|
|
|
*/
|
|
|
|
if (!isalnum(linkname[0]) && (linkname[0] != '%')) {
|
|
|
|
return LINK_INVALID;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check for stray '/' - Linkname should not have '/'
|
|
|
|
*/
|
|
|
|
char *slash = strchr(linkname, '/');
|
|
|
|
if (slash) {
|
|
|
|
int linkname_len = strnlen(linkname, MAX_FILENAME_LEN) - 1;
|
|
|
|
if (slash - linkname != linkname_len) {
|
|
|
|
return LINK_INVALID;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (linkname[strnlen(linkname, MAX_FILENAME_LEN) - 1] == '/') {
|
|
|
|
return LINK_DIR;
|
|
|
|
}
|
|
|
|
|
|
|
|
return LINK_UNINITIALISED_FILE;
|
2019-04-22 14:32:15 +02:00
|
|
|
}
|
2018-07-26 11:29:44 +02:00
|
|
|
|
2021-08-08 14:59:30 +02:00
|
|
|
/**
|
|
|
|
* \brief check if two link names are equal, after taking the '/' into account.
|
|
|
|
*/
|
|
|
|
static int linknames_equal(char *linkname, const char *linkname_new)
|
|
|
|
{
|
2021-08-31 12:18:39 +02:00
|
|
|
if (!strncmp(linkname, linkname_new, MAX_FILENAME_LEN)) {
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* check if the link names differ by a single '/'
|
|
|
|
*/
|
|
|
|
if (!strncmp
|
2021-09-03 15:56:11 +02:00
|
|
|
(linkname, linkname_new, strnlen(linkname, MAX_FILENAME_LEN))) {
|
2021-08-31 12:18:39 +02:00
|
|
|
size_t linkname_new_len = strnlen(linkname_new, MAX_FILENAME_LEN);
|
|
|
|
if ((linkname_new_len - strnlen(linkname, MAX_FILENAME_LEN) == 1)
|
2021-09-03 15:56:11 +02:00
|
|
|
&& (linkname_new[linkname_new_len - 1] == '/')) {
|
2021-08-31 12:18:39 +02:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return 0;
|
2021-08-08 14:59:30 +02:00
|
|
|
}
|
|
|
|
|
2018-07-26 11:29:44 +02:00
|
|
|
/**
|
|
|
|
* Shamelessly copied and pasted from:
|
|
|
|
* https://github.com/google/gumbo-parser/blob/master/examples/find_links.cc
|
|
|
|
*/
|
2023-09-03 20:53:12 +02:00
|
|
|
static void HTML_to_LinkTable(const char *url, GumboNode *node,
|
|
|
|
LinkTable *linktbl)
|
2018-07-26 11:29:44 +02:00
|
|
|
{
|
2021-08-31 12:18:39 +02:00
|
|
|
if (node->type != GUMBO_NODE_ELEMENT) {
|
2021-08-31 12:15:00 +02:00
|
|
|
return;
|
2021-08-31 12:18:39 +02:00
|
|
|
}
|
|
|
|
GumboAttribute *href;
|
|
|
|
if (node->v.element.tag == GUMBO_TAG_A &&
|
2021-09-03 15:56:11 +02:00
|
|
|
(href =
|
|
|
|
gumbo_get_attribute(&node->v.element.attributes, "href"))) {
|
2023-09-29 20:23:44 +02:00
|
|
|
char *link_url = (char *) href->value;
|
2023-09-03 20:53:12 +02:00
|
|
|
make_link_relative(url, link_url);
|
2021-08-31 12:15:00 +02:00
|
|
|
/*
|
2021-08-31 12:18:39 +02:00
|
|
|
* if it is valid, copy the link onto the heap
|
2021-08-31 12:15:00 +02:00
|
|
|
*/
|
2023-09-03 20:53:12 +02:00
|
|
|
LinkType type = linkname_to_LinkType(link_url);
|
2021-08-31 12:15:00 +02:00
|
|
|
/*
|
2021-08-31 12:18:39 +02:00
|
|
|
* We also check if the link being added is the same as the last link.
|
|
|
|
* This is to prevent duplicated link, if an Apache server has the
|
|
|
|
* IconsAreLinks option.
|
2021-08-31 12:15:00 +02:00
|
|
|
*/
|
2023-09-29 20:24:04 +02:00
|
|
|
/* The following four lines of code have no effect so I've commented
|
|
|
|
them out. I'm not removing them entirely because it's possible the
|
|
|
|
original intent was to do a check of some sort here and it's an
|
|
|
|
error that this check wasn't fully implemented, in which case this
|
|
|
|
commented out code and the comment above it should serve as a
|
|
|
|
reminder to whoever originally wrote it that there's something
|
|
|
|
unfinished here that needs to be finished.
|
|
|
|
*/
|
|
|
|
/*
|
2023-09-03 20:53:12 +02:00
|
|
|
size_t comp_len = strnlen(link_url, MAX_FILENAME_LEN);
|
2021-08-31 12:18:39 +02:00
|
|
|
if (type == LINK_DIR) {
|
|
|
|
comp_len--;
|
|
|
|
}
|
2023-09-29 20:24:04 +02:00
|
|
|
*/
|
2021-08-31 12:18:39 +02:00
|
|
|
if (((type == LINK_DIR) || (type == LINK_UNINITIALISED_FILE)) &&
|
2021-09-03 15:56:11 +02:00
|
|
|
!linknames_equal(linktbl->links[linktbl->num - 1]->linkname,
|
2023-09-03 20:53:12 +02:00
|
|
|
link_url)) {
|
|
|
|
LinkTable_add(linktbl, Link_new(link_url, type));
|
2021-08-31 12:18:39 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Note the recursive call, lol.
|
|
|
|
*/
|
|
|
|
GumboVector *children = &node->v.element.children;
|
|
|
|
for (size_t i = 0; i < children->length; ++i) {
|
2023-09-03 20:53:12 +02:00
|
|
|
HTML_to_LinkTable(url, (GumboNode *) children->data[i], linktbl);
|
2021-08-31 12:18:39 +02:00
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
2021-08-31 12:15:00 +02:00
|
|
|
|
2021-09-03 15:56:11 +02:00
|
|
|
void Link_set_file_stat(Link *this_link, CURL *curl)
|
2018-07-26 11:29:44 +02:00
|
|
|
{
|
2021-08-31 12:18:39 +02:00
|
|
|
long http_resp;
|
2021-09-03 13:47:48 +02:00
|
|
|
CURLcode ret =
|
|
|
|
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_resp);
|
|
|
|
if (ret) {
|
|
|
|
lprintf(error, "%s", curl_easy_strerror(ret));
|
|
|
|
}
|
2021-08-31 12:18:39 +02:00
|
|
|
if (http_resp == HTTP_OK) {
|
2023-07-25 08:57:08 +02:00
|
|
|
curl_off_t cl = 0;
|
2021-09-03 13:47:48 +02:00
|
|
|
ret =
|
2023-07-25 08:57:08 +02:00
|
|
|
curl_easy_getinfo(curl, CURLINFO_CONTENT_LENGTH_DOWNLOAD_T, &cl);
|
2021-09-03 13:47:48 +02:00
|
|
|
if (ret) {
|
|
|
|
lprintf(error, "%s", curl_easy_strerror(ret));
|
|
|
|
}
|
|
|
|
ret =
|
|
|
|
curl_easy_getinfo(curl, CURLINFO_FILETIME, &(this_link->time));
|
|
|
|
if (ret) {
|
|
|
|
lprintf(error, "%s", curl_easy_strerror(ret));
|
|
|
|
}
|
2021-08-31 19:54:58 +02:00
|
|
|
if (cl <= 0) {
|
2021-08-31 12:18:39 +02:00
|
|
|
this_link->type = LINK_INVALID;
|
2021-08-31 12:15:00 +02:00
|
|
|
} else {
|
2021-08-31 12:18:39 +02:00
|
|
|
this_link->type = LINK_FILE;
|
|
|
|
this_link->content_length = cl;
|
2021-08-31 12:15:00 +02:00
|
|
|
}
|
2021-08-31 12:18:39 +02:00
|
|
|
} else {
|
|
|
|
lprintf(warning, "HTTP %ld", http_resp);
|
|
|
|
if (HTTP_temp_failure(http_resp)) {
|
|
|
|
lprintf(warning, ", retrying later.\n");
|
|
|
|
} else {
|
|
|
|
this_link->type = LINK_INVALID;
|
|
|
|
lprintf(warning, ".\n");
|
|
|
|
}
|
|
|
|
}
|
2018-07-26 11:29:44 +02:00
|
|
|
}
|
|
|
|
|
2021-09-03 15:56:11 +02:00
|
|
|
static void LinkTable_fill(LinkTable *linktbl)
|
2018-07-26 11:29:44 +02:00
|
|
|
{
|
2023-09-03 22:03:00 +02:00
|
|
|
CURL *c = curl_easy_init();
|
2021-08-31 12:18:39 +02:00
|
|
|
Link *head_link = linktbl->links[0];
|
2023-09-03 22:02:33 +02:00
|
|
|
lprintf(debug, "Filling %s\n", head_link->f_url);
|
2021-08-31 12:18:39 +02:00
|
|
|
for (int i = 1; i < linktbl->num; i++) {
|
|
|
|
Link *this_link = linktbl->links[i];
|
2023-09-03 22:03:00 +02:00
|
|
|
/* Some web sites use characters in their href attributes that really
|
|
|
|
shouldn't be in their href attributes, most commonly spaces. And
|
|
|
|
some web sites _do_ properly encode their href attributes. So we
|
|
|
|
first unescape the link path, and then we escape it, so that curl
|
|
|
|
will definitely be happy with it (e.g., curl won't accept URLs with
|
|
|
|
spaces in them!). If we only escaped it, and there were already
|
|
|
|
encoded characters in it, then that would break the link. */
|
|
|
|
char *unescaped_path = curl_easy_unescape(c, this_link->linkpath, 0,
|
|
|
|
NULL);
|
|
|
|
char *escaped_path = curl_easy_escape(c, unescaped_path, 0);
|
|
|
|
curl_free(unescaped_path);
|
|
|
|
/* Our code does the wrong thing if there's a trailing slash that's been
|
|
|
|
replaced with %2F, which curl_easy_escape does, God bless it, so if
|
|
|
|
it did that then let's put it back. */
|
|
|
|
int escaped_len = strlen(escaped_path);
|
|
|
|
if (escaped_len >= 3 && !strcmp(escaped_path + escaped_len - 3, "%2F"))
|
|
|
|
strcpy(escaped_path + escaped_len - 3, "/");
|
|
|
|
char *url = path_append(head_link->f_url, escaped_path);
|
|
|
|
curl_free(escaped_path);
|
2021-08-31 12:18:39 +02:00
|
|
|
strncpy(this_link->f_url, url, MAX_PATH_LEN);
|
|
|
|
FREE(url);
|
|
|
|
char *unescaped_linkname;
|
|
|
|
unescaped_linkname = curl_easy_unescape(c, this_link->linkname,
|
|
|
|
0, NULL);
|
|
|
|
strncpy(this_link->linkname, unescaped_linkname, MAX_FILENAME_LEN);
|
|
|
|
curl_free(unescaped_linkname);
|
|
|
|
}
|
|
|
|
LinkTable_uninitialised_fill(linktbl);
|
2023-09-03 22:03:00 +02:00
|
|
|
curl_easy_cleanup(c);
|
2018-07-26 11:29:44 +02:00
|
|
|
}
|
|
|
|
|
2019-04-26 16:35:48 +02:00
|
|
|
/**
|
2019-09-04 20:53:11 +02:00
|
|
|
* \brief Reset invalid links in the link table
|
2019-04-26 16:35:48 +02:00
|
|
|
*/
|
2021-09-03 15:56:11 +02:00
|
|
|
static void LinkTable_invalid_reset(LinkTable *linktbl)
|
2019-04-26 16:35:48 +02:00
|
|
|
{
|
2021-08-31 12:18:39 +02:00
|
|
|
int j = 0;
|
|
|
|
for (int i = 0; i < linktbl->num; i++) {
|
|
|
|
Link *this_link = linktbl->links[i];
|
|
|
|
if (this_link->type == LINK_INVALID) {
|
|
|
|
this_link->type = LINK_UNINITIALISED_FILE;
|
|
|
|
j++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
lprintf(debug, "%d invalid links\n", j);
|
2019-04-26 16:35:48 +02:00
|
|
|
}
|
|
|
|
|
2021-09-03 15:56:11 +02:00
|
|
|
void LinkTable_free(LinkTable *linktbl)
|
2018-07-26 11:29:44 +02:00
|
|
|
{
|
2023-07-26 01:46:24 +02:00
|
|
|
if (linktbl) {
|
2023-03-31 14:26:15 +02:00
|
|
|
for (int i = 0; i < linktbl->num; i++) {
|
|
|
|
LinkTable_free(linktbl->links[i]->next_table);
|
|
|
|
FREE(linktbl->links[i]);
|
|
|
|
}
|
|
|
|
FREE(linktbl->links);
|
|
|
|
FREE(linktbl);
|
2021-08-31 12:18:39 +02:00
|
|
|
}
|
2018-07-26 11:29:44 +02:00
|
|
|
}
|
|
|
|
|
2021-09-03 15:56:11 +02:00
|
|
|
void LinkTable_print(LinkTable *linktbl)
|
2019-04-22 16:26:25 +02:00
|
|
|
{
|
2021-08-31 12:18:39 +02:00
|
|
|
if (CONFIG.log_type & info) {
|
|
|
|
int j = 0;
|
|
|
|
lprintf(info, "--------------------------------------------\n");
|
|
|
|
lprintf(info, " LinkTable %p for %s\n", linktbl,
|
|
|
|
linktbl->links[0]->f_url);
|
|
|
|
lprintf(info, "--------------------------------------------\n");
|
|
|
|
for (int i = 0; i < linktbl->num; i++) {
|
|
|
|
Link *this_link = linktbl->links[i];
|
|
|
|
lprintf(info, "%d %c %lu %s %s\n",
|
|
|
|
i,
|
|
|
|
this_link->type,
|
|
|
|
this_link->content_length,
|
|
|
|
this_link->linkname, this_link->f_url);
|
|
|
|
if ((this_link->type != LINK_FILE)
|
2021-09-03 15:56:11 +02:00
|
|
|
&& (this_link->type != LINK_DIR)
|
|
|
|
&& (this_link->type != LINK_HEAD)) {
|
2021-08-31 12:18:39 +02:00
|
|
|
j++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
lprintf(info, "--------------------------------------------\n");
|
2021-08-31 12:50:59 +02:00
|
|
|
lprintf(info, " Invalid link count: %d\n", j);
|
2021-08-31 12:18:39 +02:00
|
|
|
lprintf(info, "--------------------------------------------\n");
|
|
|
|
}
|
2019-04-22 16:26:25 +02:00
|
|
|
}
|
|
|
|
|
2019-10-22 01:42:46 +02:00
|
|
|
LinkTable *LinkTable_alloc(const char *url)
|
|
|
|
{
|
2021-08-31 12:18:39 +02:00
|
|
|
LinkTable *linktbl = CALLOC(1, sizeof(LinkTable));
|
2023-03-31 14:26:15 +02:00
|
|
|
linktbl->num = 0;
|
|
|
|
linktbl->index_time = 0;
|
|
|
|
linktbl->links = NULL;
|
|
|
|
|
2021-08-31 12:18:39 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* populate the base URL
|
|
|
|
*/
|
|
|
|
Link *head_link = Link_new("/", LINK_HEAD);
|
|
|
|
LinkTable_add(linktbl, head_link);
|
|
|
|
strncpy(head_link->f_url, url, MAX_PATH_LEN);
|
|
|
|
assert(linktbl->num == 1);
|
|
|
|
return linktbl;
|
2019-10-22 01:42:46 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
LinkTable *LinkTable_new(const char *url)
|
|
|
|
{
|
2021-08-31 12:18:39 +02:00
|
|
|
LinkTable *linktbl = LinkTable_alloc(url);
|
2023-03-31 14:26:15 +02:00
|
|
|
linktbl->index_time = time(NULL);
|
2021-08-31 12:18:39 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* start downloading the base URL
|
|
|
|
*/
|
2021-09-03 13:40:35 +02:00
|
|
|
TransferStruct ts = Link_download_full(linktbl->links[0]);
|
2021-09-03 17:58:08 +02:00
|
|
|
if (ts.curr_size == 0) {
|
2021-08-31 12:18:39 +02:00
|
|
|
LinkTable_free(linktbl);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Otherwise parsed the received data
|
|
|
|
*/
|
2021-09-03 13:40:35 +02:00
|
|
|
GumboOutput *output = gumbo_parse(ts.data);
|
2023-09-03 20:53:12 +02:00
|
|
|
HTML_to_LinkTable(url, output->root, linktbl);
|
2021-08-31 12:18:39 +02:00
|
|
|
gumbo_destroy_output(&kGumboDefaultOptions, output);
|
2021-09-03 13:40:35 +02:00
|
|
|
FREE(ts.data);
|
2021-08-31 12:18:39 +02:00
|
|
|
|
|
|
|
int skip_fill = 0;
|
|
|
|
char *unescaped_path;
|
|
|
|
CURL *c = curl_easy_init();
|
|
|
|
unescaped_path =
|
|
|
|
curl_easy_unescape(c, url + ROOT_LINK_OFFSET, 0, NULL);
|
|
|
|
if (CACHE_SYSTEM_INIT) {
|
|
|
|
CacheDir_create(unescaped_path);
|
|
|
|
LinkTable *disk_linktbl;
|
|
|
|
disk_linktbl = LinkTable_disk_open(unescaped_path);
|
|
|
|
if (disk_linktbl) {
|
|
|
|
/*
|
|
|
|
* Check if we need to update the link table
|
|
|
|
*/
|
|
|
|
lprintf(debug,
|
|
|
|
"disk_linktbl->num: %d, linktbl->num: %d\n",
|
|
|
|
disk_linktbl->num, linktbl->num);
|
|
|
|
if (disk_linktbl->num == linktbl->num) {
|
2021-08-31 12:15:00 +02:00
|
|
|
LinkTable_free(linktbl);
|
2021-08-31 12:18:39 +02:00
|
|
|
linktbl = disk_linktbl;
|
|
|
|
skip_fill = 1;
|
|
|
|
} else {
|
|
|
|
LinkTable_free(disk_linktbl);
|
|
|
|
}
|
2021-08-31 12:15:00 +02:00
|
|
|
}
|
2021-08-31 12:18:39 +02:00
|
|
|
}
|
2021-08-31 12:15:00 +02:00
|
|
|
|
2021-08-31 12:18:39 +02:00
|
|
|
if (!skip_fill) {
|
2021-08-31 12:15:00 +02:00
|
|
|
/*
|
2021-08-31 12:18:39 +02:00
|
|
|
* Fill in the link table
|
2021-08-31 12:15:00 +02:00
|
|
|
*/
|
2021-08-31 12:18:39 +02:00
|
|
|
LinkTable_fill(linktbl);
|
|
|
|
} else {
|
2021-08-31 12:15:00 +02:00
|
|
|
/*
|
2021-08-31 12:18:39 +02:00
|
|
|
* Fill in the holes in the link table
|
2021-08-31 12:15:00 +02:00
|
|
|
*/
|
2021-08-31 12:18:39 +02:00
|
|
|
LinkTable_invalid_reset(linktbl);
|
|
|
|
LinkTable_uninitialised_fill(linktbl);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Save the link table
|
|
|
|
*/
|
|
|
|
if (CACHE_SYSTEM_INIT) {
|
|
|
|
if (LinkTable_disk_save(linktbl, unescaped_path)) {
|
2021-08-31 12:59:28 +02:00
|
|
|
lprintf(error, "Failed to save the LinkTable!\n");
|
2021-08-31 12:15:00 +02:00
|
|
|
}
|
2021-08-31 12:18:39 +02:00
|
|
|
}
|
2021-08-31 12:15:00 +02:00
|
|
|
|
2021-08-31 12:18:39 +02:00
|
|
|
curl_free(unescaped_path);
|
|
|
|
curl_easy_cleanup(c);
|
2021-08-31 12:15:00 +02:00
|
|
|
|
2021-08-31 12:18:39 +02:00
|
|
|
LinkTable_print(linktbl);
|
2021-08-31 12:15:00 +02:00
|
|
|
|
2021-08-31 12:18:39 +02:00
|
|
|
return linktbl;
|
2018-07-26 11:29:44 +02:00
|
|
|
}
|
|
|
|
|
2019-04-26 08:39:45 +02:00
|
|
|
static void LinkTable_disk_delete(const char *dirn)
|
|
|
|
{
|
2021-08-31 12:18:39 +02:00
|
|
|
char *metadirn = path_append(META_DIR, dirn);
|
|
|
|
char *path;
|
|
|
|
path = path_append(metadirn, "/.LinkTable");
|
|
|
|
if (unlink(path)) {
|
|
|
|
lprintf(error, "unlink(%s): %s\n", path, strerror(errno));
|
|
|
|
}
|
|
|
|
FREE(path);
|
|
|
|
FREE(metadirn);
|
2019-04-26 08:39:45 +02:00
|
|
|
}
|
|
|
|
|
2021-09-03 15:56:11 +02:00
|
|
|
int LinkTable_disk_save(LinkTable *linktbl, const char *dirn)
|
2019-04-26 08:39:45 +02:00
|
|
|
{
|
2021-08-31 12:18:39 +02:00
|
|
|
char *metadirn = path_append(META_DIR, dirn);
|
|
|
|
char *path;
|
|
|
|
path = path_append(metadirn, "/.LinkTable");
|
|
|
|
FILE *fp = fopen(path, "w");
|
|
|
|
FREE(metadirn);
|
|
|
|
|
|
|
|
if (!fp) {
|
|
|
|
lprintf(error, "fopen(%s): %s\n", path, strerror(errno));
|
2021-08-31 12:15:00 +02:00
|
|
|
FREE(path);
|
2021-08-31 12:18:39 +02:00
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
FREE(path);
|
|
|
|
|
|
|
|
fwrite(&linktbl->num, sizeof(int), 1, fp);
|
|
|
|
for (int i = 0; i < linktbl->num; i++) {
|
|
|
|
fwrite(linktbl->links[i]->linkname, sizeof(char),
|
|
|
|
MAX_FILENAME_LEN, fp);
|
|
|
|
fwrite(linktbl->links[i]->f_url, sizeof(char), MAX_PATH_LEN, fp);
|
|
|
|
fwrite(&linktbl->links[i]->type, sizeof(LinkType), 1, fp);
|
|
|
|
fwrite(&linktbl->links[i]->content_length, sizeof(size_t), 1, fp);
|
|
|
|
fwrite(&linktbl->links[i]->time, sizeof(long), 1, fp);
|
|
|
|
}
|
|
|
|
|
|
|
|
int res = 0;
|
|
|
|
|
|
|
|
if (ferror(fp)) {
|
|
|
|
lprintf(error, "encountered ferror!\n");
|
|
|
|
res = -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (fclose(fp)) {
|
|
|
|
lprintf(error,
|
|
|
|
"cannot close the file pointer, %s\n", strerror(errno));
|
|
|
|
res = -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return res;
|
2019-04-26 08:39:45 +02:00
|
|
|
}
|
|
|
|
|
2023-09-29 20:23:44 +02:00
|
|
|
/* This is necessary to get the compiler on some platforms to stop
|
|
|
|
complaining about the fact that we're not using the return value of
|
|
|
|
fread, when we know we aren't and that's fine. */
|
|
|
|
static inline void ignore_value(int i) { (void) i; }
|
|
|
|
|
2019-04-26 08:39:45 +02:00
|
|
|
LinkTable *LinkTable_disk_open(const char *dirn)
|
|
|
|
{
|
2021-08-31 12:18:39 +02:00
|
|
|
char *metadirn = path_append(META_DIR, dirn);
|
|
|
|
char *path;
|
|
|
|
if (metadirn[strnlen(metadirn, MAX_PATH_LEN)] == '/') {
|
|
|
|
path = path_append(metadirn, ".LinkTable");
|
|
|
|
} else {
|
|
|
|
path = path_append(metadirn, "/.LinkTable");
|
|
|
|
}
|
|
|
|
FILE *fp = fopen(path, "r");
|
|
|
|
FREE(metadirn);
|
2021-08-31 12:15:00 +02:00
|
|
|
|
2021-08-31 12:18:39 +02:00
|
|
|
if (!fp) {
|
|
|
|
FREE(path);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
LinkTable *linktbl = CALLOC(1, sizeof(LinkTable));
|
|
|
|
|
|
|
|
fread(&linktbl->num, sizeof(int), 1, fp);
|
|
|
|
linktbl->links = CALLOC(linktbl->num, sizeof(Link *));
|
|
|
|
for (int i = 0; i < linktbl->num; i++) {
|
|
|
|
linktbl->links[i] = CALLOC(1, sizeof(Link));
|
2023-09-29 20:23:44 +02:00
|
|
|
/* The return values are safe to ignore here since we check them
|
|
|
|
immediately afterwards with feof() and ferror(). */
|
|
|
|
ignore_value(fread(linktbl->links[i]->linkname, sizeof(char),
|
|
|
|
MAX_FILENAME_LEN, fp));
|
|
|
|
ignore_value(fread(linktbl->links[i]->f_url, sizeof(char),
|
|
|
|
MAX_PATH_LEN, fp));
|
|
|
|
ignore_value(fread(&linktbl->links[i]->type, sizeof(LinkType), 1, fp));
|
|
|
|
ignore_value(fread(&linktbl->links[i]->content_length,
|
|
|
|
sizeof(size_t), 1, fp));
|
|
|
|
ignore_value(fread(&linktbl->links[i]->time, sizeof(long), 1, fp));
|
2021-08-31 12:18:39 +02:00
|
|
|
if (feof(fp)) {
|
|
|
|
/*
|
|
|
|
* reached EOF
|
|
|
|
*/
|
|
|
|
lprintf(error, "reached EOF!\n");
|
|
|
|
LinkTable_free(linktbl);
|
|
|
|
LinkTable_disk_delete(dirn);
|
|
|
|
return NULL;
|
2021-08-31 12:15:00 +02:00
|
|
|
}
|
2021-08-31 12:18:39 +02:00
|
|
|
if (ferror(fp)) {
|
|
|
|
lprintf(error, "encountered ferror!\n");
|
|
|
|
LinkTable_free(linktbl);
|
|
|
|
LinkTable_disk_delete(dirn);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (fclose(fp)) {
|
|
|
|
lprintf(error,
|
|
|
|
"cannot close the file pointer, %s\n", strerror(errno));
|
|
|
|
}
|
|
|
|
return linktbl;
|
2019-04-26 08:39:45 +02:00
|
|
|
}
|
|
|
|
|
2019-04-22 13:06:34 +02:00
|
|
|
LinkTable *path_to_Link_LinkTable_new(const char *path)
|
|
|
|
{
|
2023-07-26 01:45:41 +02:00
|
|
|
Link *link = NULL;
|
|
|
|
Link *tmp_link = NULL;
|
|
|
|
Link link_cpy = { 0 };
|
2023-03-31 14:26:15 +02:00
|
|
|
LinkTable *next_table = NULL;
|
|
|
|
if (!strcmp(path, "/")) {
|
|
|
|
next_table = ROOT_LINK_TBL;
|
2023-07-26 01:45:41 +02:00
|
|
|
link_cpy = *next_table->links[0];
|
|
|
|
tmp_link = &link_cpy;
|
2023-03-31 14:26:15 +02:00
|
|
|
} else {
|
|
|
|
link = path_to_Link(path);
|
|
|
|
tmp_link = link;
|
|
|
|
}
|
|
|
|
|
2023-07-26 01:46:24 +02:00
|
|
|
if (next_table) {
|
2023-03-31 14:26:15 +02:00
|
|
|
time_t time_now = time(NULL);
|
2023-07-26 01:46:24 +02:00
|
|
|
if (time_now - next_table->index_time > CONFIG.refresh_timeout) {
|
2023-07-26 01:45:41 +02:00
|
|
|
/* refresh directory contents */
|
2023-03-31 14:26:15 +02:00
|
|
|
LinkTable_free(next_table);
|
|
|
|
next_table = NULL;
|
2023-07-26 01:45:41 +02:00
|
|
|
if (link) {
|
2023-03-31 14:26:15 +02:00
|
|
|
link->next_table = NULL;
|
2023-07-26 01:45:41 +02:00
|
|
|
}
|
2023-03-31 14:26:15 +02:00
|
|
|
}
|
|
|
|
}
|
2021-08-31 12:18:39 +02:00
|
|
|
if (!next_table) {
|
|
|
|
if (CONFIG.mode == NORMAL) {
|
2023-03-31 14:26:15 +02:00
|
|
|
next_table = LinkTable_new(tmp_link->f_url);
|
2021-08-31 12:18:39 +02:00
|
|
|
} else if (CONFIG.mode == SONIC) {
|
|
|
|
if (!CONFIG.sonic_id3) {
|
2023-03-31 14:26:15 +02:00
|
|
|
next_table = sonic_LinkTable_new_index(tmp_link->sonic.id);
|
2021-08-31 12:18:39 +02:00
|
|
|
} else {
|
|
|
|
next_table =
|
2023-03-31 14:26:15 +02:00
|
|
|
sonic_LinkTable_new_id3(tmp_link->sonic.depth,
|
|
|
|
tmp_link->sonic.id);
|
2021-08-31 12:18:39 +02:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
lprintf(fatal, "Invalid CONFIG.mode\n");
|
2021-08-31 12:15:00 +02:00
|
|
|
}
|
2021-08-31 12:18:39 +02:00
|
|
|
}
|
2023-07-26 01:45:41 +02:00
|
|
|
if (link) {
|
2023-03-31 14:26:15 +02:00
|
|
|
link->next_table = next_table;
|
2023-07-26 01:45:41 +02:00
|
|
|
} else {
|
2023-03-31 14:26:15 +02:00
|
|
|
ROOT_LINK_TBL = next_table;
|
2023-07-26 01:45:41 +02:00
|
|
|
}
|
2021-08-31 12:18:39 +02:00
|
|
|
return next_table;
|
2019-04-22 13:06:34 +02:00
|
|
|
}
|
|
|
|
|
2021-09-03 15:56:11 +02:00
|
|
|
static Link *path_to_Link_recursive(char *path, LinkTable *linktbl)
|
2018-07-26 11:29:44 +02:00
|
|
|
{
|
2021-08-31 12:18:39 +02:00
|
|
|
/*
|
|
|
|
* skip the leading '/' if it exists
|
|
|
|
*/
|
|
|
|
if (*path == '/') {
|
|
|
|
path++;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* remove the last '/' if it exists
|
|
|
|
*/
|
|
|
|
char *slash = &(path[strnlen(path, MAX_PATH_LEN) - 1]);
|
|
|
|
if (*slash == '/') {
|
|
|
|
*slash = '\0';
|
|
|
|
}
|
|
|
|
|
|
|
|
slash = strchr(path, '/');
|
|
|
|
if (slash == NULL) {
|
2021-08-31 12:15:00 +02:00
|
|
|
/*
|
2021-08-31 12:18:39 +02:00
|
|
|
* We cannot find another '/', we have reached the last level
|
2021-08-31 12:15:00 +02:00
|
|
|
*/
|
2021-08-31 12:18:39 +02:00
|
|
|
for (int i = 1; i < linktbl->num; i++) {
|
|
|
|
if (!strncmp
|
2021-09-03 15:56:11 +02:00
|
|
|
(path, linktbl->links[i]->linkname, MAX_FILENAME_LEN)) {
|
2021-08-31 12:18:39 +02:00
|
|
|
/*
|
|
|
|
* We found our link
|
|
|
|
*/
|
|
|
|
return linktbl->links[i];
|
|
|
|
}
|
2021-08-31 12:15:00 +02:00
|
|
|
}
|
2021-08-31 12:18:39 +02:00
|
|
|
} else {
|
2021-08-31 12:15:00 +02:00
|
|
|
/*
|
2021-08-31 12:18:39 +02:00
|
|
|
* We can still find '/', time to consume the path and traverse
|
|
|
|
* the tree structure
|
2021-08-31 12:15:00 +02:00
|
|
|
*/
|
|
|
|
|
2021-08-31 12:18:39 +02:00
|
|
|
/*
|
|
|
|
* add termination mark to the current string,
|
|
|
|
* effective create two substrings
|
|
|
|
*/
|
|
|
|
*slash = '\0';
|
|
|
|
/*
|
|
|
|
* move the pointer past the '/'
|
|
|
|
*/
|
|
|
|
char *next_path = slash + 1;
|
|
|
|
for (int i = 1; i < linktbl->num; i++) {
|
|
|
|
if (!strncmp
|
2021-09-03 15:56:11 +02:00
|
|
|
(path, linktbl->links[i]->linkname, MAX_FILENAME_LEN)) {
|
2021-08-31 12:15:00 +02:00
|
|
|
/*
|
2021-08-31 12:18:39 +02:00
|
|
|
* The next sub-directory exists
|
2021-08-31 12:15:00 +02:00
|
|
|
*/
|
2021-08-31 12:18:39 +02:00
|
|
|
LinkTable *next_table = linktbl->links[i]->next_table;
|
|
|
|
if (!next_table) {
|
|
|
|
if (CONFIG.mode == NORMAL) {
|
|
|
|
next_table =
|
|
|
|
LinkTable_new(linktbl->links[i]->f_url);
|
|
|
|
} else if (CONFIG.mode == SONIC) {
|
|
|
|
if (!CONFIG.sonic_id3) {
|
|
|
|
next_table =
|
|
|
|
sonic_LinkTable_new_index
|
2021-09-01 22:29:13 +02:00
|
|
|
(linktbl->links[i]->sonic.id);
|
2021-08-31 12:18:39 +02:00
|
|
|
} else {
|
|
|
|
next_table =
|
|
|
|
sonic_LinkTable_new_id3
|
|
|
|
(linktbl->links
|
2021-09-01 22:29:13 +02:00
|
|
|
[i]->sonic.depth,
|
|
|
|
linktbl->links[i]->sonic.id);
|
2021-08-31 12:15:00 +02:00
|
|
|
}
|
2021-08-31 12:18:39 +02:00
|
|
|
} else {
|
|
|
|
lprintf(fatal, "Invalid CONFIG.mode\n");
|
|
|
|
}
|
2021-08-31 12:15:00 +02:00
|
|
|
}
|
2021-08-31 12:18:39 +02:00
|
|
|
linktbl->links[i]->next_table = next_table;
|
|
|
|
return path_to_Link_recursive(next_path, next_table);
|
|
|
|
}
|
2021-08-31 12:15:00 +02:00
|
|
|
}
|
2021-08-31 12:18:39 +02:00
|
|
|
}
|
|
|
|
return NULL;
|
2018-07-26 11:29:44 +02:00
|
|
|
}
|
|
|
|
|
2019-04-22 14:32:15 +02:00
|
|
|
Link *path_to_Link(const char *path)
|
|
|
|
{
|
2021-08-31 12:18:39 +02:00
|
|
|
lprintf(link_lock_debug,
|
|
|
|
"thread %x: locking link_lock;\n", pthread_self());
|
|
|
|
|
|
|
|
PTHREAD_MUTEX_LOCK(&link_lock);
|
|
|
|
char *new_path = strndup(path, MAX_PATH_LEN);
|
|
|
|
if (!new_path) {
|
|
|
|
lprintf(fatal, "cannot allocate memory\n");
|
|
|
|
}
|
|
|
|
Link *link = path_to_Link_recursive(new_path, ROOT_LINK_TBL);
|
|
|
|
FREE(new_path);
|
|
|
|
|
|
|
|
lprintf(link_lock_debug,
|
|
|
|
"thread %x: unlocking link_lock;\n", pthread_self());
|
|
|
|
PTHREAD_MUTEX_UNLOCK(&link_lock);
|
|
|
|
return link;
|
2019-04-22 14:32:15 +02:00
|
|
|
}
|
|
|
|
|
2021-09-03 15:56:11 +02:00
|
|
|
TransferStruct Link_download_full(Link *link)
|
2021-09-02 17:52:39 +02:00
|
|
|
{
|
|
|
|
char *url = link->f_url;
|
|
|
|
CURL *curl = Link_to_curl(link);
|
|
|
|
|
2021-09-03 13:40:35 +02:00
|
|
|
TransferStruct ts;
|
2021-09-03 17:58:08 +02:00
|
|
|
ts.curr_size = 0;
|
2021-09-03 13:40:35 +02:00
|
|
|
ts.data = NULL;
|
|
|
|
ts.type = DATA;
|
|
|
|
ts.transferring = 1;
|
2021-09-02 17:52:39 +02:00
|
|
|
|
2021-09-03 13:57:52 +02:00
|
|
|
CURLcode ret = curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *) &ts);
|
|
|
|
if (ret) {
|
|
|
|
lprintf(error, "%s", curl_easy_strerror(ret));
|
|
|
|
}
|
|
|
|
ret = curl_easy_setopt(curl, CURLOPT_PRIVATE, (void *) &ts);
|
|
|
|
if (ret) {
|
|
|
|
lprintf(error, "%s", curl_easy_strerror(ret));
|
|
|
|
}
|
2021-09-02 17:52:39 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If we get temporary HTTP failure, wait for 5 seconds before retry
|
|
|
|
*/
|
|
|
|
long http_resp = 0;
|
|
|
|
do {
|
|
|
|
transfer_blocking(curl);
|
2021-09-03 13:57:52 +02:00
|
|
|
ret = curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_resp);
|
2021-09-03 13:47:48 +02:00
|
|
|
if (ret) {
|
|
|
|
lprintf(error, "%s", curl_easy_strerror(ret));
|
|
|
|
}
|
2021-09-02 17:52:39 +02:00
|
|
|
if (HTTP_temp_failure(http_resp)) {
|
|
|
|
lprintf(warning,
|
|
|
|
"URL: %s, HTTP %ld, retrying later.\n",
|
|
|
|
url, http_resp);
|
|
|
|
sleep(CONFIG.http_wait_sec);
|
|
|
|
} else if (http_resp != HTTP_OK) {
|
|
|
|
lprintf(warning,
|
|
|
|
"cannot retrieve URL: %s, HTTP %ld\n", url, http_resp);
|
2021-09-03 17:58:08 +02:00
|
|
|
ts.curr_size = 0;
|
2022-11-01 02:54:35 +01:00
|
|
|
free(ts.data); /* not FREE(); can be NULL on error path! */
|
2021-09-02 17:52:39 +02:00
|
|
|
curl_easy_cleanup(curl);
|
2021-09-03 13:40:35 +02:00
|
|
|
return ts;
|
2021-09-02 17:52:39 +02:00
|
|
|
}
|
2021-09-03 15:56:11 +02:00
|
|
|
} while (HTTP_temp_failure(http_resp));
|
2021-09-02 17:52:39 +02:00
|
|
|
|
2021-09-03 13:57:52 +02:00
|
|
|
ret = curl_easy_getinfo(curl, CURLINFO_FILETIME, &(link->time));
|
2021-09-03 13:47:48 +02:00
|
|
|
if (ret) {
|
|
|
|
lprintf(error, "%s", curl_easy_strerror(ret));
|
|
|
|
}
|
2021-09-02 17:52:39 +02:00
|
|
|
curl_easy_cleanup(curl);
|
2021-09-03 13:40:35 +02:00
|
|
|
return ts;
|
2021-09-02 17:52:39 +02:00
|
|
|
}
|
|
|
|
|
2021-09-03 16:41:22 +02:00
|
|
|
static CURL *Link_download_curl_setup(Link *link, size_t req_size, off_t offset,
|
2021-09-03 15:56:11 +02:00
|
|
|
TransferStruct *header,
|
|
|
|
TransferStruct *ts)
|
2018-07-26 11:29:44 +02:00
|
|
|
{
|
2021-09-02 17:52:39 +02:00
|
|
|
if (!link) {
|
|
|
|
lprintf(fatal, "Invalid supplied\n");
|
|
|
|
}
|
|
|
|
|
2021-08-31 12:18:39 +02:00
|
|
|
size_t start = offset;
|
2021-09-01 04:53:19 +02:00
|
|
|
size_t end = start + req_size;
|
2021-09-03 15:56:11 +02:00
|
|
|
|
2021-08-31 12:18:39 +02:00
|
|
|
char range_str[64];
|
|
|
|
snprintf(range_str, sizeof(range_str), "%lu-%lu", start, end);
|
2021-09-02 16:36:53 +02:00
|
|
|
lprintf(debug, "%s: %s\n", link->linkname, range_str);
|
2021-08-31 12:18:39 +02:00
|
|
|
|
2021-09-03 13:40:35 +02:00
|
|
|
CURL *curl = Link_to_curl(link);
|
2021-09-03 13:57:52 +02:00
|
|
|
CURLcode ret =
|
2021-09-03 15:56:11 +02:00
|
|
|
curl_easy_setopt(curl, CURLOPT_HEADERDATA, (void *) header);
|
2021-09-03 13:57:52 +02:00
|
|
|
if (ret) {
|
|
|
|
lprintf(error, "%s", curl_easy_strerror(ret));
|
|
|
|
}
|
2021-09-03 15:56:11 +02:00
|
|
|
ret = curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *) ts);
|
2021-09-03 13:57:52 +02:00
|
|
|
if (ret) {
|
|
|
|
lprintf(error, "%s", curl_easy_strerror(ret));
|
|
|
|
}
|
2021-09-03 15:56:11 +02:00
|
|
|
ret = curl_easy_setopt(curl, CURLOPT_PRIVATE, (void *) ts);
|
2021-09-03 13:57:52 +02:00
|
|
|
if (ret) {
|
|
|
|
lprintf(error, "%s", curl_easy_strerror(ret));
|
|
|
|
}
|
|
|
|
ret = curl_easy_setopt(curl, CURLOPT_RANGE, range_str);
|
|
|
|
if (ret) {
|
|
|
|
lprintf(error, "%s", curl_easy_strerror(ret));
|
|
|
|
}
|
2021-09-03 15:56:11 +02:00
|
|
|
|
|
|
|
return curl;
|
|
|
|
}
|
|
|
|
|
2021-09-03 17:29:00 +02:00
|
|
|
static curl_off_t Link_download_cleanup(CURL *curl, TransferStruct *header)
|
2021-09-03 15:56:11 +02:00
|
|
|
{
|
2021-08-31 12:18:39 +02:00
|
|
|
/*
|
|
|
|
* Check for range seek support
|
|
|
|
*/
|
|
|
|
if (!CONFIG.no_range_check) {
|
2021-09-03 17:29:00 +02:00
|
|
|
if (!strcasestr((header->data), "Accept-Ranges: bytes")) {
|
2021-08-31 12:18:39 +02:00
|
|
|
fprintf(stderr, "This web server does not support HTTP \
|
2019-10-24 01:44:18 +02:00
|
|
|
range requests\n");
|
2021-08-31 12:18:39 +02:00
|
|
|
exit(EXIT_FAILURE);
|
2021-08-31 12:15:00 +02:00
|
|
|
}
|
2021-08-31 12:18:39 +02:00
|
|
|
}
|
2021-08-31 12:15:00 +02:00
|
|
|
|
2021-09-03 17:29:00 +02:00
|
|
|
FREE(header->data);
|
2021-08-31 12:15:00 +02:00
|
|
|
|
2021-08-31 12:18:39 +02:00
|
|
|
long http_resp;
|
2021-09-03 15:56:11 +02:00
|
|
|
CURLcode ret = curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_resp);
|
2021-09-03 13:47:48 +02:00
|
|
|
if (ret) {
|
|
|
|
lprintf(error, "%s", curl_easy_strerror(ret));
|
|
|
|
}
|
2021-08-31 12:18:39 +02:00
|
|
|
if (!((http_resp != HTTP_OK) ||
|
2021-09-03 15:56:11 +02:00
|
|
|
(http_resp != HTTP_PARTIAL_CONTENT) ||
|
|
|
|
(http_resp != HTTP_RANGE_NOT_SATISFIABLE))) {
|
2021-09-03 17:29:00 +02:00
|
|
|
char *url;
|
|
|
|
curl_easy_getinfo(curl, CURLINFO_EFFECTIVE_URL, &url);
|
|
|
|
lprintf(warning, "Could not download %s, HTTP %ld\n", url, http_resp);
|
2021-08-31 12:18:39 +02:00
|
|
|
return -ENOENT;
|
|
|
|
}
|
2021-08-31 12:15:00 +02:00
|
|
|
|
2021-09-01 04:53:19 +02:00
|
|
|
curl_off_t recv;
|
2021-09-03 13:47:48 +02:00
|
|
|
ret = curl_easy_getinfo(curl, CURLINFO_SIZE_DOWNLOAD_T, &recv);
|
|
|
|
if (ret) {
|
|
|
|
lprintf(error, "%s", curl_easy_strerror(ret));
|
|
|
|
}
|
2021-08-31 12:15:00 +02:00
|
|
|
|
2021-09-03 17:29:00 +02:00
|
|
|
curl_easy_cleanup(curl);
|
|
|
|
|
|
|
|
return recv;
|
|
|
|
}
|
|
|
|
|
2021-09-03 17:58:08 +02:00
|
|
|
long Link_download(Link *link, char *output_buf, size_t req_size, off_t offset)
|
2021-09-03 17:29:00 +02:00
|
|
|
{
|
|
|
|
TransferStruct ts;
|
2021-09-03 17:58:08 +02:00
|
|
|
ts.curr_size = 0;
|
2021-09-03 17:29:00 +02:00
|
|
|
ts.data = NULL;
|
|
|
|
ts.type = DATA;
|
|
|
|
ts.transferring = 1;
|
|
|
|
|
|
|
|
TransferStruct header;
|
2021-09-03 17:58:08 +02:00
|
|
|
header.curr_size = 0;
|
2021-09-03 17:29:00 +02:00
|
|
|
header.data = NULL;
|
|
|
|
|
|
|
|
CURL *curl = Link_download_curl_setup(link, req_size, offset, &header, &ts);
|
|
|
|
|
|
|
|
transfer_blocking(curl);
|
|
|
|
|
|
|
|
curl_off_t recv = Link_download_cleanup(curl, &header);
|
|
|
|
|
2021-09-01 04:53:19 +02:00
|
|
|
/* The extra 1 byte is probably for '\0' */
|
|
|
|
if (recv - 1 == (long int) req_size) {
|
|
|
|
recv--;
|
2021-09-01 12:03:27 +02:00
|
|
|
} else if (offset + req_size < link->content_length) {
|
2021-09-01 04:53:19 +02:00
|
|
|
lprintf(error, "req_size: %lu, recv: %ld\n", req_size, recv);
|
2021-08-31 12:18:39 +02:00
|
|
|
}
|
2021-08-31 12:15:00 +02:00
|
|
|
|
2021-09-03 13:40:35 +02:00
|
|
|
memmove(output_buf, ts.data, recv);
|
|
|
|
FREE(ts.data);
|
2021-08-31 12:15:00 +02:00
|
|
|
|
2021-08-31 12:18:39 +02:00
|
|
|
return recv;
|
2018-07-26 11:29:44 +02:00
|
|
|
}
|
2021-09-02 16:36:53 +02:00
|
|
|
|
2021-09-03 17:58:08 +02:00
|
|
|
long path_download(const char *path, char *output_buf, size_t req_size,
|
|
|
|
off_t offset)
|
2021-09-02 16:36:53 +02:00
|
|
|
{
|
|
|
|
if (!path) {
|
|
|
|
lprintf(fatal, "NULL path supplied\n");
|
|
|
|
}
|
2021-09-02 17:52:39 +02:00
|
|
|
|
2021-09-02 16:36:53 +02:00
|
|
|
Link *link;
|
|
|
|
link = path_to_Link(path);
|
|
|
|
if (!link) {
|
|
|
|
return -ENOENT;
|
|
|
|
}
|
|
|
|
|
|
|
|
return Link_download(link, output_buf, req_size, offset);
|
|
|
|
}
|
2023-09-03 20:53:12 +02:00
|
|
|
|
|
|
|
static void make_link_relative(const char *page_url, char *link_url)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
Some servers make the links to subdirectories absolute, but our code
|
|
|
|
expects them to be relative, so change the contents of link_url as
|
|
|
|
needed to accommodate that.
|
|
|
|
*/
|
|
|
|
if (link_url[0] != '/') {
|
|
|
|
/* Already relative, nothing to do here! */
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Find the slash after the host name. */
|
|
|
|
int slashes_left_to_find = 3;
|
|
|
|
while (*page_url) {
|
|
|
|
if (*page_url == '/' && ! --slashes_left_to_find)
|
|
|
|
break;
|
|
|
|
/* N.B. This is here, rather than doing `while (*page_url++)`, because
|
|
|
|
when we're done we want the pointer to point at the final slash. */
|
|
|
|
page_url++;
|
|
|
|
}
|
|
|
|
if (slashes_left_to_find)
|
|
|
|
if (! *page_url)
|
|
|
|
/* We're at the top level of the web site and the user entered the URL
|
|
|
|
without a trailing slash. */
|
|
|
|
page_url = "/";
|
|
|
|
else
|
|
|
|
/* Well, that's odd. Let's return rather than trying to dig ourselves
|
|
|
|
deeper into whatever hole we're in. */
|
|
|
|
return;
|
|
|
|
/* The page URL is no longer the full page_url, it's just the part after
|
2023-09-29 20:23:44 +02:00
|
|
|
the host name. */
|
2023-09-03 20:53:12 +02:00
|
|
|
/* The link URL should start with the page URL. */
|
|
|
|
if (strstr(link_url, page_url) != link_url)
|
|
|
|
return;
|
|
|
|
int skip_len = strlen(page_url);
|
|
|
|
if (page_url[skip_len-1] != '/') {
|
|
|
|
if (page_url[skip_len] != '/')
|
|
|
|
/* Um, I'm not sure what to do here, so give up. */
|
|
|
|
return;
|
|
|
|
skip_len++;
|
|
|
|
}
|
|
|
|
/* Move the part of the link URL after the parent page's pat to
|
|
|
|
the beginning of the link URL string, discarding what came
|
|
|
|
before it. */
|
|
|
|
memmove(link_url, link_url + skip_len, strlen(link_url) - skip_len + 1);
|
|
|
|
}
|