httpdirfs/link.c

241 lines
6.4 KiB
C
Raw Normal View History

2018-07-18 17:26:26 +02:00
#include <ctype.h>
#include "string.h"
2018-07-20 14:59:25 +02:00
#include "link.h"
2018-07-18 17:26:26 +02:00
static size_t
WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp)
{
size_t realsize = size * nmemb;
Link *mem = (Link *)userp;
2018-07-20 14:59:25 +02:00
mem->body = realloc(mem->body, mem->body_sz + realsize + 1);
if(mem->body == NULL) {
/* out of memory! */
printf("not enough memory (realloc returned NULL)\n");
return 0;
}
2018-07-20 14:59:25 +02:00
memcpy(&(mem->body[mem->body_sz]), contents, realsize);
mem->body_sz += realsize;
mem->body[mem->body_sz] = 0;
return realsize;
}
2018-07-20 03:09:51 +02:00
Link *Link_new(const char *p_url)
2018-07-18 17:26:26 +02:00
{
2018-07-20 03:09:51 +02:00
Link *link = calloc(1, sizeof(Link));
2018-07-20 14:59:25 +02:00
strncpy(link->p_url, p_url, LINK_LEN_MAX);
2018-07-20 03:09:51 +02:00
link->type = LINK_UNKNOWN;
2018-07-20 14:59:25 +02:00
link->curl = curl_easy_init();
2018-07-20 03:09:51 +02:00
link->res = -1;
/* set up some basic curl stuff */
2018-07-20 14:59:25 +02:00
curl_easy_setopt(link->curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
curl_easy_setopt(link->curl, CURLOPT_WRITEDATA, (void *)link);
curl_easy_setopt(link->curl, CURLOPT_USERAGENT, "mount-http-dir/libcurl");
2018-07-20 16:38:44 +02:00
curl_easy_setopt(link->curl, CURLOPT_VERBOSE, 1);
2018-07-20 03:09:51 +02:00
return link;
}
void Link_free(Link *link)
{
2018-07-20 14:59:25 +02:00
curl_easy_cleanup(link->curl);
free(link->body);
2018-07-20 03:09:51 +02:00
free(link);
link = NULL;
}
2018-07-20 16:38:44 +02:00
int Link_download(Link *link, size_t start, size_t end)
{
CURL *curl = link->curl;
char range_str[64];
snprintf(range_str, sizeof(range_str), "%lu-%lu", start, end);
curl_easy_setopt(curl, CURLOPT_NOBODY, 0);
curl_easy_setopt(curl, CURLOPT_RANGE, range_str);
curl_easy_perform(link->curl);
long http_resp;
curl_easy_getinfo(link->curl, CURLINFO_RESPONSE_CODE, &http_resp);
return http_resp;
}
LinkTable *LinkTable_new(const char *url)
2018-07-20 03:09:51 +02:00
{
LinkTable *linktbl = calloc(1, sizeof(LinkTable));
2018-07-20 14:59:25 +02:00
/* populate the base URL */
2018-07-20 14:59:25 +02:00
LinkTable_add(linktbl, Link_new(url));
Link *head_link = linktbl->links[0];
2018-07-20 16:38:44 +02:00
head_link->type = LINK_HEAD;
2018-07-20 14:59:25 +02:00
curl_easy_setopt(head_link->curl, CURLOPT_URL, url);
/* start downloading the base URL */
head_link->res = curl_easy_perform(head_link->curl);
/* if downloading base URL failed */
if (head_link->res != CURLE_OK) {
fprintf(stderr, "link.c: LinkTable_new() cannot retrive the base URL");
LinkTable_free(linktbl);
linktbl = NULL;
2018-07-20 14:59:25 +02:00
return linktbl;
};
2018-07-20 14:59:25 +02:00
/* Otherwise parsed the received data */
GumboOutput* output = gumbo_parse(head_link->body);
HTML_to_LinkTable(output->root, linktbl);
gumbo_destroy_output(&kGumboDefaultOptions, output);
/* Fill in the link table */
LinkTable_fill(linktbl);
2018-07-20 03:09:51 +02:00
return linktbl;
2018-07-18 17:26:26 +02:00
}
2018-07-20 03:09:51 +02:00
void LinkTable_free(LinkTable *linktbl)
2018-07-18 17:26:26 +02:00
{
2018-07-20 03:09:51 +02:00
for (int i = 0; i < linktbl->num; i++) {
Link_free(linktbl->links[i]);
2018-07-18 17:26:26 +02:00
}
2018-07-20 03:09:51 +02:00
free(linktbl->links);
free(linktbl);
linktbl = NULL;
2018-07-18 17:26:26 +02:00
}
2018-07-20 03:09:51 +02:00
void LinkTable_add(LinkTable *linktbl, Link *link)
2018-07-18 17:26:26 +02:00
{
2018-07-20 03:09:51 +02:00
linktbl->num++;
linktbl->links = realloc(
linktbl->links,
linktbl->num * sizeof(Link *));
linktbl->links[linktbl->num - 1] = link;
}
2018-07-20 14:59:25 +02:00
void LinkTable_fill(LinkTable *linktbl)
{
for (int i = 0; i < linktbl->num; i++) {
Link *this_link = linktbl->links[i];
if (this_link->type == LINK_UNKNOWN) {
CURL *curl = this_link->curl;
char *url;
curl_easy_getinfo(curl, CURLINFO_EFFECTIVE_URL, &url);
url = url_append(linktbl->links[0]->p_url, this_link->p_url);
curl_easy_setopt(curl, CURLOPT_URL, url);
free(url);
curl_easy_setopt(curl, CURLOPT_NOBODY, 1);
curl_easy_perform(curl);
2018-07-20 16:38:44 +02:00
long http_resp;
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_resp);
if (http_resp == HTTP_OK) {
double cl;
curl_easy_getinfo(curl, CURLINFO_CONTENT_LENGTH_DOWNLOAD, &cl);
if (cl == -1) {
this_link->content_length = 0;
this_link->type = LINK_DIR;
} else {
this_link->content_length = cl;
this_link->type = LINK_FILE;
}
2018-07-20 14:59:25 +02:00
} else {
2018-07-20 16:38:44 +02:00
this_link->type = LINK_INVALID;
2018-07-20 14:59:25 +02:00
}
}
}
}
2018-07-20 03:09:51 +02:00
void LinkTable_print(LinkTable *linktbl)
{
for (int i = 0; i < linktbl->num; i++) {
2018-07-20 14:59:25 +02:00
Link *this_link = linktbl->links[i];
printf("%d %c %lu %s\n",
2018-07-20 03:09:51 +02:00
i,
2018-07-20 14:59:25 +02:00
this_link->type,
this_link->content_length,
this_link->p_url);
2018-07-20 03:09:51 +02:00
}
2018-07-18 17:26:26 +02:00
}
static int is_valid_link(const char *n)
{
/* The link name has to start with alphanumerical character */
if (!isalnum(n[0])) {
return 0;
}
2018-07-20 03:09:51 +02:00
2018-07-18 17:26:26 +02:00
/* check for http:// and https:// */
2018-07-20 03:09:51 +02:00
int c = strnlen(n, LINK_LEN_MAX);
2018-07-18 17:26:26 +02:00
if (c > 5) {
if (n[0] == 'h' && n[1] == 't' && n[2] == 't' && n[3] == 'p') {
if ((n[4] == ':' && n[5] == '/' && n[6] == '/') ||
(n[4] == 's' && n[5] == ':' && n[6] == '/' && n[7] == '/')) {
return 0;
}
}
}
return 1;
}
/*
* Shamelessly copied and pasted from:
* https://github.com/google/gumbo-parser/blob/master/examples/find_links.cc
*/
2018-07-20 03:09:51 +02:00
void HTML_to_LinkTable(GumboNode *node, LinkTable *linktbl)
2018-07-18 17:26:26 +02:00
{
if (node->type != GUMBO_NODE_ELEMENT) {
return;
}
GumboAttribute* href;
if (node->v.element.tag == GUMBO_TAG_A &&
(href = gumbo_get_attribute(&node->v.element.attributes, "href"))) {
/* if it is valid, copy the link onto the heap */
if (is_valid_link(href->value)) {
2018-07-20 03:09:51 +02:00
LinkTable_add(linktbl, Link_new(href->value));
2018-07-18 17:26:26 +02:00
}
}
/* Note the recursive call, lol. */
GumboVector *children = &node->v.element.children;
for (size_t i = 0; i < children->length; ++i) {
2018-07-20 03:09:51 +02:00
HTML_to_LinkTable((GumboNode*)children->data[i], linktbl);
2018-07-18 17:26:26 +02:00
}
return;
}
/* the upper level */
char *url_upper(const char *url)
{
const char *pt = strrchr(url, '/');
/* +1 for the '/' */
size_t len = pt - url + 1;
2018-07-20 14:59:25 +02:00
char *str = strndup(url, len);
2018-07-18 17:26:26 +02:00
str[len] = '\0';
return str;
}
/* append url */
char *url_append(const char *url, const char *sublink)
{
int needs_separator = 0;
if (url[strlen(url)-1] != '/') {
needs_separator = 1;
}
char *str;
size_t ul = strlen(url);
size_t sl = strlen(sublink);
str = calloc(ul + sl + needs_separator, sizeof(char));
strncpy(str, url, ul);
if (needs_separator) {
str[ul] = '/';
}
strncat(str, sublink, sl);
return str;
}