split link related function to link.h
This commit is contained in:
parent
46c98be116
commit
684831a961
2
Makefile
2
Makefile
|
@ -1,7 +1,7 @@
|
|||
CC=gcc
|
||||
CFLAGS= -g -O2 -Wall -Wextra -lgumbo -lcurl -lfuse -lcrypto \
|
||||
-D_FILE_OFFSET_BITS=64
|
||||
OBJ = main.o network.o fuse_local.o
|
||||
OBJ = main.o network.o fuse_local.o link.o
|
||||
|
||||
%.o: %.c
|
||||
$(CC) -c -o $@ $< $(CFLAGS)
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
#include "fuse_local.h"
|
||||
|
||||
#include "link.h"
|
||||
#include "network.h"
|
||||
|
||||
#include <errno.h>
|
||||
|
|
|
@ -0,0 +1,434 @@
|
|||
#include "link.h"
|
||||
|
||||
#include "network.h"
|
||||
|
||||
#include <gumbo.h>
|
||||
|
||||
#include <ctype.h>
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
|
||||
|
||||
#define HTTP_OK 200
|
||||
#define HTTP_PARTIAL_CONTENT 206
|
||||
#define HTTP_RANGE_NOT_SATISFIABLE 416
|
||||
|
||||
/* ---------------- External variables -----------------------*/
|
||||
LinkTable *ROOT_LINK_TBL;
|
||||
|
||||
static void HTML_to_LinkTable(GumboNode *node, LinkTable *linktbl);
|
||||
static Link *Link_new(const char *p_url, LinkType type);
|
||||
static CURL *Link_to_curl(Link *link);
|
||||
static void Link_get_stat(Link *this_link);
|
||||
static void LinkTable_add(LinkTable *linktbl, Link *link);
|
||||
void LinkTable_fill(LinkTable *linktbl);
|
||||
static void LinkTable_free(LinkTable *linktbl);
|
||||
static void LinkTable_print(LinkTable *linktbl);
|
||||
static Link *path_to_Link_recursive(char *path, LinkTable *linktbl);
|
||||
static LinkType p_url_type(const char *p_url);
|
||||
static char *url_append(const char *url, const char *sublink);
|
||||
|
||||
/**
|
||||
* Shamelessly copied and pasted from:
|
||||
* https://github.com/google/gumbo-parser/blob/master/examples/find_links.cc
|
||||
*/
|
||||
static void HTML_to_LinkTable(GumboNode *node, LinkTable *linktbl)
|
||||
{
|
||||
if (node->type != GUMBO_NODE_ELEMENT) {
|
||||
return;
|
||||
}
|
||||
GumboAttribute* href;
|
||||
|
||||
if (node->v.element.tag == GUMBO_TAG_A &&
|
||||
(href = gumbo_get_attribute(&node->v.element.attributes, "href"))) {
|
||||
/* if it is valid, copy the link onto the heap */
|
||||
LinkType type = p_url_type(href->value);
|
||||
if (type) {
|
||||
LinkTable_add(linktbl, Link_new(href->value, type));
|
||||
}
|
||||
}
|
||||
/* Note the recursive call, lol. */
|
||||
GumboVector *children = &node->v.element.children;
|
||||
for (size_t i = 0; i < children->length; ++i) {
|
||||
HTML_to_LinkTable((GumboNode*)children->data[i], linktbl);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
static Link *Link_new(const char *p_url, LinkType type)
|
||||
{
|
||||
Link *link = calloc(1, sizeof(Link));
|
||||
if (!link) {
|
||||
fprintf(stderr, "Link_new(): calloc failure!\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
strncpy(link->p_url, p_url, LINK_LEN_MAX);
|
||||
link->type = type;
|
||||
|
||||
/* remove the '/' from p_url if it exists */
|
||||
char *c = &(link->p_url[strnlen(link->p_url, LINK_LEN_MAX) - 1]);
|
||||
if ( *c == '/') {
|
||||
*c = '\0';
|
||||
}
|
||||
|
||||
return link;
|
||||
}
|
||||
|
||||
static CURL *Link_to_curl(Link *link)
|
||||
{
|
||||
CURL *curl = curl_easy_init();
|
||||
if (!curl) {
|
||||
fprintf(stderr, "Link_to_curl(): curl_easy_init() failed!\n");
|
||||
}
|
||||
|
||||
/* set up some basic curl stuff */
|
||||
curl_easy_setopt(curl, CURLOPT_USERAGENT, "httpdirfs - \
|
||||
https://github.com/fangfufu/httpdirfs");
|
||||
curl_easy_setopt(curl, CURLOPT_VERBOSE, 0);
|
||||
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
|
||||
/* for following directories without the '/' */
|
||||
curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 2);
|
||||
curl_easy_setopt(curl, CURLOPT_URL, link->f_url);
|
||||
curl_easy_setopt(curl, CURLOPT_TCP_KEEPALIVE, 1);
|
||||
curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 15);
|
||||
curl_easy_setopt(curl, CURLOPT_SHARE, curl_share);
|
||||
/*
|
||||
* The write back function pointer has to be set at curl handle creation,
|
||||
* for thread safety
|
||||
*/
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
|
||||
|
||||
return curl;
|
||||
}
|
||||
|
||||
void Link_get_stat(Link *this_link)
|
||||
{
|
||||
fprintf(stderr, "Link_get_size(%s);\n", this_link->f_url);
|
||||
|
||||
if (this_link->type == LINK_FILE) {
|
||||
CURL *curl = Link_to_curl(this_link);
|
||||
curl_easy_setopt(curl, CURLOPT_NOBODY, 1);
|
||||
curl_easy_setopt(curl, CURLOPT_FILETIME, 1L);
|
||||
|
||||
/*
|
||||
* We need to put the variable on the heap, because otherwise the
|
||||
* variable gets popped from the stack as the function returns.
|
||||
*
|
||||
* It gets freed in curl_multi_perform_once();
|
||||
*/
|
||||
TransferStruct *transfer = malloc(sizeof(TransferStruct));
|
||||
if (!transfer) {
|
||||
fprintf(stderr, "Link_get_size(): malloc failed!\n");
|
||||
}
|
||||
transfer->link = this_link;
|
||||
transfer->type = FILESTAT;
|
||||
curl_easy_setopt(curl, CURLOPT_PRIVATE, transfer);
|
||||
|
||||
transfer_nonblocking(curl);
|
||||
}
|
||||
}
|
||||
|
||||
void Link_set_stat(Link* this_link, CURL *curl)
|
||||
{
|
||||
long http_resp;
|
||||
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_resp);
|
||||
if (http_resp == HTTP_OK) {
|
||||
double cl = 0;
|
||||
curl_easy_getinfo(curl, CURLINFO_CONTENT_LENGTH_DOWNLOAD, &cl);
|
||||
curl_easy_getinfo(curl, CURLINFO_FILETIME, &(this_link->time));
|
||||
|
||||
if (cl == -1) {
|
||||
/* Turns out not to be a file after all */
|
||||
this_link->content_length = 0;
|
||||
this_link->type = LINK_DIR;
|
||||
} else {
|
||||
this_link->content_length = cl;
|
||||
this_link->type = LINK_FILE;
|
||||
}
|
||||
} else {
|
||||
this_link->type = LINK_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
static void LinkTable_add(LinkTable *linktbl, Link *link)
|
||||
{
|
||||
linktbl->num++;
|
||||
linktbl->links = realloc(linktbl->links, linktbl->num * sizeof(Link *));
|
||||
if (!linktbl->links) {
|
||||
fprintf(stderr, "LinkTable_add(): realloc failure!\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
linktbl->links[linktbl->num - 1] = link;
|
||||
}
|
||||
|
||||
void LinkTable_fill(LinkTable *linktbl)
|
||||
{
|
||||
Link *head_link = linktbl->links[0];
|
||||
for (int i = 0; i < linktbl->num; i++) {
|
||||
Link *this_link = linktbl->links[i];
|
||||
if (this_link->type) {
|
||||
char *url;
|
||||
url = url_append(head_link->f_url, this_link->p_url);
|
||||
strncpy(this_link->f_url, url, URL_LEN_MAX);
|
||||
free(url);
|
||||
|
||||
char *unescaped_p_url;
|
||||
unescaped_p_url = curl_easy_unescape(NULL, this_link->p_url, 0,
|
||||
NULL);
|
||||
strncpy(this_link->p_url, unescaped_p_url, LINK_LEN_MAX);
|
||||
curl_free(unescaped_p_url);
|
||||
|
||||
if (this_link->type == LINK_FILE && !(this_link->content_length)) {
|
||||
Link_get_stat(this_link);
|
||||
} else if (this_link->type == LINK_DIR) {
|
||||
this_link->time = head_link->time;
|
||||
}
|
||||
}
|
||||
}
|
||||
/* Block until the LinkTable is filled up */
|
||||
while (curl_multi_perform_once()) {
|
||||
usleep(1000);
|
||||
}
|
||||
}
|
||||
|
||||
static void LinkTable_free(LinkTable *linktbl)
|
||||
{
|
||||
for (int i = 0; i < linktbl->num; i++) {
|
||||
free(linktbl->links[i]);
|
||||
}
|
||||
free(linktbl->links);
|
||||
free(linktbl);
|
||||
}
|
||||
|
||||
LinkTable *LinkTable_new(const char *url)
|
||||
{
|
||||
fprintf(stderr, "LinkTable_new(%s);\n", url);
|
||||
|
||||
LinkTable *linktbl = calloc(1, sizeof(LinkTable));
|
||||
if (!linktbl) {
|
||||
fprintf(stderr, "LinkTable_new(): calloc failure!\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
/* populate the base URL */
|
||||
LinkTable_add(linktbl, Link_new("/", LINK_HEAD));
|
||||
Link *head_link = linktbl->links[0];
|
||||
head_link->type = LINK_HEAD;
|
||||
strncpy(head_link->f_url, url, URL_LEN_MAX);
|
||||
|
||||
/* start downloading the base URL */
|
||||
CURL *curl = Link_to_curl(head_link);
|
||||
MemoryStruct buf;
|
||||
buf.size = 0;
|
||||
buf.memory = NULL;
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)&buf);
|
||||
|
||||
transfer_blocking(curl);
|
||||
|
||||
/* if downloading base URL failed */
|
||||
long http_resp;
|
||||
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_resp);
|
||||
if (http_resp != HTTP_OK) {
|
||||
fprintf(stderr, "link.c: LinkTable_new() cannot retrive the base URL, \
|
||||
URL: %s, HTTP %ld\n", url, http_resp);
|
||||
|
||||
LinkTable_free(linktbl);
|
||||
linktbl = NULL;
|
||||
return linktbl;
|
||||
};
|
||||
curl_easy_getinfo(curl, CURLINFO_FILETIME, &(head_link->time));
|
||||
curl_easy_cleanup(curl);
|
||||
|
||||
/* Otherwise parsed the received data */
|
||||
GumboOutput* output = gumbo_parse(buf.memory);
|
||||
HTML_to_LinkTable(output->root, linktbl);
|
||||
gumbo_destroy_output(&kGumboDefaultOptions, output);
|
||||
free(buf.memory);
|
||||
|
||||
/* Fill in the link table */
|
||||
LinkTable_fill(linktbl);
|
||||
return linktbl;
|
||||
}
|
||||
|
||||
/** \brief print a LinkTable */
|
||||
static void LinkTable_print(LinkTable *linktbl)
|
||||
{
|
||||
fprintf(stderr, "--------------------------------------------\n");
|
||||
fprintf(stderr, " LinkTable %p for %s\n", linktbl,
|
||||
linktbl->links[0]->f_url);
|
||||
fprintf(stderr, "--------------------------------------------\n");
|
||||
for (int i = 0; i < linktbl->num; i++) {
|
||||
Link *this_link = linktbl->links[i];
|
||||
fprintf(stderr, "%d %c %lu %s %s\n",
|
||||
i,
|
||||
this_link->type,
|
||||
this_link->content_length,
|
||||
this_link->p_url,
|
||||
this_link->f_url
|
||||
);
|
||||
|
||||
}
|
||||
fprintf(stderr, "--------------------------------------------\n");
|
||||
}
|
||||
|
||||
Link *path_to_Link(const char *path)
|
||||
{
|
||||
char *new_path = strndup(path, URL_LEN_MAX);
|
||||
if (!new_path) {
|
||||
fprintf(stderr, "path_to_Link(): cannot allocate memory\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
Link *link = path_to_Link_recursive(new_path, ROOT_LINK_TBL);
|
||||
free(new_path);
|
||||
return link;
|
||||
}
|
||||
|
||||
static Link *path_to_Link_recursive(char *path, LinkTable *linktbl)
|
||||
{
|
||||
/* skip the leading '/' if it exists */
|
||||
if (*path == '/') {
|
||||
path++;
|
||||
}
|
||||
|
||||
/* remove the last '/' if it exists */
|
||||
char *slash = &(path[strnlen(path, URL_LEN_MAX) - 1]);
|
||||
if (*slash == '/') {
|
||||
*slash = '\0';
|
||||
}
|
||||
|
||||
slash = strchr(path, '/');
|
||||
if ( slash == NULL ) {
|
||||
/* We cannot find another '/', we have reached the last level */
|
||||
for (int i = 1; i < linktbl->num; i++) {
|
||||
if (!strncmp(path, linktbl->links[i]->p_url, LINK_LEN_MAX)) {
|
||||
/* We found our link */
|
||||
return linktbl->links[i];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* We can still find '/', time to consume the path and traverse
|
||||
* the tree structure
|
||||
*/
|
||||
|
||||
/*
|
||||
* add termination mark to the current string,
|
||||
* effective create two substrings
|
||||
*/
|
||||
*slash = '\0';
|
||||
/* move the pointer past the '/' */
|
||||
char *next_path = slash + 1;
|
||||
for (int i = 1; i < linktbl->num; i++) {
|
||||
if (!strncmp(path, linktbl->links[i]->p_url, LINK_LEN_MAX)) {
|
||||
/* The next sub-directory exists */
|
||||
if (!(linktbl->links[i]->next_table)) {
|
||||
linktbl->links[i]->next_table = LinkTable_new(
|
||||
linktbl->links[i]->f_url);
|
||||
fprintf(stderr, "Created new link table for %s\n",
|
||||
linktbl->links[i]->f_url);
|
||||
LinkTable_print(linktbl->links[i]->next_table);
|
||||
}
|
||||
|
||||
return path_to_Link_recursive(next_path,
|
||||
linktbl->links[i]->next_table);
|
||||
}
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
long path_download(const char *path, char *output_buf, size_t size,
|
||||
off_t offset)
|
||||
{
|
||||
Link *link;
|
||||
link = path_to_Link(path);
|
||||
if (!link) {
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
size_t start = offset;
|
||||
size_t end = start + size;
|
||||
char range_str[64];
|
||||
snprintf(range_str, sizeof(range_str), "%lu-%lu", start, end);
|
||||
|
||||
MemoryStruct buf;
|
||||
buf.size = 0;
|
||||
buf.memory = NULL;
|
||||
|
||||
fprintf(stderr, "path_download(%s, %s);\n",
|
||||
path, range_str);
|
||||
|
||||
CURL *curl = Link_to_curl(link);
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)&buf);
|
||||
curl_easy_setopt(curl, CURLOPT_RANGE, range_str);
|
||||
|
||||
transfer_blocking(curl);
|
||||
|
||||
long http_resp;
|
||||
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_resp);
|
||||
if ( !(
|
||||
(http_resp != HTTP_OK) ||
|
||||
(http_resp != HTTP_PARTIAL_CONTENT) ||
|
||||
(http_resp != HTTP_RANGE_NOT_SATISFIABLE)
|
||||
)) {
|
||||
fprintf(stderr, "path_download(): Could not download %s, HTTP %ld\n",
|
||||
link->f_url, http_resp);
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
double dl;
|
||||
curl_easy_getinfo(curl, CURLINFO_SIZE_DOWNLOAD, &dl);
|
||||
|
||||
size_t recv = dl;
|
||||
if (recv > size) {
|
||||
recv = size;
|
||||
}
|
||||
|
||||
memmove(output_buf, buf.memory, recv);
|
||||
curl_easy_cleanup(curl);
|
||||
free(buf.memory);
|
||||
return recv;
|
||||
}
|
||||
|
||||
static LinkType p_url_type(const char *p_url)
|
||||
{
|
||||
/* The link name has to start with alphanumerical character */
|
||||
if (!isalnum(p_url[0])) {
|
||||
return LINK_INVALID;
|
||||
}
|
||||
|
||||
/* check for http:// and https:// */
|
||||
if ( !strncmp(p_url, "http://", 7) || !strncmp(p_url, "https://", 8) ) {
|
||||
return LINK_INVALID;
|
||||
}
|
||||
|
||||
if ( p_url[strlen(p_url) - 1] == '/' ) {
|
||||
return LINK_DIR;
|
||||
}
|
||||
|
||||
return LINK_FILE;
|
||||
}
|
||||
|
||||
static char *url_append(const char *url, const char *sublink)
|
||||
{
|
||||
int needs_separator = 0;
|
||||
if (url[strlen(url)-1] != '/') {
|
||||
needs_separator = 1;
|
||||
}
|
||||
|
||||
char *str;
|
||||
size_t ul = strlen(url);
|
||||
size_t sl = strlen(sublink);
|
||||
str = calloc(ul + sl + needs_separator + 1, sizeof(char));
|
||||
if (!str) {
|
||||
fprintf(stderr, "url_append(): calloc failure!\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
strncpy(str, url, ul);
|
||||
if (needs_separator) {
|
||||
str[ul] = '/';
|
||||
}
|
||||
strncat(str, sublink, sl);
|
||||
return str;
|
||||
}
|
|
@ -0,0 +1,62 @@
|
|||
#ifndef LINK_H
|
||||
#define LINK_H
|
||||
|
||||
#include <curl/curl.h>
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
#define URL_LEN_MAX 2048
|
||||
#define LINK_LEN_MAX 255
|
||||
|
||||
/** \brief the link type */
|
||||
typedef enum {
|
||||
LINK_HEAD = 'H',
|
||||
LINK_DIR = 'D',
|
||||
LINK_FILE = 'F',
|
||||
LINK_INVALID = '\0'
|
||||
} LinkType;
|
||||
|
||||
/**
|
||||
* \brief link table type
|
||||
* \details index 0 contains the Link for the base URL
|
||||
*/
|
||||
typedef struct LinkTable LinkTable;
|
||||
|
||||
/** \brief link data type */
|
||||
typedef struct Link Link;
|
||||
|
||||
|
||||
struct Link {
|
||||
char p_url[LINK_LEN_MAX];
|
||||
char f_url[URL_LEN_MAX];
|
||||
LinkType type;
|
||||
size_t content_length;
|
||||
LinkTable *next_table;
|
||||
long time;
|
||||
};
|
||||
|
||||
struct LinkTable {
|
||||
int num;
|
||||
Link **links;
|
||||
};
|
||||
|
||||
/** \brief root link table */
|
||||
extern LinkTable *ROOT_LINK_TBL;
|
||||
|
||||
void Link_set_stat(Link* this_link, CURL *curl);
|
||||
|
||||
/** \brief create a new LinkTable */
|
||||
LinkTable *LinkTable_new(const char *url);
|
||||
|
||||
/**
|
||||
* \brief download a link */
|
||||
/* \return the number of bytes downloaded
|
||||
*/
|
||||
long path_download(const char *path, char *output_buf, size_t size,
|
||||
off_t offset);
|
||||
|
||||
/** \brief find the link associated with a path */
|
||||
Link *path_to_Link(const char *path);
|
||||
|
||||
|
||||
#endif
|
3
main.c
3
main.c
|
@ -1,7 +1,8 @@
|
|||
#include "network.h"
|
||||
#include "fuse_local.h"
|
||||
|
||||
#include <stdio.h>
|
||||
// #include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
static void help();
|
||||
|
||||
|
|
461
network.c
461
network.c
|
@ -1,43 +1,19 @@
|
|||
#include "network.h"
|
||||
|
||||
#include <curl/curl.h>
|
||||
#include <gumbo.h>
|
||||
#include "link.h"
|
||||
|
||||
#include <openssl/crypto.h>
|
||||
|
||||
#include <ctype.h>
|
||||
#include <errno.h>
|
||||
#include <pthread.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#define HTTP_OK 200
|
||||
#define HTTP_PARTIAL_CONTENT 206
|
||||
#define HTTP_RANGE_NOT_SATISFIABLE 416
|
||||
|
||||
/* ---------------- External variables -----------------------*/
|
||||
LinkTable *ROOT_LINK_TBL;
|
||||
|
||||
/* ----------------- Local structs ---------------------------*/
|
||||
typedef struct {
|
||||
char *memory;
|
||||
size_t size;
|
||||
} MemoryStruct;
|
||||
|
||||
typedef enum {
|
||||
FILESTAT = 's',
|
||||
DATA = 'd'
|
||||
} TransferType;
|
||||
|
||||
typedef struct {
|
||||
TransferType type;
|
||||
int transferring;
|
||||
Link *link;
|
||||
} TransferStruct;
|
||||
/* ----------------- External variables ----------------------*/
|
||||
CURLSH *curl_share;
|
||||
|
||||
/* ----------------- Static variable ----------------------- */
|
||||
/** \brief curl shared interface */
|
||||
static CURLSH *curl_share;
|
||||
/** \brief curl multi interface handle */
|
||||
static CURLM *curl_multi;
|
||||
/** \brief mutex for transfer functions */
|
||||
|
@ -54,25 +30,8 @@ static void curl_callback_lock(CURL *handle, curl_lock_data data,
|
|||
curl_lock_access access, void *userptr);
|
||||
static void curl_callback_unlock(CURL *handle, curl_lock_data data,
|
||||
void *userptr);
|
||||
static int curl_multi_perform_once();
|
||||
void curl_process_msgs(CURLMsg *curl_msg, int n_running_curl, int n_mesgs);
|
||||
static void HTML_to_LinkTable(GumboNode *node, LinkTable *linktbl);
|
||||
static Link *Link_new(const char *p_url, LinkType type);
|
||||
static CURL *Link_to_curl(Link *link);
|
||||
void Link_get_stat(Link *this_link);
|
||||
static void Link_set_stat(Link* this_link, CURL *curl);
|
||||
static void LinkTable_add(LinkTable *linktbl, Link *link);
|
||||
void LinkTable_fill(LinkTable *linktbl);
|
||||
static void LinkTable_free(LinkTable *linktbl);
|
||||
static void LinkTable_print(LinkTable *linktbl);
|
||||
static void transfer_blocking(CURL *curl);
|
||||
static void transfer_nonblocking(CURL *curl);
|
||||
static Link *path_to_Link_recursive(char *path, LinkTable *linktbl);
|
||||
static LinkType p_url_type(const char *p_url);
|
||||
static unsigned long thread_id(void);
|
||||
static char *url_append(const char *url, const char *sublink);
|
||||
static size_t
|
||||
WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp);
|
||||
|
||||
/* -------------------- Functions -------------------------- */
|
||||
static void crypto_lock_callback(int mode, int type, char *file, int line)
|
||||
|
@ -123,7 +82,7 @@ static void curl_callback_unlock(CURL *handle, curl_lock_data data,
|
|||
pthread_mutex_unlock(&curl_lock);
|
||||
}
|
||||
|
||||
static int curl_multi_perform_once()
|
||||
int curl_multi_perform_once()
|
||||
{
|
||||
pthread_mutex_lock(&transfer_lock);
|
||||
/* Get curl multi interface to perform pending tasks */
|
||||
|
@ -231,249 +190,6 @@ void curl_process_msgs(CURLMsg *curl_msg, int n_running_curl, int n_mesgs)
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Shamelessly copied and pasted from:
|
||||
* https://github.com/google/gumbo-parser/blob/master/examples/find_links.cc
|
||||
*/
|
||||
static void HTML_to_LinkTable(GumboNode *node, LinkTable *linktbl)
|
||||
{
|
||||
if (node->type != GUMBO_NODE_ELEMENT) {
|
||||
return;
|
||||
}
|
||||
GumboAttribute* href;
|
||||
|
||||
if (node->v.element.tag == GUMBO_TAG_A &&
|
||||
(href = gumbo_get_attribute(&node->v.element.attributes, "href"))) {
|
||||
/* if it is valid, copy the link onto the heap */
|
||||
LinkType type = p_url_type(href->value);
|
||||
if (type) {
|
||||
LinkTable_add(linktbl, Link_new(href->value, type));
|
||||
}
|
||||
}
|
||||
/* Note the recursive call, lol. */
|
||||
GumboVector *children = &node->v.element.children;
|
||||
for (size_t i = 0; i < children->length; ++i) {
|
||||
HTML_to_LinkTable((GumboNode*)children->data[i], linktbl);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
static Link *Link_new(const char *p_url, LinkType type)
|
||||
{
|
||||
Link *link = calloc(1, sizeof(Link));
|
||||
if (!link) {
|
||||
fprintf(stderr, "Link_new(): calloc failure!\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
strncpy(link->p_url, p_url, LINK_LEN_MAX);
|
||||
link->type = type;
|
||||
|
||||
/* remove the '/' from p_url if it exists */
|
||||
char *c = &(link->p_url[strnlen(link->p_url, LINK_LEN_MAX) - 1]);
|
||||
if ( *c == '/') {
|
||||
*c = '\0';
|
||||
}
|
||||
|
||||
return link;
|
||||
}
|
||||
|
||||
static CURL *Link_to_curl(Link *link)
|
||||
{
|
||||
CURL *curl = curl_easy_init();
|
||||
if (!curl) {
|
||||
fprintf(stderr, "Link_to_curl(): curl_easy_init() failed!\n");
|
||||
}
|
||||
|
||||
/* set up some basic curl stuff */
|
||||
curl_easy_setopt(curl, CURLOPT_USERAGENT, "httpdirfs - \
|
||||
https://github.com/fangfufu/httpdirfs");
|
||||
curl_easy_setopt(curl, CURLOPT_VERBOSE, 0);
|
||||
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
|
||||
/* for following directories without the '/' */
|
||||
curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 2);
|
||||
curl_easy_setopt(curl, CURLOPT_URL, link->f_url);
|
||||
curl_easy_setopt(curl, CURLOPT_TCP_KEEPALIVE, 1);
|
||||
curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 15);
|
||||
curl_easy_setopt(curl, CURLOPT_SHARE, curl_share);
|
||||
/*
|
||||
* The write back function pointer has to be set at curl handle creation,
|
||||
* for thread safety
|
||||
*/
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
|
||||
|
||||
return curl;
|
||||
}
|
||||
|
||||
void Link_get_stat(Link *this_link)
|
||||
{
|
||||
fprintf(stderr, "Link_get_size(%s);\n", this_link->f_url);
|
||||
|
||||
if (this_link->type == LINK_FILE) {
|
||||
CURL *curl = Link_to_curl(this_link);
|
||||
curl_easy_setopt(curl, CURLOPT_NOBODY, 1);
|
||||
curl_easy_setopt(curl, CURLOPT_FILETIME, 1L);
|
||||
|
||||
/*
|
||||
* We need to put the variable on the heap, because otherwise the
|
||||
* variable gets popped from the stack as the function returns.
|
||||
*
|
||||
* It gets freed in curl_multi_perform_once();
|
||||
*/
|
||||
TransferStruct *transfer = malloc(sizeof(TransferStruct));
|
||||
if (!transfer) {
|
||||
fprintf(stderr, "Link_get_size(): malloc failed!\n");
|
||||
}
|
||||
transfer->link = this_link;
|
||||
transfer->type = FILESTAT;
|
||||
curl_easy_setopt(curl, CURLOPT_PRIVATE, transfer);
|
||||
|
||||
transfer_nonblocking(curl);
|
||||
}
|
||||
}
|
||||
|
||||
static void Link_set_stat(Link* this_link, CURL *curl)
|
||||
{
|
||||
long http_resp;
|
||||
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_resp);
|
||||
if (http_resp == HTTP_OK) {
|
||||
double cl = 0;
|
||||
curl_easy_getinfo(curl, CURLINFO_CONTENT_LENGTH_DOWNLOAD, &cl);
|
||||
curl_easy_getinfo(curl, CURLINFO_FILETIME, &(this_link->time));
|
||||
|
||||
if (cl == -1) {
|
||||
/* Turns out not to be a file after all */
|
||||
this_link->content_length = 0;
|
||||
this_link->type = LINK_DIR;
|
||||
} else {
|
||||
this_link->content_length = cl;
|
||||
this_link->type = LINK_FILE;
|
||||
}
|
||||
} else {
|
||||
this_link->type = LINK_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
static void LinkTable_add(LinkTable *linktbl, Link *link)
|
||||
{
|
||||
linktbl->num++;
|
||||
linktbl->links = realloc(linktbl->links, linktbl->num * sizeof(Link *));
|
||||
if (!linktbl->links) {
|
||||
fprintf(stderr, "LinkTable_add(): realloc failure!\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
linktbl->links[linktbl->num - 1] = link;
|
||||
}
|
||||
|
||||
void LinkTable_fill(LinkTable *linktbl)
|
||||
{
|
||||
Link *head_link = linktbl->links[0];
|
||||
for (int i = 0; i < linktbl->num; i++) {
|
||||
Link *this_link = linktbl->links[i];
|
||||
if (this_link->type) {
|
||||
char *url;
|
||||
url = url_append(head_link->f_url, this_link->p_url);
|
||||
strncpy(this_link->f_url, url, URL_LEN_MAX);
|
||||
free(url);
|
||||
|
||||
char *unescaped_p_url;
|
||||
unescaped_p_url = curl_easy_unescape(NULL, this_link->p_url, 0,
|
||||
NULL);
|
||||
strncpy(this_link->p_url, unescaped_p_url, LINK_LEN_MAX);
|
||||
curl_free(unescaped_p_url);
|
||||
|
||||
if (this_link->type == LINK_FILE && !(this_link->content_length)) {
|
||||
Link_get_stat(this_link);
|
||||
} else if (this_link->type == LINK_DIR) {
|
||||
this_link->time = head_link->time;
|
||||
}
|
||||
}
|
||||
}
|
||||
/* Block until the LinkTable is filled up */
|
||||
while (curl_multi_perform_once()) {
|
||||
usleep(1000);
|
||||
}
|
||||
}
|
||||
|
||||
static void LinkTable_free(LinkTable *linktbl)
|
||||
{
|
||||
for (int i = 0; i < linktbl->num; i++) {
|
||||
free(linktbl->links[i]);
|
||||
}
|
||||
free(linktbl->links);
|
||||
free(linktbl);
|
||||
}
|
||||
|
||||
LinkTable *LinkTable_new(const char *url)
|
||||
{
|
||||
fprintf(stderr, "LinkTable_new(%s);\n", url);
|
||||
|
||||
LinkTable *linktbl = calloc(1, sizeof(LinkTable));
|
||||
if (!linktbl) {
|
||||
fprintf(stderr, "LinkTable_new(): calloc failure!\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
/* populate the base URL */
|
||||
LinkTable_add(linktbl, Link_new("/", LINK_HEAD));
|
||||
Link *head_link = linktbl->links[0];
|
||||
head_link->type = LINK_HEAD;
|
||||
strncpy(head_link->f_url, url, URL_LEN_MAX);
|
||||
|
||||
/* start downloading the base URL */
|
||||
CURL *curl = Link_to_curl(head_link);
|
||||
MemoryStruct buf;
|
||||
buf.size = 0;
|
||||
buf.memory = NULL;
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)&buf);
|
||||
|
||||
transfer_blocking(curl);
|
||||
|
||||
/* if downloading base URL failed */
|
||||
long http_resp;
|
||||
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_resp);
|
||||
if (http_resp != HTTP_OK) {
|
||||
fprintf(stderr, "link.c: LinkTable_new() cannot retrive the base URL, \
|
||||
URL: %s, HTTP %ld\n", url, http_resp);
|
||||
|
||||
LinkTable_free(linktbl);
|
||||
linktbl = NULL;
|
||||
return linktbl;
|
||||
};
|
||||
curl_easy_getinfo(curl, CURLINFO_FILETIME, &(head_link->time));
|
||||
curl_easy_cleanup(curl);
|
||||
|
||||
/* Otherwise parsed the received data */
|
||||
GumboOutput* output = gumbo_parse(buf.memory);
|
||||
HTML_to_LinkTable(output->root, linktbl);
|
||||
gumbo_destroy_output(&kGumboDefaultOptions, output);
|
||||
free(buf.memory);
|
||||
|
||||
/* Fill in the link table */
|
||||
LinkTable_fill(linktbl);
|
||||
return linktbl;
|
||||
}
|
||||
|
||||
/** \brief print a LinkTable */
|
||||
static void LinkTable_print(LinkTable *linktbl)
|
||||
{
|
||||
fprintf(stderr, "--------------------------------------------\n");
|
||||
fprintf(stderr, " LinkTable %p for %s\n", linktbl,
|
||||
linktbl->links[0]->f_url);
|
||||
fprintf(stderr, "--------------------------------------------\n");
|
||||
for (int i = 0; i < linktbl->num; i++) {
|
||||
Link *this_link = linktbl->links[i];
|
||||
fprintf(stderr, "%d %c %lu %s %s\n",
|
||||
i,
|
||||
this_link->type,
|
||||
this_link->content_length,
|
||||
this_link->p_url,
|
||||
this_link->f_url
|
||||
);
|
||||
|
||||
}
|
||||
fprintf(stderr, "--------------------------------------------\n");
|
||||
}
|
||||
|
||||
void network_init(const char *url)
|
||||
{
|
||||
|
||||
|
@ -532,7 +248,7 @@ void network_init(const char *url)
|
|||
ROOT_LINK_TBL = LinkTable_new(url);
|
||||
}
|
||||
|
||||
static void transfer_blocking(CURL *curl)
|
||||
void transfer_blocking(CURL *curl)
|
||||
{
|
||||
/*
|
||||
* We don't need to malloc here, as the transfer is finished before
|
||||
|
@ -559,7 +275,7 @@ static void transfer_blocking(CURL *curl)
|
|||
}
|
||||
}
|
||||
|
||||
static void transfer_nonblocking(CURL *curl)
|
||||
void transfer_nonblocking(CURL *curl)
|
||||
{
|
||||
pthread_mutex_lock(&transfer_lock);
|
||||
CURLMcode res = curl_multi_add_handle(curl_multi, curl);
|
||||
|
@ -572,144 +288,6 @@ static void transfer_nonblocking(CURL *curl)
|
|||
}
|
||||
}
|
||||
|
||||
Link *path_to_Link(const char *path)
|
||||
{
|
||||
char *new_path = strndup(path, URL_LEN_MAX);
|
||||
if (!new_path) {
|
||||
fprintf(stderr, "path_to_Link(): cannot allocate memory\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
Link *link = path_to_Link_recursive(new_path, ROOT_LINK_TBL);
|
||||
free(new_path);
|
||||
return link;
|
||||
}
|
||||
|
||||
static Link *path_to_Link_recursive(char *path, LinkTable *linktbl)
|
||||
{
|
||||
/* skip the leading '/' if it exists */
|
||||
if (*path == '/') {
|
||||
path++;
|
||||
}
|
||||
|
||||
/* remove the last '/' if it exists */
|
||||
char *slash = &(path[strnlen(path, URL_LEN_MAX) - 1]);
|
||||
if (*slash == '/') {
|
||||
*slash = '\0';
|
||||
}
|
||||
|
||||
slash = strchr(path, '/');
|
||||
if ( slash == NULL ) {
|
||||
/* We cannot find another '/', we have reached the last level */
|
||||
for (int i = 1; i < linktbl->num; i++) {
|
||||
if (!strncmp(path, linktbl->links[i]->p_url, LINK_LEN_MAX)) {
|
||||
/* We found our link */
|
||||
return linktbl->links[i];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* We can still find '/', time to consume the path and traverse
|
||||
* the tree structure
|
||||
*/
|
||||
|
||||
/*
|
||||
* add termination mark to the current string,
|
||||
* effective create two substrings
|
||||
*/
|
||||
*slash = '\0';
|
||||
/* move the pointer past the '/' */
|
||||
char *next_path = slash + 1;
|
||||
for (int i = 1; i < linktbl->num; i++) {
|
||||
if (!strncmp(path, linktbl->links[i]->p_url, LINK_LEN_MAX)) {
|
||||
/* The next sub-directory exists */
|
||||
if (!(linktbl->links[i]->next_table)) {
|
||||
linktbl->links[i]->next_table = LinkTable_new(
|
||||
linktbl->links[i]->f_url);
|
||||
fprintf(stderr, "Created new link table for %s\n",
|
||||
linktbl->links[i]->f_url);
|
||||
LinkTable_print(linktbl->links[i]->next_table);
|
||||
}
|
||||
|
||||
return path_to_Link_recursive(next_path,
|
||||
linktbl->links[i]->next_table);
|
||||
}
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
long path_download(const char *path, char *output_buf, size_t size,
|
||||
off_t offset)
|
||||
{
|
||||
Link *link;
|
||||
link = path_to_Link(path);
|
||||
if (!link) {
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
size_t start = offset;
|
||||
size_t end = start + size;
|
||||
char range_str[64];
|
||||
snprintf(range_str, sizeof(range_str), "%lu-%lu", start, end);
|
||||
|
||||
MemoryStruct buf;
|
||||
buf.size = 0;
|
||||
buf.memory = NULL;
|
||||
|
||||
fprintf(stderr, "path_download(%s, %s);\n",
|
||||
path, range_str);
|
||||
|
||||
CURL *curl = Link_to_curl(link);
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)&buf);
|
||||
curl_easy_setopt(curl, CURLOPT_RANGE, range_str);
|
||||
|
||||
transfer_blocking(curl);
|
||||
|
||||
long http_resp;
|
||||
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_resp);
|
||||
if ( !(
|
||||
(http_resp != HTTP_OK) ||
|
||||
(http_resp != HTTP_PARTIAL_CONTENT) ||
|
||||
(http_resp != HTTP_RANGE_NOT_SATISFIABLE)
|
||||
)) {
|
||||
fprintf(stderr, "path_download(): Could not download %s, HTTP %ld\n",
|
||||
link->f_url, http_resp);
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
double dl;
|
||||
curl_easy_getinfo(curl, CURLINFO_SIZE_DOWNLOAD, &dl);
|
||||
|
||||
size_t recv = dl;
|
||||
if (recv > size) {
|
||||
recv = size;
|
||||
}
|
||||
|
||||
memmove(output_buf, buf.memory, recv);
|
||||
curl_easy_cleanup(curl);
|
||||
free(buf.memory);
|
||||
return recv;
|
||||
}
|
||||
|
||||
static LinkType p_url_type(const char *p_url)
|
||||
{
|
||||
/* The link name has to start with alphanumerical character */
|
||||
if (!isalnum(p_url[0])) {
|
||||
return LINK_INVALID;
|
||||
}
|
||||
|
||||
/* check for http:// and https:// */
|
||||
if ( !strncmp(p_url, "http://", 7) || !strncmp(p_url, "https://", 8) ) {
|
||||
return LINK_INVALID;
|
||||
}
|
||||
|
||||
if ( p_url[strlen(p_url) - 1] == '/' ) {
|
||||
return LINK_DIR;
|
||||
}
|
||||
|
||||
return LINK_FILE;
|
||||
}
|
||||
|
||||
static unsigned long thread_id(void)
|
||||
{
|
||||
unsigned long ret;
|
||||
|
@ -718,30 +296,7 @@ static unsigned long thread_id(void)
|
|||
return ret;
|
||||
}
|
||||
|
||||
static char *url_append(const char *url, const char *sublink)
|
||||
{
|
||||
int needs_separator = 0;
|
||||
if (url[strlen(url)-1] != '/') {
|
||||
needs_separator = 1;
|
||||
}
|
||||
|
||||
char *str;
|
||||
size_t ul = strlen(url);
|
||||
size_t sl = strlen(sublink);
|
||||
str = calloc(ul + sl + needs_separator + 1, sizeof(char));
|
||||
if (!str) {
|
||||
fprintf(stderr, "url_append(): calloc failure!\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
strncpy(str, url, ul);
|
||||
if (needs_separator) {
|
||||
str[ul] = '/';
|
||||
}
|
||||
strncat(str, sublink, sl);
|
||||
return str;
|
||||
}
|
||||
|
||||
static size_t
|
||||
size_t
|
||||
WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp)
|
||||
{
|
||||
size_t realsize = size * nmemb;
|
||||
|
|
66
network.h
66
network.h
|
@ -1,61 +1,41 @@
|
|||
#ifndef NETWORK_H
|
||||
#define NETWORK_H
|
||||
|
||||
#include <stdlib.h>
|
||||
#include "link.h"
|
||||
|
||||
#include <curl/curl.h>
|
||||
|
||||
#define URL_LEN_MAX 2048
|
||||
#define LINK_LEN_MAX 255
|
||||
#define CURL_MULTI_MAX_CONNECTION 20
|
||||
|
||||
/** \brief the link type */
|
||||
typedef struct {
|
||||
char *memory;
|
||||
size_t size;
|
||||
} MemoryStruct;
|
||||
|
||||
typedef enum {
|
||||
LINK_HEAD = 'H',
|
||||
LINK_DIR = 'D',
|
||||
LINK_FILE = 'F',
|
||||
LINK_INVALID = '\0'
|
||||
} LinkType;
|
||||
FILESTAT = 's',
|
||||
DATA = 'd'
|
||||
} TransferType;
|
||||
|
||||
/**
|
||||
* \brief link table type
|
||||
* \details index 0 contains the Link for the base URL
|
||||
*/
|
||||
typedef struct LinkTable LinkTable;
|
||||
typedef struct {
|
||||
TransferType type;
|
||||
int transferring;
|
||||
Link *link;
|
||||
} TransferStruct;
|
||||
|
||||
/** \brief link data type */
|
||||
typedef struct Link Link;
|
||||
/** \brief curl shared interface */
|
||||
extern CURLSH *curl_share;
|
||||
|
||||
|
||||
struct Link {
|
||||
char p_url[LINK_LEN_MAX];
|
||||
char f_url[URL_LEN_MAX];
|
||||
LinkType type;
|
||||
size_t content_length;
|
||||
LinkTable *next_table;
|
||||
long time;
|
||||
};
|
||||
|
||||
struct LinkTable {
|
||||
int num;
|
||||
Link **links;
|
||||
};
|
||||
|
||||
/** \brief root link table */
|
||||
extern LinkTable *ROOT_LINK_TBL;
|
||||
int curl_multi_perform_once();
|
||||
|
||||
/** \brief Initialise the network module */
|
||||
void network_init(const char *url);
|
||||
|
||||
/**
|
||||
* \brief download a link */
|
||||
/* \return the number of bytes downloaded
|
||||
*/
|
||||
long path_download(const char *path, char *output_buf, size_t size,
|
||||
off_t offset);
|
||||
void transfer_blocking(CURL *curl);
|
||||
|
||||
/** \brief create a new LinkTable */
|
||||
LinkTable *LinkTable_new(const char *url);
|
||||
void transfer_nonblocking(CURL *curl);
|
||||
|
||||
/** \brief find the link associated with a path */
|
||||
Link *path_to_Link(const char *path);
|
||||
size_t
|
||||
WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp);
|
||||
|
||||
#endif
|
||||
|
|
Loading…
Reference in New Issue