diff --git a/Makefile b/Makefile index c0cbef7..854d56d 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ CC=gcc CFLAGS= -g -O2 -Wall -Wextra -lgumbo -lcurl -lfuse -lcrypto \ -D_FILE_OFFSET_BITS=64 -OBJ = main.o network.o fuse_local.o +OBJ = main.o network.o fuse_local.o link.o %.o: %.c $(CC) -c -o $@ $< $(CFLAGS) diff --git a/fuse_local.c b/fuse_local.c index f2b34e0..ba5bb84 100644 --- a/fuse_local.c +++ b/fuse_local.c @@ -1,5 +1,6 @@ #include "fuse_local.h" +#include "link.h" #include "network.h" #include diff --git a/link.c b/link.c new file mode 100644 index 0000000..62199cc --- /dev/null +++ b/link.c @@ -0,0 +1,434 @@ +#include "link.h" + +#include "network.h" + +#include + +#include +#include +#include +#include + + +#define HTTP_OK 200 +#define HTTP_PARTIAL_CONTENT 206 +#define HTTP_RANGE_NOT_SATISFIABLE 416 + +/* ---------------- External variables -----------------------*/ +LinkTable *ROOT_LINK_TBL; + +static void HTML_to_LinkTable(GumboNode *node, LinkTable *linktbl); +static Link *Link_new(const char *p_url, LinkType type); +static CURL *Link_to_curl(Link *link); +static void Link_get_stat(Link *this_link); +static void LinkTable_add(LinkTable *linktbl, Link *link); +void LinkTable_fill(LinkTable *linktbl); +static void LinkTable_free(LinkTable *linktbl); +static void LinkTable_print(LinkTable *linktbl); +static Link *path_to_Link_recursive(char *path, LinkTable *linktbl); +static LinkType p_url_type(const char *p_url); +static char *url_append(const char *url, const char *sublink); + +/** + * Shamelessly copied and pasted from: + * https://github.com/google/gumbo-parser/blob/master/examples/find_links.cc + */ +static void HTML_to_LinkTable(GumboNode *node, LinkTable *linktbl) +{ + if (node->type != GUMBO_NODE_ELEMENT) { + return; + } + GumboAttribute* href; + + if (node->v.element.tag == GUMBO_TAG_A && + (href = gumbo_get_attribute(&node->v.element.attributes, "href"))) { + /* if it is valid, copy the link onto the heap */ + LinkType type = p_url_type(href->value); + if (type) { + LinkTable_add(linktbl, Link_new(href->value, type)); + } + } + /* Note the recursive call, lol. */ + GumboVector *children = &node->v.element.children; + for (size_t i = 0; i < children->length; ++i) { + HTML_to_LinkTable((GumboNode*)children->data[i], linktbl); + } + return; +} + +static Link *Link_new(const char *p_url, LinkType type) +{ + Link *link = calloc(1, sizeof(Link)); + if (!link) { + fprintf(stderr, "Link_new(): calloc failure!\n"); + exit(EXIT_FAILURE); + } + strncpy(link->p_url, p_url, LINK_LEN_MAX); + link->type = type; + + /* remove the '/' from p_url if it exists */ + char *c = &(link->p_url[strnlen(link->p_url, LINK_LEN_MAX) - 1]); + if ( *c == '/') { + *c = '\0'; + } + + return link; +} + +static CURL *Link_to_curl(Link *link) +{ + CURL *curl = curl_easy_init(); + if (!curl) { + fprintf(stderr, "Link_to_curl(): curl_easy_init() failed!\n"); + } + + /* set up some basic curl stuff */ + curl_easy_setopt(curl, CURLOPT_USERAGENT, "httpdirfs - \ + https://github.com/fangfufu/httpdirfs"); + curl_easy_setopt(curl, CURLOPT_VERBOSE, 0); + curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1); + /* for following directories without the '/' */ + curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 2); + curl_easy_setopt(curl, CURLOPT_URL, link->f_url); + curl_easy_setopt(curl, CURLOPT_TCP_KEEPALIVE, 1); + curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 15); + curl_easy_setopt(curl, CURLOPT_SHARE, curl_share); + /* + * The write back function pointer has to be set at curl handle creation, + * for thread safety + */ + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback); + + return curl; +} + +void Link_get_stat(Link *this_link) +{ + fprintf(stderr, "Link_get_size(%s);\n", this_link->f_url); + + if (this_link->type == LINK_FILE) { + CURL *curl = Link_to_curl(this_link); + curl_easy_setopt(curl, CURLOPT_NOBODY, 1); + curl_easy_setopt(curl, CURLOPT_FILETIME, 1L); + + /* + * We need to put the variable on the heap, because otherwise the + * variable gets popped from the stack as the function returns. + * + * It gets freed in curl_multi_perform_once(); + */ + TransferStruct *transfer = malloc(sizeof(TransferStruct)); + if (!transfer) { + fprintf(stderr, "Link_get_size(): malloc failed!\n"); + } + transfer->link = this_link; + transfer->type = FILESTAT; + curl_easy_setopt(curl, CURLOPT_PRIVATE, transfer); + + transfer_nonblocking(curl); + } +} + +void Link_set_stat(Link* this_link, CURL *curl) +{ + long http_resp; + curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_resp); + if (http_resp == HTTP_OK) { + double cl = 0; + curl_easy_getinfo(curl, CURLINFO_CONTENT_LENGTH_DOWNLOAD, &cl); + curl_easy_getinfo(curl, CURLINFO_FILETIME, &(this_link->time)); + + if (cl == -1) { + /* Turns out not to be a file after all */ + this_link->content_length = 0; + this_link->type = LINK_DIR; + } else { + this_link->content_length = cl; + this_link->type = LINK_FILE; + } + } else { + this_link->type = LINK_INVALID; + } +} + +static void LinkTable_add(LinkTable *linktbl, Link *link) +{ + linktbl->num++; + linktbl->links = realloc(linktbl->links, linktbl->num * sizeof(Link *)); + if (!linktbl->links) { + fprintf(stderr, "LinkTable_add(): realloc failure!\n"); + exit(EXIT_FAILURE); + } + linktbl->links[linktbl->num - 1] = link; +} + +void LinkTable_fill(LinkTable *linktbl) +{ + Link *head_link = linktbl->links[0]; + for (int i = 0; i < linktbl->num; i++) { + Link *this_link = linktbl->links[i]; + if (this_link->type) { + char *url; + url = url_append(head_link->f_url, this_link->p_url); + strncpy(this_link->f_url, url, URL_LEN_MAX); + free(url); + + char *unescaped_p_url; + unescaped_p_url = curl_easy_unescape(NULL, this_link->p_url, 0, + NULL); + strncpy(this_link->p_url, unescaped_p_url, LINK_LEN_MAX); + curl_free(unescaped_p_url); + + if (this_link->type == LINK_FILE && !(this_link->content_length)) { + Link_get_stat(this_link); + } else if (this_link->type == LINK_DIR) { + this_link->time = head_link->time; + } + } + } + /* Block until the LinkTable is filled up */ + while (curl_multi_perform_once()) { + usleep(1000); + } +} + +static void LinkTable_free(LinkTable *linktbl) +{ + for (int i = 0; i < linktbl->num; i++) { + free(linktbl->links[i]); + } + free(linktbl->links); + free(linktbl); +} + +LinkTable *LinkTable_new(const char *url) +{ + fprintf(stderr, "LinkTable_new(%s);\n", url); + + LinkTable *linktbl = calloc(1, sizeof(LinkTable)); + if (!linktbl) { + fprintf(stderr, "LinkTable_new(): calloc failure!\n"); + exit(EXIT_FAILURE); + } + + /* populate the base URL */ + LinkTable_add(linktbl, Link_new("/", LINK_HEAD)); + Link *head_link = linktbl->links[0]; + head_link->type = LINK_HEAD; + strncpy(head_link->f_url, url, URL_LEN_MAX); + + /* start downloading the base URL */ + CURL *curl = Link_to_curl(head_link); + MemoryStruct buf; + buf.size = 0; + buf.memory = NULL; + curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)&buf); + + transfer_blocking(curl); + + /* if downloading base URL failed */ + long http_resp; + curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_resp); + if (http_resp != HTTP_OK) { + fprintf(stderr, "link.c: LinkTable_new() cannot retrive the base URL, \ + URL: %s, HTTP %ld\n", url, http_resp); + + LinkTable_free(linktbl); + linktbl = NULL; + return linktbl; + }; + curl_easy_getinfo(curl, CURLINFO_FILETIME, &(head_link->time)); + curl_easy_cleanup(curl); + + /* Otherwise parsed the received data */ + GumboOutput* output = gumbo_parse(buf.memory); + HTML_to_LinkTable(output->root, linktbl); + gumbo_destroy_output(&kGumboDefaultOptions, output); + free(buf.memory); + + /* Fill in the link table */ + LinkTable_fill(linktbl); + return linktbl; +} + +/** \brief print a LinkTable */ +static void LinkTable_print(LinkTable *linktbl) +{ + fprintf(stderr, "--------------------------------------------\n"); + fprintf(stderr, " LinkTable %p for %s\n", linktbl, + linktbl->links[0]->f_url); + fprintf(stderr, "--------------------------------------------\n"); + for (int i = 0; i < linktbl->num; i++) { + Link *this_link = linktbl->links[i]; + fprintf(stderr, "%d %c %lu %s %s\n", + i, + this_link->type, + this_link->content_length, + this_link->p_url, + this_link->f_url + ); + + } + fprintf(stderr, "--------------------------------------------\n"); +} + +Link *path_to_Link(const char *path) +{ + char *new_path = strndup(path, URL_LEN_MAX); + if (!new_path) { + fprintf(stderr, "path_to_Link(): cannot allocate memory\n"); + exit(EXIT_FAILURE); + } + Link *link = path_to_Link_recursive(new_path, ROOT_LINK_TBL); + free(new_path); + return link; +} + +static Link *path_to_Link_recursive(char *path, LinkTable *linktbl) +{ + /* skip the leading '/' if it exists */ + if (*path == '/') { + path++; + } + + /* remove the last '/' if it exists */ + char *slash = &(path[strnlen(path, URL_LEN_MAX) - 1]); + if (*slash == '/') { + *slash = '\0'; + } + + slash = strchr(path, '/'); + if ( slash == NULL ) { + /* We cannot find another '/', we have reached the last level */ + for (int i = 1; i < linktbl->num; i++) { + if (!strncmp(path, linktbl->links[i]->p_url, LINK_LEN_MAX)) { + /* We found our link */ + return linktbl->links[i]; + } + } + } else { + /* + * We can still find '/', time to consume the path and traverse + * the tree structure + */ + + /* + * add termination mark to the current string, + * effective create two substrings + */ + *slash = '\0'; + /* move the pointer past the '/' */ + char *next_path = slash + 1; + for (int i = 1; i < linktbl->num; i++) { + if (!strncmp(path, linktbl->links[i]->p_url, LINK_LEN_MAX)) { + /* The next sub-directory exists */ + if (!(linktbl->links[i]->next_table)) { + linktbl->links[i]->next_table = LinkTable_new( + linktbl->links[i]->f_url); + fprintf(stderr, "Created new link table for %s\n", + linktbl->links[i]->f_url); + LinkTable_print(linktbl->links[i]->next_table); + } + + return path_to_Link_recursive(next_path, + linktbl->links[i]->next_table); + } + } + } + return NULL; +} + +long path_download(const char *path, char *output_buf, size_t size, + off_t offset) +{ + Link *link; + link = path_to_Link(path); + if (!link) { + return -ENOENT; + } + + size_t start = offset; + size_t end = start + size; + char range_str[64]; + snprintf(range_str, sizeof(range_str), "%lu-%lu", start, end); + + MemoryStruct buf; + buf.size = 0; + buf.memory = NULL; + + fprintf(stderr, "path_download(%s, %s);\n", + path, range_str); + + CURL *curl = Link_to_curl(link); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)&buf); + curl_easy_setopt(curl, CURLOPT_RANGE, range_str); + + transfer_blocking(curl); + + long http_resp; + curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_resp); + if ( !( + (http_resp != HTTP_OK) || + (http_resp != HTTP_PARTIAL_CONTENT) || + (http_resp != HTTP_RANGE_NOT_SATISFIABLE) + )) { + fprintf(stderr, "path_download(): Could not download %s, HTTP %ld\n", + link->f_url, http_resp); + return -ENOENT; + } + + double dl; + curl_easy_getinfo(curl, CURLINFO_SIZE_DOWNLOAD, &dl); + + size_t recv = dl; + if (recv > size) { + recv = size; + } + + memmove(output_buf, buf.memory, recv); + curl_easy_cleanup(curl); + free(buf.memory); + return recv; +} + +static LinkType p_url_type(const char *p_url) +{ + /* The link name has to start with alphanumerical character */ + if (!isalnum(p_url[0])) { + return LINK_INVALID; + } + + /* check for http:// and https:// */ + if ( !strncmp(p_url, "http://", 7) || !strncmp(p_url, "https://", 8) ) { + return LINK_INVALID; + } + + if ( p_url[strlen(p_url) - 1] == '/' ) { + return LINK_DIR; + } + + return LINK_FILE; +} + +static char *url_append(const char *url, const char *sublink) +{ + int needs_separator = 0; + if (url[strlen(url)-1] != '/') { + needs_separator = 1; + } + + char *str; + size_t ul = strlen(url); + size_t sl = strlen(sublink); + str = calloc(ul + sl + needs_separator + 1, sizeof(char)); + if (!str) { + fprintf(stderr, "url_append(): calloc failure!\n"); + exit(EXIT_FAILURE); + } + strncpy(str, url, ul); + if (needs_separator) { + str[ul] = '/'; + } + strncat(str, sublink, sl); + return str; +} diff --git a/link.h b/link.h new file mode 100644 index 0000000..65e7fd2 --- /dev/null +++ b/link.h @@ -0,0 +1,62 @@ +#ifndef LINK_H +#define LINK_H + +#include + +#include + +#define URL_LEN_MAX 2048 +#define LINK_LEN_MAX 255 + +/** \brief the link type */ +typedef enum { + LINK_HEAD = 'H', + LINK_DIR = 'D', + LINK_FILE = 'F', + LINK_INVALID = '\0' +} LinkType; + +/** + * \brief link table type + * \details index 0 contains the Link for the base URL + */ +typedef struct LinkTable LinkTable; + +/** \brief link data type */ +typedef struct Link Link; + + +struct Link { + char p_url[LINK_LEN_MAX]; + char f_url[URL_LEN_MAX]; + LinkType type; + size_t content_length; + LinkTable *next_table; + long time; +}; + +struct LinkTable { + int num; + Link **links; +}; + +/** \brief root link table */ +extern LinkTable *ROOT_LINK_TBL; + +void Link_set_stat(Link* this_link, CURL *curl); + +/** \brief create a new LinkTable */ +LinkTable *LinkTable_new(const char *url); + +/** + * \brief download a link */ +/* \return the number of bytes downloaded + */ +long path_download(const char *path, char *output_buf, size_t size, + off_t offset); + +/** \brief find the link associated with a path */ +Link *path_to_Link(const char *path); + + +#endif diff --git a/main.c b/main.c index 9563e32..c6b22e9 100644 --- a/main.c +++ b/main.c @@ -1,7 +1,8 @@ #include "network.h" #include "fuse_local.h" -#include +// #include +#include static void help(); diff --git a/network.c b/network.c index 0cee4b5..90b8b30 100644 --- a/network.c +++ b/network.c @@ -1,43 +1,19 @@ #include "network.h" -#include -#include +#include "link.h" + #include -#include #include #include #include #include #include -#define HTTP_OK 200 -#define HTTP_PARTIAL_CONTENT 206 -#define HTTP_RANGE_NOT_SATISFIABLE 416 - -/* ---------------- External variables -----------------------*/ -LinkTable *ROOT_LINK_TBL; - -/* ----------------- Local structs ---------------------------*/ -typedef struct { - char *memory; - size_t size; -} MemoryStruct; - -typedef enum { - FILESTAT = 's', - DATA = 'd' -} TransferType; - -typedef struct { - TransferType type; - int transferring; - Link *link; -} TransferStruct; +/* ----------------- External variables ----------------------*/ +CURLSH *curl_share; /* ----------------- Static variable ----------------------- */ -/** \brief curl shared interface */ -static CURLSH *curl_share; /** \brief curl multi interface handle */ static CURLM *curl_multi; /** \brief mutex for transfer functions */ @@ -54,25 +30,8 @@ static void curl_callback_lock(CURL *handle, curl_lock_data data, curl_lock_access access, void *userptr); static void curl_callback_unlock(CURL *handle, curl_lock_data data, void *userptr); -static int curl_multi_perform_once(); void curl_process_msgs(CURLMsg *curl_msg, int n_running_curl, int n_mesgs); -static void HTML_to_LinkTable(GumboNode *node, LinkTable *linktbl); -static Link *Link_new(const char *p_url, LinkType type); -static CURL *Link_to_curl(Link *link); -void Link_get_stat(Link *this_link); -static void Link_set_stat(Link* this_link, CURL *curl); -static void LinkTable_add(LinkTable *linktbl, Link *link); -void LinkTable_fill(LinkTable *linktbl); -static void LinkTable_free(LinkTable *linktbl); -static void LinkTable_print(LinkTable *linktbl); -static void transfer_blocking(CURL *curl); -static void transfer_nonblocking(CURL *curl); -static Link *path_to_Link_recursive(char *path, LinkTable *linktbl); -static LinkType p_url_type(const char *p_url); static unsigned long thread_id(void); -static char *url_append(const char *url, const char *sublink); -static size_t -WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp); /* -------------------- Functions -------------------------- */ static void crypto_lock_callback(int mode, int type, char *file, int line) @@ -123,7 +82,7 @@ static void curl_callback_unlock(CURL *handle, curl_lock_data data, pthread_mutex_unlock(&curl_lock); } -static int curl_multi_perform_once() +int curl_multi_perform_once() { pthread_mutex_lock(&transfer_lock); /* Get curl multi interface to perform pending tasks */ @@ -231,249 +190,6 @@ void curl_process_msgs(CURLMsg *curl_msg, int n_running_curl, int n_mesgs) } } -/** - * Shamelessly copied and pasted from: - * https://github.com/google/gumbo-parser/blob/master/examples/find_links.cc - */ -static void HTML_to_LinkTable(GumboNode *node, LinkTable *linktbl) -{ - if (node->type != GUMBO_NODE_ELEMENT) { - return; - } - GumboAttribute* href; - - if (node->v.element.tag == GUMBO_TAG_A && - (href = gumbo_get_attribute(&node->v.element.attributes, "href"))) { - /* if it is valid, copy the link onto the heap */ - LinkType type = p_url_type(href->value); - if (type) { - LinkTable_add(linktbl, Link_new(href->value, type)); - } - } - /* Note the recursive call, lol. */ - GumboVector *children = &node->v.element.children; - for (size_t i = 0; i < children->length; ++i) { - HTML_to_LinkTable((GumboNode*)children->data[i], linktbl); - } - return; -} - -static Link *Link_new(const char *p_url, LinkType type) -{ - Link *link = calloc(1, sizeof(Link)); - if (!link) { - fprintf(stderr, "Link_new(): calloc failure!\n"); - exit(EXIT_FAILURE); - } - strncpy(link->p_url, p_url, LINK_LEN_MAX); - link->type = type; - - /* remove the '/' from p_url if it exists */ - char *c = &(link->p_url[strnlen(link->p_url, LINK_LEN_MAX) - 1]); - if ( *c == '/') { - *c = '\0'; - } - - return link; -} - -static CURL *Link_to_curl(Link *link) -{ - CURL *curl = curl_easy_init(); - if (!curl) { - fprintf(stderr, "Link_to_curl(): curl_easy_init() failed!\n"); - } - - /* set up some basic curl stuff */ - curl_easy_setopt(curl, CURLOPT_USERAGENT, "httpdirfs - \ - https://github.com/fangfufu/httpdirfs"); - curl_easy_setopt(curl, CURLOPT_VERBOSE, 0); - curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1); - /* for following directories without the '/' */ - curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 2); - curl_easy_setopt(curl, CURLOPT_URL, link->f_url); - curl_easy_setopt(curl, CURLOPT_TCP_KEEPALIVE, 1); - curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 15); - curl_easy_setopt(curl, CURLOPT_SHARE, curl_share); - /* - * The write back function pointer has to be set at curl handle creation, - * for thread safety - */ - curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback); - - return curl; -} - -void Link_get_stat(Link *this_link) -{ - fprintf(stderr, "Link_get_size(%s);\n", this_link->f_url); - - if (this_link->type == LINK_FILE) { - CURL *curl = Link_to_curl(this_link); - curl_easy_setopt(curl, CURLOPT_NOBODY, 1); - curl_easy_setopt(curl, CURLOPT_FILETIME, 1L); - - /* - * We need to put the variable on the heap, because otherwise the - * variable gets popped from the stack as the function returns. - * - * It gets freed in curl_multi_perform_once(); - */ - TransferStruct *transfer = malloc(sizeof(TransferStruct)); - if (!transfer) { - fprintf(stderr, "Link_get_size(): malloc failed!\n"); - } - transfer->link = this_link; - transfer->type = FILESTAT; - curl_easy_setopt(curl, CURLOPT_PRIVATE, transfer); - - transfer_nonblocking(curl); - } -} - -static void Link_set_stat(Link* this_link, CURL *curl) -{ - long http_resp; - curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_resp); - if (http_resp == HTTP_OK) { - double cl = 0; - curl_easy_getinfo(curl, CURLINFO_CONTENT_LENGTH_DOWNLOAD, &cl); - curl_easy_getinfo(curl, CURLINFO_FILETIME, &(this_link->time)); - - if (cl == -1) { - /* Turns out not to be a file after all */ - this_link->content_length = 0; - this_link->type = LINK_DIR; - } else { - this_link->content_length = cl; - this_link->type = LINK_FILE; - } - } else { - this_link->type = LINK_INVALID; - } -} - -static void LinkTable_add(LinkTable *linktbl, Link *link) -{ - linktbl->num++; - linktbl->links = realloc(linktbl->links, linktbl->num * sizeof(Link *)); - if (!linktbl->links) { - fprintf(stderr, "LinkTable_add(): realloc failure!\n"); - exit(EXIT_FAILURE); - } - linktbl->links[linktbl->num - 1] = link; -} - -void LinkTable_fill(LinkTable *linktbl) -{ - Link *head_link = linktbl->links[0]; - for (int i = 0; i < linktbl->num; i++) { - Link *this_link = linktbl->links[i]; - if (this_link->type) { - char *url; - url = url_append(head_link->f_url, this_link->p_url); - strncpy(this_link->f_url, url, URL_LEN_MAX); - free(url); - - char *unescaped_p_url; - unescaped_p_url = curl_easy_unescape(NULL, this_link->p_url, 0, - NULL); - strncpy(this_link->p_url, unescaped_p_url, LINK_LEN_MAX); - curl_free(unescaped_p_url); - - if (this_link->type == LINK_FILE && !(this_link->content_length)) { - Link_get_stat(this_link); - } else if (this_link->type == LINK_DIR) { - this_link->time = head_link->time; - } - } - } - /* Block until the LinkTable is filled up */ - while (curl_multi_perform_once()) { - usleep(1000); - } -} - -static void LinkTable_free(LinkTable *linktbl) -{ - for (int i = 0; i < linktbl->num; i++) { - free(linktbl->links[i]); - } - free(linktbl->links); - free(linktbl); -} - -LinkTable *LinkTable_new(const char *url) -{ - fprintf(stderr, "LinkTable_new(%s);\n", url); - - LinkTable *linktbl = calloc(1, sizeof(LinkTable)); - if (!linktbl) { - fprintf(stderr, "LinkTable_new(): calloc failure!\n"); - exit(EXIT_FAILURE); - } - - /* populate the base URL */ - LinkTable_add(linktbl, Link_new("/", LINK_HEAD)); - Link *head_link = linktbl->links[0]; - head_link->type = LINK_HEAD; - strncpy(head_link->f_url, url, URL_LEN_MAX); - - /* start downloading the base URL */ - CURL *curl = Link_to_curl(head_link); - MemoryStruct buf; - buf.size = 0; - buf.memory = NULL; - curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)&buf); - - transfer_blocking(curl); - - /* if downloading base URL failed */ - long http_resp; - curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_resp); - if (http_resp != HTTP_OK) { - fprintf(stderr, "link.c: LinkTable_new() cannot retrive the base URL, \ -URL: %s, HTTP %ld\n", url, http_resp); - - LinkTable_free(linktbl); - linktbl = NULL; - return linktbl; - }; - curl_easy_getinfo(curl, CURLINFO_FILETIME, &(head_link->time)); - curl_easy_cleanup(curl); - - /* Otherwise parsed the received data */ - GumboOutput* output = gumbo_parse(buf.memory); - HTML_to_LinkTable(output->root, linktbl); - gumbo_destroy_output(&kGumboDefaultOptions, output); - free(buf.memory); - - /* Fill in the link table */ - LinkTable_fill(linktbl); - return linktbl; -} - -/** \brief print a LinkTable */ -static void LinkTable_print(LinkTable *linktbl) -{ - fprintf(stderr, "--------------------------------------------\n"); - fprintf(stderr, " LinkTable %p for %s\n", linktbl, - linktbl->links[0]->f_url); - fprintf(stderr, "--------------------------------------------\n"); - for (int i = 0; i < linktbl->num; i++) { - Link *this_link = linktbl->links[i]; - fprintf(stderr, "%d %c %lu %s %s\n", - i, - this_link->type, - this_link->content_length, - this_link->p_url, - this_link->f_url - ); - - } - fprintf(stderr, "--------------------------------------------\n"); -} - void network_init(const char *url) { @@ -532,7 +248,7 @@ void network_init(const char *url) ROOT_LINK_TBL = LinkTable_new(url); } -static void transfer_blocking(CURL *curl) +void transfer_blocking(CURL *curl) { /* * We don't need to malloc here, as the transfer is finished before @@ -559,7 +275,7 @@ static void transfer_blocking(CURL *curl) } } -static void transfer_nonblocking(CURL *curl) +void transfer_nonblocking(CURL *curl) { pthread_mutex_lock(&transfer_lock); CURLMcode res = curl_multi_add_handle(curl_multi, curl); @@ -572,144 +288,6 @@ static void transfer_nonblocking(CURL *curl) } } -Link *path_to_Link(const char *path) -{ - char *new_path = strndup(path, URL_LEN_MAX); - if (!new_path) { - fprintf(stderr, "path_to_Link(): cannot allocate memory\n"); - exit(EXIT_FAILURE); - } - Link *link = path_to_Link_recursive(new_path, ROOT_LINK_TBL); - free(new_path); - return link; -} - -static Link *path_to_Link_recursive(char *path, LinkTable *linktbl) -{ - /* skip the leading '/' if it exists */ - if (*path == '/') { - path++; - } - - /* remove the last '/' if it exists */ - char *slash = &(path[strnlen(path, URL_LEN_MAX) - 1]); - if (*slash == '/') { - *slash = '\0'; - } - - slash = strchr(path, '/'); - if ( slash == NULL ) { - /* We cannot find another '/', we have reached the last level */ - for (int i = 1; i < linktbl->num; i++) { - if (!strncmp(path, linktbl->links[i]->p_url, LINK_LEN_MAX)) { - /* We found our link */ - return linktbl->links[i]; - } - } - } else { - /* - * We can still find '/', time to consume the path and traverse - * the tree structure - */ - - /* - * add termination mark to the current string, - * effective create two substrings - */ - *slash = '\0'; - /* move the pointer past the '/' */ - char *next_path = slash + 1; - for (int i = 1; i < linktbl->num; i++) { - if (!strncmp(path, linktbl->links[i]->p_url, LINK_LEN_MAX)) { - /* The next sub-directory exists */ - if (!(linktbl->links[i]->next_table)) { - linktbl->links[i]->next_table = LinkTable_new( - linktbl->links[i]->f_url); - fprintf(stderr, "Created new link table for %s\n", - linktbl->links[i]->f_url); - LinkTable_print(linktbl->links[i]->next_table); - } - - return path_to_Link_recursive(next_path, - linktbl->links[i]->next_table); - } - } - } - return NULL; -} - -long path_download(const char *path, char *output_buf, size_t size, - off_t offset) -{ - Link *link; - link = path_to_Link(path); - if (!link) { - return -ENOENT; - } - - size_t start = offset; - size_t end = start + size; - char range_str[64]; - snprintf(range_str, sizeof(range_str), "%lu-%lu", start, end); - - MemoryStruct buf; - buf.size = 0; - buf.memory = NULL; - - fprintf(stderr, "path_download(%s, %s);\n", - path, range_str); - - CURL *curl = Link_to_curl(link); - curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)&buf); - curl_easy_setopt(curl, CURLOPT_RANGE, range_str); - - transfer_blocking(curl); - - long http_resp; - curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_resp); - if ( !( - (http_resp != HTTP_OK) || - (http_resp != HTTP_PARTIAL_CONTENT) || - (http_resp != HTTP_RANGE_NOT_SATISFIABLE) - )) { - fprintf(stderr, "path_download(): Could not download %s, HTTP %ld\n", - link->f_url, http_resp); - return -ENOENT; - } - - double dl; - curl_easy_getinfo(curl, CURLINFO_SIZE_DOWNLOAD, &dl); - - size_t recv = dl; - if (recv > size) { - recv = size; - } - - memmove(output_buf, buf.memory, recv); - curl_easy_cleanup(curl); - free(buf.memory); - return recv; -} - -static LinkType p_url_type(const char *p_url) -{ - /* The link name has to start with alphanumerical character */ - if (!isalnum(p_url[0])) { - return LINK_INVALID; - } - - /* check for http:// and https:// */ - if ( !strncmp(p_url, "http://", 7) || !strncmp(p_url, "https://", 8) ) { - return LINK_INVALID; - } - - if ( p_url[strlen(p_url) - 1] == '/' ) { - return LINK_DIR; - } - - return LINK_FILE; -} - static unsigned long thread_id(void) { unsigned long ret; @@ -718,30 +296,7 @@ static unsigned long thread_id(void) return ret; } -static char *url_append(const char *url, const char *sublink) -{ - int needs_separator = 0; - if (url[strlen(url)-1] != '/') { - needs_separator = 1; - } - - char *str; - size_t ul = strlen(url); - size_t sl = strlen(sublink); - str = calloc(ul + sl + needs_separator + 1, sizeof(char)); - if (!str) { - fprintf(stderr, "url_append(): calloc failure!\n"); - exit(EXIT_FAILURE); - } - strncpy(str, url, ul); - if (needs_separator) { - str[ul] = '/'; - } - strncat(str, sublink, sl); - return str; -} - -static size_t +size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp) { size_t realsize = size * nmemb; diff --git a/network.h b/network.h index 0af0c4f..55bdc82 100644 --- a/network.h +++ b/network.h @@ -1,61 +1,41 @@ #ifndef NETWORK_H #define NETWORK_H -#include +#include "link.h" + +#include -#define URL_LEN_MAX 2048 -#define LINK_LEN_MAX 255 #define CURL_MULTI_MAX_CONNECTION 20 -/** \brief the link type */ +typedef struct { + char *memory; + size_t size; +} MemoryStruct; + typedef enum { - LINK_HEAD = 'H', - LINK_DIR = 'D', - LINK_FILE = 'F', - LINK_INVALID = '\0' -} LinkType; + FILESTAT = 's', + DATA = 'd' +} TransferType; -/** - * \brief link table type - * \details index 0 contains the Link for the base URL - */ -typedef struct LinkTable LinkTable; +typedef struct { + TransferType type; + int transferring; + Link *link; +} TransferStruct; -/** \brief link data type */ -typedef struct Link Link; +/** \brief curl shared interface */ +extern CURLSH *curl_share; - -struct Link { - char p_url[LINK_LEN_MAX]; - char f_url[URL_LEN_MAX]; - LinkType type; - size_t content_length; - LinkTable *next_table; - long time; -}; - -struct LinkTable { - int num; - Link **links; -}; - -/** \brief root link table */ -extern LinkTable *ROOT_LINK_TBL; +int curl_multi_perform_once(); /** \brief Initialise the network module */ void network_init(const char *url); -/** - * \brief download a link */ -/* \return the number of bytes downloaded - */ -long path_download(const char *path, char *output_buf, size_t size, - off_t offset); +void transfer_blocking(CURL *curl); -/** \brief create a new LinkTable */ -LinkTable *LinkTable_new(const char *url); +void transfer_nonblocking(CURL *curl); -/** \brief find the link associated with a path */ -Link *path_to_Link(const char *path); +size_t +WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp); #endif