From 442d00141e8dd6c8cd31fdeb7899ecb73e2bcf73 Mon Sep 17 00:00:00 2001 From: Fufu Fang Date: Fri, 20 Jul 2018 02:09:51 +0100 Subject: [PATCH] new branch - new data structure --- Makefile | 2 +- data.h | 40 +++++++ http.c | 343 ------------------------------------------------------- http.h | 32 ------ link.c | 102 ++++++++++------- link.h | 42 ++++--- main.c | 2 +- test.c | 141 +---------------------- 8 files changed, 126 insertions(+), 578 deletions(-) create mode 100644 data.h diff --git a/Makefile b/Makefile index f543afc..4edcfbd 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ CC=gcc CFLAGS= -Wall -Wextra -lgumbo -lcurl -g -OBJ = main.o link.o test.o http.o +OBJ = main.o link.o test.o %.o: %.c $(CC) -c -o $@ $< $(CFLAGS) diff --git a/data.h b/data.h new file mode 100644 index 0000000..f85316c --- /dev/null +++ b/data.h @@ -0,0 +1,40 @@ +#ifndef DATA_H +#define DATA_H +/** + * \file data.h + * \brief This header stores all the custom data type definition + */ + +#include + +#define URL_LEN_MAX 2048 +#define LINK_LEN_MAX 255 + +/** \brief the link type */ +typedef enum { + LINK_DIR = 'D', + LINK_FILE = 'F', + LINK_UNKNOWN = 'U' +} LinkType; + +/** \brief link data type */ +typedef struct { + char *p_url; + LinkType type; + CURL *curl_h; + CURLcode res; /* initialise to -1, because all CURLcode are positive */ + char *data; + size_t data_sz; +} Link; + +/** \brief link table type */ +typedef struct { + int num; + Link **links; +} LinkTable; + + + +extern char *BASE_URL; + +#endif diff --git a/http.c b/http.c index d0078b3..88524a1 100644 --- a/http.c +++ b/http.c @@ -9,346 +9,3 @@ #include "http.h" -/* we use a global one for convenience */ -static CURLM *multi_handle; - -static int num_transfers = 0; - -/* curl calls this routine to get more data */ -static size_t write_callback(char *buffer, size_t size, - size_t nitems, void *userp) -{ - char *newbuff; - size_t rembuff; - - URL_FILE *url = (URL_FILE *)userp; - size *= nitems; - - rembuff = url->buffer_len - url->buffer_pos; /* remaining space in buffer */ - - if(size > rembuff) { - /* not enough space in buffer */ - newbuff = realloc(url->buffer, url->buffer_len + (size - rembuff)); - if(newbuff == NULL) { - fprintf(stderr, "callback buffer grow failed\n"); - size = rembuff; - } else { - /* realloc succeeded increase buffer size*/ - url->buffer_len += size - rembuff; - url->buffer = newbuff; - } - } - - memcpy(&url->buffer[url->buffer_pos], buffer, size); - url->buffer_pos += size; - - return size; -} - -static size_t header_callback(char *buffer, size_t size, - size_t nitems, void *userp) -{ - char *newbuff; - size_t rembuff; - - URL_FILE *url = (URL_FILE *)userp; - size *= nitems; - - rembuff = url->header_len - url->header_pos; /* remaining space in buffer */ - - if(size > rembuff) { - /* not enough space in buffer */ - newbuff = realloc(url->header, url->header_len + (size - rembuff)); - if(newbuff == NULL) { - fprintf(stderr, "callback buffer grow failed\n"); - size = rembuff; - } else { - /* realloc succeeded increase buffer size*/ - url->header_len += size - rembuff; - url->header = newbuff; - url->header[url->header_len] = '\0'; - } - } - - memcpy(&url->header[url->header_pos], buffer, size); - url->header_pos += size; - - char *hf; - hf = "Accept-Ranges:"; - if (!strncasecmp(buffer, hf, strlen(hf))) { - url->accept_range = 1; - } - hf = "Content-Length: "; - if (!strncasecmp(buffer, hf, strlen(hf))) { - /* - * We are doing this, because libcurl documentation says - *"Do not assume that the header line is zero terminated!" - */ - char *tmp = malloc((nitems) * sizeof(char)); - tmp[nitems] = '\0'; - strncpy(tmp, buffer, nitems); - url->content_length = atoi(strchr(tmp, ' ')+1); - } - return size; -} - -/* use to attempt to fill the read buffer up to requested number of bytes */ -static int fill_buffer(URL_FILE *file, size_t want) -{ - fd_set fdread; - fd_set fdwrite; - fd_set fdexcep; - struct timeval timeout; - int rc; - CURLMcode mc; /* curl_multi_fdset() return code */ - - /* only attempt to fill buffer if transactions still running and buffer - * doesn't exceed required size already - */ - if((!file->still_running) || (file->buffer_pos > want)) - return 0; - - /* attempt to fill buffer */ - do { - int maxfd = -1; - long curl_timeo = -1; - - FD_ZERO(&fdread); - FD_ZERO(&fdwrite); - FD_ZERO(&fdexcep); - - /* set a suitable timeout to fail on */ - timeout.tv_sec = 60; /* 1 minute */ - timeout.tv_usec = 0; - - curl_multi_timeout(multi_handle, &curl_timeo); - if(curl_timeo >= 0) { - timeout.tv_sec = curl_timeo / 1000; - if(timeout.tv_sec > 1) - timeout.tv_sec = 1; - else - timeout.tv_usec = (curl_timeo % 1000) * 1000; - } - - /* get file descriptors from the transfers */ - mc = curl_multi_fdset(multi_handle, &fdread, &fdwrite, &fdexcep, &maxfd); - - if(mc != CURLM_OK) { - fprintf(stderr, "curl_multi_fdset() failed, code %d.\n", mc); - break; - } - - /* On success the value of maxfd is guaranteed to be >= -1. We call - select(maxfd + 1, ...); specially in case of (maxfd == -1) there are - no fds ready yet so we call select(0, ...) --or Sleep() on Windows-- - to sleep 100ms, which is the minimum suggested value in the - curl_multi_fdset() doc. */ - - if(maxfd == -1) { - /* Portable sleep for platforms other than Windows. */ - struct timeval wait = { 0, 100 * 1000 }; /* 100ms */ - rc = select(0, NULL, NULL, NULL, &wait); - - } else { - /* Note that on some platforms 'timeout' may be modified by select(). - If you need access to the original value save a copy beforehand. */ - rc = select(maxfd + 1, &fdread, &fdwrite, &fdexcep, &timeout); - } - - switch(rc) { - case -1: - /* select error */ - break; - - case 0: - default: - /* timeout or readable/writable sockets */ - curl_multi_perform(multi_handle, &file->still_running); - break; - } - } while(file->still_running && (file->buffer_pos < want)); - return 1; -} - -/* use to remove want bytes from the front of a files buffer */ -static int use_buffer(URL_FILE *file, size_t want) -{ - /* sort out buffer */ - if((file->buffer_pos - want) <= 0) { - /* ditch buffer - write will recreate */ - free(file->buffer); - file->buffer = NULL; - file->buffer_pos = 0; - file->buffer_len = 0; - } - else { - /* move rest down make it available for later */ - memmove(file->buffer, - &file->buffer[want], - (file->buffer_pos - want)); - - file->buffer_pos -= want; - } - return 0; -} - -static void start_fetching(URL_FILE *file) -{ - /* lets start the fetch */ - curl_multi_perform(multi_handle, &file->still_running); - - /* if still_running is 0 now, we should close the file descriptor */ - if (url_feof(file)) { - url_fclose(file); - } -} - -URL_FILE *url_fopen(const char *url, const char *operation) -{ - URL_FILE *file; - - file = calloc(1, sizeof(URL_FILE)); - if (!file) { - fprintf(stderr, "url_fopen: URL_FILE memory allocation failure."); - return NULL; - } - - file->handle = curl_easy_init(); - - curl_easy_setopt(file->handle, CURLOPT_URL, url); - curl_easy_setopt(file->handle, CURLOPT_VERBOSE, 0L); - curl_easy_setopt(file->handle, CURLOPT_TCP_KEEPALIVE, 1L); - /* By default we don't want to download anything */ - curl_easy_setopt(file->handle, CURLOPT_NOBODY, 1L); - - for (const char *c = operation; *c; c++) { - switch (*c) { - case 'r': - curl_easy_setopt(file->handle, - CURLOPT_WRITEDATA, file); - curl_easy_setopt(file->handle, - CURLOPT_WRITEFUNCTION, write_callback); - curl_easy_setopt(file->handle, - CURLOPT_NOBODY, 0L); - break; - case 'h': - curl_easy_setopt(file->handle, - CURLOPT_HEADERDATA, file); - curl_easy_setopt(file->handle, - CURLOPT_HEADERFUNCTION, header_callback); - break; - default: - fprintf(stderr, "url_fopen: invalid operation %c", *c); - break; - } - } - - if (!multi_handle) { - multi_handle = curl_multi_init(); - } - - curl_multi_add_handle(multi_handle, file->handle); - - return file; -} - -CURLMcode url_fclose(URL_FILE *file) -{ - /* make sure the easy handle is not in the multi handle anymore */ - CURLMcode ret = curl_multi_remove_handle(multi_handle, file->handle); - - /* cleanup */ - curl_easy_cleanup(file->handle); - - free(file->buffer);/* free any allocated buffer space */ - free(file->header); - free(file); - file = NULL; - - return ret; -} - -int url_feof(URL_FILE *file) -{ - return (!file->buffer_pos) && (!file->still_running); -} - -size_t url_fread(void *ptr, size_t size, size_t nmemb, URL_FILE *file) -{ - size_t want = nmemb * size; - - fill_buffer(file, want); - - /* check if there's data in the buffer - if not fill_buffer() - * either errored or EOF */ - if(!file->buffer_pos) - return 0; - - /* ensure only available data is considered */ - if(file->buffer_pos < want) - want = file->buffer_pos; - - /* xfer data to caller */ - memcpy(ptr, file->buffer, want); - - use_buffer(file, want); - - want = want / size; /* number of items */ - - return want; -} - -char *url_fgets(char *ptr, size_t size, URL_FILE *file) -{ - size_t want = size - 1;/* always need to leave room for zero termination */ - size_t loop; - - fill_buffer(file, want); - - /* check if there's data in the buffer - if not fill either errored or - * EOF */ - if(!file->buffer_pos) - return NULL; - - /* ensure only available data is considered */ - if(file->buffer_pos < want) - want = file->buffer_pos; - - /*buffer contains data */ - /* look for newline or eof */ - for(loop = 0; loop < want; loop++) { - if(file->buffer[loop] == '\n') { - want = loop + 1;/* include newline */ - break; - } - } - - /* xfer data to caller */ - memcpy(ptr, file->buffer, want); - ptr[want] = 0;/* always null terminate */ - - use_buffer(file, want); - - return ptr;/*success */ -} - -void url_rewind(URL_FILE *file) -{ - /* halt transaction */ - curl_multi_remove_handle(multi_handle, file->handle); - - /* restart */ - curl_multi_add_handle(multi_handle, file->handle); - - /* ditch buffer - write will recreate - resets stream pos*/ - free(file->buffer); - file->buffer = NULL; - file->buffer_pos = 0; - file->buffer_len = 0; - - free(file->header); - file->header = NULL; - file->header_len = 0; - file->header_pos = 0; -} - diff --git a/http.h b/http.h index 5902f3f..ca46b3b 100644 --- a/http.h +++ b/http.h @@ -3,37 +3,5 @@ #include -typedef struct { - CURL *handle; /* handle */ - - char *buffer; /* buffer to store cached data*/ - size_t buffer_len; /* currently allocated buffers length */ - size_t buffer_pos; /* end of data in buffer*/ - - char *header; /* character array to store the header */ - size_t header_len; /* the current header length */ - size_t header_pos; /* end of header in buffer */ - - int accept_range; /* does it accept range request */ - int content_length; /* the length of the content */ - -} URL_FILE; - -URL_FILE *url_fopen(const char *url, const char *operation); - -int url_fclose(URL_FILE *file); - -int url_feof(URL_FILE *file); - -size_t url_fread(void *ptr, size_t size, size_t nmemb, URL_FILE *file); - -/* - * \brief fgets implemented using libcurl. - * \details This is probably not the function that you want to use, - * because it doesn't work well with binary! - */ -char *url_fgets(char *ptr, size_t size, URL_FILE *file); - -void url_rewind(URL_FILE *file); #endif diff --git a/link.c b/link.c index 7088bc8..9a7e581 100644 --- a/link.c +++ b/link.c @@ -3,37 +3,67 @@ #include "link.h" #include "string.h" -static char linktype_to_char(linktype t) +Link *Link_new(const char *p_url) { - switch (t) { - case LINK_DIR : - return 'D'; - case LINK_FILE : - return 'F'; - case LINK_UNKNOWN : - return 'U'; - default : - return 'E'; - } + Link *link = calloc(1, sizeof(Link)); + + size_t p_url_len = strnlen(p_url, LINK_LEN_MAX) + 1; + link->p_url = malloc(p_url_len); + link->p_url = strncpy(link->p_url, p_url, p_url_len); + + link->type = LINK_UNKNOWN; + link->curl_h = curl_easy_init(); + link->res = -1; + link->data = NULL; + link->data_sz = 0; + + return link; } -void linklist_print(ll_t *links) +void Link_free(Link *link) { - for (int i = 0; i < links->num; i++) { - fprintf(stderr, "%d %c %s\n", - i, - linktype_to_char(links->type[i]), - links->link[i]); - } + free(link->p_url); + curl_easy_cleanup(link->curl_h); + free(link->data); + free(link); + link = NULL; } -ll_t *linklist_new() +LinkTable *LinkTable_new() { - ll_t *links = malloc(sizeof(ll_t)); - links->num = 0; - links->link = NULL; - links->type = NULL; - return links; + LinkTable *linktbl = calloc(1, sizeof(LinkTable)); + linktbl->num = 0; + linktbl->links = NULL; + return linktbl; +} + +void LinkTable_free(LinkTable *linktbl) +{ + for (int i = 0; i < linktbl->num; i++) { + Link_free(linktbl->links[i]); + } + free(linktbl->links); + free(linktbl); + linktbl = NULL; +} + +void LinkTable_add(LinkTable *linktbl, Link *link) +{ + linktbl->num++; + linktbl->links = realloc( + linktbl->links, + linktbl->num * sizeof(Link *)); + linktbl->links[linktbl->num - 1] = link; +} + +void LinkTable_print(LinkTable *linktbl) +{ + for (int i = 0; i < linktbl->num; i++) { + printf("%d %c %s\n", + i, + linktbl->links[i]->type, + linktbl->links[i]->p_url); + } } static int is_valid_link(const char *n) @@ -42,8 +72,9 @@ static int is_valid_link(const char *n) if (!isalnum(n[0])) { return 0; } + /* check for http:// and https:// */ - int c = strlen(n); + int c = strnlen(n, LINK_LEN_MAX); if (c > 5) { if (n[0] == 'h' && n[1] == 't' && n[2] == 't' && n[3] == 'p') { if ((n[4] == ':' && n[5] == '/' && n[6] == '/') || @@ -59,7 +90,7 @@ static int is_valid_link(const char *n) * Shamelessly copied and pasted from: * https://github.com/google/gumbo-parser/blob/master/examples/find_links.cc */ -void html_to_linklist(GumboNode *node, ll_t *links) +void HTML_to_LinkTable(GumboNode *node, LinkTable *linktbl) { if (node->type != GUMBO_NODE_ELEMENT) { return; @@ -70,33 +101,18 @@ void html_to_linklist(GumboNode *node, ll_t *links) (href = gumbo_get_attribute(&node->v.element.attributes, "href"))) { /* if it is valid, copy the link onto the heap */ if (is_valid_link(href->value)) { - links->num++; - links->link = realloc(links->link, links->num * sizeof(char *)); - links->type = realloc(links->type, links->num * sizeof(linktype *)); - int i = links->num - 1; - links->link[i] = malloc(strlen(href->value) * sizeof(char *)); - strcpy(links->link[i], href->value); - links->type[i] = LINK_UNKNOWN; + LinkTable_add(linktbl, Link_new(href->value)); } } /* Note the recursive call, lol. */ GumboVector *children = &node->v.element.children; for (size_t i = 0; i < children->length; ++i) { - html_to_linklist((GumboNode*)children->data[i], links); + HTML_to_LinkTable((GumboNode*)children->data[i], linktbl); } return; } -void linklist_free(ll_t *links) -{ - for (int i = 0; i < links->num; i++) { - free(links->link[i]); - } - free(links->type); - free(links); -} - /* the upper level */ char *url_upper(const char *url) { diff --git a/link.h b/link.h index ede71c4..ab46183 100644 --- a/link.h +++ b/link.h @@ -4,37 +4,35 @@ #include #include -/* \brief the link type */ -typedef enum { - LINK_DIR, - LINK_FILE, - LINK_UNKNOWN -} linktype; -/* \brief link list data type */ -typedef struct { - int num; - char **link; - linktype *type; -} ll_t; +#include "data.h" -/* \brief make a new link list */ -ll_t *linklist_new(); +/** \brief make a new Link */ +Link *Link_new(); -/* \brief print a link list */ -void linklist_print(ll_t *links); +/** \brief free a Link */ +void Link_free(Link *link); -/* \brief convert a html page to a link list */ -void html_to_linklist(GumboNode *node, ll_t *links); +/** \brief make a new LinkTable */ +LinkTable *LinkTable_new(); -/* \brief free a link list */ -void linklist_free(ll_t *links); +/** \brief free a LinkTable */ +void LinkTable_free(LinkTable *linktbl); -/* \brief the upper level */ +/** \brief add a link to the link table */ +void LinkTable_add(LinkTable *linktbl, Link *link); + +/** \brief print a LinkTable */ +void LinkTable_print(LinkTable *linktbl); + +/** \brief convert a html page to a LinkTable */ +void HTML_to_LinkTable(GumboNode *node, LinkTable *linktbl); + +/** \brief the upper level */ /* \warning does not check if you have reached the base level! */ char *url_upper(const char *url); -/* \brief append url */ +/** \brief append url */ char *url_append(const char *url, const char *sublink); #endif diff --git a/main.c b/main.c index 7895336..3ae171e 100644 --- a/main.c +++ b/main.c @@ -9,7 +9,7 @@ int main(int argc, char** argv) { (void) argc; (void) argv; -// gumbo_test(argc, argv); + gumbo_test(argc, argv); // url_test(); http_test(); return 0; diff --git a/test.c b/test.c index 128bbeb..cc4802b 100644 --- a/test.c +++ b/test.c @@ -11,138 +11,7 @@ int http_test() { - URL_FILE *handle; - FILE *outf; - - size_t nread; - char buffer[256]; - const char *url; - -// url = "https://www.fangfufu.co.uk/~fangfufu/Unison-Windows-2.48.4.zip"; - /* ------------------------Test header-only--------------------------*/ - /* open the input file */ - URL_FILE *header_handle = url_fopen( - "http://ipv4.download.thinkbroadband.com/1GB.zip", - "rh"); - if(!header_handle) { - printf("couldn't url_fopen() \n"); - return 2; - } - -// printf("start fgets\n"); - /* Read 2 character seem to be enough to get the header*/ - url_fgets(buffer, 256, header_handle); -// printf("end fgets\n"); - - - /* Print the header */ -// printf(header_handle->header); - -// printf("accept-range: %d\n", header_handle->accept_range); -// printf("filesize: %d\n", header_handle->content_length); - printf("test fgets"); - - - /* close the URL handle */ -// url_fclose(header_handle); - - /* ---------------------------Test fgets ----------------------------*/ - /* open the input file */ - url = "http://127.0.0.1/~fangfufu/test.txt"; - handle = url_fopen(url, "h"); - if(!handle) { - printf("couldn't url_fopen() %s\n", url); - return 2; - } - - /* create the output file for fgets*/ - outf = fopen("fgets_test.txt", "wb"); - if(!outf) { - perror("couldn't open output file\n"); - return 1; - } - - /* copy from url line by line with fgets */ - while(!url_feof(handle)) { - url_fgets(buffer, sizeof(buffer), handle); - fwrite(buffer, 1, strlen(buffer), outf); - } - - /* close the handles for the fgets test*/ - url_fclose(handle); - fclose(outf); - - /* ---------------------------Test fread ----------------------------*/ - - - /* open the input file again */ - handle = url_fopen(url, "r"); - if(!handle) { - printf("couldn't url_fopen() testfile\n"); - return 2; - } - - /* create the output file for fread test*/ - outf = fopen("fread_test.txt", "wb"); - if(!outf) { - perror("couldn't open fread output file\n"); - return 1; - } - - /* Copy from url with fread */ - do { - nread = url_fread(buffer, 1, sizeof(buffer), handle); - fwrite(buffer, 1, nread, outf); - } while(nread); - - /* close the handles for the fgets test*/ - url_fclose(handle); - fclose(outf); - - /* ---------------------------Test rewind ----------------------------*/ - /* open the input file again */ - handle = url_fopen(url, "r"); - if(!handle) { - printf("couldn't url_fopen() testfile\n"); - return 2; - } - - /* create the output file for rewind test*/ - outf = fopen("rewind_test.txt", "wb"); - if(!outf) { - perror("couldn't open fread output file\n"); - return 1; - } - - /* Copy from url with fread */ - do { - nread = url_fread(buffer, 1, sizeof(buffer), handle); - fwrite(buffer, 1, nread, outf); - } while(nread); - - url_rewind(handle); - fprintf(outf, "\n-------------------\n"); - - /* - * read the URL again after rewind: - * - copy from url line by line with fgets - */ - while(!url_feof(handle)) { - url_fgets(buffer, sizeof(buffer), handle); - fwrite(buffer, 1, strlen(buffer), outf); - } - - buffer[0]='\n'; - fwrite(buffer, 1, 1, outf); - - nread = url_fread(buffer, 1, sizeof(buffer), handle); - fwrite(buffer, 1, nread, outf); - - url_fclose(handle); - - fclose(outf); - - return 0;/* all done */ + return 0; } void url_test() { @@ -182,10 +51,10 @@ void gumbo_test(int argc, char **argv) fclose(fp); GumboOutput* output = gumbo_parse(contents); - ll_t *links = linklist_new(); - html_to_linklist(output->root, links); + LinkTable *linktbl = LinkTable_new(); + HTML_to_LinkTable(output->root, linktbl); gumbo_destroy_output(&kGumboDefaultOptions, output); - linklist_print(links); - linklist_free(links); + LinkTable_print(linktbl); + LinkTable_free(linktbl); printf("--- end of gumbo_test ---\n\n"); }