httpdirfs/link.c

#include <ctype.h>

#include "link.h"
#include "string.h"

static char linktype_to_char(linktype t)
{
    switch (t) {
        case LINK_DIR :
            return 'D';
        case LINK_FILE :
            return 'F';
        case LINK_UNKNOWN :
            return 'U';
        default :
            return 'E';
    }
}

void linklist_print(ll_t *links)
{
    for (int i = 0; i < links->num; i++) {
        fprintf(stderr, "%d %c %s\n",
                i,
                linktype_to_char(links->type[i]),
                links->link[i]);
    }
}

ll_t *linklist_new()
{
    ll_t *links = malloc(sizeof(ll_t));
    links->num = 0;
    links->link = NULL;
    links->type = NULL;
    return links;
}

static int is_valid_link(const char *n)
{
    /* The link name has to start with alphanumerical character */
    if (!isalnum(n[0])) {
        return 0;
    }
    /* check for http:// and https:// */
    int c = strlen(n);
    if (c > 5) {
        if (n[0] == 'h' && n[1] == 't' && n[2] == 't' && n[3] == 'p') {
            if ((n[4] == ':' && n[5] == '/' && n[6] == '/') ||
                (n[4] == 's' && n[5] == ':' && n[6] == '/' && n[7] == '/')) {
                return 0;
            }
        }
    }
    return 1;
}

/*
 * Shamelessly copied and pasted from:
 * https://github.com/google/gumbo-parser/blob/master/examples/find_links.cc
 */
void html_to_linklist(GumboNode *node, ll_t *links)
{
    if (node->type != GUMBO_NODE_ELEMENT) {
        return;
    }
    GumboAttribute* href;

    if (node->v.element.tag == GUMBO_TAG_A &&
        (href = gumbo_get_attribute(&node->v.element.attributes, "href"))) {
        /* if it is valid, copy the link onto the heap */
        if (is_valid_link(href->value)) {
            links->num++;
            if (!links->link) {
                links->link = malloc(sizeof(char *));
                links->type = malloc(sizeof(linktype *));
            } else {
                links->link = realloc(links->link, links->num * sizeof(char *));
                links->type = realloc(links->type,
                                      links->num * sizeof(linktype *));
            }
            int i = links->num - 1;
            links->link[i] = malloc(strlen(href->value) * sizeof(char *));
            strcpy(links->link[i], href->value);
            links->type[i] = LINK_UNKNOWN;
        }
    }

    /* Note the recursive call, lol. */
    GumboVector *children = &node->v.element.children;
    for (size_t i = 0; i < children->length; ++i) {
        html_to_linklist((GumboNode*)children->data[i], links);
    }
    return;
}

void linklist_free(ll_t *links)
{
    for (int i = 0; i < links->num; i++) {
        free(links->link[i]);
    }
    free(links->type);
    free(links);
}

/* the upper level */
char *url_upper(const char *url)
{
    const char *pt = strrchr(url, '/');
    /* +1 for the '/' */
    size_t  len = pt - url + 1;
    char *str = malloc(len* sizeof(char));
    strncpy(str, url, len);
    str[len] = '\0';
    return str;
}

/* append url */
char *url_append(const char *url, const char *sublink)
{
    int needs_separator = 0;
    if (url[strlen(url)-1] != '/') {
        needs_separator = 1;
    }

    char *str;
    size_t ul = strlen(url);
    size_t sl = strlen(sublink);
    str = calloc(ul + sl + needs_separator, sizeof(char));
    strncpy(str, url, ul);
    if (needs_separator) {
        str[ul] = '/';
    }
    strncat(str, sublink, sl);
    return str;
}