diff options
| -rw-r--r-- | Makefile | 9 | ||||
| -rw-r--r-- | dom.c | 132 | ||||
| -rw-r--r-- | dom.h | 27 | ||||
| -rw-r--r-- | gentags.pl | 67 | ||||
| -rw-r--r-- | main.c | 36 | ||||
| -rw-r--r-- | mem.h | 10 | ||||
| -rw-r--r-- | parse.c | 40 | ||||
| -rw-r--r-- | tags.h | 101 | ||||
| -rw-r--r-- | tags.txt | 16 | ||||
| -rw-r--r-- | test.html | 45 | ||||
| -rw-r--r-- | vec.c | 51 | ||||
| -rw-r--r-- | vec.h | 19 |
12 files changed, 520 insertions, 33 deletions
@@ -2,13 +2,18 @@ CC = cc CFLAGS = -std=c11 -Wall -Wextra -Wpedantic -g -O0 LDFLAGS = -HDRS = mem.h parse.h dom.h -SRCS = parse.c dom.c main.c +HDRS = mem.h vec.h parse.h tags.h dom.h +SRCS = vec.c parse.c dom.c main.c OBJS = $(SRCS:.c=.o) TARGET = glacier all: $(TARGET) +tags.h: tags.txt gentags.pl + perl gentags.pl < tags.txt > tags.h + +$(OBJS): tags.h + $(TARGET): $(OBJS) $(CC) $(OBJS) -o $(TARGET) $(LDFLAGS) @@ -1,37 +1,147 @@ #include <stdio.h> +#include <err.h> +#include "mem.h" #include "dom.h" +#include "vec.h" #include "parse.h" -void init_dom(const char *html) +static struct node **nodes; +static size_t count; +static size_t capacity; + +static struct vec stack; +static struct node *root; + +static inline struct node *node_alloc(void) +{ + struct node *v; + + if (count == capacity) { + capacity = capacity ? capacity * 2 : 64; + nodes = REALLOC(nodes, capacity * sizeof(struct node *)); + } + + v = CALLOC(1, sizeof(struct node)); + nodes[count++] = v; + return v; +} + +struct node *dom_init(const char *html) { + vec_init(&stack, sizeof(struct node *)); parse(html); + return root; +} + +void dom_free(void) +{ + size_t i; + + for (i = 0; i < count; i++) + free(nodes[i]); + free(nodes); + nodes = NULL; + count = 0; + capacity = 0; + + vec_free(&stack); +} + +static inline int is_self_closing(tag_type tag) +{ + switch (tag) { + case TAG_META: + case TAG_LINK: + return 1; + default: + return 0; + } +} + +static inline void close_tag(struct node *v) +{ + void *top; + struct node *parent; + + top = vec_top(&stack); + if (!top) { + root = v; + return; + } + + parent = *(struct node **)top; + v->parent = parent; + + if (!parent->first_child) { + parent->first_child = v; + parent->last_child = v; + } else { + parent->last_child->next_sibling = v; + parent->last_child = v; + } } -/* Parser event handlers */ extern void on_open(const char *tag, size_t n) { - printf("Tag opened: %.*s\n", (int)n, tag); + struct node *v; + + v = node_alloc(); + v->tag = str_to_tag(tag, n); + vec_push(&stack, &v); +} + +extern void on_open_end(void) +{ + struct node *v; + + v = *(struct node **)vec_top(&stack); + if (is_self_closing(v->tag)) { + v = *(struct node **)vec_pop(&stack); + close_tag(v); + } } extern void on_close(const char *tag, size_t n) { - printf("Tag closed: %.*s\n", (int)n, tag); + tag_type type; + struct node *top; + + type = str_to_tag(tag, n); + top = *(struct node **)vec_top(&stack); + if (top->tag != type) + errx(1, "Unmatched closing tag: %.*s", (int)n, tag); + + top = *(struct node **)vec_pop(&stack); + close_tag(top); } extern void on_text(const char *text, size_t n) { - printf("Text: %.*s\n", (int)n, text); + struct node *v; + + v = node_alloc(); + v->tag = TAG_TEXT; + v->text = text; + v->textlen = n; + close_tag(v); } -extern void on_attr(const char *name, size_t nname, const char *val, - size_t nval) +extern void on_attr(const char *name, size_t nname, + const char *val, size_t nval) { - printf("Attribute: name=%.*s", (int)nname, name); + struct attr *a; + struct node *v; - if (val && nval > 0) - printf(", value=%.*s", (int)nval, val); + a = MALLOC(sizeof(struct attr)); + a->key = name; + a->keylen = nname; + a->val = val; + a->vallen = nval; + a->next = NULL; - printf("\n"); + v = *(struct node **)vec_top(&stack); + a->next = v->attrs; + v->attrs = a; } @@ -1,6 +1,31 @@ #ifndef DOM_H #define DOM_H -void init_dom(const char *html); +#include "tags.h" + +struct attr { + const char *key; + size_t keylen; + const char *val; + size_t vallen; + struct attr *next; +}; + +struct node { + tag_type tag; + + const char *text; + size_t textlen; + + struct attr *attrs; + + struct node *parent; + struct node *first_child; + struct node *last_child; + struct node *next_sibling; +}; + +struct node *dom_init(const char *html); +void dom_free(void); #endif /* DOM_H */ diff --git a/gentags.pl b/gentags.pl new file mode 100644 index 0000000..6c5b58b --- /dev/null +++ b/gentags.pl @@ -0,0 +1,67 @@ +#!/usr/bin/perl +use strict; +use warnings; + +my @tags = sort grep { /\S/ } map { chomp; $_ } <STDIN>; + +my $maxlen = (sort { $b <=> $a } map { length($_) } @tags)[0]; + +print "/* Generated file, do not edit */\n\n"; +print "#ifndef TAGS_H\n"; +print "#define TAGS_H\n\n"; +print "#include <string.h>\n\n"; + +print "typedef enum {\n"; +print "\tTAG_UNKNOWN,\n"; +print "\tTAG_TEXT,\n"; +for my $tag (@tags) { + printf "\tTAG_%s,\n", uc($tag); +} +print "} tag_type;\n\n"; + +print "static const struct {\n"; +print "\tconst char *name;\n"; +print "\ttag_type tag;\n"; +print "} tag_map[] = {\n"; +for my $tag (@tags) { + printf "\t{ \"%-*s\tTAG_%s },\n", $maxlen + 2, "$tag\",", uc($tag); +} +print "};\n\n"; + +my $n = scalar @tags; + +print "static inline tag_type\n"; +print "str_to_tag(const char *name, size_t len)\n"; +print "{\n"; +print "\tint lo, hi, mid, cmp;\n\n"; +print "\tlo = 0;\n"; +print "\thi = $n - 1;\n\n"; +print "\twhile (lo <= hi) {\n"; +print "\t\tmid = (lo + hi) / 2;\n"; +print "\t\tcmp = strncmp(name, tag_map[mid].name, len);\n"; +print "\t\tif (cmp == 0) {\n"; +print "\t\t\tif (tag_map[mid].name[len] == '\\0')\n"; +print "\t\t\t\treturn tag_map[mid].tag;\n"; +print "\t\t\thi = mid - 1;\n"; +print "\t\t} else if (cmp < 0)\n"; +print "\t\t\thi = mid - 1;\n"; +print "\t\telse\n"; +print "\t\t\tlo = mid + 1;\n"; +print "\t}\n\n"; +print "\treturn TAG_UNKNOWN;\n"; +print "}\n\n"; + +print "static inline const char *\n"; +print "tag_to_str(tag_type tag)\n"; +print "{\n"; +print "\tswitch (tag) {\n"; +print "\tcase TAG_UNKNOWN: return \"unknown\";\n"; +print "\tcase TAG_TEXT: return \"text\";\n"; +for my $tag (@tags) { + printf "\tcase TAG_%s: return \"%s\";\n", uc($tag), $tag; +} +print "\tdefault: return \"unknown\";\n"; +print "\t}\n"; +print "}\n\n"; + +print "#endif /* TAGS_H */\n"; @@ -3,9 +3,41 @@ #include "mem.h" #include "dom.h" +#include "tags.h" + +static void node_print(const struct node *n, const char *prefix, int last) +{ + const char *connector = last ? "└── " : "├── "; + const char *extension = last ? " " : "│ "; + char new_prefix[256]; + struct node *c; + + printf("%s%s", prefix, connector); + + if (n->tag == TAG_TEXT) + printf("%.*s\n", (int)n->textlen, n->text); + else + printf("%s\n", tag_to_str(n->tag)); + + snprintf(new_prefix, sizeof(new_prefix), "%s%s", prefix, extension); + + for (c = n->first_child; c; c = c->next_sibling) + node_print(c, new_prefix, c->next_sibling == NULL); +} + +void dom_print(const struct node *root) +{ + struct node *c; + + printf("%s\n", tag_to_str(root->tag)); + for (c = root->first_child; c; c = c->next_sibling) + node_print(c, "", c->next_sibling == NULL); +} int main(int argc, char *argv[]) { + struct node *root; + if (argc < 2) errx(1, "usage: glacier <file>"); @@ -27,8 +59,10 @@ int main(int argc, char *argv[]) html[len] = '\0'; fclose(file); - init_dom(html); + root = dom_init(html); + dom_print(root); + dom_free(); free(html); return 0; @@ -5,6 +5,7 @@ #include <stdlib.h> #define MALLOC(s) xmalloc((s), __FILE__, __LINE__) +#define CALLOC(n, s) xcalloc((n), (s), __FILE__, __LINE__) #define REALLOC(p, s) xrealloc((p), (s), __FILE__, __LINE__) static inline void *xmalloc(size_t s, const char *file, int line) @@ -16,6 +17,15 @@ static inline void *xmalloc(size_t s, const char *file, int line) return p; } +static inline void *xcalloc(size_t n, size_t s, const char *file, int line) +{ + void *p; + + if (!(p = calloc(n, s))) + err(1, "%s:%d: calloc", file, line); + return p; +} + static inline void *xrealloc(void *ptr, size_t s, const char *file, int line) { void *p; @@ -1,4 +1,5 @@ #include <ctype.h> +#include <err.h> #include <stddef.h> #include "parse.h" @@ -22,9 +23,11 @@ typedef enum { } parse_mode; extern void on_open(const char *tag, size_t n); +extern void on_open_end(void); extern void on_close(const char *tag, size_t n); extern void on_text(const char *text, size_t n); -extern void on_attr(const char *name, size_t nname, const char *val, size_t nval); +extern void on_attr(const char *name, size_t nname, + const char *val, size_t nval); void parse(const char *s) { @@ -82,33 +85,44 @@ void parse(const char *s) n = 0; while (*s) { if (*s == '>') { - mode = DATA; if (n > 0 && s[-1] == '/') n--; + + if (n > 0) + on_open(p, n); + on_open_end(); + ADVANCE(s, 1); + mode = DATA; break; } if (*s == ' ') { + if (n > 0) + on_open(p, n); + while (*s == ' ') ADVANCE(s, 1); if (isalpha((unsigned char)*s)) mode = ATTR_NAME; - else if (*s == '/' && s[1] == '>') { - ADVANCE(s, 2); // ignore self-closing tags + else if (*s == '>') { + on_open_end(); + ADVANCE(s, 1); mode = DATA; - } + } else if (*s == '/' && s[1] == '>') { + on_open_end(); + ADVANCE(s, 2); + mode = DATA; + } else + errx(1, "Invalid character in open tag: %c", *s); + break; } n++; ADVANCE(s, 1); } - - if (n > 0) - on_open(p, n); - break; case TAG_CLOSE: p = s; @@ -120,6 +134,7 @@ void parse(const char *s) ADVANCE(s, 1); break; } + n++; ADVANCE(s, 1); } @@ -131,17 +146,18 @@ void parse(const char *s) if (*s == '=') { attr_name = p; attr_name_len = n; - mode = ATTR_VALUE; ADVANCE(s, 1); if (*s == '"' || *s == '\'') ADVANCE(s, 1); + mode = ATTR_VALUE; break; } if (*s == '>') { // <input disabled> on_attr(p, n, NULL, 0); - mode = DATA; + on_open_end(); ADVANCE(s, 1); + mode = DATA; break; } @@ -159,6 +175,7 @@ void parse(const char *s) ADVANCE(s, 1); break; } + n++; ADVANCE(s, 1); } @@ -181,6 +198,7 @@ void parse(const char *s) ADVANCE(s, 1); break; } + n++; ADVANCE(s, 1); } @@ -0,0 +1,101 @@ +/* Generated file, do not edit */ + +#ifndef TAGS_H +#define TAGS_H + +#include <string.h> + +typedef enum { + TAG_UNKNOWN, + TAG_TEXT, + TAG_A, + TAG_ARTICLE, + TAG_BODY, + TAG_DIV, + TAG_FOOTER, + TAG_H1, + TAG_HEAD, + TAG_HEADER, + TAG_HTML, + TAG_LI, + TAG_LINK, + TAG_META, + TAG_P, + TAG_TIME, + TAG_TITLE, + TAG_UL, +} tag_type; + +static const struct { + const char *name; + tag_type tag; +} tag_map[] = { + { "a", TAG_A }, + { "article", TAG_ARTICLE }, + { "body", TAG_BODY }, + { "div", TAG_DIV }, + { "footer", TAG_FOOTER }, + { "h1", TAG_H1 }, + { "head", TAG_HEAD }, + { "header", TAG_HEADER }, + { "html", TAG_HTML }, + { "li", TAG_LI }, + { "link", TAG_LINK }, + { "meta", TAG_META }, + { "p", TAG_P }, + { "time", TAG_TIME }, + { "title", TAG_TITLE }, + { "ul", TAG_UL }, +}; + +static inline tag_type +str_to_tag(const char *name, size_t len) +{ + int lo, hi, mid, cmp; + + lo = 0; + hi = 16 - 1; + + while (lo <= hi) { + mid = (lo + hi) / 2; + cmp = strncmp(name, tag_map[mid].name, len); + if (cmp == 0) { + if (tag_map[mid].name[len] == '\0') + return tag_map[mid].tag; + hi = mid - 1; + } else if (cmp < 0) + hi = mid - 1; + else + lo = mid + 1; + } + + return TAG_UNKNOWN; +} + +static inline const char * +tag_to_str(tag_type tag) +{ + switch (tag) { + case TAG_UNKNOWN: return "unknown"; + case TAG_TEXT: return "text"; + case TAG_A: return "a"; + case TAG_ARTICLE: return "article"; + case TAG_BODY: return "body"; + case TAG_DIV: return "div"; + case TAG_FOOTER: return "footer"; + case TAG_H1: return "h1"; + case TAG_HEAD: return "head"; + case TAG_HEADER: return "header"; + case TAG_HTML: return "html"; + case TAG_LI: return "li"; + case TAG_LINK: return "link"; + case TAG_META: return "meta"; + case TAG_P: return "p"; + case TAG_TIME: return "time"; + case TAG_TITLE: return "title"; + case TAG_UL: return "ul"; + default: return "unknown"; + } +} + +#endif /* TAGS_H */ diff --git a/tags.txt b/tags.txt new file mode 100644 index 0000000..e9e98b4 --- /dev/null +++ b/tags.txt @@ -0,0 +1,16 @@ +html +head +meta +title +link +body +header +h1 +a +article +ul +li +time +div +p +footer @@ -1,9 +1,40 @@ -<html> - <div class="header"> - <h1>My Journal</h1> - </div> - <div class="content" disabled> - <p>Hello World</p> - </div> +<!doctype html> +<html lang="en-us"> + <head> + <meta charset="utf-8"> + <meta name="viewport" content="width=device-width, initial-scale=1"> + <title>Home</title> + <link rel="stylesheet" href="/assets/css/main.css"> +</head> + + + + <body> + + <header> + <h1><a href="/">Journal</a></h1> + + +</header> + + + <article><ul id="post-list"> + + + <li> + <time>2026-05-01</time> + <a href="/log/vcs-1/" class="link-decor-none">Built and benchmarked Urn against Git</a> + </li> + +</ul> + +<footer> + <p>Built with <a href="https://github.com/ronv/minimalist" class="external" target="_blank" rel="noopener noreferrer">Minimalist</a>. + <a href="/cgi-bin/search.cgi">Search</a> + </p> +</footer> +</article> + + </body> </html> @@ -0,0 +1,51 @@ +#include <string.h> + +#include "mem.h" +#include "vec.h" + +#define DEFAULT_LEN 8 + +void vec_init(struct vec *v, size_t unit_size) +{ + v->data = NULL; + v->len = 0; + v->cap = 0; + v->unit_size = unit_size; +} + +void vec_push(struct vec *v, const void *item) +{ + void *target; + + if (v->len == v->cap) { + v->cap = (v->cap == 0) ? DEFAULT_LEN : v->cap * 2; + v->data = REALLOC(v->data, v->cap * v->unit_size); + } + + target = (char *)v->data + (v->len * v->unit_size); + memcpy(target, item, v->unit_size); + v->len++; +} + +void *vec_pop(struct vec *v) +{ + if (v->len == 0) + return NULL; + v->len--; + return (char *)v->data + (v->len * v->unit_size); +} + +void *vec_top(struct vec *v) +{ + if (v->len == 0) + return NULL; + return (char *)v->data + ((v->len - 1) * v->unit_size); +} + +void vec_free(struct vec *v) +{ + free(v->data); + v->data = NULL; + v->len = v->cap = 0; +} + @@ -0,0 +1,19 @@ +#ifndef VEC_H +#define VEC_H + +#include <stddef.h> + +struct vec { + void *data; + size_t len; + size_t cap; + size_t unit_size; +}; + +void vec_init(struct vec *v, size_t unit_size); +void vec_push(struct vec *v, const void *item); +void *vec_pop(struct vec *v); +void *vec_top(struct vec *v); +void vec_free(struct vec *v); + +#endif /* VEC_H */ |
