summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSadeep Madurange <sadeep@asciimx.com>2026-05-24 18:00:38 +0800
committerSadeep Madurange <sadeep@asciimx.com>2026-05-28 15:34:29 +0800
commit95428c41f0ee3ac108cf1a4acfaa67157ad954dc (patch)
tree8565027758cd931f49ebe78738fb41126254b253
parent7aea09077aad335ac32bfd9858ded60ffd4d8a5b (diff)
downloadglacier-95428c41f0ee3ac108cf1a4acfaa67157ad954dc.tar.gz
Build DOM.
-rw-r--r--Makefile9
-rw-r--r--dom.c132
-rw-r--r--dom.h27
-rw-r--r--gentags.pl67
-rw-r--r--main.c36
-rw-r--r--mem.h10
-rw-r--r--parse.c40
-rw-r--r--tags.h101
-rw-r--r--tags.txt16
-rw-r--r--test.html45
-rw-r--r--vec.c51
-rw-r--r--vec.h19
12 files changed, 520 insertions, 33 deletions
diff --git a/Makefile b/Makefile
index c3b4451..660c67f 100644
--- a/Makefile
+++ b/Makefile
@@ -2,13 +2,18 @@ CC = cc
CFLAGS = -std=c11 -Wall -Wextra -Wpedantic -g -O0
LDFLAGS =
-HDRS = mem.h parse.h dom.h
-SRCS = parse.c dom.c main.c
+HDRS = mem.h vec.h parse.h tags.h dom.h
+SRCS = vec.c parse.c dom.c main.c
OBJS = $(SRCS:.c=.o)
TARGET = glacier
all: $(TARGET)
+tags.h: tags.txt gentags.pl
+ perl gentags.pl < tags.txt > tags.h
+
+$(OBJS): tags.h
+
$(TARGET): $(OBJS)
$(CC) $(OBJS) -o $(TARGET) $(LDFLAGS)
diff --git a/dom.c b/dom.c
index a4d75e1..71679bc 100644
--- a/dom.c
+++ b/dom.c
@@ -1,37 +1,147 @@
#include <stdio.h>
+#include <err.h>
+#include "mem.h"
#include "dom.h"
+#include "vec.h"
#include "parse.h"
-void init_dom(const char *html)
+static struct node **nodes;
+static size_t count;
+static size_t capacity;
+
+static struct vec stack;
+static struct node *root;
+
+static inline struct node *node_alloc(void)
+{
+ struct node *v;
+
+ if (count == capacity) {
+ capacity = capacity ? capacity * 2 : 64;
+ nodes = REALLOC(nodes, capacity * sizeof(struct node *));
+ }
+
+ v = CALLOC(1, sizeof(struct node));
+ nodes[count++] = v;
+ return v;
+}
+
+struct node *dom_init(const char *html)
{
+ vec_init(&stack, sizeof(struct node *));
parse(html);
+ return root;
+}
+
+void dom_free(void)
+{
+ size_t i;
+
+ for (i = 0; i < count; i++)
+ free(nodes[i]);
+ free(nodes);
+ nodes = NULL;
+ count = 0;
+ capacity = 0;
+
+ vec_free(&stack);
+}
+
+static inline int is_self_closing(tag_type tag)
+{
+ switch (tag) {
+ case TAG_META:
+ case TAG_LINK:
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+static inline void close_tag(struct node *v)
+{
+ void *top;
+ struct node *parent;
+
+ top = vec_top(&stack);
+ if (!top) {
+ root = v;
+ return;
+ }
+
+ parent = *(struct node **)top;
+ v->parent = parent;
+
+ if (!parent->first_child) {
+ parent->first_child = v;
+ parent->last_child = v;
+ } else {
+ parent->last_child->next_sibling = v;
+ parent->last_child = v;
+ }
}
-/* Parser event handlers */
extern void on_open(const char *tag, size_t n)
{
- printf("Tag opened: %.*s\n", (int)n, tag);
+ struct node *v;
+
+ v = node_alloc();
+ v->tag = str_to_tag(tag, n);
+ vec_push(&stack, &v);
+}
+
+extern void on_open_end(void)
+{
+ struct node *v;
+
+ v = *(struct node **)vec_top(&stack);
+ if (is_self_closing(v->tag)) {
+ v = *(struct node **)vec_pop(&stack);
+ close_tag(v);
+ }
}
extern void on_close(const char *tag, size_t n)
{
- printf("Tag closed: %.*s\n", (int)n, tag);
+ tag_type type;
+ struct node *top;
+
+ type = str_to_tag(tag, n);
+ top = *(struct node **)vec_top(&stack);
+ if (top->tag != type)
+ errx(1, "Unmatched closing tag: %.*s", (int)n, tag);
+
+ top = *(struct node **)vec_pop(&stack);
+ close_tag(top);
}
extern void on_text(const char *text, size_t n)
{
- printf("Text: %.*s\n", (int)n, text);
+ struct node *v;
+
+ v = node_alloc();
+ v->tag = TAG_TEXT;
+ v->text = text;
+ v->textlen = n;
+ close_tag(v);
}
-extern void on_attr(const char *name, size_t nname, const char *val,
- size_t nval)
+extern void on_attr(const char *name, size_t nname,
+ const char *val, size_t nval)
{
- printf("Attribute: name=%.*s", (int)nname, name);
+ struct attr *a;
+ struct node *v;
- if (val && nval > 0)
- printf(", value=%.*s", (int)nval, val);
+ a = MALLOC(sizeof(struct attr));
+ a->key = name;
+ a->keylen = nname;
+ a->val = val;
+ a->vallen = nval;
+ a->next = NULL;
- printf("\n");
+ v = *(struct node **)vec_top(&stack);
+ a->next = v->attrs;
+ v->attrs = a;
}
diff --git a/dom.h b/dom.h
index d336019..ae7c953 100644
--- a/dom.h
+++ b/dom.h
@@ -1,6 +1,31 @@
#ifndef DOM_H
#define DOM_H
-void init_dom(const char *html);
+#include "tags.h"
+
+struct attr {
+ const char *key;
+ size_t keylen;
+ const char *val;
+ size_t vallen;
+ struct attr *next;
+};
+
+struct node {
+ tag_type tag;
+
+ const char *text;
+ size_t textlen;
+
+ struct attr *attrs;
+
+ struct node *parent;
+ struct node *first_child;
+ struct node *last_child;
+ struct node *next_sibling;
+};
+
+struct node *dom_init(const char *html);
+void dom_free(void);
#endif /* DOM_H */
diff --git a/gentags.pl b/gentags.pl
new file mode 100644
index 0000000..6c5b58b
--- /dev/null
+++ b/gentags.pl
@@ -0,0 +1,67 @@
+#!/usr/bin/perl
+use strict;
+use warnings;
+
+my @tags = sort grep { /\S/ } map { chomp; $_ } <STDIN>;
+
+my $maxlen = (sort { $b <=> $a } map { length($_) } @tags)[0];
+
+print "/* Generated file, do not edit */\n\n";
+print "#ifndef TAGS_H\n";
+print "#define TAGS_H\n\n";
+print "#include <string.h>\n\n";
+
+print "typedef enum {\n";
+print "\tTAG_UNKNOWN,\n";
+print "\tTAG_TEXT,\n";
+for my $tag (@tags) {
+ printf "\tTAG_%s,\n", uc($tag);
+}
+print "} tag_type;\n\n";
+
+print "static const struct {\n";
+print "\tconst char *name;\n";
+print "\ttag_type tag;\n";
+print "} tag_map[] = {\n";
+for my $tag (@tags) {
+ printf "\t{ \"%-*s\tTAG_%s },\n", $maxlen + 2, "$tag\",", uc($tag);
+}
+print "};\n\n";
+
+my $n = scalar @tags;
+
+print "static inline tag_type\n";
+print "str_to_tag(const char *name, size_t len)\n";
+print "{\n";
+print "\tint lo, hi, mid, cmp;\n\n";
+print "\tlo = 0;\n";
+print "\thi = $n - 1;\n\n";
+print "\twhile (lo <= hi) {\n";
+print "\t\tmid = (lo + hi) / 2;\n";
+print "\t\tcmp = strncmp(name, tag_map[mid].name, len);\n";
+print "\t\tif (cmp == 0) {\n";
+print "\t\t\tif (tag_map[mid].name[len] == '\\0')\n";
+print "\t\t\t\treturn tag_map[mid].tag;\n";
+print "\t\t\thi = mid - 1;\n";
+print "\t\t} else if (cmp < 0)\n";
+print "\t\t\thi = mid - 1;\n";
+print "\t\telse\n";
+print "\t\t\tlo = mid + 1;\n";
+print "\t}\n\n";
+print "\treturn TAG_UNKNOWN;\n";
+print "}\n\n";
+
+print "static inline const char *\n";
+print "tag_to_str(tag_type tag)\n";
+print "{\n";
+print "\tswitch (tag) {\n";
+print "\tcase TAG_UNKNOWN: return \"unknown\";\n";
+print "\tcase TAG_TEXT: return \"text\";\n";
+for my $tag (@tags) {
+ printf "\tcase TAG_%s: return \"%s\";\n", uc($tag), $tag;
+}
+print "\tdefault: return \"unknown\";\n";
+print "\t}\n";
+print "}\n\n";
+
+print "#endif /* TAGS_H */\n";
diff --git a/main.c b/main.c
index 00a184a..faeb618 100644
--- a/main.c
+++ b/main.c
@@ -3,9 +3,41 @@
#include "mem.h"
#include "dom.h"
+#include "tags.h"
+
+static void node_print(const struct node *n, const char *prefix, int last)
+{
+ const char *connector = last ? "└── " : "├── ";
+ const char *extension = last ? " " : "│ ";
+ char new_prefix[256];
+ struct node *c;
+
+ printf("%s%s", prefix, connector);
+
+ if (n->tag == TAG_TEXT)
+ printf("%.*s\n", (int)n->textlen, n->text);
+ else
+ printf("%s\n", tag_to_str(n->tag));
+
+ snprintf(new_prefix, sizeof(new_prefix), "%s%s", prefix, extension);
+
+ for (c = n->first_child; c; c = c->next_sibling)
+ node_print(c, new_prefix, c->next_sibling == NULL);
+}
+
+void dom_print(const struct node *root)
+{
+ struct node *c;
+
+ printf("%s\n", tag_to_str(root->tag));
+ for (c = root->first_child; c; c = c->next_sibling)
+ node_print(c, "", c->next_sibling == NULL);
+}
int main(int argc, char *argv[])
{
+ struct node *root;
+
if (argc < 2)
errx(1, "usage: glacier <file>");
@@ -27,8 +59,10 @@ int main(int argc, char *argv[])
html[len] = '\0';
fclose(file);
- init_dom(html);
+ root = dom_init(html);
+ dom_print(root);
+ dom_free();
free(html);
return 0;
diff --git a/mem.h b/mem.h
index 032740c..2cdb36e 100644
--- a/mem.h
+++ b/mem.h
@@ -5,6 +5,7 @@
#include <stdlib.h>
#define MALLOC(s) xmalloc((s), __FILE__, __LINE__)
+#define CALLOC(n, s) xcalloc((n), (s), __FILE__, __LINE__)
#define REALLOC(p, s) xrealloc((p), (s), __FILE__, __LINE__)
static inline void *xmalloc(size_t s, const char *file, int line)
@@ -16,6 +17,15 @@ static inline void *xmalloc(size_t s, const char *file, int line)
return p;
}
+static inline void *xcalloc(size_t n, size_t s, const char *file, int line)
+{
+ void *p;
+
+ if (!(p = calloc(n, s)))
+ err(1, "%s:%d: calloc", file, line);
+ return p;
+}
+
static inline void *xrealloc(void *ptr, size_t s, const char *file, int line)
{
void *p;
diff --git a/parse.c b/parse.c
index a9998e3..f204440 100644
--- a/parse.c
+++ b/parse.c
@@ -1,4 +1,5 @@
#include <ctype.h>
+#include <err.h>
#include <stddef.h>
#include "parse.h"
@@ -22,9 +23,11 @@ typedef enum {
} parse_mode;
extern void on_open(const char *tag, size_t n);
+extern void on_open_end(void);
extern void on_close(const char *tag, size_t n);
extern void on_text(const char *text, size_t n);
-extern void on_attr(const char *name, size_t nname, const char *val, size_t nval);
+extern void on_attr(const char *name, size_t nname,
+ const char *val, size_t nval);
void parse(const char *s)
{
@@ -82,33 +85,44 @@ void parse(const char *s)
n = 0;
while (*s) {
if (*s == '>') {
- mode = DATA;
if (n > 0 && s[-1] == '/')
n--;
+
+ if (n > 0)
+ on_open(p, n);
+ on_open_end();
+
ADVANCE(s, 1);
+ mode = DATA;
break;
}
if (*s == ' ') {
+ if (n > 0)
+ on_open(p, n);
+
while (*s == ' ')
ADVANCE(s, 1);
if (isalpha((unsigned char)*s))
mode = ATTR_NAME;
- else if (*s == '/' && s[1] == '>') {
- ADVANCE(s, 2); // ignore self-closing tags
+ else if (*s == '>') {
+ on_open_end();
+ ADVANCE(s, 1);
mode = DATA;
- }
+ } else if (*s == '/' && s[1] == '>') {
+ on_open_end();
+ ADVANCE(s, 2);
+ mode = DATA;
+ } else
+ errx(1, "Invalid character in open tag: %c", *s);
+
break;
}
n++;
ADVANCE(s, 1);
}
-
- if (n > 0)
- on_open(p, n);
-
break;
case TAG_CLOSE:
p = s;
@@ -120,6 +134,7 @@ void parse(const char *s)
ADVANCE(s, 1);
break;
}
+
n++;
ADVANCE(s, 1);
}
@@ -131,17 +146,18 @@ void parse(const char *s)
if (*s == '=') {
attr_name = p;
attr_name_len = n;
- mode = ATTR_VALUE;
ADVANCE(s, 1);
if (*s == '"' || *s == '\'')
ADVANCE(s, 1);
+ mode = ATTR_VALUE;
break;
}
if (*s == '>') { // <input disabled>
on_attr(p, n, NULL, 0);
- mode = DATA;
+ on_open_end();
ADVANCE(s, 1);
+ mode = DATA;
break;
}
@@ -159,6 +175,7 @@ void parse(const char *s)
ADVANCE(s, 1);
break;
}
+
n++;
ADVANCE(s, 1);
}
@@ -181,6 +198,7 @@ void parse(const char *s)
ADVANCE(s, 1);
break;
}
+
n++;
ADVANCE(s, 1);
}
diff --git a/tags.h b/tags.h
new file mode 100644
index 0000000..4889686
--- /dev/null
+++ b/tags.h
@@ -0,0 +1,101 @@
+/* Generated file, do not edit */
+
+#ifndef TAGS_H
+#define TAGS_H
+
+#include <string.h>
+
+typedef enum {
+ TAG_UNKNOWN,
+ TAG_TEXT,
+ TAG_A,
+ TAG_ARTICLE,
+ TAG_BODY,
+ TAG_DIV,
+ TAG_FOOTER,
+ TAG_H1,
+ TAG_HEAD,
+ TAG_HEADER,
+ TAG_HTML,
+ TAG_LI,
+ TAG_LINK,
+ TAG_META,
+ TAG_P,
+ TAG_TIME,
+ TAG_TITLE,
+ TAG_UL,
+} tag_type;
+
+static const struct {
+ const char *name;
+ tag_type tag;
+} tag_map[] = {
+ { "a", TAG_A },
+ { "article", TAG_ARTICLE },
+ { "body", TAG_BODY },
+ { "div", TAG_DIV },
+ { "footer", TAG_FOOTER },
+ { "h1", TAG_H1 },
+ { "head", TAG_HEAD },
+ { "header", TAG_HEADER },
+ { "html", TAG_HTML },
+ { "li", TAG_LI },
+ { "link", TAG_LINK },
+ { "meta", TAG_META },
+ { "p", TAG_P },
+ { "time", TAG_TIME },
+ { "title", TAG_TITLE },
+ { "ul", TAG_UL },
+};
+
+static inline tag_type
+str_to_tag(const char *name, size_t len)
+{
+ int lo, hi, mid, cmp;
+
+ lo = 0;
+ hi = 16 - 1;
+
+ while (lo <= hi) {
+ mid = (lo + hi) / 2;
+ cmp = strncmp(name, tag_map[mid].name, len);
+ if (cmp == 0) {
+ if (tag_map[mid].name[len] == '\0')
+ return tag_map[mid].tag;
+ hi = mid - 1;
+ } else if (cmp < 0)
+ hi = mid - 1;
+ else
+ lo = mid + 1;
+ }
+
+ return TAG_UNKNOWN;
+}
+
+static inline const char *
+tag_to_str(tag_type tag)
+{
+ switch (tag) {
+ case TAG_UNKNOWN: return "unknown";
+ case TAG_TEXT: return "text";
+ case TAG_A: return "a";
+ case TAG_ARTICLE: return "article";
+ case TAG_BODY: return "body";
+ case TAG_DIV: return "div";
+ case TAG_FOOTER: return "footer";
+ case TAG_H1: return "h1";
+ case TAG_HEAD: return "head";
+ case TAG_HEADER: return "header";
+ case TAG_HTML: return "html";
+ case TAG_LI: return "li";
+ case TAG_LINK: return "link";
+ case TAG_META: return "meta";
+ case TAG_P: return "p";
+ case TAG_TIME: return "time";
+ case TAG_TITLE: return "title";
+ case TAG_UL: return "ul";
+ default: return "unknown";
+ }
+}
+
+#endif /* TAGS_H */
diff --git a/tags.txt b/tags.txt
new file mode 100644
index 0000000..e9e98b4
--- /dev/null
+++ b/tags.txt
@@ -0,0 +1,16 @@
+html
+head
+meta
+title
+link
+body
+header
+h1
+a
+article
+ul
+li
+time
+div
+p
+footer
diff --git a/test.html b/test.html
index 2642bfa..c3c4ae1 100644
--- a/test.html
+++ b/test.html
@@ -1,9 +1,40 @@
-<html>
- <div class="header">
- <h1>My Journal</h1>
- </div>
- <div class="content" disabled>
- <p>Hello World</p>
- </div>
+<!doctype html>
+<html lang="en-us">
+ <head>
+ <meta charset="utf-8">
+ <meta name="viewport" content="width=device-width, initial-scale=1">
+ <title>Home</title>
+ <link rel="stylesheet" href="/assets/css/main.css">
+</head>
+
+
+
+ <body>
+
+ <header>
+ <h1><a href="/">Journal</a></h1>
+
+
+</header>
+
+
+ <article><ul id="post-list">
+
+
+ <li>
+ <time>2026-05-01</time>
+ <a href="/log/vcs-1/" class="link-decor-none">Built and benchmarked Urn against Git</a>
+ </li>
+
+</ul>
+
+<footer>
+ <p>Built with <a href="https://github.com/ronv/minimalist" class="external" target="_blank" rel="noopener noreferrer">Minimalist</a>.
+ <a href="/cgi-bin/search.cgi">Search</a>
+ </p>
+</footer>
+</article>
+
+ </body>
</html>
diff --git a/vec.c b/vec.c
new file mode 100644
index 0000000..719344e
--- /dev/null
+++ b/vec.c
@@ -0,0 +1,51 @@
+#include <string.h>
+
+#include "mem.h"
+#include "vec.h"
+
+#define DEFAULT_LEN 8
+
+void vec_init(struct vec *v, size_t unit_size)
+{
+ v->data = NULL;
+ v->len = 0;
+ v->cap = 0;
+ v->unit_size = unit_size;
+}
+
+void vec_push(struct vec *v, const void *item)
+{
+ void *target;
+
+ if (v->len == v->cap) {
+ v->cap = (v->cap == 0) ? DEFAULT_LEN : v->cap * 2;
+ v->data = REALLOC(v->data, v->cap * v->unit_size);
+ }
+
+ target = (char *)v->data + (v->len * v->unit_size);
+ memcpy(target, item, v->unit_size);
+ v->len++;
+}
+
+void *vec_pop(struct vec *v)
+{
+ if (v->len == 0)
+ return NULL;
+ v->len--;
+ return (char *)v->data + (v->len * v->unit_size);
+}
+
+void *vec_top(struct vec *v)
+{
+ if (v->len == 0)
+ return NULL;
+ return (char *)v->data + ((v->len - 1) * v->unit_size);
+}
+
+void vec_free(struct vec *v)
+{
+ free(v->data);
+ v->data = NULL;
+ v->len = v->cap = 0;
+}
+
diff --git a/vec.h b/vec.h
new file mode 100644
index 0000000..df00da2
--- /dev/null
+++ b/vec.h
@@ -0,0 +1,19 @@
+#ifndef VEC_H
+#define VEC_H
+
+#include <stddef.h>
+
+struct vec {
+ void *data;
+ size_t len;
+ size_t cap;
+ size_t unit_size;
+};
+
+void vec_init(struct vec *v, size_t unit_size);
+void vec_push(struct vec *v, const void *item);
+void *vec_pop(struct vec *v);
+void *vec_top(struct vec *v);
+void vec_free(struct vec *v);
+
+#endif /* VEC_H */