summaryrefslogtreecommitdiffstats
path: root/wv_parse.c
diff options
context:
space:
mode:
authorSadeep Madurange <sadeep@asciimx.com>2026-05-06 17:46:49 +0800
committerSadeep Madurange <sadeep@asciimx.com>2026-05-06 17:46:49 +0800
commit8f0c3d4697742fb64cb1af8ba28fa2bb6f99de5a (patch)
tree1822d135ec879620361e1d80cb54a63d2d8d3602 /wv_parse.c
parentfd2d93f4a97ab5a3bc18764c353b971b4035ac6a (diff)
downloadweb-view-8f0c3d4697742fb64cb1af8ba28fa2bb6f99de5a.tar.gz
Implemented tokenizer.HEADmaster
Diffstat (limited to 'wv_parse.c')
-rw-r--r--wv_parse.c207
1 files changed, 207 insertions, 0 deletions
diff --git a/wv_parse.c b/wv_parse.c
new file mode 100644
index 0000000..a94131e
--- /dev/null
+++ b/wv_parse.c
@@ -0,0 +1,207 @@
+#include <ctype.h>
+#include <string.h>
+
+#include "wv_parse.h"
+#include "wv_vec.h"
+
+static void skip_whitespace(struct wv_tokenizer *t)
+{
+ while (t->pos < t->len && isspace(t->src[t->pos]))
+ t->pos++;
+}
+
+static int is_delim(char c)
+{
+ return isspace(c) || c == '=' || c == '>' || c == '/';
+}
+
+static wv_tag_id map_tag_name(const char *name, size_t len)
+{
+ if (len == 3 && strncmp(name, "div", 3) == 0)
+ return WV_TAG_DIV;
+ if (len == 1 && strncmp(name, "a", 1) == 0)
+ return WV_TAG_A;
+ if (len == 2 && strncmp(name, "li", 2) == 0)
+ return WV_TAG_LI;
+ if (len == 2 && strncmp(name, "ul", 2) == 0)
+ return WV_TAG_UL;
+ if (len == 2 && strncmp(name, "h1", 2) == 0)
+ return WV_TAG_H1;
+
+ return WV_TAG_UNKNOWN;
+}
+
+void wv_tokenizer_init(struct wv_tokenizer *t, const char *src,
+ size_t len)
+{
+ t->src = src;
+ t->len = len;
+ t->pos = 0;
+ t->state = WV_STATE_DATA;
+ t->quote_char = '\0';
+}
+
+struct wv_token wv_tokenizer_next(struct wv_tokenizer *t)
+{
+ struct wv_token tok = {WV_TOK_EOF, NULL, 0};
+
+ if (t->pos >= t->len)
+ return tok;
+
+ switch (t->state) {
+ case WV_STATE_DATA:
+ if (t->src[t->pos] == '<') {
+ t->pos++;
+ skip_whitespace(t);
+
+ if (t->pos < t->len && t->src[t->pos] == '/') {
+ t->pos++;
+ tok.type = WV_TOK_TAG_CLOSE;
+ } else {
+ tok.type = WV_TOK_TAG_OPEN;
+ }
+
+ tok.start = &t->src[t->pos];
+ while (t->pos < t->len && !is_delim(t->src[t->pos]))
+ t->pos++;
+ tok.len = &t->src[t->pos] - tok.start;
+
+ t->state = WV_STATE_TAG;
+ return tok;
+ }
+
+ /* Extract text until the next tag starts */
+ tok.type = WV_TOK_TEXT;
+ tok.start = &t->src[t->pos];
+ while (t->pos < t->len && t->src[t->pos] != '<')
+ t->pos++;
+ tok.len = &t->src[t->pos] - tok.start;
+ return tok;
+
+ case WV_STATE_TAG:
+ skip_whitespace(t);
+
+ if (t->pos >= t->len)
+ return tok;
+
+ /* End of tag or self-closing tag */
+ if (t->src[t->pos] == '>' || (t->src[t->pos] == '/' &&
+ t->pos + 1 < t->len && t->src[t->pos+1] == '>')) {
+
+ if (t->src[t->pos] == '/')
+ t->pos++;
+ t->pos++;
+ t->state = WV_STATE_DATA;
+ return wv_tokenizer_next(t);
+ }
+
+ /* Attribute Key */
+ tok.type = WV_TOK_ATTR_KEY;
+ tok.start = &t->src[t->pos];
+ while (t->pos < t->len && !is_delim(t->src[t->pos]))
+ t->pos++;
+ tok.len = &t->src[t->pos] - tok.start;
+
+ skip_whitespace(t);
+ if (t->pos < t->len && t->src[t->pos] == '=') {
+ t->pos++;
+ t->state = WV_STATE_ATTR_VAL;
+ }
+ return tok;
+
+ case WV_STATE_ATTR_VAL:
+ skip_whitespace(t);
+
+ if (t->pos < t->len && (t->src[t->pos] == '"' ||
+ t->src[t->pos] == '\'')) {
+ t->quote_char = t->src[t->pos++];
+ tok.start = &t->src[t->pos];
+ while (t->pos < t->len && t->src[t->pos] != t->quote_char)
+ t->pos++;
+ tok.len = &t->src[t->pos] - tok.start;
+ if (t->pos < t->len) t->pos++;
+ } else {
+ tok.start = &t->src[t->pos];
+ while (t->pos < t->len && !isspace(t->src[t->pos]) &&
+ t->src[t->pos] != '>')
+ t->pos++;
+ tok.len = &t->src[t->pos] - tok.start;
+ }
+
+ tok.type = WV_TOK_ATTR_VAL;
+ t->state = WV_STATE_TAG;
+ return tok;
+ }
+
+ return tok;
+}
+
+wv_ref wv_parse_document(struct wv_arena *arena, const char *src)
+{
+ struct wv_parser p;
+ struct wv_token tok;
+ wv_ref root = 0;
+ wv_ref last_el = 0;
+
+ wv_tokenizer_init(&p.tokenizer, src, strlen(src));
+ p.arena = arena;
+ wv_vec_init(&p.stack, sizeof(wv_ref));
+
+ while ((tok = wv_tokenizer_next(&p.tokenizer)).type != WV_TOK_EOF) {
+ switch (tok.type) {
+ case WV_TOK_TAG_OPEN: {
+ wv_ref new_node = wv_node_new(p.arena, WV_NODE_ELEMENT);
+ struct wv_node *n = (struct wv_node *)WV_ADDR(p.arena, new_node);
+ n->u.element.tag_id = map_tag_name(tok.start, tok.len);
+
+ /* Determine parent: either top of stack or this is the root */
+ wv_ref *parent = (wv_ref *)wv_vec_last(&p.stack);
+ if (parent)
+ wv_node_append(p.arena, *parent, new_node);
+ else if (root == 0)
+ root = new_node;
+
+ wv_vec_push(&p.stack, &new_node);
+ last_el = new_node; /* Store for potential attribute keys */
+ break;
+ }
+
+ case WV_TOK_TAG_CLOSE:
+ wv_vec_pop(&p.stack);
+ break;
+
+ case WV_TOK_ATTR_KEY: {
+ /* Next token MUST be ATTR_VAL if state changed */
+ struct wv_token val_tok = wv_tokenizer_next(&p.tokenizer);
+ if (val_tok.type == WV_TOK_ATTR_VAL && last_el != 0) {
+ wv_ref k = wv_arena_push_string(p.arena, tok.start, tok.len);
+ wv_ref v = wv_arena_push_string(p.arena, val_tok.start, val_tok.len);
+ wv_attr_set(p.arena, last_el, k, v);
+ }
+ break;
+ }
+
+ case WV_TOK_TEXT: {
+ wv_ref *parent = (wv_ref *)wv_vec_last(&p.stack);
+ if (!parent)
+ break;
+
+ wv_ref txt_node = wv_node_new(p.arena, WV_NODE_TEXT);
+ struct wv_node *n = (struct wv_node *)WV_ADDR(p.arena, txt_node);
+
+ n->u.text.str = wv_arena_push_string(p.arena, tok.start, tok.len);
+ n->u.text.len = tok.len;
+
+ wv_node_append(p.arena, *parent, txt_node);
+ break;
+ }
+
+ default: break;
+ }
+ }
+
+ wv_vec_free(&p.stack);
+
+ return root;
+}
+