#include #include #include "wv_parse.h" #include "wv_vec.h" static void skip_whitespace(struct wv_tokenizer *t) { while (t->pos < t->len && isspace(t->src[t->pos])) t->pos++; } static int is_delim(char c) { return isspace(c) || c == '=' || c == '>' || c == '/'; } static wv_tag_id map_tag_name(const char *name, size_t len) { if (len == 3 && strncmp(name, "div", 3) == 0) return WV_TAG_DIV; if (len == 1 && strncmp(name, "a", 1) == 0) return WV_TAG_A; if (len == 2 && strncmp(name, "li", 2) == 0) return WV_TAG_LI; if (len == 2 && strncmp(name, "ul", 2) == 0) return WV_TAG_UL; if (len == 2 && strncmp(name, "h1", 2) == 0) return WV_TAG_H1; return WV_TAG_UNKNOWN; } void wv_tokenizer_init(struct wv_tokenizer *t, const char *src, size_t len) { t->src = src; t->len = len; t->pos = 0; t->state = WV_STATE_DATA; t->quote_char = '\0'; } struct wv_token wv_tokenizer_next(struct wv_tokenizer *t) { struct wv_token tok = {WV_TOK_EOF, NULL, 0}; if (t->pos >= t->len) return tok; switch (t->state) { case WV_STATE_DATA: if (t->src[t->pos] == '<') { t->pos++; skip_whitespace(t); if (t->pos < t->len && t->src[t->pos] == '/') { t->pos++; tok.type = WV_TOK_TAG_CLOSE; } else { tok.type = WV_TOK_TAG_OPEN; } tok.start = &t->src[t->pos]; while (t->pos < t->len && !is_delim(t->src[t->pos])) t->pos++; tok.len = &t->src[t->pos] - tok.start; t->state = WV_STATE_TAG; return tok; } /* Extract text until the next tag starts */ tok.type = WV_TOK_TEXT; tok.start = &t->src[t->pos]; while (t->pos < t->len && t->src[t->pos] != '<') t->pos++; tok.len = &t->src[t->pos] - tok.start; return tok; case WV_STATE_TAG: skip_whitespace(t); if (t->pos >= t->len) return tok; /* End of tag or self-closing tag */ if (t->src[t->pos] == '>' || (t->src[t->pos] == '/' && t->pos + 1 < t->len && t->src[t->pos+1] == '>')) { if (t->src[t->pos] == '/') t->pos++; t->pos++; t->state = WV_STATE_DATA; return wv_tokenizer_next(t); } /* Attribute Key */ tok.type = WV_TOK_ATTR_KEY; tok.start = &t->src[t->pos]; while (t->pos < t->len && !is_delim(t->src[t->pos])) t->pos++; tok.len = &t->src[t->pos] - tok.start; skip_whitespace(t); if (t->pos < t->len && t->src[t->pos] == '=') { t->pos++; t->state = WV_STATE_ATTR_VAL; } return tok; case WV_STATE_ATTR_VAL: skip_whitespace(t); if (t->pos < t->len && (t->src[t->pos] == '"' || t->src[t->pos] == '\'')) { t->quote_char = t->src[t->pos++]; tok.start = &t->src[t->pos]; while (t->pos < t->len && t->src[t->pos] != t->quote_char) t->pos++; tok.len = &t->src[t->pos] - tok.start; if (t->pos < t->len) t->pos++; } else { tok.start = &t->src[t->pos]; while (t->pos < t->len && !isspace(t->src[t->pos]) && t->src[t->pos] != '>') t->pos++; tok.len = &t->src[t->pos] - tok.start; } tok.type = WV_TOK_ATTR_VAL; t->state = WV_STATE_TAG; return tok; } return tok; } wv_ref wv_parse_document(struct wv_arena *arena, const char *src) { struct wv_parser p; struct wv_token tok; wv_ref root = 0; wv_ref last_el = 0; wv_tokenizer_init(&p.tokenizer, src, strlen(src)); p.arena = arena; wv_vec_init(&p.stack, sizeof(wv_ref)); while ((tok = wv_tokenizer_next(&p.tokenizer)).type != WV_TOK_EOF) { switch (tok.type) { case WV_TOK_TAG_OPEN: { wv_ref new_node = wv_node_new(p.arena, WV_NODE_ELEMENT); struct wv_node *n = (struct wv_node *)WV_ADDR(p.arena, new_node); n->u.element.tag_id = map_tag_name(tok.start, tok.len); /* Determine parent: either top of stack or this is the root */ wv_ref *parent = (wv_ref *)wv_vec_last(&p.stack); if (parent) wv_node_append(p.arena, *parent, new_node); else if (root == 0) root = new_node; wv_vec_push(&p.stack, &new_node); last_el = new_node; /* Store for potential attribute keys */ break; } case WV_TOK_TAG_CLOSE: wv_vec_pop(&p.stack); break; case WV_TOK_ATTR_KEY: { /* Next token MUST be ATTR_VAL if state changed */ struct wv_token val_tok = wv_tokenizer_next(&p.tokenizer); if (val_tok.type == WV_TOK_ATTR_VAL && last_el != 0) { wv_ref k = wv_arena_push_string(p.arena, tok.start, tok.len); wv_ref v = wv_arena_push_string(p.arena, val_tok.start, val_tok.len); wv_attr_set(p.arena, last_el, k, v); } break; } case WV_TOK_TEXT: { wv_ref *parent = (wv_ref *)wv_vec_last(&p.stack); if (!parent) break; wv_ref txt_node = wv_node_new(p.arena, WV_NODE_TEXT); struct wv_node *n = (struct wv_node *)WV_ADDR(p.arena, txt_node); n->u.text.str = wv_arena_push_string(p.arena, tok.start, tok.len); n->u.text.len = tok.len; wv_node_append(p.arena, *parent, txt_node); break; } default: break; } } wv_vec_free(&p.stack); return root; }