diff options
| author | Sadeep Madurange <sadeep@asciimx.com> | 2026-05-06 17:46:49 +0800 |
|---|---|---|
| committer | Sadeep Madurange <sadeep@asciimx.com> | 2026-05-06 17:46:49 +0800 |
| commit | 8f0c3d4697742fb64cb1af8ba28fa2bb6f99de5a (patch) | |
| tree | 1822d135ec879620361e1d80cb54a63d2d8d3602 /wv_parse.c | |
| parent | fd2d93f4a97ab5a3bc18764c353b971b4035ac6a (diff) | |
| download | web-view-master.tar.gz | |
Diffstat (limited to 'wv_parse.c')
| -rw-r--r-- | wv_parse.c | 207 |
1 files changed, 207 insertions, 0 deletions
diff --git a/wv_parse.c b/wv_parse.c new file mode 100644 index 0000000..a94131e --- /dev/null +++ b/wv_parse.c @@ -0,0 +1,207 @@ +#include <ctype.h> +#include <string.h> + +#include "wv_parse.h" +#include "wv_vec.h" + +static void skip_whitespace(struct wv_tokenizer *t) +{ + while (t->pos < t->len && isspace(t->src[t->pos])) + t->pos++; +} + +static int is_delim(char c) +{ + return isspace(c) || c == '=' || c == '>' || c == '/'; +} + +static wv_tag_id map_tag_name(const char *name, size_t len) +{ + if (len == 3 && strncmp(name, "div", 3) == 0) + return WV_TAG_DIV; + if (len == 1 && strncmp(name, "a", 1) == 0) + return WV_TAG_A; + if (len == 2 && strncmp(name, "li", 2) == 0) + return WV_TAG_LI; + if (len == 2 && strncmp(name, "ul", 2) == 0) + return WV_TAG_UL; + if (len == 2 && strncmp(name, "h1", 2) == 0) + return WV_TAG_H1; + + return WV_TAG_UNKNOWN; +} + +void wv_tokenizer_init(struct wv_tokenizer *t, const char *src, + size_t len) +{ + t->src = src; + t->len = len; + t->pos = 0; + t->state = WV_STATE_DATA; + t->quote_char = '\0'; +} + +struct wv_token wv_tokenizer_next(struct wv_tokenizer *t) +{ + struct wv_token tok = {WV_TOK_EOF, NULL, 0}; + + if (t->pos >= t->len) + return tok; + + switch (t->state) { + case WV_STATE_DATA: + if (t->src[t->pos] == '<') { + t->pos++; + skip_whitespace(t); + + if (t->pos < t->len && t->src[t->pos] == '/') { + t->pos++; + tok.type = WV_TOK_TAG_CLOSE; + } else { + tok.type = WV_TOK_TAG_OPEN; + } + + tok.start = &t->src[t->pos]; + while (t->pos < t->len && !is_delim(t->src[t->pos])) + t->pos++; + tok.len = &t->src[t->pos] - tok.start; + + t->state = WV_STATE_TAG; + return tok; + } + + /* Extract text until the next tag starts */ + tok.type = WV_TOK_TEXT; + tok.start = &t->src[t->pos]; + while (t->pos < t->len && t->src[t->pos] != '<') + t->pos++; + tok.len = &t->src[t->pos] - tok.start; + return tok; + + case WV_STATE_TAG: + skip_whitespace(t); + + if (t->pos >= t->len) + return tok; + + /* End of tag or self-closing tag */ + if (t->src[t->pos] == '>' || (t->src[t->pos] == '/' && + t->pos + 1 < t->len && t->src[t->pos+1] == '>')) { + + if (t->src[t->pos] == '/') + t->pos++; + t->pos++; + t->state = WV_STATE_DATA; + return wv_tokenizer_next(t); + } + + /* Attribute Key */ + tok.type = WV_TOK_ATTR_KEY; + tok.start = &t->src[t->pos]; + while (t->pos < t->len && !is_delim(t->src[t->pos])) + t->pos++; + tok.len = &t->src[t->pos] - tok.start; + + skip_whitespace(t); + if (t->pos < t->len && t->src[t->pos] == '=') { + t->pos++; + t->state = WV_STATE_ATTR_VAL; + } + return tok; + + case WV_STATE_ATTR_VAL: + skip_whitespace(t); + + if (t->pos < t->len && (t->src[t->pos] == '"' || + t->src[t->pos] == '\'')) { + t->quote_char = t->src[t->pos++]; + tok.start = &t->src[t->pos]; + while (t->pos < t->len && t->src[t->pos] != t->quote_char) + t->pos++; + tok.len = &t->src[t->pos] - tok.start; + if (t->pos < t->len) t->pos++; + } else { + tok.start = &t->src[t->pos]; + while (t->pos < t->len && !isspace(t->src[t->pos]) && + t->src[t->pos] != '>') + t->pos++; + tok.len = &t->src[t->pos] - tok.start; + } + + tok.type = WV_TOK_ATTR_VAL; + t->state = WV_STATE_TAG; + return tok; + } + + return tok; +} + +wv_ref wv_parse_document(struct wv_arena *arena, const char *src) +{ + struct wv_parser p; + struct wv_token tok; + wv_ref root = 0; + wv_ref last_el = 0; + + wv_tokenizer_init(&p.tokenizer, src, strlen(src)); + p.arena = arena; + wv_vec_init(&p.stack, sizeof(wv_ref)); + + while ((tok = wv_tokenizer_next(&p.tokenizer)).type != WV_TOK_EOF) { + switch (tok.type) { + case WV_TOK_TAG_OPEN: { + wv_ref new_node = wv_node_new(p.arena, WV_NODE_ELEMENT); + struct wv_node *n = (struct wv_node *)WV_ADDR(p.arena, new_node); + n->u.element.tag_id = map_tag_name(tok.start, tok.len); + + /* Determine parent: either top of stack or this is the root */ + wv_ref *parent = (wv_ref *)wv_vec_last(&p.stack); + if (parent) + wv_node_append(p.arena, *parent, new_node); + else if (root == 0) + root = new_node; + + wv_vec_push(&p.stack, &new_node); + last_el = new_node; /* Store for potential attribute keys */ + break; + } + + case WV_TOK_TAG_CLOSE: + wv_vec_pop(&p.stack); + break; + + case WV_TOK_ATTR_KEY: { + /* Next token MUST be ATTR_VAL if state changed */ + struct wv_token val_tok = wv_tokenizer_next(&p.tokenizer); + if (val_tok.type == WV_TOK_ATTR_VAL && last_el != 0) { + wv_ref k = wv_arena_push_string(p.arena, tok.start, tok.len); + wv_ref v = wv_arena_push_string(p.arena, val_tok.start, val_tok.len); + wv_attr_set(p.arena, last_el, k, v); + } + break; + } + + case WV_TOK_TEXT: { + wv_ref *parent = (wv_ref *)wv_vec_last(&p.stack); + if (!parent) + break; + + wv_ref txt_node = wv_node_new(p.arena, WV_NODE_TEXT); + struct wv_node *n = (struct wv_node *)WV_ADDR(p.arena, txt_node); + + n->u.text.str = wv_arena_push_string(p.arena, tok.start, tok.len); + n->u.text.len = tok.len; + + wv_node_append(p.arena, *parent, txt_node); + break; + } + + default: break; + } + } + + wv_vec_free(&p.stack); + + return root; +} + |
