From 7aea09077aad335ac32bfd9858ded60ffd4d8a5b Mon Sep 17 00:00:00 2001 From: Sadeep Madurange Date: Sun, 17 May 2026 13:25:17 +0800 Subject: Parse HTML. --- Makefile | 4 +- dom.c | 37 ++++++++++++ dom.h | 6 ++ main.c | 33 ++++++++++- parse.c | 195 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ parse.h | 6 ++ test.html | 9 +++ 7 files changed, 287 insertions(+), 3 deletions(-) create mode 100644 dom.c create mode 100644 dom.h create mode 100644 parse.c create mode 100644 parse.h create mode 100644 test.html diff --git a/Makefile b/Makefile index 35d0540..c3b4451 100644 --- a/Makefile +++ b/Makefile @@ -2,8 +2,8 @@ CC = cc CFLAGS = -std=c11 -Wall -Wextra -Wpedantic -g -O0 LDFLAGS = -HDRS = mem.h -SRCS = main.c +HDRS = mem.h parse.h dom.h +SRCS = parse.c dom.c main.c OBJS = $(SRCS:.c=.o) TARGET = glacier diff --git a/dom.c b/dom.c new file mode 100644 index 0000000..a4d75e1 --- /dev/null +++ b/dom.c @@ -0,0 +1,37 @@ +#include + +#include "dom.h" +#include "parse.h" + +void init_dom(const char *html) +{ + parse(html); +} + +/* Parser event handlers */ +extern void on_open(const char *tag, size_t n) +{ + printf("Tag opened: %.*s\n", (int)n, tag); +} + +extern void on_close(const char *tag, size_t n) +{ + printf("Tag closed: %.*s\n", (int)n, tag); +} + +extern void on_text(const char *text, size_t n) +{ + printf("Text: %.*s\n", (int)n, text); +} + +extern void on_attr(const char *name, size_t nname, const char *val, + size_t nval) +{ + printf("Attribute: name=%.*s", (int)nname, name); + + if (val && nval > 0) + printf(", value=%.*s", (int)nval, val); + + printf("\n"); +} + diff --git a/dom.h b/dom.h new file mode 100644 index 0000000..d336019 --- /dev/null +++ b/dom.h @@ -0,0 +1,6 @@ +#ifndef DOM_H +#define DOM_H + +void init_dom(const char *html); + +#endif /* DOM_H */ diff --git a/main.c b/main.c index 31dbf45..00a184a 100644 --- a/main.c +++ b/main.c @@ -1,4 +1,35 @@ -int main(void) +#include +#include + +#include "mem.h" +#include "dom.h" + +int main(int argc, char *argv[]) { + if (argc < 2) + errx(1, "usage: glacier "); + + unveil(argv[1], "r"); + unveil(NULL, NULL); + pledge("stdio rpath", NULL); + + FILE *file; + char *html; + long len; + + file = fopen("test.html", "rb"); + fseek(file, 0, SEEK_END); + len = ftell(file); + fseek(file, 0, SEEK_SET); + + html = MALLOC((size_t)len + 1); + fread(html, 1, len, file); + html[len] = '\0'; + fclose(file); + + init_dom(html); + + free(html); + return 0; } diff --git a/parse.c b/parse.c new file mode 100644 index 0000000..a9998e3 --- /dev/null +++ b/parse.c @@ -0,0 +1,195 @@ +#include +#include + +#include "parse.h" + +#define ADVANCE(s, n) do { \ + size_t i_; \ + for (i_ = 0; i_ < (n); i_++) { \ + if (!*(++(s))) \ + goto end; \ + } \ +} while (0) + +typedef enum { + DATA = 1, + TAG_OPEN, + TAG_CLOSE, + ATTR_NAME, + ATTR_VALUE, + DOCTYPE, + COMMENT +} parse_mode; + +extern void on_open(const char *tag, size_t n); +extern void on_close(const char *tag, size_t n); +extern void on_text(const char *text, size_t n); +extern void on_attr(const char *name, size_t nname, const char *val, size_t nval); + +void parse(const char *s) +{ + int blank; + parse_mode mode; + size_t n, attr_name_len; + const char *p, *attr_name; + + if (!s) + return; + + mode = DATA; + + while (*s) { + switch (mode) { + case DATA: + p = s; + n = 0; + blank = 1; + while (*s) { + if (*s == '<') { + if (isalpha((unsigned char)s[1])) { + mode = TAG_OPEN; + ADVANCE(s, 1); + break; + } else if (s[1] == '/') { + mode = TAG_CLOSE; + ADVANCE(s, 2); + break; + } else if (s[1] == '!') { + if (s[2] == '-' && s[3] == '-') { + mode = COMMENT; + ADVANCE(s, 3); + } else { + mode = DOCTYPE; + ADVANCE(s, 2); + } + break; + } + } + + if (!isspace((unsigned char)*s)) + blank = 0; + + n++; + ADVANCE(s, 1); + } + + if (n > 0 && !blank) + on_text(p, n); + + break; + case TAG_OPEN: + p = s; + n = 0; + while (*s) { + if (*s == '>') { + mode = DATA; + if (n > 0 && s[-1] == '/') + n--; + ADVANCE(s, 1); + break; + } + + if (*s == ' ') { + while (*s == ' ') + ADVANCE(s, 1); + + if (isalpha((unsigned char)*s)) + mode = ATTR_NAME; + else if (*s == '/' && s[1] == '>') { + ADVANCE(s, 2); // ignore self-closing tags + mode = DATA; + } + break; + } + + n++; + ADVANCE(s, 1); + } + + if (n > 0) + on_open(p, n); + + break; + case TAG_CLOSE: + p = s; + n = 0; + while (*s) { + if (*s == '>') { + on_close(p, n); + mode = DATA; + ADVANCE(s, 1); + break; + } + n++; + ADVANCE(s, 1); + } + break; + case ATTR_NAME: + p = s; + n = 0; + while (*s) { + if (*s == '=') { + attr_name = p; + attr_name_len = n; + mode = ATTR_VALUE; + ADVANCE(s, 1); + if (*s == '"' || *s == '\'') + ADVANCE(s, 1); + break; + } + + if (*s == '>') { // + on_attr(p, n, NULL, 0); + mode = DATA; + ADVANCE(s, 1); + break; + } + + n++; + ADVANCE(s, 1); + } + break; + case ATTR_VALUE: + p = s; + n = 0; + while (*s) { + if (*s == '"' || *s == '\'' || *s == '>') { + on_attr(attr_name, attr_name_len, p, n); + mode = *s == '>' ? DATA : TAG_OPEN; + ADVANCE(s, 1); + break; + } + n++; + ADVANCE(s, 1); + } + break; + case DOCTYPE: + while (*s) { + if (*s == '>') { + mode = DATA; + ADVANCE(s, 1); + break; + } + ADVANCE(s, 1); + } + break; + case COMMENT: + n = 0; + while (*s) { + if (*s == '>' && n >= 2 && s[-1] == '-' && s[-2] == '-') { + mode = DATA; + ADVANCE(s, 1); + break; + } + n++; + ADVANCE(s, 1); + } + break; + default: + break; + } + } + +end: + return; +} diff --git a/parse.h b/parse.h new file mode 100644 index 0000000..33d7cf4 --- /dev/null +++ b/parse.h @@ -0,0 +1,6 @@ +#ifndef PARSE_H +#define PARSE_H + +void parse(const char *s); + +#endif /* PARSE_H */ diff --git a/test.html b/test.html new file mode 100644 index 0000000..2642bfa --- /dev/null +++ b/test.html @@ -0,0 +1,9 @@ + +
+

My Journal

+
+
+

Hello World

+
+ + -- cgit v1.2.3