summaryrefslogtreecommitdiffstats
path: root/parse.c
diff options
context:
space:
mode:
authorSadeep Madurange <sadeep@asciimx.com>2026-05-17 13:25:17 +0800
committerSadeep Madurange <sadeep@asciimx.com>2026-05-23 14:09:19 +0800
commit7aea09077aad335ac32bfd9858ded60ffd4d8a5b (patch)
treecdca24a43f348afca8cbb5fbc43db95e91a1b066 /parse.c
parente1a2e4150e1b1e6ddedde72dcb9c7146f6eaf85a (diff)
downloadglacier-7aea09077aad335ac32bfd9858ded60ffd4d8a5b.tar.gz
Parse HTML.
Diffstat (limited to 'parse.c')
-rw-r--r--parse.c195
1 files changed, 195 insertions, 0 deletions
diff --git a/parse.c b/parse.c
new file mode 100644
index 0000000..a9998e3
--- /dev/null
+++ b/parse.c
@@ -0,0 +1,195 @@
+#include <ctype.h>
+#include <stddef.h>
+
+#include "parse.h"
+
+#define ADVANCE(s, n) do { \
+ size_t i_; \
+ for (i_ = 0; i_ < (n); i_++) { \
+ if (!*(++(s))) \
+ goto end; \
+ } \
+} while (0)
+
+typedef enum {
+ DATA = 1,
+ TAG_OPEN,
+ TAG_CLOSE,
+ ATTR_NAME,
+ ATTR_VALUE,
+ DOCTYPE,
+ COMMENT
+} parse_mode;
+
+extern void on_open(const char *tag, size_t n);
+extern void on_close(const char *tag, size_t n);
+extern void on_text(const char *text, size_t n);
+extern void on_attr(const char *name, size_t nname, const char *val, size_t nval);
+
+void parse(const char *s)
+{
+ int blank;
+ parse_mode mode;
+ size_t n, attr_name_len;
+ const char *p, *attr_name;
+
+ if (!s)
+ return;
+
+ mode = DATA;
+
+ while (*s) {
+ switch (mode) {
+ case DATA:
+ p = s;
+ n = 0;
+ blank = 1;
+ while (*s) {
+ if (*s == '<') {
+ if (isalpha((unsigned char)s[1])) {
+ mode = TAG_OPEN;
+ ADVANCE(s, 1);
+ break;
+ } else if (s[1] == '/') {
+ mode = TAG_CLOSE;
+ ADVANCE(s, 2);
+ break;
+ } else if (s[1] == '!') {
+ if (s[2] == '-' && s[3] == '-') {
+ mode = COMMENT;
+ ADVANCE(s, 3);
+ } else {
+ mode = DOCTYPE;
+ ADVANCE(s, 2);
+ }
+ break;
+ }
+ }
+
+ if (!isspace((unsigned char)*s))
+ blank = 0;
+
+ n++;
+ ADVANCE(s, 1);
+ }
+
+ if (n > 0 && !blank)
+ on_text(p, n);
+
+ break;
+ case TAG_OPEN:
+ p = s;
+ n = 0;
+ while (*s) {
+ if (*s == '>') {
+ mode = DATA;
+ if (n > 0 && s[-1] == '/')
+ n--;
+ ADVANCE(s, 1);
+ break;
+ }
+
+ if (*s == ' ') {
+ while (*s == ' ')
+ ADVANCE(s, 1);
+
+ if (isalpha((unsigned char)*s))
+ mode = ATTR_NAME;
+ else if (*s == '/' && s[1] == '>') {
+ ADVANCE(s, 2); // ignore self-closing tags
+ mode = DATA;
+ }
+ break;
+ }
+
+ n++;
+ ADVANCE(s, 1);
+ }
+
+ if (n > 0)
+ on_open(p, n);
+
+ break;
+ case TAG_CLOSE:
+ p = s;
+ n = 0;
+ while (*s) {
+ if (*s == '>') {
+ on_close(p, n);
+ mode = DATA;
+ ADVANCE(s, 1);
+ break;
+ }
+ n++;
+ ADVANCE(s, 1);
+ }
+ break;
+ case ATTR_NAME:
+ p = s;
+ n = 0;
+ while (*s) {
+ if (*s == '=') {
+ attr_name = p;
+ attr_name_len = n;
+ mode = ATTR_VALUE;
+ ADVANCE(s, 1);
+ if (*s == '"' || *s == '\'')
+ ADVANCE(s, 1);
+ break;
+ }
+
+ if (*s == '>') { // <input disabled>
+ on_attr(p, n, NULL, 0);
+ mode = DATA;
+ ADVANCE(s, 1);
+ break;
+ }
+
+ n++;
+ ADVANCE(s, 1);
+ }
+ break;
+ case ATTR_VALUE:
+ p = s;
+ n = 0;
+ while (*s) {
+ if (*s == '"' || *s == '\'' || *s == '>') {
+ on_attr(attr_name, attr_name_len, p, n);
+ mode = *s == '>' ? DATA : TAG_OPEN;
+ ADVANCE(s, 1);
+ break;
+ }
+ n++;
+ ADVANCE(s, 1);
+ }
+ break;
+ case DOCTYPE:
+ while (*s) {
+ if (*s == '>') {
+ mode = DATA;
+ ADVANCE(s, 1);
+ break;
+ }
+ ADVANCE(s, 1);
+ }
+ break;
+ case COMMENT:
+ n = 0;
+ while (*s) {
+ if (*s == '>' && n >= 2 && s[-1] == '-' && s[-2] == '-') {
+ mode = DATA;
+ ADVANCE(s, 1);
+ break;
+ }
+ n++;
+ ADVANCE(s, 1);
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+end:
+ return;
+}