diff options
| author | Sadeep Madurange <sadeep@asciimx.com> | 2026-01-01 17:18:27 +0800 |
|---|---|---|
| committer | Sadeep Madurange <sadeep@asciimx.com> | 2026-01-01 17:18:27 +0800 |
| commit | 5d84833e25ed23e40ac5703527469101448b9b66 (patch) | |
| tree | 09d8fd57597bdc1c86856ab55e31fe15505552e5 /cgi-bin/indexer.pl | |
| parent | 2609fa6fe825c1c56e7c2fa11c3a28e93be8ad4a (diff) | |
| download | www-5d84833e25ed23e40ac5703527469101448b9b66.tar.gz | |
Perl script for indexing site.
Diffstat (limited to 'cgi-bin/indexer.pl')
| -rw-r--r-- | cgi-bin/indexer.pl | 55 |
1 files changed, 55 insertions, 0 deletions
diff --git a/cgi-bin/indexer.pl b/cgi-bin/indexer.pl new file mode 100644 index 0000000..d0314a1 --- /dev/null +++ b/cgi-bin/indexer.pl @@ -0,0 +1,55 @@ +#!/usr/bin/perl + +use strict; +use warnings; +use Storable qw(nstore); +use HTML::Entities qw(decode_entities); + +my $built_site_dir = '../_site/log'; +my $output_file = 'search_index.dat'; +my %index; + +print "Building search index from $built_site_dir...\n"; + +# glob finds every index.html in subdirectories of /log/ +foreach my $path (glob("$built_site_dir/*/index.html")) { + next unless open(my $fh, '<:utf8', $path); + my $html = do { local $/; <$fh> }; + close($fh); + + # Extract Title and Main Content + my ($title) = $html =~ m|<title>(.*?)</title>|is; + my ($main) = $html =~ m|<main>(.*?)</main>|is; + $main //= ''; + + # Remove code and pre blocks to keep index prose-only + $main =~ s|<pre[^>]*>.*?</pre>| |gs; + $main =~ s|<code[^>]*>.*?</code>| |gs; + + # Strip all remaining HTML tags + $main =~ s|<[^>]+>| |g; + + # Decode entities (e.g., & -> &) for accurate searching + $main = decode_entities($main); + + # Normalize whitespace (squash multiple spaces/newlines) + $main =~ s|\s+| |g; + $main =~ s/^\s+|\s+$//g; + + # Map file path to the final web URL + # Example: ../_site/log/arduino/index.html -> /log/arduino/index.html + (my $url = $path) =~ s|^\.\./_site/|/|; + + $index{$url} = { + t => $title || "Untitled", + c => $main + }; +} + +# Save using network-order binary (nstore) for portability +nstore(\%index, $output_file); + +my $count = scalar(keys %index); +my $size = -s $output_file; +printf("Index complete: %d files (%.2f KB)\n", $count, $size / 1024); + |
