#!/usr/bin/perl use strict; use warnings; use Storable qw(nstore); use HTML::Entities qw(decode_entities); my $built_site_dir = '../_site/log'; my $output_file = 'search_index.dat'; my %index; print "Building search index from $built_site_dir...\n"; # glob finds every index.html in subdirectories of /log/ foreach my $path (glob("$built_site_dir/*/index.html")) { next unless open(my $fh, '<:utf8', $path); my $html = do { local $/; <$fh> }; close($fh); # Extract Title and Main Content my ($title) = $html =~ m|(.*?)|is; my ($main) = $html =~ m|
(.*?)
|is; $main //= ''; # Remove code and pre blocks to keep index prose-only $main =~ s|]*>.*?| |gs; $main =~ s|]*>.*?| |gs; # Strip all remaining HTML tags $main =~ s|<[^>]+>| |g; # Decode entities (e.g., & -> &) for accurate searching $main = decode_entities($main); # Normalize whitespace (squash multiple spaces/newlines) $main =~ s|\s+| |g; $main =~ s/^\s+|\s+$//g; # Map file path to the final web URL # Example: ../_site/log/arduino/index.html -> /log/arduino/index.html (my $url = $path) =~ s|^\.\./_site/|/|; $index{$url} = { t => $title || "Untitled", c => $main }; } # Save using network-order binary (nstore) for portability nstore(\%index, $output_file); my $count = scalar(keys %index); my $size = -s $output_file; printf("Index complete: %d files (%.2f KB)\n", $count, $size / 1024);