From 5d84833e25ed23e40ac5703527469101448b9b66 Mon Sep 17 00:00:00 2001 From: Sadeep Madurange Date: Thu, 1 Jan 2026 17:18:27 +0800 Subject: Perl script for indexing site. --- cgi-bin/indexer.pl | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 cgi-bin/indexer.pl (limited to 'cgi-bin') diff --git a/cgi-bin/indexer.pl b/cgi-bin/indexer.pl new file mode 100644 index 0000000..d0314a1 --- /dev/null +++ b/cgi-bin/indexer.pl @@ -0,0 +1,55 @@ +#!/usr/bin/perl + +use strict; +use warnings; +use Storable qw(nstore); +use HTML::Entities qw(decode_entities); + +my $built_site_dir = '../_site/log'; +my $output_file = 'search_index.dat'; +my %index; + +print "Building search index from $built_site_dir...\n"; + +# glob finds every index.html in subdirectories of /log/ +foreach my $path (glob("$built_site_dir/*/index.html")) { + next unless open(my $fh, '<:utf8', $path); + my $html = do { local $/; <$fh> }; + close($fh); + + # Extract Title and Main Content + my ($title) = $html =~ m|(.*?)|is; + my ($main) = $html =~ m|
(.*?)
|is; + $main //= ''; + + # Remove code and pre blocks to keep index prose-only + $main =~ s|]*>.*?| |gs; + $main =~ s|]*>.*?| |gs; + + # Strip all remaining HTML tags + $main =~ s|<[^>]+>| |g; + + # Decode entities (e.g., & -> &) for accurate searching + $main = decode_entities($main); + + # Normalize whitespace (squash multiple spaces/newlines) + $main =~ s|\s+| |g; + $main =~ s/^\s+|\s+$//g; + + # Map file path to the final web URL + # Example: ../_site/log/arduino/index.html -> /log/arduino/index.html + (my $url = $path) =~ s|^\.\./_site/|/|; + + $index{$url} = { + t => $title || "Untitled", + c => $main + }; +} + +# Save using network-order binary (nstore) for portability +nstore(\%index, $output_file); + +my $count = scalar(keys %index); +my $size = -s $output_file; +printf("Index complete: %d files (%.2f KB)\n", $count, $size / 1024); + -- cgit v1.2.3