cgi-bin/indexer.pl


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55

#!/usr/bin/perl

use strict;
use warnings;
use Storable qw(nstore);
use HTML::Entities qw(decode_entities);

my $built_site_dir = '../_site/log';
my $output_file    = 'search_index.dat';
my %index;

print "Building search index from $built_site_dir...\n";

# glob finds every index.html in subdirectories of /log/
foreach my $path (glob("$built_site_dir/*/index.html")) {
    next unless open(my $fh, '<:utf8', $path);
    my $html = do { local $/; <$fh> };
    close($fh);

    # Extract Title and Main Content
    my ($title) = $html =~ m|<title>(.*?)</title>|is;
    my ($main)  = $html =~ m|<main>(.*?)</main>|is;
    $main //= '';

    # Remove code and pre blocks to keep index prose-only
    $main =~ s|<pre[^>]*>.*?</pre>| |gs;
    $main =~ s|<code[^>]*>.*?</code>| |gs;

    # Strip all remaining HTML tags
    $main =~ s|<[^>]+>| |g;

    # Decode entities (e.g., &amp; -> &) for accurate searching
    $main = decode_entities($main);

    # Normalize whitespace (squash multiple spaces/newlines)
    $main =~ s|\s+| |g;
    $main =~ s/^\s+|\s+$//g;

    # Map file path to the final web URL
    # Example: ../_site/log/arduino/index.html -> /log/arduino/index.html
    (my $url = $path) =~ s|^\.\./_site/|/|;

    $index{$url} = {
        t => $title || "Untitled",
        c => $main
    };
}

# Save using network-order binary (nstore) for portability
nstore(\%index, $output_file);

my $count = scalar(keys %index);
my $size  = -s $output_file;
printf("Index complete: %d files (%.2f KB)\n", $count, $size / 1024);