blob: 38a918ecc9ba1ca201751833a8c6983d4c3a6b70 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
|
#!/usr/bin/perl
use strict;
use warnings;
use Storable qw(nstore);
use HTML::Entities qw(decode_entities);
# --- Configuration ---
my $built_site_dir = '../_site/log';
my $output_file = '../_site/cgi-bin/search_index.dat';
my %index;
print "Building search index from $built_site_dir...\n";
foreach my $path (glob("$built_site_dir/*/index.html")) {
next unless open(my $fh, '<:utf8', $path);
my $html = do { local $/; <$fh> };
close($fh);
# Extract Title and Main Content
my ($title) = $html =~ m|<title>(.*?)</title>|is;
my ($main) = $html =~ m|<main>(.*?)</main>|is;
$main //= '';
# Strip HTML and clean prose
$main =~ s|<pre[^>]*>.*?</pre>| |gs;
$main =~ s|<code[^>]*>.*?</code>| |gs;
$main =~ s|<[^>]+>| |g;
$main = decode_entities($main);
$main =~ s|\s+| |g;
$main =~ s/^\s+|\s+$//g;
# Normalize path
my $url = $path;
$url =~ s|^\.\./_site/||; # Remove local build directory
$url =~ s|^\.\./||; # Remove any leading dots
$url =~ s|^/+||; # Remove leading slashes
$index{$url} = {
t => $title || "Untitled",
c => $main
};
}
nstore(\%index, $output_file);
printf("Index complete: %d files (%.2f KB)\n", scalar(keys %index), (-s $output_file) / 1024);
|