diff options
| author | Sadeep Madurange <sadeep@asciimx.com> | 2026-01-03 12:58:01 +0800 |
|---|---|---|
| committer | Sadeep Madurange <sadeep@asciimx.com> | 2026-01-03 12:58:01 +0800 |
| commit | 8a4da6809cf9368cd6a5dd7351181ea4256453f9 (patch) | |
| tree | 77b2e109ba979332d81799a957bbfa86d010b81b /_site/cgi-bin/sa_indexer.pl | |
| download | site-search-bm-8a4da6809cf9368cd6a5dd7351181ea4256453f9.tar.gz | |
Diffstat (limited to '_site/cgi-bin/sa_indexer.pl')
| -rw-r--r-- | _site/cgi-bin/sa_indexer.pl | 86 |
1 files changed, 86 insertions, 0 deletions
diff --git a/_site/cgi-bin/sa_indexer.pl b/_site/cgi-bin/sa_indexer.pl new file mode 100644 index 0000000..2395dac --- /dev/null +++ b/_site/cgi-bin/sa_indexer.pl @@ -0,0 +1,86 @@ +#!/usr/bin/perl + +use strict; +use warnings; +use File::Find; +use Storable qw(store); +use Time::HiRes qw(gettimeofday tv_interval); + +# Configuration +my $directory = '../log'; +my $corpus_file = 'corpus.bin'; +my $sa_file = 'sa.bin'; +my $map_file = 'file_map.dat'; + +# Start timing +my $t0 = [gettimeofday]; + +my $corpus = ""; +my @file_map; + +print "1. Building Case-Insensitive Corpus...\n"; +find({ + wanted => sub { + return unless -f $_ && $_ eq 'index.html'; + if (open my $fh, '<:encoding(UTF-8)', $_) { + my $content = do { local $/; <$fh> }; + close $fh; + + my ($text) = $content =~ m|<main>(.*?)</main>|is; + $text //= $content; + $text =~ s|<[^>]+>| |g; + $text =~ s|\s+| |g; + + my $start = length($corpus); + $corpus .= lc($text) . "\0"; + push @file_map, { start => $start, end => length($corpus), path => $File::Find::name }; + } + }, + no_chdir => 0, +}, $directory); + +print "2. Sorting Suffixes (Two-Pass Cache-Optimized)...\n"; +my @sa = 0 .. (length($corpus) - 1); + +@sa = sort { + (substr($corpus, $a, 64) cmp substr($corpus, $b, 64)) + || + (substr($corpus, $a) cmp substr($corpus, $b)) +} @sa; + +print "3. Writing Index Files to Disk...\n"; +open my $cfh, '>', $corpus_file or die $!; +print $cfh $corpus; +close $cfh; + +open my $sfh, '>', $sa_file or die $!; +binmode($sfh); +print $sfh pack("L*", @sa); +close $sfh; + +store \@file_map, $map_file; + +# End timing +my $elapsed = tv_interval($t0); + +# Calculate Sizes +my $c_size = -s $corpus_file; +my $s_size = -s $sa_file; +my $m_size = -s $map_file; +my $total = $c_size + $s_size + $m_size; + +# --- Final Report --- +print "\n" . "="x35 . "\n"; +print " INDEX BUILDING COMPLETE\n"; +print "="x35 . "\n"; +printf "Total Time: %.4f seconds\n", $elapsed; +print "Files Processed: " . scalar(@file_map) . "\n"; +print "-"x35 . "\n"; +print "File Sizes (KB):\n"; +printf " %-14s %10.2f KB\n", $corpus_file, $c_size / 1024; +printf " %-14s %10.2f KB\n", $sa_file, $s_size / 1024; +printf " %-14s %10.2f KB\n", $map_file, $m_size / 1024; +print "-"x35 . "\n"; +printf " TOTAL INDEX: %10.2f KB\n", $total / 1024; +print "="x35 . "\n"; + |
