summaryrefslogtreecommitdiffstats
path: root/_site/cgi-bin/sa_indexer.pl
diff options
context:
space:
mode:
authorSadeep Madurange <sadeep@asciimx.com>2026-01-03 12:58:01 +0800
committerSadeep Madurange <sadeep@asciimx.com>2026-01-03 12:58:01 +0800
commit8a4da6809cf9368cd6a5dd7351181ea4256453f9 (patch)
tree77b2e109ba979332d81799a957bbfa86d010b81b /_site/cgi-bin/sa_indexer.pl
downloadsite-search-bm-8a4da6809cf9368cd6a5dd7351181ea4256453f9.tar.gz
Perl scripts, shell script, benchmarks.HEADmaster
Diffstat (limited to '_site/cgi-bin/sa_indexer.pl')
-rw-r--r--_site/cgi-bin/sa_indexer.pl86
1 files changed, 86 insertions, 0 deletions
diff --git a/_site/cgi-bin/sa_indexer.pl b/_site/cgi-bin/sa_indexer.pl
new file mode 100644
index 0000000..2395dac
--- /dev/null
+++ b/_site/cgi-bin/sa_indexer.pl
@@ -0,0 +1,86 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+use File::Find;
+use Storable qw(store);
+use Time::HiRes qw(gettimeofday tv_interval);
+
+# Configuration
+my $directory = '../log';
+my $corpus_file = 'corpus.bin';
+my $sa_file = 'sa.bin';
+my $map_file = 'file_map.dat';
+
+# Start timing
+my $t0 = [gettimeofday];
+
+my $corpus = "";
+my @file_map;
+
+print "1. Building Case-Insensitive Corpus...\n";
+find({
+ wanted => sub {
+ return unless -f $_ && $_ eq 'index.html';
+ if (open my $fh, '<:encoding(UTF-8)', $_) {
+ my $content = do { local $/; <$fh> };
+ close $fh;
+
+ my ($text) = $content =~ m|<main>(.*?)</main>|is;
+ $text //= $content;
+ $text =~ s|<[^>]+>| |g;
+ $text =~ s|\s+| |g;
+
+ my $start = length($corpus);
+ $corpus .= lc($text) . "\0";
+ push @file_map, { start => $start, end => length($corpus), path => $File::Find::name };
+ }
+ },
+ no_chdir => 0,
+}, $directory);
+
+print "2. Sorting Suffixes (Two-Pass Cache-Optimized)...\n";
+my @sa = 0 .. (length($corpus) - 1);
+
+@sa = sort {
+ (substr($corpus, $a, 64) cmp substr($corpus, $b, 64))
+ ||
+ (substr($corpus, $a) cmp substr($corpus, $b))
+} @sa;
+
+print "3. Writing Index Files to Disk...\n";
+open my $cfh, '>', $corpus_file or die $!;
+print $cfh $corpus;
+close $cfh;
+
+open my $sfh, '>', $sa_file or die $!;
+binmode($sfh);
+print $sfh pack("L*", @sa);
+close $sfh;
+
+store \@file_map, $map_file;
+
+# End timing
+my $elapsed = tv_interval($t0);
+
+# Calculate Sizes
+my $c_size = -s $corpus_file;
+my $s_size = -s $sa_file;
+my $m_size = -s $map_file;
+my $total = $c_size + $s_size + $m_size;
+
+# --- Final Report ---
+print "\n" . "="x35 . "\n";
+print " INDEX BUILDING COMPLETE\n";
+print "="x35 . "\n";
+printf "Total Time: %.4f seconds\n", $elapsed;
+print "Files Processed: " . scalar(@file_map) . "\n";
+print "-"x35 . "\n";
+print "File Sizes (KB):\n";
+printf " %-14s %10.2f KB\n", $corpus_file, $c_size / 1024;
+printf " %-14s %10.2f KB\n", $sa_file, $s_size / 1024;
+printf " %-14s %10.2f KB\n", $map_file, $m_size / 1024;
+print "-"x35 . "\n";
+printf " TOTAL INDEX: %10.2f KB\n", $total / 1024;
+print "="x35 . "\n";
+