#!/usr/bin/perl use strict; use warnings; use File::Find; use Storable qw(store); use Encode qw(encode_utf8); use HTML::Entities qw(decode_entities); use Time::HiRes qw(gettimeofday tv_interval); my $dir = '../_site/log'; my $cgi_dir = '../_site/cgi-bin/'; my $corpus_file = "${cgi_dir}corpus.bin"; my $sa_file = "${cgi_dir}sa.bin"; my $map_file = "${cgi_dir}file_map.dat"; my %excluded_files = ( 'index.html' => 1, # /log/index.html ); # Start timing my $t0 = [gettimeofday]; my $corpus = ""; my @file_map; print "Building corpus...\n"; find({ wanted => sub { # Only index index.html files return unless -f $_ && $_ eq 'index.html'; my $rel_path = $File::Find::name; $rel_path =~ s|^\Q$dir\E/?||; return if $excluded_files{$rel_path}; if (open my $fh, '<:encoding(UTF-8)', $_) { my $content = do { local $/; <$fh> }; close $fh; my ($title) = $content =~ m|(.*?)|is; $title //= (split('/', $File::Find::name))[-2]; # Fallback to folder name $title =~ s/^\s+|\s+$//g; # Extract content from
or use whole file my ($text) = $content =~ m|
(.*?)
|is; $text //= $content; # Strip tags and normalize whitespace $text =~ s|]*>.*?| |gs; $text =~ s|]*>.*?| |gs; $text =~ s|<[^>]+>| |g; $text = decode_entities($text); $text =~ s|\s+| |g; $text =~ s/^\s+|\s+$//g; # CRITICAL: Convert to lowercase and then to raw bytes # This ensures length() and substr() work on byte offsets for seek() my $raw_entry = encode_utf8(lc($text) . "\0"); my $start = length($corpus); $corpus .= $raw_entry; push @file_map, { start => $start, end => length($corpus), title => $title, path => $File::Find::name }; } }, no_chdir => 0, }, $dir); print "Sorting suffixes...\n"; # Initialize the array of indices my @sa = 0 .. (length($corpus) - 1); # Use a block that forces byte-level comparison { use bytes; @sa = sort { # First 64 bytes check (fast path) (substr($corpus, $a, 64) cmp substr($corpus, $b, 64)) || # Full string fallback (required for correctness) (substr($corpus, $a) cmp substr($corpus, $b)) } @sa; } print "Writing index files to disk...\n"; open my $cfh, '>', $corpus_file or die "Cannot write $corpus_file: $!"; binmode($cfh); # Raw byte mode print $cfh $corpus; close $cfh; open my $sfh, '>', $sa_file or die "Cannot write $sa_file: $!"; binmode($sfh); # Pack as 32-bit unsigned integers (standard 'L') print $sfh pack("L*", @sa); close $sfh; store \@file_map, $map_file; my $elapsed = tv_interval($t0); my $c_size = -s $corpus_file; my $s_size = -s $sa_file; printf "\nIndexing Complete!\n"; printf "Total Time: %.4f seconds\n", $elapsed; printf "Corpus Size: %.2f KB\n", $c_size / 1024; printf "Suffix Array: %.2f KB\n", $s_size / 1024; printf "Files Processed: %d\n", scalar(@file_map);