#!/usr/bin/perl use strict; use warnings; use File::Find; use Storable qw(store); use Time::HiRes qw(gettimeofday tv_interval); # Configuration my $directory = '../log'; my $corpus_file = 'corpus.bin'; my $sa_file = 'sa.bin'; my $map_file = 'file_map.dat'; # Start timing my $t0 = [gettimeofday]; my $corpus = ""; my @file_map; print "1. Building Case-Insensitive Corpus...\n"; find({ wanted => sub { return unless -f $_ && $_ eq 'index.html'; if (open my $fh, '<:encoding(UTF-8)', $_) { my $content = do { local $/; <$fh> }; close $fh; my ($text) = $content =~ m|
(.*?)
|is; $text //= $content; $text =~ s|<[^>]+>| |g; $text =~ s|\s+| |g; my $start = length($corpus); $corpus .= lc($text) . "\0"; push @file_map, { start => $start, end => length($corpus), path => $File::Find::name }; } }, no_chdir => 0, }, $directory); print "2. Sorting Suffixes (Two-Pass Cache-Optimized)...\n"; my @sa = 0 .. (length($corpus) - 1); @sa = sort { (substr($corpus, $a, 64) cmp substr($corpus, $b, 64)) || (substr($corpus, $a) cmp substr($corpus, $b)) } @sa; print "3. Writing Index Files to Disk...\n"; open my $cfh, '>', $corpus_file or die $!; print $cfh $corpus; close $cfh; open my $sfh, '>', $sa_file or die $!; binmode($sfh); print $sfh pack("L*", @sa); close $sfh; store \@file_map, $map_file; # End timing my $elapsed = tv_interval($t0); # Calculate Sizes my $c_size = -s $corpus_file; my $s_size = -s $sa_file; my $m_size = -s $map_file; my $total = $c_size + $s_size + $m_size; # --- Final Report --- print "\n" . "="x35 . "\n"; print " INDEX BUILDING COMPLETE\n"; print "="x35 . "\n"; printf "Total Time: %.4f seconds\n", $elapsed; print "Files Processed: " . scalar(@file_map) . "\n"; print "-"x35 . "\n"; print "File Sizes (KB):\n"; printf " %-14s %10.2f KB\n", $corpus_file, $c_size / 1024; printf " %-14s %10.2f KB\n", $sa_file, $s_size / 1024; printf " %-14s %10.2f KB\n", $map_file, $m_size / 1024; print "-"x35 . "\n"; printf " TOTAL INDEX: %10.2f KB\n", $total / 1024; print "="x35 . "\n";