summaryrefslogtreecommitdiffstats
path: root/_site/cgi-bin/sa_indexer.pl
blob: 2395dac1da26b98cb40dfa1294afa36c86d16025 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/usr/bin/perl

use strict;
use warnings;
use File::Find;
use Storable qw(store);
use Time::HiRes qw(gettimeofday tv_interval);

# Configuration
my $directory   = '../log';
my $corpus_file = 'corpus.bin';
my $sa_file     = 'sa.bin';
my $map_file    = 'file_map.dat';

# Start timing
my $t0 = [gettimeofday];

my $corpus = "";
my @file_map;

print "1. Building Case-Insensitive Corpus...\n";
find({
    wanted => sub {
        return unless -f $_ && $_ eq 'index.html';
        if (open my $fh, '<:encoding(UTF-8)', $_) {
            my $content = do { local $/; <$fh> };
            close $fh;

            my ($text) = $content =~ m|<main>(.*?)</main>|is;
            $text //= $content; 
            $text =~ s|<[^>]+>| |g;
            $text =~ s|\s+| |g;

            my $start = length($corpus);
            $corpus .= lc($text) . "\0"; 
            push @file_map, { start => $start, end => length($corpus), path => $File::Find::name };
        }
    },
    no_chdir => 0,
}, $directory);

print "2. Sorting Suffixes (Two-Pass Cache-Optimized)...\n";
my @sa = 0 .. (length($corpus) - 1);

@sa = sort { 
    (substr($corpus, $a, 64) cmp substr($corpus, $b, 64))
    || 
    (substr($corpus, $a) cmp substr($corpus, $b))
} @sa;

print "3. Writing Index Files to Disk...\n";
open my $cfh, '>', $corpus_file or die $!;
print $cfh $corpus;
close $cfh;

open my $sfh, '>', $sa_file or die $!;
binmode($sfh);
print $sfh pack("L*", @sa);
close $sfh;

store \@file_map, $map_file;

# End timing
my $elapsed = tv_interval($t0);

# Calculate Sizes
my $c_size = -s $corpus_file;
my $s_size = -s $sa_file;
my $m_size = -s $map_file;
my $total  = $c_size + $s_size + $m_size;

# --- Final Report ---
print "\n" . "="x35 . "\n";
print "      INDEX BUILDING COMPLETE\n";
print "="x35 . "\n";
printf "Total Time:      %.4f seconds\n", $elapsed;
print "Files Processed: " . scalar(@file_map) . "\n";
print "-"x35 . "\n";
print "File Sizes (KB):\n";
printf "  %-14s %10.2f KB\n", $corpus_file, $c_size / 1024;
printf "  %-14s %10.2f KB\n", $sa_file,     $s_size / 1024;
printf "  %-14s %10.2f KB\n", $map_file,    $m_size / 1024;
print "-"x35 . "\n";
printf "  TOTAL INDEX:   %10.2f KB\n", $total / 1024;
print "="x35 . "\n";