1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
|
#!/usr/bin/perl
use strict;
use warnings;
use File::Find;
use Storable qw(store);
use Time::HiRes qw(gettimeofday tv_interval);
# Configuration
my $directory = '../log';
my $corpus_file = 'corpus.bin';
my $sa_file = 'sa.bin';
my $map_file = 'file_map.dat';
# Start timing
my $t0 = [gettimeofday];
my $corpus = "";
my @file_map;
print "1. Building Case-Insensitive Corpus...\n";
find({
wanted => sub {
return unless -f $_ && $_ eq 'index.html';
if (open my $fh, '<:encoding(UTF-8)', $_) {
my $content = do { local $/; <$fh> };
close $fh;
my ($text) = $content =~ m|<main>(.*?)</main>|is;
$text //= $content;
$text =~ s|<[^>]+>| |g;
$text =~ s|\s+| |g;
my $start = length($corpus);
$corpus .= lc($text) . "\0";
push @file_map, { start => $start, end => length($corpus), path => $File::Find::name };
}
},
no_chdir => 0,
}, $directory);
print "2. Sorting Suffixes (Two-Pass Cache-Optimized)...\n";
my @sa = 0 .. (length($corpus) - 1);
@sa = sort {
(substr($corpus, $a, 64) cmp substr($corpus, $b, 64))
||
(substr($corpus, $a) cmp substr($corpus, $b))
} @sa;
print "3. Writing Index Files to Disk...\n";
open my $cfh, '>', $corpus_file or die $!;
print $cfh $corpus;
close $cfh;
open my $sfh, '>', $sa_file or die $!;
binmode($sfh);
print $sfh pack("L*", @sa);
close $sfh;
store \@file_map, $map_file;
# End timing
my $elapsed = tv_interval($t0);
# Calculate Sizes
my $c_size = -s $corpus_file;
my $s_size = -s $sa_file;
my $m_size = -s $map_file;
my $total = $c_size + $s_size + $m_size;
# --- Final Report ---
print "\n" . "="x35 . "\n";
print " INDEX BUILDING COMPLETE\n";
print "="x35 . "\n";
printf "Total Time: %.4f seconds\n", $elapsed;
print "Files Processed: " . scalar(@file_map) . "\n";
print "-"x35 . "\n";
print "File Sizes (KB):\n";
printf " %-14s %10.2f KB\n", $corpus_file, $c_size / 1024;
printf " %-14s %10.2f KB\n", $sa_file, $s_size / 1024;
printf " %-14s %10.2f KB\n", $map_file, $m_size / 1024;
print "-"x35 . "\n";
printf " TOTAL INDEX: %10.2f KB\n", $total / 1024;
print "="x35 . "\n";
|