summaryrefslogtreecommitdiffstats
path: root/benchmark.pl
blob: 3d0d855791e2ce88138c37c884e74cb1a79047a4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/perl
use strict;
use warnings;
use Time::HiRes qw(gettimeofday tv_interval);

# Enable autoflush for live status updates
$| = 1; 

my @test_counts = @ARGV ? @ARGV : (500, 1000, 10000);
my $report_file = "result.txt";

# Open report file for writing
open(my $rfh, '>', $report_file) or die "Could not open $report_file: $!";

# Configuration
my $seed_script  = "./seed.sh";
my $indexer      = "./indexer.pl";
my $sa_cgi       = "./find_sa.cgi";
my $regex_cgi    = "./find_regex.cgi";

my $header = "=============================================================\n"
           . "SEARCH BENCHMARK: Suffix array vs. Linear regex\n"
           . "ARTICLE SIZE: 8 KB\n"
           . "=============================================================\n\n";

print $rfh $header;
print $header;

foreach my $count (@test_counts) {
    my $search_query = "keyword_-1"; # Likely not in corpus

    # Progress tracking to STDOUT
    print "--> Processing batch: $count files\n";
    
    print "    [1/4] Reseeding _site/log... ";
    system("rm -rf _site/log/*"); 
    system("$seed_script $count > /dev/null 2>&1");
    print "Done.\n";

    print "    [2/4] Indexing (Suffix array)... ";
    unlink('sa.bin', 'corpus.bin', 'file_map.dat');
    my $idx_start = [gettimeofday];
    system("perl $indexer > /dev/null 2>&1");
    my $idx_time = tv_interval($idx_start);
    print "Done.\n";
    
    my $idx_size = 0;
    if (-f 'sa.bin' && -f 'corpus.bin') {
        $idx_size = ((-s 'sa.bin') + (-s 'corpus.bin')) / 1024; 
    }

    print "    [3/4] Testing SA search... ";
    my $sa_out = `QUERY_STRING="q=$search_query" perl $sa_cgi`;
    my ($sa_time, $sa_ram) = parse_metrics($sa_out);
    print "Done.\n";

    print "    [4/4] Testing regex search... ";
    my $reg_out = `QUERY_STRING="q=$search_query" perl $regex_cgi`;
    my ($reg_time, $reg_ram) = parse_metrics($reg_out);
    print "Done.\n\n";

    # Format the table for result.txt
    my $table = sprintf("%d files (Targeting: %s):\n", $count, $search_query);
    $table .= "----------------+----------------------+---------------------\n";
    $table .= sprintf("%-15s | %-20s | %-20s\n", "METRIC", "SA", "REGEX");
    $table .= "----------------+----------------------+---------------------\n";
    $table .= sprintf("%-15s | %-20s | %-20s\n", "Search time", sprintf("%.4fs", $sa_time), sprintf("%.4fs", $reg_time));
    $table .= sprintf("%-15s | %-20s | %-20s\n", "Peak RAM", sprintf("%d KB", $sa_ram), sprintf("%d KB", $reg_ram));
    $table .= sprintf("%-15s | %-20s | %-20s\n", "Indexing time", sprintf("%.4fs", $idx_time), "N/A");
    $table .= sprintf("%-15s | %-20s | %-20s\n", "Index size", sprintf("%.2f KB", $idx_size), "N/A");
    $table .= "----------------+----------------------+---------------------\n\n";

    print $rfh $table;
}

close $rfh;
print "All tests finished. Results written to $report_file.\n\n";

my $pager = $ENV{PAGER} || 'more';
system("$pager $report_file");

sub parse_metrics {
    my $text = shift || "";
    my $time = ($text =~ /Total Time:\s+([\d.]+)/) ? $1 : 0;
    my $ram  = ($text =~ /Peak RAM:\s+(\d+)/) ? $1 : 0;
    return ($time, $ram);
}