From f6d7c3fdbecbcb880c0c02fdffefa1f467c46b03 Mon Sep 17 00:00:00 2001 From: Sadeep Madurange Date: Thu, 7 May 2026 14:18:23 +0800 Subject: Allow specifying file size via CLI. --- benchmark.pl | 107 ++++++++++++++++++++++++++++++----------------------------- 1 file changed, 54 insertions(+), 53 deletions(-) (limited to 'benchmark.pl') diff --git a/benchmark.pl b/benchmark.pl index 3d0d855..27277d7 100755 --- a/benchmark.pl +++ b/benchmark.pl @@ -3,16 +3,19 @@ use strict; use warnings; use Time::HiRes qw(gettimeofday tv_interval); -# Enable autoflush for live status updates $| = 1; +# Args: size_kb [count1 count2 ...] +my $size_kb = shift @ARGV; +if (!$size_kb) { + die "Usage: $0 [count1 count2 ...]\n"; +} + my @test_counts = @ARGV ? @ARGV : (500, 1000, 10000); my $report_file = "result.txt"; -# Open report file for writing open(my $rfh, '>', $report_file) or die "Could not open $report_file: $!"; -# Configuration my $seed_script = "./seed.sh"; my $indexer = "./indexer.pl"; my $sa_cgi = "./find_sa.cgi"; @@ -20,57 +23,55 @@ my $regex_cgi = "./find_regex.cgi"; my $header = "=============================================================\n" . "SEARCH BENCHMARK: Suffix array vs. Linear regex\n" - . "ARTICLE SIZE: 8 KB\n" + . "ARTICLE SIZE: $size_kb KB\n" . "=============================================================\n\n"; print $rfh $header; -print $header; foreach my $count (@test_counts) { - my $search_query = "keyword_-1"; # Likely not in corpus - - # Progress tracking to STDOUT - print "--> Processing batch: $count files\n"; - - print " [1/4] Reseeding _site/log... "; - system("rm -rf _site/log/*"); - system("$seed_script $count > /dev/null 2>&1"); - print "Done.\n"; - - print " [2/4] Indexing (Suffix array)... "; - unlink('sa.bin', 'corpus.bin', 'file_map.dat'); - my $idx_start = [gettimeofday]; - system("perl $indexer > /dev/null 2>&1"); - my $idx_time = tv_interval($idx_start); - print "Done.\n"; - - my $idx_size = 0; - if (-f 'sa.bin' && -f 'corpus.bin') { - $idx_size = ((-s 'sa.bin') + (-s 'corpus.bin')) / 1024; - } - - print " [3/4] Testing SA search... "; - my $sa_out = `QUERY_STRING="q=$search_query" perl $sa_cgi`; - my ($sa_time, $sa_ram) = parse_metrics($sa_out); - print "Done.\n"; - - print " [4/4] Testing regex search... "; - my $reg_out = `QUERY_STRING="q=$search_query" perl $regex_cgi`; - my ($reg_time, $reg_ram) = parse_metrics($reg_out); - print "Done.\n\n"; - - # Format the table for result.txt - my $table = sprintf("%d files (Targeting: %s):\n", $count, $search_query); - $table .= "----------------+----------------------+---------------------\n"; - $table .= sprintf("%-15s | %-20s | %-20s\n", "METRIC", "SA", "REGEX"); - $table .= "----------------+----------------------+---------------------\n"; - $table .= sprintf("%-15s | %-20s | %-20s\n", "Search time", sprintf("%.4fs", $sa_time), sprintf("%.4fs", $reg_time)); - $table .= sprintf("%-15s | %-20s | %-20s\n", "Peak RAM", sprintf("%d KB", $sa_ram), sprintf("%d KB", $reg_ram)); - $table .= sprintf("%-15s | %-20s | %-20s\n", "Indexing time", sprintf("%.4fs", $idx_time), "N/A"); - $table .= sprintf("%-15s | %-20s | %-20s\n", "Index size", sprintf("%.2f KB", $idx_size), "N/A"); - $table .= "----------------+----------------------+---------------------\n\n"; - - print $rfh $table; + my $search_query = "keyword_-1"; + + print "--> Processing batch: $count files\n"; + + print " [1/4] Reseeding _site/log... "; + system("rm -rf _site/log/*"); + # Passing size first, then count to match seed.sh logic + system("$seed_script $size_kb $count > /dev/null 2>&1"); + print "Done.\n"; + + print " [2/4] Indexing (Suffix array)... "; + unlink('sa.bin', 'corpus.bin', 'file_map.dat'); + my $idx_start = [gettimeofday]; + system("perl $indexer > /dev/null 2>&1"); + my $idx_time = tv_interval($idx_start); + print "Done.\n"; + + my $idx_size = 0; + if (-f 'sa.bin' && -f 'corpus.bin') { + $idx_size = ((-s 'sa.bin') + (-s 'corpus.bin')) / 1024; + } + + print " [3/4] Testing SA search... "; + my $sa_out = `QUERY_STRING="q=$search_query" perl $sa_cgi`; + my ($sa_time, $sa_ram) = parse_metrics($sa_out); + print "Done.\n"; + + print " [4/4] Testing regex search... "; + my $reg_out = `QUERY_STRING="q=$search_query" perl $regex_cgi`; + my ($reg_time, $reg_ram) = parse_metrics($reg_out); + print "Done.\n\n"; + + my $table = sprintf("%d files (Targeting: %s):\n", $count, $search_query); + $table .= "----------------+----------------------+---------------------\n"; + $table .= sprintf("%-15s | %-20s | %-20s\n", "METRIC", "SA", "REGEX"); + $table .= "----------------+----------------------+---------------------\n"; + $table .= sprintf("%-15s | %-20s | %-20s\n", "Search time", sprintf("%.4fs", $sa_time), sprintf("%.4fs", $reg_time)); + $table .= sprintf("%-15s | %-20s | %-20s\n", "Peak RAM", sprintf("%d KB", $sa_ram), sprintf("%d KB", $reg_ram)); + $table .= sprintf("%-15s | %-20s | %-20s\n", "Indexing time", sprintf("%.4fs", $idx_time), "N/A"); + $table .= sprintf("%-15s | %-20s | %-20s\n", "Index size", sprintf("%.2f KB", $idx_size), "N/A"); + $table .= "----------------+----------------------+---------------------\n\n"; + + print $rfh $table; } close $rfh; @@ -80,8 +81,8 @@ my $pager = $ENV{PAGER} || 'more'; system("$pager $report_file"); sub parse_metrics { - my $text = shift || ""; - my $time = ($text =~ /Total Time:\s+([\d.]+)/) ? $1 : 0; - my $ram = ($text =~ /Peak RAM:\s+(\d+)/) ? $1 : 0; - return ($time, $ram); + my $text = shift || ""; + my $time = ($text =~ /Total Time:\s+([\d.]+)/) ? $1 : 0; + my $ram = ($text =~ /Peak RAM:\s+(\d+)/) ? $1 : 0; + return ($time, $ram); } -- cgit v1.2.3