From de9d82e8074c9b67a04989f9b6be62890b7c95bb Mon Sep 17 00:00:00 2001 From: Sadeep Madurange Date: Wed, 6 May 2026 21:27:51 +0800 Subject: Run benchmark. --- benchmark.pl | 137 ++++++++++++++++++++++++++++++----------------------------- 1 file changed, 69 insertions(+), 68 deletions(-) (limited to 'benchmark.pl') diff --git a/benchmark.pl b/benchmark.pl index 8c1b4ea..3d0d855 100755 --- a/benchmark.pl +++ b/benchmark.pl @@ -3,84 +3,85 @@ use strict; use warnings; use Time::HiRes qw(gettimeofday tv_interval); -# 1. Accept directory counts from @ARGV, or use defaults +# Enable autoflush for live status updates +$| = 1; + my @test_counts = @ARGV ? @ARGV : (500, 1000, 10000); +my $report_file = "result.txt"; + +# Open report file for writing +open(my $rfh, '>', $report_file) or die "Could not open $report_file: $!"; -# Configuration - All scripts are now in the root +# Configuration my $seed_script = "./seed.sh"; my $indexer = "./indexer.pl"; my $sa_cgi = "./find_sa.cgi"; my $regex_cgi = "./find_regex.cgi"; -print "=============================================================\n"; -print "SEARCH BENCHMARK: Suffix array vs. Linear regex\n"; -print "ARTICLE SIZE: 16 KB\n"; -print "=============================================================\n\n"; +my $header = "=============================================================\n" + . "SEARCH BENCHMARK: Suffix array vs. Linear regex\n" + . "ARTICLE SIZE: 8 KB\n" + . "=============================================================\n\n"; + +print $rfh $header; +print $header; foreach my $count (@test_counts) { - my $search_query = "keyword_$count"; - - print "$count files (Targeting: $search_query):\n"; - print "-------------------------------------------------------------\n"; - print sprintf("%-15s | %-20s | %-20s\n", "METRIC", "SA", "REGEX"); - print "----------------+----------------------+---------------------\n"; - - # 1. Seed - system("$seed_script $count > /dev/null 2>&1"); - - # 2. Cleanup old index files - unlink('sa.bin', 'corpus.bin', 'file_map.dat'); - - # 3. Indexing - my $idx_start = [gettimeofday]; - system("perl $indexer > /dev/null 2>&1"); - my $idx_time = tv_interval($idx_start); - - my $idx_size = 0; - if (-f 'sa.bin' && -f 'corpus.bin') { - $idx_size = ((-s 'sa.bin') + (-s 'corpus.bin')) / 1024; - } - - # 4. SA Search - my $sa_out = `QUERY_STRING="q=$search_query" perl $sa_cgi`; - my ($sa_time, $sa_ram) = parse_metrics($sa_out); - - # 5. Regex Search - my $reg_out = `QUERY_STRING="q=$search_query" perl $regex_cgi`; - my ($reg_time, $reg_ram) = parse_metrics($reg_out); - - # 6. Final Output Table - print sprintf("%-15s | %-20s | %-20s\n", - "Search time", - sprintf("%.4fs", $sa_time), - sprintf("%.4fs", $reg_time) - ); - - print sprintf("%-15s | %-20s | %-20s\n", - "Peak RAM", - sprintf("%d KB", $sa_ram), - sprintf("%d KB", $reg_ram) - ); - - print sprintf("%-15s | %-20s | %-20s\n", - "Indexing time", - sprintf("%.4fs", $idx_time), - "N/A" - ); - - print sprintf("%-15s | %-20s | %-20s\n", - "Index size", - sprintf("%.2f KB", $idx_size), - "N/A" - ); - - print "----------------+----------------------+---------------------\n\n"; + my $search_query = "keyword_-1"; # Likely not in corpus + + # Progress tracking to STDOUT + print "--> Processing batch: $count files\n"; + + print " [1/4] Reseeding _site/log... "; + system("rm -rf _site/log/*"); + system("$seed_script $count > /dev/null 2>&1"); + print "Done.\n"; + + print " [2/4] Indexing (Suffix array)... "; + unlink('sa.bin', 'corpus.bin', 'file_map.dat'); + my $idx_start = [gettimeofday]; + system("perl $indexer > /dev/null 2>&1"); + my $idx_time = tv_interval($idx_start); + print "Done.\n"; + + my $idx_size = 0; + if (-f 'sa.bin' && -f 'corpus.bin') { + $idx_size = ((-s 'sa.bin') + (-s 'corpus.bin')) / 1024; + } + + print " [3/4] Testing SA search... "; + my $sa_out = `QUERY_STRING="q=$search_query" perl $sa_cgi`; + my ($sa_time, $sa_ram) = parse_metrics($sa_out); + print "Done.\n"; + + print " [4/4] Testing regex search... "; + my $reg_out = `QUERY_STRING="q=$search_query" perl $regex_cgi`; + my ($reg_time, $reg_ram) = parse_metrics($reg_out); + print "Done.\n\n"; + + # Format the table for result.txt + my $table = sprintf("%d files (Targeting: %s):\n", $count, $search_query); + $table .= "----------------+----------------------+---------------------\n"; + $table .= sprintf("%-15s | %-20s | %-20s\n", "METRIC", "SA", "REGEX"); + $table .= "----------------+----------------------+---------------------\n"; + $table .= sprintf("%-15s | %-20s | %-20s\n", "Search time", sprintf("%.4fs", $sa_time), sprintf("%.4fs", $reg_time)); + $table .= sprintf("%-15s | %-20s | %-20s\n", "Peak RAM", sprintf("%d KB", $sa_ram), sprintf("%d KB", $reg_ram)); + $table .= sprintf("%-15s | %-20s | %-20s\n", "Indexing time", sprintf("%.4fs", $idx_time), "N/A"); + $table .= sprintf("%-15s | %-20s | %-20s\n", "Index size", sprintf("%.2f KB", $idx_size), "N/A"); + $table .= "----------------+----------------------+---------------------\n\n"; + + print $rfh $table; } +close $rfh; +print "All tests finished. Results written to $report_file.\n\n"; + +my $pager = $ENV{PAGER} || 'more'; +system("$pager $report_file"); + sub parse_metrics { - my $text = shift || ""; - my $time = ($text =~ /Total Time:\s+([\d.]+)/) ? $1 : 0; - my $ram = ($text =~ /Peak RAM:\s+(\d+)/) ? $1 : 0; - return ($time, $ram); + my $text = shift || ""; + my $time = ($text =~ /Total Time:\s+([\d.]+)/) ? $1 : 0; + my $ram = ($text =~ /Peak RAM:\s+(\d+)/) ? $1 : 0; + return ($time, $ram); } - -- cgit v1.2.3