From f6d7c3fdbecbcb880c0c02fdffefa1f467c46b03 Mon Sep 17 00:00:00 2001 From: Sadeep Madurange Date: Thu, 7 May 2026 14:18:23 +0800 Subject: Allow specifying file size via CLI. --- benchmark.pl | 107 ++++++++++++++++++++++++++++++----------------------------- result.txt | 58 ++++++++++++++++++++++++-------- seed.sh | 58 +++++++++++++++++++------------- 3 files changed, 133 insertions(+), 90 deletions(-) diff --git a/benchmark.pl b/benchmark.pl index 3d0d855..27277d7 100755 --- a/benchmark.pl +++ b/benchmark.pl @@ -3,16 +3,19 @@ use strict; use warnings; use Time::HiRes qw(gettimeofday tv_interval); -# Enable autoflush for live status updates $| = 1; +# Args: size_kb [count1 count2 ...] +my $size_kb = shift @ARGV; +if (!$size_kb) { + die "Usage: $0 [count1 count2 ...]\n"; +} + my @test_counts = @ARGV ? @ARGV : (500, 1000, 10000); my $report_file = "result.txt"; -# Open report file for writing open(my $rfh, '>', $report_file) or die "Could not open $report_file: $!"; -# Configuration my $seed_script = "./seed.sh"; my $indexer = "./indexer.pl"; my $sa_cgi = "./find_sa.cgi"; @@ -20,57 +23,55 @@ my $regex_cgi = "./find_regex.cgi"; my $header = "=============================================================\n" . "SEARCH BENCHMARK: Suffix array vs. Linear regex\n" - . "ARTICLE SIZE: 8 KB\n" + . "ARTICLE SIZE: $size_kb KB\n" . "=============================================================\n\n"; print $rfh $header; -print $header; foreach my $count (@test_counts) { - my $search_query = "keyword_-1"; # Likely not in corpus - - # Progress tracking to STDOUT - print "--> Processing batch: $count files\n"; - - print " [1/4] Reseeding _site/log... "; - system("rm -rf _site/log/*"); - system("$seed_script $count > /dev/null 2>&1"); - print "Done.\n"; - - print " [2/4] Indexing (Suffix array)... "; - unlink('sa.bin', 'corpus.bin', 'file_map.dat'); - my $idx_start = [gettimeofday]; - system("perl $indexer > /dev/null 2>&1"); - my $idx_time = tv_interval($idx_start); - print "Done.\n"; - - my $idx_size = 0; - if (-f 'sa.bin' && -f 'corpus.bin') { - $idx_size = ((-s 'sa.bin') + (-s 'corpus.bin')) / 1024; - } - - print " [3/4] Testing SA search... "; - my $sa_out = `QUERY_STRING="q=$search_query" perl $sa_cgi`; - my ($sa_time, $sa_ram) = parse_metrics($sa_out); - print "Done.\n"; - - print " [4/4] Testing regex search... "; - my $reg_out = `QUERY_STRING="q=$search_query" perl $regex_cgi`; - my ($reg_time, $reg_ram) = parse_metrics($reg_out); - print "Done.\n\n"; - - # Format the table for result.txt - my $table = sprintf("%d files (Targeting: %s):\n", $count, $search_query); - $table .= "----------------+----------------------+---------------------\n"; - $table .= sprintf("%-15s | %-20s | %-20s\n", "METRIC", "SA", "REGEX"); - $table .= "----------------+----------------------+---------------------\n"; - $table .= sprintf("%-15s | %-20s | %-20s\n", "Search time", sprintf("%.4fs", $sa_time), sprintf("%.4fs", $reg_time)); - $table .= sprintf("%-15s | %-20s | %-20s\n", "Peak RAM", sprintf("%d KB", $sa_ram), sprintf("%d KB", $reg_ram)); - $table .= sprintf("%-15s | %-20s | %-20s\n", "Indexing time", sprintf("%.4fs", $idx_time), "N/A"); - $table .= sprintf("%-15s | %-20s | %-20s\n", "Index size", sprintf("%.2f KB", $idx_size), "N/A"); - $table .= "----------------+----------------------+---------------------\n\n"; - - print $rfh $table; + my $search_query = "keyword_-1"; + + print "--> Processing batch: $count files\n"; + + print " [1/4] Reseeding _site/log... "; + system("rm -rf _site/log/*"); + # Passing size first, then count to match seed.sh logic + system("$seed_script $size_kb $count > /dev/null 2>&1"); + print "Done.\n"; + + print " [2/4] Indexing (Suffix array)... "; + unlink('sa.bin', 'corpus.bin', 'file_map.dat'); + my $idx_start = [gettimeofday]; + system("perl $indexer > /dev/null 2>&1"); + my $idx_time = tv_interval($idx_start); + print "Done.\n"; + + my $idx_size = 0; + if (-f 'sa.bin' && -f 'corpus.bin') { + $idx_size = ((-s 'sa.bin') + (-s 'corpus.bin')) / 1024; + } + + print " [3/4] Testing SA search... "; + my $sa_out = `QUERY_STRING="q=$search_query" perl $sa_cgi`; + my ($sa_time, $sa_ram) = parse_metrics($sa_out); + print "Done.\n"; + + print " [4/4] Testing regex search... "; + my $reg_out = `QUERY_STRING="q=$search_query" perl $regex_cgi`; + my ($reg_time, $reg_ram) = parse_metrics($reg_out); + print "Done.\n\n"; + + my $table = sprintf("%d files (Targeting: %s):\n", $count, $search_query); + $table .= "----------------+----------------------+---------------------\n"; + $table .= sprintf("%-15s | %-20s | %-20s\n", "METRIC", "SA", "REGEX"); + $table .= "----------------+----------------------+---------------------\n"; + $table .= sprintf("%-15s | %-20s | %-20s\n", "Search time", sprintf("%.4fs", $sa_time), sprintf("%.4fs", $reg_time)); + $table .= sprintf("%-15s | %-20s | %-20s\n", "Peak RAM", sprintf("%d KB", $sa_ram), sprintf("%d KB", $reg_ram)); + $table .= sprintf("%-15s | %-20s | %-20s\n", "Indexing time", sprintf("%.4fs", $idx_time), "N/A"); + $table .= sprintf("%-15s | %-20s | %-20s\n", "Index size", sprintf("%.2f KB", $idx_size), "N/A"); + $table .= "----------------+----------------------+---------------------\n\n"; + + print $rfh $table; } close $rfh; @@ -80,8 +81,8 @@ my $pager = $ENV{PAGER} || 'more'; system("$pager $report_file"); sub parse_metrics { - my $text = shift || ""; - my $time = ($text =~ /Total Time:\s+([\d.]+)/) ? $1 : 0; - my $ram = ($text =~ /Peak RAM:\s+(\d+)/) ? $1 : 0; - return ($time, $ram); + my $text = shift || ""; + my $time = ($text =~ /Total Time:\s+([\d.]+)/) ? $1 : 0; + my $ram = ($text =~ /Peak RAM:\s+(\d+)/) ? $1 : 0; + return ($time, $ram); } diff --git a/result.txt b/result.txt index b9407ce..4842d9c 100644 --- a/result.txt +++ b/result.txt @@ -1,35 +1,65 @@ ============================================================= SEARCH BENCHMARK: Suffix array vs. Linear regex -ARTICLE SIZE: 8 KB +ARTICLE SIZE: 4.1 KB ============================================================= +100 files (Targeting: keyword_-1): +----------------+----------------------+--------------------- +METRIC | SA | REGEX +----------------+----------------------+--------------------- +Search time | 0.0009s | 0.0084s +Peak RAM | 7968 KB | 9676 KB +Indexing time | 1.3332s | N/A +Index size | 2070.38 KB | N/A +----------------+----------------------+--------------------- + +200 files (Targeting: keyword_-1): +----------------+----------------------+--------------------- +METRIC | SA | REGEX +----------------+----------------------+--------------------- +Search time | 0.0007s | 0.0161s +Peak RAM | 7984 KB | 9532 KB +Indexing time | 2.8687s | N/A +Index size | 4141.11 KB | N/A +----------------+----------------------+--------------------- + +300 files (Targeting: keyword_-1): +----------------+----------------------+--------------------- +METRIC | SA | REGEX +----------------+----------------------+--------------------- +Search time | 0.0009s | 0.0242s +Peak RAM | 8024 KB | 9680 KB +Indexing time | 4.5658s | N/A +Index size | 6211.76 KB | N/A +----------------+----------------------+--------------------- + 500 files (Targeting: keyword_-1): ----------------+----------------------+--------------------- METRIC | SA | REGEX ----------------+----------------------+--------------------- -Search time | 0.0014s | 0.0451s -Peak RAM | 8124 KB | 9612 KB -Indexing time | 18.1865s | N/A -Index size | 19610.39 KB | N/A +Search time | 0.0013s | 0.0455s +Peak RAM | 8116 KB | 9728 KB +Indexing time | 9.8992s | N/A +Index size | 10353.31 KB | N/A ----------------+----------------------+--------------------- 1000 files (Targeting: keyword_-1): ----------------+----------------------+--------------------- METRIC | SA | REGEX ----------------+----------------------+--------------------- -Search time | 0.0021s | 0.0918s -Peak RAM | 8280 KB | 9960 KB -Indexing time | 43.1748s | N/A -Index size | 39225.06 KB | N/A +Search time | 0.0021s | 0.0957s +Peak RAM | 8304 KB | 9972 KB +Indexing time | 21.2007s | N/A +Index size | 20707.28 KB | N/A ----------------+----------------------+--------------------- -10000 files (Targeting: keyword_-1): +5000 files (Targeting: keyword_-1): ----------------+----------------------+--------------------- METRIC | SA | REGEX ----------------+----------------------+--------------------- -Search time | 0.0173s | 1.1275s -Peak RAM | 11848 KB | 13392 KB -Indexing time | 663.3909s | N/A -Index size | 392263.01 KB | N/A +Search time | 0.0088s | 0.4937s +Peak RAM | 9948 KB | 11436 KB +Indexing time | 138.7510s | N/A +Index size | 103557.18 KB | N/A ----------------+----------------------+--------------------- diff --git a/seed.sh b/seed.sh index bdfe71e..04412cf 100755 --- a/seed.sh +++ b/seed.sh @@ -1,36 +1,48 @@ #!/bin/ksh -# Accept directory count as an argument, default to 500 -TOTAL=${1:-500} +# Usage: ./seed.sh size filecount +# Example: ./seed.sh 4.1 500 + +INPUT_SIZE=$1 +TOTAL=$2 + +if [[ -z "$INPUT_SIZE" || -z "$TOTAL" ]]; then + echo "Usage: $0 " + exit 1 +fi # Define the base path relative to the script location SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd) BASE_DIR="$SCRIPT_DIR/_site/log" -CONTENT_SIZE=8000 - # Ensure the target directory exists mkdir -p "$BASE_DIR" -for i in $(seq 1 $TOTAL); do - DIR="$BASE_DIR/site_$i" - mkdir -p "$DIR" - - # Start the file structure - echo "Site $i

" > "$DIR/index.html" - - # Generate random text using dd for byte-level precision - # We read 32KB of raw data to account for characters filtered out by tr, - # then use dd again to trim the result to exactly CONTENT_SIZE bytes. - dd if=/dev/urandom bs=32768 count=1 2>/dev/null | tr -dc 'a-zA-Z0-9 ' | dd bs=1 count=$CONTENT_SIZE 2>/dev/null >> "$DIR/index.html" - - # Append keyword and close tags - echo " Searchable content here for keyword_$i.

" >> "$DIR/index.html" - - # Print progress every 100 files - if [ $((i % 100)) -eq 0 ]; then - echo "Created $i files..." - fi +# Convert KB to raw bytes for dd precision +CONTENT_SIZE=$(awk -v kb="$INPUT_SIZE" 'BEGIN { printf "%.0f", kb * 1024 }') + +echo "Generating $TOTAL directories in $BASE_DIR (File size: $INPUT_SIZE KB)..." + +i=1 +while [[ $i -le $TOTAL ]]; do + DIR="$BASE_DIR/site_$i" + mkdir -p "$DIR" + + # Start the file structure + echo "Site $i

" > "$DIR/index.html" + + # Generate random text using dd for byte-level precision + dd if=/dev/urandom bs=32768 count=1 2>/dev/null | tr -dc 'a-zA-Z0-9 ' | dd bs=1 count=$CONTENT_SIZE 2>/dev/null >> "$DIR/index.html" + + # Append keyword and close tags + echo " Searchable content here for keyword_$i.

" >> "$DIR/index.html" + + # Print progress every 100 files + if [ $((i % 100)) -eq 0 ]; then + echo "Created $i files..." + fi + + (( i += 1 )) done echo "Done! $TOTAL directories created in $BASE_DIR." -- cgit v1.2.3