From f6d7c3fdbecbcb880c0c02fdffefa1f467c46b03 Mon Sep 17 00:00:00 2001
From: Sadeep Madurange <sadeep@asciimx.com>
Date: Thu, 7 May 2026 14:18:23 +0800
Subject: Allow specifying file size via CLI.

---
 benchmark.pl | 107 ++++++++++++++++++++++++++++++-----------------------------
 result.txt   |  58 ++++++++++++++++++++++++--------
 seed.sh      |  58 +++++++++++++++++++-------------
 3 files changed, 133 insertions(+), 90 deletions(-)

diff --git a/benchmark.pl b/benchmark.pl
index 3d0d855..27277d7 100755
--- a/benchmark.pl
+++ b/benchmark.pl
@@ -3,16 +3,19 @@ use strict;
 use warnings;
 use Time::HiRes qw(gettimeofday tv_interval);
 
-# Enable autoflush for live status updates
 $| = 1; 
 
+# Args: size_kb [count1 count2 ...]
+my $size_kb = shift @ARGV;
+if (!$size_kb) {
+	die "Usage: $0 <size_kb> [count1 count2 ...]\n";
+}
+
 my @test_counts = @ARGV ? @ARGV : (500, 1000, 10000);
 my $report_file = "result.txt";
 
-# Open report file for writing
 open(my $rfh, '>', $report_file) or die "Could not open $report_file: $!";
 
-# Configuration
 my $seed_script  = "./seed.sh";
 my $indexer      = "./indexer.pl";
 my $sa_cgi       = "./find_sa.cgi";
@@ -20,57 +23,55 @@ my $regex_cgi    = "./find_regex.cgi";
 
 my $header = "=============================================================\n"
            . "SEARCH BENCHMARK: Suffix array vs. Linear regex\n"
-           . "ARTICLE SIZE: 8 KB\n"
+           . "ARTICLE SIZE: $size_kb KB\n"
            . "=============================================================\n\n";
 
 print $rfh $header;
-print $header;
 
 foreach my $count (@test_counts) {
-    my $search_query = "keyword_-1"; # Likely not in corpus
-
-    # Progress tracking to STDOUT
-    print "--> Processing batch: $count files\n";
-    
-    print "    [1/4] Reseeding _site/log... ";
-    system("rm -rf _site/log/*"); 
-    system("$seed_script $count > /dev/null 2>&1");
-    print "Done.\n";
-
-    print "    [2/4] Indexing (Suffix array)... ";
-    unlink('sa.bin', 'corpus.bin', 'file_map.dat');
-    my $idx_start = [gettimeofday];
-    system("perl $indexer > /dev/null 2>&1");
-    my $idx_time = tv_interval($idx_start);
-    print "Done.\n";
-    
-    my $idx_size = 0;
-    if (-f 'sa.bin' && -f 'corpus.bin') {
-        $idx_size = ((-s 'sa.bin') + (-s 'corpus.bin')) / 1024; 
-    }
-
-    print "    [3/4] Testing SA search... ";
-    my $sa_out = `QUERY_STRING="q=$search_query" perl $sa_cgi`;
-    my ($sa_time, $sa_ram) = parse_metrics($sa_out);
-    print "Done.\n";
-
-    print "    [4/4] Testing regex search... ";
-    my $reg_out = `QUERY_STRING="q=$search_query" perl $regex_cgi`;
-    my ($reg_time, $reg_ram) = parse_metrics($reg_out);
-    print "Done.\n\n";
-
-    # Format the table for result.txt
-    my $table = sprintf("%d files (Targeting: %s):\n", $count, $search_query);
-    $table .= "----------------+----------------------+---------------------\n";
-    $table .= sprintf("%-15s | %-20s | %-20s\n", "METRIC", "SA", "REGEX");
-    $table .= "----------------+----------------------+---------------------\n";
-    $table .= sprintf("%-15s | %-20s | %-20s\n", "Search time", sprintf("%.4fs", $sa_time), sprintf("%.4fs", $reg_time));
-    $table .= sprintf("%-15s | %-20s | %-20s\n", "Peak RAM", sprintf("%d KB", $sa_ram), sprintf("%d KB", $reg_ram));
-    $table .= sprintf("%-15s | %-20s | %-20s\n", "Indexing time", sprintf("%.4fs", $idx_time), "N/A");
-    $table .= sprintf("%-15s | %-20s | %-20s\n", "Index size", sprintf("%.2f KB", $idx_size), "N/A");
-    $table .= "----------------+----------------------+---------------------\n\n";
-
-    print $rfh $table;
+	my $search_query = "keyword_-1";
+
+	print "--> Processing batch: $count files\n";
+	
+	print "    [1/4] Reseeding _site/log... ";
+	system("rm -rf _site/log/*"); 
+	# Passing size first, then count to match seed.sh logic
+	system("$seed_script $size_kb $count > /dev/null 2>&1");
+	print "Done.\n";
+
+	print "    [2/4] Indexing (Suffix array)... ";
+	unlink('sa.bin', 'corpus.bin', 'file_map.dat');
+	my $idx_start = [gettimeofday];
+	system("perl $indexer > /dev/null 2>&1");
+	my $idx_time = tv_interval($idx_start);
+	print "Done.\n";
+	
+	my $idx_size = 0;
+	if (-f 'sa.bin' && -f 'corpus.bin') {
+		$idx_size = ((-s 'sa.bin') + (-s 'corpus.bin')) / 1024; 
+	}
+
+	print "    [3/4] Testing SA search... ";
+	my $sa_out = `QUERY_STRING="q=$search_query" perl $sa_cgi`;
+	my ($sa_time, $sa_ram) = parse_metrics($sa_out);
+	print "Done.\n";
+
+	print "    [4/4] Testing regex search... ";
+	my $reg_out = `QUERY_STRING="q=$search_query" perl $regex_cgi`;
+	my ($reg_time, $reg_ram) = parse_metrics($reg_out);
+	print "Done.\n\n";
+
+	my $table = sprintf("%d files (Targeting: %s):\n", $count, $search_query);
+	$table .= "----------------+----------------------+---------------------\n";
+	$table .= sprintf("%-15s | %-20s | %-20s\n", "METRIC", "SA", "REGEX");
+	$table .= "----------------+----------------------+---------------------\n";
+	$table .= sprintf("%-15s | %-20s | %-20s\n", "Search time", sprintf("%.4fs", $sa_time), sprintf("%.4fs", $reg_time));
+	$table .= sprintf("%-15s | %-20s | %-20s\n", "Peak RAM", sprintf("%d KB", $sa_ram), sprintf("%d KB", $reg_ram));
+	$table .= sprintf("%-15s | %-20s | %-20s\n", "Indexing time", sprintf("%.4fs", $idx_time), "N/A");
+	$table .= sprintf("%-15s | %-20s | %-20s\n", "Index size", sprintf("%.2f KB", $idx_size), "N/A");
+	$table .= "----------------+----------------------+---------------------\n\n";
+
+	print $rfh $table;
 }
 
 close $rfh;
@@ -80,8 +81,8 @@ my $pager = $ENV{PAGER} || 'more';
 system("$pager $report_file");
 
 sub parse_metrics {
-    my $text = shift || "";
-    my $time = ($text =~ /Total Time:\s+([\d.]+)/) ? $1 : 0;
-    my $ram  = ($text =~ /Peak RAM:\s+(\d+)/) ? $1 : 0;
-    return ($time, $ram);
+	my $text = shift || "";
+	my $time = ($text =~ /Total Time:\s+([\d.]+)/) ? $1 : 0;
+	my $ram  = ($text =~ /Peak RAM:\s+(\d+)/) ? $1 : 0;
+	return ($time, $ram);
 }
diff --git a/result.txt b/result.txt
index b9407ce..4842d9c 100644
--- a/result.txt
+++ b/result.txt
@@ -1,35 +1,65 @@
 =============================================================
 SEARCH BENCHMARK: Suffix array vs. Linear regex
-ARTICLE SIZE: 8 KB
+ARTICLE SIZE: 4.1 KB
 =============================================================
 
+100 files (Targeting: keyword_-1):
+----------------+----------------------+---------------------
+METRIC          | SA                   | REGEX               
+----------------+----------------------+---------------------
+Search time     | 0.0009s              | 0.0084s             
+Peak RAM        | 7968 KB              | 9676 KB             
+Indexing time   | 1.3332s              | N/A                 
+Index size      | 2070.38 KB           | N/A                 
+----------------+----------------------+---------------------
+
+200 files (Targeting: keyword_-1):
+----------------+----------------------+---------------------
+METRIC          | SA                   | REGEX               
+----------------+----------------------+---------------------
+Search time     | 0.0007s              | 0.0161s             
+Peak RAM        | 7984 KB              | 9532 KB             
+Indexing time   | 2.8687s              | N/A                 
+Index size      | 4141.11 KB           | N/A                 
+----------------+----------------------+---------------------
+
+300 files (Targeting: keyword_-1):
+----------------+----------------------+---------------------
+METRIC          | SA                   | REGEX               
+----------------+----------------------+---------------------
+Search time     | 0.0009s              | 0.0242s             
+Peak RAM        | 8024 KB              | 9680 KB             
+Indexing time   | 4.5658s              | N/A                 
+Index size      | 6211.76 KB           | N/A                 
+----------------+----------------------+---------------------
+
 500 files (Targeting: keyword_-1):
 ----------------+----------------------+---------------------
 METRIC          | SA                   | REGEX               
 ----------------+----------------------+---------------------
-Search time     | 0.0014s              | 0.0451s             
-Peak RAM        | 8124 KB              | 9612 KB             
-Indexing time   | 18.1865s             | N/A                 
-Index size      | 19610.39 KB          | N/A                 
+Search time     | 0.0013s              | 0.0455s             
+Peak RAM        | 8116 KB              | 9728 KB             
+Indexing time   | 9.8992s              | N/A                 
+Index size      | 10353.31 KB          | N/A                 
 ----------------+----------------------+---------------------
 
 1000 files (Targeting: keyword_-1):
 ----------------+----------------------+---------------------
 METRIC          | SA                   | REGEX               
 ----------------+----------------------+---------------------
-Search time     | 0.0021s              | 0.0918s             
-Peak RAM        | 8280 KB              | 9960 KB             
-Indexing time   | 43.1748s             | N/A                 
-Index size      | 39225.06 KB          | N/A                 
+Search time     | 0.0021s              | 0.0957s             
+Peak RAM        | 8304 KB              | 9972 KB             
+Indexing time   | 21.2007s             | N/A                 
+Index size      | 20707.28 KB          | N/A                 
 ----------------+----------------------+---------------------
 
-10000 files (Targeting: keyword_-1):
+5000 files (Targeting: keyword_-1):
 ----------------+----------------------+---------------------
 METRIC          | SA                   | REGEX               
 ----------------+----------------------+---------------------
-Search time     | 0.0173s              | 1.1275s             
-Peak RAM        | 11848 KB             | 13392 KB            
-Indexing time   | 663.3909s            | N/A                 
-Index size      | 392263.01 KB         | N/A                 
+Search time     | 0.0088s              | 0.4937s             
+Peak RAM        | 9948 KB              | 11436 KB            
+Indexing time   | 138.7510s            | N/A                 
+Index size      | 103557.18 KB         | N/A                 
 ----------------+----------------------+---------------------
 
diff --git a/seed.sh b/seed.sh
index bdfe71e..04412cf 100755
--- a/seed.sh
+++ b/seed.sh
@@ -1,36 +1,48 @@
 #!/bin/ksh
 
-# Accept directory count as an argument, default to 500
-TOTAL=${1:-500}
+# Usage: ./seed.sh size filecount
+# Example: ./seed.sh 4.1 500
+
+INPUT_SIZE=$1
+TOTAL=$2
+
+if [[ -z "$INPUT_SIZE" || -z "$TOTAL" ]]; then
+	echo "Usage: $0 <size_kb> <file_count>"
+	exit 1
+fi
 
 # Define the base path relative to the script location
 SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
 BASE_DIR="$SCRIPT_DIR/_site/log"
 
-CONTENT_SIZE=8000 
-
 # Ensure the target directory exists
 mkdir -p "$BASE_DIR"
 
-for i in $(seq 1 $TOTAL); do
-    DIR="$BASE_DIR/site_$i"
-    mkdir -p "$DIR"
-    
-    # Start the file structure
-    echo "<html><head><title>Site $i</title></head><body><main><p>" > "$DIR/index.html"
-    
-    # Generate random text using dd for byte-level precision
-    # We read 32KB of raw data to account for characters filtered out by tr,
-    # then use dd again to trim the result to exactly CONTENT_SIZE bytes.
-    dd if=/dev/urandom bs=32768 count=1 2>/dev/null | tr -dc 'a-zA-Z0-9 ' | dd bs=1 count=$CONTENT_SIZE 2>/dev/null >> "$DIR/index.html"
-    
-    # Append keyword and close tags
-    echo " Searchable content here for keyword_$i. </p></main></body></html>" >> "$DIR/index.html"
-    
-    # Print progress every 100 files
-    if [ $((i % 100)) -eq 0 ]; then
-        echo "Created $i files..."
-    fi
+# Convert KB to raw bytes for dd precision
+CONTENT_SIZE=$(awk -v kb="$INPUT_SIZE" 'BEGIN { printf "%.0f", kb * 1024 }')
+
+echo "Generating $TOTAL directories in $BASE_DIR (File size: $INPUT_SIZE KB)..."
+
+i=1
+while [[ $i -le $TOTAL ]]; do
+	DIR="$BASE_DIR/site_$i"
+	mkdir -p "$DIR"
+	
+	# Start the file structure
+	echo "<html><head><title>Site $i</title></head><body><main><p>" > "$DIR/index.html"
+	
+	# Generate random text using dd for byte-level precision
+	dd if=/dev/urandom bs=32768 count=1 2>/dev/null | tr -dc 'a-zA-Z0-9 ' | dd bs=1 count=$CONTENT_SIZE 2>/dev/null >> "$DIR/index.html"
+	
+	# Append keyword and close tags
+	echo " Searchable content here for keyword_$i. </p></main></body></html>" >> "$DIR/index.html"
+	
+	# Print progress every 100 files
+	if [ $((i % 100)) -eq 0 ]; then
+		echo "Created $i files..."
+	fi
+	
+	(( i += 1 ))
 done
 
 echo "Done! $TOTAL directories created in $BASE_DIR."
-- 
cgit v1.2.3