summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xbenchmark.pl107
-rw-r--r--result.txt58
-rwxr-xr-xseed.sh58
3 files changed, 133 insertions, 90 deletions
diff --git a/benchmark.pl b/benchmark.pl
index 3d0d855..27277d7 100755
--- a/benchmark.pl
+++ b/benchmark.pl
@@ -3,16 +3,19 @@ use strict;
use warnings;
use Time::HiRes qw(gettimeofday tv_interval);
-# Enable autoflush for live status updates
$| = 1;
+# Args: size_kb [count1 count2 ...]
+my $size_kb = shift @ARGV;
+if (!$size_kb) {
+ die "Usage: $0 <size_kb> [count1 count2 ...]\n";
+}
+
my @test_counts = @ARGV ? @ARGV : (500, 1000, 10000);
my $report_file = "result.txt";
-# Open report file for writing
open(my $rfh, '>', $report_file) or die "Could not open $report_file: $!";
-# Configuration
my $seed_script = "./seed.sh";
my $indexer = "./indexer.pl";
my $sa_cgi = "./find_sa.cgi";
@@ -20,57 +23,55 @@ my $regex_cgi = "./find_regex.cgi";
my $header = "=============================================================\n"
. "SEARCH BENCHMARK: Suffix array vs. Linear regex\n"
- . "ARTICLE SIZE: 8 KB\n"
+ . "ARTICLE SIZE: $size_kb KB\n"
. "=============================================================\n\n";
print $rfh $header;
-print $header;
foreach my $count (@test_counts) {
- my $search_query = "keyword_-1"; # Likely not in corpus
-
- # Progress tracking to STDOUT
- print "--> Processing batch: $count files\n";
-
- print " [1/4] Reseeding _site/log... ";
- system("rm -rf _site/log/*");
- system("$seed_script $count > /dev/null 2>&1");
- print "Done.\n";
-
- print " [2/4] Indexing (Suffix array)... ";
- unlink('sa.bin', 'corpus.bin', 'file_map.dat');
- my $idx_start = [gettimeofday];
- system("perl $indexer > /dev/null 2>&1");
- my $idx_time = tv_interval($idx_start);
- print "Done.\n";
-
- my $idx_size = 0;
- if (-f 'sa.bin' && -f 'corpus.bin') {
- $idx_size = ((-s 'sa.bin') + (-s 'corpus.bin')) / 1024;
- }
-
- print " [3/4] Testing SA search... ";
- my $sa_out = `QUERY_STRING="q=$search_query" perl $sa_cgi`;
- my ($sa_time, $sa_ram) = parse_metrics($sa_out);
- print "Done.\n";
-
- print " [4/4] Testing regex search... ";
- my $reg_out = `QUERY_STRING="q=$search_query" perl $regex_cgi`;
- my ($reg_time, $reg_ram) = parse_metrics($reg_out);
- print "Done.\n\n";
-
- # Format the table for result.txt
- my $table = sprintf("%d files (Targeting: %s):\n", $count, $search_query);
- $table .= "----------------+----------------------+---------------------\n";
- $table .= sprintf("%-15s | %-20s | %-20s\n", "METRIC", "SA", "REGEX");
- $table .= "----------------+----------------------+---------------------\n";
- $table .= sprintf("%-15s | %-20s | %-20s\n", "Search time", sprintf("%.4fs", $sa_time), sprintf("%.4fs", $reg_time));
- $table .= sprintf("%-15s | %-20s | %-20s\n", "Peak RAM", sprintf("%d KB", $sa_ram), sprintf("%d KB", $reg_ram));
- $table .= sprintf("%-15s | %-20s | %-20s\n", "Indexing time", sprintf("%.4fs", $idx_time), "N/A");
- $table .= sprintf("%-15s | %-20s | %-20s\n", "Index size", sprintf("%.2f KB", $idx_size), "N/A");
- $table .= "----------------+----------------------+---------------------\n\n";
-
- print $rfh $table;
+ my $search_query = "keyword_-1";
+
+ print "--> Processing batch: $count files\n";
+
+ print " [1/4] Reseeding _site/log... ";
+ system("rm -rf _site/log/*");
+ # Passing size first, then count to match seed.sh logic
+ system("$seed_script $size_kb $count > /dev/null 2>&1");
+ print "Done.\n";
+
+ print " [2/4] Indexing (Suffix array)... ";
+ unlink('sa.bin', 'corpus.bin', 'file_map.dat');
+ my $idx_start = [gettimeofday];
+ system("perl $indexer > /dev/null 2>&1");
+ my $idx_time = tv_interval($idx_start);
+ print "Done.\n";
+
+ my $idx_size = 0;
+ if (-f 'sa.bin' && -f 'corpus.bin') {
+ $idx_size = ((-s 'sa.bin') + (-s 'corpus.bin')) / 1024;
+ }
+
+ print " [3/4] Testing SA search... ";
+ my $sa_out = `QUERY_STRING="q=$search_query" perl $sa_cgi`;
+ my ($sa_time, $sa_ram) = parse_metrics($sa_out);
+ print "Done.\n";
+
+ print " [4/4] Testing regex search... ";
+ my $reg_out = `QUERY_STRING="q=$search_query" perl $regex_cgi`;
+ my ($reg_time, $reg_ram) = parse_metrics($reg_out);
+ print "Done.\n\n";
+
+ my $table = sprintf("%d files (Targeting: %s):\n", $count, $search_query);
+ $table .= "----------------+----------------------+---------------------\n";
+ $table .= sprintf("%-15s | %-20s | %-20s\n", "METRIC", "SA", "REGEX");
+ $table .= "----------------+----------------------+---------------------\n";
+ $table .= sprintf("%-15s | %-20s | %-20s\n", "Search time", sprintf("%.4fs", $sa_time), sprintf("%.4fs", $reg_time));
+ $table .= sprintf("%-15s | %-20s | %-20s\n", "Peak RAM", sprintf("%d KB", $sa_ram), sprintf("%d KB", $reg_ram));
+ $table .= sprintf("%-15s | %-20s | %-20s\n", "Indexing time", sprintf("%.4fs", $idx_time), "N/A");
+ $table .= sprintf("%-15s | %-20s | %-20s\n", "Index size", sprintf("%.2f KB", $idx_size), "N/A");
+ $table .= "----------------+----------------------+---------------------\n\n";
+
+ print $rfh $table;
}
close $rfh;
@@ -80,8 +81,8 @@ my $pager = $ENV{PAGER} || 'more';
system("$pager $report_file");
sub parse_metrics {
- my $text = shift || "";
- my $time = ($text =~ /Total Time:\s+([\d.]+)/) ? $1 : 0;
- my $ram = ($text =~ /Peak RAM:\s+(\d+)/) ? $1 : 0;
- return ($time, $ram);
+ my $text = shift || "";
+ my $time = ($text =~ /Total Time:\s+([\d.]+)/) ? $1 : 0;
+ my $ram = ($text =~ /Peak RAM:\s+(\d+)/) ? $1 : 0;
+ return ($time, $ram);
}
diff --git a/result.txt b/result.txt
index b9407ce..4842d9c 100644
--- a/result.txt
+++ b/result.txt
@@ -1,35 +1,65 @@
=============================================================
SEARCH BENCHMARK: Suffix array vs. Linear regex
-ARTICLE SIZE: 8 KB
+ARTICLE SIZE: 4.1 KB
=============================================================
+100 files (Targeting: keyword_-1):
+----------------+----------------------+---------------------
+METRIC | SA | REGEX
+----------------+----------------------+---------------------
+Search time | 0.0009s | 0.0084s
+Peak RAM | 7968 KB | 9676 KB
+Indexing time | 1.3332s | N/A
+Index size | 2070.38 KB | N/A
+----------------+----------------------+---------------------
+
+200 files (Targeting: keyword_-1):
+----------------+----------------------+---------------------
+METRIC | SA | REGEX
+----------------+----------------------+---------------------
+Search time | 0.0007s | 0.0161s
+Peak RAM | 7984 KB | 9532 KB
+Indexing time | 2.8687s | N/A
+Index size | 4141.11 KB | N/A
+----------------+----------------------+---------------------
+
+300 files (Targeting: keyword_-1):
+----------------+----------------------+---------------------
+METRIC | SA | REGEX
+----------------+----------------------+---------------------
+Search time | 0.0009s | 0.0242s
+Peak RAM | 8024 KB | 9680 KB
+Indexing time | 4.5658s | N/A
+Index size | 6211.76 KB | N/A
+----------------+----------------------+---------------------
+
500 files (Targeting: keyword_-1):
----------------+----------------------+---------------------
METRIC | SA | REGEX
----------------+----------------------+---------------------
-Search time | 0.0014s | 0.0451s
-Peak RAM | 8124 KB | 9612 KB
-Indexing time | 18.1865s | N/A
-Index size | 19610.39 KB | N/A
+Search time | 0.0013s | 0.0455s
+Peak RAM | 8116 KB | 9728 KB
+Indexing time | 9.8992s | N/A
+Index size | 10353.31 KB | N/A
----------------+----------------------+---------------------
1000 files (Targeting: keyword_-1):
----------------+----------------------+---------------------
METRIC | SA | REGEX
----------------+----------------------+---------------------
-Search time | 0.0021s | 0.0918s
-Peak RAM | 8280 KB | 9960 KB
-Indexing time | 43.1748s | N/A
-Index size | 39225.06 KB | N/A
+Search time | 0.0021s | 0.0957s
+Peak RAM | 8304 KB | 9972 KB
+Indexing time | 21.2007s | N/A
+Index size | 20707.28 KB | N/A
----------------+----------------------+---------------------
-10000 files (Targeting: keyword_-1):
+5000 files (Targeting: keyword_-1):
----------------+----------------------+---------------------
METRIC | SA | REGEX
----------------+----------------------+---------------------
-Search time | 0.0173s | 1.1275s
-Peak RAM | 11848 KB | 13392 KB
-Indexing time | 663.3909s | N/A
-Index size | 392263.01 KB | N/A
+Search time | 0.0088s | 0.4937s
+Peak RAM | 9948 KB | 11436 KB
+Indexing time | 138.7510s | N/A
+Index size | 103557.18 KB | N/A
----------------+----------------------+---------------------
diff --git a/seed.sh b/seed.sh
index bdfe71e..04412cf 100755
--- a/seed.sh
+++ b/seed.sh
@@ -1,36 +1,48 @@
#!/bin/ksh
-# Accept directory count as an argument, default to 500
-TOTAL=${1:-500}
+# Usage: ./seed.sh size filecount
+# Example: ./seed.sh 4.1 500
+
+INPUT_SIZE=$1
+TOTAL=$2
+
+if [[ -z "$INPUT_SIZE" || -z "$TOTAL" ]]; then
+ echo "Usage: $0 <size_kb> <file_count>"
+ exit 1
+fi
# Define the base path relative to the script location
SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
BASE_DIR="$SCRIPT_DIR/_site/log"
-CONTENT_SIZE=8000
-
# Ensure the target directory exists
mkdir -p "$BASE_DIR"
-for i in $(seq 1 $TOTAL); do
- DIR="$BASE_DIR/site_$i"
- mkdir -p "$DIR"
-
- # Start the file structure
- echo "<html><head><title>Site $i</title></head><body><main><p>" > "$DIR/index.html"
-
- # Generate random text using dd for byte-level precision
- # We read 32KB of raw data to account for characters filtered out by tr,
- # then use dd again to trim the result to exactly CONTENT_SIZE bytes.
- dd if=/dev/urandom bs=32768 count=1 2>/dev/null | tr -dc 'a-zA-Z0-9 ' | dd bs=1 count=$CONTENT_SIZE 2>/dev/null >> "$DIR/index.html"
-
- # Append keyword and close tags
- echo " Searchable content here for keyword_$i. </p></main></body></html>" >> "$DIR/index.html"
-
- # Print progress every 100 files
- if [ $((i % 100)) -eq 0 ]; then
- echo "Created $i files..."
- fi
+# Convert KB to raw bytes for dd precision
+CONTENT_SIZE=$(awk -v kb="$INPUT_SIZE" 'BEGIN { printf "%.0f", kb * 1024 }')
+
+echo "Generating $TOTAL directories in $BASE_DIR (File size: $INPUT_SIZE KB)..."
+
+i=1
+while [[ $i -le $TOTAL ]]; do
+ DIR="$BASE_DIR/site_$i"
+ mkdir -p "$DIR"
+
+ # Start the file structure
+ echo "<html><head><title>Site $i</title></head><body><main><p>" > "$DIR/index.html"
+
+ # Generate random text using dd for byte-level precision
+ dd if=/dev/urandom bs=32768 count=1 2>/dev/null | tr -dc 'a-zA-Z0-9 ' | dd bs=1 count=$CONTENT_SIZE 2>/dev/null >> "$DIR/index.html"
+
+ # Append keyword and close tags
+ echo " Searchable content here for keyword_$i. </p></main></body></html>" >> "$DIR/index.html"
+
+ # Print progress every 100 files
+ if [ $((i % 100)) -eq 0 ]; then
+ echo "Created $i files..."
+ fi
+
+ (( i += 1 ))
done
echo "Done! $TOTAL directories created in $BASE_DIR."