diff options
| author | Sadeep Madurange <sadeep@asciimx.com> | 2026-05-06 19:42:33 +0800 |
|---|---|---|
| committer | Sadeep Madurange <sadeep@asciimx.com> | 2026-05-06 19:42:33 +0800 |
| commit | 819bf74c2841fabdcc481e12e13615d48a92cb7f (patch) | |
| tree | 00ba8c9105a96d88536f50f8ef96e838c04408e3 | |
| parent | eddb76ad8c6e850c7e24f97ff27a185d48b104ee (diff) | |
| download | site-search-bm-819bf74c2841fabdcc481e12e13615d48a92cb7f.tar.gz | |
Change directory structure and add benchmark runner.
| -rw-r--r-- | .gitignore | 2 | ||||
| -rw-r--r-- | README.txt | 15 | ||||
| -rw-r--r-- | _site/cgi-bin/find_file.cgi | 143 | ||||
| -rw-r--r-- | _site/cgi-bin/find_one_file.cgi | 184 | ||||
| -rw-r--r-- | _site/cgi-bin/indexer.pl | 34 | ||||
| -rwxr-xr-x | _site/cgi-bin/seed.sh | 28 | ||||
| -rwxr-xr-x | benchmark.pl | 86 | ||||
| -rw-r--r-- | bm_10k.txt | 34 | ||||
| -rw-r--r-- | bm_1k.txt | 34 | ||||
| -rw-r--r-- | bm_500.txt | 34 | ||||
| -rwxr-xr-x[-rw-r--r--] | find_glob.cgi (renamed from _site/cgi-bin/find_glob.cgi) | 0 | ||||
| -rwxr-xr-x | find_regex.cgi | 144 | ||||
| -rwxr-xr-x[-rw-r--r--] | find_sa.cgi (renamed from _site/cgi-bin/find_sa.cgi) | 0 | ||||
| -rwxr-xr-x[-rw-r--r--] | find_sa_mmap.cgi (renamed from _site/cgi-bin/find_sa_mmap.cgi) | 0 | ||||
| -rwxr-xr-x[-rw-r--r--] | indexer.pl (renamed from _site/cgi-bin/sa_indexer.pl) | 2 | ||||
| -rw-r--r-- | sa_stats.txt | 29 |
16 files changed, 248 insertions, 521 deletions
@@ -1,3 +1,5 @@ _site/log/ **/*.dat **/*.bin +**/*.swp +**/*.core @@ -6,3 +6,18 @@ HOW TO USE 4. In _site/cgi_bin/ execute indexer script (e.g., perl sa_indexer.pl) 5. Run search query: QUERY_STRING="q=ard" perl find_sa_mmap.cgi +Directory structure: + +. +|-- seed.sh (Shell script) +|-- benchmark.pl (The runner we just wrote) +|-- _site +| |-- cgi-bin +| | |-- indexer.pl (Creates the SA index) +| | |-- find_sa.cgi (Suffix Array search) +| | `-- find_regex.cgi (Regex search) +| `-- log/ (Created by seed.sh) + +chmod +x seed.sh benchmark.pl _site/cgi-bin/*.cgi _site/cgi-bin/*.pl + + diff --git a/_site/cgi-bin/find_file.cgi b/_site/cgi-bin/find_file.cgi deleted file mode 100644 index 2ffb808..0000000 --- a/_site/cgi-bin/find_file.cgi +++ /dev/null @@ -1,143 +0,0 @@ -#!/usr/bin/perl - -use strict; -use warnings; -use File::Find; -use Time::HiRes qw(gettimeofday tv_interval); -use BSD::Resource; # For memory and granular CPU usage - -# 1. Start Benchmark Timer -my $start_time = [gettimeofday]; - -sub escape_html { - my $str = shift; - return "" unless defined $str; - $str =~ s/&/&/g; - $str =~ s/</</g; - $str =~ s/>/>/g; - $str =~ s/"/"/g; - $str =~ s/'/'/g; - return $str; -} - -my %params; -if ($ENV{QUERY_STRING}) { - foreach my $pair (split /&/, $ENV{QUERY_STRING}) { - my ($key, $value) = split /=/, $pair; - $value =~ tr/+/ /; - $value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; - $params{$key} = $value; - } -} - -my $search_text = $params{'q'} || ''; -$search_text = substr($search_text, 0, 64); -$search_text =~ s/[^a-zA-Z0-9 ]//g; - -my $directory = '../log/'; -my @results; - -my %excluded_files = ( - 'index.html' => 1, -); - -# Track IO operations (simple count) -my $files_read = 0; - -if ($search_text =~ /\S/) { - find({ - wanted => sub { - return unless -f $_ && $_ eq 'index.html'; - my $rel_path = $File::Find::name; - $rel_path =~ s|^\Q$directory\E/?||; - return if $excluded_files{$rel_path}; - - if (open my $fh, '<', $_) { - $files_read++; # Tracking IO - my $content = do { local $/; <$fh> }; - close $fh; - - if ($content =~ /\Q$search_text\E/i) { - my ($title) = $content =~ /<title>(.*?)<\/title>/is; - $title = $title ? escape_html($title) : $rel_path; - my ($p_content) = $content =~ /<p[^>]*>(.*?)<\/p>/is; - my $snippet = $p_content || ""; - $snippet =~ s/<[^>]*>//g; - $snippet =~ s/\s+/ /g; - - $snippet = escape_html(substr($snippet, 0, 50)); - $snippet .= "..." if length($p_content || "") > 50; - - push @results, { - path => $File::Find::name, - title => $title, - snippet => $snippet - }; - } - } - }, - no_chdir => 0, - follow => 0, - }, $directory); -} - -# --- 2. Calculate Metrics --- -my $end_time = [gettimeofday]; -my $elapsed = tv_interval($start_time, $end_time); - -# CPU & RAM Usage via BSD::Resource -my $rusage = getrusage(); -my $user_cpu = $rusage->utime; # User CPU time -my $system_cpu = $rusage->stime; # System CPU time -my $max_rss = $rusage->maxrss; # Peak RAM (usually in KB on Linux) - -# --- 3. Output --- -print "Content-Type: text/html\n\n"; - -my $list; -if ($search_text eq '') { - $list = "<p>Please enter a search term above.</p>"; -} elsif (@results == 0) { - $list = "<p>No results found for \"<b>$search_text</b>\".</p>"; -} else { - $list = "<ul>"; - foreach my $res (@results) { - my $url = $res->{path}; - $list .= "<li><a href=\"/$url\">$res->{title}</a><br><small>$res->{snippet}</small></li>"; - } - $list .= "</ul>"; -} - -my $safe_search_text = escape_html($search_text); - -print <<"HTML"; -<!DOCTYPE html> -<html lang="en-us"> -<head> - <meta charset="utf-8"> - <title>Search Results</title> - <style> - .stats { background: #f4f4f4; padding: 10px; border-radius: 5px; font-family: monospace; font-size: 0.85em; margin-top: 20px; border: 1px solid #ddd; } - </style> -</head> -<body> - <div class="container"> - <h2>Search</h2> - <form action="" method="GET"> - <input type="text" name="q" value="$safe_search_text"> - <input type="submit" value="Search"> - </form> - $list - - <div class="stats"> - <strong>Performance Metrics:</strong><br> - Total Time: @{[ sprintf("%.4f", $elapsed) ]} seconds<br> - User CPU: $user_cpu s<br> - System CPU: $system_cpu s<br> - Peak RAM: $max_rss KB<br> - Files Read: $files_read (IO Activity) - </div> - </div> -</body> -</html> -HTML diff --git a/_site/cgi-bin/find_one_file.cgi b/_site/cgi-bin/find_one_file.cgi deleted file mode 100644 index a28f8c4..0000000 --- a/_site/cgi-bin/find_one_file.cgi +++ /dev/null @@ -1,184 +0,0 @@ -#!/usr/bin/perl - -use strict; -use warnings; -use Storable qw(retrieve); -use Encode qw(decode_utf8); -use HTML::Escape qw(escape_html); -use Time::HiRes qw(gettimeofday tv_interval); -use BSD::Resource; - -# 1. Start Benchmark Timer -my $start_time = [gettimeofday]; -my $files_read = 0; # Track IO Activity - -# Configuration -my $max_parallel = 100; -my $lock_timeout = 30; -my $max_results = 1000; -my $min_query_len = 3; -my $index_file = 'search_index.dat'; -my $lock_dir = '/tmp/search_locks'; - -# Concurrency control -mkdir $lock_dir, 0777 unless -d $lock_dir; -my $active_count = 0; -my $now = time(); - -opendir(my $dh, $lock_dir); -while (my $file = readdir($dh)) { - next unless $file =~ /\.lock$/; - my $path = "$lock_dir/$file"; - my $mtime = (stat($path))[9] || 0; - ( $now - $mtime > $lock_timeout ) ? unlink($path) : $active_count++; -} -closedir($dh); - -# Too many search requests -if ($active_count >= $max_parallel) { - print "Content-Type: text/html\n\n"; - render_html("<p>Server busy. Please try again in a few seconds.</p>", "", (localtime)[5]+1900); - exit; -} - -my $lock_file = "$lock_dir/$$.lock"; -open(my $fh_lock, '>', $lock_file); -$files_read++; # IO for lock creation - -my $search_text = ''; -if (($ENV{QUERY_STRING} || '') =~ /^q=([^&]*)/) { - $search_text = decode_utf8($1 // ""); - $search_text =~ s/\P{Print}//g; - $search_text = substr($search_text, 0, 64); - $search_text =~ s/^\s+|\s+$//g; -} - -my $safe_search_text = escape_html($search_text); -my $year = (localtime)[5] + 1900; - -print "Content-Type: text/html\n\n"; - -if ($search_text eq '') { - final_output("<p>Please enter a search term above.</p>"); -} - -if (length($search_text) < $min_query_len) { - final_output("<p>Search term is too short. Please enter at least $min_query_len characters.</p>"); -} - -if (!-f $index_file) { - final_output("<p>Search temporarily unavailable.</p>"); -} - -# IO for index retrieval -my $index = retrieve($index_file); -$files_read++; - -my @results; -my $found = 0; - -foreach my $url (sort keys %$index) { - last if $found >= $max_results; - my $data = $index->{$url}; - - next unless $data->{c} =~ /(.{0,40})(\Q$search_text\E)(.{0,40})/is; - my ($before, $actual, $after) = ($1, $2, $3); - $found++; - - $after =~ s/\s\S*$// if length($after) > 25; - $before =~ s/^.*?\s// if length($before) > 25; - - $before = ($before =~ /\S/) ? ucfirst($before) : ""; - $actual = ($before eq "") ? ucfirst($actual) : $actual; - - my $snippet = escape_html($before) . "<b>" . escape_html($actual) . "</b>" . escape_html($after) . "..."; - - push @results, { - path => $url, - title => escape_html($data->{t}), - snippet => $snippet - }; -} - -my $list_html = ""; -if (@results == 0) { - $list_html = "<p>No results found for \"<b>$safe_search_text</b>\".</p>"; -} else { - $list_html = "<ul>" . join('', map { - "<li><a href=\"/$_->{path}\">$_->{title}</a><br><small>$_->{snippet}</small></li>" - } @results) . "</ul>"; -} - -final_output($list_html); - -sub final_output { - my ($content) = @_; - - # 2. Calculate Metrics just before rendering - my $elapsed = tv_interval($start_time, [gettimeofday]); - my $rusage = getrusage(); - my $user_cpu = $rusage->utime; - my $system_cpu = $rusage->stime; - my $max_rss = $rusage->maxrss; - - my $bench_html = <<"BENCH"; -<div style="background: #f4f4f4; padding: 10px; border-radius: 5px; font-family: monospace; font-size: 0.85em; margin-top: 20px; border: 1px solid #ddd;"> - <strong>Performance Metrics:</strong><br> - Total Time: @{[ sprintf("%.4f", $elapsed) ]} seconds<br> - User CPU: $user_cpu s<br> - System CPU: $system_cpu s<br> - Peak RAM: $max_rss KB<br> - Files Read: $files_read (IO Activity) -</div> -BENCH - - render_html($content . $bench_html, $safe_search_text, $year); - close($fh_lock) if $fh_lock; - unlink($lock_file) if -f $lock_file; - exit; -} - -sub render_html { - my ($content, $q_val, $yr) = @_; - print <<"HTML"; -<!DOCTYPE html> -<html lang="en-us"> -<head> - <meta charset="utf-8"> - <meta name="viewport" content="width=device-width, initial-scale=1"> - <title>Search</title> - <link rel="stylesheet" href="/assets/css/main.css"> - <link rel="stylesheet" href="/assets/css/skeleton.css"> -</head> -<body> - <div id="nav-container" class="container"> - <ul id="navlist" class="left"> - <li><a href="/" class="link-decor-none">hme</a></li> - <li><a href="/log/" class="link-decor-none">log</a></li> - <li><a href="/projects/" class="link-decor-none">poc</a></li> - <li><a href="/about/" class="link-decor-none">abt</a></li> - <li class="active"><a href="/cgi-bin/find.cgi" class="link-decor-none">sws</a></li> - <li><a href="/feed.xml" class="link-decor-none">rss</a></li> - </ul> - </div> - <main class="container" id="main"> - <div class="container"> - <h2>Search</h2> - <form action="" method="GET"> - <input id="search-box" type="text" name="q" value="$q_val"> - <input id="search-btn" type="submit" value="Search"> - </form> - $content - </div> - </main> - <div class="footer"> - <div class="container"> - <div class="twelve columns right container-2"> - <p id="footer-text">© ASCIIMX - $yr</p> - </div> - </div> - </div> -</body> -</html> -HTML -} diff --git a/_site/cgi-bin/indexer.pl b/_site/cgi-bin/indexer.pl deleted file mode 100644 index 0dcd7e2..0000000 --- a/_site/cgi-bin/indexer.pl +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/perl - -use strict; -use warnings; -use Storable qw(nstore); -use HTML::Entities qw(decode_entities); - -# --- Configuration --- -my $built_site_dir = '../log'; -my $output_file = 'search_index.dat'; -my %index; - -print "Building search index from $built_site_dir...\n"; - -foreach my $path (glob("$built_site_dir/*/index.html")) { - next unless open(my $fh, '<:utf8', $path); - my $html = do { local $/; <$fh> }; - close($fh); - - # Extract Title and Main Content - my ($title) = $html =~ m|<title>(.*?)</title>|is || "Unknown"; - my ($main) = $html; - - # Normalize path - my $url = $path; - - $index{$url} = { - t => $title || "Untitled", - c => $main - }; -} - -nstore(\%index, $output_file); -printf("Index complete: %d files (%.2f KB)\n", scalar(keys %index), (-s $output_file) / 1024); diff --git a/_site/cgi-bin/seed.sh b/_site/cgi-bin/seed.sh deleted file mode 100755 index 5ae14df..0000000 --- a/_site/cgi-bin/seed.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/ksh - -# Set the number of files/dirs -TOTAL=500 -# Approximate size in blocks -BLOCK_SIZE=16 -COUNT=1 - -for i in $(seq 1 $TOTAL); do - # Create a unique directory name - DIR="site_$i" - mkdir -p "$DIR" - - # 1. Generate random valid ASCII (valid UTF-8) text - # We read more from urandom than needed because tr will filter some out - dd if=/dev/urandom bs=1024 count=$BLOCK_SIZE 2>/dev/null | tr -dc 'a-zA-Z0-9 \n' > "$DIR/index.html" - - # 2. Append the necessary HTML structure so your Perl regexes work - # This adds the <title> and <main> tags your script looks for - echo "<html><head><title>Site $i</title></head><body><main><p>Searchable content here for keyword_$i. Lorem ipsum text follows.</p></main></body></html>" >> "$DIR/index.html" - - # Optional: print progress every 100 files - if [ $((i % 100)) -eq 0 ]; then - echo "Created $i files..." - fi -done - -echo "Done! 10000 directories created with valid text." diff --git a/benchmark.pl b/benchmark.pl new file mode 100755 index 0000000..8c1b4ea --- /dev/null +++ b/benchmark.pl @@ -0,0 +1,86 @@ +#!/usr/bin/perl +use strict; +use warnings; +use Time::HiRes qw(gettimeofday tv_interval); + +# 1. Accept directory counts from @ARGV, or use defaults +my @test_counts = @ARGV ? @ARGV : (500, 1000, 10000); + +# Configuration - All scripts are now in the root +my $seed_script = "./seed.sh"; +my $indexer = "./indexer.pl"; +my $sa_cgi = "./find_sa.cgi"; +my $regex_cgi = "./find_regex.cgi"; + +print "=============================================================\n"; +print "SEARCH BENCHMARK: Suffix array vs. Linear regex\n"; +print "ARTICLE SIZE: 16 KB\n"; +print "=============================================================\n\n"; + +foreach my $count (@test_counts) { + my $search_query = "keyword_$count"; + + print "$count files (Targeting: $search_query):\n"; + print "-------------------------------------------------------------\n"; + print sprintf("%-15s | %-20s | %-20s\n", "METRIC", "SA", "REGEX"); + print "----------------+----------------------+---------------------\n"; + + # 1. Seed + system("$seed_script $count > /dev/null 2>&1"); + + # 2. Cleanup old index files + unlink('sa.bin', 'corpus.bin', 'file_map.dat'); + + # 3. Indexing + my $idx_start = [gettimeofday]; + system("perl $indexer > /dev/null 2>&1"); + my $idx_time = tv_interval($idx_start); + + my $idx_size = 0; + if (-f 'sa.bin' && -f 'corpus.bin') { + $idx_size = ((-s 'sa.bin') + (-s 'corpus.bin')) / 1024; + } + + # 4. SA Search + my $sa_out = `QUERY_STRING="q=$search_query" perl $sa_cgi`; + my ($sa_time, $sa_ram) = parse_metrics($sa_out); + + # 5. Regex Search + my $reg_out = `QUERY_STRING="q=$search_query" perl $regex_cgi`; + my ($reg_time, $reg_ram) = parse_metrics($reg_out); + + # 6. Final Output Table + print sprintf("%-15s | %-20s | %-20s\n", + "Search time", + sprintf("%.4fs", $sa_time), + sprintf("%.4fs", $reg_time) + ); + + print sprintf("%-15s | %-20s | %-20s\n", + "Peak RAM", + sprintf("%d KB", $sa_ram), + sprintf("%d KB", $reg_ram) + ); + + print sprintf("%-15s | %-20s | %-20s\n", + "Indexing time", + sprintf("%.4fs", $idx_time), + "N/A" + ); + + print sprintf("%-15s | %-20s | %-20s\n", + "Index size", + sprintf("%.2f KB", $idx_size), + "N/A" + ); + + print "----------------+----------------------+---------------------\n\n"; +} + +sub parse_metrics { + my $text = shift || ""; + my $time = ($text =~ /Total Time:\s+([\d.]+)/) ? $1 : 0; + my $ram = ($text =~ /Peak RAM:\s+(\d+)/) ? $1 : 0; + return ($time, $ram); +} + diff --git a/bm_10k.txt b/bm_10k.txt deleted file mode 100644 index fbb0932..0000000 --- a/bm_10k.txt +++ /dev/null @@ -1,34 +0,0 @@ -Benchmarks with 10000 16KB files - -Crawl directory at query time using File::Find - -Total Time: 0.9120 s -User CPU: 0.4 s -System CPU: 0.54 s -Peak RAM: 12804 KB -Files Read: 10000 (IO Activity) - -Glob files at query time - -Total Time: 0.9786 s -User CPU: 0.7 s -System CPU: 0.3 s -Peak RAM: 10216 KB -Files Read: 10000 (IO Activity) - -Single file (10000 files (41991.79 KB)) - -Total Time: 15.0889 s -User CPU: 15.06 s -System CPU: 0.06 s -Peak RAM: 101988 KB -Files Read: 2 (IO Activity) - -SA index - -Total Time: 0.0161 seconds<br> -User CPU: 0.03 s<br> -System CPU: 0.03 s<br> -Peak RAM: 12504 KB<br> -Files Read: 3 (IO Activity) - diff --git a/bm_1k.txt b/bm_1k.txt deleted file mode 100644 index f443e2e..0000000 --- a/bm_1k.txt +++ /dev/null @@ -1,34 +0,0 @@ -Benchmarks with 1000 16KB files - -Crawl directory at query time using File::Find - -Total Time: 0.0795 seconds<br> -User CPU: 0.05 s<br> -System CPU: 0.05 s<br> -Peak RAM: 9460 KB<br> -Files Read: 1000 (IO Activity) - -Glob files at query time - -Total Time: 0.0740 seconds<br> -User CPU: 0.09 s<br> -System CPU: 0.02 s<br> -Peak RAM: 8952 KB<br> -Files Read: 1000 (IO Activity) - -Single file (1000 files (4196.95 KB)) - -Total Time: 1.4814 seconds<br> -User CPU: 1.49 s<br> -System CPU: 0.02 s<br> -Peak RAM: 20264 KB<br> -Files Read: 2 (IO Activity) - -SA index - -Total Time: 0.0019 seconds<br> -User CPU: 0.02 s<br> -System CPU: 0.01 s<br> -Peak RAM: 8980 KB<br> -Files Read: 3 (IO Activity) - diff --git a/bm_500.txt b/bm_500.txt deleted file mode 100644 index f7b82e9..0000000 --- a/bm_500.txt +++ /dev/null @@ -1,34 +0,0 @@ -Benchmarks with 500 16KB files - -Crawl directory at query time using File::Find - -Total Time: 0.0407 seconds<br> -User CPU: 0.06 s<br> -System CPU: 0.02 s<br> -Peak RAM: 9136 KB<br> -Files Read: 500 (IO Activity) - -Glob files at query time - -Total Time: 0.0373 seconds<br> -User CPU: 0.06 s<br> -System CPU: 0.01 s<br> -Peak RAM: 8932 KB<br> -Files Read: 500 (IO Activity) - -Single file (500 files (2099.46 KB)) - -Total Time: 0.7322 seconds<br> -User CPU: 0.73 s<br> -System CPU: 0.02 s<br> -Peak RAM: 14564 KB<br> -Files Read: 2 (IO Activity) - -SA index - -Total Time: 0.0012 seconds<br> -User CPU: 0.01 s<br> -System CPU: 0.02 s<br> -Peak RAM: 8828 KB<br> -Files Read: 3 (IO Activity) - diff --git a/_site/cgi-bin/find_glob.cgi b/find_glob.cgi index db03bd5..db03bd5 100644..100755 --- a/_site/cgi-bin/find_glob.cgi +++ b/find_glob.cgi diff --git a/find_regex.cgi b/find_regex.cgi new file mode 100755 index 0000000..d826c12 --- /dev/null +++ b/find_regex.cgi @@ -0,0 +1,144 @@ +#!/usr/bin/perl + +use strict; +use warnings; +use File::Find; +use Time::HiRes qw(gettimeofday tv_interval); +use BSD::Resource; +use Encode qw(decode_utf8); + +# 1. Start Benchmark Timer +my $start_time = [gettimeofday]; + +# Helper to keep HTML output safe +sub escape_html { + my $str = shift; + return "" unless defined $str; + $str =~ s/&/&/g; + $str =~ s/</</g; + $str =~ s/>/>/g; + $str =~ s/"/"/g; + $str =~ s/'/'/g; + return $str; +} + +# Parse Query String (q=keyword) +my %params; +if ($ENV{QUERY_STRING}) { + foreach my $pair (split /&/, $ENV{QUERY_STRING}) { + my ($key, $value) = split /=/, $pair; + $value //= ''; + $value =~ tr/+/ /; + $value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; + $params{$key} = decode_utf8($value); + } +} + +my $search_text = $params{'q'} || ''; +$search_text = substr($search_text, 0, 64); +$search_text =~ s/[^a-zA-Z0-9 ]//g; + +# Configuration +my $directory = '_site/log/'; +my @results; +my $files_read = 0; + +# 2. The Linear Search (Crawl) +if ($search_text =~ /\S/) { + find({ + wanted => sub { + # Only look at index.html files inside the subdirectories + return unless -f $_ && $_ eq 'index.html'; + + if (open my $fh, '<', $_) { + $files_read++; + # Slurp the entire file (approx 16KB per your seed script) + my $content = do { local $/; <$fh> }; + close $fh; + + # Regex match (Case Insensitive) + if ($content =~ /\Q$search_text\E/i) { + my ($title) = $content =~ /<title>(.*?)<\/title>/is; + my ($p_content) = $content =~ /<p[^>]*>(.*?)<\/p>/is; + + # Clean up snippet + my $snippet = $p_content || ""; + $snippet =~ s/<[^>]*>//g; # Strip internal tags + $snippet =~ s/\s+/ /g; + $snippet = substr($snippet, 0, 100); + + push @results, { + path => $File::Find::name, + title => $title || $File::Find::name, + snippet => $snippet . "..." + }; + } + } + # Stop collecting after 20 results for display, + # but the benchmark usually looks for unique keywords + # where only 1 result exists. + }, + no_chdir => 0, + }, $directory); +} + +# 3. Calculate Performance Metrics +my $end_time = [gettimeofday]; +my $elapsed = tv_interval($start_time, $end_time); + +my $rusage = getrusage(); +my $user_cpu = $rusage->utime; +my $system_cpu = $rusage->stime; +my $max_rss = $rusage->maxrss; + +# 4. Generate Output +print "Content-Type: text/html\n\n"; + +my $list_html = ""; +if ($search_text eq '') { + $list_html = "<p>Please enter a search term.</p>"; +} elsif (@results == 0) { + $list_html = "<p>No results found for \"<b>" . escape_html($search_text) . "</b>\".</p>"; +} else { + $list_html = "<ul>"; + foreach my $res (@results) { + $list_html .= sprintf('<li><a href="/%s">%s</a><br><small>%s</small></li>', + $res->{path}, escape_html($res->{title}), escape_html($res->{snippet})); + } + $list_html .= "</ul>"; +} + +my $safe_q = escape_html($search_text); + +print <<"HTML"; +<!DOCTYPE html> +<html> +<head> + <meta charset="utf-8"> + <title>Regex Search Results</title> + <style> + body { font-family: sans-serif; line-height: 1.5; padding: 20px; } + .stats { background: #f4f4f4; padding: 15px; border-radius: 5px; + font-family: monospace; font-size: 0.9em; border: 1px solid #ddd; margin-top: 20px; } + </style> +</head> +<body> + <h2>Regex Search (Linear Crawl)</h2> + <form method="GET"> + <input type="text" name="q" value="$safe_q"> + <input type="submit" value="Search"> + </form> + + $list_html + + <div class="stats"> + <strong>Performance Metrics:</strong><br> + Total Time: @{[ sprintf("%.4f", $elapsed) ]} seconds<br> + User CPU: $user_cpu s<br> + System CPU: $system_cpu s<br> + Peak RAM: $max_rss KB<br> + Files Read: $files_read (IO Activity) + </div> +</body> +</html> +HTML diff --git a/_site/cgi-bin/find_sa.cgi b/find_sa.cgi index 6b608a7..6b608a7 100644..100755 --- a/_site/cgi-bin/find_sa.cgi +++ b/find_sa.cgi diff --git a/_site/cgi-bin/find_sa_mmap.cgi b/find_sa_mmap.cgi index 979f4d5..979f4d5 100644..100755 --- a/_site/cgi-bin/find_sa_mmap.cgi +++ b/find_sa_mmap.cgi diff --git a/_site/cgi-bin/sa_indexer.pl b/indexer.pl index 2395dac..0b197ff 100644..100755 --- a/_site/cgi-bin/sa_indexer.pl +++ b/indexer.pl @@ -7,7 +7,7 @@ use Storable qw(store); use Time::HiRes qw(gettimeofday tv_interval); # Configuration -my $directory = '../log'; +my $directory = '_site/log'; my $corpus_file = 'corpus.bin'; my $sa_file = 'sa.bin'; my $map_file = 'file_map.dat'; diff --git a/sa_stats.txt b/sa_stats.txt deleted file mode 100644 index 15217d6..0000000 --- a/sa_stats.txt +++ /dev/null @@ -1,29 +0,0 @@ -500 article index stats: - -Total Time: 0.1475 seconds -Files Processed: 500 -File Sizes (KB): - corpus.bin 33.59 KB - sa.bin 134.34 KB - file_map.dat 37.01 KB - TOTAL INDEX: 204.94 KB - -1K article index stats: - -Total Time: 0.3101 seconds -Files Processed: 1000 -File Sizes (KB): - corpus.bin 67.28 KB - sa.bin 269.11 KB - file_map.dat 74.12 KB - TOTAL INDEX: 410.51 KB - -10K article index stats: - -Total Time: 10.9661 seconds -Files Processed: 10000 -File Sizes (KB): - corpus.bin 682.51 KB - sa.bin 2730.05 KB - file_map.dat 750.88 KB - TOTAL INDEX: 4163.44 KB |
