diff options
| author | Sadeep Madurange <sadeep@asciimx.com> | 2026-01-03 12:58:01 +0800 |
|---|---|---|
| committer | Sadeep Madurange <sadeep@asciimx.com> | 2026-01-03 12:58:01 +0800 |
| commit | 8a4da6809cf9368cd6a5dd7351181ea4256453f9 (patch) | |
| tree | 77b2e109ba979332d81799a957bbfa86d010b81b | |
| download | site-search-bm-8a4da6809cf9368cd6a5dd7351181ea4256453f9.tar.gz | |
| -rw-r--r-- | .gitignore | 3 | ||||
| -rw-r--r-- | _site/cgi-bin/find_file.cgi | 143 | ||||
| -rw-r--r-- | _site/cgi-bin/find_glob.cgi | 147 | ||||
| -rw-r--r-- | _site/cgi-bin/find_one_file.cgi | 184 | ||||
| -rw-r--r-- | _site/cgi-bin/find_sa.cgi | 173 | ||||
| -rw-r--r-- | _site/cgi-bin/indexer.pl | 34 | ||||
| -rw-r--r-- | _site/cgi-bin/sa_indexer.pl | 86 | ||||
| -rwxr-xr-x | _site/cgi-bin/seed.sh | 28 | ||||
| -rw-r--r-- | bm_10k.txt | 34 | ||||
| -rw-r--r-- | bm_1k.txt | 34 | ||||
| -rw-r--r-- | bm_500.txt | 34 | ||||
| -rw-r--r-- | sa_stats.txt | 29 |
12 files changed, 929 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ab21b1a --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +_site/log/ +**/*.dat +**/*.bin diff --git a/_site/cgi-bin/find_file.cgi b/_site/cgi-bin/find_file.cgi new file mode 100644 index 0000000..2ffb808 --- /dev/null +++ b/_site/cgi-bin/find_file.cgi @@ -0,0 +1,143 @@ +#!/usr/bin/perl + +use strict; +use warnings; +use File::Find; +use Time::HiRes qw(gettimeofday tv_interval); +use BSD::Resource; # For memory and granular CPU usage + +# 1. Start Benchmark Timer +my $start_time = [gettimeofday]; + +sub escape_html { + my $str = shift; + return "" unless defined $str; + $str =~ s/&/&/g; + $str =~ s/</</g; + $str =~ s/>/>/g; + $str =~ s/"/"/g; + $str =~ s/'/'/g; + return $str; +} + +my %params; +if ($ENV{QUERY_STRING}) { + foreach my $pair (split /&/, $ENV{QUERY_STRING}) { + my ($key, $value) = split /=/, $pair; + $value =~ tr/+/ /; + $value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; + $params{$key} = $value; + } +} + +my $search_text = $params{'q'} || ''; +$search_text = substr($search_text, 0, 64); +$search_text =~ s/[^a-zA-Z0-9 ]//g; + +my $directory = '../log/'; +my @results; + +my %excluded_files = ( + 'index.html' => 1, +); + +# Track IO operations (simple count) +my $files_read = 0; + +if ($search_text =~ /\S/) { + find({ + wanted => sub { + return unless -f $_ && $_ eq 'index.html'; + my $rel_path = $File::Find::name; + $rel_path =~ s|^\Q$directory\E/?||; + return if $excluded_files{$rel_path}; + + if (open my $fh, '<', $_) { + $files_read++; # Tracking IO + my $content = do { local $/; <$fh> }; + close $fh; + + if ($content =~ /\Q$search_text\E/i) { + my ($title) = $content =~ /<title>(.*?)<\/title>/is; + $title = $title ? escape_html($title) : $rel_path; + my ($p_content) = $content =~ /<p[^>]*>(.*?)<\/p>/is; + my $snippet = $p_content || ""; + $snippet =~ s/<[^>]*>//g; + $snippet =~ s/\s+/ /g; + + $snippet = escape_html(substr($snippet, 0, 50)); + $snippet .= "..." if length($p_content || "") > 50; + + push @results, { + path => $File::Find::name, + title => $title, + snippet => $snippet + }; + } + } + }, + no_chdir => 0, + follow => 0, + }, $directory); +} + +# --- 2. Calculate Metrics --- +my $end_time = [gettimeofday]; +my $elapsed = tv_interval($start_time, $end_time); + +# CPU & RAM Usage via BSD::Resource +my $rusage = getrusage(); +my $user_cpu = $rusage->utime; # User CPU time +my $system_cpu = $rusage->stime; # System CPU time +my $max_rss = $rusage->maxrss; # Peak RAM (usually in KB on Linux) + +# --- 3. Output --- +print "Content-Type: text/html\n\n"; + +my $list; +if ($search_text eq '') { + $list = "<p>Please enter a search term above.</p>"; +} elsif (@results == 0) { + $list = "<p>No results found for \"<b>$search_text</b>\".</p>"; +} else { + $list = "<ul>"; + foreach my $res (@results) { + my $url = $res->{path}; + $list .= "<li><a href=\"/$url\">$res->{title}</a><br><small>$res->{snippet}</small></li>"; + } + $list .= "</ul>"; +} + +my $safe_search_text = escape_html($search_text); + +print <<"HTML"; +<!DOCTYPE html> +<html lang="en-us"> +<head> + <meta charset="utf-8"> + <title>Search Results</title> + <style> + .stats { background: #f4f4f4; padding: 10px; border-radius: 5px; font-family: monospace; font-size: 0.85em; margin-top: 20px; border: 1px solid #ddd; } + </style> +</head> +<body> + <div class="container"> + <h2>Search</h2> + <form action="" method="GET"> + <input type="text" name="q" value="$safe_search_text"> + <input type="submit" value="Search"> + </form> + $list + + <div class="stats"> + <strong>Performance Metrics:</strong><br> + Total Time: @{[ sprintf("%.4f", $elapsed) ]} seconds<br> + User CPU: $user_cpu s<br> + System CPU: $system_cpu s<br> + Peak RAM: $max_rss KB<br> + Files Read: $files_read (IO Activity) + </div> + </div> +</body> +</html> +HTML diff --git a/_site/cgi-bin/find_glob.cgi b/_site/cgi-bin/find_glob.cgi new file mode 100644 index 0000000..db03bd5 --- /dev/null +++ b/_site/cgi-bin/find_glob.cgi @@ -0,0 +1,147 @@ +#!/usr/bin/perl + +use strict; +use warnings; +use Encode qw(decode_utf8); +use HTML::Escape qw(escape_html); +use Time::HiRes qw(gettimeofday tv_interval); +use BSD::Resource; + +# 1. Start Benchmark Timer +my $start_time = [gettimeofday]; + +my $search_text = ''; +if ($ENV{QUERY_STRING} && $ENV{QUERY_STRING} =~ /^q=([^&]*)/) { + $search_text = decode_utf8($1 // ""); + $search_text =~ s/\P{Print}//g; + $search_text = substr($search_text, 0, 64); + $search_text =~ s/^\s+|\s+$//g; +} + +my @results; +my $files_read = 0; # Track IO Activity + +my $start_dir = '../log'; +my @files = glob("$start_dir/*/index.html"); + +foreach my $path (@files) { + next if -l $path || ! -f $path; + + # Using :encoding(UTF-8) to handle the valid text files + next unless open(my $fh, "<:encoding(UTF-8)", $path); + $files_read++; + my $html = do { local $/; <$fh> }; + close($fh); + + my ($text) = $html =~ m|<main>(.*?)</main>|is; + $text =~ s|<[^>]+>| |g; + $text =~ s|\s+| |g; + + next unless $text =~ /(.{0,40})(\Q$search_text\E)(.{0,40})/is; + my ($before, $actual, $after) = ($1, $2, $3); + + $after =~ s/\s\S*$// if length($after) > 25; + $before =~ s/^.*?\s// if length($before) > 25; + + if ($before =~ /\S/) { + $before = ucfirst($before); + } else { + $before = ""; + $actual = ucfirst($actual); + } + + my $safe_before = escape_html($before); + my $safe_actual = escape_html($actual); + my $safe_after = escape_html($after); + my $snippet = "${safe_before}<b>${safe_actual}</b>${safe_after}..."; + + my ($title) = $html =~ m|<title>(.*?)</title>|is; + my $safe_title = escape_html($title || "No Title"); + + push @results, { + path => $path, + title => $safe_title, + snippet => $snippet + }; +} + +# 2. Calculate Metrics +my $end_time = [gettimeofday]; +my $elapsed = tv_interval($start_time, $end_time); + +my $rusage = getrusage(); +my $user_cpu = $rusage->utime; +my $system_cpu = $rusage->stime; +my $max_rss = $rusage->maxrss; + +# 3. Output +print "Content-Type: text/html\n\n"; + +my $list; +if ($search_text eq '') { + $list = "<p>Please enter a search term above.</p>"; +} elsif (@results == 0) { + $list = "<p>No results found for \"<b>$search_text</b>\".</p>"; +} else { + $list = "<ul>"; + foreach my $res (@results) { + my $url = $res->{path}; + $list .= "<li><a href=\"/$url\">$res->{title}</a><br><small>$res->{snippet}</small></li>"; + } + $list .= "</ul>"; +} + +my $safe_search_text = escape_html($search_text); +my $year = (localtime)[5] + 1900; + +print <<"HTML"; +<!DOCTYPE html> +<html lang="en-us"> +<head> + <meta charset="utf-8"> + <meta name="viewport" content="width=device-width, initial-scale=1"> + <title>Search</title> + <link rel="stylesheet" href="/assets/css/main.css"> + <link rel="stylesheet" href="/assets/css/skeleton.css"> +</head> +<body> + <div id="nav-container" class="container"> + <ul id="navlist" class="left"> + <li><a href="/" class="link-decor-none">hme</a></li> + <li><a href="/log/" class="link-decor-none">log</a></li> + <li><a href="/projects/" class="link-decor-none">poc</a></li> + <li><a href="/about/" class="link-decor-none">abt</a></li> + <li class="active"><a href="/cgi-bin/find.cgi" class="link-decor-none">sws</a></li> + <li><a href="/feed.xml" class="link-decor-none">rss</a></li> + </ul> + </div> + <main class="container" id="main"> + <div class="container"> + <h2>Search</h2> + <form action="" method="GET"> + <input id="search-box" type="text" name="q" value="$safe_search_text"> + <input id="search-btn" type="submit" value="Search"> + </form> + $list + + <div style="background: #f4f4f4; padding: 10px; border-radius: 5px; font-family: monospace; font-size: 0.85em; margin-top: 20px; border: 1px solid #ddd;"> + <strong>Performance Metrics:</strong><br> + Total Time: @{[ sprintf("%.4f", $elapsed) ]} seconds<br> + User CPU: $user_cpu s<br> + System CPU: $system_cpu s<br> + Peak RAM: $max_rss KB<br> + Files Read: $files_read (IO Activity) + </div> + </div> + </main> + <div class="footer"> + <div class="container"> + <div class="twelve columns right container-2"> + <p id="footer-text">© ASCIIMX - $year</p> + </div> + </div> + </div> +</body> +</html> +HTML + diff --git a/_site/cgi-bin/find_one_file.cgi b/_site/cgi-bin/find_one_file.cgi new file mode 100644 index 0000000..a28f8c4 --- /dev/null +++ b/_site/cgi-bin/find_one_file.cgi @@ -0,0 +1,184 @@ +#!/usr/bin/perl + +use strict; +use warnings; +use Storable qw(retrieve); +use Encode qw(decode_utf8); +use HTML::Escape qw(escape_html); +use Time::HiRes qw(gettimeofday tv_interval); +use BSD::Resource; + +# 1. Start Benchmark Timer +my $start_time = [gettimeofday]; +my $files_read = 0; # Track IO Activity + +# Configuration +my $max_parallel = 100; +my $lock_timeout = 30; +my $max_results = 1000; +my $min_query_len = 3; +my $index_file = 'search_index.dat'; +my $lock_dir = '/tmp/search_locks'; + +# Concurrency control +mkdir $lock_dir, 0777 unless -d $lock_dir; +my $active_count = 0; +my $now = time(); + +opendir(my $dh, $lock_dir); +while (my $file = readdir($dh)) { + next unless $file =~ /\.lock$/; + my $path = "$lock_dir/$file"; + my $mtime = (stat($path))[9] || 0; + ( $now - $mtime > $lock_timeout ) ? unlink($path) : $active_count++; +} +closedir($dh); + +# Too many search requests +if ($active_count >= $max_parallel) { + print "Content-Type: text/html\n\n"; + render_html("<p>Server busy. Please try again in a few seconds.</p>", "", (localtime)[5]+1900); + exit; +} + +my $lock_file = "$lock_dir/$$.lock"; +open(my $fh_lock, '>', $lock_file); +$files_read++; # IO for lock creation + +my $search_text = ''; +if (($ENV{QUERY_STRING} || '') =~ /^q=([^&]*)/) { + $search_text = decode_utf8($1 // ""); + $search_text =~ s/\P{Print}//g; + $search_text = substr($search_text, 0, 64); + $search_text =~ s/^\s+|\s+$//g; +} + +my $safe_search_text = escape_html($search_text); +my $year = (localtime)[5] + 1900; + +print "Content-Type: text/html\n\n"; + +if ($search_text eq '') { + final_output("<p>Please enter a search term above.</p>"); +} + +if (length($search_text) < $min_query_len) { + final_output("<p>Search term is too short. Please enter at least $min_query_len characters.</p>"); +} + +if (!-f $index_file) { + final_output("<p>Search temporarily unavailable.</p>"); +} + +# IO for index retrieval +my $index = retrieve($index_file); +$files_read++; + +my @results; +my $found = 0; + +foreach my $url (sort keys %$index) { + last if $found >= $max_results; + my $data = $index->{$url}; + + next unless $data->{c} =~ /(.{0,40})(\Q$search_text\E)(.{0,40})/is; + my ($before, $actual, $after) = ($1, $2, $3); + $found++; + + $after =~ s/\s\S*$// if length($after) > 25; + $before =~ s/^.*?\s// if length($before) > 25; + + $before = ($before =~ /\S/) ? ucfirst($before) : ""; + $actual = ($before eq "") ? ucfirst($actual) : $actual; + + my $snippet = escape_html($before) . "<b>" . escape_html($actual) . "</b>" . escape_html($after) . "..."; + + push @results, { + path => $url, + title => escape_html($data->{t}), + snippet => $snippet + }; +} + +my $list_html = ""; +if (@results == 0) { + $list_html = "<p>No results found for \"<b>$safe_search_text</b>\".</p>"; +} else { + $list_html = "<ul>" . join('', map { + "<li><a href=\"/$_->{path}\">$_->{title}</a><br><small>$_->{snippet}</small></li>" + } @results) . "</ul>"; +} + +final_output($list_html); + +sub final_output { + my ($content) = @_; + + # 2. Calculate Metrics just before rendering + my $elapsed = tv_interval($start_time, [gettimeofday]); + my $rusage = getrusage(); + my $user_cpu = $rusage->utime; + my $system_cpu = $rusage->stime; + my $max_rss = $rusage->maxrss; + + my $bench_html = <<"BENCH"; +<div style="background: #f4f4f4; padding: 10px; border-radius: 5px; font-family: monospace; font-size: 0.85em; margin-top: 20px; border: 1px solid #ddd;"> + <strong>Performance Metrics:</strong><br> + Total Time: @{[ sprintf("%.4f", $elapsed) ]} seconds<br> + User CPU: $user_cpu s<br> + System CPU: $system_cpu s<br> + Peak RAM: $max_rss KB<br> + Files Read: $files_read (IO Activity) +</div> +BENCH + + render_html($content . $bench_html, $safe_search_text, $year); + close($fh_lock) if $fh_lock; + unlink($lock_file) if -f $lock_file; + exit; +} + +sub render_html { + my ($content, $q_val, $yr) = @_; + print <<"HTML"; +<!DOCTYPE html> +<html lang="en-us"> +<head> + <meta charset="utf-8"> + <meta name="viewport" content="width=device-width, initial-scale=1"> + <title>Search</title> + <link rel="stylesheet" href="/assets/css/main.css"> + <link rel="stylesheet" href="/assets/css/skeleton.css"> +</head> +<body> + <div id="nav-container" class="container"> + <ul id="navlist" class="left"> + <li><a href="/" class="link-decor-none">hme</a></li> + <li><a href="/log/" class="link-decor-none">log</a></li> + <li><a href="/projects/" class="link-decor-none">poc</a></li> + <li><a href="/about/" class="link-decor-none">abt</a></li> + <li class="active"><a href="/cgi-bin/find.cgi" class="link-decor-none">sws</a></li> + <li><a href="/feed.xml" class="link-decor-none">rss</a></li> + </ul> + </div> + <main class="container" id="main"> + <div class="container"> + <h2>Search</h2> + <form action="" method="GET"> + <input id="search-box" type="text" name="q" value="$q_val"> + <input id="search-btn" type="submit" value="Search"> + </form> + $content + </div> + </main> + <div class="footer"> + <div class="container"> + <div class="twelve columns right container-2"> + <p id="footer-text">© ASCIIMX - $yr</p> + </div> + </div> + </div> +</body> +</html> +HTML +} diff --git a/_site/cgi-bin/find_sa.cgi b/_site/cgi-bin/find_sa.cgi new file mode 100644 index 0000000..6b608a7 --- /dev/null +++ b/_site/cgi-bin/find_sa.cgi @@ -0,0 +1,173 @@ +#!/usr/bin/perl + +use strict; +use warnings; +use Storable qw(retrieve); +use Encode qw(decode_utf8); +use HTML::Escape qw(escape_html); +use Time::HiRes qw(gettimeofday tv_interval); +use BSD::Resource; + +# 1. Start Benchmark Timer +my $start_time = [gettimeofday]; +my $files_read = 0; # Track IO Activity + +# Decode search text +my $search_text = ''; +if (($ENV{QUERY_STRING} || '') =~ /^q=([^&]*)/) { + $search_text = decode_utf8($1 // ""); + $search_text =~ s/\P{Print}//g; + $search_text = substr($search_text, 0, 64); + $search_text =~ s/^\s+|\s+$//g; +} + +# We search using lowercase for the case-insensitive index +my $query = lc($search_text); +my $query_len = length($query); +my @results; + +if ($query_len >= 3 && -f 'sa.bin' && -f 'corpus.bin') { + open(my $fh_sa, '<', 'sa.bin') or die $!; + open(my $fh_cp, '<', 'corpus.bin') or die $!; + my $file_map = retrieve('file_map.dat'); + $files_read += 3; + + my $total_suffixes = (-s 'sa.bin') / 4; + + # Helper for binary search comparisons + sub compare_at { + my ($idx, $fh_sa, $fh_cp, $q, $len) = @_; + seek($fh_sa, $idx * 4, 0); + read($fh_sa, my $bin_off, 4); + my $off = unpack("L", $bin_off); + + seek($fh_cp, $off, 0); + read($fh_cp, my $text, $len); + return ($text cmp $q, $off); + } + + # Range Search: Find Left and Right boundaries + my ($low, $high) = (0, $total_suffixes - 1); + my $first_hit = -1; + while ($low <= $high) { + my $mid = int(($low + $high) / 2); + my ($cmp) = compare_at($mid, $fh_sa, $fh_cp, $query, $query_len); + if ($cmp >= 0) { $first_hit = $mid if $cmp == 0; $high = $mid - 1; } + else { $low = $mid + 1; } + } + + if ($first_hit != -1) { + ($low, $high) = ($first_hit, $total_suffixes - 1); + my $last_hit = $first_hit; + while ($low <= $high) { + my $mid = int(($low + $high) / 2); + my ($cmp) = compare_at($mid, $fh_sa, $fh_cp, $query, $query_len); + if ($cmp <= 0) { $last_hit = $mid if $cmp == 0; $low = $mid + 1; } + else { $high = $mid - 1; } + } + + # Collect unique file results + my %seen; + for my $i ($first_hit .. $last_hit) { + my (undef, $offset) = compare_at($i, $fh_sa, $fh_cp, $query, $query_len); + foreach my $m (@$file_map) { + if ($offset >= $m->{start} && $offset < $m->{end}) { + if (!$seen{$m->{path}}++) { + my $snip_start = ($offset - 30 < $m->{start}) ? $m->{start} : $offset - 30; + seek($fh_cp, $snip_start, 0); + read($fh_cp, my $raw_snip, 80); + + push @results, { + path => $m->{path}, + title => "Result: " . (split('/', $m->{path}))[-2], + snippet => "..." . escape_html($raw_snip) . "..." + }; + } + last; + } + } + last if @results >= 20; + } + } + close($fh_sa); + close($fh_cp); +} + +# 2. Calculate Metrics +my $end_time = [gettimeofday]; +my $elapsed = tv_interval($start_time, $end_time); + +my $rusage = getrusage(); +my $user_cpu = $rusage->utime; +my $system_cpu = $rusage->stime; +my $max_rss = $rusage->maxrss; + +# 3. Output +print "Content-Type: text/html\n\n"; + +my $list; +if ($search_text eq '') { + $list = "<p>Please enter a search term above.</p>"; +} elsif (@results == 0) { + $list = "<p>No results found for \"<b>" . escape_html($search_text) . "</b>\".</p>"; +} else { + $list = "<ul>"; + foreach my $res (@results) { + $list .= "<li><a href=\"/$res->{path}\">$res->{title}</a><br><small>$res->{snippet}</small></li>"; + } + $list .= "</ul>"; +} + +my $safe_search_text = escape_html($search_text); +my $year = (localtime)[5] + 1900; + +print <<"HTML"; +<!DOCTYPE html> +<html lang="en-us"> +<head> + <meta charset="utf-8"> + <meta name="viewport" content="width=device-width, initial-scale=1"> + <title>Search</title> + <link rel="stylesheet" href="/assets/css/main.css"> + <link rel="stylesheet" href="/assets/css/skeleton.css"> +</head> +<body> + <div id="nav-container" class="container"> + <ul id="navlist" class="left"> + <li><a href="/" class="link-decor-none">hme</a></li> + <li><a href="/log/" class="link-decor-none">log</a></li> + <li><a href="/projects/" class="link-decor-none">poc</a></li> + <li><a href="/about/" class="link-decor-none">abt</a></li> + <li class="active"><a href="/cgi-bin/find.cgi" class="link-decor-none">sws</a></li> + <li><a href="/feed.xml" class="link-decor-none">rss</a></li> + </ul> + </div> + <main class="container" id="main"> + <div class="container"> + <h2>Search</h2> + <form action="" method="GET"> + <input id="search-box" type="text" name="q" value="$safe_search_text"> + <input id="search-btn" type="submit" value="Search"> + </form> + $list + + <div style="background: #f4f4f4; padding: 10px; border-radius: 5px; font-family: monospace; font-size: 0.85em; margin-top: 20px; border: 1px solid #ddd;"> + <strong>Performance Metrics:</strong><br> + Total Time: @{[ sprintf("%.4f", $elapsed) ]} seconds<br> + User CPU: $user_cpu s<br> + System CPU: $system_cpu s<br> + Peak RAM: $max_rss KB<br> + Files Read: $files_read (IO Activity) + </div> + </div> + </main> + <div class="footer"> + <div class="container"> + <div class="twelve columns right container-2"> + <p id="footer-text">© ASCIIMX - $year</p> + </div> + </div> + </div> +</body> +</html> +HTML diff --git a/_site/cgi-bin/indexer.pl b/_site/cgi-bin/indexer.pl new file mode 100644 index 0000000..0dcd7e2 --- /dev/null +++ b/_site/cgi-bin/indexer.pl @@ -0,0 +1,34 @@ +#!/usr/bin/perl + +use strict; +use warnings; +use Storable qw(nstore); +use HTML::Entities qw(decode_entities); + +# --- Configuration --- +my $built_site_dir = '../log'; +my $output_file = 'search_index.dat'; +my %index; + +print "Building search index from $built_site_dir...\n"; + +foreach my $path (glob("$built_site_dir/*/index.html")) { + next unless open(my $fh, '<:utf8', $path); + my $html = do { local $/; <$fh> }; + close($fh); + + # Extract Title and Main Content + my ($title) = $html =~ m|<title>(.*?)</title>|is || "Unknown"; + my ($main) = $html; + + # Normalize path + my $url = $path; + + $index{$url} = { + t => $title || "Untitled", + c => $main + }; +} + +nstore(\%index, $output_file); +printf("Index complete: %d files (%.2f KB)\n", scalar(keys %index), (-s $output_file) / 1024); diff --git a/_site/cgi-bin/sa_indexer.pl b/_site/cgi-bin/sa_indexer.pl new file mode 100644 index 0000000..2395dac --- /dev/null +++ b/_site/cgi-bin/sa_indexer.pl @@ -0,0 +1,86 @@ +#!/usr/bin/perl + +use strict; +use warnings; +use File::Find; +use Storable qw(store); +use Time::HiRes qw(gettimeofday tv_interval); + +# Configuration +my $directory = '../log'; +my $corpus_file = 'corpus.bin'; +my $sa_file = 'sa.bin'; +my $map_file = 'file_map.dat'; + +# Start timing +my $t0 = [gettimeofday]; + +my $corpus = ""; +my @file_map; + +print "1. Building Case-Insensitive Corpus...\n"; +find({ + wanted => sub { + return unless -f $_ && $_ eq 'index.html'; + if (open my $fh, '<:encoding(UTF-8)', $_) { + my $content = do { local $/; <$fh> }; + close $fh; + + my ($text) = $content =~ m|<main>(.*?)</main>|is; + $text //= $content; + $text =~ s|<[^>]+>| |g; + $text =~ s|\s+| |g; + + my $start = length($corpus); + $corpus .= lc($text) . "\0"; + push @file_map, { start => $start, end => length($corpus), path => $File::Find::name }; + } + }, + no_chdir => 0, +}, $directory); + +print "2. Sorting Suffixes (Two-Pass Cache-Optimized)...\n"; +my @sa = 0 .. (length($corpus) - 1); + +@sa = sort { + (substr($corpus, $a, 64) cmp substr($corpus, $b, 64)) + || + (substr($corpus, $a) cmp substr($corpus, $b)) +} @sa; + +print "3. Writing Index Files to Disk...\n"; +open my $cfh, '>', $corpus_file or die $!; +print $cfh $corpus; +close $cfh; + +open my $sfh, '>', $sa_file or die $!; +binmode($sfh); +print $sfh pack("L*", @sa); +close $sfh; + +store \@file_map, $map_file; + +# End timing +my $elapsed = tv_interval($t0); + +# Calculate Sizes +my $c_size = -s $corpus_file; +my $s_size = -s $sa_file; +my $m_size = -s $map_file; +my $total = $c_size + $s_size + $m_size; + +# --- Final Report --- +print "\n" . "="x35 . "\n"; +print " INDEX BUILDING COMPLETE\n"; +print "="x35 . "\n"; +printf "Total Time: %.4f seconds\n", $elapsed; +print "Files Processed: " . scalar(@file_map) . "\n"; +print "-"x35 . "\n"; +print "File Sizes (KB):\n"; +printf " %-14s %10.2f KB\n", $corpus_file, $c_size / 1024; +printf " %-14s %10.2f KB\n", $sa_file, $s_size / 1024; +printf " %-14s %10.2f KB\n", $map_file, $m_size / 1024; +print "-"x35 . "\n"; +printf " TOTAL INDEX: %10.2f KB\n", $total / 1024; +print "="x35 . "\n"; + diff --git a/_site/cgi-bin/seed.sh b/_site/cgi-bin/seed.sh new file mode 100755 index 0000000..9d85108 --- /dev/null +++ b/_site/cgi-bin/seed.sh @@ -0,0 +1,28 @@ +#!/bin/ksh + +# Set the number of files/dirs +TOTAL=10000 +# Approximate size in blocks +BLOCK_SIZE=16 +COUNT=1 + +for i in $(seq 1 $TOTAL); do + # Create a unique directory name + DIR="site_$i" + mkdir -p "$DIR" + + # 1. Generate random valid ASCII (valid UTF-8) text + # We read more from urandom than needed because tr will filter some out + dd if=/dev/urandom bs=1024 count=$BLOCK_SIZE 2>/dev/null | tr -dc 'a-zA-Z0-9 \n' > "$DIR/index.html" + + # 2. Append the necessary HTML structure so your Perl regexes work + # This adds the <title> and <main> tags your script looks for + echo "<html><head><title>Site $i</title></head><body><main><p>Searchable content here for keyword_$i. Lorem ipsum text follows.</p></main></body></html>" >> "$DIR/index.html" + + # Optional: print progress every 100 files + if [ $((i % 100)) -eq 0 ]; then + echo "Created $i files..." + fi +done + +echo "Done! 10000 directories created with valid text." diff --git a/bm_10k.txt b/bm_10k.txt new file mode 100644 index 0000000..fbb0932 --- /dev/null +++ b/bm_10k.txt @@ -0,0 +1,34 @@ +Benchmarks with 10000 16KB files + +Crawl directory at query time using File::Find + +Total Time: 0.9120 s +User CPU: 0.4 s +System CPU: 0.54 s +Peak RAM: 12804 KB +Files Read: 10000 (IO Activity) + +Glob files at query time + +Total Time: 0.9786 s +User CPU: 0.7 s +System CPU: 0.3 s +Peak RAM: 10216 KB +Files Read: 10000 (IO Activity) + +Single file (10000 files (41991.79 KB)) + +Total Time: 15.0889 s +User CPU: 15.06 s +System CPU: 0.06 s +Peak RAM: 101988 KB +Files Read: 2 (IO Activity) + +SA index + +Total Time: 0.0161 seconds<br> +User CPU: 0.03 s<br> +System CPU: 0.03 s<br> +Peak RAM: 12504 KB<br> +Files Read: 3 (IO Activity) + diff --git a/bm_1k.txt b/bm_1k.txt new file mode 100644 index 0000000..f443e2e --- /dev/null +++ b/bm_1k.txt @@ -0,0 +1,34 @@ +Benchmarks with 1000 16KB files + +Crawl directory at query time using File::Find + +Total Time: 0.0795 seconds<br> +User CPU: 0.05 s<br> +System CPU: 0.05 s<br> +Peak RAM: 9460 KB<br> +Files Read: 1000 (IO Activity) + +Glob files at query time + +Total Time: 0.0740 seconds<br> +User CPU: 0.09 s<br> +System CPU: 0.02 s<br> +Peak RAM: 8952 KB<br> +Files Read: 1000 (IO Activity) + +Single file (1000 files (4196.95 KB)) + +Total Time: 1.4814 seconds<br> +User CPU: 1.49 s<br> +System CPU: 0.02 s<br> +Peak RAM: 20264 KB<br> +Files Read: 2 (IO Activity) + +SA index + +Total Time: 0.0019 seconds<br> +User CPU: 0.02 s<br> +System CPU: 0.01 s<br> +Peak RAM: 8980 KB<br> +Files Read: 3 (IO Activity) + diff --git a/bm_500.txt b/bm_500.txt new file mode 100644 index 0000000..f7b82e9 --- /dev/null +++ b/bm_500.txt @@ -0,0 +1,34 @@ +Benchmarks with 500 16KB files + +Crawl directory at query time using File::Find + +Total Time: 0.0407 seconds<br> +User CPU: 0.06 s<br> +System CPU: 0.02 s<br> +Peak RAM: 9136 KB<br> +Files Read: 500 (IO Activity) + +Glob files at query time + +Total Time: 0.0373 seconds<br> +User CPU: 0.06 s<br> +System CPU: 0.01 s<br> +Peak RAM: 8932 KB<br> +Files Read: 500 (IO Activity) + +Single file (500 files (2099.46 KB)) + +Total Time: 0.7322 seconds<br> +User CPU: 0.73 s<br> +System CPU: 0.02 s<br> +Peak RAM: 14564 KB<br> +Files Read: 2 (IO Activity) + +SA index + +Total Time: 0.0012 seconds<br> +User CPU: 0.01 s<br> +System CPU: 0.02 s<br> +Peak RAM: 8828 KB<br> +Files Read: 3 (IO Activity) + diff --git a/sa_stats.txt b/sa_stats.txt new file mode 100644 index 0000000..15217d6 --- /dev/null +++ b/sa_stats.txt @@ -0,0 +1,29 @@ +500 article index stats: + +Total Time: 0.1475 seconds +Files Processed: 500 +File Sizes (KB): + corpus.bin 33.59 KB + sa.bin 134.34 KB + file_map.dat 37.01 KB + TOTAL INDEX: 204.94 KB + +1K article index stats: + +Total Time: 0.3101 seconds +Files Processed: 1000 +File Sizes (KB): + corpus.bin 67.28 KB + sa.bin 269.11 KB + file_map.dat 74.12 KB + TOTAL INDEX: 410.51 KB + +10K article index stats: + +Total Time: 10.9661 seconds +Files Processed: 10000 +File Sizes (KB): + corpus.bin 682.51 KB + sa.bin 2730.05 KB + file_map.dat 750.88 KB + TOTAL INDEX: 4163.44 KB |
