From 819bf74c2841fabdcc481e12e13615d48a92cb7f Mon Sep 17 00:00:00 2001 From: Sadeep Madurange Date: Wed, 6 May 2026 19:42:33 +0800 Subject: Change directory structure and add benchmark runner. --- .gitignore | 2 + README.txt | 15 +++ _site/cgi-bin/find_file.cgi | 143 --------------------------- _site/cgi-bin/find_glob.cgi | 147 ---------------------------- _site/cgi-bin/find_one_file.cgi | 184 ---------------------------------- _site/cgi-bin/find_sa.cgi | 173 -------------------------------- _site/cgi-bin/find_sa_mmap.cgi | 212 ---------------------------------------- _site/cgi-bin/indexer.pl | 34 ------- _site/cgi-bin/sa_indexer.pl | 86 ---------------- _site/cgi-bin/seed.sh | 28 ------ benchmark.pl | 86 ++++++++++++++++ bm_10k.txt | 34 ------- bm_1k.txt | 34 ------- bm_500.txt | 34 ------- find_glob.cgi | 147 ++++++++++++++++++++++++++++ find_regex.cgi | 144 +++++++++++++++++++++++++++ find_sa.cgi | 173 ++++++++++++++++++++++++++++++++ find_sa_mmap.cgi | 212 ++++++++++++++++++++++++++++++++++++++++ indexer.pl | 86 ++++++++++++++++ sa_stats.txt | 29 ------ 20 files changed, 865 insertions(+), 1138 deletions(-) delete mode 100644 _site/cgi-bin/find_file.cgi delete mode 100644 _site/cgi-bin/find_glob.cgi delete mode 100644 _site/cgi-bin/find_one_file.cgi delete mode 100644 _site/cgi-bin/find_sa.cgi delete mode 100644 _site/cgi-bin/find_sa_mmap.cgi delete mode 100644 _site/cgi-bin/indexer.pl delete mode 100644 _site/cgi-bin/sa_indexer.pl delete mode 100755 _site/cgi-bin/seed.sh create mode 100755 benchmark.pl delete mode 100644 bm_10k.txt delete mode 100644 bm_1k.txt delete mode 100644 bm_500.txt create mode 100755 find_glob.cgi create mode 100755 find_regex.cgi create mode 100755 find_sa.cgi create mode 100755 find_sa_mmap.cgi create mode 100755 indexer.pl delete mode 100644 sa_stats.txt diff --git a/.gitignore b/.gitignore index ab21b1a..bda10d2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ _site/log/ **/*.dat **/*.bin +**/*.swp +**/*.core diff --git a/README.txt b/README.txt index c0e54ea..0bcf66a 100644 --- a/README.txt +++ b/README.txt @@ -6,3 +6,18 @@ HOW TO USE 4. In _site/cgi_bin/ execute indexer script (e.g., perl sa_indexer.pl) 5. Run search query: QUERY_STRING="q=ard" perl find_sa_mmap.cgi +Directory structure: + +. +|-- seed.sh (Shell script) +|-- benchmark.pl (The runner we just wrote) +|-- _site +| |-- cgi-bin +| | |-- indexer.pl (Creates the SA index) +| | |-- find_sa.cgi (Suffix Array search) +| | `-- find_regex.cgi (Regex search) +| `-- log/ (Created by seed.sh) + +chmod +x seed.sh benchmark.pl _site/cgi-bin/*.cgi _site/cgi-bin/*.pl + + diff --git a/_site/cgi-bin/find_file.cgi b/_site/cgi-bin/find_file.cgi deleted file mode 100644 index 2ffb808..0000000 --- a/_site/cgi-bin/find_file.cgi +++ /dev/null @@ -1,143 +0,0 @@ -#!/usr/bin/perl - -use strict; -use warnings; -use File::Find; -use Time::HiRes qw(gettimeofday tv_interval); -use BSD::Resource; # For memory and granular CPU usage - -# 1. Start Benchmark Timer -my $start_time = [gettimeofday]; - -sub escape_html { - my $str = shift; - return "" unless defined $str; - $str =~ s/&/&/g; - $str =~ s//>/g; - $str =~ s/"/"/g; - $str =~ s/'/'/g; - return $str; -} - -my %params; -if ($ENV{QUERY_STRING}) { - foreach my $pair (split /&/, $ENV{QUERY_STRING}) { - my ($key, $value) = split /=/, $pair; - $value =~ tr/+/ /; - $value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; - $params{$key} = $value; - } -} - -my $search_text = $params{'q'} || ''; -$search_text = substr($search_text, 0, 64); -$search_text =~ s/[^a-zA-Z0-9 ]//g; - -my $directory = '../log/'; -my @results; - -my %excluded_files = ( - 'index.html' => 1, -); - -# Track IO operations (simple count) -my $files_read = 0; - -if ($search_text =~ /\S/) { - find({ - wanted => sub { - return unless -f $_ && $_ eq 'index.html'; - my $rel_path = $File::Find::name; - $rel_path =~ s|^\Q$directory\E/?||; - return if $excluded_files{$rel_path}; - - if (open my $fh, '<', $_) { - $files_read++; # Tracking IO - my $content = do { local $/; <$fh> }; - close $fh; - - if ($content =~ /\Q$search_text\E/i) { - my ($title) = $content =~ /(.*?)<\/title>/is; - $title = $title ? escape_html($title) : $rel_path; - my ($p_content) = $content =~ /<p[^>]*>(.*?)<\/p>/is; - my $snippet = $p_content || ""; - $snippet =~ s/<[^>]*>//g; - $snippet =~ s/\s+/ /g; - - $snippet = escape_html(substr($snippet, 0, 50)); - $snippet .= "..." if length($p_content || "") > 50; - - push @results, { - path => $File::Find::name, - title => $title, - snippet => $snippet - }; - } - } - }, - no_chdir => 0, - follow => 0, - }, $directory); -} - -# --- 2. Calculate Metrics --- -my $end_time = [gettimeofday]; -my $elapsed = tv_interval($start_time, $end_time); - -# CPU & RAM Usage via BSD::Resource -my $rusage = getrusage(); -my $user_cpu = $rusage->utime; # User CPU time -my $system_cpu = $rusage->stime; # System CPU time -my $max_rss = $rusage->maxrss; # Peak RAM (usually in KB on Linux) - -# --- 3. Output --- -print "Content-Type: text/html\n\n"; - -my $list; -if ($search_text eq '') { - $list = "<p>Please enter a search term above.</p>"; -} elsif (@results == 0) { - $list = "<p>No results found for \"<b>$search_text</b>\".</p>"; -} else { - $list = "<ul>"; - foreach my $res (@results) { - my $url = $res->{path}; - $list .= "<li><a href=\"/$url\">$res->{title}</a><br><small>$res->{snippet}</small></li>"; - } - $list .= "</ul>"; -} - -my $safe_search_text = escape_html($search_text); - -print <<"HTML"; -<!DOCTYPE html> -<html lang="en-us"> -<head> - <meta charset="utf-8"> - <title>Search Results - - - -
-

Search

-
- - -
- $list - -
- Performance Metrics:
- Total Time: @{[ sprintf("%.4f", $elapsed) ]} seconds
- User CPU: $user_cpu s
- System CPU: $system_cpu s
- Peak RAM: $max_rss KB
- Files Read: $files_read (IO Activity) -
-
- - -HTML diff --git a/_site/cgi-bin/find_glob.cgi b/_site/cgi-bin/find_glob.cgi deleted file mode 100644 index db03bd5..0000000 --- a/_site/cgi-bin/find_glob.cgi +++ /dev/null @@ -1,147 +0,0 @@ -#!/usr/bin/perl - -use strict; -use warnings; -use Encode qw(decode_utf8); -use HTML::Escape qw(escape_html); -use Time::HiRes qw(gettimeofday tv_interval); -use BSD::Resource; - -# 1. Start Benchmark Timer -my $start_time = [gettimeofday]; - -my $search_text = ''; -if ($ENV{QUERY_STRING} && $ENV{QUERY_STRING} =~ /^q=([^&]*)/) { - $search_text = decode_utf8($1 // ""); - $search_text =~ s/\P{Print}//g; - $search_text = substr($search_text, 0, 64); - $search_text =~ s/^\s+|\s+$//g; -} - -my @results; -my $files_read = 0; # Track IO Activity - -my $start_dir = '../log'; -my @files = glob("$start_dir/*/index.html"); - -foreach my $path (@files) { - next if -l $path || ! -f $path; - - # Using :encoding(UTF-8) to handle the valid text files - next unless open(my $fh, "<:encoding(UTF-8)", $path); - $files_read++; - my $html = do { local $/; <$fh> }; - close($fh); - - my ($text) = $html =~ m|
(.*?)
|is; - $text =~ s|<[^>]+>| |g; - $text =~ s|\s+| |g; - - next unless $text =~ /(.{0,40})(\Q$search_text\E)(.{0,40})/is; - my ($before, $actual, $after) = ($1, $2, $3); - - $after =~ s/\s\S*$// if length($after) > 25; - $before =~ s/^.*?\s// if length($before) > 25; - - if ($before =~ /\S/) { - $before = ucfirst($before); - } else { - $before = ""; - $actual = ucfirst($actual); - } - - my $safe_before = escape_html($before); - my $safe_actual = escape_html($actual); - my $safe_after = escape_html($after); - my $snippet = "${safe_before}${safe_actual}${safe_after}..."; - - my ($title) = $html =~ m|(.*?)|is; - my $safe_title = escape_html($title || "No Title"); - - push @results, { - path => $path, - title => $safe_title, - snippet => $snippet - }; -} - -# 2. Calculate Metrics -my $end_time = [gettimeofday]; -my $elapsed = tv_interval($start_time, $end_time); - -my $rusage = getrusage(); -my $user_cpu = $rusage->utime; -my $system_cpu = $rusage->stime; -my $max_rss = $rusage->maxrss; - -# 3. Output -print "Content-Type: text/html\n\n"; - -my $list; -if ($search_text eq '') { - $list = "

Please enter a search term above.

"; -} elsif (@results == 0) { - $list = "

No results found for \"$search_text\".

"; -} else { - $list = ""; -} - -my $safe_search_text = escape_html($search_text); -my $year = (localtime)[5] + 1900; - -print <<"HTML"; - - - - - - Search - - - - - -
-
-

Search

-
- - -
- $list - -
- Performance Metrics:
- Total Time: @{[ sprintf("%.4f", $elapsed) ]} seconds
- User CPU: $user_cpu s
- System CPU: $system_cpu s
- Peak RAM: $max_rss KB
- Files Read: $files_read (IO Activity) -
-
-
- - - -HTML - diff --git a/_site/cgi-bin/find_one_file.cgi b/_site/cgi-bin/find_one_file.cgi deleted file mode 100644 index a28f8c4..0000000 --- a/_site/cgi-bin/find_one_file.cgi +++ /dev/null @@ -1,184 +0,0 @@ -#!/usr/bin/perl - -use strict; -use warnings; -use Storable qw(retrieve); -use Encode qw(decode_utf8); -use HTML::Escape qw(escape_html); -use Time::HiRes qw(gettimeofday tv_interval); -use BSD::Resource; - -# 1. Start Benchmark Timer -my $start_time = [gettimeofday]; -my $files_read = 0; # Track IO Activity - -# Configuration -my $max_parallel = 100; -my $lock_timeout = 30; -my $max_results = 1000; -my $min_query_len = 3; -my $index_file = 'search_index.dat'; -my $lock_dir = '/tmp/search_locks'; - -# Concurrency control -mkdir $lock_dir, 0777 unless -d $lock_dir; -my $active_count = 0; -my $now = time(); - -opendir(my $dh, $lock_dir); -while (my $file = readdir($dh)) { - next unless $file =~ /\.lock$/; - my $path = "$lock_dir/$file"; - my $mtime = (stat($path))[9] || 0; - ( $now - $mtime > $lock_timeout ) ? unlink($path) : $active_count++; -} -closedir($dh); - -# Too many search requests -if ($active_count >= $max_parallel) { - print "Content-Type: text/html\n\n"; - render_html("

Server busy. Please try again in a few seconds.

", "", (localtime)[5]+1900); - exit; -} - -my $lock_file = "$lock_dir/$$.lock"; -open(my $fh_lock, '>', $lock_file); -$files_read++; # IO for lock creation - -my $search_text = ''; -if (($ENV{QUERY_STRING} || '') =~ /^q=([^&]*)/) { - $search_text = decode_utf8($1 // ""); - $search_text =~ s/\P{Print}//g; - $search_text = substr($search_text, 0, 64); - $search_text =~ s/^\s+|\s+$//g; -} - -my $safe_search_text = escape_html($search_text); -my $year = (localtime)[5] + 1900; - -print "Content-Type: text/html\n\n"; - -if ($search_text eq '') { - final_output("

Please enter a search term above.

"); -} - -if (length($search_text) < $min_query_len) { - final_output("

Search term is too short. Please enter at least $min_query_len characters.

"); -} - -if (!-f $index_file) { - final_output("

Search temporarily unavailable.

"); -} - -# IO for index retrieval -my $index = retrieve($index_file); -$files_read++; - -my @results; -my $found = 0; - -foreach my $url (sort keys %$index) { - last if $found >= $max_results; - my $data = $index->{$url}; - - next unless $data->{c} =~ /(.{0,40})(\Q$search_text\E)(.{0,40})/is; - my ($before, $actual, $after) = ($1, $2, $3); - $found++; - - $after =~ s/\s\S*$// if length($after) > 25; - $before =~ s/^.*?\s// if length($before) > 25; - - $before = ($before =~ /\S/) ? ucfirst($before) : ""; - $actual = ($before eq "") ? ucfirst($actual) : $actual; - - my $snippet = escape_html($before) . "" . escape_html($actual) . "" . escape_html($after) . "..."; - - push @results, { - path => $url, - title => escape_html($data->{t}), - snippet => $snippet - }; -} - -my $list_html = ""; -if (@results == 0) { - $list_html = "

No results found for \"$safe_search_text\".

"; -} else { - $list_html = ""; -} - -final_output($list_html); - -sub final_output { - my ($content) = @_; - - # 2. Calculate Metrics just before rendering - my $elapsed = tv_interval($start_time, [gettimeofday]); - my $rusage = getrusage(); - my $user_cpu = $rusage->utime; - my $system_cpu = $rusage->stime; - my $max_rss = $rusage->maxrss; - - my $bench_html = <<"BENCH"; -
- Performance Metrics:
- Total Time: @{[ sprintf("%.4f", $elapsed) ]} seconds
- User CPU: $user_cpu s
- System CPU: $system_cpu s
- Peak RAM: $max_rss KB
- Files Read: $files_read (IO Activity) -
-BENCH - - render_html($content . $bench_html, $safe_search_text, $year); - close($fh_lock) if $fh_lock; - unlink($lock_file) if -f $lock_file; - exit; -} - -sub render_html { - my ($content, $q_val, $yr) = @_; - print <<"HTML"; - - - - - - Search - - - - - -
-
-

Search

-
- - -
- $content -
-
- - - -HTML -} diff --git a/_site/cgi-bin/find_sa.cgi b/_site/cgi-bin/find_sa.cgi deleted file mode 100644 index 6b608a7..0000000 --- a/_site/cgi-bin/find_sa.cgi +++ /dev/null @@ -1,173 +0,0 @@ -#!/usr/bin/perl - -use strict; -use warnings; -use Storable qw(retrieve); -use Encode qw(decode_utf8); -use HTML::Escape qw(escape_html); -use Time::HiRes qw(gettimeofday tv_interval); -use BSD::Resource; - -# 1. Start Benchmark Timer -my $start_time = [gettimeofday]; -my $files_read = 0; # Track IO Activity - -# Decode search text -my $search_text = ''; -if (($ENV{QUERY_STRING} || '') =~ /^q=([^&]*)/) { - $search_text = decode_utf8($1 // ""); - $search_text =~ s/\P{Print}//g; - $search_text = substr($search_text, 0, 64); - $search_text =~ s/^\s+|\s+$//g; -} - -# We search using lowercase for the case-insensitive index -my $query = lc($search_text); -my $query_len = length($query); -my @results; - -if ($query_len >= 3 && -f 'sa.bin' && -f 'corpus.bin') { - open(my $fh_sa, '<', 'sa.bin') or die $!; - open(my $fh_cp, '<', 'corpus.bin') or die $!; - my $file_map = retrieve('file_map.dat'); - $files_read += 3; - - my $total_suffixes = (-s 'sa.bin') / 4; - - # Helper for binary search comparisons - sub compare_at { - my ($idx, $fh_sa, $fh_cp, $q, $len) = @_; - seek($fh_sa, $idx * 4, 0); - read($fh_sa, my $bin_off, 4); - my $off = unpack("L", $bin_off); - - seek($fh_cp, $off, 0); - read($fh_cp, my $text, $len); - return ($text cmp $q, $off); - } - - # Range Search: Find Left and Right boundaries - my ($low, $high) = (0, $total_suffixes - 1); - my $first_hit = -1; - while ($low <= $high) { - my $mid = int(($low + $high) / 2); - my ($cmp) = compare_at($mid, $fh_sa, $fh_cp, $query, $query_len); - if ($cmp >= 0) { $first_hit = $mid if $cmp == 0; $high = $mid - 1; } - else { $low = $mid + 1; } - } - - if ($first_hit != -1) { - ($low, $high) = ($first_hit, $total_suffixes - 1); - my $last_hit = $first_hit; - while ($low <= $high) { - my $mid = int(($low + $high) / 2); - my ($cmp) = compare_at($mid, $fh_sa, $fh_cp, $query, $query_len); - if ($cmp <= 0) { $last_hit = $mid if $cmp == 0; $low = $mid + 1; } - else { $high = $mid - 1; } - } - - # Collect unique file results - my %seen; - for my $i ($first_hit .. $last_hit) { - my (undef, $offset) = compare_at($i, $fh_sa, $fh_cp, $query, $query_len); - foreach my $m (@$file_map) { - if ($offset >= $m->{start} && $offset < $m->{end}) { - if (!$seen{$m->{path}}++) { - my $snip_start = ($offset - 30 < $m->{start}) ? $m->{start} : $offset - 30; - seek($fh_cp, $snip_start, 0); - read($fh_cp, my $raw_snip, 80); - - push @results, { - path => $m->{path}, - title => "Result: " . (split('/', $m->{path}))[-2], - snippet => "..." . escape_html($raw_snip) . "..." - }; - } - last; - } - } - last if @results >= 20; - } - } - close($fh_sa); - close($fh_cp); -} - -# 2. Calculate Metrics -my $end_time = [gettimeofday]; -my $elapsed = tv_interval($start_time, $end_time); - -my $rusage = getrusage(); -my $user_cpu = $rusage->utime; -my $system_cpu = $rusage->stime; -my $max_rss = $rusage->maxrss; - -# 3. Output -print "Content-Type: text/html\n\n"; - -my $list; -if ($search_text eq '') { - $list = "

Please enter a search term above.

"; -} elsif (@results == 0) { - $list = "

No results found for \"" . escape_html($search_text) . "\".

"; -} else { - $list = ""; -} - -my $safe_search_text = escape_html($search_text); -my $year = (localtime)[5] + 1900; - -print <<"HTML"; - - - - - - Search - - - - - -
-
-

Search

-
- - -
- $list - -
- Performance Metrics:
- Total Time: @{[ sprintf("%.4f", $elapsed) ]} seconds
- User CPU: $user_cpu s
- System CPU: $system_cpu s
- Peak RAM: $max_rss KB
- Files Read: $files_read (IO Activity) -
-
-
- - - -HTML diff --git a/_site/cgi-bin/find_sa_mmap.cgi b/_site/cgi-bin/find_sa_mmap.cgi deleted file mode 100644 index 979f4d5..0000000 --- a/_site/cgi-bin/find_sa_mmap.cgi +++ /dev/null @@ -1,212 +0,0 @@ -#!/usr/bin/perl - -use strict; -use warnings; -use Storable qw(retrieve); -use Encode qw(decode_utf8); -use HTML::Escape qw(escape_html); -use Time::HiRes qw(gettimeofday tv_interval); -use BSD::Resource; -use Sys::Mmap; - -my $sa_file = 'sa.bin'; # Suffix Array index -my $cp_file = 'corpus.bin'; # Raw text corpus -my $map_file = 'file_map.dat'; # File metadata - -# 1. Start Benchmark Timer -my $start_time = [gettimeofday]; -my $files_read = 0; # Track IO Activity - -# Decode search text -my $search_text = ''; -if (($ENV{QUERY_STRING} || '') =~ /^q=([^&]*)/) { - $search_text = decode_utf8($1 // ""); - $search_text =~ s/\P{Print}//g; - $search_text = substr($search_text, 0, 64); - $search_text =~ s/^\s+|\s+$//g; -} - -# We search using lowercase for the case-insensitive index -my $query = lc($search_text); -my $query_len = length($query); -my @results; - -if ($query_len >= 3 && -f 'sa.bin' && -f 'corpus.bin') { - open(my $fh_sa, '<', $sa_file) or die $!; - open(my $fh_cp, '<', $cp_file) or die $!; - binmode($fh_sa); - binmode($fh_cp); - - # Memory map files - my ($sa_mapped, $cp_mapped); - mmap($sa_mapped, 0, PROT_READ, MAP_SHARED, $fh_sa) or die "Could not map SA: $!"; - mmap($cp_mapped, 0, PROT_READ, MAP_SHARED, $fh_cp) or die "Could not map Corpus: $!"; - - my $file_map = retrieve($map_file); - $files_read += 3; - my $total_suffixes = (-s $sa_file) / 4; - - # Range Search: Find Left and Right boundaries - my ($low, $high) = (0, $total_suffixes - 1); - my $first_hit = -1; - while ($low <= $high) { - my $mid = int(($low + $high) / 2); - my $off = unpack("L", substr($sa_mapped, $mid * 4, 4)); - my $text = substr($cp_mapped, $off, $query_len); - my $cmp = $text cmp $query; - if ($cmp >= 0) { - $first_hit = $mid if $cmp == 0; - $high = $mid - 1; - } else { - $low = $mid + 1; - } - } - - if ($first_hit != -1) { - ($low, $high) = ($first_hit, $total_suffixes - 1); - my $last_hit = $first_hit; - while ($low <= $high) { - my $mid = int(($low + $high) / 2); - my $off = unpack("L", substr($sa_mapped, $mid * 4, 4)); - my $text = substr($cp_mapped, $off, $query_len); - if (($text cmp $query) <= 0) { - $last_hit = $mid if $text eq $query; - $low = $mid + 1; - } else { - $high = $mid - 1; - } - } - - # Collect unique file results - my %seen; - for my $i ($first_hit .. $last_hit) { - my $offset = unpack("L", substr($sa_mapped, $i * 4, 4)); - foreach my $m (@$file_map) { - if ($offset >= $m->{start} && $offset < $m->{end}) { - if (!$seen{$m->{path}}++) { - # Capture more than 50 chars for trimming - my $snip_start = ($offset - 30 < $m->{start}) ? $m->{start} : $offset - 30; - my $max_len = $m->{end} - $snip_start; - my $read_len = ($max_len > 120) ? 120 : $max_len; - my $raw_snip = substr($cp_mapped, $snip_start, $read_len); - my $snippet = decode_utf8($raw_snip, Encode::FB_QUIET) // $raw_snip; - $snippet =~ s/\s+/ /g; # Normalize whitespace - - # Trim start: Partial word removal - if ($snip_start > $m->{start}) { - $snippet =~ s/^[^\s]*\s//; - } - - # Trim end: Length limit and partial word removal - my $has_more = 0; - if (length($snippet) > 50) { - $snippet = substr($snippet, 0, 50); - $has_more = 1 if $snippet =~ s/\s+[^\s]*$//; - } - elsif ($snip_start + $read_len < $m->{end}) { - # This check handles snippets that are naturally short but - # there's still more text in the article we didn't read - $has_more = 1; - } - - # Cleanup & capitalize - $snippet = ucfirst($snippet); - $snippet = escape_html($snippet) . ($has_more ? "..." : ""); - - my $clean_path = $m->{path}; - $clean_path =~ s|^\.\./_site/||; - - push @results, { - path => $clean_path, - title => $m->{title},, - snippet => $snippet - }; - } - last; - } - } - last if scalar @results >= 1000; - } - } - close($fh_sa); - close($fh_cp); -} - -# 2. Calculate Metrics -my $end_time = [gettimeofday]; -my $elapsed = tv_interval($start_time, $end_time); - -my $rusage = getrusage(); -my $user_cpu = $rusage->utime; -my $system_cpu = $rusage->stime; -my $max_rss = $rusage->maxrss; - -# 3. Output -print "Content-Type: text/html\n\n"; - -my $list; -if ($search_text eq '') { - $list = "

Please enter a search term above.

"; -} elsif (@results == 0) { - $list = "

No results found for \"" . escape_html($search_text) . "\".

"; -} else { - $list = ""; -} - -my $safe_search_text = escape_html($search_text); -my $year = (localtime)[5] + 1900; - -print <<"HTML"; - - - - - - Search - - - - - -
-
-

Search

-
- - -
- $list - -
- Performance Metrics:
- Total Time: @{[ sprintf("%.4f", $elapsed) ]} seconds
- User CPU: $user_cpu s
- System CPU: $system_cpu s
- Peak RAM: $max_rss KB
- Files Read: $files_read (IO Activity) -
-
-
- - - -HTML diff --git a/_site/cgi-bin/indexer.pl b/_site/cgi-bin/indexer.pl deleted file mode 100644 index 0dcd7e2..0000000 --- a/_site/cgi-bin/indexer.pl +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/perl - -use strict; -use warnings; -use Storable qw(nstore); -use HTML::Entities qw(decode_entities); - -# --- Configuration --- -my $built_site_dir = '../log'; -my $output_file = 'search_index.dat'; -my %index; - -print "Building search index from $built_site_dir...\n"; - -foreach my $path (glob("$built_site_dir/*/index.html")) { - next unless open(my $fh, '<:utf8', $path); - my $html = do { local $/; <$fh> }; - close($fh); - - # Extract Title and Main Content - my ($title) = $html =~ m|(.*?)|is || "Unknown"; - my ($main) = $html; - - # Normalize path - my $url = $path; - - $index{$url} = { - t => $title || "Untitled", - c => $main - }; -} - -nstore(\%index, $output_file); -printf("Index complete: %d files (%.2f KB)\n", scalar(keys %index), (-s $output_file) / 1024); diff --git a/_site/cgi-bin/sa_indexer.pl b/_site/cgi-bin/sa_indexer.pl deleted file mode 100644 index 2395dac..0000000 --- a/_site/cgi-bin/sa_indexer.pl +++ /dev/null @@ -1,86 +0,0 @@ -#!/usr/bin/perl - -use strict; -use warnings; -use File::Find; -use Storable qw(store); -use Time::HiRes qw(gettimeofday tv_interval); - -# Configuration -my $directory = '../log'; -my $corpus_file = 'corpus.bin'; -my $sa_file = 'sa.bin'; -my $map_file = 'file_map.dat'; - -# Start timing -my $t0 = [gettimeofday]; - -my $corpus = ""; -my @file_map; - -print "1. Building Case-Insensitive Corpus...\n"; -find({ - wanted => sub { - return unless -f $_ && $_ eq 'index.html'; - if (open my $fh, '<:encoding(UTF-8)', $_) { - my $content = do { local $/; <$fh> }; - close $fh; - - my ($text) = $content =~ m|
(.*?)
|is; - $text //= $content; - $text =~ s|<[^>]+>| |g; - $text =~ s|\s+| |g; - - my $start = length($corpus); - $corpus .= lc($text) . "\0"; - push @file_map, { start => $start, end => length($corpus), path => $File::Find::name }; - } - }, - no_chdir => 0, -}, $directory); - -print "2. Sorting Suffixes (Two-Pass Cache-Optimized)...\n"; -my @sa = 0 .. (length($corpus) - 1); - -@sa = sort { - (substr($corpus, $a, 64) cmp substr($corpus, $b, 64)) - || - (substr($corpus, $a) cmp substr($corpus, $b)) -} @sa; - -print "3. Writing Index Files to Disk...\n"; -open my $cfh, '>', $corpus_file or die $!; -print $cfh $corpus; -close $cfh; - -open my $sfh, '>', $sa_file or die $!; -binmode($sfh); -print $sfh pack("L*", @sa); -close $sfh; - -store \@file_map, $map_file; - -# End timing -my $elapsed = tv_interval($t0); - -# Calculate Sizes -my $c_size = -s $corpus_file; -my $s_size = -s $sa_file; -my $m_size = -s $map_file; -my $total = $c_size + $s_size + $m_size; - -# --- Final Report --- -print "\n" . "="x35 . "\n"; -print " INDEX BUILDING COMPLETE\n"; -print "="x35 . "\n"; -printf "Total Time: %.4f seconds\n", $elapsed; -print "Files Processed: " . scalar(@file_map) . "\n"; -print "-"x35 . "\n"; -print "File Sizes (KB):\n"; -printf " %-14s %10.2f KB\n", $corpus_file, $c_size / 1024; -printf " %-14s %10.2f KB\n", $sa_file, $s_size / 1024; -printf " %-14s %10.2f KB\n", $map_file, $m_size / 1024; -print "-"x35 . "\n"; -printf " TOTAL INDEX: %10.2f KB\n", $total / 1024; -print "="x35 . "\n"; - diff --git a/_site/cgi-bin/seed.sh b/_site/cgi-bin/seed.sh deleted file mode 100755 index 5ae14df..0000000 --- a/_site/cgi-bin/seed.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/ksh - -# Set the number of files/dirs -TOTAL=500 -# Approximate size in blocks -BLOCK_SIZE=16 -COUNT=1 - -for i in $(seq 1 $TOTAL); do - # Create a unique directory name - DIR="site_$i" - mkdir -p "$DIR" - - # 1. Generate random valid ASCII (valid UTF-8) text - # We read more from urandom than needed because tr will filter some out - dd if=/dev/urandom bs=1024 count=$BLOCK_SIZE 2>/dev/null | tr -dc 'a-zA-Z0-9 \n' > "$DIR/index.html" - - # 2. Append the necessary HTML structure so your Perl regexes work - # This adds the and <main> tags your script looks for - echo "<html><head><title>Site $i

Searchable content here for keyword_$i. Lorem ipsum text follows.

" >> "$DIR/index.html" - - # Optional: print progress every 100 files - if [ $((i % 100)) -eq 0 ]; then - echo "Created $i files..." - fi -done - -echo "Done! 10000 directories created with valid text." diff --git a/benchmark.pl b/benchmark.pl new file mode 100755 index 0000000..8c1b4ea --- /dev/null +++ b/benchmark.pl @@ -0,0 +1,86 @@ +#!/usr/bin/perl +use strict; +use warnings; +use Time::HiRes qw(gettimeofday tv_interval); + +# 1. Accept directory counts from @ARGV, or use defaults +my @test_counts = @ARGV ? @ARGV : (500, 1000, 10000); + +# Configuration - All scripts are now in the root +my $seed_script = "./seed.sh"; +my $indexer = "./indexer.pl"; +my $sa_cgi = "./find_sa.cgi"; +my $regex_cgi = "./find_regex.cgi"; + +print "=============================================================\n"; +print "SEARCH BENCHMARK: Suffix array vs. Linear regex\n"; +print "ARTICLE SIZE: 16 KB\n"; +print "=============================================================\n\n"; + +foreach my $count (@test_counts) { + my $search_query = "keyword_$count"; + + print "$count files (Targeting: $search_query):\n"; + print "-------------------------------------------------------------\n"; + print sprintf("%-15s | %-20s | %-20s\n", "METRIC", "SA", "REGEX"); + print "----------------+----------------------+---------------------\n"; + + # 1. Seed + system("$seed_script $count > /dev/null 2>&1"); + + # 2. Cleanup old index files + unlink('sa.bin', 'corpus.bin', 'file_map.dat'); + + # 3. Indexing + my $idx_start = [gettimeofday]; + system("perl $indexer > /dev/null 2>&1"); + my $idx_time = tv_interval($idx_start); + + my $idx_size = 0; + if (-f 'sa.bin' && -f 'corpus.bin') { + $idx_size = ((-s 'sa.bin') + (-s 'corpus.bin')) / 1024; + } + + # 4. SA Search + my $sa_out = `QUERY_STRING="q=$search_query" perl $sa_cgi`; + my ($sa_time, $sa_ram) = parse_metrics($sa_out); + + # 5. Regex Search + my $reg_out = `QUERY_STRING="q=$search_query" perl $regex_cgi`; + my ($reg_time, $reg_ram) = parse_metrics($reg_out); + + # 6. Final Output Table + print sprintf("%-15s | %-20s | %-20s\n", + "Search time", + sprintf("%.4fs", $sa_time), + sprintf("%.4fs", $reg_time) + ); + + print sprintf("%-15s | %-20s | %-20s\n", + "Peak RAM", + sprintf("%d KB", $sa_ram), + sprintf("%d KB", $reg_ram) + ); + + print sprintf("%-15s | %-20s | %-20s\n", + "Indexing time", + sprintf("%.4fs", $idx_time), + "N/A" + ); + + print sprintf("%-15s | %-20s | %-20s\n", + "Index size", + sprintf("%.2f KB", $idx_size), + "N/A" + ); + + print "----------------+----------------------+---------------------\n\n"; +} + +sub parse_metrics { + my $text = shift || ""; + my $time = ($text =~ /Total Time:\s+([\d.]+)/) ? $1 : 0; + my $ram = ($text =~ /Peak RAM:\s+(\d+)/) ? $1 : 0; + return ($time, $ram); +} + diff --git a/bm_10k.txt b/bm_10k.txt deleted file mode 100644 index fbb0932..0000000 --- a/bm_10k.txt +++ /dev/null @@ -1,34 +0,0 @@ -Benchmarks with 10000 16KB files - -Crawl directory at query time using File::Find - -Total Time: 0.9120 s -User CPU: 0.4 s -System CPU: 0.54 s -Peak RAM: 12804 KB -Files Read: 10000 (IO Activity) - -Glob files at query time - -Total Time: 0.9786 s -User CPU: 0.7 s -System CPU: 0.3 s -Peak RAM: 10216 KB -Files Read: 10000 (IO Activity) - -Single file (10000 files (41991.79 KB)) - -Total Time: 15.0889 s -User CPU: 15.06 s -System CPU: 0.06 s -Peak RAM: 101988 KB -Files Read: 2 (IO Activity) - -SA index - -Total Time: 0.0161 seconds
-User CPU: 0.03 s
-System CPU: 0.03 s
-Peak RAM: 12504 KB
-Files Read: 3 (IO Activity) - diff --git a/bm_1k.txt b/bm_1k.txt deleted file mode 100644 index f443e2e..0000000 --- a/bm_1k.txt +++ /dev/null @@ -1,34 +0,0 @@ -Benchmarks with 1000 16KB files - -Crawl directory at query time using File::Find - -Total Time: 0.0795 seconds
-User CPU: 0.05 s
-System CPU: 0.05 s
-Peak RAM: 9460 KB
-Files Read: 1000 (IO Activity) - -Glob files at query time - -Total Time: 0.0740 seconds
-User CPU: 0.09 s
-System CPU: 0.02 s
-Peak RAM: 8952 KB
-Files Read: 1000 (IO Activity) - -Single file (1000 files (4196.95 KB)) - -Total Time: 1.4814 seconds
-User CPU: 1.49 s
-System CPU: 0.02 s
-Peak RAM: 20264 KB
-Files Read: 2 (IO Activity) - -SA index - -Total Time: 0.0019 seconds
-User CPU: 0.02 s
-System CPU: 0.01 s
-Peak RAM: 8980 KB
-Files Read: 3 (IO Activity) - diff --git a/bm_500.txt b/bm_500.txt deleted file mode 100644 index f7b82e9..0000000 --- a/bm_500.txt +++ /dev/null @@ -1,34 +0,0 @@ -Benchmarks with 500 16KB files - -Crawl directory at query time using File::Find - -Total Time: 0.0407 seconds
-User CPU: 0.06 s
-System CPU: 0.02 s
-Peak RAM: 9136 KB
-Files Read: 500 (IO Activity) - -Glob files at query time - -Total Time: 0.0373 seconds
-User CPU: 0.06 s
-System CPU: 0.01 s
-Peak RAM: 8932 KB
-Files Read: 500 (IO Activity) - -Single file (500 files (2099.46 KB)) - -Total Time: 0.7322 seconds
-User CPU: 0.73 s
-System CPU: 0.02 s
-Peak RAM: 14564 KB
-Files Read: 2 (IO Activity) - -SA index - -Total Time: 0.0012 seconds
-User CPU: 0.01 s
-System CPU: 0.02 s
-Peak RAM: 8828 KB
-Files Read: 3 (IO Activity) - diff --git a/find_glob.cgi b/find_glob.cgi new file mode 100755 index 0000000..db03bd5 --- /dev/null +++ b/find_glob.cgi @@ -0,0 +1,147 @@ +#!/usr/bin/perl + +use strict; +use warnings; +use Encode qw(decode_utf8); +use HTML::Escape qw(escape_html); +use Time::HiRes qw(gettimeofday tv_interval); +use BSD::Resource; + +# 1. Start Benchmark Timer +my $start_time = [gettimeofday]; + +my $search_text = ''; +if ($ENV{QUERY_STRING} && $ENV{QUERY_STRING} =~ /^q=([^&]*)/) { + $search_text = decode_utf8($1 // ""); + $search_text =~ s/\P{Print}//g; + $search_text = substr($search_text, 0, 64); + $search_text =~ s/^\s+|\s+$//g; +} + +my @results; +my $files_read = 0; # Track IO Activity + +my $start_dir = '../log'; +my @files = glob("$start_dir/*/index.html"); + +foreach my $path (@files) { + next if -l $path || ! -f $path; + + # Using :encoding(UTF-8) to handle the valid text files + next unless open(my $fh, "<:encoding(UTF-8)", $path); + $files_read++; + my $html = do { local $/; <$fh> }; + close($fh); + + my ($text) = $html =~ m|
(.*?)
|is; + $text =~ s|<[^>]+>| |g; + $text =~ s|\s+| |g; + + next unless $text =~ /(.{0,40})(\Q$search_text\E)(.{0,40})/is; + my ($before, $actual, $after) = ($1, $2, $3); + + $after =~ s/\s\S*$// if length($after) > 25; + $before =~ s/^.*?\s// if length($before) > 25; + + if ($before =~ /\S/) { + $before = ucfirst($before); + } else { + $before = ""; + $actual = ucfirst($actual); + } + + my $safe_before = escape_html($before); + my $safe_actual = escape_html($actual); + my $safe_after = escape_html($after); + my $snippet = "${safe_before}${safe_actual}${safe_after}..."; + + my ($title) = $html =~ m|(.*?)|is; + my $safe_title = escape_html($title || "No Title"); + + push @results, { + path => $path, + title => $safe_title, + snippet => $snippet + }; +} + +# 2. Calculate Metrics +my $end_time = [gettimeofday]; +my $elapsed = tv_interval($start_time, $end_time); + +my $rusage = getrusage(); +my $user_cpu = $rusage->utime; +my $system_cpu = $rusage->stime; +my $max_rss = $rusage->maxrss; + +# 3. Output +print "Content-Type: text/html\n\n"; + +my $list; +if ($search_text eq '') { + $list = "

Please enter a search term above.

"; +} elsif (@results == 0) { + $list = "

No results found for \"$search_text\".

"; +} else { + $list = ""; +} + +my $safe_search_text = escape_html($search_text); +my $year = (localtime)[5] + 1900; + +print <<"HTML"; + + + + + + Search + + + + + +
+
+

Search

+
+ + +
+ $list + +
+ Performance Metrics:
+ Total Time: @{[ sprintf("%.4f", $elapsed) ]} seconds
+ User CPU: $user_cpu s
+ System CPU: $system_cpu s
+ Peak RAM: $max_rss KB
+ Files Read: $files_read (IO Activity) +
+
+
+ + + +HTML + diff --git a/find_regex.cgi b/find_regex.cgi new file mode 100755 index 0000000..d826c12 --- /dev/null +++ b/find_regex.cgi @@ -0,0 +1,144 @@ +#!/usr/bin/perl + +use strict; +use warnings; +use File::Find; +use Time::HiRes qw(gettimeofday tv_interval); +use BSD::Resource; +use Encode qw(decode_utf8); + +# 1. Start Benchmark Timer +my $start_time = [gettimeofday]; + +# Helper to keep HTML output safe +sub escape_html { + my $str = shift; + return "" unless defined $str; + $str =~ s/&/&/g; + $str =~ s//>/g; + $str =~ s/"/"/g; + $str =~ s/'/'/g; + return $str; +} + +# Parse Query String (q=keyword) +my %params; +if ($ENV{QUERY_STRING}) { + foreach my $pair (split /&/, $ENV{QUERY_STRING}) { + my ($key, $value) = split /=/, $pair; + $value //= ''; + $value =~ tr/+/ /; + $value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; + $params{$key} = decode_utf8($value); + } +} + +my $search_text = $params{'q'} || ''; +$search_text = substr($search_text, 0, 64); +$search_text =~ s/[^a-zA-Z0-9 ]//g; + +# Configuration +my $directory = '_site/log/'; +my @results; +my $files_read = 0; + +# 2. The Linear Search (Crawl) +if ($search_text =~ /\S/) { + find({ + wanted => sub { + # Only look at index.html files inside the subdirectories + return unless -f $_ && $_ eq 'index.html'; + + if (open my $fh, '<', $_) { + $files_read++; + # Slurp the entire file (approx 16KB per your seed script) + my $content = do { local $/; <$fh> }; + close $fh; + + # Regex match (Case Insensitive) + if ($content =~ /\Q$search_text\E/i) { + my ($title) = $content =~ /(.*?)<\/title>/is; + my ($p_content) = $content =~ /<p[^>]*>(.*?)<\/p>/is; + + # Clean up snippet + my $snippet = $p_content || ""; + $snippet =~ s/<[^>]*>//g; # Strip internal tags + $snippet =~ s/\s+/ /g; + $snippet = substr($snippet, 0, 100); + + push @results, { + path => $File::Find::name, + title => $title || $File::Find::name, + snippet => $snippet . "..." + }; + } + } + # Stop collecting after 20 results for display, + # but the benchmark usually looks for unique keywords + # where only 1 result exists. + }, + no_chdir => 0, + }, $directory); +} + +# 3. Calculate Performance Metrics +my $end_time = [gettimeofday]; +my $elapsed = tv_interval($start_time, $end_time); + +my $rusage = getrusage(); +my $user_cpu = $rusage->utime; +my $system_cpu = $rusage->stime; +my $max_rss = $rusage->maxrss; + +# 4. Generate Output +print "Content-Type: text/html\n\n"; + +my $list_html = ""; +if ($search_text eq '') { + $list_html = "<p>Please enter a search term.</p>"; +} elsif (@results == 0) { + $list_html = "<p>No results found for \"<b>" . escape_html($search_text) . "</b>\".</p>"; +} else { + $list_html = "<ul>"; + foreach my $res (@results) { + $list_html .= sprintf('<li><a href="/%s">%s</a><br><small>%s</small></li>', + $res->{path}, escape_html($res->{title}), escape_html($res->{snippet})); + } + $list_html .= "</ul>"; +} + +my $safe_q = escape_html($search_text); + +print <<"HTML"; +<!DOCTYPE html> +<html> +<head> + <meta charset="utf-8"> + <title>Regex Search Results + + + +

Regex Search (Linear Crawl)

+
+ + +
+ + $list_html + +
+ Performance Metrics:
+ Total Time: @{[ sprintf("%.4f", $elapsed) ]} seconds
+ User CPU: $user_cpu s
+ System CPU: $system_cpu s
+ Peak RAM: $max_rss KB
+ Files Read: $files_read (IO Activity) +
+ + +HTML diff --git a/find_sa.cgi b/find_sa.cgi new file mode 100755 index 0000000..6b608a7 --- /dev/null +++ b/find_sa.cgi @@ -0,0 +1,173 @@ +#!/usr/bin/perl + +use strict; +use warnings; +use Storable qw(retrieve); +use Encode qw(decode_utf8); +use HTML::Escape qw(escape_html); +use Time::HiRes qw(gettimeofday tv_interval); +use BSD::Resource; + +# 1. Start Benchmark Timer +my $start_time = [gettimeofday]; +my $files_read = 0; # Track IO Activity + +# Decode search text +my $search_text = ''; +if (($ENV{QUERY_STRING} || '') =~ /^q=([^&]*)/) { + $search_text = decode_utf8($1 // ""); + $search_text =~ s/\P{Print}//g; + $search_text = substr($search_text, 0, 64); + $search_text =~ s/^\s+|\s+$//g; +} + +# We search using lowercase for the case-insensitive index +my $query = lc($search_text); +my $query_len = length($query); +my @results; + +if ($query_len >= 3 && -f 'sa.bin' && -f 'corpus.bin') { + open(my $fh_sa, '<', 'sa.bin') or die $!; + open(my $fh_cp, '<', 'corpus.bin') or die $!; + my $file_map = retrieve('file_map.dat'); + $files_read += 3; + + my $total_suffixes = (-s 'sa.bin') / 4; + + # Helper for binary search comparisons + sub compare_at { + my ($idx, $fh_sa, $fh_cp, $q, $len) = @_; + seek($fh_sa, $idx * 4, 0); + read($fh_sa, my $bin_off, 4); + my $off = unpack("L", $bin_off); + + seek($fh_cp, $off, 0); + read($fh_cp, my $text, $len); + return ($text cmp $q, $off); + } + + # Range Search: Find Left and Right boundaries + my ($low, $high) = (0, $total_suffixes - 1); + my $first_hit = -1; + while ($low <= $high) { + my $mid = int(($low + $high) / 2); + my ($cmp) = compare_at($mid, $fh_sa, $fh_cp, $query, $query_len); + if ($cmp >= 0) { $first_hit = $mid if $cmp == 0; $high = $mid - 1; } + else { $low = $mid + 1; } + } + + if ($first_hit != -1) { + ($low, $high) = ($first_hit, $total_suffixes - 1); + my $last_hit = $first_hit; + while ($low <= $high) { + my $mid = int(($low + $high) / 2); + my ($cmp) = compare_at($mid, $fh_sa, $fh_cp, $query, $query_len); + if ($cmp <= 0) { $last_hit = $mid if $cmp == 0; $low = $mid + 1; } + else { $high = $mid - 1; } + } + + # Collect unique file results + my %seen; + for my $i ($first_hit .. $last_hit) { + my (undef, $offset) = compare_at($i, $fh_sa, $fh_cp, $query, $query_len); + foreach my $m (@$file_map) { + if ($offset >= $m->{start} && $offset < $m->{end}) { + if (!$seen{$m->{path}}++) { + my $snip_start = ($offset - 30 < $m->{start}) ? $m->{start} : $offset - 30; + seek($fh_cp, $snip_start, 0); + read($fh_cp, my $raw_snip, 80); + + push @results, { + path => $m->{path}, + title => "Result: " . (split('/', $m->{path}))[-2], + snippet => "..." . escape_html($raw_snip) . "..." + }; + } + last; + } + } + last if @results >= 20; + } + } + close($fh_sa); + close($fh_cp); +} + +# 2. Calculate Metrics +my $end_time = [gettimeofday]; +my $elapsed = tv_interval($start_time, $end_time); + +my $rusage = getrusage(); +my $user_cpu = $rusage->utime; +my $system_cpu = $rusage->stime; +my $max_rss = $rusage->maxrss; + +# 3. Output +print "Content-Type: text/html\n\n"; + +my $list; +if ($search_text eq '') { + $list = "

Please enter a search term above.

"; +} elsif (@results == 0) { + $list = "

No results found for \"" . escape_html($search_text) . "\".

"; +} else { + $list = ""; +} + +my $safe_search_text = escape_html($search_text); +my $year = (localtime)[5] + 1900; + +print <<"HTML"; + + + + + + Search + + + + + +
+
+

Search

+
+ + +
+ $list + +
+ Performance Metrics:
+ Total Time: @{[ sprintf("%.4f", $elapsed) ]} seconds
+ User CPU: $user_cpu s
+ System CPU: $system_cpu s
+ Peak RAM: $max_rss KB
+ Files Read: $files_read (IO Activity) +
+
+
+ + + +HTML diff --git a/find_sa_mmap.cgi b/find_sa_mmap.cgi new file mode 100755 index 0000000..979f4d5 --- /dev/null +++ b/find_sa_mmap.cgi @@ -0,0 +1,212 @@ +#!/usr/bin/perl + +use strict; +use warnings; +use Storable qw(retrieve); +use Encode qw(decode_utf8); +use HTML::Escape qw(escape_html); +use Time::HiRes qw(gettimeofday tv_interval); +use BSD::Resource; +use Sys::Mmap; + +my $sa_file = 'sa.bin'; # Suffix Array index +my $cp_file = 'corpus.bin'; # Raw text corpus +my $map_file = 'file_map.dat'; # File metadata + +# 1. Start Benchmark Timer +my $start_time = [gettimeofday]; +my $files_read = 0; # Track IO Activity + +# Decode search text +my $search_text = ''; +if (($ENV{QUERY_STRING} || '') =~ /^q=([^&]*)/) { + $search_text = decode_utf8($1 // ""); + $search_text =~ s/\P{Print}//g; + $search_text = substr($search_text, 0, 64); + $search_text =~ s/^\s+|\s+$//g; +} + +# We search using lowercase for the case-insensitive index +my $query = lc($search_text); +my $query_len = length($query); +my @results; + +if ($query_len >= 3 && -f 'sa.bin' && -f 'corpus.bin') { + open(my $fh_sa, '<', $sa_file) or die $!; + open(my $fh_cp, '<', $cp_file) or die $!; + binmode($fh_sa); + binmode($fh_cp); + + # Memory map files + my ($sa_mapped, $cp_mapped); + mmap($sa_mapped, 0, PROT_READ, MAP_SHARED, $fh_sa) or die "Could not map SA: $!"; + mmap($cp_mapped, 0, PROT_READ, MAP_SHARED, $fh_cp) or die "Could not map Corpus: $!"; + + my $file_map = retrieve($map_file); + $files_read += 3; + my $total_suffixes = (-s $sa_file) / 4; + + # Range Search: Find Left and Right boundaries + my ($low, $high) = (0, $total_suffixes - 1); + my $first_hit = -1; + while ($low <= $high) { + my $mid = int(($low + $high) / 2); + my $off = unpack("L", substr($sa_mapped, $mid * 4, 4)); + my $text = substr($cp_mapped, $off, $query_len); + my $cmp = $text cmp $query; + if ($cmp >= 0) { + $first_hit = $mid if $cmp == 0; + $high = $mid - 1; + } else { + $low = $mid + 1; + } + } + + if ($first_hit != -1) { + ($low, $high) = ($first_hit, $total_suffixes - 1); + my $last_hit = $first_hit; + while ($low <= $high) { + my $mid = int(($low + $high) / 2); + my $off = unpack("L", substr($sa_mapped, $mid * 4, 4)); + my $text = substr($cp_mapped, $off, $query_len); + if (($text cmp $query) <= 0) { + $last_hit = $mid if $text eq $query; + $low = $mid + 1; + } else { + $high = $mid - 1; + } + } + + # Collect unique file results + my %seen; + for my $i ($first_hit .. $last_hit) { + my $offset = unpack("L", substr($sa_mapped, $i * 4, 4)); + foreach my $m (@$file_map) { + if ($offset >= $m->{start} && $offset < $m->{end}) { + if (!$seen{$m->{path}}++) { + # Capture more than 50 chars for trimming + my $snip_start = ($offset - 30 < $m->{start}) ? $m->{start} : $offset - 30; + my $max_len = $m->{end} - $snip_start; + my $read_len = ($max_len > 120) ? 120 : $max_len; + my $raw_snip = substr($cp_mapped, $snip_start, $read_len); + my $snippet = decode_utf8($raw_snip, Encode::FB_QUIET) // $raw_snip; + $snippet =~ s/\s+/ /g; # Normalize whitespace + + # Trim start: Partial word removal + if ($snip_start > $m->{start}) { + $snippet =~ s/^[^\s]*\s//; + } + + # Trim end: Length limit and partial word removal + my $has_more = 0; + if (length($snippet) > 50) { + $snippet = substr($snippet, 0, 50); + $has_more = 1 if $snippet =~ s/\s+[^\s]*$//; + } + elsif ($snip_start + $read_len < $m->{end}) { + # This check handles snippets that are naturally short but + # there's still more text in the article we didn't read + $has_more = 1; + } + + # Cleanup & capitalize + $snippet = ucfirst($snippet); + $snippet = escape_html($snippet) . ($has_more ? "..." : ""); + + my $clean_path = $m->{path}; + $clean_path =~ s|^\.\./_site/||; + + push @results, { + path => $clean_path, + title => $m->{title},, + snippet => $snippet + }; + } + last; + } + } + last if scalar @results >= 1000; + } + } + close($fh_sa); + close($fh_cp); +} + +# 2. Calculate Metrics +my $end_time = [gettimeofday]; +my $elapsed = tv_interval($start_time, $end_time); + +my $rusage = getrusage(); +my $user_cpu = $rusage->utime; +my $system_cpu = $rusage->stime; +my $max_rss = $rusage->maxrss; + +# 3. Output +print "Content-Type: text/html\n\n"; + +my $list; +if ($search_text eq '') { + $list = "

Please enter a search term above.

"; +} elsif (@results == 0) { + $list = "

No results found for \"" . escape_html($search_text) . "\".

"; +} else { + $list = ""; +} + +my $safe_search_text = escape_html($search_text); +my $year = (localtime)[5] + 1900; + +print <<"HTML"; + + + + + + Search + + + + + +
+
+

Search

+
+ + +
+ $list + +
+ Performance Metrics:
+ Total Time: @{[ sprintf("%.4f", $elapsed) ]} seconds
+ User CPU: $user_cpu s
+ System CPU: $system_cpu s
+ Peak RAM: $max_rss KB
+ Files Read: $files_read (IO Activity) +
+
+
+ + + +HTML diff --git a/indexer.pl b/indexer.pl new file mode 100755 index 0000000..0b197ff --- /dev/null +++ b/indexer.pl @@ -0,0 +1,86 @@ +#!/usr/bin/perl + +use strict; +use warnings; +use File::Find; +use Storable qw(store); +use Time::HiRes qw(gettimeofday tv_interval); + +# Configuration +my $directory = '_site/log'; +my $corpus_file = 'corpus.bin'; +my $sa_file = 'sa.bin'; +my $map_file = 'file_map.dat'; + +# Start timing +my $t0 = [gettimeofday]; + +my $corpus = ""; +my @file_map; + +print "1. Building Case-Insensitive Corpus...\n"; +find({ + wanted => sub { + return unless -f $_ && $_ eq 'index.html'; + if (open my $fh, '<:encoding(UTF-8)', $_) { + my $content = do { local $/; <$fh> }; + close $fh; + + my ($text) = $content =~ m|
(.*?)
|is; + $text //= $content; + $text =~ s|<[^>]+>| |g; + $text =~ s|\s+| |g; + + my $start = length($corpus); + $corpus .= lc($text) . "\0"; + push @file_map, { start => $start, end => length($corpus), path => $File::Find::name }; + } + }, + no_chdir => 0, +}, $directory); + +print "2. Sorting Suffixes (Two-Pass Cache-Optimized)...\n"; +my @sa = 0 .. (length($corpus) - 1); + +@sa = sort { + (substr($corpus, $a, 64) cmp substr($corpus, $b, 64)) + || + (substr($corpus, $a) cmp substr($corpus, $b)) +} @sa; + +print "3. Writing Index Files to Disk...\n"; +open my $cfh, '>', $corpus_file or die $!; +print $cfh $corpus; +close $cfh; + +open my $sfh, '>', $sa_file or die $!; +binmode($sfh); +print $sfh pack("L*", @sa); +close $sfh; + +store \@file_map, $map_file; + +# End timing +my $elapsed = tv_interval($t0); + +# Calculate Sizes +my $c_size = -s $corpus_file; +my $s_size = -s $sa_file; +my $m_size = -s $map_file; +my $total = $c_size + $s_size + $m_size; + +# --- Final Report --- +print "\n" . "="x35 . "\n"; +print " INDEX BUILDING COMPLETE\n"; +print "="x35 . "\n"; +printf "Total Time: %.4f seconds\n", $elapsed; +print "Files Processed: " . scalar(@file_map) . "\n"; +print "-"x35 . "\n"; +print "File Sizes (KB):\n"; +printf " %-14s %10.2f KB\n", $corpus_file, $c_size / 1024; +printf " %-14s %10.2f KB\n", $sa_file, $s_size / 1024; +printf " %-14s %10.2f KB\n", $map_file, $m_size / 1024; +print "-"x35 . "\n"; +printf " TOTAL INDEX: %10.2f KB\n", $total / 1024; +print "="x35 . "\n"; + diff --git a/sa_stats.txt b/sa_stats.txt deleted file mode 100644 index 15217d6..0000000 --- a/sa_stats.txt +++ /dev/null @@ -1,29 +0,0 @@ -500 article index stats: - -Total Time: 0.1475 seconds -Files Processed: 500 -File Sizes (KB): - corpus.bin 33.59 KB - sa.bin 134.34 KB - file_map.dat 37.01 KB - TOTAL INDEX: 204.94 KB - -1K article index stats: - -Total Time: 0.3101 seconds -Files Processed: 1000 -File Sizes (KB): - corpus.bin 67.28 KB - sa.bin 269.11 KB - file_map.dat 74.12 KB - TOTAL INDEX: 410.51 KB - -10K article index stats: - -Total Time: 10.9661 seconds -Files Processed: 10000 -File Sizes (KB): - corpus.bin 682.51 KB - sa.bin 2730.05 KB - file_map.dat 750.88 KB - TOTAL INDEX: 4163.44 KB -- cgit v1.2.3