diff options
| -rw-r--r-- | .gitignore | 1 | ||||
| -rw-r--r-- | _config.yml | 6 | ||||
| -rw-r--r-- | _log/site-search.md | 58 | ||||
| -rw-r--r-- | cgi-bin/find.cgi | 230 | ||||
| -rw-r--r-- | cgi-bin/rgx-search.cgi | 207 | ||||
| -rw-r--r-- | cgi-bin/search.cgi | 288 | ||||
| -rwxr-xr-x | deploy.sh | 29 | ||||
| -rw-r--r-- | util/article_stats.sh | 61 |
8 files changed, 495 insertions, 385 deletions
@@ -1,3 +1,4 @@ +.env .jekyll-cache/ **/*.swp diff --git a/_config.yml b/_config.yml index a700d6a..c78fa88 100644 --- a/_config.yml +++ b/_config.yml @@ -26,4 +26,8 @@ exclude: - Gemfile.lock - README.txt - cgi-bin/indexer.pl - - cgi-bin/find.cgi + - cgi-bin/rgx_search.cgi + - util + - deploy.sh + - .env + diff --git a/_log/site-search.md b/_log/site-search.md index c1d4c12..e25c0fc 100644 --- a/_log/site-search.md +++ b/_log/site-search.md @@ -1,10 +1,10 @@ --- -title: Under-engineered search +title: Overengineered search date: 2026-01-03 layout: post --- -Developed a suffix-array-based search engine for the site today. While a simple +Developed a suffix-array-based search engine my personal site. While a simple regex search was enough, couldn't resist the technical elegance of a proper index. @@ -28,8 +28,12 @@ my @sa = 0 .. (length($corpus) - 1); 32-bit offsets provide a 4 GB ceiling—overkill for a personal site, but comforting to have. -Search: Textbook range query with two binary searches hosted in a FastCGI -process. Fixed-width offsets enable fast random access to the index: +O(L⋅N log N) sort is slow. 100 4.1 KB articles took 97.9s to index. L=64 fast +path reduces that to 1.31s (L=16, 32, 64: 1.29-1.31s; 128, 256: 1.33-1.35s). +Even with fast path optimization, indexer is unusable beyond 300 articles. + +Search: Textbook range query with two binary searches, hosted in a FastCGI +process. Fixed-width offsets allow fast random access to the index: ``` seek($fh_sa, $mid * 4, 0); @@ -43,43 +47,44 @@ Seek + read outperformed mmap for <1k files. At 10k, mmap was occasionally faster (~200 µs), but consumed more memory—possibly due to OpenBSD’s VM security trade-offs. Results may vary by OS. -Benchmarked on T490 (i7-10510U, OpenBSD 7.8, article size: 16 KB) against +Benchmarks: My articles have a 3.42 KB median, 3.43 KB mean, and 5.39 KB max. +Benchmarked on T490 (i7-10510U, OpenBSD 7.8, article size: 4.1 KB) against linear regex search: <pre class="pre-no-style"> ============================================================= SEARCH BENCHMARK: Suffix array vs. Linear regex -ARTICLE SIZE: 8 KB +ARTICLE SIZE: 4.1 KB ============================================================= -500 files (Targeting: keyword_-1): +100 files (Targeting: keyword_-1): ----------------+----------------------+--------------------- METRIC | SA | REGEX ----------------+----------------------+--------------------- -Search time | 0.0014s | 0.0451s -Peak RAM | 8124 KB | 9612 KB -Indexing time | 18.1865s | N/A -Index size | 19610.39 KB | N/A +Search time | 0.0009s | 0.0084s +Peak RAM | 7968 KB | 9676 KB +Indexing time | 1.3332s | N/A +Index size | 2070.38 KB | N/A ----------------+----------------------+--------------------- -1000 files (Targeting: keyword_-1): +300 files (Targeting: keyword_-1): ----------------+----------------------+--------------------- METRIC | SA | REGEX ----------------+----------------------+--------------------- -Search time | 0.0021s | 0.0918s -Peak RAM | 8280 KB | 9960 KB -Indexing time | 43.1748s | N/A -Index size | 39225.06 KB | N/A +Search time | 0.0009s | 0.0242s +Peak RAM | 8024 KB | 9680 KB +Indexing time | 4.5658s | N/A +Index size | 6211.76 KB | N/A ----------------+----------------------+--------------------- -10000 files (Targeting: keyword_-1): +5000 files (Targeting: keyword_-1): ----------------+----------------------+--------------------- METRIC | SA | REGEX ----------------+----------------------+--------------------- -Search time | 0.0173s | 1.1275s -Peak RAM | 11848 KB | 13392 KB -Indexing time | 663.3909s | N/A -Index size | 392263.01 KB | N/A +Search time | 0.0088s | 0.4937s +Peak RAM | 9948 KB | 11436 KB +Indexing time | 138.7510s | N/A +Index size | 103557.18 KB | N/A ----------------+----------------------+--------------------- </pre> @@ -90,17 +95,12 @@ Resource exhaustion and XSS attacks are inherent. Limited concurrent searches using lock-file semaphores, and capped the query length (64 B) and the result set (20). Mitigated XSS by HTML-escaping all output using HTML::Escape. -Performance: Without SA-IS, indexing is slow. With O(L⋅N log N) naive sort, 100 -8 KB articles took 6.58 minutes to index. L=64 fast path reduces that to 2.69 -seconds (L=16, 32, 64: 2.68-2.69s; 128, 256: 2.75-2.77s). Even so, 43.1748s to -index 500 articles is untenable. - -I under-engineered search. +Next release: Incremental indexing + SA-IS, Anno Domini 2076. Commit: <a href="https://git.asciimx.com/www/commit/?h=term&id=6da102d6e0494a3eac3f05fa3b2cdcc25ba2754e" class="external" target="_blank" rel="noopener noreferrer">6da102d</a> | Benchmarks: <a -href="https://git.asciimx.com/site-search-bm/commit/?id=de9d82e8074c9b67a04989f9b6be62890b7c95bb" -class="external" target="_blank" rel="noopener noreferrer">de9d82e</a> +href="https://git.asciimx.com/site-search-bm/commit/?id=f6d7c3fdbecbcb880c0c02fdffefa1f467c46b03" +class="external" target="_blank" rel="noopener noreferrer">f6d7c3f</a> diff --git a/cgi-bin/find.cgi b/cgi-bin/find.cgi deleted file mode 100644 index 9b1f913..0000000 --- a/cgi-bin/find.cgi +++ /dev/null @@ -1,230 +0,0 @@ -#!/usr/bin/perl - -use strict; -use warnings; -use Storable qw(retrieve); -use Encode qw(decode_utf8 encode_utf8); -use URI::Escape qw(uri_unescape); -use HTML::Escape qw(escape_html); - -# Configuration -my $max_parallel = 50; # Max parallel search requests -my $lock_timeout = 30; # Seconds before dropping stale locks -my $max_results = 20; # Max search results to display -my $sa_file = 'sa.bin'; # Suffix Array index -my $cp_file = 'corpus.bin'; # Raw text corpus -my $map_file = 'file_map.dat'; # File metadata -my $lock_dir = '/tmp/search_locks'; # Semaphore directory - -# Concurrency control -mkdir $lock_dir, 0777 unless -d $lock_dir; -my $active_count = 0; -my $now = time(); - -opendir(my $dh, $lock_dir); -while (my $file = readdir($dh)) { - next unless $file =~ /\.lock$/; - my $path = "$lock_dir/$file"; - my $mtime = (stat($path))[9] || 0; - ($now - $mtime > $lock_timeout) ? unlink($path) : $active_count++; -} -closedir($dh); - -# Template variables -my $year = (localtime)[5] + 1900; -my $search_text = ''; - -# Busy check -if ($active_count >= $max_parallel) { - print "Content-Type: text/html\n\n"; - render_html("<p>Server busy. Please try again in a few seconds.</p>", "", $year); - exit; -} - -# Create semaphore lock -my $lock_file = "$lock_dir/$$.lock"; -open(my $fh_lock, '>', $lock_file); - -# Query decoding -if (($ENV{QUERY_STRING} || '') =~ /^q=([^&]*)/) { - my $raw_q = $1; - $raw_q =~ tr/+/ /; - $search_text = uri_unescape($raw_q); - $search_text = decode_utf8($search_text // ""); - $search_text =~ s/\P{Print}//g; - $search_text = substr($search_text, 0, 64); - $search_text =~ s/^\s+|\s+$//g; -} - -my $safe_search_text = escape_html($search_text); - -print "Content-Type: text/html\n\n"; - -if ($search_text eq '') { - final_output("<p>Please enter a search term above.</p>"); -} - -# Binary search -my @results; -my $query = encode_utf8(lc($search_text)); -my $query_len = length($query); - -if (-f $sa_file && -f $cp_file) { - open(my $fh_sa, '<', $sa_file) or die $!; - open(my $fh_cp, '<', $cp_file) or die $!; - binmode($fh_sa); - binmode($fh_cp); - - my $file_map = retrieve($map_file); - my $total_suffixes = (-s $sa_file) / 4; - - # Find left boundary - my ($low, $high) = (0, $total_suffixes - 1); - my $first_hit = -1; - - while ($low <= $high) { - my $mid = int(($low + $high) / 2); - seek($fh_sa, $mid * 4, 0); - read($fh_sa, my $bin_off, 4); - my $off = unpack("L", $bin_off); - seek($fh_cp, $off, 0); - read($fh_cp, my $text, $query_len); - - my $cmp = $text cmp $query; - if ($cmp >= 0) { - $first_hit = $mid if $cmp == 0; - $high = $mid - 1; - } else { - $low = $mid + 1; - } - } - - # Collect results if found - if ($first_hit != -1) { - my $last_hit = $first_hit; - ($low, $high) = ($first_hit, $total_suffixes - 1); - - # Find right boundary - while ($low <= $high) { - my $mid = int(($low + $high) / 2); - seek($fh_sa, $mid * 4, 0); - read($fh_sa, my $bin_off, 4); - my $off = unpack("L", $bin_off); - seek($fh_cp, $off, 0); - read($fh_cp, my $text, $query_len); - - if (($text cmp $query) <= 0) { - $last_hit = $mid if $text eq $query; - $low = $mid + 1; - } else { - $high = $mid - 1; - } - } - - my %seen; - for my $i ($first_hit .. $last_hit) { - seek($fh_sa, $i * 4, 0); - read($fh_sa, my $bin_off, 4); - my $offset = unpack("L", $bin_off); - - foreach my $m (@$file_map) { - if ($offset >= $m->{start} && $offset < $m->{end}) { - if (!$seen{$m->{path}}++) { - # Capture more than 50 chars for trimming - my $snip_start = ($offset - 30 < $m->{start}) ? $m->{start} : $offset - 30; - my $max_len = $m->{end} - $snip_start; - my $read_len = ($max_len > 120) ? 120 : $max_len; - seek($fh_cp, $snip_start, 0); - read($fh_cp, my $raw_snip, $read_len); - - my $snippet = decode_utf8($raw_snip, Encode::FB_QUIET) // $raw_snip; - $snippet =~ s/\s+/ /g; # Normalize whitespace - - # Trim start: Partial word removal - if ($snip_start > $m->{start}) { - $snippet =~ s/^[^\s]*\s//; - } - - # Trim end: Length limit and partial word removal - my $has_more = 0; - if (length($snippet) > 50) { - $snippet = substr($snippet, 0, 50); - $has_more = 1 if $snippet =~ s/\s+[^\s]*$//; - } - elsif ($snip_start + $read_len < $m->{end}) { - # This check handles snippets that are naturally short but - # there's still more text in the article we didn't read - $has_more = 1; - } - - # Cleanup & capitalize - $snippet = ucfirst($snippet); - $snippet = escape_html($snippet) . ($has_more ? "..." : ""); - - my $clean_path = $m->{path}; - $clean_path =~ s|^\.\./_site/||; - - push @results, { - path => $clean_path, - title => $m->{title},, - snippet => $snippet - }; - } - last; - } - } - last if scalar @results >= $max_results; - } - } - close($fh_sa); - close($fh_cp); -} - -# --- Formatting & Output --- -my $list_html = ""; -if (@results == 0) { - $list_html = "<p>No results found for \"<b>$safe_search_text</b>\".</p>"; -} else { - $list_html = "<ul>" . join('', map { - "<li><a href=\"/$_->{path}\">$_->{title}</a><br><small>$_->{snippet}</small></li>" - } @results) . "</ul>"; -} - -final_output($list_html); - -# --- Helpers --- -sub final_output { - my ($content) = @_; - render_html($content, $safe_search_text, $year); - if ($fh_lock) { close($fh_lock); unlink($lock_file); } - exit; -} - -sub render_html { - my ($content, $q_val, $yr) = @_; - print <<"HTML"; -<!DOCTYPE html> -<html lang="en-us"> -<head> - <meta charset="utf-8"> - <meta name="viewport" content="width=device-width, initial-scale=1"> - <title>Journal | Search</title> - <link rel="stylesheet" href="/assets/css/main.css"> -</head> -<body> - <header> - <h1><a href="/">Journal</a></h1> / - <h1><a href="/cgi-bin/find.cgi">Search</a></h1> - </header> - <article> - <form id="search-bar" action="" method="GET"> - <input id="search-box" type="text" name="q" value="$q_val"> - <input id="search-btn" type="submit" value="Search"> - </form> - $content - </article> -</body> -</html> -HTML -} - diff --git a/cgi-bin/rgx-search.cgi b/cgi-bin/rgx-search.cgi new file mode 100644 index 0000000..67815c8 --- /dev/null +++ b/cgi-bin/rgx-search.cgi @@ -0,0 +1,207 @@ +#!/usr/bin/perl + +use strict; +use warnings; +use File::Spec; +use Encode qw(decode_utf8); +use URI::Escape qw(uri_unescape); +use HTML::Escape qw(escape_html); + +binmode(STDOUT, ":utf8"); + +# --- Configuration --- +my $max_parallel = 50; +my $lock_timeout = 30; +my $max_results = 20; +my $lock_dir = '/tmp/search_locks'; +my $directory = '../log/'; + +# --- Concurrency Control --- +mkdir $lock_dir, 0777 unless -d $lock_dir; +my $active_count = 0; +my $now = time(); + +opendir(my $dh_lock, $lock_dir); +while (my $file = readdir($dh_lock)) { + next unless $file =~ /\.lock$/; + my $path = "$lock_dir/$file"; + my $mtime = (stat($path))[9] || 0; + ($now - $mtime > $lock_timeout) ? unlink($path) : $active_count++; +} +closedir($dh_lock); + +if ($active_count >= $max_parallel) { + render_html("<p>Server busy. Please try again in a few seconds.</p>", ""); + exit; +} + +my $lock_file = "$lock_dir/$$.lock"; +open(my $fh_lock, '>', $lock_file) or die "Cannot create lock: $!"; + +# --- Query Decoding --- +my $search_text = ''; +if (($ENV{QUERY_STRING} || '') =~ /^q=([^&]*)/) { + my $raw_q = $1; + $raw_q =~ tr/+/ /; + $search_text = uri_unescape($raw_q); + $search_text = decode_utf8($search_text // ""); + $search_text =~ s/\P{Print}//g; + $search_text = substr($search_text, 0, 64); + $search_text =~ s/^\s+|\s+$//g; +} + +if ($search_text eq '') { + final_output("<p>Please enter a search term above.</p>"); +} + +# --- Search --- +my @results; +my $files_read = 0; + +if ($search_text =~ /\S/) { + my @stack = ($directory); + + while (@stack) { + # Exit search immediately if limit reached + last if scalar @results >= $max_results; + + my $current_path = pop @stack; + + if (-d $current_path) { + if (opendir(my $dh, $current_path)) { + while (my $entry = readdir($dh)) { + next if $entry =~ /^\.\.?$/; + push @stack, File::Spec->catfile($current_path, $entry); + } + closedir($dh); + } + } + elsif (-f $current_path && $current_path =~ /index\.html$/) { + if (open my $fh, '<:utf8', $current_path) { + $files_read++; + my $raw_content = do { local $/; <$fh> }; + close $fh; + + my ($article) = $raw_content =~ /<article[^>]*>(.*?)<\/article>/is; + $article =~ s/<pre.*?>.*?<\/pre>//isg; + $article =~ s/<code.*?>.*?<\/code>//isg; + next unless $article; + + # Clean for accurate regex offsets + my $clean_text = $article; + $clean_text =~ s/<[^>]*>/ /g; # Tags to space to prevent word mashing + $clean_text =~ s/\s+/ /g; + $clean_text =~ s/^\s+|\s+$//g; + + if ($clean_text =~ /(\Q$search_text\E)/i) { + my $match_pos = "$-[0]"; + my $match_end = "$+[0]"; + + # Grab a context window (120 chars padding) + my $grab_start = ($match_pos > 120) ? $match_pos - 120 : 0; + my $grab_len = ($match_end - $grab_start) + 120; + my $raw_chunk = substr($clean_text, $grab_start, $grab_len); + + my $is_start = ($grab_start == 0); + my $is_end = ($grab_start + $grab_len >= length($clean_text)); + + my $snippet = trim_and_clean_snippet($raw_chunk, $is_start, $is_end); + + my ($title) = $raw_content =~ /<title>(.*?)<\/title>/is; + push @results, { + path => $current_path, + title => $title || $current_path, + snippet => $snippet + }; + } + } + } + } +} + +my $safe_search_text = escape_html($search_text); + +my $list_html = ""; +if (@results == 0) { + $list_html = "<p>No results found for \"<b>$safe_search_text</b>\".</p>"; +} else { + $list_html = "<ul>" . join('', map { + "<li><a href=\"/$_->{path}\">$_->{title}</a><br><small>$_->{snippet}</small></li>" + } @results) . "</ul>"; +} + +final_output($list_html); + +sub final_output { + my ($content) = @_; + render_html($content, $safe_search_text); + if ($fh_lock) { close($fh_lock); unlink($lock_file); } + exit; +} + +sub trim_and_clean_snippet { + my ($raw_chunk, $is_start, $is_end) = @_; + + # Start check: + # If we aren't at the very start of the article, we likely have a leading fragment. + # We search for the first space to drop the partial word. + if (!$is_start) { + $raw_chunk =~ s/^[^\s]*\s//; + } + + # Length and end Check: + my $show_ellipsis = !$is_end; + + if (length($raw_chunk) > 160) { + $raw_chunk = substr($raw_chunk, 0, 160); + + # Look for the last space within our 160 chars to avoid cutting a word in half. + # If we find a space and trim, we add the ellipsis. + if ($raw_chunk =~ s/\s+[^\s]*$//) { + $show_ellipsis = 1; + } + } + + # Sometimes leading punctuation or weird fragments remain after the trim. + $raw_chunk =~ s/^[:;,.?!\s]+//; # Remove leading punctuation/space + + # Final polish + $raw_chunk = ucfirst($raw_chunk); + + # Ensure it ends cleanly: if it doesn't end in terminal punctuation, add ellipsis + my $final_text = escape_html($raw_chunk); + if ($show_ellipsis && $raw_chunk !~ /[.!?]$/) { + $final_text .= "..."; + } + + return $final_text; +} + +sub render_html { + my ($content, $q_val) = @_; + print "Content-Type: text/html; charset=UTF-8\n\n"; + print <<"HTML"; +<!DOCTYPE html> +<html lang="en-us"> +<head> + <meta charset="utf-8"> + <meta name="viewport" content="width=device-width, initial-scale=1"> + <title>Journal | Search</title> + <link rel="stylesheet" href="/assets/css/main.css"> +</head> +<body> + <header> + <h1><a href="/">Journal</a></h1> / + <h1><a href="/cgi-bin/search.cgi">Search</a></h1> + </header> + <article> + <form id="search-bar" action="" method="GET"> + <input id="search-box" type="text" name="q" value="$q_val"> + <input id="search-btn" type="submit" value="Search"> + </form> + $content + </article> +</body> +</html> +HTML +} diff --git a/cgi-bin/search.cgi b/cgi-bin/search.cgi index 67815c8..c5ea024 100644 --- a/cgi-bin/search.cgi +++ b/cgi-bin/search.cgi @@ -2,43 +2,47 @@ use strict; use warnings; -use File::Spec; -use Encode qw(decode_utf8); +use Storable qw(retrieve); +use Encode qw(decode_utf8 encode_utf8); use URI::Escape qw(uri_unescape); use HTML::Escape qw(escape_html); -binmode(STDOUT, ":utf8"); - # --- Configuration --- my $max_parallel = 50; my $lock_timeout = 30; my $max_results = 20; +my $sa_file = 'sa.bin'; +my $cp_file = 'corpus.bin'; +my $map_file = 'file_map.dat'; my $lock_dir = '/tmp/search_locks'; -my $directory = '../log/'; -# --- Concurrency Control --- +binmode(STDOUT, ":utf8"); + +# --- Concurrency control --- mkdir $lock_dir, 0777 unless -d $lock_dir; my $active_count = 0; my $now = time(); -opendir(my $dh_lock, $lock_dir); -while (my $file = readdir($dh_lock)) { +opendir(my $dh, $lock_dir); +while (my $file = readdir($dh)) { next unless $file =~ /\.lock$/; my $path = "$lock_dir/$file"; my $mtime = (stat($path))[9] || 0; ($now - $mtime > $lock_timeout) ? unlink($path) : $active_count++; } -closedir($dh_lock); +closedir($dh); +my $lock_file = "$lock_dir/$$.lock"; +my $fh_lock; + +# Busy check if ($active_count >= $max_parallel) { - render_html("<p>Server busy. Please try again in a few seconds.</p>", ""); - exit; + render_html("<p>Server busy. Please try again in a few seconds.</p>", ""); } -my $lock_file = "$lock_dir/$$.lock"; -open(my $fh_lock, '>', $lock_file) or die "Cannot create lock: $!"; +# Create semaphore lock +open($fh_lock, '>', $lock_file) or die "Cannot create lock: $!"; -# --- Query Decoding --- my $search_text = ''; if (($ENV{QUERY_STRING} || '') =~ /^q=([^&]*)/) { my $raw_q = $1; @@ -50,158 +54,192 @@ if (($ENV{QUERY_STRING} || '') =~ /^q=([^&]*)/) { $search_text =~ s/^\s+|\s+$//g; } +my $safe_search_text = escape_html($search_text); + if ($search_text eq '') { - final_output("<p>Please enter a search term above.</p>"); + render_html("<p>Please enter a search term above.</p>", ""); } -# --- Search --- +# --- Suffix array binary search --- my @results; -my $files_read = 0; - -if ($search_text =~ /\S/) { - my @stack = ($directory); - - while (@stack) { - # Exit search immediately if limit reached - last if scalar @results >= $max_results; - - my $current_path = pop @stack; +my $query = encode_utf8(lc($search_text)); +my $query_len = length($query); + +if (-f $sa_file && -f $cp_file) { + open(my $fh_sa, '<', $sa_file) or die $!; + open(my $fh_cp, '<', $cp_file) or die $!; + binmode($fh_sa); + binmode($fh_cp); + + my $file_map = retrieve($map_file); + my $total_suffixes = (-s $sa_file) / 4; + + # Find left boundary + my ($low, $high) = (0, $total_suffixes - 1); + my $first_hit = -1; + + while ($low <= $high) { + my $mid = int(($low + $high) / 2); + seek($fh_sa, $mid * 4, 0); + read($fh_sa, my $bin_off, 4); + my $off = unpack("L", $bin_off); + seek($fh_cp, $off, 0); + read($fh_cp, my $text, $query_len); + + my $cmp = $text cmp $query; + if ($cmp >= 0) { + $first_hit = $mid if $cmp == 0; + $high = $mid - 1; + } else { + $low = $mid + 1; + } + } - if (-d $current_path) { - if (opendir(my $dh, $current_path)) { - while (my $entry = readdir($dh)) { - next if $entry =~ /^\.\.?$/; - push @stack, File::Spec->catfile($current_path, $entry); - } - closedir($dh); + # Collect results + if ($first_hit != -1) { + my $last_hit = $first_hit; + ($low, $high) = ($first_hit, $total_suffixes - 1); + + # Find right boundary + while ($low <= $high) { + my $mid = int(($low + $high) / 2); + seek($fh_sa, $mid * 4, 0); + read($fh_sa, my $bin_off, 4); + my $off = unpack("L", $bin_off); + seek($fh_cp, $off, 0); + read($fh_cp, my $text, $query_len); + + if (($text cmp $query) <= 0) { + $last_hit = $mid if $text eq $query; + $low = $mid + 1; + } else { + $high = $mid - 1; } } - elsif (-f $current_path && $current_path =~ /index\.html$/) { - if (open my $fh, '<:utf8', $current_path) { - $files_read++; - my $raw_content = do { local $/; <$fh> }; - close $fh; - - my ($article) = $raw_content =~ /<article[^>]*>(.*?)<\/article>/is; - $article =~ s/<pre.*?>.*?<\/pre>//isg; - $article =~ s/<code.*?>.*?<\/code>//isg; - next unless $article; - - # Clean for accurate regex offsets - my $clean_text = $article; - $clean_text =~ s/<[^>]*>/ /g; # Tags to space to prevent word mashing - $clean_text =~ s/\s+/ /g; - $clean_text =~ s/^\s+|\s+$//g; - - if ($clean_text =~ /(\Q$search_text\E)/i) { - my $match_pos = "$-[0]"; - my $match_end = "$+[0]"; - - # Grab a context window (120 chars padding) - my $grab_start = ($match_pos > 120) ? $match_pos - 120 : 0; - my $grab_len = ($match_end - $grab_start) + 120; - my $raw_chunk = substr($clean_text, $grab_start, $grab_len); - - my $is_start = ($grab_start == 0); - my $is_end = ($grab_start + $grab_len >= length($clean_text)); - - my $snippet = trim_and_clean_snippet($raw_chunk, $is_start, $is_end); - - my ($title) = $raw_content =~ /<title>(.*?)<\/title>/is; - push @results, { - path => $current_path, - title => $title || $current_path, - snippet => $snippet - }; + + my %seen; + for my $i ($first_hit .. $last_hit) { + seek($fh_sa, $i * 4, 0); + read($fh_sa, my $bin_off, 4); + my $offset = unpack("L", $bin_off); + + foreach my $m (@$file_map) { + if ($offset >= $m->{start} && $offset < $m->{end}) { + if (!$seen{$m->{path}}++) { + # Grab context window around the match byte offset + my $grab_start = ($offset > 150) ? $offset - 150 : $m->{start}; + if ($grab_start < $m->{start}) { $grab_start = $m->{start}; } + + my $grab_len = ($offset - $grab_start) + 300; + if ($grab_start + $grab_len > $m->{end}) { + $grab_len = $m->{end} - $grab_start; + } + + seek($fh_cp, $grab_start, 0); + read($fh_cp, my $raw_chunk, $grab_len); + + my $is_start = ($grab_start == $m->{start}); + my $is_end = ($grab_start + $grab_len >= $m->{end}); + + my $snippet = trim_and_clean_snippet($raw_chunk, $is_start, $is_end); + + my $clean_path = $m->{path}; + $clean_path =~ s|^\.\./_site/||; + + push @results, { + path => $clean_path, + title => $m->{title}, + snippet => $snippet + }; + } + last; } } + last if scalar @results >= $max_results; } } + close($fh_sa); + close($fh_cp); } -my $safe_search_text = escape_html($search_text); - +# Prepare output my $list_html = ""; if (@results == 0) { - $list_html = "<p>No results found for \"<b>$safe_search_text</b>\".</p>"; + $list_html = "<p>No results found for \"<b>$safe_search_text</b>\".</p>"; } else { - $list_html = "<ul>" . join('', map { - "<li><a href=\"/$_->{path}\">$_->{title}</a><br><small>$_->{snippet}</small></li>" - } @results) . "</ul>"; + $list_html = "<ul>" . join('', map { + "<li><a href=\"/$_->{path}\">$_->{title}</a><br><small>$_->{snippet}</small></li>" + } @results) . "</ul>"; } -final_output($list_html); - -sub final_output { - my ($content) = @_; - render_html($content, $safe_search_text); - if ($fh_lock) { close($fh_lock); unlink($lock_file); } - exit; -} +render_html($list_html, $safe_search_text); sub trim_and_clean_snippet { my ($raw_chunk, $is_start, $is_end) = @_; - # Start check: - # If we aren't at the very start of the article, we likely have a leading fragment. - # We search for the first space to drop the partial word. + # Decode and normalize + my $text = decode_utf8($raw_chunk, Encode::FB_QUIET) // $raw_chunk; + $text =~ s/\s+/ /g; + $text =~ s/^\s+|\s+$//g; + + # Front-end trim: remove partial leading word if (!$is_start) { - $raw_chunk =~ s/^[^\s]*\s//; + $text =~ s/^[^\s]*\s//; } - # Length and end Check: + # Length control: target ~160 characters my $show_ellipsis = !$is_end; - - if (length($raw_chunk) > 160) { - $raw_chunk = substr($raw_chunk, 0, 160); - - # Look for the last space within our 160 chars to avoid cutting a word in half. - # If we find a space and trim, we add the ellipsis. - if ($raw_chunk =~ s/\s+[^\s]*$//) { + if (length($text) > 160) { + $text = substr($text, 0, 160); + if ($text =~ s/\s+[^\s]*$//) { $show_ellipsis = 1; } } - # Sometimes leading punctuation or weird fragments remain after the trim. - $raw_chunk =~ s/^[:;,.?!\s]+//; # Remove leading punctuation/space - - # Final polish - $raw_chunk = ucfirst($raw_chunk); - - # Ensure it ends cleanly: if it doesn't end in terminal punctuation, add ellipsis - my $final_text = escape_html($raw_chunk); - if ($show_ellipsis && $raw_chunk !~ /[.!?]$/) { - $final_text .= "..."; - } + # Clean leading punctuation, then capitalize + $text =~ s/^[:;,.?!\s]+//; + $text = ucfirst($text); - return $final_text; + my $html = escape_html($text); + if ($show_ellipsis && $text !~ /[.!?]$/) { + $html .= "..."; + } + return $html; } sub render_html { - my ($content, $q_val) = @_; + my ($content, $q_val) = @_; + print "Content-Type: text/html; charset=UTF-8\n\n"; - print <<"HTML"; + print <<"HTML"; <!DOCTYPE html> <html lang="en-us"> <head> - <meta charset="utf-8"> - <meta name="viewport" content="width=device-width, initial-scale=1"> - <title>Journal | Search</title> - <link rel="stylesheet" href="/assets/css/main.css"> + <meta charset="utf-8"> + <meta name="viewport" content="width=device-width, initial-scale=1"> + <title>Journal | Search</title> + <link rel="stylesheet" href="/assets/css/main.css"> </head> <body> - <header> - <h1><a href="/">Journal</a></h1> / - <h1><a href="/cgi-bin/search.cgi">Search</a></h1> - </header> - <article> - <form id="search-bar" action="" method="GET"> - <input id="search-box" type="text" name="q" value="$q_val"> - <input id="search-btn" type="submit" value="Search"> - </form> - $content - </article> + <header> + <h1><a href="/">Journal</a></h1> / + <h1><a href="/cgi-bin/search.cgi">Search</a></h1> + </header> + <article> + <form id="search-bar" action="" method="GET"> + <input id="search-box" type="text" name="q" value="$q_val"> + <input id="search-btn" type="submit" value="Search"> + </form> + $content + </article> </body> </html> HTML + + # Final cleanup and exit + if (defined $fh_lock) { + close($fh_lock); + unlink($lock_file) if -e $lock_file; + } + exit; } diff --git a/deploy.sh b/deploy.sh new file mode 100755 index 0000000..8f7fa52 --- /dev/null +++ b/deploy.sh @@ -0,0 +1,29 @@ +#!/bin/ksh + +set -e + +if [ -f .env ]; then + . ./.env +else + echo ".env file not found" + exit 1 +fi + +SSH_CMD="ssh -t -i $SSH_KEY $SERVER_USER@$SERVER_IP" + +echo "Building Jekyll site..." +JEKYLL_ENV=production bundle exec jekyll build + +echo "Running indexer..." +(cd cgi-bin && perl indexer.pl) + +echo "Running remote presh..." +$SSH_CMD "doas $REMOTE_PATH/presh" + +echo "Syncing files..." +rsync --rsync-path=openrsync -a --delete -e "ssh -i $SSH_KEY" $HOME/www/_site/ $SERVER_USER@$SERVER_IP:$REMOTE_PATH/$SITE + +echo "Running remote sh..." +$SSH_CMD "doas $REMOTE_PATH/sh" + +echo "Deployment complete" diff --git a/util/article_stats.sh b/util/article_stats.sh new file mode 100644 index 0000000..c91afa1 --- /dev/null +++ b/util/article_stats.sh @@ -0,0 +1,61 @@ +#!/bin/ksh + +# Find all index.html files and get sizes in bytes +# Using ls -ln for portability across AIX, Solaris, and BSD +tmp_file=$(mktemp /tmp/sizes.XXXXXX) + +find . -type f -name "index.html" -exec ls -ln {} + | awk '{print $5 / 1024}' | sort -n > "$tmp_file" + +count=$(wc -l < "$tmp_file") + +if [ "$count" -eq 0 ]; then + echo "No index.html files found." + rm -f "$tmp_file" + exit 0 +fi + +# Process statistics using awk +stats=$(awk ' + { + a[NR] = $1; + sum += $1; + count[$1]++; + if ($1 > max) max = $1; + } + END { + # Calculate Average + avg = sum / NR; + + # Calculate Median + if (NR % 2 == 1) { + med = a[int(NR/2) + 1]; + } else { + med = (a[NR/2] + a[NR/2 + 1]) / 2; + } + + # Calculate Mode + max_freq = 0; + mode = "N/A"; + for (val in count) { + if (count[val] > max_freq) { + max_freq = count[val]; + mode = val; + } + } + + printf "%.2f|%.2f|%.2f|%d|%.2f", avg, med, mode, max_freq, max; + }' "$tmp_file") + +# Parse the awk results into shell variables +IFS='|' read average median mode freq maximum <<EOF +$stats +EOF + +echo "--- Statistics for index.html (KB) ---" +echo "Files found: $count" +echo "Average: $average KB" +echo "Median: $median KB" +echo "Mode: $mode KB (appears $freq times)" +echo "Maximum: $maximum KB" + +rm -f "$tmp_file" |
