summaryrefslogtreecommitdiffstats
path: root/_site/cgi-bin/find.cgi
diff options
context:
space:
mode:
Diffstat (limited to '_site/cgi-bin/find.cgi')
-rw-r--r--_site/cgi-bin/find.cgi34
1 files changed, 20 insertions, 14 deletions
diff --git a/_site/cgi-bin/find.cgi b/_site/cgi-bin/find.cgi
index 5f95e3a..ab066dd 100644
--- a/_site/cgi-bin/find.cgi
+++ b/_site/cgi-bin/find.cgi
@@ -7,7 +7,7 @@ use Encode qw(decode_utf8 encode_utf8);
use URI::Escape qw(uri_unescape);
use HTML::Escape qw(escape_html);
-# --- Configuration ---
+# Configuration
my $max_parallel = 50; # Max parallel search requests
my $lock_timeout = 30; # Seconds before dropping stale locks
my $max_results = 20; # Max search results to display
@@ -16,7 +16,7 @@ my $cp_file = 'corpus.bin'; # Raw text corpus
my $map_file = 'file_map.dat'; # File metadata
my $lock_dir = '/tmp/search_locks'; # Semaphore directory
-# --- Concurrency Control ---
+# Concurrency control
mkdir $lock_dir, 0777 unless -d $lock_dir;
my $active_count = 0;
my $now = time();
@@ -45,7 +45,7 @@ if ($active_count >= $max_parallel) {
my $lock_file = "$lock_dir/$$.lock";
open(my $fh_lock, '>', $lock_file);
-# --- Query Decoding ---
+# Query decoding
if (($ENV{QUERY_STRING} || '') =~ /^q=([^&]*)/) {
my $raw_q = $1;
$raw_q =~ tr/+/ /;
@@ -64,7 +64,7 @@ if ($search_text eq '') {
final_output("<p>Please enter a search term above.</p>");
}
-# --- Binary Search Logic ---
+# Binary search
my @results;
my $query = encode_utf8(lc($search_text));
my $query_len = length($query);
@@ -130,38 +130,44 @@ if (-f $sa_file && -f $cp_file) {
foreach my $m (@$file_map) {
if ($offset >= $m->{start} && $offset < $m->{end}) {
if (!$seen{$m->{path}}++) {
- # 1. Capture slightly more than 50 chars for trimming
+ # Capture more than 50 chars for trimming
my $snip_start = ($offset - 30 < $m->{start}) ? $m->{start} : $offset - 30;
+ my $max_len = $m->{end} - $snip_start;
+ my $read_len = ($max_len > 120) ? 120 : $max_len;
seek($fh_cp, $snip_start, 0);
- read($fh_cp, my $raw_snip, 120);
+ read($fh_cp, my $raw_snip, $read_len);
my $snippet = decode_utf8($raw_snip, Encode::FB_QUIET) // $raw_snip;
$snippet =~ s/\s+/ /g; # Normalize whitespace
- # 2. Trim Start: Partial word removal
+ # Trim start: Partial word removal
if ($snip_start > $m->{start}) {
$snippet =~ s/^[^\s]*\s//;
}
- # 3. Trim End: Length limit and partial word removal
+ # Trim end: Length limit and partial word removal
my $has_more = 0;
if (length($snippet) > 50) {
$snippet = substr($snippet, 0, 50);
$has_more = 1 if $snippet =~ s/\s+[^\s]*$//;
}
+ elsif ($snip_start + $read_len < $m->{end}) {
+ # This check handles snippets that are naturally short but
+ # there's still more text in the article we didn't read
+ $has_more = 1;
+ }
- # 4. Cleanup & Capitalize
+ # Cleanup & capitalize
$snippet = ucfirst($snippet);
+ $snippet = escape_html($snippet) . ($has_more ? "..." : "");
+
my $clean_path = $m->{path};
$clean_path =~ s|^\.\./_site/||;
- # 5. Build Final Snippet
- my $display_snippet = escape_html($snippet) . ($has_more ? "..." : "");
-
push @results, {
path => $clean_path,
- title => (split('/', $m->{path}))[-2],
- snippet => $display_snippet
+ title => $m->{title},,
+ snippet => $snippet
};
}
last;