diff options
| author | Sadeep Madurange <sadeep@asciimx.com> | 2026-05-07 16:04:43 +0800 |
|---|---|---|
| committer | Sadeep Madurange <sadeep@asciimx.com> | 2026-05-07 16:26:55 +0800 |
| commit | 0e0d336d7ec80ff00e0e4d9acf00267ad3faa214 (patch) | |
| tree | 29ad91e993f7fe4b226a8e43c20e20558e40f466 /cgi-bin/search.cgi | |
| parent | 46d0d9de4485c152310957ebb1ec9b2d8af135e6 (diff) | |
| download | www-0e0d336d7ec80ff00e0e4d9acf00267ad3faa214.tar.gz | |
Deployment script and replace regex search with SA search.
Diffstat (limited to 'cgi-bin/search.cgi')
| -rw-r--r-- | cgi-bin/search.cgi | 288 |
1 files changed, 163 insertions, 125 deletions
diff --git a/cgi-bin/search.cgi b/cgi-bin/search.cgi index 67815c8..c5ea024 100644 --- a/cgi-bin/search.cgi +++ b/cgi-bin/search.cgi @@ -2,43 +2,47 @@ use strict; use warnings; -use File::Spec; -use Encode qw(decode_utf8); +use Storable qw(retrieve); +use Encode qw(decode_utf8 encode_utf8); use URI::Escape qw(uri_unescape); use HTML::Escape qw(escape_html); -binmode(STDOUT, ":utf8"); - # --- Configuration --- my $max_parallel = 50; my $lock_timeout = 30; my $max_results = 20; +my $sa_file = 'sa.bin'; +my $cp_file = 'corpus.bin'; +my $map_file = 'file_map.dat'; my $lock_dir = '/tmp/search_locks'; -my $directory = '../log/'; -# --- Concurrency Control --- +binmode(STDOUT, ":utf8"); + +# --- Concurrency control --- mkdir $lock_dir, 0777 unless -d $lock_dir; my $active_count = 0; my $now = time(); -opendir(my $dh_lock, $lock_dir); -while (my $file = readdir($dh_lock)) { +opendir(my $dh, $lock_dir); +while (my $file = readdir($dh)) { next unless $file =~ /\.lock$/; my $path = "$lock_dir/$file"; my $mtime = (stat($path))[9] || 0; ($now - $mtime > $lock_timeout) ? unlink($path) : $active_count++; } -closedir($dh_lock); +closedir($dh); +my $lock_file = "$lock_dir/$$.lock"; +my $fh_lock; + +# Busy check if ($active_count >= $max_parallel) { - render_html("<p>Server busy. Please try again in a few seconds.</p>", ""); - exit; + render_html("<p>Server busy. Please try again in a few seconds.</p>", ""); } -my $lock_file = "$lock_dir/$$.lock"; -open(my $fh_lock, '>', $lock_file) or die "Cannot create lock: $!"; +# Create semaphore lock +open($fh_lock, '>', $lock_file) or die "Cannot create lock: $!"; -# --- Query Decoding --- my $search_text = ''; if (($ENV{QUERY_STRING} || '') =~ /^q=([^&]*)/) { my $raw_q = $1; @@ -50,158 +54,192 @@ if (($ENV{QUERY_STRING} || '') =~ /^q=([^&]*)/) { $search_text =~ s/^\s+|\s+$//g; } +my $safe_search_text = escape_html($search_text); + if ($search_text eq '') { - final_output("<p>Please enter a search term above.</p>"); + render_html("<p>Please enter a search term above.</p>", ""); } -# --- Search --- +# --- Suffix array binary search --- my @results; -my $files_read = 0; - -if ($search_text =~ /\S/) { - my @stack = ($directory); - - while (@stack) { - # Exit search immediately if limit reached - last if scalar @results >= $max_results; - - my $current_path = pop @stack; +my $query = encode_utf8(lc($search_text)); +my $query_len = length($query); + +if (-f $sa_file && -f $cp_file) { + open(my $fh_sa, '<', $sa_file) or die $!; + open(my $fh_cp, '<', $cp_file) or die $!; + binmode($fh_sa); + binmode($fh_cp); + + my $file_map = retrieve($map_file); + my $total_suffixes = (-s $sa_file) / 4; + + # Find left boundary + my ($low, $high) = (0, $total_suffixes - 1); + my $first_hit = -1; + + while ($low <= $high) { + my $mid = int(($low + $high) / 2); + seek($fh_sa, $mid * 4, 0); + read($fh_sa, my $bin_off, 4); + my $off = unpack("L", $bin_off); + seek($fh_cp, $off, 0); + read($fh_cp, my $text, $query_len); + + my $cmp = $text cmp $query; + if ($cmp >= 0) { + $first_hit = $mid if $cmp == 0; + $high = $mid - 1; + } else { + $low = $mid + 1; + } + } - if (-d $current_path) { - if (opendir(my $dh, $current_path)) { - while (my $entry = readdir($dh)) { - next if $entry =~ /^\.\.?$/; - push @stack, File::Spec->catfile($current_path, $entry); - } - closedir($dh); + # Collect results + if ($first_hit != -1) { + my $last_hit = $first_hit; + ($low, $high) = ($first_hit, $total_suffixes - 1); + + # Find right boundary + while ($low <= $high) { + my $mid = int(($low + $high) / 2); + seek($fh_sa, $mid * 4, 0); + read($fh_sa, my $bin_off, 4); + my $off = unpack("L", $bin_off); + seek($fh_cp, $off, 0); + read($fh_cp, my $text, $query_len); + + if (($text cmp $query) <= 0) { + $last_hit = $mid if $text eq $query; + $low = $mid + 1; + } else { + $high = $mid - 1; } } - elsif (-f $current_path && $current_path =~ /index\.html$/) { - if (open my $fh, '<:utf8', $current_path) { - $files_read++; - my $raw_content = do { local $/; <$fh> }; - close $fh; - - my ($article) = $raw_content =~ /<article[^>]*>(.*?)<\/article>/is; - $article =~ s/<pre.*?>.*?<\/pre>//isg; - $article =~ s/<code.*?>.*?<\/code>//isg; - next unless $article; - - # Clean for accurate regex offsets - my $clean_text = $article; - $clean_text =~ s/<[^>]*>/ /g; # Tags to space to prevent word mashing - $clean_text =~ s/\s+/ /g; - $clean_text =~ s/^\s+|\s+$//g; - - if ($clean_text =~ /(\Q$search_text\E)/i) { - my $match_pos = "$-[0]"; - my $match_end = "$+[0]"; - - # Grab a context window (120 chars padding) - my $grab_start = ($match_pos > 120) ? $match_pos - 120 : 0; - my $grab_len = ($match_end - $grab_start) + 120; - my $raw_chunk = substr($clean_text, $grab_start, $grab_len); - - my $is_start = ($grab_start == 0); - my $is_end = ($grab_start + $grab_len >= length($clean_text)); - - my $snippet = trim_and_clean_snippet($raw_chunk, $is_start, $is_end); - - my ($title) = $raw_content =~ /<title>(.*?)<\/title>/is; - push @results, { - path => $current_path, - title => $title || $current_path, - snippet => $snippet - }; + + my %seen; + for my $i ($first_hit .. $last_hit) { + seek($fh_sa, $i * 4, 0); + read($fh_sa, my $bin_off, 4); + my $offset = unpack("L", $bin_off); + + foreach my $m (@$file_map) { + if ($offset >= $m->{start} && $offset < $m->{end}) { + if (!$seen{$m->{path}}++) { + # Grab context window around the match byte offset + my $grab_start = ($offset > 150) ? $offset - 150 : $m->{start}; + if ($grab_start < $m->{start}) { $grab_start = $m->{start}; } + + my $grab_len = ($offset - $grab_start) + 300; + if ($grab_start + $grab_len > $m->{end}) { + $grab_len = $m->{end} - $grab_start; + } + + seek($fh_cp, $grab_start, 0); + read($fh_cp, my $raw_chunk, $grab_len); + + my $is_start = ($grab_start == $m->{start}); + my $is_end = ($grab_start + $grab_len >= $m->{end}); + + my $snippet = trim_and_clean_snippet($raw_chunk, $is_start, $is_end); + + my $clean_path = $m->{path}; + $clean_path =~ s|^\.\./_site/||; + + push @results, { + path => $clean_path, + title => $m->{title}, + snippet => $snippet + }; + } + last; } } + last if scalar @results >= $max_results; } } + close($fh_sa); + close($fh_cp); } -my $safe_search_text = escape_html($search_text); - +# Prepare output my $list_html = ""; if (@results == 0) { - $list_html = "<p>No results found for \"<b>$safe_search_text</b>\".</p>"; + $list_html = "<p>No results found for \"<b>$safe_search_text</b>\".</p>"; } else { - $list_html = "<ul>" . join('', map { - "<li><a href=\"/$_->{path}\">$_->{title}</a><br><small>$_->{snippet}</small></li>" - } @results) . "</ul>"; + $list_html = "<ul>" . join('', map { + "<li><a href=\"/$_->{path}\">$_->{title}</a><br><small>$_->{snippet}</small></li>" + } @results) . "</ul>"; } -final_output($list_html); - -sub final_output { - my ($content) = @_; - render_html($content, $safe_search_text); - if ($fh_lock) { close($fh_lock); unlink($lock_file); } - exit; -} +render_html($list_html, $safe_search_text); sub trim_and_clean_snippet { my ($raw_chunk, $is_start, $is_end) = @_; - # Start check: - # If we aren't at the very start of the article, we likely have a leading fragment. - # We search for the first space to drop the partial word. + # Decode and normalize + my $text = decode_utf8($raw_chunk, Encode::FB_QUIET) // $raw_chunk; + $text =~ s/\s+/ /g; + $text =~ s/^\s+|\s+$//g; + + # Front-end trim: remove partial leading word if (!$is_start) { - $raw_chunk =~ s/^[^\s]*\s//; + $text =~ s/^[^\s]*\s//; } - # Length and end Check: + # Length control: target ~160 characters my $show_ellipsis = !$is_end; - - if (length($raw_chunk) > 160) { - $raw_chunk = substr($raw_chunk, 0, 160); - - # Look for the last space within our 160 chars to avoid cutting a word in half. - # If we find a space and trim, we add the ellipsis. - if ($raw_chunk =~ s/\s+[^\s]*$//) { + if (length($text) > 160) { + $text = substr($text, 0, 160); + if ($text =~ s/\s+[^\s]*$//) { $show_ellipsis = 1; } } - # Sometimes leading punctuation or weird fragments remain after the trim. - $raw_chunk =~ s/^[:;,.?!\s]+//; # Remove leading punctuation/space - - # Final polish - $raw_chunk = ucfirst($raw_chunk); - - # Ensure it ends cleanly: if it doesn't end in terminal punctuation, add ellipsis - my $final_text = escape_html($raw_chunk); - if ($show_ellipsis && $raw_chunk !~ /[.!?]$/) { - $final_text .= "..."; - } + # Clean leading punctuation, then capitalize + $text =~ s/^[:;,.?!\s]+//; + $text = ucfirst($text); - return $final_text; + my $html = escape_html($text); + if ($show_ellipsis && $text !~ /[.!?]$/) { + $html .= "..."; + } + return $html; } sub render_html { - my ($content, $q_val) = @_; + my ($content, $q_val) = @_; + print "Content-Type: text/html; charset=UTF-8\n\n"; - print <<"HTML"; + print <<"HTML"; <!DOCTYPE html> <html lang="en-us"> <head> - <meta charset="utf-8"> - <meta name="viewport" content="width=device-width, initial-scale=1"> - <title>Journal | Search</title> - <link rel="stylesheet" href="/assets/css/main.css"> + <meta charset="utf-8"> + <meta name="viewport" content="width=device-width, initial-scale=1"> + <title>Journal | Search</title> + <link rel="stylesheet" href="/assets/css/main.css"> </head> <body> - <header> - <h1><a href="/">Journal</a></h1> / - <h1><a href="/cgi-bin/search.cgi">Search</a></h1> - </header> - <article> - <form id="search-bar" action="" method="GET"> - <input id="search-box" type="text" name="q" value="$q_val"> - <input id="search-btn" type="submit" value="Search"> - </form> - $content - </article> + <header> + <h1><a href="/">Journal</a></h1> / + <h1><a href="/cgi-bin/search.cgi">Search</a></h1> + </header> + <article> + <form id="search-bar" action="" method="GET"> + <input id="search-box" type="text" name="q" value="$q_val"> + <input id="search-btn" type="submit" value="Search"> + </form> + $content + </article> </body> </html> HTML + + # Final cleanup and exit + if (defined $fh_lock) { + close($fh_lock); + unlink($lock_file) if -e $lock_file; + } + exit; } |
