From 819bf74c2841fabdcc481e12e13615d48a92cb7f Mon Sep 17 00:00:00 2001 From: Sadeep Madurange Date: Wed, 6 May 2026 19:42:33 +0800 Subject: Change directory structure and add benchmark runner. --- find_regex.cgi | 144 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100755 find_regex.cgi (limited to 'find_regex.cgi') diff --git a/find_regex.cgi b/find_regex.cgi new file mode 100755 index 0000000..d826c12 --- /dev/null +++ b/find_regex.cgi @@ -0,0 +1,144 @@ +#!/usr/bin/perl + +use strict; +use warnings; +use File::Find; +use Time::HiRes qw(gettimeofday tv_interval); +use BSD::Resource; +use Encode qw(decode_utf8); + +# 1. Start Benchmark Timer +my $start_time = [gettimeofday]; + +# Helper to keep HTML output safe +sub escape_html { + my $str = shift; + return "" unless defined $str; + $str =~ s/&/&/g; + $str =~ s//>/g; + $str =~ s/"/"/g; + $str =~ s/'/'/g; + return $str; +} + +# Parse Query String (q=keyword) +my %params; +if ($ENV{QUERY_STRING}) { + foreach my $pair (split /&/, $ENV{QUERY_STRING}) { + my ($key, $value) = split /=/, $pair; + $value //= ''; + $value =~ tr/+/ /; + $value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; + $params{$key} = decode_utf8($value); + } +} + +my $search_text = $params{'q'} || ''; +$search_text = substr($search_text, 0, 64); +$search_text =~ s/[^a-zA-Z0-9 ]//g; + +# Configuration +my $directory = '_site/log/'; +my @results; +my $files_read = 0; + +# 2. The Linear Search (Crawl) +if ($search_text =~ /\S/) { + find({ + wanted => sub { + # Only look at index.html files inside the subdirectories + return unless -f $_ && $_ eq 'index.html'; + + if (open my $fh, '<', $_) { + $files_read++; + # Slurp the entire file (approx 16KB per your seed script) + my $content = do { local $/; <$fh> }; + close $fh; + + # Regex match (Case Insensitive) + if ($content =~ /\Q$search_text\E/i) { + my ($title) = $content =~ /(.*?)<\/title>/is; + my ($p_content) = $content =~ /<p[^>]*>(.*?)<\/p>/is; + + # Clean up snippet + my $snippet = $p_content || ""; + $snippet =~ s/<[^>]*>//g; # Strip internal tags + $snippet =~ s/\s+/ /g; + $snippet = substr($snippet, 0, 100); + + push @results, { + path => $File::Find::name, + title => $title || $File::Find::name, + snippet => $snippet . "..." + }; + } + } + # Stop collecting after 20 results for display, + # but the benchmark usually looks for unique keywords + # where only 1 result exists. + }, + no_chdir => 0, + }, $directory); +} + +# 3. Calculate Performance Metrics +my $end_time = [gettimeofday]; +my $elapsed = tv_interval($start_time, $end_time); + +my $rusage = getrusage(); +my $user_cpu = $rusage->utime; +my $system_cpu = $rusage->stime; +my $max_rss = $rusage->maxrss; + +# 4. Generate Output +print "Content-Type: text/html\n\n"; + +my $list_html = ""; +if ($search_text eq '') { + $list_html = "<p>Please enter a search term.</p>"; +} elsif (@results == 0) { + $list_html = "<p>No results found for \"<b>" . escape_html($search_text) . "</b>\".</p>"; +} else { + $list_html = "<ul>"; + foreach my $res (@results) { + $list_html .= sprintf('<li><a href="/%s">%s</a><br><small>%s</small></li>', + $res->{path}, escape_html($res->{title}), escape_html($res->{snippet})); + } + $list_html .= "</ul>"; +} + +my $safe_q = escape_html($search_text); + +print <<"HTML"; +<!DOCTYPE html> +<html> +<head> + <meta charset="utf-8"> + <title>Regex Search Results + + + +

Regex Search (Linear Crawl)

+
+ + +
+ + $list_html + +
+ Performance Metrics:
+ Total Time: @{[ sprintf("%.4f", $elapsed) ]} seconds
+ User CPU: $user_cpu s
+ System CPU: $system_cpu s
+ Peak RAM: $max_rss KB
+ Files Read: $files_read (IO Activity) +
+ + +HTML -- cgit v1.2.3