diff options
| -rw-r--r-- | bm/BM_COMMIT_1000_100.txt (renamed from bm/BM_HISTORY_1000_100.txt) | 40 | ||||
| -rw-r--r-- | bm/BM_REBASE_1000_100.txt | 61 | ||||
| -rw-r--r-- | bm/BM_REPO_SIZE_200_20.txt | 16 | ||||
| -rw-r--r-- | bm/BM_REPO_SIZE_5000_50.txt | 14 | ||||
| -rw-r--r-- | bm/bm_commit.pl (renamed from bm/bm_rebase.pl) | 151 | ||||
| -rw-r--r-- | bm/bm_history.pl | 152 | ||||
| -rw-r--r-- | bm/bm_size.pl | 2 | ||||
| -rw-r--r-- | bm/seed.pl | 69 | ||||
| -rw-r--r-- | urn | 91 |
9 files changed, 250 insertions, 346 deletions
diff --git a/bm/BM_HISTORY_1000_100.txt b/bm/BM_COMMIT_1000_100.txt index 2885405..73a2b5a 100644 --- a/bm/BM_HISTORY_1000_100.txt +++ b/bm/BM_COMMIT_1000_100.txt @@ -1,60 +1,68 @@ ============================================================= - HISTORY BENCHMARK: 1000 files (100 commits) + COMMIT BENCHMARK: 1000 files (100 commits) + CONDITIONS: Depth=2, Files Mod=0.5%, Line Mod=5% + INITIAL REPO SIZE: 17332 KB ============================================================= SNAPSHOT: Commit #20 ------------------------------------------------------------- METRIC | URN | GIT ----------------+----------------------+--------------------- -Time | 0.35s | 0.09s +Time | 0.29s | 0.03s Max RSS | 0.02 MB | 0.01 MB Page faults | Maj:0 / Min:0 | Maj:0 / Min:0 -Inodes | 1301 | 2121 -Repo size | 18868 KB | 22076 KB +Inodes | 1300 | 1425 +Repo size | 6836 KB | 8296 KB ------------------------------------------------------------- SNAPSHOT: Commit #40 ------------------------------------------------------------- METRIC | URN | GIT ----------------+----------------------+--------------------- -Time | 0.46s | 0.11s +Time | 0.29s | 0.03s Max RSS | 0.02 MB | 0.01 MB Page faults | Maj:0 / Min:0 | Maj:0 / Min:0 -Inodes | 1341 | 2929 -Repo size | 19028 KB | 29136 KB +Inodes | 1340 | 1566 +Repo size | 7332 KB | 9268 KB ------------------------------------------------------------- SNAPSHOT: Commit #60 ------------------------------------------------------------- METRIC | URN | GIT ----------------+----------------------+--------------------- -Time | 0.43s | 0.12s +Time | 0.35s | 0.03s Max RSS | 0.02 MB | 0.01 MB Page faults | Maj:0 / Min:0 | Maj:0 / Min:0 -Inodes | 1381 | 3732 -Repo size | 19188 KB | 36088 KB +Inodes | 1381 | 1706 +Repo size | 7896 KB | 10236 KB ------------------------------------------------------------- SNAPSHOT: Commit #80 ------------------------------------------------------------- METRIC | URN | GIT ----------------+----------------------+--------------------- -Time | 0.45s | 0.08s +Time | 0.35s | 0.03s Max RSS | 0.02 MB | 0.01 MB Page faults | Maj:0 / Min:0 | Maj:0 / Min:0 -Inodes | 1421 | 4538 -Repo size | 19348 KB | 43104 KB +Inodes | 1421 | 1847 +Repo size | 8456 KB | 11200 KB ------------------------------------------------------------- SNAPSHOT: Commit #100 ------------------------------------------------------------- METRIC | URN | GIT ----------------+----------------------+--------------------- -Time | 0.44s | 0.10s +Time | 0.35s | 0.03s Max RSS | 0.02 MB | 0.01 MB Page faults | Maj:0 / Min:0 | Maj:0 / Min:0 -Inodes | 1462 | 5346 -Repo size | 19512 KB | 49980 KB +Inodes | 1462 | 1987 +Repo size | 9020 KB | 12168 KB +------------------------------------------------------------- + +AFTER GIT GC +------------------------------------------------------------- +Final Size | 9020 KB | 3812 KB +Final Inodes | 1462 | 41 ------------------------------------------------------------- TOTAL URN REBASES: 0 diff --git a/bm/BM_REBASE_1000_100.txt b/bm/BM_REBASE_1000_100.txt deleted file mode 100644 index 7550570..0000000 --- a/bm/BM_REBASE_1000_100.txt +++ /dev/null @@ -1,61 +0,0 @@ -============================================================= - REBASE BENCHMARK: 1000 files (100 commits) - CONDITIONS: Depth=2, Files Mod=5%, Change=50% -============================================================= - -SNAPSHOT: Commit #20 -------------------------------------------------------------- -METRIC | URN | GIT -----------------+----------------------+--------------------- -Time | 0.65s | 0.08s -Max RSS | 0.02 MB | 0.01 MB -Page faults | Maj:0 / Min:0 | Maj:0 / Min:0 -Inodes | 2272 | 2282 -Repo size | 38504 KB | 22700 KB -------------------------------------------------------------- - -SNAPSHOT: Commit #40 -------------------------------------------------------------- -METRIC | URN | GIT -----------------+----------------------+--------------------- -Time | 0.59s | 0.08s -Max RSS | 0.02 MB | 0.01 MB -Page faults | Maj:0 / Min:0 | Maj:0 / Min:0 -Inodes | 3332 | 3322 -Repo size | 59384 KB | 31188 KB -------------------------------------------------------------- - -SNAPSHOT: Commit #60 -------------------------------------------------------------- -METRIC | URN | GIT -----------------+----------------------+--------------------- -Time | 0.57s | 0.08s -Max RSS | 0.02 MB | 0.01 MB -Page faults | Maj:0 / Min:0 | Maj:0 / Min:0 -Inodes | 4392 | 4362 -Repo size | 80264 KB | 39676 KB -------------------------------------------------------------- - -SNAPSHOT: Commit #80 -------------------------------------------------------------- -METRIC | URN | GIT -----------------+----------------------+--------------------- -Time | 0.57s | 0.08s -Max RSS | 0.02 MB | 0.01 MB -Page faults | Maj:0 / Min:0 | Maj:0 / Min:0 -Inodes | 5452 | 5402 -Repo size | 101144 KB | 48156 KB -------------------------------------------------------------- - -SNAPSHOT: Commit #100 -------------------------------------------------------------- -METRIC | URN | GIT -----------------+----------------------+--------------------- -Time | 0.57s | 0.08s -Max RSS | 0.02 MB | 0.01 MB -Page faults | Maj:0 / Min:0 | Maj:0 / Min:0 -Inodes | 6512 | 6442 -Repo size | 122024 KB | 56644 KB -------------------------------------------------------------- - -TOTAL URN REBASES: 4950 diff --git a/bm/BM_REPO_SIZE_200_20.txt b/bm/BM_REPO_SIZE_200_20.txt index 69531c5..6c055bd 100644 --- a/bm/BM_REPO_SIZE_200_20.txt +++ b/bm/BM_REPO_SIZE_200_20.txt @@ -17,22 +17,22 @@ ACTION: Add ------------------------------------------------------------- METRIC | URN | GIT ----------------+----------------------+--------------------- -Time | 0.16s | 0.17s +Time | 0.17s | 0.17s Max RSS | 0.02 MB | 0.00 MB Page faults | Maj:0 / Min:0 | Maj:0 / Min:0 -Inodes | 225 | 360 -Repo size | 3700 KB | 3348 KB +Inodes | 225 | 370 +Repo size | 3608 KB | 3472 KB ------------------------------------------------------------- ACTION: Commit ------------------------------------------------------------- METRIC | URN | GIT ----------------+----------------------+--------------------- -Time | 0.17s | 0.04s +Time | 0.17s | 0.05s Max RSS | 0.02 MB | 0.01 MB Page faults | Maj:0 / Min:0 | Maj:0 / Min:0 -Inodes | 347 | 397 -Repo size | 4212 KB | 3496 KB +Inodes | 352 | 408 +Repo size | 4140 KB | 3624 KB ------------------------------------------------------------- ACTION: Status(Clean) @@ -42,7 +42,7 @@ METRIC | URN | GIT Time | 0.10s | 0.01s Max RSS | 0.02 MB | 0.00 MB Page faults | Maj:0 / Min:0 | Maj:0 / Min:0 -Inodes | 347 | 397 -Repo size | 4212 KB | 3496 KB +Inodes | 352 | 408 +Repo size | 4140 KB | 3624 KB ------------------------------------------------------------- diff --git a/bm/BM_REPO_SIZE_5000_50.txt b/bm/BM_REPO_SIZE_5000_50.txt index cd90798..e194153 100644 --- a/bm/BM_REPO_SIZE_5000_50.txt +++ b/bm/BM_REPO_SIZE_5000_50.txt @@ -6,7 +6,7 @@ ACTION: Status ------------------------------------------------------------- METRIC | URN | GIT ----------------+----------------------+--------------------- -Time | 0.26s | 0.00s +Time | 0.27s | 0.00s Max RSS | 0.02 MB | 0.00 MB Page faults | Maj:0 / Min:0 | Maj:0 / Min:0 Inodes | 6 | 27 @@ -17,32 +17,32 @@ ACTION: Add ------------------------------------------------------------- METRIC | URN | GIT ----------------+----------------------+--------------------- -Time | 2.82s | 4.62s +Time | 2.90s | 4.54s Max RSS | 0.02 MB | 0.01 MB Page faults | Maj:0 / Min:0 | Maj:0 / Min:0 Inodes | 5055 | 5284 -Repo size | 89444 KB | 70360 KB +Repo size | 89996 KB | 69844 KB ------------------------------------------------------------- ACTION: Commit ------------------------------------------------------------- METRIC | URN | GIT ----------------+----------------------+--------------------- -Time | 1.18s | 0.93s +Time | 1.60s | 0.93s Max RSS | 0.03 MB | 0.01 MB Page faults | Maj:0 / Min:0 | Maj:0 / Min:0 Inodes | 5264 | 5342 -Repo size | 91620 KB | 70592 KB +Repo size | 92172 KB | 70076 KB ------------------------------------------------------------- ACTION: Status(Clean) ------------------------------------------------------------- METRIC | URN | GIT ----------------+----------------------+--------------------- -Time | 0.34s | 0.10s +Time | 0.28s | 0.11s Max RSS | 0.02 MB | 0.01 MB Page faults | Maj:0 / Min:0 | Maj:0 / Min:0 Inodes | 5264 | 5342 -Repo size | 91620 KB | 70592 KB +Repo size | 92172 KB | 70076 KB ------------------------------------------------------------- diff --git a/bm/bm_rebase.pl b/bm/bm_commit.pl index e89e7f5..841325d 100644 --- a/bm/bm_rebase.pl +++ b/bm/bm_commit.pl @@ -1,4 +1,6 @@ #!/usr/bin/perl +# Usage: perl bm_commit.pl <file_count> <depth> <total_commits> <file_perc> <line_perc> +# Example: perl bm_commit.pl 100 5 50 10 2 use strict; use warnings; use File::Copy; @@ -9,9 +11,9 @@ use File::Basename; use Cwd qw(getcwd abs_path); use Time::HiRes qw(time); -my ($files, $depth, $total_commits, $file_perc, $change_perc) = @ARGV; -if (!defined $files || !defined $depth || !defined $total_commits || !defined $file_perc || !defined $change_perc) { - die "Usage: perl bm_rebase.pl <file_count> <depth> <total_commits> <file_perc> <change_perc>\n"; +my ($files, $depth, $total_commits, $file_perc, $line_perc) = @ARGV; +if (!defined $line_perc) { + die "Usage: perl bm_commit.pl <file_count> <depth> <total_commits> <file_perc> <line_perc>\n"; } my $base_dir = getcwd(); @@ -21,12 +23,11 @@ my $bm_repo = "sandbox"; my $sample_rate = int($total_commits / 5) || 1; my %results; +my %final_stats = ( URN => { size => "N/A", inodes => 0 }, GIT => { size => "N/A", inodes => 0 } ); my $initial_repo_size = "0 KB"; my $rebase_count = 0; my %last_base_hashes; -my $global_tick = 0; - sub get_size { my $dir = shift; return "0 KB" unless -d $dir; @@ -38,35 +39,56 @@ sub get_size { sub count_inodes { my $dir = shift; return 0 unless -d $dir; - my $count = `find $dir 2>/dev/null | wc -l`; - $count =~ s/\s+//g; - return $count || 0; + my %inodes; + find(sub { my @s = lstat($_); $inodes{$s[1]} = 1 if @s; }, $dir); + return scalar(keys %inodes); } sub track_rebases { return unless -f ".urn/index"; open(my $fh, '<', ".urn/index") or return; while (<$fh>) { - chomp; - my @cols = split(/\t/); - next unless @cols >= 6; + chomp; my @cols = split(/\t/); next unless @cols >= 6; my ($b_hash, $path) = ($cols[2], $cols[5]); - if (exists $last_base_hashes{$path} && $last_base_hashes{$path} ne $b_hash) { - $rebase_count++; - } + $rebase_count++ if exists $last_base_hashes{$path} && $last_base_hashes{$path} ne $b_hash; $last_base_hashes{$path} = $b_hash; } close($fh); } -sub run_rebase_benchmark { +sub generate_surgical_line { + my ($original_line) = @_; + my @types = qw(int char float double bool uint32_t); + my @vars = qw(offset limit buffer status count ptr index); + + $original_line =~ /^(\s*)/; + my $indent = $1 || " "; + + my $type = $types[rand @types]; + my $var = $vars[rand @vars] . "_" . int(rand(100)); + my $val = int(rand(1000)); + + my $new_line = "$indent$type $var = $val;\n"; + + # Byte matching: pad with spaces before the newline to keep file size identical + my $target_len = length($original_line); + if (length($new_line) < $target_len) { + substr($new_line, -1, 0, " " x ($target_len - length($new_line))); + } elsif (length($new_line) > $target_len) { + $new_line = substr($new_line, 0, $target_len - 2) . ";\n"; + } + + return $new_line; +} + +sub run_commit_benchmark { my ($tool_name) = @_; - print ">>> Starting REBASE BENCHMARK: $tool_name\n"; + print ">>> Starting COMMIT BENCHMARK: $tool_name\n"; remove_tree($bm_repo) if -d $bm_repo; system("perl $seed_bin $files $depth > /dev/null 2>&1"); - if ($initial_repo_size eq "0 KB") { $initial_repo_size = get_size($bm_repo); } + $initial_repo_size = get_size($bm_repo) if $initial_repo_size eq "0 KB"; chdir($bm_repo) or die $!; @@ -86,36 +108,31 @@ sub run_rebase_benchmark { my @target_files = @file_list[0 .. $num_to_change - 1]; for my $i (2 .. $total_commits) { - my $debug_dir = "/tmp/urn/commit_$i"; - make_path($debug_dir) unless -d $debug_dir; - for my $target (@target_files) { - open(my $fh, '<:raw', $target) or die "Read fail: $target - $!"; - my $content = do { local $/; <$fh> }; - close($fh); - - $global_tick++; - my $ts = time(); - my $header = "C$i-T$ts-N$global_tick "; - - my $total_bytes = length($content); - my $min_len = length($header); - my $to_change = int($total_bytes * ($change_perc / 100)); - - $to_change = $min_len if $to_change < $min_len; - $to_change = $total_bytes if $to_change > $total_bytes; - - my $new_segment = substr($header . ("." x $to_change), 0, $to_change); - substr($content, 0, $to_change, $new_segment); - - open(my $out, '>:raw', $target) or die "Write fail: $target - $!"; - print $out $content; - close($out); - - my $target_name = basename($target); - copy($target, "$debug_dir/$target_name") or warn "Backup failed: $!"; - - utime(undef, undef, $target); + open(my $fh, '<', $target) or die "Read fail: $target - $!"; + my @lines = <$fh>; + close($fh); + + # Find lines that are simple variable assignments to maintain surgical diffs + my @eligible = grep { $lines[$_] =~ /^\s*\w+ \w+ = \d+;/ } (0 .. $#lines); + + if (@eligible) { + # Calculate count based on percentage of eligible lines + my $to_mod = int(scalar(@eligible) * ($line_perc / 100)); + $to_mod = 1 if $to_mod == 0; + + my @indices = (sort { rand() <=> rand() } @eligible)[0 .. $to_mod - 1]; + + for my $idx (@indices) { + next unless defined $idx; + $lines[$idx] = generate_surgical_line($lines[$idx]); + } + + open(my $out, '>', $target) or die "Write fail: $target - $!"; + print $out join('', @lines); + close($out); + utime(undef, undef, $target); + } } my $cmd = ($tool_name eq "URN") @@ -124,10 +141,7 @@ sub run_rebase_benchmark { if ($i % $sample_rate == 0 || $i == $total_commits) { my $raw_output = `/usr/bin/time -l sh -c "$cmd" 2>&1`; - - if ($tool_name eq "URN") { - track_rebases(); - } + track_rebases() if $tool_name eq "URN"; my ($real, $rss, $maj, $min) = (0, 0, 0, 0); $real = $1 if $raw_output =~ /(\d+\.\d+)\s+real/; @@ -138,7 +152,7 @@ sub run_rebase_benchmark { my $meta = ($tool_name eq "URN") ? ".urn" : ".git"; $results{$i}{$tool_name} = { real => $real . "s", - rss => $rss, + rss => $rss || "0 MB", faults => "Maj:$maj / Min:$min", inodes => count_inodes($meta), size => get_size($meta), @@ -149,24 +163,35 @@ sub run_rebase_benchmark { track_rebases() if $tool_name eq "URN"; } } + + system("git gc --prune=now --quiet") if $tool_name eq "GIT"; + + my $final_meta = ($tool_name eq "URN") ? ".urn" : ".git"; + $final_stats{$tool_name} = { + size => get_size($final_meta) || "0 KB", + inodes => count_inodes($final_meta) || 0, + }; + chdir($base_dir); + remove_tree($bm_repo); } -$rebase_count = 0; -%last_base_hashes = (); -run_rebase_benchmark("URN"); -run_rebase_benchmark("GIT"); +run_commit_benchmark("URN"); +run_commit_benchmark("GIT"); -my $out_file = "BM_REBASE_${files}_${total_commits}.txt"; +my $out_file = "BM_COMMIT_${files}_${total_commits}.txt"; open(my $res, '>', $out_file) or die $!; print $res "=============================================================\n"; -print $res " REBASE BENCHMARK: $files files ($total_commits commits)\n"; -print $res " CONDITIONS: Depth=$depth, Files Mod=$file_perc%, Change=$change_perc%\n"; +print $res " COMMIT BENCHMARK: $files files ($total_commits commits)\n"; +print $res " CONDITIONS: Depth=$depth, Files Mod=$file_perc%, Line Mod=$line_perc%\n"; +print $res " INITIAL REPO SIZE: $initial_repo_size\n"; print $res "=============================================================\n\n"; foreach my $i (sort { $a <=> $b } keys %results) { my $u = $results{$i}{"URN"}; my $g = $results{$i}{"GIT"}; + next unless defined $u && defined $g; + print $res "SNAPSHOT: Commit #$i\n"; print $res "-------------------------------------------------------------\n"; printf $res "%-15s | %-20s | %-20s\n", "METRIC", "URN", "GIT"; @@ -178,7 +203,15 @@ foreach my $i (sort { $a <=> $b } keys %results) { printf $res "%-15s | %20s | %20s\n", "Repo size", $u->{size}, $g->{size}; print $res "-------------------------------------------------------------\n\n"; } + +print $res "AFTER GIT GC\n"; +print $res "-------------------------------------------------------------\n"; +printf $res "%-15s | %20s | %20s\n", "Final Size", $final_stats{URN}{size}, $final_stats{GIT}{size}; +printf $res "%-15s | %20s | %20s\n", "Final Inodes", $final_stats{URN}{inodes}, $final_stats{GIT}{inodes}; +print $res "-------------------------------------------------------------\n\n"; + print $res "TOTAL URN REBASES: $rebase_count\n"; close($res); -exec "less $out_file"; +my $pager = $ENV{PAGER} || 'less'; +system("$pager $out_file"); diff --git a/bm/bm_history.pl b/bm/bm_history.pl deleted file mode 100644 index 5f2ea4a..0000000 --- a/bm/bm_history.pl +++ /dev/null @@ -1,152 +0,0 @@ -#!/usr/bin/perl -use strict; -use warnings; -use File::Spec; -use File::Path qw(remove_tree); -use File::Find; -use Cwd qw(getcwd abs_path); - -my ($files, $depth, $total_commits) = @ARGV; -if (!defined $files || !defined $depth || !defined $total_commits) { - die "Usage: perl bm_history.pl <file_count> <depth> <total_commits>\n"; -} - -my $base_dir = getcwd(); -my $urn_bin = abs_path(File::Spec->catfile("..", "urn")); -my $seed_bin = abs_path("seed.pl"); -my $bm_repo = "bm_repo"; - -my $sample_rate = int($total_commits / 5) || 1; -my %results; - -my $rebase_count = 0; -my %last_base_hashes; - -sub get_size { - my $dir = shift; - return "0 KB" unless -d $dir; - my $size = `du -sk $dir 2>/dev/null`; - $size =~ /^(\d+)/; - return ($1 || 0) . " KB"; -} - -sub count_inodes { - my $dir = shift; - return 0 unless -d $dir; - my $count = `find $dir 2>/dev/null | wc -l`; - $count =~ s/\s+//g; - return $count || 0; -} - -sub track_rebases { - return unless -f ".urn/index"; - open(my $fh, '<', ".urn/index") or return; - while (<$fh>) { - chomp; - my @cols = split(/\t/); - next unless @cols >= 6; - my ($b_hash, $path) = ($cols[2], $cols[5]); - if (exists $last_base_hashes{$path} && $last_base_hashes{$path} ne $b_hash) { - $rebase_count++; - } - $last_base_hashes{$path} = $b_hash; - } - close($fh); -} - -sub run_history_benchmark { - my ($tool_name) = @_; - print ">>> Starting History Benchmark: $tool_name\n"; - - remove_tree($bm_repo) if -d $bm_repo; - system("perl $seed_bin $files $depth > /dev/null 2>&1"); - - chdir($bm_repo) or die $!; - - my $init_cmd = ($tool_name eq "URN") ? "perl $urn_bin init" : "git init"; - my $add_cmd = ($tool_name eq "URN") ? "perl $urn_bin add ." : "git add ."; - - system("$init_cmd > /dev/null 2>&1"); - system("$add_cmd > /dev/null 2>&1"); - system(($tool_name eq "URN" ? "perl $urn_bin" : "git") . " commit -m 'initial' > /dev/null 2>&1"); - - track_rebases() if $tool_name eq "URN"; - - my @file_list; - find(sub { push @file_list, $File::Find::name if -f $_ && $File::Find::name !~ /\.(git|urn)/ }, "."); - - for my $i (1 .. $total_commits) { - my $to_modify = int($files * 0.02) || 1; - for (1 .. $to_modify) { - my $target = $file_list[rand @file_list]; - if (open(my $fh, '>>', $target)) { - print $fh "Churn $i\n"; - close($fh); - } - } - - my $cmd = ($tool_name eq "URN") - ? "perl $urn_bin add . && perl $urn_bin commit -m 'c$i'" - : "git add . && git commit -m 'c$i'"; - - if ($i % $sample_rate == 0 || $i == $total_commits) { - # Capture hardware metrics via /usr/bin/time -l - my $stats = `/usr/bin/time -l sh -c "$cmd" 2>&1 > /dev/null`; - - my ($real, $rss, $maj, $min) = (0, 0, 0, 0); - if ($stats =~ /(\d+\.\d+)\s+real/) { $real = $1; } - if ($stats =~ /(\d+)\s+maximum resident set size/) { $rss = sprintf("%.2f MB", $1 / 1024 / 1024); } - if ($stats =~ /(\d+)\s+page reclaims/) { $min = $1; } - if ($stats =~ /(\d+)\s+page faults/) { $maj = $1; } - - if ($tool_name eq "URN") { track_rebases(); } - - my $meta = ($tool_name eq "URN") ? ".urn" : ".git"; - $results{$i}{$tool_name} = { - real => $real . "s", - rss => $rss, - faults => "Maj:$maj / Min:$min", - inodes => count_inodes($meta), - size => get_size($meta), - }; - print " [Commit $i] $tool_name sampled.\n"; - } else { - system("$cmd > /dev/null 2>&1"); - if ($tool_name eq "URN") { track_rebases(); } - } - } - chdir($base_dir); - remove_tree($bm_repo); -} - -$rebase_count = 0; -%last_base_hashes = (); -run_history_benchmark("URN"); -run_history_benchmark("GIT"); - -my $out_file = "BM_HISTORY_${files}_${total_commits}.txt"; -open(my $res, '>', $out_file) or die $!; -print $res "=============================================================\n"; -print $res " HISTORY BENCHMARK: $files files ($total_commits commits)\n"; -print $res "=============================================================\n\n"; - -foreach my $i (sort { $a <=> $b } keys %results) { - my $u = $results{$i}{"URN"}; - my $g = $results{$i}{"GIT"}; - print $res "SNAPSHOT: Commit #$i\n"; - print $res "-------------------------------------------------------------\n"; - printf $res "%-15s | %-20s | %-20s\n", "METRIC", "URN", "GIT"; - print $res "----------------+----------------------+---------------------\n"; - printf $res "%-15s | %20s | %20s\n", "Time", $u->{real}, $g->{real}; - printf $res "%-15s | %20s | %20s\n", "Max RSS", $u->{rss}, $g->{rss}; - printf $res "%-15s | %20s | %20s\n", "Page faults", $u->{faults}, $g->{faults}; - printf $res "%-15s | %20s | %20s\n", "Inodes", $u->{inodes}, $g->{inodes}; - printf $res "%-15s | %20s | %20s\n", "Repo size", $u->{size}, $g->{size}; - print $res "-------------------------------------------------------------\n\n"; -} -print $res "TOTAL URN REBASES: $rebase_count\n"; -close($res); - -my $pager = $ENV{PAGER} || 'less'; -exec $pager, $out_file; - diff --git a/bm/bm_size.pl b/bm/bm_size.pl index d21cfd6..d3d10d7 100644 --- a/bm/bm_size.pl +++ b/bm/bm_size.pl @@ -13,7 +13,7 @@ if (!defined $files || !defined $depth) { my $base_dir = getcwd(); my $urn_bin = abs_path(File::Spec->catfile("..", "urn")); my $seed_bin = abs_path("seed.pl"); -my $bm_repo = "bm_repo"; +my $bm_repo = "sandbox"; my %results; my @actions = qw(Status Add Commit Status(Clean)); @@ -13,20 +13,53 @@ if (!defined $total_files || !defined $max_depth) { my $target_root = "sandbox"; my $files_created = 0; -# Helper to generate random "code-like" text sub generate_content { - my $size_kb = 5 + int(rand(21)); # 5-25 KB - my $bytes = $size_kb * 1024; + my $size_kb = 5 + int(rand(21)); + my $target_bytes = $size_kb * 1024; my $content = ""; - my @chars = ('a'..'z', 'A'..'Z', '0'..'9'); - - while (length($content) < $bytes) { - my $line_len = 20 + int(rand(60)); - $content .= "\t" if rand() > 0.5; # Random indentation - for (1..$line_len) { $content .= $chars[rand @chars]; } - $content .= (rand() > 0.7) ? " " : ""; # Random spaces - $content .= "\n"; + + my @types = qw(int char void float double uint32_t bool); + my @ops = qw(= += -= == != < >); + my @vars = qw(count index status buffer ptr limit offset); + + $content .= "#include <stdio.h>\n#include <stdlib.h>\n\n"; + + while (length($content) < $target_bytes) { + my $roll = rand(); + + if ($roll < 0.15) { + # Function definition (no indent) + my $type = $types[rand @types]; + my $name = "process_data_" . int(rand(100)); + $content .= "\n$type $name() {\n"; + } + elsif ($roll < 0.60) { + # Standard assignment (1 tab) + my $type = $types[rand @types]; + my $var = $vars[rand @vars] . "_" . int(rand(50)); + my $val = int(rand(1000)); + $content .= "\t$type $var = $val;\n"; + } + elsif ($roll < 0.80) { + # If block with nested logic (1-2 tabs) + my $var = $vars[rand @vars]; + $content .= "\tif ($var > " . int(rand(100)) . ") {\n"; + $content .= "\t\t$var " . $ops[rand @ops] . " 1;\n"; + $content .= "\t\treturn $var;\n"; + $content .= "\t}\n"; + } + elsif ($roll < 0.95) { + # Closing brace (no indent) + $content .= "}\n\n"; + } + else { + # Comment line (1 tab) + $content .= "\t/* Update state " . int(rand(1000)) . " */\n"; + } } + + # Cleanup: Ensure we close the last function block + $content .= "\n}\n" unless $content =~ /}\n\s*$/; return $content; } @@ -34,8 +67,6 @@ sub seed_tree { my ($current_path, $depth, $files_left) = @_; return if $files_left <= 0; - # Determine how many files to put in THIS directory - # Heuristic: Spread them out, but ensure at least 1 if depth remains my $files_here = ($depth == $max_depth) ? $files_left : int($files_left / ($max_depth - $depth + 1)) + int(rand(3)); @@ -45,20 +76,20 @@ sub seed_tree { make_path($current_path) unless -d $current_path; for (1..$files_here) { - my $filename = "file_" . sprintf("%04d", ++$files_created) . ".txt"; + my $filename = "src_" . sprintf("%04d", ++$files_created) . ".c"; my $full_path = File::Spec->catfile($current_path, $filename); - open(my $fh, '>', $full_path) or die $!; + open(my $fh, '>', $full_path) or die "Could not open $full_path: $!"; print $fh generate_content(); close($fh); } - # If we have depth and files remaining, go deeper if ($depth < $max_depth && ($files_left - $files_here) > 0) { - my $next_dir = File::Spec->catdir($current_path, "depth_" . ($depth + 1)); + my $next_dir = File::Spec->catdir($current_path, "subdir_" . ($depth + 1)); seed_tree($next_dir, $depth + 1, $files_left - $files_here); } } -print "Seeding $total_files files across $max_depth levels...\n"; +print "Seeding $total_files files (C-style) with tab indents...\n"; seed_tree($target_root, 1, $total_files); -print "Done. Created $files_created files in '$target_root'.\n"; +print "Success. Created $files_created files in '$target_root'.\n"; + @@ -3,6 +3,8 @@ use strict; use warnings; use IO::Handle; +use IO::Compress::Gzip qw(gzip $GzipError); +use IO::Uncompress::Gunzip qw(gunzip $GunzipError); use File::Path qw(make_path); use File::Copy qw(copy); use File::Basename; @@ -26,6 +28,10 @@ use constant CHUNK_LEN => 8192; use constant MEM_LIMIT => 64 * 1024 * 1024; use constant IO_LAYER => ":raw:perlio(layer=" . CHUNK_LEN . ")"; +use constant GZIP_LEVEL => 6; +use constant GZIP_THRESHOLD => 256; +use constant REBASE_THRESHOLD => 1.4; + Getopt::Long::Configure("bundling"); my $cmd = shift @ARGV // ''; @@ -261,6 +267,8 @@ sub run_commit { my $use_disk_patch = 0; my ($pt_fh, $pt_path); + my $tar_size = 0; + my ($tmp_idx_fh, $tmp_idx_path) = tempfile(DIR => TMP_DIR, UNLINK => 0); binmode $tmp_idx_fh, ":raw"; @@ -276,26 +284,26 @@ sub run_commit { $out_c = $out_s; $out_b = $out_s; - my $obj_path = get_obj_path($out_b); my $stg_file = File::Spec->catfile(TMP_DIR, $out_p); - if (!-e $obj_path) { - rename($stg_file, $obj_path) or copy($stg_file, $obj_path); - } + store_base_file($stg_file, $out_b); $idx = $it_idx->(); } elsif ($cmp == 0) { # Modified or Unchanged ($out_p, $out_s, $out_m, $out_z) = ($idx->{path}, $idx->{s_hash}, $idx->{mtime}, $idx->{size}); if ($idx->{s_hash} ne ($idx->{c_hash} // "-")) { - my $base_obj = get_obj_path($old->{hash}); + my $base_obj = get_base_file($old->{hash}); my $stg_file = File::Spec->catfile(TMP_DIR, $out_p); my $patch = (-T $stg_file) ? qx(diff '$base_obj' '$stg_file') : make_bin_patch($stg_file, $base_obj); - if (defined $patch && length($patch) <= $out_z) { - if (!$use_disk_patch && ($patch_mem_size + length($patch)) > MEM_LIMIT) { + my $patch_len = length($patch); + $tar_size += $patch_len; + + if (defined $patch && $patch_len <= ($out_z * REBASE_THRESHOLD)) { + if (!$use_disk_patch && ($patch_mem_size + $patch_len) > MEM_LIMIT) { ($pt_fh, $pt_path) = tempfile(DIR => TMP_DIR, UNLINK => 0); my $tar = Archive::Tar->new; $tar->add_data($_, $patches{$_}) for keys %patches; @@ -309,16 +317,13 @@ sub run_commit { $tar->write($pt_path); } else { $patches{"$out_p.patch"} = $patch; - $patch_mem_size += length($patch); + $patch_mem_size += $patch_len; } $out_b = $old->{hash}; unlink($stg_file); } else { $out_b = $out_s; - my $obj_path = get_obj_path($out_b); - if (!-e $obj_path) { - rename($stg_file, $obj_path) or copy($stg_file, $obj_path); - } + store_object_file($stg_file, $out_b); } $out_c = $out_s; } else { @@ -364,7 +369,7 @@ sub run_commit { my $tar = Archive::Tar->new; if ($use_disk_patch) { $tar->read($pt_path); unlink $pt_path; } $tar->add_data($_, $patches{$_}) for keys %patches; - $tar->write($bundle_tmp, COMPRESS_GZIP); + $tar->write($bundle_tmp, ($tar_size >= GZIP_THRESHOLD ? GZIP_LEVEL : 0)); $patch_bundle_hash = hash_file_content($bundle_tmp); rename($bundle_tmp, get_obj_path($patch_bundle_hash)); } @@ -437,11 +442,9 @@ sub run_show { open(my $pipe, "| $pager") or die "Can't pipe to $pager: $!"; my $old_fh = select($pipe); - # 2. Use existing logic to get the content my $v = get_file_version($rev_id, $file_path); die "Error: Could not resolve '$file_path' at revision $rev_id.\n" unless defined $v; - # 3. Output content if (ref($v) eq 'SCALAR') { binmode STDOUT, ":raw"; print $$v; @@ -493,7 +496,7 @@ sub run_diff { } } } else { - # Full Tree Walk + # Full tree walk my $s_id = (lc($src // '') eq 'head') ? read_head() : ($src // ''); my $th; if (open my $rf, '<', File::Spec->catfile(REV_DIR, $s_id)) { while (<$rf>) { $th = $1 if /^tree:(.*)$/ } close $rf; @@ -815,7 +818,7 @@ sub is_revision { sub get_file_version { my ($source, $path) = @_; - # Handle Workspace + # Handle workspace if (!defined $source) { return undef unless -e $path; if ((-s $path // 0) > MEM_LIMIT) { @@ -828,7 +831,7 @@ sub get_file_version { return \$data; } - # Resolve Revision + # Resolve revision my $rev_id = (lc($source) eq 'head') ? read_head() : $source; my $rev_file = File::Spec->catfile(REV_DIR, $rev_id); return undef unless -f $rev_file; @@ -852,10 +855,10 @@ sub get_file_version { } return undef unless $node; - my $obj_path = get_obj_path($node->{hash} // ''); + my $obj_path = get_base_file($node->{hash}); return undef unless -f $obj_path; - # Extract Base Object to Temp File + # Extract base object to temp file my ($tfh, $tpath) = tempfile(DIR => TMP_DIR, UNLINK => 1); binmode $tfh, IO_LAYER; @@ -863,9 +866,9 @@ sub get_file_version { binmode $ofh, IO_LAYER; while (read($ofh, my $buf, CHUNK_LEN)) { print $tfh $buf; } close $ofh; - close $tfh; + close $tfh; - # Apply Patch via Streaming Bundle + # Apply patch via streaming bundle if ($patch_bundle_hash) { my $bundle_path = get_obj_path($patch_bundle_hash); if (-f $bundle_path && -s $bundle_path > 0) { @@ -891,7 +894,7 @@ sub get_file_version { } } - # Final Output decision based on result size + # Final output decision based on result size if ((-s $tpath // 0) > MEM_LIMIT) { return $tpath; } else { @@ -900,3 +903,45 @@ sub get_file_version { } } +sub get_base_file { + my ($obj_hash) = @_; + my $obj_path = get_obj_path($obj_hash); + return undef unless -f $obj_path; + + # Peek at the first two bytes for GZIP header + open my $fh, '<:raw', $obj_path or return $obj_path; + read($fh, my $header, 2); + close $fh; + + # If it's not gzipped, return the original path in the object store + return $obj_path unless (defined $header && $header eq "\x1f\x8b"); + + # If gzipped, decompress to a temporary file in the staging area + my ($tfh, $tpath) = tempfile(DIR => TMP_DIR, UNLINK => 1); + close $tfh; + + gunzip $obj_path => $tpath + or die "Failed to decompress base $obj_path: $GunzipError"; + + return $tpath; +} + +sub store_base_file { + my ($stg_file, $obj_hash) = @_; + my $obj_path = get_obj_path($obj_hash); + + # If the object already exists, we don't need to do anything + return $obj_path if -e $obj_path; + + if (!-l $stg_file && -s $stg_file > GZIP_THRESHOLD) { + gzip $stg_file => $obj_path, Level => GZIP_LEVEL + or die "gzip failed for $obj_path: $GzipError"; + unlink($stg_file); + } + else { + rename($stg_file, $obj_path) or copy($stg_file, $obj_path); + } + + return $obj_path; +} + |
