From 35b9a686cb8c5473ad756f49ee293d8ed23e34e8 Mon Sep 17 00:00:00 2001 From: Sadeep Madurange Date: Fri, 10 Apr 2026 21:41:43 +0800 Subject: wip: workspace scanner. --- README.txt | 11 ++++++ vcx | 113 ++++++++++++++++++++++++++++++++++++++++--------------------- 2 files changed, 86 insertions(+), 38 deletions(-) create mode 100644 README.txt diff --git a/README.txt b/README.txt new file mode 100644 index 0000000..b5383d5 --- /dev/null +++ b/README.txt @@ -0,0 +1,11 @@ +FUNCTIONAL + +Common ops: status / add / commit / log / checkout / diff +File integrity + +CONSTRAINTS + +SSD: TBW / append-only +File systems: inode count, file descriptors +System: CPU / memory + diff --git a/vcx b/vcx index 8682be8..b68d28d 100644 --- a/vcx +++ b/vcx @@ -2,6 +2,7 @@ use strict; use warnings; +use IO::Handle; use File::Path qw(make_path); use File::Copy qw(copy); use File::Find; @@ -9,6 +10,7 @@ use File::Compare; use File::Basename; use File::Glob qw(:bsd_glob); use File::Spec; +use File::Temp qw(tempfile); use Getopt::Long; use Archive::Tar; use Compress::Zlib; @@ -327,54 +329,89 @@ sub hash_file_content { return $sha->hexdigest; } -sub scan_tree { - my $cb = pop @_; +sub get_dir_scanner { + my (@paths) = @_; - my @stack; + my $chunk_size = 1024 * 64; # 64 KB chunks for IO buffering - my $collect = sub { - my ($path, $files) = @_; - my @stat = lstat($path) or (warn("lstat '$path': $!\n") and return); - if (-d _) { - push @stack, $path; - } elsif (-f _ || -l _) { - push @$files, { - path => $path =~ s|^\./||r, - size => $stat[7], - mtime => $stat[9], - }; + my @buf; + my $buf_size = 0; + my $tot_size = 0; + my $use_disk = 0; + + my ($tmp_fh, $tmp_path); + + my $flush = sub { + if (!$use_disk) { + ($tmp_fh, $tmp_path) = tempfile(UNLINK => 1); + $tmp_fh->setvbuf(undef, _IOFBF, chunk_size); + binmode $tmp_fh, ":utf8"; + $use_disk = 1; } - }; - my @input_files; - $collect->($_, \@input_files) for @_; - if (@input_files) { - @input_files = sort { $a->{path} cmp $b->{path} } @input_files; - $cb->('.', \@input_files); + print $tmp_fh @buf; + @buf = (); + $buf_size = 0; } + my @stack = @paths; while (@stack) { - my $dir = pop @stack; - my $dh; - unless (opendir($dh, $dir)) { - warn "Can't open $dir\n"; - next; - } + my $path = (pop @stack) =~ s|^\./||r; + my @st = lstat($path); - my @files; - my $subdir_idx = @stack; # Track where this level's subdirs start - while (my $ent = readdir($dh)) { - next if $ent eq '.' or $ent eq '..' or $ent eq REPO; - $collect->(File::Spec->catfile($dir, $ent), \@files); + if (-d _) { + if (opendir(my $dh, $path)) { + push @stack, map { File::Spec->catfile($path, $_) } + grep { $_ ne '.' && $_ ne '..' && $_ ne 'REPO' } + readdir($dh); + closedir($dh); + } else { + warn "Could not open '$path': $!\n"; + } + } elsif (-f _ || -l _ || !-e $path) { + # Use 0 as a default for size and mtime for deleted files. + my $size = $st[7] // 0; + my $mtime = $st[9] // 0; + my $line = "$clean_path\t$st[7]\t$st[9]\n"; + my $len = length($record); + + push @buf, $line; + $buf_size += $len; + $tot_size += $len; + + if ((!$use_disk && $tot_size > MEM_LIMIT) || + ($use_disk && $buf_size > $chunk_size)) { + $flush->(); + } } + } - closedir($dh); - @files = sort { $a->{path} cmp $b->{path} } @files; - $cb->($dir, \@files) if @files; - - if (@stack > $subdir_idx) { - my @subdirs = splice(@stack, $subdir_idx); - push @stack, sort { $b cmp $a } @subdirs; + if (!$use_disk) { + @buf = sort @buf; + return sub { + my $line = shift @buffer; + return unless $line; + chomp $line; + my ($p, $s, $m) = split(/\t/, $line); + return { path => $p, size => $s, mtime => $m }; + } + } else { + $flush->() if @buffer; # Clear remaining + close $tmp_fh; + + open(my $sort_fh, "-|", "sort", "-t", "\t", "-k1,1", $tmp_path) + or die "Could not open sort pipe: $!"; + + return sub { + my $line = <$sort_fh>; + unless ($line) { + close $sort_fh; # Reap the sort process + return; + } + + chomp $line; + my ($p, $s, $m) = split(/\t/, $line); + return { path => $p, size => $s, mtime => $m }; } } } -- cgit v1.2.3