diff options
| author | Sadeep Madurange <sadeep@asciimx.com> | 2026-04-10 21:41:43 +0800 |
|---|---|---|
| committer | Sadeep Madurange <sadeep@asciimx.com> | 2026-04-10 21:41:43 +0800 |
| commit | 35b9a686cb8c5473ad756f49ee293d8ed23e34e8 (patch) | |
| tree | 146918918ff4e32b2651434f9a55c7dc21906511 | |
| parent | 8a848fa2b1c67829c69001bbe5bff2cb182c3588 (diff) | |
| download | cvn-35b9a686cb8c5473ad756f49ee293d8ed23e34e8.tar.gz | |
wip: workspace scanner.
| -rw-r--r-- | README.txt | 11 | ||||
| -rw-r--r-- | vcx | 113 |
2 files changed, 86 insertions, 38 deletions
diff --git a/README.txt b/README.txt new file mode 100644 index 0000000..b5383d5 --- /dev/null +++ b/README.txt @@ -0,0 +1,11 @@ +FUNCTIONAL + +Common ops: status / add / commit / log / checkout / diff +File integrity + +CONSTRAINTS + +SSD: TBW / append-only +File systems: inode count, file descriptors +System: CPU / memory + @@ -2,6 +2,7 @@ use strict; use warnings; +use IO::Handle; use File::Path qw(make_path); use File::Copy qw(copy); use File::Find; @@ -9,6 +10,7 @@ use File::Compare; use File::Basename; use File::Glob qw(:bsd_glob); use File::Spec; +use File::Temp qw(tempfile); use Getopt::Long; use Archive::Tar; use Compress::Zlib; @@ -327,54 +329,89 @@ sub hash_file_content { return $sha->hexdigest; } -sub scan_tree { - my $cb = pop @_; +sub get_dir_scanner { + my (@paths) = @_; - my @stack; + my $chunk_size = 1024 * 64; # 64 KB chunks for IO buffering - my $collect = sub { - my ($path, $files) = @_; - my @stat = lstat($path) or (warn("lstat '$path': $!\n") and return); - if (-d _) { - push @stack, $path; - } elsif (-f _ || -l _) { - push @$files, { - path => $path =~ s|^\./||r, - size => $stat[7], - mtime => $stat[9], - }; + my @buf; + my $buf_size = 0; + my $tot_size = 0; + my $use_disk = 0; + + my ($tmp_fh, $tmp_path); + + my $flush = sub { + if (!$use_disk) { + ($tmp_fh, $tmp_path) = tempfile(UNLINK => 1); + $tmp_fh->setvbuf(undef, _IOFBF, chunk_size); + binmode $tmp_fh, ":utf8"; + $use_disk = 1; } - }; - my @input_files; - $collect->($_, \@input_files) for @_; - if (@input_files) { - @input_files = sort { $a->{path} cmp $b->{path} } @input_files; - $cb->('.', \@input_files); + print $tmp_fh @buf; + @buf = (); + $buf_size = 0; } + my @stack = @paths; while (@stack) { - my $dir = pop @stack; - my $dh; - unless (opendir($dh, $dir)) { - warn "Can't open $dir\n"; - next; - } + my $path = (pop @stack) =~ s|^\./||r; + my @st = lstat($path); - my @files; - my $subdir_idx = @stack; # Track where this level's subdirs start - while (my $ent = readdir($dh)) { - next if $ent eq '.' or $ent eq '..' or $ent eq REPO; - $collect->(File::Spec->catfile($dir, $ent), \@files); + if (-d _) { + if (opendir(my $dh, $path)) { + push @stack, map { File::Spec->catfile($path, $_) } + grep { $_ ne '.' && $_ ne '..' && $_ ne 'REPO' } + readdir($dh); + closedir($dh); + } else { + warn "Could not open '$path': $!\n"; + } + } elsif (-f _ || -l _ || !-e $path) { + # Use 0 as a default for size and mtime for deleted files. + my $size = $st[7] // 0; + my $mtime = $st[9] // 0; + my $line = "$clean_path\t$st[7]\t$st[9]\n"; + my $len = length($record); + + push @buf, $line; + $buf_size += $len; + $tot_size += $len; + + if ((!$use_disk && $tot_size > MEM_LIMIT) || + ($use_disk && $buf_size > $chunk_size)) { + $flush->(); + } } + } - closedir($dh); - @files = sort { $a->{path} cmp $b->{path} } @files; - $cb->($dir, \@files) if @files; - - if (@stack > $subdir_idx) { - my @subdirs = splice(@stack, $subdir_idx); - push @stack, sort { $b cmp $a } @subdirs; + if (!$use_disk) { + @buf = sort @buf; + return sub { + my $line = shift @buffer; + return unless $line; + chomp $line; + my ($p, $s, $m) = split(/\t/, $line); + return { path => $p, size => $s, mtime => $m }; + } + } else { + $flush->() if @buffer; # Clear remaining + close $tmp_fh; + + open(my $sort_fh, "-|", "sort", "-t", "\t", "-k1,1", $tmp_path) + or die "Could not open sort pipe: $!"; + + return sub { + my $line = <$sort_fh>; + unless ($line) { + close $sort_fh; # Reap the sort process + return; + } + + chomp $line; + my ($p, $s, $m) = split(/\t/, $line); + return { path => $p, size => $s, mtime => $m }; } } } |
