From 35b9a686cb8c5473ad756f49ee293d8ed23e34e8 Mon Sep 17 00:00:00 2001
From: Sadeep Madurange <sadeep@asciimx.com>
Date: Fri, 10 Apr 2026 21:41:43 +0800
Subject: wip: workspace scanner.

---
 README.txt |  11 ++++++
 vcx        | 113 ++++++++++++++++++++++++++++++++++++++++---------------------
 2 files changed, 86 insertions(+), 38 deletions(-)
 create mode 100644 README.txt

diff --git a/README.txt b/README.txt
new file mode 100644
index 0000000..b5383d5
--- /dev/null
+++ b/README.txt
@@ -0,0 +1,11 @@
+FUNCTIONAL
+
+Common ops: status / add / commit / log / checkout / diff
+File integrity
+
+CONSTRAINTS
+
+SSD: TBW / append-only
+File systems: inode count, file descriptors
+System: CPU / memory
+
diff --git a/vcx b/vcx
index 8682be8..b68d28d 100644
--- a/vcx
+++ b/vcx
@@ -2,6 +2,7 @@
 
 use strict;
 use warnings;
+use IO::Handle;
 use File::Path qw(make_path);
 use File::Copy qw(copy);
 use File::Find;
@@ -9,6 +10,7 @@ use File::Compare;
 use File::Basename;
 use File::Glob qw(:bsd_glob);
 use File::Spec;
+use File::Temp qw(tempfile);
 use Getopt::Long;
 use Archive::Tar;
 use Compress::Zlib;
@@ -327,54 +329,89 @@ sub hash_file_content {
 	return $sha->hexdigest;
 }
 
-sub scan_tree {
-	my $cb = pop @_; 
+sub get_dir_scanner {
+	my (@paths) = @_;
 
-	my @stack;
+	my $chunk_size = 1024 * 64; # 64 KB chunks for IO buffering
 
-	my $collect = sub {
-		my ($path, $files) = @_;
-		my @stat = lstat($path) or (warn("lstat '$path': $!\n") and return);
-		if (-d _) {
-			push @stack, $path;
-		} elsif (-f _ || -l _) {
-			push @$files, {
-				path  => $path =~ s|^\./||r,
-				size  => $stat[7],
-				mtime => $stat[9],
-			};
+	my @buf;
+	my $buf_size = 0;
+	my $tot_size = 0;
+	my $use_disk = 0;
+
+	my ($tmp_fh, $tmp_path);
+
+	my $flush = sub {
+		if (!$use_disk) {
+			($tmp_fh, $tmp_path) = tempfile(UNLINK => 1);
+			$tmp_fh->setvbuf(undef, _IOFBF, chunk_size); 
+			binmode $tmp_fh, ":utf8";
+			$use_disk = 1;
 		}
-	};
 
-	my @input_files;
-	$collect->($_, \@input_files) for @_;
-	if (@input_files) {
-		@input_files = sort { $a->{path} cmp $b->{path} } @input_files;	
-		$cb->('.', \@input_files);
+		print $tmp_fh @buf;
+		@buf = ();
+		$buf_size = 0;
 	}
 
+	my @stack = @paths;
 	while (@stack) {
-		my $dir = pop @stack;
-		my $dh;
-		unless (opendir($dh, $dir)) { 
-			warn "Can't open $dir\n"; 
-			next; 
-		}
+		my $path = (pop @stack) =~ s|^\./||r;
+		my @st = lstat($path);
 
-		my @files;
-		my $subdir_idx = @stack; # Track where this level's subdirs start
-		while (my $ent = readdir($dh)) {
-			next if $ent eq '.' or $ent eq '..' or $ent eq REPO;
-			$collect->(File::Spec->catfile($dir, $ent), \@files);
+		if (-d _) {
+			if (opendir(my $dh, $path)) {
+				push @stack, map { File::Spec->catfile($path, $_) }
+					grep { $_ ne '.' && $_ ne '..' && $_ ne 'REPO' } 
+					readdir($dh);
+				closedir($dh);
+			} else {
+				warn "Could not open '$path': $!\n";
+			}
+		} elsif (-f _ || -l _ || !-e $path) {
+			# Use 0 as a default for size and mtime for deleted files.
+			my $size  = $st[7] // 0;
+			my $mtime = $st[9] // 0;
+			my $line = "$clean_path\t$st[7]\t$st[9]\n";
+			my $len = length($record);
+
+			push @buf, $line;
+			$buf_size += $len;
+			$tot_size += $len;
+
+			if ((!$use_disk && $tot_size > MEM_LIMIT) || 
+			    ($use_disk && $buf_size > $chunk_size)) {
+				$flush->();
+			}
 		}
+	}
 
-		closedir($dh);
-		@files = sort { $a->{path} cmp $b->{path} } @files;
-		$cb->($dir, \@files) if @files;
-
-		if (@stack > $subdir_idx) {
-			my @subdirs = splice(@stack, $subdir_idx);
-			push @stack, sort { $b cmp $a } @subdirs;
+	if (!$use_disk) {
+		@buf = sort @buf;
+		return sub {
+			my $line = shift @buffer;
+			return unless $line;
+			chomp $line;
+			my ($p, $s, $m) = split(/\t/, $line);
+			return { path => $p, size => $s, mtime => $m };
+		}
+	} else {
+		$flush->() if @buffer; # Clear remaining
+		close $tmp_fh;
+
+		open(my $sort_fh, "-|", "sort", "-t", "\t", "-k1,1", $tmp_path)
+			or die "Could not open sort pipe: $!";
+
+		return sub {
+			my $line = <$sort_fh>;
+			unless ($line) {		
+				close $sort_fh; # Reap the sort process
+				return;
+			}
+
+			chomp $line;
+			my ($p, $s, $m) = split(/\t/, $line);
+			return { path => $p, size => $s, mtime => $m };
 		}
 	}
 }
-- 
cgit v1.2.3