2.9BSD/bin/rebuild_tables
#!/usr/bin/perl
use strict;
use warnings;
use Data::Dumper;
# Move to the base directory
my $basedir = '/usr/local/unixtree';
chdir($basedir);
# Zap and rebuild the filename/id database table
print("Rebuilding the filename sqlite3 table ... ");
# Rebuild the table from scratch
unlink("tmp/filename.db");
system("sqlite3 tmp/filename.db < db/schema.sql");
# Now find and insert all the files in all the releases
open( my $OUT, "| sqlite3 tmp/filename.db" ) || die;
print( $OUT "begin transaction;\n" );
print( $OUT "insert into filename values(NULL,'dummy_record');\n" );
open( my $IN, "find `sed 's/:.*//' db/releases.txt` -type f |" ) || die;
while (<$IN>) {
chomp;
print( $OUT "insert into filename values(NULL,'$_');\n" );
}
print( $OUT "commit transaction;\n" );
close($IN);
close($OUT);
print("done\n");
# Load the filename/id pairs from the database
my %Fileid;
print("Loading the filename/id pairs ... ");
open( $IN,
"echo 'select * from filename order by id;' | sqlite3 tmp/filename.db |" )
|| die;
my $maxfid;
while (<$IN>) {
chomp;
my ( $id, $name ) = split(m{\|});
$Fileid{$name} = $id;
$maxfid = $id;
}
close($IN);
print("$maxfid files\n");
# Now run ctcompare and get the raw list of similarities
my %Rawsimlist;
print("Running ctcompare (will take a while) ... ");
my $tmpfile="/tmp/rebuild.$$";
system("ctcompare Ctf/* > $tmpfile");
print("done\n");
print("Reading the ctcompare results ... ");
open( $IN, "<", $tmpfile ) || die;
while (<$IN>) {
next if ( !/^(\d+)\s+(\S+)\s+(\S+)/ );
# We found a ctcompare 3-column header
my $runlength = $1;
my $file1 = $2;
my $file2 = $3;
$file1 =~ s/:.*//;
$file2 =~ s/:.*//;
# Convert to fileids
my $f1id = $Fileid{$file1};
if ( !defined($f1id) ) {
print("No fileid for $file1\n"); next;
}
my $f2id = $Fileid{$file2};
if ( !defined($f2id) ) {
print("No fileid for $file2\n"); next;
}
# Increment the simiarity count for the pair
$Rawsimlist{$f1id}{$f2id} += $runlength;
$Rawsimlist{$f2id}{$f1id} += $runlength;
}
close($IN);
unlink($tmpfile);
print("done\n");
# Find the top 50 similarities for each fileid
print("Writing the similarities out ... ");
open( $OUT, ">", "tmp/similarities" ) || die;
for ( my $fid = 0 ; $fid <= $maxfid ; $fid++ ) {
my @simidlist;
# Not all files actually have similarities
if ( defined( $Rawsimlist{$fid} ) ) {
# Get a list of similar files, sorted by descending similarity
my $href = $Rawsimlist{$fid};
if ( defined($href) ) {
@simidlist =
sort( { $href->{$b} <=> $href->{$a} } keys( %{$href} ) );
}
}
# Make sure that we have exactly 50 entries
while ( @simidlist > 50 ) { pop(@simidlist); }
while ( @simidlist < 50 ) { push( @simidlist, 0 ); }
# Save the sorted list to the file as 50 binary ints
foreach my $simid (@simidlist) {
print( $OUT pack( "L", $simid ) );
}
}
close($OUT);
print("done\n");
exit(0);