2.9BSD/bin/rebuild_tables

#!/usr/bin/perl
use strict;
use warnings;
use Data::Dumper;

# Move to the base directory
my $basedir = '/usr/local/unixtree';
chdir($basedir);

# Zap and rebuild the filename/id database table
print("Rebuilding the filename sqlite3 table ... ");

# Rebuild the table from scratch
unlink("tmp/filename.db");
system("sqlite3 tmp/filename.db < db/schema.sql");

# Now find and insert all the files in all the releases
open( my $OUT, "| sqlite3 tmp/filename.db" ) || die;
print( $OUT "begin transaction;\n" );
print( $OUT "insert into filename values(NULL,'dummy_record');\n" );
open( my $IN, "find `sed 's/:.*//' db/releases.txt` -type f |" ) || die;
while (<$IN>) {
    chomp;
    print( $OUT "insert into filename values(NULL,'$_');\n" );
}
print( $OUT "commit transaction;\n" );
close($IN);
close($OUT);
print("done\n");

# Load the filename/id pairs from the database
my %Fileid;
print("Loading the filename/id pairs ... ");
open( $IN,
    "echo 'select * from filename order by id;' | sqlite3 tmp/filename.db |" )
  || die;
my $maxfid;
while (<$IN>) {
    chomp;
    my ( $id, $name ) = split(m{\|});
    $Fileid{$name} = $id;
    $maxfid = $id;
}
close($IN);
print("$maxfid files\n");

# Now run ctcompare and get the raw list of similarities
my %Rawsimlist;
print("Running ctcompare (will take a while) ... ");
my $tmpfile="/tmp/rebuild.$$";
system("ctcompare Ctf/* > $tmpfile");
print("done\n");
print("Reading the ctcompare results ... ");
open( $IN, "<", $tmpfile ) || die;
while (<$IN>) {
    next if ( !/^(\d+)\s+(\S+)\s+(\S+)/ );

    # We found a ctcompare 3-column header
    my $runlength = $1;
    my $file1     = $2;
    my $file2     = $3;
    $file1 =~ s/:.*//;
    $file2 =~ s/:.*//;

    # Convert to fileids
    my $f1id = $Fileid{$file1};
    if ( !defined($f1id) ) {
        print("No fileid for $file1\n"); next;
    }
    my $f2id = $Fileid{$file2};
    if ( !defined($f2id) ) {
        print("No fileid for $file2\n"); next;
    }

    # Increment the simiarity count for the pair
    $Rawsimlist{$f1id}{$f2id} += $runlength;
    $Rawsimlist{$f2id}{$f1id} += $runlength;
}
close($IN);
unlink($tmpfile);
print("done\n");

# Find the top 50 similarities for each fileid
print("Writing the similarities out ... ");
open( $OUT, ">", "tmp/similarities" ) || die;
for ( my $fid = 0 ; $fid <= $maxfid ; $fid++ ) {
    my @simidlist;

    # Not all files actually have similarities
    if ( defined( $Rawsimlist{$fid} ) ) {
        # Get a list of similar files, sorted by descending similarity
        my $href = $Rawsimlist{$fid};
        if ( defined($href) ) {
            @simidlist =
              sort( { $href->{$b} <=> $href->{$a} } keys( %{$href} ) );
        }
    }

    # Make sure that we have exactly 50 entries
    while ( @simidlist > 50 ) { pop(@simidlist); }
    while ( @simidlist < 50 ) { push( @simidlist, 0 ); }

    # Save the sorted list to the file as 50 binary ints
    foreach my $simid (@simidlist) {
        print( $OUT pack( "L", $simid ) );
    }
}
close($OUT);
print("done\n");
exit(0);