#!/usr/bin/perl use strict; use warnings; use Data::Dumper; # Move to the base directory my $basedir = '/usr/local/unixtree'; chdir($basedir); # Zap and rebuild the filename/id database table print("Rebuilding the filename sqlite3 table ... "); # Rebuild the table from scratch unlink("tmp/filename.db"); system("sqlite3 tmp/filename.db < db/schema.sql"); # Now find and insert all the files in all the releases open( my $OUT, "| sqlite3 tmp/filename.db" ) || die; print( $OUT "begin transaction;\n" ); print( $OUT "insert into filename values(NULL,'dummy_record');\n" ); open( my $IN, "find `sed 's/:.*//' db/releases.txt` -type f |" ) || die; while (<$IN>) { chomp; print( $OUT "insert into filename values(NULL,'$_');\n" ); } print( $OUT "commit transaction;\n" ); close($IN); close($OUT); print("done\n"); # Load the filename/id pairs from the database my %Fileid; print("Loading the filename/id pairs ... "); open( $IN, "echo 'select * from filename order by id;' | sqlite3 tmp/filename.db |" ) || die; my $maxfid; while (<$IN>) { chomp; my ( $id, $name ) = split(m{\|}); $Fileid{$name} = $id; $maxfid = $id; } close($IN); print("$maxfid files\n"); # Now run ctcompare and get the raw list of similarities my %Rawsimlist; print("Running ctcompare (will take a while) ... "); my $tmpfile="/tmp/rebuild.$$"; system("ctcompare Ctf/* > $tmpfile"); print("done\n"); print("Reading the ctcompare results ... "); open( $IN, "<", $tmpfile ) || die; while (<$IN>) { next if ( !/^(\d+)\s+(\S+)\s+(\S+)/ ); # We found a ctcompare 3-column header my $runlength = $1; my $file1 = $2; my $file2 = $3; $file1 =~ s/:.*//; $file2 =~ s/:.*//; # Convert to fileids my $f1id = $Fileid{$file1}; if ( !defined($f1id) ) { print("No fileid for $file1\n"); next; } my $f2id = $Fileid{$file2}; if ( !defined($f2id) ) { print("No fileid for $file2\n"); next; } # Increment the simiarity count for the pair $Rawsimlist{$f1id}{$f2id} += $runlength; $Rawsimlist{$f2id}{$f1id} += $runlength; } close($IN); unlink($tmpfile); print("done\n"); # Find the top 50 similarities for each fileid print("Writing the similarities out ... "); open( $OUT, ">", "tmp/similarities" ) || die; for ( my $fid = 0 ; $fid <= $maxfid ; $fid++ ) { my @simidlist; # Not all files actually have similarities if ( defined( $Rawsimlist{$fid} ) ) { # Get a list of similar files, sorted by descending similarity my $href = $Rawsimlist{$fid}; if ( defined($href) ) { @simidlist = sort( { $href->{$b} <=> $href->{$a} } keys( %{$href} ) ); } } # Make sure that we have exactly 50 entries while ( @simidlist > 50 ) { pop(@simidlist); } while ( @simidlist < 50 ) { push( @simidlist, 0 ); } # Save the sorted list to the file as 50 binary ints foreach my $simid (@simidlist) { print( $OUT pack( "L", $simid ) ); } } close($OUT); print("done\n"); exit(0);