Numerically equivalent.pl

From Colettapedia
Jump to navigation Jump to search
#!/usr/bin/perl
use warnings;
use strict;

# Function prototypes
sub LoadOldToNewHash;
sub ParseSigFile($$);
sub uniq;
sub feature_sort;
sub main;
sub Comparison_Of_Rank_Ordered_Group_Values($$$);

# Globals
my %old_featurenames_mapped_to_new;
my %unrecognized_feature_names;

&LoadOldToNewHash;

# Run the program
&main(@ARGV);

#=======================================================================
sub main {

	# Load the two files and their 

	my $file1 = shift;
	my $file2 = shift;

	print "Comparing file \"$file1\" and \"$file2\"\n";

	my %file1_values;
	&ParseSigFile( $file1, \%file1_values );

	my %file2_values;
	&ParseSigFile( $file2, \%file2_values );

	my ($file1_val, $file2_val);
	my $index;
	my $feature_name;
	my @new_feature_names =  values %old_featurenames_mapped_to_new;
  push( @new_feature_names, keys %unrecognized_feature_names );
	my @sorted_feature_names = uniq sort feature_sort @new_feature_names;
	my $matched = 0;
	my $unmatched = 0;
	my $missing = 0;
	my $feature_count = 0;
	my %groups_that_have_unmatched;
	my %groups_that_have_missing;
	foreach $feature_name ( @sorted_feature_names )
	{
		++$feature_count;
		$file1_val = $file1_values{ $feature_name };
		$file2_val = $file2_values{ $feature_name };
		if( $file1_val && $file2_val  )
		{
			if( $file1_val == $file2_val ) {
				++$matched;
				#print "Feature \"$feature_name\" matches\n";
			}
			else
			{
				++$unmatched;
				print "Feature \"$feature_name\" doesn't match:\n";
				print "\tVal $file1_val in file $file1\n";
				print "\tVal $file2_val in file $file2\n";
				my $feature_group = $feature_name;
				# strip off the bin number if it exists
				$feature_group =~ s/ \[\d+\]//;
				if( defined $groups_that_have_unmatched{ $feature_group } ) {
					$groups_that_have_unmatched{ $feature_group }++;
				}
				else
				{
					$groups_that_have_unmatched{ $feature_group } = 1;
				}
			}
		}
		elsif( $file1_val xor $file2_val )
		{
			++$missing;
			if( $file1_val ) {
				print "***Feature \"$feature_name\"\n\tappears in file \"$file1\"\n\tbut not file \"$file2\"\n";
			}
			else
			{
				print "****Feature \"$feature_name\"\n\tappears in file \"$file2\"\n\tbut not file \"$file1\"\n";
			}
			my $feature_group = $feature_name;
			# strip off the bin number if it exists
			$feature_group =~ s/ \[\d+\]//;
			if( defined $groups_that_have_missing{ $feature_group } ) {
				$groups_that_have_missing{ $feature_group }++;
			}
			else
			{
				$groups_that_have_missing{ $feature_group } = 1;
			}
		}
		else
		{
			# neither file had this particular feature
			--$feature_count;
			#print "Neither file had feature $feature_name\n";

		}
	}
	print "\n\n********************************\nSUMMARY:\n";
	print "Features examined: $feature_count\n";
	print "Matching features: $matched\n";
	print "Total unmatched features: $unmatched\n";
	print "Features in one sig file but not the other: $missing\n";
	my $group_count = 1;
	print "Groups with unmatched features:\n";
	foreach my $group_name (keys %groups_that_have_unmatched) {
		print "\t$group_count.\t".$groups_that_have_unmatched{$group_name}."\t$group_name:\n";
		#&Comparison_Of_Rank_Ordered_Group_Values( $group_name, \%file1_values, \%file2_values );
		$group_count++;
	}
	$group_count = 1;
	print "Groups with missing features:\n";
	foreach my $group_name (keys %groups_that_have_missing) {
		print "\t$group_count.\t".$groups_that_have_missing{$group_name}."\t$group_name:\n";
		$group_count++;
	}
	return 1;
}

#################################################################

sub uniq {
    my %seen = ();
    my @r = ();
    foreach my $a (@_) {
        unless ($seen{$a}) {
            push @r, $a;
            $seen{$a} = 1;
        }
    }
    return @r;
}

#################################################################

sub feature_sort {

	my ($a_stem, $a_bin, $b_stem, $b_bin );

	if( $a =~ /(.*) \[(\d+)\]/ ) {
		$a_stem = $1;
		$a_bin = $2;
	}
	if( $b =~ /(.*) \[(\d+)\]/ ) {
		$b_stem = $1;
		$b_bin = $2;
	}

	if ( $a_stem && $a_bin && $b_stem && $b_bin ) {
		if ($a_stem ne $b_stem ) { return $a cmp $b } 
		else { return $a_bin <=> $b_bin; } 
	}
	else {
		return $a cmp $b;
	}
}

#################################################################
sub ParseSigFile($$) {
	my $filename = shift;
	my $values_ref = shift;

	print "Parsing file \"$filename\"\n";

	my $value;
	my $feature;
	my $line_count = 0;
	open( THE_FILE, $filename ) or die "can't open $filename";

	while (<THE_FILE>)
	{
		++$line_count;
		if( $line_count != 1 and $line_count != 2)
		{
			if( /^(-?\d+\.\d+) (.+)$/ ) 
			{
				$value = $1;
				$feature = $2;
				chomp $feature;
				#print "\tline $line_count feature \"$feature\"; value $value\n";
				if( defined $old_featurenames_mapped_to_new{ $feature } )
				{
					$feature = $old_featurenames_mapped_to_new{ $feature };
				}
				else
				{
					$unrecognized_feature_names{$feature} = 1;
				}
				if( defined $$values_ref{ $feature } )
				{
					die "The feature $feature has already been defined for file $filename.\n";
				}
				else
				{
					$$values_ref{ $feature } = $value;
				}
			} # end if file 1 line correctly pattern matches
			else
			{
				die "Error procesing $filename: line doesn't match pattern of <#.#####> <space> <feature name>:\nHere is offending line: $_\n";
			}
		} # end processing file1 lines
	} # end if line_count
	close THE_FILE;

	print "Total lines read in from file \"$filename\" : $line_count\n";
} #end sub



#################################################################
sub Comparison_Of_Rank_Ordered_Group_Values ($$$) {
	my $group_name = shift;
	my $file1_values_ref = shift;
	my $file2_values_ref = shift;

	my @file1_group_values;
	my @file2_group_values;

	my $feature_name;
	my $meta_escaped_group_name;
	foreach $feature_name ( keys %{ $file1_values_ref } ) {
		$meta_escaped_group_name = quotemeta($group_name);
		#print "*** groupname \"$group_name\" ... quotemeta: \"$meta_escaped_group_name\"\n";
		if( $feature_name =~ /$meta_escaped_group_name/ ) {
			push @file1_group_values, $$file1_values_ref{$feature_name};
		}
	}
	foreach $feature_name ( keys %{ $file2_values_ref } ) {
		$meta_escaped_group_name = quotemeta($group_name);
		if( $feature_name =~ /$meta_escaped_group_name/ ) {
			push @file2_group_values, $$file2_values_ref{$feature_name};
		}
	}

	@file1_group_values = sort { $a <=> $b } @file1_group_values;
	@file2_group_values = sort { $a <=> $b } @file2_group_values;

	my $max = $#file1_group_values;
	my $min = $#file2_group_values;
	my $short = 2;
	if( $#file2_group_values > $max ) {
		$max = $#file2_group_values;
		$min = $#file1_group_values;
		$short = 1;
	}
	print "\nGroup analysis: $group_name:\n";
	my $i;
	for( $i = 0; $i <= $min; ++$i ) {
		print $file1_group_values[$i] . "\t" . $file2_group_values[$i] . "\n";
	}

	for( $i = ($min +1); $i <= $max; ++$i ) {
		if( $short == 2) {
			print $file1_group_values[$i] . "\t" . "N/A" . "\n";
		}
		else
		{
			print "N/A" . "\t" . $file2_group_values[$i] . "\n";
		}
	}


}