Numerically equivalent.pl
Jump to navigation
Jump to search
#!/usr/bin/perl use warnings; use strict; # Function prototypes sub LoadOldToNewHash; sub ParseSigFile($$); sub uniq; sub feature_sort; sub main; sub Comparison_Of_Rank_Ordered_Group_Values($$$); # Globals my %old_featurenames_mapped_to_new; my %unrecognized_feature_names; &LoadOldToNewHash; # Run the program &main(@ARGV); #======================================================================= sub main { # Load the two files and their my $file1 = shift; my $file2 = shift; print "Comparing file \"$file1\" and \"$file2\"\n"; my %file1_values; &ParseSigFile( $file1, \%file1_values ); my %file2_values; &ParseSigFile( $file2, \%file2_values ); my ($file1_val, $file2_val); my $index; my $feature_name; my @new_feature_names = values %old_featurenames_mapped_to_new; push( @new_feature_names, keys %unrecognized_feature_names ); my @sorted_feature_names = uniq sort feature_sort @new_feature_names; my $matched = 0; my $unmatched = 0; my $missing = 0; my $feature_count = 0; my %groups_that_have_unmatched; my %groups_that_have_missing; foreach $feature_name ( @sorted_feature_names ) { ++$feature_count; $file1_val = $file1_values{ $feature_name }; $file2_val = $file2_values{ $feature_name }; if( $file1_val && $file2_val ) { if( $file1_val == $file2_val ) { ++$matched; #print "Feature \"$feature_name\" matches\n"; } else { ++$unmatched; print "Feature \"$feature_name\" doesn't match:\n"; print "\tVal $file1_val in file $file1\n"; print "\tVal $file2_val in file $file2\n"; my $feature_group = $feature_name; # strip off the bin number if it exists $feature_group =~ s/ \[\d+\]//; if( defined $groups_that_have_unmatched{ $feature_group } ) { $groups_that_have_unmatched{ $feature_group }++; } else { $groups_that_have_unmatched{ $feature_group } = 1; } } } elsif( $file1_val xor $file2_val ) { ++$missing; if( $file1_val ) { print "***Feature \"$feature_name\"\n\tappears in file \"$file1\"\n\tbut not file \"$file2\"\n"; } else { print "****Feature \"$feature_name\"\n\tappears in file \"$file2\"\n\tbut not file \"$file1\"\n"; } my $feature_group = $feature_name; # strip off the bin number if it exists $feature_group =~ s/ \[\d+\]//; if( defined $groups_that_have_missing{ $feature_group } ) { $groups_that_have_missing{ $feature_group }++; } else { $groups_that_have_missing{ $feature_group } = 1; } } else { # neither file had this particular feature --$feature_count; #print "Neither file had feature $feature_name\n"; } } print "\n\n********************************\nSUMMARY:\n"; print "Features examined: $feature_count\n"; print "Matching features: $matched\n"; print "Total unmatched features: $unmatched\n"; print "Features in one sig file but not the other: $missing\n"; my $group_count = 1; print "Groups with unmatched features:\n"; foreach my $group_name (keys %groups_that_have_unmatched) { print "\t$group_count.\t".$groups_that_have_unmatched{$group_name}."\t$group_name:\n"; #&Comparison_Of_Rank_Ordered_Group_Values( $group_name, \%file1_values, \%file2_values ); $group_count++; } $group_count = 1; print "Groups with missing features:\n"; foreach my $group_name (keys %groups_that_have_missing) { print "\t$group_count.\t".$groups_that_have_missing{$group_name}."\t$group_name:\n"; $group_count++; } return 1; } ################################################################# sub uniq { my %seen = (); my @r = (); foreach my $a (@_) { unless ($seen{$a}) { push @r, $a; $seen{$a} = 1; } } return @r; } ################################################################# sub feature_sort { my ($a_stem, $a_bin, $b_stem, $b_bin ); if( $a =~ /(.*) \[(\d+)\]/ ) { $a_stem = $1; $a_bin = $2; } if( $b =~ /(.*) \[(\d+)\]/ ) { $b_stem = $1; $b_bin = $2; } if ( $a_stem && $a_bin && $b_stem && $b_bin ) { if ($a_stem ne $b_stem ) { return $a cmp $b } else { return $a_bin <=> $b_bin; } } else { return $a cmp $b; } } ################################################################# sub ParseSigFile($$) { my $filename = shift; my $values_ref = shift; print "Parsing file \"$filename\"\n"; my $value; my $feature; my $line_count = 0; open( THE_FILE, $filename ) or die "can't open $filename"; while (<THE_FILE>) { ++$line_count; if( $line_count != 1 and $line_count != 2) { if( /^(-?\d+\.\d+) (.+)$/ ) { $value = $1; $feature = $2; chomp $feature; #print "\tline $line_count feature \"$feature\"; value $value\n"; if( defined $old_featurenames_mapped_to_new{ $feature } ) { $feature = $old_featurenames_mapped_to_new{ $feature }; } else { $unrecognized_feature_names{$feature} = 1; } if( defined $$values_ref{ $feature } ) { die "The feature $feature has already been defined for file $filename.\n"; } else { $$values_ref{ $feature } = $value; } } # end if file 1 line correctly pattern matches else { die "Error procesing $filename: line doesn't match pattern of <#.#####> <space> <feature name>:\nHere is offending line: $_\n"; } } # end processing file1 lines } # end if line_count close THE_FILE; print "Total lines read in from file \"$filename\" : $line_count\n"; } #end sub ################################################################# sub Comparison_Of_Rank_Ordered_Group_Values ($$$) { my $group_name = shift; my $file1_values_ref = shift; my $file2_values_ref = shift; my @file1_group_values; my @file2_group_values; my $feature_name; my $meta_escaped_group_name; foreach $feature_name ( keys %{ $file1_values_ref } ) { $meta_escaped_group_name = quotemeta($group_name); #print "*** groupname \"$group_name\" ... quotemeta: \"$meta_escaped_group_name\"\n"; if( $feature_name =~ /$meta_escaped_group_name/ ) { push @file1_group_values, $$file1_values_ref{$feature_name}; } } foreach $feature_name ( keys %{ $file2_values_ref } ) { $meta_escaped_group_name = quotemeta($group_name); if( $feature_name =~ /$meta_escaped_group_name/ ) { push @file2_group_values, $$file2_values_ref{$feature_name}; } } @file1_group_values = sort { $a <=> $b } @file1_group_values; @file2_group_values = sort { $a <=> $b } @file2_group_values; my $max = $#file1_group_values; my $min = $#file2_group_values; my $short = 2; if( $#file2_group_values > $max ) { $max = $#file2_group_values; $min = $#file1_group_values; $short = 1; } print "\nGroup analysis: $group_name:\n"; my $i; for( $i = 0; $i <= $min; ++$i ) { print $file1_group_values[$i] . "\t" . $file2_group_values[$i] . "\n"; } for( $i = ($min +1); $i <= $max; ++$i ) { if( $short == 2) { print $file1_group_values[$i] . "\t" . "N/A" . "\n"; } else { print "N/A" . "\t" . $file2_group_values[$i] . "\n"; } } }