Create master dend file

From Colettapedia
Jump to navigation Jump to search
#!/usr/bin/perl
#
# Phase 3.pl
# Custom script written for John Delaney's pairwise wndchrm distance gene experiment
#
# Pseudocode:
# 1. scoop up all the dendfiles
# 2. Parse the filename to ascertain which pairwise experiment it is
# 3. Parse the file itself for the information
# 4. Perform the calculation, and store in an orderly way
# 5. After all files have been parsed, generate master dendfile from data

use strict;
use warnings;

my @dendfile_list = split( "\n", `find . -name "*dend_file*"` );
my %class_info; # Two dimensional hash stores the names and pairwise distances 

my $DEBUG0 = 1;

for( my $i = 0; $i <= $#dendfile_list; $i++) {
	# Get rid of path
	my $dend_file = $dendfile_list[$i];
	$dend_file =~ s/.+\///g;
	if( $dend_file !~ /_(.+)-D_VS_(.+)-D_dend_file\.txt/ ) {
		die "$dend_file doesn't follow naming convention of \"...<gene1>-D_VS_<gene2>-D\"\n";
	}
	print "dend file $dend_file has genes $1 and $2.\n" if( $DEBUG0 );
	my $gene1 = $1;
	my $gene2 = $2;
	my $distance1 = 0;
	my $distance2 = 0;
	open DENDFILE, $dendfile_list[$i] or die "Can't open $dendfile_list[$i]: $!\n";
	my $count = 0;
	while (<DENDFILE>) {
		$count++;
		next if $count == 1; # skip first line in file, we know it's a "4"
		if( $count == 2 ) {
			# Arrangement should be Genename   #.##   #.##   #.##   #.##
			# Capture the third number
			if( /\S+\s+\S+\s+\S+\s+(\S+)\s+\S+/ ) {
				$distance1 = $1;
				print "\tPulled $distance1 from row one.\n" if( $DEBUG0 );
			} else {
				die "Wasn't able to pull the third number from row 1 of dendrogram.\n";
			}
		} elsif ( $count == 3 ) {
			# Here we capture the fourth number
			if( /\S+\s+\S+\s+\S+\s+\S+\s+(\S+)/ ) {
				$distance2 = $1;
				print "\tPulled $distance2 from row two.\n" if( $DEBUG0 );
			} else {
				die "Wasn't able to pull the fourth number from row 2 of dendrogram.\n";
			}
		}
	}	
	$class_info{$gene1}{$gene2} = sqrt( $distance1*$distance1 + $distance2*$distance2 );
	$class_info{$gene2}{$gene1} = $class_info{$gene1}{$gene2};
	close DENDFILE;
}
open OUTPUT, ">master_dend_file.txt" or die "Can't open output file: $!\n";

my @master_gene_list = sort keys %class_info;
print OUTPUT $#master_gene_list . "\n";

foreach my $row ( @master_gene_list ) {
	print OUTPUT $row . "\t";
	foreach my $col ( @master_gene_list ) {
		if( defined $class_info{$row}{$col} ) {
			printf OUTPUT "%0.3f\t", $class_info{$row}{$col};
		} else {
			print OUTPUT "0.000\t";
		}
	}
	print OUTPUT "\n";
}

close OUTPUT;