Mds.pl

From Colettapedia
Jump to navigation Jump to search
#!/usr/bin/perl

use strict;
use warnings;

sub CalculateMDS(\@);
sub PrintOutStatistics(\@);
sub main();

#run program
&main;


############################################################


sub main {

	open IN, "/Users/chris/projects/John_gene_MDS/average_class_probabilities.tsv" or die;

	my $linecount = 0;
	my @mds_result_array;
	my $line;
	my @column_averages = ();

	# read in file line by line
	while( $line = <IN>) {
		++$linecount;
		# skip introductory lines
		if( $linecount < 5 ){
			next;
		}
		# strip off the carriage return at the end of the line
		chomp $line;

		my @columns = split( "\t", $line );

		if( $#columns < 16 ) {
			die "incorrect number of columns in avg class prob file. check file and try again.\n";
		}

		# column 0 is a label, get rid of it
		shift @columns;

		# add each column to a master list
		for( my $i = 0; $i <= $#columns; $i++ ) {
			$column_averages[$i] += $columns[$i];
		}
		
		# calculate the morphological divergence score and store it in an array
		push @mds_result_array, &CalculateMDS( \@columns );
	}
	close IN;

	
	&PrintOutStatistics( \@mds_result_array );

	for(my $i = 0; $i <= $#column_averages; $i++) {
		$column_averages[$i] /= $linecount - 5;
	}

	print "method 2:\n";
	&CalculateMDS( \@column_averages );

	return 1;
}

###################################################################

sub CalculateMDS(\@) {
	
	# adding doxycyclin keeps ESC in embryonic state, therefore dox+ is gene OFF
	
	my @columns = @{ $_[0] };
	my @gene1_OFF = (); # line 1 - gene1+dox
	my @gene1_ON = (); # line 2 - gene1-dox
	my @gene2_OFF = (); # line 3 - gene2+dox
	my @gene2_ON = (); #line 4 - gene2-dox

	$gene1_OFF[0] = $columns[0];
	$gene1_OFF[1] = $columns[1];
	$gene1_OFF[2] = $columns[2];
	$gene1_OFF[3] = $columns[3];
	
	$gene1_ON[0] = $columns[4];
	$gene1_ON[1] = $columns[5];
	$gene1_ON[2] = $columns[6];
	$gene1_ON[3] = $columns[7];
	
	$gene2_OFF[0] = $columns[8];
	$gene2_OFF[1] = $columns[9];
	$gene2_OFF[2] = $columns[10];
	$gene2_OFF[3] = $columns[11];

	$gene2_ON[0] = $columns[12];
	$gene2_ON[1] = $columns[13];
	$gene2_ON[2] = $columns[14];
	$gene2_ON[3] = $columns[15];

	# vectorON is line 4 - line 2 (gene2-dox - gene1-dox)
	my @vectorON = ( ($gene2_ON[0]-$gene1_ON[0]), ($gene2_ON[1]-$gene1_ON[1]), ($gene2_ON[2]-$gene1_ON[2]), ($gene2_ON[3]-$gene1_ON[3]) );
	# vectorOFF is line 3 - line 1 (gene2+dox - gene1+dox)
	my @vectorOFF = ( ($gene2_OFF[0]-$gene1_OFF[0]), ($gene2_OFF[1]-$gene1_OFF[1]), ($gene2_OFF[2]-$gene1_OFF[2]), ($gene2_OFF[3]-$gene1_OFF[3]) );
	my @cv = ( ($vectorON[0]-$vectorOFF[0]), ($vectorON[1]-$vectorOFF[1]), ($vectorON[2]-$vectorOFF[2]), ($vectorON[3]-$vectorOFF[3]) );
	print "\tresult vector (ON-OFF): "; foreach (@cv) { print $_ . ", "; } print "\n";
	my $correctedvector_MAG = sqrt( $cv[0]**2 + $cv[1]**2 + $cv[2]**2 + $cv[3]**2 );
	print "\tmagnitude of result vector: " . $correctedvector_MAG . "\n";
	return $correctedvector_MAG;
}
###################################################################

sub PrintOutStatistics(\@) {

	my @result_array = @{ $_[0] };
	my $mean = 0;
	foreach ( @result_array ) {
		$mean += $_;
	}

	$mean /= ($#result_array +1 );

	my $variance = 0;
	foreach ( @result_array ) {
		$variance += ($_ - $mean)**2;
	}

	$variance /= ($#result_array+1);

	my $std_dev = sqrt($variance);
	my $std_error = $std_dev / sqrt( $#result_array+1 );
# z-score for 95% interval is ~ 1.96
	my $confidence_interval = 1.95966 * $std_error;

	print "\n===============================\n";
	print "count\tmean\tvar\tstd dev\tstd err\tconfidence interval\n";
	printf( "%d\t%0.4f\t%0.4f\t%0.4f\t%0.4f\t%0.4f\n",  ($#result_array +1 ), $mean,
																										$variance, $std_dev, $std_error, $confidence_interval);
}

__END__