Create probability density points from html.pl
Jump to navigation
Jump to search
#!/usr/bin/perl use strict; use warnings; require HTML::Tree; require Statistics::Descriptive; require Statistics::KernelEstimation; use Getopt::Long; my $num_bins = undef; #my $bin_width = undef; my $normalize = undef; my $num_classes = undef; my $flags = undef; my $bandwidth = undef; my $input_file = undef; my $interp_val_dump_file = undef; GetOptions( "bins=i" => \$num_bins, # "window_size=f" => \$bin_width, "normalize=s" => \$normalize, "num_classes=i" => \$num_classes, "flags=i" => \$flags, "bandwidth=s" => \$bandwidth, "input_file=s" => \$input_file, "dump_interp_vals=s" => \$interp_val_dump_file ); print "Histogram will be created with number of bins: $num_bins\n" if( defined $num_bins ); #print "Window size : $bin_width\n" if( defined $bin_width ); print "Age scores will be normalized\n" if( defined $normalize ); my %results_hash; my $min = -1; my $max = -1; my $interpolated_value; my $predicted_class; my $split_number; my $normalized_distances; if( !defined $input_file ) { # Takes a wndchrm output html file as input. # HTML file should have been procured as part of a wndchrm test operation # where the operation derives interpolated values # This code parses html output file, and counts how many images in the test set were # sorted into the given bins. my $output_file = shift; print "Loading $output_file\n"; my $tree = HTML::TreeBuilder->new_from_file( $output_file ); print "Done loading $output_file\n"; #$tree->dump; my @table_elements = $tree->look_down("_tag", "TABLE", sub { if( defined $_[0]->attr("ID") ) { if( $_[0]->attr("ID") =~ /IndividualImages_split/ ) { # print "Got one!\n"; return 1; } } # print "DON' GOT ONE!!\n"; return 0; } ); die "Couldn't find the test results table element in file $output_file\n" if( !@table_elements ); # print "Number of splits found: $#table_elements\n"; # step one: figure out the number of classes if( !defined $num_classes ) { my $class_structure_elem = $tree->look_down("_tag", "table"); die "Couldn't derive number of classes from html file $output_file.\n" if( !$class_structure_elem ); # print $class_structure_elem->as_text . "\n"; my $first_row = $class_structure_elem->look_down("_tag", "tr"); #grab the first one it sees die "Couldn't derive number of classes from html file $output_file.\n" if( !$first_row ); # print $first_row->as_text . "\n"; my @class_rows = $first_row->look_down("_tag", "td"); $num_classes = $#class_rows - 1; } print "Number of classes used: $num_classes\n"; my @rows; my @row; # my %results_hash; my $val; my $img_link_element; my $fullpath; my $filename; my $actual_class; my $DEBUG1 = 0; #my $min = -1; #my $max = -1; my $image_column; foreach my $split_table_element (@table_elements) { @rows = $split_table_element->look_down("_tag", "tr"); # print "Parsing $output_file, " . $split_table_element->attr("ID") ."\n"; if( $split_table_element->attr("ID") =~ /IndividualImages_split(\d+)/ ) { $split_number = $1; } else { $split_number = -1; } # Parse the first "Caption" row @row = $rows[0]->look_down("_tag", "td"); my $caption_text = $row[$#row]->as_text; if( $caption_text =~ /Most similar image/ ) { $image_column = $#row - 1; } else { $image_column= $#row; } if( defined $flags && $flags =~ /1/ ) { # weird wndchrm report discrepancy, use second row when column headers don't match up. @row = $rows[1]->look_down("_tag", "td"); $image_column= $#row; } if( defined $flags && $flags =~ /2/ ) { # weird wndchrm report discrepancy, use second row when column headers don't match up. @row = $rows[1]->look_down("_tag", "td"); $image_column= $#row - 1; } print "Image column is $image_column\n\n" if( $DEBUG1 ); # The first row is the heading row, so skip it by starting at 1 instead of 0 for( my $i = 1; $i <= $#rows; $i++) { $val = 0; @row = (); $img_link_element = undef; @row = $rows[$i]->look_down("_tag","td"); if( $DEBUG1 ) { foreach (@row) { print " " . $_->as_text; } print "\n"; } $img_link_element = $row[$image_column]->look_down( "_tag", "A" ); $fullpath = $img_link_element->attr("HREF"); if( $fullpath =~ /\S*\/(\S+)/ ) { $filename = $1; # print "\tFound file $filename\n"; $normalized_distances = ""; for( my $j = 3; $j <= ( 3 + $num_classes ); $j++ ) { $normalized_distances .= $row[$j]->as_text . " "; } $actual_class = $row[ $image_column - 4 ]->as_text; $predicted_class = $row[ $image_column - 3 ]->as_text; $interpolated_value = $row[ $image_column - 1 ]->as_text; if( $min == -1 ) { $min = $interpolated_value; } $min = $interpolated_value if( $interpolated_value < $min ); if( $max == -1 ) { $max = $interpolated_value; } $max = $interpolated_value if( $interpolated_value > $max ); print "\t\tactual: $actual_class, predicted: $predicted_class, interp val: $interpolated_value\n" if( $DEBUG1 ); push @{ $results_hash{ $actual_class }->{ $filename } }, { "split_num" => $split_number, "val" => $interpolated_value, "class" => $predicted_class, "dists" => $normalized_distances }; } } } $tree->delete(); } else # if input file is specified, we're skipping over the wndchrm html part and inputting # marginal probabilities directly, in the form of some tab separated file { open INPUT, $input_file; my $num_images_per_class = 180; # my $num_classes = 7; my $current_class; my $fake_filename; my @normalized_distances_ary; my @coeff_ary = (3.19, 5.53, 11.42, 13.01, 19.22, 21.40, 25.12); my $count = 0; while (<INPUT>) { @normalized_distances_ary = (); $interpolated_value = 0; $current_class = "class" . int( $count / $num_images_per_class ); $fake_filename = "image" . $count; @normalized_distances_ary = split /\s+/, $_; die "num classes ($num_classes) does not equal num distances ($#normalized_distances_ary)\n" if( $#normalized_distances_ary != $num_classes -1 ); for( my $j = 0; $j <= $#normalized_distances_ary; $j++) { print "\t\tinterp val $interpolated_value += $normalized_distances_ary[$j] * $coeff_ary[$j]\n"; $interpolated_value += $normalized_distances_ary[$j] * $coeff_ary[$j]; } print "$current_class $fake_filename interp val $interpolated_value\n"; if( $min == -1 ) { $min = $interpolated_value; } $min = $interpolated_value if( $interpolated_value < $min ); if( $max == -1 ) { $max = $interpolated_value; } $max = $interpolated_value if( $interpolated_value > $max ); push @{ $results_hash{ $current_class }->{ $fake_filename} }, { "split_num" => 1, "dists" => $_, "val" => $interpolated_value, "predicted_class" => -1 }; $count++; } close INPUT; } my $range = $max - $min; my @interp_vals; my $stat = Statistics::Descriptive::Sparse->new(); my $norm_stat = Statistics::Descriptive::Sparse->new(); my $class_stat = Statistics::Descriptive::Full->new(); my $ks_class_stat = Statistics::KernelEstimation->new(); my $report; my $histogram; my $PDF; my $distribution_hash; open( INTERP_VAL_DUMP_FILE, '>', $interp_val_dump_file ) if( defined $interp_val_dump_file ); #print "RESULTS:\n"; foreach my $class ( sort keys %results_hash ) { #print "\tClass \"$class\"\n"; foreach my $file (keys %{ $results_hash{ $class } }) { @interp_vals = (); # print "\t\tFile \"$file\"\n"; $stat->clear; $norm_stat->clear; foreach my $hash_ref ( @{ $results_hash{ $class }->{ $file } } ) { $interpolated_value = $hash_ref->{ "val" }; $stat->add_data( $interpolated_value ); $norm_stat->add_data( ( $interpolated_value - $min ) / $range ); $predicted_class = $hash_ref->{ "class" }; $split_number = $hash_ref->{ "split_num" }; $normalized_distances = $hash_ref->{ "dists" }; # printf "\t\t\tsplit %2.d - predicted: $predicted_class, actual: $class. Normalized dists: ( $normalized_distances ) Interp val: $interpolated_value\n", $split_number; } # end iterating over each image's split result # printf "\t\t\t---> Tested %d times, mean %.3f, std dev %.3f. Normalized to [0,1]: mean: %.4f, std_dev: %.4f\n\n", $stat->count, $stat->mean, $stat->standard_deviation, $norm_stat->mean, $norm_stat->standard_deviation; if( defined $normalize ) { $class_stat->add_data( $norm_stat->mean ); $ks_class_stat->add_data( $norm_stat->mean ); print INTERP_VAL_DUMP_FILE $class . "," . $norm_stat->mean . "\n" if( defined $interp_val_dump_file ); } else { $class_stat->add_data( $stat->mean ); $ks_class_stat->add_data( $stat->mean ); print INTERP_VAL_DUMP_FILE $class . "," . $stat->mean . "\n" if( defined $interp_val_dump_file ); } } # end iterating over each image $class_stat->sort_data; $report .= sprintf "Class $class: count= %3d, min=%.4f, max=%.4f, mean=%.4f, std dev=%.4f\n", $class_stat->count, $class_stat->min, $class_stat->max, $class_stat->mean, $class_stat->standard_deviation; # Here is the old code which simply constructs a histogram if( defined $num_bins ) { $distribution_hash = $class_stat->frequency_distribution_ref($num_bins); $histogram .= "Class $class\n"; foreach( sort {$a <=> $b} keys %$distribution_hash ) { $histogram .= "$_\t$distribution_hash->{$_}\n"; } # End generation of histogram for this class } else { # Begin new code generating PDF using kernal smoothing for this class print "\nClass $class\n"; $PDF .= "\nClass $class\n"; my $bw; my $obw; # always output default bandwidth, even if it's specified on command line $bw = $ks_class_stat->default_bandwidth(); print "\tDefault bandwidth is $bw\n"; if( defined $bandwidth && $bandwidth eq "optimal" ) { $obw = $ks_class_stat->optimal_bandwidth(); print "\tOptimal bandwidth is $obw\n"; } my ( $class_min, $class_max ) = $ks_class_stat->extended_range(); print "\tClass min: $class_min, Class max: $class_max\n"; for( my $x=$class_min; $x<=$class_max; $x+=($class_max-$class_min)/100 ) { if( !defined $bandwidth ) { # use default $PDF .= $x . "\t" . $ks_class_stat->pdf( $x, $bw ) . "\n"; } elsif( defined $bandwidth && $bandwidth eq "optimal" ) { $PDF .= $x . "\t" . $ks_class_stat->pdf( $x, $obw ) . "\n"; } else { $PDF .= $x . "\t" . $ks_class_stat->pdf( $x, $bandwidth ) . "\n"; } } # end generation of PDF using kernal smoothing for this class } # reset statistical vehicles for the next class's data $class_stat->clear; $distribution_hash = undef; my $ks_class_stat_ref = ref($ks_class_stat); undef $ks_class_stat; $ks_class_stat = new $ks_class_stat_ref; } #end iterating over all classes close INTERP_VAL_DUMP_FILE if( defined $interp_val_dump_file ); print "\nGlobal min val: $min, global max val: $max\n"; print "\n\n***********REPORT********\n\n" . $report . "\n"; if( defined $num_bins ) { print "\n\n**********PLOT THESE********\n\n" . $histogram . "\n\n\n"; } else { print "\n\n**********PLOT THESE********\n\n" . $PDF . "\n\n\n"; }