#!/usr/bin/perl -w
#
# Given a 'results' dir from a bayes-10pcv-driver run,
# graph a histogram of the score ranges using GNUPlot.
#
# usage: graph-bayes-histogram [--buckets=100] ...dir/results .../dir2/results ...
#
# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at:
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>


use Getopt::Long;
our $opt_buckets;

GetOptions("buckets=i");

my $buckets = $opt_buckets || 100;
my $range_lo = 0.0;
my $range_hi = 1.0;

%bux_sp = ();
%bux_ns = ();

my $step = ($range_hi - $range_lo) / $buckets;
my $i;
for ($i = $range_lo; $i <= $range_hi; $i += $step) {
  push (@buckets, $i);
}

open(DATA, ">plot.data");
my $setcount = 0;
my %tag = ();
my @dirs = ();
foreach my $dir (@ARGV) {
  for ($i = $range_lo; $i <= $range_hi; $i += $step) {
    $bux_ns{$i} = $bux_sp{$i} = 0;
  }

  dofile($setcount, "$dir/spam_all.log", "$dir/nonspam_all.log");
  push (@dirs, $dir);
  $tag{$dir} = $setcount;
  $setcount++;
}
close DATA;

open (OUT, "| gnuplot -") or die "cannot run gnuplot";
select(OUT);

print "
set xlabel 'P(spam)'
set ylabel 'Frequency'
set logscale y 2
set xrange [0.0:1.01]
set yrange []
set xtics 0,0.1,0.99
set terminal png crop
set out 'graph.png'

plot ";

my @text = ();
my $t = 0;
foreach my $dir (@dirs) {
  my $s = $tag{$dir};
  $t++; push (@text, "  'plot.data' using 1:2 index $s with linesp lt $t pt $t t 'ham, $dir'");
  $t++; push (@text, "  'plot.data' using 1:3 index $s with linesp lt $t pt $t t 'spam, $dir'");
}

print join(", \\\n", @text);
print "\n";

close OUT;
exit;


sub dofile {
  my ($setcount, $spam, $nonspam) = @_;

  foreach my $file ($spam, $nonspam) {
    open (IN, "<$file") || die "Could not open file '$file': $!";

    my $isspam = 0; ($file eq $spam) and $isspam = 1;

    while (<IN>) {
      /^(\.|Y)\s.+bayes=([^\s,]+)/ or next;
      my $score = $2+0;

      my $bucket_id;
      foreach my $bucket (@buckets) {
        if ($score >= $bucket && $score < $bucket+$step) {
          $bucket_id = $bucket; last;
        }
      }

      if ($isspam) {
        $bux_sp{$bucket_id}++;
      } else {
        $bux_ns{$bucket_id}++;
      }
    }
  }

  my $sideoffset = 0.001*$setcount;
  foreach my $bucket (@buckets) {
    my $ns = $bux_ns{$bucket};
    my $sp = $bux_sp{$bucket};
    my $xpos = $bucket + $sideoffset;
    print DATA "$xpos $ns $sp\n";
  }
  print DATA "\n\n";
}

