# USAGE perl allelicBinPtrVAR.pl TrSNPCounts3.txt pCutoff minSNPcount [transcriptlist]
use strict;
open (IN,"<".$ARGV[0])||die "cannot open ".$ARGV[0]."\n";

#CUTOFFS
my $pCutoff = $ARGV[1];
my $fSd = (4/5); #fraction same direction;
my $dirTrRatio = (3/4); #required decimal SNP ratio to name it imprinted
my $minSNPcount = $ARGV[2];
my $production = 1; #if defined then pBinom are recalculated for ratios
my $optTrList = $ARGV[3];

#pBinom vs N table
my %pn=( "0.01","7","0.001","10","0.0001","14","0.00001","17" );

my %optional;
if ($optTrList){
    open (OPTIONAL,"<".$optTrList)||die "cannot open ".$optTrList."\n";
    while (<OPTIONAL>){
	~m/^(\w+)\t(.+)$/;
	my ($tr, $data)=($1,$2);
	$optional{$tr}=$data;
    }
    close (OPTIONAL);
}
my %uRatios; #unique collection of ratios for which binomPs are calculated
while (<IN>){
    chomp;
    my @SNPratios;
    my ($tr,$trRatio,$exSNPratio)=split ("\t",$_);
    my ($aCount,$bCount)=split ("/",$trRatio);
    next if ((($aCount+$bCount)<$pn{$pCutoff})||((! $optional{$tr})&&($optTrList)));
    Uratio ($exSNPratio,\%uRatios,"-1");
}
close (IN);
if ($production){
    open (OUT,">Uratios_".$ARGV[0]);
    print OUT "0\t",$pn{$pCutoff},"\t0.5\n";
    for my $uRatio (keys %uRatios){
	my @ratio =split (" ",$uRatio);
	print OUT @ratio[0],"\t",(@ratio[0]+@ratio[1]),"\t0.5\n";
    }
    close (OUT);
    
    Rpbinom ("Uratios_".$ARGV[0]); #R calculates binomP
}
open (IN,"<Uratios_".$ARGV[0]."P"); #Add binomP as values to hash of Uratios
while (<IN>){
    ~m/(\d+)\s(\d+)\s(\d+\.*\d*)\s(\d+\.*\d*e*\-*\d*)/;
    Uratio ($1."/".($2-$1),\%uRatios,$4);
} 
close (IN);

# Now analyse transcripts for SNP directions
my (%dExons,%dTRs); #collect some statistics
open (IN,"<".$ARGV[0]);
open (OUT,">".$ARGV[0].".DIRpCutoff".$pCutoff."minSNPcount".$minSNPcount);
while (<IN>){
    chomp;
    my @SNPratios;
    my ($tr,$trRatio,$exSNPratio)=split ("\t",$_);
    my ($aCount,$bCount)=split ("/",$trRatio);
    my ($aSNP,$bSNP,$stddev,$stdem,%trSNPdirect) = trSNPdirect ($exSNPratio,\%uRatios); #abSNP number of SNPs hit by allele A and by allele B, stdev stdem over SNPs that meet p-value 
    if ((($aCount<2) && ($bCount<2))||((! $optional{$tr})&&($ARGV[3]))){
	print OUT $tr,"\t",$optional{$tr},"\ttoo less transcript-SNP coverage\t-\t",$aSNP,"\t",$bSNP,"\n" if ($optional{$tr});
	next;
    }
    print $tr," ",$exSNPratio,"\n";
    #my ($aSNP,$bSNP,%trSNPdirect) = trSNPdirect ($exSNPratio,\%uRatios); #abSNP number of SNPs hit by allele A and by allele B

    if (! %trSNPdirect){ #no SNPs with sufficient counts
	print OUT $tr,"\t",$optional{$tr},"\ttoo less coverage per SNP\t-\t",$aSNP,"\t",$bSNP,"\n" if ($optional{$tr});;
	print "to less counts according to pCutoff ",$pCutoff, "n=",$pn{$pCutoff},"\n";
	next;
    }
    my ($trTOPdir,$trTOTcount);
    foreach my $direct (sort {$trSNPdirect{$b}<=>$trSNPdirect{$a}} keys %trSNPdirect){
	$trTOPdir=$direct if (! $trTOPdir);
	$trTOTcount+=$trSNPdirect{$direct};
	print "direct ",$direct," SNPs ",$trSNPdirect{$direct}," directions ",scalar (keys %trSNPdirect),"\t";
    }
    if ($trTOTcount < $minSNPcount){
	print "too less SNPs (< ",$minSNPcount,") to estimate direction";
	print OUT $tr,"\t",$optional{$tr},"\ttoo less SNPs (< ",$minSNPcount,") to estimate direction\t-\t",$aSNP,"\t",$bSNP,"\n";
	$dTRs{du}{tooLesSNPs}{na}++;
	next;
	}
    my $imprinted = checkDirTrRatio ($trTOPdir,$aCount,$bCount,$dirTrRatio);    
    if (scalar (keys %trSNPdirect) == 1){
	$dTRs{d}{$trTOPdir}{$imprinted}++;
	print "\ntrRatio is ",$trRatio," transcript direction: ",$trTOPdir," imprinting state: ",$imprinted,"\n\n";
	print OUT $tr,"\t",$optional{$tr},"\t",$trTOPdir,"\t",$imprinted,"\t",$aSNP,"\t",$bSNP,"\t",$trTOTcount,"\t",$stdem,"\n" if ($optional{$tr});
	next;
    }
    if (($trSNPdirect{$trTOPdir}/$trTOTcount)>=$fSd){
	$dTRs{dl}{$trTOPdir}{$imprinted}++;
	print "\ntrRatio is ",$trRatio," transcript direction likely is: ",$trTOPdir," imprinting state: ",$imprinted,"\n\n";
	print OUT $tr,"\t",$optional{$tr},"\t",$trTOPdir,"l\t",$imprinted,"\t",$aSNP,"\t",$bSNP,"\t",$trTOTcount,"\t",$stdem,"\n" if ($optional{$tr});
	next;
    }
    print OUT $tr,"\t",$optional{$tr},"\tambiguous\t-\t",$aSNP,"\t",$bSNP,"\t",$trTOTcount,"\t",$stdem,"\n" if ($optional{$tr});
    print "direction is ambiguous\n\n";
    $dTRs{du}{ambiguous}{$imprinted}++;
}
close (IN);

print "pCutoff: ",$pCutoff,"\n";
print "minimum number of informative SNPs on transcript: ",$minSNPcount,"\n";
print "fraction same direction: ",$fSd,"\n"; #fraction same direction
print "minimum transcript SNP ratio to name it imprinted: ",($dirTrRatio/(1-$dirTrRatio)),":1\n";
print "pBinom are recalculated for ratios: ",$production,"\n"; 
printer (\%dTRs,"Transcripts");


sub checkDirTrRatio{
    my ($dir,$aCount,$bCount,$dirTrRatio)=@_;
    my $trRatio=($aCount/($aCount+$bCount));
    if ((($trRatio > 0.5) && ($dir eq 'B'))||(($trRatio < 0.5) && ($dir eq 'A'))) { #trRatio inconsistent with direction
	print "DEBUG $trRatio,$dir,$aCount,$bCount,$dirTrRatio\n";
	return "trRatio inconsistent with pBinom SNP direction";
    }

    if (abs($trRatio-0.5) >= ($dirTrRatio-0.5)) { #trRatio consistent with direction and suffcient ratio to name it imprinted
	if ($dir eq 'N'){
	    return "neutral";
	}
	else{
	    return "imprinted";
	}
    }
    else {
	if ($dir eq 'N'){
            return "neutral";
        }
	return "not imprinted due to low trRatio";
    }
}

sub printer {
    my ($hash2print,$title)=@_;
    print $title,"\n";
    foreach my $key1 (sort keys %{$hash2print}){
	print $key1,"\n";
	foreach my $key2 (sort keys %{$$hash2print{$key1}}){
	    print "\t",$key2,"\n";
	    foreach my $key3 (sort keys %{$$hash2print{$key1}{$key2}}){
		print "\t",$key3,"\t",$$hash2print{$key1}{$key2}{$key3},"\n";
	    }
	}
	print "\n";
    }
}

sub trSNPdirect {
    my ($ratios,$uRatios) = @_;
    my %trSNPdirect;
    my @dirSNPratios;
    my ($aSNP,$bSNP)=(0,0);
    foreach my $SNPperEx (split ("#",$ratios)){
	my ($exon,$snpRatios) = split ("_",$SNPperEx);
	while ($snpRatios =~m/(\d+)\/(\d+)/g){
	    $aSNP++ if ($1>0);
	    $bSNP++ if ($2>0);
	    my @Scalls=sort {$a<=>$b} ($1,$2);
	    my $p = 1;
	    $p = $uRatios{@Scalls[0]." ".@Scalls[1]} if ($uRatios{@Scalls[0]." ".@Scalls[1]});
	    if (($1 > $2)&& ($p < $pCutoff)){
		$trSNPdirect{A}++;
		push (@dirSNPratios,"$1/$2");
		next;
	    }
	    if (($1 < $2) && ($p < $pCutoff)){
		$trSNPdirect{B}++;
		push (@dirSNPratios,"$1/$2");
		next;
	    }
	    if (($1+$2) >= $pn{$pCutoff}){
		$trSNPdirect{N}++;
		push (@dirSNPratios,"$1/$2");
	    }
	}
    }
    my ($stddev, $stdem)=("-","-");
    if (scalar(@dirSNPratios)){
	my @dDirSNPratios= decRatio (join (" ",@dirSNPratios));
	($stddev, $stdem) = statistics (@dDirSNPratios);
    }
    return ($aSNP,$bSNP,$stddev,$stdem,%trSNPdirect);
}



sub Rpbinom {
    my ($file)=@_;
    open (R, ">pbnom.R");
    print R "data<-read.table(\"".$file."\",colClasses=c(\"numeric\",\"numeric\",\"numeric\"))\n
    for (i in 1:nrow(data)) { data[i,\"V4\"]<- (pbinom (data[i,\"V1\"],data[i,\"V2\"],data[i,\"V3\"]))}\n
    write.table(data,file=\"".$file."P\",row.names=FALSE,col.names=FALSE)\n";
    close (R);
    my $result= `R --no-save <pbnom.R`;
}




sub Uratio {
    my ($ratios,$uRatios,$pbinom) = @_;
    while ($ratios =~m/(\d+)\/(\d+)/g){
	if (($1+$2)>=$pn{$pCutoff}){
	    my @Scalls=sort {$a<=>$b} ($1,$2);
	    $uRatios{@Scalls[0]." ".@Scalls[1]}=$pbinom;
	}
    }
}

sub decRatio {
    my ($ratios) = @_;
    my @dSNPratios;
    while ($ratios =~m/(\d+)\/(\d+)/g){
        my @Scalls=sort {$a<=>$b} ($1,$2);
	my $dSNPratio = sprintf ("%.3f", @Scalls[1]/(@Scalls[0]+@Scalls[1]));
	$dSNPratio = (1-$dSNPratio) if ($2 > $1);
	push (@dSNPratios,$dSNPratio);
    }
    return @dSNPratios;
}



sub statistics {
    my (@numbers) = @_;
    my $stddev = standard_deviation(@numbers);
    my $stdem = $stddev / sqrt(scalar @numbers);
    return (sprintf ("%.3f",$stddev), sprintf ("%.3f", $stdem));
}

sub standard_deviation {
    my(@numbers) = @_;
    #Prevent division by 0 error in case you get junk data                                                                                                                                           
    return undef unless(scalar(@numbers));

    # Step 1, find the mean of the numbers                                                                                                                                                           
    my $total1 = 0;
    foreach my $num (@numbers) {
	$total1 += $num;
    }
    my $mean1 = $total1 / (scalar @numbers);

    # Step 2, find the mean of the squares of the differences                                                                                                                                        
    # between each number and the mean                                                                                                                                                               
    my $total2 = 0;
    foreach my $num (@numbers) {
	$total2 += ($mean1-$num)**2;
    }
    my $mean2 = $total2 / (scalar @numbers);#this is not (n-1) method used in office aplications                                                                                                     

    # Step 3, standard deviation is the square root of the                                                                                                                                           
    # above mean                                                                                                                                                                                     
    my $std_dev = sqrt($mean2);
    return $std_dev;
}
