# USAGE perl allelicBinPtrVAR.pl TrSNPCounts3.txt pCutoff minSNPcount [transcriptlist] use strict; open (IN,"<".$ARGV[0])||die "cannot open ".$ARGV[0]."\n"; #CUTOFFS my $pCutoff = $ARGV[1]; my $fSd = (4/5); #fraction same direction; my $dirTrRatio = (3/4); #required decimal SNP ratio to name it imprinted my $minSNPcount = $ARGV[2]; my $production = 1; #if defined then pBinom are recalculated for ratios my $optTrList = $ARGV[3]; #pBinom vs N table my %pn=( "0.01","7","0.001","10","0.0001","14","0.00001","17" ); my %optional; if ($optTrList){ open (OPTIONAL,"<".$optTrList)||die "cannot open ".$optTrList."\n"; while (){ ~m/^(\w+)\t(.+)$/; my ($tr, $data)=($1,$2); $optional{$tr}=$data; } close (OPTIONAL); } my %uRatios; #unique collection of ratios for which binomPs are calculated while (){ chomp; my @SNPratios; my ($tr,$trRatio,$exSNPratio)=split ("\t",$_); my ($aCount,$bCount)=split ("/",$trRatio); next if ((($aCount+$bCount)<$pn{$pCutoff})||((! $optional{$tr})&&($optTrList))); Uratio ($exSNPratio,\%uRatios,"-1"); } close (IN); if ($production){ open (OUT,">Uratios_".$ARGV[0]); print OUT "0\t",$pn{$pCutoff},"\t0.5\n"; for my $uRatio (keys %uRatios){ my @ratio =split (" ",$uRatio); print OUT @ratio[0],"\t",(@ratio[0]+@ratio[1]),"\t0.5\n"; } close (OUT); Rpbinom ("Uratios_".$ARGV[0]); #R calculates binomP } open (IN,"){ ~m/(\d+)\s(\d+)\s(\d+\.*\d*)\s(\d+\.*\d*e*\-*\d*)/; Uratio ($1."/".($2-$1),\%uRatios,$4); } close (IN); # Now analyse transcripts for SNP directions my (%dExons,%dTRs); #collect some statistics open (IN,"<".$ARGV[0]); open (OUT,">".$ARGV[0].".DIRpCutoff".$pCutoff."minSNPcount".$minSNPcount); while (){ chomp; my @SNPratios; my ($tr,$trRatio,$exSNPratio)=split ("\t",$_); my ($aCount,$bCount)=split ("/",$trRatio); my ($aSNP,$bSNP,$stddev,$stdem,%trSNPdirect) = trSNPdirect ($exSNPratio,\%uRatios); #abSNP number of SNPs hit by allele A and by allele B, stdev stdem over SNPs that meet p-value if ((($aCount<2) && ($bCount<2))||((! $optional{$tr})&&($ARGV[3]))){ print OUT $tr,"\t",$optional{$tr},"\ttoo less transcript-SNP coverage\t-\t",$aSNP,"\t",$bSNP,"\n" if ($optional{$tr}); next; } print $tr," ",$exSNPratio,"\n"; #my ($aSNP,$bSNP,%trSNPdirect) = trSNPdirect ($exSNPratio,\%uRatios); #abSNP number of SNPs hit by allele A and by allele B if (! %trSNPdirect){ #no SNPs with sufficient counts print OUT $tr,"\t",$optional{$tr},"\ttoo less coverage per SNP\t-\t",$aSNP,"\t",$bSNP,"\n" if ($optional{$tr});; print "to less counts according to pCutoff ",$pCutoff, "n=",$pn{$pCutoff},"\n"; next; } my ($trTOPdir,$trTOTcount); foreach my $direct (sort {$trSNPdirect{$b}<=>$trSNPdirect{$a}} keys %trSNPdirect){ $trTOPdir=$direct if (! $trTOPdir); $trTOTcount+=$trSNPdirect{$direct}; print "direct ",$direct," SNPs ",$trSNPdirect{$direct}," directions ",scalar (keys %trSNPdirect),"\t"; } if ($trTOTcount < $minSNPcount){ print "too less SNPs (< ",$minSNPcount,") to estimate direction"; print OUT $tr,"\t",$optional{$tr},"\ttoo less SNPs (< ",$minSNPcount,") to estimate direction\t-\t",$aSNP,"\t",$bSNP,"\n"; $dTRs{du}{tooLesSNPs}{na}++; next; } my $imprinted = checkDirTrRatio ($trTOPdir,$aCount,$bCount,$dirTrRatio); if (scalar (keys %trSNPdirect) == 1){ $dTRs{d}{$trTOPdir}{$imprinted}++; print "\ntrRatio is ",$trRatio," transcript direction: ",$trTOPdir," imprinting state: ",$imprinted,"\n\n"; print OUT $tr,"\t",$optional{$tr},"\t",$trTOPdir,"\t",$imprinted,"\t",$aSNP,"\t",$bSNP,"\t",$trTOTcount,"\t",$stdem,"\n" if ($optional{$tr}); next; } if (($trSNPdirect{$trTOPdir}/$trTOTcount)>=$fSd){ $dTRs{dl}{$trTOPdir}{$imprinted}++; print "\ntrRatio is ",$trRatio," transcript direction likely is: ",$trTOPdir," imprinting state: ",$imprinted,"\n\n"; print OUT $tr,"\t",$optional{$tr},"\t",$trTOPdir,"l\t",$imprinted,"\t",$aSNP,"\t",$bSNP,"\t",$trTOTcount,"\t",$stdem,"\n" if ($optional{$tr}); next; } print OUT $tr,"\t",$optional{$tr},"\tambiguous\t-\t",$aSNP,"\t",$bSNP,"\t",$trTOTcount,"\t",$stdem,"\n" if ($optional{$tr}); print "direction is ambiguous\n\n"; $dTRs{du}{ambiguous}{$imprinted}++; } close (IN); print "pCutoff: ",$pCutoff,"\n"; print "minimum number of informative SNPs on transcript: ",$minSNPcount,"\n"; print "fraction same direction: ",$fSd,"\n"; #fraction same direction print "minimum transcript SNP ratio to name it imprinted: ",($dirTrRatio/(1-$dirTrRatio)),":1\n"; print "pBinom are recalculated for ratios: ",$production,"\n"; printer (\%dTRs,"Transcripts"); sub checkDirTrRatio{ my ($dir,$aCount,$bCount,$dirTrRatio)=@_; my $trRatio=($aCount/($aCount+$bCount)); if ((($trRatio > 0.5) && ($dir eq 'B'))||(($trRatio < 0.5) && ($dir eq 'A'))) { #trRatio inconsistent with direction print "DEBUG $trRatio,$dir,$aCount,$bCount,$dirTrRatio\n"; return "trRatio inconsistent with pBinom SNP direction"; } if (abs($trRatio-0.5) >= ($dirTrRatio-0.5)) { #trRatio consistent with direction and suffcient ratio to name it imprinted if ($dir eq 'N'){ return "neutral"; } else{ return "imprinted"; } } else { if ($dir eq 'N'){ return "neutral"; } return "not imprinted due to low trRatio"; } } sub printer { my ($hash2print,$title)=@_; print $title,"\n"; foreach my $key1 (sort keys %{$hash2print}){ print $key1,"\n"; foreach my $key2 (sort keys %{$$hash2print{$key1}}){ print "\t",$key2,"\n"; foreach my $key3 (sort keys %{$$hash2print{$key1}{$key2}}){ print "\t",$key3,"\t",$$hash2print{$key1}{$key2}{$key3},"\n"; } } print "\n"; } } sub trSNPdirect { my ($ratios,$uRatios) = @_; my %trSNPdirect; my @dirSNPratios; my ($aSNP,$bSNP)=(0,0); foreach my $SNPperEx (split ("#",$ratios)){ my ($exon,$snpRatios) = split ("_",$SNPperEx); while ($snpRatios =~m/(\d+)\/(\d+)/g){ $aSNP++ if ($1>0); $bSNP++ if ($2>0); my @Scalls=sort {$a<=>$b} ($1,$2); my $p = 1; $p = $uRatios{@Scalls[0]." ".@Scalls[1]} if ($uRatios{@Scalls[0]." ".@Scalls[1]}); if (($1 > $2)&& ($p < $pCutoff)){ $trSNPdirect{A}++; push (@dirSNPratios,"$1/$2"); next; } if (($1 < $2) && ($p < $pCutoff)){ $trSNPdirect{B}++; push (@dirSNPratios,"$1/$2"); next; } if (($1+$2) >= $pn{$pCutoff}){ $trSNPdirect{N}++; push (@dirSNPratios,"$1/$2"); } } } my ($stddev, $stdem)=("-","-"); if (scalar(@dirSNPratios)){ my @dDirSNPratios= decRatio (join (" ",@dirSNPratios)); ($stddev, $stdem) = statistics (@dDirSNPratios); } return ($aSNP,$bSNP,$stddev,$stdem,%trSNPdirect); } sub Rpbinom { my ($file)=@_; open (R, ">pbnom.R"); print R "data<-read.table(\"".$file."\",colClasses=c(\"numeric\",\"numeric\",\"numeric\"))\n for (i in 1:nrow(data)) { data[i,\"V4\"]<- (pbinom (data[i,\"V1\"],data[i,\"V2\"],data[i,\"V3\"]))}\n write.table(data,file=\"".$file."P\",row.names=FALSE,col.names=FALSE)\n"; close (R); my $result= `R --no-save =$pn{$pCutoff}){ my @Scalls=sort {$a<=>$b} ($1,$2); $uRatios{@Scalls[0]." ".@Scalls[1]}=$pbinom; } } } sub decRatio { my ($ratios) = @_; my @dSNPratios; while ($ratios =~m/(\d+)\/(\d+)/g){ my @Scalls=sort {$a<=>$b} ($1,$2); my $dSNPratio = sprintf ("%.3f", @Scalls[1]/(@Scalls[0]+@Scalls[1])); $dSNPratio = (1-$dSNPratio) if ($2 > $1); push (@dSNPratios,$dSNPratio); } return @dSNPratios; } sub statistics { my (@numbers) = @_; my $stddev = standard_deviation(@numbers); my $stdem = $stddev / sqrt(scalar @numbers); return (sprintf ("%.3f",$stddev), sprintf ("%.3f", $stdem)); } sub standard_deviation { my(@numbers) = @_; #Prevent division by 0 error in case you get junk data return undef unless(scalar(@numbers)); # Step 1, find the mean of the numbers my $total1 = 0; foreach my $num (@numbers) { $total1 += $num; } my $mean1 = $total1 / (scalar @numbers); # Step 2, find the mean of the squares of the differences # between each number and the mean my $total2 = 0; foreach my $num (@numbers) { $total2 += ($mean1-$num)**2; } my $mean2 = $total2 / (scalar @numbers);#this is not (n-1) method used in office aplications # Step 3, standard deviation is the square root of the # above mean my $std_dev = sqrt($mean2); return $std_dev; }