use strict;
my ($hitLine,$splicedLine,$tag,$id,$mapq,%mapq,$match,$snps,%snps,$subs,%snpsPerRead,%subs,@segSE,@stats,$segs,%segs); #$snps known subs, %snpsPerRead snps based on extra SNP-info
my ($score,$length,$rmInconsist,$totSnpCov,$minMapQual)=('[0-2]','35','','0','33');
my $basename=$ARGV[2];
$score=$ARGV[0] if ($ARGV[0]);
$length=$ARGV[3] if ($ARGV[3]);
$minMapQual=$ARGV[1] if ($ARGV[1]);

#open (OUT,">".$basename.".debugAndSNPS");
#open (STATS,">".$basename.".bed");
#open (ERR,">".$basename.".GSNAPERR");
open (STATS,">".$basename.".bedfull");

while (<STDIN>){

    if ($_=~m/^>([GATC]{$length})\t1\t[!-~]{$length}\t([!-?A-~]{1,255})\s*$/){
	($tag,$id,$match)=($1,$2,0);
	%snpsPerRead = ();
	@segSE = ();
	$hitLine=$_;
	$subs=0;
	next;
    }
    if (($_=~m/^\s([GATCgatc-]+)\t(\d+)..(\d+)\t([+-])(\w+):(\d+)\.\.(\d+).+sub\:(\d+)\+(\d+)\=\d+.+segs:(\d+),align_score:$score,mapq:(\d+)$/)&&($_!~m/term:/)&&($hitLine)){
	my ($refseq,$strand,$chrom,$start,$end,$knownsubs)=($1,$4,$5,$6,$7,$9);
	$subs+=$8;
	$segs=$10;
	$mapq=$11;

	if ((($1=~m/-/)&&($_!~m/splice/))||($mapq < $minMapQual)){ #Breakout if aligments pass that we dont want but meet check 
	    print OUT "ERR \n$id $1 $_";
	    $hitLine ="";
	    next;
	}

	if (consistCheck ($_,$knownsubs)){ #check inconsistent (to many) SNP_ids printed compared to reported number of SNPs
	    $hitLine ="";	
	    next;
	}
	
	#OK lets go on
	@stats=($id,$chrom,$start,$end,$strand);	

	while ($_ =~ m/(\d+)\@(\d+\w+)/g) {
	    $snpsPerRead{$1}=$2;
	}

	if ($segs==1){ #read is not spliced aligned so we can print now
	    printAndCount (\@stats,\%snpsPerRead);
	    $totSnpCov+=$knownsubs;
	    $subs{$subs}++;
	    $mapq{$subs}{$mapq}++;
	    print $hitLine,$_,"\n";
	    $hitLine ="";
	    next;
	}
	#for spliced alignments we store some data
	$snps=$knownsubs;
	@segSE=($start,$end);
	$segs{$start."_".$end}=$_;
	next;
    }
    
    if (($_=~m/^,[GATCgatc-]+\t\d+..\d+\t([+-])(\w+):(\d+)\.\.(\d+).+sub\:(\d+)\+(\d+)\=\d+.+splice/)&&(keys(%segs))){
	my ($start,$end,$knownsubs)=($3,$4,$6);
	$subs+=$5;
	
	if (consistCheck ($_,$knownsubs)){
	    $hitLine ="";
	    $splicedLine="";
	    next;
	}
	while ($_ =~ m/(\d+)\@(\d+\w+)/g) {
	    $snpsPerRead{$1}=$2;
	}
	#check for segment length
	if (abs($start-$end)>abs(@segSE[0]-@segSE[1])){
	    @segSE=($start,$end); 
	    @stats[2]=$start;
	    @stats[3]=$end;
	}

	if ($segs == (keys(%segs)+1)){
	    $segs{$start."_".$end}=$_;
	    printAndCount (\@stats,\%snpsPerRead,\%segs);
	    $totSnpCov+=$knownsubs;
	    $totSnpCov+=$snps;
	    $subs{$subs}++;
	    $mapq{$subs}{$mapq}++;
	    print $hitLine;
	    foreach my $splicedLine (sort {$a<=>$b} keys %segs){
		print $segs{$splicedLine};
	    }
	    print "\n";
	    $hitLine ="";
	    %segs=();
	    next;
	}
	$snps+=$knownsubs;
        $segs{$start."_".$end}=$_;
	next;
    }
    # non matching lines come here;
    $hitLine ="";
    %segs=();       
}
print OUT "\nTotal SNPcov = ",$totSnpCov,"\n"; 
#close (OUT);
close (STATS);
open (HITS,">".$basename.".HITSNPS");

foreach my $snp (sort { $a<=>$b } keys %snps){
    print HITS $snp," ",$snps{$snp},"\n";
}
foreach my $subn (sort { $a<=>$b } keys %subs){
    print HITS $subn," ",$subs{$subn},"\n";
}
foreach my $subn (sort { $a<=>$b } keys %mapq){
    print HITS $subn,"\n";
    foreach my $mapq (sort  { $a<=>$b } keys %{$mapq{$subn}}){
	print HITS "\t",$mapq," ",$mapq{$subn}{$mapq},"\n";
    }
}



close (HITS);
#close (ERR);

sub consistCheck{
    my ($map,$rSNPs)=@_;
    if ($rmInconsist){#check inconsistent (to many) SNP_ids printed compared to reported number of SNPs                                              
	my $pSNPs=0;
	$pSNPs++ while ($map =~ m/@/g);
	
	if ($rSNPs ne $pSNPs){
	    print ERR "i",$tag,"\n",$_;
	    return 1;
	}
    }
    return 0;
}

sub printAndCount{
    my ($stats,$snpsPerRead,$segs)=@_;
    my @sSE=sort {$a<=>$b} @{$stats}[2,3];
    if (! $segs ){ #non-spliced
	print STATS @{$stats}[1],"\t",join("\t",@sSE[0,1]),"\t",@{$stats}[0],"\t0\t",@{$stats}[4],"\n";
    }
    else{ #spliced
	foreach my $seg (sort {$a<=>$b} keys %{$segs}){
	    my @startStop = split ("_",$seg);
	    my @SstartStop =sort {$a<=>$b} @startStop;
	    print STATS @{$stats}[1],"\t",@SstartStop[0],"\t",@SstartStop[1],"\t",@{$stats}[0],"\t0\t",@{$stats}[4],"\n";
	}
    }
    foreach my $pos (sort {$a<=>$b} keys %{$snpsPerRead}){
	#print STATS "\t",$pos,"\t",$$snpsPerRead{$pos};
	print OUT " ",$$snpsPerRead{$pos};
	$snps{$$snpsPerRead{$pos}}++;
    }
#    print STATS "\n";
}
