#!/usr/bin/perl
use Getopt::Long;
$|=1;
$script=$0;
$script=~s/^.+[\\\/]//;
$info_text="
===============================   PHASER   ==============================
Version: 1.0.2
LAST MODIFIED: 02. September 2016

perl $script -i input.map [-option value]

Before you can use PHASER you must map your sequence reads to a genome or
reference sequence. For this you can use the sRNAmapper tool provided at:
http://www.smallrnagroup.uni-mainz.de/software.html
Alternatively you can use SeqMap (Yiang and Wong 2008. Bioinformatics 24
(20):2395-2396) with the option /output_all_matches. The output file
produced by sRNAmapper or SeqMap is the input file for PHASER. You can
optionally apply the reallocate tool to apportion read counts according
to estimated local transcription rates before using PHASER (Rosenkranz
2015. Nucleic Acids Res. Epub ahead of print. doi: 10.1093/nar/gkv1265).

OPTION            INFO
-------------------------------------------------------------------------
-i (-input)       Specifiy your input file. Input files must be in ELAND3
                  format (see above).
-o (-output)      Specify a name for the output file. Per default PHASER
                  will print the results to STDOUT.
-r (-range)       Specify the range of interest [bp] (default=100) which
                  is the maximum distance of mapped sequence reads to be
                  reported.
-h (-? -help)     Will print this information.


Contact:
David Rosenkranz
Institute of Anthropology
Johannes Gutenberg University Mainz, Germany
email: rosenkranz\@uni-mainz.de

";

$map="";
$range=100;
GetOptions
	(
	"input|i=s"=>\$map,
	"output|o=s"=>\$output,
	"range|r=s"=>\$range,
	"help|h|?"=>\$show_help,
	);

# check options
if($show_help==1)
	{
	print$info_text;
	exit;
	}
if($output)
	{
	unless(open OUT,">$output")
		{
		print"\nUnable to open output file.\n$!\n\n";exit;
		}
	}
$range=~s/^\s*//;
$range=~s/\s*$//;
if($range!~/^\d+$/)
	{
	print"\nRange [bp] specified by option -r or -range is not numeric!\n\n";exit;
	}

print"\nRead map file...";
open(MAP,$map)||die print"\nUnable to open map file $map.\n$!\n\n";
$max_length=0;
%hits_per_seq=();
%reads_per_seq=();
while(<MAP>)
	{
	next if($_=~/^trans_id/||$_=~/^#/);
	$h++;
	$_=~s/\s*$//;
	@d=split("\t",$_);
	$hits_per_seq{$d[4]}++;
	$reads_per_seq{$d[4]}=$d[3];
	if($max_length<length$d[4])
		{
		$max_length=length$d[4];
		# random test map file format
		if($d[1]!~/^\d+$/||$d[2]!~/^[ATGCUN]+$/||$d[4]!~/^[ATGCUN]+$/||$d[5]!~/^\d+$/||$d[6]!~/^[\+-]$/)
			{
			print"\nBad input file format!\n\n";
			exit;
			}
		}
	}
close MAP;
$map_type="This is a standard ELAND3 map file";
if($d[8])
	{
	$map_type="This is a post processed ELAND3 map file";
	}
print" done.\n$map_type with $h genomic hits.

Scan map file for 3' - 5' distances:
0%                                   50%                                   100%
|-------------------------------------|-------------------------------------|
";


$prev_loc="INIT";
$prev_coord=-1;
%recurse=();
%sp=();
%sm=();
%ep=();
%em=();
open(MAP,$map)||die print"\nUnable to open map file $map.\n$!\n\n";
$c=0;
$p=0;
$pp=0;
while(<MAP>)
	{
	next if($_=~/^trans_id/||$_=~/^#/);
	$c++;
	$p++;
	# show progress
	if($p>=$h/77)
		{
		$p=0;
		$pp++;
		print".";
		}
	
	$_=~s/\s*$//;
	@d=split("\t",$_);
	if($prev_loc ne $d[0]&&$prev_loc ne "INIT")
		{
		compare();
		}
	elsif($prev_coord+$range+$max_length<$d[1]&&$prev_coord!=-1)
		{
		compare();
		}
	elsif($c==100000) # avoid overkill
		{
		compare();
		}
	if($d[8])
		{
		$hitcount=$d[8];
		}
	else
		{
		$hitcount=$reads_per_seq{$d[4]}/$hits_per_seq{$d[4]};
		}
	
	if($d[6]=~/\+/)
		{
		$sp{$d[1]}+=$hitcount;
		$ep{$d[1]+(length$d[4])-1}+=$hitcount;
		}
	elsif($d[6]=~/-/)
		{
		$sm{$d[1]}+=$hitcount;
		$em{$d[1]-((length$d[4])-1)}+=$hitcount;
		}
	$recurse{1-length$d[4]}+=($reads_per_seq{$d[4]}/$hits_per_seq{$d[4]})*2;
	$prev_loc=$d[0];
	$prev_coord=$d[1];
	}
close MAP;
compare();
foreach($pp..76)
	{
	print".";
	}
print"\n\n";

# print results to output file or stdout
if($output)
	{
	print OUT"Distance [bp]\tread counts\n";
	foreach$d(-$range..$range)
		{
		$val=$distances{$d}-$recurse{$d};
		print OUT"\n$d\t$val";
		}
	close OUT;
	print"\nResults saved in file $output.\n\n";
	}
else
	{
	print"\nDistance [bp]\tread counts\n";
	foreach$d(-$range..$range)
		{
		$val=$distances{$d}-$recurse{$d};
		print"\n$d\t$val";
		}
	}
exit;

# subroutine
sub compare
	{
	foreach$coord_s(keys%sp)
		{
		foreach$coord_e(keys%ep)
			{
			$distances{$coord_s-$coord_e}+=$sp{$coord_s}+$ep{$coord_e};
			}
		}
	foreach$coord_s(keys%sm)
		{
		foreach$coord_e(keys%em)
			{
			$distances{$coord_e-$coord_s}+=$sm{$coord_s}+$em{$coord_e};
			}
		}
	undef%sp;undef%ep;undef%sm;undef%em;
	$c=0;
	}