#!/usr/bin/perl -w

use strict;
use warnings;
#use File::Path qw(remove_tree make_path);
use File::Copy;
use Getopt::Long;
use Time::HiRes qw(gettimeofday tv_interval);

sub print_usage();
sub create_outputDIR($$);
sub run_cmd($$);
sub assign_ID($$$);
sub read_fasta($$$);
sub check_primer($);
sub gen_map($);
sub invmap_annoind($$);

#TODO: change the decription about the pipeline
# Add decription here too
# Add an option to accept BLAST archive file and convert
# the archive into pipeline acceptable format using blast_formatter


#===============Command line options===========================#

my $inputFile = '';		# Input FASTA files
my $outputDIR = '';		# Output dir.
my $ESPT = 'ESPRIT_Tree';       # ESPRIT-Tree dir
my $PIPE = '.';                 # Pipeline installation path
my $allow_taxGap = 1;           # keep sequence if its subject's taxonomy contains gaps.

# preproc options
# preproc 	[-e] [-p <primer_file>] [-m mis_allowed] [-w] [-v var_allowed]
#		[-l <minlen>] [-u <maxlen>] <input.fas> 

my $mergeIden = ''; 		# -e only merge identical 
my $deleteAmb = ''; 		# -w remove seqs with ambiguous characters
my $primerFile = '';		# -p primer file 
my $primerMisAllow = 0;		# -m maximum allowed mistmatches with primer sequences (Default 1)
my $varAllow = 0; 		# -v maximum allowed deviations from seqs average length (Default 1.0)
my $minLen = '';		# -l remove seqs with length < minLen
my $maxLen = '';		# -u remove seqs with length > maxLen

# search engine options and database options
#my $search = 'blast'; 		# Search engine used in searching against databse (Default: BLAST)
# May include USEARCH in future version (USEARCH has no public licence for multiple platforms, thus not include in this pipeline).
my $threads = 1;                # Num of threads to use in BLAST
my $db = 'RDP';			# Database used to BLAST against (Default: RDP)
                                # supprot Greengenes in future version
my $source = 'isolated';	# Database source: isolated, uncultured or both (Default isolated)

# cutoff value for selecting annotated seqs
my $cut_identity = 97;		# cut-off value of alignment identity 
my $cut_aligned = 97;		# cut-off value of aligned region
my $cut_evalue = 1e-20 + 0;	# cut-off evalue for BLAST hits (Default 1e-20)

# pbpcluster options
# pbpcluster [-p] [-l lower] [-s intv] [-u upper] [-g gap_open] 
#	     [-e gap_ext] [-m meml] [-r errorrate] [-a <alfile>] 
#	     [-o output] [-f <freqfile>] <seqfile>  [<kfile>]

my $kmerFile = '';		# kfile k-mer configure file.
my $kmerLen = 5;                # Kmer length. Default 5.
my $bottomLevel = 0.01; 	# -l lowest distance level for clustering. Default 0.01 . 
my $interval = 0.01;		# -s interval of distance levels to show. Default 0.01 .
my $topLevel = 0.15;		# -u top distance level for clustering. Default 0.15 .
my $gap_open = 10; 		# -g gap open penalty of NW algorithm. Default -10
my $gap_ext = 0.5;		# -e gap extension penalty of NW algorithm. Default -0.5.
my $meml = 1.0; 		# -m maximum allowed memory in gigabytes. Default 1.0 .
my $errorate = 0; 		# -r base-wise sequencing error rate for single-base correcting. Default 0(No correcting)
my $preCluster = '';		# -p use pre-clustering

# For interested taxonomic level at which to obtain OTUs (Default species level)
my %taxLevels = ( domain => 0,
                  phylum => 0,
                  class  => 0,
                  order  => 0,
                  family => 0,
                  genus  => 0,
                  species=> 0);
my @targetLevels = ();

my $gen_figure = '';
my $verbose = '';
my $help = '';

# Process cmd line options				
GetOptions (
    'input=s' 		=> \$inputFile,
    'i=s' 		=> \$inputFile,
    'output=s' 		=> \$outputDIR,
    'o=s' 		=> \$outputDIR,
    'p=s'               => \$PIPE,
    'pipe=s'            => \$PIPE,
    'taxgap=i'          => \$allow_taxGap,
    # prepro cmd options
    'merge_iden' 	=> \$mergeIden,
    'delete_amb'	=> \$deleteAmb,
    'primer=s' 		=> \$primerFile,
    'primer_mis=i'	=> \$primerMisAllow,
    'variance=f'	=> \$varAllow,
    'min_len=i'		=> \$minLen,
    'max_len=i'		=> \$maxLen,
    # database cmd options
    #'engine=s'		=> \$search,
    'core=i'            => \$threads,
    'db=s'		=> \$db,
    'source=s'		=> \$source,
    't_identity=f'      => \$cut_identity,
    't_aligned=f'	=> \$cut_aligned,
    't_evalue=f'	=> \$cut_evalue,
    # pbpcluster cmd options
    'kfile=s' 		=> \$kmerFile,
    'kmer=i'            => \$kmerLen,
    'bottom=f'		=> \$bottomLevel,
    'top=f' 		=> \$topLevel,
    'interval=f'	=> \$interval,
    'gap_open=f'	=> \$gap_open,
    'gap_ext=f'		=> \$gap_ext,
    'max_mem=f'		=> \$meml,
    'error=f'		=> \$errorate,
    'precluster'	=> \$preCluster,
    # Other cmd options
    'target=s' 		=> \@targetLevels,
    'figure'            => \$gen_figure,
    'v' 		=> \$verbose,
    'verbose' 		=> \$verbose,
    'h' 		=> \$help,
    'help' 		=> \$help);

if( $inputFile eq '') {
    print_usage();
    die "ERROR:Input File is required.\n\n";}
elsif ($verbose) {
    print "Input sequence file:\t$inputFile\n\n";  
} 

# target level not specified, default species level
if(scalar(@targetLevels) == 0) {
    push(@targetLevels, 'species');
} else {
    # Check if valid target taxonomy level is entered
    foreach my $level (@targetLevels) {
        if (exists($taxLevels{lc($level)})) {
            $taxLevels{ lc($level) } += 1;
            print "Target taxonomic level:\t$level\n" if $verbose;
        }
    }
    my $validEntry = 0;
    foreach my $i (values(%taxLevels)) {
        $validEntry+= $i;
    }
    if ( $validEntry == 0) { die "Invalid targeted taxonomic level\n" ;}
    
}

#===============Beginning of pipeline==============================#

#Create output directory

print "\nStep 1:\tCreate an output directory.\n\n" if $verbose;
print "\n++++++++++++++++++ESPRIT-Tree Pipeline+++++++++++++++++++++++\n" unless $verbose;

$outputDIR = create_outputDIR($outputDIR, $verbose);

#Preprocessing input seqs file

# Replace each sequence header with an unique ID
# save the modified file in OUTPUTDIR
assign_ID($inputFile, $outputDIR, $verbose);

my $preproCMD = $outputDIR."\/Allseqs.fasta ";
if ($mergeIden)  { $preproCMD = $preproCMD." -e "; }
if ($deleteAmb)  { $preproCMD = $preproCMD." -w "; }
if ($primerFile) { $preproCMD = $preproCMD." -p $primerFile "; }
if ($varAllow)   { $preproCMD = $preproCMD." -v $varAllow "; }
if ($minLen)     { $preproCMD = $preproCMD." -l $minLen "; }
if ($maxLen)     { $preproCMD = $preproCMD." -u $maxLen "; }
if ($primerMisAllow)   { $preproCMD = $preproCMD." -m $primerMisAllow "; }
$preproCMD = $preproCMD."$outputDIR/preprocessed ";


print "\nStep 2:\tPreprocess sequences data running command line:\n\n" if $verbose;
print "\n+++++++++++++++++++Preprocessing++++++++++++++++++++++++++++++\n" unless $verbose;

# Check primer file format
if ($primerFile) {
    unless (check_primer($primerFile)) {
        die "Invalid file format, expected '>' to start FASTA label. Reading Primer File Failed\n";
    }   
}

run_cmd("$PIPE/$ESPT/preproc $preproCMD", $verbose);

 # BLAST preprocessed seqs data against database
my $queryCMD = "-query $outputDIR/preprocessed -task megablast ";
my $dbCMD = "-db $PIPE/$db/$source/Database ";
my $formatCMD = '-outfmt "6 qseqid qlen sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore" -max_target_seqs 1 ';
my $evalueCMD = "-evalue $cut_evalue ";

my $threadsCMD = "-num_threads $threads ";

print "\nStep 3:\tBLAST preprocessed sequences data running command line:\n\n" if $verbose;
print "\n+++++++++++++++++++++BLAST++++++++++++++++++++++++++++++++++++\n" unless $verbose;

my $start = [gettimeofday];
#run_cmd("$PIPE/blastn -out $outputDIR/preprocessed.blast ".$queryCMD.$dbCMD.$formatCMD.$evalueCMD.$threadsCMD, $verbose);
run_cmd("/home3/ijbcborg/bin/blast/ncbi/ncbi-blast-2.2.25+/bin/blastn -out $outputDIR/preprocessed.blast ".$queryCMD.$dbCMD.$formatCMD.$evalueCMD.$threadsCMD, $verbose);
my $exec_time = tv_interval($start);


print "BLAST takes $exec_time seconds\n";

# Pick seqs for clutering 
print "\nStep 4:\tPick eligible sequences for clustering.\n\n" if $verbose;
print "\n++++++++++++++++++++Select eligbile sequences++++++++++++++++++\n" unless $verbose;

my $pickseqsCMD = "-b $outputDIR/preprocessed.blast -s $outputDIR/preprocessed -q $outputDIR/preprocessed.frq -id $cut_identity -al $cut_aligned -o $outputDIR ";

if ($allow_taxGap) {
    $pickseqsCMD = $pickseqsCMD.'-g ';
}

run_cmd("perl $PIPE/pickSeqs.pl $pickseqsCMD", $verbose);


# Clustering obtained seqs
print "\nStep 5.1: Use ESPRIT-Tree to cluster selected sequences.\n\n" if $verbose;
print "\n+++++++++++++++++++Clustering++++++++++++++++++++++++++++++++++\n\n" unless $verbose;

my $esptCMD = "-f $outputDIR/subset.frq $outputDIR/subset.fasta -l $bottomLevel -s $interval -k $kmerLen -u $topLevel -g $gap_open -e $gap_ext -m $meml -o $outputDIR/subset ";

if ($kmerFile) {
    $esptCMD = $esptCMD."$kmerFile ";
}
if ($preCluster) {
    $esptCMD = $esptCMD.'-p ';
}
if ($errorate) {
    $esptCMD = $esptCMD.'-r $errorate ';
}

run_cmd("$PIPE/$ESPT/pbpcluster $esptCMD", $verbose);


# Obtain selected seq's taxon index
print "\nStep 5.2: Assign taxon indices for selected sequences.\n\n" if $verbose;

# Generate an annotation index matrix
run_cmd("perl $PIPE/gen_annoind.pl $outputDIR/subset.blast", $verbose);


# Map cluster results and taxon indices back to the original seq
print "\nStep 6: Map cluster result and taxon indices back to include frequency information.\n\n" if $verbose;

# Generate a mapping file from freq file
gen_map("$outputDIR/subset.frq");

# Map taxon indices back
invmap_annoind("$outputDIR/subset.annoind", "$outputDIR/subset.map");

# Map cluster results back
run_cmd("perl $PIPE/$ESPT/invmap.pl  $outputDIR/subset.Clusters  $outputDIR/subset.map  $outputDIR/subset\_org.Clusters", $verbose);

# Evaluate clustering results using the taxa at the targeted level 
print "\nStep 7: Calculate NMI score and report the distance level with NMI peak.\n\n" if $verbose;
print "\n++++++++++++++++Pipeline report+++++++++++++++++++++\n" unless $verbose;

# Consider each taxonomic level sparately

foreach my $cur_level (@targetLevels) {  
    print "\nTargeted level: $cur_level\n\n";
    run_cmd("perl $PIPE/evalNMI.pl -c $outputDIR/subset\_org.Clusters -t $outputDIR/subset\_org.annoind -l $cur_level -o $outputDIR/$cur_level -u $outputDIR/subset.OTU ", $verbose);
    # python plotting script
    # run_cmd("python $PIPE/plot_nmi.py $outputDIR/$cur_level.nmi $outputDIR $cur_level ", $verbose) if $gen_figure;
    # R plotting script
    run_cmd("Rscript --vanilla $PIPE/plot_nmi.r $outputDIR/$cur_level.nmi $outputDIR $cur_level ", $verbose) if $gen_figure;
}


#====================Subroutins==================================#

sub print_usage()
    {
        my $usage ="\nUsage: perl NMI_peak.pl [OPTIONS] <-i input_file>

DESCRIPTIONS:

   NMI_peak.pl select input sequences that are fully annotated at/above
   user-defined taxonomic level. Then cluster the selected sequences
   using ESPRIT-Tree. The clustering result is further used to evaluate
   the NMI score at user-defined taxonomic level and the distance level
   of NMI peak is reported.

OPTIONS:

   -i, --input FILE\tSequences file in FASTA format.
   -o, --output DIR\tOutput directory of the cluster results.
                   \t(Default current_directory\/pipeline_output)
   -p, --pipe DIR\tPipleline installation path (Default current path).
   --target LEVEL\tInterested taxonomic level at which to obtain OTUs.
                 \tLEVEL can be from 'domain' to 'species'(Default).
   --taxgap      \tKeep sequence if its subject's taxonomy contains gaps.
                 \tTaxon gap is inferred from sequence's species level
                 \tassuming sequences with the same species share the same
                 \tancestor, i.e. no horizontal gene transfer.
   --figure      \tGenerate an NMI score plot at user's interested
                 \ttaxonomic level. Needs python and matplotlib.
   -h, --help\t\tPrint out usage information.
   -v, --verbose\tPrint out pipeline messages.

  Preprocessing options:
   --merge_iden\t\tOnly merge identical sequences.
   --delete_amb\t\tRemove seqs with ambiguous characters
   --primer FILE\tPrimer sequence file in FASTA format.
   --primer_mis NUM\tMaximum allowed mistmatches with primer sequences.
   --variance NUM\tMaximum allowed deviations from sequence average length.
   --min_len NUM\tRemove sequences with length < NUM.
   --max_len NUM\tRemove sequences with length > NUM.

  Database options:
   --db DB\t\tChoose a database to search against.
          \t\tDB can be 'RDP'(Default) or 'greengenes' (support in future).
   --core NUM\t\tNum of cores to use in BLAST search. (Default 1).
   --source SOURCE\tChoose the source of database sequences. Source can be
                  \t'isolated' (Default) or 'both', which includes both
                  \tisolated and uncultured sequences.
   --t_identity NUM\tCut-off identity value. Keep the sequence's
                   \tannotation if its identity with top-hit subject
                   \tsequence is > NUM% (Default 97).
   --t_aligned NUM\tCut-off value of aligned percentage. Keep the sequence's
                  \tannotation if its aligned region is > NUM% of its length
                  \t(Default 97)
   --t_evalue NUM\tCut-off evalue. Keep the sequence's annotation if its
                 \tevalue is < NUM (Default 1e-20).

  Clustering options:
   --kfile FILE\t\tChoose a k-mer configure file.
   --kmer NUM\t\tKmer length (Default 5).
   --bottom NUM\t\tLowest distance level for clustering (Default 0.01).
   --top NUM\t\tHighest distance level for clustering (Default 0.15).
   --interval NUM\tInterval of distance levels to show (Default 0.01).
   --gap_open NUM\tGap open penalty of NW algorithm (Default 10).
   --gap_ext NUM\tGap extension penalty of NW algorithm (Default 0.5).
   --max_mem NUM\tMaximum allowed memory in gigabytes (Default 1.0).	
   --error NUM\t\tBase-wise sequencing error rate for single-base
              \t\tcorrecting (Default 0, no correcting).
   --precluster\t\tUse pre-clustering.";
        print $usage."\n\n";
        
        exit();
    }
    
sub create_outputDIR($$)
    {
        my ($dir, $verbose) = @_;
        $outputDIR =~ s{^./}{};
                               
        if ($dir eq '' || $dir eq '.') {
            $dir = "pipeline_output";
        }

        #make_path($dir, {verbose => $verbose});
        run_cmd("mkdir $dir", $verbose) unless ( -d "$dir");
        
        print "Output directory is: $dir/\n" if $verbose;
               
        return $dir;
                    
    }    
        
sub run_cmd($$)
     {
         my ($command, $verbose) = @_;
         
         print $command, "\n\n" if $verbose;
                           
         my $exit_code = system($command);
         if($exit_code != 0)
             {
                 print "Error:$exit_code when running\n$command\n";
                 exit;
             }
     }
     
sub check_primer($)
    {
        my $file = shift(@_);

        open(my $primer, "<", $file) or die "Can't open $file :!\n";

        my $firstLine = '';
        
        while (<$primer>) {
            $firstLine = $_;
            last;          
        }

        if ( $firstLine !~ m{^>}) {
            return 0;
        }
        return 1;
    }


sub assign_ID($$$)
#  Replace each sequence header with an unique ID and
#  save the modified file in OUTPUTDIR
    {
        my ($file, $outputDIR, $verbose) = @_;
        my $outputFile = $outputDIR."\/Allseqs.fasta";
        my $outputMap = $outputDIR."\/Allseqs.map";
                
        open(my $input, "<", $file) or die "Can't open $file :$!\n";

        my @seqs = ();
        my @header = ();

        read_fasta($input,\@seqs, \@header);
        
        open(my $output, ">",$outputFile) or die "Can't open $outputFile: !\n";
        open(my $map, ">",$outputMap) or die "Can't open $outputMap: !\n";
        for my $i (0 .. $#seqs) {
            printf $output ">%d\n%s\n", $i, $seqs[$i];
            printf $map "%s\n", $header[$i];
        }

        print "Sequences with unique ID: $outputFile\nThe mapping info:$outputMap\n" if $verbose;                 
     }


sub read_fasta($$$)
    {
        my ($file, $ref_seqs, $ref_ID) = @_;
        my $seqBuffer = '';
        my $seqCount = 0;
        my $seqs;
        my $header;

        while (<$file>) { # Check file format
            die "Invalid FASTA file, expected '>' to start\n" unless ( $_ =~ m{^>});
            my $line = $_;
            chomp($line);
            ${$ref_ID}[$seqCount] = substr($line, 1);
            last;
        }
        
        while(<$file>) {
            my $line = $_;
            chomp($line);
            
            if($line =~ m{^>}) {                  
                if( $seqBuffer ne '') {
                    ${$ref_seqs}[$seqCount++] = $seqBuffer;
                    $seqBuffer = '';
                }
                $header = substr($line, 1);
                ${$ref_ID}[$seqCount] = $header;
            } elsif (uc($line) =~ m{[AGCTUN]+}) {
                $seqs = uc($line);
                $seqs =~ s/\s//g; #remove whitespaces
                $seqBuffer = $seqBuffer.$seqs;
            } else { next; }
        }
        ${$ref_seqs}[$seqCount] = $seqBuffer; # Last seqs
    }

    
sub gen_map($)
    {
        # Generate a mapping file from seqs' frequency file to incorporate seqs'
        # abundance info into cluster results. In the freq file, each row corresponding
        # to one merged sequence generated from preprocessing step, one entry is a line index
        # into the original seqs file before preprocessing step.
        # In the generated mapping file, each sequence is assigned a new index since the sequence
        # used for clustering is a subset of the original seqs, while the mapping file generated
        # from preprocessing step is based on the whole original seqs.

        # Input:  *.frq  Frequency info of seqs 
        # Output: *.map  Mapping file where each seq is re-assigned an index
        
        my $freqFile = shift(@_);  
        my $mapFile = substr($freqFile, 0, -3)."map";
        
        open(my $freq,  "<", $freqFile)  or die "Can't open $freqFile: $!";
        open(my $map,  ">", $mapFile)  or die "Can't open $mapFile: $!";
        
        my $i = 0;
        my $index = 0;
        
        while (<$freq>) {
            my $temp = $_;
            chomp($temp);
            my @line = split(/\s/,$temp);
            
            printf $map "%d", $index++; # save the first index
            
            for (my $j = $line[1] - 1; $j > 0; $j--) {
                printf $map " %d", $index++;
            }
            printf $map "\n";
        }

    }

sub invmap_annoind($$)
    {
        # Include sequence frequency info in the annotation matrix
        # Merged sequences have the same annotation indices in the output matrix
        # Input: 
        #	*.annoind: Merged sequence's annotation index
        #                  (starting from 0) at each taxonomic level
        #       *.map:     Mapping information b/w merged sequence file and
        #                  the original sequence file, where each row is one
        #                  merged sequence
        # Output:
        #	*_org.annoind: Generated annotation matrix with frequency info.

        my ($input1, $input2) =  @_;
               
        open(my $inputAnnoFile,  "<",   $input1)  or die "Can't open annotation indices file: $!";
        open(my $mapFile,  "<",  $input2)  or die "Can't open mapping file: $!";
        
        #Import annotation indices of preprocessed seqs and mapping info
        my @inputAnno = <$inputAnnoFile>;
        my $i = 0;
        my @map = ();
        while (<$mapFile>) {
            my $temp = $_;
            chomp($temp);
            $map[$i++] = [split(/\s/, $temp)];
        }
        
        my @outputAnno = ();
        
        for my $i (0 .. $#map) {
            foreach my $index (@{$map[$i]}) {
                $outputAnno[$index] = $inputAnno[$i];       
            }   
        }
        
        my $outputFile = substr($input1, 0, -8);
        open(my $output,  ">", $outputFile."_org.annoind")  or die "Can't open output file: $!";
        foreach my $line (@outputAnno) {
            printf $output "%s", $line;
        }
        
        
    }