#!/usr/bin/perl -w use strict; use warnings; #use File::Path qw(remove_tree make_path); use File::Copy; use Getopt::Long; use Time::HiRes qw(gettimeofday tv_interval); sub print_usage(); sub create_outputDIR($$); sub run_cmd($$); sub assign_ID($$$); sub read_fasta($$$); sub check_primer($); sub gen_map($); sub invmap_annoind($$); #TODO: change the decription about the pipeline # Add decription here too # Add an option to accept BLAST archive file and convert # the archive into pipeline acceptable format using blast_formatter #===============Command line options===========================# my $inputFile = ''; # Input FASTA files my $outputDIR = ''; # Output dir. my $ESPT = 'ESPRIT_Tree'; # ESPRIT-Tree dir my $PIPE = '.'; # Pipeline installation path my $allow_taxGap = 1; # keep sequence if its subject's taxonomy contains gaps. # preproc options # preproc [-e] [-p ] [-m mis_allowed] [-w] [-v var_allowed] # [-l ] [-u ] my $mergeIden = ''; # -e only merge identical my $deleteAmb = ''; # -w remove seqs with ambiguous characters my $primerFile = ''; # -p primer file my $primerMisAllow = 0; # -m maximum allowed mistmatches with primer sequences (Default 1) my $varAllow = 0; # -v maximum allowed deviations from seqs average length (Default 1.0) my $minLen = ''; # -l remove seqs with length < minLen my $maxLen = ''; # -u remove seqs with length > maxLen # search engine options and database options #my $search = 'blast'; # Search engine used in searching against databse (Default: BLAST) # May include USEARCH in future version (USEARCH has no public licence for multiple platforms, thus not include in this pipeline). my $threads = 1; # Num of threads to use in BLAST my $db = 'RDP'; # Database used to BLAST against (Default: RDP) # supprot Greengenes in future version my $source = 'isolated'; # Database source: isolated, uncultured or both (Default isolated) # cutoff value for selecting annotated seqs my $cut_identity = 97; # cut-off value of alignment identity my $cut_aligned = 97; # cut-off value of aligned region my $cut_evalue = 1e-20 + 0; # cut-off evalue for BLAST hits (Default 1e-20) # pbpcluster options # pbpcluster [-p] [-l lower] [-s intv] [-u upper] [-g gap_open] # [-e gap_ext] [-m meml] [-r errorrate] [-a ] # [-o output] [-f ] [] my $kmerFile = ''; # kfile k-mer configure file. my $kmerLen = 5; # Kmer length. Default 5. my $bottomLevel = 0.01; # -l lowest distance level for clustering. Default 0.01 . my $interval = 0.01; # -s interval of distance levels to show. Default 0.01 . my $topLevel = 0.15; # -u top distance level for clustering. Default 0.15 . my $gap_open = 10; # -g gap open penalty of NW algorithm. Default -10 my $gap_ext = 0.5; # -e gap extension penalty of NW algorithm. Default -0.5. my $meml = 1.0; # -m maximum allowed memory in gigabytes. Default 1.0 . my $errorate = 0; # -r base-wise sequencing error rate for single-base correcting. Default 0(No correcting) my $preCluster = ''; # -p use pre-clustering # For interested taxonomic level at which to obtain OTUs (Default species level) my %taxLevels = ( domain => 0, phylum => 0, class => 0, order => 0, family => 0, genus => 0, species=> 0); my @targetLevels = (); my $gen_figure = ''; my $verbose = ''; my $help = ''; # Process cmd line options GetOptions ( 'input=s' => \$inputFile, 'i=s' => \$inputFile, 'output=s' => \$outputDIR, 'o=s' => \$outputDIR, 'p=s' => \$PIPE, 'pipe=s' => \$PIPE, 'taxgap=i' => \$allow_taxGap, # prepro cmd options 'merge_iden' => \$mergeIden, 'delete_amb' => \$deleteAmb, 'primer=s' => \$primerFile, 'primer_mis=i' => \$primerMisAllow, 'variance=f' => \$varAllow, 'min_len=i' => \$minLen, 'max_len=i' => \$maxLen, # database cmd options #'engine=s' => \$search, 'core=i' => \$threads, 'db=s' => \$db, 'source=s' => \$source, 't_identity=f' => \$cut_identity, 't_aligned=f' => \$cut_aligned, 't_evalue=f' => \$cut_evalue, # pbpcluster cmd options 'kfile=s' => \$kmerFile, 'kmer=i' => \$kmerLen, 'bottom=f' => \$bottomLevel, 'top=f' => \$topLevel, 'interval=f' => \$interval, 'gap_open=f' => \$gap_open, 'gap_ext=f' => \$gap_ext, 'max_mem=f' => \$meml, 'error=f' => \$errorate, 'precluster' => \$preCluster, # Other cmd options 'target=s' => \@targetLevels, 'figure' => \$gen_figure, 'v' => \$verbose, 'verbose' => \$verbose, 'h' => \$help, 'help' => \$help); if( $inputFile eq '') { print_usage(); die "ERROR:Input File is required.\n\n";} elsif ($verbose) { print "Input sequence file:\t$inputFile\n\n"; } # target level not specified, default species level if(scalar(@targetLevels) == 0) { push(@targetLevels, 'species'); } else { # Check if valid target taxonomy level is entered foreach my $level (@targetLevels) { if (exists($taxLevels{lc($level)})) { $taxLevels{ lc($level) } += 1; print "Target taxonomic level:\t$level\n" if $verbose; } } my $validEntry = 0; foreach my $i (values(%taxLevels)) { $validEntry+= $i; } if ( $validEntry == 0) { die "Invalid targeted taxonomic level\n" ;} } #===============Beginning of pipeline==============================# #Create output directory print "\nStep 1:\tCreate an output directory.\n\n" if $verbose; print "\n++++++++++++++++++ESPRIT-Tree Pipeline+++++++++++++++++++++++\n" unless $verbose; $outputDIR = create_outputDIR($outputDIR, $verbose); #Preprocessing input seqs file # Replace each sequence header with an unique ID # save the modified file in OUTPUTDIR assign_ID($inputFile, $outputDIR, $verbose); my $preproCMD = $outputDIR."\/Allseqs.fasta "; if ($mergeIden) { $preproCMD = $preproCMD." -e "; } if ($deleteAmb) { $preproCMD = $preproCMD." -w "; } if ($primerFile) { $preproCMD = $preproCMD." -p $primerFile "; } if ($varAllow) { $preproCMD = $preproCMD." -v $varAllow "; } if ($minLen) { $preproCMD = $preproCMD." -l $minLen "; } if ($maxLen) { $preproCMD = $preproCMD." -u $maxLen "; } if ($primerMisAllow) { $preproCMD = $preproCMD." -m $primerMisAllow "; } $preproCMD = $preproCMD."$outputDIR/preprocessed "; print "\nStep 2:\tPreprocess sequences data running command line:\n\n" if $verbose; print "\n+++++++++++++++++++Preprocessing++++++++++++++++++++++++++++++\n" unless $verbose; # Check primer file format if ($primerFile) { unless (check_primer($primerFile)) { die "Invalid file format, expected '>' to start FASTA label. Reading Primer File Failed\n"; } } run_cmd("$PIPE/$ESPT/preproc $preproCMD", $verbose); # BLAST preprocessed seqs data against database my $queryCMD = "-query $outputDIR/preprocessed -task megablast "; my $dbCMD = "-db $PIPE/$db/$source/Database "; my $formatCMD = '-outfmt "6 qseqid qlen sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore" -max_target_seqs 1 '; my $evalueCMD = "-evalue $cut_evalue "; my $threadsCMD = "-num_threads $threads "; print "\nStep 3:\tBLAST preprocessed sequences data running command line:\n\n" if $verbose; print "\n+++++++++++++++++++++BLAST++++++++++++++++++++++++++++++++++++\n" unless $verbose; my $start = [gettimeofday]; #run_cmd("$PIPE/blastn -out $outputDIR/preprocessed.blast ".$queryCMD.$dbCMD.$formatCMD.$evalueCMD.$threadsCMD, $verbose); run_cmd("/home3/ijbcborg/bin/blast/ncbi/ncbi-blast-2.2.25+/bin/blastn -out $outputDIR/preprocessed.blast ".$queryCMD.$dbCMD.$formatCMD.$evalueCMD.$threadsCMD, $verbose); my $exec_time = tv_interval($start); print "BLAST takes $exec_time seconds\n"; # Pick seqs for clutering print "\nStep 4:\tPick eligible sequences for clustering.\n\n" if $verbose; print "\n++++++++++++++++++++Select eligbile sequences++++++++++++++++++\n" unless $verbose; my $pickseqsCMD = "-b $outputDIR/preprocessed.blast -s $outputDIR/preprocessed -q $outputDIR/preprocessed.frq -id $cut_identity -al $cut_aligned -o $outputDIR "; if ($allow_taxGap) { $pickseqsCMD = $pickseqsCMD.'-g '; } run_cmd("perl $PIPE/pickSeqs.pl $pickseqsCMD", $verbose); # Clustering obtained seqs print "\nStep 5.1: Use ESPRIT-Tree to cluster selected sequences.\n\n" if $verbose; print "\n+++++++++++++++++++Clustering++++++++++++++++++++++++++++++++++\n\n" unless $verbose; my $esptCMD = "-f $outputDIR/subset.frq $outputDIR/subset.fasta -l $bottomLevel -s $interval -k $kmerLen -u $topLevel -g $gap_open -e $gap_ext -m $meml -o $outputDIR/subset "; if ($kmerFile) { $esptCMD = $esptCMD."$kmerFile "; } if ($preCluster) { $esptCMD = $esptCMD.'-p '; } if ($errorate) { $esptCMD = $esptCMD.'-r $errorate '; } run_cmd("$PIPE/$ESPT/pbpcluster $esptCMD", $verbose); # Obtain selected seq's taxon index print "\nStep 5.2: Assign taxon indices for selected sequences.\n\n" if $verbose; # Generate an annotation index matrix run_cmd("perl $PIPE/gen_annoind.pl $outputDIR/subset.blast", $verbose); # Map cluster results and taxon indices back to the original seq print "\nStep 6: Map cluster result and taxon indices back to include frequency information.\n\n" if $verbose; # Generate a mapping file from freq file gen_map("$outputDIR/subset.frq"); # Map taxon indices back invmap_annoind("$outputDIR/subset.annoind", "$outputDIR/subset.map"); # Map cluster results back run_cmd("perl $PIPE/$ESPT/invmap.pl $outputDIR/subset.Clusters $outputDIR/subset.map $outputDIR/subset\_org.Clusters", $verbose); # Evaluate clustering results using the taxa at the targeted level print "\nStep 7: Calculate NMI score and report the distance level with NMI peak.\n\n" if $verbose; print "\n++++++++++++++++Pipeline report+++++++++++++++++++++\n" unless $verbose; # Consider each taxonomic level sparately foreach my $cur_level (@targetLevels) { print "\nTargeted level: $cur_level\n\n"; run_cmd("perl $PIPE/evalNMI.pl -c $outputDIR/subset\_org.Clusters -t $outputDIR/subset\_org.annoind -l $cur_level -o $outputDIR/$cur_level -u $outputDIR/subset.OTU ", $verbose); # python plotting script # run_cmd("python $PIPE/plot_nmi.py $outputDIR/$cur_level.nmi $outputDIR $cur_level ", $verbose) if $gen_figure; # R plotting script run_cmd("Rscript --vanilla $PIPE/plot_nmi.r $outputDIR/$cur_level.nmi $outputDIR $cur_level ", $verbose) if $gen_figure; } #====================Subroutins==================================# sub print_usage() { my $usage ="\nUsage: perl NMI_peak.pl [OPTIONS] <-i input_file> DESCRIPTIONS: NMI_peak.pl select input sequences that are fully annotated at/above user-defined taxonomic level. Then cluster the selected sequences using ESPRIT-Tree. The clustering result is further used to evaluate the NMI score at user-defined taxonomic level and the distance level of NMI peak is reported. OPTIONS: -i, --input FILE\tSequences file in FASTA format. -o, --output DIR\tOutput directory of the cluster results. \t(Default current_directory\/pipeline_output) -p, --pipe DIR\tPipleline installation path (Default current path). --target LEVEL\tInterested taxonomic level at which to obtain OTUs. \tLEVEL can be from 'domain' to 'species'(Default). --taxgap \tKeep sequence if its subject's taxonomy contains gaps. \tTaxon gap is inferred from sequence's species level \tassuming sequences with the same species share the same \tancestor, i.e. no horizontal gene transfer. --figure \tGenerate an NMI score plot at user's interested \ttaxonomic level. Needs python and matplotlib. -h, --help\t\tPrint out usage information. -v, --verbose\tPrint out pipeline messages. Preprocessing options: --merge_iden\t\tOnly merge identical sequences. --delete_amb\t\tRemove seqs with ambiguous characters --primer FILE\tPrimer sequence file in FASTA format. --primer_mis NUM\tMaximum allowed mistmatches with primer sequences. --variance NUM\tMaximum allowed deviations from sequence average length. --min_len NUM\tRemove sequences with length < NUM. --max_len NUM\tRemove sequences with length > NUM. Database options: --db DB\t\tChoose a database to search against. \t\tDB can be 'RDP'(Default) or 'greengenes' (support in future). --core NUM\t\tNum of cores to use in BLAST search. (Default 1). --source SOURCE\tChoose the source of database sequences. Source can be \t'isolated' (Default) or 'both', which includes both \tisolated and uncultured sequences. --t_identity NUM\tCut-off identity value. Keep the sequence's \tannotation if its identity with top-hit subject \tsequence is > NUM% (Default 97). --t_aligned NUM\tCut-off value of aligned percentage. Keep the sequence's \tannotation if its aligned region is > NUM% of its length \t(Default 97) --t_evalue NUM\tCut-off evalue. Keep the sequence's annotation if its \tevalue is < NUM (Default 1e-20). Clustering options: --kfile FILE\t\tChoose a k-mer configure file. --kmer NUM\t\tKmer length (Default 5). --bottom NUM\t\tLowest distance level for clustering (Default 0.01). --top NUM\t\tHighest distance level for clustering (Default 0.15). --interval NUM\tInterval of distance levels to show (Default 0.01). --gap_open NUM\tGap open penalty of NW algorithm (Default 10). --gap_ext NUM\tGap extension penalty of NW algorithm (Default 0.5). --max_mem NUM\tMaximum allowed memory in gigabytes (Default 1.0). --error NUM\t\tBase-wise sequencing error rate for single-base \t\tcorrecting (Default 0, no correcting). --precluster\t\tUse pre-clustering."; print $usage."\n\n"; exit(); } sub create_outputDIR($$) { my ($dir, $verbose) = @_; $outputDIR =~ s{^./}{}; if ($dir eq '' || $dir eq '.') { $dir = "pipeline_output"; } #make_path($dir, {verbose => $verbose}); run_cmd("mkdir $dir", $verbose) unless ( -d "$dir"); print "Output directory is: $dir/\n" if $verbose; return $dir; } sub run_cmd($$) { my ($command, $verbose) = @_; print $command, "\n\n" if $verbose; my $exit_code = system($command); if($exit_code != 0) { print "Error:$exit_code when running\n$command\n"; exit; } } sub check_primer($) { my $file = shift(@_); open(my $primer, "<", $file) or die "Can't open $file :!\n"; my $firstLine = ''; while (<$primer>) { $firstLine = $_; last; } if ( $firstLine !~ m{^>}) { return 0; } return 1; } sub assign_ID($$$) # Replace each sequence header with an unique ID and # save the modified file in OUTPUTDIR { my ($file, $outputDIR, $verbose) = @_; my $outputFile = $outputDIR."\/Allseqs.fasta"; my $outputMap = $outputDIR."\/Allseqs.map"; open(my $input, "<", $file) or die "Can't open $file :$!\n"; my @seqs = (); my @header = (); read_fasta($input,\@seqs, \@header); open(my $output, ">",$outputFile) or die "Can't open $outputFile: !\n"; open(my $map, ">",$outputMap) or die "Can't open $outputMap: !\n"; for my $i (0 .. $#seqs) { printf $output ">%d\n%s\n", $i, $seqs[$i]; printf $map "%s\n", $header[$i]; } print "Sequences with unique ID: $outputFile\nThe mapping info:$outputMap\n" if $verbose; } sub read_fasta($$$) { my ($file, $ref_seqs, $ref_ID) = @_; my $seqBuffer = ''; my $seqCount = 0; my $seqs; my $header; while (<$file>) { # Check file format die "Invalid FASTA file, expected '>' to start\n" unless ( $_ =~ m{^>}); my $line = $_; chomp($line); ${$ref_ID}[$seqCount] = substr($line, 1); last; } while(<$file>) { my $line = $_; chomp($line); if($line =~ m{^>}) { if( $seqBuffer ne '') { ${$ref_seqs}[$seqCount++] = $seqBuffer; $seqBuffer = ''; } $header = substr($line, 1); ${$ref_ID}[$seqCount] = $header; } elsif (uc($line) =~ m{[AGCTUN]+}) { $seqs = uc($line); $seqs =~ s/\s//g; #remove whitespaces $seqBuffer = $seqBuffer.$seqs; } else { next; } } ${$ref_seqs}[$seqCount] = $seqBuffer; # Last seqs } sub gen_map($) { # Generate a mapping file from seqs' frequency file to incorporate seqs' # abundance info into cluster results. In the freq file, each row corresponding # to one merged sequence generated from preprocessing step, one entry is a line index # into the original seqs file before preprocessing step. # In the generated mapping file, each sequence is assigned a new index since the sequence # used for clustering is a subset of the original seqs, while the mapping file generated # from preprocessing step is based on the whole original seqs. # Input: *.frq Frequency info of seqs # Output: *.map Mapping file where each seq is re-assigned an index my $freqFile = shift(@_); my $mapFile = substr($freqFile, 0, -3)."map"; open(my $freq, "<", $freqFile) or die "Can't open $freqFile: $!"; open(my $map, ">", $mapFile) or die "Can't open $mapFile: $!"; my $i = 0; my $index = 0; while (<$freq>) { my $temp = $_; chomp($temp); my @line = split(/\s/,$temp); printf $map "%d", $index++; # save the first index for (my $j = $line[1] - 1; $j > 0; $j--) { printf $map " %d", $index++; } printf $map "\n"; } } sub invmap_annoind($$) { # Include sequence frequency info in the annotation matrix # Merged sequences have the same annotation indices in the output matrix # Input: # *.annoind: Merged sequence's annotation index # (starting from 0) at each taxonomic level # *.map: Mapping information b/w merged sequence file and # the original sequence file, where each row is one # merged sequence # Output: # *_org.annoind: Generated annotation matrix with frequency info. my ($input1, $input2) = @_; open(my $inputAnnoFile, "<", $input1) or die "Can't open annotation indices file: $!"; open(my $mapFile, "<", $input2) or die "Can't open mapping file: $!"; #Import annotation indices of preprocessed seqs and mapping info my @inputAnno = <$inputAnnoFile>; my $i = 0; my @map = (); while (<$mapFile>) { my $temp = $_; chomp($temp); $map[$i++] = [split(/\s/, $temp)]; } my @outputAnno = (); for my $i (0 .. $#map) { foreach my $index (@{$map[$i]}) { $outputAnno[$index] = $inputAnno[$i]; } } my $outputFile = substr($input1, 0, -8); open(my $output, ">", $outputFile."_org.annoind") or die "Can't open output file: $!"; foreach my $line (@outputAnno) { printf $output "%s", $line; } }