El código que publico hoy permite calcular COGs a partir de un directorio con archivos FASTA con secuencias de proteínas, utilizando los binarios del algoritmo COGtriangles, que se pueden obtener de SourceForge, y los de BLAST, que podemos descargar del NCBI.
#!/usr/bin/perl -w
# prog5.pl
# Script que ejecuta el algoritmo COGtriangle sobre los archivos FASTA contenidos
# en el directorio -d, basado en el código original de David Kristensen (Jun 2010).
# El programa asume que cada archivo .faa contiene CDS de genomas distintos
# y puede agrupar la misma secuencia en mas de un cluster, cuando sea multidominio.
use strict 'subs';
use Getopt::Std;
use File::Basename;
## 0) global variables, including binaries to be installed
## and parameters that affect the whole algorithm
# URL: ftp://ftp.ncbi.nih.gov/blast/executables/
my $BLASTPATH = '/home/whatever/blast-2.2.18/';
my $FORMATDBEXE = $BLASTPATH . 'bin/formatdb';
my $BLASTPGPEXE = $BLASTPATH . 'bin/blastpgp -I T -m 9 -b 1000 -v 1000 -z 100000000';
# URL: http://sourceforge.net/projects/cogtriangles/files/
my $COGPATH = '/home/soft/COGsoft/';
my $MAKEHASHEXE = $COGPATH . "COGmakehash/COGmakehash -s=',' -n=1";
my $READBLASTEXE= $COGPATH . 'COGreadblast/COGreadblast -e=10 -q=2 -t=2';
my $LSEEXE = $COGPATH . 'COGlse/COGlse';
my $COGTRIANGLESEXE=
$COGPATH . "COGtriangles/COGtriangles -t=0.5 -e=10 -n='cluster' -s=1";
# the default COGtriangle algorithm requires two BLAST searches
my $TWOBLASTRUNS=1; # set to 0 to perform only a filtered BLAST search
# extension of valid files in -d directories
my $FAAEXT = '.faa';
## 1) process command-line arguments ##############################
my ($INP_FASTAdir,$INP_skipBLAST,$INP_BLASToptions,$newDIR,$id) = ('',0,'');
my ($n_of_sequences,$pwd,$genome,$clusterid,%sequences,%genomeused,%COGs) = (0);
my ($allfaafile,$p2ofilename,$lsefilename,$lseoutfilename,$coglogfilename) =
('all.faa','all.p2o.csv','all.lsejob.csv','all.lse.csv','all.cog.clusters.log');
my ($cogedgesfilename,$edgesfilename) = ('cog-edges.txt','all-edges.txt');
getopts('hsb:d:', \%opts);
if(($opts{'h'})||(scalar(keys(%opts))==0))
{
print "usage: $0 [options]\n";
print " -h this message\n";
print " -d input directory with FASTA .faa files (required, example -d genomes)\n";
print " -s skip BLAST and re-use previous searches (optional)\n";
die " -b extra blastpgp options (optional, example -b '-a 2' to use 2 CPUs)\n";
}
if(!defined($opts{'d'})
|| !-d $opts{'d'}){ die "# $0 : need a valid FASTA directory, exit\n"; }
else
{
$INP_FASTAdir = $opts{'d'};
if(substr($INP_FASTAdir,length($INP_FASTAdir)-1,1) eq '/'){ chop $INP_FASTAdir }
$pwd = `pwd`; chomp $pwd; $pwd .= '/';
$newDIR = $pwd.basename($INP_FASTAdir)."_COGs";
mkdir($newDIR) if(!-d $newDIR);
}
if(defined($opts{'s'})){ $INP_skipBLAST = 1 }
if(defined($opts{'b'}) && !$INP_skipBLAST){ $INP_BLASToptions = $opts{'b'} }
print "# results_directory=$newDIR\n\n";
## 2) concatenate $FAAEXT files and create csv file with protein2organism indexing
opendir(FAADIR,$INP_FASTAdir) || die "# $0 : cannot list $INP_FASTAdir, exit\n";
@faafiles = grep {/$FAAEXT/} readdir(FAADIR);
close(FAADIR);
die "# $0 : need at least two $FAAEXT files in $INP_FASTAdir\n"
if(scalar(@faafiles)<2);
open(ALLFAA,">$newDIR/$allfaafile")
|| die "# $0 : cannot create $newDIR/$allfaafile\n";
open(PROTORG,">$newDIR/$p2ofilename")
|| die "# $0 : cannot create $newDIR/$p2ofilename, exit\n";
foreach my $faafile (@faafiles)
{
print "# reading $INP_FASTAdir/$faafile\n";
my $rhash_FASTA = read_FASTA_sequence("$INP_FASTAdir/$faafile");
foreach my $seq (sort {$a<=>$b} (keys(%$rhash_FASTA)))
{
$n_of_sequences++;
print PROTORG "$n_of_sequences,$faafile\n"; # csv file
#concatenated FASTA with numbers as headers
print ALLFAA ">$n_of_sequences\n$rhash_FASTA->{$seq}{'SEQ'}\n";
#store number - real header equivalence, could be done with BerkeleyDB
$sequence_data{$n_of_sequences} = $rhash_FASTA->{$seq}{'NAME'};
$sequence_data{$n_of_sequences.'PROT'} = $rhash_FASTA->{$seq}{'SEQ'};
}
$genomeused{$faafile} = 1;
}
close(PROTORG);
close(ALLFAA);
## 3) run BLASTs searches #########################################
if(!$INP_skipBLAST)
{
print "# Formatting BLAST database...\n";
system("$FORMATDBEXE -i $newDIR/$allfaafile -o T");
if(-s 'formatdb.log'){ system("mv -f formatdb.log $newDIR") }
if($TWOBLASTRUNS)
{
print "# Running unfiltered BLAST...\n";
mkdir($newDIR.'/BLASTno');
system("$BLASTPGPEXE -d $newDIR/$allfaafile -i $newDIR/$allfaafile -F F -t F ".
"-o $newDIR/BLASTno/all.tab $INP_BLASToptions 2> " .
"$newDIR/BLASTno/blastlog_unfiltered");
}
print "# Running filtered BLAST...\n"; # see PMID: 20439257
mkdir($newDIR.'/BLASTff');
system("$BLASTPGPEXE -d $newDIR/$allfaafile -i $newDIR/$allfaafile -F T -t T ".
"-o $newDIR/BLASTff/all.tab $INP_BLASToptions 2> " .
"$newDIR/BLASTff/blastlog_unfiltered");
}
## 4) calculate COG triangles ######################################
print "\n# making COG hash...\n";
mkdir($newDIR.'/BLASTconv');
system("$MAKEHASHEXE -i=$newDIR/$p2ofilename -o=$newDIR/BLASTconv");
print "# reading BLAST files...\n";
if($TWOBLASTRUNS)
{
system("$READBLASTEXE -d=$newDIR/BLASTconv -u=$newDIR/BLASTno " .
" -f=$newDIR/BLASTff -s=$newDIR/BLASTno");
}
else{ system("$READBLASTEXE -d=$newDIR/BLASTconv -u=$newDIR/BLASTff ".
"-f=$newDIR/BLASTff -s=$newDIR/BLASTff"); }
print "# checking Lineage-specific expansions...\n";
open (LSEIN, ">$newDIR/$lsefilename")
|| die "# $0 : cannot create $newDIR/$lsefilename, exit\n";
foreach $genome (sort(keys(%genomeused)))
{
foreach my $genome2 (sort(keys(%genomeused)))
{
next if($genome eq $genome2);
print LSEIN "$genome,$genome2\n";
}
}
close LSEIN;
mkdir($newDIR.'/tmp');
system("$LSEEXE -t=$newDIR/tmp -d=$newDIR/BLASTconv -j=$newDIR/$lsefilename ".
" -p=$newDIR/$p2ofilename -o=$newDIR/$lseoutfilename > /dev/null");
print "# making COGs...\n"; # genera all-edges.txt , cog-edges.txt
system("$COGTRIANGLESEXE -i=$newDIR/BLASTconv -q=$newDIR/$p2ofilename " .
" -l=$newDIR/$lseoutfilename -o=$newDIR/$coglogfilename");
# $COGTRIANGLESEXE puts them in cwd, Jun2010!
if(-e $cogedgesfilename){ system("mv -f $cogedgesfilename $newDIR") }
if(-e $edgesfilename){ system("mv -f $edgesfilename $newDIR") }
## 5) group sequences sharing COG #################################
open(COGS,"$newDIR/$cogedgesfilename") ||
die "# $0 : cannot read $newDIR/$cogedgesfilename, the program failed\n";
while(<COGS>)
{
#cluster00332,912,Buch_aph_Bp.faa,405,Buch_aphid_Sg.faa
my @data = split(/,/,$_);
($clusterid,$id,$genome) = @data[0,1,2];
push(@{$COGs{$clusterid}{'genomes'}},$genome)
if(!grep(/^$genome$/,@{$COGs{$clusterid}{'genomes'}}));
push(@{$COGs{$clusterid}{'ids'}},$id)
if(!grep(/^$id$/,@{$COGs{$clusterid}{'ids'}}));
}
close(COGS);
print "\n# cluster list:\n";
mkdir($newDIR.'/clusters');
foreach $clusterid (sort {substr($a,7)<=>substr($b,7)} (keys(%COGs)))
{
print ">$newDIR/clusters/$clusterid.faa sequences=".
scalar(@{$COGs{$clusterid}{'ids'}}).
" genomes=".scalar(@{$COGs{$clusterid}{'genomes'}})."\n";
open(COGFAA,">$newDIR/clusters/$clusterid.faa") ||
die "# $0 : cannot create $newDIR/cogs/$clusterid.faa, exit\n";
foreach $id (@{$COGs{$clusterid}{'ids'}})
{
print COGFAA ">$sequence_data{$id} ($clusterid)\n$sequence_data{$id.'PROT'}\n";
}
close(COGFAA);
}
print "\n# total COGs = ".scalar(keys(%COGs))." in folder $newDIR/clusters\n";
exit(0);
################################################################
sub read_FASTA_sequence
{
my ($infile) = @_;
my ($n_of_sequences,%FASTA) = (0);
open(FASTA,$infile) || die "# read_FASTA_sequence: cannot read $infile $!:\n";
while(<FASTA>)
{
if(/^\>(.*?)\n/)
{
$n_of_sequences++;
$FASTA{$n_of_sequences}{'NAME'} = $1;
}
else
{
$FASTA{$n_of_sequences}{'SEQ'} .= $_;
$FASTA{$n_of_sequences}{'SEQ'} =~ s/[\s|\n|\d]//g;
}
}
close(FASTA);
return(\%FASTA);
}
No hay comentarios:
Publicar un comentario