Tras leer el artículo que sale hoy en Nature sobre la
expansión del espacio de secuencias de proteínas, que se asemeja a la expansión del Universo tal como la imaginaba
Hubble, he vuelto a encontrarme con el algoritmo COGs (Clusters of Orthologous Groups), originalmente creado por Tatusov en 1997 y actualizado por
David Kristensen hace unas semanas. En esencia este algoritmo permite definir grupos de secuencias (potencialmente) ortólogas entre 3 o más genomas a partir de los resultados de BLAST de todos contra todos. Para ello utiliza el concepto de los 'bidirectional best hits', que se dibuja como flechas bidireccionales en esta figura, tomada del paper de Kristensen:
El código que publico hoy permite calcular COGs a partir de un directorio con archivos FASTA con secuencias de proteínas, utilizando los binarios del algoritmo COGtriangles, que se pueden obtener de
SourceForge, y los de BLAST, que podemos descargar del
NCBI.
#!/usr/bin/perl -w
# prog5.pl
# Script que ejecuta el algoritmo COGtriangle sobre los archivos FASTA contenidos
# en el directorio -d, basado en el código original de David Kristensen (Jun 2010).
# El programa asume que cada archivo .faa contiene CDS de genomas distintos
# y puede agrupar la misma secuencia en mas de un cluster, cuando sea multidominio.
use strict 'subs';
use Getopt::Std;
use File::Basename;
## 0) global variables, including binaries to be installed
## and parameters that affect the whole algorithm
# URL: ftp://ftp.ncbi.nih.gov/blast/executables/
my $BLASTPATH = '/home/whatever/blast-2.2.18/';
my $FORMATDBEXE = $BLASTPATH . 'bin/formatdb';
my $BLASTPGPEXE = $BLASTPATH . 'bin/blastpgp -I T -m 9 -b 1000 -v 1000 -z 100000000';
# URL: http://sourceforge.net/projects/cogtriangles/files/
my $COGPATH = '/home/soft/COGsoft/';
my $MAKEHASHEXE = $COGPATH . "COGmakehash/COGmakehash -s=',' -n=1";
my $READBLASTEXE= $COGPATH . 'COGreadblast/COGreadblast -e=10 -q=2 -t=2';
my $LSEEXE = $COGPATH . 'COGlse/COGlse';
my $COGTRIANGLESEXE=
$COGPATH . "COGtriangles/COGtriangles -t=0.5 -e=10 -n='cluster' -s=1";
# the default COGtriangle algorithm requires two BLAST searches
my $TWOBLASTRUNS=1; # set to 0 to perform only a filtered BLAST search
# extension of valid files in -d directories
my $FAAEXT = '.faa';
## 1) process command-line arguments ##############################
my ($INP_FASTAdir,$INP_skipBLAST,$INP_BLASToptions,$newDIR,$id) = ('',0,'');
my ($n_of_sequences,$pwd,$genome,$clusterid,%sequences,%genomeused,%COGs) = (0);
my ($allfaafile,$p2ofilename,$lsefilename,$lseoutfilename,$coglogfilename) =
('all.faa','all.p2o.csv','all.lsejob.csv','all.lse.csv','all.cog.clusters.log');
my ($cogedgesfilename,$edgesfilename) = ('cog-edges.txt','all-edges.txt');
getopts('hsb:d:', \%opts);
if(($opts{'h'})||(scalar(keys(%opts))==0))
{
print "usage: $0 [options]\n";
print " -h this message\n";
print " -d input directory with FASTA .faa files (required, example -d genomes)\n";
print " -s skip BLAST and re-use previous searches (optional)\n";
die " -b extra blastpgp options (optional, example -b '-a 2' to use 2 CPUs)\n";
}
if(!defined($opts{'d'})
|| !-d $opts{'d'}){ die "# $0 : need a valid FASTA directory, exit\n"; }
else
{
$INP_FASTAdir = $opts{'d'};
if(substr($INP_FASTAdir,length($INP_FASTAdir)-1,1) eq '/'){ chop $INP_FASTAdir }
$pwd = `pwd`; chomp $pwd; $pwd .= '/';
$newDIR = $pwd.basename($INP_FASTAdir)."_COGs";
mkdir($newDIR) if(!-d $newDIR);
}
if(defined($opts{'s'})){ $INP_skipBLAST = 1 }
if(defined($opts{'b'}) && !$INP_skipBLAST){ $INP_BLASToptions = $opts{'b'} }
print "# results_directory=$newDIR\n\n";
## 2) concatenate $FAAEXT files and create csv file with protein2organism indexing
opendir(FAADIR,$INP_FASTAdir) || die "# $0 : cannot list $INP_FASTAdir, exit\n";
@faafiles = grep {/$FAAEXT/} readdir(FAADIR);
close(FAADIR);
die "# $0 : need at least two $FAAEXT files in $INP_FASTAdir\n"
if(scalar(@faafiles)<2);
open(ALLFAA,">$newDIR/$allfaafile")
|| die "# $0 : cannot create $newDIR/$allfaafile\n";
open(PROTORG,">$newDIR/$p2ofilename")
|| die "# $0 : cannot create $newDIR/$p2ofilename, exit\n";
foreach my $faafile (@faafiles)
{
print "# reading $INP_FASTAdir/$faafile\n";
my $rhash_FASTA = read_FASTA_sequence("$INP_FASTAdir/$faafile");
foreach my $seq (sort {$a<=>$b} (keys(%$rhash_FASTA)))
{
$n_of_sequences++;
print PROTORG "$n_of_sequences,$faafile\n"; # csv file
#concatenated FASTA with numbers as headers
print ALLFAA ">$n_of_sequences\n$rhash_FASTA->{$seq}{'SEQ'}\n";
#store number - real header equivalence, could be done with BerkeleyDB
$sequence_data{$n_of_sequences} = $rhash_FASTA->{$seq}{'NAME'};
$sequence_data{$n_of_sequences.'PROT'} = $rhash_FASTA->{$seq}{'SEQ'};
}
$genomeused{$faafile} = 1;
}
close(PROTORG);
close(ALLFAA);
## 3) run BLASTs searches #########################################
if(!$INP_skipBLAST)
{
print "# Formatting BLAST database...\n";
system("$FORMATDBEXE -i $newDIR/$allfaafile -o T");
if(-s 'formatdb.log'){ system("mv -f formatdb.log $newDIR") }
if($TWOBLASTRUNS)
{
print "# Running unfiltered BLAST...\n";
mkdir($newDIR.'/BLASTno');
system("$BLASTPGPEXE -d $newDIR/$allfaafile -i $newDIR/$allfaafile -F F -t F ".
"-o $newDIR/BLASTno/all.tab $INP_BLASToptions 2> " .
"$newDIR/BLASTno/blastlog_unfiltered");
}
print "# Running filtered BLAST...\n"; # see PMID: 20439257
mkdir($newDIR.'/BLASTff');
system("$BLASTPGPEXE -d $newDIR/$allfaafile -i $newDIR/$allfaafile -F T -t T ".
"-o $newDIR/BLASTff/all.tab $INP_BLASToptions 2> " .
"$newDIR/BLASTff/blastlog_unfiltered");
}
## 4) calculate COG triangles ######################################
print "\n# making COG hash...\n";
mkdir($newDIR.'/BLASTconv');
system("$MAKEHASHEXE -i=$newDIR/$p2ofilename -o=$newDIR/BLASTconv");
print "# reading BLAST files...\n";
if($TWOBLASTRUNS)
{
system("$READBLASTEXE -d=$newDIR/BLASTconv -u=$newDIR/BLASTno " .
" -f=$newDIR/BLASTff -s=$newDIR/BLASTno");
}
else{ system("$READBLASTEXE -d=$newDIR/BLASTconv -u=$newDIR/BLASTff ".
"-f=$newDIR/BLASTff -s=$newDIR/BLASTff"); }
print "# checking Lineage-specific expansions...\n";
open (LSEIN, ">$newDIR/$lsefilename")
|| die "# $0 : cannot create $newDIR/$lsefilename, exit\n";
foreach $genome (sort(keys(%genomeused)))
{
foreach my $genome2 (sort(keys(%genomeused)))
{
next if($genome eq $genome2);
print LSEIN "$genome,$genome2\n";
}
}
close LSEIN;
mkdir($newDIR.'/tmp');
system("$LSEEXE -t=$newDIR/tmp -d=$newDIR/BLASTconv -j=$newDIR/$lsefilename ".
" -p=$newDIR/$p2ofilename -o=$newDIR/$lseoutfilename > /dev/null");
print "# making COGs...\n"; # genera all-edges.txt , cog-edges.txt
system("$COGTRIANGLESEXE -i=$newDIR/BLASTconv -q=$newDIR/$p2ofilename " .
" -l=$newDIR/$lseoutfilename -o=$newDIR/$coglogfilename");
# $COGTRIANGLESEXE puts them in cwd, Jun2010!
if(-e $cogedgesfilename){ system("mv -f $cogedgesfilename $newDIR") }
if(-e $edgesfilename){ system("mv -f $edgesfilename $newDIR") }
## 5) group sequences sharing COG #################################
open(COGS,"$newDIR/$cogedgesfilename") ||
die "# $0 : cannot read $newDIR/$cogedgesfilename, the program failed\n";
while(<COGS>)
{
#cluster00332,912,Buch_aph_Bp.faa,405,Buch_aphid_Sg.faa
my @data = split(/,/,$_);
($clusterid,$id,$genome) = @data[0,1,2];
push(@{$COGs{$clusterid}{'genomes'}},$genome)
if(!grep(/^$genome$/,@{$COGs{$clusterid}{'genomes'}}));
push(@{$COGs{$clusterid}{'ids'}},$id)
if(!grep(/^$id$/,@{$COGs{$clusterid}{'ids'}}));
}
close(COGS);
print "\n# cluster list:\n";
mkdir($newDIR.'/clusters');
foreach $clusterid (sort {substr($a,7)<=>substr($b,7)} (keys(%COGs)))
{
print ">$newDIR/clusters/$clusterid.faa sequences=".
scalar(@{$COGs{$clusterid}{'ids'}}).
" genomes=".scalar(@{$COGs{$clusterid}{'genomes'}})."\n";
open(COGFAA,">$newDIR/clusters/$clusterid.faa") ||
die "# $0 : cannot create $newDIR/cogs/$clusterid.faa, exit\n";
foreach $id (@{$COGs{$clusterid}{'ids'}})
{
print COGFAA ">$sequence_data{$id} ($clusterid)\n$sequence_data{$id.'PROT'}\n";
}
close(COGFAA);
}
print "\n# total COGs = ".scalar(keys(%COGs))." in folder $newDIR/clusters\n";
exit(0);
################################################################
sub read_FASTA_sequence
{
my ($infile) = @_;
my ($n_of_sequences,%FASTA) = (0);
open(FASTA,$infile) || die "# read_FASTA_sequence: cannot read $infile $!:\n";
while(<FASTA>)
{
if(/^\>(.*?)\n/)
{
$n_of_sequences++;
$FASTA{$n_of_sequences}{'NAME'} = $1;
}
else
{
$FASTA{$n_of_sequences}{'SEQ'} .= $_;
$FASTA{$n_of_sequences}{'SEQ'} =~ s/[\s|\n|\d]//g;
}
}
close(FASTA);
return(\%FASTA);
}