Una situación cada vez más habitual en genómica y proteómica es la redundancia de datos. Con las modernas técnicas de secuenciación de alto rendimiento se generan gigas y gigas de datos que es necesario filtrar y corregir para obtener resultados inteligibles.
El problema más habitual y sencillo es la redundancia de secuencias, cuando en un mismo fichero aparecen secuencias idénticas con identificadores diferentes. En otras ocasiones el problema es el contrario, identificadores idénticos apuntan a secuencias parciales de una misma secuencia (especialmente en datos de proteómica o transcriptómica).
El siguiente script filtra un fichero de sequencias en formato FASTA, dos tipos de filtro son posibles: 1) filtrar las secuencias redundantes (repetidas), 2) filtrar las secuencias de transcritos parciales (recuperando únicamente los transcritos de mayor secuencia en ficheros de proteínas)
#!/usr/bin/perl
my ($infile) = @ARGV;
my $outfile = filter_fasta_file($infile,['longtrans','nonredundant']);
# Create a FASTA file with custom filtered sequences
sub filter_fasta_file {
my ($infile, $filter, $outfile) = @_;
if (!defined($outfile)){$outfile=$infile.'.filtered'}
open(INFILE,"$infile");
open(OUTFILE,">$outfile");
my ($sequences,$names,$headers) = read_FASTA_file($infile);
my (@previous_sequences,@previous_names,@previous_headers);
# my (@filtered_sequences,@filtered_names,@fitered_headers);
for (my $i=0; $i<=$#{$headers}; $i++){
my $write=1;
if (in_array($filter,'nonredundant')){
my ($found,$posic)=in_array($sequences, $sequences->[$i], 1);
foreach my $pos (@{$posic}){
if ($i != $pos && $i > $pos){
$write = 0; last;
}
}
}
if (in_array($filter,'longtrans')){
$names->[$i] =~ /([^\.]+)\.?(\d?)/;
my $name = $1;
my $variant = $2;
my ($found,$posic)=in_array($names, "/$1/", 1);
foreach my $pos (@{$posic}){
if ($i != $pos){
if (length($sequences->[$i]) < length($sequences->[$pos])) {
$write = 0; last;
}
if (length($sequences->[$i]) == length($sequences->[$pos]) && $sequences->[$i] eq $sequences->[$pos]){
$names->[$pos] =~ /([^\.]+)\.?(\d?)/;
my $name2 = $1;
my $variant2 = $2;
if ($variant2 < $variant){
$write = 0; last;
}
}
}
}
}
if ($write) {
print OUTFILE $headers->[$i]."\n".$sequences->[$i]."\n";
}
}
close INFILE;
close OUTFILE;
return $outfile;
}
#################################################################################
# Create 3 array references with sequences, names and headers from a FASTA file
sub read_FASTA_file
{
my($FASTAfile,$full_header,$allow_empty_sequences,$allow_multi_rows) = @_;
if(!$full_header){ $full_header = $allow_empty_sequences = 0 }
if(!$allow_empty_sequences){ $allow_empty_sequences = 0 }
if(!$allow_multi_rows){ $allow_multi_rows = 0 }
my (@sequences,@names,@headers,$seq,$seqname,$header);
my $headerOK = 0;
open(FASTA,$FASTAfile) || die "# read_FASTA_file : cannot find $FASTAfile\n";
$seq = '';
while(<FASTA>){
next if(/^#/); # allow comments
s/\012\015?|\015\012?|^\s+|\s+$//;
if(/>/){
$headerOK++;
if($seq || ($allow_empty_sequences && $seqname)){
if(!$allow_multi_rows){
$seq =~ s/\d|\s//g;
}
push(@sequences,$seq); # store this FASTA sequence
push(@names,$seqname);
push(@headers,$header);
$seq='';
}
if($full_header){
$seqname = substr($_,1);
}else { # get next sequence name
if (/^>\s*([^\t|^\||^;]+)/){
$header = $_;
$seqname = $1;
if (length($seqname)>32 && $seqname =~ /^([^\s]+)+\s/){
$seqname = $1;
}
$seqname =~ s/^\s+|\s+$// ;
}
}
}else{
if(!$allow_multi_rows){
s/\s+//g;
}
$seq .= $_;
}
}
close(FASTA);
# take care of last FASTA sequence
push(@sequences,$seq);
push(@names,$seqname);
push(@headers,$header);
if(!$headerOK){ @sequences = @names = @headers = (); } #Aug2007
return (\@sequences,\@names,\@headers);
}
#################################################################################
# Searchs a string inside an array
sub in_array {
my ($array,$search_for,$search_posic) = @_;
# my %items = map {$_ => 1} @$array; # create a hash out of the array values
# my $exist = (exists($items{$search_for}))?1:0;
# return (exists($items{$search_for}))?1:0;
my @posic;
# If search a pattern
if ($search_for =~ /^\/.+\/$/){
$search_for =~ s/^\/|\/$//g;
@posic = grep { $array->[$_] =~ /$search_for/ } 0 .. $#{$array};
} else {
@posic = grep { $array->[$_] eq $search_for } 0 .. $#{$array};
}
if (defined($search_posic) && $search_posic){
(scalar @posic)?return (1,\@posic):return (0);
} else {
(scalar @posic)?return 1:return 0;
}
}
An example FASTA file to test the script:
>AT1G51370.2 | Symbols: | F-box family protein | chr1:19045615-19046748 FORWARD
MVGGKKKTKICDKVSHEEDRISQLPEPLISEILFHLSTKDSVRTSALSTKWRYLWQSVPG
LDLDPYASSNTNTIVSFVESFFDSHRDSWIRKLRLDLGYHHDKYDLMSWIDAATTRRIQH
LDVHCFHDNKIPLSIYTCTTLVHLRLRWAVLTNPEFVSLPCLKIMHFENVSYPNETTLQK
LISGSPVLEELILFSTMYPKGNVLQLRSDTLKRLDINEFIDVVIYAPLLQCLRAKMYSTK
NFQIISSGFPAKLDIDFVNTGGRYQKKKVIEDILIDISRVRDLVISSNTWKEFFLYSKSR
PLLQFRYISHLNARFYISDLEMLPTLLESCPKLESLILVMSSFNPS*
>AT1G70790.1 | Symbols: | C2 domain-containing protein | chr1:26700724-26702127 FORWARD
MEDKPLGILRVHVKRGINLAIRDATTSDPYVVITLANQKLKTRVINNNCNPVWNEQLTLS
IKDVNDPIRLTVFDKDRFSGDDKMGDAEIDFRPFLEAHQMELDFQKLPNGCAIKRIRPGR
TNCLAEESSITWSNGKIMQEMILRLKNVECGEVELMLEWTDGPGCKGLGREGSKKTPWMP
TKRLD*
>AT1G50920.1 | Symbols: | GTP-binding protein-related | chr1:18870555-18872570 FORWARD
MVQYNFKRITVVPNGKEFVDIILSRTQRQTPTVVHKGYKINRLRQFYMRKVKYTQTNFHA
KLSAIIDEFPRLEQIHPFYGDLLHVLYNKDHYKLALGQVNTARNLISKISKDYVKLLKYG
DSLYRCKCLKVAALGRMCTVLKRITPSLAYLEQIRQHMARLPSIDPNTRTVLICGYPNVG
KSSFMNKVTRADVDVQPYAFTTKSLFVGHTDYKYLRYQVIDTPGILDRPFEDRNIIEMCS
ITALAHLRAAVLFFLDISGSCGYTIAQQAALFHSIKSLFMNKPLVIVCNKTDLMPMENIS
EEDRKLIEEMKSEAMKTAMGASEEQVLLKMSTLTDEGVMSVKNAACERLLDQRVEAKMKS
KKINDHLNRFHVAIPKPRDSIERLPCIPQVVLEAKAKEAAAMEKRKTEKDLEEENGGAGV
YSASLKKNYILQHDEWKEDIMPEILDGHNVADFIDPDILQRLAELEREEGIREAGVEEAD
MEMDIEKLSDEQLKQLSEIRKKKAILIKNHRLKKTVAQNRSTVPRKFDKDKKYTTKRMGR
ELSAMGLDPSSAMDRARSKSRGRKRDRSEDAGNDAMDVDDEQQSNKKQRVRSKSRAMSIS
RSQSRPPAHEVVPGEGFKDSTQKLSAIKISNKSHKKRDKNARRGEADRVIPTLRPKHLFS
GKRGKGKTDRR*
>AT1G70795.1 | Symbols: | C2 domain-containing protein | chr1:26700724-26702127 FORWARD
MEDKPLGILRVHVKRGINLAIRDATTSDPYVVITLANQKLKTRVINNNCNPVWNEQLTLS
IKDVNDPIRLTVFDKDRFSGDDKMGDAEIDFRPFLEAHQMELDFQKLPNGCAIKRIRPGR
TNCLAEESSITWSNGKIMQEMILRLKNVECGEVELMLEWTDGPGCKGLGREGSKKTPWMP
TKRLD*
>AT1G70790.2 | Symbols: | C2 domain-containing protein | chr1:26700724-26702127 FORWARD
MEDKPLGILRVHVKRGINLAIRDATTSDPYVVITLANQKLKTRVINNNCNPVWNEQLTLS
IKDVNDPIRLTVFDKDRFSGDDKMGDAEIDFRPFLEAHQMELDFQKLPNGCAIKRIRPGR
TNCLAEESSITWSNGKIMQEMILRLKNVECGEVELMLEWTDGPGCKGLGREGSKKTPWMP
TKRLD*
>AT1G36960.1 | Symbols: | unknown protein | chr1:14014796-14015508 FORWARD
MTRLLPYKGGDFLGPDFLTFIDLCVQVRGIPLPYLSELTVSFIAGTLGPILEMEFNQDTS
TYVAFIRVKIRLVFIDRLRFFRREEAAASNTITDQTHMTSSNSSDISPASPISQPPLPAS
LPSHDSYFDAGIQASRLVNPRAISQHHFSSSYSDFKGKEKAKIKIGECSKRKKDKQVDSG
T*
>AT1G51370.1 | Symbols: | F-box family protein | chr1:19045615-19047141 FORWARD
MVGGKKKTKICDKVSHEEDRISQLPEPLISEILFHLSTKDSVRTSALSTKWRYLWQSVPG
LDLDPYASSNTNTIVSFVESFFDSHRDSWIRKLRLDLGYHHDKYDLMSWIDAATTRRIQH
LDVHCFHDNKIPLSIYTCTTLVHLRLRWAVLTNPEFVSLPCLKIMHFENVSYPNETTLQK
LISGSPVLEELILFSTMYPKGNVLQLRSDTLKRLDINEFIDVVIYAPLLQCLRAKMYSTK
NFQIISSGFPAKLDIDFVNTGGRYQKKKVIEDILIDISRVRDLVISSNTWKEFFLYSKSR
PLLQFRYISHLNARFYISDLEMLPTLLESCPKLESLILEMVKNQSTRRHGEKREPNVMVS
TVPWCLVSSLKFVELKRSIPRYEGEMELVRYVLTNSTVLKKLRLNVYYTKKAKCAFLTEL
VAIPRCSSTCVVLVL*