30 de agosto de 2011

Decodificando la Gene Ontology (C/C++)

Hola,
como continuación de la entrada anterior os pongo hoy código en C/C++ para hacer exactamente la misma tarea, aprovechando las estructuras de datos e iteradores de la Standard Template Library (STL). En mis pruebas, esta implementación es aproximandamente el doble de rápida que la de Perl. Por cierto, se compila con:  
$ gcc -o get_go_annotation get_go_annotation.cpp myGOparser.cpp -lstdc++

El código fuente incluye 3 ficheros:


1) get_go_annotation.cpp

 #include <stdio.h>  
 #include <stdlib.h>  
 #include <string>  
 #include <map>  
 #include "myGOparser.h"  
 using namespace std;  
   
 #define DEFFLATFILE "/path/to/gene_ontology.1_2.obo";  
   
 int main(int argc, char *argv[])  
 {   
    if(argc == 1)  
    {  
       printf("# usage: ./get_go_annotation <GO:0046983>\n\n");  
       exit(-1);  
    }  
      
    string input_term = argv[1];  
    string flatfile_name = DEFFLATFILE;   
    int n_of_records = 0;  
    map <string, GOnode> parsedGO;  
      
    printf("# parsing GO flat file ... ");  
    n_of_records = parse_GO_flat_file(flatfile_name, parsedGO);  
    printf("done (%d records)\n\n",n_of_records);  
      
    string annot = get_full_annotation_for_term(input_term,parsedGO);  
    printf("%s\n",annot.c_str());  
      
    exit(0);  
 }  

2) myGOparser.h

 /* Bruno Contreras-Moreira, 2011 EEAD/CSIC */   
 #include <stdio.h>  
 #include <stdlib.h>  
 #include <string>  
 #include <vector>  
 #include <map>  
   
 using namespace std;  
   
 #define LINE 400  
 #define FIELD 200  
   
 struct GOnode   
 {  
    string name;  
    vector <string> parents;     
 };  
   
 int parse_GO_flat_file(string &filename, map <string, GOnode> &GO);  
   
 string get_full_annotation_for_term(string &term, map <string, GOnode> &GO);  

3) myGOparser.cpp

 // Bruno Contreras-Moreira, 2011 EEAD/CSIC  
 #include <string.h>  
 #include <map>  
 #include "myGOparser.h"  
   
 using namespace std;  
   
 int parse_GO_flat_file(string &filename, map <string, GOnode> &GO)  
 {  
    // 1) parse flat GO file  
    FILE * gofile = fopen(filename.c_str(),"r");  
    if(gofile == NULL)  
    {  
       printf("# parse_GO_flat_file : cannot read %s, exit ...\n",  
          filename.c_str());  
       return 0;  
    }  
      
    int n_of_records = 0;  
    char line[LINE], field[FIELD];  
    string term,name,alt_id,parent;  
    map <string, string> synonym;  
    while( fgets(line, LINE, gofile) != NULL )  
    {  
      /*[Term]  
       id: GO:0006355  
       name: regulation of transcription, DNA-dependent  
       namespace: biological_process  
       alt_id: GO:0032583  
       is_a: GO:0010468 ! regulation of gene expression */  
         
       if(strncmp(line,"id:",3) == 0)  
       {   
          sscanf(line,"id: %s", (char *) &field);  
          term = field;   
          n_of_records++;  
       }  
       else if(strncmp(line,"name:",5) == 0)   
       {  
          strncpy(field,line+6,FIELD-1);  
          field[strcspn (field,"\n")] = '\0'; // chomp  
          name = field;   
          GO[term].name = name;  
       }  
       else if(strncmp(line,"alt_id:",7) == 0)  
       {  
          sscanf(line,"alt_id: %s",(char *) &field);  
          alt_id = field;  
          synonym[alt_id] = term;   
       }  
       else if(strncmp(line,"is_a:",4) == 0)  
       {  
          sscanf(line,"is_a: %s",(char *) &field);  
          parent = field;     
          GO[term].parents.push_back(parent);  
       }  
    }  
    fclose(gofile);  
      
    // 2) link synonims  
    map <string, string>::iterator syn;  
    for(syn = synonym.begin(); syn != synonym.end(); ++syn )   
       GO[syn->first] = GO[syn->second];  
      
    return n_of_records;  
 }  
   
 string get_full_annotation_for_term(string &term, map <string, GOnode> &GO)  
 {  
    string annot, pterm;  
    vector <string>::iterator pit;  
      
    if(GO.find(term) == GO.end())  
    {  
       annot = "[cannot find GO term ";  
       annot += term;  
       annot += "]";  
    }  
    else  
    {  
       if(GO[term].parents.size() == 0)  
       {  
          annot += GO[term].name;  
          annot += "(";  
          annot += term;  
          annot += ") | ";  
       }  
       else  
       {  
          for(pit=GO[term].parents.begin();pit != GO[term].parents.end();++pit)  
          {      
             annot += GO[term].name;  
             annot += "(";  
             annot += term;  
             annot += "), ";  
             annot += get_full_annotation_for_term(*pit,GO);  
            
          }  
       }     
    }  
      
    return annot;  
 }  

Un saludo, Bruno


No hay comentarios:

Publicar un comentario