This example shows how the (sequence) information contained in a CIF file can be readily accessed and transformed into another format. This particular example implements a FASTA converter, which reads the monomer sequences in the entity_poly_seq category and translates them into the single-letter FASTA format. As there are single-letter codifications already in the CIF file, in the entity_poly category, this example also shows how one might easily compare their translations with the codifications already in the CIF file.
Save FASTA.py and save the CIF data file. Run python FASTA.py /path/to/file.cif
from pdbx.reader.PdbxContainers import ContainerBase
from pdbx.reader.PdbxContainers import DataCategory
getObj(self, name)
Returns the DataCategory object specified by name.hasAttribute(self, attributeName)
Returns a bool indicating whether or not the category has the attribute attributeName.getValue(self, attributeName=None, rowIndex=None)
Returns the value of the attribute attributeName at row index rowIndex.getRowCount(self)
Returns the number of rows in the category table.Output of the command: python FASTA.py 5HVP.cif
For entity #1:
The codified version in the CIF file is as follows:
PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPT
PVNIIGRNLLTQIGCTLNF
The codified version obtained from translating entity_poly_seq is as follows:
PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPT
PVNIIGRNLLTQIGCTLNF
The two are equivalent.
---
For entity #2:
The codified version in the CIF file is as follows:
XVVXAX
The codified version obtained from translating entity_poly_seq is as follows:
XVVXAX
The two are equivalent.
---
more 5HVP.fasta
>HIV-1 PROTEASE
PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPT
PVNIIGRNLLTQIGCTLNF
>ACETYL-*PEPSTATIN
XVVXAX
""" FASTA.py For some CIF file, generate a FASTA file based on the monomer sequences stored in the struct_poly_seq category table. Optionally compare our codification with the one stored in the struct_poly category table. Lines with superscriptions contain footnoted references or explanations. """ from os.path import splitext from pdbx.reader.PdbxReader import PdbxReader from pdbx.reader.PdbxContainers import * from sys import argv, exit # Amino acid FASTA codification dictionary codification = { "ALA" : 'A', "CYS" : 'C', "ASP" : 'D', "GLU" : 'E', "PHE" : 'F', "GLY" : 'G', "HIS" : 'H', "ILE" : 'I', "LYS" : 'K', "LEU" : 'L', "MET" : 'M', "ASN" : 'N', "PYL" : 'O', "PRO" : 'P', "GLN" : 'Q', "ARG" : 'R', "SER" : 'S', "THR" : 'T', "SEC" : 'U', "VAL" : 'V', "TRP" : 'W', "TYR" : 'Y' } # Compare our translations against the codifications stored in the CIF file compare = True # Check for improper usage if len(argv) != 2 : exit("Usage: python FASTA.py /path/to/file.cif") # Open the CIF file cif = open(argv[1]) # A list to be propagated with data blocks data = [] # Create a PdbxReader object pRd = PdbxReader(cif) # Read the CIF file, propagating the data list pRd.read(data) # Close the CIF file, as it is no longer needed cif.close() # Retrieve the first data block block = data[0] # Retrieve the entity category table, which contains information that will be used in the FASTA header1 entity = block.getObj("entity") # Holds non-mandatory entity attributes that could serve as FASTA header lines, ordered preferentially candidates = ["pdbx_description", "details", "type"] # The entity category table attribute to be used to fill the FASTA header headerDescriptor = "" # Set the header descriptor for c in candidates : if entity.hasAttribute(c) : headerDescriptor = c break # If none of the optional descriptors are present, just use the entity id if not headerDescriptor: headerDescriptor = "id" # Retrieve the entity_poly_seq category table, which containers the monomer sequences for entities2 entity_poly_seq = block.getObj("entity_poly_seq") # Use the CIF file pathname to generate the FASTA file (.fasta) pathname (file, ext) = splitext(argv[1]) fastaFileName = file + ".fasta" # Open the FASTA file for writing fasta = open(fastaFileName, "w") # A container to hold each codified entity for optional comparison codifiedEntities = [] # Track the current entity currentEntity = 1 # Track the column width columnWidth = 0 # Codification of the current entity codifiedEntity = "" # Write the first header to the FASTA file fasta.write(">%s\n" % entity.getValue(headerDescriptor, currentEntity - 1)) # Iterate over every row in entity_poly_seq, each containing an entity monomer for index in range(entity_poly_seq.getRowCount()) : # Obtain the ID of the entity monomer described by this row tempEntity = (int)(entity_poly_seq.getValue("entity_id", index)) # If we are dealing with a new entity if currentEntity != tempEntity : # Write out the current entity's FASTA codification fasta.write(codifiedEntity) # Store the current entity's FASTA codification codifiedEntities.append(codifiedEntity) # Set the new entity ID currentEntity = tempEntity # Write out the new header fasta.write("\n>%s\n" % entity.getValue(headerDescriptor, currentEntity - 1)) columnWidth = 0 codifiedEntity = "" # If we have hit the maximum column width if columnWidth == 80 : # Move to a new line and reset the width codifiedEntity += "\n" columnWidth = 0 # Retrieve the monomer stored in this row monomer = entity_poly_seq.getValue("mon_id", index) # If the monomer is an amino acid if len(monomer) == 3 : # If it's in the codification dictionary, add it if codification.has_key(monomer) : codifiedEntity += codification[monomer] # Otherwise, use the default value "X" else : codifiedEntity += "X" # If it's a nucleic acid, there is nothing to codify else : codifiedEntity += monomer columnWidth += 1 # Write out and store the last entity fasta.write(codifiedEntity) codifiedEntities.append(codifiedEntity) # Close the FASTA file fasta.close() # Optional comparison against the codified sequences stored in entity_poly if compare : # Retrieve the table corresponding to the entity_poly category, which contains one letter # codified canonical sequences for entities, against which we can compare our conversions3 entity_poly = block.getObj("entity_poly") # Compare every codification with the one in the CIF file for i in range(len(codifiedEntities)) : code = entity_poly.getValue("pdbx_seq_one_letter_code_can", i) print "For entity #%d: " % (i + 1) print "\nThe codified version in the CIF file is as follows: " print code print "\nThe codified version obtained from translating entity_poly_seq is as follows: " print codifiedEntities[i] equality = (code == codifiedEntities[i]) and "equivalent" or "inequivalent" print "\nThe two are %s." % (equality) print "---"