A more involved and extensive example that uses the PDBX parsing Python library to generate a CIF file for each biological
assembly listed in the pdbx_struct_assembly
category of a CIF file. This example synthesizes information located in the
pdbx_struct_assembly_gen,
pdbx_struct_oper_list, and
atom_site categories to accomplish this task.
For every assembly in pdbx_struct_assembly_gen
, the program retrieves the operation expression and applies the
operations it specifies (which are stored in pdbx_struct_oper_list
) to certain coordinates.
Save Assemblies.py and the CIF data file. Run python Assemblies.py /path/to/file.cif
, which generates a
/path/to/2BUK-ASSEMBLY.cif
file for each assembly listed in pdbx_struct_assembly
from pdbx.reader.PdbxContainers import ContainerBase
from pdbx.reader.PdbxContainers import DataCategory
getObj(self, name)
Returns the DataCategory object specified by name.getAttributeList(self)
Returns a list of attribute/data item names.getRowCount(self)
Returns the number of rows in the category table.getValue(self, attributeName=None, rowIndex=None)
Returns the value of the attribute attributeName at row index rowIndex.getFullRow(self, index)
Attempts to fetch the full row at index index and returns an empty list if it fails.setValue(self, value, attributeName=None, rowIndex=None)
Sets a value for the attribute specified by
attributeName at the row index rowIndex./************************* * Assemblies.py * * For some CIF file, generate a complete CIF file for every assembly * listed in the pdbx_struct_assembly category table by performing * rotation and translation matrix operations and creating a new atom_site * category table for each assembly. * * Lines with superscriptions contain footnoted references or explanations. *************************/ import copy from os.path import splitext from pdbx.reader.PdbxReader import PdbxReader from pdbx.writer.PdbxWriter import PdbxWriter from pdbx.reader.PdbxContainers import * from sys import argv def parseOperationExpression(expression) : operations = [] stops = [ "," , "-" , ")" ] currentOp = "" i = 1 # Iterate over the operation expression while i in range(1, len(expression) - 1): pos = i # Read an operation while expression[pos] not in stops and pos < len(expression) - 1 : pos += 1 currentOp = expression[i : pos] # Handle single operations if expression[pos] != "-" : operations.append(currentOp) i = pos # Handle ranges if expression[pos] == "-" : pos += 1 i = pos # Read in the range's end value while expression[pos] not in stops : pos += 1 end = int(expression[i : pos]) # Add all the operations in [currentOp, end] for val in range((int(currentOp)), end + 1) : operations.append(str(val)) i = pos i += 1 return operations def prepareOperation(oper_list, op1index, op2index) : # Prepare matrices for operations 1 & 2 op1 = [[0, 0, 0, 0],[0, 0, 0, 0],[0, 0, 0, 0],[0, 0, 0, 1]] op2 = [[0, 0, 0, 0],[0, 0, 0, 0],[0, 0, 0, 0],[0, 0, 0, 1]] # Fill the operation matrices for operations 1 & 2 for i in range(3) : op1[i][3] = float(oper_list.getValue("vector[" + str(i + 1) + "]", op1index)) if (op2index != -1) : op2[i][3] = float(oper_list.getValue("vector[" + str(i + 1) + "]", op2index)) for j in range(3) : op1[i][j] = float(oper_list.getValue("matrix[" + str(i + 1) + "][" + str(j + 1) + "]", op1index)) if (op2index != -1) : op2[i][j] = float(oper_list.getValue("matrix[" + str(i + 1) + "][" + str(j + 1) + "]", op2index)) # Handles non-Cartesian product expressions if (op2index == -1) : return op1 operation = [[0, 0, 0, 0],[0, 0, 0, 0],[0, 0, 0, 0],[0, 0, 0, 1]] # Handles Cartesian product expressions (4x4 matrix multiplication) sum = 0.0 for row in range(4) : for col in range(4) : sum = 0.0 for r in range(4) : sum += (op1[row][r] * op2[r][col]) operation[row][col] = sum return operation # Open the CIF file cif = open(argv[1]) root, ext = splitext(argv[1]) # A list to be propagated with data blocks data = [] # Create a PdbxReader object with the CIF file handle pRd = PdbxReader(cif) # Read the CIF file, propagating the data list pRd.read(data) # Close the CIF file, as it is no longer needed cif.close() # Retrieve the first data block block = data[0] # Retrieve the atom_site category table, which delineates constituent atoms1 atom_site = block.getObj("atom_site") # Make a reference copy of the atom_site category table atom_site_ref = copy.copy(atom_site) # Retrieve the pdbx_struct_assembly_gen category table, which details the generation of each macromolecular assembly2 assembly_gen = block.getObj("pdbx_struct_assembly_gen") # Retrieve the pdbx_struct_oper_list category table, which details translation and rotation # operations required to generate/transform assembly coordinates3 oper_list = block.getObj("pdbx_struct_oper_list") attributes = atom_site_ref.getAttributeList() # Create a CIF file for every assembly specified in pdbx_struct_assembly_gen for index in range(assembly_gen.getRowCount()) : # Create a new atom_site category table for this assembly atom_site = DataCategory("atom_site", attributes) # Lists to hold the individual operations oper = [] oper2 = [] # Keep track of the current atom and model number atomNum = 1 modelNum = 0 # Retrieve the assembly_id attribute value for this assembly assemblyId = assembly_gen.getValue("assembly_id", index) # Retrieve the operation expression for this assembly from the oper_expression attribute oper_expression = assembly_gen.getValue("oper_expression", index) # Count the number of left parentheses in the operation expression parenCount = oper_expression.count("(") # Handles one operation assemblies (e.g., "1") if parenCount == 0 : oper.append(oper_expression) # Handles multiple operation assemblies, no Cartesian products (e.g., "(1-5)") if parenCount == 1 : oper.extend(parseOperationExpression(oper_expression)) # Handles Cartesian product expressions (e.g., "(X0)(1-60)") if parenCount == 2 : # Break the expression into two parenthesized expressions and parse them temp = oper_expression.find(")") oper.extend(parseOperationExpression(oper_expression[0:temp+1])) oper2.extend(parseOperationExpression(oper_expression[temp+1:])) # Retrieve the asym_id_list, which indicates which atoms to apply the operations to asym_id_list = assembly_gen.getValue("asym_id_list", index) temp = (1 > len(oper2)) and 1 or len(oper2) # For every operation in the first parenthesized list for op1 in oper : # Find the index of the current operation in the oper_list category table op1index = 0 for row in range(oper_list.getRowCount()) : if oper_list.getValue("id", row) == op1 : op1index = row break # For every operation in the second parenthesized list (if there is one) for i in range(temp) : # Find the index of the second operation in the oper_list category table op2index = -1 if (oper2) : for row in range(oper_list.getRowCount()) : if oper_list.getValue("id", row) == oper2[i] : op2index = row break # Prepare the operation matrix operation = prepareOperation(oper_list, op1index, op2index) # Iterate over every atom in the atom_site reference table for r in range(atom_site_ref.getRowCount()) : # If the asym_id of the current atom is not in the asym_id list, skip to the next atom if (asym_id_list.find(atom_site_ref.getValue("label_asym_id", r)) == -1) : continue # Retrieve the atom's row from the atom_site reference table atom = atom_site_ref.getFullRow(r) # Add this row to the atom_site table for this assembly for s in range(len(attributes)) : atom_site.setValue(atom[s], attributes[s], atomNum - 1) # Update the atom number and model number for this row atom_site.setValue(str(atomNum), "id", atomNum - 1) atom_site.setValue(str(modelNum), "pdbx_PDB_model_num", atomNum - 1) # Determine and set the new coordinates coords = [float(atom[10]), float(atom[11]), float(atom[12]), 1.0] sum = 0.0 xyz = ['x', 'y', 'z'] for a in range(3) : sum = 0.0 for b in range(4) : sum += (operation[a][b] * coords[b]) atom_site.setValue("%.3f" % sum, "Cartn_" + xyz[a], atomNum - 1) atomNum += 1 modelNum += 1 # Write the CIF out file block.replace(atom_site) out = open(root + "-" + assemblyId + ".cif", "w") pWt = PdbxWriter(out) pWt.writeContainer(block) out.close()