modules/chempy/cif.py (569 lines of code) (raw):

#A* ------------------------------------------------------------------- #B* This file contains source code for the PyMOL computer program #C* copyright 1998-2000 by Warren Lyford Delano of DeLano Scientific. #D* ------------------------------------------------------------------- #E* It is unlawful to modify or remove this copyright notice. #F* ------------------------------------------------------------------- #G* Please see the accompanying LICENSE file for further information. #H* ------------------------------------------------------------------- #I* Additional authors of this source file include: #-* #-* #-* #Z* ------------------------------------------------------------------- from __future__ import print_function import re from chempy import Atom, Bond from chempy.models import Indexed # OMG what a horrid file format. # set asym_id as chain AND segi ASYM_ID_AS_SEGI = True # matches any valid CIF token token_re = re.compile(r'(?:\s*(' r"'.*?'(?=\s)|" r'".*?"(?=\s)|' r'^;.*?^;|' r'#.*?$|' r'\S+' r'))', re.I | re.DOTALL | re.MULTILINE) # matches numbers in parentheses floatuncert_sub = re.compile(r'\([0-9]+\)').sub def unquote(s): ''' Remove quotes from a CIF token ''' s0 = s[0] if s0 in ('"', "'"): return s[1:-1] if s0 == ';': return s[1:-2] if s0 == '[': raise ValueError('found reserved "[" character') return s def scifloat(s): ''' Convert string to float. Accepts numbers which match the CIF regex for floats, which may contain an uncertenty in parentheses ''' # strip uncertenty notation from string: 123(4)E02 -> 123E02 return float(floatuncert_sub('', s)) class ciftokeniter(object): ''' Tokenize a CIF string. Skip comments, preserve quotes. Provides sub-iterators for loop keys and data. ''' def __init__(self, starstr): self._iter = token_re.finditer(starstr) self._prev = [] def __iter__(self): return self def next(self): if self._prev: return self._prev.pop() s = next(self._iter).group(1) if s[0] == '#': return next(self) return s __next__ = next def loopdataiter(self): for s in self: if s[0] == '_' or s[:5].lower() in ('loop_', 'data_', 'save_'): self._prev.append(s) break yield s def loopkeysiter(self): for s in self: if s[0] != '_': self._prev.append(s) break yield s def parse_cif(cifstr): ''' Parse a CIF string and return an iterator over CIFData records. ''' current_data = current_block = None token_it = ciftokeniter(cifstr) for s in token_it: s_lower = s.lower() if s[0] == '_': key = s_lower.replace('.', '_') current_block.key_value[key] = next(token_it) elif s_lower == 'loop_': loop = CIFLoop() current_block.loops.append(loop) for i, key in enumerate(token_it.loopkeysiter()): key = key.lower().replace('.', '_') loop.keys[key] = i ncols = len(loop.keys) for i, value in enumerate(token_it.loopdataiter()): if i % ncols == 0: row = [] loop.rows.append(row) row.append(value) elif s_lower[:5] == 'data_': if current_data is not None: yield current_data current_block = current_data = CIFData(s[5:]) elif s_lower[:5] == 'save_': if len(s) > 5: current_block = CIFData(s[5:]) current_data.saveframes.append(current_block) else: current_block = current_data else: raise ValueError(s) if current_data is not None: yield current_data def _row_get(row, i, d, cast): if i < 0: return d v = row[i] if v in ('.', '?'): return d return cast(v) class CIFLoop: ''' CIF loop (table) ''' def __init__(self): self.keys = {} self.rows = [] def get_col_idx_opt(self, *names): ''' Get column index for first found name in names, or -1 ''' for name in names: idx = self.keys.get(name.replace('.', '_')) if idx is not None: return idx return -1 def get_col_idx(self, *names): ''' Get column index for first found name in names, or raise KeyError ''' idx = self.get_col_idx_opt(*names) if idx == -1: raise KeyError return idx class CIFData: ''' CIF data ''' def __init__(self, name): self.name = name self.loops = [] self.key_value = {} self.saveframes = {} def __repr__(self): return '<%s:%s #kv=%d #loops=%d>' % (type(self).__name__, self.name, len(self.key_value), len(self.loops)) def _get(self, key, d, cast): v = self.key_value[key] if v in ('.', '?'): return d return cast(v) def to_float(self, key): return self._get(key, 0.0, scifloat) def to_str(self, key): return self._get(key, '', unquote) def index_to_int(self, index, value): return _row_get(value, index, 0, int) def index_to_float(self, index, value): return _row_get(value, index, 0.0, scifloat) def index_to_str(self, index, value): return _row_get(value, index, '', unquote) class CIFRec(CIFData): ''' CIF record ''' def __init__(self, datablock): self.loops = datablock.loops self.key_value = datablock.key_value self.name = datablock.name # now build the molecule record self.model = Indexed() self.model.molecule.title = datablock.name # coordinates for state 2-N self.extra_coords = [] # by default, indicate that we want PyMOL to automatically # detect bonds based on coordinates self.model.connect_mode = 3 self.read_symmetry() if self.read_atom_site(): self.read_geom_bond() self.read_struct_conn() self.read_ss() elif self.read_chem_comp_atom(): self.read_chem_comp_bond() def read_symmetry(self): try: self.model.cell = [ self.to_float("_cell_length_a"), self.to_float("_cell_length_b"), self.to_float("_cell_length_c"), self.to_float("_cell_angle_alpha"), self.to_float("_cell_angle_beta"), self.to_float("_cell_angle_gamma")] except (KeyError, ValueError): return False try: self.model.spacegroup = self.to_str('_symmetry_space_group_name_h-m') except KeyError: pass return True def read_chem_comp_atom_model_cartn(self,fields,field_dict,values): try: cartn_x = field_dict['_chem_comp_atom_model_cartn_x'] cartn_y = field_dict['_chem_comp_atom_model_cartn_y'] cartn_z = field_dict['_chem_comp_atom_model_cartn_z'] except KeyError: return False name = field_dict.get('_chem_comp_atom_atom_id',None) symbol = field_dict.get('_chem_comp_atom_type_symbol',None) resn = field_dict.get('_chem_comp_atom_comp_id',None) partial_charge = field_dict.get('_chem_comp_atom_partial_charge',None) formal_charge = field_dict.get('chem_comp_atom_charge',None) str_fields = [] if symbol != None: str_fields.append( ('symbol',symbol) ) if name != None: str_fields.append( ('name',name) ) if resn != None: str_fields.append( ('resn',resn) ) float_fields = [] if partial_charge != None: float_fields.append( ('partial_charge',partial_charge) ) int_fields = [] if formal_charge != None: int_fields.append( ('formal_charge',formal_charge) ) for value in values: atom = Atom() atom.coord = [ self.index_to_float(cartn_x,value), self.index_to_float(cartn_y,value), self.index_to_float(cartn_z,value)] self.model.atom.append(atom) for field in str_fields: setattr(atom,field[0],self.index_to_str(field[1],value)) for field in float_fields: setattr(atom,field[0],self.index_to_float(field[1],value)) for field in int_fields: setattr(atom,field[0],self.index_to_int(field[1],value)) return True def read_chem_comp_atom(self): for loop in self.loops: if self.read_chem_comp_atom_model_cartn(None, loop.keys, loop.rows): return True return False def read_atom_site_fract(self,fields,field_dict,values): try: fract_x = field_dict['_atom_site_fract_x'] fract_y = field_dict['_atom_site_fract_y'] fract_z = field_dict['_atom_site_fract_z'] except KeyError: return False self.model.fractional = 1 symbol = field_dict.get('_atom_site_type_symbol',None) name = field_dict.get('_atom_site_label',None) if name is None: name = field_dict.get('_atom_site_id',None) u = field_dict.get('_atom_site_u_iso_or_equiv',None) str_fields = [] if symbol != None: str_fields.append( ('symbol',symbol) ) if name != None: str_fields.append( ('name',name) ) float_fields = [] if u != None: float_fields.append( ('u', u)) int_fields = [] for value in values: atom = Atom() atom.coord = [ self.index_to_float(fract_x,value), self.index_to_float(fract_y,value), self.index_to_float(fract_z,value)] self.model.atom.append(atom) for field in str_fields: setattr(atom,field[0],value[field[1]]) for field in float_fields: setattr(atom,field[0],self.index_to_float(field[1],value)) for field in int_fields: setattr(atom,field[0],self.index_to_int(field[1],value)) return True def read_atom_site_cartn(self,fields,field_dict,values): try: cartn_x = field_dict['_atom_site_cartn_x'] cartn_y = field_dict['_atom_site_cartn_y'] cartn_z = field_dict['_atom_site_cartn_z'] except KeyError as e: return False group_pdb = field_dict.get('_atom_site_group_pdb',None) symbol = field_dict.get('_atom_site_type_symbol',None) name = field_dict.get('_atom_site_label_atom_id',None) resn = field_dict.get('_atom_site_label_comp_id',None) resi = field_dict.get('_atom_site_label_seq_id',None) chain = field_dict.get('_atom_site_label_asym_id',None) ins_code = field_dict.get('_atom_site_pdbx_pdb_ins_code',None) alt = field_dict.get('_atom_site_label_alt_id', None) model_num = field_dict.get('_atom_site_pdbx_pdb_model_num', None) # use auth fields preferentially, if provided auth_resn = field_dict.get('_atom_site_auth_comp_id',None) auth_resi = field_dict.get('_atom_site_auth_seq_id',None) auth_name = field_dict.get('_atom_site_auth_atom_id',None) auth_chain = field_dict.get('_atom_site_auth_asym_id',None) if auth_resn != None: resn = auth_resn if auth_resi != None: resi = auth_resi if auth_name != None: name = auth_name if auth_chain != None: chain = auth_chain b = field_dict.get('_atom_site_b_iso_or_equiv',None) q = field_dict.get('_atom_site_occupancy',None) ID = field_dict.get('_atom_site_id',None) str_fields = [] if symbol != None: str_fields.append( ('symbol',symbol) ) if name != None: str_fields.append( ('name',name) ) if resn != None: str_fields.append( ('resn',resn) ) if resi != None: str_fields.append( ('resi',resi) ) if chain != None: str_fields.append( ('chain',chain) ) if ASYM_ID_AS_SEGI: str_fields.append( ('segi',chain) ) if alt != None: str_fields.append( ('alt',alt) ) if ins_code != None: str_fields.append( ('ins_code',ins_code) ) float_fields = [] if q != None: float_fields.append( ('q',q) ) if b != None: float_fields.append( ('b',b) ) int_fields = [] if ID != None: int_fields.append( ('id',ID) ) first_model_num = self.index_to_int(model_num, values[0]) for value in values: coord = [ self.index_to_float(cartn_x,value), self.index_to_float(cartn_y,value), self.index_to_float(cartn_z,value)] if model_num is not None: v = self.index_to_int(model_num, value) if v != first_model_num: self.extra_coords.extend(coord) continue atom = Atom() atom.coord = coord self.model.atom.append(atom) if group_pdb != None: if value[group_pdb] == 'ATOM': atom.hetatm = 0 else: atom.hetatm = 1 for field in str_fields: setattr(atom,field[0],self.index_to_str(field[1],value)) for field in float_fields: setattr(atom,field[0],self.index_to_float(field[1],value)) for field in int_fields: setattr(atom,field[0],self.index_to_int(field[1],value)) return True def read_atom_site_aniso(self,fields,field_dict,values): try: label = field_dict['_atom_site_aniso_label'] u11 = field_dict['_atom_site_aniso_u_11'] u22 = field_dict['_atom_site_aniso_u_22'] u33 = field_dict['_atom_site_aniso_u_33'] u12 = field_dict['_atom_site_aniso_u_12'] u13 = field_dict['_atom_site_aniso_u_13'] u23 = field_dict['_atom_site_aniso_u_23'] except KeyError: return False cnt = 0 name_dict = {} for atom in self.model.atom: if hasattr(atom,'name'): name_dict[atom.name] = cnt cnt = cnt + 1 for value in values: try: atom = name_dict[self.index_to_str(label,value)] except KeyError: print(" CIF _atom_site_aniso_label, invalid key:", value) continue self.model.atom[atom].u_aniso = [ self.index_to_float(u11,value), self.index_to_float(u22,value), self.index_to_float(u33,value), self.index_to_float(u12,value), self.index_to_float(u13,value), self.index_to_float(u23,value) ] return True def read_atom_site(self): for loop in self.loops: if self.read_atom_site_fract(None, loop.keys, loop.rows) or \ self.read_atom_site_cartn(None, loop.keys, loop.rows): for loop in self.loops: if self.read_atom_site_aniso(None, loop.keys, loop.rows): break return True return False def read_struct_conn_(self, loop): try: type_id = loop.get_col_idx('_struct_conn.conn_type_id') asym_id_1 = loop.get_col_idx('_struct_conn.ptnr1_auth_asym_id', '_struct_conn.ptnr1_label_asym_id') comp_id_1 = loop.get_col_idx('_struct_conn.ptnr1_auth_comp_id', '_struct_conn.ptnr1_label_comp_id') seq_id_1 = loop.get_col_idx('_struct_conn.ptnr1_auth_seq_id', '_struct_conn.ptnr1_label_seq_id') atom_id_1 = loop.get_col_idx('_struct_conn.ptnr1_label_atom_id') asym_id_2 = loop.get_col_idx('_struct_conn.ptnr2_auth_asym_id', '_struct_conn.ptnr2_label_asym_id') comp_id_2 = loop.get_col_idx('_struct_conn.ptnr2_auth_comp_id', '_struct_conn.ptnr2_label_comp_id') seq_id_2 = loop.get_col_idx('_struct_conn.ptnr2_auth_seq_id', '_struct_conn.ptnr2_label_seq_id') atom_id_2 = loop.get_col_idx('_struct_conn.ptnr2_label_atom_id') except KeyError: return False alt_id_1 = loop.get_col_idx_opt('_struct_conn.pdbx_ptnr1_label_alt_id') ins_code_1 = loop.get_col_idx_opt('_struct_conn.pdbx_ptnr1_pdb_ins_code') symm_1 = loop.get_col_idx_opt('_struct_conn.ptnr1_symmetry') alt_id_2 = loop.get_col_idx_opt('_struct_conn.pdbx_ptnr2_label_alt_id') ins_code_2 = loop.get_col_idx_opt('_struct_conn.pdbx_ptnr2_pdb_ins_code') symm_2 = loop.get_col_idx_opt('_struct_conn.ptnr2_symmetry') idxs_1 = [asym_id_1, comp_id_1, seq_id_1, ins_code_1, atom_id_1, alt_id_1] idxs_2 = [asym_id_2, comp_id_2, seq_id_2, ins_code_2, atom_id_2, alt_id_2] # atoms indexed by atomic identifiers atom_dict = dict(((a.chain, a.resn, a.resi, getattr(a, 'ins_code', ''), a.name, a.alt), i) for (i, a) in enumerate(self.model.atom)) for row in loop.rows: if self.index_to_str(type_id, row).lower() != 'covale': # ignore non-covalent bonds (metalc, hydrog) continue if self.index_to_str(symm_1, row) != self.index_to_str(symm_2, row): # don't bond to symmetry mates continue key_1 = tuple(self.index_to_str(i, row) for i in idxs_1) key_2 = tuple(self.index_to_str(i, row) for i in idxs_2) try: index = [atom_dict[key_1], atom_dict[key_2]] except KeyError: print(" CIF _struct_conn, invalid keys:", row) continue bond = Bond() bond.index = index self.model.bond.append(bond) return True def read_struct_conn(self): ''' Create bonds from STRUCT_CONN category ''' for loop in self.loops: if self.read_struct_conn_(loop): return True return False def read_geom_bond_atom_site_labels(self,fields,field_dict,values): try: label_1 = field_dict.get('_geom_bond_atom_site_id_1', None) if label_1 is not None: label_2 = field_dict['_geom_bond_atom_site_id_2'] else: label_1 = field_dict['_geom_bond_atom_site_label_1'] label_2 = field_dict['_geom_bond_atom_site_label_2'] except KeyError: return False symm_1 = field_dict.get('_geom_bond_site_symmetry_1', -1) symm_2 = field_dict.get('_geom_bond_site_symmetry_2', -1) # create index of atom name cnt = 0 name_dict = {} for atom in self.model.atom: if hasattr(atom,'name'): name_dict[atom.name] = cnt cnt = cnt + 1 for value in values: if self.index_to_str(symm_1, value) != self.index_to_str(symm_2, value): # don't bond to symmetry mates continue try: index = [name_dict[self.index_to_str(label_1, value)], name_dict[self.index_to_str(label_2, value)]] except KeyError: print(" CIF _geom_bond_atom_site_label, invalid keys:", value) continue bond = Bond() bond.index = index bond.order = 1 self.model.bond.append(bond) return True def read_geom_bond(self): ''' Create bonds from GEOM_BOND category ''' for loop in self.loops: if self.read_geom_bond_atom_site_labels(None, loop.keys, loop.rows): return True return False def read_chem_comp_bond_atom_ids(self,fields,field_dict,values): try: label_1 = field_dict['_chem_comp_bond_atom_id_1'] label_2 = field_dict['_chem_comp_bond_atom_id_2'] except KeyError: return False order_table = { 'sing' : 1, 'doub' : 2, 'trip' :3, 'delo': 4 } # create index of atom name cnt = 0 name_dict = {} for atom in self.model.atom: if hasattr(atom,'name'): name_dict[atom.name] = cnt cnt = cnt + 1 order = field_dict.get('_chem_comp_bond_value_order',None) for value in values: try: index = [name_dict[self.index_to_str(label_1, value)], name_dict[self.index_to_str(label_2, value)]] except KeyError: print(" CIF _chem_comp_bond_atom_id, invalid keys:", value) continue bond = Bond() bond.index = index if order != None: order_string = self.index_to_str(order,value).lower() bond.order = order_table.get(order_string[0:4],1) else: bond.order = 1 self.model.bond.append(bond) # don't do distance-based bonding self.model.connect_mode = 1 return True def read_chem_comp_bond(self): ''' Create bonds from CHEM_COMP_BOND category. If successfull, don't do any distance-based bonding. ''' for loop in self.loops: if self.read_chem_comp_bond_atom_ids(None, loop.keys, loop.rows): return True return False def read_ss_(self, loop, ss, ssrecords): ''' Populate ssrecords dictionary with secondary structure records from STRUCT_CONF or STRUCT_SHEET_RANGE category. ''' prefix = '_struct_conf' if ss == 'H' else '_struct_sheet_range' try: Beg_NDB_Strand_ID = loop.get_col_idx(prefix + '.beg_auth_asym_id', prefix + '.beg_label_asym_id') Beg_NDB_res_num = loop.get_col_idx(prefix + '.beg_auth_seq_id', prefix + '.beg_label_seq_id') End_NDB_Strand_ID = loop.get_col_idx(prefix + '.end_auth_asym_id', prefix + '.end_label_asym_id') End_NDB_res_num = loop.get_col_idx(prefix + '.end_auth_seq_id', prefix + '.end_label_seq_id') except KeyError: return False Beg_NDB_ins_code = loop.get_col_idx_opt(prefix + '.pdbx_beg_pdb_ins_code') End_NDB_ins_code = loop.get_col_idx_opt(prefix + '.pdbx_end_pdb_ins_code') idxs_1 = [Beg_NDB_Strand_ID, Beg_NDB_res_num, Beg_NDB_ins_code] idxs_2 = [End_NDB_Strand_ID, End_NDB_res_num, End_NDB_ins_code] for row in loop.rows: key = tuple(self.index_to_str(i, row) for i in idxs_1) ssrecords[key] = [ss, tuple(self.index_to_str(i, row) for i in idxs_2)] return True def read_ss(self): ''' Read secondary structre (sheets and helices) from STRUCT_CONF and STRUCT_SHEET_RANGE categories and assign to CA atoms. ''' ssrecords = {} for loop in self.loops: if self.read_ss_(loop, 'H', ssrecords): break for loop in self.loops: if self.read_ss_(loop, 'S', ssrecords): break if not ssrecords: return False atoms = self.model.atom for i, a in enumerate(atoms): if a.name != 'CA': continue try: ss, endkey = ssrecords[a.chain, a.resi, getattr(a, 'ins_code', '')] except KeyError: continue for j in range(i, len(atoms)): aj = atoms[j] if aj.name != 'CA': continue aj.ss = ss if (aj.chain, aj.resi, getattr(aj, 'ins_code', '')) == endkey: break return True class CIF: def __init__(self, fname, mode='r'): if mode not in ('r','pf'): print(" CIF: bad mode") return None if mode=='pf': # pseudofile contents = fname.read() else: try: from pymol.internal import file_read contents = file_read(fname) except ImportError: contents = open(fname, mode).read() self.datablocks_it = parse_cif(contents) def __iter__(self): return self def next(self): rec = CIFRec(next(self.datablocks_it)) if rec.model.atom: return rec return next(self) __next__ = next def read(self): try: return next(self) except StopIteration: return None