modules/pymol/seqalign.py (134 lines of code) (raw):

''' Sequence alignment stuff. Based on psico.seqalign (c) 2018 Schrodinger, Inc. (c) 2011 Thomas Holder, MPI for Developmental Biology License: BSD-2-Clause ''' from pymol import cmd, CmdException def needle_alignment(s1, s2): ''' DESCRIPTION Does a Needleman-Wunsch Alignment of sequence s1 and s2 and returns a Bio.Align.MultipleSeqAlignment object. ''' from Bio import pairwise2 from Bio.Align import MultipleSeqAlignment from Bio.SubsMat.MatrixInfo import blosum62 def match_callback(c1, c2): return blosum62.get((c1, c2), 1 if c1 == c2 else -4) alns = pairwise2.align.globalcs(s1, s2, match_callback, -10., -.5, one_alignment_only=True) a = MultipleSeqAlignment([]) a.add_sequence("s1", alns[0][0]) a.add_sequence("s2", alns[0][1]) return a def alignment_mapping(seq1, seq2): ''' DESCRIPTION Returns an iterator with seq1 indices mapped to seq2 indices >>> mapping = dict(alignment_mapping(s1, s2)) ''' i, j = -1, -1 for a, b in zip(seq1, seq2): if a != '-': i += 1 if b != '-': j += 1 if a != '-' and b != '-': yield i, j def aln_magic_format(infile): ''' DESCRIPTION Guess alignment file format. ''' with open(infile) as handle: for line in handle: if len(line.rstrip()) > 0: break if line.startswith('CLUSTAL') or line.startswith('MUSCLE'): informat = 'clustal' elif line.startswith('>P1;'): informat = 'pir' elif line.startswith('>'): informat = 'fasta' elif line.startswith('# STOCKHOLM'): informat = 'stockholm' elif line.startswith('Align '): informat = 'fatcat' elif line.startswith('# ProSMART Alignment File'): informat = 'prosmart' else: informat = 'emboss' return informat def aln_magic_read(infile, format=None): ''' DESCRIPTION Wrapper for Bio.AlignIO.read that guesses alignment file format. ''' from Bio import AlignIO if not format: format = aln_magic_format(infile) with open(infile) as handle: return AlignIO.read(handle, format) def load_aln_multi(filename, object=None, mapping='', alignioformat='', guide='', quiet=1, _self=cmd): ''' DESCRIPTION Load a multiple sequence alignment from file and apply it to already loaded structures. Requires Biopython (conda install biopython) USAGE load_aln_multi filename [, object [, mapping [, alignioformat [, guide ]]]] ARGUMENTS filename = str: alignment file object = str: name of the new alignment object {default: filename prefix} mapping = str: space separated list of sequence-id to object-name mappings {default: assume sequence-id = object-name} alignioformat = str: file format, see http://biopython.org/wiki/AlignIO {default: guess from first line in file} guide = str: object name of "guide" object (PyMOL only knows many-to-one alignment, not true multiple alignments) {default: random object} SEE ALSO psico.importing.load_aln ''' import os quiet = int(quiet) if object is None: object = os.path.basename(filename).rsplit('.', 1)[0] # load alignment file alignment = aln_magic_read(cmd.exp_path(filename), alignioformat) # object name mapping if isinstance(mapping, str): mapping = mapping.split() mapping = dict(zip(mapping[0::2], mapping[1::2])) i2index = [None] * len(alignment) onames = [None] * len(alignment) indices = [-1] * len(alignment) # get structure models and sequences for r, record in enumerate(alignment): oname = mapping.get(record.id, record.id) if not oname: continue atoms = [] n = _self.iterate('?{} & guide'.format(oname), 'atoms.append((oneletter or "?", index))', space=locals()) if n == 0: print(" Warning: no atoms for object '{}'".format(oname)) continue # align sequences from file to structures aln = needle_alignment(str(record.seq), ''.join(a[0] for a in atoms)) # get index mappings i2index[r] = dict((i, atoms[j][1]) for (i, j) in alignment_mapping(*aln)) onames[r] = oname if len(onames) < 2: raise CmdException('Failed to map alignment to objects') # build alignment list raw = [] for c in range(len(alignment[0])): rawcol = [] for r, aa in enumerate(alignment[:, c]): if onames[r] is None: continue if aa != '-': indices[r] += 1 index = i2index[r].get(indices[r], -1) if index >= 0: rawcol.append((onames[r], index)) if len(rawcol) > 1: raw.append(rawcol) _self.set_raw_alignment(object, raw, guide=guide) return raw # vi:expandtab:smarttab