Source code for fragmenstein.m_rmsd

from __future__ import annotations
from collections import defaultdict
########################################################################################################################

__doc__ = \
    """
Combined RMSD
    """

__author__ = "Matteo Ferla. [Github](https://github.com/matteoferla)"
__email__ = "matteo.ferla@gmail.com"
__date__ = "2020 A.D."
__license__ = "MIT"
__citation__ = ""

import warnings

from .version import __version__

########################################################################################################################


from rdkit import Chem
from rdkit.Chem import rdFMCS, AllChem
from rdkit.Geometry import Point3D

from typing import Sequence, List, Optional, Tuple

import json, re
import numpy as np

from .monster import Monster
from typing import *

[docs] class mRMSD: """Mols are non-superposed ('aligned') for the RMSD and in Å. The RMSD has been calculated differently. The inbuilt RMSD calculations in RDKit (``Chem.rdMolAlign.GetBestRMS``) align the two molecules, this does not align/superpose them. This deals with the case of multiple hits. As a comparision, For euclidean distance the square root of the sum of the differences in each coordinates is taken. As a comparision, For a regular RMSD the still-squared distance is averaged before taking the root. Here the average is done across all the atom pairs between each hit and the followup. Therefore, atoms in followup that derive in the blended molecule by multiple atom are scored multiple times. .. math:: \sqrt{\frac{\sum_{i}^{N_{\rm{hits}}} (\sum_{i}^{n} (q_{i,\rm{x}} - h_{i,\rm{x}})^2 + \\ (q_{i,\rm{y}} - h_{i,\rm{y}})^2 + (q_{i,\rm{z}} - h_{i,\rm{z}})^2 }{n\cdot m}} """ # faux enum: HIT_BASED = 0 # regularly instantiated XYZ_BASED = 1 # from_internal_xyz IDENTITY_BASED = 2 # identity
[docs] def __init__(self, followup: Chem.Mol, hits: Sequence[Chem.Mol], mappings: List[List[Tuple[int, int]]]): """ This is not meant to be called directly. mappings is a list of len(hits) containing lists of tuples of atom idx that go from followup to hit The hit _Name must match that in origin! currected output of monster.origin_from_mol() or cls.get_origins(to-be-scored-mol, annotated) :param followup: the followup compounds :param hits: the fragment hits :param mappings: a complicated affair... """ self.followup = followup self.hits = hits self.mappings = mappings self.mode = self.HIT_BASED # calculate mds = [] self.rmsds = [] tatoms = 0 for hit, mapping in zip(hits, mappings): md = self.calculate_msd(self.followup, hit, mapping) mds.append(md) tatoms += len(mapping) if len(mapping): self.rmsds.append((md/ len(mapping)) ** 0.5) else: self.rmsds.append(None) if tatoms: self.mrmsd = (sum(mds) / tatoms) ** 0.5 else: self.mrmsd = None
[docs] @staticmethod def generate_overlap_mapping(mol_a: Chem.Mol, mol_b: Chem.Mol) -> List[Tuple[int, int]]: # Monster.get_positional_mapping : Dict[int, int] map_dict : Dict[int, int] = Monster.get_positional_mapping(mol_a, mol_b) return list(map_dict.items())
[docs] @classmethod def from_unannotated_mols(cls, moved_followup: Chem.Mol, hits: Sequence[Chem.Mol], placed_followup: Chem.Mol ) -> mRMSD: """ Mapping is done by positional overlap between placed_followup and hits This mapping is the applied to moved_followup. The mapping is not between placed and moved. But the former acts as a go between. :param moved_followup: The mol to be scored :param hits: the hits to score against :param placed_followup: the mol to determine how to score :return: """ placed2moved : Dict[int, int] = dict(cls.from_unannotated_same_mols(placed_followup=placed_followup, moved_followup=moved_followup ).mappings[0] ) mappings = [] for h, hit in enumerate(hits): placed2hit : List[Tuple[int, int]] = cls.generate_overlap_mapping(placed_followup, hit) mappings.append([(placed2moved[pi], hi) for pi, hi in placed2hit if pi in placed2moved]) return cls(followup=moved_followup, hits=hits, mappings=mappings)
[docs] @classmethod def from_unannotated_same_mols(cls, moved_followup: Chem.Mol, placed_followup: Chem.Mol ) -> mRMSD: """ Mapping is not done by positional overlap between placed_followup and moved_followup But by shape overlap that yields the lowest RMSD This means that every atom matches reguardless of hits. :param moved_followup: The mol to be scored :param placed_followup: the mol to determine how to score :return: """ # placed idx -> moved idx munge_mapping = lambda match: list(zip(range(placed_followup.GetNumAtoms()), match)) candidate_mappings = [munge_mapping(match) for match in moved_followup.GetSubstructMatches(placed_followup)] assert candidate_mappings, 'No matches?!' # mappings is places to moved hence the order candidate_mrmsds = [cls(followup=placed_followup, hits=[moved_followup], mappings=[mapping]) for mapping in candidate_mappings] self = sorted(candidate_mrmsds, key=lambda m: m.mrmsd)[0] # self.hits is placed_followup not hits self.mode = self.IDENTITY_BASED return self
[docs] @classmethod def from_annotated_mols(cls, annotated_followup: Chem.Mol, hits: Optional[Sequence[Chem.Mol]]=None ) -> mRMSD: """ Monster leaves a note of what it did. atom prop _Origin is a json of a list of mol _Name dot AtomIdx. This classmethod accepts a followup with has this. :param annotated_followup: :param hits: :return: """ if cls.is_xyz_annotated(annotated_followup): self = cls.from_internal_xyz(annotated_followup) self.hits = hits return self mappings = cls._mapping_from_annotated_and_hits(annotated_followup, hits) return cls(annotated_followup, hits, mappings)
[docs] @classmethod def is_origin_annotated(cls, mol: Chem.Mol) -> bool: """ This is atom not mol. """ for atom in mol.GetAtoms(): if len(cls._get_origin(atom)) > 0: return True else: return False
[docs] @classmethod def is_xyz_annotated(cls, mol: Chem.Mol) -> bool: for atom in mol.GetAtoms(): if len(cls._get_xyz(atom)) > 0: return True else: return False
@classmethod def _mapping_from_annotated_and_hits(cls, annotated_followup: Chem.Mol, hits: Sequence[Chem.Mol]): assert cls.is_origin_annotated(annotated_followup), 'This molecules is not annotated.' mappings = [] for h, hit in enumerate(hits): hname = hit.GetProp('_Name') mapping = [] if hname == '': print(f'{hit} has no name!') else: for i in range(annotated_followup.GetNumAtoms()): atom = annotated_followup.GetAtomWithIdx(i) for oel in cls._get_origin(atom): rex = re.match(hname + '\.(\d+)', oel) if rex is not None: h = int(rex.group(1)) mapping.append((i, h)) mappings.append(mapping) return mappings
[docs] @classmethod def from_other_annotated_mols(cls, followup: Chem.Mol, hits: Sequence[Chem.Mol], annotated: Chem.Mol, ) -> mRMSD: """ The two molecules are the same (atom names included) but the followup lacks annotations. """ cls.overannotate(annotated, hits=hits, priority=('atom_origin', 'mol_origin', 'xyz')) if all([cls.is_xyz_annotated(annotated), annotated.GetAtomWithIdx(0).GetPDBResidueInfo() is not None, followup.GetAtomWithIdx(0).GetPDBResidueInfo() is not None ]): # match by atom name name2xyz = {} for atom in annotated.GetAtoms(): pi = atom.GetPDBResidueInfo() if pi and atom.HasProp('_x'): name2xyz[pi.GetName()] = {cart: atom.GetDoubleProp("_" + cart) for cart in ('x', 'y', 'z')} for atom in followup.GetAtoms(): pi = atom.GetPDBResidueInfo() if pi is None: continue name = pi.GetName() if pi and name in name2xyz: for cart in ('x', 'y', 'z'): atom.SetDoubleProp('_' + cart, name2xyz[name][cart]) return cls.from_annotated_mols(followup, hits) elif (atom.GetSymbol() for atom in annotated.GetAtoms()) == (atom.GetSymbol() for atom in followup.GetAtoms()): # element sequence matches... but has no atom names for anno_atom, follow_atom in zip(annotated.GetAtoms(), followup.GetAtoms()): if anno_atom.HasProp('_Origin'): follow_atom.SetProp('_Origin', anno_atom.GetProp('_Origin')) if anno_atom.HasProp('_x'): follow_atom.SetProp('_x', anno_atom.GetProp('_x')) follow_atom.SetProp('_y', anno_atom.GetProp('_y')) follow_atom.SetProp('_z', anno_atom.GetProp('_z')) else: # has become nearly redundant with from_unannotated_mols except this expect annotated to have: # from_unannotated_mols # the former way has issues with isomorphisms # cls.copy_origins(annotated, followup) # return cls.from_annotated_mols(followup, hits) posibilities : Tuple[List[Chem.Mol], List[List[int]]]= cls.copy_all_possible_origins(annotated, followup) targets: List[Chem.Mol] = posibilities[0] assert targets, 'Molecule could not be mapped.' results = [(target, cls.from_annotated_mols(target, hits)) for target in targets] return list(sorted(results, key=lambda x: x[1].mrmsd))[0][1]
[docs] def calculate_msd(self, molA, molB, mapping) -> float: """ A nonroot rmsd. :param molA: :param molB: :param mapping: lists of tuples of atom idx that go from molA to molB :return: nonroot rmsd """ confA = molA.GetConformer() confB = molB.GetConformer() return sum([(confA.GetAtomPosition(a).x - confB.GetAtomPosition(b).x) ** 2 + (confA.GetAtomPosition(a).y - confB.GetAtomPosition(b).y) ** 2 + (confA.GetAtomPosition(a).z - confB.GetAtomPosition(b).z) ** 2 for a, b in mapping])
[docs] def calculate_rmsd(self, molA, molB, mapping) -> float: return (self.calculate_msd(molA, molB, mapping) / len(mapping)) ** 0.5
[docs] @classmethod def mock(cls): self = cls.__new__(cls) self.followup = None self.hits = None self.mappings = {} self.mrmsd = float('nan') self.rmsds = [] return self
[docs] @classmethod def copy_origins(cls, annotated: Chem.Mol, target: Chem.Mol): """ > This is no longer used by ``from_other_annotated_mols``. Monster leaves a note of what it did. atom prop _Origin is a json of a list of mol _Name dot AtomIdx. However, the atom order seems to be maintained but I dont trust it. Also dummy atoms are stripped. :param annotated: :param target: :return: a list of origins """ warnings.warn('This is no longer used by ``from_other_annotated_mols``.', DeprecationWarning) mcs = rdFMCS.FindMCS([target, annotated], atomCompare=rdFMCS.AtomCompare.CompareElements, bondCompare=rdFMCS.BondCompare.CompareAny, ringMatchesRingOnly=True) common = Chem.MolFromSmarts(mcs.smartsString) dmapping = dict(zip(target.GetSubstructMatch(common), annotated.GetSubstructMatch(common))) origins = [] cls.overannotate(annotated, [], ('atom_origin', 'mol_origin', 'surrender')) for i in range(target.GetNumAtoms()): if i in dmapping: atom = annotated.GetAtomWithIdx(dmapping[i]) tatom = target.GetAtomWithIdx(i) o = cls._get_origin(atom) tatom.SetProp('_Origin', json.dumps(o)) return origins
[docs] @classmethod def copy_all_possible_origins(cls, annotated: Chem.Mol, target: Chem.Mol) -> Tuple[List[Chem.Mol], List[List[int]]]: """ Monster leaves a note of what it did. atom prop _Origin is a json of a list of mol _Name dot AtomIdx. However, the atom order seems to be maintained but I dont trust it. Also dummy atoms are stripped. :param annotated: :param target: :return: a list of mols and a list of orgins (a list too) """ cls.overannotate(annotated, [], ('atom_origin', 'mol_origin', 'xyz')) mcs = rdFMCS.FindMCS([target, annotated], atomCompare=rdFMCS.AtomCompare.CompareElements, bondCompare=rdFMCS.BondCompare.CompareAny, ringMatchesRingOnly=True) common = Chem.MolFromSmarts(mcs.smartsString) options = [] originss = [] # adding Unique=False will make benzene isomorphism not an issue, but the code will grind to a halt. for target_match in target.GetSubstructMatches(common): for anno_match in annotated.GetSubstructMatches(common): dmapping = dict(zip(target_match, anno_match)) origins = [] option = Chem.Mol(target) for i in range(option.GetNumAtoms()): if i in dmapping: atom = annotated.GetAtomWithIdx(dmapping[i]) tatom = option.GetAtomWithIdx(i) o = cls._get_origin(atom) tatom.SetProp('_Origin', json.dumps(o)) xyz = cls._get_xyz(atom) if xyz: cls._set_xyz(tatom, xyz) options.append(option) originss.append(origins) return options, originss
[docs] @classmethod def overannotate(cls, mol: Chem.Mol, hits: List[Chem.Mol], priority=('mol_origin', 'atom_origin', 'xyz')): """ Unfortunately, in an attempt to make users happy, I have to make the code more complicated. There are three different annotation systems going on. The first is the property '_Origin' in the ``Chem.Mol``. The second is the property '_Origin' in the ``Chem.Atom``. The third is the properties '_x', '_y', '_z' in the ``Chem.Atom``. This method will take the first priority that is available and propagate it to the other two. :param mol: the annotated molecule :param hits: the hits that are used to annotate the molecule :param priority: the priority of the annotation, a sequence of up to four strings: 'mol_origin', 'atom_origin', 'xyz', 'surrender' """ if len(priority) == 0: raise ValueError('molecule is not annotated') task = priority[0] # ----------------------------------------------- if task == 'mol_origin': if not mol.HasProp('_Origin'): return cls.overannotate(mol, hits, tuple(priority[1:])) origins = json.loads(mol.GetProp('_Origin')) assert len(origins) == mol.GetNumAtoms(), f'Mismatch {len(origins)} vs. {mol.GetNumAtoms()}' for i, atom in enumerate(mol.GetAtoms()): # add origin to atom atom.SetProp('_Origin', json.dumps(origins[i])) # add xyz to atom cls._atom_origin_to_xyz(origins[i], atom, hits) return None # ----------------------------------------------- elif task == 'atom_origin': if not cls.is_origin_annotated(mol): return cls.overannotate(mol, hits, tuple(priority[1:])) origins = [cls._get_origin(atom) for atom in mol.GetAtoms()] # add origin to mol mol.SetProp('_Origin', json.dumps(origins)) # add xyz to atom for atom, origin in zip(mol.GetAtoms(), origins): cls._atom_origin_to_xyz(origin, atom, hits) # ----------------------------------------------- elif task == 'xyz': if not cls.is_xyz_annotated(mol): return cls.overannotate(mol, hits, tuple(priority[1:])) # this cannot be consolidated... pass elif task == 'surrender': return None else: raise ValueError(f'Unknown task {task}')
@classmethod def _atom_origin_to_xyz(cls, origins: List[str], atom: Chem.Atom, hits: List[Chem.Mol]): points = [] if len(hits) == 0: return # no hits hitdex: Dict[str, Chem.Mol] = {hit.GetProp('_Name'): hit for hit in hits} for origin in origins: if origin == 'none': # thank you human. continue hit_name, idx = re.match(r'(.*)\.(\d+)', origin).groups() atom.SetProp('_ori_name', str(hit_name)) if hit_name not in hitdex: return ref_conf = hitdex[hit_name].GetConformer() ref_xyz: Point3D = ref_conf.GetAtomPosition(int(idx)) points.append(list(ref_xyz)) # noqa this is okay if points: xyz = np.mean(points, axis=0) cls._set_xyz(atom, xyz)
[docs] @classmethod def migrate_origin(cls, mol: Chem.Mol, tag='_Origin') -> Chem.Mol: """ The origin list may be saved as a molecule property rather than an atom -saved as a mol say. :param mol: mol to fix :param tag: name of prop :return: the same mol """ raise NotImplementedError('This method was deprecated. Use ``overannotate`` instead. Requires HITS!')
@classmethod def _get_origin(cls, atom: Chem.Atom) -> List[str]: if atom.HasProp('_Origin'): o = atom.GetProp('_Origin') if o != 'none': return json.loads(o) else: return [] else: return [] @classmethod def _get_xyz(cls, atom: Chem.Atom) -> Tuple[float,float,float]: if atom.HasProp('_x'): return (atom.GetDoubleProp('_x'), atom.GetDoubleProp('_y'), atom.GetDoubleProp('_z')) else: return () @classmethod def _set_xyz(cls, atom: Chem.Atom, xyz): if len(xyz): atom.SetDoubleProp('_x', float(xyz[0])), atom.SetDoubleProp('_y', float(xyz[1])), atom.SetDoubleProp('_z', float(xyz[2]))
[docs] @classmethod def from_internal_xyz(cls, annotated_followup): """ This is an alternative for when the atoms have _x, _y, _z :param annotated_followup: :return: """ self = cls.__new__(cls) self.followup = annotated_followup self.hits = [] self.mappings = [] self.rmsds = [] self.mrmsd = float('nan') self.mode = self.XYZ_BASED conf = annotated_followup.GetConformer() n = 0 tatoms = 0 ns = defaultdict(int) ts = defaultdict(int) for a, atom in enumerate(annotated_followup.GetAtoms()): if not atom.HasProp('_x'): continue x, y, z = cls._get_xyz(atom) tatoms += 1 d = sum([(conf.GetAtomPosition(a).x - x) ** 2 + (conf.GetAtomPosition(a).y - y) ** 2 + (conf.GetAtomPosition(a).z - z) ** 2]) n += d if atom.HasProp('_Origin'): origins = json.loads(atom.GetProp('_Origin')) for origin in origins: if origin == 'none': continue hit_name, idx = re.match(r'(.*)\.(\d+)', origin).groups() ns[hit_name] += d ts[hit_name] += 1 elif atom.HasProp('_ori_name'): hit_name = atom.GetProp('_ori_name') ns[hit_name] += d ts[hit_name] += 1 self.mrmsd = (n / tatoms) ** 0.5 self.rmsds = [(ns[hit_name] / ts[hit_name]) ** 0.5 for hit_name in ns] return self