Source code for prolif.molecule

"""
Reading proteins and ligands --- :mod:`prolif.molecule`
=======================================================
"""
import copy
from operator import attrgetter
import MDAnalysis as mda
from rdkit import Chem
from rdkit.Chem.AllChem import AssignBondOrdersFromTemplate
from .rdkitmol import BaseRDKitMol
from .residue import Residue, ResidueGroup
from .utils import split_mol_by_residues


[docs]class Molecule(BaseRDKitMol):
    """Main molecule class that behaves like an RDKit :class:`~rdkit.Chem.rdchem.Mol`
    with extra attributes (see examples below). The main purpose of this class
    is to access residues as fragments of the molecule.

    Parameters
    ----------
    mol : rdkit.Chem.rdchem.Mol
        A ligand or protein with a single conformer

    Attributes
    ----------
    residues : prolif.residue.ResidueGroup
        A dictionnary storing one/many :class:`~prolif.residue.Residue` indexed
        by :class:`~prolif.residue.ResidueId`. The residue list is sorted.
    n_residues : int
        Number of residues
    
    Examples
    --------

    .. ipython:: python
        :okwarning:

        import MDAnalysis as mda
        import prolif
        u = mda.Universe(prolif.datafiles.TOP, prolif.datafiles.TRAJ)
        mol = u.select_atoms("protein").convert_to("RDKIT")
        mol = prolif.Molecule(mol)
        mol
    
    You can also create a Molecule directly from a
    :class:`~MDAnalysis.core.universe.Universe`:

    .. ipython:: python
        :okwarning:

        mol = prolif.Molecule.from_mda(u, "protein")
        mol

    
    Notes
    -----
    Residues can be accessed easily in different ways:
        
    .. ipython:: python

        mol["TYR38.A"] # by resid string (residue name + number + chain)
        mol[42] # by index (from 0 to n_residues-1)
        mol[prolif.ResidueId("TYR", 38, "A")] # by ResidueId
    
    See :mod:`prolif.residue` for more information on residues
    """
    def __init__(self, mol):
        super().__init__(mol)
        # set mapping of atoms
        for atom in self.GetAtoms():
            atom.SetUnsignedProp("mapindex", atom.GetIdx())
        # split in residues
        residues = split_mol_by_residues(self)
        residues = [Residue(mol) for mol in residues]
        residues.sort(key=attrgetter("resid"))
        self.residues = ResidueGroup(residues)
    
[docs]    @classmethod
    def from_mda(cls, obj, selection=None, **kwargs):
        """Creates a Molecule from an MDAnalysis object
        
        Parameters
        ----------
        obj : MDAnalysis.core.universe.Universe or MDAnalysis.core.groups.AtomGroup
            The MDAnalysis object to convert
        selection : None or str
            Apply a selection to `obj` to create an AtomGroup. Uses all atoms
            in `obj` if ``selection=None``
        **kwargs : object
            Other arguments passed to the RDKitConverter of MDAnalysis

        Example
        -------
        .. ipython:: python
            :okwarning:

            mol = prolif.Molecule.from_mda(u, "protein")
            mol

        Which is equivalent to:

        .. ipython:: python
            :okwarning:

            protein = u.select_atoms("protein")
            mol = prolif.Molecule.from_mda(protein)
            mol

        """
        ag = obj.select_atoms(selection) if selection else obj.atoms
        if ag.n_atoms == 0:
            raise mda.SelectionError(
                f"AtomGroup is empty, please check your selection")
        mol = ag.convert_to.rdkit(**kwargs)
        return cls(mol)
    
[docs]    @classmethod
    def from_rdkit(cls, mol, resname="UNL", resnumber=1, chain=""):
        """Creates a Molecule from an RDKit molecule
        
        While directly instantiating a molecule with ``prolif.Molecule(mol)``
        would also work, this method insures that every atom is linked to an
        AtomPDBResidueInfo which is required by ProLIF
        
        Parameters
        ----------
        mol : rdkit.Chem.rdchem.Mol
            The input RDKit molecule
        resname : str
            The default residue name that is used if none was found
        resnumber : int
            The default residue number that is used if none was found
        chain : str
            The default chain Id that is used if none was found
        
        Notes
        -----
        This method only checks for an existing AtomPDBResidueInfo in the first
        atom. If none was found, it will patch all atoms with the one created
        from the method's arguments (resname, resnumber, chain).
        """
        if mol.GetAtomWithIdx(0).GetMonomerInfo():
            return cls(mol)
        mol = copy.deepcopy(mol)
        for atom in mol.GetAtoms():
            mi = Chem.AtomPDBResidueInfo(f" {atom.GetSymbol():<3.3}",
                                         residueName=resname,
                                         residueNumber=resnumber,
                                         chainId=chain)
            atom.SetMonomerInfo(mi)
        return cls(mol)

    def __iter__(self):
        for residue in self.residues.values():
            yield residue

    def __getitem__(self, key):
        return self.residues[key]

    def __repr__(self): # pragma: no cover
        name = ".".join([self.__class__.__module__, self.__class__.__name__])
        params = f"{self.n_residues} residues and {self.GetNumAtoms()} atoms"
        return f"<{name} with {params} at {id(self):#x}>"

    @property
    def n_residues(self):
        return len(self.residues)


[docs]def pdbqt_supplier(paths, template, **kwargs):
    """Supplies molecules, given a path to PDBQT files

    Parameters
    ----------
    paths : list
        A list (or any iterable) of PDBQT files
    template : rdkit.Chem.rdchem.Mol
        A template molecule with the correct bond orders and charges. It must
        match exactly the molecule inside the PDBQT file.
    kwargs : object
        Keyword arguments passed to the RDKitConverter of MDAnalysis

    Returns
    -------
    suppl : generator
        A generator object that will provide the :class:`Molecule` object as
        you iterate over it.

    Example
    -------
    The supplier is typically used like this::

        >>> import glob
        >>> pdbqts = glob.glob("docking/ligand1/*.pdbqt")
        >>> lig_suppl = pdbqt_supplier(pdbqts, template)
        >>> for lig in lig_suppl:
        ...     # do something with each ligand

    """
    for pdbqt_path in paths:
        pdbqt = mda.Universe(pdbqt_path)
        # set attributes needed by the converter
        elements = [mda.topology.guessers.guess_atom_element(x)
                    for x in pdbqt.atoms.names]
        pdbqt.add_TopologyAttr("elements", elements)
        pdbqt.add_TopologyAttr("chainIDs", pdbqt.atoms.segids)
        pdbqt.atoms.types = pdbqt.atoms.elements
        # convert without infering bond orders and charges
        kwargs.pop("NoImplicit", None)
        mol = pdbqt.atoms.convert_to.rdkit(NoImplicit=False, **kwargs)
        # assign BO from template then add hydrogens
        mol = Chem.RemoveHs(mol, sanitize=False)
        mol = AssignBondOrdersFromTemplate(template, mol)
        mol = Chem.AddHs(mol, addCoords=True, addResidueInfo=True)
        yield Molecule(mol)


[docs]def sdf_supplier(path, **kwargs):
    """Supplies molecules, given a path to an SDFile
    
    Parameters
    ----------
    path : str
        A path to the .sdf file
    resname : str
        Residue name for every ligand
    resnumber : int
        Residue number for every ligand
    chain : str
        Chain ID for every ligand

    Returns
    -------
    suppl : generator
        A generator object that will provide the :class:`Molecule` object as
        you iterate over it.

    Example
    -------
    The supplier is typically used like this::

        >>> lig_suppl = sdf_supplier("docking/output.sdf")
        >>> for lig in lig_suppl:
        ...     # do something with each ligand

    """
    suppl = Chem.SDMolSupplier(path, removeHs=False)
    for mol in suppl:
        yield Molecule.from_rdkit(mol, **kwargs)


[docs]def mol2_supplier(path, **kwargs):
    """Generates prolif.Molecule objects from a MOL2 file
    
    Parameters
    ----------
    path : str
        A path to the .mol2 file
    resname : str
        Residue name for every ligand
    resnumber : int
        Residue number for every ligand
    chain : str
        Chain ID for every ligand

    Returns
    -------
    suppl : generator
        A generator object that will provide the :class:`Molecule` object as
        you iterate over it.

    Example
    -------
    The supplier is typically used like this::

        >>> lig_suppl = mol2_supplier("docking/output.mol2")
        >>> for lig in lig_suppl:
        ...     # do something with each ligand

    """
    block = []
    with open(path, "r") as f:
        for line in f:
            if line.startswith("#"):
                continue
            if block and line.startswith("@<TRIPOS>MOLECULE"):
                mol = Chem.MolFromMol2Block("".join(block), removeHs=False)
                yield Molecule.from_rdkit(mol, **kwargs)
                block = []
            block.append(line)
        mol = Chem.MolFromMol2Block("".join(block), removeHs=False)
        yield Molecule.from_rdkit(mol, **kwargs)