Source code for prolif.residue

"""
Residue-related classes --- :mod:`prolif.residue`
=================================================
"""

import re
from collections import UserDict
from collections.abc import Iterable, Sequence
from typing import TYPE_CHECKING, Any, cast

import numpy as np
from rdkit.Chem.rdmolops import FastFindRings

from prolif.rdkitmol import BaseRDKitMol

if TYPE_CHECKING:
    from rdkit import Chem

    from prolif.typeshed import ResidueKey

_RE_RESID = re.compile(
    r"(TIP[234]|T[234]P|H2O|[0-9][A-Z]{2}|[A-Z ]+)?(\d*)\.?([A-Z\d]{1,2})?"
)


[docs]class ResidueId:
    """Residue identifier

    Parameters
    ----------
    name : str or None, default = "UNK"
        Residue name
    number : int or None, default = 0
        Residue number
    chain : str or None, default = None
        Protein chain or segment index

    Notes
    -----
    Whitespaces are stripped from the name and chain.

    .. versionchanged:: 2.1.0
        Whitespaces are now stripped from the name and chain. Better support for water
        and monatomic ion residue names. Ability to use the segment index as chain.
    """

    def __init__(
        self,
        name: str | None = "UNK",
        number: int | None = 0,
        chain: str | None = None,
    ):
        self.name = "UNK" if not name else name.strip()
        self.number = number or 0
        self.chain = None if not chain else chain.strip()

    def __repr__(self) -> str:
        return f"ResidueId({self.name}, {self.number}, {self.chain})"

    def __str__(self) -> str:
        resid = f"{self.name}{self.number}"
        if self.chain:
            return f"{resid}.{self.chain}"
        return resid

    def __hash__(self) -> int:
        return hash((self.name, self.number, self.chain))

    def __eq__(self, other: object) -> bool:
        if not isinstance(other, ResidueId):
            return NotImplemented
        return (self.name, self.number, self.chain) == (
            other.name,
            other.number,
            other.chain,
        )

    def __lt__(self, other: "ResidueId") -> bool:
        return (_chain_key(self.chain), self.number) < (
            _chain_key(other.chain),
            other.number,
        )

[docs]    @classmethod
    def from_atom(cls, atom: "Chem.Atom", use_segid: bool = False) -> "ResidueId":
        """Creates a ResidueId from an RDKit atom

        Parameters
        ----------
        atom : rdkit.Chem.rdchem.Atom
            An atom that contains an RDKit :class:`~rdkit.Chem.rdchem.AtomMonomerInfo`
        use_segid: bool, default = False
            Use the segment number rather than the chain identifier as a chain
        """
        mi = atom.GetMonomerInfo()
        if mi:
            name = mi.GetResidueName()
            number = mi.GetResidueNumber()
            chain = str(mi.GetSegmentNumber()) if use_segid else mi.GetChainId()
            return cls(name, number, chain)
        return cls()

[docs]    @classmethod
    def from_string(cls, resid_str: str) -> "ResidueId":
        """Creates a ResidueId from a string

        Parameters
        ----------
        resid_str : str
            A string in the format ``<3-letter code><residue number>.<chain>``
            All arguments are optionnal, and the dot should be present only if
            the chain identifier is also present

        Examples
        --------

        +-----------+----------------------------------+
        | string    | Corresponding ResidueId          |
        +===========+==================================+
        | "ALA10.A" | ``ResidueId("ALA", 10, "A")``    |
        +-----------+----------------------------------+
        | "GLU33"   | ``ResidueId("GLU", 33, None)``   |
        +-----------+----------------------------------+
        | "LYS.B"   | ``ResidueId("LYS", 0, "B")``     |
        +-----------+----------------------------------+
        | "ARG"     | ``ResidueId("ARG", 0, None)``    |
        +-----------+----------------------------------+
        | "5.C"     | ``ResidueId("UNK", 5, "C")``     |
        +-----------+----------------------------------+
        | "42"      | ``ResidueId("UNK", 42, None)``   |
        +-----------+----------------------------------+
        | ".D"      | ``ResidueId("UNK", 0, "D")``     |
        +-----------+----------------------------------+
        | ""        | ``ResidueId("UNK", 0, None)``    |
        +-----------+----------------------------------+

        """
        matches = cast(re.Match, _RE_RESID.search(resid_str))
        name, number, chain = matches.groups()
        number = int(number) if number else 0
        return cls(name, number, chain)


def _chain_key(chain: str | None) -> tuple[bool, str | None]:
    """Handles the case where the two chains are of different types"""
    # e.g., None from WAT123, and str from ALA42.A
    return (chain is not None, chain)


[docs]class Residue(BaseRDKitMol):
    """A class for residues as RDKit molecules

    Parameters
    ----------
    mol : rdkit.Chem.rdchem.Mol
        The residue as an RDKit molecule
    use_segid: bool, default = False
        Use the segment number rather than the chain identifier as a chain

    Attributes
    ----------
    resid : prolif.residue.ResidueId
        The residue identifier

    Notes
    -----
    The name of the residue can be converted to a string by using
    ``str(Residue)``

    .. versionchanged:: 2.1.0
        Added `use_segid`.
    """

    def __init__(self, mol: "Chem.Mol", *, use_segid: bool = False):
        super().__init__(mol)
        FastFindRings(self)
        self.resid = ResidueId.from_atom(self.GetAtomWithIdx(0), use_segid=use_segid)

    def __repr__(self) -> str:  # pragma: no cover
        name = ".".join([self.__class__.__module__, self.__class__.__name__])
        return f"<{name} {self.resid} at {id(self):#x}>"

    def __str__(self) -> str:
        return str(self.resid)


[docs]class ResidueGroup(UserDict[ResidueId, Residue]):
    """A container to store and retrieve Residue instances easily

    Parameters
    ----------
    residues : list
        A list of :class:`~prolif.residue.Residue`

    Attributes
    ----------
    n_residues : int
        Number of residues in the ResidueGroup

    Notes
    -----
    Residues in the group can be accessed by :class:`ResidueId`, string, or
    index. See the :class:`~prolif.molecule.Molecule` class for an example.
    You can also use the :meth:`~prolif.residue.ResidueGroup.select` method to
    access a subset of a ResidueGroup.
    """

    def __init__(self, residues: Iterable[Residue]):
        self._residues = cast(Sequence[Residue], np.asarray(residues, dtype=object))
        resinfo = [
            (r.resid.name, r.resid.number, r.resid.chain) for r in self._residues
        ]
        try:
            name, number, chain = zip(*resinfo, strict=True)
        except ValueError:
            self.name = np.array([], dtype=object)
            self.number = np.array([], dtype=np.uint8)
            self.chain = np.array([], dtype=object)
        else:
            self.name = np.asarray(name, dtype=object)
            self.number = np.asarray(number, dtype=np.uint16)
            self.chain = np.asarray(chain, dtype=object)
        super().__init__([(r.resid, r) for r in self._residues])

    def __getitem__(self, key: "ResidueKey") -> Residue:
        # bool is a subclass of int but shouldn't be used here
        if isinstance(key, bool):
            raise KeyError(
                f"Expected a ResidueId, int, or str, got {type(key).__name__!r}"
                " instead",
            )
        if isinstance(key, int):
            return self._residues[key]
        if isinstance(key, str):
            key = ResidueId.from_string(key)
            return self.data[key]
        if isinstance(key, ResidueId):
            return self.data[key]
        raise KeyError(
            f"Expected a ResidueId, int, or str, got {type(key).__name__!r} instead",
        )

[docs]    def select(self, mask: Any) -> "ResidueGroup":
        """Locate a subset of a ResidueGroup based on a boolean mask

        Parameters
        ----------
        mask : numpy.ndarray
            A 1D array of ``dtype=bool`` with the same length as the number of
            residues in the ResidueGroup. The mask should be constructed by
            using conditions on the "name", "number", and "chain" residue
            attributes as defined in the :class:`~prolif.residue.ResidueId`
            class

        Returns
        -------
        rg : prolif.residue.ResidueGroup
            A subset of the original ResidueGroup

        Examples
        --------
        ::

            >>> rg
            <prolif.residue.ResidueGroup with 200 residues at 0x7f9a68719ac0>
            >>> rg.select(rg.chain == "A")
            <prolif.residue.ResidueGroup with 42 residues at 0x7fe3fdb86ca0>
            >>> rg.select((10 <= rg.number) & (rg.number < 30))
            <prolif.residue.ResidueGroup with 20 residues at 0x7f5f3c69aaf0>
            >>> rg.select((rg.chain == "B") & (np.isin(rg.name, ["ASP", "GLU"])))
            <prolif.residue.ResidueGroup with 3 residues at 0x7f5f3c510c70>

        As seen in these examples, you can combine masks with different
        operators, similarly to numpy boolean indexing or pandas
        :meth:`~pandas.DataFrame.loc` method

            * AND --> ``&``
            * OR --> ``|``
            * XOR --> ``^``
            * NOT --> ``~``

        """
        return ResidueGroup(self._residues[mask])

    def __repr__(self) -> str:  # pragma: no cover
        name = ".".join([self.__class__.__module__, self.__class__.__name__])
        return f"<{name} with {self.n_residues} residues at {id(self):#x}>"

    @property
    def n_residues(self) -> int:
        return len(self)