Source code for prolif.io.protein_helper

"""
Protein helper functions --- :mod:`prolif.io.protein_helper`
============================================================
This module provides helper functions for working with
protein structures (but not limited to protein structures).

Yu-Yuan (Stuart) Yang, 2025
"""

import logging
import warnings
from pathlib import Path

from rdkit import Chem

from prolif.io.constants import (
    AMBER_POOL,
    ATOMNAME_ALIASES,
    CHARMM_POOL,
    FORMAL_CHARGE_ALIASES,
    GROMOS_POOL,
    MAX_AMIDE_LENGTH,
    OPLS_AA_POOL,
    RESNAME_ALIASES,
    STANDARD_AA,
    STANDARD_RESNAME_MAP,
    TERMINAL_OXYGEN_NAMES,
)
from prolif.molecule import Molecule, pdbqt_supplier
from prolif.residue import Residue, ResidueGroup

logger = logging.getLogger(__name__)


[docs]class ProteinHelper:
    """:class:`ProteinHelper` is a class to standardize the residue names and fix the
    bond orders when reading the non-standard residues with RDKit for a molecule.

    .. versionadded:: 2.1.0

    Parameters
    ----------
    templates : list[dict] or dict or None, optional
        The templates to use for standardizing the protein molecule.
        If `None`, the standard amino acid template is used.
        If a dict is provided, it should contain the templates for residues.
        If a list is provided, it should contain dictionaries for each template.
        Default is `None`.

    Attributes
    ----------
    templates : list[dict]
        The templates used for standardizing the protein molecule.

    Note
    ----
    This class is designed to work with ProLIF Molecule instances or PDB files.
    It reads the input topology, standardizes the residue names, and fixes the bond
    orders based on the provided templates or standard amino acid template.

    Example
    -------
    >>> import prolif as plf
    >>> from prolif.io import ProteinHelper
    >>> protein_helper = ProteinHelper(templates=[{"ALA": {"SMILES": "CC(C(=O)O)N"}}])
    >>> mol = protein_helper.standardize_protein(input_topology="path/to/protein.pdb")
    >>> plf.display_residues(mol)

    """

    def __init__(self, templates: list[dict] | dict | None = None):
        # read the templates
        if templates is None:
            self.templates = [STANDARD_AA]
        elif isinstance(templates, dict):
            # if templates is a dict, convert it to a list of dicts
            self.templates = [templates, STANDARD_AA]
        elif isinstance(templates, list):
            # if templates is a list, check if it contains dicts
            if not all(isinstance(t, dict) for t in templates):
                raise TypeError("Templates must be a dict, a list of dicts or None.")
            self.templates = [*templates, STANDARD_AA]
        else:
            raise TypeError("Templates must be a dict, a list of dicts or None.")

        # check the templates with "name" for each residue
        for template in self.templates:
            for t in template:
                # if not, set the residue name as the template name
                if "name" not in template[t]:
                    template[t]["name"] = t  # use the residue name as the template name

                elif template[t]["name"] != t:
                    warnings.warn(
                        f"Align the template name ({template[t]['name']}) with ({t}).",
                        stacklevel=2,
                    )
                    template[t]["name"] = t

        # get a dict of the number of the heavy atoms in the template residues
        self.n_template_res_hatms = self.n_template_residue_heavy_atoms(self.templates)

[docs]    def standardize_protein(self, input_topology: Molecule | str | Path) -> Molecule:
        """Standardize the protein molecule.

        This function will standardize the residue names, fix the bond orders,
        and check the residue names against the templates.

        Parameters
        ----------
        input_topology : Molecule | str | Path
            The input topology to standardize.
            It can be a ProLIF Molecule or a path to a PDB file.

        Returns
        -------
        Molecule
            The standardized protein molecule.

        Example
        -------
        >>> from prolif.io import ProteinHelper
        >>> protein_helper = ProteinHelper(
                templates=[{"ALA": {"SMILES": "CC(C(=O)O)N"}}]
            )
        >>> mol = protein_helper.standardize_protein(
               "path/to/protein.pdb"
            )

        .. important::
            If your input for `standardize_protein` is a :class:`prolif.Molecule`, it
            will modify your original molecule in place. Your residue names will be
            updated to the standardized names and residue's bond orders will be fixed
            to the corresponding protonated states.

        """

        # read as prolif molecule
        if isinstance(input_topology, Molecule):
            protein_mol = input_topology

        elif isinstance(input_topology, str | Path) and str(input_topology).endswith(
            ".pdb"
        ):
            input_protein_top = Chem.MolFromPDBFile(str(input_topology), removeHs=False)
            protein_mol = Molecule.from_rdkit(input_protein_top)

        else:
            raise TypeError(
                "input_topology must be a string (path to a PDB file) or "
                "a prolif Molecule instance."
            )

        # guess forcefield
        conv_resnames = set(protein_mol.residues.name)
        forcefield_name = self.forcefield_guesser(conv_resnames)

        # standardize the protein molecule
        new_residues = []
        for residue in protein_mol.residues.values():
            standardized_resname = self.convert_to_standard_resname(
                resname=residue.resid.name.upper(), forcefield_name=forcefield_name
            )

            # set the standard residue name
            for atom in residue.GetAtoms():
                # set new residue name for each atom (at residue level)
                atom.GetPDBResidueInfo().SetResidueName(standardized_resname)
                # set the new residue name for each atom at the molecule level
                protein_mol.GetAtomWithIdx(
                    atom.GetUnsignedProp("mapindex")
                ).GetPDBResidueInfo().SetResidueName(standardized_resname)
            # update the residue name in the Residue object
            residue.resid.name = standardized_resname

            # before fixing the bond orders: strict check with non-standard residues
            self.check_resnames({standardized_resname}, self.templates)

            # soft check the heavy atoms in the residue compared to the standard one
            if self.n_residue_heavy_atoms(residue) != self.n_template_res_hatms.get(
                standardized_resname, 0
            ):
                warnings.warn(
                    f"Residue {residue.resid} has a different number of "
                    "heavy atoms than the standard residue. "
                    "This may affect H-bond detection.",
                    stacklevel=2,
                )
            # fix the bond orders
            new_residues.append(self.fix_molecule_bond_orders(residue, self.templates))

        # update the protein molecule with the new residues
        protein_mol.residues = ResidueGroup(new_residues)

        return protein_mol

[docs]    @staticmethod
    def forcefield_guesser(
        conv_resnames: set[str],
    ) -> str:
        """Guess the forcefield based on the residue names.

        Parameters
        ----------
        conv_resnames : set[str]
            Set of residue names in the protein molecule.

        Returns
        -------
        str
            The guessed forcefield name.
        """
        if AMBER_POOL.intersection(conv_resnames):
            return "amber"
        if CHARMM_POOL.intersection(conv_resnames):
            return "charmm"
        if GROMOS_POOL.intersection(conv_resnames):
            return "gromos"
        if OPLS_AA_POOL.intersection(conv_resnames):
            return "oplsaa"

        return "unknown"

[docs]    @staticmethod
    def convert_to_standard_resname(
        resname: str, forcefield_name: str = "unknown"
    ) -> str:
        """Convert a residue name to its standard form based on the forcefield.

        Parameters
        ----------
        resname : str
            The residue name to convert.
        forcefield_name : str
            The name of the forcefield for assigning the correct standard name for CYS.
            Default is "unknown".

        Returns
        -------
        str
            The standard residue name.

        Note
        ----
        This conversion is designed to distinguish residues with
        different possible H-bond donors at side chains, instead of the
        actual protonated states of residues.

        For example, neutral and protonated arginine are both assigned to ARG,
        while neutral and deprotonated arginine in GROMOS force field
        are assigned to ARGN and ARG, respectively.
        """

        if forcefield_name == "unknown" and resname == "CYS":
            warnings.warn(
                "Could not guess the forcefield based on the residue names. "
                "CYS is assigned to neutral CYS (charge = 0).",
                stacklevel=2,
            )

        elif forcefield_name == "gromos" and resname == "CYS":
            return "CYX"

        return STANDARD_RESNAME_MAP.get(resname, resname)

[docs]    @staticmethod
    def check_resnames(
        resnames_to_check: set[str], templates: list[dict] | None = None
    ) -> None:
        """Check if the residue names are standard or in templates and
        raise a warning if not.

        Parameters
        ----------
        resnames_to_check : set[str]
            Set of residue names to check.
        templates : list[dict] or None, optional
            The templates to use for checking the residue names.
            If `None`, the standard amino acid template is used.
            Default is `None`.

        Raises
        ------
        ValueError
            If any residue name is not standard or not in the templates.

        """

        if templates is None:
            templates = [STANDARD_AA]

        all_available_resnames = set().union(*list(templates))

        non_standard_resnames = resnames_to_check - all_available_resnames
        if non_standard_resnames:
            raise ValueError(
                f"Residue {non_standard_resnames} is not a standard residue or "
                "not in the templates. Please provide a custom template."
            )

[docs]    @staticmethod
    def n_residue_heavy_atoms(residue: Residue) -> int:
        """Count the number of heavy atoms in a residue.

        Parameters
        ----------
        residue : Residue
            The residue to count the heavy atoms in.

        Returns
        -------
        int
            The number of heavy atoms in the residue.
        """

        return len(
            [
                atom
                for atom in residue.GetAtoms()
                if atom.GetAtomicNum() != 1
                and atom.GetPDBResidueInfo().GetName().strip()
                not in TERMINAL_OXYGEN_NAMES
            ]
        )

[docs]    @staticmethod
    def n_template_residue_heavy_atoms(
        templates: list[dict] | None = None,
    ) -> dict[str, int]:
        """Count the number of heavy atoms in a residue based on the templates.

        Parameters
        ----------
        templates : list or None, optional
            The templates to use for counting the heavy atoms.
            If `None`, the standard amino acid template is used.
            Default is `None`.

        Returns
        -------
        dict[str, int]
            The dictionary with residue names as keys and
            the number of heavy atoms as values.
        """
        # Check if templates are provided,
        # otherwise use the standard amino acid template
        if templates is None:
            templates = [STANDARD_AA]

        n_template_residue_heavy_atoms = {}
        for t in templates:
            for resname in t:
                if resname in n_template_residue_heavy_atoms:
                    continue

                # SMILES template
                if "SMILES" in t[resname]:
                    t_mol = Chem.MolFromSmiles(t[resname]["SMILES"])
                    n_template_residue_heavy_atoms[resname] = len(
                        [at for at in t_mol.GetAtoms() if at.GetAtomicNum() != 1]
                    )
                    continue

                # CIF template
                residue_atom_df = t[resname]["_chem_comp_atom"]
                residue_atom_df = residue_atom_df[
                    residue_atom_df["alt_atom_id"] != "OXT"
                ]

                n_template_residue_heavy_atoms[resname] = sum(
                    residue_atom_df["type_symbol"] != "H"
                )

        return n_template_residue_heavy_atoms

[docs]    @staticmethod
    def fix_molecule_bond_orders(
        residue: Residue, templates: list[dict] | None = None
    ) -> Residue:
        r"""Fix the bond orders of a molecule.

        Parameters
        ----------
        residue : Residue
            The residue to fix the bond orders for.
        templates : list[dict] or None, optional
            The templates to use for fixing the bond orders.
            If `None`, the standard amino acid template is used.
            Default is `None`. If the residue is not a standard amino acid,
            the user should provide a custom template.

            Also, note that the order of templates is relevant,
            as the first template that matches the residue name will be used.

        Note
        ----
        1\. If the user provides a SMILES template, it will be converted to an RDKit
        molecule, and the bond orders will be assigned from the template.

        2\. SMILES templates are prioritized over CIF templates.

        3\. For CIF templates, any bonds and chiral designation on the input molecule
        will be removed at the start of the process. This function is adapted from the
        `pdbinf/_pdbinf.py` module's `assign_pdb_bonds` function, which is used to
        assign bonds and aromaticity based on the standard amino acid templates.

        Source: https://github.com/OpenFreeEnergy/pdbinf/blob/c0ddf00bd068d7860b2e99b9f03847c890e3efb5/src/pdbinf/_pdbinf.py#L482
        """
        if templates is None:
            templates = [STANDARD_AA]
        resname = residue.resid.name.upper()

        # read the templates and assign correct bond orders
        for t in templates:
            # check if resname in template doc
            if resname not in t:
                continue

            # SMILES template
            if "SMILES" in t[resname]:
                new_res = assign_bond_orders_from_smiles(
                    template_smiles=t[resname]["SMILES"], mol=residue
                )
                break

            # cif template
            new_res = strip_bonds(residue)  # strip bonds and chiral tags
            new_res = assign_intra_props(new_res, t[resname])
            break  # avoid double assignment, ordering of templates is relevant

        else:
            raise ValueError(f"Failed to find template for residue: '{resname}'")

        return Residue(new_res)


[docs]def assign_bond_orders_from_smiles(
    template_smiles: str, mol: Residue | Chem.Mol
) -> Chem.Mol:
    """Assign bond orders from a SMILES template to a residue.

    Parameters
    ----------
    template_smiles : str
        The SMILES string of the template to assign bond orders from.
    residue : Residue | Chem.Mol
        The residue or molecule to assign bond orders to.

    Returns
    -------
    Chem.Mol
        The residue or molecule with assigned bond orders from the template.
    """
    for atm in mol.GetAtoms():
        # Set the necessary property for _adjust_hydrogens function (index of the atom)
        atm.SetIntProp("_MDAnalysis_index", atm.GetIdx())

    # Call template from SMILES
    mol_template = Chem.MolFromSmiles(template_smiles)

    # use the available function to assign bond orders (adjust hydrogens)
    return pdbqt_supplier._adjust_hydrogens(mol_template, mol)


# The below code contains functions for fixing molecule bond orders.
# The code is adapted from pdbinf: https://github.com/OpenFreeEnergy/pdbinf/tree/main
# Accessed on: 16 June, 2025 under MIT License.
[docs]def strip_bonds(m: Chem.Mol) -> Chem.Mol:
    """Strip all bonds and chiral tags from a molecule.

    Parameters
    ----------
    m : rdkit.Chem.Mol
        The input molecule to strip bonds from.

    Returns
    -------
    rdkit.Chem.Mol
        The modified molecule with all bonds and chiral tags removed.

    Note
    ----
    This function is adapted from the `pdbinf/_pdbinf.py` module's `strip_bonds`
    function.

    Source: https://github.com/OpenFreeEnergy/pdbinf/blob/c0ddf00bd068d7860b2e99b9f03847c890e3efb5/src/pdbinf/_pdbinf.py#L71
    """
    with Chem.RWMol(m) as em:
        for b in m.GetBonds():
            em.RemoveBond(b.GetBeginAtomIdx(), b.GetEndAtomIdx())
        # rdkit, perhaps rightfully, gets upset at chiral tags w/ no bonds

        for at in em.GetAtoms():
            at.SetChiralTag(Chem.CHI_UNSPECIFIED)

    return em.GetMol()


[docs]def assign_intra_props(mol: Chem.Mol, reference_block: dict) -> Chem.Mol:
    """Assign bonds and aromaticity based on residue and atom names
    from a reference block (cif template molecule).

    Parameters
    ----------
    mol : rdkit.Chem.Mol
        The input molecule, this is modified in place and returned.
    reference_block : dict
        The cif template molecule from which to assign bonds.

    Returns
    -------
    rdkit.Chem.Mol
        The modified molecule with assigned bonds and aromaticity.

    Note
    ----
    This function is adapted from the `pdbinf/_pdbinf.py` module's `assign_intra_props`
    function, which is used to assign bonds and aromaticity based on
    the standard amino acid templates.

    Source: https://github.com/OpenFreeEnergy/pdbinf/blob/c0ddf00bd068d7860b2e99b9f03847c890e3efb5/src/pdbinf/_pdbinf.py#L117
    """
    nm_2_idx = {}
    fc_special_assign_nm_idx_fc_pair = {}

    # convert indices to names
    res_alias = RESNAME_ALIASES.get(reference_block["name"], reference_block["name"])
    aliases = ATOMNAME_ALIASES.get(res_alias, {})
    for atom in mol.GetAtoms():
        nm = atom.GetMonomerInfo().GetName().strip()
        nm = aliases.get(nm, nm)
        if (
            reference_block["name"] in FORMAL_CHARGE_ALIASES
            and nm in FORMAL_CHARGE_ALIASES[reference_block["name"]]
        ):
            fc_special_assign_nm_idx_fc_pair[atom.GetIdx()] = FORMAL_CHARGE_ALIASES[
                reference_block["name"]
            ][nm]
        nm_2_idx[nm] = atom.GetIdx()

    logger.debug(f"assigning intra props for {reference_block['name']}")

    with Chem.RWMol(mol) as em:
        # grab bond data from cif Block
        for _, row in reference_block["_chem_comp_bond"].iterrows():
            nm1, nm2 = row["atom_id_1"], row["atom_id_2"]
            order, arom = row["value_order"], row["pdbx_aromatic_flag"]
            try:
                idx1, idx2 = nm_2_idx[nm1], nm_2_idx[nm2]
            except KeyError:
                continue

            # [different from pdbinf] we priorituze SINGLE and DOUBLE bonds
            if order == "SING":
                order = Chem.BondType.SINGLE
            elif order == "DOUB":
                order = Chem.BondType.DOUBLE
            elif arom == "Y":
                order = Chem.BondType.AROMATIC
            else:
                order = Chem.BondType.UNSPECIFIED

            logger.debug(f"adding bond: {nm1}-{nm2} at {idx1} {idx2} {order}")

            em.AddBond(idx1, idx2, order=order)

    # find lone hydrogens, then attach to the nearest heavy atom
    em = _assign_intra_props_lone_H(em)

    # assign aromaticity for atoms based on the template
    for _, row in reference_block["_chem_comp_atom"].iterrows():
        nm, arom = row["atom_id"].strip(), row["pdbx_aromatic_flag"].strip()
        try:
            idx = nm_2_idx[nm]
        except KeyError:
            # todo: could check atom is marked as leaving atom
            continue

        atom = em.GetAtomWithIdx(idx)
        atom.SetIsAromatic(arom == "Y")

    # [different from pdbinf] assign formal charge for a specific atom
    if fc_special_assign_nm_idx_fc_pair != {}:
        for (
            fc_special_assign_nm_idx,
            fc_to_assign,
        ) in fc_special_assign_nm_idx_fc_pair.items():
            atom = em.GetAtomWithIdx(fc_special_assign_nm_idx)
            atom.SetFormalCharge(fc_to_assign)
            logger.debug(
                f"Assigned {reference_block['name']}'s formal charge {fc_to_assign} "
                f"on {atom.GetPDBResidueInfo().GetName()}."
            )

    # [different from pdbinf] sanitize the molecule
    mol = em.GetMol()
    mol.UpdatePropertyCache()
    Chem.SanitizeMol(mol)

    return mol


def _assign_intra_props_lone_H(em: Chem.RWMol) -> Chem.RWMol:
    """Assign lone hydrogens to the nearest heavy atom in the molecule.
    This is a part function for assign_intra_props.

    Parameters
    ----------
    em : rdkit.Chem.RWMol
        The input editable molecule to assign lone hydrogens to the nearest heavy atom.

    Returns
    -------
    rdkit.Chem.RWMol
        The modified molecule with lone hydrogens assigned to the nearest heavy atom.

    Note
    ----
    This function is adapted from the `pdbinf/_pdbinf.py` module's
    `assign_intra_props` function.

    Source: https://github.com/OpenFreeEnergy/pdbinf/blob/c0ddf00bd068d7860b2e99b9f03847c890e3efb5/src/pdbinf/_pdbinf.py#L167
    """
    additional_bonds = []
    heavy_atoms = []
    lone_H = []
    for atom in em.GetAtoms():
        if atom.GetAtomicNum() != 1:
            heavy_atoms.append(atom.GetIdx())
            continue
        if atom.GetBonds():  # if any bonds, we're ok
            continue
        lone_H.append(atom.GetIdx())
    if lone_H:
        logger.debug(f"found lone hydrogens: {lone_H}")
        conf = em.GetConformer()
        for idx in lone_H:
            pos = conf.GetAtomPosition(idx)
            minidx, mindist = -1, float("inf")
            for idx2 in heavy_atoms:
                pos2 = conf.GetAtomPosition(idx2)
                d = pos.Distance(pos2)
                if d > mindist:
                    continue
                minidx, mindist = idx2, d

            if mindist < MAX_AMIDE_LENGTH:
                logger.debug(
                    f"attached hydrogen {idx} to heavy atom {minidx} d={mindist}"
                )
                additional_bonds.append((idx, minidx))
    if additional_bonds:
        for i, j in additional_bonds:
            em.AddBond(i, j, order=Chem.BondType.SINGLE)

    return em