Source code for prolif.io.protein_helper

"""
Protein helper functions --- :mod:`prolif.io.protein_helper`
============================================================
This module provides helper functions for working with
protein structures (but not limited to protein structures).

Yu-Yuan (Stuart) Yang, 2025
"""

import logging
import warnings
from pathlib import Path

from rdkit import Chem

from prolif.io.constants import (
    AMBER_POOL,
    ATOMNAME_ALIASES,
    CHARMM_POOL,
    FORMAL_CHARGE_ALIASES,
    GROMOS_POOL,
    MAX_AMIDE_LENGTH,
    OPLS_AA_POOL,
    RESNAME_ALIASES,
    STANDARD_AA,
    STANDARD_RESNAME_MAP,
    TERMINAL_OXYGEN_NAMES,
)
from prolif.molecule import Molecule, pdbqt_supplier
from prolif.residue import Residue, ResidueGroup

logger = logging.getLogger(__name__)


[docs]class ProteinHelper: """:class:`ProteinHelper` is a class to standardize the residue names and fix the bond orders when reading the non-standard residues with RDKit for a molecule. .. versionadded:: 2.1.0 Parameters ---------- templates : list[dict] or dict or None, optional The templates to use for standardizing the protein molecule. If `None`, the standard amino acid template is used. If a dict is provided, it should contain the templates for residues. If a list is provided, it should contain dictionaries for each template. Default is `None`. Attributes ---------- templates : list[dict] The templates used for standardizing the protein molecule. Note ---- This class is designed to work with ProLIF Molecule instances or PDB files. It reads the input topology, standardizes the residue names, and fixes the bond orders based on the provided templates or standard amino acid template. Example ------- >>> import prolif as plf >>> from prolif.io import ProteinHelper >>> protein_helper = ProteinHelper(templates=[{"ALA": {"SMILES": "CC(C(=O)O)N"}}]) >>> mol = protein_helper.standardize_protein(input_topology="path/to/protein.pdb") >>> plf.display_residues(mol) """ def __init__(self, templates: list[dict] | dict | None = None): # read the templates if templates is None: self.templates = [STANDARD_AA] elif isinstance(templates, dict): # if templates is a dict, convert it to a list of dicts self.templates = [templates, STANDARD_AA] elif isinstance(templates, list): # if templates is a list, check if it contains dicts if not all(isinstance(t, dict) for t in templates): raise TypeError("Templates must be a dict, a list of dicts or None.") self.templates = [*templates, STANDARD_AA] else: raise TypeError("Templates must be a dict, a list of dicts or None.") # check the templates with "name" for each residue for template in self.templates: for t in template: # if not, set the residue name as the template name if "name" not in template[t]: template[t]["name"] = t # use the residue name as the template name elif template[t]["name"] != t: warnings.warn( f"Align the template name ({template[t]['name']}) with ({t}).", stacklevel=2, ) template[t]["name"] = t # get a dict of the number of the heavy atoms in the template residues self.n_template_res_hatms = self.n_template_residue_heavy_atoms(self.templates)
[docs] def standardize_protein(self, input_topology: Molecule | str | Path) -> Molecule: """Standardize the protein molecule. This function will standardize the residue names, fix the bond orders, and check the residue names against the templates. Parameters ---------- input_topology : Molecule | str | Path The input topology to standardize. It can be a ProLIF Molecule or a path to a PDB file. Returns ------- Molecule The standardized protein molecule. Example ------- >>> from prolif.io import ProteinHelper >>> protein_helper = ProteinHelper( templates=[{"ALA": {"SMILES": "CC(C(=O)O)N"}}] ) >>> mol = protein_helper.standardize_protein( "path/to/protein.pdb" ) .. important:: If your input for `standardize_protein` is a :class:`prolif.Molecule`, it will modify your original molecule in place. Your residue names will be updated to the standardized names and residue's bond orders will be fixed to the corresponding protonated states. """ # read as prolif molecule if isinstance(input_topology, Molecule): protein_mol = input_topology elif isinstance(input_topology, str | Path) and str(input_topology).endswith( ".pdb" ): input_protein_top = Chem.MolFromPDBFile(str(input_topology), removeHs=False) protein_mol = Molecule.from_rdkit(input_protein_top) else: raise TypeError( "input_topology must be a string (path to a PDB file) or " "a prolif Molecule instance." ) # guess forcefield conv_resnames = set(protein_mol.residues.name) forcefield_name = self.forcefield_guesser(conv_resnames) # standardize the protein molecule new_residues = [] for residue in protein_mol.residues.values(): standardized_resname = self.convert_to_standard_resname( resname=residue.resid.name.upper(), forcefield_name=forcefield_name ) # set the standard residue name for atom in residue.GetAtoms(): # set new residue name for each atom (at residue level) atom.GetPDBResidueInfo().SetResidueName(standardized_resname) # set the new residue name for each atom at the molecule level protein_mol.GetAtomWithIdx( atom.GetUnsignedProp("mapindex") ).GetPDBResidueInfo().SetResidueName(standardized_resname) # update the residue name in the Residue object residue.resid.name = standardized_resname # before fixing the bond orders: strict check with non-standard residues self.check_resnames({standardized_resname}, self.templates) # soft check the heavy atoms in the residue compared to the standard one if self.n_residue_heavy_atoms(residue) != self.n_template_res_hatms.get( standardized_resname, 0 ): warnings.warn( f"Residue {residue.resid} has a different number of " "heavy atoms than the standard residue. " "This may affect H-bond detection.", stacklevel=2, ) # fix the bond orders new_residues.append(self.fix_molecule_bond_orders(residue, self.templates)) # update the protein molecule with the new residues protein_mol.residues = ResidueGroup(new_residues) return protein_mol
[docs] @staticmethod def forcefield_guesser( conv_resnames: set[str], ) -> str: """Guess the forcefield based on the residue names. Parameters ---------- conv_resnames : set[str] Set of residue names in the protein molecule. Returns ------- str The guessed forcefield name. """ if AMBER_POOL.intersection(conv_resnames): return "amber" if CHARMM_POOL.intersection(conv_resnames): return "charmm" if GROMOS_POOL.intersection(conv_resnames): return "gromos" if OPLS_AA_POOL.intersection(conv_resnames): return "oplsaa" return "unknown"
[docs] @staticmethod def convert_to_standard_resname( resname: str, forcefield_name: str = "unknown" ) -> str: """Convert a residue name to its standard form based on the forcefield. Parameters ---------- resname : str The residue name to convert. forcefield_name : str The name of the forcefield for assigning the correct standard name for CYS. Default is "unknown". Returns ------- str The standard residue name. Note ---- This conversion is designed to distinguish residues with different possible H-bond donors at side chains, instead of the actual protonated states of residues. For example, neutral and protonated arginine are both assigned to ARG, while neutral and deprotonated arginine in GROMOS force field are assigned to ARGN and ARG, respectively. """ if forcefield_name == "unknown" and resname == "CYS": warnings.warn( "Could not guess the forcefield based on the residue names. " "CYS is assigned to neutral CYS (charge = 0).", stacklevel=2, ) elif forcefield_name == "gromos" and resname == "CYS": return "CYX" return STANDARD_RESNAME_MAP.get(resname, resname)
[docs] @staticmethod def check_resnames( resnames_to_check: set[str], templates: list[dict] | None = None ) -> None: """Check if the residue names are standard or in templates and raise a warning if not. Parameters ---------- resnames_to_check : set[str] Set of residue names to check. templates : list[dict] or None, optional The templates to use for checking the residue names. If `None`, the standard amino acid template is used. Default is `None`. Raises ------ ValueError If any residue name is not standard or not in the templates. """ if templates is None: templates = [STANDARD_AA] all_available_resnames = set().union(*list(templates)) non_standard_resnames = resnames_to_check - all_available_resnames if non_standard_resnames: raise ValueError( f"Residue {non_standard_resnames} is not a standard residue or " "not in the templates. Please provide a custom template." )
[docs] @staticmethod def n_residue_heavy_atoms(residue: Residue) -> int: """Count the number of heavy atoms in a residue. Parameters ---------- residue : Residue The residue to count the heavy atoms in. Returns ------- int The number of heavy atoms in the residue. """ return len( [ atom for atom in residue.GetAtoms() if atom.GetAtomicNum() != 1 and atom.GetPDBResidueInfo().GetName().strip() not in TERMINAL_OXYGEN_NAMES ] )
[docs] @staticmethod def n_template_residue_heavy_atoms( templates: list[dict] | None = None, ) -> dict[str, int]: """Count the number of heavy atoms in a residue based on the templates. Parameters ---------- templates : list or None, optional The templates to use for counting the heavy atoms. If `None`, the standard amino acid template is used. Default is `None`. Returns ------- dict[str, int] The dictionary with residue names as keys and the number of heavy atoms as values. """ # Check if templates are provided, # otherwise use the standard amino acid template if templates is None: templates = [STANDARD_AA] n_template_residue_heavy_atoms = {} for t in templates: for resname in t: if resname in n_template_residue_heavy_atoms: continue # SMILES template if "SMILES" in t[resname]: t_mol = Chem.MolFromSmiles(t[resname]["SMILES"]) n_template_residue_heavy_atoms[resname] = len( [at for at in t_mol.GetAtoms() if at.GetAtomicNum() != 1] ) continue # CIF template residue_atom_df = t[resname]["_chem_comp_atom"] residue_atom_df = residue_atom_df[ residue_atom_df["alt_atom_id"] != "OXT" ] n_template_residue_heavy_atoms[resname] = sum( residue_atom_df["type_symbol"] != "H" ) return n_template_residue_heavy_atoms
[docs] @staticmethod def fix_molecule_bond_orders( residue: Residue, templates: list[dict] | None = None ) -> Residue: r"""Fix the bond orders of a molecule. Parameters ---------- residue : Residue The residue to fix the bond orders for. templates : list[dict] or None, optional The templates to use for fixing the bond orders. If `None`, the standard amino acid template is used. Default is `None`. If the residue is not a standard amino acid, the user should provide a custom template. Also, note that the order of templates is relevant, as the first template that matches the residue name will be used. Note ---- 1\. If the user provides a SMILES template, it will be converted to an RDKit molecule, and the bond orders will be assigned from the template. 2\. SMILES templates are prioritized over CIF templates. 3\. For CIF templates, any bonds and chiral designation on the input molecule will be removed at the start of the process. This function is adapted from the `pdbinf/_pdbinf.py` module's `assign_pdb_bonds` function, which is used to assign bonds and aromaticity based on the standard amino acid templates. Source: https://github.com/OpenFreeEnergy/pdbinf/blob/c0ddf00bd068d7860b2e99b9f03847c890e3efb5/src/pdbinf/_pdbinf.py#L482 """ if templates is None: templates = [STANDARD_AA] resname = residue.resid.name.upper() # read the templates and assign correct bond orders for t in templates: # check if resname in template doc if resname not in t: continue # SMILES template if "SMILES" in t[resname]: new_res = assign_bond_orders_from_smiles( template_smiles=t[resname]["SMILES"], mol=residue ) break # cif template new_res = strip_bonds(residue) # strip bonds and chiral tags new_res = assign_intra_props(new_res, t[resname]) break # avoid double assignment, ordering of templates is relevant else: raise ValueError(f"Failed to find template for residue: '{resname}'") return Residue(new_res)
[docs]def assign_bond_orders_from_smiles( template_smiles: str, mol: Residue | Chem.Mol ) -> Chem.Mol: """Assign bond orders from a SMILES template to a residue. Parameters ---------- template_smiles : str The SMILES string of the template to assign bond orders from. residue : Residue | Chem.Mol The residue or molecule to assign bond orders to. Returns ------- Chem.Mol The residue or molecule with assigned bond orders from the template. """ for atm in mol.GetAtoms(): # Set the necessary property for _adjust_hydrogens function (index of the atom) atm.SetIntProp("_MDAnalysis_index", atm.GetIdx()) # Call template from SMILES mol_template = Chem.MolFromSmiles(template_smiles) # use the available function to assign bond orders (adjust hydrogens) return pdbqt_supplier._adjust_hydrogens(mol_template, mol)
# The below code contains functions for fixing molecule bond orders. # The code is adapted from pdbinf: https://github.com/OpenFreeEnergy/pdbinf/tree/main # Accessed on: 16 June, 2025 under MIT License.
[docs]def strip_bonds(m: Chem.Mol) -> Chem.Mol: """Strip all bonds and chiral tags from a molecule. Parameters ---------- m : rdkit.Chem.Mol The input molecule to strip bonds from. Returns ------- rdkit.Chem.Mol The modified molecule with all bonds and chiral tags removed. Note ---- This function is adapted from the `pdbinf/_pdbinf.py` module's `strip_bonds` function. Source: https://github.com/OpenFreeEnergy/pdbinf/blob/c0ddf00bd068d7860b2e99b9f03847c890e3efb5/src/pdbinf/_pdbinf.py#L71 """ with Chem.RWMol(m) as em: for b in m.GetBonds(): em.RemoveBond(b.GetBeginAtomIdx(), b.GetEndAtomIdx()) # rdkit, perhaps rightfully, gets upset at chiral tags w/ no bonds for at in em.GetAtoms(): at.SetChiralTag(Chem.CHI_UNSPECIFIED) return em.GetMol()
[docs]def assign_intra_props(mol: Chem.Mol, reference_block: dict) -> Chem.Mol: """Assign bonds and aromaticity based on residue and atom names from a reference block (cif template molecule). Parameters ---------- mol : rdkit.Chem.Mol The input molecule, this is modified in place and returned. reference_block : dict The cif template molecule from which to assign bonds. Returns ------- rdkit.Chem.Mol The modified molecule with assigned bonds and aromaticity. Note ---- This function is adapted from the `pdbinf/_pdbinf.py` module's `assign_intra_props` function, which is used to assign bonds and aromaticity based on the standard amino acid templates. Source: https://github.com/OpenFreeEnergy/pdbinf/blob/c0ddf00bd068d7860b2e99b9f03847c890e3efb5/src/pdbinf/_pdbinf.py#L117 """ nm_2_idx = {} fc_special_assign_nm_idx_fc_pair = {} # convert indices to names res_alias = RESNAME_ALIASES.get(reference_block["name"], reference_block["name"]) aliases = ATOMNAME_ALIASES.get(res_alias, {}) for atom in mol.GetAtoms(): nm = atom.GetMonomerInfo().GetName().strip() nm = aliases.get(nm, nm) if ( reference_block["name"] in FORMAL_CHARGE_ALIASES and nm in FORMAL_CHARGE_ALIASES[reference_block["name"]] ): fc_special_assign_nm_idx_fc_pair[atom.GetIdx()] = FORMAL_CHARGE_ALIASES[ reference_block["name"] ][nm] nm_2_idx[nm] = atom.GetIdx() logger.debug(f"assigning intra props for {reference_block['name']}") with Chem.RWMol(mol) as em: # grab bond data from cif Block for _, row in reference_block["_chem_comp_bond"].iterrows(): nm1, nm2 = row["atom_id_1"], row["atom_id_2"] order, arom = row["value_order"], row["pdbx_aromatic_flag"] try: idx1, idx2 = nm_2_idx[nm1], nm_2_idx[nm2] except KeyError: continue # [different from pdbinf] we priorituze SINGLE and DOUBLE bonds if order == "SING": order = Chem.BondType.SINGLE elif order == "DOUB": order = Chem.BondType.DOUBLE elif arom == "Y": order = Chem.BondType.AROMATIC else: order = Chem.BondType.UNSPECIFIED logger.debug(f"adding bond: {nm1}-{nm2} at {idx1} {idx2} {order}") em.AddBond(idx1, idx2, order=order) # find lone hydrogens, then attach to the nearest heavy atom em = _assign_intra_props_lone_H(em) # assign aromaticity for atoms based on the template for _, row in reference_block["_chem_comp_atom"].iterrows(): nm, arom = row["atom_id"].strip(), row["pdbx_aromatic_flag"].strip() try: idx = nm_2_idx[nm] except KeyError: # todo: could check atom is marked as leaving atom continue atom = em.GetAtomWithIdx(idx) atom.SetIsAromatic(arom == "Y") # [different from pdbinf] assign formal charge for a specific atom if fc_special_assign_nm_idx_fc_pair != {}: for ( fc_special_assign_nm_idx, fc_to_assign, ) in fc_special_assign_nm_idx_fc_pair.items(): atom = em.GetAtomWithIdx(fc_special_assign_nm_idx) atom.SetFormalCharge(fc_to_assign) logger.debug( f"Assigned {reference_block['name']}'s formal charge {fc_to_assign} " f"on {atom.GetPDBResidueInfo().GetName()}." ) # [different from pdbinf] sanitize the molecule mol = em.GetMol() mol.UpdatePropertyCache() Chem.SanitizeMol(mol) return mol
def _assign_intra_props_lone_H(em: Chem.RWMol) -> Chem.RWMol: """Assign lone hydrogens to the nearest heavy atom in the molecule. This is a part function for assign_intra_props. Parameters ---------- em : rdkit.Chem.RWMol The input editable molecule to assign lone hydrogens to the nearest heavy atom. Returns ------- rdkit.Chem.RWMol The modified molecule with lone hydrogens assigned to the nearest heavy atom. Note ---- This function is adapted from the `pdbinf/_pdbinf.py` module's `assign_intra_props` function. Source: https://github.com/OpenFreeEnergy/pdbinf/blob/c0ddf00bd068d7860b2e99b9f03847c890e3efb5/src/pdbinf/_pdbinf.py#L167 """ additional_bonds = [] heavy_atoms = [] lone_H = [] for atom in em.GetAtoms(): if atom.GetAtomicNum() != 1: heavy_atoms.append(atom.GetIdx()) continue if atom.GetBonds(): # if any bonds, we're ok continue lone_H.append(atom.GetIdx()) if lone_H: logger.debug(f"found lone hydrogens: {lone_H}") conf = em.GetConformer() for idx in lone_H: pos = conf.GetAtomPosition(idx) minidx, mindist = -1, float("inf") for idx2 in heavy_atoms: pos2 = conf.GetAtomPosition(idx2) d = pos.Distance(pos2) if d > mindist: continue minidx, mindist = idx2, d if mindist < MAX_AMIDE_LENGTH: logger.debug( f"attached hydrogen {idx} to heavy atom {minidx} d={mindist}" ) additional_bonds.append((idx, minidx)) if additional_bonds: for i, j in additional_bonds: em.AddBond(i, j, order=Chem.BondType.SINGLE) return em