Source code for prolif.interactions.water_bridge

"""
Water-mediated interactions --- :mod:`prolif.interactions.water_bridge`
=======================================================================

This module contains the :class:`~prolif.interactions.water_bridge.WaterBridge` class
for analyzing water-mediated interactions. It does so by generating fingerprints for
ligand-water, water-protein, and water-water interactions. The results are then
combined to identify water-bridged interactions using a graph-based approach.

.. versionadded:: 2.1.0

"""

import itertools as it
from collections import defaultdict
from collections.abc import Iterable, Iterator
from typing import TYPE_CHECKING, Any, Optional, Union, cast

import networkx as nx
from MDAnalysis.core.groups import UpdatingAtomGroup

from prolif.ifp import IFP, InteractionData
from prolif.interactions.base import BridgedInteraction

if TYPE_CHECKING:
    from prolif.molecule import Molecule
    from prolif.residue import ResidueId
    from prolif.typeshed import IFPResults, MDAObject, Trajectory


[docs]class WaterBridge(BridgedInteraction):
    """Implementation of the water-bridge analysis.

    Parameters
    ----------
    water : Union[MDAnalysis.core.groups.AtomGroup, Iterable[prolif.molecule.Molecule]]
        An MDAnalysis AtomGroup or iterable of prolif Molecule objects containing the
        water molecules
    order : int
        Maximum number of water molecules that can be involved in a water-bridged
        interaction.
    min_order : int
        Minimum number of water molecules that can be involved in a water-bridged
        interaction.
    hbdonor : Optional[dict]
        Parameters for the HBDonor interaction passed to the underlying fingerprint
        generator. See :class:`~prolif.interactions.interactions.HBDonor` for more
        details.
    hbacceptor : Optional[dict]
        Same as above for :class:`~prolif.interactions.interactions.HBAcceptor`.
    atomgroup_converter_kwargs : Optional[dict]
        Optional parameters passed to the MDAnalysis' RDKitConverter if the
        specified `water` is an MDAnalysis AtomGroup.
    count : bool
        Whether to generate a count fingerprint or just a binary one.

    Notes
    -----
    This analysis currently only runs in serial.

    .. versionadded:: 2.1.0
    """

    def __init__(
        self,
        water: Union["MDAObject", Iterable["Molecule"]],
        order: int = 1,
        min_order: int = 1,
        hbdonor: dict | None = None,
        hbacceptor: dict | None = None,
        atomgroup_converter_kwargs: dict | None = None,
        count: bool = False,
    ) -> None:
        # circular import
        from prolif.fingerprint import Fingerprint

        if order < 1:
            raise ValueError("order must be greater than 0")
        if min_order > order:
            raise ValueError("min_order cannot be greater than order")
        self.water = water
        # handle AtomGroup generated with `updating=True` automatically
        self.water_conv_kwargs = atomgroup_converter_kwargs or (
            {"NoImplicit": False, "cache": False}
            if isinstance(water, UpdatingAtomGroup)
            else {}
        )
        self.order = order
        self.min_order = min_order
        self.water_fp = Fingerprint(
            interactions=["HBDonor", "HBAcceptor"],
            parameters={"HBDonor": hbdonor or {}, "HBAcceptor": hbacceptor or {}},
            count=count,
        )
        super().__init__()

[docs]    def setup(self, ifp_store: Optional["IFPResults"] = None, **kwargs: Any) -> None:
        super().setup(ifp_store=ifp_store, **kwargs)
        self.kwargs.pop("n_jobs", None)
        self.residues = self.kwargs.pop("residues", None)
        self.converter_kwargs = self.kwargs.pop("converter_kwargs", ({}, {}))
        self.water_fp.use_segid = self.kwargs.pop("use_segid", False)

[docs]    def run(
        self,
        traj: "Trajectory",
        lig: "MDAObject",
        prot: "MDAObject",
    ) -> "IFPResults":
        """Run the water bridge analysis for a trajectory.

        Parameters
        ----------
        traj : Union[MDAnalysis.coordinates.base.ProtoReader, MDAnalysis.coordinates.base.FrameIteratorSliced]
            Iterate over this Universe trajectory or sliced trajectory object
            to extract the frames used for the fingerprint extraction
        lig : MDAnalysis.core.groups.AtomGroup
            An MDAnalysis AtomGroup for the ligand
        prot : MDAnalysis.core.groups.AtomGroup
            An MDAnalysis AtomGroup for the protein (with multiple residues)
        """  # noqa: E501
        water_obj = cast("MDAObject", self.water)
        # Run analysis for ligand-water and water-protein interactions
        lig_water_ifp: dict[int, IFP] = self.water_fp._run_serial(
            traj,
            lig,
            water_obj,
            residues=None,
            converter_kwargs=(self.converter_kwargs[0], self.water_conv_kwargs),
            **self.kwargs,
            desc="Ligand-Water",
        )
        water_prot_ifp: dict[int, IFP] = self.water_fp._run_serial(
            traj,
            water_obj,
            prot,
            residues=self.residues,
            converter_kwargs=(self.water_conv_kwargs, self.converter_kwargs[1]),
            **self.kwargs,
            desc="Water-Protein",
        )
        if self.order >= 2:
            # Run water-water interaction analysis
            water_ifp: dict[int, IFP] | None = self.water_fp._run_serial(
                traj,
                water_obj,
                water_obj,
                residues=None,
                converter_kwargs=(self.water_conv_kwargs, self.water_conv_kwargs),
                **self.kwargs,
                desc="Water-Water",
            )
        else:
            water_ifp = None

        for frame in lig_water_ifp:
            ifp_lw = lig_water_ifp[frame]  # Ligand → Water
            ifp_wp = water_prot_ifp[frame]  # Water → Protein
            self.ifp.setdefault(frame, IFP())

            if water_ifp is not None:
                ifp_ww = water_ifp[frame]  # WaterX -> WaterY
                self._any_order(frame, ifp_lw, ifp_ww, ifp_wp)

            else:
                self._first_order_only(frame, ifp_lw, ifp_wp)

        return self.ifp

[docs]    def run_from_iterable(
        self, lig_iterable: Iterable["Molecule"], prot_mol: "Molecule"
    ) -> "IFPResults":
        """Run the water-bridge analysis for an iterable of molecules.

        Parameters
        ----------
        lig_iterable : list or generator
            An iterable yielding ligands as :class:`~prolif.molecule.Molecule`
            objects
        prot_mol : prolif.molecule.Molecule
            The protein
        """
        water_obj = cast("Molecule", self.water)
        # Run analysis for ligand-water and water-protein interactions
        lig_water_ifp: "IFPResults" = self.water_fp._run_iter_serial(
            lig_iterable, water_obj, residues=None, **self.kwargs
        )
        water_prot_ifp: "IFPResults" = self.water_fp._run_iter_serial(
            [water_obj], prot_mol, residues=self.residues, **self.kwargs
        )
        ifp_wp = water_prot_ifp[0]  # Water → Protein

        if self.order >= 2:
            # Run water-water interaction analysis
            water_ifp: "IFPResults" = self.water_fp._run_iter_serial(
                [water_obj], water_obj, residues=None, **self.kwargs
            )
            ifp_ww: IFP | None = water_ifp[0]  # WaterX -> WaterY
        else:
            ifp_ww = None

        for pose in lig_water_ifp:
            ifp_lw = lig_water_ifp[pose]  # Ligand → Water
            self.ifp.setdefault(pose, IFP())

            if ifp_ww is not None:
                self._any_order(pose, ifp_lw, ifp_ww, ifp_wp)

            else:
                self._first_order_only(pose, ifp_lw, ifp_wp)
        return self.ifp

    def _first_order_only(self, frame: int, ifp_lw: IFP, ifp_wp: IFP) -> None:
        """Iterates over all relevant combinations of ligand-water-protein"""
        # for each ligand-water interaction
        for data_lw in ifp_lw.interactions():
            # for each water-protein interaction
            for data_wp in ifp_wp.interactions():
                if data_lw.protein == data_wp.ligand:
                    self._merge_metadata(frame, data_lw, data_wp)

    def _any_order(self, frame: int, ifp_lw: IFP, ifp_ww: IFP, ifp_wp: IFP) -> None:
        """Generate results for any order of water-bridge interactions.

        Constructs a graph to represent the water network and iterates over all paths
        up to a given length (corresponding to ``order + 1``).
        """
        # MultiGraph to allow the same pair of nodes to interact as both HBA and HBD
        # and potentially multiple groups of atoms satisfying the constraints.
        # Each of these interaction will have its own edge in the graph.
        graph: nx.MultiGraph["ResidueId"] = nx.MultiGraph()
        nodes: defaultdict[str, set[ResidueId]] = defaultdict(set)

        # construct graph of water interactions
        for ifp, role in [(ifp_lw, "ligand"), (ifp_wp, "protein")]:
            for data in ifp.interactions():
                graph.add_edge(data.ligand, data.protein, int_data=data)
                # assign ligand and protein residue nodes to corresponding role
                nodes[role].add(getattr(data, role))

        # remove mirror interactions before adding water-water to the graph
        deduplicated = {
            frozenset((ligand_resid, protein_resid))
            for ligand_resid, protein_resid in ifp_ww
            if ligand_resid != protein_resid
        }
        for ligand_resid, protein_resid in deduplicated:
            ww_dict = ifp_ww.data[ligand_resid, protein_resid]
            for int_name, metadata_tuple in ww_dict.items():
                for metadata in metadata_tuple:
                    data = InteractionData(
                        ligand=ligand_resid,
                        protein=protein_resid,
                        interaction=int_name,
                        metadata=metadata,
                    )
                    graph.add_edge(
                        data.ligand, data.protein, int_data=data, water_only=True
                    )

        # find all edge paths of length up to `order + 1`
        for source in nodes["ligand"]:
            targets = (t for t in nodes["protein"] if nx.has_path(graph, source, t))
            paths = cast(
                Iterator[list[tuple["ResidueId", "ResidueId", str]]],
                nx.all_simple_edge_paths(graph, source, targets, cutoff=self.order + 1),
            )
            for path in paths:
                if len(path) <= self.min_order:
                    continue
                # path is a list[tuple[node_id1, node_id2, deduplication_key]]
                # first element in path is lig-water1, last is waterN-prot
                data_lw = cast(InteractionData, graph.edges[path[0]]["int_data"])
                data_wp = cast(InteractionData, graph.edges[path[-1]]["int_data"])
                ww_edges = [graph.edges[p] for p in path[1:-1]]
                # only include if strictly passing through water (going back through
                # ligand or protein is not a valid higher-order interaction)
                if all(e.get("water_only") for e in ww_edges):
                    # reorder ligand and protein in InteractionData to be contiguous
                    # i.e. lig-w1, w1-w2, w2-prot instead of lig-w1, w2-w1, w2-prot
                    data_ww_list = []
                    left = data_lw.protein
                    for e in ww_edges:
                        d = cast(InteractionData, e["int_data"])
                        is_sorted = d.ligand == left
                        data_ww = InteractionData(
                            ligand=d.ligand if is_sorted else d.protein,
                            protein=d.protein if is_sorted else d.ligand,
                            # interaction name is not kept in final metadata
                            # so no need to invert it
                            interaction=d.interaction,
                            # the indices of "ligand" water and "protein" water are
                            # merged in the final metadata for water mols so no need to
                            # invert the roles in indices
                            metadata=d.metadata,
                        )
                        data_ww_list.append(data_ww)
                        left = data_ww.protein
                    self._merge_metadata(frame, data_lw, data_wp, *data_ww_list)

    def _merge_metadata(
        self,
        frame: int,
        data_lw: InteractionData,
        data_wp: InteractionData,
        *data_ww_args: InteractionData,
    ) -> None:
        """Merge results from all fingerprints on matching water residues"""
        # get indices for individual water molecules
        water_indices = defaultdict(set)
        for data, role in [
            (data_lw, "protein"),
            (data_wp, "ligand"),
            *it.chain.from_iterable(
                [
                    [(data_ww, "ligand"), (data_ww, "protein")]
                    for data_ww in data_ww_args
                ]
            ),
        ]:
            resid = getattr(data, role)
            water_indices[str(resid)].update(data.metadata["indices"][role])
        # construct merged metadata
        metadata = {
            "indices": {
                "ligand": data_lw.metadata["indices"]["ligand"],
                "protein": data_wp.metadata["indices"]["protein"],
                **{key: tuple(indices) for key, indices in water_indices.items()},
            },
            "parent_indices": {
                "ligand": data_lw.metadata["parent_indices"]["ligand"],
                "protein": data_wp.metadata["parent_indices"]["protein"],
                "water": tuple(
                    set().union(
                        data_lw.metadata["parent_indices"]["protein"],
                        data_wp.metadata["parent_indices"]["ligand"],
                        *it.chain.from_iterable(
                            [
                                [
                                    data_ww.metadata["parent_indices"]["ligand"],
                                    data_ww.metadata["parent_indices"]["protein"],
                                ]
                                for data_ww in data_ww_args
                            ]
                        ),
                    )
                ),
            },
            "water_residues": tuple(
                dict.fromkeys(  # uniquify but keep order
                    [
                        data_lw.protein,
                        *it.chain.from_iterable(
                            [
                                [data_ww.ligand, data_ww.protein]
                                for data_ww in data_ww_args
                            ]
                        ),
                        data_wp.ligand,
                    ]
                )
            ),
            "order": len(data_ww_args) + 1,
            "ligand_role": data_lw.interaction,
            "protein_role": (  # invert role
                "HBDonor" if data_wp.interaction == "HBAcceptor" else "HBAcceptor"
            ),
            **{
                f"{key}{suffix}": data.metadata[key]
                for suffix, data in [
                    (f"_ligand_{data_lw.protein}", data_lw),
                    *it.chain.from_iterable(
                        [
                            [(f"_{data_ww.ligand}_{data_ww.protein}", data_ww)]
                            for data_ww in data_ww_args
                        ]
                    ),
                    (f"_{data_wp.ligand}_protein", data_wp),
                ]
                for key in ["distance", "DHA_angle"]
            },
            "distance": sum(
                data.metadata["distance"] for data in [data_lw, *data_ww_args, data_wp]
            ),
        }

        # store metadata
        self.ifp[frame].setdefault((data_lw.ligand, data_wp.protein), {}).setdefault(
            "WaterBridge", []
        ).append(metadata)  # type: ignore[attr-defined]