Source code for prolif.io.cif

"""
I/O-related helper functions --- :mod:`prolif.io.cif`
=====================================================
This module provides a lightweight parser for Crystallographic Information File (CIF)
format.

Yu-Yuan (Stuart) Yang, 2025
"""

import shlex
from pathlib import Path

import pandas as pd


# A user can provide a custom CIF file with the standard amino acid.
def _block_decompose(data_block: list) -> tuple:
    """
    Decomposes a CIF data block into decriptive information and tables.

    .. versionadded:: 2.1.0
    """
    descriptions: list[str] = []
    data_tables: list[list] = []
    data_table: list[str] | None = None

    for idx, block_line in enumerate(data_block):
        if block_line.startswith("#"):
            if data_table is not None:
                # save the current table
                data_tables.append(data_table)
            # reset the table
            data_table = None
        elif block_line.startswith("loop_"):
            # table format
            data_table = []
        elif data_table is not None:
            # add data to the current table
            data_table.append(block_line)
            if idx == len(data_block) - 1:  # last line of the block
                # save the final table
                data_tables.append(data_table)
        else:
            descriptions.append(block_line)

    return descriptions, data_tables


[docs]def cif_parser_lite(cif_string: str) -> dict:
    """
    Parses a CIF string and returns a dictionary of data blocks.

    .. versionadded:: 2.1.0

    Parameters
    ----------
    cif_string : str
        The CIF string to parse.

    """
    # Split the CIF string into blocks based on 'data_' lines
    data_blocks = {}
    current_block = None
    all_lines = cif_string.strip().split("\n")
    for idx, line in enumerate(all_lines):
        if line.startswith("data_"):
            current_block = line.split("data_")[1]
            data_block: list[str] = []
        elif line.startswith("##") or idx == len(all_lines) - 1:
            # end of a data block
            data_blocks[current_block] = data_block
        else:
            data_block.append(line.strip())

    # create a dictionary to hold the parsed data
    cif_dict: dict = {}
    for block_name, data_block in data_blocks.items():
        descriptions, data_tables = _block_decompose(data_block)
        cif_dict[block_name] = {"name": block_name}

        # descriptive information
        for each in descriptions:
            content = shlex.split(each)
            info_name = content[0].split(".")
            info = content[1]
            if info_name[0] not in cif_dict[block_name]:
                cif_dict[block_name][info_name[0]] = {}
            cif_dict[block_name][info_name[0]][info_name[1]] = info

        # data tables
        for data_table in data_tables:
            header = []
            data: list[list[str]] = []
            table_name = data_table[0].split(".")[0]
            for each_line in data_table:
                if each_line.startswith("_"):
                    # header line
                    header.append(each_line.split(".")[1].strip())
                else:
                    # data line
                    # Use shlex.split to respect quoted strings
                    row = [
                        item.strip('"')
                        if item.startswith('"') and item.endswith('"')
                        else item
                        for item in shlex.split(each_line, posix=False)
                    ]
                    data.append(row)

            table = pd.DataFrame(data, columns=header)
            cif_dict[block_name][table_name] = table

    return cif_dict


[docs]def cif_template_reader(cif_filepath: Path | str) -> dict:
    """
    Reads a CIF file and returns a dictionary of data blocks.

    .. versionadded:: 2.1.0

    Parameters
    ----------
    cif_filepath : str
        The path to the CIF file to read.

    Returns
    -------
    dict
        A dictionary containing the parsed data blocks.

    """
    cif_string = Path(cif_filepath).read_text()

    return cif_parser_lite(cif_string)