Source code for prolif.io.cif

"""
I/O-related helper functions --- :mod:`prolif.io.cif`
=====================================================
This module provides a lightweight parser for Crystallographic Information File (CIF)
format.

Yu-Yuan (Stuart) Yang, 2025
"""

import shlex
from pathlib import Path

import pandas as pd


# A user can provide a custom CIF file with the standard amino acid.
def _block_decompose(data_block: list) -> tuple:
    """
    Decomposes a CIF data block into decriptive information and tables.

    .. versionadded:: 2.1.0
    """
    descriptions: list[str] = []
    data_tables: list[list] = []
    data_table: list[str] | None = None

    for idx, block_line in enumerate(data_block):
        if block_line.startswith("#"):
            if data_table is not None:
                # save the current table
                data_tables.append(data_table)
            # reset the table
            data_table = None
        elif block_line.startswith("loop_"):
            # table format
            data_table = []
        elif data_table is not None:
            # add data to the current table
            data_table.append(block_line)
            if idx == len(data_block) - 1:  # last line of the block
                # save the final table
                data_tables.append(data_table)
        else:
            descriptions.append(block_line)

    return descriptions, data_tables


[docs]def cif_parser_lite(cif_string: str) -> dict: """ Parses a CIF string and returns a dictionary of data blocks. .. versionadded:: 2.1.0 Parameters ---------- cif_string : str The CIF string to parse. """ # Split the CIF string into blocks based on 'data_' lines data_blocks = {} current_block = None all_lines = cif_string.strip().split("\n") for idx, line in enumerate(all_lines): if line.startswith("data_"): current_block = line.split("data_")[1] data_block: list[str] = [] elif line.startswith("##") or idx == len(all_lines) - 1: # end of a data block data_blocks[current_block] = data_block else: data_block.append(line.strip()) # create a dictionary to hold the parsed data cif_dict: dict = {} for block_name, data_block in data_blocks.items(): descriptions, data_tables = _block_decompose(data_block) cif_dict[block_name] = {"name": block_name} # descriptive information for each in descriptions: content = shlex.split(each) info_name = content[0].split(".") info = content[1] if info_name[0] not in cif_dict[block_name]: cif_dict[block_name][info_name[0]] = {} cif_dict[block_name][info_name[0]][info_name[1]] = info # data tables for data_table in data_tables: header = [] data: list[list[str]] = [] table_name = data_table[0].split(".")[0] for each_line in data_table: if each_line.startswith("_"): # header line header.append(each_line.split(".")[1].strip()) else: # data line # Use shlex.split to respect quoted strings row = [ item.strip('"') if item.startswith('"') and item.endswith('"') else item for item in shlex.split(each_line, posix=False) ] data.append(row) table = pd.DataFrame(data, columns=header) cif_dict[block_name][table_name] = table return cif_dict
[docs]def cif_template_reader(cif_filepath: Path | str) -> dict: """ Reads a CIF file and returns a dictionary of data blocks. .. versionadded:: 2.1.0 Parameters ---------- cif_filepath : str The path to the CIF file to read. Returns ------- dict A dictionary containing the parsed data blocks. """ cif_string = Path(cif_filepath).read_text() return cif_parser_lite(cif_string)