modules

Top-level package for procaliper.

`Protein` ¶

Source code in procaliper/_protein.py

class Protein:
    UNIPROT_SITE_PATTERNS = {
        "Active site": "ACT_SITE",
        "Binding site": "BINDING",
        "DNA binding": "DNA_BIND",
        "Disulfide bond": "DISULFID",
        "Beta strand": "STRAND",
        "Helix": "HELIX",
        "Turn": "TURN",
        "PTM": "MOD_RES",
        "Region": "REGION",
        "Domain": "DOMAIN",
    }

    UNIPROT_SITE_PATTERNS_RECTIFIED = {
        "active": "ACT_SITE",
        "binding": "BINDING",
        "dna_binding": "DNA_BIND",
        "disulfide_bond": "DISULFID",
        "beta_strand": "STRAND",
        "helix": "HELIX",
        "turn": "TURN",
        "modified_residue": "MOD_RES",
        "region": "REGION",
        "domain": "DOMAIN",
        "domain_[ft]": "DOMAIN",
    }

    UNIPROT_API_DEFAULT_FIELDS = [
        "id",
        "reviewed",
        "protein_name",
        "gene_names",
        "organism_name",
        "length",
        "sequence",
        "ft_act_site",
        "ft_binding",
        "ft_dna_bind",
        "ft_disulfid",
        "ft_strand",
        "ft_helix",
        "ft_turn",
        "ft_mod_res",
        "ft_region",
        "ft_domain",
    ]

    def __init__(self) -> None:
        self.data: dict[str, Any] = {}
        self.pdb_location_relative: str | None = None
        self.pdb_location_absolute: str | None = None

        self.site_annotations: SiteAnnotations = SiteAnnotations("")
        self.custom_site_data: CustomSiteData = CustomSiteData([], {})

        self.confidence_data: list[float] | None = None
        self.sasa_data: structure.sasa.SASAData | None = None
        self.charge_data: structure.charge.ChargeData | None = None
        self.cysteine_data: structure.cysteine_data.CysteineData | None = None
        self.titration_data: structure.titration.TitrationData | None = None
        self.structure_index: list[int] | None = None
        self.sequence_position_to_structure_index: dict[int, int] | None = None
        pass

    def _rectify_label(self, label: str) -> str:
        new_label = label.replace(" ", "_").lower()
        new_label = new_label.removesuffix("_site_sites")
        new_label = new_label.removesuffix("_site")
        return new_label

    def _rectify_data_labels(self) -> None:
        """
        Standardize the features names in self.data

        Replaces all spaces with underscores and lowercases the keys, and then
        replaces all instances of "_site_sites" with "_sites"
        """
        for k in list(self.data.keys()):
            new_key = self._rectify_label(k)
            self.data[new_key] = self.data.pop(k)

    @classmethod
    def from_uniprot_row(cls, row: dict[str, Any]) -> Protein:
        """Create a new Protein object from a row from a Uniprot table

        Args:
            row (dict[str, Any]): Contains the data from the Uniprot table. Must
                have "Sequence" or "sequence" as a key.

        Raises:
            ValueError: If "Sequence" or "sequence" is not found in the row.

        Returns:
            Protein: A processed and standardized protein object.
        """
        p = cls()
        if "Sequence" in row:
            p.data["sequence"] = row["Sequence"]
        elif "sequence" in row:
            p.data["sequence"] = row["sequence"]
        else:
            raise ValueError(f"Sequence not found in row: {row}")
        p.custom_site_data.add_residue_numbers(len(p.data["sequence"]))
        p.site_annotations = SiteAnnotations(p.data["sequence"])
        for key, value in row.items():
            key = p._rectify_label(key)
            if key in cls.UNIPROT_SITE_PATTERNS_RECTIFIED:
                uniprot_description_id = cls.UNIPROT_SITE_PATTERNS_RECTIFIED[key]
                p.site_annotations.extract_annotation(uniprot_description_id, value)
            elif key in cls.UNIPROT_SITE_PATTERNS:
                uniprot_description_id = cls.UNIPROT_SITE_PATTERNS[key]
                p.site_annotations.extract_annotation(uniprot_description_id, value)
            else:
                if value != value:
                    value = ""
                p.data[key] = value
        return p

    @classmethod
    def from_uniprot_id(
        cls,
        uniprot_id: str,
        fields: list[str] | None = None,
        from_db: str = "UniProtKB_AC-ID",
        to_db: str = "UniProtKB-Swiss-Prot",
    ) -> Protein:
        """Create a new Protein object from a Uniprot ID (fetches with Uniprot API)

        Args:
            uniprot_id (str): The Uniprot ID of the protein.
            fields (list[str] | None, optional): The fields to retrieve from
                Uniprot. If `None`, `Protein.UNIPROT_API_DEFAULT_FIELDS` is used.
            from_db (str, optional): The database to retrieve the ID from.
                Defaults to "UniProtKB_AC-ID".
            to_db (str, optional): The database to map to.
                Defaults to "UniProtKB-Swiss-Prot".

        Raises:
            ValueError: If we cannot retrieve the Uniprot ID.

        Returns:
            Protein: A processed and standardized protein object.
        """

        if not fields:
            fields = cls.UNIPROT_API_DEFAULT_FIELDS

        mapper = ProtMapper()

        result, error = mapper.get(
            ids=[uniprot_id], fields=fields, from_db=from_db, to_db=to_db
        )
        if error:
            raise ValueError(f"Uniprot id not retrieved: {error}")
        result.rename(columns={"From": "entry"}, inplace=True)
        if "Length" in result.columns:
            result["Length"] = pd.to_numeric(result["Length"])
        return cls.from_uniprot_row(result.iloc[0].to_dict())

    @classmethod
    def list_from_uniprot_ids(
        cls,
        uniprot_ids: list[str],
        fields: list[str] | None = None,
        from_db: str = "UniProtKB_AC-ID",
        to_db: str = "UniProtKB-Swiss-Prot",
    ) -> list[Protein]:
        """Create a list of Protein objects from a list of Uniprot IDs (fetches with Uniprot API)

        Args:
            uniprot_ids (list[str]): The Uniprot IDs of the proteins.
            fields (list[str] | None, optional): The fields to retrieve from
                Uniprot. If `None`, `Protein.UNIPROT_API_DEFAULT_FIELDS` is used.
            from_db (str, optional): The database to retrieve the IDs from.
                Defaults to "UniProtKB_AC-ID".
            to_db (str, optional): The database to map to.
                Defaults to "UniProtKB-Swiss-Prot".

        Raises:
            ValueError: If we cannot retrieve the Uniprot IDs.

        Returns:
            list[Protein]: A list of processed and standardized protein objects.
        """
        if not fields:
            fields = cls.UNIPROT_API_DEFAULT_FIELDS

        mapper = ProtMapper()

        result, error = mapper.get(
            ids=uniprot_ids, fields=fields, from_db=from_db, to_db=to_db
        )
        if error:
            raise ValueError(f"Uniprot id not retrieved: {error}")
        result.rename(columns={"From": "entry"}, inplace=True)

        if "Length" in result.columns:
            result["Length"] = pd.to_numeric(result["Length"])
        return [cls.from_uniprot_row(row.to_dict()) for _, row in result.iterrows()]

    def __eq__(self, other: object) -> bool:
        if not isinstance(other, Protein):
            return False
        return (
            self.data == other.data
            and self.sasa_data == other.sasa_data
            and self.charge_data == other.charge_data
            and self.cysteine_data == other.cysteine_data
        )

    def residue_data_frame(self) -> pd.DataFrame:
        d = dict(
            chain(
                self.get_charge().items(),
                self.get_sasa().items(),
                self.get_cysteine_data().items(),
                self.get_titration().items(),
            )
        )
        d["pLDDT"] = self.get_confidence()

        return pd.DataFrame(d)

    def get_biopandas_pdb_dataframe(self) -> PandasPdb:
        """Get the PDB dataframe for the protein.

        Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
        file in `self.pdb_location_absolute`.

        Raises:
            ValueError: If `pdb_location_absolute` is not set.

        Returns:
            PandasPdb: A biopandas dataframe that contains the PDB file information.
        """
        if not self.pdb_location_absolute:
            raise ValueError("PDB location not set; use `fetch_pdb` first")
        ppdb = PandasPdb()
        return ppdb.read_pdb(self.pdb_location_absolute)

    def get_biopython_structure(self) -> Structure:
        """Get the biopython structure for the protein.

        Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
        file in `self.pdb_location_absolute`.

        Raises:
            ValueError: If `pdb_location_absolute` is not set.
            ValueError: If the PDB file cannot be parsed.

        Returns:
            Structure: A biopython Structure object for the protein.
        """
        if not self.pdb_location_absolute:
            raise ValueError("PDB location not set; use `fetch_pdb` first")
        p = PDBParser(QUIET=True)
        structure = p.get_structure("", self.pdb_location_absolute)
        if not isinstance(structure, Structure):
            raise ValueError("Unable to parse PDB file.")
        return structure

    def get_biopython_residues(self) -> list[Residue]:
        """Get the biopython residues for the protein.

        Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
        file in `self.pdb_location_absolute`.

        Raises:
            ValueError: If `pdb_location_absolute` is not set.

        Returns:
            list[Residue]: A list of biopython residues for the protein.
        """
        if not self.pdb_location_absolute:
            raise ValueError("PDB location not set; use `fetch_pdb` first")
        p = PDBParser(QUIET=True)
        structure = p.get_structure("", self.pdb_location_absolute)
        reslist = [res for model in structure for chain in model for res in chain]
        return reslist

    def get_confidence(self) -> list[float]:
        """Fetches precomputed confidence data from pdb file.

        Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
        file in `self.pdb_location_absolute`.

        Raises:
            ValueError: If `confidence_data` is not already stored and
                `pdb_location_absolute` is not set.

        Returns:
            list[float]: A list of confidence values for each residue.
        """
        if self.confidence_data:
            return self.confidence_data

        if self.pdb_location_absolute:
            self.confidence_data = structure.confidence.residue_pLDDT(
                self.pdb_location_absolute,
            )
            return self.confidence_data
        else:
            raise ValueError(
                "Confidence data not stored, and PDB location not set; use `fetch_pdb` first"
            )

    def get_sasa(self) -> structure.sasa.SASAData:
        """Fetches precomputed SASA data for the protein, or computes it.

        Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
        file in `self.pdb_location_absolute`.

        Raises:
            ValueError: If `sasa_data` is not already stored and
                `pdb_location_absolute` is not set.

        Returns:
            structure.sasa.SASAData: A :class:`protein_structure.sasa.SASAData`
                object containing the SASA values for residues and atoms.
        """
        if self.sasa_data:
            return self.sasa_data

        if self.pdb_location_absolute:
            self.sasa_data = structure.sasa.calculate_sasa(
                self.pdb_location_absolute,
            )
            return self.sasa_data
        else:
            raise ValueError(
                "SASA data not stored, and PDB location not set; use `fetch_pdb` first"
            )

    def get_charge(self, method: str = "gasteiger") -> structure.charge.ChargeData:
        """Fetches precomputed charge data for the protein, or computes it.

        Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
        file in `self.pdb_location_absolute`.

        Args:
            method (str, optional): The method used for the charge calculation.
                Examples include 'qtpie', 'eem', 'gasteiger'. Defaults to
                'gasteiger'. For a full list reference
                https://open-babel.readthedocs.io/en/latest/Charges/charges.html

        Raises:
            ValueError: If `charge_data` is not already stored and
                `pdb_location_absolute` is not set.

        Returns:
            structure.charge.ChargeData: A :class:`protein_structure.charge.ChargeData`
                object containing the charge values for residues and atoms.
        """
        if self.charge_data:
            if self.charge_data["charge_method"]:
                if self.charge_data["charge_method"][0] == method:
                    return self.charge_data

        if self.pdb_location_absolute:
            self.charge_data = structure.charge.calculate_charge(
                self.pdb_location_absolute,
                method=method,
            )

            self.last_charge_method = method

            return self.charge_data
        else:
            raise ValueError(
                "Charge data for specified method not stored, and PDB location not set; use `fetch_pdb` first"
            )

    def get_cysteine_data(self) -> structure.cysteine_data.CysteineData:
        """Fetches precomputed size data for the protein, or computes it.

        Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
        file in `self.pdb_location_absolute`.

        Raises:
            ValueError: If `cysteine_data` is not already stored and
                `pdb_location_absolute` is not set.

        Returns:
            structure.size.CysteineData: A :class:`protein_structure.size.CysteineData`
                object containing the size values for cystein sites.
        """
        if self.cysteine_data:
            return self.cysteine_data

        if self.pdb_location_absolute:
            self.cysteine_data = structure.cysteine_data.calculate_cysteine_data(
                self.pdb_location_absolute,
            )
            return self.cysteine_data
        else:
            raise ValueError(
                "Size data not stored, and PDB location not set; use `fetch_pdb` first"
            )

    def get_titration(self) -> structure.titration.TitrationData:
        """Runs the default titration calculation for the protein.

        Equivalent to running `self.get_titration_from_propka`.

        Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
        file in `self.pdb_location_absolute`.

        Raises:
            ValueError: If `titration_data` is not already stored and
                `pdb_location_absolute` is not set.

        Returns:
            structure.titration.TitrationData: A
                :class:`protein_structure.titration.TitrationData` object containing
                the titration values for residues.
        """
        return self.get_titration_from_propka()

    def get_titration_from_propka(self) -> structure.titration.TitrationData:
        """Fetches precomputed titration data for the protein, or computes it.

        Uses :func:`protein_structure.titration.calculate_titration_propka` if
        `self.titration_data` is not already stored.

        Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
        file in `self.pdb_location_absolute`.

        Raises:
            ValueError: If `titration_data` is not already stored and
                `pdb_location_absolute` is not set.

        Returns:
            structure.titration.TitrationData: A
                :class:`protein_structure.titration.TitrationData` object containing
                the titration values for residues."""
        if self.titration_data:
            return self.titration_data

        if self.pdb_location_absolute:
            self.titration_data = structure.titration.calculate_titration_propka(
                self.pdb_location_absolute,
            )
            return self.titration_data
        else:
            raise ValueError(
                "Titration data not stored, and PDB location not set; use `fetch_pdb` first"
            )

    def get_titration_from_pypka(self) -> structure.titration.TitrationData:
        """Fetches precomputed titration data for the protein, or computes it.

        Uses :func:`protein_structure.titration.calculate_titration_pypka` if
        `self.titration_data` is not already stored. Requires pypka to be
        installed, which has dependencies that are not FOSS. Please be sure to
        verify that you are legally allowed to use pypka.

        Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
        file in `self.pdb_location_absolute`.

        Raises:
            ValueError: If `titration_data` is not already stored and
                `pdb_location_absolute` is not set. ImportError: If pypka is not
                installed.

        Returns:
            structure.titration.TitrationData: A
                :class:`protein_structure.titration.TitrationData` object containing
                the titration values forresidues."""

        if self.titration_data:
            return self.titration_data

        if self.pdb_location_absolute:
            self.titration_data = structure.titration.calculate_titration_pypka(
                self.pdb_location_absolute,
            )
            return self.titration_data
        else:
            raise ValueError(
                "Titration data not stored, and PDB location not set; use `fetch_pdb` first"
            )

    def get_titration_from_pkai(self) -> structure.titration.TitrationData:
        """Fetches precomputed titration data for the protein, or computes it.

        Uses :func:`protein_structure.titration.calculate_titration_pkai` if
        `self.titration_data` is not already stored. Requires pkai to be
        installed. Note that this method is a deep-learning model, not a
        physics-based calculation.

        Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
        file in `self.pdb_location_absolute`.

        Raises:
            ValueError: If `titration_data` is not already stored and
                `pdb_location_absolute` is not set.

        Returns: structure.titration.TitrationData: A
            :class:`protein_structure.titration.TitrationData` object containing
                the titration values for residues."""
        if self.titration_data:
            return self.titration_data

        if self.pdb_location_absolute:
            self.titration_data = structure.titration.calculate_titration_pkai(
                self.pdb_location_absolute,
            )
            return self.titration_data
        else:
            raise ValueError(
                "Titration data not stored, and PDB location not set; use `fetch_pdb` first"
            )

    def add_custom_site_data_column(
        self, key: str, site_data: list[Any], overwrite: bool = False
    ) -> None:
        if not self.custom_site_data.residue_number:
            self.custom_site_data.add_residue_numbers(len(self.data["sequence"]))
        self.custom_site_data.add_site_data(key, site_data, overwrite=overwrite)

    def unravel_sites(
        self,
        selected_aas: None | set[AminoAcidLetter] = None,
        selected_keys: None | set[str] = None,
    ) -> dict[str, list[Any]]:
        """Split the protein into individual sites, recording values for each.

        Args:
            selected_aas: A set of amino acids letters to include in the output.
                If `None` (default), all amino acids will be included.
            selected_keys: A set of keys belonging to this `Protein` object's
                `data` dictionary to include in the output. If `None` (default),
                all keys are used.

        Returns:
            dict[str, list[Any]]: A dictionary mapping keys to lists of values.
                Each list is a parallel array of the same length as the protein
                sequence (after filtering out non-selected amino acids)."""
        tbl = self.site_annotations.table() | self.custom_site_data.table()
        if selected_keys is None:
            selected_keys = (set(tbl.keys()) | set(self.data.keys())) - {"sequence"}
        tbl_keys = selected_keys & set(tbl.keys())
        data_keys = selected_keys & set(self.data.keys())
        assert tbl_keys.isdisjoint(data_keys)
        res: dict[str, list[Any]] = {k: [] for k in selected_keys}
        for index, site in enumerate(self.data["sequence"]):
            if selected_aas and site not in selected_aas:
                continue
            for k in tbl_keys:
                res[k].append(tbl[k][index])
            for k in data_keys:
                res[k].append(self.data[k])  # will be the same for all sites

        return res

    def fetch_pdb(self, save_path: str | None = None, url: str | None = None) -> None:
        """Fetches the PDB file for the protein (from the AlphaFold database by default).

        Args:
            save_path (str | None, optional): The path to save the PDB file to.
                If `None`, the protein name will be used as the file name.
                Defaults to `None`.
            url (str | None, optional): The URL to fetch the PDB file from.
                Defaults to `None`, in which case the AlphaFold database is used.

        Raises:
            Exception: If the response status code is not 200, meaning we could
                not fetch the PDB from the database."""
        if not url:
            url = f"https://alphafold.ebi.ac.uk/files/AF-{self.data['entry']}-F1-model_v4.pdb"
        if not save_path:
            save_path = f"{self.data['entry']}.pdb"

        response = requests.get(url)

        if response.status_code != 200:
            raise Exception(f"Failed to fetch PDB: {response.status_code}")

        with open(save_path, "wb+") as f:
            f.write(response.content)

        self.pdb_location_relative = save_path
        self.pdb_location_absolute = os.path.abspath(save_path)
        self._build_structure_index()

    def register_local_pdb(self, path_to_pdb_file: str | None = None) -> None:
        """Sets pdb file for protein object using local pdb file.

        Args:
            path_to_pdb_file (str | None, optional): Path to local PDB file.
                Defaults to `None`, in which case it assumes a file with 'entry'.pdb."""
        if not path_to_pdb_file:
            path_to_pdb_file = f"{self.data['entry']}.pdb"
        self.pdb_location_relative = path_to_pdb_file
        self.pdb_location_absolute = os.path.abspath(path_to_pdb_file)
        self._build_structure_index()

    def _build_structure_index(self) -> None:
        self.structure_index = (
            self.get_biopandas_pdb_dataframe().df["ATOM"]["residue_number"].unique()
        )
        assert (
            self.structure_index is not None
        ), "Structure index is not built. PDB file may not be loaded correctly."
        self.sequence_position_to_structure_index = {
            self.structure_index[i]: i for i in range(len(self.structure_index))
        }

    def _is_site_aa(self, site: int, aa: AminoAcidLetter = "C") -> bool:
        if "sequence" not in self.data:
            raise ValueError("Sequence entry not found in data")

        sequence = self.data["sequence"]

        return site <= len(sequence) and sequence[site - 1] == aa

`fetch_pdb(save_path=None, url=None)` ¶

Fetches the PDB file for the protein (from the AlphaFold database by default).

Parameters:

Name	Type	Description	Default
`save_path`	`str \| None`	The path to save the PDB file to. If `None`, the protein name will be used as the file name. Defaults to `None`.	`None`
`url`	`str \| None`	The URL to fetch the PDB file from. Defaults to `None`, in which case the AlphaFold database is used.	`None`

Raises:

Type	Description
`Exception`	If the response status code is not 200, meaning we could not fetch the PDB from the database.

Source code in procaliper/_protein.py

def fetch_pdb(self, save_path: str | None = None, url: str | None = None) -> None:
    """Fetches the PDB file for the protein (from the AlphaFold database by default).

    Args:
        save_path (str | None, optional): The path to save the PDB file to.
            If `None`, the protein name will be used as the file name.
            Defaults to `None`.
        url (str | None, optional): The URL to fetch the PDB file from.
            Defaults to `None`, in which case the AlphaFold database is used.

    Raises:
        Exception: If the response status code is not 200, meaning we could
            not fetch the PDB from the database."""
    if not url:
        url = f"https://alphafold.ebi.ac.uk/files/AF-{self.data['entry']}-F1-model_v4.pdb"
    if not save_path:
        save_path = f"{self.data['entry']}.pdb"

    response = requests.get(url)

    if response.status_code != 200:
        raise Exception(f"Failed to fetch PDB: {response.status_code}")

    with open(save_path, "wb+") as f:
        f.write(response.content)

    self.pdb_location_relative = save_path
    self.pdb_location_absolute = os.path.abspath(save_path)
    self._build_structure_index()

`from_uniprot_id(uniprot_id, fields=None, from_db='UniProtKB_AC-ID', to_db='UniProtKB-Swiss-Prot')` `classmethod` ¶

Create a new Protein object from a Uniprot ID (fetches with Uniprot API)

Parameters:

Name	Type	Description	Default
`uniprot_id`	`str`	The Uniprot ID of the protein.	required
`fields`	`list[str] \| None`	The fields to retrieve from Uniprot. If `None`, `Protein.UNIPROT_API_DEFAULT_FIELDS` is used.	`None`
`from_db`	`str`	The database to retrieve the ID from. Defaults to "UniProtKB_AC-ID".	`'UniProtKB_AC-ID'`
`to_db`	`str`	The database to map to. Defaults to "UniProtKB-Swiss-Prot".	`'UniProtKB-Swiss-Prot'`

Raises:

Type	Description
`ValueError`	If we cannot retrieve the Uniprot ID.

Returns:

Name	Type	Description
`Protein`	`Protein`	A processed and standardized protein object.

Source code in procaliper/_protein.py

@classmethod
def from_uniprot_id(
    cls,
    uniprot_id: str,
    fields: list[str] | None = None,
    from_db: str = "UniProtKB_AC-ID",
    to_db: str = "UniProtKB-Swiss-Prot",
) -> Protein:
    """Create a new Protein object from a Uniprot ID (fetches with Uniprot API)

    Args:
        uniprot_id (str): The Uniprot ID of the protein.
        fields (list[str] | None, optional): The fields to retrieve from
            Uniprot. If `None`, `Protein.UNIPROT_API_DEFAULT_FIELDS` is used.
        from_db (str, optional): The database to retrieve the ID from.
            Defaults to "UniProtKB_AC-ID".
        to_db (str, optional): The database to map to.
            Defaults to "UniProtKB-Swiss-Prot".

    Raises:
        ValueError: If we cannot retrieve the Uniprot ID.

    Returns:
        Protein: A processed and standardized protein object.
    """

    if not fields:
        fields = cls.UNIPROT_API_DEFAULT_FIELDS

    mapper = ProtMapper()

    result, error = mapper.get(
        ids=[uniprot_id], fields=fields, from_db=from_db, to_db=to_db
    )
    if error:
        raise ValueError(f"Uniprot id not retrieved: {error}")
    result.rename(columns={"From": "entry"}, inplace=True)
    if "Length" in result.columns:
        result["Length"] = pd.to_numeric(result["Length"])
    return cls.from_uniprot_row(result.iloc[0].to_dict())

`from_uniprot_row(row)` `classmethod` ¶

Create a new Protein object from a row from a Uniprot table

Parameters:

Name	Type	Description	Default
`row`	`dict[str, Any]`	Contains the data from the Uniprot table. Must have "Sequence" or "sequence" as a key.	required

Raises:

Type	Description
`ValueError`	If "Sequence" or "sequence" is not found in the row.

Returns:

Name	Type	Description
`Protein`	`Protein`	A processed and standardized protein object.

Source code in procaliper/_protein.py

@classmethod
def from_uniprot_row(cls, row: dict[str, Any]) -> Protein:
    """Create a new Protein object from a row from a Uniprot table

    Args:
        row (dict[str, Any]): Contains the data from the Uniprot table. Must
            have "Sequence" or "sequence" as a key.

    Raises:
        ValueError: If "Sequence" or "sequence" is not found in the row.

    Returns:
        Protein: A processed and standardized protein object.
    """
    p = cls()
    if "Sequence" in row:
        p.data["sequence"] = row["Sequence"]
    elif "sequence" in row:
        p.data["sequence"] = row["sequence"]
    else:
        raise ValueError(f"Sequence not found in row: {row}")
    p.custom_site_data.add_residue_numbers(len(p.data["sequence"]))
    p.site_annotations = SiteAnnotations(p.data["sequence"])
    for key, value in row.items():
        key = p._rectify_label(key)
        if key in cls.UNIPROT_SITE_PATTERNS_RECTIFIED:
            uniprot_description_id = cls.UNIPROT_SITE_PATTERNS_RECTIFIED[key]
            p.site_annotations.extract_annotation(uniprot_description_id, value)
        elif key in cls.UNIPROT_SITE_PATTERNS:
            uniprot_description_id = cls.UNIPROT_SITE_PATTERNS[key]
            p.site_annotations.extract_annotation(uniprot_description_id, value)
        else:
            if value != value:
                value = ""
            p.data[key] = value
    return p

`get_biopandas_pdb_dataframe()` ¶

Get the PDB dataframe for the protein.

Must run self.fetch_pdb first or specify an abosulute path to the PDB file in self.pdb_location_absolute.

Raises:

Type	Description
`ValueError`	If `pdb_location_absolute` is not set.

Returns:

Name	Type	Description
`PandasPdb`	`PandasPdb`	A biopandas dataframe that contains the PDB file information.

Source code in procaliper/_protein.py

def get_biopandas_pdb_dataframe(self) -> PandasPdb:
    """Get the PDB dataframe for the protein.

    Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
    file in `self.pdb_location_absolute`.

    Raises:
        ValueError: If `pdb_location_absolute` is not set.

    Returns:
        PandasPdb: A biopandas dataframe that contains the PDB file information.
    """
    if not self.pdb_location_absolute:
        raise ValueError("PDB location not set; use `fetch_pdb` first")
    ppdb = PandasPdb()
    return ppdb.read_pdb(self.pdb_location_absolute)

`get_biopython_residues()` ¶

Get the biopython residues for the protein.

Must run self.fetch_pdb first or specify an abosulute path to the PDB file in self.pdb_location_absolute.

Raises:

Type	Description
`ValueError`	If `pdb_location_absolute` is not set.

Returns:

Type	Description
`list[Residue]`	list[Residue]: A list of biopython residues for the protein.

Source code in procaliper/_protein.py

def get_biopython_residues(self) -> list[Residue]:
    """Get the biopython residues for the protein.

    Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
    file in `self.pdb_location_absolute`.

    Raises:
        ValueError: If `pdb_location_absolute` is not set.

    Returns:
        list[Residue]: A list of biopython residues for the protein.
    """
    if not self.pdb_location_absolute:
        raise ValueError("PDB location not set; use `fetch_pdb` first")
    p = PDBParser(QUIET=True)
    structure = p.get_structure("", self.pdb_location_absolute)
    reslist = [res for model in structure for chain in model for res in chain]
    return reslist

`get_biopython_structure()` ¶

Get the biopython structure for the protein.

Must run self.fetch_pdb first or specify an abosulute path to the PDB file in self.pdb_location_absolute.

Raises:

Type	Description
`ValueError`	If `pdb_location_absolute` is not set.
`ValueError`	If the PDB file cannot be parsed.

Returns:

Name	Type	Description
`Structure`	`Structure`	A biopython Structure object for the protein.

Source code in procaliper/_protein.py

def get_biopython_structure(self) -> Structure:
    """Get the biopython structure for the protein.

    Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
    file in `self.pdb_location_absolute`.

    Raises:
        ValueError: If `pdb_location_absolute` is not set.
        ValueError: If the PDB file cannot be parsed.

    Returns:
        Structure: A biopython Structure object for the protein.
    """
    if not self.pdb_location_absolute:
        raise ValueError("PDB location not set; use `fetch_pdb` first")
    p = PDBParser(QUIET=True)
    structure = p.get_structure("", self.pdb_location_absolute)
    if not isinstance(structure, Structure):
        raise ValueError("Unable to parse PDB file.")
    return structure

`get_charge(method='gasteiger')` ¶

Fetches precomputed charge data for the protein, or computes it.

Must run self.fetch_pdb first or specify an abosulute path to the PDB file in self.pdb_location_absolute.

Parameters:

Name	Type	Description	Default
`method`	`str`	The method used for the charge calculation. Examples include 'qtpie', 'eem', 'gasteiger'. Defaults to 'gasteiger'. For a full list reference https://open-babel.readthedocs.io/en/latest/Charges/charges.html	`'gasteiger'`

Raises:

Type	Description
`ValueError`	If `charge_data` is not already stored and `pdb_location_absolute` is not set.

Returns:

Type	Description
`ChargeData`	structure.charge.ChargeData: A :class:`protein_structure.charge.ChargeData` object containing the charge values for residues and atoms.

Source code in procaliper/_protein.py

def get_charge(self, method: str = "gasteiger") -> structure.charge.ChargeData:
    """Fetches precomputed charge data for the protein, or computes it.

    Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
    file in `self.pdb_location_absolute`.

    Args:
        method (str, optional): The method used for the charge calculation.
            Examples include 'qtpie', 'eem', 'gasteiger'. Defaults to
            'gasteiger'. For a full list reference
            https://open-babel.readthedocs.io/en/latest/Charges/charges.html

    Raises:
        ValueError: If `charge_data` is not already stored and
            `pdb_location_absolute` is not set.

    Returns:
        structure.charge.ChargeData: A :class:`protein_structure.charge.ChargeData`
            object containing the charge values for residues and atoms.
    """
    if self.charge_data:
        if self.charge_data["charge_method"]:
            if self.charge_data["charge_method"][0] == method:
                return self.charge_data

    if self.pdb_location_absolute:
        self.charge_data = structure.charge.calculate_charge(
            self.pdb_location_absolute,
            method=method,
        )

        self.last_charge_method = method

        return self.charge_data
    else:
        raise ValueError(
            "Charge data for specified method not stored, and PDB location not set; use `fetch_pdb` first"
        )

`get_confidence()` ¶

Fetches precomputed confidence data from pdb file.

Must run self.fetch_pdb first or specify an abosulute path to the PDB file in self.pdb_location_absolute.

Raises:

Type	Description
`ValueError`	If `confidence_data` is not already stored and `pdb_location_absolute` is not set.

Returns:

Type	Description
`list[float]`	list[float]: A list of confidence values for each residue.

Source code in procaliper/_protein.py

def get_confidence(self) -> list[float]:
    """Fetches precomputed confidence data from pdb file.

    Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
    file in `self.pdb_location_absolute`.

    Raises:
        ValueError: If `confidence_data` is not already stored and
            `pdb_location_absolute` is not set.

    Returns:
        list[float]: A list of confidence values for each residue.
    """
    if self.confidence_data:
        return self.confidence_data

    if self.pdb_location_absolute:
        self.confidence_data = structure.confidence.residue_pLDDT(
            self.pdb_location_absolute,
        )
        return self.confidence_data
    else:
        raise ValueError(
            "Confidence data not stored, and PDB location not set; use `fetch_pdb` first"
        )

`get_cysteine_data()` ¶

Fetches precomputed size data for the protein, or computes it.

Must run self.fetch_pdb first or specify an abosulute path to the PDB file in self.pdb_location_absolute.

Raises:

Type	Description
`ValueError`	If `cysteine_data` is not already stored and `pdb_location_absolute` is not set.

Returns:

Type	Description
`CysteineData`	structure.size.CysteineData: A :class:`protein_structure.size.CysteineData` object containing the size values for cystein sites.

Source code in procaliper/_protein.py

def get_cysteine_data(self) -> structure.cysteine_data.CysteineData:
    """Fetches precomputed size data for the protein, or computes it.

    Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
    file in `self.pdb_location_absolute`.

    Raises:
        ValueError: If `cysteine_data` is not already stored and
            `pdb_location_absolute` is not set.

    Returns:
        structure.size.CysteineData: A :class:`protein_structure.size.CysteineData`
            object containing the size values for cystein sites.
    """
    if self.cysteine_data:
        return self.cysteine_data

    if self.pdb_location_absolute:
        self.cysteine_data = structure.cysteine_data.calculate_cysteine_data(
            self.pdb_location_absolute,
        )
        return self.cysteine_data
    else:
        raise ValueError(
            "Size data not stored, and PDB location not set; use `fetch_pdb` first"
        )

`get_sasa()` ¶

Fetches precomputed SASA data for the protein, or computes it.

Must run self.fetch_pdb first or specify an abosulute path to the PDB file in self.pdb_location_absolute.

Raises:

Type	Description
`ValueError`	If `sasa_data` is not already stored and `pdb_location_absolute` is not set.

Returns:

Type	Description
`SASAData`	structure.sasa.SASAData: A :class:`protein_structure.sasa.SASAData` object containing the SASA values for residues and atoms.

Source code in procaliper/_protein.py

def get_sasa(self) -> structure.sasa.SASAData:
    """Fetches precomputed SASA data for the protein, or computes it.

    Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
    file in `self.pdb_location_absolute`.

    Raises:
        ValueError: If `sasa_data` is not already stored and
            `pdb_location_absolute` is not set.

    Returns:
        structure.sasa.SASAData: A :class:`protein_structure.sasa.SASAData`
            object containing the SASA values for residues and atoms.
    """
    if self.sasa_data:
        return self.sasa_data

    if self.pdb_location_absolute:
        self.sasa_data = structure.sasa.calculate_sasa(
            self.pdb_location_absolute,
        )
        return self.sasa_data
    else:
        raise ValueError(
            "SASA data not stored, and PDB location not set; use `fetch_pdb` first"
        )

`get_titration()` ¶

Runs the default titration calculation for the protein.

Equivalent to running self.get_titration_from_propka.

Must run self.fetch_pdb first or specify an abosulute path to the PDB file in self.pdb_location_absolute.

Raises:

Type	Description
`ValueError`	If `titration_data` is not already stored and `pdb_location_absolute` is not set.

Returns:

Type	Description
`TitrationData`	structure.titration.TitrationData: A :class:`protein_structure.titration.TitrationData` object containing the titration values for residues.

Source code in procaliper/_protein.py

def get_titration(self) -> structure.titration.TitrationData:
    """Runs the default titration calculation for the protein.

    Equivalent to running `self.get_titration_from_propka`.

    Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
    file in `self.pdb_location_absolute`.

    Raises:
        ValueError: If `titration_data` is not already stored and
            `pdb_location_absolute` is not set.

    Returns:
        structure.titration.TitrationData: A
            :class:`protein_structure.titration.TitrationData` object containing
            the titration values for residues.
    """
    return self.get_titration_from_propka()

`get_titration_from_pkai()` ¶

Fetches precomputed titration data for the protein, or computes it.

Uses :func:protein_structure.titration.calculate_titration_pkai if self.titration_data is not already stored. Requires pkai to be installed. Note that this method is a deep-learning model, not a physics-based calculation.

Must run self.fetch_pdb first or specify an abosulute path to the PDB file in self.pdb_location_absolute.

Raises:

Type	Description
`ValueError`	If `titration_data` is not already stored and `pdb_location_absolute` is not set.

structure.titration.TitrationData: A

Type	Description
`TitrationData`	the titration values for residues.

Source code in procaliper/_protein.py

def get_titration_from_pkai(self) -> structure.titration.TitrationData:
    """Fetches precomputed titration data for the protein, or computes it.

    Uses :func:`protein_structure.titration.calculate_titration_pkai` if
    `self.titration_data` is not already stored. Requires pkai to be
    installed. Note that this method is a deep-learning model, not a
    physics-based calculation.

    Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
    file in `self.pdb_location_absolute`.

    Raises:
        ValueError: If `titration_data` is not already stored and
            `pdb_location_absolute` is not set.

    Returns: structure.titration.TitrationData: A
        :class:`protein_structure.titration.TitrationData` object containing
            the titration values for residues."""
    if self.titration_data:
        return self.titration_data

    if self.pdb_location_absolute:
        self.titration_data = structure.titration.calculate_titration_pkai(
            self.pdb_location_absolute,
        )
        return self.titration_data
    else:
        raise ValueError(
            "Titration data not stored, and PDB location not set; use `fetch_pdb` first"
        )

`get_titration_from_propka()` ¶

Fetches precomputed titration data for the protein, or computes it.

Uses :func:protein_structure.titration.calculate_titration_propka if self.titration_data is not already stored.

Must run self.fetch_pdb first or specify an abosulute path to the PDB file in self.pdb_location_absolute.

Raises:

Type	Description
`ValueError`	If `titration_data` is not already stored and `pdb_location_absolute` is not set.

Returns:

Type	Description
`TitrationData`	structure.titration.TitrationData: A :class:`protein_structure.titration.TitrationData` object containing the titration values for residues.

Source code in procaliper/_protein.py

def get_titration_from_propka(self) -> structure.titration.TitrationData:
    """Fetches precomputed titration data for the protein, or computes it.

    Uses :func:`protein_structure.titration.calculate_titration_propka` if
    `self.titration_data` is not already stored.

    Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
    file in `self.pdb_location_absolute`.

    Raises:
        ValueError: If `titration_data` is not already stored and
            `pdb_location_absolute` is not set.

    Returns:
        structure.titration.TitrationData: A
            :class:`protein_structure.titration.TitrationData` object containing
            the titration values for residues."""
    if self.titration_data:
        return self.titration_data

    if self.pdb_location_absolute:
        self.titration_data = structure.titration.calculate_titration_propka(
            self.pdb_location_absolute,
        )
        return self.titration_data
    else:
        raise ValueError(
            "Titration data not stored, and PDB location not set; use `fetch_pdb` first"
        )

`get_titration_from_pypka()` ¶

Fetches precomputed titration data for the protein, or computes it.

Uses :func:protein_structure.titration.calculate_titration_pypka if self.titration_data is not already stored. Requires pypka to be installed, which has dependencies that are not FOSS. Please be sure to verify that you are legally allowed to use pypka.

Must run self.fetch_pdb first or specify an abosulute path to the PDB file in self.pdb_location_absolute.

Raises:

Type	Description
`ValueError`	If `titration_data` is not already stored and `pdb_location_absolute` is not set. ImportError: If pypka is not installed.

Returns:

Type	Description
`TitrationData`	structure.titration.TitrationData: A :class:`protein_structure.titration.TitrationData` object containing the titration values forresidues.

Source code in procaliper/_protein.py

def get_titration_from_pypka(self) -> structure.titration.TitrationData:
    """Fetches precomputed titration data for the protein, or computes it.

    Uses :func:`protein_structure.titration.calculate_titration_pypka` if
    `self.titration_data` is not already stored. Requires pypka to be
    installed, which has dependencies that are not FOSS. Please be sure to
    verify that you are legally allowed to use pypka.

    Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
    file in `self.pdb_location_absolute`.

    Raises:
        ValueError: If `titration_data` is not already stored and
            `pdb_location_absolute` is not set. ImportError: If pypka is not
            installed.

    Returns:
        structure.titration.TitrationData: A
            :class:`protein_structure.titration.TitrationData` object containing
            the titration values forresidues."""

    if self.titration_data:
        return self.titration_data

    if self.pdb_location_absolute:
        self.titration_data = structure.titration.calculate_titration_pypka(
            self.pdb_location_absolute,
        )
        return self.titration_data
    else:
        raise ValueError(
            "Titration data not stored, and PDB location not set; use `fetch_pdb` first"
        )

`list_from_uniprot_ids(uniprot_ids, fields=None, from_db='UniProtKB_AC-ID', to_db='UniProtKB-Swiss-Prot')` `classmethod` ¶

Create a list of Protein objects from a list of Uniprot IDs (fetches with Uniprot API)

Parameters:

Name	Type	Description	Default
`uniprot_ids`	`list[str]`	The Uniprot IDs of the proteins.	required
`fields`	`list[str] \| None`	The fields to retrieve from Uniprot. If `None`, `Protein.UNIPROT_API_DEFAULT_FIELDS` is used.	`None`
`from_db`	`str`	The database to retrieve the IDs from. Defaults to "UniProtKB_AC-ID".	`'UniProtKB_AC-ID'`
`to_db`	`str`	The database to map to. Defaults to "UniProtKB-Swiss-Prot".	`'UniProtKB-Swiss-Prot'`

Raises:

Type	Description
`ValueError`	If we cannot retrieve the Uniprot IDs.

Returns:

Type	Description
`list[Protein]`	list[Protein]: A list of processed and standardized protein objects.

Source code in procaliper/_protein.py

@classmethod
def list_from_uniprot_ids(
    cls,
    uniprot_ids: list[str],
    fields: list[str] | None = None,
    from_db: str = "UniProtKB_AC-ID",
    to_db: str = "UniProtKB-Swiss-Prot",
) -> list[Protein]:
    """Create a list of Protein objects from a list of Uniprot IDs (fetches with Uniprot API)

    Args:
        uniprot_ids (list[str]): The Uniprot IDs of the proteins.
        fields (list[str] | None, optional): The fields to retrieve from
            Uniprot. If `None`, `Protein.UNIPROT_API_DEFAULT_FIELDS` is used.
        from_db (str, optional): The database to retrieve the IDs from.
            Defaults to "UniProtKB_AC-ID".
        to_db (str, optional): The database to map to.
            Defaults to "UniProtKB-Swiss-Prot".

    Raises:
        ValueError: If we cannot retrieve the Uniprot IDs.

    Returns:
        list[Protein]: A list of processed and standardized protein objects.
    """
    if not fields:
        fields = cls.UNIPROT_API_DEFAULT_FIELDS

    mapper = ProtMapper()

    result, error = mapper.get(
        ids=uniprot_ids, fields=fields, from_db=from_db, to_db=to_db
    )
    if error:
        raise ValueError(f"Uniprot id not retrieved: {error}")
    result.rename(columns={"From": "entry"}, inplace=True)

    if "Length" in result.columns:
        result["Length"] = pd.to_numeric(result["Length"])
    return [cls.from_uniprot_row(row.to_dict()) for _, row in result.iterrows()]

`register_local_pdb(path_to_pdb_file=None)` ¶

Sets pdb file for protein object using local pdb file.

Parameters:

Name	Type	Description	Default
`path_to_pdb_file`	`str \| None`	Path to local PDB file. Defaults to `None`, in which case it assumes a file with 'entry'.pdb.	`None`

Source code in procaliper/_protein.py

def register_local_pdb(self, path_to_pdb_file: str | None = None) -> None:
    """Sets pdb file for protein object using local pdb file.

    Args:
        path_to_pdb_file (str | None, optional): Path to local PDB file.
            Defaults to `None`, in which case it assumes a file with 'entry'.pdb."""
    if not path_to_pdb_file:
        path_to_pdb_file = f"{self.data['entry']}.pdb"
    self.pdb_location_relative = path_to_pdb_file
    self.pdb_location_absolute = os.path.abspath(path_to_pdb_file)
    self._build_structure_index()

`unravel_sites(selected_aas=None, selected_keys=None)` ¶

Split the protein into individual sites, recording values for each.

Parameters:

Name	Type	Description	Default
`selected_aas`	`None \| set[AminoAcidLetter]`	A set of amino acids letters to include in the output. If `None` (default), all amino acids will be included.	`None`
`selected_keys`	`None \| set[str]`	A set of keys belonging to this `Protein` object's `data` dictionary to include in the output. If `None` (default), all keys are used.	`None`

Returns:

Type	Description
`dict[str, list[Any]]`	dict[str, list[Any]]: A dictionary mapping keys to lists of values. Each list is a parallel array of the same length as the protein sequence (after filtering out non-selected amino acids).

Source code in procaliper/_protein.py

def unravel_sites(
    self,
    selected_aas: None | set[AminoAcidLetter] = None,
    selected_keys: None | set[str] = None,
) -> dict[str, list[Any]]:
    """Split the protein into individual sites, recording values for each.

    Args:
        selected_aas: A set of amino acids letters to include in the output.
            If `None` (default), all amino acids will be included.
        selected_keys: A set of keys belonging to this `Protein` object's
            `data` dictionary to include in the output. If `None` (default),
            all keys are used.

    Returns:
        dict[str, list[Any]]: A dictionary mapping keys to lists of values.
            Each list is a parallel array of the same length as the protein
            sequence (after filtering out non-selected amino acids)."""
    tbl = self.site_annotations.table() | self.custom_site_data.table()
    if selected_keys is None:
        selected_keys = (set(tbl.keys()) | set(self.data.keys())) - {"sequence"}
    tbl_keys = selected_keys & set(tbl.keys())
    data_keys = selected_keys & set(self.data.keys())
    assert tbl_keys.isdisjoint(data_keys)
    res: dict[str, list[Any]] = {k: [] for k in selected_keys}
    for index, site in enumerate(self.data["sequence"]):
        if selected_aas and site not in selected_aas:
            continue
        for k in tbl_keys:
            res[k].append(tbl[k][index])
        for k in data_keys:
            res[k].append(self.data[k])  # will be the same for all sites

    return res

`network` ¶

`contact_network(protein, max_dist_angstroms=10.0)` ¶

Constructs a contact network from a protein.

Parameters:

Name	Type	Description	Default
`protein`	`Protein`	Protein object.	required
`max_dist_angstroms`	`float`	Maximum distance between residues to be considered a contact. Defaults to 10.0.	`10.0`

Returns:

Type	Description
`Graph`	nx.Graph: Contact network.

Source code in procaliper/network.py

def contact_network(protein: Protein, max_dist_angstroms: float = 10.0) -> nx.Graph:
    """Constructs a contact network from a protein.

    Args:
        protein (Protein): Protein object.
        max_dist_angstroms (float, optional): Maximum distance between residues to be considered a contact. Defaults to 10.0.

    Returns:
        nx.Graph: Contact network.
    """
    return nx.from_numpy_array(
        psd.contact_map(protein.get_biopython_structure(), max_dist_angstroms)
    )

`distance_network(protein, max_dist_angstroms=20)` ¶

Constructs a distance network from a protein.

Parameters:

Name	Type	Description	Default
`protein`	`Protein`	Protein object.	required
`max_dist_angstroms`	`float`	Maximum distance between residues. Values greater than this will be set to np.inf. Defaults to 20.	`20`

Returns:

Type	Description
`Graph`	nx.Graph: Distance network.

Source code in procaliper/network.py

def distance_network(protein: Protein, max_dist_angstroms: float = 20) -> nx.Graph:
    """Constructs a distance network from a protein.

    Args:
        protein (Protein): Protein object.
        max_dist_angstroms (float, optional): Maximum distance between residues.
            Values greater than this will be set to np.inf. Defaults to 20.

    Returns:
        nx.Graph: Distance network.
    """
    g = nx.from_numpy_array(
        psd.distance_matrix(protein.get_biopython_structure(), max_dist_angstroms)
    )
    for u, v, d in list(g.edges(data=True)):
        if d["weight"] == np.inf:
            g.remove_edge(u, v)
        else:
            d["proximity"] = 1 / (d["weight"] + 1)
            d["d2"] = d["weight"] ** 2
    return g

`euclidean_backbone(g)` ¶

Returns the Euclidean backbone of a distance network.

The Euclidean backbone of a weighted graph g is the smallest subgraph of g that contains all shortest paths where a path length is determined by the square root of the sum of the squared edge weights.

This is useful for sparsifying a distance network without disconnecting it.

Parameters:

Name	Type	Description	Default
`g`	`Graph`	Distance network. Edges must have an attribute "d2" representing the squared edge weight. This is computed by `distance_network` and `regulatory_distance_network` automatically.	required

Returns:

Type	Description
`Graph`	nx.Graph: Euclidean backbone.

Source code in procaliper/network.py

def euclidean_backbone(g: nx.Graph) -> nx.Graph:
    """Returns the Euclidean backbone of a distance network.

    The Euclidean backbone of a weighted graph g is the smallest subgraph of g that contains
    all shortest paths where a path length is determined by the square root of the sum of the
    squared edge weights.

    This is useful for sparsifying a distance network without disconnecting it.

    Args:
        g (nx.Graph): Distance network. Edges must have an attribute "d2" representing the
            squared edge weight. This is computed by `distance_network` and
            `regulatory_distance_network` automatically.

    Returns:
        nx.Graph: Euclidean backbone.
    """
    return dc.backbone(g, weight="d2", kind="metric")

`regulatory_distance_network(protein)` ¶

Constructs a regulatory region distance network from a protein.

Distances are computed between PTM sites, annotated regions, binding sites, and active sites.

Node labels will be 1-indexed and inclusive (e.g., "K5..C7" refers to residues 5, 6, and 7). The letter in front of the index refers to the first and last amino acid in the region.

Parameters:

Name	Type	Description	Default
`protein`	`Protein`	Protein object.	required

Returns:

Type	Description
`Graph`	nx.Graph: Distance network.

Source code in procaliper/network.py

def regulatory_distance_network(protein: Protein) -> nx.Graph:
    """Constructs a regulatory region distance network from a protein.

    Distances are computed between PTM sites, annotated regions, binding sites, and active sites.

    Node labels will be 1-indexed and inclusive (e.g., `"K5..C7"` refers to residues 5, 6, and 7).
    The letter in front of the index refers to the first and last amino acid in the region.

    Args:
        protein (Protein): Protein object.

    Returns:
        nx.Graph: Distance network.
    """
    if protein.sequence_position_to_structure_index is None:
        raise ValueError(
            "Protein structure not loaded; use `fetch_pdb`  or `register_local_pdb` first"
        )

    ptms = {f"p_{i}": [i] for i, x in enumerate(protein.site_annotations.ptm) if x}
    binding = {
        f"b_{i}": [i] for i, x in enumerate(protein.site_annotations.binding) if x
    }
    active = {f"a_{i}": [i] for i, x in enumerate(protein.site_annotations.active) if x}
    regions = protein.site_annotations.regions
    domains = protein.site_annotations.domains

    all_regs = {**ptms, **binding, **active, **regions, **domains}

    # residues, excluding heteroatoms and water
    protein_residues = [
        res for res in protein.get_biopython_residues() if res.get_id()[0] == " "
    ]

    all_regs_residues = {}
    for k, v in all_regs.items():
        structure_matched = []
        for i in v:
            if i in protein.sequence_position_to_structure_index:
                res_ind = protein.sequence_position_to_structure_index[i]
                structure_matched.append(protein_residues[res_ind])
        if structure_matched:
            all_regs_residues[k] = structure_matched

    g = nx.Graph()
    for k, v in all_regs.items():
        g.add_node(
            k,
            label=_region_label(v, protein.data["sequence"]),
            region_type=_region_type(k),
            residues=v,
        )

    for k1, v1 in all_regs_residues.items():
        for k2, v2 in all_regs_residues.items():
            if k1 == k2:
                continue
            weight = psd.region_distance(v1, v2)
            g.add_edge(k1, k2, weight=weight, d2=weight**2, proximity=1 / (weight + 1))

    return g

`protein_structure` ¶

`calculate_charge(pdb_filename, method='gasteiger')` ¶

Computes the charge of residue sites in a PDB file.

By default, the method used is 'gasteiger', but this is configurable in hyperparameters.py.

Parameters:

Name	Type	Description	Default
`pdb_filename`	`str`	The path to the PDB file. shortname (str): The shortname of the protein (typically will be UniProt ID).	required
`method`	`str`	The method used for the charge calculation. Examples include 'qtpie', 'eem', 'gasteiger'. Defaults to 'gasteiger'. For a full list reference https://open-babel.readthedocs.io/en/latest/Charges/charges.html	`'gasteiger'`

Raises:

Type	Description
`ValueError`	If the charge method is not found.

Returns:

Name	Type	Description
`ChargeData`	`ChargeData`	A data class for holding charge data from computed from a PDB file.

Source code in procaliper/protein_structure/charge.py

def calculate_charge(pdb_filename: str, method: str = "gasteiger") -> ChargeData:
    """Computes the charge of residue sites in a PDB file.

    By default, the method used is 'gasteiger', but this is configurable in
    `hyperparameters.py`.

    Args:
        pdb_filename (str): The path to the PDB file. shortname (str): The
            shortname of the protein (typically will be UniProt ID).
        method (str, optional): The method used for the charge calculation.
            Examples include 'qtpie', 'eem', 'gasteiger'. Defaults to
            'gasteiger'. For a full list reference
            https://open-babel.readthedocs.io/en/latest/Charges/charges.html


    Raises:
        ValueError: If the charge method is not found.

    Returns:
        ChargeData: A data class for holding charge data from computed from a
            PDB file.
    """
    pbmol = next(pybel.readfile("pdb", pdb_filename))
    mol = pbmol.OBMol

    # Applies the model and computes charges.
    ob_charge_model = ob.OBChargeModel.FindType(method)

    if not ob_charge_model:
        raise ValueError("Charge method not found. Please check hyperparameters.py")
    ob_charge_model.ComputeCharges(mol)

    charges = cast(list[float], ob_charge_model.GetPartialCharges())

    ppdb = PandasPdb()
    ppdb.read_pdb(pdb_filename)

    # Set up dict
    res = ChargeData(
        {
            "charge": [],
            "charge_method": [],
        }
    )

    for _, residue in sorted(ppdb.df["ATOM"].groupby("residue_number")):
        res["charge"].append([charges[x - 1] for x in sorted(residue["atom_number"])])
        res["charge_method"].append(method)

    return res

`calculate_cysteine_data(pdb_filename)` ¶

Calculates spatial data for a protein from a PDB file.

Parameters:

Name	Type	Description	Default
`pdb_filename`	`str`	The path to the PDB file.	required

Returns:

Name	Type	Description
`CysteineData`	`CysteineData`	A data class for holding size data from computed from a PDB file.

Source code in procaliper/protein_structure/cysteine_data.py

def calculate_cysteine_data(pdb_filename: str) -> CysteineData:
    """Calculates spatial data for a protein from a PDB file.

    Args:
        pdb_filename (str): The path to the PDB file.

    Returns:
        CysteineData: A data class for holding size data from computed from a PDB file.
    """
    ppdb = PandasPdb()
    ppdb.read_pdb(pdb_filename)

    res = CysteineData(
        {
            "cys_ratio": [],
            "min_dist_to_closest_sulfur": [],
            "sulfur_closeness_rating_scaled": [],
        }
    )

    total_residue = cast(int, max(ppdb.df["ATOM"]["residue_number"]))

    cys_positions: list[tuple[float, float, float]] = []
    for x in range(len(ppdb.df["ATOM"])):
        if ppdb.df["ATOM"]["residue_name"][x] == "CYS":
            if ppdb.df["ATOM"]["atom_name"][x] == "SG":
                cys_positions.append(
                    (
                        ppdb.df["ATOM"]["x_coord"][x],
                        ppdb.df["ATOM"]["y_coord"][x],
                        ppdb.df["ATOM"]["z_coord"][x],
                    )
                )
    total_cys_sites = len(cys_positions)

    cys_index = 0

    for _, grp in sorted(ppdb.df["ATOM"].groupby("residue_number")):
        if grp["residue_name"].max() == "CYS":
            sg_closeness_rating_scaled = 0
            x_p, y_p, z_p = cys_positions[cys_index]
            min_distance = 1000  # Initialize with a large number

            points_excluding_index = (
                cys_positions[:cys_index] + cys_positions[cys_index + 1 :]
            )
            for point in points_excluding_index:
                x_q, y_q, z_q = point
                distance = np.sqrt(
                    (x_p - x_q) ** 2 + (y_p - y_q) ** 2 + (z_p - z_q) ** 2
                )
                if distance < min_distance:
                    min_distance = distance
                sg_closeness_rating_scaled += 10 / ((distance + 1) ** 2)

            cys_index += 1

            res["cys_ratio"].append(float(total_cys_sites) / float(total_residue))
            res["min_dist_to_closest_sulfur"].append(min_distance)
            res["sulfur_closeness_rating_scaled"].append(sg_closeness_rating_scaled)
        else:
            res["cys_ratio"].append(None)
            res["min_dist_to_closest_sulfur"].append(None)
            res["sulfur_closeness_rating_scaled"].append(None)

    return res

`calculate_sasa(pdb_filename)` ¶

Compute the SASA values for all CYS sites in a PDB file.

Uses the ShrakeRupley algorithm implemented in Bio.PDB.SASA.ShrakeRupley with a probe radius of 1.40 and 100 points.

Parameters:

Name	Type	Description	Default
`pdb_filename`	`str`	The path to the PDB file.	required

Returns:

Name	Type	Description
`SASAData`	`SASAData`	A data class for holding SASA data from computed from a PDB file.

Source code in procaliper/protein_structure/sasa.py

def calculate_sasa(pdb_filename: str) -> SASAData:
    """Compute the SASA values for all CYS sites in a PDB file.

    Uses the ShrakeRupley algorithm implemented in `Bio.PDB.SASA.ShrakeRupley`
    with a probe radius of 1.40 and 100 points.

    Args:
        pdb_filename (str): The path to the PDB file.

    Returns:
        SASAData: A data class for holding SASA data from computed from a PDB
            file."""
    p = PDBParser(QUIET=True)
    struct = p.get_structure("", pdb_filename)

    sr = ShrakeRupley(probe_radius=PROBE_RADIUS, n_points=N_POINTS, radii_dict=None)

    # Calc sasa values from Residues (from atoms)
    sr.compute(struct, level="R")

    # Set up dict
    res = SASAData(
        {
            "all_sasa_value": [],
            "atom_sasa_values": [],
        }
    )

    assert isinstance(struct, Structure)
    assert struct is not None

    # Fill dict with CYS sites
    for x in struct.child_list:
        for y in x.child_list:
            for z in y.child_list:
                if z.get_id()[0] != " ":  # skips heteroatoms
                    continue
                assert hasattr(z, "sasa")
                res["all_sasa_value"].append(z.sasa)
                res["atom_sasa_values"].append([zx.sasa for zx in z.child_list])  # type: ignore

    return res

`residue_pLDDT(pdb_filename)` ¶

Extracts the pLDDT confidence for each residue in a PDB file.

We assume that the pLDDT confidences are in the B-factor entries of the PDB file. If this information is provided at the atom-level, the maximimum value across the residue is used.

Parameters:

Name	Type	Description	Default
`pdb_filename`	`str`	The path to the PDB file.	required

Returns:

Type	Description
`list[float]`	list[float]: The pLDDT confidence for each residue in the PDB file.

Source code in procaliper/protein_structure/confidence.py

def residue_pLDDT(pdb_filename: str) -> list[float]:
    """Extracts the pLDDT confidence for each residue in a PDB file.

    We assume that the pLDDT confidences are in the B-factor entries of the PDB
    file. If this information is provided at the atom-level, the maximimum value
    across the residue is used.

    Args:
        pdb_filename (str): The path to the PDB file.

    Returns:
        list[float]: The pLDDT confidence for each residue in the PDB file.
    """
    ppdb = PandasPdb()
    ppdb.read_pdb(pdb_filename)

    vals = []
    for _, res in ppdb.df["ATOM"].groupby("residue_number"):
        vals.append(res["b_factor"].max())

    return vals

`charge` ¶

`ChargeData` ¶

Bases: TypedDict

A data class for holding charge data from computed from a PDB file.

Array index corresponds to residue number in the PDB. Note that Python arrays are 0-indexed and PDB files are 1-indexed, so Python index 0 corresponds to residue 1. This assumes a complete PDB. Otherwise, an object of the procaliper.Protein class that constructs this will store a variable called structure_index that maps these indices to the sequence position.

Attributes:

Name	Type	Description
`charges`	`list[list[float]]`	The charge value for atoms in the residue, ordered from C-terminus to N-terminus according to standard pdb order. For example, in CYS, the last atom is always the SG sulfur.
`method`	`list[str]`	The method used for the charge calculation.
`residue_number`	`list[int]`	The residue number for the site.
`residue_name`	`list[str]`	The residue name (three-letter amino acid abbreviation) for the sites.

Source code in procaliper/protein_structure/charge.py

class ChargeData(TypedDict):
    """
    A data class for holding charge data from computed from a PDB file.

    Array index corresponds to residue number in the PDB. Note that Python
    arrays are 0-indexed and PDB files are 1-indexed, so Python index 0
    corresponds to residue 1. This assumes a complete PDB. Otherwise,
    an object of the `procaliper.Protein` class that constructs this will
    store a variable called `structure_index` that maps these indices to the
    sequence position.

    Attributes:
        charges (list[list[float]]): The charge value for atoms in the residue,
            ordered from C-terminus to N-terminus according to standard pdb order.
            For example, in CYS, the last atom is always the SG sulfur.
        method (list[str]): The method used for the charge calculation.
        residue_number (list[int]): The residue number for the site.
        residue_name (list[str]): The residue name (three-letter amino acid
            abbreviation) for the sites.
    """

    charge: list[list[float]]
    charge_method: list[str]

`calculate_charge(pdb_filename, method='gasteiger')` ¶

Computes the charge of residue sites in a PDB file.

By default, the method used is 'gasteiger', but this is configurable in hyperparameters.py.

Parameters:

Name	Type	Description	Default
`pdb_filename`	`str`	The path to the PDB file. shortname (str): The shortname of the protein (typically will be UniProt ID).	required
`method`	`str`	The method used for the charge calculation. Examples include 'qtpie', 'eem', 'gasteiger'. Defaults to 'gasteiger'. For a full list reference https://open-babel.readthedocs.io/en/latest/Charges/charges.html	`'gasteiger'`

Raises:

Type	Description
`ValueError`	If the charge method is not found.

Returns:

Name	Type	Description
`ChargeData`	`ChargeData`	A data class for holding charge data from computed from a PDB file.

Source code in procaliper/protein_structure/charge.py

def calculate_charge(pdb_filename: str, method: str = "gasteiger") -> ChargeData:
    """Computes the charge of residue sites in a PDB file.

    By default, the method used is 'gasteiger', but this is configurable in
    `hyperparameters.py`.

    Args:
        pdb_filename (str): The path to the PDB file. shortname (str): The
            shortname of the protein (typically will be UniProt ID).
        method (str, optional): The method used for the charge calculation.
            Examples include 'qtpie', 'eem', 'gasteiger'. Defaults to
            'gasteiger'. For a full list reference
            https://open-babel.readthedocs.io/en/latest/Charges/charges.html


    Raises:
        ValueError: If the charge method is not found.

    Returns:
        ChargeData: A data class for holding charge data from computed from a
            PDB file.
    """
    pbmol = next(pybel.readfile("pdb", pdb_filename))
    mol = pbmol.OBMol

    # Applies the model and computes charges.
    ob_charge_model = ob.OBChargeModel.FindType(method)

    if not ob_charge_model:
        raise ValueError("Charge method not found. Please check hyperparameters.py")
    ob_charge_model.ComputeCharges(mol)

    charges = cast(list[float], ob_charge_model.GetPartialCharges())

    ppdb = PandasPdb()
    ppdb.read_pdb(pdb_filename)

    # Set up dict
    res = ChargeData(
        {
            "charge": [],
            "charge_method": [],
        }
    )

    for _, residue in sorted(ppdb.df["ATOM"].groupby("residue_number")):
        res["charge"].append([charges[x - 1] for x in sorted(residue["atom_number"])])
        res["charge_method"].append(method)

    return res

`confidence` ¶

`residue_pLDDT(pdb_filename)` ¶

Extracts the pLDDT confidence for each residue in a PDB file.

We assume that the pLDDT confidences are in the B-factor entries of the PDB file. If this information is provided at the atom-level, the maximimum value across the residue is used.

Parameters:

Name	Type	Description	Default
`pdb_filename`	`str`	The path to the PDB file.	required

Returns:

Type	Description
`list[float]`	list[float]: The pLDDT confidence for each residue in the PDB file.

Source code in procaliper/protein_structure/confidence.py

def residue_pLDDT(pdb_filename: str) -> list[float]:
    """Extracts the pLDDT confidence for each residue in a PDB file.

    We assume that the pLDDT confidences are in the B-factor entries of the PDB
    file. If this information is provided at the atom-level, the maximimum value
    across the residue is used.

    Args:
        pdb_filename (str): The path to the PDB file.

    Returns:
        list[float]: The pLDDT confidence for each residue in the PDB file.
    """
    ppdb = PandasPdb()
    ppdb.read_pdb(pdb_filename)

    vals = []
    for _, res in ppdb.df["ATOM"].groupby("residue_number"):
        vals.append(res["b_factor"].max())

    return vals

`cysteine_data` ¶

`CysteineData` ¶

Bases: TypedDict

Data class for holding size data from computed from a PDB file.

Non-CYS sites are assigned None values.

Array index corresponds to residue number in the PDB. Note that Python arrays are 0-indexed and PDB files are 1-indexed, so Python index 0 corresponds to residue 1. This assumes a complete PDB. Otherwise, an object of the procaliper.Protein class that constructs this will store a variable called structure_index that maps these indices to the sequence position.

Attributes:

Name	Type	Description
`cys_ratio`	`list[float \| None]`	The ratio of CYS sites to total sites.
`min_dist_to_closest_sulfur`	`list[float \| None]`	The minimum distance to the closest sulfur for each CYS site.
`sulfur_closeness_rating_scaled`	`list[float \| None]`	The sulfur closeness rating scaled for the CYS sites.

Source code in procaliper/protein_structure/cysteine_data.py

class CysteineData(TypedDict):
    """Data class for holding size data from computed from a PDB file.

    Non-CYS sites are assigned `None` values.

    Array index corresponds to residue number in the PDB. Note that Python
    arrays are 0-indexed and PDB files are 1-indexed, so Python index 0
    corresponds to residue 1. This assumes a complete PDB. Otherwise,
    an object of the `procaliper.Protein` class that constructs this will
    store a variable called `structure_index` that maps these indices to the
    sequence position.

    Attributes:
        cys_ratio (list[float | None]): The ratio of CYS sites to total sites.
        min_dist_to_closest_sulfur (list[float | None]): The minimum distance to the closest sulfur for each CYS site.
        sulfur_closeness_rating_scaled (list[float | None]): The sulfur closeness rating scaled for the CYS sites."""

    cys_ratio: list[float | None]
    min_dist_to_closest_sulfur: list[float | None]
    sulfur_closeness_rating_scaled: list[float | None]

`calculate_cysteine_data(pdb_filename)` ¶

Calculates spatial data for a protein from a PDB file.

Parameters:

Name	Type	Description	Default
`pdb_filename`	`str`	The path to the PDB file.	required

Returns:

Name	Type	Description
`CysteineData`	`CysteineData`	A data class for holding size data from computed from a PDB file.

Source code in procaliper/protein_structure/cysteine_data.py

def calculate_cysteine_data(pdb_filename: str) -> CysteineData:
    """Calculates spatial data for a protein from a PDB file.

    Args:
        pdb_filename (str): The path to the PDB file.

    Returns:
        CysteineData: A data class for holding size data from computed from a PDB file.
    """
    ppdb = PandasPdb()
    ppdb.read_pdb(pdb_filename)

    res = CysteineData(
        {
            "cys_ratio": [],
            "min_dist_to_closest_sulfur": [],
            "sulfur_closeness_rating_scaled": [],
        }
    )

    total_residue = cast(int, max(ppdb.df["ATOM"]["residue_number"]))

    cys_positions: list[tuple[float, float, float]] = []
    for x in range(len(ppdb.df["ATOM"])):
        if ppdb.df["ATOM"]["residue_name"][x] == "CYS":
            if ppdb.df["ATOM"]["atom_name"][x] == "SG":
                cys_positions.append(
                    (
                        ppdb.df["ATOM"]["x_coord"][x],
                        ppdb.df["ATOM"]["y_coord"][x],
                        ppdb.df["ATOM"]["z_coord"][x],
                    )
                )
    total_cys_sites = len(cys_positions)

    cys_index = 0

    for _, grp in sorted(ppdb.df["ATOM"].groupby("residue_number")):
        if grp["residue_name"].max() == "CYS":
            sg_closeness_rating_scaled = 0
            x_p, y_p, z_p = cys_positions[cys_index]
            min_distance = 1000  # Initialize with a large number

            points_excluding_index = (
                cys_positions[:cys_index] + cys_positions[cys_index + 1 :]
            )
            for point in points_excluding_index:
                x_q, y_q, z_q = point
                distance = np.sqrt(
                    (x_p - x_q) ** 2 + (y_p - y_q) ** 2 + (z_p - z_q) ** 2
                )
                if distance < min_distance:
                    min_distance = distance
                sg_closeness_rating_scaled += 10 / ((distance + 1) ** 2)

            cys_index += 1

            res["cys_ratio"].append(float(total_cys_sites) / float(total_residue))
            res["min_dist_to_closest_sulfur"].append(min_distance)
            res["sulfur_closeness_rating_scaled"].append(sg_closeness_rating_scaled)
        else:
            res["cys_ratio"].append(None)
            res["min_dist_to_closest_sulfur"].append(None)
            res["sulfur_closeness_rating_scaled"].append(None)

    return res

`distance` ¶

`contact_map(structure, max_dist_angsrtom=10)` ¶

A contact map for a protein structure.

Parameters:

Name	Type	Description	Default
`structure`	`Structure`	protein structure.	required
`max_dist_angsrtom`	`float`	Largest distance to consider a contact, in Angstroms. Defaults to 10.	`10`

Returns:

Type	Description
`NDArray[int8]`	npt.NDArray[np.float64]: contact map with shape nxn where n is the number of residues in the structure.

Source code in procaliper/protein_structure/distance.py

def contact_map(
    structure: Structure, max_dist_angsrtom: float = 10
) -> npt.NDArray[np.int8]:
    """A contact map for a protein structure.

    Args:
        structure (Structure): protein structure.
        max_dist_angsrtom (float, optional): Largest distance to consider a contact,
            in Angstroms. Defaults to 10.

    Returns:
        npt.NDArray[np.float64]: contact map with shape nxn where n is the
            number of residues in the structure.
    """
    residues = [res for model in structure for chain in model for res in chain]
    residues = list(enumerate(residues))
    adj = np.zeros((len(residues), len(residues)), dtype=np.int8)

    # a residue has zero distance to itself
    for i in range(len(residues)):
        adj[i, i] = np.int8(1)

    for (row, r1), (col, r2) in combinations(residues, 2):
        dist = residue_distance(r1, r2)
        if dist <= max_dist_angsrtom:
            adj[row, col] = np.int8(1)
            adj[col, row] = np.int8(1)
    return adj

`distance_matrix(structure, thresh=np.inf)` ¶

Compute a distance matrix for a protein structure.

Parameters:

Name	Type	Description	Default
`structure`	`Structure`	protein structure.	required
`thresh`	`float`	threshold for distance. Defaults to np.inf. Distances greater than this will be set to np.inf.	`inf`

Returns:

Type	Description
`NDArray[float64]`	npt.NDArray[np.float64]: distance matrix with shape nxn where n is the number of residues in the structure.

Source code in procaliper/protein_structure/distance.py

def distance_matrix(
    structure: Structure, thresh: float = np.inf
) -> npt.NDArray[np.float64]:
    """Compute a distance matrix for a protein structure.

    Args:
        structure (Structure): protein structure.
        thresh (float, optional): threshold for distance. Defaults to np.inf.
            Distances greater than this will be set to np.inf.

    Returns:
        npt.NDArray[np.float64]: distance matrix with shape nxn where n is the
            number of residues in the structure.
    """
    residues = [res for model in structure for chain in model for res in chain]
    residues = list(enumerate(residues))
    adj = np.ones((len(residues), len(residues))) * np.inf

    # a residue has zero distance to itself
    for i in range(len(residues)):
        adj[i, i] = 0

    for (row, r1), (col, r2) in combinations(residues, 2):
        dist = residue_distance(r1, r2)
        if dist <= thresh:
            adj[row, col] = dist
            adj[col, row] = adj[row, col]
    return adj

`proximity_matrix(structure, thresh=0)` ¶

Compute a proximity matrix for a protein structure.

Parameters:

Name	Type	Description	Default
`structure`	`Structure`	protein structure.	required
`thresh`	`float`	threshold for proximity. Defaults to 0. Proximity less than this will be set to 0.	`0`

Returns:

Type	Description
`NDArray[float64]`	npt.NDArray[np.float64]: proximity matrix with shape nxn where n is the number of residues in the structure.

Source code in procaliper/protein_structure/distance.py

def proximity_matrix(
    structure: Structure, thresh: float = 0
) -> npt.NDArray[np.float64]:
    """Compute a proximity matrix for a protein structure.

    Args:
        structure (Structure): protein structure.
        thresh (float, optional): threshold for proximity. Defaults to 0. Proximity
            less than this will be set to 0.

    Returns:
        npt.NDArray[np.float64]: proximity matrix with shape nxn where n is the
            number of residues in the structure.
    """
    residues = [res for model in structure for chain in model for res in chain]
    residues = list(enumerate(residues))
    adj = np.zeros((len(residues), len(residues)))

    # a residue has proximity 1 to itself
    for i in range(len(residues)):
        adj[i, i] = 1

    for (row, r1), (col, r2) in combinations(residues, 2):
        prox = 1 / (residue_distance(r1, r2) + 1)
        if prox >= thresh:
            adj[row, col] = prox
            adj[col, row] = adj[row, col]
    return adj

`region_distance(region_1, region_2)` ¶

Compute the distance between two regions of a protein, in Angstroms.

Parameters:

Name	Type	Description	Default
`region_1`	`Iterable[Residue]`	first region.	required
`region_2`	`Iterable[Residue]`	second region.	required

Returns:

Type	Description
`floating[Any]`	np.floating[Any]: minimum distance between the two regions.

Source code in procaliper/protein_structure/distance.py

def region_distance(
    region_1: Iterable[Residue], region_2: Iterable[Residue]
) -> np.floating[Any]:
    """Compute the distance between two regions of a protein, in Angstroms.

    Args:
        region_1 (Iterable[Residue]): first region.
        region_2 (Iterable[Residue]): second region.

    Returns:
        np.floating[Any]: minimum distance between the two regions.
    """
    return min(residue_distance(r1, r2) for r1, r2 in product(region_1, region_2))

`region_distance_matrix(regions)` ¶

Compute a distance matrix between regions of a protein.

Parameters:

Name	Type	Description	Default
`regions`	`Sequence[Iterable[Residue]]`	sequence of regions; each region is an iterable of residues.	required

Returns:

Type	Description
`NDArray[float64]`	npt.NDArray[np.float64]: distance matrix with shape nxn where n is the number of regions.

Source code in procaliper/protein_structure/distance.py

def region_distance_matrix(
    regions: Sequence[Iterable[Residue]],
) -> npt.NDArray[np.float64]:
    """Compute a distance matrix between regions of a protein.

    Args:
        regions (Sequence[Iterable[Residue]]): sequence of regions; each region is an iterable of residues.

    Returns:
        npt.NDArray[np.float64]: distance matrix with shape nxn where n is the
            number of regions.
    """
    return np.array([[region_distance(r1, r2) for r2 in regions] for r1 in regions])

`region_proximity_matrix(regions)` ¶

Compute a proxmity matrix between regions of a protein.

Parameters:

Name	Type	Description	Default
`regions`	`Sequence[Iterable[Residue]]`	sequence of regions; each region is an iterable of residues.	required

Returns:

Type	Description
`NDArray[float64]`	npt.NDArray[np.float64]: proxmity matrix with shape nxn where n is the number of regions.

Source code in procaliper/protein_structure/distance.py

def region_proximity_matrix(
    regions: Sequence[Iterable[Residue]],
) -> npt.NDArray[np.float64]:
    """Compute a proxmity matrix between regions of a protein.

    Args:
        regions (Sequence[Iterable[Residue]]): sequence of regions; each region is an iterable of residues.

    Returns:
        npt.NDArray[np.float64]: proxmity matrix with shape nxn where n is the
            number of regions.
    """
    return 1 / (
        1 + np.array([[region_distance(r1, r2) for r2 in regions] for r1 in regions])
    )

`residue_distance(r1, r2)` ¶

Compute the distance between two residues, in Angstroms.

Parameters:

Name	Type	Description	Default
`r1`	`Residue`	first residue.	required
`r2`	`Residue`	second residue.	required

Returns:

Type	Description
`floating[Any]`	np.floating[Any]: distance between the two residues.

Source code in procaliper/protein_structure/distance.py

def residue_distance(
    r1: Residue,
    r2: Residue,
) -> np.floating[Any]:
    """Compute the distance between two residues, in Angstroms.

    Args:
        r1 (Residue): first residue.
        r2 (Residue): second residue.

    Returns:
        np.floating[Any]: distance between the two residues.
    """
    dv = r1["CA"].coord - r2["CA"].coord
    return np.linalg.norm(dv)

`sasa` ¶

`SASAData` ¶

Bases: TypedDict

Data class for holding SASA data from computed from a PDB file.

Array index corresponds to residue number in the PDB. Note that Python arrays are 0-indexed and PDB files are 1-indexed, so Python index 0 corresponds to residue 1. This assumes a complete PDB. Otherwise, an object of the procaliper.Protein class that constructs this will store a variable called structure_index that maps these indices to the sequence position.

Attributes:

Name	Type	Description
`all_sasa_value`	`list[float]`	The overall SASA value for each site (computed as sum of atom SASA values).
`atom_sasa_values`	`list[list[float]]`	The SASA value for the each atom in each sites. Atoms are ordered from C-terminus to N-terminus according to standard pdb order. For example, in CYS, the last atom is always the SG sulfur.

Source code in procaliper/protein_structure/sasa.py

class SASAData(TypedDict):
    """Data class for holding SASA data from computed from a PDB file.

    Array index corresponds to residue number in the PDB. Note that Python
    arrays are 0-indexed and PDB files are 1-indexed, so Python index 0
    corresponds to residue 1. This assumes a complete PDB. Otherwise,
    an object of the `procaliper.Protein` class that constructs this will
    store a variable called `structure_index` that maps these indices to the
    sequence position.

    Attributes:
        all_sasa_value (list[float]): The overall SASA value for each site
            (computed as sum of atom SASA values).
        atom_sasa_values (list[list[float]]): The SASA value for the each atom
            in each sites. Atoms are ordered from C-terminus to N-terminus
            according to standard pdb order. For example, in CYS, the last atom
            is always the SG sulfur.
    """

    all_sasa_value: list[float]
    atom_sasa_values: list[list[float]]

`calculate_sasa(pdb_filename)` ¶

Compute the SASA values for all CYS sites in a PDB file.

Uses the ShrakeRupley algorithm implemented in Bio.PDB.SASA.ShrakeRupley with a probe radius of 1.40 and 100 points.

Parameters:

Name	Type	Description	Default
`pdb_filename`	`str`	The path to the PDB file.	required

Returns:

Name	Type	Description
`SASAData`	`SASAData`	A data class for holding SASA data from computed from a PDB file.

Source code in procaliper/protein_structure/sasa.py

def calculate_sasa(pdb_filename: str) -> SASAData:
    """Compute the SASA values for all CYS sites in a PDB file.

    Uses the ShrakeRupley algorithm implemented in `Bio.PDB.SASA.ShrakeRupley`
    with a probe radius of 1.40 and 100 points.

    Args:
        pdb_filename (str): The path to the PDB file.

    Returns:
        SASAData: A data class for holding SASA data from computed from a PDB
            file."""
    p = PDBParser(QUIET=True)
    struct = p.get_structure("", pdb_filename)

    sr = ShrakeRupley(probe_radius=PROBE_RADIUS, n_points=N_POINTS, radii_dict=None)

    # Calc sasa values from Residues (from atoms)
    sr.compute(struct, level="R")

    # Set up dict
    res = SASAData(
        {
            "all_sasa_value": [],
            "atom_sasa_values": [],
        }
    )

    assert isinstance(struct, Structure)
    assert struct is not None

    # Fill dict with CYS sites
    for x in struct.child_list:
        for y in x.child_list:
            for z in y.child_list:
                if z.get_id()[0] != " ":  # skips heteroatoms
                    continue
                assert hasattr(z, "sasa")
                res["all_sasa_value"].append(z.sasa)
                res["atom_sasa_values"].append([zx.sasa for zx in z.child_list])  # type: ignore

    return res

`titration` ¶

`TitrationData` ¶

Bases: TypedDict

Data class for titration data.

Array index corresponds to residue number in the PDB. Note that Python arrays are 0-indexed and PDB files are 1-indexed, so Python index 0 corresponds to residue 1. This assumes a complete PDB. Otherwise, an object of the procaliper.Protein class that constructs this will store a variable called structure_index that maps these indices to the sequence position.

Attributes:

Name	Type	Description
`pKa`	`list[float \| None]`	The pKa values for the titration data. Non-titratable sites are assigned `None` values. protonation_state
`(list[tuple[str,`	`float]]`	The expected protonation states for the titration data. The first element of the tuple is the state of the site and the second element is the average protonation of the site. Non-titratable sites are assigned `("undefined", nan)`.

Source code in procaliper/protein_structure/titration.py

class TitrationData(TypedDict):
    """Data class for titration data.

    Array index corresponds to residue number in the PDB. Note that Python
    arrays are 0-indexed and PDB files are 1-indexed, so Python index 0
    corresponds to residue 1. This assumes a complete PDB. Otherwise,
    an object of the `procaliper.Protein` class that constructs this will
    store a variable called `structure_index` that maps these indices to the
    sequence position.

    Attributes:
        pKa (list[float | None]): The pKa values for the titration data.
            Non-titratable sites are assigned `None` values. protonation_state
        (list[tuple[str, float]]): The expected protonation states for the
            titration data. The first element of the tuple is the state of the
            site and the second element is the average protonation of the site.
            Non-titratable sites are assigned `("undefined", nan)`.
    """

    pKa: list[float | None]
    protonation_state: list[tuple[str, float | str]]

`calculate_titration_propka(pdb_filename)` ¶

Uses propka to calculate titration data for the protein.

Parameters:

Name	Type	Description	Default
`pdb_filename`	`str`	The path to the PDB file.	required

Returns:

Name	Type	Description
`TitrationData`	`TitrationData`	The titration data for the protein.

Source code in procaliper/protein_structure/titration.py

def calculate_titration_propka(pdb_filename: str) -> TitrationData:
    """Uses propka to calculate titration data for the protein.

    Args:
        pdb_filename (str): The path to the PDB file.

    Returns:
        TitrationData: The titration data for the protein.
    """
    mol = propka.run.single(pdb_filename, optargs=["--quiet"], write_pka=False)
    gs = mol.conformations["AVR"].groups

    ppdb = PandasPdb()
    ppdb.read_pdb(pdb_filename)

    seq = {
        i: res["residue_name"].iloc[0]
        for i, res in ppdb.df["ATOM"].groupby("residue_number")
    }
    pks = {group.atom.res_num: group.pka_value for group in gs}
    sv = sorted(seq.items())
    return TitrationData(
        # pKa=[group.pka_value for group in gs],
        pKa=[pks[i] if i in pks else None for i, _ in sv],
        protonation_state=[_state_from_pk(pks[i] if i in pks else 0) for i, _ in sv],
    )

`site_metadata` ¶

`CustomSiteData` ¶

Class for storing custom site-level data.

Source code in procaliper/site_metadata/custom_site_data.py

class CustomSiteData:
    """Class for storing custom site-level data."""

    def __init__(self, residue_number: list[int], data: dict[str, list[Any]]) -> None:
        self.residue_number = residue_number
        for key, value in data.items():
            setattr(self, key, value)

        self.keys = {"residue_number"} | set(data.keys())

    @classmethod
    def from_dict(
        cls,
        data: dict[str, list[Any]],
        residue_index_feature_name: str = "residue_number",
    ) -> CustomSiteData:
        """Create a CustomSiteData object from a dictionary of data.

        Args:
            data (dict[str, list[Any]]): Data dictionary indexed by feature
                name. Each value must be a list of the same length as the
                residue number feature. Must include a residue number key.
            residue_index_feature_name (str, optional): The name of the feature
                that contains the residue number. Defaults to "residue_number".

        Raises:
            ValueError: If the residue number feature is not in the data.

        Returns:
            CustomSiteData: A CustomSiteData object. that contains the data.
        """
        if residue_index_feature_name not in data:
            raise ValueError("CustomSiteData must have a residue_number key.")
        return cls(data[residue_index_feature_name], data)

    def table(self) -> dict[str, list[Any]]:
        """Return a dictionary of the data in the CustomSiteData object.

        Returns:
            dict[str, list[Any]]: A dictionary of the data in the CustomSiteData
                object.
        """
        return {k: getattr(self, k) for k in self.keys}

    def add_residue_numbers(self, residue_number: list[int] | int) -> None:
        """Specify the number of residues in the CustomSiteData object.

        Args:
            residue_number (list[int] | int): If an integer, the number of
                residues. If a list of integers, the list of residue numbers.
        """
        if isinstance(residue_number, int):
            self.residue_number = list(range(1, residue_number + 1))
        else:
            self.residue_number = residue_number

    def add_site_data(self, key: str, row: list[Any], overwrite: bool = False) -> None:
        """Add a site-level feature to the CustomSiteData object.

        Args:
            key (str): The name of the feature to add.
            row (list[Any]): The values for the feature.

            overwrite (bool, optional): Whether to overwrite an existing
                feature. Defaults to False.

        Raises:
            KeyError: If overwrite is False and the feature already exists.
            ValueError: If the number of values in the feature does not match
                the number of residues.
        """
        if hasattr(self, key) and not overwrite:
            raise KeyError(
                f"CustomSiteData already has a {key} key and overwrite is False."
            )

        if len(row) != len(self.residue_number):
            raise ValueError(
                f"CustomSiteData has {len(self.residue_number)} residues, but {key} has {len(row)} values."
                " Perhaps you forgot to call add_residue_numbers?"
            )

        setattr(self, key, row)
        self.keys.add(key)

`add_residue_numbers(residue_number)` ¶

Specify the number of residues in the CustomSiteData object.

Parameters:

Name	Type	Description	Default
`residue_number`	`list[int] \| int`	If an integer, the number of residues. If a list of integers, the list of residue numbers.	required

Source code in procaliper/site_metadata/custom_site_data.py

def add_residue_numbers(self, residue_number: list[int] | int) -> None:
    """Specify the number of residues in the CustomSiteData object.

    Args:
        residue_number (list[int] | int): If an integer, the number of
            residues. If a list of integers, the list of residue numbers.
    """
    if isinstance(residue_number, int):
        self.residue_number = list(range(1, residue_number + 1))
    else:
        self.residue_number = residue_number

`add_site_data(key, row, overwrite=False)` ¶

Add a site-level feature to the CustomSiteData object.

Parameters:

Name	Type	Description	Default
`key`	`str`	The name of the feature to add.	required
`row`	`list[Any]`	The values for the feature.	required
`overwrite`	`bool`	Whether to overwrite an existing feature. Defaults to False.	`False`

Raises:

Type	Description
`KeyError`	If overwrite is False and the feature already exists.
`ValueError`	If the number of values in the feature does not match the number of residues.

Source code in procaliper/site_metadata/custom_site_data.py

def add_site_data(self, key: str, row: list[Any], overwrite: bool = False) -> None:
    """Add a site-level feature to the CustomSiteData object.

    Args:
        key (str): The name of the feature to add.
        row (list[Any]): The values for the feature.

        overwrite (bool, optional): Whether to overwrite an existing
            feature. Defaults to False.

    Raises:
        KeyError: If overwrite is False and the feature already exists.
        ValueError: If the number of values in the feature does not match
            the number of residues.
    """
    if hasattr(self, key) and not overwrite:
        raise KeyError(
            f"CustomSiteData already has a {key} key and overwrite is False."
        )

    if len(row) != len(self.residue_number):
        raise ValueError(
            f"CustomSiteData has {len(self.residue_number)} residues, but {key} has {len(row)} values."
            " Perhaps you forgot to call add_residue_numbers?"
        )

    setattr(self, key, row)
    self.keys.add(key)

`from_dict(data, residue_index_feature_name='residue_number')` `classmethod` ¶

Create a CustomSiteData object from a dictionary of data.

Parameters:

Name	Type	Description	Default
`data`	`dict[str, list[Any]]`	Data dictionary indexed by feature name. Each value must be a list of the same length as the residue number feature. Must include a residue number key.	required
`residue_index_feature_name`	`str`	The name of the feature that contains the residue number. Defaults to "residue_number".	`'residue_number'`

Raises:

Type	Description
`ValueError`	If the residue number feature is not in the data.

Returns:

Name	Type	Description
`CustomSiteData`	`CustomSiteData`	A CustomSiteData object. that contains the data.

Source code in procaliper/site_metadata/custom_site_data.py

@classmethod
def from_dict(
    cls,
    data: dict[str, list[Any]],
    residue_index_feature_name: str = "residue_number",
) -> CustomSiteData:
    """Create a CustomSiteData object from a dictionary of data.

    Args:
        data (dict[str, list[Any]]): Data dictionary indexed by feature
            name. Each value must be a list of the same length as the
            residue number feature. Must include a residue number key.
        residue_index_feature_name (str, optional): The name of the feature
            that contains the residue number. Defaults to "residue_number".

    Raises:
        ValueError: If the residue number feature is not in the data.

    Returns:
        CustomSiteData: A CustomSiteData object. that contains the data.
    """
    if residue_index_feature_name not in data:
        raise ValueError("CustomSiteData must have a residue_number key.")
    return cls(data[residue_index_feature_name], data)

`table()` ¶

Return a dictionary of the data in the CustomSiteData object.

Returns:

Type	Description
`dict[str, list[Any]]`	dict[str, list[Any]]: A dictionary of the data in the CustomSiteData object.

Source code in procaliper/site_metadata/custom_site_data.py

def table(self) -> dict[str, list[Any]]:
    """Return a dictionary of the data in the CustomSiteData object.

    Returns:
        dict[str, list[Any]]: A dictionary of the data in the CustomSiteData
            object.
    """
    return {k: getattr(self, k) for k in self.keys}

`SiteAnnotations` ¶

Class for parsing and storing UniProt site annotations.

An example of a UniProt site annotation:

DISULFID 28..87; /evidence="ECO:0000255|PROSITE-ProRule:PRU00114"; DISULFID 105; /note="Interchain (with heavy chain)"

Attributes:

Name	Type	Description
`residue_letter`	`list[str]`	A list of amino acid letters.
`residue_number`	`list[int]`	A list of residue numbers.
`binding`	`list[bool]`	A list of booleans indicating whether a residue is a binding site.
`active`	`list[bool]`	A list of booleans indicating whether a residue is an active site.
`ptm`	`list[bool]`	A list of booleans indicating whether a residue is reported to be post-translationally modified.
`dna_binding`	`list[bool]`	A list of booleans indicating whether a residue is a DNA binding site.
`disulfide_bond`	`list[bool]`	A list of booleans indicating whether a residue is a disulfide bond.
`helix`	`list[bool]`	A list of booleans indicating whether a residue is in a helix.
`turn`	`list[bool]`	A list of booleans indicating whether a residue is in a turn.
`beta_strand`	`list[bool]`	A list of booleans indicating whether a residue is in a beta strand.
`binding_data`	`list[dict[str, str]]`	A list of dictionaries containing binding site metadata.
`active_data`	`list[dict[str, str]]`	A list of dictionaries containing active site metadata.
`ptm_data`	`list[dict[str, str]]`	A list of dictionaries containing post-translationally modified site metadata.
`regions`	`dict[str, list[int]]`	A dictionary mapping region names to lists of (zero-indexed) residue numbers.
`region_data`	`dict[str, str]`	A dictionary mapping region names to annotation data.
`domains`	`dict[str, list[int]]`	A dictionary mapping domain names to lists of (zero-indexed) residue numbers.
`domain_data`	`dict[str, str]`	A dictionary mapping domain names to annotation data.

Source code in procaliper/site_metadata/uniprot_site_parsing.py

class SiteAnnotations:
    """Class for parsing and storing UniProt site annotations.

    An example of a UniProt site annotation:

    `DISULFID 28..87; /evidence="ECO:0000255|PROSITE-ProRule:PRU00114"; DISULFID 105; /note="Interchain (with heavy chain)"`

    Attributes:
        residue_letter (list[str]): A list of amino acid letters.
        residue_number (list[int]): A list of residue numbers.
        binding (list[bool]): A list of booleans indicating whether a residue
            is a binding site.
        active (list[bool]): A list of booleans indicating whether a residue
            is an active site.
        ptm (list[bool]): A list of booleans indicating whether a residue
            is reported to be post-translationally modified.
        dna_binding (list[bool]): A list of booleans indicating whether a residue
            is a DNA binding site.
        disulfide_bond (list[bool]): A list of booleans indicating whether a residue
            is a disulfide bond.
        helix (list[bool]): A list of booleans indicating whether a residue
            is in a helix.
        turn (list[bool]): A list of booleans indicating whether a residue
            is in a turn.
        beta_strand (list[bool]): A list of booleans indicating whether a residue
            is in a beta strand.
        binding_data (list[dict[str, str]]): A list of dictionaries containing
            binding site metadata.
        active_data (list[dict[str, str]]): A list of dictionaries containing
            active site metadata.
        ptm_data (list[dict[str, str]]): A list of dictionaries containing
            post-translationally modified site metadata.
        regions (dict[str,list[int]]): A dictionary mapping region names to lists
            of (zero-indexed) residue numbers.
        region_data (dict[str,str]): A dictionary mapping region names to annotation data.
        domains (dict[str,list[int]]): A dictionary mapping domain names to lists
            of (zero-indexed) residue numbers.
        domain_data (dict[str,str]): A dictionary mapping domain names to annotation data.
    """

    fields_by_description_type: dict[str, list[str]] = {
        "BINDING": ["ligand"],
        "ACT_SITE": ["note"],
        "MOD_RES": ["note"],
        "REGION": ["note"],
        "DOMAIN": ["note"],
        "DNA_BIND": [],
        "DISULFID": [],
        "HELIX": [],
        "TURN": [],
        "STRAND": [],
    }

    def __init__(self, sequence: str) -> None:
        """Instantiates a SiteAnnotations object from a string of amino acid letters.

        It is recommended to call `SiteAnnotations.extract_annotation` after instantiating.
        Before that, the `SiteAnnotations` object contains only default values.

        Args:
            sequence (str): A string of amino acid letters. See
                `type_aliases.AminoAcidLetter` for valid letters.
        """
        self.residue_letter: list[str] = list(sequence)
        self.residue_number: list[int] = list(range(1, len(sequence) + 1))
        self.binding: list[bool] = [False] * len(sequence)
        self.active: list[bool] = [False] * len(sequence)
        self.ptm: list[bool] = [False] * len(sequence)
        self.dna_binding: list[bool] = [False] * len(sequence)
        self.disulfide_bond: list[bool] = [False] * len(sequence)
        self.helix: list[bool] = [False] * len(sequence)
        self.turn: list[bool] = [False] * len(sequence)
        self.beta_strand: list[bool] = [False] * len(sequence)

        self.binding_data: list[dict[str, str]] = [{} for _ in range(len(sequence))]
        self.active_data: list[dict[str, str]] = [{} for _ in range(len(sequence))]
        self.ptm_data: list[dict[str, str]] = [{} for _ in range(len(sequence))]

        self.regions: dict[str, list[int]] = {}
        self.region_data: dict[str, dict[str, str]] = {}

        self.domains: dict[str, list[int]] = {}
        self.domain_data: dict[str, dict[str, str]] = {}

    def table(self) -> dict[str, list[Any]]:
        """Return a dictionary of the data in the SiteAnnotations object.

        Returns:
            dict[str, list[Any]]: Each key is a site annotation feature name.
                Each value is a list of the values for that feature.
        """
        tbl: dict[str, list[Any]] = {}

        tbl["residue_letter"] = self.residue_letter
        tbl["residue_number"] = self.residue_number
        tbl["binding"] = self.binding
        tbl["active"] = self.active
        tbl["ptm"] = self.ptm
        tbl["dna_binding"] = self.dna_binding
        tbl["disulfide_bond"] = self.disulfide_bond
        tbl["helix"] = self.helix
        tbl["turn"] = self.turn
        tbl["beta_strand"] = self.beta_strand
        tbl["binding_data"] = self.binding_data
        tbl["active_data"] = self.active_data
        tbl["ptm_data"] = self.ptm_data

        return tbl

    def __len__(self) -> int:
        return len(self.residue_letter)

    def _parse_description(
        self,
        description_type: str,
        description: str,
        extract_metadata: bool | None = None,
    ) -> tuple[list[bool], list[dict[str, str]] | None]:
        # example of descrition:
        # DISULFID 28..87; /evidence="ECO:0000255|PROSITE-ProRule:PRU00114"; DISULFID 105; /note="Interchain (with heavy chain)"

        site_matches = [False] * len(self)

        site_data: list[dict[str, str]] | None = None

        if extract_metadata is None:
            extract_metadata = bool(self.fields_by_description_type[description_type])
        if extract_metadata:
            site_data = [{} for _ in range(len(self))]

        if description_type not in self.fields_by_description_type:
            raise NotImplementedError(f"Unknown description type: {description_type}")
        if (
            not description or description != description
        ):  # not-equal check is for pandas nans
            return site_matches, site_data
        if description_type not in description:
            raise ValueError(
                f"{description_type} does not appear in the description: {description}"
            )

        stretches = description.split(description_type)

        # first stretch is always empty
        for stretch in stretches[1:]:
            fields = stretch.split(";")
            # first field is always site numbers
            se = fields[0].strip().split("..")
            start, end = len(self), len(self)
            if len(se) not in (1, 2):
                raise ValueError(
                    f"Unable to parse site numbers {se} in {stretch} from {description}"
                )
            se_start = se[0].split(":")[-1]

            if len(se) == 1:
                start, end = (
                    int(se_start) - 1,
                    int(se_start) - 1,
                )  # uniprot 1-indexes sites
            else:
                start, end = int(se_start) - 1, int(se[1]) - 1

            if start >= len(self) or end >= len(self) or start > end:
                raise ValueError(
                    f"Improperly formatted descritpion; site numbers not recognized: {stretch} in {description}"
                )

            field_sites = list(range(start, end + 1))
            for s in field_sites:
                site_matches[s] = True
                if se[0] != se_start and extract_metadata:
                    # site_data is populated if extract_metadata is True
                    # mypy does not catch this
                    site_data[s]["isoform"] = se[0].split(":")[0]  # type: ignore

            if len(fields) == 1 or site_data is None:
                continue

            for field in fields[1:]:
                field = field.strip()
                for field_id in self.fields_by_description_type[description_type]:
                    if not field.startswith(f"/{field_id}="):
                        continue
                    field_data = field.removeprefix(f"/{field_id}=")
                    for s in field_sites:
                        if field_id not in site_data[s]:
                            site_data[s][field_id] = field_data
                        else:
                            site_data[s][field_id] += "," + field_data

        return site_matches, site_data

    def _region_parsing(self, description: str) -> None:
        region_annotations = description.split("REGION ")[1:]
        self.regions = {}
        self.region_data = {}
        for region_index, x in enumerate(region_annotations):
            r = f"r_{region_index}"
            fields = x.split(";")
            self.regions[r] = list(
                range(
                    int(fields[0].split("..")[0]) - 1,
                    int(fields[0].split("..")[1]),
                )
            )
            self.region_data[r] = {}
            for field in fields[1:]:
                field = field.strip()
                for field_id in self.fields_by_description_type["REGION"]:
                    if not field.startswith(f"/{field_id}="):
                        continue
                    field_data = field.removeprefix(f"/{field_id}=")
                    if field_id not in self.region_data[r]:
                        self.region_data[r][field_id] = field_data
                    else:
                        self.region_data[r][field_id] += "," + field_data

    def _domain_parsing(self, description: str) -> None:
        domain_annotations = description.split("DOMAIN ")[1:]
        self.domains = {}
        self.domain_data = {}
        for domain_index, x in enumerate(domain_annotations):
            r = f"d_{domain_index}"
            fields = x.split(";")
            self.domains[r] = list(
                range(
                    int(fields[0].split("..")[0]) - 1,
                    int(fields[0].split("..")[1]),
                )
            )
            self.domain_data[r] = {}
            for field in fields[1:]:
                field = field.strip()
                for field_id in self.fields_by_description_type["DOMAIN"]:
                    if not field.startswith(f"/{field_id}="):
                        continue
                    field_data = field.removeprefix(f"/{field_id}=")
                    if field_id not in self.domain_data[r]:
                        self.domain_data[r][field_id] = field_data
                    else:
                        self.domain_data[r][field_id] += "," + field_data

    def extract_annotation(
        self,
        description_type: str,
        description: str,
        extract_metadata: bool | None = None,
    ) -> None:
        """Extracts the site annotations from the description.

        Args:
            description_type (str): The type of site annotation to extract. Must be
                one of the keys in `self.fields_by_description_type`.
            description (str): The UniProt site description string.
            extract_metadata (bool | None, optional): Whether to extract metadata.
                By default, this is inferred from the `description_type` parameter.

        Raises:
            NotImplementedError: From `_parse_description`. If an unknown `description_type` is provided.
            ValueError: From `_parse_description`. If the `description_type` is not found in `description`.
            AssertionError: If a `description_type` is provided that is known to `_parse_description` but
                not `extract_annotation`. This indicates an internal bug and should be reported.
        """
        # regions are a special case because they can overlap
        if description_type == "REGION":
            self._region_parsing(description)
            return
        if description_type == "DOMAIN":
            self._domain_parsing(description)
            return

        matches, data = self._parse_description(
            description_type, description, extract_metadata
        )
        if description_type == "ACT_SITE":
            self.active = matches
            if data:
                self.active_data = data
        elif description_type == "BINDING":
            self.binding = matches
            if data:
                self.binding_data = data
        elif description_type == "MOD_RES":
            self.ptm = matches
            if data:
                self.ptm_data = data
        elif description_type == "DNA_BIND":
            self.dna_binding = matches
        elif description_type == "DISULFID":
            self.disulfide_bond = matches
        elif description_type == "STRAND":
            self.beta_strand = matches
        elif description_type == "HELIX":
            self.helix = matches
        elif description_type == "TURN":
            self.turn = matches
        else:
            raise AssertionError(
                f"If this is raised, the description type {description_type} is only partially handled. Please file an issue."
            )

`init(sequence)` ¶

Instantiates a SiteAnnotations object from a string of amino acid letters.

It is recommended to call SiteAnnotations.extract_annotation after instantiating. Before that, the SiteAnnotations object contains only default values.

Parameters:

Name	Type	Description	Default
`sequence`	`str`	A string of amino acid letters. See `type_aliases.AminoAcidLetter` for valid letters.	required

Source code in procaliper/site_metadata/uniprot_site_parsing.py

def __init__(self, sequence: str) -> None:
    """Instantiates a SiteAnnotations object from a string of amino acid letters.

    It is recommended to call `SiteAnnotations.extract_annotation` after instantiating.
    Before that, the `SiteAnnotations` object contains only default values.

    Args:
        sequence (str): A string of amino acid letters. See
            `type_aliases.AminoAcidLetter` for valid letters.
    """
    self.residue_letter: list[str] = list(sequence)
    self.residue_number: list[int] = list(range(1, len(sequence) + 1))
    self.binding: list[bool] = [False] * len(sequence)
    self.active: list[bool] = [False] * len(sequence)
    self.ptm: list[bool] = [False] * len(sequence)
    self.dna_binding: list[bool] = [False] * len(sequence)
    self.disulfide_bond: list[bool] = [False] * len(sequence)
    self.helix: list[bool] = [False] * len(sequence)
    self.turn: list[bool] = [False] * len(sequence)
    self.beta_strand: list[bool] = [False] * len(sequence)

    self.binding_data: list[dict[str, str]] = [{} for _ in range(len(sequence))]
    self.active_data: list[dict[str, str]] = [{} for _ in range(len(sequence))]
    self.ptm_data: list[dict[str, str]] = [{} for _ in range(len(sequence))]

    self.regions: dict[str, list[int]] = {}
    self.region_data: dict[str, dict[str, str]] = {}

    self.domains: dict[str, list[int]] = {}
    self.domain_data: dict[str, dict[str, str]] = {}

`extract_annotation(description_type, description, extract_metadata=None)` ¶

Extracts the site annotations from the description.

Parameters:

Name	Type	Description	Default
`description_type`	`str`	The type of site annotation to extract. Must be one of the keys in `self.fields_by_description_type`.	required
`description`	`str`	The UniProt site description string.	required
`extract_metadata`	`bool \| None`	Whether to extract metadata. By default, this is inferred from the `description_type` parameter.	`None`

Raises:

Type	Description
`NotImplementedError`	From `_parse_description`. If an unknown `description_type` is provided.
`ValueError`	From `_parse_description`. If the `description_type` is not found in `description`.
`AssertionError`	If a `description_type` is provided that is known to `_parse_description` but not `extract_annotation`. This indicates an internal bug and should be reported.

Source code in procaliper/site_metadata/uniprot_site_parsing.py

def extract_annotation(
    self,
    description_type: str,
    description: str,
    extract_metadata: bool | None = None,
) -> None:
    """Extracts the site annotations from the description.

    Args:
        description_type (str): The type of site annotation to extract. Must be
            one of the keys in `self.fields_by_description_type`.
        description (str): The UniProt site description string.
        extract_metadata (bool | None, optional): Whether to extract metadata.
            By default, this is inferred from the `description_type` parameter.

    Raises:
        NotImplementedError: From `_parse_description`. If an unknown `description_type` is provided.
        ValueError: From `_parse_description`. If the `description_type` is not found in `description`.
        AssertionError: If a `description_type` is provided that is known to `_parse_description` but
            not `extract_annotation`. This indicates an internal bug and should be reported.
    """
    # regions are a special case because they can overlap
    if description_type == "REGION":
        self._region_parsing(description)
        return
    if description_type == "DOMAIN":
        self._domain_parsing(description)
        return

    matches, data = self._parse_description(
        description_type, description, extract_metadata
    )
    if description_type == "ACT_SITE":
        self.active = matches
        if data:
            self.active_data = data
    elif description_type == "BINDING":
        self.binding = matches
        if data:
            self.binding_data = data
    elif description_type == "MOD_RES":
        self.ptm = matches
        if data:
            self.ptm_data = data
    elif description_type == "DNA_BIND":
        self.dna_binding = matches
    elif description_type == "DISULFID":
        self.disulfide_bond = matches
    elif description_type == "STRAND":
        self.beta_strand = matches
    elif description_type == "HELIX":
        self.helix = matches
    elif description_type == "TURN":
        self.turn = matches
    else:
        raise AssertionError(
            f"If this is raised, the description type {description_type} is only partially handled. Please file an issue."
        )

`table()` ¶

Return a dictionary of the data in the SiteAnnotations object.

Returns:

Type	Description
`dict[str, list[Any]]`	dict[str, list[Any]]: Each key is a site annotation feature name. Each value is a list of the values for that feature.

Source code in procaliper/site_metadata/uniprot_site_parsing.py

def table(self) -> dict[str, list[Any]]:
    """Return a dictionary of the data in the SiteAnnotations object.

    Returns:
        dict[str, list[Any]]: Each key is a site annotation feature name.
            Each value is a list of the values for that feature.
    """
    tbl: dict[str, list[Any]] = {}

    tbl["residue_letter"] = self.residue_letter
    tbl["residue_number"] = self.residue_number
    tbl["binding"] = self.binding
    tbl["active"] = self.active
    tbl["ptm"] = self.ptm
    tbl["dna_binding"] = self.dna_binding
    tbl["disulfide_bond"] = self.disulfide_bond
    tbl["helix"] = self.helix
    tbl["turn"] = self.turn
    tbl["beta_strand"] = self.beta_strand
    tbl["binding_data"] = self.binding_data
    tbl["active_data"] = self.active_data
    tbl["ptm_data"] = self.ptm_data

    return tbl

`custom_site_data` ¶

`CustomSiteData` ¶

Class for storing custom site-level data.

Source code in procaliper/site_metadata/custom_site_data.py

class CustomSiteData:
    """Class for storing custom site-level data."""

    def __init__(self, residue_number: list[int], data: dict[str, list[Any]]) -> None:
        self.residue_number = residue_number
        for key, value in data.items():
            setattr(self, key, value)

        self.keys = {"residue_number"} | set(data.keys())

    @classmethod
    def from_dict(
        cls,
        data: dict[str, list[Any]],
        residue_index_feature_name: str = "residue_number",
    ) -> CustomSiteData:
        """Create a CustomSiteData object from a dictionary of data.

        Args:
            data (dict[str, list[Any]]): Data dictionary indexed by feature
                name. Each value must be a list of the same length as the
                residue number feature. Must include a residue number key.
            residue_index_feature_name (str, optional): The name of the feature
                that contains the residue number. Defaults to "residue_number".

        Raises:
            ValueError: If the residue number feature is not in the data.

        Returns:
            CustomSiteData: A CustomSiteData object. that contains the data.
        """
        if residue_index_feature_name not in data:
            raise ValueError("CustomSiteData must have a residue_number key.")
        return cls(data[residue_index_feature_name], data)

    def table(self) -> dict[str, list[Any]]:
        """Return a dictionary of the data in the CustomSiteData object.

        Returns:
            dict[str, list[Any]]: A dictionary of the data in the CustomSiteData
                object.
        """
        return {k: getattr(self, k) for k in self.keys}

    def add_residue_numbers(self, residue_number: list[int] | int) -> None:
        """Specify the number of residues in the CustomSiteData object.

        Args:
            residue_number (list[int] | int): If an integer, the number of
                residues. If a list of integers, the list of residue numbers.
        """
        if isinstance(residue_number, int):
            self.residue_number = list(range(1, residue_number + 1))
        else:
            self.residue_number = residue_number

    def add_site_data(self, key: str, row: list[Any], overwrite: bool = False) -> None:
        """Add a site-level feature to the CustomSiteData object.

        Args:
            key (str): The name of the feature to add.
            row (list[Any]): The values for the feature.

            overwrite (bool, optional): Whether to overwrite an existing
                feature. Defaults to False.

        Raises:
            KeyError: If overwrite is False and the feature already exists.
            ValueError: If the number of values in the feature does not match
                the number of residues.
        """
        if hasattr(self, key) and not overwrite:
            raise KeyError(
                f"CustomSiteData already has a {key} key and overwrite is False."
            )

        if len(row) != len(self.residue_number):
            raise ValueError(
                f"CustomSiteData has {len(self.residue_number)} residues, but {key} has {len(row)} values."
                " Perhaps you forgot to call add_residue_numbers?"
            )

        setattr(self, key, row)
        self.keys.add(key)

`add_residue_numbers(residue_number)` ¶

Specify the number of residues in the CustomSiteData object.

Parameters:

Name	Type	Description	Default
`residue_number`	`list[int] \| int`	If an integer, the number of residues. If a list of integers, the list of residue numbers.	required

Source code in procaliper/site_metadata/custom_site_data.py

def add_residue_numbers(self, residue_number: list[int] | int) -> None:
    """Specify the number of residues in the CustomSiteData object.

    Args:
        residue_number (list[int] | int): If an integer, the number of
            residues. If a list of integers, the list of residue numbers.
    """
    if isinstance(residue_number, int):
        self.residue_number = list(range(1, residue_number + 1))
    else:
        self.residue_number = residue_number

`add_site_data(key, row, overwrite=False)` ¶

Add a site-level feature to the CustomSiteData object.

Parameters:

Name	Type	Description	Default
`key`	`str`	The name of the feature to add.	required
`row`	`list[Any]`	The values for the feature.	required
`overwrite`	`bool`	Whether to overwrite an existing feature. Defaults to False.	`False`

Raises:

Type	Description
`KeyError`	If overwrite is False and the feature already exists.
`ValueError`	If the number of values in the feature does not match the number of residues.

Source code in procaliper/site_metadata/custom_site_data.py

def add_site_data(self, key: str, row: list[Any], overwrite: bool = False) -> None:
    """Add a site-level feature to the CustomSiteData object.

    Args:
        key (str): The name of the feature to add.
        row (list[Any]): The values for the feature.

        overwrite (bool, optional): Whether to overwrite an existing
            feature. Defaults to False.

    Raises:
        KeyError: If overwrite is False and the feature already exists.
        ValueError: If the number of values in the feature does not match
            the number of residues.
    """
    if hasattr(self, key) and not overwrite:
        raise KeyError(
            f"CustomSiteData already has a {key} key and overwrite is False."
        )

    if len(row) != len(self.residue_number):
        raise ValueError(
            f"CustomSiteData has {len(self.residue_number)} residues, but {key} has {len(row)} values."
            " Perhaps you forgot to call add_residue_numbers?"
        )

    setattr(self, key, row)
    self.keys.add(key)

`from_dict(data, residue_index_feature_name='residue_number')` `classmethod` ¶

Create a CustomSiteData object from a dictionary of data.

Parameters:

Name	Type	Description	Default
`data`	`dict[str, list[Any]]`	Data dictionary indexed by feature name. Each value must be a list of the same length as the residue number feature. Must include a residue number key.	required
`residue_index_feature_name`	`str`	The name of the feature that contains the residue number. Defaults to "residue_number".	`'residue_number'`

Raises:

Type	Description
`ValueError`	If the residue number feature is not in the data.

Returns:

Name	Type	Description
`CustomSiteData`	`CustomSiteData`	A CustomSiteData object. that contains the data.

Source code in procaliper/site_metadata/custom_site_data.py

@classmethod
def from_dict(
    cls,
    data: dict[str, list[Any]],
    residue_index_feature_name: str = "residue_number",
) -> CustomSiteData:
    """Create a CustomSiteData object from a dictionary of data.

    Args:
        data (dict[str, list[Any]]): Data dictionary indexed by feature
            name. Each value must be a list of the same length as the
            residue number feature. Must include a residue number key.
        residue_index_feature_name (str, optional): The name of the feature
            that contains the residue number. Defaults to "residue_number".

    Raises:
        ValueError: If the residue number feature is not in the data.

    Returns:
        CustomSiteData: A CustomSiteData object. that contains the data.
    """
    if residue_index_feature_name not in data:
        raise ValueError("CustomSiteData must have a residue_number key.")
    return cls(data[residue_index_feature_name], data)

`table()` ¶

Return a dictionary of the data in the CustomSiteData object.

Returns:

Type	Description
`dict[str, list[Any]]`	dict[str, list[Any]]: A dictionary of the data in the CustomSiteData object.

Source code in procaliper/site_metadata/custom_site_data.py

def table(self) -> dict[str, list[Any]]:
    """Return a dictionary of the data in the CustomSiteData object.

    Returns:
        dict[str, list[Any]]: A dictionary of the data in the CustomSiteData
            object.
    """
    return {k: getattr(self, k) for k in self.keys}

`uniprot_site_parsing` ¶

`SiteAnnotations` ¶

Class for parsing and storing UniProt site annotations.

An example of a UniProt site annotation:

DISULFID 28..87; /evidence="ECO:0000255|PROSITE-ProRule:PRU00114"; DISULFID 105; /note="Interchain (with heavy chain)"

Attributes:

Name	Type	Description
`residue_letter`	`list[str]`	A list of amino acid letters.
`residue_number`	`list[int]`	A list of residue numbers.
`binding`	`list[bool]`	A list of booleans indicating whether a residue is a binding site.
`active`	`list[bool]`	A list of booleans indicating whether a residue is an active site.
`ptm`	`list[bool]`	A list of booleans indicating whether a residue is reported to be post-translationally modified.
`dna_binding`	`list[bool]`	A list of booleans indicating whether a residue is a DNA binding site.
`disulfide_bond`	`list[bool]`	A list of booleans indicating whether a residue is a disulfide bond.
`helix`	`list[bool]`	A list of booleans indicating whether a residue is in a helix.
`turn`	`list[bool]`	A list of booleans indicating whether a residue is in a turn.
`beta_strand`	`list[bool]`	A list of booleans indicating whether a residue is in a beta strand.
`binding_data`	`list[dict[str, str]]`	A list of dictionaries containing binding site metadata.
`active_data`	`list[dict[str, str]]`	A list of dictionaries containing active site metadata.
`ptm_data`	`list[dict[str, str]]`	A list of dictionaries containing post-translationally modified site metadata.
`regions`	`dict[str, list[int]]`	A dictionary mapping region names to lists of (zero-indexed) residue numbers.
`region_data`	`dict[str, str]`	A dictionary mapping region names to annotation data.
`domains`	`dict[str, list[int]]`	A dictionary mapping domain names to lists of (zero-indexed) residue numbers.
`domain_data`	`dict[str, str]`	A dictionary mapping domain names to annotation data.

Source code in procaliper/site_metadata/uniprot_site_parsing.py

class SiteAnnotations:
    """Class for parsing and storing UniProt site annotations.

    An example of a UniProt site annotation:

    `DISULFID 28..87; /evidence="ECO:0000255|PROSITE-ProRule:PRU00114"; DISULFID 105; /note="Interchain (with heavy chain)"`

    Attributes:
        residue_letter (list[str]): A list of amino acid letters.
        residue_number (list[int]): A list of residue numbers.
        binding (list[bool]): A list of booleans indicating whether a residue
            is a binding site.
        active (list[bool]): A list of booleans indicating whether a residue
            is an active site.
        ptm (list[bool]): A list of booleans indicating whether a residue
            is reported to be post-translationally modified.
        dna_binding (list[bool]): A list of booleans indicating whether a residue
            is a DNA binding site.
        disulfide_bond (list[bool]): A list of booleans indicating whether a residue
            is a disulfide bond.
        helix (list[bool]): A list of booleans indicating whether a residue
            is in a helix.
        turn (list[bool]): A list of booleans indicating whether a residue
            is in a turn.
        beta_strand (list[bool]): A list of booleans indicating whether a residue
            is in a beta strand.
        binding_data (list[dict[str, str]]): A list of dictionaries containing
            binding site metadata.
        active_data (list[dict[str, str]]): A list of dictionaries containing
            active site metadata.
        ptm_data (list[dict[str, str]]): A list of dictionaries containing
            post-translationally modified site metadata.
        regions (dict[str,list[int]]): A dictionary mapping region names to lists
            of (zero-indexed) residue numbers.
        region_data (dict[str,str]): A dictionary mapping region names to annotation data.
        domains (dict[str,list[int]]): A dictionary mapping domain names to lists
            of (zero-indexed) residue numbers.
        domain_data (dict[str,str]): A dictionary mapping domain names to annotation data.
    """

    fields_by_description_type: dict[str, list[str]] = {
        "BINDING": ["ligand"],
        "ACT_SITE": ["note"],
        "MOD_RES": ["note"],
        "REGION": ["note"],
        "DOMAIN": ["note"],
        "DNA_BIND": [],
        "DISULFID": [],
        "HELIX": [],
        "TURN": [],
        "STRAND": [],
    }

    def __init__(self, sequence: str) -> None:
        """Instantiates a SiteAnnotations object from a string of amino acid letters.

        It is recommended to call `SiteAnnotations.extract_annotation` after instantiating.
        Before that, the `SiteAnnotations` object contains only default values.

        Args:
            sequence (str): A string of amino acid letters. See
                `type_aliases.AminoAcidLetter` for valid letters.
        """
        self.residue_letter: list[str] = list(sequence)
        self.residue_number: list[int] = list(range(1, len(sequence) + 1))
        self.binding: list[bool] = [False] * len(sequence)
        self.active: list[bool] = [False] * len(sequence)
        self.ptm: list[bool] = [False] * len(sequence)
        self.dna_binding: list[bool] = [False] * len(sequence)
        self.disulfide_bond: list[bool] = [False] * len(sequence)
        self.helix: list[bool] = [False] * len(sequence)
        self.turn: list[bool] = [False] * len(sequence)
        self.beta_strand: list[bool] = [False] * len(sequence)

        self.binding_data: list[dict[str, str]] = [{} for _ in range(len(sequence))]
        self.active_data: list[dict[str, str]] = [{} for _ in range(len(sequence))]
        self.ptm_data: list[dict[str, str]] = [{} for _ in range(len(sequence))]

        self.regions: dict[str, list[int]] = {}
        self.region_data: dict[str, dict[str, str]] = {}

        self.domains: dict[str, list[int]] = {}
        self.domain_data: dict[str, dict[str, str]] = {}

    def table(self) -> dict[str, list[Any]]:
        """Return a dictionary of the data in the SiteAnnotations object.

        Returns:
            dict[str, list[Any]]: Each key is a site annotation feature name.
                Each value is a list of the values for that feature.
        """
        tbl: dict[str, list[Any]] = {}

        tbl["residue_letter"] = self.residue_letter
        tbl["residue_number"] = self.residue_number
        tbl["binding"] = self.binding
        tbl["active"] = self.active
        tbl["ptm"] = self.ptm
        tbl["dna_binding"] = self.dna_binding
        tbl["disulfide_bond"] = self.disulfide_bond
        tbl["helix"] = self.helix
        tbl["turn"] = self.turn
        tbl["beta_strand"] = self.beta_strand
        tbl["binding_data"] = self.binding_data
        tbl["active_data"] = self.active_data
        tbl["ptm_data"] = self.ptm_data

        return tbl

    def __len__(self) -> int:
        return len(self.residue_letter)

    def _parse_description(
        self,
        description_type: str,
        description: str,
        extract_metadata: bool | None = None,
    ) -> tuple[list[bool], list[dict[str, str]] | None]:
        # example of descrition:
        # DISULFID 28..87; /evidence="ECO:0000255|PROSITE-ProRule:PRU00114"; DISULFID 105; /note="Interchain (with heavy chain)"

        site_matches = [False] * len(self)

        site_data: list[dict[str, str]] | None = None

        if extract_metadata is None:
            extract_metadata = bool(self.fields_by_description_type[description_type])
        if extract_metadata:
            site_data = [{} for _ in range(len(self))]

        if description_type not in self.fields_by_description_type:
            raise NotImplementedError(f"Unknown description type: {description_type}")
        if (
            not description or description != description
        ):  # not-equal check is for pandas nans
            return site_matches, site_data
        if description_type not in description:
            raise ValueError(
                f"{description_type} does not appear in the description: {description}"
            )

        stretches = description.split(description_type)

        # first stretch is always empty
        for stretch in stretches[1:]:
            fields = stretch.split(";")
            # first field is always site numbers
            se = fields[0].strip().split("..")
            start, end = len(self), len(self)
            if len(se) not in (1, 2):
                raise ValueError(
                    f"Unable to parse site numbers {se} in {stretch} from {description}"
                )
            se_start = se[0].split(":")[-1]

            if len(se) == 1:
                start, end = (
                    int(se_start) - 1,
                    int(se_start) - 1,
                )  # uniprot 1-indexes sites
            else:
                start, end = int(se_start) - 1, int(se[1]) - 1

            if start >= len(self) or end >= len(self) or start > end:
                raise ValueError(
                    f"Improperly formatted descritpion; site numbers not recognized: {stretch} in {description}"
                )

            field_sites = list(range(start, end + 1))
            for s in field_sites:
                site_matches[s] = True
                if se[0] != se_start and extract_metadata:
                    # site_data is populated if extract_metadata is True
                    # mypy does not catch this
                    site_data[s]["isoform"] = se[0].split(":")[0]  # type: ignore

            if len(fields) == 1 or site_data is None:
                continue

            for field in fields[1:]:
                field = field.strip()
                for field_id in self.fields_by_description_type[description_type]:
                    if not field.startswith(f"/{field_id}="):
                        continue
                    field_data = field.removeprefix(f"/{field_id}=")
                    for s in field_sites:
                        if field_id not in site_data[s]:
                            site_data[s][field_id] = field_data
                        else:
                            site_data[s][field_id] += "," + field_data

        return site_matches, site_data

    def _region_parsing(self, description: str) -> None:
        region_annotations = description.split("REGION ")[1:]
        self.regions = {}
        self.region_data = {}
        for region_index, x in enumerate(region_annotations):
            r = f"r_{region_index}"
            fields = x.split(";")
            self.regions[r] = list(
                range(
                    int(fields[0].split("..")[0]) - 1,
                    int(fields[0].split("..")[1]),
                )
            )
            self.region_data[r] = {}
            for field in fields[1:]:
                field = field.strip()
                for field_id in self.fields_by_description_type["REGION"]:
                    if not field.startswith(f"/{field_id}="):
                        continue
                    field_data = field.removeprefix(f"/{field_id}=")
                    if field_id not in self.region_data[r]:
                        self.region_data[r][field_id] = field_data
                    else:
                        self.region_data[r][field_id] += "," + field_data

    def _domain_parsing(self, description: str) -> None:
        domain_annotations = description.split("DOMAIN ")[1:]
        self.domains = {}
        self.domain_data = {}
        for domain_index, x in enumerate(domain_annotations):
            r = f"d_{domain_index}"
            fields = x.split(";")
            self.domains[r] = list(
                range(
                    int(fields[0].split("..")[0]) - 1,
                    int(fields[0].split("..")[1]),
                )
            )
            self.domain_data[r] = {}
            for field in fields[1:]:
                field = field.strip()
                for field_id in self.fields_by_description_type["DOMAIN"]:
                    if not field.startswith(f"/{field_id}="):
                        continue
                    field_data = field.removeprefix(f"/{field_id}=")
                    if field_id not in self.domain_data[r]:
                        self.domain_data[r][field_id] = field_data
                    else:
                        self.domain_data[r][field_id] += "," + field_data

    def extract_annotation(
        self,
        description_type: str,
        description: str,
        extract_metadata: bool | None = None,
    ) -> None:
        """Extracts the site annotations from the description.

        Args:
            description_type (str): The type of site annotation to extract. Must be
                one of the keys in `self.fields_by_description_type`.
            description (str): The UniProt site description string.
            extract_metadata (bool | None, optional): Whether to extract metadata.
                By default, this is inferred from the `description_type` parameter.

        Raises:
            NotImplementedError: From `_parse_description`. If an unknown `description_type` is provided.
            ValueError: From `_parse_description`. If the `description_type` is not found in `description`.
            AssertionError: If a `description_type` is provided that is known to `_parse_description` but
                not `extract_annotation`. This indicates an internal bug and should be reported.
        """
        # regions are a special case because they can overlap
        if description_type == "REGION":
            self._region_parsing(description)
            return
        if description_type == "DOMAIN":
            self._domain_parsing(description)
            return

        matches, data = self._parse_description(
            description_type, description, extract_metadata
        )
        if description_type == "ACT_SITE":
            self.active = matches
            if data:
                self.active_data = data
        elif description_type == "BINDING":
            self.binding = matches
            if data:
                self.binding_data = data
        elif description_type == "MOD_RES":
            self.ptm = matches
            if data:
                self.ptm_data = data
        elif description_type == "DNA_BIND":
            self.dna_binding = matches
        elif description_type == "DISULFID":
            self.disulfide_bond = matches
        elif description_type == "STRAND":
            self.beta_strand = matches
        elif description_type == "HELIX":
            self.helix = matches
        elif description_type == "TURN":
            self.turn = matches
        else:
            raise AssertionError(
                f"If this is raised, the description type {description_type} is only partially handled. Please file an issue."
            )

`init(sequence)` ¶

Instantiates a SiteAnnotations object from a string of amino acid letters.

It is recommended to call SiteAnnotations.extract_annotation after instantiating. Before that, the SiteAnnotations object contains only default values.

Parameters:

Name	Type	Description	Default
`sequence`	`str`	A string of amino acid letters. See `type_aliases.AminoAcidLetter` for valid letters.	required

Source code in procaliper/site_metadata/uniprot_site_parsing.py

def __init__(self, sequence: str) -> None:
    """Instantiates a SiteAnnotations object from a string of amino acid letters.

    It is recommended to call `SiteAnnotations.extract_annotation` after instantiating.
    Before that, the `SiteAnnotations` object contains only default values.

    Args:
        sequence (str): A string of amino acid letters. See
            `type_aliases.AminoAcidLetter` for valid letters.
    """
    self.residue_letter: list[str] = list(sequence)
    self.residue_number: list[int] = list(range(1, len(sequence) + 1))
    self.binding: list[bool] = [False] * len(sequence)
    self.active: list[bool] = [False] * len(sequence)
    self.ptm: list[bool] = [False] * len(sequence)
    self.dna_binding: list[bool] = [False] * len(sequence)
    self.disulfide_bond: list[bool] = [False] * len(sequence)
    self.helix: list[bool] = [False] * len(sequence)
    self.turn: list[bool] = [False] * len(sequence)
    self.beta_strand: list[bool] = [False] * len(sequence)

    self.binding_data: list[dict[str, str]] = [{} for _ in range(len(sequence))]
    self.active_data: list[dict[str, str]] = [{} for _ in range(len(sequence))]
    self.ptm_data: list[dict[str, str]] = [{} for _ in range(len(sequence))]

    self.regions: dict[str, list[int]] = {}
    self.region_data: dict[str, dict[str, str]] = {}

    self.domains: dict[str, list[int]] = {}
    self.domain_data: dict[str, dict[str, str]] = {}

`extract_annotation(description_type, description, extract_metadata=None)` ¶

Extracts the site annotations from the description.

Parameters:

Name	Type	Description	Default
`description_type`	`str`	The type of site annotation to extract. Must be one of the keys in `self.fields_by_description_type`.	required
`description`	`str`	The UniProt site description string.	required
`extract_metadata`	`bool \| None`	Whether to extract metadata. By default, this is inferred from the `description_type` parameter.	`None`

Raises:

Type	Description
`NotImplementedError`	From `_parse_description`. If an unknown `description_type` is provided.
`ValueError`	From `_parse_description`. If the `description_type` is not found in `description`.
`AssertionError`	If a `description_type` is provided that is known to `_parse_description` but not `extract_annotation`. This indicates an internal bug and should be reported.

Source code in procaliper/site_metadata/uniprot_site_parsing.py

def extract_annotation(
    self,
    description_type: str,
    description: str,
    extract_metadata: bool | None = None,
) -> None:
    """Extracts the site annotations from the description.

    Args:
        description_type (str): The type of site annotation to extract. Must be
            one of the keys in `self.fields_by_description_type`.
        description (str): The UniProt site description string.
        extract_metadata (bool | None, optional): Whether to extract metadata.
            By default, this is inferred from the `description_type` parameter.

    Raises:
        NotImplementedError: From `_parse_description`. If an unknown `description_type` is provided.
        ValueError: From `_parse_description`. If the `description_type` is not found in `description`.
        AssertionError: If a `description_type` is provided that is known to `_parse_description` but
            not `extract_annotation`. This indicates an internal bug and should be reported.
    """
    # regions are a special case because they can overlap
    if description_type == "REGION":
        self._region_parsing(description)
        return
    if description_type == "DOMAIN":
        self._domain_parsing(description)
        return

    matches, data = self._parse_description(
        description_type, description, extract_metadata
    )
    if description_type == "ACT_SITE":
        self.active = matches
        if data:
            self.active_data = data
    elif description_type == "BINDING":
        self.binding = matches
        if data:
            self.binding_data = data
    elif description_type == "MOD_RES":
        self.ptm = matches
        if data:
            self.ptm_data = data
    elif description_type == "DNA_BIND":
        self.dna_binding = matches
    elif description_type == "DISULFID":
        self.disulfide_bond = matches
    elif description_type == "STRAND":
        self.beta_strand = matches
    elif description_type == "HELIX":
        self.helix = matches
    elif description_type == "TURN":
        self.turn = matches
    else:
        raise AssertionError(
            f"If this is raised, the description type {description_type} is only partially handled. Please file an issue."
        )

`table()` ¶

Return a dictionary of the data in the SiteAnnotations object.

Returns:

Type	Description
`dict[str, list[Any]]`	dict[str, list[Any]]: Each key is a site annotation feature name. Each value is a list of the values for that feature.

Source code in procaliper/site_metadata/uniprot_site_parsing.py

def table(self) -> dict[str, list[Any]]:
    """Return a dictionary of the data in the SiteAnnotations object.

    Returns:
        dict[str, list[Any]]: Each key is a site annotation feature name.
            Each value is a list of the values for that feature.
    """
    tbl: dict[str, list[Any]] = {}

    tbl["residue_letter"] = self.residue_letter
    tbl["residue_number"] = self.residue_number
    tbl["binding"] = self.binding
    tbl["active"] = self.active
    tbl["ptm"] = self.ptm
    tbl["dna_binding"] = self.dna_binding
    tbl["disulfide_bond"] = self.disulfide_bond
    tbl["helix"] = self.helix
    tbl["turn"] = self.turn
    tbl["beta_strand"] = self.beta_strand
    tbl["binding_data"] = self.binding_data
    tbl["active_data"] = self.active_data
    tbl["ptm_data"] = self.ptm_data

    return tbl

`view` ¶

`ngl_scheme(data, float_to_hex=None, two_sided=False)` ¶

Converts a list of values to an nglview color scheme.

Parameters:

Name	Type	Description	Default
`data`	`list[float]`	The list of values to convert.	required
`float_to_hex`	`Callable[[float], str] \| None`	Function that converts a float to a hex color in the form `"#RRGGBB"`. If `None`, a default function is used that interpolates between white and green (one-sided) or red and blue (two-sided). Defaults to `None`.	`None`
`two_sided`	`bool`	Whether to use a two-sided color scheme. If `False`, we assume `data` only contains positive values. Defaults to `False`.	`False`

Returns:

Type	Description
`list[tuple[str, str]]`	list[tuple[str, str]]: A list of color and residue number tuples that are compatible with nglview.

Source code in procaliper/view/nglview_utils.py

def ngl_scheme(
    data: list[float],
    float_to_hex: Callable[[float], str] | None = None,
    two_sided: bool = False,
) -> list[tuple[str, str]]:
    """Converts a list of values to an nglview color scheme.

    Args:
        data (list[float]): The list of values to convert.
        float_to_hex (Callable[[float], str] | None, optional): Function that
            converts a float to a hex color in the form `"#RRGGBB"`. If `None`,
            a default function is used that interpolates between white and green
            (one-sided) or red and blue (two-sided). Defaults to `None`.
        two_sided (bool, optional): Whether to use a two-sided color scheme. If
            `False`, we assume `data` only contains positive values. Defaults to
            `False`.

    Returns:
        list[tuple[str, str]]: A list of color and residue number tuples that
            are compatible with nglview.
    """
    if float_to_hex is None:
        if two_sided:
            float_to_hex = _default_float_to_hex_rb
        else:
            float_to_hex = _default_float_to_hex

    maxx = max(data)
    scale = max(min(data), abs(maxx)) if two_sided else maxx

    if scale == 0:
        data_scaled = [0.0] * len(data)
    else:
        data_scaled = [x / maxx for x in data]

    return [(float_to_hex(x), f"{i+1}") for i, x in enumerate(data_scaled)]

`protein_to_nglview(protein)` ¶

Generates an nglview widget from a protein that has an associated PDB file.

Must run protein.fetch_pdb first or specify an abosulute path to the PDB in protein.pdb_location_absolute.

Parameters:

Name	Type	Description	Default
`protein`	`Protein`	The protein object to visualize.	required

Raises:

Type	Description
`ValueError`	If the PDB location is not set.

Returns:

Type	Description
`NGLWidget`	nglview.NGLWidget: an nglview widget

Source code in procaliper/view/nglview_utils.py

def protein_to_nglview(protein: Protein) -> nglview.NGLWidget:
    """Generates an nglview widget from a protein that has an associated PDB file.

    Must run `protein.fetch_pdb` first or specify an abosulute path to the PDB
    in `protein.pdb_location_absolute`.

    Args:
        protein (Protein): The protein object to visualize.

    Raises:
        ValueError: If the PDB location is not set.

    Returns:
        nglview.NGLWidget: an nglview widget
    """
    if not protein.pdb_location_absolute:
        raise ValueError("PDB location not set; use `fetch_pdb` first")
    return nglview.show_file(protein.pdb_location_absolute)

`nglview_utils` ¶

`ngl_scheme(data, float_to_hex=None, two_sided=False)` ¶

Converts a list of values to an nglview color scheme.

Parameters:

Name	Type	Description	Default
`data`	`list[float]`	The list of values to convert.	required
`float_to_hex`	`Callable[[float], str] \| None`	Function that converts a float to a hex color in the form `"#RRGGBB"`. If `None`, a default function is used that interpolates between white and green (one-sided) or red and blue (two-sided). Defaults to `None`.	`None`
`two_sided`	`bool`	Whether to use a two-sided color scheme. If `False`, we assume `data` only contains positive values. Defaults to `False`.	`False`

Returns:

Type	Description
`list[tuple[str, str]]`	list[tuple[str, str]]: A list of color and residue number tuples that are compatible with nglview.

Source code in procaliper/view/nglview_utils.py

def ngl_scheme(
    data: list[float],
    float_to_hex: Callable[[float], str] | None = None,
    two_sided: bool = False,
) -> list[tuple[str, str]]:
    """Converts a list of values to an nglview color scheme.

    Args:
        data (list[float]): The list of values to convert.
        float_to_hex (Callable[[float], str] | None, optional): Function that
            converts a float to a hex color in the form `"#RRGGBB"`. If `None`,
            a default function is used that interpolates between white and green
            (one-sided) or red and blue (two-sided). Defaults to `None`.
        two_sided (bool, optional): Whether to use a two-sided color scheme. If
            `False`, we assume `data` only contains positive values. Defaults to
            `False`.

    Returns:
        list[tuple[str, str]]: A list of color and residue number tuples that
            are compatible with nglview.
    """
    if float_to_hex is None:
        if two_sided:
            float_to_hex = _default_float_to_hex_rb
        else:
            float_to_hex = _default_float_to_hex

    maxx = max(data)
    scale = max(min(data), abs(maxx)) if two_sided else maxx

    if scale == 0:
        data_scaled = [0.0] * len(data)
    else:
        data_scaled = [x / maxx for x in data]

    return [(float_to_hex(x), f"{i+1}") for i, x in enumerate(data_scaled)]

`protein_to_nglview(protein)` ¶

Generates an nglview widget from a protein that has an associated PDB file.

Must run protein.fetch_pdb first or specify an abosulute path to the PDB in protein.pdb_location_absolute.

Parameters:

Name	Type	Description	Default
`protein`	`Protein`	The protein object to visualize.	required

Raises:

Type	Description
`ValueError`	If the PDB location is not set.

Returns:

Type	Description
`NGLWidget`	nglview.NGLWidget: an nglview widget

Source code in procaliper/view/nglview_utils.py

def protein_to_nglview(protein: Protein) -> nglview.NGLWidget:
    """Generates an nglview widget from a protein that has an associated PDB file.

    Must run `protein.fetch_pdb` first or specify an abosulute path to the PDB
    in `protein.pdb_location_absolute`.

    Args:
        protein (Protein): The protein object to visualize.

    Raises:
        ValueError: If the PDB location is not set.

    Returns:
        nglview.NGLWidget: an nglview widget
    """
    if not protein.pdb_location_absolute:
        raise ValueError("PDB location not set; use `fetch_pdb` first")
    return nglview.show_file(protein.pdb_location_absolute)

modules

Protein ¶

fetch_pdb(save_path=None, url=None) ¶

from_uniprot_id(uniprot_id, fields=None, from_db='UniProtKB_AC-ID', to_db='UniProtKB-Swiss-Prot') classmethod ¶

from_uniprot_row(row) classmethod ¶

get_biopandas_pdb_dataframe() ¶

get_biopython_residues() ¶

get_biopython_structure() ¶

get_charge(method='gasteiger') ¶

get_confidence() ¶

get_cysteine_data() ¶

get_sasa() ¶

get_titration() ¶

get_titration_from_pkai() ¶

get_titration_from_propka() ¶

get_titration_from_pypka() ¶

list_from_uniprot_ids(uniprot_ids, fields=None, from_db='UniProtKB_AC-ID', to_db='UniProtKB-Swiss-Prot') classmethod ¶

register_local_pdb(path_to_pdb_file=None) ¶

unravel_sites(selected_aas=None, selected_keys=None) ¶

network ¶

contact_network(protein, max_dist_angstroms=10.0) ¶

distance_network(protein, max_dist_angstroms=20) ¶

euclidean_backbone(g) ¶

regulatory_distance_network(protein) ¶

protein_structure ¶

calculate_charge(pdb_filename, method='gasteiger') ¶

calculate_cysteine_data(pdb_filename) ¶

calculate_sasa(pdb_filename) ¶

residue_pLDDT(pdb_filename) ¶

charge ¶

ChargeData ¶

calculate_charge(pdb_filename, method='gasteiger') ¶

confidence ¶

residue_pLDDT(pdb_filename) ¶

cysteine_data ¶

CysteineData ¶

calculate_cysteine_data(pdb_filename) ¶

distance ¶

contact_map(structure, max_dist_angsrtom=10) ¶

distance_matrix(structure, thresh=np.inf) ¶

proximity_matrix(structure, thresh=0) ¶

region_distance(region_1, region_2) ¶

region_distance_matrix(regions) ¶

region_proximity_matrix(regions) ¶

residue_distance(r1, r2) ¶

sasa ¶

SASAData ¶

calculate_sasa(pdb_filename) ¶

titration ¶

TitrationData ¶

calculate_titration_propka(pdb_filename) ¶

site_metadata ¶

CustomSiteData ¶

add_residue_numbers(residue_number) ¶

add_site_data(key, row, overwrite=False) ¶

from_dict(data, residue_index_feature_name='residue_number') classmethod ¶

table() ¶

SiteAnnotations ¶

__init__(sequence) ¶

extract_annotation(description_type, description, extract_metadata=None) ¶

table() ¶

custom_site_data ¶

CustomSiteData ¶

add_residue_numbers(residue_number) ¶

add_site_data(key, row, overwrite=False) ¶

from_dict(data, residue_index_feature_name='residue_number') classmethod ¶

table() ¶

uniprot_site_parsing ¶

SiteAnnotations ¶

__init__(sequence) ¶

extract_annotation(description_type, description, extract_metadata=None) ¶

table() ¶

view ¶

ngl_scheme(data, float_to_hex=None, two_sided=False) ¶

protein_to_nglview(protein) ¶

nglview_utils ¶

ngl_scheme(data, float_to_hex=None, two_sided=False) ¶

protein_to_nglview(protein) ¶

`Protein` ¶

`fetch_pdb(save_path=None, url=None)` ¶

`from_uniprot_id(uniprot_id, fields=None, from_db='UniProtKB_AC-ID', to_db='UniProtKB-Swiss-Prot')` `classmethod` ¶

`from_uniprot_row(row)` `classmethod` ¶

`get_biopandas_pdb_dataframe()` ¶

`get_biopython_residues()` ¶

`get_biopython_structure()` ¶

`get_charge(method='gasteiger')` ¶

`get_confidence()` ¶

`get_cysteine_data()` ¶

`get_sasa()` ¶

`get_titration()` ¶

`get_titration_from_pkai()` ¶

`get_titration_from_propka()` ¶

`get_titration_from_pypka()` ¶

`list_from_uniprot_ids(uniprot_ids, fields=None, from_db='UniProtKB_AC-ID', to_db='UniProtKB-Swiss-Prot')` `classmethod` ¶

`register_local_pdb(path_to_pdb_file=None)` ¶

`unravel_sites(selected_aas=None, selected_keys=None)` ¶

`network` ¶

`contact_network(protein, max_dist_angstroms=10.0)` ¶

`distance_network(protein, max_dist_angstroms=20)` ¶

`euclidean_backbone(g)` ¶

`regulatory_distance_network(protein)` ¶

`protein_structure` ¶

`calculate_charge(pdb_filename, method='gasteiger')` ¶

`calculate_cysteine_data(pdb_filename)` ¶

`calculate_sasa(pdb_filename)` ¶

`residue_pLDDT(pdb_filename)` ¶

`charge` ¶

`ChargeData` ¶

`calculate_charge(pdb_filename, method='gasteiger')` ¶

`confidence` ¶

`residue_pLDDT(pdb_filename)` ¶

`cysteine_data` ¶

`CysteineData` ¶

`calculate_cysteine_data(pdb_filename)` ¶

`distance` ¶

`contact_map(structure, max_dist_angsrtom=10)` ¶

`distance_matrix(structure, thresh=np.inf)` ¶

`proximity_matrix(structure, thresh=0)` ¶

`region_distance(region_1, region_2)` ¶

`region_distance_matrix(regions)` ¶

`region_proximity_matrix(regions)` ¶

`residue_distance(r1, r2)` ¶

`sasa` ¶

`SASAData` ¶

`calculate_sasa(pdb_filename)` ¶

`titration` ¶

`TitrationData` ¶

`calculate_titration_propka(pdb_filename)` ¶

`site_metadata` ¶

`CustomSiteData` ¶

`add_residue_numbers(residue_number)` ¶

`add_site_data(key, row, overwrite=False)` ¶

`from_dict(data, residue_index_feature_name='residue_number')` `classmethod` ¶

`table()` ¶

`SiteAnnotations` ¶

`init(sequence)` ¶

`extract_annotation(description_type, description, extract_metadata=None)` ¶

`table()` ¶

`custom_site_data` ¶

`CustomSiteData` ¶

`add_residue_numbers(residue_number)` ¶

`add_site_data(key, row, overwrite=False)` ¶

`from_dict(data, residue_index_feature_name='residue_number')` `classmethod` ¶

`table()` ¶

`uniprot_site_parsing` ¶

`SiteAnnotations` ¶

`init(sequence)` ¶

`extract_annotation(description_type, description, extract_metadata=None)` ¶

`table()` ¶

`view` ¶

`ngl_scheme(data, float_to_hex=None, two_sided=False)` ¶

`protein_to_nglview(protein)` ¶

`nglview_utils` ¶

`ngl_scheme(data, float_to_hex=None, two_sided=False)` ¶

`protein_to_nglview(protein)` ¶