Skip to content

modules

Top-level package for procaliper.

Protein

Source code in procaliper/_protein.py
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
class Protein:
    UNIPROT_SITE_PATTERNS = {
        "Active site": "ACT_SITE",
        "Binding site": "BINDING",
        "DNA binding": "DNA_BIND",
        "Disulfide bond": "DISULFID",
        "Beta strand": "STRAND",
        "Helix": "HELIX",
        "Turn": "TURN",
        "PTM": "MOD_RES",
        "Region": "REGION",
        "Domain": "DOMAIN",
    }

    UNIPROT_SITE_PATTERNS_RECTIFIED = {
        "active": "ACT_SITE",
        "binding": "BINDING",
        "dna_binding": "DNA_BIND",
        "disulfide_bond": "DISULFID",
        "beta_strand": "STRAND",
        "helix": "HELIX",
        "turn": "TURN",
        "modified_residue": "MOD_RES",
        "region": "REGION",
        "domain": "DOMAIN",
        "domain_[ft]": "DOMAIN",
    }

    UNIPROT_API_DEFAULT_FIELDS = [
        "id",
        "reviewed",
        "protein_name",
        "gene_names",
        "organism_name",
        "length",
        "sequence",
        "ft_act_site",
        "ft_binding",
        "ft_dna_bind",
        "ft_disulfid",
        "ft_strand",
        "ft_helix",
        "ft_turn",
        "ft_mod_res",
        "ft_region",
        "ft_domain",
    ]

    def __init__(self) -> None:
        self.data: dict[str, Any] = {}
        self.pdb_location_relative: str | None = None
        self.pdb_location_absolute: str | None = None

        self.site_annotations: SiteAnnotations = SiteAnnotations("")
        self.custom_site_data: CustomSiteData = CustomSiteData([], {})

        self.confidence_data: list[float] | None = None
        self.sasa_data: structure.sasa.SASAData | None = None
        self.charge_data: structure.charge.ChargeData | None = None
        self.cysteine_data: structure.cysteine_data.CysteineData | None = None
        self.titration_data: structure.titration.TitrationData | None = None
        self.structure_index: list[int] | None = None
        self.sequence_position_to_structure_index: dict[int, int] | None = None
        pass

    def _rectify_label(self, label: str) -> str:
        new_label = label.replace(" ", "_").lower()
        new_label = new_label.removesuffix("_site_sites")
        new_label = new_label.removesuffix("_site")
        return new_label

    def _rectify_data_labels(self) -> None:
        """
        Standardize the features names in self.data

        Replaces all spaces with underscores and lowercases the keys, and then
        replaces all instances of "_site_sites" with "_sites"
        """
        for k in list(self.data.keys()):
            new_key = self._rectify_label(k)
            self.data[new_key] = self.data.pop(k)

    @classmethod
    def from_uniprot_row(cls, row: dict[str, Any]) -> Protein:
        """Create a new Protein object from a row from a Uniprot table

        Args:
            row (dict[str, Any]): Contains the data from the Uniprot table. Must
                have "Sequence" or "sequence" as a key.

        Raises:
            ValueError: If "Sequence" or "sequence" is not found in the row.

        Returns:
            Protein: A processed and standardized protein object.
        """
        p = cls()
        if "Sequence" in row:
            p.data["sequence"] = row["Sequence"]
        elif "sequence" in row:
            p.data["sequence"] = row["sequence"]
        else:
            raise ValueError(f"Sequence not found in row: {row}")
        p.custom_site_data.add_residue_numbers(len(p.data["sequence"]))
        p.site_annotations = SiteAnnotations(p.data["sequence"])
        for key, value in row.items():
            key = p._rectify_label(key)
            if key in cls.UNIPROT_SITE_PATTERNS_RECTIFIED:
                uniprot_description_id = cls.UNIPROT_SITE_PATTERNS_RECTIFIED[key]
                p.site_annotations.extract_annotation(uniprot_description_id, value)
            elif key in cls.UNIPROT_SITE_PATTERNS:
                uniprot_description_id = cls.UNIPROT_SITE_PATTERNS[key]
                p.site_annotations.extract_annotation(uniprot_description_id, value)
            else:
                if value != value:
                    value = ""
                p.data[key] = value
        return p

    @classmethod
    def from_uniprot_id(
        cls,
        uniprot_id: str,
        fields: list[str] | None = None,
        from_db: str = "UniProtKB_AC-ID",
        to_db: str = "UniProtKB-Swiss-Prot",
    ) -> Protein:
        """Create a new Protein object from a Uniprot ID (fetches with Uniprot API)

        Args:
            uniprot_id (str): The Uniprot ID of the protein.
            fields (list[str] | None, optional): The fields to retrieve from
                Uniprot. If `None`, `Protein.UNIPROT_API_DEFAULT_FIELDS` is used.
            from_db (str, optional): The database to retrieve the ID from.
                Defaults to "UniProtKB_AC-ID".
            to_db (str, optional): The database to map to.
                Defaults to "UniProtKB-Swiss-Prot".

        Raises:
            ValueError: If we cannot retrieve the Uniprot ID.

        Returns:
            Protein: A processed and standardized protein object.
        """

        if not fields:
            fields = cls.UNIPROT_API_DEFAULT_FIELDS

        mapper = ProtMapper()

        result, error = mapper.get(
            ids=[uniprot_id], fields=fields, from_db=from_db, to_db=to_db
        )
        if error:
            raise ValueError(f"Uniprot id not retrieved: {error}")
        result.rename(columns={"From": "entry"}, inplace=True)
        if "Length" in result.columns:
            result["Length"] = pd.to_numeric(result["Length"])
        return cls.from_uniprot_row(result.iloc[0].to_dict())

    @classmethod
    def list_from_uniprot_ids(
        cls,
        uniprot_ids: list[str],
        fields: list[str] | None = None,
        from_db: str = "UniProtKB_AC-ID",
        to_db: str = "UniProtKB-Swiss-Prot",
    ) -> list[Protein]:
        """Create a list of Protein objects from a list of Uniprot IDs (fetches with Uniprot API)

        Args:
            uniprot_ids (list[str]): The Uniprot IDs of the proteins.
            fields (list[str] | None, optional): The fields to retrieve from
                Uniprot. If `None`, `Protein.UNIPROT_API_DEFAULT_FIELDS` is used.
            from_db (str, optional): The database to retrieve the IDs from.
                Defaults to "UniProtKB_AC-ID".
            to_db (str, optional): The database to map to.
                Defaults to "UniProtKB-Swiss-Prot".

        Raises:
            ValueError: If we cannot retrieve the Uniprot IDs.

        Returns:
            list[Protein]: A list of processed and standardized protein objects.
        """
        if not fields:
            fields = cls.UNIPROT_API_DEFAULT_FIELDS

        mapper = ProtMapper()

        result, error = mapper.get(
            ids=uniprot_ids, fields=fields, from_db=from_db, to_db=to_db
        )
        if error:
            raise ValueError(f"Uniprot id not retrieved: {error}")
        result.rename(columns={"From": "entry"}, inplace=True)

        if "Length" in result.columns:
            result["Length"] = pd.to_numeric(result["Length"])
        return [cls.from_uniprot_row(row.to_dict()) for _, row in result.iterrows()]

    def __eq__(self, other: object) -> bool:
        if not isinstance(other, Protein):
            return False
        return (
            self.data == other.data
            and self.sasa_data == other.sasa_data
            and self.charge_data == other.charge_data
            and self.cysteine_data == other.cysteine_data
        )

    def residue_data_frame(self) -> pd.DataFrame:
        d = dict(
            chain(
                self.get_charge().items(),
                self.get_sasa().items(),
                self.get_cysteine_data().items(),
                self.get_titration().items(),
            )
        )
        d["pLDDT"] = self.get_confidence()

        return pd.DataFrame(d)

    def get_biopandas_pdb_dataframe(self) -> PandasPdb:
        """Get the PDB dataframe for the protein.

        Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
        file in `self.pdb_location_absolute`.

        Raises:
            ValueError: If `pdb_location_absolute` is not set.

        Returns:
            PandasPdb: A biopandas dataframe that contains the PDB file information.
        """
        if not self.pdb_location_absolute:
            raise ValueError("PDB location not set; use `fetch_pdb` first")
        ppdb = PandasPdb()
        return ppdb.read_pdb(self.pdb_location_absolute)

    def get_biopython_structure(self) -> Structure:
        """Get the biopython structure for the protein.

        Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
        file in `self.pdb_location_absolute`.

        Raises:
            ValueError: If `pdb_location_absolute` is not set.
            ValueError: If the PDB file cannot be parsed.

        Returns:
            Structure: A biopython Structure object for the protein.
        """
        if not self.pdb_location_absolute:
            raise ValueError("PDB location not set; use `fetch_pdb` first")
        p = PDBParser(QUIET=True)
        structure = p.get_structure("", self.pdb_location_absolute)
        if not isinstance(structure, Structure):
            raise ValueError("Unable to parse PDB file.")
        return structure

    def get_biopython_residues(self) -> list[Residue]:
        """Get the biopython residues for the protein.

        Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
        file in `self.pdb_location_absolute`.

        Raises:
            ValueError: If `pdb_location_absolute` is not set.

        Returns:
            list[Residue]: A list of biopython residues for the protein.
        """
        if not self.pdb_location_absolute:
            raise ValueError("PDB location not set; use `fetch_pdb` first")
        p = PDBParser(QUIET=True)
        structure = p.get_structure("", self.pdb_location_absolute)
        reslist = [res for model in structure for chain in model for res in chain]
        return reslist

    def get_confidence(self) -> list[float]:
        """Fetches precomputed confidence data from pdb file.

        Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
        file in `self.pdb_location_absolute`.

        Raises:
            ValueError: If `confidence_data` is not already stored and
                `pdb_location_absolute` is not set.

        Returns:
            list[float]: A list of confidence values for each residue.
        """
        if self.confidence_data:
            return self.confidence_data

        if self.pdb_location_absolute:
            self.confidence_data = structure.confidence.residue_pLDDT(
                self.pdb_location_absolute,
            )
            return self.confidence_data
        else:
            raise ValueError(
                "Confidence data not stored, and PDB location not set; use `fetch_pdb` first"
            )

    def get_sasa(self) -> structure.sasa.SASAData:
        """Fetches precomputed SASA data for the protein, or computes it.

        Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
        file in `self.pdb_location_absolute`.

        Raises:
            ValueError: If `sasa_data` is not already stored and
                `pdb_location_absolute` is not set.

        Returns:
            structure.sasa.SASAData: A :class:`protein_structure.sasa.SASAData`
                object containing the SASA values for residues and atoms.
        """
        if self.sasa_data:
            return self.sasa_data

        if self.pdb_location_absolute:
            self.sasa_data = structure.sasa.calculate_sasa(
                self.pdb_location_absolute,
            )
            return self.sasa_data
        else:
            raise ValueError(
                "SASA data not stored, and PDB location not set; use `fetch_pdb` first"
            )

    def get_charge(self, method: str = "gasteiger") -> structure.charge.ChargeData:
        """Fetches precomputed charge data for the protein, or computes it.

        Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
        file in `self.pdb_location_absolute`.

        Args:
            method (str, optional): The method used for the charge calculation.
                Examples include 'qtpie', 'eem', 'gasteiger'. Defaults to
                'gasteiger'. For a full list reference
                https://open-babel.readthedocs.io/en/latest/Charges/charges.html

        Raises:
            ValueError: If `charge_data` is not already stored and
                `pdb_location_absolute` is not set.

        Returns:
            structure.charge.ChargeData: A :class:`protein_structure.charge.ChargeData`
                object containing the charge values for residues and atoms.
        """
        if self.charge_data:
            if self.charge_data["charge_method"]:
                if self.charge_data["charge_method"][0] == method:
                    return self.charge_data

        if self.pdb_location_absolute:
            self.charge_data = structure.charge.calculate_charge(
                self.pdb_location_absolute,
                method=method,
            )

            self.last_charge_method = method

            return self.charge_data
        else:
            raise ValueError(
                "Charge data for specified method not stored, and PDB location not set; use `fetch_pdb` first"
            )

    def get_cysteine_data(self) -> structure.cysteine_data.CysteineData:
        """Fetches precomputed size data for the protein, or computes it.

        Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
        file in `self.pdb_location_absolute`.

        Raises:
            ValueError: If `cysteine_data` is not already stored and
                `pdb_location_absolute` is not set.

        Returns:
            structure.size.CysteineData: A :class:`protein_structure.size.CysteineData`
                object containing the size values for cystein sites.
        """
        if self.cysteine_data:
            return self.cysteine_data

        if self.pdb_location_absolute:
            self.cysteine_data = structure.cysteine_data.calculate_cysteine_data(
                self.pdb_location_absolute,
            )
            return self.cysteine_data
        else:
            raise ValueError(
                "Size data not stored, and PDB location not set; use `fetch_pdb` first"
            )

    def get_titration(self) -> structure.titration.TitrationData:
        """Runs the default titration calculation for the protein.

        Equivalent to running `self.get_titration_from_propka`.

        Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
        file in `self.pdb_location_absolute`.

        Raises:
            ValueError: If `titration_data` is not already stored and
                `pdb_location_absolute` is not set.

        Returns:
            structure.titration.TitrationData: A
                :class:`protein_structure.titration.TitrationData` object containing
                the titration values for residues.
        """
        return self.get_titration_from_propka()

    def get_titration_from_propka(self) -> structure.titration.TitrationData:
        """Fetches precomputed titration data for the protein, or computes it.

        Uses :func:`protein_structure.titration.calculate_titration_propka` if
        `self.titration_data` is not already stored.

        Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
        file in `self.pdb_location_absolute`.

        Raises:
            ValueError: If `titration_data` is not already stored and
                `pdb_location_absolute` is not set.

        Returns:
            structure.titration.TitrationData: A
                :class:`protein_structure.titration.TitrationData` object containing
                the titration values for residues."""
        if self.titration_data:
            return self.titration_data

        if self.pdb_location_absolute:
            self.titration_data = structure.titration.calculate_titration_propka(
                self.pdb_location_absolute,
            )
            return self.titration_data
        else:
            raise ValueError(
                "Titration data not stored, and PDB location not set; use `fetch_pdb` first"
            )

    def get_titration_from_pypka(self) -> structure.titration.TitrationData:
        """Fetches precomputed titration data for the protein, or computes it.

        Uses :func:`protein_structure.titration.calculate_titration_pypka` if
        `self.titration_data` is not already stored. Requires pypka to be
        installed, which has dependencies that are not FOSS. Please be sure to
        verify that you are legally allowed to use pypka.

        Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
        file in `self.pdb_location_absolute`.

        Raises:
            ValueError: If `titration_data` is not already stored and
                `pdb_location_absolute` is not set. ImportError: If pypka is not
                installed.

        Returns:
            structure.titration.TitrationData: A
                :class:`protein_structure.titration.TitrationData` object containing
                the titration values forresidues."""

        if self.titration_data:
            return self.titration_data

        if self.pdb_location_absolute:
            self.titration_data = structure.titration.calculate_titration_pypka(
                self.pdb_location_absolute,
            )
            return self.titration_data
        else:
            raise ValueError(
                "Titration data not stored, and PDB location not set; use `fetch_pdb` first"
            )

    def get_titration_from_pkai(self) -> structure.titration.TitrationData:
        """Fetches precomputed titration data for the protein, or computes it.

        Uses :func:`protein_structure.titration.calculate_titration_pkai` if
        `self.titration_data` is not already stored. Requires pkai to be
        installed. Note that this method is a deep-learning model, not a
        physics-based calculation.

        Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
        file in `self.pdb_location_absolute`.

        Raises:
            ValueError: If `titration_data` is not already stored and
                `pdb_location_absolute` is not set.

        Returns: structure.titration.TitrationData: A
            :class:`protein_structure.titration.TitrationData` object containing
                the titration values for residues."""
        if self.titration_data:
            return self.titration_data

        if self.pdb_location_absolute:
            self.titration_data = structure.titration.calculate_titration_pkai(
                self.pdb_location_absolute,
            )
            return self.titration_data
        else:
            raise ValueError(
                "Titration data not stored, and PDB location not set; use `fetch_pdb` first"
            )

    def add_custom_site_data_column(
        self, key: str, site_data: list[Any], overwrite: bool = False
    ) -> None:
        if not self.custom_site_data.residue_number:
            self.custom_site_data.add_residue_numbers(len(self.data["sequence"]))
        self.custom_site_data.add_site_data(key, site_data, overwrite=overwrite)

    def unravel_sites(
        self,
        selected_aas: None | set[AminoAcidLetter] = None,
        selected_keys: None | set[str] = None,
    ) -> dict[str, list[Any]]:
        """Split the protein into individual sites, recording values for each.

        Args:
            selected_aas: A set of amino acids letters to include in the output.
                If `None` (default), all amino acids will be included.
            selected_keys: A set of keys belonging to this `Protein` object's
                `data` dictionary to include in the output. If `None` (default),
                all keys are used.

        Returns:
            dict[str, list[Any]]: A dictionary mapping keys to lists of values.
                Each list is a parallel array of the same length as the protein
                sequence (after filtering out non-selected amino acids)."""
        tbl = self.site_annotations.table() | self.custom_site_data.table()
        if selected_keys is None:
            selected_keys = (set(tbl.keys()) | set(self.data.keys())) - {"sequence"}
        tbl_keys = selected_keys & set(tbl.keys())
        data_keys = selected_keys & set(self.data.keys())
        assert tbl_keys.isdisjoint(data_keys)
        res: dict[str, list[Any]] = {k: [] for k in selected_keys}
        for index, site in enumerate(self.data["sequence"]):
            if selected_aas and site not in selected_aas:
                continue
            for k in tbl_keys:
                res[k].append(tbl[k][index])
            for k in data_keys:
                res[k].append(self.data[k])  # will be the same for all sites

        return res

    def fetch_pdb(self, save_path: str | None = None, url: str | None = None) -> None:
        """Fetches the PDB file for the protein (from the AlphaFold database by default).

        Args:
            save_path (str | None, optional): The path to save the PDB file to.
                If `None`, the protein name will be used as the file name.
                Defaults to `None`.
            url (str | None, optional): The URL to fetch the PDB file from.
                Defaults to `None`, in which case the AlphaFold database is used.

        Raises:
            Exception: If the response status code is not 200, meaning we could
                not fetch the PDB from the database."""
        if not url:
            url = f"https://alphafold.ebi.ac.uk/files/AF-{self.data['entry']}-F1-model_v4.pdb"
        if not save_path:
            save_path = f"{self.data['entry']}.pdb"

        response = requests.get(url)

        if response.status_code != 200:
            raise Exception(f"Failed to fetch PDB: {response.status_code}")

        with open(save_path, "wb+") as f:
            f.write(response.content)

        self.pdb_location_relative = save_path
        self.pdb_location_absolute = os.path.abspath(save_path)
        self._build_structure_index()

    def register_local_pdb(self, path_to_pdb_file: str | None = None) -> None:
        """Sets pdb file for protein object using local pdb file.

        Args:
            path_to_pdb_file (str | None, optional): Path to local PDB file.
                Defaults to `None`, in which case it assumes a file with 'entry'.pdb."""
        if not path_to_pdb_file:
            path_to_pdb_file = f"{self.data['entry']}.pdb"
        self.pdb_location_relative = path_to_pdb_file
        self.pdb_location_absolute = os.path.abspath(path_to_pdb_file)
        self._build_structure_index()

    def _build_structure_index(self) -> None:
        self.structure_index = (
            self.get_biopandas_pdb_dataframe().df["ATOM"]["residue_number"].unique()
        )
        assert (
            self.structure_index is not None
        ), "Structure index is not built. PDB file may not be loaded correctly."
        self.sequence_position_to_structure_index = {
            self.structure_index[i]: i for i in range(len(self.structure_index))
        }

    def _is_site_aa(self, site: int, aa: AminoAcidLetter = "C") -> bool:
        if "sequence" not in self.data:
            raise ValueError("Sequence entry not found in data")

        sequence = self.data["sequence"]

        return site <= len(sequence) and sequence[site - 1] == aa

fetch_pdb(save_path=None, url=None)

Fetches the PDB file for the protein (from the AlphaFold database by default).

Parameters:

Name Type Description Default
save_path str | None

The path to save the PDB file to. If None, the protein name will be used as the file name. Defaults to None.

None
url str | None

The URL to fetch the PDB file from. Defaults to None, in which case the AlphaFold database is used.

None

Raises:

Type Description
Exception

If the response status code is not 200, meaning we could not fetch the PDB from the database.

Source code in procaliper/_protein.py
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
def fetch_pdb(self, save_path: str | None = None, url: str | None = None) -> None:
    """Fetches the PDB file for the protein (from the AlphaFold database by default).

    Args:
        save_path (str | None, optional): The path to save the PDB file to.
            If `None`, the protein name will be used as the file name.
            Defaults to `None`.
        url (str | None, optional): The URL to fetch the PDB file from.
            Defaults to `None`, in which case the AlphaFold database is used.

    Raises:
        Exception: If the response status code is not 200, meaning we could
            not fetch the PDB from the database."""
    if not url:
        url = f"https://alphafold.ebi.ac.uk/files/AF-{self.data['entry']}-F1-model_v4.pdb"
    if not save_path:
        save_path = f"{self.data['entry']}.pdb"

    response = requests.get(url)

    if response.status_code != 200:
        raise Exception(f"Failed to fetch PDB: {response.status_code}")

    with open(save_path, "wb+") as f:
        f.write(response.content)

    self.pdb_location_relative = save_path
    self.pdb_location_absolute = os.path.abspath(save_path)
    self._build_structure_index()

from_uniprot_id(uniprot_id, fields=None, from_db='UniProtKB_AC-ID', to_db='UniProtKB-Swiss-Prot') classmethod

Create a new Protein object from a Uniprot ID (fetches with Uniprot API)

Parameters:

Name Type Description Default
uniprot_id str

The Uniprot ID of the protein.

required
fields list[str] | None

The fields to retrieve from Uniprot. If None, Protein.UNIPROT_API_DEFAULT_FIELDS is used.

None
from_db str

The database to retrieve the ID from. Defaults to "UniProtKB_AC-ID".

'UniProtKB_AC-ID'
to_db str

The database to map to. Defaults to "UniProtKB-Swiss-Prot".

'UniProtKB-Swiss-Prot'

Raises:

Type Description
ValueError

If we cannot retrieve the Uniprot ID.

Returns:

Name Type Description
Protein Protein

A processed and standardized protein object.

Source code in procaliper/_protein.py
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
@classmethod
def from_uniprot_id(
    cls,
    uniprot_id: str,
    fields: list[str] | None = None,
    from_db: str = "UniProtKB_AC-ID",
    to_db: str = "UniProtKB-Swiss-Prot",
) -> Protein:
    """Create a new Protein object from a Uniprot ID (fetches with Uniprot API)

    Args:
        uniprot_id (str): The Uniprot ID of the protein.
        fields (list[str] | None, optional): The fields to retrieve from
            Uniprot. If `None`, `Protein.UNIPROT_API_DEFAULT_FIELDS` is used.
        from_db (str, optional): The database to retrieve the ID from.
            Defaults to "UniProtKB_AC-ID".
        to_db (str, optional): The database to map to.
            Defaults to "UniProtKB-Swiss-Prot".

    Raises:
        ValueError: If we cannot retrieve the Uniprot ID.

    Returns:
        Protein: A processed and standardized protein object.
    """

    if not fields:
        fields = cls.UNIPROT_API_DEFAULT_FIELDS

    mapper = ProtMapper()

    result, error = mapper.get(
        ids=[uniprot_id], fields=fields, from_db=from_db, to_db=to_db
    )
    if error:
        raise ValueError(f"Uniprot id not retrieved: {error}")
    result.rename(columns={"From": "entry"}, inplace=True)
    if "Length" in result.columns:
        result["Length"] = pd.to_numeric(result["Length"])
    return cls.from_uniprot_row(result.iloc[0].to_dict())

from_uniprot_row(row) classmethod

Create a new Protein object from a row from a Uniprot table

Parameters:

Name Type Description Default
row dict[str, Any]

Contains the data from the Uniprot table. Must have "Sequence" or "sequence" as a key.

required

Raises:

Type Description
ValueError

If "Sequence" or "sequence" is not found in the row.

Returns:

Name Type Description
Protein Protein

A processed and standardized protein object.

Source code in procaliper/_protein.py
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
@classmethod
def from_uniprot_row(cls, row: dict[str, Any]) -> Protein:
    """Create a new Protein object from a row from a Uniprot table

    Args:
        row (dict[str, Any]): Contains the data from the Uniprot table. Must
            have "Sequence" or "sequence" as a key.

    Raises:
        ValueError: If "Sequence" or "sequence" is not found in the row.

    Returns:
        Protein: A processed and standardized protein object.
    """
    p = cls()
    if "Sequence" in row:
        p.data["sequence"] = row["Sequence"]
    elif "sequence" in row:
        p.data["sequence"] = row["sequence"]
    else:
        raise ValueError(f"Sequence not found in row: {row}")
    p.custom_site_data.add_residue_numbers(len(p.data["sequence"]))
    p.site_annotations = SiteAnnotations(p.data["sequence"])
    for key, value in row.items():
        key = p._rectify_label(key)
        if key in cls.UNIPROT_SITE_PATTERNS_RECTIFIED:
            uniprot_description_id = cls.UNIPROT_SITE_PATTERNS_RECTIFIED[key]
            p.site_annotations.extract_annotation(uniprot_description_id, value)
        elif key in cls.UNIPROT_SITE_PATTERNS:
            uniprot_description_id = cls.UNIPROT_SITE_PATTERNS[key]
            p.site_annotations.extract_annotation(uniprot_description_id, value)
        else:
            if value != value:
                value = ""
            p.data[key] = value
    return p

get_biopandas_pdb_dataframe()

Get the PDB dataframe for the protein.

Must run self.fetch_pdb first or specify an abosulute path to the PDB file in self.pdb_location_absolute.

Raises:

Type Description
ValueError

If pdb_location_absolute is not set.

Returns:

Name Type Description
PandasPdb PandasPdb

A biopandas dataframe that contains the PDB file information.

Source code in procaliper/_protein.py
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
def get_biopandas_pdb_dataframe(self) -> PandasPdb:
    """Get the PDB dataframe for the protein.

    Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
    file in `self.pdb_location_absolute`.

    Raises:
        ValueError: If `pdb_location_absolute` is not set.

    Returns:
        PandasPdb: A biopandas dataframe that contains the PDB file information.
    """
    if not self.pdb_location_absolute:
        raise ValueError("PDB location not set; use `fetch_pdb` first")
    ppdb = PandasPdb()
    return ppdb.read_pdb(self.pdb_location_absolute)

get_biopython_residues()

Get the biopython residues for the protein.

Must run self.fetch_pdb first or specify an abosulute path to the PDB file in self.pdb_location_absolute.

Raises:

Type Description
ValueError

If pdb_location_absolute is not set.

Returns:

Type Description
list[Residue]

list[Residue]: A list of biopython residues for the protein.

Source code in procaliper/_protein.py
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
def get_biopython_residues(self) -> list[Residue]:
    """Get the biopython residues for the protein.

    Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
    file in `self.pdb_location_absolute`.

    Raises:
        ValueError: If `pdb_location_absolute` is not set.

    Returns:
        list[Residue]: A list of biopython residues for the protein.
    """
    if not self.pdb_location_absolute:
        raise ValueError("PDB location not set; use `fetch_pdb` first")
    p = PDBParser(QUIET=True)
    structure = p.get_structure("", self.pdb_location_absolute)
    reslist = [res for model in structure for chain in model for res in chain]
    return reslist

get_biopython_structure()

Get the biopython structure for the protein.

Must run self.fetch_pdb first or specify an abosulute path to the PDB file in self.pdb_location_absolute.

Raises:

Type Description
ValueError

If pdb_location_absolute is not set.

ValueError

If the PDB file cannot be parsed.

Returns:

Name Type Description
Structure Structure

A biopython Structure object for the protein.

Source code in procaliper/_protein.py
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
def get_biopython_structure(self) -> Structure:
    """Get the biopython structure for the protein.

    Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
    file in `self.pdb_location_absolute`.

    Raises:
        ValueError: If `pdb_location_absolute` is not set.
        ValueError: If the PDB file cannot be parsed.

    Returns:
        Structure: A biopython Structure object for the protein.
    """
    if not self.pdb_location_absolute:
        raise ValueError("PDB location not set; use `fetch_pdb` first")
    p = PDBParser(QUIET=True)
    structure = p.get_structure("", self.pdb_location_absolute)
    if not isinstance(structure, Structure):
        raise ValueError("Unable to parse PDB file.")
    return structure

get_charge(method='gasteiger')

Fetches precomputed charge data for the protein, or computes it.

Must run self.fetch_pdb first or specify an abosulute path to the PDB file in self.pdb_location_absolute.

Parameters:

Name Type Description Default
method str

The method used for the charge calculation. Examples include 'qtpie', 'eem', 'gasteiger'. Defaults to 'gasteiger'. For a full list reference https://open-babel.readthedocs.io/en/latest/Charges/charges.html

'gasteiger'

Raises:

Type Description
ValueError

If charge_data is not already stored and pdb_location_absolute is not set.

Returns:

Type Description
ChargeData

structure.charge.ChargeData: A :class:protein_structure.charge.ChargeData object containing the charge values for residues and atoms.

Source code in procaliper/_protein.py
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
def get_charge(self, method: str = "gasteiger") -> structure.charge.ChargeData:
    """Fetches precomputed charge data for the protein, or computes it.

    Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
    file in `self.pdb_location_absolute`.

    Args:
        method (str, optional): The method used for the charge calculation.
            Examples include 'qtpie', 'eem', 'gasteiger'. Defaults to
            'gasteiger'. For a full list reference
            https://open-babel.readthedocs.io/en/latest/Charges/charges.html

    Raises:
        ValueError: If `charge_data` is not already stored and
            `pdb_location_absolute` is not set.

    Returns:
        structure.charge.ChargeData: A :class:`protein_structure.charge.ChargeData`
            object containing the charge values for residues and atoms.
    """
    if self.charge_data:
        if self.charge_data["charge_method"]:
            if self.charge_data["charge_method"][0] == method:
                return self.charge_data

    if self.pdb_location_absolute:
        self.charge_data = structure.charge.calculate_charge(
            self.pdb_location_absolute,
            method=method,
        )

        self.last_charge_method = method

        return self.charge_data
    else:
        raise ValueError(
            "Charge data for specified method not stored, and PDB location not set; use `fetch_pdb` first"
        )

get_confidence()

Fetches precomputed confidence data from pdb file.

Must run self.fetch_pdb first or specify an abosulute path to the PDB file in self.pdb_location_absolute.

Raises:

Type Description
ValueError

If confidence_data is not already stored and pdb_location_absolute is not set.

Returns:

Type Description
list[float]

list[float]: A list of confidence values for each residue.

Source code in procaliper/_protein.py
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
def get_confidence(self) -> list[float]:
    """Fetches precomputed confidence data from pdb file.

    Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
    file in `self.pdb_location_absolute`.

    Raises:
        ValueError: If `confidence_data` is not already stored and
            `pdb_location_absolute` is not set.

    Returns:
        list[float]: A list of confidence values for each residue.
    """
    if self.confidence_data:
        return self.confidence_data

    if self.pdb_location_absolute:
        self.confidence_data = structure.confidence.residue_pLDDT(
            self.pdb_location_absolute,
        )
        return self.confidence_data
    else:
        raise ValueError(
            "Confidence data not stored, and PDB location not set; use `fetch_pdb` first"
        )

get_cysteine_data()

Fetches precomputed size data for the protein, or computes it.

Must run self.fetch_pdb first or specify an abosulute path to the PDB file in self.pdb_location_absolute.

Raises:

Type Description
ValueError

If cysteine_data is not already stored and pdb_location_absolute is not set.

Returns:

Type Description
CysteineData

structure.size.CysteineData: A :class:protein_structure.size.CysteineData object containing the size values for cystein sites.

Source code in procaliper/_protein.py
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
def get_cysteine_data(self) -> structure.cysteine_data.CysteineData:
    """Fetches precomputed size data for the protein, or computes it.

    Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
    file in `self.pdb_location_absolute`.

    Raises:
        ValueError: If `cysteine_data` is not already stored and
            `pdb_location_absolute` is not set.

    Returns:
        structure.size.CysteineData: A :class:`protein_structure.size.CysteineData`
            object containing the size values for cystein sites.
    """
    if self.cysteine_data:
        return self.cysteine_data

    if self.pdb_location_absolute:
        self.cysteine_data = structure.cysteine_data.calculate_cysteine_data(
            self.pdb_location_absolute,
        )
        return self.cysteine_data
    else:
        raise ValueError(
            "Size data not stored, and PDB location not set; use `fetch_pdb` first"
        )

get_sasa()

Fetches precomputed SASA data for the protein, or computes it.

Must run self.fetch_pdb first or specify an abosulute path to the PDB file in self.pdb_location_absolute.

Raises:

Type Description
ValueError

If sasa_data is not already stored and pdb_location_absolute is not set.

Returns:

Type Description
SASAData

structure.sasa.SASAData: A :class:protein_structure.sasa.SASAData object containing the SASA values for residues and atoms.

Source code in procaliper/_protein.py
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
def get_sasa(self) -> structure.sasa.SASAData:
    """Fetches precomputed SASA data for the protein, or computes it.

    Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
    file in `self.pdb_location_absolute`.

    Raises:
        ValueError: If `sasa_data` is not already stored and
            `pdb_location_absolute` is not set.

    Returns:
        structure.sasa.SASAData: A :class:`protein_structure.sasa.SASAData`
            object containing the SASA values for residues and atoms.
    """
    if self.sasa_data:
        return self.sasa_data

    if self.pdb_location_absolute:
        self.sasa_data = structure.sasa.calculate_sasa(
            self.pdb_location_absolute,
        )
        return self.sasa_data
    else:
        raise ValueError(
            "SASA data not stored, and PDB location not set; use `fetch_pdb` first"
        )

get_titration()

Runs the default titration calculation for the protein.

Equivalent to running self.get_titration_from_propka.

Must run self.fetch_pdb first or specify an abosulute path to the PDB file in self.pdb_location_absolute.

Raises:

Type Description
ValueError

If titration_data is not already stored and pdb_location_absolute is not set.

Returns:

Type Description
TitrationData

structure.titration.TitrationData: A :class:protein_structure.titration.TitrationData object containing the titration values for residues.

Source code in procaliper/_protein.py
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
def get_titration(self) -> structure.titration.TitrationData:
    """Runs the default titration calculation for the protein.

    Equivalent to running `self.get_titration_from_propka`.

    Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
    file in `self.pdb_location_absolute`.

    Raises:
        ValueError: If `titration_data` is not already stored and
            `pdb_location_absolute` is not set.

    Returns:
        structure.titration.TitrationData: A
            :class:`protein_structure.titration.TitrationData` object containing
            the titration values for residues.
    """
    return self.get_titration_from_propka()

get_titration_from_pkai()

Fetches precomputed titration data for the protein, or computes it.

Uses :func:protein_structure.titration.calculate_titration_pkai if self.titration_data is not already stored. Requires pkai to be installed. Note that this method is a deep-learning model, not a physics-based calculation.

Must run self.fetch_pdb first or specify an abosulute path to the PDB file in self.pdb_location_absolute.

Raises:

Type Description
ValueError

If titration_data is not already stored and pdb_location_absolute is not set.

structure.titration.TitrationData: A

Type Description
TitrationData

the titration values for residues.

Source code in procaliper/_protein.py
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
def get_titration_from_pkai(self) -> structure.titration.TitrationData:
    """Fetches precomputed titration data for the protein, or computes it.

    Uses :func:`protein_structure.titration.calculate_titration_pkai` if
    `self.titration_data` is not already stored. Requires pkai to be
    installed. Note that this method is a deep-learning model, not a
    physics-based calculation.

    Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
    file in `self.pdb_location_absolute`.

    Raises:
        ValueError: If `titration_data` is not already stored and
            `pdb_location_absolute` is not set.

    Returns: structure.titration.TitrationData: A
        :class:`protein_structure.titration.TitrationData` object containing
            the titration values for residues."""
    if self.titration_data:
        return self.titration_data

    if self.pdb_location_absolute:
        self.titration_data = structure.titration.calculate_titration_pkai(
            self.pdb_location_absolute,
        )
        return self.titration_data
    else:
        raise ValueError(
            "Titration data not stored, and PDB location not set; use `fetch_pdb` first"
        )

get_titration_from_propka()

Fetches precomputed titration data for the protein, or computes it.

Uses :func:protein_structure.titration.calculate_titration_propka if self.titration_data is not already stored.

Must run self.fetch_pdb first or specify an abosulute path to the PDB file in self.pdb_location_absolute.

Raises:

Type Description
ValueError

If titration_data is not already stored and pdb_location_absolute is not set.

Returns:

Type Description
TitrationData

structure.titration.TitrationData: A :class:protein_structure.titration.TitrationData object containing the titration values for residues.

Source code in procaliper/_protein.py
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
def get_titration_from_propka(self) -> structure.titration.TitrationData:
    """Fetches precomputed titration data for the protein, or computes it.

    Uses :func:`protein_structure.titration.calculate_titration_propka` if
    `self.titration_data` is not already stored.

    Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
    file in `self.pdb_location_absolute`.

    Raises:
        ValueError: If `titration_data` is not already stored and
            `pdb_location_absolute` is not set.

    Returns:
        structure.titration.TitrationData: A
            :class:`protein_structure.titration.TitrationData` object containing
            the titration values for residues."""
    if self.titration_data:
        return self.titration_data

    if self.pdb_location_absolute:
        self.titration_data = structure.titration.calculate_titration_propka(
            self.pdb_location_absolute,
        )
        return self.titration_data
    else:
        raise ValueError(
            "Titration data not stored, and PDB location not set; use `fetch_pdb` first"
        )

get_titration_from_pypka()

Fetches precomputed titration data for the protein, or computes it.

Uses :func:protein_structure.titration.calculate_titration_pypka if self.titration_data is not already stored. Requires pypka to be installed, which has dependencies that are not FOSS. Please be sure to verify that you are legally allowed to use pypka.

Must run self.fetch_pdb first or specify an abosulute path to the PDB file in self.pdb_location_absolute.

Raises:

Type Description
ValueError

If titration_data is not already stored and pdb_location_absolute is not set. ImportError: If pypka is not installed.

Returns:

Type Description
TitrationData

structure.titration.TitrationData: A :class:protein_structure.titration.TitrationData object containing the titration values forresidues.

Source code in procaliper/_protein.py
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
def get_titration_from_pypka(self) -> structure.titration.TitrationData:
    """Fetches precomputed titration data for the protein, or computes it.

    Uses :func:`protein_structure.titration.calculate_titration_pypka` if
    `self.titration_data` is not already stored. Requires pypka to be
    installed, which has dependencies that are not FOSS. Please be sure to
    verify that you are legally allowed to use pypka.

    Must run `self.fetch_pdb` first or specify an abosulute path to the PDB
    file in `self.pdb_location_absolute`.

    Raises:
        ValueError: If `titration_data` is not already stored and
            `pdb_location_absolute` is not set. ImportError: If pypka is not
            installed.

    Returns:
        structure.titration.TitrationData: A
            :class:`protein_structure.titration.TitrationData` object containing
            the titration values forresidues."""

    if self.titration_data:
        return self.titration_data

    if self.pdb_location_absolute:
        self.titration_data = structure.titration.calculate_titration_pypka(
            self.pdb_location_absolute,
        )
        return self.titration_data
    else:
        raise ValueError(
            "Titration data not stored, and PDB location not set; use `fetch_pdb` first"
        )

list_from_uniprot_ids(uniprot_ids, fields=None, from_db='UniProtKB_AC-ID', to_db='UniProtKB-Swiss-Prot') classmethod

Create a list of Protein objects from a list of Uniprot IDs (fetches with Uniprot API)

Parameters:

Name Type Description Default
uniprot_ids list[str]

The Uniprot IDs of the proteins.

required
fields list[str] | None

The fields to retrieve from Uniprot. If None, Protein.UNIPROT_API_DEFAULT_FIELDS is used.

None
from_db str

The database to retrieve the IDs from. Defaults to "UniProtKB_AC-ID".

'UniProtKB_AC-ID'
to_db str

The database to map to. Defaults to "UniProtKB-Swiss-Prot".

'UniProtKB-Swiss-Prot'

Raises:

Type Description
ValueError

If we cannot retrieve the Uniprot IDs.

Returns:

Type Description
list[Protein]

list[Protein]: A list of processed and standardized protein objects.

Source code in procaliper/_protein.py
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
@classmethod
def list_from_uniprot_ids(
    cls,
    uniprot_ids: list[str],
    fields: list[str] | None = None,
    from_db: str = "UniProtKB_AC-ID",
    to_db: str = "UniProtKB-Swiss-Prot",
) -> list[Protein]:
    """Create a list of Protein objects from a list of Uniprot IDs (fetches with Uniprot API)

    Args:
        uniprot_ids (list[str]): The Uniprot IDs of the proteins.
        fields (list[str] | None, optional): The fields to retrieve from
            Uniprot. If `None`, `Protein.UNIPROT_API_DEFAULT_FIELDS` is used.
        from_db (str, optional): The database to retrieve the IDs from.
            Defaults to "UniProtKB_AC-ID".
        to_db (str, optional): The database to map to.
            Defaults to "UniProtKB-Swiss-Prot".

    Raises:
        ValueError: If we cannot retrieve the Uniprot IDs.

    Returns:
        list[Protein]: A list of processed and standardized protein objects.
    """
    if not fields:
        fields = cls.UNIPROT_API_DEFAULT_FIELDS

    mapper = ProtMapper()

    result, error = mapper.get(
        ids=uniprot_ids, fields=fields, from_db=from_db, to_db=to_db
    )
    if error:
        raise ValueError(f"Uniprot id not retrieved: {error}")
    result.rename(columns={"From": "entry"}, inplace=True)

    if "Length" in result.columns:
        result["Length"] = pd.to_numeric(result["Length"])
    return [cls.from_uniprot_row(row.to_dict()) for _, row in result.iterrows()]

register_local_pdb(path_to_pdb_file=None)

Sets pdb file for protein object using local pdb file.

Parameters:

Name Type Description Default
path_to_pdb_file str | None

Path to local PDB file. Defaults to None, in which case it assumes a file with 'entry'.pdb.

None
Source code in procaliper/_protein.py
610
611
612
613
614
615
616
617
618
619
620
def register_local_pdb(self, path_to_pdb_file: str | None = None) -> None:
    """Sets pdb file for protein object using local pdb file.

    Args:
        path_to_pdb_file (str | None, optional): Path to local PDB file.
            Defaults to `None`, in which case it assumes a file with 'entry'.pdb."""
    if not path_to_pdb_file:
        path_to_pdb_file = f"{self.data['entry']}.pdb"
    self.pdb_location_relative = path_to_pdb_file
    self.pdb_location_absolute = os.path.abspath(path_to_pdb_file)
    self._build_structure_index()

unravel_sites(selected_aas=None, selected_keys=None)

Split the protein into individual sites, recording values for each.

Parameters:

Name Type Description Default
selected_aas None | set[AminoAcidLetter]

A set of amino acids letters to include in the output. If None (default), all amino acids will be included.

None
selected_keys None | set[str]

A set of keys belonging to this Protein object's data dictionary to include in the output. If None (default), all keys are used.

None

Returns:

Type Description
dict[str, list[Any]]

dict[str, list[Any]]: A dictionary mapping keys to lists of values. Each list is a parallel array of the same length as the protein sequence (after filtering out non-selected amino acids).

Source code in procaliper/_protein.py
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
def unravel_sites(
    self,
    selected_aas: None | set[AminoAcidLetter] = None,
    selected_keys: None | set[str] = None,
) -> dict[str, list[Any]]:
    """Split the protein into individual sites, recording values for each.

    Args:
        selected_aas: A set of amino acids letters to include in the output.
            If `None` (default), all amino acids will be included.
        selected_keys: A set of keys belonging to this `Protein` object's
            `data` dictionary to include in the output. If `None` (default),
            all keys are used.

    Returns:
        dict[str, list[Any]]: A dictionary mapping keys to lists of values.
            Each list is a parallel array of the same length as the protein
            sequence (after filtering out non-selected amino acids)."""
    tbl = self.site_annotations.table() | self.custom_site_data.table()
    if selected_keys is None:
        selected_keys = (set(tbl.keys()) | set(self.data.keys())) - {"sequence"}
    tbl_keys = selected_keys & set(tbl.keys())
    data_keys = selected_keys & set(self.data.keys())
    assert tbl_keys.isdisjoint(data_keys)
    res: dict[str, list[Any]] = {k: [] for k in selected_keys}
    for index, site in enumerate(self.data["sequence"]):
        if selected_aas and site not in selected_aas:
            continue
        for k in tbl_keys:
            res[k].append(tbl[k][index])
        for k in data_keys:
            res[k].append(self.data[k])  # will be the same for all sites

    return res

network

contact_network(protein, max_dist_angstroms=10.0)

Constructs a contact network from a protein.

Parameters:

Name Type Description Default
protein Protein

Protein object.

required
max_dist_angstroms float

Maximum distance between residues to be considered a contact. Defaults to 10.0.

10.0

Returns:

Type Description
Graph

nx.Graph: Contact network.

Source code in procaliper/network.py
11
12
13
14
15
16
17
18
19
20
21
22
23
def contact_network(protein: Protein, max_dist_angstroms: float = 10.0) -> nx.Graph:
    """Constructs a contact network from a protein.

    Args:
        protein (Protein): Protein object.
        max_dist_angstroms (float, optional): Maximum distance between residues to be considered a contact. Defaults to 10.0.

    Returns:
        nx.Graph: Contact network.
    """
    return nx.from_numpy_array(
        psd.contact_map(protein.get_biopython_structure(), max_dist_angstroms)
    )

distance_network(protein, max_dist_angstroms=20)

Constructs a distance network from a protein.

Parameters:

Name Type Description Default
protein Protein

Protein object.

required
max_dist_angstroms float

Maximum distance between residues. Values greater than this will be set to np.inf. Defaults to 20.

20

Returns:

Type Description
Graph

nx.Graph: Distance network.

Source code in procaliper/network.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
def distance_network(protein: Protein, max_dist_angstroms: float = 20) -> nx.Graph:
    """Constructs a distance network from a protein.

    Args:
        protein (Protein): Protein object.
        max_dist_angstroms (float, optional): Maximum distance between residues.
            Values greater than this will be set to np.inf. Defaults to 20.

    Returns:
        nx.Graph: Distance network.
    """
    g = nx.from_numpy_array(
        psd.distance_matrix(protein.get_biopython_structure(), max_dist_angstroms)
    )
    for u, v, d in list(g.edges(data=True)):
        if d["weight"] == np.inf:
            g.remove_edge(u, v)
        else:
            d["proximity"] = 1 / (d["weight"] + 1)
            d["d2"] = d["weight"] ** 2
    return g

euclidean_backbone(g)

Returns the Euclidean backbone of a distance network.

The Euclidean backbone of a weighted graph g is the smallest subgraph of g that contains all shortest paths where a path length is determined by the square root of the sum of the squared edge weights.

This is useful for sparsifying a distance network without disconnecting it.

Parameters:

Name Type Description Default
g Graph

Distance network. Edges must have an attribute "d2" representing the squared edge weight. This is computed by distance_network and regulatory_distance_network automatically.

required

Returns:

Type Description
Graph

nx.Graph: Euclidean backbone.

Source code in procaliper/network.py
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
def euclidean_backbone(g: nx.Graph) -> nx.Graph:
    """Returns the Euclidean backbone of a distance network.

    The Euclidean backbone of a weighted graph g is the smallest subgraph of g that contains
    all shortest paths where a path length is determined by the square root of the sum of the
    squared edge weights.

    This is useful for sparsifying a distance network without disconnecting it.

    Args:
        g (nx.Graph): Distance network. Edges must have an attribute "d2" representing the
            squared edge weight. This is computed by `distance_network` and
            `regulatory_distance_network` automatically.

    Returns:
        nx.Graph: Euclidean backbone.
    """
    return dc.backbone(g, weight="d2", kind="metric")

regulatory_distance_network(protein)

Constructs a regulatory region distance network from a protein.

Distances are computed between PTM sites, annotated regions, binding sites, and active sites.

Node labels will be 1-indexed and inclusive (e.g., "K5..C7" refers to residues 5, 6, and 7). The letter in front of the index refers to the first and last amino acid in the region.

Parameters:

Name Type Description Default
protein Protein

Protein object.

required

Returns:

Type Description
Graph

nx.Graph: Distance network.

Source code in procaliper/network.py
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
def regulatory_distance_network(protein: Protein) -> nx.Graph:
    """Constructs a regulatory region distance network from a protein.

    Distances are computed between PTM sites, annotated regions, binding sites, and active sites.

    Node labels will be 1-indexed and inclusive (e.g., `"K5..C7"` refers to residues 5, 6, and 7).
    The letter in front of the index refers to the first and last amino acid in the region.

    Args:
        protein (Protein): Protein object.

    Returns:
        nx.Graph: Distance network.
    """
    if protein.sequence_position_to_structure_index is None:
        raise ValueError(
            "Protein structure not loaded; use `fetch_pdb`  or `register_local_pdb` first"
        )

    ptms = {f"p_{i}": [i] for i, x in enumerate(protein.site_annotations.ptm) if x}
    binding = {
        f"b_{i}": [i] for i, x in enumerate(protein.site_annotations.binding) if x
    }
    active = {f"a_{i}": [i] for i, x in enumerate(protein.site_annotations.active) if x}
    regions = protein.site_annotations.regions
    domains = protein.site_annotations.domains

    all_regs = {**ptms, **binding, **active, **regions, **domains}

    # residues, excluding heteroatoms and water
    protein_residues = [
        res for res in protein.get_biopython_residues() if res.get_id()[0] == " "
    ]

    all_regs_residues = {}
    for k, v in all_regs.items():
        structure_matched = []
        for i in v:
            if i in protein.sequence_position_to_structure_index:
                res_ind = protein.sequence_position_to_structure_index[i]
                structure_matched.append(protein_residues[res_ind])
        if structure_matched:
            all_regs_residues[k] = structure_matched

    g = nx.Graph()
    for k, v in all_regs.items():
        g.add_node(
            k,
            label=_region_label(v, protein.data["sequence"]),
            region_type=_region_type(k),
            residues=v,
        )

    for k1, v1 in all_regs_residues.items():
        for k2, v2 in all_regs_residues.items():
            if k1 == k2:
                continue
            weight = psd.region_distance(v1, v2)
            g.add_edge(k1, k2, weight=weight, d2=weight**2, proximity=1 / (weight + 1))

    return g

protein_structure

calculate_charge(pdb_filename, method='gasteiger')

Computes the charge of residue sites in a PDB file.

By default, the method used is 'gasteiger', but this is configurable in hyperparameters.py.

Parameters:

Name Type Description Default
pdb_filename str

The path to the PDB file. shortname (str): The shortname of the protein (typically will be UniProt ID).

required
method str

The method used for the charge calculation. Examples include 'qtpie', 'eem', 'gasteiger'. Defaults to 'gasteiger'. For a full list reference https://open-babel.readthedocs.io/en/latest/Charges/charges.html

'gasteiger'

Raises:

Type Description
ValueError

If the charge method is not found.

Returns:

Name Type Description
ChargeData ChargeData

A data class for holding charge data from computed from a PDB file.

Source code in procaliper/protein_structure/charge.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
def calculate_charge(pdb_filename: str, method: str = "gasteiger") -> ChargeData:
    """Computes the charge of residue sites in a PDB file.

    By default, the method used is 'gasteiger', but this is configurable in
    `hyperparameters.py`.

    Args:
        pdb_filename (str): The path to the PDB file. shortname (str): The
            shortname of the protein (typically will be UniProt ID).
        method (str, optional): The method used for the charge calculation.
            Examples include 'qtpie', 'eem', 'gasteiger'. Defaults to
            'gasteiger'. For a full list reference
            https://open-babel.readthedocs.io/en/latest/Charges/charges.html


    Raises:
        ValueError: If the charge method is not found.

    Returns:
        ChargeData: A data class for holding charge data from computed from a
            PDB file.
    """
    pbmol = next(pybel.readfile("pdb", pdb_filename))
    mol = pbmol.OBMol

    # Applies the model and computes charges.
    ob_charge_model = ob.OBChargeModel.FindType(method)

    if not ob_charge_model:
        raise ValueError("Charge method not found. Please check hyperparameters.py")
    ob_charge_model.ComputeCharges(mol)

    charges = cast(list[float], ob_charge_model.GetPartialCharges())

    ppdb = PandasPdb()
    ppdb.read_pdb(pdb_filename)

    # Set up dict
    res = ChargeData(
        {
            "charge": [],
            "charge_method": [],
        }
    )

    for _, residue in sorted(ppdb.df["ATOM"].groupby("residue_number")):
        res["charge"].append([charges[x - 1] for x in sorted(residue["atom_number"])])
        res["charge_method"].append(method)

    return res

calculate_cysteine_data(pdb_filename)

Calculates spatial data for a protein from a PDB file.

Parameters:

Name Type Description Default
pdb_filename str

The path to the PDB file.

required

Returns:

Name Type Description
CysteineData CysteineData

A data class for holding size data from computed from a PDB file.

Source code in procaliper/protein_structure/cysteine_data.py
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
def calculate_cysteine_data(pdb_filename: str) -> CysteineData:
    """Calculates spatial data for a protein from a PDB file.

    Args:
        pdb_filename (str): The path to the PDB file.

    Returns:
        CysteineData: A data class for holding size data from computed from a PDB file.
    """
    ppdb = PandasPdb()
    ppdb.read_pdb(pdb_filename)

    res = CysteineData(
        {
            "cys_ratio": [],
            "min_dist_to_closest_sulfur": [],
            "sulfur_closeness_rating_scaled": [],
        }
    )

    total_residue = cast(int, max(ppdb.df["ATOM"]["residue_number"]))

    cys_positions: list[tuple[float, float, float]] = []
    for x in range(len(ppdb.df["ATOM"])):
        if ppdb.df["ATOM"]["residue_name"][x] == "CYS":
            if ppdb.df["ATOM"]["atom_name"][x] == "SG":
                cys_positions.append(
                    (
                        ppdb.df["ATOM"]["x_coord"][x],
                        ppdb.df["ATOM"]["y_coord"][x],
                        ppdb.df["ATOM"]["z_coord"][x],
                    )
                )
    total_cys_sites = len(cys_positions)

    cys_index = 0

    for _, grp in sorted(ppdb.df["ATOM"].groupby("residue_number")):
        if grp["residue_name"].max() == "CYS":
            sg_closeness_rating_scaled = 0
            x_p, y_p, z_p = cys_positions[cys_index]
            min_distance = 1000  # Initialize with a large number

            points_excluding_index = (
                cys_positions[:cys_index] + cys_positions[cys_index + 1 :]
            )
            for point in points_excluding_index:
                x_q, y_q, z_q = point
                distance = np.sqrt(
                    (x_p - x_q) ** 2 + (y_p - y_q) ** 2 + (z_p - z_q) ** 2
                )
                if distance < min_distance:
                    min_distance = distance
                sg_closeness_rating_scaled += 10 / ((distance + 1) ** 2)

            cys_index += 1

            res["cys_ratio"].append(float(total_cys_sites) / float(total_residue))
            res["min_dist_to_closest_sulfur"].append(min_distance)
            res["sulfur_closeness_rating_scaled"].append(sg_closeness_rating_scaled)
        else:
            res["cys_ratio"].append(None)
            res["min_dist_to_closest_sulfur"].append(None)
            res["sulfur_closeness_rating_scaled"].append(None)

    return res

calculate_sasa(pdb_filename)

Compute the SASA values for all CYS sites in a PDB file.

Uses the ShrakeRupley algorithm implemented in Bio.PDB.SASA.ShrakeRupley with a probe radius of 1.40 and 100 points.

Parameters:

Name Type Description Default
pdb_filename str

The path to the PDB file.

required

Returns:

Name Type Description
SASAData SASAData

A data class for holding SASA data from computed from a PDB file.

Source code in procaliper/protein_structure/sasa.py
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
def calculate_sasa(pdb_filename: str) -> SASAData:
    """Compute the SASA values for all CYS sites in a PDB file.

    Uses the ShrakeRupley algorithm implemented in `Bio.PDB.SASA.ShrakeRupley`
    with a probe radius of 1.40 and 100 points.

    Args:
        pdb_filename (str): The path to the PDB file.

    Returns:
        SASAData: A data class for holding SASA data from computed from a PDB
            file."""
    p = PDBParser(QUIET=True)
    struct = p.get_structure("", pdb_filename)

    sr = ShrakeRupley(probe_radius=PROBE_RADIUS, n_points=N_POINTS, radii_dict=None)

    # Calc sasa values from Residues (from atoms)
    sr.compute(struct, level="R")

    # Set up dict
    res = SASAData(
        {
            "all_sasa_value": [],
            "atom_sasa_values": [],
        }
    )

    assert isinstance(struct, Structure)
    assert struct is not None

    # Fill dict with CYS sites
    for x in struct.child_list:
        for y in x.child_list:
            for z in y.child_list:
                if z.get_id()[0] != " ":  # skips heteroatoms
                    continue
                assert hasattr(z, "sasa")
                res["all_sasa_value"].append(z.sasa)
                res["atom_sasa_values"].append([zx.sasa for zx in z.child_list])  # type: ignore

    return res

residue_pLDDT(pdb_filename)

Extracts the pLDDT confidence for each residue in a PDB file.

We assume that the pLDDT confidences are in the B-factor entries of the PDB file. If this information is provided at the atom-level, the maximimum value across the residue is used.

Parameters:

Name Type Description Default
pdb_filename str

The path to the PDB file.

required

Returns:

Type Description
list[float]

list[float]: The pLDDT confidence for each residue in the PDB file.

Source code in procaliper/protein_structure/confidence.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
def residue_pLDDT(pdb_filename: str) -> list[float]:
    """Extracts the pLDDT confidence for each residue in a PDB file.

    We assume that the pLDDT confidences are in the B-factor entries of the PDB
    file. If this information is provided at the atom-level, the maximimum value
    across the residue is used.

    Args:
        pdb_filename (str): The path to the PDB file.

    Returns:
        list[float]: The pLDDT confidence for each residue in the PDB file.
    """
    ppdb = PandasPdb()
    ppdb.read_pdb(pdb_filename)

    vals = []
    for _, res in ppdb.df["ATOM"].groupby("residue_number"):
        vals.append(res["b_factor"].max())

    return vals

charge

ChargeData

Bases: TypedDict

A data class for holding charge data from computed from a PDB file.

Array index corresponds to residue number in the PDB. Note that Python arrays are 0-indexed and PDB files are 1-indexed, so Python index 0 corresponds to residue 1. This assumes a complete PDB. Otherwise, an object of the procaliper.Protein class that constructs this will store a variable called structure_index that maps these indices to the sequence position.

Attributes:

Name Type Description
charges list[list[float]]

The charge value for atoms in the residue, ordered from C-terminus to N-terminus according to standard pdb order. For example, in CYS, the last atom is always the SG sulfur.

method list[str]

The method used for the charge calculation.

residue_number list[int]

The residue number for the site.

residue_name list[str]

The residue name (three-letter amino acid abbreviation) for the sites.

Source code in procaliper/protein_structure/charge.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
class ChargeData(TypedDict):
    """
    A data class for holding charge data from computed from a PDB file.

    Array index corresponds to residue number in the PDB. Note that Python
    arrays are 0-indexed and PDB files are 1-indexed, so Python index 0
    corresponds to residue 1. This assumes a complete PDB. Otherwise,
    an object of the `procaliper.Protein` class that constructs this will
    store a variable called `structure_index` that maps these indices to the
    sequence position.

    Attributes:
        charges (list[list[float]]): The charge value for atoms in the residue,
            ordered from C-terminus to N-terminus according to standard pdb order.
            For example, in CYS, the last atom is always the SG sulfur.
        method (list[str]): The method used for the charge calculation.
        residue_number (list[int]): The residue number for the site.
        residue_name (list[str]): The residue name (three-letter amino acid
            abbreviation) for the sites.
    """

    charge: list[list[float]]
    charge_method: list[str]

calculate_charge(pdb_filename, method='gasteiger')

Computes the charge of residue sites in a PDB file.

By default, the method used is 'gasteiger', but this is configurable in hyperparameters.py.

Parameters:

Name Type Description Default
pdb_filename str

The path to the PDB file. shortname (str): The shortname of the protein (typically will be UniProt ID).

required
method str

The method used for the charge calculation. Examples include 'qtpie', 'eem', 'gasteiger'. Defaults to 'gasteiger'. For a full list reference https://open-babel.readthedocs.io/en/latest/Charges/charges.html

'gasteiger'

Raises:

Type Description
ValueError

If the charge method is not found.

Returns:

Name Type Description
ChargeData ChargeData

A data class for holding charge data from computed from a PDB file.

Source code in procaliper/protein_structure/charge.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
def calculate_charge(pdb_filename: str, method: str = "gasteiger") -> ChargeData:
    """Computes the charge of residue sites in a PDB file.

    By default, the method used is 'gasteiger', but this is configurable in
    `hyperparameters.py`.

    Args:
        pdb_filename (str): The path to the PDB file. shortname (str): The
            shortname of the protein (typically will be UniProt ID).
        method (str, optional): The method used for the charge calculation.
            Examples include 'qtpie', 'eem', 'gasteiger'. Defaults to
            'gasteiger'. For a full list reference
            https://open-babel.readthedocs.io/en/latest/Charges/charges.html


    Raises:
        ValueError: If the charge method is not found.

    Returns:
        ChargeData: A data class for holding charge data from computed from a
            PDB file.
    """
    pbmol = next(pybel.readfile("pdb", pdb_filename))
    mol = pbmol.OBMol

    # Applies the model and computes charges.
    ob_charge_model = ob.OBChargeModel.FindType(method)

    if not ob_charge_model:
        raise ValueError("Charge method not found. Please check hyperparameters.py")
    ob_charge_model.ComputeCharges(mol)

    charges = cast(list[float], ob_charge_model.GetPartialCharges())

    ppdb = PandasPdb()
    ppdb.read_pdb(pdb_filename)

    # Set up dict
    res = ChargeData(
        {
            "charge": [],
            "charge_method": [],
        }
    )

    for _, residue in sorted(ppdb.df["ATOM"].groupby("residue_number")):
        res["charge"].append([charges[x - 1] for x in sorted(residue["atom_number"])])
        res["charge_method"].append(method)

    return res

confidence

residue_pLDDT(pdb_filename)

Extracts the pLDDT confidence for each residue in a PDB file.

We assume that the pLDDT confidences are in the B-factor entries of the PDB file. If this information is provided at the atom-level, the maximimum value across the residue is used.

Parameters:

Name Type Description Default
pdb_filename str

The path to the PDB file.

required

Returns:

Type Description
list[float]

list[float]: The pLDDT confidence for each residue in the PDB file.

Source code in procaliper/protein_structure/confidence.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
def residue_pLDDT(pdb_filename: str) -> list[float]:
    """Extracts the pLDDT confidence for each residue in a PDB file.

    We assume that the pLDDT confidences are in the B-factor entries of the PDB
    file. If this information is provided at the atom-level, the maximimum value
    across the residue is used.

    Args:
        pdb_filename (str): The path to the PDB file.

    Returns:
        list[float]: The pLDDT confidence for each residue in the PDB file.
    """
    ppdb = PandasPdb()
    ppdb.read_pdb(pdb_filename)

    vals = []
    for _, res in ppdb.df["ATOM"].groupby("residue_number"):
        vals.append(res["b_factor"].max())

    return vals

cysteine_data

CysteineData

Bases: TypedDict

Data class for holding size data from computed from a PDB file.

Non-CYS sites are assigned None values.

Array index corresponds to residue number in the PDB. Note that Python arrays are 0-indexed and PDB files are 1-indexed, so Python index 0 corresponds to residue 1. This assumes a complete PDB. Otherwise, an object of the procaliper.Protein class that constructs this will store a variable called structure_index that maps these indices to the sequence position.

Attributes:

Name Type Description
cys_ratio list[float | None]

The ratio of CYS sites to total sites.

min_dist_to_closest_sulfur list[float | None]

The minimum distance to the closest sulfur for each CYS site.

sulfur_closeness_rating_scaled list[float | None]

The sulfur closeness rating scaled for the CYS sites.

Source code in procaliper/protein_structure/cysteine_data.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
class CysteineData(TypedDict):
    """Data class for holding size data from computed from a PDB file.

    Non-CYS sites are assigned `None` values.

    Array index corresponds to residue number in the PDB. Note that Python
    arrays are 0-indexed and PDB files are 1-indexed, so Python index 0
    corresponds to residue 1. This assumes a complete PDB. Otherwise,
    an object of the `procaliper.Protein` class that constructs this will
    store a variable called `structure_index` that maps these indices to the
    sequence position.

    Attributes:
        cys_ratio (list[float | None]): The ratio of CYS sites to total sites.
        min_dist_to_closest_sulfur (list[float | None]): The minimum distance to the closest sulfur for each CYS site.
        sulfur_closeness_rating_scaled (list[float | None]): The sulfur closeness rating scaled for the CYS sites."""

    cys_ratio: list[float | None]
    min_dist_to_closest_sulfur: list[float | None]
    sulfur_closeness_rating_scaled: list[float | None]

calculate_cysteine_data(pdb_filename)

Calculates spatial data for a protein from a PDB file.

Parameters:

Name Type Description Default
pdb_filename str

The path to the PDB file.

required

Returns:

Name Type Description
CysteineData CysteineData

A data class for holding size data from computed from a PDB file.

Source code in procaliper/protein_structure/cysteine_data.py
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
def calculate_cysteine_data(pdb_filename: str) -> CysteineData:
    """Calculates spatial data for a protein from a PDB file.

    Args:
        pdb_filename (str): The path to the PDB file.

    Returns:
        CysteineData: A data class for holding size data from computed from a PDB file.
    """
    ppdb = PandasPdb()
    ppdb.read_pdb(pdb_filename)

    res = CysteineData(
        {
            "cys_ratio": [],
            "min_dist_to_closest_sulfur": [],
            "sulfur_closeness_rating_scaled": [],
        }
    )

    total_residue = cast(int, max(ppdb.df["ATOM"]["residue_number"]))

    cys_positions: list[tuple[float, float, float]] = []
    for x in range(len(ppdb.df["ATOM"])):
        if ppdb.df["ATOM"]["residue_name"][x] == "CYS":
            if ppdb.df["ATOM"]["atom_name"][x] == "SG":
                cys_positions.append(
                    (
                        ppdb.df["ATOM"]["x_coord"][x],
                        ppdb.df["ATOM"]["y_coord"][x],
                        ppdb.df["ATOM"]["z_coord"][x],
                    )
                )
    total_cys_sites = len(cys_positions)

    cys_index = 0

    for _, grp in sorted(ppdb.df["ATOM"].groupby("residue_number")):
        if grp["residue_name"].max() == "CYS":
            sg_closeness_rating_scaled = 0
            x_p, y_p, z_p = cys_positions[cys_index]
            min_distance = 1000  # Initialize with a large number

            points_excluding_index = (
                cys_positions[:cys_index] + cys_positions[cys_index + 1 :]
            )
            for point in points_excluding_index:
                x_q, y_q, z_q = point
                distance = np.sqrt(
                    (x_p - x_q) ** 2 + (y_p - y_q) ** 2 + (z_p - z_q) ** 2
                )
                if distance < min_distance:
                    min_distance = distance
                sg_closeness_rating_scaled += 10 / ((distance + 1) ** 2)

            cys_index += 1

            res["cys_ratio"].append(float(total_cys_sites) / float(total_residue))
            res["min_dist_to_closest_sulfur"].append(min_distance)
            res["sulfur_closeness_rating_scaled"].append(sg_closeness_rating_scaled)
        else:
            res["cys_ratio"].append(None)
            res["min_dist_to_closest_sulfur"].append(None)
            res["sulfur_closeness_rating_scaled"].append(None)

    return res

distance

contact_map(structure, max_dist_angsrtom=10)

A contact map for a protein structure.

Parameters:

Name Type Description Default
structure Structure

protein structure.

required
max_dist_angsrtom float

Largest distance to consider a contact, in Angstroms. Defaults to 10.

10

Returns:

Type Description
NDArray[int8]

npt.NDArray[np.float64]: contact map with shape nxn where n is the number of residues in the structure.

Source code in procaliper/protein_structure/distance.py
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
def contact_map(
    structure: Structure, max_dist_angsrtom: float = 10
) -> npt.NDArray[np.int8]:
    """A contact map for a protein structure.

    Args:
        structure (Structure): protein structure.
        max_dist_angsrtom (float, optional): Largest distance to consider a contact,
            in Angstroms. Defaults to 10.

    Returns:
        npt.NDArray[np.float64]: contact map with shape nxn where n is the
            number of residues in the structure.
    """
    residues = [res for model in structure for chain in model for res in chain]
    residues = list(enumerate(residues))
    adj = np.zeros((len(residues), len(residues)), dtype=np.int8)

    # a residue has zero distance to itself
    for i in range(len(residues)):
        adj[i, i] = np.int8(1)

    for (row, r1), (col, r2) in combinations(residues, 2):
        dist = residue_distance(r1, r2)
        if dist <= max_dist_angsrtom:
            adj[row, col] = np.int8(1)
            adj[col, row] = np.int8(1)
    return adj

distance_matrix(structure, thresh=np.inf)

Compute a distance matrix for a protein structure.

Parameters:

Name Type Description Default
structure Structure

protein structure.

required
thresh float

threshold for distance. Defaults to np.inf. Distances greater than this will be set to np.inf.

inf

Returns:

Type Description
NDArray[float64]

npt.NDArray[np.float64]: distance matrix with shape nxn where n is the number of residues in the structure.

Source code in procaliper/protein_structure/distance.py
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
def distance_matrix(
    structure: Structure, thresh: float = np.inf
) -> npt.NDArray[np.float64]:
    """Compute a distance matrix for a protein structure.

    Args:
        structure (Structure): protein structure.
        thresh (float, optional): threshold for distance. Defaults to np.inf.
            Distances greater than this will be set to np.inf.

    Returns:
        npt.NDArray[np.float64]: distance matrix with shape nxn where n is the
            number of residues in the structure.
    """
    residues = [res for model in structure for chain in model for res in chain]
    residues = list(enumerate(residues))
    adj = np.ones((len(residues), len(residues))) * np.inf

    # a residue has zero distance to itself
    for i in range(len(residues)):
        adj[i, i] = 0

    for (row, r1), (col, r2) in combinations(residues, 2):
        dist = residue_distance(r1, r2)
        if dist <= thresh:
            adj[row, col] = dist
            adj[col, row] = adj[row, col]
    return adj

proximity_matrix(structure, thresh=0)

Compute a proximity matrix for a protein structure.

Parameters:

Name Type Description Default
structure Structure

protein structure.

required
thresh float

threshold for proximity. Defaults to 0. Proximity less than this will be set to 0.

0

Returns:

Type Description
NDArray[float64]

npt.NDArray[np.float64]: proximity matrix with shape nxn where n is the number of residues in the structure.

Source code in procaliper/protein_structure/distance.py
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
def proximity_matrix(
    structure: Structure, thresh: float = 0
) -> npt.NDArray[np.float64]:
    """Compute a proximity matrix for a protein structure.

    Args:
        structure (Structure): protein structure.
        thresh (float, optional): threshold for proximity. Defaults to 0. Proximity
            less than this will be set to 0.

    Returns:
        npt.NDArray[np.float64]: proximity matrix with shape nxn where n is the
            number of residues in the structure.
    """
    residues = [res for model in structure for chain in model for res in chain]
    residues = list(enumerate(residues))
    adj = np.zeros((len(residues), len(residues)))

    # a residue has proximity 1 to itself
    for i in range(len(residues)):
        adj[i, i] = 1

    for (row, r1), (col, r2) in combinations(residues, 2):
        prox = 1 / (residue_distance(r1, r2) + 1)
        if prox >= thresh:
            adj[row, col] = prox
            adj[col, row] = adj[row, col]
    return adj

region_distance(region_1, region_2)

Compute the distance between two regions of a protein, in Angstroms.

Parameters:

Name Type Description Default
region_1 Iterable[Residue]

first region.

required
region_2 Iterable[Residue]

second region.

required

Returns:

Type Description
floating[Any]

np.floating[Any]: minimum distance between the two regions.

Source code in procaliper/protein_structure/distance.py
16
17
18
19
20
21
22
23
24
25
26
27
28
def region_distance(
    region_1: Iterable[Residue], region_2: Iterable[Residue]
) -> np.floating[Any]:
    """Compute the distance between two regions of a protein, in Angstroms.

    Args:
        region_1 (Iterable[Residue]): first region.
        region_2 (Iterable[Residue]): second region.

    Returns:
        np.floating[Any]: minimum distance between the two regions.
    """
    return min(residue_distance(r1, r2) for r1, r2 in product(region_1, region_2))

region_distance_matrix(regions)

Compute a distance matrix between regions of a protein.

Parameters:

Name Type Description Default
regions Sequence[Iterable[Residue]]

sequence of regions; each region is an iterable of residues.

required

Returns:

Type Description
NDArray[float64]

npt.NDArray[np.float64]: distance matrix with shape nxn where n is the number of regions.

Source code in procaliper/protein_structure/distance.py
31
32
33
34
35
36
37
38
39
40
41
42
43
def region_distance_matrix(
    regions: Sequence[Iterable[Residue]],
) -> npt.NDArray[np.float64]:
    """Compute a distance matrix between regions of a protein.

    Args:
        regions (Sequence[Iterable[Residue]]): sequence of regions; each region is an iterable of residues.

    Returns:
        npt.NDArray[np.float64]: distance matrix with shape nxn where n is the
            number of regions.
    """
    return np.array([[region_distance(r1, r2) for r2 in regions] for r1 in regions])

region_proximity_matrix(regions)

Compute a proxmity matrix between regions of a protein.

Parameters:

Name Type Description Default
regions Sequence[Iterable[Residue]]

sequence of regions; each region is an iterable of residues.

required

Returns:

Type Description
NDArray[float64]

npt.NDArray[np.float64]: proxmity matrix with shape nxn where n is the number of regions.

Source code in procaliper/protein_structure/distance.py
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
def region_proximity_matrix(
    regions: Sequence[Iterable[Residue]],
) -> npt.NDArray[np.float64]:
    """Compute a proxmity matrix between regions of a protein.

    Args:
        regions (Sequence[Iterable[Residue]]): sequence of regions; each region is an iterable of residues.

    Returns:
        npt.NDArray[np.float64]: proxmity matrix with shape nxn where n is the
            number of regions.
    """
    return 1 / (
        1 + np.array([[region_distance(r1, r2) for r2 in regions] for r1 in regions])
    )

residue_distance(r1, r2)

Compute the distance between two residues, in Angstroms.

Parameters:

Name Type Description Default
r1 Residue

first residue.

required
r2 Residue

second residue.

required

Returns:

Type Description
floating[Any]

np.floating[Any]: distance between the two residues.

Source code in procaliper/protein_structure/distance.py
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
def residue_distance(
    r1: Residue,
    r2: Residue,
) -> np.floating[Any]:
    """Compute the distance between two residues, in Angstroms.

    Args:
        r1 (Residue): first residue.
        r2 (Residue): second residue.

    Returns:
        np.floating[Any]: distance between the two residues.
    """
    dv = r1["CA"].coord - r2["CA"].coord
    return np.linalg.norm(dv)

sasa

SASAData

Bases: TypedDict

Data class for holding SASA data from computed from a PDB file.

Array index corresponds to residue number in the PDB. Note that Python arrays are 0-indexed and PDB files are 1-indexed, so Python index 0 corresponds to residue 1. This assumes a complete PDB. Otherwise, an object of the procaliper.Protein class that constructs this will store a variable called structure_index that maps these indices to the sequence position.

Attributes:

Name Type Description
all_sasa_value list[float]

The overall SASA value for each site (computed as sum of atom SASA values).

atom_sasa_values list[list[float]]

The SASA value for the each atom in each sites. Atoms are ordered from C-terminus to N-terminus according to standard pdb order. For example, in CYS, the last atom is always the SG sulfur.

Source code in procaliper/protein_structure/sasa.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
class SASAData(TypedDict):
    """Data class for holding SASA data from computed from a PDB file.

    Array index corresponds to residue number in the PDB. Note that Python
    arrays are 0-indexed and PDB files are 1-indexed, so Python index 0
    corresponds to residue 1. This assumes a complete PDB. Otherwise,
    an object of the `procaliper.Protein` class that constructs this will
    store a variable called `structure_index` that maps these indices to the
    sequence position.

    Attributes:
        all_sasa_value (list[float]): The overall SASA value for each site
            (computed as sum of atom SASA values).
        atom_sasa_values (list[list[float]]): The SASA value for the each atom
            in each sites. Atoms are ordered from C-terminus to N-terminus
            according to standard pdb order. For example, in CYS, the last atom
            is always the SG sulfur.
    """

    all_sasa_value: list[float]
    atom_sasa_values: list[list[float]]

calculate_sasa(pdb_filename)

Compute the SASA values for all CYS sites in a PDB file.

Uses the ShrakeRupley algorithm implemented in Bio.PDB.SASA.ShrakeRupley with a probe radius of 1.40 and 100 points.

Parameters:

Name Type Description Default
pdb_filename str

The path to the PDB file.

required

Returns:

Name Type Description
SASAData SASAData

A data class for holding SASA data from computed from a PDB file.

Source code in procaliper/protein_structure/sasa.py
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
def calculate_sasa(pdb_filename: str) -> SASAData:
    """Compute the SASA values for all CYS sites in a PDB file.

    Uses the ShrakeRupley algorithm implemented in `Bio.PDB.SASA.ShrakeRupley`
    with a probe radius of 1.40 and 100 points.

    Args:
        pdb_filename (str): The path to the PDB file.

    Returns:
        SASAData: A data class for holding SASA data from computed from a PDB
            file."""
    p = PDBParser(QUIET=True)
    struct = p.get_structure("", pdb_filename)

    sr = ShrakeRupley(probe_radius=PROBE_RADIUS, n_points=N_POINTS, radii_dict=None)

    # Calc sasa values from Residues (from atoms)
    sr.compute(struct, level="R")

    # Set up dict
    res = SASAData(
        {
            "all_sasa_value": [],
            "atom_sasa_values": [],
        }
    )

    assert isinstance(struct, Structure)
    assert struct is not None

    # Fill dict with CYS sites
    for x in struct.child_list:
        for y in x.child_list:
            for z in y.child_list:
                if z.get_id()[0] != " ":  # skips heteroatoms
                    continue
                assert hasattr(z, "sasa")
                res["all_sasa_value"].append(z.sasa)
                res["atom_sasa_values"].append([zx.sasa for zx in z.child_list])  # type: ignore

    return res

titration

TitrationData

Bases: TypedDict

Data class for titration data.

Array index corresponds to residue number in the PDB. Note that Python arrays are 0-indexed and PDB files are 1-indexed, so Python index 0 corresponds to residue 1. This assumes a complete PDB. Otherwise, an object of the procaliper.Protein class that constructs this will store a variable called structure_index that maps these indices to the sequence position.

Attributes:

Name Type Description
pKa list[float | None]

The pKa values for the titration data. Non-titratable sites are assigned None values. protonation_state

(list[tuple[str, float]]

The expected protonation states for the titration data. The first element of the tuple is the state of the site and the second element is the average protonation of the site. Non-titratable sites are assigned ("undefined", nan).

Source code in procaliper/protein_structure/titration.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
class TitrationData(TypedDict):
    """Data class for titration data.

    Array index corresponds to residue number in the PDB. Note that Python
    arrays are 0-indexed and PDB files are 1-indexed, so Python index 0
    corresponds to residue 1. This assumes a complete PDB. Otherwise,
    an object of the `procaliper.Protein` class that constructs this will
    store a variable called `structure_index` that maps these indices to the
    sequence position.

    Attributes:
        pKa (list[float | None]): The pKa values for the titration data.
            Non-titratable sites are assigned `None` values. protonation_state
        (list[tuple[str, float]]): The expected protonation states for the
            titration data. The first element of the tuple is the state of the
            site and the second element is the average protonation of the site.
            Non-titratable sites are assigned `("undefined", nan)`.
    """

    pKa: list[float | None]
    protonation_state: list[tuple[str, float | str]]

calculate_titration_propka(pdb_filename)

Uses propka to calculate titration data for the protein.

Parameters:

Name Type Description Default
pdb_filename str

The path to the PDB file.

required

Returns:

Name Type Description
TitrationData TitrationData

The titration data for the protein.

Source code in procaliper/protein_structure/titration.py
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
def calculate_titration_propka(pdb_filename: str) -> TitrationData:
    """Uses propka to calculate titration data for the protein.

    Args:
        pdb_filename (str): The path to the PDB file.

    Returns:
        TitrationData: The titration data for the protein.
    """
    mol = propka.run.single(pdb_filename, optargs=["--quiet"], write_pka=False)
    gs = mol.conformations["AVR"].groups

    ppdb = PandasPdb()
    ppdb.read_pdb(pdb_filename)

    seq = {
        i: res["residue_name"].iloc[0]
        for i, res in ppdb.df["ATOM"].groupby("residue_number")
    }
    pks = {group.atom.res_num: group.pka_value for group in gs}
    sv = sorted(seq.items())
    return TitrationData(
        # pKa=[group.pka_value for group in gs],
        pKa=[pks[i] if i in pks else None for i, _ in sv],
        protonation_state=[_state_from_pk(pks[i] if i in pks else 0) for i, _ in sv],
    )

site_metadata

CustomSiteData

Class for storing custom site-level data.

Source code in procaliper/site_metadata/custom_site_data.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
class CustomSiteData:
    """Class for storing custom site-level data."""

    def __init__(self, residue_number: list[int], data: dict[str, list[Any]]) -> None:
        self.residue_number = residue_number
        for key, value in data.items():
            setattr(self, key, value)

        self.keys = {"residue_number"} | set(data.keys())

    @classmethod
    def from_dict(
        cls,
        data: dict[str, list[Any]],
        residue_index_feature_name: str = "residue_number",
    ) -> CustomSiteData:
        """Create a CustomSiteData object from a dictionary of data.

        Args:
            data (dict[str, list[Any]]): Data dictionary indexed by feature
                name. Each value must be a list of the same length as the
                residue number feature. Must include a residue number key.
            residue_index_feature_name (str, optional): The name of the feature
                that contains the residue number. Defaults to "residue_number".

        Raises:
            ValueError: If the residue number feature is not in the data.

        Returns:
            CustomSiteData: A CustomSiteData object. that contains the data.
        """
        if residue_index_feature_name not in data:
            raise ValueError("CustomSiteData must have a residue_number key.")
        return cls(data[residue_index_feature_name], data)

    def table(self) -> dict[str, list[Any]]:
        """Return a dictionary of the data in the CustomSiteData object.

        Returns:
            dict[str, list[Any]]: A dictionary of the data in the CustomSiteData
                object.
        """
        return {k: getattr(self, k) for k in self.keys}

    def add_residue_numbers(self, residue_number: list[int] | int) -> None:
        """Specify the number of residues in the CustomSiteData object.

        Args:
            residue_number (list[int] | int): If an integer, the number of
                residues. If a list of integers, the list of residue numbers.
        """
        if isinstance(residue_number, int):
            self.residue_number = list(range(1, residue_number + 1))
        else:
            self.residue_number = residue_number

    def add_site_data(self, key: str, row: list[Any], overwrite: bool = False) -> None:
        """Add a site-level feature to the CustomSiteData object.

        Args:
            key (str): The name of the feature to add.
            row (list[Any]): The values for the feature.

            overwrite (bool, optional): Whether to overwrite an existing
                feature. Defaults to False.

        Raises:
            KeyError: If overwrite is False and the feature already exists.
            ValueError: If the number of values in the feature does not match
                the number of residues.
        """
        if hasattr(self, key) and not overwrite:
            raise KeyError(
                f"CustomSiteData already has a {key} key and overwrite is False."
            )

        if len(row) != len(self.residue_number):
            raise ValueError(
                f"CustomSiteData has {len(self.residue_number)} residues, but {key} has {len(row)} values."
                " Perhaps you forgot to call add_residue_numbers?"
            )

        setattr(self, key, row)
        self.keys.add(key)

add_residue_numbers(residue_number)

Specify the number of residues in the CustomSiteData object.

Parameters:

Name Type Description Default
residue_number list[int] | int

If an integer, the number of residues. If a list of integers, the list of residue numbers.

required
Source code in procaliper/site_metadata/custom_site_data.py
55
56
57
58
59
60
61
62
63
64
65
def add_residue_numbers(self, residue_number: list[int] | int) -> None:
    """Specify the number of residues in the CustomSiteData object.

    Args:
        residue_number (list[int] | int): If an integer, the number of
            residues. If a list of integers, the list of residue numbers.
    """
    if isinstance(residue_number, int):
        self.residue_number = list(range(1, residue_number + 1))
    else:
        self.residue_number = residue_number

add_site_data(key, row, overwrite=False)

Add a site-level feature to the CustomSiteData object.

Parameters:

Name Type Description Default
key str

The name of the feature to add.

required
row list[Any]

The values for the feature.

required
overwrite bool

Whether to overwrite an existing feature. Defaults to False.

False

Raises:

Type Description
KeyError

If overwrite is False and the feature already exists.

ValueError

If the number of values in the feature does not match the number of residues.

Source code in procaliper/site_metadata/custom_site_data.py
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
def add_site_data(self, key: str, row: list[Any], overwrite: bool = False) -> None:
    """Add a site-level feature to the CustomSiteData object.

    Args:
        key (str): The name of the feature to add.
        row (list[Any]): The values for the feature.

        overwrite (bool, optional): Whether to overwrite an existing
            feature. Defaults to False.

    Raises:
        KeyError: If overwrite is False and the feature already exists.
        ValueError: If the number of values in the feature does not match
            the number of residues.
    """
    if hasattr(self, key) and not overwrite:
        raise KeyError(
            f"CustomSiteData already has a {key} key and overwrite is False."
        )

    if len(row) != len(self.residue_number):
        raise ValueError(
            f"CustomSiteData has {len(self.residue_number)} residues, but {key} has {len(row)} values."
            " Perhaps you forgot to call add_residue_numbers?"
        )

    setattr(self, key, row)
    self.keys.add(key)

from_dict(data, residue_index_feature_name='residue_number') classmethod

Create a CustomSiteData object from a dictionary of data.

Parameters:

Name Type Description Default
data dict[str, list[Any]]

Data dictionary indexed by feature name. Each value must be a list of the same length as the residue number feature. Must include a residue number key.

required
residue_index_feature_name str

The name of the feature that contains the residue number. Defaults to "residue_number".

'residue_number'

Raises:

Type Description
ValueError

If the residue number feature is not in the data.

Returns:

Name Type Description
CustomSiteData CustomSiteData

A CustomSiteData object. that contains the data.

Source code in procaliper/site_metadata/custom_site_data.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
@classmethod
def from_dict(
    cls,
    data: dict[str, list[Any]],
    residue_index_feature_name: str = "residue_number",
) -> CustomSiteData:
    """Create a CustomSiteData object from a dictionary of data.

    Args:
        data (dict[str, list[Any]]): Data dictionary indexed by feature
            name. Each value must be a list of the same length as the
            residue number feature. Must include a residue number key.
        residue_index_feature_name (str, optional): The name of the feature
            that contains the residue number. Defaults to "residue_number".

    Raises:
        ValueError: If the residue number feature is not in the data.

    Returns:
        CustomSiteData: A CustomSiteData object. that contains the data.
    """
    if residue_index_feature_name not in data:
        raise ValueError("CustomSiteData must have a residue_number key.")
    return cls(data[residue_index_feature_name], data)

table()

Return a dictionary of the data in the CustomSiteData object.

Returns:

Type Description
dict[str, list[Any]]

dict[str, list[Any]]: A dictionary of the data in the CustomSiteData object.

Source code in procaliper/site_metadata/custom_site_data.py
46
47
48
49
50
51
52
53
def table(self) -> dict[str, list[Any]]:
    """Return a dictionary of the data in the CustomSiteData object.

    Returns:
        dict[str, list[Any]]: A dictionary of the data in the CustomSiteData
            object.
    """
    return {k: getattr(self, k) for k in self.keys}

SiteAnnotations

Class for parsing and storing UniProt site annotations.

An example of a UniProt site annotation:

DISULFID 28..87; /evidence="ECO:0000255|PROSITE-ProRule:PRU00114"; DISULFID 105; /note="Interchain (with heavy chain)"

Attributes:

Name Type Description
residue_letter list[str]

A list of amino acid letters.

residue_number list[int]

A list of residue numbers.

binding list[bool]

A list of booleans indicating whether a residue is a binding site.

active list[bool]

A list of booleans indicating whether a residue is an active site.

ptm list[bool]

A list of booleans indicating whether a residue is reported to be post-translationally modified.

dna_binding list[bool]

A list of booleans indicating whether a residue is a DNA binding site.

disulfide_bond list[bool]

A list of booleans indicating whether a residue is a disulfide bond.

helix list[bool]

A list of booleans indicating whether a residue is in a helix.

turn list[bool]

A list of booleans indicating whether a residue is in a turn.

beta_strand list[bool]

A list of booleans indicating whether a residue is in a beta strand.

binding_data list[dict[str, str]]

A list of dictionaries containing binding site metadata.

active_data list[dict[str, str]]

A list of dictionaries containing active site metadata.

ptm_data list[dict[str, str]]

A list of dictionaries containing post-translationally modified site metadata.

regions dict[str, list[int]]

A dictionary mapping region names to lists of (zero-indexed) residue numbers.

region_data dict[str, str]

A dictionary mapping region names to annotation data.

domains dict[str, list[int]]

A dictionary mapping domain names to lists of (zero-indexed) residue numbers.

domain_data dict[str, str]

A dictionary mapping domain names to annotation data.

Source code in procaliper/site_metadata/uniprot_site_parsing.py
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
class SiteAnnotations:
    """Class for parsing and storing UniProt site annotations.

    An example of a UniProt site annotation:

    `DISULFID 28..87; /evidence="ECO:0000255|PROSITE-ProRule:PRU00114"; DISULFID 105; /note="Interchain (with heavy chain)"`

    Attributes:
        residue_letter (list[str]): A list of amino acid letters.
        residue_number (list[int]): A list of residue numbers.
        binding (list[bool]): A list of booleans indicating whether a residue
            is a binding site.
        active (list[bool]): A list of booleans indicating whether a residue
            is an active site.
        ptm (list[bool]): A list of booleans indicating whether a residue
            is reported to be post-translationally modified.
        dna_binding (list[bool]): A list of booleans indicating whether a residue
            is a DNA binding site.
        disulfide_bond (list[bool]): A list of booleans indicating whether a residue
            is a disulfide bond.
        helix (list[bool]): A list of booleans indicating whether a residue
            is in a helix.
        turn (list[bool]): A list of booleans indicating whether a residue
            is in a turn.
        beta_strand (list[bool]): A list of booleans indicating whether a residue
            is in a beta strand.
        binding_data (list[dict[str, str]]): A list of dictionaries containing
            binding site metadata.
        active_data (list[dict[str, str]]): A list of dictionaries containing
            active site metadata.
        ptm_data (list[dict[str, str]]): A list of dictionaries containing
            post-translationally modified site metadata.
        regions (dict[str,list[int]]): A dictionary mapping region names to lists
            of (zero-indexed) residue numbers.
        region_data (dict[str,str]): A dictionary mapping region names to annotation data.
        domains (dict[str,list[int]]): A dictionary mapping domain names to lists
            of (zero-indexed) residue numbers.
        domain_data (dict[str,str]): A dictionary mapping domain names to annotation data.
    """

    fields_by_description_type: dict[str, list[str]] = {
        "BINDING": ["ligand"],
        "ACT_SITE": ["note"],
        "MOD_RES": ["note"],
        "REGION": ["note"],
        "DOMAIN": ["note"],
        "DNA_BIND": [],
        "DISULFID": [],
        "HELIX": [],
        "TURN": [],
        "STRAND": [],
    }

    def __init__(self, sequence: str) -> None:
        """Instantiates a SiteAnnotations object from a string of amino acid letters.

        It is recommended to call `SiteAnnotations.extract_annotation` after instantiating.
        Before that, the `SiteAnnotations` object contains only default values.

        Args:
            sequence (str): A string of amino acid letters. See
                `type_aliases.AminoAcidLetter` for valid letters.
        """
        self.residue_letter: list[str] = list(sequence)
        self.residue_number: list[int] = list(range(1, len(sequence) + 1))
        self.binding: list[bool] = [False] * len(sequence)
        self.active: list[bool] = [False] * len(sequence)
        self.ptm: list[bool] = [False] * len(sequence)
        self.dna_binding: list[bool] = [False] * len(sequence)
        self.disulfide_bond: list[bool] = [False] * len(sequence)
        self.helix: list[bool] = [False] * len(sequence)
        self.turn: list[bool] = [False] * len(sequence)
        self.beta_strand: list[bool] = [False] * len(sequence)

        self.binding_data: list[dict[str, str]] = [{} for _ in range(len(sequence))]
        self.active_data: list[dict[str, str]] = [{} for _ in range(len(sequence))]
        self.ptm_data: list[dict[str, str]] = [{} for _ in range(len(sequence))]

        self.regions: dict[str, list[int]] = {}
        self.region_data: dict[str, dict[str, str]] = {}

        self.domains: dict[str, list[int]] = {}
        self.domain_data: dict[str, dict[str, str]] = {}

    def table(self) -> dict[str, list[Any]]:
        """Return a dictionary of the data in the SiteAnnotations object.

        Returns:
            dict[str, list[Any]]: Each key is a site annotation feature name.
                Each value is a list of the values for that feature.
        """
        tbl: dict[str, list[Any]] = {}

        tbl["residue_letter"] = self.residue_letter
        tbl["residue_number"] = self.residue_number
        tbl["binding"] = self.binding
        tbl["active"] = self.active
        tbl["ptm"] = self.ptm
        tbl["dna_binding"] = self.dna_binding
        tbl["disulfide_bond"] = self.disulfide_bond
        tbl["helix"] = self.helix
        tbl["turn"] = self.turn
        tbl["beta_strand"] = self.beta_strand
        tbl["binding_data"] = self.binding_data
        tbl["active_data"] = self.active_data
        tbl["ptm_data"] = self.ptm_data

        return tbl

    def __len__(self) -> int:
        return len(self.residue_letter)

    def _parse_description(
        self,
        description_type: str,
        description: str,
        extract_metadata: bool | None = None,
    ) -> tuple[list[bool], list[dict[str, str]] | None]:
        # example of descrition:
        # DISULFID 28..87; /evidence="ECO:0000255|PROSITE-ProRule:PRU00114"; DISULFID 105; /note="Interchain (with heavy chain)"

        site_matches = [False] * len(self)

        site_data: list[dict[str, str]] | None = None

        if extract_metadata is None:
            extract_metadata = bool(self.fields_by_description_type[description_type])
        if extract_metadata:
            site_data = [{} for _ in range(len(self))]

        if description_type not in self.fields_by_description_type:
            raise NotImplementedError(f"Unknown description type: {description_type}")
        if (
            not description or description != description
        ):  # not-equal check is for pandas nans
            return site_matches, site_data
        if description_type not in description:
            raise ValueError(
                f"{description_type} does not appear in the description: {description}"
            )

        stretches = description.split(description_type)

        # first stretch is always empty
        for stretch in stretches[1:]:
            fields = stretch.split(";")
            # first field is always site numbers
            se = fields[0].strip().split("..")
            start, end = len(self), len(self)
            if len(se) not in (1, 2):
                raise ValueError(
                    f"Unable to parse site numbers {se} in {stretch} from {description}"
                )
            se_start = se[0].split(":")[-1]

            if len(se) == 1:
                start, end = (
                    int(se_start) - 1,
                    int(se_start) - 1,
                )  # uniprot 1-indexes sites
            else:
                start, end = int(se_start) - 1, int(se[1]) - 1

            if start >= len(self) or end >= len(self) or start > end:
                raise ValueError(
                    f"Improperly formatted descritpion; site numbers not recognized: {stretch} in {description}"
                )

            field_sites = list(range(start, end + 1))
            for s in field_sites:
                site_matches[s] = True
                if se[0] != se_start and extract_metadata:
                    # site_data is populated if extract_metadata is True
                    # mypy does not catch this
                    site_data[s]["isoform"] = se[0].split(":")[0]  # type: ignore

            if len(fields) == 1 or site_data is None:
                continue

            for field in fields[1:]:
                field = field.strip()
                for field_id in self.fields_by_description_type[description_type]:
                    if not field.startswith(f"/{field_id}="):
                        continue
                    field_data = field.removeprefix(f"/{field_id}=")
                    for s in field_sites:
                        if field_id not in site_data[s]:
                            site_data[s][field_id] = field_data
                        else:
                            site_data[s][field_id] += "," + field_data

        return site_matches, site_data

    def _region_parsing(self, description: str) -> None:
        region_annotations = description.split("REGION ")[1:]
        self.regions = {}
        self.region_data = {}
        for region_index, x in enumerate(region_annotations):
            r = f"r_{region_index}"
            fields = x.split(";")
            self.regions[r] = list(
                range(
                    int(fields[0].split("..")[0]) - 1,
                    int(fields[0].split("..")[1]),
                )
            )
            self.region_data[r] = {}
            for field in fields[1:]:
                field = field.strip()
                for field_id in self.fields_by_description_type["REGION"]:
                    if not field.startswith(f"/{field_id}="):
                        continue
                    field_data = field.removeprefix(f"/{field_id}=")
                    if field_id not in self.region_data[r]:
                        self.region_data[r][field_id] = field_data
                    else:
                        self.region_data[r][field_id] += "," + field_data

    def _domain_parsing(self, description: str) -> None:
        domain_annotations = description.split("DOMAIN ")[1:]
        self.domains = {}
        self.domain_data = {}
        for domain_index, x in enumerate(domain_annotations):
            r = f"d_{domain_index}"
            fields = x.split(";")
            self.domains[r] = list(
                range(
                    int(fields[0].split("..")[0]) - 1,
                    int(fields[0].split("..")[1]),
                )
            )
            self.domain_data[r] = {}
            for field in fields[1:]:
                field = field.strip()
                for field_id in self.fields_by_description_type["DOMAIN"]:
                    if not field.startswith(f"/{field_id}="):
                        continue
                    field_data = field.removeprefix(f"/{field_id}=")
                    if field_id not in self.domain_data[r]:
                        self.domain_data[r][field_id] = field_data
                    else:
                        self.domain_data[r][field_id] += "," + field_data

    def extract_annotation(
        self,
        description_type: str,
        description: str,
        extract_metadata: bool | None = None,
    ) -> None:
        """Extracts the site annotations from the description.

        Args:
            description_type (str): The type of site annotation to extract. Must be
                one of the keys in `self.fields_by_description_type`.
            description (str): The UniProt site description string.
            extract_metadata (bool | None, optional): Whether to extract metadata.
                By default, this is inferred from the `description_type` parameter.

        Raises:
            NotImplementedError: From `_parse_description`. If an unknown `description_type` is provided.
            ValueError: From `_parse_description`. If the `description_type` is not found in `description`.
            AssertionError: If a `description_type` is provided that is known to `_parse_description` but
                not `extract_annotation`. This indicates an internal bug and should be reported.
        """
        # regions are a special case because they can overlap
        if description_type == "REGION":
            self._region_parsing(description)
            return
        if description_type == "DOMAIN":
            self._domain_parsing(description)
            return

        matches, data = self._parse_description(
            description_type, description, extract_metadata
        )
        if description_type == "ACT_SITE":
            self.active = matches
            if data:
                self.active_data = data
        elif description_type == "BINDING":
            self.binding = matches
            if data:
                self.binding_data = data
        elif description_type == "MOD_RES":
            self.ptm = matches
            if data:
                self.ptm_data = data
        elif description_type == "DNA_BIND":
            self.dna_binding = matches
        elif description_type == "DISULFID":
            self.disulfide_bond = matches
        elif description_type == "STRAND":
            self.beta_strand = matches
        elif description_type == "HELIX":
            self.helix = matches
        elif description_type == "TURN":
            self.turn = matches
        else:
            raise AssertionError(
                f"If this is raised, the description type {description_type} is only partially handled. Please file an issue."
            )

__init__(sequence)

Instantiates a SiteAnnotations object from a string of amino acid letters.

It is recommended to call SiteAnnotations.extract_annotation after instantiating. Before that, the SiteAnnotations object contains only default values.

Parameters:

Name Type Description Default
sequence str

A string of amino acid letters. See type_aliases.AminoAcidLetter for valid letters.

required
Source code in procaliper/site_metadata/uniprot_site_parsing.py
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
def __init__(self, sequence: str) -> None:
    """Instantiates a SiteAnnotations object from a string of amino acid letters.

    It is recommended to call `SiteAnnotations.extract_annotation` after instantiating.
    Before that, the `SiteAnnotations` object contains only default values.

    Args:
        sequence (str): A string of amino acid letters. See
            `type_aliases.AminoAcidLetter` for valid letters.
    """
    self.residue_letter: list[str] = list(sequence)
    self.residue_number: list[int] = list(range(1, len(sequence) + 1))
    self.binding: list[bool] = [False] * len(sequence)
    self.active: list[bool] = [False] * len(sequence)
    self.ptm: list[bool] = [False] * len(sequence)
    self.dna_binding: list[bool] = [False] * len(sequence)
    self.disulfide_bond: list[bool] = [False] * len(sequence)
    self.helix: list[bool] = [False] * len(sequence)
    self.turn: list[bool] = [False] * len(sequence)
    self.beta_strand: list[bool] = [False] * len(sequence)

    self.binding_data: list[dict[str, str]] = [{} for _ in range(len(sequence))]
    self.active_data: list[dict[str, str]] = [{} for _ in range(len(sequence))]
    self.ptm_data: list[dict[str, str]] = [{} for _ in range(len(sequence))]

    self.regions: dict[str, list[int]] = {}
    self.region_data: dict[str, dict[str, str]] = {}

    self.domains: dict[str, list[int]] = {}
    self.domain_data: dict[str, dict[str, str]] = {}

extract_annotation(description_type, description, extract_metadata=None)

Extracts the site annotations from the description.

Parameters:

Name Type Description Default
description_type str

The type of site annotation to extract. Must be one of the keys in self.fields_by_description_type.

required
description str

The UniProt site description string.

required
extract_metadata bool | None

Whether to extract metadata. By default, this is inferred from the description_type parameter.

None

Raises:

Type Description
NotImplementedError

From _parse_description. If an unknown description_type is provided.

ValueError

From _parse_description. If the description_type is not found in description.

AssertionError

If a description_type is provided that is known to _parse_description but not extract_annotation. This indicates an internal bug and should be reported.

Source code in procaliper/site_metadata/uniprot_site_parsing.py
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
def extract_annotation(
    self,
    description_type: str,
    description: str,
    extract_metadata: bool | None = None,
) -> None:
    """Extracts the site annotations from the description.

    Args:
        description_type (str): The type of site annotation to extract. Must be
            one of the keys in `self.fields_by_description_type`.
        description (str): The UniProt site description string.
        extract_metadata (bool | None, optional): Whether to extract metadata.
            By default, this is inferred from the `description_type` parameter.

    Raises:
        NotImplementedError: From `_parse_description`. If an unknown `description_type` is provided.
        ValueError: From `_parse_description`. If the `description_type` is not found in `description`.
        AssertionError: If a `description_type` is provided that is known to `_parse_description` but
            not `extract_annotation`. This indicates an internal bug and should be reported.
    """
    # regions are a special case because they can overlap
    if description_type == "REGION":
        self._region_parsing(description)
        return
    if description_type == "DOMAIN":
        self._domain_parsing(description)
        return

    matches, data = self._parse_description(
        description_type, description, extract_metadata
    )
    if description_type == "ACT_SITE":
        self.active = matches
        if data:
            self.active_data = data
    elif description_type == "BINDING":
        self.binding = matches
        if data:
            self.binding_data = data
    elif description_type == "MOD_RES":
        self.ptm = matches
        if data:
            self.ptm_data = data
    elif description_type == "DNA_BIND":
        self.dna_binding = matches
    elif description_type == "DISULFID":
        self.disulfide_bond = matches
    elif description_type == "STRAND":
        self.beta_strand = matches
    elif description_type == "HELIX":
        self.helix = matches
    elif description_type == "TURN":
        self.turn = matches
    else:
        raise AssertionError(
            f"If this is raised, the description type {description_type} is only partially handled. Please file an issue."
        )

table()

Return a dictionary of the data in the SiteAnnotations object.

Returns:

Type Description
dict[str, list[Any]]

dict[str, list[Any]]: Each key is a site annotation feature name. Each value is a list of the values for that feature.

Source code in procaliper/site_metadata/uniprot_site_parsing.py
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
def table(self) -> dict[str, list[Any]]:
    """Return a dictionary of the data in the SiteAnnotations object.

    Returns:
        dict[str, list[Any]]: Each key is a site annotation feature name.
            Each value is a list of the values for that feature.
    """
    tbl: dict[str, list[Any]] = {}

    tbl["residue_letter"] = self.residue_letter
    tbl["residue_number"] = self.residue_number
    tbl["binding"] = self.binding
    tbl["active"] = self.active
    tbl["ptm"] = self.ptm
    tbl["dna_binding"] = self.dna_binding
    tbl["disulfide_bond"] = self.disulfide_bond
    tbl["helix"] = self.helix
    tbl["turn"] = self.turn
    tbl["beta_strand"] = self.beta_strand
    tbl["binding_data"] = self.binding_data
    tbl["active_data"] = self.active_data
    tbl["ptm_data"] = self.ptm_data

    return tbl

custom_site_data

CustomSiteData

Class for storing custom site-level data.

Source code in procaliper/site_metadata/custom_site_data.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
class CustomSiteData:
    """Class for storing custom site-level data."""

    def __init__(self, residue_number: list[int], data: dict[str, list[Any]]) -> None:
        self.residue_number = residue_number
        for key, value in data.items():
            setattr(self, key, value)

        self.keys = {"residue_number"} | set(data.keys())

    @classmethod
    def from_dict(
        cls,
        data: dict[str, list[Any]],
        residue_index_feature_name: str = "residue_number",
    ) -> CustomSiteData:
        """Create a CustomSiteData object from a dictionary of data.

        Args:
            data (dict[str, list[Any]]): Data dictionary indexed by feature
                name. Each value must be a list of the same length as the
                residue number feature. Must include a residue number key.
            residue_index_feature_name (str, optional): The name of the feature
                that contains the residue number. Defaults to "residue_number".

        Raises:
            ValueError: If the residue number feature is not in the data.

        Returns:
            CustomSiteData: A CustomSiteData object. that contains the data.
        """
        if residue_index_feature_name not in data:
            raise ValueError("CustomSiteData must have a residue_number key.")
        return cls(data[residue_index_feature_name], data)

    def table(self) -> dict[str, list[Any]]:
        """Return a dictionary of the data in the CustomSiteData object.

        Returns:
            dict[str, list[Any]]: A dictionary of the data in the CustomSiteData
                object.
        """
        return {k: getattr(self, k) for k in self.keys}

    def add_residue_numbers(self, residue_number: list[int] | int) -> None:
        """Specify the number of residues in the CustomSiteData object.

        Args:
            residue_number (list[int] | int): If an integer, the number of
                residues. If a list of integers, the list of residue numbers.
        """
        if isinstance(residue_number, int):
            self.residue_number = list(range(1, residue_number + 1))
        else:
            self.residue_number = residue_number

    def add_site_data(self, key: str, row: list[Any], overwrite: bool = False) -> None:
        """Add a site-level feature to the CustomSiteData object.

        Args:
            key (str): The name of the feature to add.
            row (list[Any]): The values for the feature.

            overwrite (bool, optional): Whether to overwrite an existing
                feature. Defaults to False.

        Raises:
            KeyError: If overwrite is False and the feature already exists.
            ValueError: If the number of values in the feature does not match
                the number of residues.
        """
        if hasattr(self, key) and not overwrite:
            raise KeyError(
                f"CustomSiteData already has a {key} key and overwrite is False."
            )

        if len(row) != len(self.residue_number):
            raise ValueError(
                f"CustomSiteData has {len(self.residue_number)} residues, but {key} has {len(row)} values."
                " Perhaps you forgot to call add_residue_numbers?"
            )

        setattr(self, key, row)
        self.keys.add(key)
add_residue_numbers(residue_number)

Specify the number of residues in the CustomSiteData object.

Parameters:

Name Type Description Default
residue_number list[int] | int

If an integer, the number of residues. If a list of integers, the list of residue numbers.

required
Source code in procaliper/site_metadata/custom_site_data.py
55
56
57
58
59
60
61
62
63
64
65
def add_residue_numbers(self, residue_number: list[int] | int) -> None:
    """Specify the number of residues in the CustomSiteData object.

    Args:
        residue_number (list[int] | int): If an integer, the number of
            residues. If a list of integers, the list of residue numbers.
    """
    if isinstance(residue_number, int):
        self.residue_number = list(range(1, residue_number + 1))
    else:
        self.residue_number = residue_number
add_site_data(key, row, overwrite=False)

Add a site-level feature to the CustomSiteData object.

Parameters:

Name Type Description Default
key str

The name of the feature to add.

required
row list[Any]

The values for the feature.

required
overwrite bool

Whether to overwrite an existing feature. Defaults to False.

False

Raises:

Type Description
KeyError

If overwrite is False and the feature already exists.

ValueError

If the number of values in the feature does not match the number of residues.

Source code in procaliper/site_metadata/custom_site_data.py
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
def add_site_data(self, key: str, row: list[Any], overwrite: bool = False) -> None:
    """Add a site-level feature to the CustomSiteData object.

    Args:
        key (str): The name of the feature to add.
        row (list[Any]): The values for the feature.

        overwrite (bool, optional): Whether to overwrite an existing
            feature. Defaults to False.

    Raises:
        KeyError: If overwrite is False and the feature already exists.
        ValueError: If the number of values in the feature does not match
            the number of residues.
    """
    if hasattr(self, key) and not overwrite:
        raise KeyError(
            f"CustomSiteData already has a {key} key and overwrite is False."
        )

    if len(row) != len(self.residue_number):
        raise ValueError(
            f"CustomSiteData has {len(self.residue_number)} residues, but {key} has {len(row)} values."
            " Perhaps you forgot to call add_residue_numbers?"
        )

    setattr(self, key, row)
    self.keys.add(key)
from_dict(data, residue_index_feature_name='residue_number') classmethod

Create a CustomSiteData object from a dictionary of data.

Parameters:

Name Type Description Default
data dict[str, list[Any]]

Data dictionary indexed by feature name. Each value must be a list of the same length as the residue number feature. Must include a residue number key.

required
residue_index_feature_name str

The name of the feature that contains the residue number. Defaults to "residue_number".

'residue_number'

Raises:

Type Description
ValueError

If the residue number feature is not in the data.

Returns:

Name Type Description
CustomSiteData CustomSiteData

A CustomSiteData object. that contains the data.

Source code in procaliper/site_metadata/custom_site_data.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
@classmethod
def from_dict(
    cls,
    data: dict[str, list[Any]],
    residue_index_feature_name: str = "residue_number",
) -> CustomSiteData:
    """Create a CustomSiteData object from a dictionary of data.

    Args:
        data (dict[str, list[Any]]): Data dictionary indexed by feature
            name. Each value must be a list of the same length as the
            residue number feature. Must include a residue number key.
        residue_index_feature_name (str, optional): The name of the feature
            that contains the residue number. Defaults to "residue_number".

    Raises:
        ValueError: If the residue number feature is not in the data.

    Returns:
        CustomSiteData: A CustomSiteData object. that contains the data.
    """
    if residue_index_feature_name not in data:
        raise ValueError("CustomSiteData must have a residue_number key.")
    return cls(data[residue_index_feature_name], data)
table()

Return a dictionary of the data in the CustomSiteData object.

Returns:

Type Description
dict[str, list[Any]]

dict[str, list[Any]]: A dictionary of the data in the CustomSiteData object.

Source code in procaliper/site_metadata/custom_site_data.py
46
47
48
49
50
51
52
53
def table(self) -> dict[str, list[Any]]:
    """Return a dictionary of the data in the CustomSiteData object.

    Returns:
        dict[str, list[Any]]: A dictionary of the data in the CustomSiteData
            object.
    """
    return {k: getattr(self, k) for k in self.keys}

uniprot_site_parsing

SiteAnnotations

Class for parsing and storing UniProt site annotations.

An example of a UniProt site annotation:

DISULFID 28..87; /evidence="ECO:0000255|PROSITE-ProRule:PRU00114"; DISULFID 105; /note="Interchain (with heavy chain)"

Attributes:

Name Type Description
residue_letter list[str]

A list of amino acid letters.

residue_number list[int]

A list of residue numbers.

binding list[bool]

A list of booleans indicating whether a residue is a binding site.

active list[bool]

A list of booleans indicating whether a residue is an active site.

ptm list[bool]

A list of booleans indicating whether a residue is reported to be post-translationally modified.

dna_binding list[bool]

A list of booleans indicating whether a residue is a DNA binding site.

disulfide_bond list[bool]

A list of booleans indicating whether a residue is a disulfide bond.

helix list[bool]

A list of booleans indicating whether a residue is in a helix.

turn list[bool]

A list of booleans indicating whether a residue is in a turn.

beta_strand list[bool]

A list of booleans indicating whether a residue is in a beta strand.

binding_data list[dict[str, str]]

A list of dictionaries containing binding site metadata.

active_data list[dict[str, str]]

A list of dictionaries containing active site metadata.

ptm_data list[dict[str, str]]

A list of dictionaries containing post-translationally modified site metadata.

regions dict[str, list[int]]

A dictionary mapping region names to lists of (zero-indexed) residue numbers.

region_data dict[str, str]

A dictionary mapping region names to annotation data.

domains dict[str, list[int]]

A dictionary mapping domain names to lists of (zero-indexed) residue numbers.

domain_data dict[str, str]

A dictionary mapping domain names to annotation data.

Source code in procaliper/site_metadata/uniprot_site_parsing.py
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
class SiteAnnotations:
    """Class for parsing and storing UniProt site annotations.

    An example of a UniProt site annotation:

    `DISULFID 28..87; /evidence="ECO:0000255|PROSITE-ProRule:PRU00114"; DISULFID 105; /note="Interchain (with heavy chain)"`

    Attributes:
        residue_letter (list[str]): A list of amino acid letters.
        residue_number (list[int]): A list of residue numbers.
        binding (list[bool]): A list of booleans indicating whether a residue
            is a binding site.
        active (list[bool]): A list of booleans indicating whether a residue
            is an active site.
        ptm (list[bool]): A list of booleans indicating whether a residue
            is reported to be post-translationally modified.
        dna_binding (list[bool]): A list of booleans indicating whether a residue
            is a DNA binding site.
        disulfide_bond (list[bool]): A list of booleans indicating whether a residue
            is a disulfide bond.
        helix (list[bool]): A list of booleans indicating whether a residue
            is in a helix.
        turn (list[bool]): A list of booleans indicating whether a residue
            is in a turn.
        beta_strand (list[bool]): A list of booleans indicating whether a residue
            is in a beta strand.
        binding_data (list[dict[str, str]]): A list of dictionaries containing
            binding site metadata.
        active_data (list[dict[str, str]]): A list of dictionaries containing
            active site metadata.
        ptm_data (list[dict[str, str]]): A list of dictionaries containing
            post-translationally modified site metadata.
        regions (dict[str,list[int]]): A dictionary mapping region names to lists
            of (zero-indexed) residue numbers.
        region_data (dict[str,str]): A dictionary mapping region names to annotation data.
        domains (dict[str,list[int]]): A dictionary mapping domain names to lists
            of (zero-indexed) residue numbers.
        domain_data (dict[str,str]): A dictionary mapping domain names to annotation data.
    """

    fields_by_description_type: dict[str, list[str]] = {
        "BINDING": ["ligand"],
        "ACT_SITE": ["note"],
        "MOD_RES": ["note"],
        "REGION": ["note"],
        "DOMAIN": ["note"],
        "DNA_BIND": [],
        "DISULFID": [],
        "HELIX": [],
        "TURN": [],
        "STRAND": [],
    }

    def __init__(self, sequence: str) -> None:
        """Instantiates a SiteAnnotations object from a string of amino acid letters.

        It is recommended to call `SiteAnnotations.extract_annotation` after instantiating.
        Before that, the `SiteAnnotations` object contains only default values.

        Args:
            sequence (str): A string of amino acid letters. See
                `type_aliases.AminoAcidLetter` for valid letters.
        """
        self.residue_letter: list[str] = list(sequence)
        self.residue_number: list[int] = list(range(1, len(sequence) + 1))
        self.binding: list[bool] = [False] * len(sequence)
        self.active: list[bool] = [False] * len(sequence)
        self.ptm: list[bool] = [False] * len(sequence)
        self.dna_binding: list[bool] = [False] * len(sequence)
        self.disulfide_bond: list[bool] = [False] * len(sequence)
        self.helix: list[bool] = [False] * len(sequence)
        self.turn: list[bool] = [False] * len(sequence)
        self.beta_strand: list[bool] = [False] * len(sequence)

        self.binding_data: list[dict[str, str]] = [{} for _ in range(len(sequence))]
        self.active_data: list[dict[str, str]] = [{} for _ in range(len(sequence))]
        self.ptm_data: list[dict[str, str]] = [{} for _ in range(len(sequence))]

        self.regions: dict[str, list[int]] = {}
        self.region_data: dict[str, dict[str, str]] = {}

        self.domains: dict[str, list[int]] = {}
        self.domain_data: dict[str, dict[str, str]] = {}

    def table(self) -> dict[str, list[Any]]:
        """Return a dictionary of the data in the SiteAnnotations object.

        Returns:
            dict[str, list[Any]]: Each key is a site annotation feature name.
                Each value is a list of the values for that feature.
        """
        tbl: dict[str, list[Any]] = {}

        tbl["residue_letter"] = self.residue_letter
        tbl["residue_number"] = self.residue_number
        tbl["binding"] = self.binding
        tbl["active"] = self.active
        tbl["ptm"] = self.ptm
        tbl["dna_binding"] = self.dna_binding
        tbl["disulfide_bond"] = self.disulfide_bond
        tbl["helix"] = self.helix
        tbl["turn"] = self.turn
        tbl["beta_strand"] = self.beta_strand
        tbl["binding_data"] = self.binding_data
        tbl["active_data"] = self.active_data
        tbl["ptm_data"] = self.ptm_data

        return tbl

    def __len__(self) -> int:
        return len(self.residue_letter)

    def _parse_description(
        self,
        description_type: str,
        description: str,
        extract_metadata: bool | None = None,
    ) -> tuple[list[bool], list[dict[str, str]] | None]:
        # example of descrition:
        # DISULFID 28..87; /evidence="ECO:0000255|PROSITE-ProRule:PRU00114"; DISULFID 105; /note="Interchain (with heavy chain)"

        site_matches = [False] * len(self)

        site_data: list[dict[str, str]] | None = None

        if extract_metadata is None:
            extract_metadata = bool(self.fields_by_description_type[description_type])
        if extract_metadata:
            site_data = [{} for _ in range(len(self))]

        if description_type not in self.fields_by_description_type:
            raise NotImplementedError(f"Unknown description type: {description_type}")
        if (
            not description or description != description
        ):  # not-equal check is for pandas nans
            return site_matches, site_data
        if description_type not in description:
            raise ValueError(
                f"{description_type} does not appear in the description: {description}"
            )

        stretches = description.split(description_type)

        # first stretch is always empty
        for stretch in stretches[1:]:
            fields = stretch.split(";")
            # first field is always site numbers
            se = fields[0].strip().split("..")
            start, end = len(self), len(self)
            if len(se) not in (1, 2):
                raise ValueError(
                    f"Unable to parse site numbers {se} in {stretch} from {description}"
                )
            se_start = se[0].split(":")[-1]

            if len(se) == 1:
                start, end = (
                    int(se_start) - 1,
                    int(se_start) - 1,
                )  # uniprot 1-indexes sites
            else:
                start, end = int(se_start) - 1, int(se[1]) - 1

            if start >= len(self) or end >= len(self) or start > end:
                raise ValueError(
                    f"Improperly formatted descritpion; site numbers not recognized: {stretch} in {description}"
                )

            field_sites = list(range(start, end + 1))
            for s in field_sites:
                site_matches[s] = True
                if se[0] != se_start and extract_metadata:
                    # site_data is populated if extract_metadata is True
                    # mypy does not catch this
                    site_data[s]["isoform"] = se[0].split(":")[0]  # type: ignore

            if len(fields) == 1 or site_data is None:
                continue

            for field in fields[1:]:
                field = field.strip()
                for field_id in self.fields_by_description_type[description_type]:
                    if not field.startswith(f"/{field_id}="):
                        continue
                    field_data = field.removeprefix(f"/{field_id}=")
                    for s in field_sites:
                        if field_id not in site_data[s]:
                            site_data[s][field_id] = field_data
                        else:
                            site_data[s][field_id] += "," + field_data

        return site_matches, site_data

    def _region_parsing(self, description: str) -> None:
        region_annotations = description.split("REGION ")[1:]
        self.regions = {}
        self.region_data = {}
        for region_index, x in enumerate(region_annotations):
            r = f"r_{region_index}"
            fields = x.split(";")
            self.regions[r] = list(
                range(
                    int(fields[0].split("..")[0]) - 1,
                    int(fields[0].split("..")[1]),
                )
            )
            self.region_data[r] = {}
            for field in fields[1:]:
                field = field.strip()
                for field_id in self.fields_by_description_type["REGION"]:
                    if not field.startswith(f"/{field_id}="):
                        continue
                    field_data = field.removeprefix(f"/{field_id}=")
                    if field_id not in self.region_data[r]:
                        self.region_data[r][field_id] = field_data
                    else:
                        self.region_data[r][field_id] += "," + field_data

    def _domain_parsing(self, description: str) -> None:
        domain_annotations = description.split("DOMAIN ")[1:]
        self.domains = {}
        self.domain_data = {}
        for domain_index, x in enumerate(domain_annotations):
            r = f"d_{domain_index}"
            fields = x.split(";")
            self.domains[r] = list(
                range(
                    int(fields[0].split("..")[0]) - 1,
                    int(fields[0].split("..")[1]),
                )
            )
            self.domain_data[r] = {}
            for field in fields[1:]:
                field = field.strip()
                for field_id in self.fields_by_description_type["DOMAIN"]:
                    if not field.startswith(f"/{field_id}="):
                        continue
                    field_data = field.removeprefix(f"/{field_id}=")
                    if field_id not in self.domain_data[r]:
                        self.domain_data[r][field_id] = field_data
                    else:
                        self.domain_data[r][field_id] += "," + field_data

    def extract_annotation(
        self,
        description_type: str,
        description: str,
        extract_metadata: bool | None = None,
    ) -> None:
        """Extracts the site annotations from the description.

        Args:
            description_type (str): The type of site annotation to extract. Must be
                one of the keys in `self.fields_by_description_type`.
            description (str): The UniProt site description string.
            extract_metadata (bool | None, optional): Whether to extract metadata.
                By default, this is inferred from the `description_type` parameter.

        Raises:
            NotImplementedError: From `_parse_description`. If an unknown `description_type` is provided.
            ValueError: From `_parse_description`. If the `description_type` is not found in `description`.
            AssertionError: If a `description_type` is provided that is known to `_parse_description` but
                not `extract_annotation`. This indicates an internal bug and should be reported.
        """
        # regions are a special case because they can overlap
        if description_type == "REGION":
            self._region_parsing(description)
            return
        if description_type == "DOMAIN":
            self._domain_parsing(description)
            return

        matches, data = self._parse_description(
            description_type, description, extract_metadata
        )
        if description_type == "ACT_SITE":
            self.active = matches
            if data:
                self.active_data = data
        elif description_type == "BINDING":
            self.binding = matches
            if data:
                self.binding_data = data
        elif description_type == "MOD_RES":
            self.ptm = matches
            if data:
                self.ptm_data = data
        elif description_type == "DNA_BIND":
            self.dna_binding = matches
        elif description_type == "DISULFID":
            self.disulfide_bond = matches
        elif description_type == "STRAND":
            self.beta_strand = matches
        elif description_type == "HELIX":
            self.helix = matches
        elif description_type == "TURN":
            self.turn = matches
        else:
            raise AssertionError(
                f"If this is raised, the description type {description_type} is only partially handled. Please file an issue."
            )
__init__(sequence)

Instantiates a SiteAnnotations object from a string of amino acid letters.

It is recommended to call SiteAnnotations.extract_annotation after instantiating. Before that, the SiteAnnotations object contains only default values.

Parameters:

Name Type Description Default
sequence str

A string of amino acid letters. See type_aliases.AminoAcidLetter for valid letters.

required
Source code in procaliper/site_metadata/uniprot_site_parsing.py
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
def __init__(self, sequence: str) -> None:
    """Instantiates a SiteAnnotations object from a string of amino acid letters.

    It is recommended to call `SiteAnnotations.extract_annotation` after instantiating.
    Before that, the `SiteAnnotations` object contains only default values.

    Args:
        sequence (str): A string of amino acid letters. See
            `type_aliases.AminoAcidLetter` for valid letters.
    """
    self.residue_letter: list[str] = list(sequence)
    self.residue_number: list[int] = list(range(1, len(sequence) + 1))
    self.binding: list[bool] = [False] * len(sequence)
    self.active: list[bool] = [False] * len(sequence)
    self.ptm: list[bool] = [False] * len(sequence)
    self.dna_binding: list[bool] = [False] * len(sequence)
    self.disulfide_bond: list[bool] = [False] * len(sequence)
    self.helix: list[bool] = [False] * len(sequence)
    self.turn: list[bool] = [False] * len(sequence)
    self.beta_strand: list[bool] = [False] * len(sequence)

    self.binding_data: list[dict[str, str]] = [{} for _ in range(len(sequence))]
    self.active_data: list[dict[str, str]] = [{} for _ in range(len(sequence))]
    self.ptm_data: list[dict[str, str]] = [{} for _ in range(len(sequence))]

    self.regions: dict[str, list[int]] = {}
    self.region_data: dict[str, dict[str, str]] = {}

    self.domains: dict[str, list[int]] = {}
    self.domain_data: dict[str, dict[str, str]] = {}
extract_annotation(description_type, description, extract_metadata=None)

Extracts the site annotations from the description.

Parameters:

Name Type Description Default
description_type str

The type of site annotation to extract. Must be one of the keys in self.fields_by_description_type.

required
description str

The UniProt site description string.

required
extract_metadata bool | None

Whether to extract metadata. By default, this is inferred from the description_type parameter.

None

Raises:

Type Description
NotImplementedError

From _parse_description. If an unknown description_type is provided.

ValueError

From _parse_description. If the description_type is not found in description.

AssertionError

If a description_type is provided that is known to _parse_description but not extract_annotation. This indicates an internal bug and should be reported.

Source code in procaliper/site_metadata/uniprot_site_parsing.py
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
def extract_annotation(
    self,
    description_type: str,
    description: str,
    extract_metadata: bool | None = None,
) -> None:
    """Extracts the site annotations from the description.

    Args:
        description_type (str): The type of site annotation to extract. Must be
            one of the keys in `self.fields_by_description_type`.
        description (str): The UniProt site description string.
        extract_metadata (bool | None, optional): Whether to extract metadata.
            By default, this is inferred from the `description_type` parameter.

    Raises:
        NotImplementedError: From `_parse_description`. If an unknown `description_type` is provided.
        ValueError: From `_parse_description`. If the `description_type` is not found in `description`.
        AssertionError: If a `description_type` is provided that is known to `_parse_description` but
            not `extract_annotation`. This indicates an internal bug and should be reported.
    """
    # regions are a special case because they can overlap
    if description_type == "REGION":
        self._region_parsing(description)
        return
    if description_type == "DOMAIN":
        self._domain_parsing(description)
        return

    matches, data = self._parse_description(
        description_type, description, extract_metadata
    )
    if description_type == "ACT_SITE":
        self.active = matches
        if data:
            self.active_data = data
    elif description_type == "BINDING":
        self.binding = matches
        if data:
            self.binding_data = data
    elif description_type == "MOD_RES":
        self.ptm = matches
        if data:
            self.ptm_data = data
    elif description_type == "DNA_BIND":
        self.dna_binding = matches
    elif description_type == "DISULFID":
        self.disulfide_bond = matches
    elif description_type == "STRAND":
        self.beta_strand = matches
    elif description_type == "HELIX":
        self.helix = matches
    elif description_type == "TURN":
        self.turn = matches
    else:
        raise AssertionError(
            f"If this is raised, the description type {description_type} is only partially handled. Please file an issue."
        )
table()

Return a dictionary of the data in the SiteAnnotations object.

Returns:

Type Description
dict[str, list[Any]]

dict[str, list[Any]]: Each key is a site annotation feature name. Each value is a list of the values for that feature.

Source code in procaliper/site_metadata/uniprot_site_parsing.py
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
def table(self) -> dict[str, list[Any]]:
    """Return a dictionary of the data in the SiteAnnotations object.

    Returns:
        dict[str, list[Any]]: Each key is a site annotation feature name.
            Each value is a list of the values for that feature.
    """
    tbl: dict[str, list[Any]] = {}

    tbl["residue_letter"] = self.residue_letter
    tbl["residue_number"] = self.residue_number
    tbl["binding"] = self.binding
    tbl["active"] = self.active
    tbl["ptm"] = self.ptm
    tbl["dna_binding"] = self.dna_binding
    tbl["disulfide_bond"] = self.disulfide_bond
    tbl["helix"] = self.helix
    tbl["turn"] = self.turn
    tbl["beta_strand"] = self.beta_strand
    tbl["binding_data"] = self.binding_data
    tbl["active_data"] = self.active_data
    tbl["ptm_data"] = self.ptm_data

    return tbl

view

ngl_scheme(data, float_to_hex=None, two_sided=False)

Converts a list of values to an nglview color scheme.

Parameters:

Name Type Description Default
data list[float]

The list of values to convert.

required
float_to_hex Callable[[float], str] | None

Function that converts a float to a hex color in the form "#RRGGBB". If None, a default function is used that interpolates between white and green (one-sided) or red and blue (two-sided). Defaults to None.

None
two_sided bool

Whether to use a two-sided color scheme. If False, we assume data only contains positive values. Defaults to False.

False

Returns:

Type Description
list[tuple[str, str]]

list[tuple[str, str]]: A list of color and residue number tuples that are compatible with nglview.

Source code in procaliper/view/nglview_utils.py
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
def ngl_scheme(
    data: list[float],
    float_to_hex: Callable[[float], str] | None = None,
    two_sided: bool = False,
) -> list[tuple[str, str]]:
    """Converts a list of values to an nglview color scheme.

    Args:
        data (list[float]): The list of values to convert.
        float_to_hex (Callable[[float], str] | None, optional): Function that
            converts a float to a hex color in the form `"#RRGGBB"`. If `None`,
            a default function is used that interpolates between white and green
            (one-sided) or red and blue (two-sided). Defaults to `None`.
        two_sided (bool, optional): Whether to use a two-sided color scheme. If
            `False`, we assume `data` only contains positive values. Defaults to
            `False`.

    Returns:
        list[tuple[str, str]]: A list of color and residue number tuples that
            are compatible with nglview.
    """
    if float_to_hex is None:
        if two_sided:
            float_to_hex = _default_float_to_hex_rb
        else:
            float_to_hex = _default_float_to_hex

    maxx = max(data)
    scale = max(min(data), abs(maxx)) if two_sided else maxx

    if scale == 0:
        data_scaled = [0.0] * len(data)
    else:
        data_scaled = [x / maxx for x in data]

    return [(float_to_hex(x), f"{i+1}") for i, x in enumerate(data_scaled)]

protein_to_nglview(protein)

Generates an nglview widget from a protein that has an associated PDB file.

Must run protein.fetch_pdb first or specify an abosulute path to the PDB in protein.pdb_location_absolute.

Parameters:

Name Type Description Default
protein Protein

The protein object to visualize.

required

Raises:

Type Description
ValueError

If the PDB location is not set.

Returns:

Type Description
NGLWidget

nglview.NGLWidget: an nglview widget

Source code in procaliper/view/nglview_utils.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
def protein_to_nglview(protein: Protein) -> nglview.NGLWidget:
    """Generates an nglview widget from a protein that has an associated PDB file.

    Must run `protein.fetch_pdb` first or specify an abosulute path to the PDB
    in `protein.pdb_location_absolute`.

    Args:
        protein (Protein): The protein object to visualize.

    Raises:
        ValueError: If the PDB location is not set.

    Returns:
        nglview.NGLWidget: an nglview widget
    """
    if not protein.pdb_location_absolute:
        raise ValueError("PDB location not set; use `fetch_pdb` first")
    return nglview.show_file(protein.pdb_location_absolute)

nglview_utils

ngl_scheme(data, float_to_hex=None, two_sided=False)

Converts a list of values to an nglview color scheme.

Parameters:

Name Type Description Default
data list[float]

The list of values to convert.

required
float_to_hex Callable[[float], str] | None

Function that converts a float to a hex color in the form "#RRGGBB". If None, a default function is used that interpolates between white and green (one-sided) or red and blue (two-sided). Defaults to None.

None
two_sided bool

Whether to use a two-sided color scheme. If False, we assume data only contains positive values. Defaults to False.

False

Returns:

Type Description
list[tuple[str, str]]

list[tuple[str, str]]: A list of color and residue number tuples that are compatible with nglview.

Source code in procaliper/view/nglview_utils.py
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
def ngl_scheme(
    data: list[float],
    float_to_hex: Callable[[float], str] | None = None,
    two_sided: bool = False,
) -> list[tuple[str, str]]:
    """Converts a list of values to an nglview color scheme.

    Args:
        data (list[float]): The list of values to convert.
        float_to_hex (Callable[[float], str] | None, optional): Function that
            converts a float to a hex color in the form `"#RRGGBB"`. If `None`,
            a default function is used that interpolates between white and green
            (one-sided) or red and blue (two-sided). Defaults to `None`.
        two_sided (bool, optional): Whether to use a two-sided color scheme. If
            `False`, we assume `data` only contains positive values. Defaults to
            `False`.

    Returns:
        list[tuple[str, str]]: A list of color and residue number tuples that
            are compatible with nglview.
    """
    if float_to_hex is None:
        if two_sided:
            float_to_hex = _default_float_to_hex_rb
        else:
            float_to_hex = _default_float_to_hex

    maxx = max(data)
    scale = max(min(data), abs(maxx)) if two_sided else maxx

    if scale == 0:
        data_scaled = [0.0] * len(data)
    else:
        data_scaled = [x / maxx for x in data]

    return [(float_to_hex(x), f"{i+1}") for i, x in enumerate(data_scaled)]

protein_to_nglview(protein)

Generates an nglview widget from a protein that has an associated PDB file.

Must run protein.fetch_pdb first or specify an abosulute path to the PDB in protein.pdb_location_absolute.

Parameters:

Name Type Description Default
protein Protein

The protein object to visualize.

required

Raises:

Type Description
ValueError

If the PDB location is not set.

Returns:

Type Description
NGLWidget

nglview.NGLWidget: an nglview widget

Source code in procaliper/view/nglview_utils.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
def protein_to_nglview(protein: Protein) -> nglview.NGLWidget:
    """Generates an nglview widget from a protein that has an associated PDB file.

    Must run `protein.fetch_pdb` first or specify an abosulute path to the PDB
    in `protein.pdb_location_absolute`.

    Args:
        protein (Protein): The protein object to visualize.

    Raises:
        ValueError: If the PDB location is not set.

    Returns:
        nglview.NGLWidget: an nglview widget
    """
    if not protein.pdb_location_absolute:
        raise ValueError("PDB location not set; use `fetch_pdb` first")
    return nglview.show_file(protein.pdb_location_absolute)