Skip to content

Commit

Permalink
upgraded Python version to >=3.10,<4.0 in pyproject.toml, poetry.lock…
Browse files Browse the repository at this point in the history
… and databses-ci; added biotite package to dependencies (#63)

* upgraded Python version to >=3.10,<4.0 in pyproject.toml, poetry.lock, and databses-ci; added biotite package to dependencies

* added ClustalOmegaAligner

* adding whole KLIFS pocket mapping to klifs and kinase_schema modules

* added try_except_substraction for whole KLIFS pocket matching algo

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* overwriting conflict attributable to pre-commit.ci

* added openpyxl via Poetry

* added PKIS2 Km, ATP data

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* added pydantic to test_env.yaml in devtools

* added git to test_env.yaml in devtools

* corrected git to GitPython in test_env.yaml in devtools

* removed unused imports

* chmod -x klifs pocket notebook

* added error handling for get_sequence_max_with_exception and kinase_info.KLIFS2UniProtSeq[region] TypeError; removed KLIFSPocket remnants

* added upsetplot and panel for plotting

* added PKIS heatmap workbook

* removed extraneous UpSet plotting

* added try_except_middle_int to extract single missense AA change from Mutations

* added plot module for SequenceAlignment

* added API client for Protvar

* updated all pre-processing for Broad ML in Drug Discovery poster in notebooks/pkis2_km_atp.ipynb

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* added ESM-2 PKIS scripts to ML package; sub-package integration ongoing

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fixed flake8 issues

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fixed final flake8 errors in esm2 utils

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
jessicaw9910 and pre-commit-ci[bot] authored Nov 12, 2024
1 parent bdea8c6 commit 04ea845
Show file tree
Hide file tree
Showing 21 changed files with 3,358 additions and 114 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/databases-ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
strategy:
matrix:
os: [macOS-latest, ubuntu-latest, windows-latest]
python-version: [3.9, "3.10", "3.11"]
python-version: ["3.10", "3.11"]

steps:
- uses: actions/checkout@v3
Expand Down
Binary file added data/3. PKIS Nanosyn Assay Heatmaps.xlsx
Binary file not shown.
2 changes: 2 additions & 0 deletions devtools/conda-envs/test_env.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,5 @@ dependencies:
- beautifulsoup4
- numpy
- biopython
- pydantic
- GitPython
Original file line number Diff line number Diff line change
@@ -1,20 +1,67 @@
from abc import ABC, abstractmethod
from dataclasses import dataclass

from Bio import Align

class CustomAligner(ABC):
"""Custom aligner class for aligning sequences."""

substitution_matrix: str = "BLOSUM62"
"""str: Substitution matrix used. Default is BLOSUM62."""

@abstractmethod
def align(self, *args, **kwargs):
"""Abstract method for aligning sequences."""
...


@dataclass
class ClustalOmegaAligner(CustomAligner):
"""ClustalOmega aligner class for multiple sequence alignments (need to initialize with list of sequences)."""

list_sequences: list[str]
"""list[str]: List of sequences to align."""
path_bin: str = "/usr/local/bin/clustalo"
"""str: Path to clustalo binary. Default is "/usr/local/bin/clustalo"."""

def __post_init__(self):
from biotite.sequence import ProteinSequence, align

self.alphabet = ProteinSequence.alphabet
self.matrix_substitution = align.SubstitutionMatrix(
self.alphabet, self.alphabet, self.substitution_matrix
)
self.list_sequences = [ProteinSequence(seq) for seq in self.list_sequences]
self.align()

def align(self) -> str:
from biotite.application import clustalo

app = clustalo.ClustalOmegaApp(
self.list_sequences, self.path_bin, self.matrix_substitution
)

app.start()
app.join()
self.alignments = app.get_alignment()
self.list_alignments = self.alignments.get_gapped_sequences()


@dataclass
class CustomAligner:
class BioAligner(CustomAligner):
"""BioPython aligner class for aligning sequences. Initialized without sequences"""

from Bio import Align

mode: str = "local"
"""str: Alignment mode. Default is "local"."""
substitution_matrix: str = "BLOSUM62"
"""str: Substitution matrix. Default is BLOSUM62."""
gap_score: int = -5
"""int: Gap score. Default is -5."""
extend_gap_score: int = -1
"""int: Gap extension score. Default is -1."""

def __post_init__(self):
from Bio import Align

self.aligner = Align.PairwiseAligner()
self.aligner.mode = self.mode
self.aligner.substitution_matrix = Align.substitution_matrices.load(
Expand All @@ -28,7 +75,7 @@ def align(self, seq1: str, seq2: str) -> Align.MultipleSeqAlignment:


@dataclass
class BL2UniProtAligner(CustomAligner):
class BL2UniProtAligner(BioAligner):
mode: str = "global"
"""str: Alignment mode. Default is "global."""

Expand All @@ -37,7 +84,7 @@ def __post_init__(self):


@dataclass
class Kincore2UniProtAligner(CustomAligner):
class Kincore2UniProtAligner(BioAligner):
mode: str = "local"
"""str: Alignment mode. Default is "local."""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -166,3 +166,10 @@ def get_mutations(self):


# TODO: implement clinical annotations class


def try_except_middle_int(str_in):
try:
return int(str_in[1:-1])
except ValueError:
return None
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import logging
import os
from enum import Enum, StrEnum
from itertools import chain

import pandas as pd
from pydantic import BaseModel, ValidationError, constr, model_validator
from typing_extensions import Self

from missense_kinase_toolkit.databases import klifs
from missense_kinase_toolkit.databases.aligners import ClustalOmegaAligner
from missense_kinase_toolkit.databases.kincore import (
align_kincore2uniprot,
extract_pk_fasta_info_as_dict,
Expand Down Expand Up @@ -85,10 +87,9 @@ class Family(Enum):
KinaseDomainName = StrEnum(
"KinaseDomainName", {"KD" + str(idx + 1): kd for idx, kd in enumerate(LIST_PFAM_KD)}
)

UniProtSeq = constr(pattern=r"^[ACDEFGHIKLMNPQRSTVWXY]+$")
SeqUniProt = constr(pattern=r"^[ACDEFGHIKLMNPQRSTVWXY]+$")
"""Pydantic model for UniProt sequence constraints."""
KLIFSPocket = constr(pattern=r"^[ACDEFGHIKLMNPQRSTVWY\-]{85}$")
SeqKLIFS = constr(pattern=r"^[ACDEFGHIKLMNPQRSTVWY\-]{85}$")
"""Pydantic model for KLIFS pocket sequence constraints."""
UniProtID = constr(pattern=r"^[A-Z][0-9][A-Z0-9]{3}[0-9]$")
"""Pydantic model for UniProt ID constraints."""
Expand All @@ -107,7 +108,7 @@ class KinHub(BaseModel):
class UniProt(BaseModel):
"""Pydantic model for UniProt information."""

canonical_seq: UniProtSeq
canonical_seq: SeqUniProt


class KLIFS(BaseModel):
Expand All @@ -120,7 +121,7 @@ class KLIFS(BaseModel):
family: Family
iuphar: int
kinase_id: int
pocket_seq: KLIFSPocket | None
pocket_seq: SeqKLIFS | None


class Pfam(BaseModel):
Expand All @@ -137,7 +138,7 @@ class Pfam(BaseModel):
class KinCore(BaseModel):
"""Pydantic model for KinCore information."""

seq: UniProtSeq
seq: SeqUniProt
start: int | None
end: int | None
mismatch: list[int] | None
Expand All @@ -154,7 +155,8 @@ class KinaseInfo(BaseModel):
Pfam: Pfam | None
KinCore: KinCore | None
bool_offset: bool = True
KLIFS2UniProt: dict[str, int] | None = None
KLIFS2UniProtIdx: dict[str, int | None] | None = None
KLIFS2UniProtSeq: dict[str, str | None] | None = None

# https://docs.pydantic.dev/latest/examples/custom_validators/#validating-nested-model-fields
@model_validator(mode="after")
Expand All @@ -174,17 +176,14 @@ def change_wrong_klifs_pocket_seq(self) -> Self:
# https://stackoverflow.com/questions/68082983/validating-a-nested-model-in-pydantic
# skip if other validation errors occur in nested models first
@model_validator(mode="after")
@classmethod
def validate_uniprot_length(cls, values):
def validate_uniprot_length(self) -> Self:
"""Validate canonical UniProt sequence length matches Pfam length if Pfam not None."""
pfam = values.Pfam
uniprot = values.UniProt
if pfam is not None:
if len(uniprot.canonical_seq) != pfam.protein_length:
if self.Pfam is not None:
if len(self.UniProt.canonical_seq) != self.Pfam.protein_length:
raise ValidationError(
"UniProt sequence length does not match Pfam protein length."
)
return values
return self

@model_validator(mode="after")
def generate_klifs2uniprot_dict(self) -> Self:
Expand All @@ -204,9 +203,8 @@ def generate_klifs2uniprot_dict(self) -> Self:
)

if temp_obj.list_align is not None:
self.KLIFS2UniProt = dict(
zip(klifs.LIST_KLIFS_REGION, temp_obj.list_align)
)
self.KLIFS2UniProtIdx = temp_obj.KLIFS2UniProtIdx
self.KLIFS2UniProtSeq = temp_obj.KLIFS2UniProtSeq

return self

Expand Down Expand Up @@ -528,6 +526,111 @@ def create_kinase_models_from_df(
return dict_kinase_models


def get_sequence_max_with_exception(list_in: list[int | None]) -> int:
"""Get maximum sequence length from dictionary of dictionaries.
Parameters
----------
dict_in : dict[str, dict[str, str | None]]
Dictionary of dictionaries.
Returns
-------
int
Maximum sequence length.
"""
try:
return max(list_in)
except ValueError:
return 0


def replace_none_with_max_len(dict_in):
dict_max_len = {
key1: get_sequence_max_with_exception(
[len(val2) for val2 in val1.values() if val2 is not None]
)
for key1, val1 in dict_in.items()
}

for region, length in dict_max_len.items():
for hgnc, seq in dict_in[region].items():
if seq is None:
dict_in[region][hgnc] = "-" * length

return dict_in


def align_inter_intra_region(
dict_in: dict[str, KinaseInfo],
) -> dict[str, dict[str, str]]:
"""Align inter and intra region sequences.
Parameters
----------
dict_in : dict[str, KinaseInfo]
Dictionary of kinase information models
Returns
-------
dict[str, dict[str, str]]
Dictionary of aligned inter and intra region
"""

list_inter_intra = klifs.LIST_INTER_REGIONS + klifs.LIST_INTRA_REGIONS

dict_align = {
region: {hgnc: None for hgnc in dict_in.keys()} for region in list_inter_intra
}

for region in list_inter_intra:
list_hgnc, list_seq = [], []
for hgnc, kinase_info in dict_in.items():
try:
seq = kinase_info.KLIFS2UniProtSeq[region]
except TypeError:
seq = None
if seq is not None:
list_hgnc.append(hgnc)
list_seq.append(seq)
if len(list_seq) > 2:
aligner_temp = ClustalOmegaAligner(list_seq)
dict_align[region].update(
dict(zip(list_hgnc, aligner_temp.list_alignments))
)
else:
# hinge:linker - {'ATR': 'N', 'CAMKK1': 'L'}
# αE:VI - {'MKNK1': 'DKVSLCHLGWSAMAPSGLTAAPTSLGSSDPPTSASQVAGTT'}
dict_align[region].update(dict(zip(list_hgnc, list_seq)))

replace_none_with_max_len(dict_align)

return dict_align


def reverse_order_dict_of_dict(
dict_in: dict[str, dict[str, str | int | None]],
) -> dict[str, dict[str, str | int | None]]:
"""Reverse order of dictionary of dictionaries.
Parameters
----------
dict_in : dict[str, dict[str, str | int | None]]
Dictionary of dictionaries
Returns
-------
dict_out : dict[str, dict[str, str | int | None]]
Dictionary of dictionaries with reversed order
"""
dict_out = {
key1: {key2: dict_in[key2][key1] for key2 in dict_in.keys()}
for key1 in set(chain(*[list(j.keys()) for j in dict_in.values()]))
}
return dict_out


# # NOT IN USE - USE TO GENERATE ABOVE

# import numpy as np
Expand All @@ -554,7 +657,7 @@ def create_kinase_models_from_df(
# df_pivot = pd.DataFrame(df_kinhub[["Family", "SubFamily"]].value_counts()).reset_index().pivot(columns="Family", index="SubFamily", values="count")
# df_pivot.loc[df_pivot.index.isin([key for key, val in dict_subfamily.items() if val >= 5]),].dropna(axis=1, how="all")

# # kinase_schema.UniProtSeq
# # kinase_schema.SeqUniProt
# "".join(sorted(list(set(chain.from_iterable(df_uniprot["canonical_sequence"].apply(lambda x: list(x)).tolist())))))

# # kinase_schema.KLIFSPocket
Expand Down Expand Up @@ -594,3 +697,51 @@ def create_kinase_models_from_df(
# .apply(lambda x: "".join(x) == "domain"), "name"]
# .tolist()
# )
#
# USED FOR INTER MAPPING ASSESSMENT
# dict_kinase = create_kinase_models_from_df()

# dict_klifs = {i: j for i, j in dict_kinase.items() if \
# (j.KLIFS is not None and j.KLIFS.pocket_seq is not None)}
# df_klifs_idx = pd.DataFrame([list(j for i, j in val.KLIFS2UniProt.items()) for key, val in dict_klifs.items()],
# columns=klifs.LIST_KLIFS_REGION, index=dict_klifs.keys())

# list_region = list(klifs.DICT_POCKET_KLIFS_REGIONS.keys())

# dict_start_end = {list_region[i-1]:list_region[i] for i in range(1, len(list_region)-1)}
# dict_cols = {key: list(i for i in df_klifs_idx.columns.tolist() \
# if i.split(":")[0] == key) for key in list_region}

# list_inter = []
# for key, val in dict_start_end.items():

# list_temp = []
# for idx, row in df_klifs_idx.iterrows():

# cols_start, cols_end = dict_cols[key], dict_cols[val]

# start = row.loc[cols_start].values
# if np.all(np.isnan(start)):
# max_start = None
# else:
# max_start = np.nanmax(start) + 1

# end = row.loc[cols_end].values
# if np.all(np.isnan(end)):
# min_end = None
# else:
# min_end = np.nanmin(end)

# list_temp.append((max_start, min_end))

# list_inter.append(list_temp)

# df_inter = pd.DataFrame(list_inter,
# index=[f"{key}:{val}" for key, val in dict_start_end.items()],
# columns=df_klifs_idx.index).T
# df_length = df_inter.map(lambda x: try_except_substraction(x[0], x[1]))

# df_multi = df_length.loc[:, df_length.apply(lambda x: any(x > 0))]
# # BUB1B has 1 residue in b.l intra region that was
# # previously captured in αC:b.l since flanked by None
# list_cols = [i for i in df_multi.columns if i != "αC:b.l"]
Loading

0 comments on commit 04ea845

Please sign in to comment.