You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I used Claude to document the code (for reviewing purposes).
It is not tested but I thought I leave it here if it is considered useful.
importloggingimportosfrompathlibimportPathimportnumpyasnpimportpandasaspdfromibaqpy.ibaq.ibaqpy_commonsimportload_feature, load_sdrffromibaqpy.ibaq.utilsimport (
apply_batch_correction,
compute_pca,
fill_samples,
filter_missing_value_by_group,
folder_retrieval,
generate_meta,
get_batch_info_from_sample_names,
impute_missing_values,
iterative_outlier_removal,
plot_pca,
remove_single_sample_batches,
split_df_by_column,
)
# Set up logginglogging.basicConfig(
format="%(asctime)s [%(funcName)s] - %(message)s", level=logging.DEBUG
)
logger=logging.getLogger(__name__)
classCombiner:
""" A class for combining and processing proteomics data from multiple sources. This class handles data loading, imputation, outlier removal, and batch correction for proteomics data, particularly focusing on iBAQ (intensity-based absolute quantification) results. """def__init__(self, data_folder: os.PathLike, covariate: str=None, organism: str="HUMAN"):
""" Initialize the Combiner object. Args: data_folder (os.PathLike): Path to the folder containing the data files. covariate (str, optional): The covariate to use for analysis. Defaults to None. organism (str, optional): The organism to filter for. Defaults to "HUMAN". """self.data_folder=Path(data_folder)
ifnotself.data_folder.exists() ornotself.data_folder.is_dir():
raiseFileNotFoundError(f"Data folder {self.data_folder} does not exist!")
self.covariate=covariateself.organism=organism# Initialize other attributesself.df_pca=Noneself.df_corrected=Noneself.df_filtered_outliers=Noneself.batch_index=Noneself.samples_number=Noneself.datasets=Noneself.df=Noneself.metadata=None# Load and process dataself._load_data()
def_load_data(self):
"""Load and process SDRF and iBAQ data from the specified data folder."""logger.info("Combining SDRFs and ibaq results ...")
files=folder_retrieval(str(self.data_folder))
# Load and combine metadataself.metadata=pd.concat([generate_meta(load_sdrf(sdrf)) forsdrfinfiles["sdrf"]])
self.metadata=self.metadata.drop_duplicates()
self.metadata.index=self.metadata["sample_id"]
# Load and combine iBAQ dataself.df=pd.concat([load_feature(ibaq) foribaqinfiles["ibaq"]])
self.df=self.df[self.df["ProteinName"].str.endswith(self.organism)]
self.df.index=self.df["SampleID"]
# Join metadata with iBAQ dataself.df=self.df.join(self.metadata, how="left")
# Set up additional attributesself.samples=self.df.columns.tolist()
self.proteins=self.df["ProteinName"].unique().tolist()
self.batch_index=get_batch_info_from_sample_names(self.df.columns)
defread_data(self, meta: str, ibaq: str, organism="HUMAN", covariate=None):
""" Read metadata and iBAQ data from local files. Args: meta (str): Path to the metadata file. ibaq (str): Path to the iBAQ data file. organism (str, optional): Organism to filter for. Defaults to "HUMAN". covariate (str, optional): Covariate to use for analysis. Defaults to None. """self.covariate=covariateself.df=pd.read_csv(ibaq, index_col=0)
self.metadata=pd.read_csv(meta)
self.df=self.df[self.df["ProteinName"].str.endswith(organism)]
self.df.index=self.df["SampleID"]
self.metadata=self.metadata.drop_duplicates()
self.df=self.df.join(self.metadata, how="left")
defimputer(self, covariate_to_keep: list=None):
""" Impute missing values in the iBAQ data. Args: covariate_to_keep (list, optional): List of covariate values to keep. Defaults to None. """logger.info("Imputing merged ibaq results ...")
ifself.covariateandlen(self.metadata[self.covariate].unique()) <2:
raiseSystemExit(f"{self.covariate} should contain at least two different covariates!")
# Filter data based on covariate_to_keepifcovariate_to_keep:
self.df=self.df[self.df[self.covariate].isin(covariate_to_keep)]
# Filter out proteins with too many missing valuesself.df=filter_missing_value_by_group(
self.df, col="ProteinName", non_missing_percent_to_keep=0.3
)
# Impute missing valuesifself.covariate:
df_list=split_df_by_column(self.df, cov_index_col=self.covariate)
df_list= [fill_samples(df, self.proteins) fordfindf_list]
df_list=impute_missing_values(df_list)
self.df=pd.concat(df_list, axis=1)
else:
self.df=fill_samples(self.df, self.proteins)
self.df=impute_missing_values(self.df)
self.datasets=list(set([sample.split("-")[0] forsampleinself.samples]))
defoutlier_removal(self, n_components: int=None, min_cluster_size: int=None,
min_samples_num: int=None, n_iter: int=None):
""" Remove outliers from the imputed data using iterative outlier removal. Args: n_components (int, optional): Number of PCA components to use. Defaults to None. min_cluster_size (int, optional): Minimum cluster size for HDBSCAN. Defaults to None. min_samples_num (int, optional): Minimum number of samples for HDBSCAN. Defaults to None. n_iter (int, optional): Number of iterations for outlier removal. Defaults to None. """logger.info("Removing outliers from imputed data ...")
# Calculate sample numbers per datasetbatches= [sample.split("-")[0] forsampleinself.samples]
self.samples_number= {dataset: batches.count(dataset) fordatasetinself.datasets}
min_samples=max(round(np.median(list(self.samples_number.values()))), 2)
# Apply iterative outlier removalself.df_filtered_outliers=iterative_outlier_removal(
self.df,
self.batch_index,
n_components=n_componentsorround(len(set(self.batch_index)) /3),
min_cluster_size=min_cluster_sizeormin_samples,
min_samples=min_samples_numormin_samples,
n_iter=n_iteror5,
)
# Compute and plot PCA of corrected data with outliers removedself._compute_and_plot_pca(self.df_filtered_outliers, "PCA plot of corrected data with outliers removed",
"pca_corrected_outliers_removed.png", n_components)
defbatch_correction(self, n_components: int=None, tissue_parts_to_keep: int=None):
""" Apply batch correction to the data. Args: n_components (int, optional): Number of PCA components to use. Defaults to None. tissue_parts_to_keep (int, optional): Number of tissue parts to keep. Defaults to None. """logger.info("Applying batch effect correction ...")
# Plot PCA of uncorrected dataself._compute_and_plot_pca(self.df, "PCA plot of uncorrected data", "pca_uncorrected.png", n_components)
# Filter samples based on tissue parts if specifiediftissue_parts_to_keep:
self._filter_samples_by_tissue(tissue_parts_to_keep)
# Prepare data for batch correctionself._prepare_for_batch_correction()
# Apply batch correctionself.df_corrected=apply_batch_correction(
self.df, self.batch_index,
covs=self.metadata[self.covariate].tolist() ifself.covariateelse []
)
# Plot PCA of corrected dataself._compute_and_plot_pca(self.df_corrected, "PCA plot of corrected data", "pca_corrected.png", n_components)
def_compute_and_plot_pca(self, data, title, output_file, n_components):
""" Compute PCA and plot the results. Args: data (pd.DataFrame): Data to compute PCA on. title (str): Title for the PCA plot. output_file (str): Filename for saving the PCA plot. n_components (int): Number of PCA components to compute. """n_components=n_componentsorround(len(set(self.batch_index)) /3)
self.df_pca=compute_pca(data.T, n_components=n_components)
self.df_pca["batch"] =self.df_pca.index.str.split("-").str[0]
plot_pca(self.df_pca, title=title, output_file=output_file)
def_filter_samples_by_tissue(self, tissue_parts_to_keep):
""" Filter samples based on specified tissue parts. Args: tissue_parts_to_keep (int): Number of tissue parts to keep. """self.metadata=self.metadata[self.metadata["tissue_part"].isin(tissue_parts_to_keep)]
samples_to_keep=self.metadata["sample_id"].tolist()
self.df=self.df[[sforsinself.df.columnsifsinsamples_to_keep]]
def_prepare_for_batch_correction(self):
"""Prepare data for batch correction by removing single sample batches and aligning metadata."""self.batch_index=get_batch_info_from_sample_names(self.df.columns.tolist())
self.df=remove_single_sample_batches(self.df, self.batch_index)
columns=self.df.columns.tolist()
self.metadata=self.metadata[self.metadata["sample_id"].isin(columns)]
self.metadata=self.metadata.reset_index(drop=True)
self.metadata= (
self.metadata.set_index("sample_id").reindex(columns, axis=0).reset_index()
)
The text was updated successfully, but these errors were encountered:
I used Claude to document the code (for reviewing purposes).
It is not tested but I thought I leave it here if it is considered useful.
The text was updated successfully, but these errors were encountered: