-
Notifications
You must be signed in to change notification settings - Fork 9
/
dump_case_info.py
102 lines (90 loc) · 4.13 KB
/
dump_case_info.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import os
import warnings
from argparse import ArgumentParser
from glob import glob
warnings.filterwarnings('ignore', category=FutureWarning,
module='rpy2.robjects.pandas2ri')
import numpy as np
import pandas as pd
import rpy2.rinterface_lib.embedded as r_embedded
r_embedded.set_initoptions(
('rpy2', '--quiet', '--no-save', '--max-ppsize=500000'))
import rpy2.robjects as robjects
from joblib import dump, load
from rpy2.robjects import numpy2ri, pandas2ri
from rpy2.robjects.packages import importr
from tabulate import tabulate
numpy2ri.activate()
pandas2ri.activate()
parser = ArgumentParser()
parser.add_argument('--data-dir', type=str, default='data', help='data dir')
parser.add_argument('--results-dir', type=str, default='results/models',
help='results dir')
parser.add_argument('--resp-sort-by', type=str, nargs='+',
choices=['Cancer', 'Analysis', 'Target', 'Data Type',
'Num Cases', 'NR (-) Cases', 'R (+) Cases'],
default=['Cancer', 'Analysis', 'Target', 'Data Type'],
help='Drug response columns to sort by')
parser.add_argument('--surv-sort-by', type=str, nargs='+',
choices=['Cancer', 'Analysis', 'Target', 'Data Type',
'Num Cases'],
default=['Cancer', 'Analysis', 'Target', 'Data Type'],
help='Survival columns to sort by')
args = parser.parse_args()
r_base = importr('base')
r_biobase = importr('Biobase')
results = []
eset_files = sorted(glob('{}/tcga_*_resp_*_eset.rds'.format(args.data_dir)))
num_esets = len(eset_files)
for eset_idx, eset_file in enumerate(eset_files):
file_basename = os.path.splitext(os.path.split(eset_file)[1])[0]
_, cancer, analysis, target, data_type, *rest = file_basename.split('_')
print('Loading {:d}/{:d} esets'.format(eset_idx + 1, num_esets), end='\r',
flush=True)
eset = r_base.readRDS(eset_file)
sample_meta = r_biobase.pData(eset)
num_cases = sample_meta['case_submitter_id'].nunique()
neg_cases, pos_cases = (sample_meta.groupby('Class')['case_submitter_id']
.nunique())
cancer = cancer.upper()
target = target.title()
data_type = ('Microbiome' if data_type == 'kraken' else
'Expression' if data_type == 'htseq' else
'Combo')
results.append([cancer, analysis, target, data_type, num_cases, neg_cases,
pos_cases])
results_df = pd.DataFrame(results, columns=[
'Cancer', 'Analysis', 'Target', 'Data Type', 'Num Cases', 'NR (-) Cases',
'R (+) Cases'])
out_dir = '{}/resp'.format(args.results_dir)
os.makedirs(out_dir, mode=0o755, exist_ok=True)
results_df.to_csv('{}/resp_case_info.tsv'.format(out_dir), sep='\t',
index=False)
print(tabulate(results_df.sort_values(by=args.resp_sort_by), headers='keys',
showindex=False))
print()
results = []
eset_files = sorted(glob('{}/tcga_*_surv_*_eset.rds'.format(args.data_dir)))
num_esets = len(eset_files)
for eset_idx, eset_file in enumerate(eset_files):
file_basename = os.path.splitext(os.path.split(eset_file)[1])[0]
_, cancer, analysis, target, data_type, *rest = file_basename.split('_')
print('Loading {:d}/{:d} esets'.format(eset_idx + 1, num_esets), end='\r',
flush=True)
eset = r_base.readRDS(eset_file)
sample_meta = r_biobase.pData(eset)
num_cases = sample_meta['case_submitter_id'].nunique()
cancer = cancer.upper()
target = target.upper()
data_type = ('Microbiome' if data_type == 'kraken' else
'Expression' if data_type == 'htseq' else
'Combo')
results.append([cancer, analysis, target, data_type, num_cases])
results_df = pd.DataFrame(results, columns=[
'Cancer', 'Analysis', 'Target', 'Data Type', 'Num Cases'])
out_dir = '{}/surv'.format(args.results_dir)
os.makedirs(out_dir, mode=0o755, exist_ok=True)
results_df.to_csv('{}/surv_case_info.tsv'.format(out_dir), sep='\t',
index=False)
print(tabulate(results_df.sort_values(by=args.surv_sort_by), headers='keys',
showindex=False))