-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate-TPM.R
35 lines (28 loc) · 1.19 KB
/
generate-TPM.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
library(biomaRt)
library(tximport)
library(GenomicFeatures)
# output from salmon
files <- list.files(path = './quants/', pattern = 'SRR')
files_import <- paste0('./quants/', files, "/quant.sf")
# mapping of transcripts to genes
txdb <- makeTxDbFromGFF("~/software/GRCh38/gencode.v40.annotation.gtf.gz")
k <- keys(txdb, keytype = "TXNAME")
tx2gene <- ensembldb::select(txdb, k, "GENEID", "TXNAME")
# read salmon output
mat_gse <- tximport(files_import,
type = "salmon",
tx2gene = tx2gene,
ignoreAfterBar = TRUE)
TPM.mat = as.data.frame(mat_gse$abundance)
colnames(TPM.mat) = samples$Run
# get gene names
mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
genes = rownames(TPM.mat)
genes = stringr::str_split_fixed(genes, pattern = '\\.', n=2)[,1]
gene_IDs <- getBM(filters= "ensembl_gene_id", attributes= c("ensembl_gene_id","hgnc_symbol"),
values = genes, mart= mart)
rownames(gene_IDs) = gene_IDs$ensembl_gene_id
TPM.mat$ensembl_gene_id = rownames(TPM.mat)
TPM.mat$gene = gene_IDs[genes, 'hgnc_symbol']
TPM.mat = TPM.mat[, c('ensembl_gene_id', 'gene', samples$Run)]
write.csv2(TPM.mat, file = './TPM.csv', quote = F)