From 29191363d6554b2214045b27d4e123b065aa502b Mon Sep 17 00:00:00 2001 From: Baigal Date: Tue, 24 Nov 2020 02:00:10 -0600 Subject: [PATCH 01/15] Revert "update from head" From 4088fe8cf78903483f47a291ee408fcc652c9ac9 Mon Sep 17 00:00:00 2001 From: Gali Bai Date: Tue, 24 Nov 2020 03:12:36 -0500 Subject: [PATCH 02/15] fixed merge_h5 bugs --- MAESTRO/scATAC_H5Process.py | 110 ++++++++++++++++++------------------ 1 file changed, 55 insertions(+), 55 deletions(-) diff --git a/MAESTRO/scATAC_H5Process.py b/MAESTRO/scATAC_H5Process.py index 45c7f53..bdedbc6 100644 --- a/MAESTRO/scATAC_H5Process.py +++ b/MAESTRO/scATAC_H5Process.py @@ -24,31 +24,31 @@ def mtxtoh5_parser(subparsers): Add main function mtx-to-h5 argument parsers. """ - workflow = subparsers.add_parser("mtx-to-h5", + workflow = subparsers.add_parser("mtx-to-h5", help = "Convert 10X mtx format matrix to HDF5 format.") group_input = workflow.add_argument_group("Input files arguments") - group_input.add_argument("--type", dest = "datatype", default = "Peak", - choices = ["Peak", "Gene"], + group_input.add_argument("--type", dest = "datatype", default = "Peak", + choices = ["Peak", "Gene"], help = "Type of the count matrix (Peak for scATAC-seq and Gene for scRNA-seq). DEFAULT: Peak.") - group_input.add_argument("--matrix", dest = "matrix", default = "matrix.mtx", + group_input.add_argument("--matrix", dest = "matrix", default = "matrix.mtx", help = "Location of .mtx formatted matrix file. DEFAULT: matrix.mtx.") - group_input.add_argument("--feature", dest = "feature", default = "features.tsv", + group_input.add_argument("--feature", dest = "feature", default = "features.tsv", help = "Location of feature file (required for the format of 'mtx'). Features correspond to row indices of count matrix. " "If the type is Peak, please provide the peak bed file with 3 columns. " "If the type is Gene, each row should include gene name. DEFAULT: features.tsv.") group_input.add_argument("--gene-column", dest = "gene_column", default = 2, type = int, help = "If the type is 'Peak', please specify which column of the feature file to use for gene names. DEFAULT: 2.") - group_input.add_argument("--barcode", dest = "barcode", default = "barcodes.tsv", + group_input.add_argument("--barcode", dest = "barcode", default = "barcodes.tsv", help = "Location of barcode file. Cell barcodes correspond to column indices of count matrix. DEFAULT: barcodes.tsv. ") - group_input.add_argument("--species", dest = "species", default = "GRCh38", - choices = ["GRCh38", "GRCm38"], type = str, + group_input.add_argument("--species", dest = "species", default = "GRCh38", + choices = ["GRCh38", "GRCm38"], type = str, help = "Species (GRCh38 for human and GRCm38 for mouse). DEFAULT: GRCh38.") - + group_output = workflow.add_argument_group("Output arguments") - group_output.add_argument("-d", "--directory", dest = "directory", default = "MAESTRO", + group_output.add_argument("-d", "--directory", dest = "directory", default = "MAESTRO", help = "Path to the directory where the result file shall be stored. DEFAULT: MAESTRO.") - group_output.add_argument("--outprefix", dest = "outprefix", default = "MAESTRO", + group_output.add_argument("--outprefix", dest = "outprefix", default = "MAESTRO", help = "Prefix of output files. DEFAULT: MAESTRO.") def mtxtocount_parser(subparsers): @@ -56,31 +56,31 @@ def mtxtocount_parser(subparsers): Add main function mtx-to-count argument parsers. """ - workflow = subparsers.add_parser("mtx-to-count", + workflow = subparsers.add_parser("mtx-to-count", help = "Convert 10X mtx format matrix to plain text count table.") group_input = workflow.add_argument_group("Input files arguments") - group_input.add_argument("--type", dest = "datatype", default = "Peak", - choices = ["Peak", "Gene"], + group_input.add_argument("--type", dest = "datatype", default = "Peak", + choices = ["Peak", "Gene"], help = "Type of the count matrix (Peak for scATAC-seq and Gene for scRNA-seq). DEFAULT: Peak.") - group_input.add_argument("--matrix", dest = "matrix", default = "matrix.mtx", + group_input.add_argument("--matrix", dest = "matrix", default = "matrix.mtx", help = "Location of .mtx formatted matrix file. DEFAULT: matrix.mtx.") - group_input.add_argument("--feature", dest = "feature", default = "features.tsv", + group_input.add_argument("--feature", dest = "feature", default = "features.tsv", help = "Location of feature file (required for the format of 'mtx'). Features correspond to row indices of count matrix. " "If the type is Peak, please provide the peak bed file with 3 columns. " "If the type is Gene, each row should include gene name. DEFAULT: features.tsv.") group_input.add_argument("--gene-column", dest = "gene_column", default = 2, type = int, help = "If the type is 'Peak', please specify which column of the feature file to use for gene names. DEFAULT: 2.") - group_input.add_argument("--barcode", dest = "barcode", default = "barcodes.tsv", + group_input.add_argument("--barcode", dest = "barcode", default = "barcodes.tsv", help = "Location of barcode file. Cell barcodes correspond to column indices of count matrix. DEFAULT: barcodes.tsv. ") - group_input.add_argument("--species", dest = "species", default = "GRCh38", - choices = ["GRCh38", "GRCm38"], type = str, + group_input.add_argument("--species", dest = "species", default = "GRCh38", + choices = ["GRCh38", "GRCm38"], type = str, help = "Species (GRCh38 for human and GRCm38 for mouse). DEFAULT: GRCh38.") - + group_output = workflow.add_argument_group("Output arguments") - group_output.add_argument("-d", "--directory", dest = "directory", default = "MAESTRO", + group_output.add_argument("-d", "--directory", dest = "directory", default = "MAESTRO", help = "Path to the directory where the result file shall be stored. DEFAULT: MAESTRO.") - group_output.add_argument("--outprefix", dest = "outprefix", default = "MAESTRO", + group_output.add_argument("--outprefix", dest = "outprefix", default = "MAESTRO", help = "Prefix of output files. DEFAULT: MAESTRO.") @@ -89,29 +89,29 @@ def counttoh5_parser(subparsers): Add main function count-to-h5 argument parsers. """ - workflow = subparsers.add_parser("count-to-h5", + workflow = subparsers.add_parser("count-to-h5", help = "Convert plain text count table to HDF5 format.") group_input = workflow.add_argument_group("Input files arguments") - group_input.add_argument("--type", dest = "datatype", default = "Peak", - choices = ["Peak", "Gene"], + group_input.add_argument("--type", dest = "datatype", default = "Peak", + choices = ["Peak", "Gene"], help = "Type of the count matrix (Peak for scATAC-seq and Gene for scRNA-seq). DEFAULT: Peak.") - group_input.add_argument("--count", dest = "count", default = "", + group_input.add_argument("--count", dest = "count", default = "", help = "Location of plain text count table file. " "If the type is Peak, the row name should be like 'chromosome_peakstart_peakend'. " "For example, 'chr10_100020591_100020841'.") - group_input.add_argument("--separator", dest = "separator", default = "tab", + group_input.add_argument("--separator", dest = "separator", default = "tab", choices = ["tab", "space", "comma"], help = "The separating character (only for the format of 'plain'). " "Values on each line of the plain matrix file will be separated by the character. DEFAULT: tab.") - group_input.add_argument("--species", dest = "species", default = "GRCh38", - choices = ["GRCh38", "GRCm38"], type = str, + group_input.add_argument("--species", dest = "species", default = "GRCh38", + choices = ["GRCh38", "GRCm38"], type = str, help = "Species (GRCh38 for human and GRCm38 for mouse). DEFAULT: GRCh38.") - + group_output = workflow.add_argument_group("Output arguments") - group_output.add_argument("-d", "--directory", dest = "directory", default = "MAESTRO", + group_output.add_argument("-d", "--directory", dest = "directory", default = "MAESTRO", help = "Path to the directory where the result file shall be stored. DEFAULT: MAESTRO.") - group_output.add_argument("--outprefix", dest = "outprefix", default = "MAESTRO", + group_output.add_argument("--outprefix", dest = "outprefix", default = "MAESTRO", help = "Prefix of output files. DEFAULT: MAESTRO.") @@ -120,23 +120,23 @@ def h5tocount_parser(subparsers): Add main function h5-to-count argument parsers. """ - workflow = subparsers.add_parser("h5-to-count", + workflow = subparsers.add_parser("h5-to-count", help = "Convert HDF5 format to plain text count table.") group_input = workflow.add_argument_group("Input files arguments") - group_input.add_argument("--type", dest = "datatype", default = "Peak", - choices = ["Peak", "Gene"], + group_input.add_argument("--type", dest = "datatype", default = "Peak", + choices = ["Peak", "Gene"], help = "Type of the count matrix (Peak for scATAC-seq and Gene for scRNA-seq). DEFAULT: Peak.") - group_input.add_argument("--h5", dest = "h5", default = "", + group_input.add_argument("--h5", dest = "h5", default = "", help = "Location of HDF5 format file. ") - group_input.add_argument("--species", dest = "species", default = "GRCh38", - choices = ["GRCh38", "GRCm38"], type = str, + group_input.add_argument("--species", dest = "species", default = "GRCh38", + choices = ["GRCh38", "GRCm38"], type = str, help = "Species (GRCh38 for human and GRCm38 for mouse). DEFAULT: GRCh38.") - + group_output = workflow.add_argument_group("Output arguments") - group_output.add_argument("-d", "--directory", dest = "directory", default = "MAESTRO", + group_output.add_argument("-d", "--directory", dest = "directory", default = "MAESTRO", help = "Path to the directory where the result file shall be stored. DEFAULT: MAESTRO.") - group_output.add_argument("--outprefix", dest = "outprefix", default = "MAESTRO", + group_output.add_argument("--outprefix", dest = "outprefix", default = "MAESTRO", help = "Prefix of output files. DEFAULT: MAESTRO.") def mergeh5_parser(subparsers): @@ -144,28 +144,28 @@ def mergeh5_parser(subparsers): Add main function merge-h5 argument parsers. """ - workflow = subparsers.add_parser("merge-h5", + workflow = subparsers.add_parser("merge-h5", help = "Merge 10X HDF5 files.") group_input = workflow.add_argument_group("Input files arguments") - group_input.add_argument("--type", dest = "datatype", default = "Peak", - choices = ["Peak", "Gene"], + group_input.add_argument("--type", dest = "datatype", default = "Peak", + choices = ["Peak", "Gene"], help = "Type of the count matrix (Peak for scATAC-seq and Gene for scRNA-seq). DEFAULT: Peak.") group_input.add_argument("--h5", dest = "h5_list", default = [], nargs = "+", help = "Location of HDF5 count files. Multiple files should be separated by space. " "For example, --h5 A.h5 B.h5 C.h5 ") - group_input.add_argument("--species", dest = "species", default = "GRCh38", - choices = ["GRCh38", "GRCm38"], type = str, + group_input.add_argument("--species", dest = "species", default = "GRCh38", + choices = ["GRCh38", "GRCm38"], type = str, help = "Species (GRCh38 for human and GRCm38 for mouse). DEFAULT: GRCh38.") - + group_output = workflow.add_argument_group("Output arguments") group_output.add_argument("--cellprefix", dest = "cellprefix_list", default = [], nargs = "+", help = "Prefix to add to cell identities. Multiple prefixes should be separated by space " "and the number should be equal to that of HDF5 files. " "If not set, cell original cell identities will be kept.") - group_output.add_argument("-d", "--directory", dest = "directory", default = "MAESTRO", + group_output.add_argument("-d", "--directory", dest = "directory", default = "MAESTRO", help = "Path to the directory where the result file shall be stored. DEFAULT: MAESTRO.") - group_output.add_argument("--outprefix", dest = "outprefix", default = "MAESTRO", + group_output.add_argument("--outprefix", dest = "outprefix", default = "MAESTRO", help = "Prefix of output files. DEFAULT: MAESTRO.") @@ -245,7 +245,8 @@ def merge_10X_h5(directory, outprefix, h5list, prefixlist, genome = 'GRCh38', da if_features_same = True for i in range(0, len(features_list)-1): if if_features_same: - if_features_same = if_features_same & (features_list[i] == features_list[i+1]).all() + #if_features_same = if_features_same & (features_list[i] == features_list[i+1]).all() + if_features_same = if_features_same & numpy.array_equal(features_list[i],features_list[i+1]) else: break @@ -291,7 +292,7 @@ def read_10X_mtx(matrix_file, feature_file, barcode_file, datatype, gene_column feature_in = gzip.open(feature_file, "r") else: feature_in = open(feature_file, "r") - features = feature_in.readlines() + features = feature_in.readlines() if datatype == "Peak": features = ["_".join(feature.strip().split("\t")[0:3]) for feature in features] else: @@ -304,7 +305,7 @@ def read_10X_mtx(matrix_file, feature_file, barcode_file, datatype, gene_column barcode_in = gzip.open(barcode_file, "r") else: barcode_in = open(barcode_file, "r") - barcodes = barcode_in.readlines() + barcodes = barcode_in.readlines() if type(barcodes[0]) == str: barcodes = [barcode.strip().split("\t")[0] for barcode in barcodes] if type(barcodes[0]) == bytes: @@ -330,7 +331,7 @@ def mtx_2_h5(directory, outprefix, matrix_file, feature_file, barcode_file, gene matrix_dict = read_10X_mtx(matrix_file = matrix_file, feature_file = feature_file, barcode_file = barcode_file, datatype = datatype, gene_column = gene_column) - write_10X_h5(filename = filename, matrix = matrix_dict["matrix"], + write_10X_h5(filename = filename, matrix = matrix_dict["matrix"], features = matrix_dict["features"], barcodes = matrix_dict["barcodes"], genome = genome, datatype = datatype) @@ -435,6 +436,5 @@ def count_2_h5(directory, outprefix, count_file, separator, genome = 'GRCh38', d matrix_dict = read_count(count_file, separator) - write_10X_h5(filename = filename, matrix = matrix_dict["matrix"], + write_10X_h5(filename = filename, matrix = matrix_dict["matrix"], features = matrix_dict["features"], barcodes = matrix_dict["barcodes"], genome = genome, datatype = datatype) - From 07188705d977c1f307e10721514f3f5a3b007bd0 Mon Sep 17 00:00:00 2001 From: Gali Bai Date: Fri, 4 Dec 2020 22:56:17 -0500 Subject: [PATCH 03/15] Modified path to Lisa2 --- MAESTRO/R/scRNAseq_pipe.R | 15 +--- R/RNAAnnotateTranscriptionFactor.R | 114 +++++++++++++---------------- 2 files changed, 53 insertions(+), 76 deletions(-) diff --git a/MAESTRO/R/scRNAseq_pipe.R b/MAESTRO/R/scRNAseq_pipe.R index 017fbfd..5042ab5 100644 --- a/MAESTRO/R/scRNAseq_pipe.R +++ b/MAESTRO/R/scRNAseq_pipe.R @@ -25,15 +25,6 @@ option_list = list( make_option(c("--signature"), type = "character", default = "", action = "store", help = "The cell signature file for celltype annotation. Default is built-in CIBERSORT immune cell signature." ), - make_option(c("--lisamode"), type = "character", default = "", - action = "store", help = "Mode to run LISA (web or local)." - ), - make_option(c("--condadir"), type = "character", default = "", - action = "store", help = "Directory where miniconda or anaconda is installed (only if method is set to lisa)." - ), - make_option(c("--lisaenv"), type = "character", default = "lisa", - action = "store", help = "Name of lisa environment (only if method is set to lisa)." - ), make_option(c("--thread"), type = "integer", default = 1, action = "store", help = "Number of cores to use." ) @@ -47,9 +38,6 @@ prefix = argue$prefix thread = argue$thread method = argue$method sigfile = argue$signature -lisamode = argue$lisamode -condadir = argue$condadir -lisaenv = argue$lisaenv species = argue$species @@ -83,6 +71,5 @@ RNA.res$RNA <- RNAAnnotateCelltype(RNA = RNA.res$RNA, genes = RNA.res$genes, signatures = signatures, min.score = 0.05) saveRDS(RNA.res, paste0(prefix, "_scRNA_Object.rds")) RNA.tfs <- RNAAnnotateTranscriptionFactor(RNA = RNA.res$RNA, genes = RNA.res$genes, project = prefix, - method = method, lisa.mode = lisamode, - conda.dir = condadir, lisa.envname = lisaenv, + method = method, organism = species, top.tf = 10) diff --git a/R/RNAAnnotateTranscriptionFactor.R b/R/RNAAnnotateTranscriptionFactor.R index 9a2b6b9..c1c6e8c 100644 --- a/R/RNAAnnotateTranscriptionFactor.R +++ b/R/RNAAnnotateTranscriptionFactor.R @@ -1,7 +1,7 @@ #' Predict driver transcrition factors for scRNA-seq clusters #' -#' Predict driver transcrition factors for scRNA-seq clusters based on TF ChIP-seq peaks from CistromeDB. -#' To run this function, you must first install the rabit package (see from http://rabit.dfci.harvard.edu/). +#' Predict driver transcrition factors for scRNA-seq clusters based on TF ChIP-seq peaks from CistromeDB. +#' To run this function, you must first install the rabit package (see from http://rabit.dfci.harvard.edu/). #' #' @docType methods #' @name RNAAnnotateTranscriptionFactor @@ -9,23 +9,14 @@ #' #' @param RNA Seurat object generated by \code{\link{RNARunSeurat}} function. #' @param genes Data frame of differential expressed genes, generated by \code{\link{RNARunSeurat}}, \code{\link{FindMarkersMAESTRO}} or \code{\link{FindAllMarkersMAESTRO}} function. -#' @param cluster The cluster of given differential genes. -#' For example, if the diff-genes are genrated using cluster 1,2,3 vs cluster 4,5,6, +#' @param cluster The cluster of given differential genes. +#' For example, if the diff-genes are genrated using cluster 1,2,3 vs cluster 4,5,6, #' the cluster should be set as c(1,2,3,4,5,6). #' @param project Output project name for the transcription factor analysis result. Default is the project name of the Seurat object. #' @param method Use RABIT or LISA to identify the driver regulators. If Rabit is selected, should provide rabit annotation at the #' same time. LISA do not need annotations for this step. Default is Rabit. #' @param rabit.path Path of the rabit annotation files. The rabit annoation for CistromeDB is available at https://github.com/liulab-dfci/MAESTRO. #' Download and unzip the annotation, supply the path of the annotations to \code{rabit.path}. -#' @param lisa.mode Mode to Run LISA, "local" or "web". If the mode is set as "local", -#' please install LISA (https://github.com/qinqian/lisa) and download pre-computed datasets -#' following the instructions. The "web" mode is to run online version of LISA. -#' In consideration of the connection issue, if users have mutiple clusters of differential genes, -#' the "local" mode is recommended. -#' @param conda.dir Directory where miniconda or anaconda is installed (required if the \code{lisa.mode} is set to "local"). -#' For example, /home1/wangchenfei/miniconda3. -#' @param lisa.envname The name of the lisa conda environment (required if the \code{lisa.mode} is set to "local"). -#' Default is "lisa". #' @param organism Organism for the dataset. Only support "GRCh38" and "GRCm38". Default is "GRCh38". #' @param top.tf Output the target genes for the top TFs. Default is 10. #' @@ -46,9 +37,8 @@ #' @export -RNAAnnotateTranscriptionFactor <- function(RNA, genes, cluster = NULL, project = RNA@project.name, - method = "RABIT", rabit.path, lisa.mode = "local", conda.dir = "", - lisa.envname = "lisa", organism = "GRCh38", top.tf = 10) +RNAAnnotateTranscriptionFactor <- function(RNA, genes, cluster = NULL, project = RNA@project.name,lisa.mode = "local", + method = "LISA",rabit.path, organism = "GRCh38", top.tf = 10) { if(organism == "GRCh38"){ org = "hsa" @@ -58,7 +48,7 @@ RNAAnnotateTranscriptionFactor <- function(RNA, genes, cluster = NULL, project = org = "mmu" data(mouse.tf.family) tf_family_list <- MOUSE.TFFamily} - + if("cluster" %in% colnames(genes)){ #genes$cluster <- as.factor(genes$cluster) genes <- genes[genes$avg_logFC>0,] @@ -72,7 +62,7 @@ RNAAnnotateTranscriptionFactor <- function(RNA, genes, cluster = NULL, project = genes$cluster <- paste(cluster, collapse = ",") genes$gene <- rownames(genes) } - + method = toupper(method) if(method == "RABIT"){ scorename = "log(Rabitscore)" @@ -82,23 +72,23 @@ RNAAnnotateTranscriptionFactor <- function(RNA, genes, cluster = NULL, project = if(method == "LISA"){ scorename = "log(Lisascore)" if(lisa.mode == "local"){ - out_fdr_max_log = RunLISALocal(genes = genes, project = project, organism = organism, conda.dir = conda.dir, lisa.envname = lisa.envname) + out_fdr_max_log = RunLISALocal(genes = genes, project = project, organism = organism) } if(lisa.mode == "web"){ out_fdr_max_log = RunLISAWeb(genes = genes, project = project, organism = organism) } } - + if(class(out_fdr_max_log) == "character"){ message(out_fdr_max_log) tfList <- NULL }else{ cluster_drivertf_list <- lapply(colnames(out_fdr_max_log), function(x){ - return(data.frame(row.names = rownames(out_fdr_max_log)[order(out_fdr_max_log[,x],decreasing=T)], factor = rownames(out_fdr_max_log)[order(out_fdr_max_log[,x],decreasing=T)], score = sort(out_fdr_max_log[,x],decreasing=T), stringsAsFactors = FALSE)) + return(data.frame(row.names = rownames(out_fdr_max_log)[order(out_fdr_max_log[,x],decreasing=T)], factor = rownames(out_fdr_max_log)[order(out_fdr_max_log[,x],decreasing=T)], score = sort(out_fdr_max_log[,x],decreasing=T, na.last = TRUE), stringsAsFactors = FALSE)) }) names(cluster_drivertf_list) <- colnames(out_fdr_max_log) - - + + if(ifAllcluster){ cluster_cell_list = split(names(Idents(RNA)), Idents(RNA)) }else{ @@ -110,11 +100,11 @@ RNAAnnotateTranscriptionFactor <- function(RNA, genes, cluster = NULL, project = cluster_cell_list[["others"]] = names(Idents(RNA))[!(Idents(RNA) %in% cluster)] } } - + cluster_avg_expr <- sapply(names(cluster_cell_list), function(x){ return(Matrix::rowMeans(GetAssayData(object = RNA)[, cluster_cell_list[[x]]])) }) - + cluster_tf_list_filter = sapply(names(cluster_drivertf_list), function(x){ tf_family_filter = sapply(cluster_drivertf_list[[x]][,1], function(y){ if(y %in% names(tf_family_list) & length(intersect(tf_family_list[[y]], rownames(cluster_avg_expr))) > 1){ @@ -139,13 +129,13 @@ RNAAnnotateTranscriptionFactor <- function(RNA, genes, cluster = NULL, project = tf_family_filter_score = unlist(sapply(tf_family_filter,function(xx){ return(xx[[2]]) })) - + tf_family_filter_dedup_tf = tf_family_filter_tf[!duplicated(tf_family_filter_tf)] tf_family_filter_dedup_score = tf_family_filter_score[!duplicated(tf_family_filter_tf)] listlen = sapply(tf_family_filter_dedup_tf, function(xx){ length(xx) }) - + tf_family_filter_desubset_tf = sapply(tf_family_filter_dedup_tf,function(xx){ ifsubset = sapply(tf_family_filter_dedup_tf, function(yy){ all(xx %in% yy) @@ -158,7 +148,7 @@ RNAAnnotateTranscriptionFactor <- function(RNA, genes, cluster = NULL, project = }) return(max(tf_family_filter_dedup_score[ifsubset])) }) - + tf_family_filter_desubset_dedup_tf = tf_family_filter_desubset_tf[!duplicated(tf_family_filter_desubset_tf)] tf_family_filter_desubset_dedup_score = tf_family_filter_desubset_score[!duplicated(tf_family_filter_desubset_tf)] tf_family_filter_desubset_dedup_tf_str = lapply(tf_family_filter_desubset_dedup_tf, function(xx){ @@ -166,7 +156,7 @@ RNAAnnotateTranscriptionFactor <- function(RNA, genes, cluster = NULL, project = }) return(list(tf = unlist(tf_family_filter_desubset_dedup_tf_str)[1:top.tf], score = tf_family_filter_desubset_dedup_score[1:top.tf])) }) - + cluster_tf_list_filter_tf = cluster_tf_list_filter["tf",] cluster_tf_list_filter_score = cluster_tf_list_filter["score",] if(length(cluster_tf_list_filter_tf) == 1){ @@ -178,13 +168,13 @@ RNAAnnotateTranscriptionFactor <- function(RNA, genes, cluster = NULL, project = names(cluster_tf_list_filter_tf) = paste(cluster, collapse = ",") names(cluster_tf_list_filter_score) = paste(cluster, collapse = ",") } - + cluster_tf_df = reshape2::melt(cluster_tf_list_filter_tf)[,c(2,1)] colnames(cluster_tf_df) = c("Cluster","TF") cluster_score_df = reshape2::melt(cluster_tf_list_filter_score)[,c(2,1)] colnames(cluster_score_df) = c("Cluster",scorename) cluster_tf_df[,scorename] = round(cluster_score_df[, scorename], 2) - + if("assign.ident" %in% colnames(RNA@meta.data)){ if(ifAllcluster){ reg_table = data.frame(Cluster = Idents(RNA), CelltypeAnnotation = RNA@meta.data$assign.ident) @@ -209,9 +199,9 @@ RNAAnnotateTranscriptionFactor <- function(RNA, genes, cluster = NULL, project = }else{ reg_table_unique = data.frame(Cluster = unique(cluster_tf_df$Cluster), CelltypeAnnotation = "unknown") } - reg_df <- merge(reg_table_unique, cluster_tf_df) + reg_df <- merge(reg_table_unique, cluster_tf_df) write.table(reg_df,paste0(project, ".PredictedTFTop", top.tf, ".txt"), col.names = TRUE, row.names = FALSE, sep = "\t", quote = FALSE) - + tfList <- as.list(as.data.frame(cluster_tf_list_filter_tf, stringsAsFactors = FALSE)) names(tfList) = names(cluster_tf_list_filter_tf) } @@ -219,7 +209,7 @@ RNAAnnotateTranscriptionFactor <- function(RNA, genes, cluster = NULL, project = } -RunLISALocal <- function(genes, project, organism = "GRCh38", conda.dir = "", lisa.envname = "lisa") +RunLISALocal <- function(genes, project, organism = "GRCh38") { if(organism == "GRCh38"){ species = "hg38" @@ -231,47 +221,47 @@ RunLISALocal <- function(genes, project, organism = "GRCh38", conda.dir = "", li cluster_markers_list <- split(genes, genes$cluster) outputDir <- paste0(project, ".lisa") - if (!file.exists(outputDir)) dir.create(path=outputDir) + if (!file.exists(outputDir)) dir.create(path=outputDir) for(i in names(cluster_markers_list)) { cluster_marker = as.matrix(cluster_markers_list[[i]]$gene) - lisaDir <- paste0(outputDir, "/",i) - if (!file.exists(lisaDir)) dir.create(path=lisaDir) - write.table(cluster_marker,paste0(lisaDir,'/', i, ".txt"),sep='\t',col.names = F, row.names = F,quote = FALSE) + if (nrow(cluster_marker) > 500) { + cluster_marker = as.matrix(cluster_marker[1:500,]) + } + write.table(cluster_marker,paste0(outputDir,'/', i, ".txt"),sep='\t',col.names = F, row.names = F,quote = FALSE) } message("Start to run Lisa.") - for(i in names(cluster_markers_list)) - { - cmd = paste0(". ", conda.dir, "/etc/profile.d/conda.sh && conda activate ", lisa.envname, " && lisa model --method='all' --web=False --new_rp_h5=None --new_count_h5=None --species ",species, " --epigenome '['DNase','H3K27ac']' --cluster=False --covariates=False --random=True --prefix ",i,".txt"," --background=None --stat_background_number=500 --threads 8 ",outputDir,'/', i,'/',i,".txt") - system2("/bin/bash", args = c("-c", shQuote(cmd))) - message(paste0("Lisa in cluster ", i, " is done!")) - } - system("rm *.yml *.profile *.model") + cmd = paste0("lisa multi ", species, " ", outputDir, "/*.txt -o ", outputDir, "/ -c 10 -b 500 --seed=2556") + system2("/bin/bash", args = c("-c", shQuote(cmd))) + message(paste0("Lisa in cluster ", i, " is done!")) message("Lisa is done.") tf_all = NULL for(i in names(cluster_markers_list)) { - fileName = paste0(outputDir,'/', i,'/',i, '.txt_chipseq_cauchy_combine_dedup.csv') + fileName = paste0(outputDir,'/', i, '.txt.lisa.tsv') if (file.exists(fileName)) { - tf_p = read.csv(fileName) - rownames(tf_p) = tf_p$TF - tf = as.vector(sort(tf_p$TF)) - tf_temp = data.frame(tf_p[tf,2]) - colnames(tf_temp) = paste0(i) + tf_p = read.table(fileName, header = TRUE, sep = "\t") + tf_p_dedup = tf_p[!duplicated(tf_p$factor),] + rownames(tf_p_dedup) = tf_p_dedup$factor + tf = as.vector(sort(tf_p_dedup$factor)) + tf_temp = data.frame(-log10(tf_p_dedup[tf,"summary_p_value"]), tf) + colnames(tf_temp) = c(paste0(i), "TF") tf_all = c(tf_all,list(tf_temp)) } } - tf_all <- do.call(cbind, tf_all) - tf_all_log10 = -log10(tf_all) + tf_all_df <- Reduce(function(x, y) merge(x, y, by="TF", all = TRUE), tf_all) + + tf_all_log10 = tf_all_df if(organism == "GRCm38"){ - tf = Hmisc::capitalize(tolower(tf)) + tf_all_log10$TF= Hmisc::capitalize(tolower(tf_all_log10$TF)) } - rownames(tf_all_log10)=tf + rownames(tf_all_log10)=tf_all_log10$TF + tf_all_log10 = tf_all_log10[, c(-1)] write.table(tf_all_log10,paste0(project,'_lisa.txt'),sep='\t',quote = F) return(tf_all_log10) } @@ -288,7 +278,7 @@ RunLISAWeb <- function(genes, project, organism = "GRCh38") cluster_markers_list <- split(genes, genes$cluster) outputDir <- paste0(project, ".lisa") - if (!file.exists(outputDir)) dir.create(path=outputDir) + if (!file.exists(outputDir)) dir.create(path=outputDir) res_zip_url_list = rep("zip", length(cluster_markers_list)) names(res_zip_url_list) = names(cluster_markers_list) @@ -395,7 +385,7 @@ RunRABIT <- function(genes, project, rabit.path, organism = "hsa") { if(organism == "hsa"){ genes$entrezid = MAGeCKFlute::TransGeneID(genes$gene, fromType = "Symbol", toType = "Entrez", - organism = organism, ensemblHost = "www.ensembl.org") + organism = organism, ensemblHost = "www.ensembl.org") }else{ marker_mouse = genes$gene human = biomaRt::useMart("ensembl", dataset = "hsapiens_gene_ensembl") @@ -403,10 +393,10 @@ RunRABIT <- function(genes, project, rabit.path, organism = "hsa") marker_human = biomaRt::getLDS(attributes = c("mgi_symbol"), filters = "mgi_symbol", values = marker_mouse ,mart = mouse, attributesL = "hgnc_symbol", martL = human, uniqueRows=T) genes = merge(genes, marker_human, by.x = "gene", by.y = "MGI.symbol") genes$entrezid = MAGeCKFlute::TransGeneID(genes$HGNC.symbol, fromType = "Symbol", toType = "Entrez", - organism = "hsa", useBiomart = FALSE, - ensemblHost = "www.ensembl.org") + organism = "hsa", useBiomart = FALSE, + ensemblHost = "www.ensembl.org") } - + genes = na.omit(genes) cluster_markers_list <- split(genes, genes$cluster) @@ -417,7 +407,7 @@ RunRABIT <- function(genes, project, rabit.path, organism = "hsa") system(paste0("rm -rf ", outputDir)) dir.create(path=outputDir) } - for(i in names(cluster_markers_list)){ + for(i in names(cluster_markers_list)){ cluster_marker_logfc <- cluster_markers_list[[i]][,c("p_val","entrezid")] cluster_marker_logfc <- cluster_marker_logfc[which(!duplicated(cluster_marker_logfc$entrezid)),] row.names(cluster_marker_logfc) <- cluster_marker_logfc[,'entrezid'] @@ -434,7 +424,7 @@ RunRABIT <- function(genes, project, rabit.path, organism = "hsa") rabit_cluster_list <- list.files(paste0(project, ".RABIT"), pattern = "output.FDR") rabit_cluster_list <- unlist(strsplit(rabit_cluster_list, split = ".output.FDR")) - + notf_clusters = c() for(m in 1:length(rabit_cluster_list)){ i <- rabit_cluster_list[m] From a27e5f74ab2ba24c40f7749ce3ef03ac91ac1d87 Mon Sep 17 00:00:00 2001 From: Gali Bai Date: Sat, 5 Dec 2020 03:22:30 -0500 Subject: [PATCH 04/15] files changed to adapt LISA2 --- MAESTRO/MAESTRO_PipeInit.py | 135 +++++++++--------- MAESTRO/R/scRNAseq_pipe.R | 10 +- MAESTRO/Snakemake/scRNA/config_template.yaml | 59 ++++---- R/RNAAnnotateTranscriptionFactor.R | 10 +- .../RNA_infrastructure_10x.md | 18 +-- 5 files changed, 105 insertions(+), 127 deletions(-) diff --git a/MAESTRO/MAESTRO_PipeInit.py b/MAESTRO/MAESTRO_PipeInit.py index 7b88b28..7d7924c 100644 --- a/MAESTRO/MAESTRO_PipeInit.py +++ b/MAESTRO/MAESTRO_PipeInit.py @@ -24,11 +24,11 @@ def scatac_parser(subparsers): # Input files arguments group_input = workflow.add_argument_group("Input files arguments") - group_input.add_argument("--platform", dest = "platform", default = "10x-genomics", - choices = ["10x-genomics", "sci-ATAC-seq", "microfluidic"], + group_input.add_argument("--platform", dest = "platform", default = "10x-genomics", + choices = ["10x-genomics", "sci-ATAC-seq", "microfluidic"], help = "Platform of single cell ATAC-seq. DEFAULT: 10x-genomics.") - group_input.add_argument("--format", dest = "format", default = "fastq", - choices = ["fastq", "bam", "fragments"], + group_input.add_argument("--format", dest = "format", default = "fastq", + choices = ["fastq", "bam", "fragments"], help = "The format of input files. Users can start with sequencing fastq files, " "bam files with CB tag or fragments.tsv.gz file generated by CellRanger ATAC. " "If the platform is 10x-genomics, users can start with sequencing fastq files, bam files with CB tag or fragments.tsv.gz file generated by CellRanger ATAC. " @@ -55,37 +55,37 @@ def scatac_parser(subparsers): help = "The fragment file generated by CellRanger ATAC (required when the format is set as 'fragments'). " "For example, fragments.tsv.gz.") group_input.add_argument("--species", dest = "species", default = "GRCh38", - choices = ["GRCh38", "GRCm38"], type = str, + choices = ["GRCh38", "GRCm38"], type = str, help = "Specify the genome assembly (GRCh38 for human and GRCm38 for mouse). DEFAULT: GRCh38.") # Output arguments group_output = workflow.add_argument_group("Output arguments") - group_output.add_argument("--cores", dest = "cores", default = 8, + group_output.add_argument("--cores", dest = "cores", default = 8, type = int, help = "Number of cores to use. DEFAULT: 8.") - group_output.add_argument("--directory", "-d", dest = "directory", type = str, default = "MAESTRO", + group_output.add_argument("--directory", "-d", dest = "directory", type = str, default = "MAESTRO", help = "Path to the directory where the workflow shall be initialized and results shall be stored. DEFAULT: MAESTRO.") - group_output.add_argument("--outprefix", dest = "outprefix", type = str, default = "MAESTRO", + group_output.add_argument("--outprefix", dest = "outprefix", type = str, default = "MAESTRO", help = "Prefix of output files. DEFAULT: MAESTRO.") # Quality control cutoff group_cutoff = workflow.add_argument_group("Quality control arguments") group_cutoff.add_argument("--peak-cutoff", dest = "peak_cutoff", default = 100, type = int, help = "Minimum number of peaks included in each cell. DEFAULT: 100.") - group_cutoff.add_argument("--count-cutoff", dest = "count_cutoff", default = 1000, type = int, + group_cutoff.add_argument("--count-cutoff", dest = "count_cutoff", default = 1000, type = int, help = "Cutoff for the number of count in each cell. DEFAULT: 1000.") - group_cutoff.add_argument("--frip-cutoff", dest = "frip_cutoff", default = 0.2, type = float, + group_cutoff.add_argument("--frip-cutoff", dest = "frip_cutoff", default = 0.2, type = float, help = "Cutoff for fraction of reads in promoter in each cell. DEFAULT: 0.2.") group_cutoff.add_argument("--cell-cutoff", dest = "cell_cutoff", default = 10, type = int, help = "Minimum number of cells covered by each peak. DEFAULT: 10.") # Reference genome arguments group_reference = workflow.add_argument_group("Reference genome arguments") - group_reference.add_argument("--giggleannotation", dest = "giggleannotation", - required = True, + group_reference.add_argument("--giggleannotation", dest = "giggleannotation", + required = True, help = "Path of the giggle annotation file required for regulator identification. " "Please download the annotation file from " "http://cistrome.org/~chenfei/MAESTRO/giggle.tar.gz and decompress it.") - group_reference.add_argument("--fasta", dest = "fasta", + group_reference.add_argument("--fasta", dest = "fasta", default = "", help = "Genome fasta file for minimap2. " "Users can just download the fasta file for huamn and mouse from " @@ -95,7 +95,7 @@ def scatac_parser(subparsers): # Barcode library arguments group_barcode = workflow.add_argument_group("Barcode library arguments, only for platform of 'sci-ATAC-seq' and '10x-genomics'") - group_barcode.add_argument("--whitelist", dest = "whitelist", type = str, + group_barcode.add_argument("--whitelist", dest = "whitelist", type = str, default = "", help = "If the platform is 'sci-ATAC-seq' or '10x-genomics', please specify the barcode library (whitelist) " "so that the pipeline can correct cell barcodes with 1 base mismatched. " @@ -106,46 +106,46 @@ def scatac_parser(subparsers): # Customized peak arguments group_peak = workflow.add_argument_group("Customized peak arguments") - group_peak.add_argument("--custompeak", dest = "custompeak", action = "store_true", + group_peak.add_argument("--custompeak", dest = "custompeak", action = "store_true", help = "Whether or not to provide custom peaks. If set, users need to provide " "the file location of peak file through '--custompeak-file' and then MAESTRO will merge " "the custom peak file and the peak file called from all fragments using MACS2. " "By default (not set), the pipeline will use the peaks called using MACS2.") - group_peak.add_argument("--custompeak-file", dest = "custompeak_file", type = str, default = "", + group_peak.add_argument("--custompeak-file", dest = "custompeak_file", type = str, default = "", help = "If '--custompeak' is set, please provide the file location of custom peak file. " "The peak file is BED formatted with tab seperated. " "The first column is chromsome, the second is chromStart, and the third is chromEnd.") - group_peak.add_argument("--shortpeak", dest = "shortpeak", action = "store_true", + group_peak.add_argument("--shortpeak", dest = "shortpeak", action = "store_true", help = "Whether or not to call peaks from short fragments (shorter than 150bp). If set, " "MAESTRO will merge the peaks called from all fragments and those called from short fragments, " "and then use the merged peak file for further analysis. " "If not (by default), the pipeline will only use peaks called from all fragments.") - group_peak.add_argument("--clusterpeak", dest = "clusterpeak", action = "store_true", + group_peak.add_argument("--clusterpeak", dest = "clusterpeak", action = "store_true", help = "Whether or not to call peaks by cluster. If set, " "MAESTRO will split the bam file according to the clustering result, " "and then call peaks for each cluster. " "By default (not set), MAESTRO will skip this step.") - + # Gene score arguments group_genescore = workflow.add_argument_group("Gene score arguments") - group_genescore.add_argument("--rpmodel", dest = "rpmodel", default = "Enhanced", - choices = ["Simple", "Enhanced"], + group_genescore.add_argument("--rpmodel", dest = "rpmodel", default = "Enhanced", + choices = ["Simple", "Enhanced"], help = "The RP model to use to calaculate gene score. " "For each gene, simple model summarizes the impact of all regulatory elements within the up/dowm-stream of TSS. " "On the basis of simple model, enhanced model includes the regulatory elements within the exon region, " "and also excludes the regulatory elements overlapped with another gene (the promoter and exon of a nearby gene). " "See the MAESTRO paper for more details. DEFAULT: Enhanced.") - group_genescore.add_argument("--genedistance", dest = "genedistance", default = 10000, type = int, + group_genescore.add_argument("--genedistance", dest = "genedistance", default = 10000, type = int, help = "Gene score decay distance, could be optional from 1kb (promoter-based regulation) " "to 10kb (enhancer-based regulation). DEFAULT: 10000.") # Signature file arguments group_annotation = workflow.add_argument_group("Cell-type annotation arguments") - group_annotation.add_argument("--annotation", dest = "annotation", action = "store_true", + group_annotation.add_argument("--annotation", dest = "annotation", action = "store_true", help = "Whether or not to perform ccell-type annotation. " "By default (not set), MAESTRO will skip the step of cell-type annotation. " "If set, please specify the method of cell-type annotation through --method. ") - group_annotation.add_argument("--method", dest = "method", type = str, default = "RP-based", + group_annotation.add_argument("--method", dest = "method", type = str, default = "RP-based", choices = ["RP-based", "peak-based", "both"], help = "Method to annotate cell types. MAESTRO provides two strategies to annotate cell types for scATAC-seq data. " "Users can choose from 'RP-based' and 'peak-based', or choose to run both of them. " @@ -155,13 +155,13 @@ def scatac_parser(subparsers): "If 'peak-based' is set, MAESTRO utilizes GIGGLE to evaluate the enrichment of bulk chromatin accessibility peaks on cluster-specific peaks from scATAC-seq data, " "and then transfers the Cistrome cluster identity from the most enriched bulk chromatin accessibility data as the cell-type annotation for the scATAC-seq cluster. " "See the MAESTRO paper for more details. DEFAULT: RP-based. ") - group_annotation.add_argument("--signature", dest = "signature", type = str, default = "human.immune.CIBERSORT", + group_annotation.add_argument("--signature", dest = "signature", type = str, default = "human.immune.CIBERSORT", help = "Cell signature file used to annotate cell types (required when method is set as 'RP-based'). MAESTRO provides several sets of built-in cell signatures. " "Users can choose from ['human.immune.CIBERSORT', 'mouse.brain.ALLEN', 'mouse.all.facs.TabulaMuris', 'mouse.all.droplet.TabulaMuris']. " "Custom cell signatures are also supported. In this situation, users need to provide the file location of cell signatures, " "and the signature file is tab-seperated without header. The first column is cell type, and the second column is signature gene. " "DEFAULT: human.immune.CIBERSORT. ") - + return @@ -176,37 +176,37 @@ def scrna_parser(subparsers): # Input files arguments group_input = workflow.add_argument_group("Input files arguments") - group_input.add_argument("--platform", dest = "platform", default = "10x-genomics", - choices = ["10x-genomics", "Dropseq", "Smartseq2"], + group_input.add_argument("--platform", dest = "platform", default = "10x-genomics", + choices = ["10x-genomics", "Dropseq", "Smartseq2"], help = "Platform of single cell RNA-seq. DEFAULT: 10x-genomics.") - group_input.add_argument("--fastq-dir", dest = "fastq_dir", type = str, required = True, + group_input.add_argument("--fastq-dir", dest = "fastq_dir", type = str, required = True, help = "Directory where fastq files are stored") - group_input.add_argument("--fastq-prefix", dest = "fastq_prefix", type = str, default = "", + group_input.add_argument("--fastq-prefix", dest = "fastq_prefix", type = str, default = "", help = "Sample name of fastq file, only for the platform of '10x-genomics'. " "If there is a file named pbmc_1k_v2_S1_L001_I1_001.fastq.gz, the prefix is 'pbmc_1k_v2'.") - group_input.add_argument("--fastq-barcode", dest = "fastq_barcode", type = str, default = "", + group_input.add_argument("--fastq-barcode", dest = "fastq_barcode", type = str, default = "", help = "Specify the barcode fastq file, only for the platform of 'Dropseq'. " "If there are multiple pairs of fastq, please provide a comma-separated list of barcode fastq files. " "For example, --fastq-barcode test1_1.fastq,test2_1.fastq") - group_input.add_argument("--fastq-transcript", dest = "fastq_transcript", type = str, default = "", + group_input.add_argument("--fastq-transcript", dest = "fastq_transcript", type = str, default = "", help = "Specify the transcript fastq file, only for the platform of 'Dropseq'. " "If there are multiple pairs of fastq, please provide a comma-separated list of barcode fastq files. " "For example, --fastq-barcode test1_2.fastq,test2_2.fastq") group_input.add_argument("--species", dest = "species", default = "GRCh38", - choices = ["GRCh38", "GRCm38"], type = str, + choices = ["GRCh38", "GRCm38"], type = str, help = "Specify the genome assembly (GRCh38 for human and GRCm38 for mouse). DEFAULT: GRCh38.") # Output arguments group_output = workflow.add_argument_group("Running and output arguments") - group_output.add_argument("--cores", dest = "cores", default = 8, + group_output.add_argument("--cores", dest = "cores", default = 8, type = int, help = "The number of cores to use. DEFAULT: 8.") - group_output.add_argument("--rseqc", dest = "rseqc", action = "store_true", + group_output.add_argument("--rseqc", dest = "rseqc", action = "store_true", help = "Whether or not to run RSeQC. " "If set, the pipeline will include the RSeQC part and then takes a longer time. " "By default (not set), the pipeline will skip the RSeQC part.") - group_output.add_argument("--directory", "-d", dest = "directory", type = str, default = "MAESTRO", + group_output.add_argument("--directory", "-d", dest = "directory", type = str, default = "MAESTRO", help = "Path to the directory where the workflow shall be initialized and results shall be stored. DEFAULT: MAESTRO.") - group_output.add_argument("--outprefix", dest = "outprefix", type = str, default = "MAESTRO", + group_output.add_argument("--outprefix", dest = "outprefix", type = str, default = "MAESTRO", help = "Prefix of output files. DEFAULT: MAESTRO.") # Quality control cutoff @@ -216,16 +216,16 @@ def scrna_parser(subparsers): group_cutoff.add_argument("--gene-cutoff", dest = "gene_cutoff", default = 500, type = int, help = "Cutoff for the number of genes included in each cell. DEFAULT: 500.") group_cutoff.add_argument("--cell-cutoff", dest = "cell_cutoff", default = 10, type = int, - help = "Cutoff for the number of cells covered by each gene. DEFAULT: 10.") + help = "Cutoff for the number of cells covered by each gene. DEFAULT: 10.") # Reference genome arguments group_reference = workflow.add_argument_group("Reference genome arguments") - group_reference.add_argument("--mapindex", dest = "mapindex", - required = True, + group_reference.add_argument("--mapindex", dest = "mapindex", + required = True, help = "Genome index directory for STAR. Users can just download the index file for human and mouse " - "from http://cistrome.org/~chenfei/MAESTRO/Refdata_scRNA_MAESTRO_GRCh38_1.1.0.tar.gz and " - "http://cistrome.org/~chenfei/MAESTRO/Refdata_scRNA_MAESTRO_GRCm38_1.1.0.tar.gz, respectively, and decompress them. " - "Then specify the index directory for STAR, for example, 'Refdata_scRNA_MAESTRO_GRCh38_1.1.0/GRCh38_STAR_2.7.3a'.") + "from http://cistrome.org/~galib/Refdata_scRNA_MAESTRO_GRCh38_1.2.2.tar.gz and " + "http://cistrome.org/~galib/Refdata_scRNA_MAESTRO_GRCm38_1.2.2.tar.gz, respectively, and decompress them. " + "Then specify the index directory for STAR, for example, 'Refdata_scRNA_MAESTRO_GRCh38_1.2.2/GRCh38_STAR_2.7.6a'.") group_reference.add_argument("--rsem", dest = "rsem", default = "", help = "The prefix of transcript references for RSEM used by rsem-prepare-reference (Only required when the platform is Smartseq2). " "Users can directly download the annotation file for huamn and mouse from " @@ -255,38 +255,37 @@ def scrna_parser(subparsers): # Regulator identification group_regulator = workflow.add_argument_group("Regulator identification arguments") - # group_regulator.add_argument("--method", dest = "method", type = str, + # group_regulator.add_argument("--method", dest = "method", type = str, # choices = ["RABIT", "LISA"], default = "LISA", # help = "Method to predict driver regulators.") - # group_regulator.add_argument("--method", dest = "method", type = str, + # group_regulator.add_argument("--method", dest = "method", type = str, # choices = ["LISA"], default = "LISA", # help = "Method to predict driver regulators.") # group_regulator.add_argument("--rabitlib", dest = "rabitlib", type = str, # help = "Path of the rabit annotation file required for regulator identification (only required if method is set to RABIT). " # "Please download the annotation file from " # "http://cistrome.org/~chenfei/MAESTRO/rabit.tar.gz and decompress it.") - group_regulator.add_argument("--lisamode", dest = "lisamode", type = str, default = "local", choices = ["local", "web"], - help = "Mode to Run LISA, 'local' or 'web'. If the mode is set as 'local', " - "please install LISA (https://github.com/qinqian/lisa) and download pre-computed datasets following the instructions. " - "The 'web' mode is to run online version of LISA. In consideration of the connection issue and size of datasets, " - "the 'local' mode is recommended to run the whole MAESTRO pipeline. " - "If the mode is 'local', please provide the name of LISA environment through --lisaenv " - "and specify the directory where miniconda or anaconda is installed through --condadir. DEFAULT: local.") - group_regulator.add_argument("--lisaenv", dest = "lisaenv", type = str, default = "", - help = "Name of LISA environment (required if lisamode is set to local). For example, lisa.") - group_regulator.add_argument("--condadir", dest = "condadir", type = str, default = "", - help = "Directory where miniconda or anaconda is installed " - "(required if lisamode is set to local). For example, '/home/user/miniconda3'.") + #group_regulator.add_argument("--lisamode", dest = "lisamode", type = str, default = "local", choices = ["local", "web"], + # help = "Mode to Run LISA, 'local' or 'web'. If the mode is set as 'local', " + # "please install LISA (https://github.com/qinqian/lisa) and download pre-computed datasets following the instructions. " + # "The 'web' mode is to run online version of LISA. In consideration of the connection issue and size of datasets, " + # "the 'local' mode is recommended to run the whole MAESTRO pipeline. " + # "If the mode is 'local', please provide the name of LISA environment through --lisaenv " + # "and specify the directory where miniconda or anaconda is installed through --condadir. DEFAULT: local.") + group_regulator.add_argument("--lisa", dest = "lisa", type = str, default = "", + help = "Path to lisa data files. For human and mouse, data can be downloaded http://cistrome.org/~alynch/data/lisa_data/hg38_2.1.tar.gz" + "and http://cistrome.org/~alynch/data/lisa_data/mm10_2.1.tar.gz") + # Signature file arguments group_signature = workflow.add_argument_group("Cell signature arguments") - group_signature.add_argument("--signature", dest = "signature", type = str, default = "human.immune.CIBERSORT", + group_signature.add_argument("--signature", dest = "signature", type = str, default = "human.immune.CIBERSORT", help = "Cell signature file used to annotate cell types. MAESTRO provides several sets of built-in cell signatures. " "Users can choose from ['human.immune.CIBERSORT', 'mouse.brain.ALLEN', 'mouse.all.facs.TabulaMuris', 'mouse.all.droplet.TabulaMuris']. " "Custom cell signatures are also supported. In this situation, users need to provide the file location of cell signatures, " "and the signature file is tab-seperated without header. The first column is cell type, and the second column is signature gene. " "DEFAULT: human.immune.CIBERSORT.") - + return @@ -308,9 +307,9 @@ def integrate_parser(subparsers): # Output arguments group_output = workflow.add_argument_group("Running and output arguments") - group_output.add_argument("--directory", "-d", dest = "directory", default = "MAESTRO", + group_output.add_argument("--directory", "-d", dest = "directory", default = "MAESTRO", help = "Path to the directory where the workflow shall be initialized and results shall be stored. DEFAULT: MAESTRO.") - group_output.add_argument("--outprefix", dest = "outprefix", type = str, default = "MAESTRO", + group_output.add_argument("--outprefix", dest = "outprefix", type = str, default = "MAESTRO", help = "Prefix of output files. DEFAULT: MAESTRO.") return @@ -347,7 +346,7 @@ def scatac_config(args): configout.write(config_template.render( platform = args.platform, format = args.format, - fastqdir = os.path.abspath(args.fastq_dir), + fastqdir = os.path.abspath(args.fastq_dir), fastqprefix = args.fastq_prefix, bam = os.path.abspath(args.bam), frag = os.path.abspath(args.frag), @@ -370,7 +369,7 @@ def scatac_config(args): genedistance = args.genedistance, giggleannotation = os.path.abspath(args.giggleannotation), fasta = os.path.abspath(args.fasta))) - + source = os.path.join(pkgpath, "scATAC", "Snakefile") target = os.path.join(args.directory, "Snakefile") shutil.copy(source, target) @@ -399,7 +398,7 @@ def scrna_config(args): with open(configfile, "w") as configout: configout.write(config_template.render( - fastqdir = os.path.abspath(args.fastq_dir), + fastqdir = os.path.abspath(args.fastq_dir), fastqprefix = args.fastq_prefix, species = args.species, platform = args.platform, @@ -411,9 +410,8 @@ def scrna_config(args): cell = args.cell_cutoff, signature = signature, # rabitlib = os.path.abspath(args.rabitlib), - lisamode = args.lisamode, - lisaenv = args.lisaenv, - condadir = os.path.abspath(args.condadir), + #lisamode = args.lisamode, + lisa = os.path.abspath(args.lisa) mapindex = os.path.abspath(args.mapindex), rsem = os.path.abspath(args.rsem), whitelist = os.path.abspath(args.whitelist), @@ -447,11 +445,10 @@ def integrate_config(args): config_template = Template(open(template_file, "r").read(), trim_blocks=True, lstrip_blocks=True) with open(configfile, "w") as configout: configout.write(config_template.render( - rnaobject = os.path.abspath(args.rna_object), + rnaobject = os.path.abspath(args.rna_object), atacobject = os.path.abspath(args.atac_object), outprefix = args.outprefix)) source = os.path.join(pkgpath, "integrate", "Snakefile") target = os.path.join(args.directory, "Snakefile") shutil.copy(source, target) - diff --git a/MAESTRO/R/scRNAseq_pipe.R b/MAESTRO/R/scRNAseq_pipe.R index 5042ab5..b16879c 100644 --- a/MAESTRO/R/scRNAseq_pipe.R +++ b/MAESTRO/R/scRNAseq_pipe.R @@ -19,9 +19,12 @@ option_list = list( make_option(c("--species"), type = "character", default = "GRCh38", action = "store", help = "The platform of scRNA-seq." ), - make_option(c("--method"), type = "character", default = "LISA", - action = "store", help = "The method to identify driver regulators. [LISA, RABIT]" - ), + #make_option(c("--lisamode"), type = "character", default = "multi", + #action = "store", help = "Mode to run LISA (multi or one-vs-rest)." + #), + #make_option(c("--method"), type = "character", default = "LISA", + #action = "store", help = "The method to identify driver regulators. [LISA, RABIT]" + #), make_option(c("--signature"), type = "character", default = "", action = "store", help = "The cell signature file for celltype annotation. Default is built-in CIBERSORT immune cell signature." ), @@ -71,5 +74,4 @@ RNA.res$RNA <- RNAAnnotateCelltype(RNA = RNA.res$RNA, genes = RNA.res$genes, signatures = signatures, min.score = 0.05) saveRDS(RNA.res, paste0(prefix, "_scRNA_Object.rds")) RNA.tfs <- RNAAnnotateTranscriptionFactor(RNA = RNA.res$RNA, genes = RNA.res$genes, project = prefix, - method = method, organism = species, top.tf = 10) diff --git a/MAESTRO/Snakemake/scRNA/config_template.yaml b/MAESTRO/Snakemake/scRNA/config_template.yaml index 4292ec9..138dbd8 100644 --- a/MAESTRO/Snakemake/scRNA/config_template.yaml +++ b/MAESTRO/Snakemake/scRNA/config_template.yaml @@ -1,7 +1,7 @@ # Directory where fastq files are stored fastqdir: {{ fastqdir }} -# Sample name of fastq file (only for platform of "10x-genomics", for example, +# Sample name of fastq file (only for platform of "10x-genomics", for example, # if there is a file named pbmc_1k_v2_S1_L001_I1_001.fastq.gz, the sample name is "pbmc_1k_v2". ) fastqprefix: {{ fastqprefix }} @@ -15,33 +15,22 @@ platform: {{ platform }} outprefix: {{ outprefix }} # Whether or not to run RSeQC. [True, False] -# If it's set to True, the pipeline will include the RSeQC part and then takes a longer time. +# If it's set to True, the pipeline will include the RSeQC part and then takes a longer time. # By default, the pipeline will skip the RSeQC part. DEFAULT: False. rseqc: {{ rseqc }} # Number of cores to use cores: {{ cores }} -# Cell signature file used to annotate cell types. MAESTRO provides several sets of built-in cell signatures. -# Users can choose from ['human.immune.CIBERSORT', 'mouse.brain.ALLEN', 'mouse.all.facs.TabulaMuris', 'mouse.all.droplet.TabulaMuris']. -# Custom cell signatures are also supported. In this situation, users need to provide the file location of cell signatures, +# Cell signature file used to annotate cell types. MAESTRO provides several sets of built-in cell signatures. +# Users can choose from ['human.immune.CIBERSORT', 'mouse.brain.ALLEN', 'mouse.all.facs.TabulaMuris', 'mouse.all.droplet.TabulaMuris']. +# Custom cell signatures are also supported. In this situation, users need to provide the file location of cell signatures, # and the signature file is tab-seperated without header. The first column is cell type, and the second column is signature gene. signature: {{ signature }} -# Mode to Run LISA, 'local' or 'web'. If the mode is set as 'local', -# please install LISA (https://github.com/qinqian/lisa) and download pre-computed datasets following the instructions. -# The 'web' mode is to run online version of LISA. In consideration of the connection issue and size of datasets, -# the 'local' mode is recommended to run the whole MAESTRO pipeline. -# If the mode is 'local', please provide the name of LISA environment through lisaenv -# and specify the directory where miniconda or anaconda is installed through condadir. DEFAULT: local. -lisamode: {{ lisamode }} - -# Name of lisa environment (required if method is set to lisa and lisamode is set to local). DEFAULT: lisa. -lisaenv: {{ lisaenv }} - -# Directory where miniconda or anaconda is installed (required if method is set to lisa and lisamode is set to local). -# For example, /home/user/miniconda3 -condadir: {{ condadir }} +# Path to the LISA data files +# please follow LISA2 (https://github.com/liulab-dfci/lisa2) and download pre-computed datasets following the instructions. +lisa: {{ lisa }} # Cutoff for quality control @@ -54,14 +43,14 @@ cutoff: cell: {{ cell }} -# Reference genome +# Reference genome genome: - # Genome index directory for STAR. Users can just download the index file - # from http://cistrome.org/~chenfei/MAESTRO/Refdata_scRNA_MAESTRO_GRCh38_1.1.0.tar.gz and decompress it. + # Genome index directory for STAR. Users can just download the index file + # from http://cistrome.org/~chenfei/MAESTRO/Refdata_scRNA_MAESTRO_GRCh38_1.1.0.tar.gz and decompress it. # Then specify the index directory for STAR, for example, 'Refdata_scRNA_MAESTRO_GRCh38_1.1.0/GRCh38_STAR_2.7.3a'. mapindex: {{ mapindex }} - # The prefix of transcript references for RSEM used by rsem-prepare-reference (Only required when the platform is Smartseq2). - # Users can directly download the annotation file from + # The prefix of transcript references for RSEM used by rsem-prepare-reference (Only required when the platform is Smartseq2). + # Users can directly download the annotation file from # http://cistrome.org/~chenfei/MAESTRO/giggle.tar.gz and decompress it. # Then specify the prefix for RSEM, for example, 'Refdata_scRNA_MAESTRO_GRCh38_1.1.0/GRCh38_RSEM_1.3.2/GRCh38'. rsem: {{ rsem }} @@ -69,31 +58,31 @@ genome: # Information about barcode (for platform of 'Dropseq' or '10x-genomics') barcode: - # If the platform is 'Dropseq' or '10x-genomics', please specify the barcode library (whitelist) - # so that STARsolo can do the error correction and demultiplexing of cell barcodes. - # The 10X Chromium whitelist file can be found inside the CellRanger distribution. - # Please make sure that the whitelist is compatible with the specific version of the 10X chemistry: V2 or V3. - # For example, in CellRanger 3.1.0, the V2 whitelist is 'cellranger-3.1.0/cellranger-cs/3.1.0/lib/python/cellranger/barcodes/737K-august-2016.txt'. - # The V3 whitelist is 'cellranger-3.1.0/cellranger-cs/3.1.0/lib/python/cellranger/barcodes/3M-february-2018.txt'. + # If the platform is 'Dropseq' or '10x-genomics', please specify the barcode library (whitelist) + # so that STARsolo can do the error correction and demultiplexing of cell barcodes. + # The 10X Chromium whitelist file can be found inside the CellRanger distribution. + # Please make sure that the whitelist is compatible with the specific version of the 10X chemistry: V2 or V3. + # For example, in CellRanger 3.1.0, the V2 whitelist is 'cellranger-3.1.0/cellranger-cs/3.1.0/lib/python/cellranger/barcodes/737K-august-2016.txt'. + # The V3 whitelist is 'cellranger-3.1.0/cellranger-cs/3.1.0/lib/python/cellranger/barcodes/3M-february-2018.txt'. whitelist: {{ whitelist }} # The start site of each barcode. DEFAULT: 1. barcodestart: {{ barcodestart }} - # The length of cell barcode. For 10x-genomics, the length of barcode is 16. DEFAULT: 16. + # The length of cell barcode. For 10x-genomics, the length of barcode is 16. DEFAULT: 16. barcodelength: {{ barcodelength }} # The start site of UMI. DEFAULT: 17. umistart: {{ umistart }} - # The length of UMI. For 10x-genomics, the length of V2 chemistry is 10. - # For 10X V3 chemistry, the length is 12. DEFAULT: 10. + # The length of UMI. For 10x-genomics, the length of V2 chemistry is 10. + # For 10X V3 chemistry, the length is 12. DEFAULT: 10. umilength: {{ umilength }} # Specify the barcode fastq file and transcript fastq file (only for platform of "Dropseq") fastq: - # Specify the barcode fastq file, only for the platform of 'Dropseq'. + # Specify the barcode fastq file, only for the platform of 'Dropseq'. # If there are multiple pairs of fastq, please provide a comma-separated list of barcode fastq files. # For example, 'test1_1.fastq,test2_1.fastq' barcode: {{ barcode }} - # Specify the transcript fastq file, only for the platform of 'Dropseq'. + # Specify the transcript fastq file, only for the platform of 'Dropseq'. # If there are multiple pairs of fastq, please provide a comma-separated list of barcode fastq files. # For example, test1_2.fastq,test2_2.fastq' transcript: {{ transcript }} diff --git a/R/RNAAnnotateTranscriptionFactor.R b/R/RNAAnnotateTranscriptionFactor.R index c1c6e8c..a43e8f9 100644 --- a/R/RNAAnnotateTranscriptionFactor.R +++ b/R/RNAAnnotateTranscriptionFactor.R @@ -37,7 +37,7 @@ #' @export -RNAAnnotateTranscriptionFactor <- function(RNA, genes, cluster = NULL, project = RNA@project.name,lisa.mode = "local", +RNAAnnotateTranscriptionFactor <- function(RNA, genes, cluster = NULL, project = RNA@project.name,lisa.mode = "multi", method = "LISA",rabit.path, organism = "GRCh38", top.tf = 10) { if(organism == "GRCh38"){ @@ -71,10 +71,10 @@ RNAAnnotateTranscriptionFactor <- function(RNA, genes, cluster = NULL, project = } if(method == "LISA"){ scorename = "log(Lisascore)" - if(lisa.mode == "local"){ - out_fdr_max_log = RunLISALocal(genes = genes, project = project, organism = organism) + if(lisa.mode == "multi"){ + out_fdr_max_log = RunLISAMulti(genes = genes, project = project, organism = organism) } - if(lisa.mode == "web"){ + if(lisa.mode == "one-vs-rest"){ out_fdr_max_log = RunLISAWeb(genes = genes, project = project, organism = organism) } } @@ -209,7 +209,7 @@ RNAAnnotateTranscriptionFactor <- function(RNA, genes, cluster = NULL, project = } -RunLISALocal <- function(genes, project, organism = "GRCh38") +RunLISAMulti <- function(genes, project, organism = "GRCh38") { if(organism == "GRCh38"){ species = "hg38" diff --git a/example/RNA_infrastructure_10x/RNA_infrastructure_10x.md b/example/RNA_infrastructure_10x/RNA_infrastructure_10x.md index dff9468..91db4d2 100644 --- a/example/RNA_infrastructure_10x/RNA_infrastructure_10x.md +++ b/example/RNA_infrastructure_10x/RNA_infrastructure_10x.md @@ -24,7 +24,7 @@ $ MAESTRO scrna-init --platform 10x-genomics --species GRCh38 \ --cores 8 --rseqc --directory Analysis/10X_PBMC_8k_MAESTRO_V110 --outprefix 10X_PBMC_8k \ --mapindex annotations/MAESTRO/Refdata_scRNA_MAESTRO_GRCh38_1.1.0/GRCh38_STAR_2.7.3a \ --whitelist Data/barcodes/737K-august-2016.txt \ ---umi-length 10 --lisamode local --lisaenv lisa --condadir /home1/user/miniconda3 --signature human.immune.CIBERSORT +--umi-length 10 --lisa lisa/hg38_2.1.tar.gz --signature human.immune.CIBERSORT ``` To get a full description of command-line options, please use the command `MAESTRO scrna-init -h`. @@ -43,8 +43,7 @@ usage: MAESTRO scrna-init [-h] [--platform {10x-genomics,Dropseq,Smartseq2}] [--barcode-start BARCODE_START] [--barcode-length BARCODE_LENGTH] [--umi-start UMI_START] [--umi-length UMI_LENGTH] - [--lisamode {local,web}] [--lisaenv LISAENV] - [--condadir CONDADIR] [--signature SIGNATURE] + [--lisa][--signature SIGNATURE] ``` Here we list all the arguments and their description. @@ -98,10 +97,7 @@ Arguments | Description Arguments | Description --------- | ----------- -`--lisamode` | Mode to Run LISA, 'local' or 'web'. If the mode is set as 'local', please install [LISA](https://github.com/qinqian/lisa) and download pre-computed datasets following the instructions. The 'web' mode is to run online version of LISA. In consideration of the connection issue and size of datasets, the 'local' mode is recommended to run the whole MAESTRO pipeline. If the mode is 'local', please provide the name of LISA environment through `--lisaenv` and specify the directory where miniconda or anaconda is installed through `--condadir`. DEFAULT: local. (**Note:** LISA takes multiple types of gene sets which can be constituted of only official gene symbols, only RefSeq ids, only Ensembl ids, only Entrez ids, or a mixture of these identifiers.) -`--lisaenv` | Name of LISA environment (required if method is set to lisa and lisamode is set to local). DEFAULT: lisa. -`--condadir` | Directory where miniconda or anaconda is installed (required if method is set to lisa and lisamode is set to local). For example, `--condadir /home/user/miniconda3`. - +`--lisa` | Path to the LISA data files. Please download LISA's required data from cistrome.org: [human](http://cistrome.org/~alynch/data/lisa_data/hg38_2.1.tar.gz) and [mouse](http://cistrome.org/~alynch/data/lisa_data/mm10_2.1.tar.gz). The new version of [LISA](https://github.com/liulab-dfci/lisa2) has been released to largely decrease processing time on multiple gene list. **Cell signature arguments:** Arguments | Description @@ -277,16 +273,12 @@ All the reduction results are stored in `Object@reductions`. For example, users ### Step 3. Identify driver transcription regulators -To identify enriched transcription regulators is crucial to understanding gene regulation in the heterogeneous single-cell populations. MAESTRO utilizes LISA to predict the potential transcription factors based on the marker genes in each cluster, which rely on the transcriptional regulator binding profiles from CistromeDB to identify the potential regulators shaping the expression pattern of each cluster. MAESTRO provides two options for running LISA, "local" and "web". The web mode does not need to install LISA and download the annotations. If users select the local mode, which is much faster and more stable than the web version, users need to install LISA locally, build the annotation files according to the [LISA document](https://github.com/qinqian/lisa), and provide the environment name of LISA when using the `RNAAnnotateTranscriptionFactor` function. If users have multiple clusters of differential genes, the "local" mode is recommended. +To identify enriched transcription regulators is crucial to understanding gene regulation in the heterogeneous single-cell populations. MAESTRO utilizes LISA to predict the potential transcription factors based on the marker genes in each cluster, which rely on the transcriptional regulator binding profiles from CistromeDB to identify the potential regulators shaping the expression pattern of each cluster. In rare cases, users need to download LISA data manually. If the program fails while downloading data, follow these [LISA document](https://github.com/liulab-dfci/lisa2/blob/master/docs/troubleshooting.md). If users have multiple clusters of differential genes, the "multi" mode is recommended. ```R > pbmc.RNA.tfs <- RNAAnnotateTranscriptionFactor(RNA = pbmc.RNA.res$RNA, genes = pbmc.RNA.res$genes, project = pbmc.RNA.res$RNA@project.name, - method = "LISA", - lisa.mode = "local", - conda.dir = "/home1/user/miniconda3", - lisa.envname = "lisa", organism = "GRCh38", top.tf = 10) > pbmc.RNA.tfs[["0"]] @@ -309,8 +301,6 @@ Besides identifying TFs for all the clusters, we also support the differential g genes = de.geneset, cluster = c(0), project = "10X_PBMC_8k_Monocyte", - method = "LISA", - lisa.mode = "web", organism = "GRCh38", top.tf = 20) ``` From f3723a1d1e22ce45652764e0776da48046992860 Mon Sep 17 00:00:00 2001 From: Gali Bai Date: Sat, 5 Dec 2020 03:34:08 -0500 Subject: [PATCH 05/15] modified clarifications --- example/RNA_infrastructure_10x/RNA_infrastructure_10x.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/example/RNA_infrastructure_10x/RNA_infrastructure_10x.md b/example/RNA_infrastructure_10x/RNA_infrastructure_10x.md index 91db4d2..92bb6ff 100644 --- a/example/RNA_infrastructure_10x/RNA_infrastructure_10x.md +++ b/example/RNA_infrastructure_10x/RNA_infrastructure_10x.md @@ -97,7 +97,8 @@ Arguments | Description Arguments | Description --------- | ----------- -`--lisa` | Path to the LISA data files. Please download LISA's required data from cistrome.org: [human](http://cistrome.org/~alynch/data/lisa_data/hg38_2.1.tar.gz) and [mouse](http://cistrome.org/~alynch/data/lisa_data/mm10_2.1.tar.gz). The new version of [LISA](https://github.com/liulab-dfci/lisa2) has been released to largely decrease processing time on multiple gene list. +`--lisa` | Path to the LISA data files. Please download LISA's required data from cistrome.org: [human](http://cistrome.org/~alynch/data/lisa_data/hg38_2.1.tar.gz) and [mouse](http://cistrome.org/~alynch/data/lisa_data/mm10_2.1.tar.gz). The latest version of [LISA2](https://github.com/liulab-dfci/lisa2) was recently released to largely decrease processing time on multiple gene lists. Please check out for detailed information. + **Cell signature arguments:** Arguments | Description @@ -273,7 +274,7 @@ All the reduction results are stored in `Object@reductions`. For example, users ### Step 3. Identify driver transcription regulators -To identify enriched transcription regulators is crucial to understanding gene regulation in the heterogeneous single-cell populations. MAESTRO utilizes LISA to predict the potential transcription factors based on the marker genes in each cluster, which rely on the transcriptional regulator binding profiles from CistromeDB to identify the potential regulators shaping the expression pattern of each cluster. In rare cases, users need to download LISA data manually. If the program fails while downloading data, follow these [LISA document](https://github.com/liulab-dfci/lisa2/blob/master/docs/troubleshooting.md). If users have multiple clusters of differential genes, the "multi" mode is recommended. +To identify enriched transcription regulators is crucial to understanding gene regulation in the heterogeneous single-cell populations. MAESTRO utilizes LISA to predict the potential transcription factors based on the marker genes in each cluster, which rely on the transcriptional regulator binding profiles from CistromeDB to identify the potential regulators shaping the expression pattern of each cluster. In rare cases, users need to download LISA data manually. If the program fails while downloading data, follow this [LISA document](https://github.com/liulab-dfci/lisa2/blob/master/docs/troubleshooting.md). If users have multiple clusters of differential genes, the "multi" mode is recommended. ```R > pbmc.RNA.tfs <- RNAAnnotateTranscriptionFactor(RNA = pbmc.RNA.res$RNA, From 58a2b3ca485cc760909199cdc2682b4e6f5ab433 Mon Sep 17 00:00:00 2001 From: Gali Bai Date: Mon, 7 Dec 2020 21:20:24 -0500 Subject: [PATCH 06/15] add lisa2 in conda env meta.yaml --- MAESTRO/MAESTRO_PipeInit.py | 6 ++-- MAESTRO/R/scRNAseq_pipe.R | 6 ++-- MAESTRO/Snakemake/scRNA/Snakefile | 14 ++++++---- MAESTRO/Snakemake/scRNA/config_template.yaml | 2 +- README.md | 3 +- conda/MAESTRO/meta.yaml | 28 ++++++++++--------- .../RNA_infrastructure_10x.md | 6 ++-- man/RNAAnnotateTranscriptionFactor.Rd | 26 ++++------------- setup.py | 8 +++--- 9 files changed, 44 insertions(+), 55 deletions(-) diff --git a/MAESTRO/MAESTRO_PipeInit.py b/MAESTRO/MAESTRO_PipeInit.py index 7d7924c..79984c6 100644 --- a/MAESTRO/MAESTRO_PipeInit.py +++ b/MAESTRO/MAESTRO_PipeInit.py @@ -272,8 +272,8 @@ def scrna_parser(subparsers): # "the 'local' mode is recommended to run the whole MAESTRO pipeline. " # "If the mode is 'local', please provide the name of LISA environment through --lisaenv " # "and specify the directory where miniconda or anaconda is installed through --condadir. DEFAULT: local.") - group_regulator.add_argument("--lisa", dest = "lisa", type = str, default = "", - help = "Path to lisa data files. For human and mouse, data can be downloaded http://cistrome.org/~alynch/data/lisa_data/hg38_2.1.tar.gz" + group_regulator.add_argument("--lisadir", dest = "lisadir", type = str, default = "", + help = "Path to lisa data files. For human and mouse, data can be downloaded from http://cistrome.org/~alynch/data/lisa_data/hg38_2.1.tar.gz" "and http://cistrome.org/~alynch/data/lisa_data/mm10_2.1.tar.gz") @@ -411,7 +411,7 @@ def scrna_config(args): signature = signature, # rabitlib = os.path.abspath(args.rabitlib), #lisamode = args.lisamode, - lisa = os.path.abspath(args.lisa) + lisadir = os.path.abspath(args.lisadir) mapindex = os.path.abspath(args.mapindex), rsem = os.path.abspath(args.rsem), whitelist = os.path.abspath(args.whitelist), diff --git a/MAESTRO/R/scRNAseq_pipe.R b/MAESTRO/R/scRNAseq_pipe.R index b16879c..e5b8c82 100644 --- a/MAESTRO/R/scRNAseq_pipe.R +++ b/MAESTRO/R/scRNAseq_pipe.R @@ -39,7 +39,7 @@ setwd(argue$outdir) count_mat = argue$expression prefix = argue$prefix thread = argue$thread -method = argue$method +#method = argue$method sigfile = argue$signature species = argue$species @@ -70,8 +70,8 @@ if(sigfile %in% c("human.immune.CIBERSORT", "mouse.brain.ALLEN", "mouse.all.facs signatures = read.table(sigfile, header = FALSE, sep = "\t", stringsAsFactors = FALSE) } RNA.res <- RNARunSeurat(inputMat = exp.dat, project = prefix, min.c = 10, min.g = 100) -RNA.res$RNA <- RNAAnnotateCelltype(RNA = RNA.res$RNA, genes = RNA.res$genes, +RNA.res$RNA <- RNAAnnotateCelltype(RNA = RNA.res$RNA, genes = RNA.res$genes, signatures = signatures, min.score = 0.05) saveRDS(RNA.res, paste0(prefix, "_scRNA_Object.rds")) -RNA.tfs <- RNAAnnotateTranscriptionFactor(RNA = RNA.res$RNA, genes = RNA.res$genes, project = prefix, +RNA.tfs <- RNAAnnotateTranscriptionFactor(RNA = RNA.res$RNA, genes = RNA.res$genes, project = prefix, organism = species, top.tf = 10) diff --git a/MAESTRO/Snakemake/scRNA/Snakefile b/MAESTRO/Snakemake/scRNA/Snakefile index cd0f020..9d0c3e7 100644 --- a/MAESTRO/Snakemake/scRNA/Snakefile +++ b/MAESTRO/Snakemake/scRNA/Snakefile @@ -434,19 +434,21 @@ rule scrna_analysis: species = config["species"], outpre = config["outprefix"], outdir = "Result/Analysis", - method = "LISA", - lisamode = config["lisamode"], - lisaenv = config["lisaenv"], - condadir = config["condadir"], + lisadir = config["lisadir"], + #method = "LISA", + #lisamode = config["lisamode"], + #lisaenv = config["lisaenv"], + #condadir = config["condadir"], signature = config["signature"] benchmark: "Result/Benchmark/%s_Analysis.benchmark" %(config["outprefix"]) threads: config["cores"] shell: + "lisa unpack {params.lisadir} --remove" "Rscript " + RSCRIPT_PATH + "/scRNAseq_pipe.R --expression {params.expression} --species {params.species} " - "--prefix {params.outpre} --method {params.method} --signature {params.signature} " - "--lisamode {params.lisamode} --condadir {params.condadir} --lisaenv {params.lisaenv} --outdir {params.outdir} --thread {threads}" + "--prefix {params.outpre} --signature {params.signature} " + "--outdir {params.outdir} --thread {threads}" if config["rseqc"]: rule scrna_report: diff --git a/MAESTRO/Snakemake/scRNA/config_template.yaml b/MAESTRO/Snakemake/scRNA/config_template.yaml index 138dbd8..149c9d7 100644 --- a/MAESTRO/Snakemake/scRNA/config_template.yaml +++ b/MAESTRO/Snakemake/scRNA/config_template.yaml @@ -30,7 +30,7 @@ signature: {{ signature }} # Path to the LISA data files # please follow LISA2 (https://github.com/liulab-dfci/lisa2) and download pre-computed datasets following the instructions. -lisa: {{ lisa }} +lisadir: {{ lisadir }} # Cutoff for quality control diff --git a/README.md b/README.md index 92b3633..02ffe2c 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,7 @@ ### v1.2.1.9999 * Bug fixes (placeholder for v1.2.2 formal release) ### v1.2.2 +* Update LISA to LISA2 which extends the original, running faster, reducing dependencies, and adding useful CLI functions for pipeline integration. Please download the LISA2 data from [human](http://cistrome.org/~alynch/data/lisa_data/hg38_2.1.tar.gz) and [mouse](http://cistrome.org/~alynch/data/lisa_data/mm10_2.1.tar.gz). * Update conda dependencies to only requesting lowest versions. * Fix the bugs in conda package installation channel. * Update markers in the mouse.brain.ALLEN cell signature file. @@ -93,7 +94,7 @@ The full MAESTRO workflow requires extra annotation files and tools: * MAESTRO depends on [starsolo](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md) and [minimap2](https://github.com/lh3/minimap2) for mapping scRNA-seq and scATAC-seq dataset. Users need to generate **the reference files** for the alignment software and specify the path of the annotations to MAESTRO through command line options. -* MAESTRO utilizes **LISA** to evaluate the enrichment of transcription factors based on the marker genes from scRNA-seq clusters. By default, users can choose the "web" option, which will use the API function in MAESTRO to perform LISA analysis. However, if users want to use the local version of LISA, they need to install [LISA](https://github.com/qinqian/lisa) locally, build the annotation files according to the LISA document, and provide the path of LISA to MAESTRO when using the RNAAnnotateTranscriptionFactor function. The input gene set can be constituted of only official gene symbols, only RefSeq ids, only Ensembl ids, only Entrez ids, or a mixture of these identifiers. +* MAESTRO utilizes **LISA2** to evaluate the enrichment of transcription factors based on the marker genes from scRNA-seq clusters. If users want to use LISA2, they need to download and install reference data either for [human](http://cistrome.org/~alynch/data/lisa_data/hg38_2.1.tar.gz) or for [mouse](http://cistrome.org/~alynch/data/lisa_data/mm10_2.1.tar.gz) locally and build the data according to the [LISA2 document](https://github.com/liulab-dfci/lisa2/blob/master/docs/troubleshooting.md). The input gene set can be constituted of only official gene symbols, only RefSeq ids, only Ensembl ids, only Entrez ids, or a mixture of these identifiers. * MAESTRO utilizes **giggle** to identify enrichment of transcription factor peaks in scATAC-seq cluster-specific peaks. By default [giggle](https://github.com/ryanlayer/giggle) is installed in MAESTRO environment. The giggle index for Cistrome database can be downloaded [here](http://cistrome.org/~chenfei/MAESTRO/giggle.all.tar.gz) (**Note:** Before v1.2.0, the giggle index `giggle.tar.gz` can be downloaded from http://cistrome.org/~chenfei/MAESTRO/giggle.tar.gz. Since v1.2.0, please download the latest index [giggle.all.tar.gz](http://cistrome.org/~chenfei/MAESTRO/giggle.all.tar.gz)). Users need to download the file and provide the location of the giggle annotation to MAESTRO when using the ATACAnnotateTranscriptionFactor function. diff --git a/conda/MAESTRO/meta.yaml b/conda/MAESTRO/meta.yaml index 3864879..e90274c 100644 --- a/conda/MAESTRO/meta.yaml +++ b/conda/MAESTRO/meta.yaml @@ -1,6 +1,6 @@ package: name: maestro - version: "1.2.1.9999" + version: "1.2.2" source: path: ../../ # build: @@ -18,13 +18,14 @@ requirements: - setuptools - snakemake >=5.14 - rseqc >=3.0.0 - - star >=2.7.3a + - star =2.7.6a - minimap2 >=2.17 - samtools >=1.9 - bedtools >=2.29.2 - rsem >=1.3.2 - picard >=2.22.0 - macs2 >=2.2.6 + - lisa2 - umap-learn - pysam # this is needed by sinto - r-base >=4.0.2 @@ -40,10 +41,10 @@ requirements: - r-cluster - r-Seurat =3.1.5 # Although the following will be installed through the dependencies of other tools - # They have been explicitly defined in DESCRIPTION. + # They have been explicitly defined in DESCRIPTION. - r-ggplot2 - - r-ggrepel - - r-cowplot + - r-ggrepel + - r-cowplot - r-matrix - r-dplyr - r-png @@ -84,15 +85,16 @@ requirements: - setuptools - snakemake >=5.14 - rseqc >=3.0.0 - - star >=2.7.3a + - star =2.7.6a - minimap2 >=2.17 - samtools >=1.9 - bedtools >=2.29.2 - rsem >=1.3.2 - picard >=2.22.0 - macs2 >=2.2.6 - - umap-learn - - pysam # this is needed by sinto + - lisa2 + - umap-learn + - pysam # this is needed by sinto - r-base >=4.0.2 - r-BiocManager - r-devtools @@ -106,10 +108,10 @@ requirements: - r-cluster - r-Seurat =3.1.5 # Although the following will be installed through the dependencies of other tools - # They have been explicitly defined in DESCRIPTION. - - r-ggplot2 >=3.0.0 - - r-ggrepel - - r-cowplot + # They have been explicitly defined in DESCRIPTION. + - r-ggplot2 >=3.0.0 + - r-ggrepel + - r-cowplot - r-matrix - r-dplyr - r-png @@ -144,4 +146,4 @@ requirements: about: home: https://github.com/liulab-dfci/MAESTRO license: GPL-3.0 - summary: MAESTRO(Model-based AnalysEs of Single-cell Transcriptome and RegulOme) is a comprehensive single-cell RNA-seq and ATAC-seq analysis suit built using snakemake. + summary: MAESTRO(Model-based AnalysEs of Single-cell Transcriptome and RegulOme) is a comprehensive single-cell RNA-seq and ATAC-seq analysis suit built using snakemake. diff --git a/example/RNA_infrastructure_10x/RNA_infrastructure_10x.md b/example/RNA_infrastructure_10x/RNA_infrastructure_10x.md index 92bb6ff..3405bc6 100644 --- a/example/RNA_infrastructure_10x/RNA_infrastructure_10x.md +++ b/example/RNA_infrastructure_10x/RNA_infrastructure_10x.md @@ -24,7 +24,7 @@ $ MAESTRO scrna-init --platform 10x-genomics --species GRCh38 \ --cores 8 --rseqc --directory Analysis/10X_PBMC_8k_MAESTRO_V110 --outprefix 10X_PBMC_8k \ --mapindex annotations/MAESTRO/Refdata_scRNA_MAESTRO_GRCh38_1.1.0/GRCh38_STAR_2.7.3a \ --whitelist Data/barcodes/737K-august-2016.txt \ ---umi-length 10 --lisa lisa/hg38_2.1.tar.gz --signature human.immune.CIBERSORT +--umi-length 10 --lisadir lisa/hg38_2.1.tar.gz --signature human.immune.CIBERSORT ``` To get a full description of command-line options, please use the command `MAESTRO scrna-init -h`. @@ -43,7 +43,7 @@ usage: MAESTRO scrna-init [-h] [--platform {10x-genomics,Dropseq,Smartseq2}] [--barcode-start BARCODE_START] [--barcode-length BARCODE_LENGTH] [--umi-start UMI_START] [--umi-length UMI_LENGTH] - [--lisa][--signature SIGNATURE] + [--lisadir][--signature SIGNATURE] ``` Here we list all the arguments and their description. @@ -97,7 +97,7 @@ Arguments | Description Arguments | Description --------- | ----------- -`--lisa` | Path to the LISA data files. Please download LISA's required data from cistrome.org: [human](http://cistrome.org/~alynch/data/lisa_data/hg38_2.1.tar.gz) and [mouse](http://cistrome.org/~alynch/data/lisa_data/mm10_2.1.tar.gz). The latest version of [LISA2](https://github.com/liulab-dfci/lisa2) was recently released to largely decrease processing time on multiple gene lists. Please check out for detailed information. +`--lisadir` | Path to the LISA data files. Please download LISA's required data from cistrome.org: [human](http://cistrome.org/~alynch/data/lisa_data/hg38_2.1.tar.gz) and [mouse](http://cistrome.org/~alynch/data/lisa_data/mm10_2.1.tar.gz). The latest version of [LISA2](https://github.com/liulab-dfci/lisa2) was recently released to largely decrease processing time on multiple gene lists. Please check out for detailed information. **Cell signature arguments:** diff --git a/man/RNAAnnotateTranscriptionFactor.Rd b/man/RNAAnnotateTranscriptionFactor.Rd index 739b95e..4be55a9 100644 --- a/man/RNAAnnotateTranscriptionFactor.Rd +++ b/man/RNAAnnotateTranscriptionFactor.Rd @@ -10,11 +10,8 @@ RNAAnnotateTranscriptionFactor( genes, cluster = NULL, project = RNA@project.name, - method = "RABIT", + method = "LISA", rabit.path, - lisa.mode = "local", - conda.dir = "", - lisa.envname = "lisa", organism = "GRCh38", top.tf = 10 ) @@ -30,23 +27,12 @@ the cluster should be set as c(1,2,3,4,5,6).} \item{project}{Output project name for the transcription factor analysis result. Default is the project name of the Seurat object.} -\item{method}{Use RABIT or LISA to identify the driver regulators. If Rabit is selected, should provide rabit annotation at the -same time. LISA do not need annotations for this step. Default is Rabit.} +\item{method}{Use LISA or Rabit to identify the driver regulators. If using LISA, users will need to download and install annotation files. If Rabit is selected, should provide rabit annotation at the same time. Default is LISA.} \item{rabit.path}{Path of the rabit annotation files. The rabit annoation for CistromeDB is available at https://github.com/liulab-dfci/MAESTRO. Download and unzip the annotation, supply the path of the annotations to \code{rabit.path}.} -\item{lisa.mode}{Mode to Run LISA, "local" or "web". If the mode is set as "local", -please install LISA (https://github.com/qinqian/lisa) and download pre-computed datasets -following the instructions. The "web" mode is to run online version of LISA. -In consideration of the connection issue, if users have mutiple clusters of differential genes, -the "local" mode is recommended.} - -\item{conda.dir}{Directory where miniconda or anaconda is installed (required if the \code{lisa.mode} is set to "local"). -For example, /home1/wangchenfei/miniconda3.} - -\item{lisa.envname}{The name of the lisa conda environment (required if the \code{lisa.mode} is set to "local"). -Default is "lisa".} +\item{lisa.mode}{Mode to Run LISA, "multi" or "one-vs-rest". Currently disbabled since we only support multi mode. } \item{organism}{Organism for the dataset. Only support "GRCh38" and "GRCm38". Default is "GRCh38".} @@ -57,15 +43,13 @@ A list of top enriched transcription factor family for each cluster. For each li TFs are ranked according to expression. } \description{ -Predict driver transcrition factors for scRNA-seq clusters based on TF ChIP-seq peaks from CistromeDB. -To run this function, you must first install the rabit package (see from http://rabit.dfci.harvard.edu/). -} +Predict driver transcrition factors for scRNA-seq clusters based on TF ChIP-seq peaks from CistromeDB. \examples{ data(pbmc.RNA) data(human.immune.CIBERSORT) pbmc.RNA.res <- RNARunSeurat(inputMat = pbmc.RNA, project = "PBMC.scRNA.Seurat", min.c = 10, min.g = 100, dims.use = 1:15) pbmc.RNA.res$RNA <- RNAAnnotateCelltype(pbmc.RNA.res$RNA, pbmc.RNA.res$genes, human.immune.CIBERSORT, min.score = 0.05) -pbmc.tfs <- RNAAnnotateTranscriptionFactor(pbmc.RNA.res$RNA, pbmc.RNA.res$genes, project = "PBMC.scRNA.TF", rabit.path = "/homes/cwang/annotations/rabit") +pbmc.tfs <- RNAAnnotateTranscriptionFactor(pbmc.RNA.res$RNA, pbmc.RNA.res$genes, project = "PBMC.scRNA.TF", organism = "GRCh38", top.tf =10) pbmc.tfs } diff --git a/setup.py b/setup.py index a579566..9b73652 100644 --- a/setup.py +++ b/setup.py @@ -18,21 +18,21 @@ def main(): setup( name = "MAESTRO", - version = "1.2.1.9999", + version = "1.2.2", package_dir = {'MAESTRO':'MAESTRO'}, packages = ['MAESTRO'], package_data={'MAESTRO':['Snakemake/scRNA/*', 'Snakemake/scATAC/*', 'Snakemake/integrate/*', 'R/*', 'annotations/*', 'html/*', '']}, #data_files = [(os.path.join(sys.prefix,'bin'), ['refpkg/giggle/bin/giggle'])], scripts = ['MAESTRO/MAESTRO'], include_package_data = True, - + author = "Chenfei Wang, Dongqing Sun", author_email = "", description = "MAESTRO(Model-based AnalysEs of Single-cell Transcriptome and RegulOme) is a comprehensive " "single-cell RNA-seq and ATAC-seq analysis suit built using snakemake.", license = "GPL-3.0", url = "https://github.com/chenfeiwang/MAESTRO", - + # entry_points = {"console_scripts": ["strap = strap:main"]}, classifiers = [ "Development Status :: 4 - Beta", @@ -45,7 +45,7 @@ def main(): "Topic :: Scientific/Engineering :: Bio-Informatics" ], #install_requires=["sinto>=0.7.1",], - #setup_requires=["sinto>=0.7.1",], + #setup_requires=["sinto>=0.7.1",], ) From 1db0c2c6f4782eca5edb31c73699674667e105b6 Mon Sep 17 00:00:00 2001 From: Gali Bai Date: Tue, 8 Dec 2020 01:49:35 -0500 Subject: [PATCH 07/15] modified .travis.yml --- .travis.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 4a4ee16..f42d631 100644 --- a/.travis.yml +++ b/.travis.yml @@ -55,6 +55,7 @@ install: # configure the channels - conda config --add channels defaults + - conda config --add channels liulab-dfci - conda config --add channels bioconda - conda config --add channels conda-forge - conda install -q mamba -c conda-forge @@ -127,6 +128,7 @@ script: - giggle search # - Rabit -help - sinto -h + - lisa - MAESTRO -v - R -e "library(MAESTRO);library(Seurat)" - R -e "library(org.Hs.eg.db);library(org.Mm.eg.db)" @@ -137,7 +139,7 @@ script: # We also need more testing here! # the following codes will upload when all the above is successful and -# the test is triggered within "liulab-dfci/MAESTRO" repo and +# the test is triggered within "baigal628/MAESTRO" repo and # the test is triggered with a tag value, e.g. 1.1.0 or 1.2.0 and # the CONDA_UPLOAD_TOKEN must match the one retrieved under anaconda account # @@ -152,7 +154,7 @@ after_success: - mamba install anaconda-client - | # Only upload builds from tags - if [[ $TRAVIS_PULL_REQUEST == false && $TRAVIS_REPO_SLUG == "liulab-dfci/MAESTRO" + if [[ $TRAVIS_PULL_REQUEST == false && $TRAVIS_REPO_SLUG == "baigal628/MAESTRO" && $TRAVIS_BRANCH == $TRAVIS_TAG && $TRAVIS_TAG != '' ]]; then export ANACONDA_API_TOKEN=$CONDA_UPLOAD_TOKEN anaconda upload bld-dir/**/maestro-*.tar.bz2 From 0b8e4ae4d2567af0ab788de4eb53796a3397ff41 Mon Sep 17 00:00:00 2001 From: Gali Bai Date: Tue, 8 Dec 2020 02:35:56 -0500 Subject: [PATCH 08/15] modify lisa channels --- .travis.yml | 2 +- conda/MAESTRO/meta.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index f42d631..7da0ca8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -55,7 +55,7 @@ install: # configure the channels - conda config --add channels defaults - - conda config --add channels liulab-dfci + - conda config --add channels allenwlynch - conda config --add channels bioconda - conda config --add channels conda-forge - conda install -q mamba -c conda-forge diff --git a/conda/MAESTRO/meta.yaml b/conda/MAESTRO/meta.yaml index e90274c..7bc77ff 100644 --- a/conda/MAESTRO/meta.yaml +++ b/conda/MAESTRO/meta.yaml @@ -25,7 +25,7 @@ requirements: - rsem >=1.3.2 - picard >=2.22.0 - macs2 >=2.2.6 - - lisa2 + - lisa2 >=2.1.0 - umap-learn - pysam # this is needed by sinto - r-base >=4.0.2 @@ -92,7 +92,7 @@ requirements: - rsem >=1.3.2 - picard >=2.22.0 - macs2 >=2.2.6 - - lisa2 + - lisa2 >=2.1.0 - umap-learn - pysam # this is needed by sinto - r-base >=4.0.2 From 6e4c6db958433401b410dbe53a8664c2c2779398 Mon Sep 17 00:00:00 2001 From: Gali Bai Date: Tue, 8 Dec 2020 10:14:36 -0500 Subject: [PATCH 09/15] fixed bug in init --- MAESTRO/MAESTRO_PipeInit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAESTRO/MAESTRO_PipeInit.py b/MAESTRO/MAESTRO_PipeInit.py index 79984c6..51761e5 100644 --- a/MAESTRO/MAESTRO_PipeInit.py +++ b/MAESTRO/MAESTRO_PipeInit.py @@ -411,7 +411,7 @@ def scrna_config(args): signature = signature, # rabitlib = os.path.abspath(args.rabitlib), #lisamode = args.lisamode, - lisadir = os.path.abspath(args.lisadir) + lisadir = os.path.abspath(args.lisadir), mapindex = os.path.abspath(args.mapindex), rsem = os.path.abspath(args.rsem), whitelist = os.path.abspath(args.whitelist), From 153dceceb50e88cc636dcf54d576f589fadcf31a Mon Sep 17 00:00:00 2001 From: Gali Bai Date: Wed, 9 Dec 2020 03:18:52 -0500 Subject: [PATCH 10/15] fixed bugs after installing new conda package --- DESCRIPTION | 8 ++++---- MAESTRO/MAESTRO_ParameterValidate.py | 10 ---------- MAESTRO/MAESTRO_PipeInit.py | 6 +++--- MAESTRO/Snakemake/scRNA/Snakefile | 2 +- MAESTRO/Snakemake/scRNA/config_template.yaml | 3 +-- R/RNAAnnotateTranscriptionFactor.R | 2 +- README.md | 1 + .../RNA_infrastructure_10x/RNA_infrastructure_10x.md | 8 ++++---- man/RNAAnnotateTranscriptionFactor.Rd | 4 ++-- 9 files changed, 17 insertions(+), 27 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 658a3ba..7e323e9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,14 +1,14 @@ Package: MAESTRO Type: Package Title: Model-based Analyses of Single-cell Transcriptome and Regulome -Version: 1.2.1.9999 +Version: 1.2.2 Date: 2020-10-28 Author: Chenfei Wang, Dongqing Sun Maintainer: Dongqing Sun -Description: MAESTRO is an intergrative analysis pipeline to support downstream analysis of single-cell RNA-seq and single-cell ATAC-seq dataset. MAESTRO provides funtion for quality control, normalization, clustering, differential gene and peak analysis, marker gene based annotation, transcription factor identification using Cistome toolkit, and intergrative analysis for scRNA-seq and scATACseq. +Description: MAESTRO is an intergrative analysis pipeline to support downstream analysis of single-cell RNA-seq and single-cell ATAC-seq dataset. MAESTRO provides funtion for quality control, normalization, clustering, differential gene and peak analysis, marker gene based annotation, transcription factor identification using Cistome toolkit, and intergrative analysis for scRNA-seq and scATACseq. URL: https://github.com/LiuLab-dfci/MAESTRO Depends: R (>= 3.6.1) -Imports: Seurat (>= 3.1.2), ggplot2 (>= 3.0.0), ggrepel, cowplot, Matrix (>= +Imports: Seurat (>= 3.1.2), ggplot2 (>= 3.0.0), ggrepel, cowplot, Matrix (>= 1.2.14), dplyr, png, RColorBrewer, scales, pheatmap, MAGeCKFlute, DESeq2, Gmisc, grid, karyoploteR, presto, AnnotationDbi, org.Hs.eg.db, org.Mm.eg.db Suggests: knitr, pagoda2, RCA, MAST, scABC, devtools, uwot, cisTopic, chromVAR, motifmatchr, @@ -24,7 +24,7 @@ License: GPL (>=3) Encoding: UTF-8 LazyData: true NeedsCompilation: no -biocViews: Software, Single-cell, RNA-seq, ATAC-seq, QualityControl, Clustering, +biocViews: Software, Single-cell, RNA-seq, ATAC-seq, QualityControl, Clustering, DifferentialExpression, DifferentialPeakCalling, TranscriptionFactorEnrichment, IntegrativeAnalyis, Visualization RoxygenNote: 7.0.2 diff --git a/MAESTRO/MAESTRO_ParameterValidate.py b/MAESTRO/MAESTRO_ParameterValidate.py index 2c7ba2c..f3358a2 100644 --- a/MAESTRO/MAESTRO_ParameterValidate.py +++ b/MAESTRO/MAESTRO_ParameterValidate.py @@ -120,19 +120,9 @@ def scrna_validator(args): logging.error("--rsem is required. Please provide the prefix of transcript references for RSEM. See --rsem help for more details.") exit(1) - if args.lisamode == "local": - if args.lisaenv == "": - logging.error("--lisaenv is required when lisamode is 'local'. Please specify the name of LISA environment!") - exit(1) - if args.condadir == "": - logging.error("--condadir is required when lisamode is 'local'. Please specify the directory where miniconda or anaconda is installed!") - exit(1) - if args.signature not in ['human.immune.CIBERSORT', 'mouse.brain.ALLEN', 'mouse.all.facs.TabulaMuris', 'mouse.all.droplet.TabulaMuris']: if os.path.exists(args.signature): pass else: logging.error("Please specify the signature built in MAESTRO or provide customized signature file. See --signature help for more details!") exit(1) - - diff --git a/MAESTRO/MAESTRO_PipeInit.py b/MAESTRO/MAESTRO_PipeInit.py index 51761e5..7dec9ae 100644 --- a/MAESTRO/MAESTRO_PipeInit.py +++ b/MAESTRO/MAESTRO_PipeInit.py @@ -198,8 +198,8 @@ def scrna_parser(subparsers): # Output arguments group_output = workflow.add_argument_group("Running and output arguments") - group_output.add_argument("--cores", dest = "cores", default = 8, - type = int, help = "The number of cores to use. DEFAULT: 8.") + group_output.add_argument("--cores", dest = "cores", default = 10, + type = int, help = "The number of cores to use. DEFAULT: 10.") group_output.add_argument("--rseqc", dest = "rseqc", action = "store_true", help = "Whether or not to run RSeQC. " "If set, the pipeline will include the RSeQC part and then takes a longer time. " @@ -411,7 +411,7 @@ def scrna_config(args): signature = signature, # rabitlib = os.path.abspath(args.rabitlib), #lisamode = args.lisamode, - lisadir = os.path.abspath(args.lisadir), + lisadir = os.path.abspath(args.lisadir) mapindex = os.path.abspath(args.mapindex), rsem = os.path.abspath(args.rsem), whitelist = os.path.abspath(args.whitelist), diff --git a/MAESTRO/Snakemake/scRNA/Snakefile b/MAESTRO/Snakemake/scRNA/Snakefile index 9d0c3e7..b3eb096 100644 --- a/MAESTRO/Snakemake/scRNA/Snakefile +++ b/MAESTRO/Snakemake/scRNA/Snakefile @@ -445,7 +445,7 @@ rule scrna_analysis: threads: config["cores"] shell: - "lisa unpack {params.lisadir} --remove" + "lisa unpack {params.lisadir}; " "Rscript " + RSCRIPT_PATH + "/scRNAseq_pipe.R --expression {params.expression} --species {params.species} " "--prefix {params.outpre} --signature {params.signature} " "--outdir {params.outdir} --thread {threads}" diff --git a/MAESTRO/Snakemake/scRNA/config_template.yaml b/MAESTRO/Snakemake/scRNA/config_template.yaml index 149c9d7..e93ee3d 100644 --- a/MAESTRO/Snakemake/scRNA/config_template.yaml +++ b/MAESTRO/Snakemake/scRNA/config_template.yaml @@ -19,7 +19,7 @@ outprefix: {{ outprefix }} # By default, the pipeline will skip the RSeQC part. DEFAULT: False. rseqc: {{ rseqc }} -# Number of cores to use +# Number of cores to use. cores: {{ cores }} # Cell signature file used to annotate cell types. MAESTRO provides several sets of built-in cell signatures. @@ -29,7 +29,6 @@ cores: {{ cores }} signature: {{ signature }} # Path to the LISA data files -# please follow LISA2 (https://github.com/liulab-dfci/lisa2) and download pre-computed datasets following the instructions. lisadir: {{ lisadir }} diff --git a/R/RNAAnnotateTranscriptionFactor.R b/R/RNAAnnotateTranscriptionFactor.R index a43e8f9..c50a174 100644 --- a/R/RNAAnnotateTranscriptionFactor.R +++ b/R/RNAAnnotateTranscriptionFactor.R @@ -261,7 +261,7 @@ RunLISAMulti <- function(genes, project, organism = "GRCh38") tf_all_log10$TF= Hmisc::capitalize(tolower(tf_all_log10$TF)) } rownames(tf_all_log10)=tf_all_log10$TF - tf_all_log10 = tf_all_log10[, c(-1)] + tf_all_log10 = subset(tf_all_log10, select = -c(TF)) write.table(tf_all_log10,paste0(project,'_lisa.txt'),sep='\t',quote = F) return(tf_all_log10) } diff --git a/README.md b/README.md index 02ffe2c..2306971 100644 --- a/README.md +++ b/README.md @@ -70,6 +70,7 @@ $ bash Miniconda3-latest-Linux-x86_64.sh And then users can create an isolated environment for MAESTRO and install through the following commands: ``` bash $ conda config --add channels defaults +$ conda config --add channels allenwlynch $ conda config --add channels bioconda $ conda config --add channels conda-forge # To make the installation faster, we recommend using mamba diff --git a/example/RNA_infrastructure_10x/RNA_infrastructure_10x.md b/example/RNA_infrastructure_10x/RNA_infrastructure_10x.md index 3405bc6..f463800 100644 --- a/example/RNA_infrastructure_10x/RNA_infrastructure_10x.md +++ b/example/RNA_infrastructure_10x/RNA_infrastructure_10x.md @@ -21,8 +21,8 @@ Initialize the MAESTRO scRNA-seq workflow using `MAESTRO scrna-init` command. Th ```bash $ MAESTRO scrna-init --platform 10x-genomics --species GRCh38 \ --fastq-dir Data/10X_PBMC_8k/fastqs --fastq-prefix pbmc8k \ ---cores 8 --rseqc --directory Analysis/10X_PBMC_8k_MAESTRO_V110 --outprefix 10X_PBMC_8k \ ---mapindex annotations/MAESTRO/Refdata_scRNA_MAESTRO_GRCh38_1.1.0/GRCh38_STAR_2.7.3a \ +--cores 10 --rseqc --directory Analysis/10X_PBMC_8k_MAESTRO_V110 --outprefix 10X_PBMC_8k \ +--mapindex annotations/MAESTRO/Refdata_scRNA_MAESTRO_GRCh38_1.2.2/GRCh38_STAR_2.7.6a \ --whitelist Data/barcodes/737K-august-2016.txt \ --umi-length 10 --lisadir lisa/hg38_2.1.tar.gz --signature human.immune.CIBERSORT ``` @@ -63,7 +63,7 @@ Arguments | Description Arguments | Description --------- | ----------- -`--cores` | The number of cores to use. DEFAULT: 8. +`--cores` | The number of cores to use. DEFAULT: 10. `--rseqc` | Whether or not to run RSeQC. If set, the pipeline will include the RSeQC part and then takes a longer time. By default (not set), the pipeline will skip the RSeQC part. `--directory` | Path to the directory where the workflow shall be initialized and results shall be stored. DEFAULT: MAESTRO. `--outprefix` | Prefix of output files. DEFAULT: MAESTRO. @@ -295,7 +295,7 @@ To identify enriched transcription regulators is crucial to understanding gene r [10] "NCOR1" ``` -Besides identifying TFs for all the clusters, we also support the differential gene list from a single comparison. In this situation, users can choose the "web" mode. +Besides identifying TFs for all the clusters, we also support the differential gene list from a single comparison. ```R > de.geneset <- FindMarkersMAESTRO(pbmc.RNA.res$RNA, ident.1 = c(0)) > pbmc.RNA.monocyte.tfs <- RNAAnnotateTranscriptionFactor(RNA = pbmc.RNA.res$RNA, diff --git a/man/RNAAnnotateTranscriptionFactor.Rd b/man/RNAAnnotateTranscriptionFactor.Rd index 4be55a9..9a0ccd8 100644 --- a/man/RNAAnnotateTranscriptionFactor.Rd +++ b/man/RNAAnnotateTranscriptionFactor.Rd @@ -44,6 +44,7 @@ TFs are ranked according to expression. } \description{ Predict driver transcrition factors for scRNA-seq clusters based on TF ChIP-seq peaks from CistromeDB. +} \examples{ data(pbmc.RNA) data(human.immune.CIBERSORT) @@ -51,8 +52,7 @@ pbmc.RNA.res <- RNARunSeurat(inputMat = pbmc.RNA, project = "PBMC.scRNA.Seurat", pbmc.RNA.res$RNA <- RNAAnnotateCelltype(pbmc.RNA.res$RNA, pbmc.RNA.res$genes, human.immune.CIBERSORT, min.score = 0.05) pbmc.tfs <- RNAAnnotateTranscriptionFactor(pbmc.RNA.res$RNA, pbmc.RNA.res$genes, project = "PBMC.scRNA.TF", organism = "GRCh38", top.tf =10) pbmc.tfs - } \author{ Dongqing Sun, Chenfei Wang -} +} \ No newline at end of file From ef520fc81b2bd7cf84829fb80cf6cd6e08fcd30f Mon Sep 17 00:00:00 2001 From: Gali Bai Date: Wed, 9 Dec 2020 03:35:04 -0500 Subject: [PATCH 11/15] add , after a lisadir arg --- MAESTRO/MAESTRO_PipeInit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAESTRO/MAESTRO_PipeInit.py b/MAESTRO/MAESTRO_PipeInit.py index 7dec9ae..3640938 100644 --- a/MAESTRO/MAESTRO_PipeInit.py +++ b/MAESTRO/MAESTRO_PipeInit.py @@ -411,7 +411,7 @@ def scrna_config(args): signature = signature, # rabitlib = os.path.abspath(args.rabitlib), #lisamode = args.lisamode, - lisadir = os.path.abspath(args.lisadir) + lisadir = os.path.abspath(args.lisadir), mapindex = os.path.abspath(args.mapindex), rsem = os.path.abspath(args.rsem), whitelist = os.path.abspath(args.whitelist), From 8b3b0b9a549b756e905fcf5342b4c8401402e2c6 Mon Sep 17 00:00:00 2001 From: Gali Bai Date: Wed, 9 Dec 2020 09:59:57 -0500 Subject: [PATCH 12/15] fix typo in documentation --- example/ATAC_infrastructure_10x/ATAC_infrastructure_10x.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example/ATAC_infrastructure_10x/ATAC_infrastructure_10x.md b/example/ATAC_infrastructure_10x/ATAC_infrastructure_10x.md index f5423b2..90f0e11 100644 --- a/example/ATAC_infrastructure_10x/ATAC_infrastructure_10x.md +++ b/example/ATAC_infrastructure_10x/ATAC_infrastructure_10x.md @@ -389,7 +389,7 @@ The output TFs from MAESTRO have already been pre-filtered using TF regulatory p cluster.1 = 0, type = "ATAC", SeuratObj = pbmc.ATAC.res$ATAC, - GIGGLE.table = "10X_PBMC_10k_giggle.txt", + GIGGLE.table = "10X_PBMC_10k.PredictedTFScore.txt", visual.totalnumber = 100, name = "10X_PBMC_10k_Monocyte_filtered") ``` From 3e1bb27b6945c9ccb687b0e3dec77b9e90524c10 Mon Sep 17 00:00:00 2001 From: Gali Bai Date: Wed, 9 Dec 2020 16:47:34 -0500 Subject: [PATCH 13/15] update lisa2 channel to liulab-dfci --- .travis.yml | 2 +- README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 7da0ca8..ae35442 100644 --- a/.travis.yml +++ b/.travis.yml @@ -55,7 +55,7 @@ install: # configure the channels - conda config --add channels defaults - - conda config --add channels allenwlynch + - conda config --add channels liulab-dfci - conda config --add channels bioconda - conda config --add channels conda-forge - conda install -q mamba -c conda-forge diff --git a/README.md b/README.md index 2306971..69687e4 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,7 @@ $ bash Miniconda3-latest-Linux-x86_64.sh And then users can create an isolated environment for MAESTRO and install through the following commands: ``` bash $ conda config --add channels defaults -$ conda config --add channels allenwlynch +$ conda config --add channels liulab-dfci $ conda config --add channels bioconda $ conda config --add channels conda-forge # To make the installation faster, we recommend using mamba From 6f4897dd60fbeb8f475e7b800d8720fb0c996f2e Mon Sep 17 00:00:00 2001 From: Gali Bai Date: Thu, 10 Dec 2020 08:17:03 -0500 Subject: [PATCH 14/15] changed name of travis CI repo back to liulab-dfci --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index ae35442..56af2a0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -139,7 +139,7 @@ script: # We also need more testing here! # the following codes will upload when all the above is successful and -# the test is triggered within "baigal628/MAESTRO" repo and +# the test is triggered within "liulab-dfci/MAESTRO" repo and # the test is triggered with a tag value, e.g. 1.1.0 or 1.2.0 and # the CONDA_UPLOAD_TOKEN must match the one retrieved under anaconda account # @@ -154,7 +154,7 @@ after_success: - mamba install anaconda-client - | # Only upload builds from tags - if [[ $TRAVIS_PULL_REQUEST == false && $TRAVIS_REPO_SLUG == "baigal628/MAESTRO" + if [[ $TRAVIS_PULL_REQUEST == false && $TRAVIS_REPO_SLUG == "liulab-dfci/MAESTRO" && $TRAVIS_BRANCH == $TRAVIS_TAG && $TRAVIS_TAG != '' ]]; then export ANACONDA_API_TOKEN=$CONDA_UPLOAD_TOKEN anaconda upload bld-dir/**/maestro-*.tar.bz2 From 8844f165fafe78269a2945655a6205facb5f0031 Mon Sep 17 00:00:00 2001 From: Gali Bai Date: Thu, 10 Dec 2020 09:24:51 -0500 Subject: [PATCH 15/15] Correct wordings in log --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 69687e4..395b392 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ ### v1.2.1.9999 * Bug fixes (placeholder for v1.2.2 formal release) ### v1.2.2 -* Update LISA to LISA2 which extends the original, running faster, reducing dependencies, and adding useful CLI functions for pipeline integration. Please download the LISA2 data from [human](http://cistrome.org/~alynch/data/lisa_data/hg38_2.1.tar.gz) and [mouse](http://cistrome.org/~alynch/data/lisa_data/mm10_2.1.tar.gz). +* Update LISA to LISA2 which extends the original, runs faster, reduces dependencies, and adds useful CLI functions for pipeline integration. Please download the LISA2 data from [human](http://cistrome.org/~alynch/data/lisa_data/hg38_2.1.tar.gz) and [mouse](http://cistrome.org/~alynch/data/lisa_data/mm10_2.1.tar.gz). * Update conda dependencies to only requesting lowest versions. * Fix the bugs in conda package installation channel. * Update markers in the mouse.brain.ALLEN cell signature file.