many updates

noriakis · Sep 26, 2024 · 1f2a7d9 · 1f2a7d9
1 parent 18f67dd
commit 1f2a7d9
Show file tree

Hide file tree

Showing 59 changed files with 202 additions and 107 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -5,9 +5,9 @@ Version: 0.99.0
 License: GPL-3
 Description: The package aims to analyze intra-species diversity from metagenomics. The package supports the loading functions for single nucleotide variants and copy number variant profiles, filtering, and statistical analysis.
 Authors@R: person("Noriaki", "Sato", email = "nori@hgc.jp", role = c("cre", "aut"))
-Depends: ggplot2, ggstar, ggraph, igraph
-Imports: GetoptLong, BiocFileCache, RCurl, vegan, methods, data.table, phangorn, RColorBrewer, ggtree, circlize, ComplexHeatmap, ggkegg, ape, dplyr, exactRankTests, ggblend, ggh4x, scales, tidygraph, ggplotify, ggtreeExtra, ggnewscale, scico, MKmisc, NMF, pillar, cowplot, patchwork, reshape2, Boruta, tidyr, stringr, clusterProfiler, grDevices, stats, utils
-Suggests: simplifyEnrichment, knitr, rmarkdown, NNLM, shiny, BiocStyle, matrixStats
+Depends: ggplot2, ggraph, igraph, vegan, phangorn
+Imports: GetoptLong, BiocFileCache, RCurl, methods, data.table, RColorBrewer, ggtree, circlize, ComplexHeatmap, ggkegg, ape, dplyr, exactRankTests, ggblend, ggh4x, scales, tidygraph, ggplotify, ggtreeExtra, ggnewscale, scico, MKmisc, NMF, pillar, cowplot, patchwork, reshape2, Boruta, tidyr, stringr, clusterProfiler, grDevices, stats, utils, BiocStyle, ggstar
+Suggests: simplifyEnrichment, knitr, rmarkdown, NNLM, shiny, matrixStats
 RoxygenNote: 7.3.2
 biocViews: Microbiome
 VignetteBuilder: knitr
diff --git a/R/NMF.R b/R/NMF.R
@@ -22,9 +22,10 @@
 #' @param nnlm_na_perc NA frequency when the cross-validation is performed.
 #' @param tss perform total sum scaling to the matrix
 #' @param remove_na_perc remove features with too many NA
-#' (features with NA below {remove_na_perc} * overall sample numbers will be retained)
+#' (features with NA below remove_na_perc * overall sample numbers will be retained)
 #' @importFrom NMF nmf nmfEstimateRank
 #' @export
+#' @return stana object
 NMF <- function(stana, species, rank=3, target="kos", seed=53, method="snmf/r",
     deleteZeroDepth=FALSE, beta=0.01, estimate=FALSE, estimate_range=1:6, nnlm_flag=FALSE,
     nnlm_args=list(), nnlm_na_perc=0.3, tss=FALSE, remove_na_perc=NULL) {
@@ -106,7 +107,7 @@ NMF <- function(stana, species, rank=3, target="kos", seed=53, method="snmf/r",
 
 			err <- sapply(X = estimate_range,
 			              FUN = function(k) {
-			                z <- nnmf(A2, k);
+			                z <- NNLM::nnmf(A2, k);
 			                c(mean((with(z, W %*% H)[ind] - A[ind])^2), tail(z$mse, 1));
 			              }
 			);
@@ -158,7 +159,7 @@ NMF <- function(stana, species, rank=3, target="kos", seed=53, method="snmf/r",
 	        res <- NMF::nmf(mat, rank = rank, seed = seed, method=method)
 	    }
 	    coefMat <- coef(res)
-        basisMat <- basis(res) 	
+        basisMat <- NMF::basis(res) 	
     }
 
     stana@coefMat[[species]] <- data.frame(coefMat)
@@ -207,7 +208,7 @@ plotStackedBarPlot <- function(stana, sp, by="NMF") {
 	    ## Not showing sample label, instead facet by group
 		ggplot(melted, aes(fill=variable, y=value, x=sample)) + 
 		    geom_col(position="fill")+
-		    facet_grid(. ~ group, scale="free")+
+		    facet_grid(. ~ group, scales="free")+
             scale_y_continuous(expand = expansion(mult = c(0, 0.05)))+
             cowplot::theme_cowplot()+ cowplot::panel_border()+
 		    theme(axis.text.x = element_blank())

diff --git a/R/addGeneAbundance.R b/R/addGeneAbundance.R
@@ -0,0 +1,50 @@
+#' addGeneAbundance
+#' 
+#' Add the specified gene copy number to metadata
+#' 
+#' @param stana stana object
+#' @param candSp candidate species
+#' @param IDs gene IDs to add
+#' @param target KO or genes
+#' @param how how to combine multiple IDs
+#' @param newCol new column name
+#' @param discNumeric convert discrete value to numeric
+#' @param disc discretize the abundance by the threshold. function for calculating
+#' threshold, like median
+#' @param convert conversion such as log10
+#' @export
+#' @return stana object
+addGeneAbundance <- function(stana, candSp, IDs,
+    target="KO", how=sum, newCol="gene",
+    disc=NULL, discNumeric=TRUE, convert=NULL) {
+    if (target=="KO") {
+        ints <- intersect(row.names(stana@kos[[candSp]]), IDs)
+        subMat <- stana@kos[[candSp]][ints, ]
+    } else {
+        ints <- intersect(row.names(stana@genes[[candSp]]), IDs)
+        subMat <- stana@genes[[candSp]][ints, ]
+    }
+    if (length(IDs)>1) {
+        adda <- apply(subMat, 2, how)    
+    } else {
+        adda <- subMat
+    }
+    nm <- names(adda)
+    if (!is.null(disc)) {
+        thresh <- do.call(disc, list("x"=adda))
+        if (discNumeric) {
+            adda <- as.numeric(adda > thresh)        
+        } else {
+            adda <- adda > thresh
+        }
+        names(adda) <- nm
+    }
+    meta <- stana@meta
+    if (!is.null(convert)) {
+        adda <- do.call(convert, list(x=adda))
+    }
+    meta[[newCol]] <- adda[row.names(stana@meta)]
+    stana@meta <- meta
+    return(stana)
+}
+
diff --git a/R/alleleStat.R b/R/alleleStat.R
@@ -1,4 +1,3 @@
-
 #' @noRd
 alleleStat_metaSNV <- function(stana, sp, cl, deleteZeroDepth) {
 	statList <- list()
@@ -43,7 +42,6 @@ alleleStat_metaSNV <- function(stana, sp, cl, deleteZeroDepth) {
 #' @param deleteZeroDepth delete snvs with zero depth
 #' @export
 #' @return list of statistical values
-#' 
 alleleStat <- function(stana, sp=NULL, cl=NULL, base="maj",
 	deleteZeroDepth=FALSE) {
 	if (is.null(sp)) {sp <- stana@ids[1]}

diff --git a/R/annot.R b/R/annot.R
@@ -65,7 +65,7 @@ anno_eggNOG_keywords <- function(split, genes, tib,
   })
 
   lt <- lapply(gene_list, function(x) {
-    (tib |> dplyr::filter(ID %in% x))$V2
+    (tib |> dplyr::filter(.data$ID %in% x))$V2
   })
 
   names(lt) <- names(gene_list)

diff --git a/R/calcGF.R b/R/calcGF.R
@@ -1,12 +1,14 @@
-
-
 #' calcGF
+#' 
+#' Calculate gene family profile
+#' 
 #' @param stana stana object
 #' @param candSp candidate species ID
 #' @param how how to summarize multiple gene CN assigned to the same KO
 #' @param annot eggNOG or manual
 #' @param column When eggNOG, which family to summarize, default to KEGG_ko
 #' @export
+#' @return stana object
 calcGF <- function(stana, candSp=NULL, how=sum, annot="eggNOG", column="KEGG_ko") {
 	checkID(stana, candSp)
     if (is.null(candSp)) {cat("Species not specified, the first ID will be used:", stana@ids[1]);

diff --git a/R/checkEGGNOG.R b/R/checkEGGNOG.R
@@ -11,13 +11,13 @@
 #' "KEGG_rclass"    "BRITE"          "KEGG_TC"        "CAZy"           "BiGG_Reaction" 
 #' "PFAMs"
 #' @export
+#' @return list containing ggplot2 object and graph
 #' 
 drawEGGNOG <- function(annot_file, geneIDs, candPlot) {
   retList <- list()
   egng <- checkEGGNOG(annot_file,"all", geneIDs)
   # delete map* entry (in KEGG)
   egng <- egng[!grepl("map",egng$value),]
-  print(egng)
   ap <- NULL
   for (i in geneIDs) {
     tmp <- egng[egng$ID == i,]
@@ -87,15 +87,15 @@ checkEGGNOG <- function(annot_file, ret="all", checkIDs=NULL, fill=TRUE) {
   if (ret!="all") {
     parsed <- ann[,c("#query",ret), with=FALSE] |>
       tidyr::pivot_longer(-1) |>
-      dplyr::filter(value!="-") |>
-      dplyr::mutate(value = strsplit(as.character(value), ",")) |>
-      tidyr::unnest(value)
+      dplyr::filter(.data$value!="-") |>
+      dplyr::mutate(value = strsplit(as.character(.data$value), ",")) |>
+      tidyr::unnest(.data$value)
   } else {
     parsed <- ann[, !c("evalue","score"), with=FALSE] |>
       tidyr::pivot_longer(-1) |>
-      dplyr::filter(value!="-") |>
-      dplyr::mutate(value = strsplit(as.character(value), ",")) |>
-      tidyr::unnest(value)
+      dplyr::filter(.data$value!="-") |>
+      dplyr::mutate(value = strsplit(as.character(.data$value), ",")) |>
+      tidyr::unnest(.data$value)
   }
   parsed <- parsed |> `colnames<-`(c("ID","name","value"))
   if (!is.null(checkIDs)) {

diff --git a/R/checkPATRIC.R b/R/checkPATRIC.R
@@ -14,6 +14,7 @@
 #' @import igraph ggraph BiocFileCache RCurl ggplot2
 #' @importFrom data.table fread
 #' @export
+#' @return checkPATRIC results
 drawPATRIC <- function(genes,
                         whichToCount="ec_description",
                         delSize=0,
@@ -51,6 +52,7 @@ drawPATRIC <- function(genes,
 #' @param skipGraph skip graph plotting
 #' @import igraph ggraph BiocFileCache RCurl ggplot2
 #' @export
+#' @return list of annotations
 checkPATRIC <- function(genes,
                         whichToCount="ec_description",
                         delSize=0,

diff --git a/R/checkPATRICSimple.R b/R/checkPATRICSimple.R
@@ -8,6 +8,7 @@
 #' @param whichToCount ec_description, ec_number, pathway_name, pathway_id
 #' @import BiocFileCache RCurl
 #' @export
+#' @return list of annotation
 checkPATRICSimple <- function(genes,
                         whichToCount="ec_description") {
   allGenes <- unlist(genes)

diff --git a/R/combineGenes.R b/R/combineGenes.R
@@ -28,7 +28,7 @@ combineGenes <- function(stana_list, species) {
   if (ovlgr>0) {qqcat("Duplicate label found in group\n")}
 
   new_stana@ids <- species
-  new_stana@type <- stana@type
+  new_stana@type <- "Combined"
   new_stana@cl <- cls
   new_stana@colors <- getColors(cls)
 

diff --git a/R/compareGenes.R b/R/compareGenes.R
@@ -12,7 +12,7 @@
 #' @param verbose_zero show zero abundance genes
 #' @import ggplot2
 #' @export
-#' @return ggplot
+#' @return list of statistical results from wilcox.exact
 #' 
 compareGenes <- function(stana, species=NULL, geneID=NULL, cl=NULL, argList=list(),
 	verbose_zero=FALSE) {

diff --git a/R/consensusSeq.R b/R/consensusSeq.R
@@ -10,6 +10,7 @@
 #' @param argList parameters, passed to corresponding functions
 #' @export
 #' @rdname consensusseq
+#' @return stana object
 #'
 consensusSeq <- function(stana,
 	species=NULL, argList=list()){

diff --git a/R/consensusSeqFast.R b/R/consensusSeqFast.R
@@ -3,6 +3,7 @@
 #' @param allele_columns columns specifying major and minor allele
 #' @rdname consensusseq
 #' @export
+#' @return stana object
 consensusSeqGeneral <- function(
 	stana,
 	species=NULL,
@@ -69,7 +70,7 @@ consensusSeqGeneral <- function(
 
         if (return_mat) {
         	mat <- do.call(cbind, allele_list)
-        	row.names(mat) <- names(site_filters)
+        	# row.names(mat) <- names(site_filters)
 	        return(mat) 
         }
 
@@ -133,10 +134,12 @@ consensusSeqGeneral <- function(
 #' @param output_seq whether to output actual FASTA file
 #' @param locus_type locus type to be included, default to CDS
 #' @param site_list site list to be included
+#' @param site_type site types to be included
 #' @param return_mat return matrix of characters
 #' @param output_seq whether to output actual FASTA file
 #' @rdname consensusseq
 #' @export
+#' @return stana object
 consensusSeqMIDAS2 <- function(
 	stana,
 	species=NULL,
@@ -150,6 +153,7 @@ consensusSeqMIDAS2 <- function(
 	site_prev=0.9,
 	locus_type="CDS",
 	site_list=NULL,
+	site_type=c("1D","2D","3D","4D"),
 	cl=NULL,
 	max_sites=Inf,
 	tree=FALSE,
@@ -287,7 +291,7 @@ consensusSeqMIDAS2 <- function(
         	    sum(c(
         	    	(length(keep_samples) / length(names(SAMPLES))) >= max(c(1e-6, site_prev)),
         	        pooled_maf >= site_maf,
-	                sp_site_type[i] %in% c("1D","2D","3D","4D"),
+	                sp_site_type[i] %in% site_type,
 	                sp_locus_type[i] %in% locus_type))
 	        return(keep_site_filter)
         }) |> unlist()
@@ -391,6 +395,7 @@ consensusSeqMIDAS2 <- function(
 #' @importFrom phangorn read.phyDat
 #' @rdname consensusseq
 #' @export
+#' @return stana object
 consensusSeqMIDAS1 <- function(
 	stana,
 	species=NULL,
@@ -405,6 +410,7 @@ consensusSeqMIDAS1 <- function(
 	max_sites=Inf,
 	tree=FALSE,
 	locus_type="CDS",
+	site_type=c("1D","2D","3D","4D"),
     keep_samples=NULL,
     verbose=FALSE,
     site_list=NULL,
@@ -517,7 +523,7 @@ consensusSeqMIDAS1 <- function(
         	    	(length(keep_samples) / length(names(SAMPLES))) >= max(c(1e-6, site_prev)),
         	        pooled_maf >= site_maf,
         	        sp_ref_allele[i] %in% c("A","T","G","C"),
-	                sp_site_type[i] %in% c("1D","2D","3D","4D"),
+	                sp_site_type[i] %in% site_type,
 	                sp_locus_type[i] %in% locus_type))
 	        return(keep_site_filter)
         }) |> unlist()

diff --git a/R/convertWideToMaf.R b/R/convertWideToMaf.R
@@ -34,7 +34,7 @@ convertWideToMaf <- function(df) {
     })
     sp <- unique(df$species_id)
     ret <- lapply(sp, function(s) {
-        tmp <- df %>% dplyr::filter(species_id == s)
+        tmp <- df %>% dplyr::filter(.data$species_id == s)
         tmp <- tmp[,c("snv_id", "maf")]
         row.names(tmp) <- tmp$snv_id
         tmp$snv_id <- NULL
@@ -43,7 +43,7 @@ convertWideToMaf <- function(df) {
     names(ret) <- sp
 
     summary <- lapply(sp, function(s) {
-        tmp <- df %>% dplyr::filter(species_id == s)
+        tmp <- df %>% dplyr::filter(.data$species_id == s)
         tmp <- tmp[,c("snv_id", "major", "minor")]
         row.names(tmp) <- tmp$snv_id
         tmp$snv_id <- NULL
@@ -62,6 +62,7 @@ convertWideToMaf <- function(df) {
 #' combine the maf table produced by convertWideToMaf()
 #' @param mafs named list of the results of convertWideToMaf()
 #' @export
+#' @return stana object
 combineMaf <- function(mafs) {
   if (is.null(names(mafs))) {stop("The list must be named")}
   sps <- do.call(union, lapply(names(mafs), function(s) {
@@ -77,9 +78,9 @@ combineMaf <- function(mafs) {
       return(tmp)
     }))
     tbl <- tidyr::pivot_wider(tmpdf,
-                       id_cols=snv_id,
-                       names_from=sample,
-                       values_from = maf)
+                       id_cols="snv_id",
+                       names_from="sample",
+                       values_from = "maf")
     tbl <- data.frame(tbl)
     row.names(tbl) <- tbl$snv_id
     tbl$snv_id <- NULL

diff --git a/R/doAdonis.R b/R/doAdonis.R
@@ -28,6 +28,7 @@
 #' @importFrom stats as.formula dist
 #' @importFrom utils read.table
 #' @export
+#' @return stana object
 doAdonis <- function(stana, specs, cl=NULL,
     target="snps", formula=NULL,
     distMethod="manhattan", pcoa=FALSE,
@@ -144,7 +145,6 @@ doAdonis <- function(stana, specs, cl=NULL,
         } else {
           argList[["data"]] <- met
         }
-
         adores <- do.call("adonis2", argList)
 
         if (pr) {

diff --git a/R/doBoruta.R b/R/doBoruta.R
@@ -17,6 +17,7 @@
 #' @return list of Boruta object and plot
 #' @import ggplot2
 #' @export
+#' @return list containing Boruta results
 doBoruta <- function(stana, sp, cl=NULL, doFix=TRUE,
   target="genes", mat=NULL, whichToCount="ec_description", argList=list(),
   deleteZeroDepth=FALSE) {