Skip to content

Commit

Permalink
snv filters
Browse files Browse the repository at this point in the history
  • Loading branch information
noriakis committed Mar 28, 2024
1 parent b2926d3 commit d254d84
Show file tree
Hide file tree
Showing 18 changed files with 182 additions and 18 deletions.
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ export(plotGenes)
export(plotHeatmap)
export(plotKEGGPathway)
export(plotMAF)
export(plotMAFHist)
export(plotPCA)
export(plotSNVInfo)
export(plotSNVSummary)
Expand All @@ -71,6 +72,7 @@ export(setMap)
export(setMetadata)
export(setSlot)
export(setTree)
export(siteFilter)
export(snv)
export(strainClusterHeatmap)
export(summariseAbundance)
Expand All @@ -87,6 +89,7 @@ exportMethods(getSlot)
exportMethods(getTree)
exportMethods(getTreePlot)
exportMethods(setSlot)
exportMethods(siteFilter)
import(BiocFileCache)
import(GetoptLong)
import(RColorBrewer)
Expand Down
26 changes: 20 additions & 6 deletions R/NMF.R
Original file line number Diff line number Diff line change
Expand Up @@ -46,16 +46,33 @@ NMF <- function(stana, species, rank=3, target="KO", seed=53, method="snmf/r",
mat <- stana@genes[[species]]
} else if (target=="snps") {
mat <- stana@snps[[species]]
if (!is.null(stana@includeSNVID[[species]])) {
cat_subtle("# The set SNV ID information (", length(stana@includeSNVID[[species]]), ") is used.\n")
mat <- mat[stana@includeSNVID[[species]], ]
}
if (deleteZeroDepth) {
mat <- mat[rowSums(mat == -1)==0,]
cat_subtle("# After filtering `-1`, position numbers: ",dim(mat)[1],"\n")
} else {
mat[ mat == -1 ] <- NA
if (!nnlm_flag) cat_subtle("# Changing to NNLM\n")
nnlm_flag <- TRUE
nnlm_flag <- TRUE
}
}

nac <- as.numeric(table(is.na(mat))["TRUE"]) / (dim(mat)[1]*dim(mat)[2])
zeroc <- as.numeric(table(mat==0)["TRUE"]) / (dim(mat)[1]*dim(mat)[2])

cat_subtle("# Original features: ", dim(mat)[1], "\n", sep="")
cat_subtle("# Original samples: ", dim(mat)[2], "\n", sep="")
cat_subtle("# Original matrix NA: ", round(nac, 3), "\n", sep="")
cat_subtle("# Original matrix zero: ", round(zeroc, 3), "\n", sep="")

if (is.null(nnlm_args[["loss"]])){
cat_subtle("# Selecting KL loss\n")
nnlm_args[["loss"]] <- "mkl"
}

if (!is.null(remove_na_perc)){
cat_subtle("# Removing too many NA features: ", remove_na_perc * dim(mat)[2], "\n", sep="")
mat <- mat[apply(is.na(mat), 1, sum) < remove_na_perc * dim(mat)[2],]
Expand All @@ -65,15 +82,12 @@ NMF <- function(stana, species, rank=3, target="KO", seed=53, method="snmf/r",
cat_subtle("# Performing TSS\n")
mat <- apply(mat, 2, function(x) x / sum(x))
}
cat_subtle("# Original features: ", dim(mat)[1], "\n", sep="")
cat_subtle("# Original samples: ", dim(mat)[2], "\n", sep="")
if (!nnlm_flag) {
mat <- mat[apply(mat, 1, function(x) sum(x)!=0),]
mat <- mat[,apply(mat, 2, function(x) sum(x)!=0)]

cat_subtle("# Filtered features: ", dim(mat)[1], "\n", sep="")
cat_subtle("# Filtered samples: ", dim(mat)[2], "\n", sep="")
}
cat_subtle("# Filtered features: ", dim(mat)[1], "\n", sep="")
cat_subtle("# Filtered samples: ", dim(mat)[2], "\n", sep="")

## Test multiple ranks
if (estimate) {
Expand Down
23 changes: 20 additions & 3 deletions R/consensusSeqFast.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,16 @@ consensusSeqGeneral <- function(
if (length(species)==0) {stop("No species available")}
retList <- list()
for (sp in species) {
qqcat("Beginning calling for @{sp}\n")
cat_subtle("# Beginning calling for ", sp, "\n", sep="")
SPECIES <- list()
SPECIES[["freqs"]] <- stana@snps[[sp]]

if (!is.null(stana@includeSNVID[[sp]])) {
cat_subtle("# The set SNV ID information (",
length(stana@includeSNVID[[sp]]), ") is used.\n")
SPECIES[["freqs"]] <- SPECIES[["freqs"]][stana@includeSNVID[[sp]], ]
}

siteNum <- dim(SPECIES[["freqs"]])[1]
qqcat(" Site number: @{siteNum}\n")

Expand All @@ -37,6 +44,9 @@ consensusSeqGeneral <- function(
stop("Snps info data frame has to have major_allele and minor_allele column")
}
SPECIES[["info"]] <- stana@snpsInfo[[sp]]
if (!is.null(stana@includeSNVID[[sp]])) {
SPECIES[["info"]] <- SPECIES[["info"]][stana@includeSNVID[[sp]], ]
}
}

sp_minor_allele <- SPECIES[["info"]]$minor_allele
Expand Down Expand Up @@ -156,9 +166,16 @@ consensusSeqMIDAS2 <- function(
midas_merge_dir <- stana@mergeDir
retList <- list()
for (sp in species) {
qqcat("Beginning calling for @{sp}\n")
cat_subtle("# Beginning calling for ", sp, "\n", sep="")
SPECIES <- list()
## Load info and depth file per species

if (!is.null(stana@includeSNVID[[sp]])) {
cat_subtle("# The set SNV ID information (",
length(stana@includeSNVID[[sp]]), ") is used.\n")
site_list <- stana@includeSNVID[[sp]]
}

if (is.null(stana@snpsInfo[[sp]])) {
file <- "info"
if (verbose) {
Expand Down Expand Up @@ -195,7 +212,7 @@ consensusSeqMIDAS2 <- function(
}
SPECIES[["freqs"]] <- stana@snps[[sp]]
siteNum <- dim(SPECIES[["freqs"]])[1]
qqcat(" Site number: @{siteNum}\n")
cat_subtle("# Original Site number: ", siteNum, "\n", sep="")

if (stana@type=="MIDAS2") {
if (dim(stana@snpsSummary)[1]==0) {
Expand Down
6 changes: 5 additions & 1 deletion R/doAdonis.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
#' @export
doAdonis <- function(stana, specs, cl=NULL,
target="snps", formula=NULL,
distMethod="binary", pcoa=FALSE,
distMethod="manhattan", pcoa=FALSE,
AAfunc=dist.ml, AAargs=list(),
maj=FALSE, deleteZeroDepth=FALSE,
argList=list(), distArg=list()) {
Expand All @@ -39,6 +39,10 @@ doAdonis <- function(stana, specs, cl=NULL,
cat_subtle("# Performing adonis in ", sp, " target is ", target, "\n", sep="")
if (target=="snps") {
snps <- stana@snps[[sp]]
if (!is.null(stana@includeSNVID[[sp]])) {
cat_subtle("# The set SNV ID information (", length(stana@includeSNVID[[sp]]), ") is used.\n")
snps <- snps[stana@includeSNVID[[sp]], ]
}
if (deleteZeroDepth) {
snps <- snps[rowSums(snps==-1)==0,]
cat_subtle("After filtering `-1`, position numbers: ", dim(snps)[1], "\n", sep="")
Expand Down
5 changes: 5 additions & 0 deletions R/inferAndPlotTree.R
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,11 @@ inferAndPlotTree <- function(stana, species=NULL, cl=NULL,
dm <- do.call(dist, tree_args)
} else {
mat <- stana@snps[[sp]]
if (!is.null(stana@includeSNVID[[sp]])) {
cat_subtle("# The set SNV ID information (",
length(stana@includeSNVID[[sp]]), ") is used.\n")
mat <- mat[stana@includeSNVID[[sp]], ]
}
if (deleteZeroDepth) {
mat <- mat[rowSums(mat==-1)==0, ]
cat("Position number:", dim(mat)[1], "\n")
Expand Down
File renamed without changes.
29 changes: 29 additions & 0 deletions R/plotMAF.R
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,32 @@ plotMAF <- function(stana, species, SNV, cl=NULL, deleteZeroDepth=TRUE) {
facet_wrap(.~snvID)+
theme_minimal()
}


#'
#' plotMAFHist
#'
#' plot the distribution of MAF for specificed species
#'
#' @param stana stana object
#' @param species candidate species
#' @import ggplot2
#' @export
#' @return ggplot
#'
plotMAFHist <- function(stana, species) {

snvDf <- stana@snps[[species]]
if (!is.null(stana@includeSNVID[[species]])) {
cat_subtle("# The set SNV ID information (", length(stana@includeSNVID[[species]]), ") is used.\n")
snvDf <- snvDf[stana@includeSNVID[[species]], ]
}
snvDf[ snvDf == -1 ] <- NA
snvMat <- as.matrix(snvDf)

df <- data.frame(freq=as.vector(snvMat))
ggplot(df, aes(x=freq))+geom_histogram()+cowplot::theme_cowplot()+
scale_y_continuous(expand = expansion(mult = c(0, 0.05)))


}
4 changes: 4 additions & 0 deletions R/plotPCA.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ plotPCA <- function(stana, species, cl=NULL, target="snps",

if (target=="snps") {
df <- stana@snps[[sp]]
if (!is.null(stana@includeSNVID[[sp]])) {
cat_subtle("# The set SNV ID information (", length(stana@includeSNVID[[sp]]), ") is used.\n")
df <- df[stana@includeSNVID[[sp]], ]
}
} else if (target=="genes") {
df <- stana@genes[[sp]]
} else if (target=="kos") {
Expand Down
33 changes: 31 additions & 2 deletions R/stana.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
#' @slot eggNOG list of path for eggNOG mapper v2 results for species
#' @slot map slot storing the mapping data.frame for gene ID and orthology
#' @slot coefMat slot storing the strain (or subspecies) abundances
#' @slot includeSNVID if SNV allele frequency is used in the calculation,
#' the IDs in the slot is subset by default.
setClass("stana", slots=list(
type="character",
cl="list",
Expand All @@ -24,6 +26,7 @@ setClass("stana", slots=list(
eggNOG="list",
NMF = "list",
kos="list",
includeSNVID="list",
snpsInfo="list",
snpsDepth="list",
snpsSummary="data.frame",
Expand Down Expand Up @@ -126,15 +129,41 @@ loadDic <- function() {
}



#' siteFilter
#' @param x stana object
#' @param sp species ID
#' @param exp expression for filtering the snps info
#' @export
setGeneric("siteFilter", function(x, sp, exp) standardGeneric("siteFilter"))

#' siteFilter
#' @param x stana object
#' @param sp species ID
#' @param exp expression for filtering the snps info
#' @export
setMethod("siteFilter", "stana",
function(x, sp, exp) {
info <- x@snpsInfo[[sp]]
ret <- info %>%
dplyr::filter(!!enquo(exp)) %>%
row.names(.)
cat_subtle("# total of ", length(ret), " obtained from ", dim(info)[1], "\n", sep="")
x@includeSNVID[[sp]] <- ret
x
})



#' check
#' check and output statistics based on conditional formulas
#' check and output statistics based on conditional formulas for summary
#' @param x stana boject
#' @param exp expression for filtering the snps summary
#' @param target snps or genes
#' @export
setGeneric("check", function(x, exp, target="snps") standardGeneric("check"))
#' check
#' check and output statistics based on conditional formulas
#' check and output statistics based on conditional formulas for summary
#' @param x stana boject
#' @param exp expression for filtering the snps summary
#' @param target snps or genes
Expand Down
1 change: 1 addition & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ listToNV <- function(l) {
#' @export
#' @return ggplot
plotSNVInfo <- function(stana, sp) {
if (!is.null(stana@includeSNVID)[[sp]]) {cat_subtle("# The set SNV ID information is not used in this function\n")}
d <- stana@snpsInfo[[sp]]
tbl <- data.frame(table(paste0(d$major_allele,"/",d$minor_allele)))
if ("locus_type" %in% colnames(d)) {
Expand Down
4 changes: 2 additions & 2 deletions man/check-stana-method.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions man/check.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/doAdonis.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/loadmetaSNV.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 19 additions & 0 deletions man/plotMAFHist.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 18 additions & 0 deletions man/siteFilter-stana-method.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 18 additions & 0 deletions man/siteFilter.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions man/stana-class.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit d254d84

Please sign in to comment.