Merge pull request #1 from dalmolingroup/upset-plot

Add upset plot and network analysis
dalmolingroup · Aug 23, 2023 · c875ccc · c875ccc
2 parents 63030b3 + 375a0f4
commit c875ccc
Show file tree

Hide file tree

Showing 17 changed files with 5,423 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -41,3 +41,4 @@ vignettes/*.pdf
 # dataset
 todo.md
 functional_enrichment_files/
+data/Homo_sapiens.GRCh38.97.chr_patch_hapl_scaff.gtf.gz
diff --git a/R/stringdb.R b/R/stringdb.R
@@ -0,0 +1,59 @@
+# Get interaction network from STRINGdb
+get_string_network <-
+  function(ids,
+           species = "9606",
+           required_score = 0) {
+    ids_collapsed <- paste0(ids, collapse = "%0d")
+
+    jsonlite::fromJSON(
+      RCurl::postForm(
+        "https://string-db.org/api/json/network",
+        identifiers = ids_collapsed,
+        echo_query  = "1",
+        required_score = as.character(required_score),
+        species = species
+      ),
+    )
+  }
+
+# Get identifiers from STRINGdb
+get_string_ids <- function(ids, species = "9606") {
+  ids_collapsed <- paste0(ids, collapse = "%0d")
+
+  jsonlite::fromJSON(
+    RCurl::postForm(
+      "https://string-db.org/api/json/get_string_ids",
+      identifiers = ids_collapsed,
+      echo_query  = "1",
+      species = species
+    ),
+  )
+}
+
+# Function to combine scores according to the STRINGdb algorithm
+combinescores <- function(dat,
+                          evidences = "all",
+                          confLevel = 0.4) {
+  if (evidences[1] == "all") {
+    edat <- dat[, -c(1, 2, ncol(dat))]
+  } else {
+    if (!all(evidences %in% colnames(dat))) {
+      stop("NOTE: one or more 'evidences' not listed in 'dat' colnames!")
+    }
+    edat <- dat[, evidences]
+  }
+  if (any(edat > 1)) {
+    edat <- edat / 1000
+  }
+  edat <- 1 - edat
+  sc <- apply(
+    X = edat,
+    MARGIN = 1,
+    FUN = function(x)
+      1 - prod(x)
+  )
+  dat <- cbind(dat[, c(1, 2)], combined_score = sc)
+  idx <- dat$combined_score >= confLevel
+  dat <- dat[idx, ]
+  return(dat)
+}
diff --git a/README.md b/README.md
@@ -9,6 +9,7 @@ The results of the different DGEs were then used for a meta-analysis.
 - [RNA-Seq](./reports/RNASeq.html) - [Source](./analysis/RNASeq.Rmd)
 - [GSE107385](./reports/GSE107385.html) - [Source](./analysis/GSE107385.Rmd)
 - [Select Genes](./reports/select_genes.html) - [Source](./analysis/select_genes.Rmd)
+- [Assemble Network](./reports/assemble_network.html) - [Source](./analysis/assemble_network.Rmd)
 
 ## Repository Structure
 
@@ -28,6 +29,10 @@ The results of the different DGEs were then used for a meta-analysis.
     - DGE for the other two RNA-Seq datasets
   - select_genes.Rmd
     - Select genes associated to mitophagy, autophagy and mitochondrial biogenesis
+  - make_upset.R
+    - Script to generate UpSet plot with intersections between grouped processes.
+  - assemble_network.Rmd
+    - Assemble PPI network between metanalysis genes.
 - `data`
   - `Plataformas.csv`
     - Original metadata for the three studies

diff --git a/analysis/assemble_network.Rmd b/analysis/assemble_network.Rmd
@@ -0,0 +1,184 @@
+---
+title: "Assemble Network"
+author: "João Vitor F. Cavalcante"
+date: "`r Sys.setlocale('LC_TIME', 'C'); format(Sys.time(), '%d %B, %Y')`"
+knit: (function(inputFile, encoding) {
+  rmarkdown::render(inputFile, encoding = encoding, output_dir = "../reports/") })
+output:
+  html_document:
+    toc: true
+    toc_float: true
+    toc_collapsed: false
+    theme:
+      bslib: true
+      bootswatch: minty
+---
+
+```{r knitr, include=FALSE}
+knitr::opts_chunk$set(
+  echo = TRUE,
+  message = FALSE,
+  warning = FALSE
+)
+```
+
+## PPI Network assembly from Metanalysis genes
+
+We took all genes resulting from the metanalysis and assembled a PPI network for them by querying
+STRINGdb v12.0.
+
+Evidence channels used to gather the interactions were experimental, co-expression and databases. Said interactions were also filtered for a confidence level of 0.5.
+
+### Load libraries and data
+
+```{r setup}
+# remotes::install_github("daniloimparato/easylayout", ref = "dadamorais")
+library(easylayout)
+library(tidygraph)
+library(igraph)
+library(ggraph)
+library(dplyr)
+library(tidyr)
+library(vroom)
+library(here)
+source(here("R/stringdb.R"))
+
+meta <- vroom(here("data/meta_analysis_genes.csv")) %>%
+  select(gene_symbol, MD, pvalor, `Grouped process`)
+```
+
+### Assemble Network
+
+```{r assemble, eval=FALSE}
+string_ids <- get_string_ids(meta$gene_symbol)
+
+meta_merged <- meta %>%
+  left_join(string_ids %>% select(queryItem, stringId),
+            by = c("gene_symbol" = "queryItem")) %>%
+  mutate(stringId = stringr::str_remove(stringId, "9606.")) %>%
+  separate_rows(`Grouped process`, sep = " \\| ")
+
+encoded_source <- meta_merged %>%
+  mutate(n = 1) %>%
+  pivot_wider(
+    id_cols     = stringId,
+    names_from  = `Grouped process`,
+    values_from = n,
+    values_fn   = list(n = length),
+    values_fill = list(n = 0),
+    names_prefix = "From "
+  ) %>%
+  mutate(source_count = starts_with("From ") %>% across %>% rowSums)
+
+network <- get_string_network(string_ids$stringId)
+
+network_separated <-  network %>%
+  separate(stringId_A,
+           into = c("ncbi_taxon_id", "stringId_A"),
+           sep = "\\.") %>%
+  separate(stringId_B,
+           into = c("ncbi_taxon_id", "stringId_B"),
+           sep = "\\.")
+
+nodelist <-
+  data.frame(node = unique(c(network_separated$stringId_A, network_separated$stringId_B))) %>%
+  left_join(meta_merged, by = c("node" = "stringId")) %>%
+  left_join(encoded_source, by = c("node" = "stringId")) %>%
+  distinct(node, gene_symbol, .keep_all = TRUE)
+
+network_filtered <- network %>%
+  combinescores(.,
+                evidences = c("ascore", "escore", "dscore"),
+                confLevel = 0.5) %>%
+  separate(stringId_A,
+           into = c("ncbi_taxon_id", "stringId_A"),
+           sep = "\\.") %>%
+  separate(stringId_B,
+           into = c("ncbi_taxon_id", "stringId_B"),
+           sep = "\\.") %>%
+  dplyr::select(stringId_A, stringId_B)
+```
+
+### Compute Network Layout
+
+```{r layout, eval=FALSE}
+graph <-
+  graph_from_data_frame(network_filtered, directed = FALSE, vertices = nodelist)
+
+layout <- easylayout::vivagraph(graph)
+layout <- easylayout::vivagraph(graph, layout = layout, pin_nodes = TRUE, lcc_margin_left = 10)
+V(graph)$x <- layout[, 1]
+V(graph)$y <- layout[, 2]
+
+save(graph, nodelist, file = here("results/full_network.rda"))
+```
+
+### Network Plots
+
+#### By significance
+
+```{r}
+load(here("results/full_network.rda"))
+
+graph_colored <- graph %>%
+  as_tbl_graph() %>%
+  activate(nodes) %>%
+  mutate(signif_md = ifelse(pvalor < 0.05, MD, NA))
+
+ggraph(graph_colored,
+       "manual",
+       x = V(graph)$x,
+       y = V(graph)$y) +
+  geom_edge_link0(edge_width = 0.2, color = "#90909020") +
+  geom_node_point(aes(color = signif_md)) +
+  scale_colour_gradientn(colours = RColorBrewer::brewer.pal(9, "OrRd")) +
+  coord_fixed() +
+  theme_void() +
+  theme(
+    legend.key.size = unit(0.5, 'cm'),
+    legend.key.height = unit(0.5, 'cm'),
+    legend.key.width = unit(0.5, 'cm'),
+    legend.title = element_text(size=6),
+    legend.text = element_text(size=6),
+    plot.title = element_text(size = 4, face = "bold")
+  ) +
+  labs(
+    color = "Expression"
+  )
+
+ggsave(here("results/full_network.pdf"), width = 8, height = 8)
+ggsave(here("results/full_network.png"), width = 8, height = 8, bg = "white")
+```
+
+#### By grouped process
+
+```{r}
+ggraph(graph, "manual", x = V(graph)$x, y = V(graph)$y) +
+  geom_edge_link0(color = "#90909020") +
+  scatterpie::geom_scatterpie(
+    cols = colnames(nodelist)[startsWith(colnames(nodelist), "From ")],
+    data = igraph::as_data_frame(graph, "vertices"),
+    colour = NA,
+    pie_scale = 0.2
+  ) +
+  coord_fixed() +
+  theme_void() +
+  theme(
+    legend.key.size = unit(0.5, 'cm'),
+    legend.key.height = unit(0.5, 'cm'),
+    legend.key.width = unit(0.5, 'cm'),
+    legend.title = element_text(size=6),
+    legend.text = element_text(size=6),
+    legend.position = "bottom",
+    plot.title = element_text(size = 4, face = "bold")
+  ) +
+  labs(
+    fill = "Source:"
+  )
+
+ggsave(here("results/network_processes.pdf"), width = 8, height = 8)
+ggsave(here("results/network_processes.png"), width = 8, height = 8, bg = "white")
+```
+
+
+
diff --git a/analysis/make_upset.R b/analysis/make_upset.R
@@ -0,0 +1,73 @@
+library(here)
+library(readr)
+library(dplyr)
+library(tidyr)
+library(UpSetR)
+
+data <- read_csv(here("data/selected_genes_grouped.csv"), skip = 1) %>%
+  janitor::clean_names() %>%
+  dplyr::select(gene_symbol, grouped_process) %>%
+  distinct()
+
+data_encoded_source <- data %>%
+  filter(!is.na(grouped_process)) %>%
+  mutate(n = 1) %>%
+  pivot_wider(
+    id_cols     = gene_symbol,
+    names_from  = grouped_process,
+    values_from = n,
+    values_fn   = list(n = length),
+    values_fill = list(n = 0)
+  ) %>%
+  as.data.frame()
+
+upset_p <- upset(
+  nsets = 4,
+  data_encoded_source,
+  order.by = "freq",
+  empty.intersections = "on"
+)
+upset_p
+
+pdf(file=here("results/upset_plot.pdf"), width = 8, height = 5.5)
+upset_p
+dev.off()
+
+png(file=here("results/upset_plot.png"), width = 8, height = 5.5, units = "in", res=500)
+upset_p
+dev.off()
+
+data <- read_csv(here("data/meta_analysis_genes.csv")) %>%
+  janitor::clean_names() %>%
+  dplyr::select(gene_symbol, grouped_process) %>%
+  separate_rows(grouped_process, sep = "\\|") %>%
+  mutate(grouped_process = stringr::str_trim(grouped_process)) %>%
+  distinct()
+
+data_encoded_source <- data %>%
+  filter(!is.na(grouped_process)) %>%
+  mutate(n = 1) %>%
+  pivot_wider(
+    id_cols     = gene_symbol,
+    names_from  = grouped_process,
+    values_from = n,
+    values_fn   = list(n = length),
+    values_fill = list(n = 0)
+  ) %>%
+  as.data.frame()
+
+upset_p <- upset(
+  nsets = 4,
+  data_encoded_source,
+  order.by = "freq",
+  empty.intersections = "on"
+)
+upset_p
+
+pdf(file=here("results/upset_plot_figureb.pdf"), width = 8, height = 5.5)
+upset_p
+dev.off()
+
+png(file=here("results/upset_plot_figureb.png"), width = 8, height = 5.5, units = "in", res=500)
+upset_p
+dev.off()