analysis/GSE107385.Rmd

---
title: "GSE107385 Differential Expression"
author: "João Vitor F. Cavalcante"
date: "`r Sys.setlocale('LC_TIME', 'C'); format(Sys.time(), '%d %B, %Y')`"
knit: (function(inputFile, encoding) {
  rmarkdown::render(inputFile, encoding = encoding, output_dir = "../reports/") })
output:
  html_document:
    toc: true
    toc_float: true
    toc_collapsed: false
    theme:
      bslib: true
      bootswatch: minty
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE,
                      message = FALSE,
                      warning = FALSE,
                      eval = FALSE)
```


# Methodology

* Microarray Differential expression
 - To obtain the normalized count matrices for this study, we used the **oligo**
 R package.
 - To perform the differential expression analysis itself, we used the **limma** R package.
  - Differentially expressed genes determined for p-value < 0.01 and BH method for p-value adjustment.

# Differential Expression

## Loading libraries and functions

```{r}
library(oligo)
library(limma)
library(readr)
library(dplyr)
library(affy)
```

```{r}
# Acquire metadata for the study
get_phenodata <- function(gse_id) {
  read_csv("data/Plataformas.csv") %>%
    janitor::clean_names() %>%
    filter(gse == gse_id) %>%
    dplyr::select(gsm, grupo, grupo_no_trabalho_original)
}

# Run differential expression with limma
get_limma_dexp <-
  function(eset,
           phenodata,
           contraste,
           symbol = NULL,
           add_id = NULL,
           platform_df = NULL) {
    eset_df <- exprs(eset)

    design <- model.matrix(~ 0 + factor(phenodata$grupo))
    colnames(design) <- unique(phenodata$grupo)

    fit <- limma::lmFit(eset, design)

    if (is.null(platform_df)) {
      db_symbol <- as.data.frame(symbol[rownames(eset_df)])

      db_add_id <- as.data.frame(add_id[rownames(eset_df)])

      platform_df <-
        db_add_id %>% left_join(db_symbol, by = "probe_id")
    }

    contrasts <-
      limma::makeContrasts(contrasts = contraste, levels = design)

    fit2 <- contrasts.fit(fit, contrasts)

    ct.fit <- limma::eBayes(fit2)

    res.fit <-
      limma::decideTests(
        ct.fit,
        method = "global",
        adjust.method = "BH",
        p.value = 0.01,
      )

    is_de = as.data.frame(res.fit@.Data) %>%
      tibble::rownames_to_column("probe") %>%
      setNames(., c("probe", "is_de"))
    
    topTable(
      ct.fit,
      coef = 1,
      adjust.method = "BH",
      number = "inf",
      confint = TRUE
    ) %>%
      tibble::rownames_to_column("probe") %>%
      left_join(is_de, by = "probe") %>%
      left_join(platform_df, by = c("probe" = "probe_id"))
    
    
  }

clean_toptable <- function(toptable, unique_col) {
  ref_col = dplyr::sym(unique_col)
  toptable %>%
    group_by(!!ref_col) %>%
    slice(which.max(abs(logFC))) %>%
    ungroup()
}

# Main function
# Loads dataset with appropriate platform technology (affymetrix, affy-oligo or agilent)
# Run differential expression with limma and return output table
get_dge_table <-
  function(gse_id,
           celfile_path,
           contraste,
           refcol,
           symbol = NULL,
           add_id = NULL,
           platform_df = NULL,
           oligo_exp = FALSE,
           agilent = FALSE) {
    phenodata_file <- get_phenodata(gse_id)

    if (oligo_exp == TRUE) {
      celfiles <-
        list.files(celfile_path, pattern = "CEL.gz", full.names = TRUE)
      data <-
        oligo::read.celfiles(celfile.path = celfiles)
      eset <- oligo::rma(data)
    } else if (agilent == TRUE) {
      celfiles <- list.files(celfile_path, pattern = "txt.gz")
      raw <-
        read.maimages(celfiles,
                      path = celfile_path,
                      source = "agilent",
                      green.only = TRUE)
      norm <- backgroundCorrect(raw, method = "normexp")
      norm <- normalizeBetweenArrays(norm, method = "quantile")
      norm.ave <- avereps(norm, ID = norm$genes$ProbeName)

      eset <- Biobase::ExpressionSet(assayData = norm.ave$E)
    } else {
      data <-
        affy::ReadAffy(celfile.path = celfile_path)

      eset <- affy::rma(data)
    }


    toptable <- get_limma_dexp(
      eset,
      phenodata_file,
      contraste,
      symbol = symbol,
      add_id = add_id,
      platform_df = platform_df
    )
    
    clean_toptable(toptable, refcol)

  }
```

## Analysis

Now, we perform the differential expression analysis itself.

```{r run-analysis}

GPL16686 <- read_csv("data/GPL16686.csv", col_types = c(rep("c", 3))) %>%
  janitor::clean_names()

GSE107385 <-
  get_dge_table(
    "GSE107385",
    "data/GSE107385",
    "Tratado-Controle",
    refcol = "hgnc_symbol",
    platform_df = GPL16686,
    oligo_exp = TRUE,
  )

GSE107385 %>% 
  write_csv("results/GSE107385.csv")

```