Differential expression analysis

Using Dream | PD x CT.

Differential expression for repeated measures (dream) uses a linear model model to increase power and decrease false positives for RNA-seq datasets with multiple measurements per individual. The analysis fits seamlessly into the widely used workflow of limma/voom (Law et al. 2014).

expression_dir = "~/ad-omics_hydra/microglia_omics/expression_tables/added_pilot_314s/expr_4brain_regions/"
work_plots = "~/pd-omics/katia/Microglia/MiGA_public_release/DE_diagnosis/"
metadata_path = "~/pd-omics/katia/Microglia/mic_255s/metadata_files/"

load(paste0(expression_dir, "Expression_filt_255s.Rdata")) 
#dim(genes_counts_exp_3rd) # 19376   255

load(paste0(metadata_path, "metadata_255filt_eng_29jun2020.Rdata"))
#str(metadata3rd_pass)

Preparing the data for DEG

metadata <- metadata3rd_pass

samples_case = as.character(metadata$donor_tissue[metadata$main_diagnosis %in% "Parkinson"])
samples_control = as.character(metadata$donor_tissue[metadata$main_diagnosis %in% "Control"])

rownames(metadata) <- metadata$donor_tissue

#To pick up only the samples for DEG analysis
genes_counts4deg = genes_counts_exp_3rd[, colnames(genes_counts_exp_3rd) %in% c(samples_case,samples_control) ]
metadata4deg = metadata[rownames(metadata) %in% c(samples_case,samples_control) ,]

# all(rownames(metadata4deg) == colnames(genes_counts4deg)) # # The rownames of the metadata needs to be in the same order as colnames expression table 
# str(metadata4deg) #shows class for all columns 
# table(metadata4deg$main_diagnosis)

Samples for DEG

Number of case samples

length(samples_case)

[1] 18

Case samples

samples_case

[1] “14-078-GFM” “15-024-GFM” “15-024-THA” “15-041-THA” “15-097-GTS” [6] “15-110-GTS” “16-016-THA” “16-102-GFM” “17-012-GFM” “17-012-SVZ” [11] “17-012-THA” “MG-06-GFM” “MG-06-SVZ” “MG-08-GFM” “MG-08-SVZ” [16] “MG-13-GFM” “MG-13-SVZ” “MG-14-SVZ”

Number of control samples

length(samples_control)

[1] 96

Control samples

samples_control

[1] “14-005-GFM” “14-005-GTS” “14-015-GFM” “14-015-GTS” “14-029-GFM” [6] “14-029-GTS” “14-051-GTS” “14-069-GFM” “14-069-GTS” “14-075-GFM” [11] “14-075-GTS” “15-018-GFM” “15-018-GTS” “15-027-GFM” “15-034-GFM” [16] “15-055-GFM” “15-055-GTS” “15-074-THA” “15-087-GFM” “15-087-GTS” [21] “15-087-THA” “15-089-GFM” “15-089-GTS” “15-089-THA” “16-027-GFM” [26] “16-027-GTS” “16-038-GFM” “16-046-GFM” “16-046-GTS” “16-046-THA” [31] “16-056-GFM” “16-056-THA” “16-067-GFM” “16-067-GTS” “16-067-THA” [36] “16-078-GFM” “16-078-GTS” “16-078-THA” “16-080-GFM” “16-080-GTS” [41] “16-080-SVZ” “16-080-THA” “16-082-SVZ” “16-116-GFM” “16-116-GTS” [46] “16-137-GFM” “16-137-GTS” “16-137-SVZ” “16-137-THA” “17-003-GFM” [51] “17-003-GTS” “17-003-SVZ” “17-003-THA” “17-005-GFM” “17-005-GTS” [56] “17-005-SVZ” “17-005-THA” “17-016-GFM” “17-016-GTS” “17-016-SVZ” [61] “17-016-THA” “17-043-GFM” “17-043-GTS” “17-043-SVZ” “17-078-GFM” [66] “17-078-GTS” “17-078-SVZ” “17-078-THA” “17-097-GFM” “17-097-GTS” [71] “17-097-SVZ” “17-124-GFM” “17-124-GTS” “17-124-SVZ” “17-124-THA” [76] “18-018-GFM” “18-018-SVZ” “18-018-THA” “18-021-GFM” “18-021-GTS” [81] “18-021-SVZ” “18-021-THA” “18-064-GFM” “18-064-GTS” “18-064-SVZ” [86] “18-064-THA” “18-105-GFM” “18-105-GTS” “18-105-THA” “18-112-GFM” [91] “18-112-GTS” “18-112-THA” “MG-03-SVZ” “MG-09-GFM” “MG-11-GFM” [96] “MG-11-SVZ”

Dream analysis

params = BiocParallel::MulticoreParam(workers=4, progressbar=T)
register(params)
registerDoParallel(4)

# To include cause of death and C1-C4
metadata4deg$cause_of_death_categories[metadata4deg$cause_of_death_categories %in% NA] <- "Other"
# table(metadata4deg$cause_of_death_categories)

metadata4deg$C1 = metadata4deg$C1 %>% replace_na(median(metadata4deg$C1, na.rm = T))
metadata4deg$C2 = metadata4deg$C2 %>% replace_na(median(metadata4deg$C2, na.rm = T))
metadata4deg$C3 = metadata4deg$C3 %>% replace_na(median(metadata4deg$C3, na.rm = T))
metadata4deg$C4 = metadata4deg$C4 %>% replace_na(median(metadata4deg$C4, na.rm = T))

# Check variance partition version 
# packageVersion("variancePartition")  # Must be 1.17.7

# The variable to be tested should be a fixed effect
form <- ~ main_diagnosis + sex + (1|donor_id) + age + tissue + (1|cause_of_death_categories) + C1 + C2 + C3 + C4 + picard_pct_mrna_bases + picard_summed_median + picard_pct_ribosomal_bases

# estimate weights using linear mixed model of dream
vobjDream = suppressWarnings( voomWithDreamWeights( genes_counts4deg, form, metadata4deg ) ) # supressing messages because of Biocparallel was generating a lot of messages  
 
# Fit the dream model on each gene
# By default, uses the Satterthwaite approximation for the hypothesis test
fitmm = suppressWarnings (dream( vobjDream, form, metadata4deg )) 

# Examine design matrix
# createDT(fitmm$design, 3)

res_dream <- data.frame(topTable(fitmm, coef='main_diagnosisParkinson', 
                                 number=nrow(genes_counts4deg), sort.by = "p"), check.names = F)

The t-statistics are not directly comparable since they have different degrees of freedom. In order to be able to compare test statistics, we report z.std which is the p-value transformed into a signed z-score. This can be used for downstream analysis.

DE genes 15%

length(which(res_dream$adj.P.Val<0.15))

[1] 0

DE genes 10%

length(which(res_dream$adj.P.Val<0.10))

[1] 0

DE genes 5%

length(which(res_dream$adj.P.Val<0.05))

[1] 0

Plots from Dream

P Value Distribution

res = res_dream
p = ggplot(res, aes(P.Value))
p + geom_density(color="darkblue", fill="lightblue") +
  theme_classic() +
  ggtitle("FDR Distribution")

Fold Change Distribution

p = ggplot(res, aes(logFC))
p + geom_density(color = "darkblue", fill = "lightblue") + 
  theme_classic() +
  ggtitle("Fold Change Distribution")

MA Plot

plot.data = res
plot.data$id = rownames(plot.data)
data = data.frame(plot.data)
data$P.Value = -log10(data$P.Value)
data$fifteen = as.factor(abs(data$adj.P.Val < 0.05))

ma = ggplot(data, aes(AveExpr, logFC, color = fifteen))
ma + geom_point() +
  scale_color_manual(values = c("black", "red"), labels = c ("> 0.05", "< 0.05")) +
  labs(title = "MA plot", color = "labels") +
  theme_classic()
  #theme(plot.title = element_text(hjust = 0.5)) + ylim (-10,10) + xlim(-4,22)

Volcano Plot

vp = ggplot(data, aes(logFC, P.Value, color = fifteen))
vp + geom_point() +
  scale_color_manual(values = c("black", "red"), labels = c("> 0.05", "< 0.05")) + 
  labs(title = "Gene Level Volcano Plot", color = "FDR") +
  #theme(plot.title = element_text(hjust = 0.5)) +
  theme_classic() +
  xlim(-10,10) + ylim(0, 10) + ylab("-log10 pvalue")

DEG table for download

## Get conversion table for Gencode 30
gencode_30 = read.table("~/pd-omics/katia/ens.geneid.gencode.v30")
colnames(gencode_30) = c("ensembl","symbol")

res$ensembl = rownames(res)
res_name = merge(res, gencode_30, by="ensembl")
rownames(res_name) = res_name$ensembl
res_name = res_name[order(res_name$adj.P.Val), ]
res_name = res_name[, c("symbol", "logFC", "AveExpr", "t", "P.Value", "adj.P.Val", "z.std")]
   
createDT(res_name)

write.table(res_name, file = paste0(work_plots, "PDxCT_dream_3rd.txt"), quote = F, sep = "\t")

Directionality

LogFC > 0 is up in Case from Dream.

top_6 = head(res_name)
top_6$ensembl = rownames(top_6)
rownames(metadata4deg) = metadata4deg$donor_tissue
genes_voom = genes_counts_voom_3rd[, rownames(metadata4deg)] # Voom 1st pass only for the samples used in this comparison 
gene2check = as.data.frame( genes_voom[rownames(top_6) ,])
gene2check$ensembl = rownames(gene2check)
gene2check = merge(gene2check, top_6[, c("symbol", "ensembl")], by = "ensembl")

gene2check_m = melt(gene2check, id.vars = c("ensembl", "symbol"))
gene2check_charac = merge(gene2check_m, metadata4deg, by.x = "variable", by.y = "donor_tissue")

ggplot(gene2check_charac, aes(x = main_diagnosis, y = value, fill = main_diagnosis)) +
  geom_boxplot(notch = F, na.rm = T, outlier.color = NA) +
  geom_jitter() +
  theme_classic() + 
  facet_wrap(~symbol, scales = "free_y") +
  ggpubr::stat_compare_means(label = "p.format", label.x.npc = "centre", method = "wilcox.test")

sessionInfo()

R version 3.6.2 (2019-12-12) Platform: x86_64-pc-linux-gnu (64-bit) Running under: CentOS Linux 7 (Core)

Matrix products: default BLAS/LAPACK: /hpc/packages/minerva-centos7/intel/parallel_studio_xe_2019/compilers_and_libraries_2019.0.117/linux/mkl/lib/intel64_lin/libmkl_gf_lp64.so

locale: [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C

attached base packages: [1] parallel stats graphics grDevices utils datasets methods
[8] base

other attached packages: [1] tidyr_1.0.2 dplyr_0.8.5 doParallel_1.0.15
[4] iterators_1.0.12 ggpubr_0.2.5 magrittr_1.5
[7] statmod_1.4.34 BiocParallel_1.20.1 variancePartition_1.17.7 [10] Biobase_2.46.0 BiocGenerics_0.32.0 scales_1.1.0
[13] foreach_1.4.8 reshape2_1.4.3 ggplot2_3.3.0
[16] gplots_3.0.3 RColorBrewer_1.1-2 edgeR_3.28.1
[19] limma_3.42.2 rmarkdown_2.1

loaded via a namespace (and not attached): [1] jsonlite_1.6.1 splines_3.6.2 gtools_3.8.1
[4] shiny_1.4.0 assertthat_0.2.1 BiocManager_1.30.10 [7] yaml_2.2.1 progress_1.2.2 numDeriv_2016.8-1.1 [10] pillar_1.4.3 lattice_0.20-40 glue_1.3.1
[13] digest_0.6.25 promises_1.1.0 ggsignif_0.6.0
[16] minqa_1.2.4 colorspace_1.4-1 httpuv_1.5.2
[19] htmltools_0.4.0 Matrix_1.2-18 plyr_1.8.6
[22] pkgconfig_2.0.3 xtable_1.8-4 purrr_0.3.3
[25] gdata_2.18.0 later_1.0.0 lme4_1.1-21
[28] tibble_2.1.3 farver_2.0.3 ellipsis_0.3.0
[31] DT_0.12 withr_2.1.2 pbkrtest_0.4-8.6
[34] mime_0.9 crayon_1.3.4 evaluate_0.14
[37] nlme_3.1-145 MASS_7.3-51.5 tools_3.6.2
[40] prettyunits_1.1.1 hms_0.5.3 lifecycle_0.2.0
[43] stringr_1.4.0 munsell_0.5.0 locfit_1.5-9.1
[46] colorRamps_2.3 compiler_3.6.2 caTools_1.18.0
[49] rlang_0.4.5 grid_3.6.2 nloptr_1.2.2
[52] htmlwidgets_1.5.1 crosstalk_1.0.0 bitops_1.0-6
[55] labeling_0.3 boot_1.3-24 gtable_0.3.0
[58] codetools_0.2-16 lmerTest_3.1-1 R6_2.4.1
[61] knitr_1.28 fastmap_1.0.1 KernSmooth_2.23-16 [64] stringi_1.4.6 Rcpp_1.0.6 vctrs_0.2.4
[67] tidyselect_1.0.0 xfun_0.12

Differential expression analysis

PD x CT

Katia de Paiva Lopes
Raj Lab
Department of Neuroscience
Icahn School of Medicine at Mount Sinai
NYC, New York

2021-03-03

Preparing the data for DEG

Samples for DEG

Number of case samples

Case samples

Number of control samples

Control samples

Dream analysis

DE genes 15%

DE genes 10%

DE genes 5%

Plots from Dream

P Value Distribution

Fold Change Distribution

MA Plot

Volcano Plot

DEG table for download

Directionality

Differential expression analysis

PD x CT

Katia de Paiva Lopes Raj Lab Department of Neuroscience Icahn School of Medicine at Mount Sinai NYC, New York

2021-03-03

Preparing the data for DEG

Samples for DEG

Number of case samples

Case samples

Number of control samples

Control samples

Dream analysis

DE genes 15%

DE genes 10%

DE genes 5%

Plots from Dream

P Value Distribution

Fold Change Distribution

MA Plot

Volcano Plot

DEG table for download

Directionality

Katia de Paiva Lopes
Raj Lab
Department of Neuroscience
Icahn School of Medicine at Mount Sinai
NYC, New York