Post mashR pipeline

Previous pipeline: https://github.com/RajLabMSSM/mashR

GTEx paper: https://www.nature.com/articles/s41588-018-0268-8

mash_rds = readRDS(paste0(fastqtl_to_mash_output, "QTLSumStats.mash.rds"))
# names(mash_rds)

maxb <- mash_rds$strong.b
maxz <- mash_rds$strong.z

mash_posterior = readRDS(paste0(mashr_flashr_workflow_output, "QTLSumStats.mash.EZ.FL_PC3.V_simple.posterior.rds"))
# names(mash_posterior)
# head(mash_posterior$lfsr)
# dim(mash_posterior$lfsr)

pm.mash <- mash_posterior$PosteriorMean
lfsr.all <- mash_posterior$lfsr
standard.error <- maxb/maxz
pm.mash.beta <- pm.mash*standard.error

lfsr table at 10%

lfsr = local false sign rate. Method proposed by Stephens, it is analogous to FDR. Number of significative genes:

pm.mash.beta <- pm.mash.beta[rowSums(lfsr.all<0.10)>0,]
lfsr.mash <- lfsr.all[rowSums(lfsr.all<0.10)>0,] # lfsr.mash have the significant results. 
# Means they have lfsr less than 0.10 in at least one condition.
dim(lfsr.mash)[1]

[1] 5722

lfsr.mash_symbol = as.data.frame(lfsr.mash)
lfsr.mash_symbol$ensembl = gsub("(.*?)_(.*)", "\\1", rownames(lfsr.mash_symbol))

## Get conversion table for Gencode 30
gencode_30 = read.table("~/pd-omics/katia/ens.geneid.gencode.v30", header = T)
gencode_30$ensembl = gsub("(.*?)\\.(.*)", "\\1", gencode_30$gene_id)

lfsr.mash_symbol = merge(lfsr.mash_symbol, gencode_30, by = "ensembl")
rownames(lfsr.mash_symbol) <- rownames(lfsr.mash)
lfsr.mash_symbol$ensembl <- NULL 
colnames(lfsr.mash_symbol) = gsub("(.*?)_(.*)","\\1",colnames(lfsr.mash_symbol))

lfsr.mash_symbol = lfsr.mash_symbol[,c("gene", "GeneSymbol", "MFG", "STG", "THA", "SVZ")]
rownames(lfsr.mash_symbol) = gsub("_s", "_rs", rownames(lfsr.mash_symbol))

write.table(lfsr.all, file = paste0(work_dir, "mash_lfsr_all.txt"), sep = "\t", quote = F)
write.table(lfsr.mash_symbol, file = paste0(work_dir, "mash_lfsr_10per.txt"), sep = "\t", quote = F)

createDT(lfsr.mash_symbol)

Pairwise plot

Pairwise sharing by magnitude of eQtLs among tissues. For each pair of tissues, we considered the top eQTLs that were significant (lfsr < 0.10) in at least one of the two tissues, and plotted the proportion of these that are shared in magnitude—that is, have effect estimates that are the same sign and within a factor of 2 in size of one another.

thresh <- 0.10

shared.fold.size <- matrix(NA,nrow = ncol(lfsr.mash),ncol=ncol(lfsr.mash))
colnames(shared.fold.size) <- rownames(shared.fold.size) <- colnames(maxz)
for (i in 1:ncol(lfsr.mash))
  for (j in 1:ncol(lfsr.mash)) {
    sig.row=which(lfsr.mash[,i]<thresh)
    sig.col=which(lfsr.mash[,j]<thresh)
    a=(union(sig.row,sig.col))
    quotient=(pm.mash.beta[a,i]/pm.mash.beta[a,j])
    shared.fold.size[i,j] = mean(quotient > 0.5 & quotient < 2)
  }

# Plot heatmap of sharing by magnitude
# Generate the heatmap using the “levelplot” function from the lattice package.

clrs <- colorRampPalette(rev(c("#D73027","#FC8D59","#FEE090","#FFFFBF",
                               "#E0F3F8","#91BFDB","#4575B4")))(64)
lat <- shared.fold.size
lat[lower.tri(lat)] <- NA
n <- nrow(lat)

rownames(lat) <- gsub("(.*?)_(.*)", "\\1", rownames(lat))
colnames(lat) <- gsub("(.*?)_(.*)", "\\1", colnames(lat))

print(levelplot(lat[n:1,],col.regions = clrs,xlab = "",ylab = "",colorkey = TRUE))

# I am reading the input again to garantee the values! 

source(paste0(code_folder, "normfuncs.R"))
mash_rds = readRDS(paste0(fastqtl_to_mash_output, "QTLSumStats.mash.rds"))

b.stat <- mash_rds$strong.b
z.stat <- mash_rds$strong.z

mash_posterior = readRDS(paste0(mashr_flashr_workflow_output, "QTLSumStats.mash.EZ.FL_PC3.V_simple.posterior.rds"))
# names(mash_posterior)
# head(mash_posterior$lfsr)
# dim(mash_posterior$lfsr)

posterior.means <- mash_posterior$PosteriorMean
lfsr <- mash_posterior$lfsr

mar.var <- mash_posterior$PosteriorSD

colnames(lfsr)  <-
  colnames(mar.var) <- 
  colnames(posterior.means) <- 
  colnames(z.stat)
standard.error  <- b.stat/z.stat
posterior.betas <- standard.error * posterior.means
pm.beta.norm    <- het.norm(posterior.betas)

Metaplots

Each dot shows original (raw) effect estimate for a single tissue, then after mashR. Grey bars indicating ±2 s.e. In each case, mash combines information across all tissues, using the background information (patterns of sharing) learned from data on all eQTLs to produce more precise estimates.

tissue.colors = pal_futurama()(4)

create.metaplot <- function (j) {
  plot.new()
  pm.beta.norm  <- pm.beta.norm # Shuffle columns.
  z.shuffle     <- z.stat
  b.shuffle     <- b.stat
  post.var      <- mar.var
  post.bshuffle <- posterior.betas
  sem.shuffle   <- standard.error
  lfsr          <- lfsr
  plot.title    <- strsplit(rownames(z.stat)[j], "[.]")[[1]][1]
  
  x <- as.numeric(post.bshuffle[j,])
  
  layout(t(1:2))
  
  metaplot(as.numeric(b.shuffle[j,]),as.numeric(sem.shuffle[j,]),xlab = "",
           ylab = "",colors = meta.colors(box = as.character(tissue.colors)),
           labels = gsub("(.*?)_(.*)","\\1",colnames(z.stat)),
           xlim = c(-1,1))
  title(plot.title)
  
  # Transform to posterior sd of beta.
  sd <- as.numeric(sem.shuffle[j,]) * sqrt(as.numeric(post.var[j,]))
  metaplot(x,sd,xlab = "",ylab = "",
           colors = meta.colors(box = as.character(tissue.colors)),
           labels = gsub("(.*?)_(.*)","\\1",colnames(z.stat)),
           xlim = c(-1,1))
  title(plot.title)
}

Gene: CTSB

Cathepsin B. This gene encodes a member of the C1 family of peptidases. Alternative splicing of this gene results in multiple transcript variants. At least one of these variants encodes a preproprotein that is proteolytically processed to generate multiple protein products. These products include the cathepsin B light and heavy chains, which can dimerize to form the double chain form of the enzyme. This enzyme is a lysosomal cysteine protease with both endopeptidase and exopeptidase activity that may play a role in protein turnover. It is also known as amyloid precursor protein secretase and is involved in the proteolytic processing of amyloid precursor protein (APP). Incomplete proteolytic processing of APP has been suggested to be a causative factor in Alzheimer’s disease, the most common cause of dementia. Overexpression of the encoded protein has been associated with esophageal adenocarcinoma and other tumors. Multiple pseudogenes of this gene have been identified. [provided by RefSeq, Nov 2015]

create.metaplot(grep("ENSG00000164733", rownames(z.stat)))

Gene: MS4A6A

MS4A6A (Membrane Spanning 4-Domains A6A) is a Protein Coding gene. Diseases associated with MS4A6A include Polycystic Lipomembranous Osteodysplasia With Sclerosing Leukoencephalopathy and Alzheimer Disease.

create.metaplot(grep("ENSG00000110077", rownames(z.stat)))

Gene: MS4A14

MS4A14 (Membrane Spanning 4-Domains A14) is a Protein Coding gene. Diseases associated with MS4A14 include Lagophthalmos and Superficial Keratitis.

create.metaplot(grep("ENSG00000166928", rownames(z.stat)))

Gene: ALDOC

Example of a gene with max value of posterior beta for the tissue: MFG.

ALDOC (Aldolase, Fructose-Bisphosphate C) is a Protein Coding gene. Among its related pathways are Innate Immune System and Glucose metabolism. Gene Ontology (GO) annotations related to this gene include cytoskeletal protein binding and fructose-bisphosphate aldolase activity.

#posterior.betas[posterior.betas[,1]==max(posterior.betas[,1]),]
create.metaplot(posterior.betas[,1]==max(posterior.betas[,1]))

Gene: INTS4P1

Example of a gene with max value of posterior beta for the tissue: STG.

INTS4P1 (Integrator Complex Subunit 4 Pseudogene 1) is a Pseudogene.

create.metaplot(posterior.betas[,2]==max(posterior.betas[,2]))
# rownames(z.stat)[posterior.betas[,2]==max(posterior.betas[,2])]

Gene: NUDT9

Example of a gene with max value of posterior beta for the tissue: SVZ.

NUDT9 (Nudix Hydrolase 9) is a Protein Coding gene. Diseases associated with NUDT9 include Type 1 Diabetes Mellitus 10 and Arrhythmogenic Right Ventricular Dysplasia. Among its related pathways are Metabolism and Metabolism of nucleotides. Gene Ontology (GO) annotations related to this gene include hydrolase activity and ADP-sugar diphosphatase activity.

create.metaplot(posterior.betas[,3]==max(posterior.betas[,3]))
# rownames(z.stat)[posterior.betas[,3]==max(posterior.betas[,3])]

Gene: TM7SF2

Example of a gene with max value of posterior beta for the tissue: THA.

TM7SF2 (Transmembrane 7 Superfamily Member 2) is a Protein Coding gene. Among its related pathways are Regulation of cholesterol biosynthesis by SREBP and Metabolism. Gene Ontology (GO) annotations related to this gene include oxidoreductase activity, acting on the CH-CH group of donors, NAD or NADP as acceptor and delta14-sterol reductase activity.

create.metaplot(posterior.betas[,4]==max(posterior.betas[,4]))
# rownames(z.stat)[posterior.betas[,4]==max(posterior.betas[,4])]

Posterior beta

Complete table. Number of gene-SNP pairs:

dim(posterior.betas)[1]

[1] 18799

## Get conversion table for Gencode 30 from previous chunck 
posterior.betas_symbol = as.data.frame(posterior.betas)
colnames(posterior.betas_symbol) = gsub("(.*?)_(.*)","\\1",colnames(posterior.betas_symbol))
posterior.betas_symbol$ensembl = gsub("(.*?)_(.*)", "\\1", rownames(posterior.betas_symbol))
posterior.betas_symbol$snp = gsub("(.*?)_(.*)", "\\2", rownames(posterior.betas_symbol))
posterior.betas_symbol = merge(posterior.betas_symbol, gencode_30, by="ensembl")
posterior.betas_symbol$gene_id = NULL 
posterior.betas_symbol = posterior.betas_symbol[,c("ensembl", "snp", "GeneSymbol", "MFG", "STG", "THA", "SVZ")]
posterior.betas_symbol$snp = gsub("s", "rs", posterior.betas_symbol$snp)
  
write.table(posterior.betas, file = paste0(work_dir, "posterior.beta_all.txt"), sep = "\t", quote = F)
write.table(posterior.betas_symbol, file = paste0(work_dir, "posterior.beta_all_symbol.txt"), sep = "\t", quote = F)

createDT(posterior.betas_symbol)

sessionInfo()

R version 3.6.1 (2019-07-05) Platform: x86_64-apple-darwin15.6.0 (64-bit) Running under: macOS Catalina 10.15.4

Matrix products: default BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib

locale: [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages: [1] stats graphics grDevices utils datasets methods base

other attached packages: [1] ggsci_2.9 rmeta_3.0 dplyr_0.8.5 lattice_0.20-38

loaded via a namespace (and not attached): [1] Rcpp_1.0.4 later_1.0.0 pillar_1.4.3 compiler_3.6.1
[5] tools_3.6.1 digest_0.6.25 jsonlite_1.6.1 evaluate_0.14
[9] tibble_2.1.3 lifecycle_0.2.0 gtable_0.3.0 pkgconfig_2.0.3
[13] rlang_0.4.5 shiny_1.4.0 crosstalk_1.0.0 yaml_2.2.0
[17] xfun_0.11 fastmap_1.0.1 stringr_1.4.0 knitr_1.26
[21] htmlwidgets_1.5.1 grid_3.6.1 DT_0.13 tidyselect_0.2.5 [25] glue_1.4.0 R6_2.4.1 rmarkdown_2.0 ggplot2_3.3.0
[29] purrr_0.3.3 magrittr_1.5 promises_1.1.0 scales_1.1.0
[33] htmltools_0.4.0 assertthat_0.2.1 xtable_1.8-4 mime_0.8
[37] colorspace_1.4-1 httpuv_1.5.2 stringi_1.4.6 munsell_0.5.0
[41] crayon_1.3.4

Post mashR pipeline

Brain regions

Katia de Paiva Lopes
Raj Lab
Department of Neuroscience
Icahn School of Medicine at Mount Sinai
NYC, New York

2020-05-04

lfsr table at 10%

Pairwise plot

Metaplots

Gene: CTSB

Gene: MS4A6A

Gene: MS4A14

Gene: ALDOC

Gene: INTS4P1

Gene: NUDT9

Gene: TM7SF2

Posterior beta

Post mashR pipeline

Brain regions

Katia de Paiva Lopes Raj Lab Department of Neuroscience Icahn School of Medicine at Mount Sinai NYC, New York

2020-05-04

lfsr table at 10%

Pairwise plot

Metaplots

Gene: CTSB

Gene: MS4A6A

Gene: MS4A14

Gene: ALDOC

Gene: INTS4P1

Gene: NUDT9

Gene: TM7SF2

Posterior beta

Katia de Paiva Lopes
Raj Lab
Department of Neuroscience
Icahn School of Medicine at Mount Sinai
NYC, New York