knitr::opts_chunk$set(echo = T, root.dir = here::here())
# knitr::opts_knit$set(echo = T, root.dir = here::here())
library(dplyr)
library(ggplot2)
library(googledrive)
set.seed(2019)# https://github.com/RajLabMSSM/catalogueR
library(catalogueR) # remotes::install_github("RajLabMSSM/catalogueR")
colocs <- data.table::fread(here::here("data/Schilder2020/Schilder2020_topColoc.csv"))
tidyr::separate(colocs, col = "qtl.id",sep = "[.]", into = c("Study","Sample"), remove = F) %>%
dplyr::rename(qtl_id=qtl.id) %>%
catalogueR::eQTL_Catalogue.annotate_tissues() %>%
data.table::fwrite("data/Schilder2020/Schilder2020_topColoc_annot.csv")gsa.out_files <- list.files(here::here("data/Nalls2019/celltypes/FUMA_celltype569"),"*.gsa.out", full.names = T)
DAT <- lapply(gsa.out_files, function(x){
name <-gsub("magma_celltype_|.gsa.out","",basename(x))
print(x)
dat <- read.table(x, comment.char = "#", header = T) %>%
dplyr::rename(Cell_Type=VARIABLE)
dat$Dataset <- name
dat$Source <- basename(x)
return(dat)
}) %>% data.table::rbindlist(fill = T)
sig_dat <- data.table::fread(here::here("data/Nalls2019/celltypes/step1_2_summary.txt")) %>%
dplyr::rename(Cell_Type=Cell_type) %>%
dplyr::mutate(Source="step1_2_summary.txt")
merged_DAT <- rbind(sig_dat, DAT, fill=T) %>% dplyr::relocate(Dataset, Source, .after = "FULL_NAME")
data.table::fwrite(merged_DAT, here::here("data/Nalls2019/celltypes/Nalls2019_merged_celltype_results.csv"))tissues <- lapply(list.files("data/Nalls2019/tissues", ".gcov.out", full.names = T), function(x){
dat <- read.table(x, comment.char = "#", header = T)
dat$Source <- basename(x)
dat$Dataset <- "GTEx V7"
return(dat)
}) %>% data.table::rbindlist(fill = T)
data.table::fwrite(tissues, "data/Nalls2019/tissues/Nalls2019_merged_tissues_results.csv")Infer p-values from FD.
celltypes <- readxl::read_excel("data/metaanalysis/TableS1.xlsx", sheet = "Nott2019_celltypes")
### Use the internal code of p.adjust to reconstruct p-values from FDR
n <- 9*16
p <- celltypes$q
nm <- names(p)
p <- as.numeric(p)
p0 <- setNames(p, nm)
if (all(nna <- !is.na(p))){
nna <- TRUE
} else p <- p[nna]
lp <- length(p)
i <- lp:1L
o <- order(p, decreasing = TRUE)
ro <- order(o)
q <- pmin(1, cummin(n/i * p[o]))[ro]
# p_recon <- cummin(q[o] / (n/i))
q <- celltypes$q
celltypes$P_reconstructed <- pmin(1, cummin(q[o] / (n/i)))[ro]
dplyr::mutate(celltypes, FDR_reconstructed=p.adjust(P_reconstructed, method = "fdr", n = 144)) %>% data.table::fwrite("data/Nott2019/Nott2019_celltypes.csv")celltypes <- lapply(c("Table S5","Table S7"), function(sheet){
dat <- readxl::read_excel("data/Agarwal2020/SN_Paper_STables.xlsx", sheet = sheet, skip = 1) %>%
dplyr::rename(Cell_Type=Cell)
id_vars <- grep("LDSC|MAGMA",colnames(dat), value = T, invert = T)
rbind(dat %>%
dplyr::select(all_of(id_vars), P=Pvalue_LDSC, Q=Qvalue_LDSC) %>%
dplyr::mutate(Method="LDSC"),
dat %>%
dplyr::select(all_of(id_vars), P=Pvalue_MAGMA, Q=Qvalue_MAGMA) %>%
dplyr::mutate(Method="MAGMA")
) %>%
dplyr::mutate(Sig=P, Source=paste0("Supplementary Materials (",sheet,")"))
}) %>% data.table::rbindlist() %>%
subset(Trait=="Parkinson disease")
cond_sn <- readxl::read_excel("data/Agarwal2020/SN_Paper_STables.xlsx", sheet = "Table S6", skip = 1) %>%
subset(!is.na(Trait)) %>% subset(Trait=="Parkinson's disease") %>%
dplyr::rename(Cell_Type=Cell1, Conditioned_Cell_Type=Cell2, P=Coefficient_P_value) %>%
dplyr::mutate(Region="SN", Method="conditional LDSC", Sig=P, Source=paste0("Supplementary Materials (","Table S6",")"))
tmp <- readxl::read_excel("data/Agarwal2020/SN_Paper_STables.xlsx", sheet = "Table S8", skip = 1) %>%
dplyr::rename(Trait=GWA) %>%
subset(!is.na(Trait)) %>% subset(Trait=="Parkinson's disease") %>%
dplyr::rename(Region=Atlas1, Conditioned_Region=Atlas2, Cell_Type=Cell1, Conditioned_Cell_Type=Cell2)
id_vars <- grep("LDSC|MAGMA",colnames(tmp), value = T, invert = T)
cond_ctx <- rbind(tmp %>% dplyr::rename() %>%
dplyr::select(all_of(id_vars), P="P-value (LDSC) (Original Cellular Atlas))") %>%
dplyr::mutate(Method="conditional LDSC (Original Cellular Atlas)"),
tmp %>% dplyr::rename() %>%
dplyr::select(all_of(id_vars), P="P-value (LDSC) (Matched Cellular Atlas)") %>%
dplyr::mutate(Method="conditional LDSC (Matched Cellular Atlas)")
) %>%
dplyr::mutate(Sig=P, Source=paste0("Supplementary Materials (","Table S8",")"))
## Merge
keys <- readxl::read_excel("data/Agarwal2020/SN_Paper_STables.xlsx", sheet = "Abbreviations")
keys_dict <- setNames(keys$`Full Name`, keys$Abbreviation)
keys_dict["Substantia nigra"] = "Substantia nigra"
keys_dict["Cortex"] = "Cortex"
celltypes_all <- data.table::rbindlist(list(celltypes, cond_sn, cond_ctx), fill = T) %>%
relocate(Sig, Source, .after = Conditioned_Region)
## Not all celltypes included in Abbreviations...
# celltypes_all$Cell_Type <- keys_dict[celltypes_all$Cell_Type]
celltypes_all$Region <- keys_dict[celltypes_all$Region]
celltypes_all$Conditioned_Region <- keys_dict[celltypes_all$Conditioned_Region]
data.table::fwrite(celltypes_all, "data/Agarwal2020/Agarwal2020_celltypes.csv")
celltypes_alldat <- readxl::read_excel("data/Agarwal2020/SN_Paper_STables.xlsx", sheet = "Table S9", skip = 2)
id_vars <- grep("LDSC|MAGMA",colnames(dat), value = T, invert = T)
tissues <- rbind(dat %>%
dplyr::select(all_of(id_vars), P=Pvalue_LDSC, Q=Qvalue_LDSC) %>%
dplyr::mutate(Method="LDSC"),
dat %>%
dplyr::select(all_of(id_vars), P=Pvalue_MAGMA, Q=Qvalue_MAGMA) %>%
dplyr::mutate(Method="MAGMA")
) %>%
dplyr::mutate(Sig=P, Source=paste0("Supplementary Materials (","Table S9",")")) %>%
subset(Trait=="Parkinson disease")
data.table::fwrite(tissues, "data/Agarwal2020/Agarwal2020_tissues.csv")Fig. 3: Evaluation of the shared cell-type associations between pairs of neuropsychiatric disorders. Evaluation of alike cell-type associations between any two neuropsychiatric disorders to identify any shared cell-type-specific component of risk, for the SN cell types (top) and the cortex (bottom). Each heatmap represents the results from LDSC of the associations (p value associated with an LDSC coefficient) of a specific cell-type expression profile with the genetic risk of a given neuropsychiatric disorder (disease1—X-axis) after conditioning on the genetic risk of another neuropsychiatric disorder (disease2—Y-axis). This analysis was only performed where two neuropsychiatric disorders showed a significant (or suggestive) association with the same cell type (Fig. 2). The blue heatmap colours are proportional to −log10 q value (FDR-adjusted p value) of the enrichment of genetic variants associated with a disorder adjusted for another disorder. The cell associations that were not evaluated (no overlap in Fig. 2) are coloured in dark grey.
cond <- readxl::read_excel("data/Agarwal2020/SN_Paper_STables.xlsx", sheet = "Table S6", skip = 1)%>%
subset(!is.na(Trait)) %>%
dplyr::mutate(Q = p.adjust(Coefficient_P_value, method = "fdr")) %>%
dplyr::mutate(neg_logQ = -log10(Q)) %>%
data.table::data.table() #%>%
# data.table::dcast(formula = Trait ~ + Cell1 + Cell2, fun.aggregate = mean, value.var = "Q")
ggplot(cond, aes(y=Trait, x=Trait, fill=neg_logQ)) +
geom_tile() +
facet_grid(facets = .~Cell1, switch = "x") +
theme_bw() +
theme(axis.text.x = element_text(angle=45, hjust=1))Try running my own variation of phenotype correlations.
celltypes <- lapply(c("Table S5","Table S7"), function(sheet){
dat <- readxl::read_excel("data/Agarwal2020/SN_Paper_STables.xlsx", sheet = sheet, skip = 1) %>%
dplyr::rename(Cell_Type=Cell)
id_vars <- grep("LDSC|MAGMA",colnames(dat), value = T, invert = T)
rbind(dat %>%
dplyr::select(all_of(id_vars), P=Pvalue_LDSC, Q=Qvalue_LDSC) %>%
dplyr::mutate(Method="LDSC"),
dat %>%
dplyr::select(all_of(id_vars), P=Pvalue_MAGMA, Q=Qvalue_MAGMA) %>%
dplyr::mutate(Method="MAGMA")
) %>%
dplyr::mutate(Sig=P, Source=paste0("Supplementary Materials (",sheet,")"))
}) %>% data.table::rbindlist() %>%
subset(Method=="LDSC")
correlate_phenos <- function(celltypes, type=NULL){
if(!is.null(type)) celltypes <- subset(celltypes, Cell_Type==type)
p_mat <- data.table::dcast.data.table(celltypes,
formula = Trait ~ Cell_Type + Method,
value.var = "P",
fun.aggregate = mean, na.rm=T) %>%
tibble::column_to_rownames("Trait") %>% as.matrix()
cor_pheno <- cor(t(p_mat[complete.cases(p_mat),]))
return(cor_pheno)
}
## Correlate across all celltypes
cor_all <- correlate_phenos(celltypes)
heatmap(cor_all)
corrplot::corrplot(cor_pheno, tl.col = "black", tl.cex = .8)
reshape2::melt(cor_all) %>%
`colnames<-`(c("Trait1", "Trait2","corr")) %>%
dplyr::arrange(Trait1, desc(corr)) %>%
data.table::fwrite("data/Agarwal2020/Agarwal2020_correlations.csv")key <- readxl::read_excel("data/Coetzee2016/41598_2016_BFsrep30509_MOESM3_ESM.xls",
col_names = c("id","group","abbrev","name"))
snps <- readxl::read_excel("data/Coetzee2016/41598_2016_BFsrep30509_MOESM2_ESM.xls")
colnames(snps)[1] <- "abbrev"
snps_annot <- merge(key, snps, by="abbrev") %>% data.table::data.table() %>%
data.table::melt.data.table(id.vars = c("name","id","group","abbrev"),
variable.name = "snp", value.name = "neg_nat_log") %>%
dplyr::mutate(p=exp(-neg_nat_log)) %>%
dplyr::arrange(p)celltypes <- subset(snps_annot, grepl("cell|cyte|nuclei",name, ignore.case = T))
unique(celltypes$name)
data.table::fwrite(celltypes, "data/Coetzee2016/Coetze2016_celltypes.csv")
celltypes_agg <-celltypes %>%
dplyr::group_by(group, name) %>%
dplyr::summarise_at(.vars = c("p"),
.funs = list(mean_p=mean, max_p=max, min_p=min)) %>%
dplyr::arrange(min_p)
celltypes_agg tissues <- subset(snps_annot, !name %in% c(celltypes$name))
unique(tissues$name)
data.table::fwrite(tissues, "data/Coetzee2016/Coetze2016_tissues.csv")
tissues_agg <- tissues %>%
dplyr::group_by(group, name) %>%
dplyr::summarise_at(.vars = c("p"),
.funs = list(mean_p=mean, max_p=max, min_p=min)) %>%
dplyr::arrange(min_p)
tissues_aggFrom Garfield documentation:
out.file.perm: contains enrichment analysis results for each annotation, where PThresh is the GWAS p-value threshold used for analysis, FE denotes the fold enrichment statistic (equals -1 if no sufficient data was available for the FE calculation), EmpPval shows the empirical p-value of enrichment (equals -1 if FE is calculated but significance of enrichment analysis is not run at that threshold), NAnnotThresh - the number of variants at the threshold which are annotated with the given feature, NAnnot - the total number of annotated variants, NThresh - the total number of variants at that threshold and N - the total number of pruned variants. The remaining columns show additional information on the annotations used for analysis.
dat <- lapply(list.files("data/Li2019/", "*garfield.perm.*",recursive = T, full.names = T), function(x){
d <- data.table::fread(x)
d$Trait <- gsub("garfield.perm.|.out","",basename(x))
return(d)
}) %>% data.table::rbindlist() %>%
dplyr::rename(Datatype=Tissue, Tissue_Celltype=Celltype) %>%
dplyr::mutate(Tissue_Celltype=gsub("*_.*","", Tissue_Celltype)) %>%
dplyr::mutate(Tissue_Celltype=gsub("mono","monocytes",Tissue_Celltype)) %>%
dplyr::mutate(Tissue_Celltype=gsub("tcel","T-cells",Tissue_Celltype)) %>%
dplyr::mutate(Tissue_Celltype=gsub("neut","neutrocytes",Tissue_Celltype)) %>%
dplyr::mutate(Tissue_Celltype=gsub("dlpfc","DLPFC",Tissue_Celltype)) %>%
# Mod
dplyr::mutate(Category=gsub("blueprint","BLUEPRINT",Category)) %>%
dplyr::mutate(Category=gsub("CMC","CommonMindConsortium",Category)) %>%
dplyr::mutate(Category=gsub("fairfax","Fairfax2014",Category)) %>%
dplyr::mutate(Category=gsub("immvar","ImmVar",Category)) %>%
dplyr::mutate(Category=gsub("rosmap","ROSMAP",Category)) %>%
# Mod
dplyr::mutate(Datatype=gsub("blueprint","BLUEPRINT",Datatype)) %>%
dplyr::mutate(Datatype=gsub("CMC","CommonMindConsortium",Datatype)) %>%
dplyr::mutate(Datatype=gsub("fairfax","Fairfax2014",Datatype)) %>%
dplyr::mutate(Datatype=gsub("immvar","ImmVar",Datatype)) %>%
dplyr::mutate(Datatype=gsub("rosmap","ROSMAP",Datatype)) %>%
dplyr::mutate(Dataset=paste(Category,Datatype,sep="_"),
Sig = ifelse(EmpPval==-1, NA, EmpPval))
unique(dat$Tissue_Celltype)
unique(dat$Dataset)
unique(dat$Datatype)
summary(dat$Sig)
hist(dat$Sig, 50)celltypes <- subset(dat, grepl("monocytes|neutrocytes|T-cells",Tissue_Celltype)) %>%
dplyr::relocate(Tissue_Celltype, 1) %>%
dplyr::rename(Cell_Type=Tissue_Celltype)
celltypes
data.table::fwrite(celltypes,"data/Li2019/Li2019_celltypes.csv")tissues <- subset(dat, !grepl("monocytes|neutrocytes|T-cells",Tissue_Celltype)) %>%
dplyr::relocate(Tissue_Celltype, 1) %>%
dplyr::rename(Tissue=Tissue_Celltype)
tissues
data.table::fwrite(tissues,"data/Li2019/Li2019_tissues.csv")Data shared from Corces2020 via Supplementary Data 7 of the PD omics review was simply copy and pasted into Google Sheets with minor editing (thanks to their excellent data organization!).
Still, I’d like to provide a compressed version of Corces2020’s Supplementary Data 7 since it’s 100Mb as an excel sheet, which is above GitHub’s file size limit. Reprocessing here to allow this.
We also used data from Supplementary Data 8, which is small enough that we can just provide it in its original excel format.
Note: There’s a typo in the header of Supplementary Data 7 excel sheet which mislabels it as Supplementary Table 8.
Code used to produce these datasets in the original publication can be found here.
data7_path <- here::here("data/Corces2020/41588_2020_721_MOESM10_ESM.xlsx")
data8_path <- here::here("data/Corces2020/41588_2020_721_MOESM11_ESM.xlsx")
if(!file.exists(here::here(data7_path))){
download.file("https://static-content.springer.com/esm/art%3A10.1038%2Fs41588-020-00721-x/MediaObjects/41588_2020_721_MOESM10_ESM.xlsx",data7_path)
}
if(!file.exists(here::here(data8_path))){
download.file("https://static-content.springer.com/esm/art%3A10.1038%2Fs41588-020-00721-x/MediaObjects/41588_2020_721_MOESM11_ESM.xlsx",data8_path)
}
sheets <- readxl::excel_sheets(data7_path)
skip14 <- c("NeuronClusterDefinitions","markerGenes_log2(FoldChange)","markerGenes_FDR")
split_files <- lapply(sheets, function(x){
print(x)
skip <- if(x %in% skip14) 14 else 12
dat <- readxl::read_excel(data7_path, sheet = x, skip = skip)
colnames(dat) <- gsub(" |[.]","_",colnames(dat))
colnames(dat) <- gsub("[(]|[)]","",colnames(dat))
dat$Sheet <- x
sheet_name <- gsub(" |-|[.]","_",gsub("[(]|[)]","",x))
new_file <- file.path(dirname(data7_path),paste0("Corces2020.SuppData7.",sheet_name,".csv.gz"))
data.table::fwrite(dat,new_file)
return(new_file)
}) %>% `names<-`(sheets)utils::sessionInfo()## R version 4.1.0 (2021-05-18)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] googledrive_2.0.0 ggplot2_3.3.5 dplyr_1.0.7
##
## loaded via a namespace (and not attached):
## [1] pillar_1.6.2 bslib_0.2.5.1 compiler_4.1.0 jquerylib_0.1.4
## [5] tools_4.1.0 digest_0.6.27 gargle_1.2.0 jsonlite_1.7.2
## [9] evaluate_0.14 lifecycle_1.0.0 tibble_3.1.3 gtable_0.3.0
## [13] pkgconfig_2.0.3 rlang_0.4.11 DBI_1.1.1 yaml_2.2.1
## [17] xfun_0.25 withr_2.4.2 stringr_1.4.0 knitr_1.33
## [21] fs_1.5.0 generics_0.1.0 vctrs_0.3.8 sass_0.4.0
## [25] rprojroot_2.0.2 grid_4.1.0 tidyselect_1.1.1 glue_1.4.2
## [29] here_1.0.1 R6_2.5.0 fansi_0.5.0 rmarkdown_2.10
## [33] purrr_0.3.4 magrittr_2.0.1 scales_1.1.1 ellipsis_0.3.2
## [37] htmltools_0.5.1.1 assertthat_0.2.1 colorspace_2.0-2 utf8_1.2.2
## [41] stringi_1.7.3 munsell_0.5.0 crayon_1.4.1