::opts_chunk$set(echo = T, root.dir = here::here())
knitr# knitr::opts_knit$set(echo = T, root.dir = here::here())
library(dplyr)
library(ggplot2)
library(googledrive)
set.seed(2019)
# https://github.com/RajLabMSSM/catalogueR
library(catalogueR) # remotes::install_github("RajLabMSSM/catalogueR")
<- data.table::fread(here::here("data/Schilder2020/Schilder2020_topColoc.csv"))
colocs
::separate(colocs, col = "qtl.id",sep = "[.]", into = c("Study","Sample"), remove = F) %>%
tidyr::rename(qtl_id=qtl.id) %>%
dplyr::eQTL_Catalogue.annotate_tissues() %>%
catalogueR::fwrite("data/Schilder2020/Schilder2020_topColoc_annot.csv") data.table
<- list.files(here::here("data/Nalls2019/celltypes/FUMA_celltype569"),"*.gsa.out", full.names = T)
gsa.out_files
<- lapply(gsa.out_files, function(x){
DAT <-gsub("magma_celltype_|.gsa.out","",basename(x))
name print(x)
<- read.table(x, comment.char = "#", header = T) %>%
dat ::rename(Cell_Type=VARIABLE)
dplyr$Dataset <- name
dat$Source <- basename(x)
datreturn(dat)
%>% data.table::rbindlist(fill = T)
})
<- data.table::fread(here::here("data/Nalls2019/celltypes/step1_2_summary.txt")) %>%
sig_dat ::rename(Cell_Type=Cell_type) %>%
dplyr::mutate(Source="step1_2_summary.txt")
dplyr
<- rbind(sig_dat, DAT, fill=T) %>% dplyr::relocate(Dataset, Source, .after = "FULL_NAME")
merged_DAT ::fwrite(merged_DAT, here::here("data/Nalls2019/celltypes/Nalls2019_merged_celltype_results.csv")) data.table
<- lapply(list.files("data/Nalls2019/tissues", ".gcov.out", full.names = T), function(x){
tissues <- read.table(x, comment.char = "#", header = T)
dat $Source <- basename(x)
dat$Dataset <- "GTEx V7"
datreturn(dat)
%>% data.table::rbindlist(fill = T)
})
::fwrite(tissues, "data/Nalls2019/tissues/Nalls2019_merged_tissues_results.csv") data.table
Infer p-values from FD.
<- readxl::read_excel("data/metaanalysis/TableS1.xlsx", sheet = "Nott2019_celltypes")
celltypes
### Use the internal code of p.adjust to reconstruct p-values from FDR
<- 9*16
n <- celltypes$q
p <- names(p)
nm <- as.numeric(p)
p <- setNames(p, nm)
p0 if (all(nna <- !is.na(p))){
<- TRUE
nna else p <- p[nna]
} <- length(p)
lp <- lp:1L
i <- order(p, decreasing = TRUE)
o <- order(o)
ro <- pmin(1, cummin(n/i * p[o]))[ro]
q
# p_recon <- cummin(q[o] / (n/i))
<- celltypes$q
q $P_reconstructed <- pmin(1, cummin(q[o] / (n/i)))[ro]
celltypes
::mutate(celltypes, FDR_reconstructed=p.adjust(P_reconstructed, method = "fdr", n = 144)) %>% data.table::fwrite("data/Nott2019/Nott2019_celltypes.csv") dplyr
<- lapply(c("Table S5","Table S7"), function(sheet){
celltypes <- readxl::read_excel("data/Agarwal2020/SN_Paper_STables.xlsx", sheet = sheet, skip = 1) %>%
dat ::rename(Cell_Type=Cell)
dplyr<- grep("LDSC|MAGMA",colnames(dat), value = T, invert = T)
id_vars rbind(dat %>%
::select(all_of(id_vars), P=Pvalue_LDSC, Q=Qvalue_LDSC) %>%
dplyr::mutate(Method="LDSC"),
dplyr%>%
dat ::select(all_of(id_vars), P=Pvalue_MAGMA, Q=Qvalue_MAGMA) %>%
dplyr::mutate(Method="MAGMA")
dplyr%>%
) ::mutate(Sig=P, Source=paste0("Supplementary Materials (",sheet,")"))
dplyr%>% data.table::rbindlist() %>%
}) subset(Trait=="Parkinson disease")
<- readxl::read_excel("data/Agarwal2020/SN_Paper_STables.xlsx", sheet = "Table S6", skip = 1) %>%
cond_sn subset(!is.na(Trait)) %>% subset(Trait=="Parkinson's disease") %>%
::rename(Cell_Type=Cell1, Conditioned_Cell_Type=Cell2, P=Coefficient_P_value) %>%
dplyr::mutate(Region="SN", Method="conditional LDSC", Sig=P, Source=paste0("Supplementary Materials (","Table S6",")"))
dplyr
<- readxl::read_excel("data/Agarwal2020/SN_Paper_STables.xlsx", sheet = "Table S8", skip = 1) %>%
tmp ::rename(Trait=GWA) %>%
dplyrsubset(!is.na(Trait)) %>% subset(Trait=="Parkinson's disease") %>%
::rename(Region=Atlas1, Conditioned_Region=Atlas2, Cell_Type=Cell1, Conditioned_Cell_Type=Cell2)
dplyr<- grep("LDSC|MAGMA",colnames(tmp), value = T, invert = T)
id_vars <- rbind(tmp %>% dplyr::rename() %>%
cond_ctx ::select(all_of(id_vars), P="P-value (LDSC) (Original Cellular Atlas))") %>%
dplyr::mutate(Method="conditional LDSC (Original Cellular Atlas)"),
dplyr%>% dplyr::rename() %>%
tmp ::select(all_of(id_vars), P="P-value (LDSC) (Matched Cellular Atlas)") %>%
dplyr::mutate(Method="conditional LDSC (Matched Cellular Atlas)")
dplyr%>%
) ::mutate(Sig=P, Source=paste0("Supplementary Materials (","Table S8",")"))
dplyr
## Merge
<- readxl::read_excel("data/Agarwal2020/SN_Paper_STables.xlsx", sheet = "Abbreviations")
keys <- setNames(keys$`Full Name`, keys$Abbreviation)
keys_dict "Substantia nigra"] = "Substantia nigra"
keys_dict["Cortex"] = "Cortex"
keys_dict[
<- data.table::rbindlist(list(celltypes, cond_sn, cond_ctx), fill = T) %>%
celltypes_all relocate(Sig, Source, .after = Conditioned_Region)
## Not all celltypes included in Abbreviations...
# celltypes_all$Cell_Type <- keys_dict[celltypes_all$Cell_Type]
$Region <- keys_dict[celltypes_all$Region]
celltypes_all$Conditioned_Region <- keys_dict[celltypes_all$Conditioned_Region]
celltypes_all
::fwrite(celltypes_all, "data/Agarwal2020/Agarwal2020_celltypes.csv")
data.table celltypes_all
<- readxl::read_excel("data/Agarwal2020/SN_Paper_STables.xlsx", sheet = "Table S9", skip = 2)
dat <- grep("LDSC|MAGMA",colnames(dat), value = T, invert = T)
id_vars <- rbind(dat %>%
tissues ::select(all_of(id_vars), P=Pvalue_LDSC, Q=Qvalue_LDSC) %>%
dplyr::mutate(Method="LDSC"),
dplyr%>%
dat ::select(all_of(id_vars), P=Pvalue_MAGMA, Q=Qvalue_MAGMA) %>%
dplyr::mutate(Method="MAGMA")
dplyr%>%
) ::mutate(Sig=P, Source=paste0("Supplementary Materials (","Table S9",")")) %>%
dplyrsubset(Trait=="Parkinson disease")
::fwrite(tissues, "data/Agarwal2020/Agarwal2020_tissues.csv") data.table
Fig. 3: Evaluation of the shared cell-type associations between pairs of neuropsychiatric disorders. Evaluation of alike cell-type associations between any two neuropsychiatric disorders to identify any shared cell-type-specific component of risk, for the SN cell types (top) and the cortex (bottom). Each heatmap represents the results from LDSC of the associations (p value associated with an LDSC coefficient) of a specific cell-type expression profile with the genetic risk of a given neuropsychiatric disorder (disease1—X-axis) after conditioning on the genetic risk of another neuropsychiatric disorder (disease2—Y-axis). This analysis was only performed where two neuropsychiatric disorders showed a significant (or suggestive) association with the same cell type (Fig. 2). The blue heatmap colours are proportional to −log10 q value (FDR-adjusted p value) of the enrichment of genetic variants associated with a disorder adjusted for another disorder. The cell associations that were not evaluated (no overlap in Fig. 2) are coloured in dark grey.
<- readxl::read_excel("data/Agarwal2020/SN_Paper_STables.xlsx", sheet = "Table S6", skip = 1)%>%
cond subset(!is.na(Trait)) %>%
::mutate(Q = p.adjust(Coefficient_P_value, method = "fdr")) %>%
dplyr::mutate(neg_logQ = -log10(Q)) %>%
dplyr::data.table() #%>%
data.table# data.table::dcast(formula = Trait ~ + Cell1 + Cell2, fun.aggregate = mean, value.var = "Q")
ggplot(cond, aes(y=Trait, x=Trait, fill=neg_logQ)) +
geom_tile() +
facet_grid(facets = .~Cell1, switch = "x") +
theme_bw() +
theme(axis.text.x = element_text(angle=45, hjust=1))
Try running my own variation of phenotype correlations.
<- lapply(c("Table S5","Table S7"), function(sheet){
celltypes <- readxl::read_excel("data/Agarwal2020/SN_Paper_STables.xlsx", sheet = sheet, skip = 1) %>%
dat ::rename(Cell_Type=Cell)
dplyr<- grep("LDSC|MAGMA",colnames(dat), value = T, invert = T)
id_vars rbind(dat %>%
::select(all_of(id_vars), P=Pvalue_LDSC, Q=Qvalue_LDSC) %>%
dplyr::mutate(Method="LDSC"),
dplyr%>%
dat ::select(all_of(id_vars), P=Pvalue_MAGMA, Q=Qvalue_MAGMA) %>%
dplyr::mutate(Method="MAGMA")
dplyr%>%
) ::mutate(Sig=P, Source=paste0("Supplementary Materials (",sheet,")"))
dplyr%>% data.table::rbindlist() %>%
}) subset(Method=="LDSC")
<- function(celltypes, type=NULL){
correlate_phenos if(!is.null(type)) celltypes <- subset(celltypes, Cell_Type==type)
<- data.table::dcast.data.table(celltypes,
p_mat formula = Trait ~ Cell_Type + Method,
value.var = "P",
fun.aggregate = mean, na.rm=T) %>%
::column_to_rownames("Trait") %>% as.matrix()
tibble<- cor(t(p_mat[complete.cases(p_mat),]))
cor_pheno return(cor_pheno)
}
## Correlate across all celltypes
<- correlate_phenos(celltypes)
cor_all heatmap(cor_all)
::corrplot(cor_pheno, tl.col = "black", tl.cex = .8)
corrplot
::melt(cor_all) %>%
reshape2`colnames<-`(c("Trait1", "Trait2","corr")) %>%
::arrange(Trait1, desc(corr)) %>%
dplyr::fwrite("data/Agarwal2020/Agarwal2020_correlations.csv") data.table
<- readxl::read_excel("data/Coetzee2016/41598_2016_BFsrep30509_MOESM3_ESM.xls",
key col_names = c("id","group","abbrev","name"))
<- readxl::read_excel("data/Coetzee2016/41598_2016_BFsrep30509_MOESM2_ESM.xls")
snps colnames(snps)[1] <- "abbrev"
<- merge(key, snps, by="abbrev") %>% data.table::data.table() %>%
snps_annot ::melt.data.table(id.vars = c("name","id","group","abbrev"),
data.tablevariable.name = "snp", value.name = "neg_nat_log") %>%
::mutate(p=exp(-neg_nat_log)) %>%
dplyr::arrange(p) dplyr
<- subset(snps_annot, grepl("cell|cyte|nuclei",name, ignore.case = T))
celltypes unique(celltypes$name)
::fwrite(celltypes, "data/Coetzee2016/Coetze2016_celltypes.csv")
data.table
<-celltypes %>%
celltypes_agg ::group_by(group, name) %>%
dplyr::summarise_at(.vars = c("p"),
dplyr.funs = list(mean_p=mean, max_p=max, min_p=min)) %>%
::arrange(min_p)
dplyr celltypes_agg
<- subset(snps_annot, !name %in% c(celltypes$name))
tissues unique(tissues$name)
::fwrite(tissues, "data/Coetzee2016/Coetze2016_tissues.csv")
data.table
<- tissues %>%
tissues_agg ::group_by(group, name) %>%
dplyr::summarise_at(.vars = c("p"),
dplyr.funs = list(mean_p=mean, max_p=max, min_p=min)) %>%
::arrange(min_p)
dplyr tissues_agg
From Garfield documentation:
out.file.perm: contains enrichment analysis results for each annotation, where PThresh is the GWAS p-value threshold used for analysis, FE denotes the fold enrichment statistic (equals -1 if no sufficient data was available for the FE calculation), EmpPval shows the empirical p-value of enrichment (equals -1 if FE is calculated but significance of enrichment analysis is not run at that threshold), NAnnotThresh - the number of variants at the threshold which are annotated with the given feature, NAnnot - the total number of annotated variants, NThresh - the total number of variants at that threshold and N - the total number of pruned variants. The remaining columns show additional information on the annotations used for analysis.
<- lapply(list.files("data/Li2019/", "*garfield.perm.*",recursive = T, full.names = T), function(x){
dat <- data.table::fread(x)
d $Trait <- gsub("garfield.perm.|.out","",basename(x))
dreturn(d)
%>% data.table::rbindlist() %>%
}) ::rename(Datatype=Tissue, Tissue_Celltype=Celltype) %>%
dplyr::mutate(Tissue_Celltype=gsub("*_.*","", Tissue_Celltype)) %>%
dplyr::mutate(Tissue_Celltype=gsub("mono","monocytes",Tissue_Celltype)) %>%
dplyr::mutate(Tissue_Celltype=gsub("tcel","T-cells",Tissue_Celltype)) %>%
dplyr::mutate(Tissue_Celltype=gsub("neut","neutrocytes",Tissue_Celltype)) %>%
dplyr::mutate(Tissue_Celltype=gsub("dlpfc","DLPFC",Tissue_Celltype)) %>%
dplyr# Mod
::mutate(Category=gsub("blueprint","BLUEPRINT",Category)) %>%
dplyr::mutate(Category=gsub("CMC","CommonMindConsortium",Category)) %>%
dplyr::mutate(Category=gsub("fairfax","Fairfax2014",Category)) %>%
dplyr::mutate(Category=gsub("immvar","ImmVar",Category)) %>%
dplyr::mutate(Category=gsub("rosmap","ROSMAP",Category)) %>%
dplyr# Mod
::mutate(Datatype=gsub("blueprint","BLUEPRINT",Datatype)) %>%
dplyr::mutate(Datatype=gsub("CMC","CommonMindConsortium",Datatype)) %>%
dplyr::mutate(Datatype=gsub("fairfax","Fairfax2014",Datatype)) %>%
dplyr::mutate(Datatype=gsub("immvar","ImmVar",Datatype)) %>%
dplyr::mutate(Datatype=gsub("rosmap","ROSMAP",Datatype)) %>%
dplyr::mutate(Dataset=paste(Category,Datatype,sep="_"),
dplyrSig = ifelse(EmpPval==-1, NA, EmpPval))
unique(dat$Tissue_Celltype)
unique(dat$Dataset)
unique(dat$Datatype)
summary(dat$Sig)
hist(dat$Sig, 50)
<- subset(dat, grepl("monocytes|neutrocytes|T-cells",Tissue_Celltype)) %>%
celltypes ::relocate(Tissue_Celltype, 1) %>%
dplyr::rename(Cell_Type=Tissue_Celltype)
dplyr
celltypes::fwrite(celltypes,"data/Li2019/Li2019_celltypes.csv") data.table
<- subset(dat, !grepl("monocytes|neutrocytes|T-cells",Tissue_Celltype)) %>%
tissues ::relocate(Tissue_Celltype, 1) %>%
dplyr::rename(Tissue=Tissue_Celltype)
dplyr
tissues::fwrite(tissues,"data/Li2019/Li2019_tissues.csv") data.table
Data shared from Corces2020 via Supplementary Data 7 of the PD omics review was simply copy and pasted into Google Sheets with minor editing (thanks to their excellent data organization!).
Still, I’d like to provide a compressed version of Corces2020’s Supplementary Data 7 since it’s 100Mb as an excel sheet, which is above GitHub’s file size limit. Reprocessing here to allow this.
We also used data from Supplementary Data 8, which is small enough that we can just provide it in its original excel format.
Note: There’s a typo in the header of Supplementary Data 7 excel sheet which mislabels it as Supplementary Table 8.
Code used to produce these datasets in the original publication can be found here.
<- here::here("data/Corces2020/41588_2020_721_MOESM10_ESM.xlsx")
data7_path <- here::here("data/Corces2020/41588_2020_721_MOESM11_ESM.xlsx")
data8_path
if(!file.exists(here::here(data7_path))){
download.file("https://static-content.springer.com/esm/art%3A10.1038%2Fs41588-020-00721-x/MediaObjects/41588_2020_721_MOESM10_ESM.xlsx",data7_path)
}if(!file.exists(here::here(data8_path))){
download.file("https://static-content.springer.com/esm/art%3A10.1038%2Fs41588-020-00721-x/MediaObjects/41588_2020_721_MOESM11_ESM.xlsx",data8_path)
}
<- readxl::excel_sheets(data7_path)
sheets <- c("NeuronClusterDefinitions","markerGenes_log2(FoldChange)","markerGenes_FDR")
skip14
<- lapply(sheets, function(x){
split_files print(x)
<- if(x %in% skip14) 14 else 12
skip <- readxl::read_excel(data7_path, sheet = x, skip = skip)
dat colnames(dat) <- gsub(" |[.]","_",colnames(dat))
colnames(dat) <- gsub("[(]|[)]","",colnames(dat))
$Sheet <- x
dat<- gsub(" |-|[.]","_",gsub("[(]|[)]","",x))
sheet_name <- file.path(dirname(data7_path),paste0("Corces2020.SuppData7.",sheet_name,".csv.gz"))
new_file ::fwrite(dat,new_file)
data.tablereturn(new_file)
%>% `names<-`(sheets) })
::sessionInfo() utils
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] googledrive_2.0.0 ggplot2_3.3.5 dplyr_1.0.7
##
## loaded via a namespace (and not attached):
## [1] pillar_1.6.2 bslib_0.2.5.1 compiler_4.1.0 jquerylib_0.1.4
## [5] tools_4.1.0 digest_0.6.27 gargle_1.2.0 jsonlite_1.7.2
## [9] evaluate_0.14 lifecycle_1.0.0 tibble_3.1.3 gtable_0.3.0
## [13] pkgconfig_2.0.3 rlang_0.4.11 DBI_1.1.1 yaml_2.2.1
## [17] xfun_0.25 withr_2.4.2 stringr_1.4.0 knitr_1.33
## [21] fs_1.5.0 generics_0.1.0 vctrs_0.3.8 sass_0.4.0
## [25] rprojroot_2.0.2 grid_4.1.0 tidyselect_1.1.1 glue_1.4.2
## [29] here_1.0.1 R6_2.5.0 fansi_0.5.0 rmarkdown_2.10
## [33] purrr_0.3.4 magrittr_2.0.1 scales_1.1.1 ellipsis_0.3.2
## [37] htmltools_0.5.1.1 assertthat_0.2.1 colorspace_2.0-2 utf8_1.2.2
## [41] stringi_1.7.3 munsell_0.5.0 crayon_1.4.1