## Setup, Load Libraries & Report Versions
knitr::opts_chunk$set(echo = TRUE)
library(pacman)
p_load(tidyverse,DT,ggsci,ggeasy,UpSetR,gridExtra,scales,ggrepel,scales,factoextra)
createDT <- function(DF, caption="", scrollY=500){
data <- DT::datatable(DF, caption=caption,
extensions = 'Buttons',
class = "display",
callback = JS("return table;"),
filter = c("none", "bottom", "top"),
escape = TRUE,
style = "default", width = NULL, height = NULL, elementId = NULL,
fillContainer = getOption("DT.fillContainer", NULL),
autoHideNavigation = getOption("DT.autoHideNavigation", NULL),
selection = c("multiple", "single", "none"),
plugins = NULL, editable = FALSE,
options = list( dom = 'Bfrtip',
buttons = c('copy', 'csv', 'excel', 'pdf', 'print'),
scrollY = scrollY, scrollX=T, scrollCollapse = T, paging = T,
columnDefs = list(list(className = 'dt-center', targets = "_all"))
)
)
return(data)
}
work_dir = "~/ad-omics_hydra/microglia_omics/expression_tables/added_pilot_314s/"
genotype_dir = "~/pd-omics_hydra/glia_omics/genotypes/"
metadata_path = "~/pd-omics/katia/Microglia/mic_314s/metadata_files/"
anc_dir = "~/ad-omics/ricardo/gsa/"
write_dir = anc_dir
work_dir = "D:/OneDrive/Área de Trabalho/Work/gsa/"
genotype_dir = paste0(work_dir,"genotypes/")
metadata_path = paste0(work_dir,"metadata_files/")
mbv_dir = paste0(work_dir,"mbv/")
anc_dir = work_dir
write_dir = anc_dir
expression_dir = paste0(work_dir,"expression_tables_added_pilot_314s/")
This file contains all samples processed by Michael Chao, not only the ones from this project.
First columns is the GSA id. Second column is our ID.
QCed column show samples kept in the file gsa123456_geno95_hwe6_mind95_het3sd
### READ METADATA
# All RNA-seq samples (314)
metadata = read.table(paste0(metadata_path,"metadata_314s_28feb2020.txt"), header = T, check.names = F, sep="\t", stringsAsFactors = F)
# Only RNA-seq samples that pass QC (285)
metadata.filt = read.table(paste0(metadata_path,"metadata_285filt_eng_3mar2020.txt"), header = T, check.names = F, sep="\t", stringsAsFactors = F)
# Just checking if donor IDs can be inferred from sample IDs
#length(unique(gsub("(.*)-(.*)","\\1",metadata$Donor_tissue))) # 115 donors from RNAseq (preQC)
#all(gsub("(.*)-(.*)","\\1",metadata$Donor_tissue) == metadata$Donor_id) # All donor IDs can be inferred from Sample IDs
#length(unique(gsub("(.*)-(.*)","\\1",metadata.filt$donor_tissue))) # 111 donor from RNAseq (afterQC)
#all(gsub("(.*)-(.*)","\\1",metadata.filt$donor_tissue) == metadata.filt$donor_id) # Again, all donor IDs can be inferred from Sample IDs
failed_samples_df = data.frame(donor_tissue = metadata$Donor_tissue, status = "Ok", cause = "", stringsAsFactors = F)
register_failed_samples <- function(failed_samples_df, donor_tissue, cause){
failed_samples_df[failed_samples_df$donor_tissue %in% donor_tissue, "status"] <- "Failed"
failed_samples_df[failed_samples_df$donor_tissue %in% donor_tissue, "cause"] <- ifelse(failed_samples_df[failed_samples_df$donor_tissue %in% donor_tissue, "cause"]=="",
cause, paste0(failed_samples_df[failed_samples_df$donor_tissue %in% donor_tissue, "cause"],";",cause))
return(failed_samples_df)
}
failed_samples_df = register_failed_samples(failed_samples_df, metadata[!metadata$Donor_tissue %in% metadata.filt$donor_tissue,"Donor_tissue"],"RNAseq_QC")
### READ GENOTYPE FAM FILES
genotype.fam = read.table(paste0(genotype_dir,"gsa123456.fam"), header = F, stringsAsFactors = F)
genotype.fam$Donor_id = gsub("_","-",gsub("(.*)_GFM","\\1",gsub("(.*)_CER","\\1",genotype.fam$V2))) # Remove chars after underline to match samples names in the RNAseq tables
genotype.fam.QCed = read.table(paste0(genotype_dir,"gsa123456_geno95_hwe6_mind95_het3sd.fam"), header = F, stringsAsFactors = F)
genotype.fam.QCed$Donor_id = gsub("_","-",gsub("(.*)_GFM","\\1",gsub("(.*)_CER","\\1",genotype.fam.QCed$V2))) # Remove chars after underline to match samples names in the RNAseq tables
### READ HET QC
genotype.hetQC = read.table(paste0(work_dir,"fail-het-qc.txt"), header = T, stringsAsFactors = F, check.names = F)
genotype.missingQC = read.table(paste0(work_dir,"plink.imiss"), header = T, stringsAsFactors = F, check.names = F)
# length(unique(metadata$Donor_id)) # 115 donors
# sum(unique(metadata$Donor_id) %in% genotype.fam$Donor_id) # 109 of them have genotypes (pre GSA QC)
# sum(unique(metadata$Donor_id) %in% genotype.fam.QCed$Donor_id) # 105 of them have genotypes (after GSA QC)
# length(unique(metadata.filt$donor_id)) # 111 donors after RNAseq QC
# sum(unique(metadata.filt$donor_id) %in% genotype.fam$Donor_id) # 105 of them have genotypes (pre GSA QC)
# sum(unique(metadata.filt$donor_id) %in% genotype.fam.QCed$Donor_id) # 101 of them have genotypes (after GSA QC)
genotype.fam.merged = merge(genotype.fam,genotype.fam.QCed, by = c("V1","V2"), all.x = T) %>% dplyr::rename(V3 = V3.x, V4 = V4.x, V5 = V5.x, V6 = V6.x, Donor_id = Donor_id.x) %>% mutate(QCed = !is.na(V3.y)) %>% dplyr::select(V1,V2,V3,V4,V5,V6,QCed,Donor_id)
createDT(genotype.fam.merged)
Comparing donor and samples overlapping the GSA files. Showing before and after RNAseq QC
data.frame(Status=c("preQC","afterQC"),
`Donors (n)`=c(length(unique(metadata$Donor_id)), length(unique(metadata.filt$donor_id))),
`Samples (n)`=c(length(unique(metadata$Donor_tissue)), length(unique(metadata.filt$donor_tissue))),
check.names = F)
Same thing, but now showing before and after GSA QC
data.frame(Status=c("preQC","afterQC"),
`Total donors (n)`=c(nrow(genotype.fam),nrow(genotype.fam.QCed)),
`Unique donors (n)`=c(length(unique(genotype.fam$V2)), length(unique(genotype.fam.QCed$V2))),
`Matching preQC RNAseq samples (n)`= c(sum(unique(metadata$Donor_id) %in% unique(genotype.fam$Donor_id)), sum(unique(metadata$Donor_id) %in% unique(genotype.fam.QCed$Donor_id))),
`Matching afterQC RNAseq samples (n)`= c(sum(unique(metadata.filt$donor_id) %in% unique(genotype.fam$Donor_id)), sum(unique(metadata.filt$donor_id) %in% unique(genotype.fam.QCed$Donor_id))),
check.names = F)
Duplicated IDs? Showing only for IDs matching RNAseq samples
#genotype.fam.merged$Donor_id = gsub("(.*)_(.*)","\\1",genotype.fam.merged$V2)
isdup <- function (x) duplicated (x) | duplicated (x, fromLast = TRUE)
genotype.fam.merged %>% filter(genotype.fam.merged$Donor_id %in% unique(metadata$Donor_id)) %>% filter(isdup(Donor_id))
listInput = list(RNAseq_preQC = unique(metadata$Donor_id),
RNAseq_postQC = unique(metadata.filt$donor_id),
GSA_preQC = unique(genotype.fam$Donor_id),
GSA_postQC = unique(genotype.fam.QCed$Donor_id))
upset(fromList(listInput), order.by = "freq")
Samples with RNAseq that were not genotyped
metadata.missing = metadata %>% filter(!Donor_id %in% unique(genotype.fam$Donor_id)) %>% dplyr::select(Donor_tissue,Donor_id) %>% distinct()
metadata.missing$PassQC = metadata.missing$Donor_tissue %in% metadata.filt$donor_tissue
failed_samples_df = register_failed_samples(failed_samples_df, metadata.missing$Donor_tissue, "Not_genotyped")
createDT(metadata.missing)
MG samples that were genotyped but didn’t pass the GSA QC steps were because of individual missing rate (>0.15)
donors_preQC_GSA = unique(metadata$Donor_id)[unique(metadata$Donor_id) %in% genotype.fam$Donor_id]
donors_afterQC_GSA = unique(metadata$Donor_id)[unique(metadata$Donor_id) %in% genotype.fam.QCed$Donor_id]
donors_removed_in_GSA_QC = donors_preQC_GSA[!donors_preQC_GSA %in% donors_afterQC_GSA]
failed_samples_df = register_failed_samples(failed_samples_df, metadata[metadata$Donor_id%in%donors_removed_in_GSA_QC,"Donor_tissue"], "Removed_in_GSA_QC_highMissingness")
genotype.indmiss = genotype.missingQC[genotype.missingQC$FID %in% genotype.fam[genotype.fam$Donor_id%in%metadata$Donor_id,"V1"], ]
genotype.indmiss$Miss_thresh = genotype.indmiss$F_MISS>0.15
ggplot(genotype.indmiss, aes(x=F_MISS)) +
geom_histogram(aes(fill=Miss_thresh), bins = 100, colour="black") +
scale_fill_manual(values = c("grey","#CB454A")) +
geom_vline(xintercept=0.15, color="grey") +
labs(y = "Number of individuals", x = "Variant missingness per individual") +
theme_classic() +
easy_remove_legend()
List of samples removed due to missingness in the genotyping step
After GSA QC AND RNAseq QC (101 donors and 262 samples)
Ancestry defined with differenct thresholds per population (Michael Chao analysis)
UNKNOWN are samples not present in any of the ancestry files (probably didnt pass SD thresdhold)
aj_samples = read.table(paste0(anc_dir,"gsa123456_aj_samples_10MDS_mean3sd.txt"), header = T, stringsAsFactors = F)
afr_samples = read.table(paste0(anc_dir,"gsa_afr_samples_10MDS_4SD.txt"), header = F, stringsAsFactors = F)
amr_samples = read.table(paste0(anc_dir,"gsa_amr_samples_10MDS_6SD.txt"), header = F, stringsAsFactors = F)
eas_samples = read.table(paste0(anc_dir,"gsa_eas_samples_10MDS_3SD.txt"), header = F, stringsAsFactors = F)
eur_samples = read.table(paste0(anc_dir,"gsa_eur_samples_10MDS_6SD.txt"), header = F, stringsAsFactors = F)
sas_samples = read.table(paste0(anc_dir,"gsa_sas_samples_10MDS_6SD.txt"), header = F, stringsAsFactors = F)
anc_samples = rbind(as.data.frame(cbind(aj_samples$IID,"AJ")),
cbind(afr_samples,V2 ="AFR"),
cbind(amr_samples,V2 ="AMR"),
cbind(eas_samples,V2 ="EAS"),
cbind(eur_samples,V2 ="EUR"),
cbind(sas_samples,V2 ="SAS"))
genotype.fam.merged$merged_id = paste(genotype.fam.merged$V1,genotype.fam.merged$V2, sep = "_")
genotype.fam.merged.usable = genotype.fam.merged[genotype.fam.merged$Donor_id %in% unique(metadata_usable$donor_id),]
genotype.fam.merged.usable.afr = genotype.fam.merged.usable[genotype.fam.merged.usable$merged_id %in% afr_samples$V1,]
genotype.fam.merged.usable.amr = genotype.fam.merged.usable[genotype.fam.merged.usable$merged_id %in% amr_samples$V1,]
genotype.fam.merged.usable.eas = genotype.fam.merged.usable[genotype.fam.merged.usable$merged_id %in% eas_samples$V1,]
genotype.fam.merged.usable.eur = genotype.fam.merged.usable[genotype.fam.merged.usable$merged_id %in% eur_samples$V1,]
genotype.fam.merged.usable.sas = genotype.fam.merged.usable[genotype.fam.merged.usable$merged_id %in% sas_samples$V1,]
genotype.fam.merged.usable.aj = genotype.fam.merged.usable[genotype.fam.merged.usable$merged_id %in% aj_samples$IID,]
#genotype.fam.merged.usable.eur[!genotype.fam.merged.usable.eur$merged_id %in% genotype.fam.merged.usable.aj$merged_id,]
genotype.fam.merged.usable.unknown = genotype.fam.merged.usable[!genotype.fam.merged.usable$merged_id %in% anc_samples$V1,]
data.frame(`Ancestry` = c("AFR","AMR","EAS","EUR","SAS","AJ (already included in EUR)","UNKNOWN"),
`Samples` = c(length(unique(genotype.fam.merged.usable.afr$Donor_id)),
length(unique(genotype.fam.merged.usable.amr$Donor_id)),
length(unique(genotype.fam.merged.usable.eas$Donor_id)),
length(unique(genotype.fam.merged.usable.eur$Donor_id)),
length(unique(genotype.fam.merged.usable.sas$Donor_id)),
length(unique(genotype.fam.merged.usable.aj$Donor_id)),
length(unique(genotype.fam.merged.usable.unknown)$Donor_id)),
`Duplicated` = c(sum(duplicated(genotype.fam.merged.usable.afr$Donor_id)),
sum(duplicated(genotype.fam.merged.usable.amr$Donor_id)),
sum(duplicated(genotype.fam.merged.usable.eas$Donor_id)),
sum(duplicated(genotype.fam.merged.usable.eur$Donor_id)),
sum(duplicated(genotype.fam.merged.usable.sas$Donor_id)),
sum(duplicated(genotype.fam.merged.usable.aj$Donor_id)),
sum(duplicated(genotype.fam.merged.usable.unknown$Donor_id))))
# Load MDS files
mds_all = read.delim2(paste0(work_dir,"gsa123456_1000g_10mds.txt"), stringsAsFactors = F)
mds.usable = mds_all[mds_all$IID %in% genotype.fam.merged.usable$merged_id,]
mds.usable
mds_all$MajorPop = gsub("(.*)-(.*)","\\1",mds_all$Population)
ancestry_pcs_cohort = mds_all[mds_all$MajorPop %in% c("AFR","AMR","EAS","EUR","SAS") | mds_all$IID %in% mds.usable$IID,]
ancestry_pcs_cohort$MajorPop[ancestry_pcs_cohort$MajorPop=="GSA"] = "MG cohort"
ancestry_pcs_cohort$cohort = ancestry_pcs_cohort$MajorPop=="MG cohort"
ancestry_pcs_cohort$MajorPop = factor(ancestry_pcs_cohort$MajorPop,
levels = c("AFR","AMR","EAS","EUR","SAS","MG cohort"))
ancestry_pcs_colors = c("#941594","#E18727FF","#108C43","#69A5CC","#ABB9B9","#CB454A")
ancestry_pcs_cohort$C1 = as.numeric(as.character(ancestry_pcs_cohort$C1))
ancestry_pcs_cohort$C2 = as.numeric(as.character(ancestry_pcs_cohort$C2))
ggplot(ancestry_pcs_cohort,
aes(x=C1, y=C2, fill=MajorPop)) +
geom_point(aes(shape=cohort,alpha=cohort,color=cohort), size=2.5) +
scale_color_manual(values = c("white","black"), guide = FALSE) +
scale_shape_manual(values = c(22,22), guide = FALSE) +
scale_alpha_manual(values = c(0.2,0.8), guide = FALSE) +
scale_fill_manual(values = ancestry_pcs_colors,
guide = guide_legend(fill = ancestry_pcs_colors, colour = "black")) +
scale_x_continuous(breaks = pretty_breaks(n = 10)) +
scale_y_continuous(breaks = pretty_breaks(n = 10)) +
coord_fixed() +
labs(x = "Ancestry PC1", y = "Ancestry PC2", fill = "Population") +
guides(fill = guide_legend(nrow = 1, override.aes = list(shape = 22, colour = "black"))) +
theme_classic() +
theme(legend.position="top", legend.box = "horizontal")
Comparing reported sex information with inferred sex from genotypes
sex_check = read.table(paste0(anc_dir,"plink.sexcheck"), header = T, stringsAsFactors = F)
sex_check.usable = sex_check[sex_check$FID %in% unique(genotype.fam.merged.usable$V1),]
sex_check.usable$donor_id = gsub("_","-",gsub("(.*)_GFM","\\1",gsub("(.*)_CER","\\1",sex_check.usable$IID)))
sex_check.usable = sex_check.usable[sex_check.usable$donor_id %in% metadata.filt$donor_id,]
# 1 is M
# 2 if F
# Compare with our metadata
sex_check.usable = sex_check.usable %>% left_join(unique(metadata[,c("Donor_id","sex")]), by = c("donor_id" = "Donor_id"))
sex_check.usable$sex2 = sex_check.usable$sex
sex_check.usable$sex2[sex_check.usable$sex2=="m"] = 1
sex_check.usable$sex2[sex_check.usable$sex2=="f"] = 2
sex_check.usable$STATUS2 = sex_check.usable$SNPSEX == sex_check.usable$sex2
sex_check.usable$STATUS2[sex_check.usable$STATUS2==T] = "OK"
sex_check.usable$STATUS2[sex_check.usable$STATUS2==F] = "PROBLEM"
#table(metadata_usable %>% distinct(donor_id,sex) %>% dplyr::select(sex),useNA="ifany")
#table(sex_check.usable$SNPSEX2,useNA="ifany")
a = ggplot(sex_check.usable, aes(x = STATUS2, fill=STATUS2)) +
geom_bar(color="black") +
geom_text(stat='count', aes(label=..count..),vjust=-0.35) +
theme_classic() +
labs(y = "Count", x = "Sex Check Status") +
scale_fill_aaas() +
easy_remove_legend()
sex_check.usable$SNPSEX2 = sex_check.usable$SNPSEX
sex_check.usable$SNPSEX2[sex_check.usable$SNPSEX2 == 1] = "Male"
sex_check.usable$SNPSEX2[sex_check.usable$SNPSEX2 == 2] = "Female"
sex_check.usable$SNPSEX2[sex_check.usable$SNPSEX2 == 0] = "Other"
b = ggplot(sex_check.usable, aes(x = SNPSEX2, fill=STATUS2)) +
geom_bar(position="dodge", color="black") +
geom_text(aes(label=..count..), stat='count', position=position_dodge(0.9), vjust=-0.35) +
labs(y = "Count", x = "SNP Sex Inferred", fill = "Sex Check Status") +
scale_fill_aaas() +
theme_classic()
grid.arrange(a, b, nrow = 1)
Showing the samples with sex missmatches. 8 donors
sex_check.usable.problem = sex_check.usable[sex_check.usable$STATUS2=="PROBLEM",]
#sex_check.usable.problem$donor_id = gsub("(.*)_(.*)","\\1",sex_check.usable.problem$IID)
sex_check.usable.problem2 = sex_check.usable.problem %>%
dplyr::select(gsa_id = FID,gsa_id2 = IID,donor_id,reported_sex = sex,inferred_sex = SNPSEX2,status = STATUS2,`F`)
failed_samples_df = register_failed_samples(failed_samples_df, metadata[metadata$Donor_id %in% sex_check.usable.problem2$donor_id, "Donor_tissue"], "SexMismatch")
createDT(sex_check.usable.problem2)
Checking in the RNA data
load(paste0(expression_dir, "Expression_filt_285s.Rdata"))
# TPM
#dim(genes_tpm_exp) # 19376 genes and 285 samples
xist = genes_tpm_exp[which(grepl("ENSG00000229807",rownames(genes_tpm_exp))),]
uty = genes_tpm_exp[which(grepl("ENSG00000183878",rownames(genes_tpm_exp))),]
rna_sex_genes_tpm = data.frame(XIST = t(xist), UTY = t(uty))
colnames(rna_sex_genes_tpm) = c("XIST","UTY")
# Merge with metadata
#all(metadata.filt$donor_tissue==rownames(rna_sex_genes_tpm)) # TRUE
rna_sex_genes_tpm$`Reported Sex` = metadata.filt$sex
#ggplot(rna_sex_genes_tpm, aes(x=XIST, y=UTY, fill=`Reported Sex`)) +
# geom_point(shape=21)
# VOOM
genes_counts_voom = as.data.frame(genes_counts_voom, stringsAsFactors = F)
xist = genes_counts_voom[which(grepl("ENSG00000229807",rownames(genes_counts_voom))),]
uty = genes_counts_voom[which(grepl("ENSG00000183878",rownames(genes_counts_voom))),]
rna_sex_genes_voom = data.frame(XIST = t(xist), UTY = t(uty))
colnames(rna_sex_genes_voom) = c("XIST","UTY")
# Merge with metadata
#all(metadata.filt$donor_tissue==rownames(rna_sex_genes_voom)) # TRUE
rna_sex_genes_voom$`Reported Sex` = factor(toupper(metadata.filt$sex), levels = c("M","F"))
rna_sex_genes_voom$donor_id = metadata.filt$donor_id
rna_sex_genes_voom$donor_tissue = metadata.filt$donor_tissue
rna_sex_genes_voom = rna_sex_genes_voom %>% plyr::join(sex_check.usable[,c("donor_id","STATUS")], by = "donor_id")
avg_xist = mean(rna_sex_genes_voom$XIST[rna_sex_genes_voom$`Reported Sex`=="F"])
avg_uty = mean(rna_sex_genes_voom$UTY[rna_sex_genes_voom$`Reported Sex`=="M"])
rna_sex_genes_voom$toCheck = F
rna_sex_genes_voom$toCheck = (rna_sex_genes_voom$UTY>0 & rna_sex_genes_voom$`Reported Sex`=="F") |
(rna_sex_genes_voom$XIST<2 & rna_sex_genes_voom$`Reported Sex`=="F") |
(rna_sex_genes_voom$XIST>4.5 & rna_sex_genes_voom$`Reported Sex`=="M")
ggplot(rna_sex_genes_voom, aes(x=XIST, y=UTY, fill=`Reported Sex`)) +
geom_point(shape=21, size=2, alpha=.8) +
geom_label_repel(data = subset(rna_sex_genes_voom, toCheck == T),
aes(label = donor_tissue),
fill = "grey90",
size=3,
segment.alpha = 0.5,
label.size = NA,
alpha = 0.8,
label.padding=.1,
box.padding = .1,
na.rm=TRUE,
seed = 1234) +
scale_fill_aaas() +
labs(x = "XIST (Voom)", y = "UTY (Voom)", fill = "Reported Sex") +
theme_classic()
Numbers:
Donors | Samples | |
---|---|---|
All | 100 | 261 |
EUR-only | 92 | 239 |
Showing below metadata for all ancestries
# Remove related individual
metadata_final = metadata_usable[!metadata_usable$donor_id=="15-041",]
# Add sexCheck information
metadata_final$sex_check_FAIL = metadata_final$donor_id %in% sex_check.usable.problem$donor_id
# Select european ancestry individuals
metadata_final.eur = metadata_final[metadata_final$donor_id %in% genotype.fam.merged.usable.eur$Donor_id,]
# Filter duplicated and related donor from GSA fam table
genotype.fam.merged.usable.final = genotype.fam.merged.usable[!genotype.fam.merged.usable$V1 %in% c("GSA5_109","GSA5_121","GSA5_124","GSA5_171","GSA6_179"),]
rownames(genotype.fam.merged.usable.final) = genotype.fam.merged.usable.final$Donor_id
# Select european ancestry individuals
genotype.fam.merged.usable.final.eur = genotype.fam.merged.usable.final[genotype.fam.merged.usable.final$Donor_id %in% genotype.fam.merged.usable.eur$Donor_id,]
metadata_final$gsa_id = genotype.fam.merged.usable.final[metadata_final$donor_id,"V1"]
metadata_final$merged_gsa_id = genotype.fam.merged.usable.final[metadata_final$donor_id,"merged_id"]
# merged_gsa_id2 have the original GSA id (some samples have donor_ids with _ instead of -)
metadata_final$merged_gsa_id2 = paste0(genotype.fam.merged.usable.final[metadata_final$donor_id,"V1"],"_",genotype.fam.merged.usable.final[metadata_final$donor_id,"V2"])
metadata_final.eur$gsa_id = genotype.fam.merged.usable.final.eur[metadata_final.eur$donor_id,"V1"]
metadata_final.eur$merged_gsa_id = genotype.fam.merged.usable.final.eur[metadata_final.eur$donor_id,"merged_id"]
metadata_final.eur$merged_gsa_id2 = paste0(genotype.fam.merged.usable.final.eur[metadata_final.eur$donor_id,"V1"],"_",genotype.fam.merged.usable.final.eur[metadata_final.eur$donor_id,"V2"])
#write.table(metadata_final, file = paste0(write_dir,"metadata4eQTL_final.txt"), quote = F, row.names = F)
#write.table(metadata_final.eur, file = paste0(write_dir,"metadata4eQTL_final.eur.txt"), quote = F, row.names = F)
#write.table(metadata_final.eur$gsa_id, file = paste0(write_dir,"gsa_ids_4eQTL_final.eur.txt"), quote = F, row.names = F)
#write.table(genotype.fam.merged.usable[,c("V1","V2")], file = paste0(write_dir,"gsa_ids_4eQTL_prefilt.txt"), quote = F, row.names = F)
createDT(metadata_final)
We will check for concordance between RNA (using .bam files) and DNA data (using GSA genotypes from .vcf file).
First, we will show if there is matches for 13 samples that were not genotyped.
#mbv_test <- read.table(file = paste0(mbv_dir,"/results/glia_omics_mbv/output/16-116-GTS.bamstat.txt"), header = T, check.names = F)
#ggplot(unique(mbv_test), aes(x = perc_het_consistent, y = perc_hom_consistent)) +
# geom_point( shape=21, alpha=0.4, size=2, fill="grey" ) +
# labs(y = "Concordance at HOM (%)", x = "Concordance at HET (%)") +
# theme_classic()
mbv <- read.table(file = paste0(mbv_dir,"/results/glia_omics_mbv/glia_omics_mbv_mbv_summary.txt"),
header = F,
check.names = F,
col.names = c("bam","SampleID","n_geno_missing","n_het_total","n_hom_total","n_het_covered","n_hom_covered","n_het_consistent","n_hom_consistent","perc_het_consistent","perc_hom_consistent","n_het_in_ase"))
mbv$bam_donor_id = gsub("(.*)\\/(.*)-(.*)","\\2",mbv$bam)
mbv$bam_sample_id = gsub("(.*)\\/(.*)\\.bamstat\\.txt","\\2",mbv$bam)
mbv$sample_donor_id = gsub("_","-",gsub("_CER|_GFM","",gsub("(.*?)_(.*?)_(.*)","\\3",mbv$SampleID)))
mbv$sample_id = gsub("(.*?)_(.*?)_(.*)","\\3",mbv$SampleID)
mbv$same_donor = mbv$bam_donor_id==mbv$sample_donor_id
failed_samples_df = register_failed_samples(failed_samples_df, mbv[mbv$same_donor==F,"bam_sample_id"], "MBV")
mbv.missing = mbv[mbv$bam_sample_id %in% metadata.missing$Donor_tissue,]
ggplot(mbv.missing, aes(x = perc_het_consistent, y = perc_hom_consistent, fill = same_donor)) +
geom_vline(xintercept=0.5, color="grey", linetype = "dashed") +
geom_point(shape=21, alpha=0.4, size=2) +
geom_point(data = subset(mbv.missing, same_donor == F), alpha = 0.8, shape=21, size=2 ) +
geom_label_repel(data = subset(mbv.missing, same_donor == F),
aes(label = bam_sample_id),
fill = "#CB454A",
size=3,
segment.alpha = 0.3,
label.size = NA,
alpha = 0.8,
label.padding=.1,
box.padding = .1,
na.rm=TRUE,
seed = 1234) +
geom_label_repel(data = subset(mbv.missing, same_donor == T & perc_het_consistent<0.5),
aes(label = bam_sample_id),
fill = "grey",
size=3,
segment.alpha = 0.3,
label.size = NA,
alpha = 0.8,
label.padding=.1,
box.padding = .1,
na.rm=TRUE,
seed = 1234) +
expand_limits(y = 1, x = 1) +
scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
scale_x_continuous(labels = scales::percent_format(accuracy = 1)) +
scale_fill_manual(values = c("#CB454A","grey")) +
labs(y = "Concordance at HOM", x = "Concordance at HET", fill = "Matching donor", alpha = "Status") +
theme_classic()
Showing results for all 301 RNA-seq samples that were genotyped (each dot is one sample). Labels in red are missmatching samples.
# Removing RNAseq missing GSA
mbv = mbv[!mbv$bam_donor_id %in% metadata.missing$Donor_id,] # 301 samples
# sum(mbv$bam_donor_id==mbv$sample_donor_id) # 287 samples matching RNA and DNA
mbv$same_donor = mbv$bam_donor_id==mbv$sample_donor_id
mbv$status = NA
mbv$status[mbv$same_donor==T] = "Match"
mbv$status[mbv$same_donor==F] = "Not Match"
mbv$status[mbv$bam_donor_id %in% metadata.missing$Donor_id] = "Not Genotyped"
ggplot(mbv, aes(x = perc_het_consistent, y = perc_hom_consistent, fill = same_donor)) +
geom_vline(xintercept=0.5, color="grey", linetype = "dashed") +
geom_point(shape=21, alpha=0.4, size=2) +
geom_point(data = subset(mbv, same_donor == F), alpha = 0.8, shape=21, size=2 ) +
geom_label_repel(data = subset(mbv, same_donor == F),
aes(label = bam_donor_id),
fill = "#CB454A",
size=3,
segment.alpha = 0.3,
label.size = NA,
alpha = 0.8,
label.padding=.1,
box.padding = .1,
na.rm=TRUE,
seed = 1234) +
expand_limits(y = 1, x = 1) +
scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
scale_x_continuous(labels = scales::percent_format(accuracy = 1)) +
scale_fill_manual(values = c("#CB454A","grey")) +
labs(y = "Concordance at HOM", x = "Concordance at HET", fill = "Matching donor", alpha = "Status") +
theme_classic()
Now showing only the samples where donors passed all previous QC metrics (261 samples/100 donors)
mbv.filtered = mbv[mbv$bam_sample_id %in% metadata_final$donor_tissue,]
ggplot(mbv.filtered, aes(x = perc_het_consistent, y = perc_hom_consistent, fill = same_donor)) +
geom_vline(xintercept=0.5, color="grey", linetype = "dashed") +
geom_point(shape=21, alpha=0.4, size=2) +
geom_point(data = subset(mbv.filtered, same_donor == F), alpha = 0.8, shape=21, size=2 ) +
geom_label_repel(data = subset(mbv.filtered, same_donor == F),
aes(label = bam_sample_id),
fill = "#CB454A",
size=3,
segment.alpha = 0.3,
label.size = NA,
alpha = 0.8,
label.padding=.1,
box.padding = .1,
na.rm=TRUE,
seed = 1234) +
geom_label_repel(data = subset(mbv.filtered, same_donor == T & perc_het_consistent<0.5),
aes(label = bam_sample_id),
fill = "grey",
size=3,
segment.alpha = 0.3,
label.size = NA,
alpha = 0.8,
label.padding=.1,
box.padding = .1,
na.rm=TRUE,
seed = 1234) +
expand_limits(y = 1, x = 1) +
scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
scale_x_continuous(labels = scales::percent_format(accuracy = 1)) +
scale_fill_manual(values = c("#CB454A","grey")) +
labs(y = "Concordance at HOM", x = "Concordance at HET", fill = "Matching donor", alpha = "Status") +
theme_classic()
From 261 (after-QC) RNAseq samples, 258 (98 donors) matched donors with DNA.
Here is the list of missmatching samples.
#length(unique(mbv.filtered$bam)) # 261 RNAseq bam files
#length(unique(metadata$Donor_tissue)) # 314
#length(unique(mbv$bam_sample_id)) # 291
#length(unique(mbv$bam_donor_id)) # 105 donors (from RNAseq bam ids)
#length(unique(mbv$SampleID)) # 108 donors (from GSA) - best matches for each RNAseq sample
#sum(unique(mbv$bam_sample_id) %in% unique(metadata$Donor_tissue)) # 291 samples (bam ids) are in the metadata
#sum(unique(mbv$bam_donor_id) %in% unique(metadata$Donor_id)) # 105 donors (bam ids) are in the metadata
#sum(mbv$bam_donor_id==mbv$sample_donor_id) # 287 RNAseq samples are matching donors with DNA
#sum(mbv.filtered$bam_donor_id==mbv.filtered$sample_donor_id) # 258 RNAseq samples are matching donors with DNA
mbv_match = mbv.filtered[which(mbv.filtered$bam_donor_id==mbv.filtered$sample_donor_id),]
#length(unique(mbv_match$bam_donor_id)) # 98 donors
mbv_miss = mbv.filtered[which(!mbv.filtered$bam_donor_id==mbv.filtered$sample_donor_id),]
mbv_miss$PassMBV = mbv_miss$perc_het_consistent>0.5
createDT(mbv_miss)
After removing non-concordant samples.
Final numbers:
Donors | Samples | |
---|---|---|
All | 98 | 258 |
EUR-only | 90 | 236 |
Showing below metadata for all ancestries
metadata_final2 = metadata_final[metadata_final$donor_tissue %in% mbv_match$bam_sample_id,]
metadata_final2 = dplyr::left_join(metadata_final2, mds.usable, by = c("merged_gsa_id" = "IID"))
#length(unique(metadata_final2$donor_id)) # 98 donors
# Select european ancestry individuals
metadata_final2.eur = metadata_final2[metadata_final2$donor_id %in% genotype.fam.merged.usable.eur$Donor_id,]
#length(unique(metadata_final2.eur$donor_id)) # 90 donors
write.table(metadata_final2, file = paste0(write_dir,"metadata4eQTL.txt"), quote = F, row.names = F, sep = "\t")
write.table(metadata_final2.eur, file = paste0(write_dir,"metadata4eQTL_eur.txt"), quote = F, row.names = F, sep = "\t")
# Select ids for filtering samples in the plink file
write.table(metadata_final2[,c("merged_gsa_id2")], file = paste0(write_dir,"gsa_ids_4eQTL_final2.txt"), quote = F, row.names = F, col.names = F, sep=" ")
# Mapping GSA names to donor_id
write.table(metadata_final2[,c("merged_gsa_id2","donor_id")], file = paste0(write_dir,"gsa_ids_4eQTL_final2.namesMap.txt"), quote = F, row.names = F, col.names = F, sep=" ")
# Select ids for filtering samples in the plink file
write.table(metadata_final2.eur[,c("merged_gsa_id2")], file = paste0(write_dir,"gsa_ids_4eQTL_final2.eur.txt"), quote = F, row.names = F)
# Mapping GSA names to donor_id
write.table(metadata_final2.eur[,c("merged_gsa_id2","donor_id")], file = paste0(write_dir,"gsa_ids_4eQTL_final2.eur.namesMap.txt"), quote = F, row.names = F, col.names = F, sep=" ")
createDT(metadata_final2)
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 18363)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] factoextra_1.0.6 ggrepel_0.8.2 scales_1.1.0 gridExtra_2.3
## [5] UpSetR_1.4.0 ggeasy_0.1.2 ggsci_2.9 DT_0.13
## [9] forcats_0.5.0 stringr_1.4.0 dplyr_0.8.5 purrr_0.3.3
## [13] readr_1.3.1 tidyr_1.0.2 tibble_3.0.0 ggplot2_3.3.0
## [17] tidyverse_1.3.0 pacman_0.5.1
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.4 lubridate_1.7.4 lattice_0.20-40 assertthat_0.2.1
## [5] digest_0.6.25 R6_2.4.1 cellranger_1.1.0 plyr_1.8.6
## [9] backports_1.1.5 reprex_0.3.0 evaluate_0.14 httr_1.4.1
## [13] pillar_1.4.3 rlang_0.4.5 readxl_1.3.1 rstudioapi_0.11
## [17] rmarkdown_2.1 labeling_0.3 htmlwidgets_1.5.1 munsell_0.5.0
## [21] broom_0.5.5 compiler_3.6.3 modelr_0.1.6 xfun_0.12
## [25] pkgconfig_2.0.3 htmltools_0.4.0 tidyselect_1.0.0 fansi_0.4.1
## [29] crayon_1.3.4 dbplyr_1.4.2 withr_2.1.2 grid_3.6.3
## [33] nlme_3.1-145 jsonlite_1.6.1 gtable_0.3.0 lifecycle_0.2.0
## [37] DBI_1.1.0 magrittr_1.5 cli_2.0.2 stringi_1.4.6
## [41] farver_2.0.3 fs_1.3.2 xml2_1.2.5 ellipsis_0.3.0
## [45] generics_0.0.2 vctrs_0.2.4 tools_3.6.3 glue_1.3.2
## [49] crosstalk_1.1.0.1 hms_0.5.3 yaml_2.2.1 colorspace_1.4-1
## [53] rvest_0.3.5 knitr_1.28 haven_2.2.0