## Setup, Load Libraries & Report Versions
knitr::opts_chunk$set(echo = TRUE)

library(pacman)
p_load(tidyverse,DT,ggsci,ggeasy,UpSetR,gridExtra,scales,ggrepel,scales,factoextra)

createDT <- function(DF, caption="", scrollY=500){
  data <- DT::datatable(DF, caption=caption,
                        extensions = 'Buttons',
                        class = "display",
                        callback = JS("return table;"),
                        filter = c("none", "bottom", "top"), 
                        escape = TRUE,
                        style = "default", width = NULL, height = NULL, elementId = NULL,
                        fillContainer = getOption("DT.fillContainer", NULL),
                        autoHideNavigation = getOption("DT.autoHideNavigation", NULL),
                        selection = c("multiple", "single", "none"),
                        plugins = NULL, editable = FALSE,
                        options = list( dom = 'Bfrtip', 
                                        buttons = c('copy', 'csv', 'excel', 'pdf', 'print'), 
                                        scrollY = scrollY, scrollX=T, scrollCollapse = T, paging = T,  
                                        columnDefs = list(list(className = 'dt-center', targets = "_all"))
                        )
  ) 
  return(data)
}

work_dir = "~/ad-omics_hydra/microglia_omics/expression_tables/added_pilot_314s/"
genotype_dir = "~/pd-omics_hydra/glia_omics/genotypes/" 
metadata_path = "~/pd-omics/katia/Microglia/mic_314s/metadata_files/"
anc_dir = "~/ad-omics/ricardo/gsa/"
write_dir = anc_dir

work_dir = "D:/OneDrive/Área de Trabalho/Work/gsa/"
genotype_dir = paste0(work_dir,"genotypes/")
metadata_path = paste0(work_dir,"metadata_files/")
mbv_dir = paste0(work_dir,"mbv/")
anc_dir = work_dir
write_dir = anc_dir
expression_dir = paste0(work_dir,"expression_tables_added_pilot_314s/")

GSA fam file

This file contains all samples processed by Michael Chao, not only the ones from this project.

First columns is the GSA id. Second column is our ID.

QCed column show samples kept in the file gsa123456_geno95_hwe6_mind95_het3sd

### READ METADATA
# All RNA-seq samples (314)
metadata = read.table(paste0(metadata_path,"metadata_314s_28feb2020.txt"), header = T, check.names = F, sep="\t", stringsAsFactors = F)
# Only RNA-seq samples that pass QC (285)
metadata.filt = read.table(paste0(metadata_path,"metadata_285filt_eng_3mar2020.txt"), header = T, check.names = F, sep="\t", stringsAsFactors = F)
# Just checking if donor IDs can be inferred from sample IDs
#length(unique(gsub("(.*)-(.*)","\\1",metadata$Donor_tissue))) # 115 donors from RNAseq (preQC)
#all(gsub("(.*)-(.*)","\\1",metadata$Donor_tissue) == metadata$Donor_id) # All donor IDs can be inferred from Sample IDs
#length(unique(gsub("(.*)-(.*)","\\1",metadata.filt$donor_tissue))) # 111 donor from RNAseq (afterQC)
#all(gsub("(.*)-(.*)","\\1",metadata.filt$donor_tissue) == metadata.filt$donor_id) # Again, all donor IDs can be inferred from Sample IDs

failed_samples_df = data.frame(donor_tissue = metadata$Donor_tissue, status = "Ok", cause = "", stringsAsFactors = F)
register_failed_samples <- function(failed_samples_df, donor_tissue, cause){
  failed_samples_df[failed_samples_df$donor_tissue %in% donor_tissue, "status"] <- "Failed"
  failed_samples_df[failed_samples_df$donor_tissue %in% donor_tissue, "cause"] <- ifelse(failed_samples_df[failed_samples_df$donor_tissue %in% donor_tissue, "cause"]=="", 
                                                                                         cause, paste0(failed_samples_df[failed_samples_df$donor_tissue %in% donor_tissue, "cause"],";",cause))
  return(failed_samples_df)
}
failed_samples_df = register_failed_samples(failed_samples_df, metadata[!metadata$Donor_tissue %in% metadata.filt$donor_tissue,"Donor_tissue"],"RNAseq_QC")

### READ GENOTYPE FAM FILES
genotype.fam = read.table(paste0(genotype_dir,"gsa123456.fam"), header = F, stringsAsFactors = F)
genotype.fam$Donor_id = gsub("_","-",gsub("(.*)_GFM","\\1",gsub("(.*)_CER","\\1",genotype.fam$V2))) # Remove chars after underline to match samples names in the RNAseq tables
genotype.fam.QCed = read.table(paste0(genotype_dir,"gsa123456_geno95_hwe6_mind95_het3sd.fam"), header = F, stringsAsFactors = F)
genotype.fam.QCed$Donor_id = gsub("_","-",gsub("(.*)_GFM","\\1",gsub("(.*)_CER","\\1",genotype.fam.QCed$V2))) # Remove chars after underline to match samples names in the RNAseq tables
### READ HET QC 
genotype.hetQC = read.table(paste0(work_dir,"fail-het-qc.txt"), header = T, stringsAsFactors = F, check.names = F)
genotype.missingQC = read.table(paste0(work_dir,"plink.imiss"), header = T, stringsAsFactors = F, check.names = F)

# length(unique(metadata$Donor_id)) # 115 donors
# sum(unique(metadata$Donor_id) %in% genotype.fam$Donor_id) # 109 of them have genotypes (pre GSA QC)
# sum(unique(metadata$Donor_id) %in% genotype.fam.QCed$Donor_id) # 105 of them have genotypes (after GSA QC)

# length(unique(metadata.filt$donor_id)) # 111 donors after RNAseq QC
# sum(unique(metadata.filt$donor_id) %in% genotype.fam$Donor_id) # 105 of them have genotypes (pre GSA QC)
# sum(unique(metadata.filt$donor_id) %in% genotype.fam.QCed$Donor_id) # 101 of them have genotypes (after GSA QC)

genotype.fam.merged = merge(genotype.fam,genotype.fam.QCed, by = c("V1","V2"), all.x = T) %>% dplyr::rename(V3 = V3.x, V4 = V4.x, V5 = V5.x, V6 = V6.x, Donor_id = Donor_id.x) %>% mutate(QCed = !is.na(V3.y)) %>% dplyr::select(V1,V2,V3,V4,V5,V6,QCed,Donor_id)
createDT(genotype.fam.merged)

From RNA-seq

Comparing donor and samples overlapping the GSA files. Showing before and after RNAseq QC

data.frame(Status=c("preQC","afterQC"),
           `Donors (n)`=c(length(unique(metadata$Donor_id)), length(unique(metadata.filt$donor_id))),
           `Samples (n)`=c(length(unique(metadata$Donor_tissue)), length(unique(metadata.filt$donor_tissue))),
           check.names = F)

From GSA

Same thing, but now showing before and after GSA QC

data.frame(Status=c("preQC","afterQC"),
           `Total donors (n)`=c(nrow(genotype.fam),nrow(genotype.fam.QCed)),
           `Unique donors (n)`=c(length(unique(genotype.fam$V2)), length(unique(genotype.fam.QCed$V2))),
           `Matching preQC RNAseq samples (n)`= c(sum(unique(metadata$Donor_id) %in% unique(genotype.fam$Donor_id)), sum(unique(metadata$Donor_id) %in% unique(genotype.fam.QCed$Donor_id))),
           `Matching afterQC RNAseq samples (n)`= c(sum(unique(metadata.filt$donor_id) %in% unique(genotype.fam$Donor_id)), sum(unique(metadata.filt$donor_id) %in% unique(genotype.fam.QCed$Donor_id))),
           check.names = F)

Duplicated IDs? Showing only for IDs matching RNAseq samples

#genotype.fam.merged$Donor_id = gsub("(.*)_(.*)","\\1",genotype.fam.merged$V2)
isdup <- function (x) duplicated (x) | duplicated (x, fromLast = TRUE)
genotype.fam.merged %>% filter(genotype.fam.merged$Donor_id %in% unique(metadata$Donor_id)) %>% filter(isdup(Donor_id))

Plot showing overlaps between GSA and RNAseq

listInput = list(RNAseq_preQC = unique(metadata$Donor_id), 
                 RNAseq_postQC = unique(metadata.filt$donor_id),
                 GSA_preQC = unique(genotype.fam$Donor_id),
                 GSA_postQC = unique(genotype.fam.QCed$Donor_id))
upset(fromList(listInput), order.by = "freq")

Missing samples

Samples with RNAseq that were not genotyped

metadata.missing = metadata %>% filter(!Donor_id %in% unique(genotype.fam$Donor_id)) %>% dplyr::select(Donor_tissue,Donor_id) %>% distinct()
metadata.missing$PassQC = metadata.missing$Donor_tissue %in% metadata.filt$donor_tissue

failed_samples_df = register_failed_samples(failed_samples_df, metadata.missing$Donor_tissue, "Not_genotyped")

createDT(metadata.missing)

Samples removed in GSA QC

MG samples that were genotyped but didn’t pass the GSA QC steps were because of individual missing rate (>0.15)

donors_preQC_GSA = unique(metadata$Donor_id)[unique(metadata$Donor_id) %in% genotype.fam$Donor_id] 
donors_afterQC_GSA = unique(metadata$Donor_id)[unique(metadata$Donor_id) %in% genotype.fam.QCed$Donor_id]
donors_removed_in_GSA_QC = donors_preQC_GSA[!donors_preQC_GSA %in% donors_afterQC_GSA]

failed_samples_df = register_failed_samples(failed_samples_df, metadata[metadata$Donor_id%in%donors_removed_in_GSA_QC,"Donor_tissue"], "Removed_in_GSA_QC_highMissingness")

genotype.indmiss = genotype.missingQC[genotype.missingQC$FID %in% genotype.fam[genotype.fam$Donor_id%in%metadata$Donor_id,"V1"], ]

genotype.indmiss$Miss_thresh = genotype.indmiss$F_MISS>0.15
ggplot(genotype.indmiss, aes(x=F_MISS)) +
  geom_histogram(aes(fill=Miss_thresh), bins = 100, colour="black") +
  scale_fill_manual(values = c("grey","#CB454A")) +
  geom_vline(xintercept=0.15, color="grey") + 
  labs(y = "Number of individuals", x = "Variant missingness per individual") +
  theme_classic() +
  easy_remove_legend()

List of samples removed due to missingness in the genotyping step

createDT(metadata[metadata$Donor_id %in% donors_removed_in_GSA_QC,c("Donor_tissue","Donor_id")])

Usable samples

After GSA QC AND RNAseq QC (101 donors and 262 samples)

metadata_usable = metadata.filt[metadata.filt$donor_id %in% unique(genotype.fam.QCed$Donor_id),]
createDT(metadata_usable)

Checking ancestry numbers

Ancestry defined with differenct thresholds per population (Michael Chao analysis)

UNKNOWN are samples not present in any of the ancestry files (probably didnt pass SD thresdhold)

aj_samples = read.table(paste0(anc_dir,"gsa123456_aj_samples_10MDS_mean3sd.txt"), header = T, stringsAsFactors = F)
afr_samples = read.table(paste0(anc_dir,"gsa_afr_samples_10MDS_4SD.txt"), header = F, stringsAsFactors = F)
amr_samples = read.table(paste0(anc_dir,"gsa_amr_samples_10MDS_6SD.txt"), header = F, stringsAsFactors = F)
eas_samples = read.table(paste0(anc_dir,"gsa_eas_samples_10MDS_3SD.txt"), header = F, stringsAsFactors = F)
eur_samples = read.table(paste0(anc_dir,"gsa_eur_samples_10MDS_6SD.txt"), header = F, stringsAsFactors = F)
sas_samples = read.table(paste0(anc_dir,"gsa_sas_samples_10MDS_6SD.txt"), header = F, stringsAsFactors = F)

anc_samples = rbind(as.data.frame(cbind(aj_samples$IID,"AJ")), 
                    cbind(afr_samples,V2 ="AFR"),
                    cbind(amr_samples,V2 ="AMR"),
                    cbind(eas_samples,V2 ="EAS"),
                    cbind(eur_samples,V2 ="EUR"),
                    cbind(sas_samples,V2 ="SAS"))

genotype.fam.merged$merged_id = paste(genotype.fam.merged$V1,genotype.fam.merged$V2, sep = "_")

genotype.fam.merged.usable = genotype.fam.merged[genotype.fam.merged$Donor_id %in% unique(metadata_usable$donor_id),]
genotype.fam.merged.usable.afr = genotype.fam.merged.usable[genotype.fam.merged.usable$merged_id %in% afr_samples$V1,]
genotype.fam.merged.usable.amr = genotype.fam.merged.usable[genotype.fam.merged.usable$merged_id %in% amr_samples$V1,]
genotype.fam.merged.usable.eas = genotype.fam.merged.usable[genotype.fam.merged.usable$merged_id %in% eas_samples$V1,]
genotype.fam.merged.usable.eur = genotype.fam.merged.usable[genotype.fam.merged.usable$merged_id %in% eur_samples$V1,]
genotype.fam.merged.usable.sas = genotype.fam.merged.usable[genotype.fam.merged.usable$merged_id %in% sas_samples$V1,]
genotype.fam.merged.usable.aj = genotype.fam.merged.usable[genotype.fam.merged.usable$merged_id %in% aj_samples$IID,]
#genotype.fam.merged.usable.eur[!genotype.fam.merged.usable.eur$merged_id %in% genotype.fam.merged.usable.aj$merged_id,]
genotype.fam.merged.usable.unknown = genotype.fam.merged.usable[!genotype.fam.merged.usable$merged_id %in% anc_samples$V1,]

data.frame(`Ancestry` = c("AFR","AMR","EAS","EUR","SAS","AJ (already included in EUR)","UNKNOWN"), 
           `Samples` = c(length(unique(genotype.fam.merged.usable.afr$Donor_id)),
                         length(unique(genotype.fam.merged.usable.amr$Donor_id)),
                         length(unique(genotype.fam.merged.usable.eas$Donor_id)),
                         length(unique(genotype.fam.merged.usable.eur$Donor_id)),
                         length(unique(genotype.fam.merged.usable.sas$Donor_id)),
                         length(unique(genotype.fam.merged.usable.aj$Donor_id)),
                         length(unique(genotype.fam.merged.usable.unknown)$Donor_id)), 
           `Duplicated` = c(sum(duplicated(genotype.fam.merged.usable.afr$Donor_id)),
                            sum(duplicated(genotype.fam.merged.usable.amr$Donor_id)),
                            sum(duplicated(genotype.fam.merged.usable.eas$Donor_id)),
                            sum(duplicated(genotype.fam.merged.usable.eur$Donor_id)),
                            sum(duplicated(genotype.fam.merged.usable.sas$Donor_id)),
                            sum(duplicated(genotype.fam.merged.usable.aj$Donor_id)),
                            sum(duplicated(genotype.fam.merged.usable.unknown$Donor_id))))

# Load MDS files
mds_all = read.delim2(paste0(work_dir,"gsa123456_1000g_10mds.txt"), stringsAsFactors = F)
mds.usable = mds_all[mds_all$IID %in% genotype.fam.merged.usable$merged_id,]
mds.usable

mds_all$MajorPop = gsub("(.*)-(.*)","\\1",mds_all$Population)
ancestry_pcs_cohort = mds_all[mds_all$MajorPop %in% c("AFR","AMR","EAS","EUR","SAS") | mds_all$IID %in% mds.usable$IID,]
ancestry_pcs_cohort$MajorPop[ancestry_pcs_cohort$MajorPop=="GSA"] = "MG cohort"
ancestry_pcs_cohort$cohort = ancestry_pcs_cohort$MajorPop=="MG cohort"
ancestry_pcs_cohort$MajorPop = factor(ancestry_pcs_cohort$MajorPop, 
                                      levels = c("AFR","AMR","EAS","EUR","SAS","MG cohort"))
  ancestry_pcs_colors = c("#941594","#E18727FF","#108C43","#69A5CC","#ABB9B9","#CB454A")

ancestry_pcs_cohort$C1 = as.numeric(as.character(ancestry_pcs_cohort$C1))
ancestry_pcs_cohort$C2 = as.numeric(as.character(ancestry_pcs_cohort$C2))
ggplot(ancestry_pcs_cohort, 
         aes(x=C1, y=C2, fill=MajorPop)) +
    geom_point(aes(shape=cohort,alpha=cohort,color=cohort), size=2.5) +
    scale_color_manual(values = c("white","black"), guide = FALSE) +
    scale_shape_manual(values = c(22,22), guide = FALSE) +
    scale_alpha_manual(values = c(0.2,0.8), guide = FALSE) +
    scale_fill_manual(values = ancestry_pcs_colors, 
                      guide = guide_legend(fill = ancestry_pcs_colors, colour = "black")) +
    scale_x_continuous(breaks = pretty_breaks(n = 10)) +
    scale_y_continuous(breaks = pretty_breaks(n = 10)) +
    coord_fixed() +
    labs(x = "Ancestry PC1", y = "Ancestry PC2", fill = "Population") +
    guides(fill = guide_legend(nrow = 1, override.aes = list(shape = 22, colour = "black"))) +
    theme_classic() +
    theme(legend.position="top", legend.box = "horizontal")

Sex check

Comparing reported sex information with inferred sex from genotypes

sex_check = read.table(paste0(anc_dir,"plink.sexcheck"), header = T, stringsAsFactors = F)
sex_check.usable = sex_check[sex_check$FID %in% unique(genotype.fam.merged.usable$V1),]
sex_check.usable$donor_id  = gsub("_","-",gsub("(.*)_GFM","\\1",gsub("(.*)_CER","\\1",sex_check.usable$IID)))
sex_check.usable = sex_check.usable[sex_check.usable$donor_id %in% metadata.filt$donor_id,]

# 1 is M
# 2 if F
# Compare with our metadata
sex_check.usable = sex_check.usable %>% left_join(unique(metadata[,c("Donor_id","sex")]), by = c("donor_id" = "Donor_id"))
sex_check.usable$sex2 = sex_check.usable$sex 
sex_check.usable$sex2[sex_check.usable$sex2=="m"] = 1
sex_check.usable$sex2[sex_check.usable$sex2=="f"] = 2
sex_check.usable$STATUS2 = sex_check.usable$SNPSEX == sex_check.usable$sex2
sex_check.usable$STATUS2[sex_check.usable$STATUS2==T] = "OK"
sex_check.usable$STATUS2[sex_check.usable$STATUS2==F] = "PROBLEM"

#table(metadata_usable %>% distinct(donor_id,sex) %>% dplyr::select(sex),useNA="ifany")
#table(sex_check.usable$SNPSEX2,useNA="ifany")

a = ggplot(sex_check.usable, aes(x = STATUS2, fill=STATUS2)) + 
  geom_bar(color="black") + 
  geom_text(stat='count', aes(label=..count..),vjust=-0.35) + 
  theme_classic() +
  labs(y = "Count", x = "Sex Check Status") +
  scale_fill_aaas() + 
  easy_remove_legend()

sex_check.usable$SNPSEX2 = sex_check.usable$SNPSEX
sex_check.usable$SNPSEX2[sex_check.usable$SNPSEX2 == 1] = "Male" 
sex_check.usable$SNPSEX2[sex_check.usable$SNPSEX2 == 2] = "Female" 
sex_check.usable$SNPSEX2[sex_check.usable$SNPSEX2 == 0] = "Other" 
b = ggplot(sex_check.usable, aes(x = SNPSEX2, fill=STATUS2)) +
  geom_bar(position="dodge", color="black") +
  geom_text(aes(label=..count..), stat='count', position=position_dodge(0.9), vjust=-0.35) +
  labs(y = "Count", x = "SNP Sex Inferred", fill = "Sex Check Status") +
  scale_fill_aaas() + 
  theme_classic()
grid.arrange(a, b, nrow = 1)

Showing the samples with sex missmatches. 8 donors

sex_check.usable.problem = sex_check.usable[sex_check.usable$STATUS2=="PROBLEM",]

#sex_check.usable.problem$donor_id = gsub("(.*)_(.*)","\\1",sex_check.usable.problem$IID)
sex_check.usable.problem2 = sex_check.usable.problem %>% 
  dplyr::select(gsa_id = FID,gsa_id2 = IID,donor_id,reported_sex = sex,inferred_sex = SNPSEX2,status = STATUS2,`F`)

failed_samples_df = register_failed_samples(failed_samples_df, metadata[metadata$Donor_id %in% sex_check.usable.problem2$donor_id, "Donor_tissue"], "SexMismatch")

createDT(sex_check.usable.problem2)

Checking in the RNA data

load(paste0(expression_dir, "Expression_filt_285s.Rdata"))

# TPM
#dim(genes_tpm_exp) # 19376 genes and 285 samples
xist = genes_tpm_exp[which(grepl("ENSG00000229807",rownames(genes_tpm_exp))),]
uty = genes_tpm_exp[which(grepl("ENSG00000183878",rownames(genes_tpm_exp))),]
rna_sex_genes_tpm = data.frame(XIST = t(xist), UTY = t(uty))
colnames(rna_sex_genes_tpm) = c("XIST","UTY")

# Merge with metadata 
#all(metadata.filt$donor_tissue==rownames(rna_sex_genes_tpm)) # TRUE
rna_sex_genes_tpm$`Reported Sex` = metadata.filt$sex

#ggplot(rna_sex_genes_tpm, aes(x=XIST, y=UTY, fill=`Reported Sex`)) +
#  geom_point(shape=21)

# VOOM
genes_counts_voom = as.data.frame(genes_counts_voom, stringsAsFactors = F)
xist = genes_counts_voom[which(grepl("ENSG00000229807",rownames(genes_counts_voom))),]
uty = genes_counts_voom[which(grepl("ENSG00000183878",rownames(genes_counts_voom))),]
rna_sex_genes_voom = data.frame(XIST = t(xist), UTY = t(uty))
colnames(rna_sex_genes_voom) = c("XIST","UTY")

# Merge with metadata 
#all(metadata.filt$donor_tissue==rownames(rna_sex_genes_voom)) # TRUE
rna_sex_genes_voom$`Reported Sex` = factor(toupper(metadata.filt$sex), levels = c("M","F"))
rna_sex_genes_voom$donor_id = metadata.filt$donor_id
rna_sex_genes_voom$donor_tissue = metadata.filt$donor_tissue
rna_sex_genes_voom = rna_sex_genes_voom %>% plyr::join(sex_check.usable[,c("donor_id","STATUS")], by = "donor_id")
avg_xist = mean(rna_sex_genes_voom$XIST[rna_sex_genes_voom$`Reported Sex`=="F"])
avg_uty = mean(rna_sex_genes_voom$UTY[rna_sex_genes_voom$`Reported Sex`=="M"])
rna_sex_genes_voom$toCheck = F

rna_sex_genes_voom$toCheck =  (rna_sex_genes_voom$UTY>0 & rna_sex_genes_voom$`Reported Sex`=="F") | 
  (rna_sex_genes_voom$XIST<2 & rna_sex_genes_voom$`Reported Sex`=="F") |
  (rna_sex_genes_voom$XIST>4.5 & rna_sex_genes_voom$`Reported Sex`=="M")

ggplot(rna_sex_genes_voom, aes(x=XIST, y=UTY, fill=`Reported Sex`)) +
  geom_point(shape=21, size=2, alpha=.8) +
  geom_label_repel(data = subset(rna_sex_genes_voom, toCheck == T),
                   aes(label = donor_tissue),
                   fill = "grey90",
                   size=3,
                   segment.alpha = 0.5,
                   label.size = NA, 
                   alpha = 0.8, 
                   label.padding=.1, 
                   box.padding = .1,
                   na.rm=TRUE,
                   seed = 1234) +
  scale_fill_aaas() +
  labs(x = "XIST (Voom)", y = "UTY (Voom)", fill = "Reported Sex") +
  theme_classic()

Relatedness

Besides the two duplicated samples in the GSA run, there are two more samples with 2nd-degree relatedness

pi_hat = read.table(paste0(anc_dir,"pihat_min0.2.genome"), header = T, stringsAsFactors = F) # Related samples to remove
# ggplot(pi_hat, aes(x=PI_HAT)) +
#   geom_histogram(bins = 20, colour="black", fill="white") +
#   theme_classic()
pi_hat.to_remove = pi_hat[pi_hat$FID1 %in% genotype.fam.merged.usable$V1 & pi_hat$FID2 %in% genotype.fam.merged.usable$V1,]

king.to_remove = read.table(paste0(anc_dir,"kingunrelated_toberemoved.txt"), header = F, stringsAsFactors = F)
king.to_remove.filt = king.to_remove[king.to_remove$V1 %in% genotype.fam.merged.usable$V1, ]

#king -b ~/pd-omics_hydra/glia_omics/genotypes/gsa123456_geno95_hwe6_mind95_het3sd.bed --kinship --unrelated --related --duplicate
# an estimated kinship coefficient range >0.354, [0.177, 0.354], [0.0884, 0.177] and [0.0442, 0.0884] corresponds to duplicate/MZ twin, 1st-degree, 2nd-degree, and 3rd-degree relationships respectively
king = read.table(gzfile(paste0(anc_dir,"king.kin0.gz")), header = T, stringsAsFactors = F)
king.usable = king[king$FID1 %in% genotype.fam.merged.usable$V1 & king$FID2 %in% genotype.fam.merged.usable$V1,]
king.usable.toRemove = king.usable[king.usable$Kinship>0.0884,]
d = data.frame(label = c("duplicate/MZ twin","1st-degree","2nd-degree","3rd-degree"),
               value = c(0.354,0.177,0.0884,0.0442))
king.usable$Class = factor(ifelse(king.usable$Kinship>0.0884,"toRemove","toKeep"), levels = c("toRemove","toKeep"))
ggplot(king.usable, aes(x=Kinship)) +
  geom_histogram(aes(fill=Class), bins = 100, colour="black") +
  scale_fill_manual(values = c("#CB454A","grey")) +
  geom_vline(data=d, mapping=aes(xintercept=value), color="grey") + 
  geom_text(data=d, mapping=aes(x=value, y=1800, label=label), size=3, angle=-90, vjust=-0.4, hjust=0) +
  scale_y_sqrt(breaks = c(0,10,100,250,500,1000,1500)) +
  labs(y = "Counts (square root scale)") +
  #annotation_logticks(sides = "l",) +
  theme_classic() +
  easy_remove_legend()

Showing the duplicated and related samples identified by Kinship analysis

king.usable.toRemove

Merging data

Prioritizing EUR ancestry donor
Donors 18-112 and 15-041 are 2nd degree related. Keeping the donor 18-112 (3 brain regions) and removing donor 15-041 (1 brain region)
For the duplicated GSA runs, we will keep those from GSA6

Numbers:

	Donors	Samples
All	100	261
EUR-only	92	239

Showing below metadata for all ancestries

# Remove related individual
metadata_final = metadata_usable[!metadata_usable$donor_id=="15-041",]
# Add sexCheck information
metadata_final$sex_check_FAIL = metadata_final$donor_id %in% sex_check.usable.problem$donor_id
# Select european ancestry individuals
metadata_final.eur = metadata_final[metadata_final$donor_id %in% genotype.fam.merged.usable.eur$Donor_id,]
# Filter duplicated and related donor from GSA fam table
genotype.fam.merged.usable.final = genotype.fam.merged.usable[!genotype.fam.merged.usable$V1 %in% c("GSA5_109","GSA5_121","GSA5_124","GSA5_171","GSA6_179"),]
rownames(genotype.fam.merged.usable.final) = genotype.fam.merged.usable.final$Donor_id
# Select european ancestry individuals
genotype.fam.merged.usable.final.eur = genotype.fam.merged.usable.final[genotype.fam.merged.usable.final$Donor_id %in% genotype.fam.merged.usable.eur$Donor_id,]

metadata_final$gsa_id = genotype.fam.merged.usable.final[metadata_final$donor_id,"V1"]
metadata_final$merged_gsa_id = genotype.fam.merged.usable.final[metadata_final$donor_id,"merged_id"]
# merged_gsa_id2 have the original GSA id (some samples have donor_ids with _ instead of -)
metadata_final$merged_gsa_id2 = paste0(genotype.fam.merged.usable.final[metadata_final$donor_id,"V1"],"_",genotype.fam.merged.usable.final[metadata_final$donor_id,"V2"])

metadata_final.eur$gsa_id = genotype.fam.merged.usable.final.eur[metadata_final.eur$donor_id,"V1"]
metadata_final.eur$merged_gsa_id = genotype.fam.merged.usable.final.eur[metadata_final.eur$donor_id,"merged_id"]
metadata_final.eur$merged_gsa_id2 = paste0(genotype.fam.merged.usable.final.eur[metadata_final.eur$donor_id,"V1"],"_",genotype.fam.merged.usable.final.eur[metadata_final.eur$donor_id,"V2"])

#write.table(metadata_final, file = paste0(write_dir,"metadata4eQTL_final.txt"), quote = F, row.names = F)
#write.table(metadata_final.eur, file = paste0(write_dir,"metadata4eQTL_final.eur.txt"), quote = F, row.names = F)
#write.table(metadata_final.eur$gsa_id, file = paste0(write_dir,"gsa_ids_4eQTL_final.eur.txt"), quote = F, row.names = F)

#write.table(genotype.fam.merged.usable[,c("V1","V2")], file = paste0(write_dir,"gsa_ids_4eQTL_prefilt.txt"), quote = F, row.names = F)

createDT(metadata_final)

Checking DNA-RNA concordance

We will check for concordance between RNA (using .bam files) and DNA data (using GSA genotypes from .vcf file).

First, we will show if there is matches for 13 samples that were not genotyped.

#mbv_test <- read.table(file = paste0(mbv_dir,"/results/glia_omics_mbv/output/16-116-GTS.bamstat.txt"), header = T, check.names = F)
#ggplot(unique(mbv_test), aes(x = perc_het_consistent, y = perc_hom_consistent)) +
#  geom_point( shape=21, alpha=0.4, size=2, fill="grey" ) +
#  labs(y = "Concordance at HOM (%)", x = "Concordance at HET (%)") +
#  theme_classic()
mbv <- read.table(file = paste0(mbv_dir,"/results/glia_omics_mbv/glia_omics_mbv_mbv_summary.txt"), 
                  header = F, 
                  check.names = F,
                  col.names = c("bam","SampleID","n_geno_missing","n_het_total","n_hom_total","n_het_covered","n_hom_covered","n_het_consistent","n_hom_consistent","perc_het_consistent","perc_hom_consistent","n_het_in_ase"))
mbv$bam_donor_id = gsub("(.*)\\/(.*)-(.*)","\\2",mbv$bam)
mbv$bam_sample_id = gsub("(.*)\\/(.*)\\.bamstat\\.txt","\\2",mbv$bam)
mbv$sample_donor_id = gsub("_","-",gsub("_CER|_GFM","",gsub("(.*?)_(.*?)_(.*)","\\3",mbv$SampleID)))
mbv$sample_id = gsub("(.*?)_(.*?)_(.*)","\\3",mbv$SampleID)
mbv$same_donor = mbv$bam_donor_id==mbv$sample_donor_id

failed_samples_df = register_failed_samples(failed_samples_df, mbv[mbv$same_donor==F,"bam_sample_id"], "MBV")

mbv.missing = mbv[mbv$bam_sample_id %in% metadata.missing$Donor_tissue,]
ggplot(mbv.missing, aes(x = perc_het_consistent, y = perc_hom_consistent, fill = same_donor)) +
  geom_vline(xintercept=0.5, color="grey", linetype = "dashed") + 
  geom_point(shape=21, alpha=0.4, size=2) +
  geom_point(data = subset(mbv.missing, same_donor == F), alpha = 0.8, shape=21, size=2 ) +
  geom_label_repel(data = subset(mbv.missing, same_donor == F), 
                   aes(label = bam_sample_id),
                   fill = "#CB454A",
                   size=3,
                   segment.alpha = 0.3,
                   label.size = NA, 
                   alpha = 0.8, 
                   label.padding=.1, 
                   box.padding = .1,
                   na.rm=TRUE,
                   seed = 1234) +
  geom_label_repel(data = subset(mbv.missing, same_donor == T & perc_het_consistent<0.5), 
                 aes(label = bam_sample_id),
                 fill = "grey",
                 size=3,
                 segment.alpha = 0.3,
                 label.size = NA, 
                 alpha = 0.8, 
                 label.padding=.1, 
                 box.padding = .1,
                 na.rm=TRUE,
                 seed = 1234) +
  expand_limits(y = 1, x = 1) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  scale_x_continuous(labels = scales::percent_format(accuracy = 1)) +
  scale_fill_manual(values = c("#CB454A","grey")) +
  labs(y = "Concordance at HOM", x = "Concordance at HET", fill = "Matching donor", alpha = "Status") +
  theme_classic()

Showing results for all 301 RNA-seq samples that were genotyped (each dot is one sample). Labels in red are missmatching samples.

# Removing RNAseq missing GSA
mbv = mbv[!mbv$bam_donor_id %in% metadata.missing$Donor_id,] # 301 samples

# sum(mbv$bam_donor_id==mbv$sample_donor_id) # 287 samples matching RNA and DNA
mbv$same_donor = mbv$bam_donor_id==mbv$sample_donor_id 

mbv$status = NA
mbv$status[mbv$same_donor==T] = "Match"
mbv$status[mbv$same_donor==F] = "Not Match"
mbv$status[mbv$bam_donor_id %in% metadata.missing$Donor_id] = "Not Genotyped"

ggplot(mbv, aes(x = perc_het_consistent, y = perc_hom_consistent, fill = same_donor)) +
  geom_vline(xintercept=0.5, color="grey", linetype = "dashed") + 
  geom_point(shape=21, alpha=0.4, size=2) +
  geom_point(data = subset(mbv, same_donor == F), alpha = 0.8, shape=21, size=2 ) +
  geom_label_repel(data = subset(mbv, same_donor == F), 
                   aes(label = bam_donor_id),
                   fill = "#CB454A",
                   size=3,
                   segment.alpha = 0.3,
                   label.size = NA, 
                   alpha = 0.8, 
                   label.padding=.1, 
                   box.padding = .1,
                   na.rm=TRUE,
                   seed = 1234) +
  expand_limits(y = 1, x = 1) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  scale_x_continuous(labels = scales::percent_format(accuracy = 1)) +
  scale_fill_manual(values = c("#CB454A","grey")) +
  labs(y = "Concordance at HOM", x = "Concordance at HET", fill = "Matching donor", alpha = "Status") +
  theme_classic()

Now showing only the samples where donors passed all previous QC metrics (261 samples/100 donors)

mbv.filtered = mbv[mbv$bam_sample_id %in% metadata_final$donor_tissue,]
ggplot(mbv.filtered, aes(x = perc_het_consistent, y = perc_hom_consistent, fill = same_donor)) +
  geom_vline(xintercept=0.5, color="grey", linetype = "dashed") + 
  geom_point(shape=21, alpha=0.4, size=2) +
  geom_point(data = subset(mbv.filtered, same_donor == F), alpha = 0.8, shape=21, size=2 ) +
  geom_label_repel(data = subset(mbv.filtered, same_donor == F), 
                   aes(label = bam_sample_id),
                   fill = "#CB454A",
                   size=3,
                   segment.alpha = 0.3,
                   label.size = NA, 
                   alpha = 0.8, 
                   label.padding=.1, 
                   box.padding = .1,
                   na.rm=TRUE,
                   seed = 1234) +
  geom_label_repel(data = subset(mbv.filtered, same_donor == T & perc_het_consistent<0.5), 
                 aes(label = bam_sample_id),
                 fill = "grey",
                 size=3,
                 segment.alpha = 0.3,
                 label.size = NA, 
                 alpha = 0.8, 
                 label.padding=.1, 
                 box.padding = .1,
                 na.rm=TRUE,
                 seed = 1234) +
  expand_limits(y = 1, x = 1) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  scale_x_continuous(labels = scales::percent_format(accuracy = 1)) +
  scale_fill_manual(values = c("#CB454A","grey")) +
  labs(y = "Concordance at HOM", x = "Concordance at HET", fill = "Matching donor", alpha = "Status") +
  theme_classic()

From 261 (after-QC) RNAseq samples, 258 (98 donors) matched donors with DNA.

Here is the list of missmatching samples.

#length(unique(mbv.filtered$bam)) # 261 RNAseq bam files
#length(unique(metadata$Donor_tissue)) # 314
#length(unique(mbv$bam_sample_id)) # 291
#length(unique(mbv$bam_donor_id)) # 105 donors (from RNAseq bam ids)
#length(unique(mbv$SampleID)) # 108 donors (from GSA) - best matches for each RNAseq sample
#sum(unique(mbv$bam_sample_id) %in% unique(metadata$Donor_tissue)) # 291 samples (bam ids) are in the metadata
#sum(unique(mbv$bam_donor_id) %in% unique(metadata$Donor_id)) # 105 donors (bam ids) are in the metadata

#sum(mbv$bam_donor_id==mbv$sample_donor_id) # 287 RNAseq samples are matching donors with DNA
#sum(mbv.filtered$bam_donor_id==mbv.filtered$sample_donor_id) # 258 RNAseq samples are matching donors with DNA

mbv_match = mbv.filtered[which(mbv.filtered$bam_donor_id==mbv.filtered$sample_donor_id),]
#length(unique(mbv_match$bam_donor_id)) # 98 donors
mbv_miss = mbv.filtered[which(!mbv.filtered$bam_donor_id==mbv.filtered$sample_donor_id),]

mbv_miss$PassMBV = mbv_miss$perc_het_consistent>0.5
createDT(mbv_miss)

Final results

After removing non-concordant samples.

Final numbers:

	Donors	Samples
All	98	258
EUR-only	90	236

Showing below metadata for all ancestries

metadata_final2 = metadata_final[metadata_final$donor_tissue %in% mbv_match$bam_sample_id,]
metadata_final2 = dplyr::left_join(metadata_final2, mds.usable, by = c("merged_gsa_id" = "IID"))
#length(unique(metadata_final2$donor_id)) # 98 donors
# Select european ancestry individuals
metadata_final2.eur = metadata_final2[metadata_final2$donor_id %in% genotype.fam.merged.usable.eur$Donor_id,]
#length(unique(metadata_final2.eur$donor_id)) # 90 donors

write.table(metadata_final2, file = paste0(write_dir,"metadata4eQTL.txt"), quote = F, row.names = F, sep = "\t")
write.table(metadata_final2.eur, file = paste0(write_dir,"metadata4eQTL_eur.txt"), quote = F, row.names = F, sep = "\t")

# Select ids for filtering samples in the plink file
write.table(metadata_final2[,c("merged_gsa_id2")], file = paste0(write_dir,"gsa_ids_4eQTL_final2.txt"), quote = F, row.names = F, col.names = F, sep=" ")
# Mapping GSA names to donor_id
write.table(metadata_final2[,c("merged_gsa_id2","donor_id")], file = paste0(write_dir,"gsa_ids_4eQTL_final2.namesMap.txt"), quote = F, row.names = F, col.names = F, sep=" ")

# Select ids for filtering samples in the plink file
write.table(metadata_final2.eur[,c("merged_gsa_id2")], file = paste0(write_dir,"gsa_ids_4eQTL_final2.eur.txt"), quote = F, row.names = F)
# Mapping GSA names to donor_id
write.table(metadata_final2.eur[,c("merged_gsa_id2","donor_id")], file = paste0(write_dir,"gsa_ids_4eQTL_final2.eur.namesMap.txt"), quote = F, row.names = F, col.names = F, sep=" ")

createDT(metadata_final2)

Final QC status

# table(failed_samples_df$status) # 79 failed, 235 Ok
createDT(failed_samples_df)

Session info

sessionInfo()

## R version 3.6.3 (2020-02-29)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 18363)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] factoextra_1.0.6 ggrepel_0.8.2    scales_1.1.0     gridExtra_2.3   
##  [5] UpSetR_1.4.0     ggeasy_0.1.2     ggsci_2.9        DT_0.13         
##  [9] forcats_0.5.0    stringr_1.4.0    dplyr_0.8.5      purrr_0.3.3     
## [13] readr_1.3.1      tidyr_1.0.2      tibble_3.0.0     ggplot2_3.3.0   
## [17] tidyverse_1.3.0  pacman_0.5.1    
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.4        lubridate_1.7.4   lattice_0.20-40   assertthat_0.2.1 
##  [5] digest_0.6.25     R6_2.4.1          cellranger_1.1.0  plyr_1.8.6       
##  [9] backports_1.1.5   reprex_0.3.0      evaluate_0.14     httr_1.4.1       
## [13] pillar_1.4.3      rlang_0.4.5       readxl_1.3.1      rstudioapi_0.11  
## [17] rmarkdown_2.1     labeling_0.3      htmlwidgets_1.5.1 munsell_0.5.0    
## [21] broom_0.5.5       compiler_3.6.3    modelr_0.1.6      xfun_0.12        
## [25] pkgconfig_2.0.3   htmltools_0.4.0   tidyselect_1.0.0  fansi_0.4.1      
## [29] crayon_1.3.4      dbplyr_1.4.2      withr_2.1.2       grid_3.6.3       
## [33] nlme_3.1-145      jsonlite_1.6.1    gtable_0.3.0      lifecycle_0.2.0  
## [37] DBI_1.1.0         magrittr_1.5      cli_2.0.2         stringi_1.4.6    
## [41] farver_2.0.3      fs_1.3.2          xml2_1.2.5        ellipsis_0.3.0   
## [45] generics_0.0.2    vctrs_0.2.4       tools_3.6.3       glue_1.3.2       
## [49] crosstalk_1.1.0.1 hms_0.5.3         yaml_2.2.1        colorspace_1.4-1 
## [53] rvest_0.3.5       knitr_1.28        haven_2.2.0

Checking GSA results

Genotypes

Katia de Paiva Lopes
Ricardo A. Vialle
Raj Lab
Department of Neuroscience
Icahn School of Medicine at Mount Sinai
NYC, New York

2020-04-21

GSA fam file

From RNA-seq

From GSA

Plot showing overlaps between GSA and RNAseq

Missing samples

Samples removed in GSA QC

Usable samples

Checking ancestry numbers

Sex check

Relatedness

Merging data

Checking DNA-RNA concordance

Final results

Final QC status

Session info

Checking GSA results

Genotypes

Katia de Paiva Lopes Ricardo A. Vialle Raj Lab Department of Neuroscience Icahn School of Medicine at Mount Sinai NYC, New York

2020-04-21

GSA fam file

From RNA-seq

From GSA

Plot showing overlaps between GSA and RNAseq

Missing samples

Samples removed in GSA QC

Usable samples

Checking ancestry numbers

Sex check

Relatedness

Merging data

Checking DNA-RNA concordance

Final results

Final QC status

Session info

Katia de Paiva Lopes
Ricardo A. Vialle
Raj Lab
Department of Neuroscience
Icahn School of Medicine at Mount Sinai
NYC, New York