GSA fam file

This file contains all samples processed by Michael Chao, not only the ones from this project.

First columns is the GSA id. Second column is our ID.

QCed column show samples kept in the file gsa123456_geno95_hwe6_mind95_het3sd

### READ METADATA
# All RNA-seq samples (314)
metadata = read.table(paste0(metadata_path,"metadata_314s_28feb2020.txt"), header = T, check.names = F, sep="\t", stringsAsFactors = F)
# Only RNA-seq samples that pass QC (285)
metadata.filt = read.table(paste0(metadata_path,"metadata_285filt_eng_3mar2020.txt"), header = T, check.names = F, sep="\t", stringsAsFactors = F)
# Just checking if donor IDs can be inferred from sample IDs
#length(unique(gsub("(.*)-(.*)","\\1",metadata$Donor_tissue))) # 115 donors from RNAseq (preQC)
#all(gsub("(.*)-(.*)","\\1",metadata$Donor_tissue) == metadata$Donor_id) # All donor IDs can be inferred from Sample IDs
#length(unique(gsub("(.*)-(.*)","\\1",metadata.filt$donor_tissue))) # 111 donor from RNAseq (afterQC)
#all(gsub("(.*)-(.*)","\\1",metadata.filt$donor_tissue) == metadata.filt$donor_id) # Again, all donor IDs can be inferred from Sample IDs

failed_samples_df = data.frame(donor_tissue = metadata$Donor_tissue, status = "Ok", cause = "", stringsAsFactors = F)
register_failed_samples <- function(failed_samples_df, donor_tissue, cause){
  failed_samples_df[failed_samples_df$donor_tissue %in% donor_tissue, "status"] <- "Failed"
  failed_samples_df[failed_samples_df$donor_tissue %in% donor_tissue, "cause"] <- ifelse(failed_samples_df[failed_samples_df$donor_tissue %in% donor_tissue, "cause"]=="", 
                                                                                         cause, paste0(failed_samples_df[failed_samples_df$donor_tissue %in% donor_tissue, "cause"],";",cause))
  return(failed_samples_df)
}
failed_samples_df = register_failed_samples(failed_samples_df, metadata[!metadata$Donor_tissue %in% metadata.filt$donor_tissue,"Donor_tissue"],"RNAseq_QC")

### READ GENOTYPE FAM FILES
genotype.fam = read.table(paste0(genotype_dir,"gsa123456.fam"), header = F, stringsAsFactors = F)
genotype.fam$Donor_id = gsub("_","-",gsub("(.*)_GFM","\\1",gsub("(.*)_CER","\\1",genotype.fam$V2))) # Remove chars after underline to match samples names in the RNAseq tables
genotype.fam.QCed = read.table(paste0(genotype_dir,"gsa123456_geno95_hwe6_mind95_het3sd.fam"), header = F, stringsAsFactors = F)
genotype.fam.QCed$Donor_id = gsub("_","-",gsub("(.*)_GFM","\\1",gsub("(.*)_CER","\\1",genotype.fam.QCed$V2))) # Remove chars after underline to match samples names in the RNAseq tables
### READ HET QC 
genotype.hetQC = read.table(paste0(work_dir,"fail-het-qc.txt"), header = T, stringsAsFactors = F, check.names = F)
genotype.missingQC = read.table(paste0(work_dir,"plink.imiss"), header = T, stringsAsFactors = F, check.names = F)

# length(unique(metadata$Donor_id)) # 115 donors
# sum(unique(metadata$Donor_id) %in% genotype.fam$Donor_id) # 109 of them have genotypes (pre GSA QC)
# sum(unique(metadata$Donor_id) %in% genotype.fam.QCed$Donor_id) # 105 of them have genotypes (after GSA QC)

# length(unique(metadata.filt$donor_id)) # 111 donors after RNAseq QC
# sum(unique(metadata.filt$donor_id) %in% genotype.fam$Donor_id) # 105 of them have genotypes (pre GSA QC)
# sum(unique(metadata.filt$donor_id) %in% genotype.fam.QCed$Donor_id) # 101 of them have genotypes (after GSA QC)

genotype.fam.merged = merge(genotype.fam,genotype.fam.QCed, by = c("V1","V2"), all.x = T) %>% dplyr::rename(V3 = V3.x, V4 = V4.x, V5 = V5.x, V6 = V6.x, Donor_id = Donor_id.x) %>% mutate(QCed = !is.na(V3.y)) %>% dplyr::select(V1,V2,V3,V4,V5,V6,QCed,Donor_id)
createDT(genotype.fam.merged)

From RNA-seq

Comparing donor and samples overlapping the GSA files. Showing before and after RNAseq QC

From GSA

Same thing, but now showing before and after GSA QC

Duplicated IDs? Showing only for IDs matching RNAseq samples

Missing samples

Samples with RNAseq that were not genotyped

Samples removed in GSA QC

MG samples that were genotyped but didn’t pass the GSA QC steps were because of individual missing rate (>0.15)

List of samples removed due to missingness in the genotyping step

Usable samples

After GSA QC AND RNAseq QC (101 donors and 262 samples)

Checking ancestry numbers

Ancestry defined with differenct thresholds per population (Michael Chao analysis)

UNKNOWN are samples not present in any of the ancestry files (probably didnt pass SD thresdhold)

aj_samples = read.table(paste0(anc_dir,"gsa123456_aj_samples_10MDS_mean3sd.txt"), header = T, stringsAsFactors = F)
afr_samples = read.table(paste0(anc_dir,"gsa_afr_samples_10MDS_4SD.txt"), header = F, stringsAsFactors = F)
amr_samples = read.table(paste0(anc_dir,"gsa_amr_samples_10MDS_6SD.txt"), header = F, stringsAsFactors = F)
eas_samples = read.table(paste0(anc_dir,"gsa_eas_samples_10MDS_3SD.txt"), header = F, stringsAsFactors = F)
eur_samples = read.table(paste0(anc_dir,"gsa_eur_samples_10MDS_6SD.txt"), header = F, stringsAsFactors = F)
sas_samples = read.table(paste0(anc_dir,"gsa_sas_samples_10MDS_6SD.txt"), header = F, stringsAsFactors = F)

anc_samples = rbind(as.data.frame(cbind(aj_samples$IID,"AJ")), 
                    cbind(afr_samples,V2 ="AFR"),
                    cbind(amr_samples,V2 ="AMR"),
                    cbind(eas_samples,V2 ="EAS"),
                    cbind(eur_samples,V2 ="EUR"),
                    cbind(sas_samples,V2 ="SAS"))

genotype.fam.merged$merged_id = paste(genotype.fam.merged$V1,genotype.fam.merged$V2, sep = "_")

genotype.fam.merged.usable = genotype.fam.merged[genotype.fam.merged$Donor_id %in% unique(metadata_usable$donor_id),]
genotype.fam.merged.usable.afr = genotype.fam.merged.usable[genotype.fam.merged.usable$merged_id %in% afr_samples$V1,]
genotype.fam.merged.usable.amr = genotype.fam.merged.usable[genotype.fam.merged.usable$merged_id %in% amr_samples$V1,]
genotype.fam.merged.usable.eas = genotype.fam.merged.usable[genotype.fam.merged.usable$merged_id %in% eas_samples$V1,]
genotype.fam.merged.usable.eur = genotype.fam.merged.usable[genotype.fam.merged.usable$merged_id %in% eur_samples$V1,]
genotype.fam.merged.usable.sas = genotype.fam.merged.usable[genotype.fam.merged.usable$merged_id %in% sas_samples$V1,]
genotype.fam.merged.usable.aj = genotype.fam.merged.usable[genotype.fam.merged.usable$merged_id %in% aj_samples$IID,]
#genotype.fam.merged.usable.eur[!genotype.fam.merged.usable.eur$merged_id %in% genotype.fam.merged.usable.aj$merged_id,]
genotype.fam.merged.usable.unknown = genotype.fam.merged.usable[!genotype.fam.merged.usable$merged_id %in% anc_samples$V1,]

data.frame(`Ancestry` = c("AFR","AMR","EAS","EUR","SAS","AJ (already included in EUR)","UNKNOWN"), 
           `Samples` = c(length(unique(genotype.fam.merged.usable.afr$Donor_id)),
                         length(unique(genotype.fam.merged.usable.amr$Donor_id)),
                         length(unique(genotype.fam.merged.usable.eas$Donor_id)),
                         length(unique(genotype.fam.merged.usable.eur$Donor_id)),
                         length(unique(genotype.fam.merged.usable.sas$Donor_id)),
                         length(unique(genotype.fam.merged.usable.aj$Donor_id)),
                         length(unique(genotype.fam.merged.usable.unknown)$Donor_id)), 
           `Duplicated` = c(sum(duplicated(genotype.fam.merged.usable.afr$Donor_id)),
                            sum(duplicated(genotype.fam.merged.usable.amr$Donor_id)),
                            sum(duplicated(genotype.fam.merged.usable.eas$Donor_id)),
                            sum(duplicated(genotype.fam.merged.usable.eur$Donor_id)),
                            sum(duplicated(genotype.fam.merged.usable.sas$Donor_id)),
                            sum(duplicated(genotype.fam.merged.usable.aj$Donor_id)),
                            sum(duplicated(genotype.fam.merged.usable.unknown$Donor_id))))

Sex check

Comparing reported sex information with inferred sex from genotypes

sex_check = read.table(paste0(anc_dir,"plink.sexcheck"), header = T, stringsAsFactors = F)
sex_check.usable = sex_check[sex_check$FID %in% unique(genotype.fam.merged.usable$V1),]
sex_check.usable$donor_id  = gsub("_","-",gsub("(.*)_GFM","\\1",gsub("(.*)_CER","\\1",sex_check.usable$IID)))
sex_check.usable = sex_check.usable[sex_check.usable$donor_id %in% metadata.filt$donor_id,]

# 1 is M
# 2 if F
# Compare with our metadata
sex_check.usable = sex_check.usable %>% left_join(unique(metadata[,c("Donor_id","sex")]), by = c("donor_id" = "Donor_id"))
sex_check.usable$sex2 = sex_check.usable$sex 
sex_check.usable$sex2[sex_check.usable$sex2=="m"] = 1
sex_check.usable$sex2[sex_check.usable$sex2=="f"] = 2
sex_check.usable$STATUS2 = sex_check.usable$SNPSEX == sex_check.usable$sex2
sex_check.usable$STATUS2[sex_check.usable$STATUS2==T] = "OK"
sex_check.usable$STATUS2[sex_check.usable$STATUS2==F] = "PROBLEM"

#table(metadata_usable %>% distinct(donor_id,sex) %>% dplyr::select(sex),useNA="ifany")
#table(sex_check.usable$SNPSEX2,useNA="ifany")

a = ggplot(sex_check.usable, aes(x = STATUS2, fill=STATUS2)) + 
  geom_bar(color="black") + 
  geom_text(stat='count', aes(label=..count..),vjust=-0.35) + 
  theme_classic() +
  labs(y = "Count", x = "Sex Check Status") +
  scale_fill_aaas() + 
  easy_remove_legend()

sex_check.usable$SNPSEX2 = sex_check.usable$SNPSEX
sex_check.usable$SNPSEX2[sex_check.usable$SNPSEX2 == 1] = "Male" 
sex_check.usable$SNPSEX2[sex_check.usable$SNPSEX2 == 2] = "Female" 
sex_check.usable$SNPSEX2[sex_check.usable$SNPSEX2 == 0] = "Other" 
b = ggplot(sex_check.usable, aes(x = SNPSEX2, fill=STATUS2)) +
  geom_bar(position="dodge", color="black") +
  geom_text(aes(label=..count..), stat='count', position=position_dodge(0.9), vjust=-0.35) +
  labs(y = "Count", x = "SNP Sex Inferred", fill = "Sex Check Status") +
  scale_fill_aaas() + 
  theme_classic()
grid.arrange(a, b, nrow = 1)

Showing the samples with sex missmatches. 8 donors

Checking in the RNA data

load(paste0(expression_dir, "Expression_filt_285s.Rdata"))

# TPM
#dim(genes_tpm_exp) # 19376 genes and 285 samples
xist = genes_tpm_exp[which(grepl("ENSG00000229807",rownames(genes_tpm_exp))),]
uty = genes_tpm_exp[which(grepl("ENSG00000183878",rownames(genes_tpm_exp))),]
rna_sex_genes_tpm = data.frame(XIST = t(xist), UTY = t(uty))
colnames(rna_sex_genes_tpm) = c("XIST","UTY")

# Merge with metadata 
#all(metadata.filt$donor_tissue==rownames(rna_sex_genes_tpm)) # TRUE
rna_sex_genes_tpm$`Reported Sex` = metadata.filt$sex

#ggplot(rna_sex_genes_tpm, aes(x=XIST, y=UTY, fill=`Reported Sex`)) +
#  geom_point(shape=21)

# VOOM
genes_counts_voom = as.data.frame(genes_counts_voom, stringsAsFactors = F)
xist = genes_counts_voom[which(grepl("ENSG00000229807",rownames(genes_counts_voom))),]
uty = genes_counts_voom[which(grepl("ENSG00000183878",rownames(genes_counts_voom))),]
rna_sex_genes_voom = data.frame(XIST = t(xist), UTY = t(uty))
colnames(rna_sex_genes_voom) = c("XIST","UTY")

# Merge with metadata 
#all(metadata.filt$donor_tissue==rownames(rna_sex_genes_voom)) # TRUE
rna_sex_genes_voom$`Reported Sex` = factor(toupper(metadata.filt$sex), levels = c("M","F"))
rna_sex_genes_voom$donor_id = metadata.filt$donor_id
rna_sex_genes_voom$donor_tissue = metadata.filt$donor_tissue
rna_sex_genes_voom = rna_sex_genes_voom %>% plyr::join(sex_check.usable[,c("donor_id","STATUS")], by = "donor_id")
avg_xist = mean(rna_sex_genes_voom$XIST[rna_sex_genes_voom$`Reported Sex`=="F"])
avg_uty = mean(rna_sex_genes_voom$UTY[rna_sex_genes_voom$`Reported Sex`=="M"])
rna_sex_genes_voom$toCheck = F

rna_sex_genes_voom$toCheck =  (rna_sex_genes_voom$UTY>0 & rna_sex_genes_voom$`Reported Sex`=="F") | 
  (rna_sex_genes_voom$XIST<2 & rna_sex_genes_voom$`Reported Sex`=="F") |
  (rna_sex_genes_voom$XIST>4.5 & rna_sex_genes_voom$`Reported Sex`=="M")

ggplot(rna_sex_genes_voom, aes(x=XIST, y=UTY, fill=`Reported Sex`)) +
  geom_point(shape=21, size=2, alpha=.8) +
  geom_label_repel(data = subset(rna_sex_genes_voom, toCheck == T),
                   aes(label = donor_tissue),
                   fill = "grey90",
                   size=3,
                   segment.alpha = 0.5,
                   label.size = NA, 
                   alpha = 0.8, 
                   label.padding=.1, 
                   box.padding = .1,
                   na.rm=TRUE,
                   seed = 1234) +
  scale_fill_aaas() +
  labs(x = "XIST (Voom)", y = "UTY (Voom)", fill = "Reported Sex") +
  theme_classic()

Relatedness

Besides the two duplicated samples in the GSA run, there are two more samples with 2nd-degree relatedness

pi_hat = read.table(paste0(anc_dir,"pihat_min0.2.genome"), header = T, stringsAsFactors = F) # Related samples to remove
# ggplot(pi_hat, aes(x=PI_HAT)) +
#   geom_histogram(bins = 20, colour="black", fill="white") +
#   theme_classic()
pi_hat.to_remove = pi_hat[pi_hat$FID1 %in% genotype.fam.merged.usable$V1 & pi_hat$FID2 %in% genotype.fam.merged.usable$V1,]

king.to_remove = read.table(paste0(anc_dir,"kingunrelated_toberemoved.txt"), header = F, stringsAsFactors = F)
king.to_remove.filt = king.to_remove[king.to_remove$V1 %in% genotype.fam.merged.usable$V1, ]

#king -b ~/pd-omics_hydra/glia_omics/genotypes/gsa123456_geno95_hwe6_mind95_het3sd.bed --kinship --unrelated --related --duplicate
# an estimated kinship coefficient range >0.354, [0.177, 0.354], [0.0884, 0.177] and [0.0442, 0.0884] corresponds to duplicate/MZ twin, 1st-degree, 2nd-degree, and 3rd-degree relationships respectively
king = read.table(gzfile(paste0(anc_dir,"king.kin0.gz")), header = T, stringsAsFactors = F)
king.usable = king[king$FID1 %in% genotype.fam.merged.usable$V1 & king$FID2 %in% genotype.fam.merged.usable$V1,]
king.usable.toRemove = king.usable[king.usable$Kinship>0.0884,]
d = data.frame(label = c("duplicate/MZ twin","1st-degree","2nd-degree","3rd-degree"),
               value = c(0.354,0.177,0.0884,0.0442))
king.usable$Class = factor(ifelse(king.usable$Kinship>0.0884,"toRemove","toKeep"), levels = c("toRemove","toKeep"))
ggplot(king.usable, aes(x=Kinship)) +
  geom_histogram(aes(fill=Class), bins = 100, colour="black") +
  scale_fill_manual(values = c("#CB454A","grey")) +
  geom_vline(data=d, mapping=aes(xintercept=value), color="grey") + 
  geom_text(data=d, mapping=aes(x=value, y=1800, label=label), size=3, angle=-90, vjust=-0.4, hjust=0) +
  scale_y_sqrt(breaks = c(0,10,100,250,500,1000,1500)) +
  labs(y = "Counts (square root scale)") +
  #annotation_logticks(sides = "l",) +
  theme_classic() +
  easy_remove_legend()

Showing the duplicated and related samples identified by Kinship analysis

Merging data

  • Prioritizing EUR ancestry donor
  • Donors 18-112 and 15-041 are 2nd degree related. Keeping the donor 18-112 (3 brain regions) and removing donor 15-041 (1 brain region)
  • For the duplicated GSA runs, we will keep those from GSA6

Numbers:

Donors Samples
All 100 261
EUR-only 92 239

Showing below metadata for all ancestries

# Remove related individual
metadata_final = metadata_usable[!metadata_usable$donor_id=="15-041",]
# Add sexCheck information
metadata_final$sex_check_FAIL = metadata_final$donor_id %in% sex_check.usable.problem$donor_id
# Select european ancestry individuals
metadata_final.eur = metadata_final[metadata_final$donor_id %in% genotype.fam.merged.usable.eur$Donor_id,]
# Filter duplicated and related donor from GSA fam table
genotype.fam.merged.usable.final = genotype.fam.merged.usable[!genotype.fam.merged.usable$V1 %in% c("GSA5_109","GSA5_121","GSA5_124","GSA5_171","GSA6_179"),]
rownames(genotype.fam.merged.usable.final) = genotype.fam.merged.usable.final$Donor_id
# Select european ancestry individuals
genotype.fam.merged.usable.final.eur = genotype.fam.merged.usable.final[genotype.fam.merged.usable.final$Donor_id %in% genotype.fam.merged.usable.eur$Donor_id,]

metadata_final$gsa_id = genotype.fam.merged.usable.final[metadata_final$donor_id,"V1"]
metadata_final$merged_gsa_id = genotype.fam.merged.usable.final[metadata_final$donor_id,"merged_id"]
# merged_gsa_id2 have the original GSA id (some samples have donor_ids with _ instead of -)
metadata_final$merged_gsa_id2 = paste0(genotype.fam.merged.usable.final[metadata_final$donor_id,"V1"],"_",genotype.fam.merged.usable.final[metadata_final$donor_id,"V2"])

metadata_final.eur$gsa_id = genotype.fam.merged.usable.final.eur[metadata_final.eur$donor_id,"V1"]
metadata_final.eur$merged_gsa_id = genotype.fam.merged.usable.final.eur[metadata_final.eur$donor_id,"merged_id"]
metadata_final.eur$merged_gsa_id2 = paste0(genotype.fam.merged.usable.final.eur[metadata_final.eur$donor_id,"V1"],"_",genotype.fam.merged.usable.final.eur[metadata_final.eur$donor_id,"V2"])

#write.table(metadata_final, file = paste0(write_dir,"metadata4eQTL_final.txt"), quote = F, row.names = F)
#write.table(metadata_final.eur, file = paste0(write_dir,"metadata4eQTL_final.eur.txt"), quote = F, row.names = F)
#write.table(metadata_final.eur$gsa_id, file = paste0(write_dir,"gsa_ids_4eQTL_final.eur.txt"), quote = F, row.names = F)

#write.table(genotype.fam.merged.usable[,c("V1","V2")], file = paste0(write_dir,"gsa_ids_4eQTL_prefilt.txt"), quote = F, row.names = F)

createDT(metadata_final)

Checking DNA-RNA concordance

We will check for concordance between RNA (using .bam files) and DNA data (using GSA genotypes from .vcf file).

First, we will show if there is matches for 13 samples that were not genotyped.

#mbv_test <- read.table(file = paste0(mbv_dir,"/results/glia_omics_mbv/output/16-116-GTS.bamstat.txt"), header = T, check.names = F)
#ggplot(unique(mbv_test), aes(x = perc_het_consistent, y = perc_hom_consistent)) +
#  geom_point( shape=21, alpha=0.4, size=2, fill="grey" ) +
#  labs(y = "Concordance at HOM (%)", x = "Concordance at HET (%)") +
#  theme_classic()
mbv <- read.table(file = paste0(mbv_dir,"/results/glia_omics_mbv/glia_omics_mbv_mbv_summary.txt"), 
                  header = F, 
                  check.names = F,
                  col.names = c("bam","SampleID","n_geno_missing","n_het_total","n_hom_total","n_het_covered","n_hom_covered","n_het_consistent","n_hom_consistent","perc_het_consistent","perc_hom_consistent","n_het_in_ase"))
mbv$bam_donor_id = gsub("(.*)\\/(.*)-(.*)","\\2",mbv$bam)
mbv$bam_sample_id = gsub("(.*)\\/(.*)\\.bamstat\\.txt","\\2",mbv$bam)
mbv$sample_donor_id = gsub("_","-",gsub("_CER|_GFM","",gsub("(.*?)_(.*?)_(.*)","\\3",mbv$SampleID)))
mbv$sample_id = gsub("(.*?)_(.*?)_(.*)","\\3",mbv$SampleID)
mbv$same_donor = mbv$bam_donor_id==mbv$sample_donor_id

failed_samples_df = register_failed_samples(failed_samples_df, mbv[mbv$same_donor==F,"bam_sample_id"], "MBV")

mbv.missing = mbv[mbv$bam_sample_id %in% metadata.missing$Donor_tissue,]
ggplot(mbv.missing, aes(x = perc_het_consistent, y = perc_hom_consistent, fill = same_donor)) +
  geom_vline(xintercept=0.5, color="grey", linetype = "dashed") + 
  geom_point(shape=21, alpha=0.4, size=2) +
  geom_point(data = subset(mbv.missing, same_donor == F), alpha = 0.8, shape=21, size=2 ) +
  geom_label_repel(data = subset(mbv.missing, same_donor == F), 
                   aes(label = bam_sample_id),
                   fill = "#CB454A",
                   size=3,
                   segment.alpha = 0.3,
                   label.size = NA, 
                   alpha = 0.8, 
                   label.padding=.1, 
                   box.padding = .1,
                   na.rm=TRUE,
                   seed = 1234) +
  geom_label_repel(data = subset(mbv.missing, same_donor == T & perc_het_consistent<0.5), 
                 aes(label = bam_sample_id),
                 fill = "grey",
                 size=3,
                 segment.alpha = 0.3,
                 label.size = NA, 
                 alpha = 0.8, 
                 label.padding=.1, 
                 box.padding = .1,
                 na.rm=TRUE,
                 seed = 1234) +
  expand_limits(y = 1, x = 1) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  scale_x_continuous(labels = scales::percent_format(accuracy = 1)) +
  scale_fill_manual(values = c("#CB454A","grey")) +
  labs(y = "Concordance at HOM", x = "Concordance at HET", fill = "Matching donor", alpha = "Status") +
  theme_classic()

Showing results for all 301 RNA-seq samples that were genotyped (each dot is one sample). Labels in red are missmatching samples.

Now showing only the samples where donors passed all previous QC metrics (261 samples/100 donors)

From 261 (after-QC) RNAseq samples, 258 (98 donors) matched donors with DNA.

Here is the list of missmatching samples.

Final results

After removing non-concordant samples.

Final numbers:

Donors Samples
All 98 258
EUR-only 90 236

Showing below metadata for all ancestries

metadata_final2 = metadata_final[metadata_final$donor_tissue %in% mbv_match$bam_sample_id,]
metadata_final2 = dplyr::left_join(metadata_final2, mds.usable, by = c("merged_gsa_id" = "IID"))
#length(unique(metadata_final2$donor_id)) # 98 donors
# Select european ancestry individuals
metadata_final2.eur = metadata_final2[metadata_final2$donor_id %in% genotype.fam.merged.usable.eur$Donor_id,]
#length(unique(metadata_final2.eur$donor_id)) # 90 donors

write.table(metadata_final2, file = paste0(write_dir,"metadata4eQTL.txt"), quote = F, row.names = F, sep = "\t")
write.table(metadata_final2.eur, file = paste0(write_dir,"metadata4eQTL_eur.txt"), quote = F, row.names = F, sep = "\t")

# Select ids for filtering samples in the plink file
write.table(metadata_final2[,c("merged_gsa_id2")], file = paste0(write_dir,"gsa_ids_4eQTL_final2.txt"), quote = F, row.names = F, col.names = F, sep=" ")
# Mapping GSA names to donor_id
write.table(metadata_final2[,c("merged_gsa_id2","donor_id")], file = paste0(write_dir,"gsa_ids_4eQTL_final2.namesMap.txt"), quote = F, row.names = F, col.names = F, sep=" ")

# Select ids for filtering samples in the plink file
write.table(metadata_final2.eur[,c("merged_gsa_id2")], file = paste0(write_dir,"gsa_ids_4eQTL_final2.eur.txt"), quote = F, row.names = F)
# Mapping GSA names to donor_id
write.table(metadata_final2.eur[,c("merged_gsa_id2","donor_id")], file = paste0(write_dir,"gsa_ids_4eQTL_final2.eur.namesMap.txt"), quote = F, row.names = F, col.names = F, sep=" ")

createDT(metadata_final2)

Final QC status

Session info

## R version 3.6.3 (2020-02-29)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 18363)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] factoextra_1.0.6 ggrepel_0.8.2    scales_1.1.0     gridExtra_2.3   
##  [5] UpSetR_1.4.0     ggeasy_0.1.2     ggsci_2.9        DT_0.13         
##  [9] forcats_0.5.0    stringr_1.4.0    dplyr_0.8.5      purrr_0.3.3     
## [13] readr_1.3.1      tidyr_1.0.2      tibble_3.0.0     ggplot2_3.3.0   
## [17] tidyverse_1.3.0  pacman_0.5.1    
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.4        lubridate_1.7.4   lattice_0.20-40   assertthat_0.2.1 
##  [5] digest_0.6.25     R6_2.4.1          cellranger_1.1.0  plyr_1.8.6       
##  [9] backports_1.1.5   reprex_0.3.0      evaluate_0.14     httr_1.4.1       
## [13] pillar_1.4.3      rlang_0.4.5       readxl_1.3.1      rstudioapi_0.11  
## [17] rmarkdown_2.1     labeling_0.3      htmlwidgets_1.5.1 munsell_0.5.0    
## [21] broom_0.5.5       compiler_3.6.3    modelr_0.1.6      xfun_0.12        
## [25] pkgconfig_2.0.3   htmltools_0.4.0   tidyselect_1.0.0  fansi_0.4.1      
## [29] crayon_1.3.4      dbplyr_1.4.2      withr_2.1.2       grid_3.6.3       
## [33] nlme_3.1-145      jsonlite_1.6.1    gtable_0.3.0      lifecycle_0.2.0  
## [37] DBI_1.1.0         magrittr_1.5      cli_2.0.2         stringi_1.4.6    
## [41] farver_2.0.3      fs_1.3.2          xml2_1.2.5        ellipsis_0.3.0   
## [45] generics_0.0.2    vctrs_0.2.4       tools_3.6.3       glue_1.3.2       
## [49] crosstalk_1.1.0.1 hms_0.5.3         yaml_2.2.1        colorspace_1.4-1 
## [53] rvest_0.3.5       knitr_1.28        haven_2.2.0