library(DEP) library(dplyr) library(openxlsx) library(ReactomePA) library(org.Hs.eg.db) library(ggfortify) library(stringr) library(reshape2) library(RColorBrewer) library(clusterProfiler) wdir <- "vavassori_lnp2022_proteomics" out_dir <- paste(wdir, "paper_plots", sep = "/") dir.create(out_dir, showWarnings = F) data_dir <- paste(wdir, "paper_data", sep = "/") design <- read.xlsx(xlsxFile = paste(data_dir, "ProteomicsDesign.xlsx", sep = "/")) str(design) # Day 4 d4_design <- design[4:18,] d4_design$label dd <- read.xlsx(xlsxFile = paste(data_dir, "proteinGroups_SR-TIGET_200112.xlsx", sep = "/")) colnames(dd) for (col in d4_design$label) { dd[[col]] <- as.numeric(dd[[col]]) dd[[col]] <- 2^dd[[col]] dd[[col]][is.na(dd[[col]])] <- 0 } head(dd) dd_unique <- make_unique(dd, "Gene.names", "Protein.IDs", delim = ";") colnames(dd_unique) str(dd_unique) samp_id <- match(d4_design$label, colnames(dd_unique)) print(samp_id) data_dd <- make_se(proteins_unique = dd_unique, columns = samp_id, expdesign = d4_design) # Filter for proteins that are identified in all replicates of at least one condition data_dd_filt <- filter_missval(data_dd, thr = 0) # Normalize the data data_dd_norm <- normalize_vsn(se = data_dd_filt) # Impute missing data using random draws from a Gaussian distribution centered around a minimal value (for MNAR) data_dd_imp <- impute(data_dd_norm, fun = "MinProb", q = 0.01) n_top_genes <- 100 mat <- as.data.frame(data_dd_imp@assays@data@listData) mat_var <- sort(apply(mat, 1, var), decreasing = T) mat_filt <- mat[names(mat_var[1:n_top_genes]),] ss <- limma::removeBatchEffect(x = as.data.frame(data_dd_imp@assays@data@listData), batch = data_dd_imp$replicate) ss_var <- sort(apply(ss, 1, var), decreasing = T) ss_filt <- ss[names(ss_var[1:n_top_genes]),] sinfo <- data.frame(Condition = data_dd_imp$condition, Replicate = data_dd_imp$replicate) rownames(sinfo) <- data_dd_imp$ID sinfo <- mutate(sinfo, Cond = case_when(sinfo$Condition == "UT_D4" ~ "UT", sinfo$Condition == "UT_electro_D4" ~ "Mock Electro", sinfo$Condition == "RNP_D4" ~ "RNP", sinfo$Condition == "RNP_AAV6_D4" ~ "RNP+AAV6", sinfo$Condition == "AAV6_D4" ~ "AAV6 only")) sinfo$Condition <- sinfo$Cond pca2 <- prcomp(x = t(ss_filt), center = TRUE,scale. = T) pdf(file = paste(out_dir, "DEP-PCA-Day4-top100-batchcorr.pdf", sep = "/"), width = 9, height = 7) autoplot(object = pca2, data = sinfo, label = FALSE, colour = 'Condition', size = 6) + ggtitle(label = "Principal Component Analysis (PCA)", subtitle = paste("Top", n_top_genes, "proteins (with batch correction)", sep = " ")) + theme_bw(base_size = 20) dev.off() con = c("UT_electro_D4_vs_UT_D4", "RNP_D4_vs_UT_electro_D4", "RNP_D4_vs_UT_D4", "AAV6_D4_vs_RNP_D4", "AAV6_D4_vs_UT_electro_D4", "AAV6_D4_vs_UT_D4", "RNP_AAV6_D4_vs_AAV6_D4", "RNP_AAV6_D4_vs_RNP_D4", "RNP_AAV6_D4_vs_UT_electro_D4", "RNP_AAV6_D4_vs_UT_D4") for (gsea_val in c("_GSEApval")) { for (gsea_db in c("gsea_HALLMARK")) { hallmark_ds_order <- c("UT_electro_D4_vs_UT_D4", # UT electro vs UT "RNP_D4_vs_UT_D4", # RNP vs UT "RNP_AAV6_D4_vs_UT_D4", # RNP+AAV6 vs UT "UT_electro_D4_vs_AAV6_D4", # old "AAV6_D4-UT_electro_D4", # Utelectro vs AAV6 !!! "RNP_D4_vs_AAV6_D4", # old "AAV6_D4-RNP_D4", # RNP vs AAV6 !!! "RNP_AAV6_D4_vs_AAV6_D4", # RNP+AAV6 vs AAV6 "RNP_D4_vs_UT_electro_D4", # RNP vs UT electro "RNP_AAV6_D4_vs_UT_electro_D4", # RNP+AAV6 vs Utelectro "RNP_AAV6_D4_vs_RNP_D4", # RNP+AAV6 vs RNP "AAV6_D4_vs_UT_D4" # AAV6 vs UT ) hallmark.full <- data.frame() for (contr in con) { print(contr) ds.t <- read.xlsx(xlsxFile = paste(wdir, "results_DEP", paste0(contr, paste0(gsea_val, ".xlsx")), sep = "/"), sheet = gsea_db) if (nrow(ds.t) == 0) { next } ds.t.filt <- ds.t[,c("Description", "setSize", "enrichmentScore", "NES", "pvalue", "p.adjust", "qvalues"), drop = FALSE] ds.t.filt$Dataset <- contr if (nrow(hallmark.full) == 0) { hallmark.full <- ds.t.filt } else { hallmark.full <- rbind(hallmark.full, ds.t.filt) } } hallmark.full.filt <- hallmark.full[, c("Description", "NES", "Dataset")] hallmark.full.filt$Description <- gsub(x = hallmark.full.filt$Description, "HALLMARK_", "") for (dd in hallmark_ds_order) { if(!dd %in% unique(hallmark.full.filt$Dataset)) { newdd <- paste(as.character(str_split_fixed(dd, "_vs_", 2)[,2]), as.character(str_split_fixed(dd, "_vs_", 2)[,1]), sep = "_vs_") if (newdd %in% unique(hallmark.full.filt$Dataset)) { newvals <- -1 * hallmark.full.filt[which(hallmark.full.filt$Dataset == newdd), "NES"] hallmark.full.filt[which(hallmark.full.filt$Dataset == newdd), "NES"] <- newvals hallmark.full.filt[which(hallmark.full.filt$Dataset == newdd), "Dataset"] <- dd } } } head(hallmark.full.filt) hallmark.order <- hallmark.full.filt %>% group_by(Description) %>% summarise(Pos = sum(NES)) hallmark.order.terms <- hallmark.order[order(hallmark.order$Pos, decreasing = FALSE), "Description", drop = FALSE] hallmark.full.filt.tt <- melt(reshape2::dcast(hallmark.full.filt, Description ~ Dataset, value.var="NES"), id.vars = c("Description")) colnames(hallmark.full.filt.tt) <- c("Description", "Dataset", "NES") hallmark.full.filt.tt$Description <- factor(hallmark.full.filt.tt$Description, levels = hallmark.order.terms$Description) hallmark.full.filt.tt$Dataset <- gsub("UT_D4", "UT", hallmark.full.filt.tt$Dataset) hallmark.full.filt.tt$Dataset <- gsub("UT_electro_D4", "Mock Electro", hallmark.full.filt.tt$Dataset) hallmark.full.filt.tt$Dataset <- gsub("RNP_D4", "RNP", hallmark.full.filt.tt$Dataset) hallmark.full.filt.tt$Dataset <- gsub("RNP_AAV6_D4", "RNP+AAV6", hallmark.full.filt.tt$Dataset) hallmark.full.filt.tt$Dataset <- gsub("AAV6_D4", "AAV6 only", hallmark.full.filt.tt$Dataset) hallmark.full.filt.tt$Dataset <- gsub("_vs_", " - ", hallmark.full.filt.tt$Dataset) hallmark_ds_order <- gsub("UT_D4", "UT", hallmark_ds_order) hallmark_ds_order <- gsub("UT_electro_D4", "Mock Electro", hallmark_ds_order) hallmark_ds_order <- gsub("RNP_D4", "RNP", hallmark_ds_order) hallmark_ds_order <- gsub("RNP_AAV6_D4", "RNP+AAV6", hallmark_ds_order) hallmark_ds_order <- gsub("AAV6_D4", "AAV6 only", hallmark_ds_order) hallmark_ds_order <- gsub("_vs_", " - ", hallmark_ds_order) hallmark.full.filt.tt$Dataset <- factor(hallmark.full.filt.tt$Dataset, levels = hallmark_ds_order) p.hallmark <- ggplot(hallmark.full.filt.tt, aes(x = Dataset, y = Description)) + theme_bw(base_size = 20) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + geom_tile(aes(fill = NES), colour = "white") + scale_fill_gradientn(colours = colorRampPalette(rev(brewer.pal(11,"RdBu")))(100), na.value = "grey") + ylab("") + xlab("") plot_height <- ifelse(length(levels(hallmark.full.filt.tt$Description)) < 50, 7, 15) ggsave(filename = paste(out_dir, paste0("HM_D4_", gsea_db, gsea_val, ".pdf"), sep = "/"), plot = p.hallmark, width = 10, height = plot_height, dpi = 150) } } # Day 12 d12_design <- design[19:33,] d12_design$label dd <- read.xlsx(xlsxFile = paste(wdir, "proteinGroups_SR-TIGET_200112.xlsx", sep = "/")) colnames(dd) for (col in d12_design$label) { dd[[col]] <- as.numeric(dd[[col]]) dd[[col]] <- 2^dd[[col]] dd[[col]][is.na(dd[[col]])] <- 0 } head(dd) dd_unique <- make_unique(dd, "Gene.names", "Protein.IDs", delim = ";") colnames(dd_unique) str(dd_unique) samp_id <- match(d12_design$label, colnames(dd_unique)) print(samp_id) data_dd <- make_se(proteins_unique = dd_unique, columns = samp_id, expdesign = d12_design) # Filter for proteins that are identified in all replicates of at least one condition data_dd_filt <- filter_missval(data_dd, thr = 0) # Normalize the data data_dd_norm <- normalize_vsn(se = data_dd_filt) # Impute missing data using random draws from a Gaussian distribution centered around a minimal value (for MNAR) data_dd_imp <- impute(data_dd_norm, fun = "MinProb", q = 0.01) mat <- as.data.frame(data_dd_imp@assays@data@listData) mat_var <- sort(apply(mat, 1, var), decreasing = T) mat_filt <- mat[names(mat_var[1:n_top_genes]),] ss <- limma::removeBatchEffect(x = as.data.frame(data_dd_imp@assays@data@listData), batch = data_dd_imp$replicate) ss_var <- sort(apply(ss, 1, var), decreasing = T) ss_filt <- ss[names(ss_var[1:n_top_genes]),] sinfo <- data.frame(Condition = data_dd_imp$condition, Replicate = data_dd_imp$replicate) rownames(sinfo) <- data_dd_imp$ID sinfo <- mutate(sinfo, Cond = case_when(sinfo$Condition == "UT_D12" ~ "UT", sinfo$Condition == "UT_electro_D12" ~ "Mock Electro", sinfo$Condition == "RNP_D12" ~ "RNP", sinfo$Condition == "RNP_AAV6_D12" ~ "RNP+AAV6", sinfo$Condition == "AAV6_D12" ~ "AAV6 only")) sinfo$Condition <- sinfo$Cond pca2 <- prcomp(x = t(ss_filt), center = TRUE,scale. = T) pdf(file = paste(out_dir, paste0("DEP-PCA-Day12-top", n_top_genes, "-batchcorr.pdf"), sep = "/"), width = 9, height = 7) autoplot(object = pca2, data = sinfo, label = FALSE, colour = 'Condition', size = 6) + ggtitle(label = "Principal Component Analysis (PCA)", subtitle = paste("Top", n_top_genes, "proteins (with batch correction)", sep = " ")) + theme_bw(base_size = 20) dev.off()