Added paper plot script.

1f7cc998 · Stefano Beretta · c0ca1c22 · 1f7cc998
Commit 1f7cc998 authored Oct 21, 2022 by Stefano Beretta
--- a/paper_plots.R
+++ b/paper_plots.R
+library(DEP)
+library(dplyr)
+library(openxlsx)
+library(ReactomePA)
+library(org.Hs.eg.db)
+library(ggfortify)
+library(stringr)
+library(reshape2)
+library(RColorBrewer)
+library(clusterProfiler)
+
+wdir <- "vavassori_lnp2022_proteomics"
+out_dir <- paste(wdir, "paper_plots", sep = "/")
+dir.create(out_dir, showWarnings = F)
+data_dir <- paste(wdir, "paper_data", sep = "/")
+design <- read.xlsx(xlsxFile = paste(data_dir, "ProteomicsDesign.xlsx", sep = "/"))
+str(design)
+
+# Day 4
+d4_design <- design[4:18,]
+d4_design$label
+dd <- read.xlsx(xlsxFile = paste(data_dir, "proteinGroups_SR-TIGET_200112.xlsx", sep = "/"))
+colnames(dd)
+for (col in d4_design$label) {
+  dd[[col]] <- as.numeric(dd[[col]])
+  dd[[col]] <- 2^dd[[col]]
+  dd[[col]][is.na(dd[[col]])] <- 0
+}
+head(dd)
+
+dd_unique <- make_unique(dd, "Gene.names", "Protein.IDs", delim = ";")
+colnames(dd_unique)
+str(dd_unique)
+samp_id <- match(d4_design$label, colnames(dd_unique))
+print(samp_id)
+data_dd <- make_se(proteins_unique = dd_unique, columns = samp_id, expdesign = d4_design)
+
+# Filter for proteins that are identified in all replicates of at least one condition
+data_dd_filt <- filter_missval(data_dd, thr = 0)
+# Normalize the data
+data_dd_norm <- normalize_vsn(se = data_dd_filt)
+
+# Impute missing data using random draws from a Gaussian distribution centered around a minimal value (for MNAR)
+data_dd_imp <- impute(data_dd_norm, fun = "MinProb", q = 0.01)
+
+n_top_genes <- 100
+mat <- as.data.frame(data_dd_imp@assays@data@listData)
+mat_var <- sort(apply(mat, 1, var), decreasing = T)
+mat_filt <- mat[names(mat_var[1:n_top_genes]),]
+ss <- limma::removeBatchEffect(x = as.data.frame(data_dd_imp@assays@data@listData), batch = data_dd_imp$replicate)
+ss_var <- sort(apply(ss, 1, var), decreasing = T)
+ss_filt <- ss[names(ss_var[1:n_top_genes]),]
+sinfo <- data.frame(Condition = data_dd_imp$condition, Replicate = data_dd_imp$replicate)
+rownames(sinfo) <- data_dd_imp$ID
+sinfo <- mutate(sinfo,
+                Cond = case_when(sinfo$Condition == "UT_D4" ~ "UT",
+                                 sinfo$Condition == "UT_electro_D4" ~ "Mock Electro",
+                                 sinfo$Condition == "RNP_D4" ~ "RNP",
+                                 sinfo$Condition == "RNP_AAV6_D4" ~ "RNP+AAV6",
+                                 sinfo$Condition == "AAV6_D4" ~ "AAV6 only"))
+sinfo$Condition <- sinfo$Cond
+pca2 <- prcomp(x = t(ss_filt), center = TRUE,scale. = T)
+
+pdf(file = paste(out_dir, "DEP-PCA-Day4-top100-batchcorr.pdf", sep = "/"), width = 9, height = 7)
+autoplot(object = pca2, data = sinfo, label = FALSE, colour = 'Condition', size = 6) +
+  ggtitle(label = "Principal Component Analysis (PCA)",
+          subtitle = paste("Top", n_top_genes, "proteins (with batch correction)", sep = " ")) +
+  theme_bw(base_size = 20)
+dev.off()
+
+con = c("UT_electro_D4_vs_UT_D4",
+        "RNP_D4_vs_UT_electro_D4",
+        "RNP_D4_vs_UT_D4",
+        "AAV6_D4_vs_RNP_D4",
+        "AAV6_D4_vs_UT_electro_D4",
+        "AAV6_D4_vs_UT_D4",
+        "RNP_AAV6_D4_vs_AAV6_D4",
+        "RNP_AAV6_D4_vs_RNP_D4",
+        "RNP_AAV6_D4_vs_UT_electro_D4",
+        "RNP_AAV6_D4_vs_UT_D4")
+
+
+
+for (gsea_val in c("_GSEApval")) {
+  for (gsea_db in c("gsea_HALLMARK")) {
+    hallmark_ds_order <- c("UT_electro_D4_vs_UT_D4", # UT electro vs UT
+                           "RNP_D4_vs_UT_D4", # RNP vs UT
+                           "RNP_AAV6_D4_vs_UT_D4", # RNP+AAV6 vs UT
+                           "UT_electro_D4_vs_AAV6_D4", # old "AAV6_D4-UT_electro_D4", # Utelectro vs AAV6 !!!
+                           "RNP_D4_vs_AAV6_D4", # old "AAV6_D4-RNP_D4", # RNP vs AAV6 !!!
+                           "RNP_AAV6_D4_vs_AAV6_D4", # RNP+AAV6 vs AAV6
+                           "RNP_D4_vs_UT_electro_D4", # RNP vs UT electro
+                           "RNP_AAV6_D4_vs_UT_electro_D4", # RNP+AAV6 vs Utelectro
+                           "RNP_AAV6_D4_vs_RNP_D4", # RNP+AAV6 vs RNP
+                           "AAV6_D4_vs_UT_D4" # AAV6 vs UT
+    )
+    hallmark.full <- data.frame()
+    for (contr in con) {
+      print(contr)
+      ds.t <- read.xlsx(xlsxFile = paste(wdir, "results_DEP", paste0(contr, paste0(gsea_val, ".xlsx")), sep = "/"),
+                        sheet = gsea_db)
+      if (nrow(ds.t) == 0) { next }
+      ds.t.filt <- ds.t[,c("Description", "setSize", "enrichmentScore", "NES", "pvalue", "p.adjust", "qvalues"), drop = FALSE]
+      ds.t.filt$Dataset <- contr
+      if (nrow(hallmark.full) == 0) {
+        hallmark.full <- ds.t.filt
+      } else {
+        hallmark.full <- rbind(hallmark.full, ds.t.filt)
+      }
+    }
+    hallmark.full.filt <- hallmark.full[, c("Description", "NES", "Dataset")]
+    hallmark.full.filt$Description <- gsub(x = hallmark.full.filt$Description, "HALLMARK_", "")
+    for (dd in hallmark_ds_order) {
+      if(!dd %in% unique(hallmark.full.filt$Dataset)) {
+        newdd <- paste(as.character(str_split_fixed(dd, "_vs_", 2)[,2]), as.character(str_split_fixed(dd, "_vs_", 2)[,1]), sep = "_vs_")
+        if (newdd %in% unique(hallmark.full.filt$Dataset)) {
+          newvals <- -1 * hallmark.full.filt[which(hallmark.full.filt$Dataset == newdd), "NES"]
+          hallmark.full.filt[which(hallmark.full.filt$Dataset == newdd), "NES"] <- newvals
+          hallmark.full.filt[which(hallmark.full.filt$Dataset == newdd), "Dataset"] <- dd
+        }
+      }
+    }
+    head(hallmark.full.filt)
+    hallmark.order <- hallmark.full.filt %>% group_by(Description) %>% summarise(Pos = sum(NES))
+    hallmark.order.terms <- hallmark.order[order(hallmark.order$Pos, decreasing = FALSE), "Description", drop = FALSE]
+    hallmark.full.filt.tt <- melt(reshape2::dcast(hallmark.full.filt, Description ~ Dataset, value.var="NES"), id.vars = c("Description"))
+    colnames(hallmark.full.filt.tt) <- c("Description", "Dataset", "NES")
+    hallmark.full.filt.tt$Description <- factor(hallmark.full.filt.tt$Description, levels = hallmark.order.terms$Description)
+    hallmark.full.filt.tt$Dataset <- gsub("UT_D4", "UT", hallmark.full.filt.tt$Dataset)
+    hallmark.full.filt.tt$Dataset <- gsub("UT_electro_D4", "Mock Electro", hallmark.full.filt.tt$Dataset)
+    hallmark.full.filt.tt$Dataset <- gsub("RNP_D4", "RNP", hallmark.full.filt.tt$Dataset)
+    hallmark.full.filt.tt$Dataset <- gsub("RNP_AAV6_D4", "RNP+AAV6", hallmark.full.filt.tt$Dataset)
+    hallmark.full.filt.tt$Dataset <- gsub("AAV6_D4", "AAV6 only", hallmark.full.filt.tt$Dataset)
+    hallmark.full.filt.tt$Dataset <- gsub("_vs_", " - ", hallmark.full.filt.tt$Dataset)
+    
+    hallmark_ds_order <- gsub("UT_D4", "UT", hallmark_ds_order)
+    hallmark_ds_order <- gsub("UT_electro_D4", "Mock Electro", hallmark_ds_order)
+    hallmark_ds_order <- gsub("RNP_D4", "RNP", hallmark_ds_order)
+    hallmark_ds_order <- gsub("RNP_AAV6_D4", "RNP+AAV6", hallmark_ds_order)
+    hallmark_ds_order <- gsub("AAV6_D4", "AAV6 only", hallmark_ds_order)
+    hallmark_ds_order <- gsub("_vs_", " - ", hallmark_ds_order)
+    hallmark.full.filt.tt$Dataset <- factor(hallmark.full.filt.tt$Dataset, levels = hallmark_ds_order)
+    
+    p.hallmark <- ggplot(hallmark.full.filt.tt, aes(x = Dataset, y = Description)) +
+      theme_bw(base_size = 20) +
+      theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
+      geom_tile(aes(fill = NES), colour = "white") +
+      scale_fill_gradientn(colours = colorRampPalette(rev(brewer.pal(11,"RdBu")))(100), na.value = "grey") +
+      ylab("") +
+      xlab("")
+    plot_height <- ifelse(length(levels(hallmark.full.filt.tt$Description)) < 50, 7, 15)
+    ggsave(filename = paste(out_dir, paste0("HM_D4_", gsea_db, gsea_val, ".pdf"), sep = "/"), plot = p.hallmark,
+           width = 10, height = plot_height, dpi = 150)
+  }
+}
+
+
+
+# Day 12
+d12_design <- design[19:33,]
+d12_design$label
+dd <- read.xlsx(xlsxFile = paste(wdir, "proteinGroups_SR-TIGET_200112.xlsx", sep = "/"))
+colnames(dd)
+for (col in d12_design$label) {
+  dd[[col]] <- as.numeric(dd[[col]])
+  dd[[col]] <- 2^dd[[col]]
+  dd[[col]][is.na(dd[[col]])] <- 0
+}
+head(dd)
+
+dd_unique <- make_unique(dd, "Gene.names", "Protein.IDs", delim = ";")
+colnames(dd_unique)
+str(dd_unique)
+samp_id <- match(d12_design$label, colnames(dd_unique))
+print(samp_id)
+data_dd <- make_se(proteins_unique = dd_unique, columns = samp_id, expdesign = d12_design)
+
+# Filter for proteins that are identified in all replicates of at least one condition
+data_dd_filt <- filter_missval(data_dd, thr = 0)
+# Normalize the data
+data_dd_norm <- normalize_vsn(se = data_dd_filt)
+# Impute missing data using random draws from a Gaussian distribution centered around a minimal value (for MNAR)
+data_dd_imp <- impute(data_dd_norm, fun = "MinProb", q = 0.01)
+
+mat <- as.data.frame(data_dd_imp@assays@data@listData)
+mat_var <- sort(apply(mat, 1, var), decreasing = T)
+mat_filt <- mat[names(mat_var[1:n_top_genes]),]
+ss <- limma::removeBatchEffect(x = as.data.frame(data_dd_imp@assays@data@listData), batch = data_dd_imp$replicate)
+ss_var <- sort(apply(ss, 1, var), decreasing = T)
+ss_filt <- ss[names(ss_var[1:n_top_genes]),]
+sinfo <- data.frame(Condition = data_dd_imp$condition, Replicate = data_dd_imp$replicate)
+rownames(sinfo) <- data_dd_imp$ID
+sinfo <- mutate(sinfo,
+                Cond = case_when(sinfo$Condition == "UT_D12" ~ "UT",
+                                 sinfo$Condition == "UT_electro_D12" ~ "Mock Electro",
+                                 sinfo$Condition == "RNP_D12" ~ "RNP",
+                                 sinfo$Condition == "RNP_AAV6_D12" ~ "RNP+AAV6",
+                                 sinfo$Condition == "AAV6_D12" ~ "AAV6 only"))
+sinfo$Condition <- sinfo$Cond
+pca2 <- prcomp(x = t(ss_filt), center = TRUE,scale. = T)
+pdf(file = paste(out_dir, paste0("DEP-PCA-Day12-top", n_top_genes, "-batchcorr.pdf"), sep = "/"), width = 9, height = 7)
+autoplot(object = pca2, data = sinfo, label = FALSE, colour = 'Condition', size = 6) +
+  ggtitle(label = "Principal Component Analysis (PCA)",
+          subtitle = paste("Top", n_top_genes, "proteins (with batch correction)", sep = " ")) +
+  theme_bw(base_size = 20)
+dev.off()