library(DEP)
library(dplyr)
library(openxlsx)
library(ReactomePA)
library(org.Hs.eg.db)
library(ggfortify)
library(stringr)
library(reshape2)
library(RColorBrewer)
library(clusterProfiler)

wdir <- "vavassori_lnp2022_proteomics"
out_dir <- paste(wdir, "paper_plots", sep = "/")
dir.create(out_dir, showWarnings = F)
data_dir <- paste(wdir, "paper_data", sep = "/")
design <- read.xlsx(xlsxFile = paste(data_dir, "ProteomicsDesign.xlsx", sep = "/"))
str(design)

# Day 4
d4_design <- design[4:18,]
d4_design$label
dd <- read.xlsx(xlsxFile = paste(data_dir, "proteinGroups_SR-TIGET_200112.xlsx", sep = "/"))
colnames(dd)
for (col in d4_design$label) {
  dd[[col]] <- as.numeric(dd[[col]])
  dd[[col]] <- 2^dd[[col]]
  dd[[col]][is.na(dd[[col]])] <- 0
}
head(dd)

dd_unique <- make_unique(dd, "Gene.names", "Protein.IDs", delim = ";")
colnames(dd_unique)
str(dd_unique)
samp_id <- match(d4_design$label, colnames(dd_unique))
print(samp_id)
data_dd <- make_se(proteins_unique = dd_unique, columns = samp_id, expdesign = d4_design)

# Filter for proteins that are identified in all replicates of at least one condition
data_dd_filt <- filter_missval(data_dd, thr = 0)
# Normalize the data
data_dd_norm <- normalize_vsn(se = data_dd_filt)

# Impute missing data using random draws from a Gaussian distribution centered around a minimal value (for MNAR)
data_dd_imp <- impute(data_dd_norm, fun = "MinProb", q = 0.01)

n_top_genes <- 100
mat <- as.data.frame(data_dd_imp@assays@data@listData)
mat_var <- sort(apply(mat, 1, var), decreasing = T)
mat_filt <- mat[names(mat_var[1:n_top_genes]),]
ss <- limma::removeBatchEffect(x = as.data.frame(data_dd_imp@assays@data@listData), batch = data_dd_imp$replicate)
ss_var <- sort(apply(ss, 1, var), decreasing = T)
ss_filt <- ss[names(ss_var[1:n_top_genes]),]
sinfo <- data.frame(Condition = data_dd_imp$condition, Replicate = data_dd_imp$replicate)
rownames(sinfo) <- data_dd_imp$ID
sinfo <- mutate(sinfo,
                Cond = case_when(sinfo$Condition == "UT_D4" ~ "UT",
                                 sinfo$Condition == "UT_electro_D4" ~ "Mock Electro",
                                 sinfo$Condition == "RNP_D4" ~ "RNP",
                                 sinfo$Condition == "RNP_AAV6_D4" ~ "RNP+AAV6",
                                 sinfo$Condition == "AAV6_D4" ~ "AAV6 only"))
sinfo$Condition <- sinfo$Cond
pca2 <- prcomp(x = t(ss_filt), center = TRUE,scale. = T)

pdf(file = paste(out_dir, "DEP-PCA-Day4-top100-batchcorr.pdf", sep = "/"), width = 9, height = 7)
autoplot(object = pca2, data = sinfo, label = FALSE, colour = 'Condition', size = 6) +
  ggtitle(label = "Principal Component Analysis (PCA)",
          subtitle = paste("Top", n_top_genes, "proteins (with batch correction)", sep = " ")) +
  theme_bw(base_size = 20)
dev.off()

con = c("UT_electro_D4_vs_UT_D4",
        "RNP_D4_vs_UT_electro_D4",
        "RNP_D4_vs_UT_D4",
        "AAV6_D4_vs_RNP_D4",
        "AAV6_D4_vs_UT_electro_D4",
        "AAV6_D4_vs_UT_D4",
        "RNP_AAV6_D4_vs_AAV6_D4",
        "RNP_AAV6_D4_vs_RNP_D4",
        "RNP_AAV6_D4_vs_UT_electro_D4",
        "RNP_AAV6_D4_vs_UT_D4")


for (gsea_val in c("_GSEApval")) {
  for (gsea_db in c("gsea_HALLMARK")) {
    hallmark_ds_order <- c("UT_electro_D4_vs_UT_D4", # UT electro vs UT
                           "RNP_D4_vs_UT_D4", # RNP vs UT
                           "RNP_AAV6_D4_vs_UT_D4", # RNP+AAV6 vs UT
                           "UT_electro_D4_vs_AAV6_D4", # old "AAV6_D4-UT_electro_D4", # Utelectro vs AAV6 !!!
                           "RNP_D4_vs_AAV6_D4", # old "AAV6_D4-RNP_D4", # RNP vs AAV6 !!!
                           "RNP_AAV6_D4_vs_AAV6_D4", # RNP+AAV6 vs AAV6
                           "RNP_D4_vs_UT_electro_D4", # RNP vs UT electro
                           "RNP_AAV6_D4_vs_UT_electro_D4", # RNP+AAV6 vs Utelectro
                           "RNP_AAV6_D4_vs_RNP_D4", # RNP+AAV6 vs RNP
                           "AAV6_D4_vs_UT_D4" # AAV6 vs UT
    )
    hallmark.full <- data.frame()
    for (contr in con) {
      print(contr)
      ds.t <- read.xlsx(xlsxFile = paste(wdir, "results_DEP", paste0(contr, paste0(gsea_val, ".xlsx")), sep = "/"),
                        sheet = gsea_db)
      if (nrow(ds.t) == 0) { next }
      ds.t.filt <- ds.t[,c("Description", "setSize", "enrichmentScore", "NES", "pvalue", "p.adjust", "qvalues"), drop = FALSE]
      ds.t.filt$Dataset <- contr
      if (nrow(hallmark.full) == 0) {
        hallmark.full <- ds.t.filt
      } else {
        hallmark.full <- rbind(hallmark.full, ds.t.filt)
      }
    }
    hallmark.full.filt <- hallmark.full[, c("Description", "NES", "Dataset")]
    hallmark.full.filt$Description <- gsub(x = hallmark.full.filt$Description, "HALLMARK_", "")
    for (dd in hallmark_ds_order) {
      if(!dd %in% unique(hallmark.full.filt$Dataset)) {
        newdd <- paste(as.character(str_split_fixed(dd, "_vs_", 2)[,2]), as.character(str_split_fixed(dd, "_vs_", 2)[,1]), sep = "_vs_")
        if (newdd %in% unique(hallmark.full.filt$Dataset)) {
          newvals <- -1 * hallmark.full.filt[which(hallmark.full.filt$Dataset == newdd), "NES"]
          hallmark.full.filt[which(hallmark.full.filt$Dataset == newdd), "NES"] <- newvals
          hallmark.full.filt[which(hallmark.full.filt$Dataset == newdd), "Dataset"] <- dd
        }
      }
    }
    head(hallmark.full.filt)
    hallmark.order <- hallmark.full.filt %>% group_by(Description) %>% summarise(Pos = sum(NES))
    hallmark.order.terms <- hallmark.order[order(hallmark.order$Pos, decreasing = FALSE), "Description", drop = FALSE]
    hallmark.full.filt.tt <- melt(reshape2::dcast(hallmark.full.filt, Description ~ Dataset, value.var="NES"), id.vars = c("Description"))
    colnames(hallmark.full.filt.tt) <- c("Description", "Dataset", "NES")
    hallmark.full.filt.tt$Description <- factor(hallmark.full.filt.tt$Description, levels = hallmark.order.terms$Description)
    hallmark.full.filt.tt$Dataset <- gsub("UT_D4", "UT", hallmark.full.filt.tt$Dataset)
    hallmark.full.filt.tt$Dataset <- gsub("UT_electro_D4", "Mock Electro", hallmark.full.filt.tt$Dataset)
    hallmark.full.filt.tt$Dataset <- gsub("RNP_D4", "RNP", hallmark.full.filt.tt$Dataset)
    hallmark.full.filt.tt$Dataset <- gsub("RNP_AAV6_D4", "RNP+AAV6", hallmark.full.filt.tt$Dataset)
    hallmark.full.filt.tt$Dataset <- gsub("AAV6_D4", "AAV6 only", hallmark.full.filt.tt$Dataset)
    hallmark.full.filt.tt$Dataset <- gsub("_vs_", " - ", hallmark.full.filt.tt$Dataset)
    
    hallmark_ds_order <- gsub("UT_D4", "UT", hallmark_ds_order)
    hallmark_ds_order <- gsub("UT_electro_D4", "Mock Electro", hallmark_ds_order)
    hallmark_ds_order <- gsub("RNP_D4", "RNP", hallmark_ds_order)
    hallmark_ds_order <- gsub("RNP_AAV6_D4", "RNP+AAV6", hallmark_ds_order)
    hallmark_ds_order <- gsub("AAV6_D4", "AAV6 only", hallmark_ds_order)
    hallmark_ds_order <- gsub("_vs_", " - ", hallmark_ds_order)
    hallmark.full.filt.tt$Dataset <- factor(hallmark.full.filt.tt$Dataset, levels = hallmark_ds_order)
    
    p.hallmark <- ggplot(hallmark.full.filt.tt, aes(x = Dataset, y = Description)) +
      theme_bw(base_size = 20) +
      theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
      geom_tile(aes(fill = NES), colour = "white") +
      scale_fill_gradientn(colours = colorRampPalette(rev(brewer.pal(11,"RdBu")))(100), na.value = "grey") +
      ylab("") +
      xlab("")
    plot_height <- ifelse(length(levels(hallmark.full.filt.tt$Description)) < 50, 7, 15)
    ggsave(filename = paste(out_dir, paste0("HM_D4_", gsea_db, gsea_val, ".pdf"), sep = "/"), plot = p.hallmark,
           width = 10, height = plot_height, dpi = 150)
  }
}


# Day 12
d12_design <- design[19:33,]
d12_design$label
dd <- read.xlsx(xlsxFile = paste(wdir, "proteinGroups_SR-TIGET_200112.xlsx", sep = "/"))
colnames(dd)
for (col in d12_design$label) {
  dd[[col]] <- as.numeric(dd[[col]])
  dd[[col]] <- 2^dd[[col]]
  dd[[col]][is.na(dd[[col]])] <- 0
}
head(dd)

dd_unique <- make_unique(dd, "Gene.names", "Protein.IDs", delim = ";")
colnames(dd_unique)
str(dd_unique)
samp_id <- match(d12_design$label, colnames(dd_unique))
print(samp_id)
data_dd <- make_se(proteins_unique = dd_unique, columns = samp_id, expdesign = d12_design)

# Filter for proteins that are identified in all replicates of at least one condition
data_dd_filt <- filter_missval(data_dd, thr = 0)
# Normalize the data
data_dd_norm <- normalize_vsn(se = data_dd_filt)
# Impute missing data using random draws from a Gaussian distribution centered around a minimal value (for MNAR)
data_dd_imp <- impute(data_dd_norm, fun = "MinProb", q = 0.01)

mat <- as.data.frame(data_dd_imp@assays@data@listData)
mat_var <- sort(apply(mat, 1, var), decreasing = T)
mat_filt <- mat[names(mat_var[1:n_top_genes]),]
ss <- limma::removeBatchEffect(x = as.data.frame(data_dd_imp@assays@data@listData), batch = data_dd_imp$replicate)
ss_var <- sort(apply(ss, 1, var), decreasing = T)
ss_filt <- ss[names(ss_var[1:n_top_genes]),]
sinfo <- data.frame(Condition = data_dd_imp$condition, Replicate = data_dd_imp$replicate)
rownames(sinfo) <- data_dd_imp$ID
sinfo <- mutate(sinfo,
                Cond = case_when(sinfo$Condition == "UT_D12" ~ "UT",
                                 sinfo$Condition == "UT_electro_D12" ~ "Mock Electro",
                                 sinfo$Condition == "RNP_D12" ~ "RNP",
                                 sinfo$Condition == "RNP_AAV6_D12" ~ "RNP+AAV6",
                                 sinfo$Condition == "AAV6_D12" ~ "AAV6 only"))
sinfo$Condition <- sinfo$Cond
pca2 <- prcomp(x = t(ss_filt), center = TRUE,scale. = T)
pdf(file = paste(out_dir, paste0("DEP-PCA-Day12-top", n_top_genes, "-batchcorr.pdf"), sep = "/"), width = 9, height = 7)
autoplot(object = pca2, data = sinfo, label = FALSE, colour = 'Condition', size = 6) +
  ggtitle(label = "Principal Component Analysis (PCA)",
          subtitle = paste("Top", n_top_genes, "proteins (with batch correction)", sep = " ")) +
  theme_bw(base_size = 20)
dev.off()