Compare_variants_AF_v3.R

Compare_variants_AF_v3 <- function(dblist, sampleid_1, sampleid_2, 
                                my.width, my.height, my.res, 
                                tag = "demotag",
                                outfolder = "Outfold/"){
  
  library(ggplot2)
  library(plotly)
  library(ggrepel)
  library(reshape2)
  library(cowplot)
  library(openxlsx)
  
  dir.create(path = outfolder, recursive = T)
  
  db <- dblist
  
  Common_vars <- intersect(x = db[[sampleid_1]]$variantkey, y = db[[sampleid_2]]$variantkey)
  
  fulldf <- rbind.data.frame(x = db[[sampleid_1]], y = db[[sampleid_2]])
  
  fulldf_annot <- subset.data.frame(x = fulldf, select = c("variantkey","ID","HGVS_P","Effect","Impact", 
                                                           "dbNSFP_CADD_phred", "dbNSFP_PROVEAN_pred",
                                                           "dbNSFP_MutationTaster_pred", "dbNSFP_SIFT_pred",
                                                           "dbNSFP_Polyphen2_HVAR_pred"))
  fulldf_annot <- fulldf_annot %>% distinct(variantkey,.keep_all = TRUE)
  
  sampleid_1_sub <- subset.data.frame(x = db[[sampleid_1]], subset = variantkey %in% Common_vars, 
                                      select = c("variantkey","AF","DP","FILTER","Gene"))
  colnames(sampleid_1_sub) <- c("variantkey","AF_GFP_H","DP_GFP_H","FILTER_GFP_H","Gene")
  
  sampleid_2_sub <- subset.data.frame(x = db[[sampleid_2]], subset = variantkey %in% Common_vars, 
                                      select = c("variantkey","AF","DP","FILTER","Gene"))
  colnames(sampleid_2_sub) <- c("variantkey","AF_GFP_L","DP_GFP_L","FILTER_GFP_L","Gene")
  
  # merge datasets
  
  common_df <- merge.data.frame(x = sampleid_1_sub, y = sampleid_2_sub, by = c('variantkey','Gene'))
  common_df_melted <- melt(common_df, id.vars = c("variantkey","Gene", "DP_GFP_H", "DP_GFP_L","FILTER_GFP_H","FILTER_GFP_L"))
  
  # plotting common variants AF
  
  common_df_tmp <- common_df
  colnames(common_df_tmp) = c("variantkey","Gene","AF_GFP_H","DP_GFP_H","FILTER_GFP_H","AF_GFP_L","DP_GFP_L","FILTER_GFP_L")
  
  common_df_melted_tmp <- melt(common_df_tmp, id.vars = c("variantkey","Gene", "DP_GFP_H", "DP_GFP_L","FILTER_GFP_H","FILTER_GFP_L"))
  
  g_point <- ggplot(data = common_df_tmp, mapping = aes(x = AF_GFP_H, y = AF_GFP_L)) + geom_point() + xlab(label = "AF_GFP_H") + ylab(label = "AF_GFP_L")
  g_density <- ggplot(data = common_df_melted_tmp, mapping = aes(value, fill = variable)) + geom_density() + 
    facet_wrap(facets = ~ variable, nrow = 2) + theme(legend.position = "none", axis.title = element_blank())
  
  title <- ggdraw() +
    draw_label(
      paste0("\tAllele Fraction\n", sampleid_1, " vs ", sampleid_2),
      fontface = 'bold',
      x = 0,
      hjust = -0.5
    ) +
    theme(
      # add margin on the left of the drawing canvas,
      # so title is aligned with left edge of first plot
      plot.margin = margin(1, 1, 1, 1)
    )

  aa <- plot_grid(plotlist = list(g_density, g_point), ncol = 2)
  
  png(filename = paste0(outfolder,"AF_Comparison_",sampleid_1,"_",sampleid_2,"_common_",tag,".png"), 
      width = my.width, height = my.height, units = "in", res = my.res)
  print(plot_grid(
    title, 
    aa,
    ncol = 1,
    # rel_heights values control vertical title margins
    rel_heights = c(0.1, 1)
  ))
  dev.off()
  
  # Merging
  
   union_vars <- union(x = db[[sampleid_1]]$variantkey, y = db[[sampleid_2]]$variantkey)
   
   sampleid_1_full <- subset.data.frame(x = db[[sampleid_1]], select = c("variantkey","AF","DP","FILTER","Gene"))
   colnames(sampleid_1_full) <- c("variantkey","AF_GFP_H","DP_GFP_H","FILTER_GFP_H","Gene")
  # 
   sampleid_2_full <- subset.data.frame(x = db[[sampleid_2]], select = c("variantkey","AF","DP","FILTER","Gene"))
   colnames(sampleid_2_full) <- c("variantkey","AF_GFP_L","DP_GFP_L","FILTER_GFP_L","Gene")
  # 
  union_df <- merge.data.frame(x = sampleid_1_full, y=sampleid_2_full, by = c("variantkey","Gene"), all = T)
 
  union_df$AF_GFP_H[is.na(union_df$AF_GFP_H)] <- 0
  union_df$AF_GFP_L[is.na(union_df$AF_GFP_L)] <- 0

  union_df$DP_GFP_H[is.na(union_df$DP_GFP_H)] <- 0
  union_df$DP_GFP_L[is.na(union_df$DP_GFP_L)] <- 0
  
  colnames(union_df) <- c("variantkey","Gene","AF_GFP_H","DP_GFP_H","FILTER_GFP_H","AF_GFP_L","DP_GFP_L","FILTER_GFP_L")
  
  # Plotting Both

  png(filename = paste0(outfolder,"AF_Comparison_",sampleid_1,"_",sampleid_2,"_union_",tag,".png"), width = 12, height = 9, units = "in", res = 200)
  print(ggplot(data = union_df, mapping = aes(x = AF_GFP_H, y = AF_GFP_L, label = Gene)) + 
          geom_point(size = 0.5) + #geom_text(size=2) + 
          ggtitle(label = paste0("AF distribution ", tag), subtitle = paste0(sampleid_1, " vs ", sampleid_2)) +
          theme(plot.subtitle = element_text(hjust = 0.5))) 
  dev.off()

  # retrieve info from db for complete annotation 
  
  common_annot <- merge.data.frame(x = common_df, y = fulldf_annot, by = 'variantkey', all.x = T, sort = F)
  union_annot <- merge.data.frame(x = union_df, y = fulldf_annot, by = 'variantkey', all.x = T, sort = F)
    
  resultdata <- list(common = common_df, commonannot = common_annot, 
                     union = union_df, unionannot = union_annot,
                     comvars = fulldf)
  
  write.xlsx(x = resultdata, file = paste0(outfolder,"Comparison_",sampleid_1,"_",sampleid_2,"_", tag, ".xlsx"), asTable = T)
  
  saveRDS(object = resultdata, file = paste0(outfolder,"Comparison_",sampleid_1,"_",sampleid_2,"_", tag, ".rds"))
  
  return(resultdata)
}