# BAR-seq Analysis library(dplyr) library(stringr) library(ggplot2) library(reshape2) library(ComplexHeatmap) library(scales) library(circlize) library(RColorBrewer) library(edgeR) library(openxlsx) library(DESeq2) library(gridExtra) ####################### ## Utility Functions ## ####################### deriv <- function(x, y) diff(y) / diff(x) middle_pts <- function(x) x[-1] - diff(x) / 2 elbow_point = function(x, y) { # start and end point positions wmin_x = which.min(x) wmax_x = which.max(x) # start point x_start = x[wmin_x] y_start = y[wmin_x] # end point x_end = x[wmax_x] y_end = y[wmax_x] # distance between x, y start and end dx = x_end - x_start dy = y_end - y_start d_start_end = sqrt(dx^2 + dy^2) # distance between each point in (x, y) and the line passing through the # points (x_start, y_start) and (x_end, y_end) d = abs(dx * (y_start - y) - (x_start - x) * dy) / d_start_end wmax_d = which.max(d) return(list(idx = wmax_d, x = x[wmax_d], y = y[wmax_d])) } # Cut selection select_cut <- function(t) { d1 <- deriv(t$Rank, t$Count) d1log <- deriv(log10(t$Rank), log10(t$Count)) sd1_log <- which.min(diff(d1log)) + 1 return(sd1_log) } ############# ## General ## ############# wdir <- "Fiumara_BasePrimeEd2022_BARseq" fname_suf <- "_edit_2.barcode.mc2.tsv" data_dir <- paste(wdir, "02-barseq", sep = "/") out_dir <- paste(wdir, "03-plots", sep = "/") dir.create(path = out_dir, showWarnings = F) sinfo <- data.frame("Sample" = c()) for (sample in list.dirs(path = data_dir, full.names = F)) { if (sample == "") { next } if (length(grep("clone", sample, value = T)) >= 1) { next } print(sample) sinfo <- rbind(sinfo, data.frame("Sample" = sample)) } sinfo$CT <- str_split_fixed(sinfo$Sample, "_", 4)[,3] sinfo <- mutate(sinfo, CellType = case_when(CT == "15w" ~ "PB 15Weeks", CT == "BM" ~ "BM", CT == "SPL" ~ "SPL", CT == "9W" ~ "PB 9Weeks", CT == "B" ~ "B cells", CT == "M" ~ "Myeloid cells", CT == "T" ~ "T cells", CT == "CD34" ~ "CD34")) sinfo$Mouse <- paste0("Mouse_", str_split_fixed(sinfo$Sample, "_", 4)[,4]) sinfo$Mouse <- gsub("Mouse_A", "BE4_", sinfo$Mouse) sinfo$Mouse <- gsub("Mouse_B", "ABE8_", sinfo$Mouse) sinfo$Mouse <-gsub("Mouse_C", "Cas9_", sinfo$Mouse) sinfo$Mouse <- gsub("Mouse_D", "UTelectro_", sinfo$Mouse) sinfo$Treatment <- str_split_fixed(sinfo$Mouse, "_", 2)[,1] sinfo$Treatment <- factor(sinfo$Treatment, levels = c("UTelectro","Cas9","BE4","ABE8")) sinfo$Mouse <- factor(sinfo$Mouse, levels = c("UTelectro_0","UTelectro_1","UTelectro_2","UTelectro_3","UTelectro_4", "Cas9_0","Cas9_1","Cas9_4", "BE4_0","BE4_1","BE4_2","BE4_3", "ABE8_0","ABE8_1","ABE8_2","ABE8_3","ABE8_4" )) sinfo$CellType <- factor(sinfo$CellType, levels = c("PB 9Weeks","PB 15Weeks","BM","SPL","CD34","B cells","Myeloid cells","T cells")) sinfo <- sinfo[order(sinfo$Treatment, sinfo$Mouse, sinfo$CellType),] ################### ### Export XLSX ### ################### df_count <- data.frame() df_logcpm <- data.frame() df_count_selected <- data.frame() df_logcpm_selected <- data.frame() df_count_dominant <- data.frame() df_logcpm_dominant <- data.frame() for (sample in sinfo$Sample) { print(sample) t <- read.table(paste(data_dir, sample, paste0(sample, fname_suf), sep = "/"), header = T, sep = "\t") colnames(t) <- c("Barcode", "Count", "SaturationPerc") t$logCPM <- log1p(edgeR::cpm(t$Count)) t_count <- t[, c("Barcode", "Count")] colnames(t_count) <- c("Barcode", sample) t_logcpm <- t[, c("Barcode", "logCPM")] colnames(t_logcpm) <- c("Barcode", sample) t$Rank <- seq(1, nrow(t)) # Selection sel_cut <- select_cut(t) t_sel <- subset(t, Rank <= t$Rank[sel_cut]) t_count_sel <- t_sel[, c("Barcode", "Count")] colnames(t_count_sel) <- c("Barcode", paste0(sample, ":rank", t$Rank[sel_cut])) t_logcpm_sel <- t_sel[, c("Barcode", "logCPM")] colnames(t_logcpm_sel) <- c("Barcode", paste0(sample, ":rank", t$Rank[sel_cut])) # Dominant ebow <- elbow_point(x = t$Rank, t$Count) dominant_cut <- ebow$idx t_dom <- subset(t, Rank <= t$Rank[dominant_cut]) t_count_dom <- t_dom[, c("Barcode", "Count")] colnames(t_count_dom) <- c("Barcode", paste0(sample, ":rank", t$Rank[dominant_cut])) t_logcpm_dom <- t_dom[, c("Barcode", "logCPM")] colnames(t_logcpm_dom) <- c("Barcode", paste0(sample, ":rank", t$Rank[dominant_cut])) if(nrow(df_count) == 0) { df_count <- t_count df_logcpm <- t_logcpm df_count_selected <- t_count_sel df_logcpm_selected <- t_logcpm_sel df_count_dominant <- t_count_dom df_logcpm_dominant <- t_logcpm_dom } else { df_count <- merge(df_count, t_count, by = "Barcode", all = T) df_logcpm <- merge(df_logcpm, t_logcpm, by = "Barcode", all = T) df_count_selected <- merge(df_count_selected, t_count_sel, by = "Barcode", all = T) df_logcpm_selected <- merge(df_logcpm_selected, t_logcpm_sel, by = "Barcode", all = T) df_count_dominant <- merge(df_count_dominant, t_count_dom, by = "Barcode", all = T) df_logcpm_dominant <- merge(df_logcpm_dominant, t_logcpm_dom, by = "Barcode", all = T) } } write.xlsx(x = list("Count" = df_count, "logCPM" = df_logcpm, "Count_Selected" = df_count_selected, "logCPM_Selected" = df_logcpm_selected, "Count_Dominant" = df_count_dominant, "logCPM_Dominant" = df_logcpm_dominant), file = paste(out_dir, "Full_barcodes.xlsx", sep = "/"), overwrite = T, asTable = T, colWidths = "auto", firstRow = T) # Stats full.df <- data.frame() for (sample in sinfo$Sample) { print(sample) t <- read.table(paste(data_dir, sample, paste0(sample, fname_suf), sep = "/"), header = T, sep = "\t") colnames(t) <- c("Barcode", "Count", "SaturationPerc") tot_bars <- nrow(t) t$logCPM <- log1p(edgeR::cpm(t$Count)) t$Rank <- seq(1, nrow(t)) # Cut selection sel_cut <- select_cut(t) tot_sum <- sum(t$Count) t_sel <- subset(t, Rank <= t$Rank[sel_cut]) sel_bars <- nrow(t_sel) sel_sum <- sum(t_sel$Count) # Cut dominant ebow <- elbow_point(x = t$Rank, t$Count) dominant_cut <- ebow$idx t_dom <- subset(t, Rank <= t$Rank[dominant_cut]) dom_bars <- nrow(t_dom) dom_sum <- sum(t_dom$Count) mm <- as.character(sinfo[sinfo$Sample == sample, "Mouse"]) ct <- as.character(sinfo[sinfo$Sample == sample, "CellType"]) df <- data.frame("Sample" = c(sample), "Treatment" = c(TT), "Mouse" = c(mm), "CellType" = c(ct), "TotBarcodes" = c(tot_bars), "SumTotBarcodes" = c(tot_sum), "SelBarcodes" = c(sel_bars), "SelBarcodesPerc" = c(round((sel_bars/tot_bars)*100, 2)), "SumSelBarcodes" = c(sel_sum), "SumSelBarcodesPerc" = c(round((sel_sum/tot_sum)*100, 2)), "DomBarcodes" = c(dom_bars), "DomBarcodesPerc" = c(round((dom_bars/tot_bars)*100, 2)), "SumDomBarcodes" = c(dom_sum), "SumDomBarcodesPerc" = c(round((dom_sum/tot_sum)*100, 2)) ) full.df <- rbind(full.df, df) } write.xlsx(x = list("SampleStats" = full.df), file = paste(out_dir, "SampleStats.xlsx", sep = "/"), overwrite = T) ######################### ### Heatmap Selection ### ######################### full.t <- data.frame() full.t.hm <- data.frame() full.t.annot <- data.frame() for (sample in sinfo$Sample) { print(sample) t <- read.table(paste(data_dir, sample, paste0(sample, fname_suf), sep = "/"), header = T, sep = "\t") colnames(t) <- c("Barcode", "Count", "SaturationPerc") t$Sample <- sample t$logCPM <- log1p(edgeR::cpm(t$Count)) t$Rank <- seq(1, nrow(t)) t$NormRank <- t$Rank / nrow(t) # Cut selection sel_cut <- select_cut(t) t$SelRank <- t$Rank[sel_cut] t$SelNormRank <- t$NormRank[sel_cut] full.t <- rbind(full.t, t) t.hm <- subset(t, Rank <= t$Rank[sel_cut]) %>% select(Barcode, Count, Sample, SaturationPerc) full.t.hm <- rbind(full.t.hm, t.hm) for (r in 1:nrow(t.hm)) { if (nrow(full.t.annot) == 0) { full.t.annot <- t.hm[r, c("Barcode", "Sample"), drop = FALSE] } else { bcode <- t.hm[r, "Barcode"] if (!(bcode %in% full.t.annot$Barcode)) { full.t.annot <- rbind(full.t.annot, t.hm[r, c("Barcode", "Sample"), drop = FALSE]) } } } } full.t <- merge(full.t, sinfo, by = "Sample") full.t.hm$SaturationPerc <- NULL full.tt.hm <- dcast(full.t.hm, Barcode ~ Sample, value.var = "Count") rownames(full.tt.hm) <- full.tt.hm$Barcode full.tt.hm$Barcode <- NULL full.tt.hm[is.na(full.tt.hm)] <- 0 # logCPM full.tt.hm.cpm <- edgeR::cpm(full.tt.hm) full.tt.hm.cpm <- log1p(full.tt.hm.cpm) full.tt.hm.cpm <- full.tt.hm.cpm[full.t.annot$Barcode, sinfo$Sample] # log Abundance full.tt.hm <- log1p(full.tt.hm) full.tt.hm <- full.tt.hm[full.t.annot$Barcode, sinfo$Sample] annot <- sinfo rownames(annot) <- annot$Sample mm_num <- paste0("Mouse", seq(1,length(levels(annot$Mouse)))) names(mm_num) <- levels(annot$Mouse) annot$MouseName <- factor(mm_num[annot$Mouse], levels = paste0("Mouse", seq(1,length(levels(annot$Mouse))))) ct_colors <- brewer.pal(length(unique(annot$CellType)), "Set2") names(ct_colors) <- unique(annot$CellType) ha_col <- columnAnnotation(df = annot[, "CellType", drop = F], col = list(CellType = ct_colors), show_annotation_name = F) hm = Heatmap(matrix = as.matrix(full.tt.hm), name = "log(Count)", col = colorRampPalette(brewer.pal(n = 9, name ="OrRd"))(100), cluster_columns = FALSE, cluster_rows = FALSE, rect_gp = gpar(col = "grey", lwd = 0), border = T, show_row_names = FALSE, show_column_names = TRUE, column_names_gp = gpar(fontsize = 9), column_split = annot$MouseName, column_title_side = "top", column_title_gp = gpar(fontsize = 9), column_gap = unit(c(rep(.1, 4), 2, rep(.1, 3), 2, rep(.1, 2), 2, rep(.1, 4)), "mm"), top_annotation = ha_col, use_raster = TRUE ) pdf(paste(out_dir, "Full_HM_selected.pdf", sep = "/"), width = 16, height = 10) draw(hm, merge_legends = TRUE) dev.off() hm = Heatmap(matrix = as.matrix(full.tt.hm.cpm), name = "logCPM", col = colorRampPalette(brewer.pal(n = 9, name ="OrRd"))(100), cluster_columns = FALSE, cluster_rows = FALSE, rect_gp = gpar(col = "grey", lwd = 0), border = T, show_row_names = FALSE, show_column_names = TRUE, column_names_gp = gpar(fontsize = 9), column_split = annot$MouseName, column_title_side = "top", column_title_gp = gpar(fontsize = 9), column_gap = unit(c(rep(.1, 4), 2, rep(.1, 3), 2, rep(.1, 2), 2, rep(.1, 4)), "mm"), top_annotation = ha_col, use_raster = TRUE ) pdf(paste(out_dir, "Full_HM_logCPM_selected.pdf", sep = "/"), width = 16, height = 10) draw(hm, merge_legends = TRUE) dev.off() ####################### ### Barcode Sharing ### ####################### ### Selected ### df_count <- data.frame() for (sample in sinfo$Sample) { print(sample) t <- read.table(paste(data_dir, sample, paste0(sample, fname_suf), sep = "/"), header = T, sep = "\t") colnames(t) <- c("Barcode", "Count", "SaturationPerc") t$logCPM <- log1p(edgeR::cpm(t$Count)) t$Rank <- seq(1, nrow(t)) t$NormRank <- t$Rank / nrow(t) # Cut selection sel_cut <- select_cut(t) t <- subset(t, Rank <= t$Rank[sel_cut]) t_count <- t[, c("Barcode", "Count")] colnames(t_count) <- c("Barcode", sample) if(nrow(df_count) == 0) { df_count <- t_count } else { df_count <- merge(df_count, t_count, by = "Barcode", all = T) } } tt <- df_count tt[is.na(tt)] <- 0 rownames(tt) <- tt$Barcode tt$Barcode <- NULL present = !is.na(tt) & tt > 0 intersect = crossprod(present) union = nrow(tt) - crossprod(!present) jacc_mat = intersect / union jacc_mat[lower.tri(jacc_mat)] <- NA jacc_mat_melt <- melt(jacc_mat) jacc_mat_melt <- merge(jacc_mat_melt, sinfo, by.y = "Sample", by.x = "Var1") p <- ggplot(jacc_mat_melt, aes(x = Var1, y = Var2, fill = value)) + theme_bw(base_size = 16) + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 5)) + theme(axis.text.y = element_text(size = 5)) + theme(panel.spacing.x = unit(0, "mm")) + ylab("") + xlab("") + ggtitle("Full Selected - Jaccard Index") + geom_tile(width = .95, height = .95) + scale_fill_gradientn(limits = c(0.05,1),na.value = "grey90", name = "Jaccard", colours=c("navyblue", "darkmagenta", "darkorange1"), breaks = seq(0, 1, 0.2), labels = format(seq(0, 1, 0.2))) + facet_grid(. ~ Treatment, scales = "free", space = "free") zoom1_ds <- c(grep("_B0", jacc_mat_melt$Var1, value = T), grep("_B1", jacc_mat_melt$Var1, value = T)) zoom1 <- subset(jacc_mat_melt, Var1 %in% zoom1_ds & Var2 %in% zoom1_ds) zoom1$Treatment <- droplevels(zoom1$Treatment) p_zoom1 <- ggplot(zoom1, aes(x = Var1, y = Var2, fill = value)) + theme_bw() + theme(axis.text.x = element_blank()) + theme(axis.text.y = element_blank()) + theme(legend.position = "none") + theme( plot.background = element_rect(fill = "transparent", colour = NA), panel.border = element_rect(fill = "transparent", colour = "forestgreen", size = 2), plot.margin = unit(c(0, 0, 0, 0), "null"), axis.ticks = element_blank() ) + ylab("") + xlab("") + geom_tile(width = .95, height = .95) + scale_fill_gradientn(limits = c(0.05,1),na.value = "grey90", name = "Jaccard", colours=c("navyblue", "darkmagenta", "darkorange1"), breaks = seq(0, 1, 0.2), labels = format(seq(0, 1, 0.2))) zoom2_ds <- c(grep("_C1", jacc_mat_melt$Var1, value = T), grep("_C4", jacc_mat_melt$Var1, value = T)) zoom2 <- subset(jacc_mat_melt, Var1 %in% zoom2_ds & Var2 %in% zoom2_ds) p_zoom2 <- ggplot(zoom2, aes(x = Var1, y = Var2, fill = value)) + theme_bw() + theme(axis.text.x = element_blank()) + theme(axis.text.y = element_blank()) + theme(legend.position = "none") + theme( plot.background = element_rect(fill = "transparent", colour = NA), panel.border = element_rect(fill = "transparent", colour = "red", size = 2), plot.margin = unit(c(0, 0, 0, 0), "null"), axis.ticks = element_blank() ) + ylab("") + xlab("") + geom_tile(width = .95, height = .95) + scale_fill_gradientn(limits = c(0.05,1),na.value = "grey90", name = "Jaccard", colours=c("navyblue", "darkmagenta", "darkorange1"), breaks = seq(0, 1, 0.2), labels = format(seq(0, 1, 0.2))) ss <- 0.27 pdf(file = paste(out_dir, "Full_sharing_HM_selected.pdf", sep = "/"), width = 12, height = 10) grid.newpage() grid.draw(arrangeGrob(p)) grid.rect(x = unit(0.7, "npc"), y = unit(0.728, "npc"), width = unit(0.1, "npc"), height = unit(0.1, "npc"), gp=gpar(col = "forestgreen", lwd = 3)) grid.draw(grobTree(ggplotGrob(p_zoom1), vp=viewport(width = unit(ss, "npc"), height = unit(ss, "npc"), x = unit(0.74,"npc"),y = unit(0.48,"npc")))) grid.lines(x = unit(c(0.6245, 0.65), "npc"), y = unit(c(0.613, 0.677), "npc"), gp=gpar(col = "forestgreen", lwd = 3)) grid.lines(x = unit(c(0.874, 0.75), "npc"), y = unit(c(0.613, 0.677), "npc"), gp=gpar(col = "forestgreen", lwd = 3)) grid.rect(x = unit(0.418, "npc"), y = unit(0.44, "npc"), width = unit(0.1, "npc"), height = unit(0.1, "npc"), gp=gpar(col = "red", lwd = 3)) grid.draw(grobTree(ggplotGrob(p_zoom2), vp=viewport(width = unit(ss, "npc"), height = unit(ss, "npc"), x = unit(0.48,"npc"),y = unit(0.22,"npc")))) grid.lines(x = unit(c(0.47, 0.613), "npc"), y = unit(c(0.39, 0.355), "npc"), gp=gpar(col = "red", lwd = 3)) grid.lines(x = unit(c(0.368, 0.3645), "npc"), y = unit(c(0.39, 0.355), "npc"), gp=gpar(col = "red", lwd = 3)) dev.off()