Commit cb11c372 authored by Stefano Beretta's avatar Stefano Beretta
Browse files

Upload New File

parent 1583cb06
# BAR-seq Analysis
library(dplyr)
library(stringr)
library(ggplot2)
library(reshape2)
library(ComplexHeatmap)
library(scales)
library(circlize)
library(RColorBrewer)
library(edgeR)
library(openxlsx)
library(DESeq2)
library(gridExtra)
#######################
## Utility Functions ##
#######################
deriv <- function(x, y) diff(y) / diff(x)
middle_pts <- function(x) x[-1] - diff(x) / 2
elbow_point = function(x, y) {
# start and end point positions
wmin_x = which.min(x)
wmax_x = which.max(x)
# start point
x_start = x[wmin_x]
y_start = y[wmin_x]
# end point
x_end = x[wmax_x]
y_end = y[wmax_x]
# distance between x, y start and end
dx = x_end - x_start
dy = y_end - y_start
d_start_end = sqrt(dx^2 + dy^2)
# distance between each point in (x, y) and the line passing through the
# points (x_start, y_start) and (x_end, y_end)
d = abs(dx * (y_start - y) - (x_start - x) * dy) / d_start_end
wmax_d = which.max(d)
return(list(idx = wmax_d, x = x[wmax_d], y = y[wmax_d]))
}
# Cut selection
select_cut <- function(t) {
d1 <- deriv(t$Rank, t$Count)
d1log <- deriv(log10(t$Rank), log10(t$Count))
sd1_log <- which.min(diff(d1log)) + 1
return(sd1_log)
}
#############
## General ##
#############
wdir <- "Fiumara_BasePrimeEd2022_BARseq"
fname_suf <- "_edit_2.barcode.mc2.tsv"
data_dir <- paste(wdir, "02-barseq", sep = "/")
out_dir <- paste(wdir, "03-plots", sep = "/")
dir.create(path = out_dir, showWarnings = F)
sinfo <- data.frame("Sample" = c())
for (sample in list.dirs(path = data_dir, full.names = F)) {
if (sample == "") { next }
if (length(grep("clone", sample, value = T)) >= 1) { next }
print(sample)
sinfo <- rbind(sinfo, data.frame("Sample" = sample))
}
sinfo$CT <- str_split_fixed(sinfo$Sample, "_", 4)[,3]
sinfo <- mutate(sinfo, CellType = case_when(CT == "15w" ~ "PB 15Weeks",
CT == "BM" ~ "BM",
CT == "SPL" ~ "SPL",
CT == "9W" ~ "PB 9Weeks",
CT == "B" ~ "B cells",
CT == "M" ~ "Myeloid cells",
CT == "T" ~ "T cells",
CT == "CD34" ~ "CD34"))
sinfo$Mouse <- paste0("Mouse_", str_split_fixed(sinfo$Sample, "_", 4)[,4])
sinfo$Mouse <- gsub("Mouse_A", "BE4_", sinfo$Mouse)
sinfo$Mouse <- gsub("Mouse_B", "ABE8_", sinfo$Mouse)
sinfo$Mouse <-gsub("Mouse_C", "Cas9_", sinfo$Mouse)
sinfo$Mouse <- gsub("Mouse_D", "UTelectro_", sinfo$Mouse)
sinfo$Treatment <- str_split_fixed(sinfo$Mouse, "_", 2)[,1]
sinfo$Treatment <- factor(sinfo$Treatment, levels = c("UTelectro","Cas9","BE4","ABE8"))
sinfo$Mouse <- factor(sinfo$Mouse, levels = c("UTelectro_0","UTelectro_1","UTelectro_2","UTelectro_3","UTelectro_4",
"Cas9_0","Cas9_1","Cas9_4",
"BE4_0","BE4_1","BE4_2","BE4_3",
"ABE8_0","ABE8_1","ABE8_2","ABE8_3","ABE8_4"
))
sinfo$CellType <- factor(sinfo$CellType, levels = c("PB 9Weeks","PB 15Weeks","BM","SPL","CD34","B cells","Myeloid cells","T cells"))
sinfo <- sinfo[order(sinfo$Treatment, sinfo$Mouse, sinfo$CellType),]
###################
### Export XLSX ###
###################
df_count <- data.frame()
df_logcpm <- data.frame()
df_count_selected <- data.frame()
df_logcpm_selected <- data.frame()
df_count_dominant <- data.frame()
df_logcpm_dominant <- data.frame()
for (sample in sinfo$Sample) {
print(sample)
t <- read.table(paste(data_dir, sample, paste0(sample, fname_suf), sep = "/"), header = T, sep = "\t")
colnames(t) <- c("Barcode", "Count", "SaturationPerc")
t$logCPM <- log1p(edgeR::cpm(t$Count))
t_count <- t[, c("Barcode", "Count")]
colnames(t_count) <- c("Barcode", sample)
t_logcpm <- t[, c("Barcode", "logCPM")]
colnames(t_logcpm) <- c("Barcode", sample)
t$Rank <- seq(1, nrow(t))
# Selection
sel_cut <- select_cut(t)
t_sel <- subset(t, Rank <= t$Rank[sel_cut])
t_count_sel <- t_sel[, c("Barcode", "Count")]
colnames(t_count_sel) <- c("Barcode", paste0(sample, ":rank", t$Rank[sel_cut]))
t_logcpm_sel <- t_sel[, c("Barcode", "logCPM")]
colnames(t_logcpm_sel) <- c("Barcode", paste0(sample, ":rank", t$Rank[sel_cut]))
# Dominant
ebow <- elbow_point(x = t$Rank, t$Count)
dominant_cut <- ebow$idx
t_dom <- subset(t, Rank <= t$Rank[dominant_cut])
t_count_dom <- t_dom[, c("Barcode", "Count")]
colnames(t_count_dom) <- c("Barcode", paste0(sample, ":rank", t$Rank[dominant_cut]))
t_logcpm_dom <- t_dom[, c("Barcode", "logCPM")]
colnames(t_logcpm_dom) <- c("Barcode", paste0(sample, ":rank", t$Rank[dominant_cut]))
if(nrow(df_count) == 0) {
df_count <- t_count
df_logcpm <- t_logcpm
df_count_selected <- t_count_sel
df_logcpm_selected <- t_logcpm_sel
df_count_dominant <- t_count_dom
df_logcpm_dominant <- t_logcpm_dom
} else {
df_count <- merge(df_count, t_count, by = "Barcode", all = T)
df_logcpm <- merge(df_logcpm, t_logcpm, by = "Barcode", all = T)
df_count_selected <- merge(df_count_selected, t_count_sel, by = "Barcode", all = T)
df_logcpm_selected <- merge(df_logcpm_selected, t_logcpm_sel, by = "Barcode", all = T)
df_count_dominant <- merge(df_count_dominant, t_count_dom, by = "Barcode", all = T)
df_logcpm_dominant <- merge(df_logcpm_dominant, t_logcpm_dom, by = "Barcode", all = T)
}
}
write.xlsx(x = list("Count" = df_count, "logCPM" = df_logcpm,
"Count_Selected" = df_count_selected, "logCPM_Selected" = df_logcpm_selected,
"Count_Dominant" = df_count_dominant, "logCPM_Dominant" = df_logcpm_dominant),
file = paste(out_dir, "Full_barcodes.xlsx", sep = "/"), overwrite = T,
asTable = T, colWidths = "auto", firstRow = T)
# Stats
full.df <- data.frame()
for (sample in sinfo$Sample) {
print(sample)
t <- read.table(paste(data_dir, sample, paste0(sample, fname_suf), sep = "/"), header = T, sep = "\t")
colnames(t) <- c("Barcode", "Count", "SaturationPerc")
tot_bars <- nrow(t)
t$logCPM <- log1p(edgeR::cpm(t$Count))
t$Rank <- seq(1, nrow(t))
# Cut selection
sel_cut <- select_cut(t)
tot_sum <- sum(t$Count)
t_sel <- subset(t, Rank <= t$Rank[sel_cut])
sel_bars <- nrow(t_sel)
sel_sum <- sum(t_sel$Count)
# Cut dominant
ebow <- elbow_point(x = t$Rank, t$Count)
dominant_cut <- ebow$idx
t_dom <- subset(t, Rank <= t$Rank[dominant_cut])
dom_bars <- nrow(t_dom)
dom_sum <- sum(t_dom$Count)
mm <- as.character(sinfo[sinfo$Sample == sample, "Mouse"])
ct <- as.character(sinfo[sinfo$Sample == sample, "CellType"])
df <- data.frame("Sample" = c(sample), "Treatment" = c(TT),
"Mouse" = c(mm), "CellType" = c(ct),
"TotBarcodes" = c(tot_bars), "SumTotBarcodes" = c(tot_sum),
"SelBarcodes" = c(sel_bars), "SelBarcodesPerc" = c(round((sel_bars/tot_bars)*100, 2)),
"SumSelBarcodes" = c(sel_sum), "SumSelBarcodesPerc" = c(round((sel_sum/tot_sum)*100, 2)),
"DomBarcodes" = c(dom_bars), "DomBarcodesPerc" = c(round((dom_bars/tot_bars)*100, 2)),
"SumDomBarcodes" = c(dom_sum), "SumDomBarcodesPerc" = c(round((dom_sum/tot_sum)*100, 2))
)
full.df <- rbind(full.df, df)
}
write.xlsx(x = list("SampleStats" = full.df), file = paste(out_dir, "SampleStats.xlsx", sep = "/"), overwrite = T)
#########################
### Heatmap Selection ###
#########################
full.t <- data.frame()
full.t.hm <- data.frame()
full.t.annot <- data.frame()
for (sample in sinfo$Sample) {
print(sample)
t <- read.table(paste(data_dir, sample, paste0(sample, fname_suf), sep = "/"), header = T, sep = "\t")
colnames(t) <- c("Barcode", "Count", "SaturationPerc")
t$Sample <- sample
t$logCPM <- log1p(edgeR::cpm(t$Count))
t$Rank <- seq(1, nrow(t))
t$NormRank <- t$Rank / nrow(t)
# Cut selection
sel_cut <- select_cut(t)
t$SelRank <- t$Rank[sel_cut]
t$SelNormRank <- t$NormRank[sel_cut]
full.t <- rbind(full.t, t)
t.hm <- subset(t, Rank <= t$Rank[sel_cut]) %>% select(Barcode, Count, Sample, SaturationPerc)
full.t.hm <- rbind(full.t.hm, t.hm)
for (r in 1:nrow(t.hm)) {
if (nrow(full.t.annot) == 0) {
full.t.annot <- t.hm[r, c("Barcode", "Sample"), drop = FALSE]
} else {
bcode <- t.hm[r, "Barcode"]
if (!(bcode %in% full.t.annot$Barcode)) {
full.t.annot <- rbind(full.t.annot, t.hm[r, c("Barcode", "Sample"), drop = FALSE])
}
}
}
}
full.t <- merge(full.t, sinfo, by = "Sample")
full.t.hm$SaturationPerc <- NULL
full.tt.hm <- dcast(full.t.hm, Barcode ~ Sample, value.var = "Count")
rownames(full.tt.hm) <- full.tt.hm$Barcode
full.tt.hm$Barcode <- NULL
full.tt.hm[is.na(full.tt.hm)] <- 0
# logCPM
full.tt.hm.cpm <- edgeR::cpm(full.tt.hm)
full.tt.hm.cpm <- log1p(full.tt.hm.cpm)
full.tt.hm.cpm <- full.tt.hm.cpm[full.t.annot$Barcode, sinfo$Sample]
# log Abundance
full.tt.hm <- log1p(full.tt.hm)
full.tt.hm <- full.tt.hm[full.t.annot$Barcode, sinfo$Sample]
annot <- sinfo
rownames(annot) <- annot$Sample
mm_num <- paste0("Mouse", seq(1,length(levels(annot$Mouse))))
names(mm_num) <- levels(annot$Mouse)
annot$MouseName <- factor(mm_num[annot$Mouse], levels = paste0("Mouse", seq(1,length(levels(annot$Mouse)))))
ct_colors <- brewer.pal(length(unique(annot$CellType)), "Set2")
names(ct_colors) <- unique(annot$CellType)
ha_col <- columnAnnotation(df = annot[, "CellType", drop = F],
col = list(CellType = ct_colors),
show_annotation_name = F)
hm = Heatmap(matrix = as.matrix(full.tt.hm),
name = "log(Count)",
col = colorRampPalette(brewer.pal(n = 9, name ="OrRd"))(100),
cluster_columns = FALSE,
cluster_rows = FALSE,
rect_gp = gpar(col = "grey", lwd = 0),
border = T,
show_row_names = FALSE,
show_column_names = TRUE,
column_names_gp = gpar(fontsize = 9),
column_split = annot$MouseName,
column_title_side = "top",
column_title_gp = gpar(fontsize = 9),
column_gap = unit(c(rep(.1, 4), 2, rep(.1, 3), 2, rep(.1, 2), 2, rep(.1, 4)), "mm"),
top_annotation = ha_col,
use_raster = TRUE
)
pdf(paste(out_dir, "Full_HM_selected.pdf", sep = "/"), width = 16, height = 10)
draw(hm, merge_legends = TRUE)
dev.off()
hm = Heatmap(matrix = as.matrix(full.tt.hm.cpm),
name = "logCPM",
col = colorRampPalette(brewer.pal(n = 9, name ="OrRd"))(100),
cluster_columns = FALSE,
cluster_rows = FALSE,
rect_gp = gpar(col = "grey", lwd = 0),
border = T,
show_row_names = FALSE,
show_column_names = TRUE,
column_names_gp = gpar(fontsize = 9),
column_split = annot$MouseName,
column_title_side = "top",
column_title_gp = gpar(fontsize = 9),
column_gap = unit(c(rep(.1, 4), 2, rep(.1, 3), 2, rep(.1, 2), 2, rep(.1, 4)), "mm"),
top_annotation = ha_col,
use_raster = TRUE
)
pdf(paste(out_dir, "Full_HM_logCPM_selected.pdf", sep = "/"), width = 16, height = 10)
draw(hm, merge_legends = TRUE)
dev.off()
#######################
### Barcode Sharing ###
#######################
### Selected ###
df_count <- data.frame()
for (sample in sinfo$Sample) {
print(sample)
t <- read.table(paste(data_dir, sample, paste0(sample, fname_suf), sep = "/"), header = T, sep = "\t")
colnames(t) <- c("Barcode", "Count", "SaturationPerc")
t$logCPM <- log1p(edgeR::cpm(t$Count))
t$Rank <- seq(1, nrow(t))
t$NormRank <- t$Rank / nrow(t)
# Cut selection
sel_cut <- select_cut(t)
t <- subset(t, Rank <= t$Rank[sel_cut])
t_count <- t[, c("Barcode", "Count")]
colnames(t_count) <- c("Barcode", sample)
if(nrow(df_count) == 0) {
df_count <- t_count
} else {
df_count <- merge(df_count, t_count, by = "Barcode", all = T)
}
}
tt <- df_count
tt[is.na(tt)] <- 0
rownames(tt) <- tt$Barcode
tt$Barcode <- NULL
present = !is.na(tt) & tt > 0
intersect = crossprod(present)
union = nrow(tt) - crossprod(!present)
jacc_mat = intersect / union
jacc_mat[lower.tri(jacc_mat)] <- NA
jacc_mat_melt <- melt(jacc_mat)
jacc_mat_melt <- merge(jacc_mat_melt, sinfo, by.y = "Sample", by.x = "Var1")
p <- ggplot(jacc_mat_melt, aes(x = Var1, y = Var2, fill = value)) +
theme_bw(base_size = 16) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 5)) +
theme(axis.text.y = element_text(size = 5)) +
theme(panel.spacing.x = unit(0, "mm")) +
ylab("") +
xlab("") +
ggtitle("Full Selected - Jaccard Index") +
geom_tile(width = .95, height = .95) +
scale_fill_gradientn(limits = c(0.05,1),na.value = "grey90", name = "Jaccard",
colours=c("navyblue", "darkmagenta", "darkorange1"),
breaks = seq(0, 1, 0.2), labels = format(seq(0, 1, 0.2))) +
facet_grid(. ~ Treatment, scales = "free", space = "free")
zoom1_ds <- c(grep("_B0", jacc_mat_melt$Var1, value = T), grep("_B1", jacc_mat_melt$Var1, value = T))
zoom1 <- subset(jacc_mat_melt, Var1 %in% zoom1_ds & Var2 %in% zoom1_ds)
zoom1$Treatment <- droplevels(zoom1$Treatment)
p_zoom1 <- ggplot(zoom1, aes(x = Var1, y = Var2, fill = value)) +
theme_bw() +
theme(axis.text.x = element_blank()) +
theme(axis.text.y = element_blank()) +
theme(legend.position = "none") +
theme(
plot.background = element_rect(fill = "transparent", colour = NA),
panel.border = element_rect(fill = "transparent", colour = "forestgreen", size = 2),
plot.margin = unit(c(0, 0, 0, 0), "null"),
axis.ticks = element_blank()
) +
ylab("") +
xlab("") +
geom_tile(width = .95, height = .95) +
scale_fill_gradientn(limits = c(0.05,1),na.value = "grey90", name = "Jaccard",
colours=c("navyblue", "darkmagenta", "darkorange1"),
breaks = seq(0, 1, 0.2), labels = format(seq(0, 1, 0.2)))
zoom2_ds <- c(grep("_C1", jacc_mat_melt$Var1, value = T), grep("_C4", jacc_mat_melt$Var1, value = T))
zoom2 <- subset(jacc_mat_melt, Var1 %in% zoom2_ds & Var2 %in% zoom2_ds)
p_zoom2 <- ggplot(zoom2, aes(x = Var1, y = Var2, fill = value)) +
theme_bw() +
theme(axis.text.x = element_blank()) +
theme(axis.text.y = element_blank()) +
theme(legend.position = "none") +
theme(
plot.background = element_rect(fill = "transparent", colour = NA),
panel.border = element_rect(fill = "transparent", colour = "red", size = 2),
plot.margin = unit(c(0, 0, 0, 0), "null"),
axis.ticks = element_blank()
) +
ylab("") +
xlab("") +
geom_tile(width = .95, height = .95) +
scale_fill_gradientn(limits = c(0.05,1),na.value = "grey90", name = "Jaccard",
colours=c("navyblue", "darkmagenta", "darkorange1"),
breaks = seq(0, 1, 0.2), labels = format(seq(0, 1, 0.2)))
ss <- 0.27
pdf(file = paste(out_dir, "Full_sharing_HM_selected.pdf", sep = "/"), width = 12, height = 10)
grid.newpage()
grid.draw(arrangeGrob(p))
grid.rect(x = unit(0.7, "npc"), y = unit(0.728, "npc"),
width = unit(0.1, "npc"), height = unit(0.1, "npc"),
gp=gpar(col = "forestgreen", lwd = 3))
grid.draw(grobTree(ggplotGrob(p_zoom1),
vp=viewport(width = unit(ss, "npc"), height = unit(ss, "npc"),
x = unit(0.74,"npc"),y = unit(0.48,"npc"))))
grid.lines(x = unit(c(0.6245, 0.65), "npc"),
y = unit(c(0.613, 0.677), "npc"),
gp=gpar(col = "forestgreen", lwd = 3))
grid.lines(x = unit(c(0.874, 0.75), "npc"),
y = unit(c(0.613, 0.677), "npc"),
gp=gpar(col = "forestgreen", lwd = 3))
grid.rect(x = unit(0.418, "npc"), y = unit(0.44, "npc"),
width = unit(0.1, "npc"), height = unit(0.1, "npc"),
gp=gpar(col = "red", lwd = 3))
grid.draw(grobTree(ggplotGrob(p_zoom2),
vp=viewport(width = unit(ss, "npc"), height = unit(ss, "npc"),
x = unit(0.48,"npc"),y = unit(0.22,"npc"))))
grid.lines(x = unit(c(0.47, 0.613), "npc"),
y = unit(c(0.39, 0.355), "npc"),
gp=gpar(col = "red", lwd = 3))
grid.lines(x = unit(c(0.368, 0.3645), "npc"),
y = unit(c(0.39, 0.355), "npc"),
gp=gpar(col = "red", lwd = 3))
dev.off()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment