HTO_demux.R

library(scales)
library(viridis)
library(Seurat)
library(stringr)
library(tibble)
library(dplyr)
library(patchwork)
library(ggplot2)
library(tidyr)
library(RColorBrewer)
library(openxlsx)
library(cowplot)

args <- commandArgs(trailingOnly = TRUE)


#########################################################################################
# KEEP ONLY BARCODE USED IN EXPERIMENT FROM FILE, NORMALIZE HTO AND DEMUX
#########################################################################################
filterbarcodes <- function(object_seurat,i,marg.norm) {
  cat('## FILTER BACRODES')
  counts <- GetAssayData(object_seurat, slot="counts", assay="HTO")   
  cat('number of cells for each HTO ')
  cat(rowSums(data.frame(counts)))
  
  hto.filter <-  str_split(hto_conditions[hto_conditions$sample == sampleid, 'HTO'], ',')[[1]] #hto_conditions[[sampleid]] 
  counts.sub <- counts[hto.filter,]
  
  ##filter and update number of cells per sample
  hto.assay <- CreateSeuratObject(counts=counts.sub)
  object_seurat@assays$HTO <- NULL  ### to use if you want to subset HTO
  object_seurat@assays$HTO <- CreateAssayObject(counts =  counts.sub)  ### ### to use if you want to subset HTO
  object_seurat@assays$HTO@key <- 'hto_'
  cat('Normalize HTO data')
  scrna.hashtag <- NormalizeData(object_seurat, assay = "HTO",normalization.method = "CLR", margin = marg.norm) 
  scrna.hashtag <- HTODemux(scrna.hashtag, assay = "HTO", positive.quantile = i,  seed = 42)
  
  ## rerturn list of barcodes and obj demultiplexed
  return(list(sampleid,
              obj_demux=scrna.hashtag,
              barcodes=rownames(scrna.hashtag@assays$HTO)))
  
}


#########################################################################################
# CHOOSE POSITIVE QUANTILE PARAMETER
#########################################################################################
eval_pq <- function(out_dir, obj.combined, sampleid, library_id, min_feature, max_feature, mito.prefix, max_pc_mito, marg.norm) {
eval_pq_out_list <- list()

  #cat(sampleid)
  eval_pq_out <- tibble()
  #obj.combined = readRDS(paste(folder,sampleid,paste0(sampleid,'_minimal.rds'), sep='/'))
  
 DefaultAssay(obj.combined) <- 'RNA'
 print(DefaultAssay(obj.combined))
 
 cat('filter cells', mito.prefix)
 # Compute mito % 
 obj.combined[["percent.mt"]] <- PercentageFeatureSet(obj.combined, assay = 'RNA', pattern = mito.prefix)
 obj  <- obj.combined %>%
  subset(
    nFeature_RNA > min_feature &    ### Remove cells with < 250 detected genes
      nFeature_RNA < max_feature &   ### Remove cells with > 2500 detected genes (could be doublets)
      percent.mt < max_pc_mito  ### Remove cells with > 0.15 mito/total reads
  )

# filtering object based on hto count to zero
DefaultAssay(obj) <- 'HTO'
print(DefaultAssay(obj))
obj <- subset(obj,subset = nCount_HTO != 0 )

cat('find pq')
for (i in seq(from = 0.9, to = 0.999999999, by = 0.005)) {
  cat(' demultiplexing cells based on HTO')
  
  x <- filterbarcodes(obj, i, marg.norm)
  
  
  object <- x$obj_demux
  cat('save pq')
  HTO_classification <- as.data.frame.matrix(table(
    object@meta.data$HTO_maxID,
    object@meta.data$HTO_classification.global
  )) %>%
    summarise(max.singlet = sum(Singlet)) %>%
    mutate(threshold = i)
  
  eval_pq_out <- bind_rows(eval_pq_out, HTO_classification) %>%
    mutate(library = sampleid)
}
eval_pq_out_list <- rbind( eval_pq_out_list, eval_pq_out)

result_list <- list(
"eval_pq_out" =  eval_pq_out_list
)

return(result_list)
}

obj_minimal <- args[1] # Sample minimal object
obj_combined <- readRDS(obj_minimal)
out_dir <- args[2]      # Path to output result folder (results)
sampleid <- args[3]     #Sample id
#min_cells <- as.numeric(args[4])
min_feature <- as.numeric(args[4]) 
max_feature <- as.numeric(args[5]) 
mito.prefix = args[6]
max_pc_mito <-  as.numeric(args[7])
marg.norm <- as.numeric(args[8])
hto <- args[9]
cat('########## mito prefix  #######')
cat(mito.prefix)
hto_conditions <- read.table(hto, header = TRUE, stringsAsFactors = FALSE) # Table with HTO list per sample

## Evaluate positive quantile parameter
eval_pq_lib1 <- eval_pq(out_dir, obj_combined, sampleid, hto_conditions, min_feature, max_feature, mito.prefix, max_pc_mito, marg.norm)

## Plot number of Singlet detected 
pq <- eval_pq_lib1$eval_pq_out %>%
  ggplot(aes(x = threshold, y = max.singlet, fill = library, color = library)) +
  geom_line() +
  geom_point() +
  scale_color_brewer(palette = "Set1", direction = -1) +
  labs(x = "Postive Quantile", y = "# Singlet cells", fill = "", color = "") +
  theme_bw()


out_dir_sample <- paste(out_dir, sampleid, 'plots',sep = "/")
dir.create(out_dir_sample, showWarnings = FALSE)

png(filename = paste(out_dir_sample, paste0(sampleid,"_max_threshod_pq_margin_",as.character(marg.norm),'.png'), sep = '/'),
    width = 12, height = 9, units = "in", res = 96)
print(pq)
dev.off()

## save threshold
dx <- data.frame(eval_pq_lib1$eval_pq_out)
dx$library <- as.factor(dx$library)
max_threshold <- merge(aggregate(max.singlet ~ library, data=dx, max), dx, all.x=T)
write.table(max_threshold,paste(out_dir_sample, paste0(sampleid,"_max_threshod_pq_margin_",as.character(marg.norm),".txt"), sep = '/'))

# #####################################################################################
# # After computing best pq with higher number of singlets perform demux with max value
# #####################################################################################
cat(sampleid)
maxi <- max_threshold[max_threshold$library == sampleid, 'threshold']

# sample minimal object
obj <- readRDS(obj_minimal)

### Add mitochondrial percentage to meta.data table
DefaultAssay(obj) <- 'RNA'
print(DefaultAssay(obj))

# Compute mito %
obj[["percent.mt"]] <- PercentageFeatureSet(obj, assay = 'RNA', pattern = mito.prefix)
obj  <- obj %>%
  subset(
    nFeature_RNA > min_feature &    ### Remove cells with < 250 detected genes
      nFeature_RNA < max_feature &   ### Remove cells with > 2500 detected genes (could be doublets)
      percent.mt < max_pc_mito  ### Remove cells with > 0.15 mito/total reads
  )

# filtering object based on hto count to zero
DefaultAssay(obj) <- 'HTO'

print(DefaultAssay(obj))
obj <- subset(obj,subset = nCount_HTO != 0 )

table(obj$orig.ident)

cat('Demultiplexing cells based on HTO')

x <- filterbarcodes(obj, maxi, marg.norm)

scrna.hashtag <- x$obj_demux
rowSums(data.frame(scrna.hashtag@assays$HTO@counts))

# Save demux Seurat object
#oiut_dir_sample <- paste(out_dir, sampleid, sep = "/")
#dir.create(out_dir_sample, showWarnings = FALSE)
DefaultAssay(scrna.hashtag) <- "RNA"
out_dir <- paste(out_dir, sampleid, sep = '/')
saveRDS(object = scrna.hashtag, file = paste(out_dir, paste(sampleid, "demux_minimal.rds", sep = "_"), sep = "/"))

cat(x$barcodes)

cat('visualize demultiplexing - Global classification results', sampleid)
class_hto <- list()
class_hto[[sampleid]] <- table(scrna.hashtag$HTO_classification)
class_hto[[sampleid]]

class_htoglobal <- list()
class_htoglobal[[sampleid]] <- table(scrna.hashtag$HTO_classification.global)
class_htoglobal[[sampleid]]

cat('Group cells based on the max HTO signal')
Idents(scrna.hashtag) <- "HTO_maxID"
DefaultAssay(scrna.hashtag) <- "HTO"
rdgplot <- scrna.hashtag %>%
    RidgePlot(
      assay    = "HTO",
      features = rownames(scrna.hashtag),
      ncol     = 2)

png(filename = paste(out_dir_sample, paste0(sampleid,"_ridgeplot.png"), sep = '/'),
    width = 12, height = 9, units = "in", res = 96)
print(rdgplot)
dev.off()

### Calculate doublet rate
scrna.hashtag@meta.data %>%
    group_by(hash.ID) %>%
    summarize(fract = n() / nrow(.))
### Create bar graphs comparing cell count for each sample
HTO_bars_1 <- scrna.hashtag@meta.data %>%
    rownames_to_column("cell_id") %>%
    ggplot(aes(hash.ID, fill = hash.ID)) +
    geom_bar() +
    labs(y = "Cell Count") +
    cowplot::theme_cowplot() +
    theme(
      legend.position = "none",
      axis.title.x    = element_blank(),
      axis.text.x     = element_text(hjust = 1, angle = 45)
    )
### Create stacked bar graph showing fraction of cells
HTO_bars_2 <- scrna.hashtag@meta.data %>%
    rownames_to_column("cell_id") %>%
    group_by(hash.ID) %>%
    summarize(hash_frac = n() / nrow(.)) %>%
    ggplot(aes("scRNA-seq", hash_frac, fill = hash.ID)) +
    geom_bar(stat = "identity", color = "white", size = 0.5) +
    labs(y = "Fraction of Cells", x= sampleid) +
    cowplot::theme_cowplot() +
    theme(
      legend.title = element_blank(),
      axis.title.x = element_blank(),
      axis.text.x  = element_blank(),
      axis.ticks.x = element_blank()
    )
  ### Combine plots
  ### now add the title
  # title <- ggdraw() +
  #   draw_label(
  #     sampleid,
  #     fontface = 'bold',
  #     x = 0,
  #     hjust = 0
  #   ) +
  #   theme(
  #     ### add margin on the left of the drawing canvas,
  #     ### so title is aligned with left edge of first plot
  #     plot.margin = margin(0, 0, 0, 7)
  #   )
x <- plot_grid(#title,
    HTO_bars_1, HTO_bars_2,
    nrow        = 1,
    rel_widths  = c(0.5, 0.5),
    rel_heights = c(1, 0.8)
  )

png(filename = paste(out_dir_sample, paste0(sampleid,"_HTO_bars.png"), sep = '/'),
    width = 12, height = 9, units = "in", res = 96)
print(x)
dev.off()

# cell count by hto classification
scrna.hashtag@meta.data %>%
    group_by(HTO_classification) %>%
    summarize(fract = n() / nrow(.))

### Create bar graphs comparing cell count by HTO classification
HTO_bars_3 <- scrna.hashtag@meta.data %>%
    rownames_to_column("cell_id") %>%
    ggplot(aes(HTO_classification, fill = HTO_classification)) +
    geom_bar() +
    labs(y = "Cell Count") +
    cowplot::theme_cowplot() +
    theme(
      legend.position = "none",
      axis.title.x    = element_blank(),
      axis.text.x     = element_text(hjust = 1, angle = 45)
    )
### Create stacked bar graph showing fraction of cells
HTO_bars_4 <- scrna.hashtag@meta.data %>%
    rownames_to_column("cell_id") %>%
    group_by(HTO_classification) %>%
    summarize(hash_frac = n() / nrow(.)) %>%
    ggplot(aes("scRNA-seq", hash_frac, fill = HTO_classification)) +
    geom_bar(stat = "identity", color = "white", size = 0.5) +
    labs(y = "Fraction of Cells", x= sampleid) +
    cowplot::theme_cowplot() +
    theme(
      legend.title = element_blank(),
      axis.title.x = element_blank(),
      axis.text.x  = element_blank(),
      axis.ticks.x = element_blank()
    )
### Combine plots
x <- plot_grid(#title,
    HTO_bars_3, HTO_bars_4,
    nrow        = 1,
    rel_widths  = c(0.5, 0.5),
    rel_heights = c(1, 0.8)
  )

png(filename = paste(out_dir_sample, paste0(sampleid,"_HTO_bars_classification.png"), sep = '/'),
      width = 12, height = 9, units = "in", res = 96)
print(x)
dev.off()

fs <- FeatureScatter(scrna.hashtag, feature1 = "hto_H1", feature2 = "hto_H2")

png(filename = paste(out_dir_sample, paste0(sampleid,"_HTO_1_2_featurescatter.png"), sep = '/'),
    width = 12, height = 9, units = "in", res = 96)
print(fs)
dev.off()


Idents(scrna.hashtag) <- "HTO_classification.global"
vnlp <- VlnPlot(scrna.hashtag, features = "nCount_RNA", pt.size = 0.1, log = TRUE)

png(filename = paste(out_dir_sample, paste0(sampleid,"_ncount_RNA.png"), sep = '/'),
      width = 12, height = 9, units = "in", res = 96)
print(vnlp)
dev.off()

cat('remove negative cells from the object')
scrna.hashtag.subset <- subset(scrna.hashtag, idents = "Negative", invert = TRUE)

cat('Calculate a tSNE embedding of the HTO data')
DefaultAssay(scrna.hashtag.subset) <- "HTO"
scrna.hashtag.subset <- ScaleData(scrna.hashtag.subset, features = rownames(scrna.hashtag.subset),
                                    verbose = FALSE)
scrna.hashtag.subset <- RunPCA(scrna.hashtag.subset, features = rownames(scrna.hashtag.subset), approx = FALSE)

# Calculate a distance matrix using HTO
hto.dist.mtx <- as.matrix(dist(t(GetAssayData(object = scrna.hashtag.subset, assay = "HTO"))))

scrna.hashtag.subset <- RunTSNE(scrna.hashtag.subset, dims = 1:length( hto_conditions[[sampleid]]), perplexity = 100,distance.matrix = hto.dist.mtx,
                                check_duplicates = FALSE)


Idents(scrna.hashtag.subset) <- 'HTO_classification'
pdim_hto <- DimPlot(scrna.hashtag.subset,label=TRUE)

png(filename = paste(out_dir_sample, paste0(sampleid,"_HTO_classification_dimplot.png"), sep = '/'),
      width = 12, height = 9, units = "in", res = 96)
print(pdim_hto)
dev.off()

Idents(scrna.hashtag.subset) <- 'HTO_classification.global'
pdim_hto_cl <- DimPlot(scrna.hashtag.subset,label=TRUE)
png(filename = paste(out_dir_sample, paste0(sampleid,"_HTO_classification_global_dimplot.png"), sep = '/'),
      width = 12, height = 9, units = "in", res = 96)
print(pdim_hto_cl)
dev.off()

htohm <- HTOHeatmap(scrna.hashtag.subset, assay = "HTO", ncells = 5000)
png(filename = paste(out_dir_sample, paste0(sampleid,"_HTO_heatmap.png"), sep = '/'),
      width = 12, height = 9, units = "in", res = 96)
print(htohm)
dev.off()


## count all barcodes
cat('count HTO reads')
count_HTO_reads <- list()
count_HTO_reads[[sampleid]] <- rowSums(data.frame(obj@assays$HTO@counts))
count_HTO_reads[[sampleid]] 

###cat('Extract the singlets')
scrna.hashtag.singlet <- subset(scrna.hashtag.subset, idents = "Singlet")
count_HTO_reads_singlet <- list()
count_HTO_reads_singlet[[sampleid]] <- rowSums(data.frame(scrna.hashtag.singlet@assays$HTO@counts))
cat('count HTO singlet reads\n')
count_HTO_reads_singlet[[sampleid]]


Idents(scrna.hashtag.singlet) <- "HTO_classification"
count_HTO_reads_singlet_HTO <- list()
count_HTO_reads_singlet_HTO_mean <- list()
for (n in rownames(scrna.hashtag.singlet@assays$HTO@counts)){
    x <- subset(scrna.hashtag.singlet, idents = n  )
    cat('hto', n)
    print(rowSums(data.frame(x@assays$HTO@counts)))
    ## count of reads HTO in cell classified as specific HTO
    count_HTO_reads_singlet_HTO[[sampleid]][n] <- rowSums(data.frame(x@assays$HTO@counts))[n]
    ## MEAN read count per hashatg over cell classified as specific-HTO
    count_HTO_reads_singlet_HTO_mean[[sampleid]][n] <- unname(rowSums(data.frame(x@assays$HTO@counts))[n])/unname(class_hto[[sampleid]][n])
  }

cat('count HTO reads, only in cell classified as singlet for specific HTO\n')
count_HTO_reads_singlet_HTO

cat('mean HTO reads, only in cell classified as singlet for specific HTO\n')
count_HTO_reads_singlet_HTO_mean

Idents(scrna.hashtag.subset) <- 'HTO_classification.global'
### Projecting singlet identities on TSNE visualization

dimpl_htocl <- DimPlot(scrna.hashtag.singlet, group.by = "HTO_classification",label=TRUE)
png(filename = paste(out_dir_sample, paste0(sampleid,"_HTO_classification_singlets.png"), sep = '/'),
      width = 12, height = 9, units = "in", res = 96)
dimpl_htocl
dev.off()

# dimpl_htoglobal <- DimPlot(scrna.hashtag.singlet, group.by = 'HTO_classification.global', label=TRUE)
# png(filename = paste(out_dir_sample, paste0(sampleid,"_HTO_classification_global_singlets.png"), sep = '/'),
#       width = 12, height = 9, units = "in", res = 96)
# print(dimpl_htoglobal)
# dev.off()

vnl_hto <- scrna.hashtag.singlet %>%
    VlnPlot(
      features = c( "nCount_HTO"),
      ncol    = 1,
      pt.size = 0.0005,
      log     = T
    )
png(filename = paste(out_dir_sample, paste0(sampleid,"_violin_hto_singlets.png"), sep = '/'),
      width = 12, height = 9, units = "in", res = 96)
print(vnl_hto)
dev.off()


## Save Singlets cell and HTO classification in meta data
DefaultAssay(scrna.hashtag) <- "RNA"
scrna.hashtag[['HTO']] <- NULL # eliminate HTO assay
saveRDS(object =  scrna.hashtag.singlet, file = paste(out_dir, paste(sampleid, "singlet_minimal.rds", sep = "_"), sep = "/"))