suppressPackageStartupMessages(library(Seurat))
options(Seurat.object.assay.version = "v5")
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(openxlsx))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(plyr))

wd <- "/beegfs/scratch/ric.squadrito/ric.squadrito/202312011215_VMSC10702-NotaroTAD_A019/Analysis_MM/python_scripts/circles"
setwd(wd)

args <- commandArgs(trailingOnly = TRUE)

sample <- args[1] # "M18", "M21", "M40", "M41"
#sample <- "M18"

out_dir <- paste0(wd, "/distance_table")
dir.create(out_dir, showWarnings = F, recursive = T)

#####################

# Define a function to calculate Euclidean distance
euclidean_dist <- function(x1, y1, x2, y2) {
  return(sqrt((x1 - x2)^2 + (y1 - y2)^2))
}

#####################

detected_transcripts <- read.csv(file=paste0(wd, "/detected_transcripts_", sample, ".csv"), header = T, numerals = c("no.loss"))

# "global_x" and "global_y" give the position in um, x and y in pixels: https://vizgen.github.io/vizgen-postprocessing/output_data_formats/detected_transcripts_format.html
df_molecules <- detected_transcripts[c("gene", "global_x", "global_y", "cell_id")]
colnames(df_molecules) <- c("gene", "x", "y", "cell_id")
rm(detected_transcripts)

# Remove transcripts without cell_id
nrow(df_molecules)
df_molecules <- df_molecules %>% filter(! df_molecules$gene %in% grep(x = df_molecules$gene, pattern = "Blank", value = T))
nrow(df_molecules)
df_molecules <- df_molecules[df_molecules$cell_id != "-1",] # It might be possible to include transcripts outside the cells but the results appear similar
nrow(df_molecules)

# M18 transcripts: 4263642 => 4242609 => 3254954
# M21 transcripts: 3508195 => 3490821 => 2430395
# M40 transcripts: 2017821 => 2001106 => 1449746
# M41 transcripts: 4510227 => 4494444 => 3030891

# Number of genes
n_genes <- 500

# Define the distance threshold. Default d=30um
d <- 30

# SelectedGenes contains the list of genes manually selected to identify specific cell types.
# It is possible to run the analysis with all the 500 genes but it takes much more time (5-10 days).
SelectedGenes <- unique(c("S100a6", "Klf4", "Anxa2", "Cdkn2a", "Vim",
                          # Antigen presentation
                          "Ciita", "H2-DMb1", "H2-Aa", "H2-Ab1", 
                          # Macrophages
                          "Clec4f", "C1qa", "Marco",
                          # T cell activation
                          "Trac", "Cd3e", "Klf2", "Sell", "Il2rb", "Stat1","Ccr5", "Il12rb1", "Cd40",
                          # IFN
                          "Irf1", "Isg15", "Ifit2", "Cxcl9","Il18r1",
                          # Protumoral
                          "Serpine1","Cebpb","Pdgfb","Lrg1","Id2", "Vegfa", "Il1r1", "Cd274", "Tgfb1")) # Cd81

df_molecules <- df_molecules[df_molecules$gene %in% SelectedGenes,]

# subset to have a smaller dataframe. This is used mostly for benchmarking and testing
#df_molecules <- df_molecules[df_molecules$gene %in% head(genes_ID, n_genes),]

# Get the unique gene ID
genes_ID <- unique(df_molecules$gene)

# Sort genes_ID
genes_ID <- sort(genes_ID, decreasing=F)

n_genes <- length(genes_ID) # 500

# Initialize an empty distance matrix
distance_matrix <- matrix(nrow = length(genes_ID), ncol = length(genes_ID), 0)
distance_matrix_norm <- matrix(nrow = length(genes_ID), ncol = length(genes_ID), 0)
colnames(distance_matrix) <- genes_ID
rownames(distance_matrix) <- genes_ID
colnames(distance_matrix_norm) <- genes_ID
rownames(distance_matrix_norm) <- genes_ID

# Loop through each gene type combination
for (i in 1:length(genes_ID)) { # 1:length(genes_ID) or 1:n_genes
  print(i)
  gene_A <- genes_ID[i]
  
  # Filter transcripts for gene A
  transcripts_A <- df_molecules[df_molecules$gene == gene_A,]
  for (j in 1:length(genes_ID)) { # (i + 1):length(genes_ID)) or 1:n_genes
    gene_B <- genes_ID[j]
    
    # Filter transcripts for gene B
    transcripts_B <- df_molecules[df_molecules$gene == gene_B,]
    
    # Initialize counters for close and not close transcripts
    close_count <- 0
    
    # Loop through transcripts in transcripts_A
    for (z in 1:nrow(transcripts_A)) {
      a_x <- transcripts_A$x[z]
      a_y <- transcripts_A$y[z]
      
      # filter to retain only transcripts_B that are not in the same cell as the transcripts_A analysed. This speed up the script.
      transcripts_B_temp <- transcripts_B[transcripts_B$cell_id != transcripts_A$cell_id[z],]
      # If you include also transcripts outside the cells add the following lines
      #if (transcripts_A$cell_id[z] != "-1") {
      #  transcripts_B_temp <- transcripts_B[transcripts_B$cell_id != transcripts_A$cell_id[z],]}
      #else{transcripts_B_temp <- transcripts_B}
      
      # filter to retain only transcripts_B that are close to the transcripts_A analysed. This speed up the script. It looks only in a square of l=1.1*1.1*2d.
      transcripts_B_temp <- transcripts_B_temp[transcripts_B_temp$x > a_x - 1.1*d & transcripts_B_temp$x < a_x + 1.1*d,]
      transcripts_B_temp <- transcripts_B_temp[transcripts_B_temp$y > a_y - 1.1*d & transcripts_B_temp$y < a_y + 1.1*d,]
      
      is_close <- FALSE  # Flag to track if a close transcript is found
      
      if (nrow(transcripts_B_temp)==0){next} # This transcript of A is not close to any transcript of B
      
      # Loop through transcripts in transcripts_B
      for (w in 1:nrow(transcripts_B_temp)) {
        b_x <- transcripts_B_temp$x[w]
        b_y <- transcripts_B_temp$y[w]
        
        # Calculate distance
        distance <- euclidean_dist(a_x, a_y, b_x, b_y)
        
        # Check if distance is less than threshold
        if (distance < d) {
          is_close <- TRUE
          break  # Exit inner loop if close transcript found
        }
      }

      # Update counters based on the flag
      if (is_close) {
        close_count <- close_count + 1
      }
    
    }
  # Fill the distance matrix. Note: The matrix is not perfectly symmetrical because more A molecules can be close to the same B molecule and viceversa
  distance_matrix[i, j] <- close_count
  # Normalize for the sum of transcripts A and B.
  # Note: I multiplied close_count * 2 because each transcript-transcript interaction involves two transcripts.
  distance_matrix_norm[i, j] <- round(close_count*2/((nrow(transcripts_A)+nrow(transcripts_B))),5)
  }
}

# Print the distance matrix
print(distance_matrix)
print(distance_matrix_norm)

# Save the matrix
write.csv(distance_matrix, paste0(out_dir, "/", sample, "_transcripts_distance_matrix_inCell_t", d, "_g", n_genes, "_selected.csv"), row.names=T)
write.csv(distance_matrix_norm, paste0(out_dir, "/", sample, "_transcripts_distance_matrix_norm_inCell_t", d, "_g", n_genes, "_selected.csv"), row.names=T)