suppressPackageStartupMessages(library(Seurat)) options(Seurat.object.assay.version = "v5") suppressPackageStartupMessages(library(ggplot2)) suppressPackageStartupMessages(library(openxlsx)) suppressPackageStartupMessages(library(dplyr)) suppressPackageStartupMessages(library(plyr)) wd <- "/beegfs/scratch/ric.squadrito/ric.squadrito/202312011215_VMSC10702-NotaroTAD_A019/Analysis_MM/python_scripts/circles" setwd(wd) args <- commandArgs(trailingOnly = TRUE) sample <- args[1] # "M18", "M21", "M40", "M41" #sample <- "M18" out_dir <- paste0(wd, "/distance_table") dir.create(out_dir, showWarnings = F, recursive = T) ##################### # Define a function to calculate Euclidean distance euclidean_dist <- function(x1, y1, x2, y2) { return(sqrt((x1 - x2)^2 + (y1 - y2)^2)) } ##################### detected_transcripts <- read.csv(file=paste0(wd, "/detected_transcripts_", sample, ".csv"), header = T, numerals = c("no.loss")) # "global_x" and "global_y" give the position in um, x and y in pixels: https://vizgen.github.io/vizgen-postprocessing/output_data_formats/detected_transcripts_format.html df_molecules <- detected_transcripts[c("gene", "global_x", "global_y", "cell_id")] colnames(df_molecules) <- c("gene", "x", "y", "cell_id") rm(detected_transcripts) # Remove transcripts without cell_id nrow(df_molecules) df_molecules <- df_molecules %>% filter(! df_molecules$gene %in% grep(x = df_molecules$gene, pattern = "Blank", value = T)) nrow(df_molecules) df_molecules <- df_molecules[df_molecules$cell_id != "-1",] # It might be possible to include transcripts outside the cells but the results appear similar nrow(df_molecules) # M18 transcripts: 4263642 => 4242609 => 3254954 # M21 transcripts: 3508195 => 3490821 => 2430395 # M40 transcripts: 2017821 => 2001106 => 1449746 # M41 transcripts: 4510227 => 4494444 => 3030891 # Number of genes n_genes <- 500 # Define the distance threshold. Default d=30um d <- 30 # SelectedGenes contains the list of genes manually selected to identify specific cell types. # It is possible to run the analysis with all the 500 genes but it takes much more time (5-10 days). SelectedGenes <- unique(c("S100a6", "Klf4", "Anxa2", "Cdkn2a", "Vim", # Antigen presentation "Ciita", "H2-DMb1", "H2-Aa", "H2-Ab1", # Macrophages "Clec4f", "C1qa", "Marco", # T cell activation "Trac", "Cd3e", "Klf2", "Sell", "Il2rb", "Stat1","Ccr5", "Il12rb1", "Cd40", # IFN "Irf1", "Isg15", "Ifit2", "Cxcl9","Il18r1", # Protumoral "Serpine1","Cebpb","Pdgfb","Lrg1","Id2", "Vegfa", "Il1r1", "Cd274", "Tgfb1")) # Cd81 df_molecules <- df_molecules[df_molecules$gene %in% SelectedGenes,] # subset to have a smaller dataframe. This is used mostly for benchmarking and testing #df_molecules <- df_molecules[df_molecules$gene %in% head(genes_ID, n_genes),] # Get the unique gene ID genes_ID <- unique(df_molecules$gene) # Sort genes_ID genes_ID <- sort(genes_ID, decreasing=F) n_genes <- length(genes_ID) # 500 # Initialize an empty distance matrix distance_matrix <- matrix(nrow = length(genes_ID), ncol = length(genes_ID), 0) distance_matrix_norm <- matrix(nrow = length(genes_ID), ncol = length(genes_ID), 0) colnames(distance_matrix) <- genes_ID rownames(distance_matrix) <- genes_ID colnames(distance_matrix_norm) <- genes_ID rownames(distance_matrix_norm) <- genes_ID # Loop through each gene type combination for (i in 1:length(genes_ID)) { # 1:length(genes_ID) or 1:n_genes print(i) gene_A <- genes_ID[i] # Filter transcripts for gene A transcripts_A <- df_molecules[df_molecules$gene == gene_A,] for (j in 1:length(genes_ID)) { # (i + 1):length(genes_ID)) or 1:n_genes gene_B <- genes_ID[j] # Filter transcripts for gene B transcripts_B <- df_molecules[df_molecules$gene == gene_B,] # Initialize counters for close and not close transcripts close_count <- 0 # Loop through transcripts in transcripts_A for (z in 1:nrow(transcripts_A)) { a_x <- transcripts_A$x[z] a_y <- transcripts_A$y[z] # filter to retain only transcripts_B that are not in the same cell as the transcripts_A analysed. This speed up the script. transcripts_B_temp <- transcripts_B[transcripts_B$cell_id != transcripts_A$cell_id[z],] # If you include also transcripts outside the cells add the following lines #if (transcripts_A$cell_id[z] != "-1") { # transcripts_B_temp <- transcripts_B[transcripts_B$cell_id != transcripts_A$cell_id[z],]} #else{transcripts_B_temp <- transcripts_B} # filter to retain only transcripts_B that are close to the transcripts_A analysed. This speed up the script. It looks only in a square of l=1.1*1.1*2d. transcripts_B_temp <- transcripts_B_temp[transcripts_B_temp$x > a_x - 1.1*d & transcripts_B_temp$x < a_x + 1.1*d,] transcripts_B_temp <- transcripts_B_temp[transcripts_B_temp$y > a_y - 1.1*d & transcripts_B_temp$y < a_y + 1.1*d,] is_close <- FALSE # Flag to track if a close transcript is found if (nrow(transcripts_B_temp)==0){next} # This transcript of A is not close to any transcript of B # Loop through transcripts in transcripts_B for (w in 1:nrow(transcripts_B_temp)) { b_x <- transcripts_B_temp$x[w] b_y <- transcripts_B_temp$y[w] # Calculate distance distance <- euclidean_dist(a_x, a_y, b_x, b_y) # Check if distance is less than threshold if (distance < d) { is_close <- TRUE break # Exit inner loop if close transcript found } } # Update counters based on the flag if (is_close) { close_count <- close_count + 1 } } # Fill the distance matrix. Note: The matrix is not perfectly symmetrical because more A molecules can be close to the same B molecule and viceversa distance_matrix[i, j] <- close_count # Normalize for the sum of transcripts A and B. # Note: I multiplied close_count * 2 because each transcript-transcript interaction involves two transcripts. distance_matrix_norm[i, j] <- round(close_count*2/((nrow(transcripts_A)+nrow(transcripts_B))),5) } } # Print the distance matrix print(distance_matrix) print(distance_matrix_norm) # Save the matrix write.csv(distance_matrix, paste0(out_dir, "/", sample, "_transcripts_distance_matrix_inCell_t", d, "_g", n_genes, "_selected.csv"), row.names=T) write.csv(distance_matrix_norm, paste0(out_dir, "/", sample, "_transcripts_distance_matrix_norm_inCell_t", d, "_g", n_genes, "_selected.csv"), row.names=T)