scRNAseq code - 1st commit

5657b15d · Matteo Barcella · 013b769a · 5657b15d · 5657b15d · 5657b15d
Commit 5657b15d authored Feb 17, 2025 by Matteo Barcella
--- a/scRNAseq/00_upstream_wrapper_functions.bash
+++ b/scRNAseq/00_upstream_wrapper_functions.bash
+# run each individual library on cellranger
+module load cellranger/6.0.1
+ref=refdata-cellranger-hg19-3.0.0
+cellwrangler () {
+  echo "Started processing library ${library[$1]} on `date`"
+  cellranger count --id=${library[$1]} \
+                   --sample=${library[$1]} \
+                   --transcriptome=${ref} \
+                   --fastqs=${fastqdir}
+  echo "Finishing processing library ${library[$1]} on `date`"
+}
+# Demux donors in donor-multiplexed samples (libraries: SITTB3, SITTC3, SITTG3, SITTE10, SITTF10)
+# adjusts the barcode suffix "-1" to "-1-0" and "-1-1" respectively to match suffixes added by scanpy
+adjustbam () {
+  echo "Starting processing INPUT sample ${libs[$1]} on `date`"
+  let suffix=$1-1
+  singularity exec ${tooldir}souporcell.sif samtools view -h ${basepath}${libs[$1]}/outs/possorted_genome_bam.bam \
+                                                    | sed -r "s/(CB:Z:[ACTG]+\w+-1)/\1-${suffix}/" | \
+                                                    singularity exec ${tooldir}souporcell.sif \
+                                                    samtools view -@ ${SLURM_NTASKS} -bSh - > ${basepath}temp/${libs[$1]}_adjusted.bam
+  echo "Finished processing INPUT sample ${libs[$1]} on `date`"  
+}
+# How many donors/genotypes are present in the sample to demux
+ndonors=2
+# demultiplex donors using souporcell
+demux () {
+  echo "Started processing library $lib on `date`"
+  singularity exec ${tooldir}souporcell.sif souporcell_pipeline.py \
+    -i ${workdir}processed/temp/${libs[$1]}_adjusted.bam \
+    -b ${workdir}notebooks/output/${libs[$1]}_filtered_barcodes.txt \
+    -f ${basedir}references/10x/cellranger3x/refdata-cellranger-hg19-3.0.0/fasta/genome.fa \
+    -o ${workdir}processed/souporcell/${libs[$1]} \
+    -t ${SLURM_NTASKS} \
+    -k ${ndonors}
+   echo "Finished processing library $lib on `date`"	
+}
--- a/scRNAseq/06b_seurat_v4_BM_scr_plus_healthy_samples_SCTransformed_CCA_aligned.R
+++ b/scRNAseq/06b_seurat_v4_BM_scr_plus_healthy_samples_SCTransformed_CCA_aligned.R
+prefix <- format(as.Date(Sys.time()), "%Y%m%d")
+basename <- '_bthalcombo_v6_preGT_plus_6x_healthy_'
+basedir <- '.'
+# -----------------------------------------------------------------------------------
+library(Seurat)
+library(future)
+library(ggplot2)
+library(cowplot)
+suppressMessages(library(dplyr))
+plan(strategy = "multicore", workers = 10)
+options(future.globals.maxSize = +Inf) # (1024 * 32) * 1024^2 )
+# -------------------------------------------------------------------------------------                         
+data <- readRDS( paste0(basedir, '20230704_bthalcombo6_preGT_plus_6x_healthy_controls_counts_Seurat_obj.rds') )
+s.genes <- cc.genes$s.genes
+g2m.genes <- cc.genes$g2m.genes
+Sys.time()
+print("Preprocessing")
+data.list <- SplitObject(data, split.by = "donor")
+data.list <- lapply(X = data.list, FUN = SCTransform, vars.to.regress = c("nCount_RNA", "mitoc_fraction"), vst.flavor = "v2")
+data.list <- lapply(X = data.list, FUN = CellCycleScoring, s.features = s.genes, 
+                                                           g2m.features = g2m.genes, 
+                                                           assay = 'SCT' )
+data.list <- lapply(X = data.list, FUN = SCTransform, 
+                                    vars.to.regress = c("nCount_RNA", "mitoc_fraction", 'S.Score', 'G2M.Score'), vst.flavor = "v2")
+Sys.time()
+print("Selecting features")
+features <- SelectIntegrationFeatures(object.list = data.list, nfeatures = 3000)
+data.list <- PrepSCTIntegration(object.list = data.list, anchor.features = features)
+Sys.time()
+print("Selecting anchors")
+data.anchors <- FindIntegrationAnchors(object.list = data.list, 
+                                       normalization.method = "SCT",
+                                       anchor.features = features)
+saveRDS(file = paste0(basedir, prefix, basename, 'SCT_anchors.rds'), data.anchors )
+Sys.time()
+print("Integrating datasets...")
+data.combined.sct <- IntegrateData(anchorset = data.anchors, normalization.method = "SCT")
+Sys.time()
+saveRDS(file = paste0(basedir, prefix, basename, 'SCT_integrated.rds'), data.combined.sct )
+Sys.time()
--- a/scRNAseq/07h_R_annotate_bthal_v6_via_Azimuth.ipynb
+++ b/scRNAseq/07h_R_annotate_bthal_v6_via_Azimuth.ipynb
--- a/scRNAseq/07h_R_annotate_bthal_v6_via_Symphony.ipynb
+++ b/scRNAseq/07h_R_annotate_bthal_v6_via_Symphony.ipynb
--- a/scRNAseq/07i_PY_import_seurat_integration_azimuth_annotation_process_downstream.ipynb
+++ b/scRNAseq/07i_PY_import_seurat_integration_azimuth_annotation_process_downstream.ipynb
--- a/scRNAseq/07k4_DESeq2_pseudobulk_diffexpr_tests.ipynb
+++ b/scRNAseq/07k4_DESeq2_pseudobulk_diffexpr_tests.ipynb
--- a/scRNAseq/07m1c_PY_defining_trajectories_with_Cellrank.ipynb
+++ b/scRNAseq/07m1c_PY_defining_trajectories_with_Cellrank.ipynb
--- a/scRNAseq/07m2a_tradeSeq_ery_branch_plus_GAM.R
+++ b/scRNAseq/07m2a_tradeSeq_ery_branch_plus_GAM.R
+prefix <- format(as.Date(Sys.time()), "%Y%m%d")
+prefix
+basedir <- "."
+library(tradeSeq)
+library(RColorBrewer)
+library(SingleCellExperiment)
+ann <- anndata::read_h5ad('../h5ad/20231122_bthalcombo_v6c_cr2_no_B_palantir_pseudotime_kernel.h5ad')
+eryidx <- read.table(file='../notebooks/output/20231128_bthalcombo_v6c_eryplus_traj_cell_idx.txt', header = FALSE)
+eryidx <- as.vector(eryidx$V1)
+pseudotime <- ann[eryidx]$obs$palantir_pseudotime
+conv <- t( data.frame("bthal005" = "bthal", "bthal006" = "bthal", "healthy_CTRL" = "healthy", "bthal010" = "bthal",
+          "bthal009" = "bthal", "bthal007" = "bthal", "bthal008" = "bthal", "P185" = "healthy",
+          "P181" = "healthy", "P257" = "healthy", "paedBM1" = "healthy", "paedBM2" = "healthy") )
+condition <- conv[ match(ann[eryidx]$obs$donor, rownames(conv)) ]
+sce <- readRDS( paste0(basedir, 'h5ad/', '20230815_bthalcombo_v6b_counts_sf_SCE_obj.rds') )
+sce <- sce[,eryidx]
+cw <- rep(1,ncol(sce)) # cell weights
+rm(ann)
+set.seed(42)
+Sys.time()
+sceGAM <- fitGAM(counts = as.matrix( counts(sce) ),
+                 conditions = factor(condition),
+                 pseudotime=pseudotime,
+                 cellWeights=cw,
+                 nknots=5,
+                 verbose=T
+                 #parallel=TRUE, 
+                 #BPPARAM = BPPARAM
+                )
+Sys.time()
+saveRDS(file="../output/20231129_sceGAM_erytroid_plus_traj_conditions.rds", sceGAM)
--- a/scRNAseq/07m2b_tradeSeq_mono_branch_plus_GAM.R
+++ b/scRNAseq/07m2b_tradeSeq_mono_branch_plus_GAM.R
+prefix <- format(as.Date(Sys.time()), "%Y%m%d")
+prefix
+basedir <- '.'
+library(tradeSeq)
+library(RColorBrewer)
+library(SingleCellExperiment)
+ann <- anndata::read_h5ad('../h5ad/20231122_bthalcombo_v6c_cr2_no_B_palantir_pseudotime_kernel.h5ad')
+monoidx <- read.table(file='../notebooks/output/20231207_bthalcombo_v6c_monoplus_traj_cell_idx_CLEAN.txt', header = FALSE)
+monoidx <- as.vector(monoidx$V1)
+pseudotime <- ann[monoidx]$obs$palantir_pseudotime
+conv <- t( data.frame("bthal005" = "bthal", "bthal006" = "bthal", "healthy_CTRL" = "healthy", "bthal010" = "bthal",
+          "bthal009" = "bthal", "bthal007" = "bthal", "bthal008" = "bthal", "P185" = "healthy",
+          "P181" = "healthy", "P257" = "healthy", "paedBM1" = "healthy", "paedBM2" = "healthy") )
+condition <- conv[ match(ann[monoidx]$obs$donor, rownames(conv)) ]
+sce <- readRDS( paste0(basedir, 'h5ad/', '20230815_bthalcombo_v6b_counts_sf_SCE_obj.rds') )
+sce <- sce[,monoidx]
+cw <- rep(1,ncol(sce)) # cell weights
+rm(ann)
+set.seed(42)
+Sys.time()
+sceGAM <- fitGAM(counts = as.matrix( counts(sce) ),
+                 conditions = factor(condition),
+                 pseudotime=pseudotime,
+                 cellWeights=cw,
+                 nknots=6,
+                 verbose=T
+                 #parallel=TRUE, 
+                 #BPPARAM = BPPARAM
+                )
+Sys.time()
+saveRDS(file="../output/20231207_sceGAM_mono_plus_traj_conditions.rds", sceGAM)
--- a/scRNAseq/07m4_R_tradeSeq_export_smoothers_plots.ipynb
+++ b/scRNAseq/07m4_R_tradeSeq_export_smoothers_plots.ipynb
--- a/scRNAseq/07n_PY_defining_Subset1.ipynb
+++ b/scRNAseq/07n_PY_defining_Subset1.ipynb
--- a/scRNAseq/07o1_PY_figure_plots.ipynb
+++ b/scRNAseq/07o1_PY_figure_plots.ipynb
--- a/scRNAseq/07o2_assemble_TableS6_excel_file.ipynb
+++ b/scRNAseq/07o2_assemble_TableS6_excel_file.ipynb
--- a/scRNAseq/README_scRNAseq_10x.md
+++ b/scRNAseq/README_scRNAseq_10x.md
+The scRNAseq upstream analysis main steps comprised of cellranger count processing and souporcell (for demultiplexing the samples sequenced together).
+Standard QC and filtering steps followed, leading to a midstream aligment/integration of all samples using Seurat (v4). For that each library was SCtransformed, with count number, mitochondrial fraction, S score and G2M score regressed before integrating all samples via CCA. Downstream analyses included annotation transfer using both Azimuth and Symphony, and trajectory analysis using Cellrank (v2) and tradeSeq.
+- 00_* : helper wrappers for demultiplexing, alignment and quantification of samples
+- 06b_* : script to align all samples using Seurat (v4)
+- 07[h|i]* : notebooks for annotation of the integrated/aligned dataset
+- 07k4* : notebook for pseudobulk differential expression testing with DESeq2
+- 07m* : notebooks and scripts for defining and analysing trajectiories using Cellrank (v2) and tradeSeq
+- 07n* : notebook defining Subset1-like cells
+- 07o* : notebooks to produce specific figures and table for the manuscript