scRNAseq code - 1st commit

5657b15d · Matteo Barcella · 013b769a · 5657b15d · 5657b15d · 5657b15d
Commit 5657b15d authored Feb 17, 2025 by Matteo Barcella
--- a/scRNAseq/00_upstream_wrapper_functions.bash
+++ b/scRNAseq/00_upstream_wrapper_functions.bash
+# run each individual library on cellranger
+module load cellranger/6.0.1
+ref=refdata-cellranger-hg19-3.0.0
+cellwrangler () {
+  echo "Started processing library ${library[$1]} on `date`"
+  cellranger count --id=${library[$1]} \
+                   --sample=${library[$1]} \
+                   --transcriptome=${ref} \
+                   --fastqs=${fastqdir}
+  echo "Finishing processing library ${library[$1]} on `date`"
+}
+# Demux donors in donor-multiplexed samples (libraries: SITTB3, SITTC3, SITTG3, SITTE10, SITTF10)
+# adjusts the barcode suffix "-1" to "-1-0" and "-1-1" respectively to match suffixes added by scanpy
+adjustbam () {
+  echo "Starting processing INPUT sample ${libs[$1]} on `date`"
+  let suffix=$1-1
+  singularity exec ${tooldir}souporcell.sif samtools view -h ${basepath}${libs[$1]}/outs/possorted_genome_bam.bam \
+                                                    | sed -r "s/(CB:Z:[ACTG]+\w+-1)/\1-${suffix}/" | \
+                                                    singularity exec ${tooldir}souporcell.sif \
+                                                    samtools view -@ ${SLURM_NTASKS} -bSh - > ${basepath}temp/${libs[$1]}_adjusted.bam
+  echo "Finished processing INPUT sample ${libs[$1]} on `date`"  
+}
+# How many donors/genotypes are present in the sample to demux
+ndonors=2
+# demultiplex donors using souporcell
+demux () {
+  echo "Started processing library $lib on `date`"
+  singularity exec ${tooldir}souporcell.sif souporcell_pipeline.py \
+    -i ${workdir}processed/temp/${libs[$1]}_adjusted.bam \
+    -b ${workdir}notebooks/output/${libs[$1]}_filtered_barcodes.txt \
+    -f ${basedir}references/10x/cellranger3x/refdata-cellranger-hg19-3.0.0/fasta/genome.fa \
+    -o ${workdir}processed/souporcell/${libs[$1]} \
+    -t ${SLURM_NTASKS} \
+    -k ${ndonors}
+   echo "Finished processing library $lib on `date`"	
+}
--- a/scRNAseq/06b_seurat_v4_BM_scr_plus_healthy_samples_SCTransformed_CCA_aligned.R
+++ b/scRNAseq/06b_seurat_v4_BM_scr_plus_healthy_samples_SCTransformed_CCA_aligned.R
+prefix <- format(as.Date(Sys.time()), "%Y%m%d")
+basename <- '_bthalcombo_v6_preGT_plus_6x_healthy_'
+basedir <- '.'
+# -----------------------------------------------------------------------------------
+library(Seurat)
+library(future)
+library(ggplot2)
+library(cowplot)
+suppressMessages(library(dplyr))
+plan(strategy = "multicore", workers = 10)
+options(future.globals.maxSize = +Inf) # (1024 * 32) * 1024^2 )
+# -------------------------------------------------------------------------------------                         
+data <- readRDS( paste0(basedir, '20230704_bthalcombo6_preGT_plus_6x_healthy_controls_counts_Seurat_obj.rds') )
+s.genes <- cc.genes$s.genes
+g2m.genes <- cc.genes$g2m.genes
+Sys.time()
+print("Preprocessing")
+data.list <- SplitObject(data, split.by = "donor")
+data.list <- lapply(X = data.list, FUN = SCTransform, vars.to.regress = c("nCount_RNA", "mitoc_fraction"), vst.flavor = "v2")
+data.list <- lapply(X = data.list, FUN = CellCycleScoring, s.features = s.genes, 
+                                                           g2m.features = g2m.genes, 
+                                                           assay = 'SCT' )
+data.list <- lapply(X = data.list, FUN = SCTransform, 
+                                    vars.to.regress = c("nCount_RNA", "mitoc_fraction", 'S.Score', 'G2M.Score'), vst.flavor = "v2")
+Sys.time()
+print("Selecting features")
+features <- SelectIntegrationFeatures(object.list = data.list, nfeatures = 3000)
+data.list <- PrepSCTIntegration(object.list = data.list, anchor.features = features)
+Sys.time()
+print("Selecting anchors")
+data.anchors <- FindIntegrationAnchors(object.list = data.list, 
+                                       normalization.method = "SCT",
+                                       anchor.features = features)
+saveRDS(file = paste0(basedir, prefix, basename, 'SCT_anchors.rds'), data.anchors )
+Sys.time()
+print("Integrating datasets...")
+data.combined.sct <- IntegrateData(anchorset = data.anchors, normalization.method = "SCT")
+Sys.time()
+saveRDS(file = paste0(basedir, prefix, basename, 'SCT_integrated.rds'), data.combined.sct )
+Sys.time()
--- a/scRNAseq/07h_R_annotate_bthal_v6_via_Azimuth.ipynb
+++ b/scRNAseq/07h_R_annotate_bthal_v6_via_Azimuth.ipynb
--- a/scRNAseq/07h_R_annotate_bthal_v6_via_Symphony.ipynb
+++ b/scRNAseq/07h_R_annotate_bthal_v6_via_Symphony.ipynb
--- a/scRNAseq/07i_PY_import_seurat_integration_azimuth_annotation_process_downstream.ipynb
+++ b/scRNAseq/07i_PY_import_seurat_integration_azimuth_annotation_process_downstream.ipynb
--- a/scRNAseq/07k4_DESeq2_pseudobulk_diffexpr_tests.ipynb
+++ b/scRNAseq/07k4_DESeq2_pseudobulk_diffexpr_tests.ipynb
--- a/scRNAseq/07m1c_PY_defining_trajectories_with_Cellrank.ipynb
+++ b/scRNAseq/07m1c_PY_defining_trajectories_with_Cellrank.ipynb
--- a/scRNAseq/07m2a_tradeSeq_ery_branch_plus_GAM.R
+++ b/scRNAseq/07m2a_tradeSeq_ery_branch_plus_GAM.R
+prefix <- format(as.Date(Sys.time()), "%Y%m%d")
+prefix
+basedir <- "."
+library(tradeSeq)
+library(RColorBrewer)
+library(SingleCellExperiment)
+ann <- anndata::read_h5ad('../h5ad/20231122_bthalcombo_v6c_cr2_no_B_palantir_pseudotime_kernel.h5ad')
+eryidx <- read.table(file='../notebooks/output/20231128_bthalcombo_v6c_eryplus_traj_cell_idx.txt', header = FALSE)
+eryidx <- as.vector(eryidx$V1)
+pseudotime <- ann[eryidx]$obs$palantir_pseudotime
+conv <- t( data.frame("bthal005" = "bthal", "bthal006" = "bthal", "healthy_CTRL" = "healthy", "bthal010" = "bthal",
+          "bthal009" = "bthal", "bthal007" = "bthal", "bthal008" = "bthal", "P185" = "healthy",
+          "P181" = "healthy", "P257" = "healthy", "paedBM1" = "healthy", "paedBM2" = "healthy") )
+condition <- conv[ match(ann[eryidx]$obs$donor, rownames(conv)) ]
+sce <- readRDS( paste0(basedir, 'h5ad/', '20230815_bthalcombo_v6b_counts_sf_SCE_obj.rds') )
+sce <- sce[,eryidx]
+cw <- rep(1,ncol(sce)) # cell weights
+rm(ann)
+set.seed(42)
+Sys.time()
+sceGAM <- fitGAM(counts = as.matrix( counts(sce) ),
+                 conditions = factor(condition),
+                 pseudotime=pseudotime,
+                 cellWeights=cw,
+                 nknots=5,
+                 verbose=T
+                 #parallel=TRUE, 
+                 #BPPARAM = BPPARAM
+                )
+Sys.time()
+saveRDS(file="../output/20231129_sceGAM_erytroid_plus_traj_conditions.rds", sceGAM)
--- a/scRNAseq/07m2b_tradeSeq_mono_branch_plus_GAM.R
+++ b/scRNAseq/07m2b_tradeSeq_mono_branch_plus_GAM.R
+prefix <- format(as.Date(Sys.time()), "%Y%m%d")
+prefix
+basedir <- '.'
+library(tradeSeq)
+library(RColorBrewer)
+library(SingleCellExperiment)
+ann <- anndata::read_h5ad('../h5ad/20231122_bthalcombo_v6c_cr2_no_B_palantir_pseudotime_kernel.h5ad')
+monoidx <- read.table(file='../notebooks/output/20231207_bthalcombo_v6c_monoplus_traj_cell_idx_CLEAN.txt', header = FALSE)
+monoidx <- as.vector(monoidx$V1)
+pseudotime <- ann[monoidx]$obs$palantir_pseudotime
+conv <- t( data.frame("bthal005" = "bthal", "bthal006" = "bthal", "healthy_CTRL" = "healthy", "bthal010" = "bthal",
+          "bthal009" = "bthal", "bthal007" = "bthal", "bthal008" = "bthal", "P185" = "healthy",
+          "P181" = "healthy", "P257" = "healthy", "paedBM1" = "healthy", "paedBM2" = "healthy") )
+condition <- conv[ match(ann[monoidx]$obs$donor, rownames(conv)) ]
+sce <- readRDS( paste0(basedir, 'h5ad/', '20230815_bthalcombo_v6b_counts_sf_SCE_obj.rds') )
+sce <- sce[,monoidx]
+cw <- rep(1,ncol(sce)) # cell weights
+rm(ann)
+set.seed(42)
+Sys.time()
+sceGAM <- fitGAM(counts = as.matrix( counts(sce) ),
+                 conditions = factor(condition),
+                 pseudotime=pseudotime,
+                 cellWeights=cw,
+                 nknots=6,
+                 verbose=T
+                 #parallel=TRUE, 
+                 #BPPARAM = BPPARAM
+                )
+Sys.time()
+saveRDS(file="../output/20231207_sceGAM_mono_plus_traj_conditions.rds", sceGAM)
--- a/scRNAseq/07m4_R_tradeSeq_export_smoothers_plots.ipynb
+++ b/scRNAseq/07m4_R_tradeSeq_export_smoothers_plots.ipynb
--- a/scRNAseq/07n_PY_defining_Subset1.ipynb
+++ b/scRNAseq/07n_PY_defining_Subset1.ipynb
--- a/scRNAseq/07o1_PY_figure_plots.ipynb
+++ b/scRNAseq/07o1_PY_figure_plots.ipynb
--- a/scRNAseq/07o2_assemble_TableS6_excel_file.ipynb
+++ b/scRNAseq/07o2_assemble_TableS6_excel_file.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "a223e20f-5403-4aa0-935e-214df220ad93",
+   "metadata": {},
+   "source": [
+    "# bthal dataset v6b - collate and export diff expr results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "02b5201c-bc31-4361-a57f-f58d95adae62",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ipywidgets: 8.1.2\n",
+      "matplotlib: 3.7.4\n",
+      "scanpy    : 1.9.8\n",
+      "seaborn   : 0.13.0\n",
+      "sys       : 3.8.10 (default, Nov 22 2023, 10:22:35) \n",
+      "[GCC 9.4.0]\n",
+      "pandas    : 2.0.3\n",
+      "numpy     : 1.24.4\n",
+      "json      : 2.0.9\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "%matplotlib widget\n",
+    "%load_ext watermark\n",
+    "\n",
+    "import warnings\n",
+    "warnings.filterwarnings('ignore')\n",
+    "\n",
+    "import os, sys, json, operator, getpass\n",
+    "from pathlib import Path\n",
+    "from datetime import datetime\n",
+    "\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import scanpy as sc\n",
+    "\n",
+    "import matplotlib\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "\n",
+    "import ipywidgets as widgets\n",
+    "\n",
+    "\n",
+    "sc.settings.verbosity = 3             # show some output\n",
+    "sc.settings.file_format_figs = 'svg'  # set this to 'svg' (notebook) or 'pdf' (files) if you want vector graphics\n",
+    "sc.settings.savefigs = False\n",
+    "\n",
+    "# plt.rcParams['font.family'] = 'sans-serif'\n",
+    "# plt.rcParams['font.sans-serif'] = 'Arial'\n",
+    "# plt.rc('font', size=14)\n",
+    "\n",
+    "home = str(Path.home())\n",
+    "user = getpass.getuser()\n",
+    "\n",
+    "%watermark --iversions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2389390c-ab9a-44da-84a4-df73cce20e25",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "4c834193-f1a4-483c-a1bc-7bdfc92dd0b6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'proteus.45g'"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "with open('/.singularity.d/labels.json') as fh:\n",
+    "    singularity = json.load(fh)\n",
+    "    \n",
+    "singularity['Version']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "7696e6b1-47d9-4d33-95c5-f142508734bb",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "20240412\n"
+     ]
+    }
+   ],
+   "source": [
+    "now = datetime.now()\n",
+    "prefix = now.strftime('%Y%m%d')\n",
+    "print(prefix)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "78143475-18b4-41df-a0ed-8c466f8c5573",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8efd3dd9-2512-447c-aad5-527eec19241a",
+   "metadata": {},
+   "source": [
+    "### collect dex file list"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "c9a30e75-e0db-4579-84f9-a8fe2fe55f86",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "files = sorted(os.listdir(os.path.join(basedir, 'dex')))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a37bcfa0-cdca-4a2b-8bee-26b7ea98b48a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "4ec3ee28-32ce-482f-bd65-e15740470274",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "a6e7db93-7da6-49b1-a3c5-e4957251572a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "patt = re.compile('^\\d{8}\\_bthalcombo\\_v6b\\_(.+)\\_bthal\\_vs\\_healthy\\_sig\\_genes\\_DESeq2\\_pb.tsv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fed11c9d-3973-4170-87c1-2450a6b54fc9",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "3a0ba3b0-6e7c-4e7d-a13e-9d6881e82fe7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "repl = {'C10': 'Mende', 'C45': 'Zeng', 'strict': 'pheno'}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3e2b998b-3c42-4643-957e-5eb1c64b0b5c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d6defa3b-0571-477a-8581-fae088b721ee",
+   "metadata": {},
+   "source": [
+    "### parse dex files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "88bda9da-e678-467c-89a4-b06a92464f9c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Mende_cluster_02-MEP2\n",
+      "Mende_cluster_04-HSC-MPP1\n",
+      "Mende_cluster_05-HSC-MPP3\n",
+      "Mende_cluster_08-EryP\n",
+      "Mende_cluster_10-MPP-to-MEP\n",
+      "Mende_cluster_12-MEP1\n",
+      "Mende_lineage_HSC-MPP\n",
+      "Mende_lineage_MDP\n",
+      "Mende_lineage_early-MEMBP\n",
+      "Mende_lineage_late-MEMBP\n",
+      "Mende_lineage_late-MyP\n",
+      "Mende_lineage_primed-MPP\n",
+      "Zeng_cluster_BFU-E\n",
+      "Zeng_cluster_CFU-E\n",
+      "Zeng_cluster_GMP-Mono\n",
+      "Zeng_lineage_Erytroid\n",
+      "Zeng_lineage_HSC\n",
+      "Zeng_lineage_MDP\n",
+      "Zeng_lineage_MEP\n",
+      "Zeng_lineage_MPP-MkEry\n",
+      "Zeng_lineage_MPP-MyLy\n",
+      "pheno_HSCMPP_population\n",
+      "22\n",
+      "22\n"
+     ]
+    }
+   ],
+   "source": [
+    "dex = {}\n",
+    "counter = 0\n",
+    "\n",
+    "for f in files:\n",
+    "    m = patt.search(f)\n",
+    "    if m:\n",
+    "        temp = m.group(1)\n",
+    "        temp = temp.split('_')\n",
+    "        temp = [ repl[temp[0]] ] + temp[1:]\n",
+    "        comp = '_'.join(temp)\n",
+    "        \n",
+    "        dex[comp] = pd.read_csv('../dex/'+f, sep='\\t')\n",
+    "        counter += 1\n",
+    "        print(comp)\n",
+    "        \n",
+    "print(len(files))\n",
+    "print(counter)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "52bfc29d-b5e6-4304-ab41-bd4344d82339",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bdf67b7f-f74e-4d13-8ad7-63d482aaa2f7",
+   "metadata": {},
+   "source": [
+    "### make summary"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "a165f92a-7635-4b6d-8fbe-8ead957f8128",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "summary = []"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "58a10438-2e17-47d9-8767-7fdaad012c51",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for k in dex.keys():\n",
+    "    summary.append([k, dex[k].shape[0], str(sum( dex[k].log2FoldChange > 0 )), str(sum( dex[k].log2FoldChange < 0 ))])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b7c8e5aa-6b54-461e-8238-b27ad8eaa4ac",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "418fca10-d67b-43a2-ade9-d479ed88cc4c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "summary = pd.DataFrame(summary, columns=['Subset', 'Total', 'Upregulated', 'Downregulated'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "8bc43c6b-5021-4db7-bf47-aa21c58d4e69",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Subset</th>\n",
+       "      <th>Total</th>\n",
+       "      <th>Upregulated</th>\n",
+       "      <th>Downregulated</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Mende_cluster_02-MEP2</td>\n",
+       "      <td>15</td>\n",
+       "      <td>1</td>\n",
+       "      <td>14</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Mende_cluster_04-HSC-MPP1</td>\n",
+       "      <td>103</td>\n",
+       "      <td>25</td>\n",
+       "      <td>78</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Mende_cluster_05-HSC-MPP3</td>\n",
+       "      <td>295</td>\n",
+       "      <td>122</td>\n",
+       "      <td>173</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Mende_cluster_08-EryP</td>\n",
+       "      <td>14</td>\n",
+       "      <td>3</td>\n",
+       "      <td>11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Mende_cluster_10-MPP-to-MEP</td>\n",
+       "      <td>52</td>\n",
+       "      <td>7</td>\n",
+       "      <td>45</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>Mende_cluster_12-MEP1</td>\n",
+       "      <td>5</td>\n",
+       "      <td>0</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>Mende_lineage_HSC-MPP</td>\n",
+       "      <td>356</td>\n",
+       "      <td>153</td>\n",
+       "      <td>203</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>Mende_lineage_MDP</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>Mende_lineage_early-MEMBP</td>\n",
+       "      <td>17</td>\n",
+       "      <td>2</td>\n",
+       "      <td>15</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>Mende_lineage_late-MEMBP</td>\n",
+       "      <td>22</td>\n",
+       "      <td>3</td>\n",
+       "      <td>19</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>Mende_lineage_late-MyP</td>\n",
+       "      <td>76</td>\n",
+       "      <td>10</td>\n",
+       "      <td>66</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>Mende_lineage_primed-MPP</td>\n",
+       "      <td>52</td>\n",
+       "      <td>7</td>\n",
+       "      <td>45</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>Zeng_cluster_BFU-E</td>\n",
+       "      <td>9</td>\n",
+       "      <td>1</td>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>Zeng_cluster_CFU-E</td>\n",
+       "      <td>6</td>\n",
+       "      <td>2</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>Zeng_cluster_GMP-Mono</td>\n",
+       "      <td>4</td>\n",
+       "      <td>0</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>Zeng_lineage_Erytroid</td>\n",
+       "      <td>21</td>\n",
+       "      <td>1</td>\n",
+       "      <td>20</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>Zeng_lineage_HSC</td>\n",
+       "      <td>272</td>\n",
+       "      <td>115</td>\n",
+       "      <td>157</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>Zeng_lineage_MDP</td>\n",
+       "      <td>6</td>\n",
+       "      <td>0</td>\n",
+       "      <td>6</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>Zeng_lineage_MEP</td>\n",
+       "      <td>33</td>\n",
+       "      <td>5</td>\n",
+       "      <td>28</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>Zeng_lineage_MPP-MkEry</td>\n",
+       "      <td>104</td>\n",
+       "      <td>26</td>\n",
+       "      <td>78</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>Zeng_lineage_MPP-MyLy</td>\n",
+       "      <td>136</td>\n",
+       "      <td>35</td>\n",
+       "      <td>101</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>21</th>\n",
+       "      <td>pheno_HSCMPP_population</td>\n",
+       "      <td>3037</td>\n",
+       "      <td>1465</td>\n",
+       "      <td>1572</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                         Subset  Total Upregulated Downregulated\n",
+       "0         Mende_cluster_02-MEP2     15           1            14\n",
+       "1     Mende_cluster_04-HSC-MPP1    103          25            78\n",
+       "2     Mende_cluster_05-HSC-MPP3    295         122           173\n",
+       "3         Mende_cluster_08-EryP     14           3            11\n",
+       "4   Mende_cluster_10-MPP-to-MEP     52           7            45\n",
+       "5         Mende_cluster_12-MEP1      5           0             5\n",
+       "6         Mende_lineage_HSC-MPP    356         153           203\n",
+       "7             Mende_lineage_MDP      0           0             0\n",
+       "8     Mende_lineage_early-MEMBP     17           2            15\n",
+       "9      Mende_lineage_late-MEMBP     22           3            19\n",
+       "10       Mende_lineage_late-MyP     76          10            66\n",
+       "11     Mende_lineage_primed-MPP     52           7            45\n",
+       "12           Zeng_cluster_BFU-E      9           1             8\n",
+       "13           Zeng_cluster_CFU-E      6           2             4\n",
+       "14        Zeng_cluster_GMP-Mono      4           0             4\n",
+       "15        Zeng_lineage_Erytroid     21           1            20\n",
+       "16             Zeng_lineage_HSC    272         115           157\n",
+       "17             Zeng_lineage_MDP      6           0             6\n",
+       "18             Zeng_lineage_MEP     33           5            28\n",
+       "19       Zeng_lineage_MPP-MkEry    104          26            78\n",
+       "20        Zeng_lineage_MPP-MyLy    136          35           101\n",
+       "21      pheno_HSCMPP_population   3037        1465          1572"
+      ]
+     },
+     "execution_count": 35,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "summary"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f964c8e5-1b21-438e-816a-a131b920cf98",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "id": "3f5b8eb3-1354-415c-9158-39d89da675da",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# summary.to_excel('../output/20240412_DESEq2_pseudobulk_bthal_vs_healthy_comparisons.xlsx', index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7c07f0b6-5c92-46dc-8da2-6f912f78ef47",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "99902890-81d5-4539-baeb-1a4729bd9652",
+   "metadata": {},
+   "source": [
+    "### write out dex results (1 subset per tab)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "603f4143-dab0-4eb9-b226-b1ee6abe2925",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "dict_keys(['Mende_cluster_02-MEP2', 'Mende_cluster_04-HSC-MPP1', 'Mende_cluster_05-HSC-MPP3', 'Mende_cluster_08-EryP', 'Mende_cluster_10-MPP-to-MEP', 'Mende_cluster_12-MEP1', 'Mende_lineage_HSC-MPP', 'Mende_lineage_MDP', 'Mende_lineage_early-MEMBP', 'Mende_lineage_late-MEMBP', 'Mende_lineage_late-MyP', 'Mende_lineage_primed-MPP', 'Zeng_cluster_BFU-E', 'Zeng_cluster_CFU-E', 'Zeng_cluster_GMP-Mono', 'Zeng_lineage_Erytroid', 'Zeng_lineage_HSC', 'Zeng_lineage_MDP', 'Zeng_lineage_MEP', 'Zeng_lineage_MPP-MkEry', 'Zeng_lineage_MPP-MyLy', 'pheno_HSCMPP_population'])"
+      ]
+     },
+     "execution_count": 41,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dex.keys()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f3a53b3d-21e0-4d34-9b8c-1224fad79e4e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3e8074ee-679d-447a-bc5b-030a2a43a7da",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "151f34e9-7fd6-4185-a933-b48db352093b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# https://stackoverflow.com/questions/21981820/creating-multiple-excel-worksheets-using-data-from-a-pandas-dataframe"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "id": "fc99c873-f8b1-469d-a712-ed6a168d0bd7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with pd.ExcelWriter('../output/20240412_DESEq2_pseudobulk_bthal_vs_healthy_comparisons.xlsx', engine='openpyxl') as writer:\n",
+    "    \n",
+    "    summary.to_excel(writer, sheet_name='Summary', index=False)\n",
+    "    \n",
+    "    for k in dex.keys():\n",
+    "        dex[k].to_excel(writer, sheet_name=k, index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "918ab08c-d25e-4d2d-950a-aa23523200be",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "id": "c6b16cee-95b9-45e3-bfd6-8c816ac08401",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>gene</th>\n",
+       "      <th>baseMean</th>\n",
+       "      <th>log2FoldChange</th>\n",
+       "      <th>lfcSE</th>\n",
+       "      <th>pvalue</th>\n",
+       "      <th>padj</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>IFI6</td>\n",
+       "      <td>2701.297655</td>\n",
+       "      <td>-2.393732</td>\n",
+       "      <td>0.038346</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>GADD45A</td>\n",
+       "      <td>1606.359692</td>\n",
+       "      <td>-2.596506</td>\n",
+       "      <td>0.051663</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>IFI44L</td>\n",
+       "      <td>2107.963029</td>\n",
+       "      <td>-2.474058</td>\n",
+       "      <td>0.043739</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>MCL1</td>\n",
+       "      <td>4403.662831</td>\n",
+       "      <td>-1.206698</td>\n",
+       "      <td>0.030463</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>ID2</td>\n",
+       "      <td>2049.391917</td>\n",
+       "      <td>-3.285105</td>\n",
+       "      <td>0.049060</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3032</th>\n",
+       "      <td>AKT3</td>\n",
+       "      <td>235.660229</td>\n",
+       "      <td>-0.228459</td>\n",
+       "      <td>0.117286</td>\n",
+       "      <td>0.009927</td>\n",
+       "      <td>0.049581</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3033</th>\n",
+       "      <td>LGALS3</td>\n",
+       "      <td>11.347210</td>\n",
+       "      <td>0.485461</td>\n",
+       "      <td>0.581906</td>\n",
+       "      <td>0.009935</td>\n",
+       "      <td>0.049607</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3034</th>\n",
+       "      <td>PTGR1</td>\n",
+       "      <td>74.282079</td>\n",
+       "      <td>0.326648</td>\n",
+       "      <td>0.222420</td>\n",
+       "      <td>0.009967</td>\n",
+       "      <td>0.049750</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3035</th>\n",
+       "      <td>CENPH</td>\n",
+       "      <td>431.522818</td>\n",
+       "      <td>-0.182787</td>\n",
+       "      <td>0.086405</td>\n",
+       "      <td>0.009979</td>\n",
+       "      <td>0.049791</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3036</th>\n",
+       "      <td>NXPE3</td>\n",
+       "      <td>151.781287</td>\n",
+       "      <td>-0.259021</td>\n",
+       "      <td>0.143365</td>\n",
+       "      <td>0.010015</td>\n",
+       "      <td>0.049958</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>3037 rows × 6 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         gene     baseMean  log2FoldChange     lfcSE    pvalue      padj\n",
+       "0        IFI6  2701.297655       -2.393732  0.038346  0.000000  0.000000\n",
+       "1     GADD45A  1606.359692       -2.596506  0.051663  0.000000  0.000000\n",
+       "2      IFI44L  2107.963029       -2.474058  0.043739  0.000000  0.000000\n",
+       "3        MCL1  4403.662831       -1.206698  0.030463  0.000000  0.000000\n",
+       "4         ID2  2049.391917       -3.285105  0.049060  0.000000  0.000000\n",
+       "...       ...          ...             ...       ...       ...       ...\n",
+       "3032     AKT3   235.660229       -0.228459  0.117286  0.009927  0.049581\n",
+       "3033   LGALS3    11.347210        0.485461  0.581906  0.009935  0.049607\n",
+       "3034    PTGR1    74.282079        0.326648  0.222420  0.009967  0.049750\n",
+       "3035    CENPH   431.522818       -0.182787  0.086405  0.009979  0.049791\n",
+       "3036    NXPE3   151.781287       -0.259021  0.143365  0.010015  0.049958\n",
+       "\n",
+       "[3037 rows x 6 columns]"
+      ]
+     },
+     "execution_count": 39,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dex[k]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "15c1507e-70bc-4a8f-8666-2aa08ee1b046",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "420817ea-4e13-42c6-a71b-2357dbe914a8",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "601e1302-91ec-491e-ae58-b5863013de2a",
+   "metadata": {},
+   "source": [
+    "# END"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.8"
+  },
+  "widgets": {
+   "application/vnd.jupyter.widget-state+json": {
+    "state": {},
+    "version_major": 2,
+    "version_minor": 0
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/scRNAseq/README_scRNAseq_10x.md
+++ b/scRNAseq/README_scRNAseq_10x.md
+The scRNAseq upstream analysis main steps comprised of cellranger count processing and souporcell (for demultiplexing the samples sequenced together).
+Standard QC and filtering steps followed, leading to a midstream aligment/integration of all samples using Seurat (v4). For that each library was SCtransformed, with count number, mitochondrial fraction, S score and G2M score regressed before integrating all samples via CCA. Downstream analyses included annotation transfer using both Azimuth and Symphony, and trajectory analysis using Cellrank (v2) and tradeSeq.
+- 00_* : helper wrappers for demultiplexing, alignment and quantification of samples
+- 06b_* : script to align all samples using Seurat (v4)
+- 07[h|i]* : notebooks for annotation of the integrated/aligned dataset
+- 07k4* : notebook for pseudobulk differential expression testing with DESeq2
+- 07m* : notebooks and scripts for defining and analysing trajectiories using Cellrank (v2) and tradeSeq
+- 07n* : notebook defining Subset1-like cells
+- 07o* : notebooks to produce specific figures and table for the manuscript