{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Import libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import celloracle as co\n",
    "from celloracle.applications import Pseudotime_calculator\n",
    "from celloracle.applications import Gradient_calculator\n",
    "from celloracle.applications import Oracle_development_module\n",
    "import anndata2ri\n",
    "import os\n",
    "import sys\n",
    "import time\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import scanpy as sc\n",
    "import seaborn as sns\n",
    "import copy\n",
    "import glob\n",
    "import time\n",
    "import shutil\n",
    "from tqdm.auto import tqdm"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Plotting parameters settings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# visualization settings\n",
    "%config InlineBackend.figure_format = 'retina'\n",
    "%matplotlib inline\n",
    "plt.rcParams['figure.figsize'] = [6, 4.5]\n",
    "plt.rcParams[\"savefig.dpi\"] = 300"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "save_folder = \"Scarfo_HEC2023/scRNAseq\"\n",
    "os.makedirs(save_folder, exist_ok=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load Seurat object in Python"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Activate the anndata2ri conversion between SingleCellExperiment and AnnData\n",
    "anndata2ri.activate()\n",
    "#Loading the rpy2 extension\n",
    "%load_ext rpy2.ipython\n",
    "sc.settings.verbosity = 3\n",
    "sc.logging.print_versions()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R\n",
    "suppressPackageStartupMessages(library(Seurat))\n",
    "runx1_obj <- readRDS(\"Scarfo_HEC2023/scRNAseq/RUNX1_clusters.rds\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -o runx1_obj_sce\n",
    "#convert the Seurat object to a SingleCellExperiment object\n",
    "runx1_obj_sce <- as.SingleCellExperiment(runx1_obj)\n",
    "runx1_obj_sce"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata = runx1_obj_sce\n",
    "adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f\"Cell number is :{adata.shape[0]}\")\n",
    "print(f\"Gene number is :{adata.shape[1]}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sc.pl.umap(adata, color='RNA_snn_new_res.0.6')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Constructing GRN"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Select top 3000 highly-variable genes\n",
    "filter_result = sc.pp.filter_genes_dispersion(adata.X,flavor='cell_ranger',n_top_genes=3000,log=False)\n",
    "# Subset the genes\n",
    "adata = adata[:, filter_result.gene_subset]\n",
    "print(f\"Cell number is :{adata.shape[0]}\")\n",
    "print(f\"Gene number is :{adata.shape[1]}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load data from CellTalkDB\n",
    "ligand_receptor = pd.read_csv(\"Scarfo_HEC2023/scRNAseq/CellTalkDB_modified.txt\", sep='\\t')\n",
    "ligand_receptor.columns = ['Ligand','Receptor']\n",
    "ligand_receptor\n",
    "print('FCGR2B' in ligand_receptor['Ligand'].unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Make dictionary: dictionary key is Ligand and dictionary value is list of target genes.\n",
    "LR_to_TG_dictionary = {}\n",
    "for LR, TGs in zip(ligand_receptor.Ligand, ligand_receptor.Receptor):\n",
    "    TG_list = TGs.replace(\" \", \"\").split(\",\")\n",
    "    LR_to_TG_dictionary[LR] = TG_list\n",
    "TG_to_LR_dictionary = co.utility.inverse_dictionary(LR_to_TG_dictionary)\n",
    "TG_to_LR_dictionary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Instantiate Oracle object\n",
    "oracle = co.Oracle()\n",
    "oracle.import_anndata_as_raw_count(adata=adata,cluster_column_name=\"RNA_snn_new_res.0.6\",embedding_name=\"X_umap\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Add LR information \n",
    "oracle.addTFinfo_dictionary(TG_to_LR_dictionary)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Perform PCA\n",
    "oracle.perform_PCA()\n",
    "# Select important PCs\n",
    "plt.plot(np.cumsum(oracle.pca.explained_variance_ratio_)[:100])\n",
    "n_comps = np.where(np.diff(np.diff(np.cumsum(oracle.pca.explained_variance_ratio_))>0.002))[0][0]\n",
    "plt.axvline(n_comps, c=\"k\")\n",
    "plt.show()\n",
    "print(n_comps)\n",
    "n_comps = min(n_comps, 50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Estimate the optimal number of nearest neighbors for KNN imputation\n",
    "n_cell = oracle.adata.shape[0]\n",
    "print(f\"cell number is :{n_cell}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "k = int(0.025*n_cell)\n",
    "print(f\"Auto-selected k is :{k}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# perform KNN imputation\n",
    "oracle.knn_imputation(n_pca_dims=n_comps,k=k, balanced=True,b_sight=k*8,b_maxl=k*4,n_jobs=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# GRN calculation\n",
    "%time\n",
    "links = oracle.get_links(cluster_name_for_GRN_unit=\"RNA_snn_new_res.0.6\", alpha=10,verbose_level=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "links.links_dict.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# filter weak edges\n",
    "links.filter_links(p=0.001, weight=\"coef_abs\", threshold_number=2000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# network degree distribution\n",
    "links.plot_degree_distributions(plot_model=True,\n",
    "                                save=f\"{save_folder}/degree_distribution/\",)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculate network scores.\n",
    "links.get_network_score()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Adding pseudotime to oracle object"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Instantiate pseudotime object using oracle object.\n",
    "pt = Pseudotime_calculator(oracle_object=oracle)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "clusters_in_RUNX1_lineage = ['0','1','2','11','16','17']\n",
    "# Make a dictionary\n",
    "lineage_dictionary = {\"Lineage_RUNX1\": clusters_in_RUNX1_lineage}\n",
    "# Input lineage information into pseudotime object\n",
    "pt.set_lineage(lineage_dictionary=lineage_dictionary)\n",
    "# Visualize lineage information\n",
    "pt.plot_lineages()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# add root cell information\n",
    "# plotly is required\n",
    "try:\n",
    "    import plotly.express as px\n",
    "    def plot(adata, embedding_key, cluster_column_name):\n",
    "        embedding = adata.obsm[embedding_key]\n",
    "        df = pd.DataFrame(embedding, columns=[\"x\", \"y\"])\n",
    "        df[\"cluster\"] = adata.obs[cluster_column_name].values\n",
    "        df[\"label\"] = adata.obs.index.values\n",
    "        fig = px.scatter(df, x=\"x\", y=\"y\", hover_name=df[\"label\"], color=\"cluster\")\n",
    "        fig.show()\n",
    "    plot(adata=pt.adata,\n",
    "         embedding_key=pt.obsm_key,\n",
    "         cluster_column_name=pt.cluster_column_name)\n",
    "except:\n",
    "    print(\"Plotly not found in your environment. Did you install plotly?\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Estimated root cell name for each lineage\n",
    "root_cells = {\"Lineage_RUNX1\": \"Sample1_GGAGAACTCGCGGTAC-1\"}\n",
    "pt.set_root_cells(root_cells=root_cells)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check diffusion map data.\n",
    "\"X_diffmap\" in pt.adata.obsm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# calculate diffusion map\n",
    "sc.pp.neighbors(pt.adata, n_neighbors=30)\n",
    "sc.tl.diffmap(pt.adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculate pseudotime\n",
    "pt.get_pseudotime_per_each_lineage()\n",
    "# Check results\n",
    "pt.plot_pseudotime(cmap=\"rainbow\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Add calculated pseudotime data to the oracle object\n",
    "oracle.adata.obs = pt.adata.obs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## In silico perturbation analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make pridictive models for simulation and fit the ridge regression models again\n",
    "links.filter_links()\n",
    "oracle.get_cluster_specific_TFdict_from_Links(links_object=links)\n",
    "oracle.fit_GRN_for_simulation(alpha=10, use_cluster_specific_TFdict=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "goi = \"FCGR2B\"\n",
    "# Enter perturbation conditions to simulate signal propagation after the perturbation.\n",
    "oracle.simulate_shift(perturb_condition={goi: 0.0}, n_propagation=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get transition probability\n",
    "oracle.estimate_transition_prob(n_neighbors=200,knn_random=True,sampled_fraction=1)\n",
    "# Calculate embedding\n",
    "oracle.calculate_embedding_shift(sigma_corr=0.05)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# quiver plots\n",
    "fig, ax = plt.subplots(1, 2,  figsize=[13, 6])\n",
    "scale = 25\n",
    "# Show quiver plot\n",
    "oracle.plot_quiver(scale=scale, ax=ax[0])\n",
    "ax[0].set_title(f\"Simulated cell identity shift vector: {goi} KO\")\n",
    "# Show quiver plot that was calculated with randomized graph.\n",
    "oracle.plot_quiver_random(scale=scale, ax=ax[1])\n",
    "ax[1].set_title(f\"Randomized simulation vector\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# establishing digitalized grids\n",
    "n_grid = 40\n",
    "oracle.calculate_p_mass(smooth=0.8, n_grid=n_grid, n_neighbors=200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Search for best min_mass.\n",
    "oracle.suggest_mass_thresholds(n_suggestion=12)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "min_mass = 19\n",
    "oracle.calculate_mass_filter(min_mass=min_mass, plot=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# shift vector plots\n",
    "fig, ax = plt.subplots(1, 2,  figsize=[13, 5])\n",
    "scale_simulation = 2\n",
    "# Show quiver plot\n",
    "oracle.plot_simulation_flow_on_grid(scale=scale_simulation, ax=ax[0])\n",
    "ax[0].set_title(f\"Simulated cell identity shift vector: {goi} KO\")\n",
    "# Show quiver plot that was calculated with randomized graph.\n",
    "oracle.plot_simulation_flow_random_on_grid(scale=scale_simulation, ax=ax[1])\n",
    "ax[1].set_title(f\"Randomized simulation vector\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Plot vector field with cell cluster\n",
    "fig, ax = plt.subplots(figsize=[8, 8])\n",
    "oracle.plot_cluster_whole(ax=ax, s=10)\n",
    "oracle.plot_simulation_flow_on_grid(scale=scale_simulation, ax=ax, show_background=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualize pseudotime\n",
    "fig, ax = plt.subplots(figsize=[6,6])\n",
    "sc.pl.embedding(adata=oracle.adata, basis=oracle.embedding_name, ax=ax, cmap=\"viridis\",\n",
    "                color=[\"Pseudotime\"], save=\"pseudotime.pdf\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Instantiate Gradient calculator object\n",
    "gradient = Gradient_calculator(oracle_object=oracle, pseudotime_key=\"Pseudotime\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "gradient.calculate_p_mass(smooth=0.8, n_grid=n_grid, n_neighbors=200)\n",
    "gradient.calculate_mass_filter(min_mass=min_mass, plot=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# convert pseudotime intop grid points\n",
    "gradient.transfer_data_into_grid(args={\"method\": \"polynomial\", \"n_poly\":5}, plot=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculate graddient\n",
    "gradient.calculate_gradient()\n",
    "# Show results\n",
    "scale_dev = 20\n",
    "gradient.visualize_results(scale=scale_dev, s=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualize results\n",
    "fig, ax = plt.subplots(figsize=[6, 6])\n",
    "gradient.plot_dev_flow_on_grid(scale=scale_dev, ax=ax)\n",
    "ax.set_title(f\"Normal Developmental flow\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# quantitatively compare the directionality and size of vectors between perturbation simulation and natural differentiation using inner product\n",
    "# Make Oracle_development_module to compare two vector field\n",
    "dev = Oracle_development_module()\n",
    "# Load development flow\n",
    "dev.load_differentiation_reference_data(gradient_object=gradient)\n",
    "# Load simulation result\n",
    "dev.load_perturb_simulation_data(oracle_object=oracle)\n",
    "# Calculate inner produc scores\n",
    "dev.calculate_inner_product()\n",
    "dev.calculate_digitized_ip(n_bins=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Show perturbation scores\n",
    "vm = 0.2\n",
    "fig, ax = plt.subplots(1, 2, figsize=[12, 5])\n",
    "dev.plot_inner_product_on_grid(vm=0.02, s=50, ax=ax[0])\n",
    "ax[0].set_title(f\"PS calculated with FCGR2B KO\")\n",
    "dev.plot_inner_product_random_on_grid(vm=vm, s=50, ax=ax[1])\n",
    "ax[1].set_title(f\"PS calculated with Randomized simulation vector\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Show perturbation scores with perturbation simulation vector field\n",
    "fig, ax = plt.subplots(figsize=[6, 6])\n",
    "dev.plot_inner_product_on_grid(vm=0.05, s=50, ax=ax)\n",
    "dev.plot_simulation_flow_on_grid(scale=scale_simulation, show_background=False, ax=ax)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# summary plots\n",
    "# visualize the results\n",
    "dev.visualize_development_module_layout_0(s=5, scale_for_simulation=scale_simulation, s_grid=50, scale_for_pseudotime=scale_dev, vm=0.05)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:postsc]",
   "language": "python",
   "name": "conda-env-postsc-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}