{ "cells": [ { "cell_type": "markdown", "id": "a223e20f-5403-4aa0-935e-214df220ad93", "metadata": {}, "source": [ "# bthal dataset v6b - collate and export diff expr results" ] }, { "cell_type": "code", "execution_count": 1, "id": "02b5201c-bc31-4361-a57f-f58d95adae62", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ipywidgets: 8.1.2\n", "matplotlib: 3.7.4\n", "scanpy : 1.9.8\n", "seaborn : 0.13.0\n", "sys : 3.8.10 (default, Nov 22 2023, 10:22:35) \n", "[GCC 9.4.0]\n", "pandas : 2.0.3\n", "numpy : 1.24.4\n", "json : 2.0.9\n", "\n" ] } ], "source": [ "%matplotlib widget\n", "%load_ext watermark\n", "\n", "import warnings\n", "warnings.filterwarnings('ignore')\n", "\n", "import os, sys, json, operator, getpass\n", "from pathlib import Path\n", "from datetime import datetime\n", "\n", "import numpy as np\n", "import pandas as pd\n", "import scanpy as sc\n", "\n", "import matplotlib\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "import ipywidgets as widgets\n", "\n", "\n", "sc.settings.verbosity = 3 # show some output\n", "sc.settings.file_format_figs = 'svg' # set this to 'svg' (notebook) or 'pdf' (files) if you want vector graphics\n", "sc.settings.savefigs = False\n", "\n", "# plt.rcParams['font.family'] = 'sans-serif'\n", "# plt.rcParams['font.sans-serif'] = 'Arial'\n", "# plt.rc('font', size=14)\n", "\n", "home = str(Path.home())\n", "user = getpass.getuser()\n", "\n", "%watermark --iversions" ] }, { "cell_type": "code", "execution_count": null, "id": "2389390c-ab9a-44da-84a4-df73cce20e25", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 3, "id": "4c834193-f1a4-483c-a1bc-7bdfc92dd0b6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'proteus.45g'" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "with open('/.singularity.d/labels.json') as fh:\n", " singularity = json.load(fh)\n", " \n", "singularity['Version']" ] }, { "cell_type": "code", "execution_count": 4, "id": "7696e6b1-47d9-4d33-95c5-f142508734bb", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "20240412\n" ] } ], "source": [ "now = datetime.now()\n", "prefix = now.strftime('%Y%m%d')\n", "print(prefix)" ] }, { "cell_type": "code", "execution_count": null, "id": "78143475-18b4-41df-a0ed-8c466f8c5573", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "8efd3dd9-2512-447c-aad5-527eec19241a", "metadata": {}, "source": [ "### collect dex file list" ] }, { "cell_type": "code", "execution_count": 6, "id": "c9a30e75-e0db-4579-84f9-a8fe2fe55f86", "metadata": {}, "outputs": [], "source": [ "files = sorted(os.listdir(os.path.join(basedir, 'dex')))" ] }, { "cell_type": "code", "execution_count": null, "id": "a37bcfa0-cdca-4a2b-8bee-26b7ea98b48a", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 8, "id": "4ec3ee28-32ce-482f-bd65-e15740470274", "metadata": {}, "outputs": [], "source": [ "import re" ] }, { "cell_type": "code", "execution_count": 9, "id": "a6e7db93-7da6-49b1-a3c5-e4957251572a", "metadata": {}, "outputs": [], "source": [ "patt = re.compile('^\\d{8}\\_bthalcombo\\_v6b\\_(.+)\\_bthal\\_vs\\_healthy\\_sig\\_genes\\_DESeq2\\_pb.tsv')" ] }, { "cell_type": "code", "execution_count": null, "id": "fed11c9d-3973-4170-87c1-2450a6b54fc9", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 10, "id": "3a0ba3b0-6e7c-4e7d-a13e-9d6881e82fe7", "metadata": {}, "outputs": [], "source": [ "repl = {'C10': 'Mende', 'C45': 'Zeng', 'strict': 'pheno'}" ] }, { "cell_type": "code", "execution_count": null, "id": "3e2b998b-3c42-4643-957e-5eb1c64b0b5c", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "d6defa3b-0571-477a-8581-fae088b721ee", "metadata": {}, "source": [ "### parse dex files" ] }, { "cell_type": "code", "execution_count": 23, "id": "88bda9da-e678-467c-89a4-b06a92464f9c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mende_cluster_02-MEP2\n", "Mende_cluster_04-HSC-MPP1\n", "Mende_cluster_05-HSC-MPP3\n", "Mende_cluster_08-EryP\n", "Mende_cluster_10-MPP-to-MEP\n", "Mende_cluster_12-MEP1\n", "Mende_lineage_HSC-MPP\n", "Mende_lineage_MDP\n", "Mende_lineage_early-MEMBP\n", "Mende_lineage_late-MEMBP\n", "Mende_lineage_late-MyP\n", "Mende_lineage_primed-MPP\n", "Zeng_cluster_BFU-E\n", "Zeng_cluster_CFU-E\n", "Zeng_cluster_GMP-Mono\n", "Zeng_lineage_Erytroid\n", "Zeng_lineage_HSC\n", "Zeng_lineage_MDP\n", "Zeng_lineage_MEP\n", "Zeng_lineage_MPP-MkEry\n", "Zeng_lineage_MPP-MyLy\n", "pheno_HSCMPP_population\n", "22\n", "22\n" ] } ], "source": [ "dex = {}\n", "counter = 0\n", "\n", "for f in files:\n", " m = patt.search(f)\n", " if m:\n", " temp = m.group(1)\n", " temp = temp.split('_')\n", " temp = [ repl[temp[0]] ] + temp[1:]\n", " comp = '_'.join(temp)\n", " \n", " dex[comp] = pd.read_csv('../dex/'+f, sep='\\t')\n", " counter += 1\n", " print(comp)\n", " \n", "print(len(files))\n", "print(counter)" ] }, { "cell_type": "code", "execution_count": null, "id": "52bfc29d-b5e6-4304-ab41-bd4344d82339", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "bdf67b7f-f74e-4d13-8ad7-63d482aaa2f7", "metadata": {}, "source": [ "### make summary" ] }, { "cell_type": "code", "execution_count": 25, "id": "a165f92a-7635-4b6d-8fbe-8ead957f8128", "metadata": {}, "outputs": [], "source": [ "summary = []" ] }, { "cell_type": "code", "execution_count": 26, "id": "58a10438-2e17-47d9-8767-7fdaad012c51", "metadata": {}, "outputs": [], "source": [ "for k in dex.keys():\n", " summary.append([k, dex[k].shape[0], str(sum( dex[k].log2FoldChange > 0 )), str(sum( dex[k].log2FoldChange < 0 ))])" ] }, { "cell_type": "code", "execution_count": null, "id": "b7c8e5aa-6b54-461e-8238-b27ad8eaa4ac", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 27, "id": "418fca10-d67b-43a2-ade9-d479ed88cc4c", "metadata": {}, "outputs": [], "source": [ "summary = pd.DataFrame(summary, columns=['Subset', 'Total', 'Upregulated', 'Downregulated'])" ] }, { "cell_type": "code", "execution_count": 35, "id": "8bc43c6b-5021-4db7-bf47-aa21c58d4e69", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | Subset | \n", "Total | \n", "Upregulated | \n", "Downregulated | \n", "
---|---|---|---|---|
0 | \n", "Mende_cluster_02-MEP2 | \n", "15 | \n", "1 | \n", "14 | \n", "
1 | \n", "Mende_cluster_04-HSC-MPP1 | \n", "103 | \n", "25 | \n", "78 | \n", "
2 | \n", "Mende_cluster_05-HSC-MPP3 | \n", "295 | \n", "122 | \n", "173 | \n", "
3 | \n", "Mende_cluster_08-EryP | \n", "14 | \n", "3 | \n", "11 | \n", "
4 | \n", "Mende_cluster_10-MPP-to-MEP | \n", "52 | \n", "7 | \n", "45 | \n", "
5 | \n", "Mende_cluster_12-MEP1 | \n", "5 | \n", "0 | \n", "5 | \n", "
6 | \n", "Mende_lineage_HSC-MPP | \n", "356 | \n", "153 | \n", "203 | \n", "
7 | \n", "Mende_lineage_MDP | \n", "0 | \n", "0 | \n", "0 | \n", "
8 | \n", "Mende_lineage_early-MEMBP | \n", "17 | \n", "2 | \n", "15 | \n", "
9 | \n", "Mende_lineage_late-MEMBP | \n", "22 | \n", "3 | \n", "19 | \n", "
10 | \n", "Mende_lineage_late-MyP | \n", "76 | \n", "10 | \n", "66 | \n", "
11 | \n", "Mende_lineage_primed-MPP | \n", "52 | \n", "7 | \n", "45 | \n", "
12 | \n", "Zeng_cluster_BFU-E | \n", "9 | \n", "1 | \n", "8 | \n", "
13 | \n", "Zeng_cluster_CFU-E | \n", "6 | \n", "2 | \n", "4 | \n", "
14 | \n", "Zeng_cluster_GMP-Mono | \n", "4 | \n", "0 | \n", "4 | \n", "
15 | \n", "Zeng_lineage_Erytroid | \n", "21 | \n", "1 | \n", "20 | \n", "
16 | \n", "Zeng_lineage_HSC | \n", "272 | \n", "115 | \n", "157 | \n", "
17 | \n", "Zeng_lineage_MDP | \n", "6 | \n", "0 | \n", "6 | \n", "
18 | \n", "Zeng_lineage_MEP | \n", "33 | \n", "5 | \n", "28 | \n", "
19 | \n", "Zeng_lineage_MPP-MkEry | \n", "104 | \n", "26 | \n", "78 | \n", "
20 | \n", "Zeng_lineage_MPP-MyLy | \n", "136 | \n", "35 | \n", "101 | \n", "
21 | \n", "pheno_HSCMPP_population | \n", "3037 | \n", "1465 | \n", "1572 | \n", "
\n", " | gene | \n", "baseMean | \n", "log2FoldChange | \n", "lfcSE | \n", "pvalue | \n", "padj | \n", "
---|---|---|---|---|---|---|
0 | \n", "IFI6 | \n", "2701.297655 | \n", "-2.393732 | \n", "0.038346 | \n", "0.000000 | \n", "0.000000 | \n", "
1 | \n", "GADD45A | \n", "1606.359692 | \n", "-2.596506 | \n", "0.051663 | \n", "0.000000 | \n", "0.000000 | \n", "
2 | \n", "IFI44L | \n", "2107.963029 | \n", "-2.474058 | \n", "0.043739 | \n", "0.000000 | \n", "0.000000 | \n", "
3 | \n", "MCL1 | \n", "4403.662831 | \n", "-1.206698 | \n", "0.030463 | \n", "0.000000 | \n", "0.000000 | \n", "
4 | \n", "ID2 | \n", "2049.391917 | \n", "-3.285105 | \n", "0.049060 | \n", "0.000000 | \n", "0.000000 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
3032 | \n", "AKT3 | \n", "235.660229 | \n", "-0.228459 | \n", "0.117286 | \n", "0.009927 | \n", "0.049581 | \n", "
3033 | \n", "LGALS3 | \n", "11.347210 | \n", "0.485461 | \n", "0.581906 | \n", "0.009935 | \n", "0.049607 | \n", "
3034 | \n", "PTGR1 | \n", "74.282079 | \n", "0.326648 | \n", "0.222420 | \n", "0.009967 | \n", "0.049750 | \n", "
3035 | \n", "CENPH | \n", "431.522818 | \n", "-0.182787 | \n", "0.086405 | \n", "0.009979 | \n", "0.049791 | \n", "
3036 | \n", "NXPE3 | \n", "151.781287 | \n", "-0.259021 | \n", "0.143365 | \n", "0.010015 | \n", "0.049958 | \n", "
3037 rows × 6 columns
\n", "