{ "cells": [ { "cell_type": "markdown", "id": "a223e20f-5403-4aa0-935e-214df220ad93", "metadata": {}, "source": [ "# bthal dataset v6b - collate and export diff expr results" ] }, { "cell_type": "code", "execution_count": 1, "id": "02b5201c-bc31-4361-a57f-f58d95adae62", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ipywidgets: 8.1.2\n", "matplotlib: 3.7.4\n", "scanpy : 1.9.8\n", "seaborn : 0.13.0\n", "sys : 3.8.10 (default, Nov 22 2023, 10:22:35) \n", "[GCC 9.4.0]\n", "pandas : 2.0.3\n", "numpy : 1.24.4\n", "json : 2.0.9\n", "\n" ] } ], "source": [ "%matplotlib widget\n", "%load_ext watermark\n", "\n", "import warnings\n", "warnings.filterwarnings('ignore')\n", "\n", "import os, sys, json, operator, getpass\n", "from pathlib import Path\n", "from datetime import datetime\n", "\n", "import numpy as np\n", "import pandas as pd\n", "import scanpy as sc\n", "\n", "import matplotlib\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "import ipywidgets as widgets\n", "\n", "\n", "sc.settings.verbosity = 3 # show some output\n", "sc.settings.file_format_figs = 'svg' # set this to 'svg' (notebook) or 'pdf' (files) if you want vector graphics\n", "sc.settings.savefigs = False\n", "\n", "# plt.rcParams['font.family'] = 'sans-serif'\n", "# plt.rcParams['font.sans-serif'] = 'Arial'\n", "# plt.rc('font', size=14)\n", "\n", "home = str(Path.home())\n", "user = getpass.getuser()\n", "\n", "%watermark --iversions" ] }, { "cell_type": "code", "execution_count": null, "id": "2389390c-ab9a-44da-84a4-df73cce20e25", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 3, "id": "4c834193-f1a4-483c-a1bc-7bdfc92dd0b6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'proteus.45g'" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "with open('/.singularity.d/labels.json') as fh:\n", " singularity = json.load(fh)\n", " \n", "singularity['Version']" ] }, { "cell_type": "code", "execution_count": 4, "id": "7696e6b1-47d9-4d33-95c5-f142508734bb", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "20240412\n" ] } ], "source": [ "now = datetime.now()\n", "prefix = now.strftime('%Y%m%d')\n", "print(prefix)" ] }, { "cell_type": "code", "execution_count": null, "id": "78143475-18b4-41df-a0ed-8c466f8c5573", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "8efd3dd9-2512-447c-aad5-527eec19241a", "metadata": {}, "source": [ "### collect dex file list" ] }, { "cell_type": "code", "execution_count": 6, "id": "c9a30e75-e0db-4579-84f9-a8fe2fe55f86", "metadata": {}, "outputs": [], "source": [ "files = sorted(os.listdir(os.path.join(basedir, 'dex')))" ] }, { "cell_type": "code", "execution_count": null, "id": "a37bcfa0-cdca-4a2b-8bee-26b7ea98b48a", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 8, "id": "4ec3ee28-32ce-482f-bd65-e15740470274", "metadata": {}, "outputs": [], "source": [ "import re" ] }, { "cell_type": "code", "execution_count": 9, "id": "a6e7db93-7da6-49b1-a3c5-e4957251572a", "metadata": {}, "outputs": [], "source": [ "patt = re.compile('^\\d{8}\\_bthalcombo\\_v6b\\_(.+)\\_bthal\\_vs\\_healthy\\_sig\\_genes\\_DESeq2\\_pb.tsv')" ] }, { "cell_type": "code", "execution_count": null, "id": "fed11c9d-3973-4170-87c1-2450a6b54fc9", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 10, "id": "3a0ba3b0-6e7c-4e7d-a13e-9d6881e82fe7", "metadata": {}, "outputs": [], "source": [ "repl = {'C10': 'Mende', 'C45': 'Zeng', 'strict': 'pheno'}" ] }, { "cell_type": "code", "execution_count": null, "id": "3e2b998b-3c42-4643-957e-5eb1c64b0b5c", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "d6defa3b-0571-477a-8581-fae088b721ee", "metadata": {}, "source": [ "### parse dex files" ] }, { "cell_type": "code", "execution_count": 23, "id": "88bda9da-e678-467c-89a4-b06a92464f9c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mende_cluster_02-MEP2\n", "Mende_cluster_04-HSC-MPP1\n", "Mende_cluster_05-HSC-MPP3\n", "Mende_cluster_08-EryP\n", "Mende_cluster_10-MPP-to-MEP\n", "Mende_cluster_12-MEP1\n", "Mende_lineage_HSC-MPP\n", "Mende_lineage_MDP\n", "Mende_lineage_early-MEMBP\n", "Mende_lineage_late-MEMBP\n", "Mende_lineage_late-MyP\n", "Mende_lineage_primed-MPP\n", "Zeng_cluster_BFU-E\n", "Zeng_cluster_CFU-E\n", "Zeng_cluster_GMP-Mono\n", "Zeng_lineage_Erytroid\n", "Zeng_lineage_HSC\n", "Zeng_lineage_MDP\n", "Zeng_lineage_MEP\n", "Zeng_lineage_MPP-MkEry\n", "Zeng_lineage_MPP-MyLy\n", "pheno_HSCMPP_population\n", "22\n", "22\n" ] } ], "source": [ "dex = {}\n", "counter = 0\n", "\n", "for f in files:\n", " m = patt.search(f)\n", " if m:\n", " temp = m.group(1)\n", " temp = temp.split('_')\n", " temp = [ repl[temp[0]] ] + temp[1:]\n", " comp = '_'.join(temp)\n", " \n", " dex[comp] = pd.read_csv('../dex/'+f, sep='\\t')\n", " counter += 1\n", " print(comp)\n", " \n", "print(len(files))\n", "print(counter)" ] }, { "cell_type": "code", "execution_count": null, "id": "52bfc29d-b5e6-4304-ab41-bd4344d82339", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "bdf67b7f-f74e-4d13-8ad7-63d482aaa2f7", "metadata": {}, "source": [ "### make summary" ] }, { "cell_type": "code", "execution_count": 25, "id": "a165f92a-7635-4b6d-8fbe-8ead957f8128", "metadata": {}, "outputs": [], "source": [ "summary = []" ] }, { "cell_type": "code", "execution_count": 26, "id": "58a10438-2e17-47d9-8767-7fdaad012c51", "metadata": {}, "outputs": [], "source": [ "for k in dex.keys():\n", " summary.append([k, dex[k].shape[0], str(sum( dex[k].log2FoldChange > 0 )), str(sum( dex[k].log2FoldChange < 0 ))])" ] }, { "cell_type": "code", "execution_count": null, "id": "b7c8e5aa-6b54-461e-8238-b27ad8eaa4ac", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 27, "id": "418fca10-d67b-43a2-ade9-d479ed88cc4c", "metadata": {}, "outputs": [], "source": [ "summary = pd.DataFrame(summary, columns=['Subset', 'Total', 'Upregulated', 'Downregulated'])" ] }, { "cell_type": "code", "execution_count": 35, "id": "8bc43c6b-5021-4db7-bf47-aa21c58d4e69", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SubsetTotalUpregulatedDownregulated
0Mende_cluster_02-MEP215114
1Mende_cluster_04-HSC-MPP11032578
2Mende_cluster_05-HSC-MPP3295122173
3Mende_cluster_08-EryP14311
4Mende_cluster_10-MPP-to-MEP52745
5Mende_cluster_12-MEP1505
6Mende_lineage_HSC-MPP356153203
7Mende_lineage_MDP000
8Mende_lineage_early-MEMBP17215
9Mende_lineage_late-MEMBP22319
10Mende_lineage_late-MyP761066
11Mende_lineage_primed-MPP52745
12Zeng_cluster_BFU-E918
13Zeng_cluster_CFU-E624
14Zeng_cluster_GMP-Mono404
15Zeng_lineage_Erytroid21120
16Zeng_lineage_HSC272115157
17Zeng_lineage_MDP606
18Zeng_lineage_MEP33528
19Zeng_lineage_MPP-MkEry1042678
20Zeng_lineage_MPP-MyLy13635101
21pheno_HSCMPP_population303714651572
\n", "
" ], "text/plain": [ " Subset Total Upregulated Downregulated\n", "0 Mende_cluster_02-MEP2 15 1 14\n", "1 Mende_cluster_04-HSC-MPP1 103 25 78\n", "2 Mende_cluster_05-HSC-MPP3 295 122 173\n", "3 Mende_cluster_08-EryP 14 3 11\n", "4 Mende_cluster_10-MPP-to-MEP 52 7 45\n", "5 Mende_cluster_12-MEP1 5 0 5\n", "6 Mende_lineage_HSC-MPP 356 153 203\n", "7 Mende_lineage_MDP 0 0 0\n", "8 Mende_lineage_early-MEMBP 17 2 15\n", "9 Mende_lineage_late-MEMBP 22 3 19\n", "10 Mende_lineage_late-MyP 76 10 66\n", "11 Mende_lineage_primed-MPP 52 7 45\n", "12 Zeng_cluster_BFU-E 9 1 8\n", "13 Zeng_cluster_CFU-E 6 2 4\n", "14 Zeng_cluster_GMP-Mono 4 0 4\n", "15 Zeng_lineage_Erytroid 21 1 20\n", "16 Zeng_lineage_HSC 272 115 157\n", "17 Zeng_lineage_MDP 6 0 6\n", "18 Zeng_lineage_MEP 33 5 28\n", "19 Zeng_lineage_MPP-MkEry 104 26 78\n", "20 Zeng_lineage_MPP-MyLy 136 35 101\n", "21 pheno_HSCMPP_population 3037 1465 1572" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "summary" ] }, { "cell_type": "code", "execution_count": null, "id": "f964c8e5-1b21-438e-816a-a131b920cf98", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 40, "id": "3f5b8eb3-1354-415c-9158-39d89da675da", "metadata": {}, "outputs": [], "source": [ "# summary.to_excel('../output/20240412_DESEq2_pseudobulk_bthal_vs_healthy_comparisons.xlsx', index=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "7c07f0b6-5c92-46dc-8da2-6f912f78ef47", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "99902890-81d5-4539-baeb-1a4729bd9652", "metadata": {}, "source": [ "### write out dex results (1 subset per tab)" ] }, { "cell_type": "code", "execution_count": 41, "id": "603f4143-dab0-4eb9-b226-b1ee6abe2925", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dict_keys(['Mende_cluster_02-MEP2', 'Mende_cluster_04-HSC-MPP1', 'Mende_cluster_05-HSC-MPP3', 'Mende_cluster_08-EryP', 'Mende_cluster_10-MPP-to-MEP', 'Mende_cluster_12-MEP1', 'Mende_lineage_HSC-MPP', 'Mende_lineage_MDP', 'Mende_lineage_early-MEMBP', 'Mende_lineage_late-MEMBP', 'Mende_lineage_late-MyP', 'Mende_lineage_primed-MPP', 'Zeng_cluster_BFU-E', 'Zeng_cluster_CFU-E', 'Zeng_cluster_GMP-Mono', 'Zeng_lineage_Erytroid', 'Zeng_lineage_HSC', 'Zeng_lineage_MDP', 'Zeng_lineage_MEP', 'Zeng_lineage_MPP-MkEry', 'Zeng_lineage_MPP-MyLy', 'pheno_HSCMPP_population'])" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dex.keys()" ] }, { "cell_type": "code", "execution_count": null, "id": "f3a53b3d-21e0-4d34-9b8c-1224fad79e4e", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "3e8074ee-679d-447a-bc5b-030a2a43a7da", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 24, "id": "151f34e9-7fd6-4185-a933-b48db352093b", "metadata": {}, "outputs": [], "source": [ "# https://stackoverflow.com/questions/21981820/creating-multiple-excel-worksheets-using-data-from-a-pandas-dataframe" ] }, { "cell_type": "code", "execution_count": 43, "id": "fc99c873-f8b1-469d-a712-ed6a168d0bd7", "metadata": {}, "outputs": [], "source": [ "with pd.ExcelWriter('../output/20240412_DESEq2_pseudobulk_bthal_vs_healthy_comparisons.xlsx', engine='openpyxl') as writer:\n", " \n", " summary.to_excel(writer, sheet_name='Summary', index=False)\n", " \n", " for k in dex.keys():\n", " dex[k].to_excel(writer, sheet_name=k, index=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "918ab08c-d25e-4d2d-950a-aa23523200be", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 39, "id": "c6b16cee-95b9-45e3-bfd6-8c816ac08401", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
genebaseMeanlog2FoldChangelfcSEpvaluepadj
0IFI62701.297655-2.3937320.0383460.0000000.000000
1GADD45A1606.359692-2.5965060.0516630.0000000.000000
2IFI44L2107.963029-2.4740580.0437390.0000000.000000
3MCL14403.662831-1.2066980.0304630.0000000.000000
4ID22049.391917-3.2851050.0490600.0000000.000000
.....................
3032AKT3235.660229-0.2284590.1172860.0099270.049581
3033LGALS311.3472100.4854610.5819060.0099350.049607
3034PTGR174.2820790.3266480.2224200.0099670.049750
3035CENPH431.522818-0.1827870.0864050.0099790.049791
3036NXPE3151.781287-0.2590210.1433650.0100150.049958
\n", "

3037 rows × 6 columns

\n", "
" ], "text/plain": [ " gene baseMean log2FoldChange lfcSE pvalue padj\n", "0 IFI6 2701.297655 -2.393732 0.038346 0.000000 0.000000\n", "1 GADD45A 1606.359692 -2.596506 0.051663 0.000000 0.000000\n", "2 IFI44L 2107.963029 -2.474058 0.043739 0.000000 0.000000\n", "3 MCL1 4403.662831 -1.206698 0.030463 0.000000 0.000000\n", "4 ID2 2049.391917 -3.285105 0.049060 0.000000 0.000000\n", "... ... ... ... ... ... ...\n", "3032 AKT3 235.660229 -0.228459 0.117286 0.009927 0.049581\n", "3033 LGALS3 11.347210 0.485461 0.581906 0.009935 0.049607\n", "3034 PTGR1 74.282079 0.326648 0.222420 0.009967 0.049750\n", "3035 CENPH 431.522818 -0.182787 0.086405 0.009979 0.049791\n", "3036 NXPE3 151.781287 -0.259021 0.143365 0.010015 0.049958\n", "\n", "[3037 rows x 6 columns]" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dex[k]" ] }, { "cell_type": "code", "execution_count": null, "id": "15c1507e-70bc-4a8f-8666-2aa08ee1b046", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "420817ea-4e13-42c6-a71b-2357dbe914a8", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "601e1302-91ec-491e-ae58-b5863013de2a", "metadata": {}, "source": [ "# END" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.8" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "state": {}, "version_major": 2, "version_minor": 0 } } }, "nbformat": 4, "nbformat_minor": 5 }