# bthal dataset v6b - collate and export diff expr results

In [1]:
%matplotlib widget
%load_ext watermark

import warnings
warnings.filterwarnings('ignore')

import os, sys, json, operator, getpass
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
import scanpy as sc

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import ipywidgets as widgets


sc.settings.verbosity = 3             # show some output
sc.settings.file_format_figs = 'svg'  # set this to 'svg' (notebook) or 'pdf' (files) if you want vector graphics
sc.settings.savefigs = False

# plt.rcParams['font.family'] = 'sans-serif'
# plt.rcParams['font.sans-serif'] = 'Arial'
# plt.rc('font', size=14)

home = str(Path.home())
user = getpass.getuser()

%watermark --iversions

ipywidgets: 8.1.2
matplotlib: 3.7.4
scanpy    : 1.9.8
seaborn   : 0.13.0
sys       : 3.8.10 (default, Nov 22 2023, 10:22:35) 
[GCC 9.4.0]
pandas    : 2.0.3
numpy     : 1.24.4
json      : 2.0.9



In [3]:
with open('/.singularity.d/labels.json') as fh:
    singularity = json.load(fh)
    
singularity['Version']

'proteus.45g'

In [4]:
now = datetime.now()
prefix = now.strftime('%Y%m%d')
print(prefix)

20240412


### collect dex file list

In [6]:
files = sorted(os.listdir(os.path.join(basedir, 'dex')))

In [8]:
import re

In [9]:
patt = re.compile('^\d{8}\_bthalcombo\_v6b\_(.+)\_bthal\_vs\_healthy\_sig\_genes\_DESeq2\_pb.tsv')

In [10]:
repl = {'C10': 'Mende', 'C45': 'Zeng', 'strict': 'pheno'}

### parse dex files

In [23]:
dex = {}
counter = 0

for f in files:
    m = patt.search(f)
    if m:
        temp = m.group(1)
        temp = temp.split('_')
        temp = [ repl[temp[0]] ] + temp[1:]
        comp = '_'.join(temp)
        
        dex[comp] = pd.read_csv('../dex/'+f, sep='\t')
        counter += 1
        print(comp)
        
print(len(files))
print(counter)

Mende_cluster_02-MEP2
Mende_cluster_04-HSC-MPP1
Mende_cluster_05-HSC-MPP3
Mende_cluster_08-EryP
Mende_cluster_10-MPP-to-MEP
Mende_cluster_12-MEP1
Mende_lineage_HSC-MPP
Mende_lineage_MDP
Mende_lineage_early-MEMBP
Mende_lineage_late-MEMBP
Mende_lineage_late-MyP
Mende_lineage_primed-MPP
Zeng_cluster_BFU-E
Zeng_cluster_CFU-E
Zeng_cluster_GMP-Mono
Zeng_lineage_Erytroid
Zeng_lineage_HSC
Zeng_lineage_MDP
Zeng_lineage_MEP
Zeng_lineage_MPP-MkEry
Zeng_lineage_MPP-MyLy
pheno_HSCMPP_population
22
22


### make summary

In [25]:
summary = []

In [26]:
for k in dex.keys():
    summary.append([k, dex[k].shape[0], str(sum( dex[k].log2FoldChange > 0 )), str(sum( dex[k].log2FoldChange < 0 ))])

In [27]:
summary = pd.DataFrame(summary, columns=['Subset', 'Total', 'Upregulated', 'Downregulated'])

In [35]:
summary

Unnamed: 0,Subset,Total,Upregulated,Downregulated
0,Mende_cluster_02-MEP2,15,1,14
1,Mende_cluster_04-HSC-MPP1,103,25,78
2,Mende_cluster_05-HSC-MPP3,295,122,173
3,Mende_cluster_08-EryP,14,3,11
4,Mende_cluster_10-MPP-to-MEP,52,7,45
5,Mende_cluster_12-MEP1,5,0,5
6,Mende_lineage_HSC-MPP,356,153,203
7,Mende_lineage_MDP,0,0,0
8,Mende_lineage_early-MEMBP,17,2,15
9,Mende_lineage_late-MEMBP,22,3,19


In [40]:
# summary.to_excel('../output/20240412_DESEq2_pseudobulk_bthal_vs_healthy_comparisons.xlsx', index=False)

### write out dex results (1 subset per tab)

In [41]:
dex.keys()

dict_keys(['Mende_cluster_02-MEP2', 'Mende_cluster_04-HSC-MPP1', 'Mende_cluster_05-HSC-MPP3', 'Mende_cluster_08-EryP', 'Mende_cluster_10-MPP-to-MEP', 'Mende_cluster_12-MEP1', 'Mende_lineage_HSC-MPP', 'Mende_lineage_MDP', 'Mende_lineage_early-MEMBP', 'Mende_lineage_late-MEMBP', 'Mende_lineage_late-MyP', 'Mende_lineage_primed-MPP', 'Zeng_cluster_BFU-E', 'Zeng_cluster_CFU-E', 'Zeng_cluster_GMP-Mono', 'Zeng_lineage_Erytroid', 'Zeng_lineage_HSC', 'Zeng_lineage_MDP', 'Zeng_lineage_MEP', 'Zeng_lineage_MPP-MkEry', 'Zeng_lineage_MPP-MyLy', 'pheno_HSCMPP_population'])

In [24]:
# https://stackoverflow.com/questions/21981820/creating-multiple-excel-worksheets-using-data-from-a-pandas-dataframe

In [43]:
with pd.ExcelWriter('../output/20240412_DESEq2_pseudobulk_bthal_vs_healthy_comparisons.xlsx', engine='openpyxl') as writer:
    
    summary.to_excel(writer, sheet_name='Summary', index=False)
    
    for k in dex.keys():
        dex[k].to_excel(writer, sheet_name=k, index=False)

In [39]:
dex[k]

Unnamed: 0,gene,baseMean,log2FoldChange,lfcSE,pvalue,padj
0,IFI6,2701.297655,-2.393732,0.038346,0.000000,0.000000
1,GADD45A,1606.359692,-2.596506,0.051663,0.000000,0.000000
2,IFI44L,2107.963029,-2.474058,0.043739,0.000000,0.000000
3,MCL1,4403.662831,-1.206698,0.030463,0.000000,0.000000
4,ID2,2049.391917,-3.285105,0.049060,0.000000,0.000000
...,...,...,...,...,...,...
3032,AKT3,235.660229,-0.228459,0.117286,0.009927,0.049581
3033,LGALS3,11.347210,0.485461,0.581906,0.009935,0.049607
3034,PTGR1,74.282079,0.326648,0.222420,0.009967,0.049750
3035,CENPH,431.522818,-0.182787,0.086405,0.009979,0.049791


# END