Commit c4cc0487 authored by Ivan Merelli's avatar Ivan Merelli
Browse files

uploaded software

parent 7bcc2d92
No related merge requests found
Showing with 381 additions and 1 deletion
+381 -1
MIT License
Copyright (c) 2022 scVAR
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
# Scvar
# EXAMPLE
This is scVAR
A nice software for integrating scRNA-seq and genomic data
# scVar
# import packs
import numpy as np
import pandas as pd
import scanpy as sc
import gc
import torch
import torch.nn.functional as F
import umap
import anndata
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics import normalized_mutual_info_score
from sklearn.feature_extraction.text import TfidfTransformer
from scipy import io
from scipy import sparse
import matplotlib.pyplot as plt
### FUNCTION ###
################
def transcriptomicAnalysis(path_10x, bcode_variants, high_var_genes_min_mean=0.015, high_var_genes_max_mean=3, high_var_genes_min_disp=0.75,
min_genes=200, min_cells=3, max_pct_mt=100, n_neighbors=20, n_pcs=30, transcript_key='trans', manual_matrix_reading = False):
# load sparse matrix
if not manual_matrix_reading:
adata = sc.read_10x_mtx(path=path_10x, make_unique=True)
else:
######## alternativa per PTXX and GLIO
# load sparse matrix
rna = path_10x+'matrix.mtx.gz'
barcode = path_10x+'barcodes.tsv.gz'
feat = path_10x+'features.tsv.gz'
rna = io.mmread(rna)
B = rna.todense()
B = B.T
barcode = pd.read_csv(barcode, sep = '\t', header=None, names=['bcode'])
barcode.index=barcode.iloc[:,0]
barcode = barcode.drop('bcode', axis=1)
feat = pd.read_csv(feat, sep = '\t', header=None, names = ['gene_ids', 'gene_symbol', 'feature_types'])
feat.index=feat['gene_symbol']
feat = feat.drop('gene_symbol', axis=1)
adata = anndata.AnnData(X=B, obs=barcode, var=feat)
adata.X = sparse.csr_matrix(adata.X)
################
bcode_var = pd.read_csv(bcode_variants, sep = '\t', header=None)[0]
adata = adata[adata.obs.index.isin(bcode_var)]
# use Scanpy to preprocess data
sc.pp.filter_cells(adata, min_genes=min_genes)
sc.pp.filter_genes(adata, min_cells=min_cells)
adata.var['mt'] = adata.var_names.str.startswith('MT-')
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
adata = adata[adata.obs.pct_counts_mt < max_pct_mt, :]
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, min_mean=high_var_genes_min_mean, max_mean=high_var_genes_max_mean, min_disp=high_var_genes_min_disp)
sc.pp.pca(adata, n_comps=n_pcs, use_highly_variable=True)
sc.pp.neighbors(adata, n_neighbors=n_neighbors, n_pcs=n_pcs, random_state=42)
sc.tl.umap(adata, random_state=42)
adata.obsm[transcript_key+'_umap'] = adata.obsm['X_umap']
# Scale data
scaler = StandardScaler()
adata.uns[transcript_key+'_X'] = scaler.fit_transform(adata.X.toarray())
return(adata)
def variantAnalysis(adata, matrix_path, bcode_path, variants_path, var_filter=0.2, n_neighbors=20, n_pcs=30,variant_key='variant'):
# load sparse matrix
var = matrix_path
barcode = bcode_path
snv = variants_path
var = io.mmread(var)
var = var.todense()
barcode = pd.read_csv(barcode, sep = '\t', header=None)[0]
snv = pd.read_csv(snv, sep = '\t', header=None)[0]
var = pd.DataFrame(var, snv, barcode)
var = var.T
# filter col
var = var.loc[list(adata.obs.index) , ]
col_sum = var.sum()
nrow = len(var)
selected_var = var.columns[col_sum/nrow > var_filter]
selected_var = selected_var.tolist()
var = var.loc[:, selected_var] # filtro prima per le cellule rimaste nella trascrittomica e poi per le varianit trascritte
# Scaling
scaler = StandardScaler()
var_X = scaler.fit_transform(var)
# find umap variant_based
adata.uns[variant_key+'_orig_mat'] = var
adata.uns[variant_key+'_X'] = var_X
adata.obsm[variant_key+'_pca'] = sc.pp.pca(adata.uns[variant_key+'_X'])
sc.pp.neighbors(adata, use_rep=variant_key+'_pca', n_neighbors=n_neighbors, n_pcs=n_pcs, key_added=variant_key+'_neighbors', random_state=42)
umap_data = sc.tl.umap(adata, neighbors_key=variant_key+'_neighbors', copy=True, random_state=42)
adata.obsm[variant_key+'_umap'] = umap_data.obsm['X_umap']
return(adata)
def calcOmicsClusters(adata, omic_key, res=0.5, n_neighbors=None, n_pcs=None):
if n_neighbors is None:
n_neighbors=adata.uns['neighbors']['params']['n_neighbors']
if n_pcs is None:
n_pcs=adata.varm['PCs'].shape[1]
neig_name = omic_key+'_'+str(n_neighbors)+'_neighbors_'+str(n_pcs)+'_PCs'
sc.pp.neighbors(adata, n_neighbors=n_neighbors, n_pcs=n_pcs, use_rep=omic_key+'_umap', key_added=neig_name)
sc.tl.leiden(adata,resolution=res, key_added=omic_key+'_clust_'+str(res), neighbors_key=neig_name)
return(adata)
def weightsInit(m):
if isinstance(m, torch.nn.Linear):
torch.nn.init.xavier_uniform(m.weight.data)
m.bias.data.zero_()
def omicsIntegration(adata, transcript_key='trans', variant_key='variant', clf_out=25, integration_key='int'): # forse clf_out andrebbe settato con il numero di tipi cellulari che so essere presenti
# Use integration
rna_X = adata.uns[transcript_key+'_X']
var_X = adata.uns[variant_key+'_X']
net = pairedIntegration(input_dim_a = rna_X.shape[1], input_dim_b = var_X.shape[1], clf_out=clf_out)
net.apply(weightsInit)
# Training process
pairedIntegrationTrainer(X_a = rna_X, X_b = var_X, model = net, num_epoch=50)
# Integrates visualization
net.to(torch.device("cpu"))
X_all_tensor_a = torch.tensor(rna_X).float()
X_all_tensor_b = torch.tensor(var_X).float()
y_pred_a = net.encoder_a(X_all_tensor_a)
y_pred_a = F.normalize(y_pred_a, dim=1,p=2)
y_pred_a = torch.Tensor.cpu(y_pred_a).detach().numpy()
y_pred_b = net.encoder_b(X_all_tensor_b)
y_pred_b = F.normalize(y_pred_b, dim=1,p=2)
y_pred_b = torch.Tensor.cpu(y_pred_b).detach().numpy()
# Coord Mean
y_pred = np.concatenate((y_pred_a, y_pred_b),axis=0)
umaps = umap.UMAP(n_neighbors=5, min_dist=0.0, n_components=2, metric="euclidean", random_state=42).fit(y_pred)
embedding = umaps.transform(y_pred)
embedding = pd.DataFrame(embedding)
embedding.columns=['UMAP1','UMAP2']
emb = embedding.iloc[:,:2].values
rna_coord = emb[slice(0,int(emb.shape[0]/2))]
variants_coord = emb[slice(int(emb.shape[0]/2),int(emb.shape[0]))]
new_umap = list()
for i in range(0,rna_coord.shape[0]):
X_rna = rna_coord[i][0]
X_var = variants_coord[i][0]
coord_X = [X_rna, X_var]
coord_X = np.mean(coord_X)
Y_rna = rna_coord[i][1]
Y_var = variants_coord[i][1]
coord_Y = [Y_rna, Y_var]
coord_Y = np.mean(coord_Y)
coord = [coord_X, coord_Y]
new_umap.append(coord)
new_umap = np.array(new_umap)
adata.obsm[integration_key+'_umap'] = new_umap
return(adata)
class pairedIntegration(torch.nn.Module):
def __init__(self,input_dim_a=2000,input_dim_b=2000,clf_out=10):
super(pairedIntegration, self).__init__()
self.input_dim_a = input_dim_a
self.input_dim_b = input_dim_b
self.clf_out = clf_out
self.encoder_a = torch.nn.Sequential(
torch.nn.Linear(self.input_dim_a, 1000),
torch.nn.BatchNorm1d(1000),
torch.nn.ReLU(),
torch.nn.Linear(1000, 512),
torch.nn.BatchNorm1d(512),
torch.nn.ReLU(),
torch.nn.Linear(512, 128),
torch.nn.BatchNorm1d(128),
torch.nn.ReLU())
self.encoder_b = torch.nn.Sequential(
torch.nn.Linear(self.input_dim_b, 1000),
torch.nn.BatchNorm1d(1000),
torch.nn.ReLU(),
torch.nn.Linear(1000, 512),
torch.nn.BatchNorm1d(512),
torch.nn.ReLU(),
torch.nn.Linear(512, 128),
torch.nn.BatchNorm1d(128),
torch.nn.ReLU())
self.clf = torch.nn.Sequential(
torch.nn.Linear(128, self.clf_out),
torch.nn.Softmax(dim=1))
self.feature = torch.nn.Sequential(
torch.nn.Linear(128, 32))
def forward(self, x_a,x_b):
out_a = self.encoder_a(x_a)
f_a = self.feature(out_a)
y_a = self.clf(out_a)
out_b = self.encoder_b(x_b)
f_b = self.feature(out_b)
y_b = self.clf(out_b)
return f_a,y_a,f_b,y_b
def pairedIntegrationTrainer(X_a, X_b, model, batch_size = 512, num_epoch=5,
f_temp = 0.1, p_temp = 1.0):
device = torch.device("cpu")
f_con = contrastiveLoss(batch_size = batch_size,temperature = f_temp)
p_con = contrastiveLoss(batch_size = model.clf_out,temperature = p_temp)
opt = torch.optim.SGD(model.parameters(),lr=0.01, momentum=0.9,weight_decay=5e-4)
for k in range(num_epoch):
model.to(device)
n = X_a.shape[0]
r = np.random.permutation(n)
X_train_a = X_a[r,:]
X_tensor_A=torch.tensor(X_train_a).float()
X_train_b = X_b[r,:]
X_tensor_B=torch.tensor(X_train_b).float()
losses = 0
for j in range(n//batch_size):
inputs_a = X_tensor_A[j*batch_size:(j+1)*batch_size,:].to(device)
inputs_a2 = inputs_a + torch.normal(0,1,inputs_a.shape).to(device)
inputs_a = inputs_a + torch.normal(0,1,inputs_a.shape).to(device)
inputs_b = X_tensor_B[j*batch_size:(j+1)*batch_size,:].to(device)
inputs_b = inputs_b + torch.normal(0,1,inputs_b.shape).to(device)
feas,o,nfeas,no = model(inputs_a,inputs_b)
feas2,o2,_,_ = model(inputs_a2,inputs_b)
fea_mi = f_con(feas,nfeas)+f_con(feas,feas2)
p_mi = p_con(o.T,no.T)+p_con(o.T,o2.T)
loss = fea_mi + p_mi
opt.zero_grad()
loss.backward()
opt.step()
losses += loss.data.tolist()
print("Total loss: "+str(round(losses,4)))
gc.collect()
class contrastiveLoss(torch.nn.Module):
def __init__(self, batch_size, temperature=0.5):
super().__init__()
self.batch_size = batch_size
self.register_buffer("temperature", torch.tensor(temperature))
self.register_buffer("negatives_mask", (~torch.eye(batch_size * 2, batch_size * 2,
dtype=bool)).float())
def forward(self, emb_i, emb_j):
# """
# emb_i and emb_j are batches of embeddings, where corresponding indices are pairs
# z_i, z_j as per SimCLR paper
# """
z_i = F.normalize(emb_i, dim=1,p=2)
z_j = F.normalize(emb_j, dim=1,p=2)
representations = torch.cat([z_i, z_j], dim=0)
similarity_matrix = F.cosine_similarity(representations.unsqueeze(1), representations.unsqueeze(0), dim=2)
sim_ij = torch.diag(similarity_matrix, self.batch_size)
sim_ji = torch.diag(similarity_matrix, -self.batch_size)
positives = torch.cat([sim_ij, sim_ji], dim=0)
nominator = torch.exp(positives / self.temperature)
denominator = self.negatives_mask * torch.exp(similarity_matrix / self.temperature)
loss_partial = -torch.log(nominator / torch.sum(denominator, dim=1))
loss = torch.sum(loss_partial) / (2 * self.batch_size)
return loss
def distributionClusters(adata, control_cl, group_cl, perc_cell_to_show=0.1, figsize = (12,8), dpi=100, save_path=None):
df = adata.obs.groupby([group_cl, control_cl]).size().unstack()
df = df.loc[df.sum(axis=1)/df.values.sum()>=perc_cell_to_show,:] # remove row if group_cluster represents less cells in perc than perc_cell_to_show.
df_rel = df
df = df.div(df.sum(axis=1), axis=0)
df[group_cl] = df.index
plt.rcParams["figure.figsize"] = figsize
plt.rcParams['figure.dpi'] = dpi
plt.rcParams['figure.facecolor'] = '#FFFFFF'
df.plot(
x = group_cl,
kind = 'barh',
stacked = True,
mark_right = True,
)
leg = plt.legend(bbox_to_anchor=(1, 1), loc="upper left")
leg.get_frame().set_edgecolor('black')
plt.xlabel('perc_'+control_cl, loc='center')
for n in df_rel:
for i, (pos_x, ab, value) in enumerate(zip(df.iloc[:, :-1].cumsum(1)[n], df[n], df_rel[n])):
if (value == 0) | (ab <=0.05):
value = ''
plt.text(pos_x-ab/2, i, str(value), va='center', ha='center')
plt.grid(False)
if save_path is not None:
plt.savefig(save_path, bbox_inches='tight')
return(plt.show())
import setuptools
with open("README.md", "r", encoding="utf-8") as fh:
long_description = fh.read()
setuptools.setup(
name="scVAR",
version="0.0.1",
author="Samuele Manessi",
author_email="samuele.manessi@itb.cnr.it",
description="A tool to integrate genomics and transcriptomics in scRNA-seq data.",
long_description=long_description,
long_description_content_type="text/markdown",
#url="https://github.com/ILTUOACCOUNT/ILTUOPACCHETTO",
packages=setuptools.find_packages(include=['scVAR', 'scVAR.*']),
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
],
install_requires = [
'numpy',
'pandas',
'scanpy',
#'gc',
'torch',
'umap',
'anndata',
'sklearn',
'scipy',
'matplotlib'
],
python_requires='>=3.7',
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment