% matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import scvi
import scanpy as sc
import scanpy.external as sce
import scFates as scf
import matplotlib.pyplot as plt
import warnings
from numba.core.errors import NumbaDeprecationWarning
warnings.simplefilter('ignore' , category= NumbaDeprecationWarning)
warnings.simplefilter('ignore' , category= FutureWarning )
warnings.simplefilter('ignore' , category= UserWarning )
/home/fdb589/projects/data/Brickman/conda/envs/scvi-1.0.0/lib/python3.10/site-packages/scvi/_settings.py:63: UserWarning: Since v1.0.0, scvi-tools no longer uses a random seed by default. Run `scvi.settings.seed = 0` to reproduce results from previous versions.
self.seed = seed
/home/fdb589/projects/data/Brickman/conda/envs/scvi-1.0.0/lib/python3.10/site-packages/scvi/_settings.py:70: UserWarning: Setting `dl_pin_memory_gpu_training` is deprecated in v1.0 and will be removed in v1.1. Please pass in `pin_memory` to the data loaders instead.
self.dl_pin_memory_gpu_training = (
sc.set_figure_params(figsize= (10 , 6 ))
% config InlineBackend.print_figure_kwargs= {'facecolor' : "w" }
% config InlineBackend.figure_format= 'retina'
Austin Smith dataset
AS_MATRIX_URL = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE166nnn/GSE166422/matrix/GSE166422_series_matrix.txt.gz"
metadata = pd.read_table(AS_MATRIX_URL, skiprows= 27 , index_col = 0 ).T
metadata['ID_REF' ].to_csv('../pipeline/fetchngs/human_GSE166422.tsv' , index= None , header= None , sep= ' \t ' )
fetch-ngs
nf-core_tower.sh Guo_2021 \
nextflow run nf-core/fetchngs \
-r 1.11.0 \
--input /projects/dan1/data/Brickman/projects/proks-salehin-et-al-2023/pipeline/fetchngs/human_GSE166422.tsv
Dataset preprocessing
adata_rivron = sc.read('../data/external/aligned/human/rivron_2022_reprocessed.h5ad' )
gtf = pd.read_table("../data/external/human/Homo_sapiens.GRCh38.110.gene_length.tsv" , index_col= 0 )
gene_lengths = gtf[['median' ]].copy()
gene_lengths.columns = ['length' ]
def normalize_smartseq(adata: sc.AnnData, gene_len: pd.DataFrame) -> sc.AnnData:
print ("SMART-SEQ: Normalization" )
common_genes = adata.var_names.intersection(gene_len.index)
print (f"SMART-SEQ: Common genes { common_genes. shape[0 ]} " )
lengths = gene_len.loc[common_genes, "length" ].values
normalized = sc.AnnData(adata[:, common_genes].X, obs= adata.obs, dtype= np.float32)
normalized.var_names = common_genes
normalized.X = normalized.X / lengths * np.median(lengths)
normalized.X = np.rint(normalized.X)
return normalized
normalize_smartseq(adata_rivron, gene_lengths)
metadata_rivron = adata_rivron.obs
metadata_rivron_clean = metadata_rivron.loc[:,['sample' ]]
metadata_rivron_clean['sample_title' ] = metadata_rivron.sample_title.str .extract(r'^(.*)-' , expand = False )
metadata_rivron_clean.sample_title.unique()
metadata_rivron_clean = metadata_rivron_clean.loc[metadata_rivron_clean.sample_title.isin(['blastoid 96h TROP2 pl' , 'naive H9' , 'blastoid 24h' , 'blastoid 60h TROP2 pl' , 'blastoid 60h TROP2 min' , 'blastoid 96h DN' , 'blastoid 96h PDGFRa pl' , 'blastoid 60h PDGFRa pl' ])].copy()
metadata_rivron_clean['batch' ] = 'Rivron'
metadata_rivron_clean['time' ] = metadata_rivron_clean['sample_title' ]
metadata_rivron_clean['flow' ] = metadata_rivron_clean['sample_title' ]
time_replace_dict = {
'naive H9' : '0h' ,
'blastoid 24h' : '24h' ,
'blastoid 60h TROP2 pl' : '60h' ,
'blastoid 60h TROP2 min' : '60h' ,
'blastoid 60h PDGFRa pl' : '60h' ,
'blastoid 96h DN' : '96h' ,
'blastoid 96h PDGFRa pl' : '96h' ,
'blastoid 96h TROP2 pl' : '96h'
}
flow_replace_dict = {
'naive H9' : 'naive' ,
'blastoid 24h' : 'na' ,
'blastoid 60h TROP2 pl' : 'TROP2+' ,
'blastoid 60h TROP2 min' : 'TROP2-' ,
'blastoid 60h PDGFRa pl' : 'PDGFRA+' ,
'blastoid 96h DN' : 'Double-neg' ,
'blastoid 96h PDGFRa pl' : 'PDGFRA+' ,
'blastoid 96h TROP2 pl' : 'TROP2+'
}
metadata_rivron_clean = metadata_rivron_clean.replace({'time' : time_replace_dict, 'flow' : flow_replace_dict})
adata_rivron = adata_rivron[metadata_rivron_clean.index].copy()
adata_rivron.obs = metadata_rivron_clean
adata_rivron.var['mt' ] = adata_rivron.var.gene_symbol.str .startswith('MT-' )
sc.pp.calculate_qc_metrics(adata_rivron, qc_vars= ['mt' ], percent_top= None , log1p= False , inplace= True )
sns.violinplot(y= adata_rivron.obs['pct_counts_mt' ], orient= 'v' )
sns.scatterplot(x= 'total_counts' , y= 'n_genes_by_counts' , data= adata_rivron.obs, hue= 'batch' )
adata_rivron = adata_rivron[adata_rivron.obs.pct_counts_mt < 12.5 ].copy()
sc.pp.filter_cells(adata_rivron, min_counts= 2.5e5 )
sc.pp.filter_cells(adata_rivron, max_counts= 2.5e6 )
sc.pp.filter_cells(adata_rivron, min_genes= 2_000 )
adata_rivron.layers["counts" ] = adata_rivron.X.copy()
sc.pp.normalize_total(adata_rivron, target_sum= 10_000 )
sc.pp.log1p(adata_rivron)
adata_rivron.raw = adata_rivron
# remove mitochondrial genes
adata_rivron = adata_rivron[:, adata_rivron.var[~ adata_rivron.var.gene_symbol.str .startswith('MT-' )].index].copy()
# remove ribosomal genes
adata_rivron = adata_rivron[:, adata_rivron.var[~ adata_rivron.var.gene_symbol.str .startswith(('RPS' , 'RPL' ))].index].copy()
adata_rivron.write_h5ad('../results/06_human_Rivron.h5ad' )
query = sc.read_h5ad('../results/06_human_Rivron.h5ad' )
query.obs['experiment' ] = 'Rivron'
query
lvae = scvi.model.SCANVI.load("../results/02_human_integration/05_scanvi_ns15/" )
#lvae = scvi.model.SCANVI.load("../results/deprecated/human_integration/version_1/scanvi/")
scvi.model.SCVI.prepare_query_anndata(query, lvae)
lvae_q = scvi.model.SCANVI.load_query_data(query, lvae)
lvae_q.train(
max_epochs= 100 ,
plan_kwargs= dict (weight_decay= 0.0 ),
check_val_every_n_epoch= 10 ,
early_stopping= True
)
query.obsm["X_scANVI" ] = lvae_q.get_latent_representation()
query.obs["predictions" ] = lvae_q.predict()
query.obs['entropy' ] = 1 - lvae_q.predict(soft= True ).max (axis= 1 )
pd.crosstab(query.obs.predictions, query.obs.flow)
pd.crosstab(query.obs.predictions, query.obs.time)
sc.pp.highly_variable_genes(
query,
flavor= "seurat_v3" ,
n_top_genes= 5_000 ,
layer= "counts" ,
batch_key= "batch" ,
subset= True ,
)
sc.pp.neighbors(query)
sc.tl.umap(query)
sc.pl.umap(query, color= ['predictions' ,'time' , 'flow' , 'entropy' ], ncols= 1 )