Austin Smith paper

Author

Martin Proks

Published

May 1, 2026

%matplotlib inline

import numpy as np
import pandas as pd
import seaborn as sns

import scvi
import scanpy as sc
import scanpy.external as sce
import scFates as scf
import matplotlib.pyplot as plt

import warnings
from numba.core.errors import NumbaDeprecationWarning
warnings.simplefilter('ignore', category=NumbaDeprecationWarning)
warnings.simplefilter('ignore', category=FutureWarning)
warnings.simplefilter('ignore', category=UserWarning)
/home/fdb589/projects/data/Brickman/conda/envs/scvi-1.0.0/lib/python3.10/site-packages/scvi/_settings.py:63: UserWarning: Since v1.0.0, scvi-tools no longer uses a random seed by default. Run `scvi.settings.seed = 0` to reproduce results from previous versions.
  self.seed = seed
/home/fdb589/projects/data/Brickman/conda/envs/scvi-1.0.0/lib/python3.10/site-packages/scvi/_settings.py:70: UserWarning: Setting `dl_pin_memory_gpu_training` is deprecated in v1.0 and will be removed in v1.1. Please pass in `pin_memory` to the data loaders instead.
  self.dl_pin_memory_gpu_training = (
sc.set_figure_params(figsize=(10, 6))

%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

1 Austin Smith dataset

AS_MATRIX_URL = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE166nnn/GSE166422/matrix/GSE166422_series_matrix.txt.gz"
metadata = pd.read_table(AS_MATRIX_URL, skiprows=27, index_col = 0).T
metadata['ID_REF'].to_csv('../pipeline/fetchngs/human_GSE166422.tsv', index=None, header=None, sep='\t')

2 fetch-ngs

nf-core_tower.sh Guo_2021 \
    nextflow run nf-core/fetchngs \
    -r 1.11.0 \
    --input /projects/dan1/data/Brickman/projects/proks-salehin-et-al-2023/pipeline/fetchngs/human_GSE166422.tsv

3 Dataset preprocessing

adata_rivron = sc.read('../data/external/aligned/human/rivron_2022_reprocessed.h5ad')
adata_rivron
gtf = pd.read_table("../data/external/human/Homo_sapiens.GRCh38.110.gene_length.tsv", index_col=0)
gene_lengths = gtf[['median']].copy()
gene_lengths.columns = ['length']
def normalize_smartseq(adata: sc.AnnData, gene_len: pd.DataFrame) -> sc.AnnData:
    print("SMART-SEQ: Normalization")

    common_genes = adata.var_names.intersection(gene_len.index)
    print(f"SMART-SEQ: Common genes {common_genes.shape[0]}")

    lengths = gene_len.loc[common_genes, "length"].values
    normalized = sc.AnnData(adata[:, common_genes].X, obs=adata.obs, dtype=np.float32)
    normalized.var_names = common_genes
    normalized.X = normalized.X / lengths * np.median(lengths)
    normalized.X = np.rint(normalized.X)

    return normalized
normalize_smartseq(adata_rivron, gene_lengths)
metadata_rivron = adata_rivron.obs
metadata_rivron_clean = metadata_rivron.loc[:,['sample']]
metadata_rivron_clean['sample_title'] = metadata_rivron.sample_title.str.extract(r'^(.*)-', expand = False)
metadata_rivron_clean.sample_title.unique()
metadata_rivron_clean = metadata_rivron_clean.loc[metadata_rivron_clean.sample_title.isin(['blastoid 96h TROP2 pl', 'naive H9', 'blastoid 24h', 'blastoid 60h TROP2 pl', 'blastoid 60h TROP2 min', 'blastoid 96h DN', 'blastoid 96h PDGFRa pl', 'blastoid 60h PDGFRa pl'])].copy()
metadata_rivron_clean['batch'] = 'Rivron'
metadata_rivron_clean['time'] = metadata_rivron_clean['sample_title']
metadata_rivron_clean['flow'] = metadata_rivron_clean['sample_title']
time_replace_dict = {
    'naive H9': '0h',
    'blastoid 24h': '24h',
    'blastoid 60h TROP2 pl': '60h',
    'blastoid 60h TROP2 min': '60h',
    'blastoid 60h PDGFRa pl': '60h',
    'blastoid 96h DN': '96h',
    'blastoid 96h PDGFRa pl': '96h',
    'blastoid 96h TROP2 pl': '96h'
}

flow_replace_dict = {
    'naive H9': 'naive',
    'blastoid 24h': 'na',
    'blastoid 60h TROP2 pl': 'TROP2+',
    'blastoid 60h TROP2 min': 'TROP2-',
    'blastoid 60h PDGFRa pl': 'PDGFRA+',
    'blastoid 96h DN': 'Double-neg',
    'blastoid 96h PDGFRa pl': 'PDGFRA+',
    'blastoid 96h TROP2 pl': 'TROP2+'
}

metadata_rivron_clean = metadata_rivron_clean.replace({'time': time_replace_dict, 'flow': flow_replace_dict})
adata_rivron = adata_rivron[metadata_rivron_clean.index].copy()
adata_rivron.obs = metadata_rivron_clean
adata_rivron
adata_rivron.var['mt'] = adata_rivron.var.gene_symbol.str.startswith('MT-')
sc.pp.calculate_qc_metrics(adata_rivron, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
sns.violinplot(y=adata_rivron.obs['pct_counts_mt'], orient='v')
adata_rivron.obs
sns.scatterplot(x='total_counts', y='n_genes_by_counts', data=adata_rivron.obs, hue='batch')
adata_rivron = adata_rivron[adata_rivron.obs.pct_counts_mt < 12.5].copy()
sc.pp.filter_cells(adata_rivron, min_counts=2.5e5)
sc.pp.filter_cells(adata_rivron, max_counts=2.5e6)
sc.pp.filter_cells(adata_rivron, min_genes=2_000)
adata_rivron.layers["counts"] = adata_rivron.X.copy()
sc.pp.normalize_total(adata_rivron, target_sum=10_000)
sc.pp.log1p(adata_rivron)
adata_rivron.raw = adata_rivron
# remove mitochondrial genes
adata_rivron = adata_rivron[:, adata_rivron.var[~adata_rivron.var.gene_symbol.str.startswith('MT-')].index].copy()

# remove ribosomal genes
adata_rivron = adata_rivron[:, adata_rivron.var[~adata_rivron.var.gene_symbol.str.startswith(('RPS', 'RPL'))].index].copy()
adata_rivron.write_h5ad('../results/06_human_Rivron.h5ad')
query = sc.read_h5ad('../results/06_human_Rivron.h5ad')
query.obs['experiment'] = 'Rivron'
query
lvae = scvi.model.SCANVI.load("../results/02_human_integration/05_scanvi_ns15/")
#lvae = scvi.model.SCANVI.load("../results/deprecated/human_integration/version_1/scanvi/")
scvi.model.SCVI.prepare_query_anndata(query, lvae)
lvae_q = scvi.model.SCANVI.load_query_data(query, lvae)
lvae_q.train(
    max_epochs=100,
    plan_kwargs=dict(weight_decay=0.0),
    check_val_every_n_epoch=10,
    early_stopping=True
)
query.obsm["X_scANVI"] = lvae_q.get_latent_representation()
query.obs["predictions"] = lvae_q.predict()
query.obs['entropy'] = 1 - lvae_q.predict(soft=True).max(axis=1)
pd.crosstab(query.obs.predictions, query.obs.flow)
pd.crosstab(query.obs.predictions, query.obs.time)
sc.pp.highly_variable_genes(
    query,
    flavor="seurat_v3",
    n_top_genes=5_000,
    layer="counts",
    batch_key="batch",
    subset=True,
)
sc.pp.neighbors(query)
sc.tl.umap(query)
sc.pl.umap(query, color=['predictions','time', 'flow', 'entropy'], ncols=1)