01 - NGS collection [human]

Author

Martin Proks

Published

April 8, 2024

import pandas as pd

0.1 Datasets

Dataset	Stages	Technology
Meistermann2021	E0,E2,E2.5,E3.25,E3.5,E3.75,E4.5	SMART-seq2
Petropoulos2016	E3,E4,E5,E6,E7	SMART-Seq2
Yan2013	MII-Oocyte, Zygote, 2C,4C,8C,morula,blastocyst	SMART-seq
Yanagida	E6,E7	SMART-seq
Nakamura	Cynomolgus Monkey E6 onwards	SC3-seq
Blakeley	E6/7	SMARTer-seq
Hang	E6,7,8,9,10,12,14	SMART-seq2
Xue		Tang et al. method

Root Dataset	Dataset	Technology	Download	Notes
Radley et al., 2022
X	Meistermann et al, 2021	SMART-SEQ2	PRJEB30442
X	Petropoulos et al, 2016	SMART-SEQ2	E-MTAB-3929	SCPORTAL
X	Yan et al, 2013	SMART-SEQ	GSE36552	SCPORTAL
X	Yanagida et al, 2021	SMART-SEQ2	GSE171820
X	Nakamura et al, 2017	SMART-SEQ2
Blakeley et al, 2015		SMARTer Ultra Low RNA Kit	GSE66507
Tysen et al, 2021	SMARTSEQ2	Portal	GASTRULATION (CS7)
Hang et al, 2019	SMART-SEQ2	GSE136447
Xue	Tang et al. method	GSE44183

0.2 Meistermann et al., 2021 PRJEB30442

MEISTERMANN_ENA_URL = "https://www.ebi.ac.uk/ena/portal/api/filereport?accession=PRJEB30442&result=read_run&fields=study_accession,sample_accession,experiment_accession,run_accession,tax_id,scientific_name,fastq_ftp,submitted_ftp,sra_ftp,sample_alias&format=tsv&limit=0"
meistermann_metadata = pd.read_table(MEISTERMANN_ENA_URL)

meistermann_sample_annotation = pd.read_csv("../data/external/human/Meistermann_et_al_2021/sampleAnnot.tsv", index_col=0, sep="\t")

meistermann_sample_annotation = meistermann_sample_annotation[meistermann_sample_annotation['Dataset'] == 'ThisPaper']

meistermann_sample_annotation = meistermann_sample_annotation.merge(meistermann_metadata, left_on = 'Name', right_on = 'sample_alias')

meistermann_sample_annotation.run_accession.to_csv("../pipeline/fetchngs/human_PRJEB30442.txt", index=None, header=None)

nf-core_tower.sh \
    Meistermann_2021 \
    nextflow run nf-core/fetchngs \
    -r 1.10.0 \
    --input /projects/dan1/data/Brickman/projects/proks-salehin-et-al-2023/pipeline/fetchngs/human_PRJEB30442.txt

nf-core_tower.sh Meistermann_2021 nextflow run brickmanlab/scrnaseq \
    -r feature/smartseq \
    -c /projects/dan1/data/Brickman/projects/proks-salehin-et-al-2023/pipeline/smartseq.human.config \
    --input /scratch/Brickman/pipelines/Meistermann_2021/results/samplesheet/samplesheet.csv

0.3 Petropoulos et al., 2016 E-MTAB-3929

PETROPOULOS_URL = "https://www.ebi.ac.uk/ena/portal/api/filereport?accession=PRJEB11202&result=read_run&fields=study_accession,sample_accession,experiment_accession,run_accession,tax_id,scientific_name,fastq_ftp,submitted_ftp,sra_ftp,sample_alias&format=tsv&limit=0"
petropoulos_metadata = pd.read_table(PETROPOULOS_URL)

petropoulos_metadata.sample_alias = petropoulos_metadata.sample_alias.str.extract("(E[0-9].*$)")

petropoulos_metadata_short = petropoulos_metadata[['run_accession', 'sample_alias']]

petropoulos_sample_annotation = pd.read_csv("../data/external/human/Meistermann_et_al_2021/sampleAnnot.tsv", index_col=0, sep="\t")

petropoulos_sample_annotation = petropoulos_sample_annotation.loc[petropoulos_sample_annotation.Dataset == 'Petropoulos2016']

petropoulos_sample_annotation = petropoulos_sample_annotation.merge(petropoulos_metadata_short, left_on = 'Name', right_on = 'sample_alias')

petropoulos_sample_annotation.run_accession.to_csv("../pipeline/fetchngs/human_E-MTAB-3929.txt", index=None, header=None)

nf-core_tower.sh \
    Petropoulos_2016 \
    nextflow run nf-core/fetchngs \
    -r 1.10.0 \
    --input /projects/dan1/data/Brickman/projects/proks-salehin-et-al-2023/pipeline/fetchngs/human_E-MTAB-3929.txt

nf-core_tower.sh Petropoulos_2016 nextflow run brickmanlab/scrnaseq \
    -r feature/smartseq \
    -c /projects/dan1/data/Brickman/projects/proks-salehin-et-al-2023/pipeline/smartseq.human.config \
    --input /scratch/Brickman/pipelines/Petropoulos_2016/results/samplesheet/samplesheet.csv

0.4 Xiang et al., 2020 [GSE136447]

xiang_metadata_1 = pd.read_table("https://ftp.ncbi.nlm.nih.gov/geo/series/GSE136nnn/GSE136447/matrix/GSE136447-GPL20795_series_matrix.txt.gz", 
                              skiprows=29, nrows=1, index_col = 0).T

xiang_metadata_2 = pd.read_table("https://ftp.ncbi.nlm.nih.gov/geo/series/GSE136nnn/GSE136447/matrix/GSE136447-GPL23227_series_matrix.txt.gz", 
                              skiprows=29, nrows=1, index_col = 0).T

xiang_metadata = pd.concat([xiang_metadata_1, xiang_metadata_2])

xiang_metadata['Sample_name'] = xiang_metadata.index.to_list()

xiang_metadata['Sample_name'] = xiang_metadata['Sample_name'].str.extract("_(.*$)")

xiang_metadata

!Sample_title	!Sample_geo_accession	Sample_name
Embryo_D6A1S1	GSM4050122	D6A1S1
Embryo_D6A1S2	GSM4050123	D6A1S2
Embryo_D6A1S3	GSM4050124	D6A1S3
Embryo_D6A1S4	GSM4050125	D6A1S4
Embryo_D6A1B1	GSM4050126	D6A1B1
...	...	...
Embryo_D14A1S5	GSM4050628	D14A1S5
Embryo_D14A1S6	GSM4050634	D14A1S6
Embryo_D14A1S7	GSM4050635	D14A1S7
Embryo_D14A1S8	GSM4050636	D14A1S8
Embryo_D14A1S9	GSM4050637	D14A1S9

555 rows × 2 columns

xiang_sample_annotation = pd.read_excel("../data/external/human/Xiang_et_al_2019/41586_2019_1875_MOESM10_ESM.xlsx", skiprows=2, index_col=0)

xiang_sample_annotation = xiang_sample_annotation.merge(xiang_metadata, left_on='Sample ID', right_on = 'Sample_name')

xiang_sample_annotation.columns = ['Day', 'Embryo ID', 'Group', 'GEO_accession', 'Sample_name']

xiang_sample_annotation.GEO_accession.to_csv("../pipeline/fetchngs/human_GSE136447.txt", index=None, header=None)

nf-core_tower.sh \
    Xiang_2020 \
    nextflow run nf-core/fetchngs \
    -r 1.10.0 \
    --input /projects/dan1/data/Brickman/projects/proks-salehin-et-al-2023/pipeline/fetchngs/human_GSE136447.txt

nf-core_tower.sh Xiang_2020_human nextflow run brickmanlab/scrnaseq \
    -r feature/smartseq \
    -c /projects/dan1/data/Brickman/projects/proks-salehin-et-al-2023/pipeline/smartseq.human.config \
    --input /scratch/Brickman/pipelines/Xiang_2020_human/results/samplesheet/samplesheet.csv

0.5 Yan et al., 2013

YAN_MATRIX_URL = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE36nnn/GSE36552/matrix/GSE36552_series_matrix.txt.gz"

yam_metadata = pd.read_table(YAN_MATRIX_URL, skiprows=52, index_col = 0).T

yam_annotations = metadata = pd.read_csv("../data/external/human/Meistermann_et_al_2021/sampleAnnot.tsv", index_col=0, sep="\t")
yam_annotations = yam_annotations[yam_annotations.Dataset == 'Yan2013'].copy()

yam_metadata = yam_metadata[~yam_metadata.index.str.contains('hESC')].copy()

yam_metadata['SampleNames'] = yam_metadata.index.values

yam_metadata['SampleNames'] = yam_metadata['SampleNames'].str.replace("#",".")
yam_metadata['SampleNames'] = yam_metadata['SampleNames'].str.replace(" -Cell", "")
yam_metadata['SampleNames'] = yam_metadata['SampleNames'].str.replace("Late blastocyst ", "lateBlasto")
yam_metadata['SampleNames'] = yam_metadata['SampleNames'].str.replace("Morulae ", "Morula")
yam_metadata['SampleNames'] = yam_metadata['SampleNames'].str.replace("Oocyte ", "Oocyte")
yam_metadata['SampleNames'] = yam_metadata['SampleNames'].str.replace("Zygote ", "Zygote")
yam_metadata['SampleNames'] = yam_metadata['SampleNames'].str.replace("2-cell embryo", "e2C")
yam_metadata['SampleNames'] = yam_metadata['SampleNames'].str.replace("4-cell embryo", "e4C")
yam_metadata['SampleNames'] = yam_metadata['SampleNames'].str.replace("8-cell embryo", "e8C")

yam_metadata = yam_metadata.loc[:,['!Sample_geo_accession','SampleNames']].copy()

yam_metadata.columns = ['Geo_accession', 'SampleNames']

yam_annotations = yam_annotations.merge(yam_metadata, left_index = True, right_on='SampleNames', how = 'right')

yam_annotations.Geo_accession.to_csv("../pipeline/fetchngs/human_GSE36552.txt", index=None, header=None)

nf-core_tower.sh \
    Yan_2013_human \
    nextflow run nf-core/fetchngs \
    -r 1.10.0 \
    --input /projects/dan1/data/Brickman/projects/proks-salehin-et-al-2023/pipeline/fetchngs/human_GSE36552.txt

nf-core_tower.sh Yan_2013_human nextflow run brickmanlab/scrnaseq \
    -r feature/smartseq \
    -c /projects/dan1/data/Brickman/projects/proks-salehin-et-al-2023/pipeline/smartseq.human.config \
    --input /scratch/Brickman/pipelines/Yan_2013_human/results/samplesheet/samplesheet.csv

0.6 Yanagida et al., 2021 [GSE171820]

YANAGIDA_URL = 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE171nnn/GSE171820/matrix/GSE171820_series_matrix.txt.gz'

yanagida_metadata = pd.read_table(YANAGIDA_URL, skiprows=30, index_col = 0).T

yanagida_metadata = yanagida_metadata[yanagida_metadata['!Sample_source_name_ch1'] != 'Blastoid'].copy()

yanagida_metadata['lineage'] = yanagida_metadata[['!Sample_characteristics_ch1']].agg(' '.join, axis=1).str.extract("lineage: (.*) polar_mural")
yanagida_metadata['day'] = yanagida_metadata[['!Sample_characteristics_ch1']].agg(' '.join, axis=1).str.extract("time point: Embryonic day ([0-9]{1})")
yanagida_metadata['side'] = yanagida_metadata[['!Sample_characteristics_ch1']].agg(' '.join, axis=1).str.extract("polar_mural: ([a-z]*)")

yanagida_metadata = yanagida_metadata[['lineage','day','side']].copy()

yanagida_metadata['Geo_accession'] = yanagida_metadata.index.values

yanagida_metadata.Geo_accession.to_csv("../pipeline/fetchngs/human_GSE171820.txt", index=None, header=None)

nf-core_tower.sh \
    Yanagida_2021_human \
    nextflow run nf-core/fetchngs \
    -r 1.10.0 \
    --input /projects/dan1/data/Brickman/projects/proks-salehin-et-al-2023/pipeline/fetchngs/human_GSE171820.txt

nf-core_tower.sh Yanagida_2021_human nextflow run brickmanlab/scrnaseq \
    -r feature/smartseq \
    -c /projects/dan1/data/Brickman/projects/proks-salehin-et-al-2023/pipeline/smartseq.human.config \
    --input /scratch/Brickman/pipelines/Yanagida_2021_human/results/samplesheet/samplesheet.csv

0.7 Xue et al., 2013 [GSE44183]

XUE_URL = 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE44nnn/GSE44183/matrix/GSE44183-GPL11154_series_matrix.txt.gz'

xue_metadata = pd.read_table(XUE_URL, skiprows=36, index_col = 0).T

xue_metadata = xue_metadata[xue_metadata['!Sample_source_name_ch1'].isin(['oocyte','pronucleus','zygote','2-cell blastomere','4-cell blastomere','8-cell blastomere', 'morula'])].copy()
xue_metadata = xue_metadata[['!Sample_geo_accession','!Sample_source_name_ch1']].copy()

reannotate_dict = {
    'oocyte': 'Oocyte',
    'pronucleus': 'Pronucleus',
    'zygote': 'Zygote',
    '2-cell blastomere': '2C',
    '4-cell blastomere': '4C',
    '8-cell blastomere': '8C'
}
xue_metadata.replace(reannotate_dict, inplace=True)

xue_metadata['!Sample_geo_accession'].to_csv("../pipeline/fetchngs/human_GSE44183.txt", index=None, header=None)

nf-core_tower.sh \
    Xue_2013_human \
    nextflow run nf-core/fetchngs \
    -r 1.10.0 \
    --input /projects/dan1/data/Brickman/projects/proks-salehin-et-al-2023/pipeline/fetchngs/human_GSE44183.txt

nf-core_tower.sh Xue_2013_human nextflow run brickmanlab/scrnaseq \
    -r feature/smartseq \
    -c /projects/dan1/data/Brickman/projects/proks-salehin-et-al-2023/pipeline/smartseq.human.config \
    --input /scratch/Brickman/pipelines/Meistermann_2021/results/samplesheet/samplesheet.csv

1 Import data after nf-core single cell RNA-seq pipeline

import scanpy as sc
import anndata as ad
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

1.0.1 Annotations

Annotations (adata.obs)

EmbryonicDay
Lineage
Dataset
Technology

1.1 Datasets

Dataset	Stages	Technology
Meistermann2021	E0,E2,E2.5,E3.25,E3.5,E3.75,E4.5	SMART-seq2
Petropoulos2016	E3,E4,E5,E6,E7	SMART-Seq2
Yan2013	MII-Oocyte, Zygote, 2C,4C,8C,morula,blastocyst	SMART-seq
Yanagida	E6,E7	SMART-seq
Nakamura	Cynomolgus Monkey E6 onwards	SC3-seq
Blakeley	E6/7	SMARTer-seq
Hang	E6,7,8,9,10,12,14	SMART-seq2
Xue		Tang et al. method

Root Dataset	Dataset	Technology	Download	Notes
Radley et al., 2022
X	Meistermann et al, 2021	SMART-SEQ2	PRJEB30442
X	Petropoulos et al, 2016	SMART-SEQ2	E-MTAB-3929	SCPORTAL
X	Yan et al, 2013	SMART-SEQ	GSE36552	SCPORTAL
X	Yanagida et al, 2021	SMART-SEQ2	GSE171820
X	Nakamura et al, 2017	SMART-SEQ2
Blakeley et al, 2015		SMARTer Ultra Low RNA Kit	GSE66507
Tysen et al, 2021	SMARTSEQ2	Portal	GASTRULATION (CS7)
Hang et al, 2019	SMART-SEQ2	GSE136447
Xue	Tang et al. method	GSE44183

1.2 Initial setup

For the pipeline, need to normalise using TPM. This requires average gene lengths. The original iteration of the notebooks linked GENE SYMBOL to MEAN GENE LENGTH. This time, I will instead link ENSEMBL GENE CODE to MEAN GENE LENGTH.

python3.10 ../data/external/human/gtftools.py -l ../data/external/human/Homo_sapiens.GRCh38.110.gene_length.tsv /scratch/Brickman/references/homo_sapiens/ensembl/GRCh38_110/Homo_sapiens.GRCh38.110.gtf

gtf = pd.read_table("../data/external/human/Homo_sapiens.GRCh38.110.gene_length.tsv", index_col=0)

gene_lengths = gtf[['mean']].copy()
gene_lengths.columns = ['length']

def normalize_smartseq(adata: sc.AnnData, gene_len: pd.DataFrame) -> sc.AnnData:
    print("SMART-SEQ: Normalization")

    common_genes = adata.var_names.intersection(gene_len.index)
    print(f"SMART-SEQ: Common genes {common_genes.shape[0]}")

    lengths = gene_len.loc[common_genes, "length"].values
    normalized = sc.AnnData(adata[:, common_genes].X, obs=adata.obs, dtype=np.float32)
    normalized.var_names = common_genes
    normalized.X = normalized.X / lengths * np.median(lengths)
    normalized.X = np.rint(normalized.X)

    return normalized

1.3 Meistermann et al., 2021

For annotation, I will be wiping the published annotations of the Meistermann dataset. Setting everything to ‘Unknown’. The annotations do not contain an ICM.

meistermann_h5ad = sc.read_h5ad("../data/external/human/meistermann_2021_reprocessed.h5ad")

MEISTERMANN_ENA_URL = "https://www.ebi.ac.uk/ena/portal/api/filereport?accession=PRJEB30442&result=read_run&fields=study_accession,sample_accession,experiment_accession,run_accession,tax_id,scientific_name,fastq_ftp,submitted_ftp,sra_ftp,sample_alias&format=tsv&limit=0"
meistermann_metadata = pd.read_table(MEISTERMANN_ENA_URL)
meistermann_sample_annotation = pd.read_csv("../data/external/human/Meistermann_et_al_2021/sampleAnnot.tsv", index_col=0, sep="\t")
meistermann_sample_annotation = meistermann_sample_annotation[meistermann_sample_annotation['Dataset'] == 'ThisPaper']
meistermann_sample_annotation = meistermann_sample_annotation.merge(meistermann_metadata, left_on = 'Name', right_on = 'sample_alias')

meistermann_sample_annotation.columns

Index(['Embryo', 'Branches', 'EmbryoDay', 'Stage', 'BlastoDissectionSide',
       'Dataset', 'Treatment', 'Stirparo.lineage', 'Author.lineage',
       'Pseudotime', 'totalCounts', 'totalGenesExpr', 'clusterUmap',
       'study_accession', 'sample_accession', 'experiment_accession',
       'run_accession', 'tax_id', 'scientific_name', 'fastq_ftp',
       'submitted_ftp', 'sra_ftp', 'sample_alias'],
      dtype='object')

meistermann_sample_annotation.clusterUmap.unique()

array(['B1.EPI', 'B1_B2', 'early_TE', 'EPI', 'medium_TE', 'late_TE',
       'EPI.PrE', 'Morula', 'EightCells', 'EPI.PrE.TE'], dtype=object)

meistermann_sample_annotation.Branches.unique()

array(['6.Epiblast', '3.Early blastocyst', '5.Early trophectoderm',
       '4.Inner cell mass', '8.TE.NR2F2-', '9.TE.NR2F2+', '2.Morula',
       '1.Pre-morula', '7.Primitive endoderm'], dtype=object)

meistermann_h5ad.obs.loc[:,['sample','run_accession']].merge(meistermann_sample_annotation, left_on='run_accession', right_on='run_accession').loc[:,['Stage','Dataset', 'clusterUmap','EmbryoDay']]

	Stage	Dataset	clusterUmap	EmbryoDay
0	B2+	ThisPaper	B1.EPI	5.0
1	B2+	ThisPaper	B1_B2	5.0
2	B2+	ThisPaper	B1.EPI	5.0
3	B2+	ThisPaper	B1.EPI	5.0
4	B2+	ThisPaper	B1_B2	5.0
...	...	...	...	...
145	B5	ThisPaper	medium_TE	6.0
146	B5	ThisPaper	medium_TE	6.0
147	B5	ThisPaper	medium_TE	6.0
148	B5	ThisPaper	medium_TE	6.0
149	B4	ThisPaper	early_TE	6.0

150 rows × 4 columns

meistermann = meistermann_h5ad.copy()

meistermann.obs = meistermann_h5ad.obs.loc[:,['sample','run_accession']].reset_index().merge(meistermann_sample_annotation, left_on='run_accession', right_on='run_accession').set_index('index')

meistermann_reannotation = meistermann.obs[['EmbryoDay','clusterUmap']]

meistermann_reannotation.head()

	EmbryoDay	clusterUmap
index
ERX3015937_ERX3015937	5.0	B1.EPI
ERX3015939_ERX3015939	5.0	B1_B2
ERX3015940_ERX3015940	5.0	B1.EPI
ERX3015941_ERX3015941	5.0	B1.EPI
ERX3015936_ERX3015936	5.0	B1_B2

lineage_renaming = {
    'early_TE': 'Trophectoderm',
    'late_TE': 'Trophectoderm',
    'medium_TE':'Trophectoderm',
    'EPI':'Epiblast',
    'PrE':'Primitive Endoderm',
    'PrE.TE':'Unknown',
    'B1.EPI':'Unknown',
    'EPI.PrE': 'Unknown',
    'EPI.PrE.TE':'Unknown',
    'EPI.early_TE':'Unknown',
    'B1_B2':'Blastocyst',
    'EightCells': '8C',
    'Morula': 'Morula',
}

meistermann_reannotation = meistermann_reannotation.replace({
    'clusterUmap':lineage_renaming
})

meistermann_reannotation.columns = ['day', 'ct']
meistermann_reannotation['ct'] = 'Unknown'
meistermann_reannotation['experiment'] = 'Meistermann_2021'
meistermann_reannotation['technology'] = 'SMARTSeq2'
meistermann_reannotation.head()

	day	ct	experiment	technology
index
ERX3015937_ERX3015937	5.0	Unknown	Meistermann_2021	SMARTSeq2
ERX3015939_ERX3015939	5.0	Unknown	Meistermann_2021	SMARTSeq2
ERX3015940_ERX3015940	5.0	Unknown	Meistermann_2021	SMARTSeq2
ERX3015941_ERX3015941	5.0	Unknown	Meistermann_2021	SMARTSeq2
ERX3015936_ERX3015936	5.0	Unknown	Meistermann_2021	SMARTSeq2

meistermann.obs = meistermann_reannotation

normalize_smartseq(meistermann, gene_lengths)

SMART-SEQ: Normalization
SMART-SEQ: Common genes 62663

AnnData object with n_obs × n_vars = 150 × 62663
    obs: 'day', 'ct', 'experiment', 'technology'

sc.pp.filter_cells(meistermann, min_counts=10)
sc.pp.filter_cells(meistermann, min_genes=10)
meistermann.layers["counts"] = meistermann.X.copy()
sc.pp.normalize_total(meistermann, target_sum=10_000)
sc.pp.log1p(meistermann)
meistermann.raw = meistermann

1.4 Petropoulos et al., 2016

petropoulos_h5ad = sc.read_h5ad("../data/external/human/petropoulos_2016_reprocesses.h5ad")

petropoulos_h5ad

AnnData object with n_obs × n_vars = 1496 × 62754
    obs: 'sample', 'fastq_1', 'run_accession', 'experiment_accession', 'sample_accession', 'secondary_sample_accession', 'study_accession', 'secondary_study_accession', 'submission_accession', 'run_alias', 'experiment_alias', 'sample_alias', 'study_alias', 'library_layout', 'library_selection', 'library_source', 'library_strategy', 'library_name', 'instrument_model', 'instrument_platform', 'scientific_name', 'sample_title', 'experiment_title', 'study_title', 'sample_description', 'fastq_md5', 'fastq_ftp', 'fastq_galaxy', 'fastq_aspera'
    var: 'gene_symbol'

PETROPOULOS_URL = "https://www.ebi.ac.uk/ena/portal/api/filereport?accession=PRJEB11202&result=read_run&fields=study_accession,sample_accession,experiment_accession,run_accession,tax_id,scientific_name,fastq_ftp,submitted_ftp,sra_ftp,sample_alias&format=tsv&limit=0"
petropoulos_metadata = pd.read_table(PETROPOULOS_URL)
petropoulos_metadata.sample_alias = petropoulos_metadata.sample_alias.str.extract("(E[0-9].*$)")
petropoulos_metadata_short = petropoulos_metadata[['run_accession', 'sample_alias']]
petropoulos_sample_annotation = pd.read_csv("../data/external/human/Meistermann_et_al_2021/sampleAnnot.tsv", index_col=0, sep="\t")
petropoulos_sample_annotation = petropoulos_sample_annotation.loc[petropoulos_sample_annotation.Dataset == 'Petropoulos2016']
petropoulos_sample_annotation = petropoulos_sample_annotation.merge(petropoulos_metadata_short, left_on = 'Name', right_on = 'sample_alias')

petropoulos = petropoulos_h5ad.copy()
petropoulos.obs = petropoulos_h5ad.obs.loc[:,['sample','run_accession']].reset_index().merge(petropoulos_sample_annotation, left_on='run_accession', right_on='run_accession').set_index('index')

petropoulos

AnnData object with n_obs × n_vars = 1496 × 62754
    obs: 'sample', 'run_accession', 'Embryo', 'Branches', 'EmbryoDay', 'Stage', 'BlastoDissectionSide', 'Dataset', 'Treatment', 'Stirparo.lineage', 'Author.lineage', 'Pseudotime', 'totalCounts', 'totalGenesExpr', 'clusterUmap', 'sample_alias'
    var: 'gene_symbol'

petropoulos_reannotation = petropoulos.obs

pd.crosstab(petropoulos_reannotation['Stirparo.lineage'], petropoulos_reannotation.Stage)

Stage	8C	B	B2+	M	MC
Stirparo.lineage
EPI	0	44	0	0	0
ICM	0	65	0	0	0
TE	0	927	0	0	0
intermediate	0	66	0	0	0
prE	0	28	0	0	0
undefined	78	43	24	120	47

pd.crosstab(petropoulos_reannotation.Branches, petropoulos_reannotation.EmbryoDay)

EmbryoDay	3.0	4.0	5.0	6.0	7.0
Branches
1.Pre-morula	80	8	0	0	0
2.Morula	0	150	16	0	0
3.Early blastocyst	0	29	116	1	0
4.Inner cell mass	0	0	13	6	0
5.Early trophectoderm	0	0	97	27	2
6.Epiblast	0	0	114	33	16
7.Primitive endoderm	0	0	7	45	42
8.TE.NR2F2-	0	0	3	226	141
9.TE.NR2F2+	0	0	0	71	253

pd.crosstab(petropoulos_reannotation.clusterUmap, petropoulos_reannotation.Branches)

Branches	1.Pre-morula	2.Morula	3.Early blastocyst	4.Inner cell mass	5.Early trophectoderm	6.Epiblast	7.Primitive endoderm	8.TE.NR2F2-	9.TE.NR2F2+
clusterUmap
B1.EPI	0	0	0	1	0	3	0	0	0
B1_B2	0	25	136	2	7	1	0	0	0
EPI	0	0	0	0	0	104	29	0	0
EPI.PrE	0	0	0	0	0	19	9	0	0
EPI.PrE.TE	0	0	0	1	2	8	19	11	3
EPI.early_TE	0	0	0	0	1	14	0	0	0
EightCells	83	0	0	0	0	0	0	0	0
Morula	5	141	7	0	0	0	0	0	0
PrE	0	0	0	0	0	9	32	4	3
PrE.TE	0	0	0	0	0	0	1	0	15
early_TE	0	0	3	15	98	5	0	0	0
late_TE	0	0	0	0	0	0	3	39	290
medium_TE	0	0	0	0	18	0	1	316	13

clusterUmap_renaming = {
    'early_TE': 'Trophectoderm',
    'late_TE': 'Trophectoderm',
    'medium_TE':'Trophectoderm',
    'EPI':'Epiblast',
    'PrE':'Primitive Endoderm',
    'PrE.TE':'Unknown',
    'B1.EPI':'Unknown',
    'EPI.PrE': 'Unknown',
    'EPI.PrE.TE':'Unknown',
    'EPI.early_TE':'Unknown',
    'B1_B2':'Blastocyst',
    'EightCells': '8C',
    'Morula': 'Morula',
}
petropoulos_reannotation = petropoulos_reannotation.replace({
    'clusterUmap':clusterUmap_renaming
})

stirparoLineage_renaming = {
    'EPI':'Epiblast',
    'prE':'Primitive Endoderm',
    'ICM':'Inner Cell Mass',
    'TE': 'Trophectoderm',
    'intermediate': 'Unknown',
    'undefined': 'Unknown'
}
petropoulos_reannotation = petropoulos_reannotation.replace({
    'Stirparo.lineage':stirparoLineage_renaming
})

np.sum(petropoulos_reannotation['Stirparo.lineage'].isna())

petropoulos_reannotation.loc[petropoulos_reannotation['Stirparo.lineage'].isna(),['Stirparo.lineage']] = 'Unknown'

petropoulos_reannotation = petropoulos_reannotation[['EmbryoDay','Dataset','Stirparo.lineage']].copy()
petropoulos_reannotation.columns = ['day','experiment','ct']
petropoulos_reannotation['experiment'] = 'Petropoulos_2016'
petropoulos_reannotation['technology'] = 'SMARTSeq2'
petropoulos_reannotation.head()

	day	experiment	ct	technology
index
ERX1120888_ERX1120888	3.0	Petropoulos_2016	Unknown	SMARTSeq2
ERX1120887_ERX1120887	3.0	Petropoulos_2016	Unknown	SMARTSeq2
ERX1120886_ERX1120886	3.0	Petropoulos_2016	Unknown	SMARTSeq2
ERX1120885_ERX1120885	3.0	Petropoulos_2016	Unknown	SMARTSeq2
ERX1120890_ERX1120890	3.0	Petropoulos_2016	Unknown	SMARTSeq2

petropoulos.obs = petropoulos_reannotation

normalize_smartseq(petropoulos, gene_lengths)

SMART-SEQ: Normalization
SMART-SEQ: Common genes 62663

AnnData object with n_obs × n_vars = 1496 × 62663
    obs: 'day', 'experiment', 'ct', 'technology'

sc.pp.filter_cells(petropoulos, min_counts=10)
sc.pp.filter_cells(petropoulos, min_genes=10)
petropoulos.layers["counts"] = petropoulos.X.copy()
sc.pp.normalize_total(petropoulos, target_sum=10_000)
sc.pp.log1p(petropoulos)
petropoulos.raw = petropoulos

1.5 Xiang 2020

xiang_h5ad = sc.read_h5ad("../data/external/human/xiang_2020_reprocessed.h5ad")

xiang_metadata_1 = pd.read_table("https://ftp.ncbi.nlm.nih.gov/geo/series/GSE136nnn/GSE136447/matrix/GSE136447-GPL20795_series_matrix.txt.gz", 
                              skiprows=29, nrows=1, index_col = 0).T
xiang_metadata_2 = pd.read_table("https://ftp.ncbi.nlm.nih.gov/geo/series/GSE136nnn/GSE136447/matrix/GSE136447-GPL23227_series_matrix.txt.gz", 
                              skiprows=29, nrows=1, index_col = 0).T
xiang_metadata = pd.concat([xiang_metadata_1, xiang_metadata_2])
xiang_metadata['Sample_name'] = xiang_metadata.index.to_list()
xiang_metadata['Sample_name'] = xiang_metadata['Sample_name'].str.extract("_(.*$)")
xiang_sample_annotation = pd.read_excel("../data/external/human/Xiang_et_al_2019/41586_2019_1875_MOESM10_ESM.xlsx", skiprows=2, index_col=0)
xiang_sample_annotation = xiang_sample_annotation.merge(xiang_metadata, left_on='Sample ID', right_on = 'Sample_name')
xiang_sample_annotation.columns = ['Day', 'Embryo ID', 'Group', 'GEO_accession', 'Sample_name']

xiang_sample_annotation

	Day	Embryo ID	Group	GEO_accession	Sample_name
0	D6	D6A1	ICM	GSM4050122	D6A1S1
1	D6	D6A1	EPI	GSM4050123	D6A1S2
2	D6	D6A1	ICM	GSM4050124	D6A1S3
3	D6	D6A1	ICM	GSM4050125	D6A1S4
4	D6	D6A1	ICM	GSM4050126	D6A1B1
...	...	...	...	...	...
550	D14	D14A3	EPI	GSM4050672	D14A3S29
551	D14	D14A3	EVT	GSM4050673	D14A3S30
552	D14	D14A3	CTB	GSM4050674	D14A3S5
553	D14	D14A3	EVT	GSM4050675	D14A3S7
554	D14	D14A3	EVT	GSM4050676	D14A3S8

555 rows × 5 columns

xiang = xiang_h5ad.copy()

xiang.obs = xiang_h5ad.obs.loc[:,['sample','sample_alias']].reset_index().merge(xiang_sample_annotation, left_on='sample_alias', right_on='GEO_accession').set_index('index')

xiang

AnnData object with n_obs × n_vars = 555 × 62754
    obs: 'sample', 'sample_alias', 'Day', 'Embryo ID', 'Group', 'GEO_accession', 'Sample_name'
    var: 'gene_symbol'

xiang_reannotation = xiang.obs

day_renaming = {
    'D10':10,
    'D12':12,
    'D14':14,
    'D6':6,
    'D7':7,
    'D8':8,
    'D9':9,
}

group_renaming = {
    'CTB':'Trophectoderm',
    'EPI':'Epiblast',
    'EVT':'Trophectoderm',
    'ICM':'Inner Cell Mass',
    'PSA-EPI':'PostImplantation-Epiblast',
    'PrE':'Primitive Endoderm',
    'STB':'Trophectoderm'
}

xiang_reannotation = xiang_reannotation.replace({'Day':day_renaming, 'Group': group_renaming})

xiang_reannotation = xiang_reannotation[['Day', 'Group']].copy()
xiang_reannotation.columns = ['day', 'ct']
xiang_reannotation['experiment'] = 'Xiang_2020'
xiang_reannotation['technology'] = 'SMARTSeq2'
xiang_reannotation.head()

	day	ct	experiment	technology
index
SRX6774526_SRX6774526	12	Trophectoderm	Xiang_2020	SMARTSeq2
SRX6774449_SRX6774449	6	Epiblast	Xiang_2020	SMARTSeq2
SRX6774468_SRX6774468	6	Epiblast	Xiang_2020	SMARTSeq2
SRX6774508_SRX6774508	12	Trophectoderm	Xiang_2020	SMARTSeq2
SRX6774478_SRX6774478	10	Epiblast	Xiang_2020	SMARTSeq2

xiang.obs = xiang_reannotation

normalize_smartseq(xiang, gene_lengths)

SMART-SEQ: Normalization
SMART-SEQ: Common genes 62663

AnnData object with n_obs × n_vars = 555 × 62663
    obs: 'day', 'ct', 'experiment', 'technology'

sc.pp.filter_cells(xiang, min_counts=10)
sc.pp.filter_cells(xiang, min_genes=10)
xiang.layers["counts"] = xiang.X.copy()
sc.pp.normalize_total(xiang, target_sum=10_000)
sc.pp.log1p(xiang)
xiang.raw = xiang

1.6 Yan 2013

yan_h5ad = sc.read_h5ad("../data/external/human/yan_2013_reprocessed.h5ad")

YAN_MATRIX_URL = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE36nnn/GSE36552/matrix/GSE36552_series_matrix.txt.gz"
yan_metadata = pd.read_table(YAN_MATRIX_URL, skiprows=52, index_col = 0).T
yan_annotations = metadata = pd.read_csv("../data/external/human/Meistermann_et_al_2021/sampleAnnot.tsv", index_col=0, sep="\t")
yan_annotations = yan_annotations[yan_annotations.Dataset == 'Yan2013'].copy()
yan_metadata = yan_metadata[~yan_metadata.index.str.contains('hESC')].copy()
yan_metadata['SampleNames'] = yan_metadata.index.values

yan_metadata['SampleNames'] = yan_metadata['SampleNames'].str.replace("#",".")
yan_metadata['SampleNames'] = yan_metadata['SampleNames'].str.replace(" -Cell", "")
yan_metadata['SampleNames'] = yan_metadata['SampleNames'].str.replace("Late blastocyst ", "lateBlasto")
yan_metadata['SampleNames'] = yan_metadata['SampleNames'].str.replace("Morulae ", "Morula")
yan_metadata['SampleNames'] = yan_metadata['SampleNames'].str.replace("Oocyte ", "Oocyte")
yan_metadata['SampleNames'] = yan_metadata['SampleNames'].str.replace("Zygote ", "Zygote")
yan_metadata['SampleNames'] = yan_metadata['SampleNames'].str.replace("2-cell embryo", "e2C")
yan_metadata['SampleNames'] = yan_metadata['SampleNames'].str.replace("4-cell embryo", "e4C")
yan_metadata['SampleNames'] = yan_metadata['SampleNames'].str.replace("8-cell embryo", "e8C")

yan_metadata = yan_metadata.loc[:,['!Sample_geo_accession','SampleNames']].copy()
yan_metadata.columns = ['Geo_accession', 'SampleNames']
yan_annotations = yan_annotations.merge(yan_metadata, left_index = True, right_on='SampleNames', how = 'right')

yan = yan_h5ad.copy()

yan.obs = yan_h5ad.obs.loc[:,['sample','sample_alias']].reset_index().merge(yan_annotations, left_on='sample_alias', right_on='Geo_accession').set_index('index')

The Yan 2013 data is encoded by stage (Oocyte, Zygote, etc). To convert to Embryonic day, the samples were encoded as follows:

Zygote –> E0.75; Collected 17h post-IVF
e2C –> E1.25; Collected 27h post-IVF
e4C –> E2.0; Collected 48h post-IVF
e8C –> E3.0

yan_reannotation = yan.obs

yan_reannotation.loc[yan_reannotation['SampleNames'].str.contains('Oocyte'),['EmbryoDay','clusterUmap', 'Stirparo.lineage']] = [0,'Oocyte','Oocyte']
yan_reannotation.loc[yan_reannotation['SampleNames'].str.contains('Zygote'),['EmbryoDay','clusterUmap', 'Stirparo.lineage']] = [0.75, 'Zygote', 'Zygote']
yan_reannotation.loc[yan_reannotation['SampleNames'].str.contains('e2C'),['EmbryoDay','clusterUmap', 'Stirparo.lineage']] = [1.25, '2C', '2C']
yan_reannotation.loc[yan_reannotation['SampleNames'].str.contains('e4C'),['EmbryoDay','clusterUmap', 'Stirparo.lineage']] = [2.0, '4C', '4C']
yan_reannotation.loc[yan_reannotation['SampleNames'].str.contains('e8C'),['EmbryoDay','clusterUmap', 'Stirparo.lineage']] = [3.0, '8C', '8C']
yan_reannotation.loc[yan_reannotation['SampleNames'].str.contains('Morula'),['clusterUmap', 'Stirparo.lineage']] = ['Morula', 'Morula']

clusterUmap_renaming = {
    'early_TE': 'Trophectoderm',
    'late_TE': 'Trophectoderm',
    'medium_TE':'Trophectoderm',
    'EPI':'Epiblast',
    'PrE':'Primitive Endoderm',
    'PrE.TE':'Unknown',
    'B1.EPI':'Unknown',
    'EPI.PrE': 'Unknown',
    'EPI.PrE.TE':'Unknown',
    'EPI.early_TE':'Unknown',
    'B1_B2':'Blastocyst',
    'EightCells': '8C',
    'Morula': 'Morula',
}
yan_reannotation = yan_reannotation.replace({
    'clusterUmap':clusterUmap_renaming
})

stirparoLineage_renaming = {
    'EPI':'Epiblast',
    'prE':'Primitive Endoderm',
    'ICM':'Inner Cell Mass',
    'TE': 'Trophectoderm',
    'intermediate': 'Unknown',
    'undefined': 'Unknown'
}
yan_reannotation = yan_reannotation.replace({
    'Stirparo.lineage':stirparoLineage_renaming
})

yan_reannotation.loc[yan_reannotation['Stirparo.lineage'].isna(),['Stirparo.lineage']] = 'Unknown'

yan_reannotation.head()

	sample	sample_alias	Embryo	Branches	EmbryoDay	Stage	BlastoDissectionSide	Dataset	Treatment	Stirparo.lineage	Author.lineage	Pseudotime	totalCounts	totalGenesExpr	clusterUmap	Geo_accession	SampleNames
index
SRX144398_SRX144398	SRX144398	GSM922204	lateBlasto.1	4.Inner cell mass	6.0	B	NaN	Yan2013	NO	Trophectoderm	TE	32.314831	11403434.0	12724.0	Trophectoderm	GSM922204	lateBlasto.1.11
SRX144343_SRX144343	SRX144343	GSM922149	NaN	NaN	2.0	NaN	NaN	NaN	NaN	4C	NaN	NaN	NaN	NaN	4C	GSM922149	e4C.1.4
SRX144359_SRX144359	SRX144359	GSM922165	8C.2	1.Pre-morula	3.0	8C	NaN	Yan2013	NO	8C	NaN	2.626271	18039226.0	16018.0	8C	GSM922165	e8C.2.4
SRX144408_SRX144408	SRX144408	GSM922214	lateBlasto.2	4.Inner cell mass	6.0	B	NaN	Yan2013	NO	Trophectoderm	TE	35.919631	21209246.0	12577.0	Unknown	GSM922214	lateBlasto.2.9
SRX144361_SRX144361	SRX144361	GSM922167	8C.2	1.Pre-morula	3.0	8C	NaN	Yan2013	NO	8C	NaN	0.329333	17166536.0	17739.0	8C	GSM922167	e8C.2.6

yan_reannotation = yan_reannotation[['EmbryoDay', 'Stirparo.lineage']].copy()
yan_reannotation.columns = ['day','ct']
yan_reannotation['experiment'] = 'Yan_2013'
yan_reannotation['technology'] = 'SMARTSeq'
yan_reannotation.head()

	day	ct	experiment	technology
index
SRX144398_SRX144398	6.0	Trophectoderm	Yan_2013	SMARTSeq
SRX144343_SRX144343	2.0	4C	Yan_2013	SMARTSeq
SRX144359_SRX144359	3.0	8C	Yan_2013	SMARTSeq
SRX144408_SRX144408	6.0	Trophectoderm	Yan_2013	SMARTSeq
SRX144361_SRX144361	3.0	8C	Yan_2013	SMARTSeq

yan.obs = yan_reannotation

normalize_smartseq(yan, gene_lengths)

SMART-SEQ: Normalization
SMART-SEQ: Common genes 62663

AnnData object with n_obs × n_vars = 90 × 62663
    obs: 'day', 'ct', 'experiment', 'technology'

sc.pp.filter_cells(yan, min_counts=10)
sc.pp.filter_cells(yan, min_genes=10)
yan.layers["counts"] = yan.X.copy()
sc.pp.normalize_total(yan, target_sum=10_000)
sc.pp.log1p(yan)
yan.raw = yan

1.7 Yanagida 2021

yanagida_h5ad = sc.read_h5ad("../data/external/human/yanagida_2021_reprocessed.h5ad")

YANAGIDA_URL = 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE171nnn/GSE171820/matrix/GSE171820_series_matrix.txt.gz'
yanagida_metadata = pd.read_table(YANAGIDA_URL, skiprows=30, index_col = 0).T
yanagida_metadata = yanagida_metadata[yanagida_metadata['!Sample_source_name_ch1'] != 'Blastoid'].copy()
yanagida_metadata['lineage'] = yanagida_metadata[['!Sample_characteristics_ch1']].agg(' '.join, axis=1).str.extract("lineage: (.*) polar_mural")
yanagida_metadata['day'] = yanagida_metadata[['!Sample_characteristics_ch1']].agg(' '.join, axis=1).str.extract("time point: Embryonic day ([0-9]{1})")
yanagida_metadata['side'] = yanagida_metadata[['!Sample_characteristics_ch1']].agg(' '.join, axis=1).str.extract("polar_mural: ([a-z]*)")
yanagida_metadata = yanagida_metadata[['lineage','day','side']].copy()
yanagida_metadata['Geo_accession'] = yanagida_metadata.index.values

yanagida = yanagida_h5ad.copy()

yanagida_metadata

!Sample_geo_accession	lineage	day	side	Geo_accession
GSM5234744	Trophectoderm	7	polar	GSM5234744
GSM5234745	Trophectoderm	7	polar	GSM5234745
GSM5234746	Trophectoderm	7	polar	GSM5234746
GSM5234747	Epiblast	6	polar	GSM5234747
GSM5234748	Epiblast	6	polar	GSM5234748
...	...	...	...	...
GSM5235116	Trophectoderm	6	polar	GSM5235116
GSM5235117	Trophectoderm	6	polar	GSM5235117
GSM5235118	Trophectoderm	6	polar	GSM5235118
GSM5235119	Trophectoderm	6	mural	GSM5235119
GSM5235128	Trophectoderm	6	mural	GSM5235128

228 rows × 4 columns

yanagida.obs = yanagida_h5ad.obs.loc[:,['sample','sample_alias']].reset_index().merge(yanagida_metadata, left_on='sample_alias', right_on='Geo_accession').set_index('index')

yanagida_reannotation = yanagida.obs[['lineage','day']]

yanagida_reannotation

	lineage	day
index
SRX10567995_SRX10567995	Trophectoderm	6
SRX10567984_SRX10567984	Trophectoderm	6
SRX10568025_SRX10568025	Trophectoderm	6
SRX10567983_SRX10567983	Trophectoderm	6
SRX10567987_SRX10567987	Trophectoderm	7
...	...	...
SRX10568348_SRX10568348	Trophectoderm	6
SRX10568337_SRX10568337	Trophectoderm	6
SRX10568339_SRX10568339	Trophectoderm	6
SRX10568338_SRX10568338	Trophectoderm	6
SRX10568336_SRX10568336	Trophectoderm	6

228 rows × 2 columns

yanagida_reannotation.lineage.unique()

array(['Trophectoderm', 'Epiblast', 'Unknown', 'Early Trophectoderm',
       'Inner Cell Mass', 'Inner Cell Mass-Trophectoderm Transition',
       'Primitive Endoderm'], dtype=object)

lineage_renaming = {
    'Early Trophectoderm': 'Trophectoderm',
    'Inner Cell Mass-Trophectoderm Transition': 'Unknown',
}
yanagida_reannotation = yanagida_reannotation.replace({'lineage':lineage_renaming})

yanagida_reannotation = yanagida_reannotation[['day', 'lineage']]
yanagida_reannotation.columns = ['day','ct']
yanagida_reannotation['experiment'] = 'Yanagida_2021'
yanagida_reannotation['technology'] = 'SMARTSeq2'
yanagida_reannotation.head()

	day	ct	experiment	technology
index
SRX10567995_SRX10567995	6	Trophectoderm	Yanagida_2021	SMARTSeq2
SRX10567984_SRX10567984	6	Trophectoderm	Yanagida_2021	SMARTSeq2
SRX10568025_SRX10568025	6	Trophectoderm	Yanagida_2021	SMARTSeq2
SRX10567983_SRX10567983	6	Trophectoderm	Yanagida_2021	SMARTSeq2
SRX10567987_SRX10567987	7	Trophectoderm	Yanagida_2021	SMARTSeq2

yanagida.obs = yanagida_reannotation

normalize_smartseq(yanagida, gene_lengths)

SMART-SEQ: Normalization
SMART-SEQ: Common genes 62663

AnnData object with n_obs × n_vars = 228 × 62663
    obs: 'day', 'ct', 'experiment', 'technology'

sc.pp.filter_cells(yanagida, min_counts=10)
sc.pp.filter_cells(yanagida, min_genes=10)
yanagida.layers["counts"] = yanagida.X.copy()
sc.pp.normalize_total(yanagida, target_sum=10_000)
sc.pp.log1p(yanagida)
yanagida.raw = yanagida

1.8 Xue 2013

xue_h5ad = sc.read_h5ad("../data/external/human/xue_2013_reprocessed.h5ad")

XUE_URL = 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE44nnn/GSE44183/matrix/GSE44183-GPL11154_series_matrix.txt.gz'
xue_metadata = pd.read_table(XUE_URL, skiprows=36, index_col = 0).T
xue_metadata = xue_metadata[xue_metadata['!Sample_source_name_ch1'].isin(['oocyte','pronucleus','zygote','2-cell blastomere','4-cell blastomere','8-cell blastomere', 'morula'])].copy()
xue_metadata = xue_metadata[['!Sample_geo_accession','!Sample_source_name_ch1']].copy()
reannotate_dict = {
    'oocyte': 'Oocyte',
    'pronucleus': 'Pronucleus',
    'zygote': 'Zygote',
    '2-cell blastomere': '2C',
    '4-cell blastomere': '4C',
    '8-cell blastomere': '8C',
    'morula': 'Morula',
}
xue_metadata.replace(reannotate_dict, inplace=True)

xue = xue_h5ad.copy()

xue.obs = xue.obs.loc[:,['sample','sample_alias']].reset_index().merge(xue_metadata, left_on='sample_alias', right_on='!Sample_geo_accession').set_index('index')

This dataset contains an additional Pronuclei stage. According to Capmany, et al. (1996), the average time for pronuclei formation is 8h post-IVF. We therefore annotate these cells as {'EmbryonicDay': '0.33', 'Lineage': 'Pronucleus'}

xue.obs

	sample	sample_alias	!Sample_geo_accession	!Sample_source_name_ch1
index
SRX300891_SRX300891	SRX300891	GSM1160130	GSM1160130	8C
SRX300889_SRX300889	SRX300889	GSM1160128	GSM1160128	8C
SRX300873_SRX300873	SRX300873	GSM1160112	GSM1160112	Oocyte
SRX300899_SRX300899	SRX300899	GSM1160138	GSM1160138	Morula
SRX300883_SRX300883	SRX300883	GSM1160122	GSM1160122	2C
SRX300895_SRX300895	SRX300895	GSM1160134	GSM1160134	8C
SRX300892_SRX300892	SRX300892	GSM1160131	GSM1160131	8C
SRX300901_SRX300901	SRX300901	GSM1160140	GSM1160140	Morula
SRX300900_SRX300900	SRX300900	GSM1160139	GSM1160139	Morula
SRX300885_SRX300885	SRX300885	GSM1160124	GSM1160124	4C
SRX300875_SRX300875	SRX300875	GSM1160114	GSM1160114	Oocyte
SRX300897_SRX300897	SRX300897	GSM1160136	GSM1160136	8C
SRX300879_SRX300879	SRX300879	GSM1160118	GSM1160118	Zygote
SRX300896_SRX300896	SRX300896	GSM1160135	GSM1160135	8C
SRX300890_SRX300890	SRX300890	GSM1160129	GSM1160129	8C
SRX300894_SRX300894	SRX300894	GSM1160133	GSM1160133	8C
SRX300881_SRX300881	SRX300881	GSM1160120	GSM1160120	2C
SRX300874_SRX300874	SRX300874	GSM1160113	GSM1160113	Oocyte
SRX300887_SRX300887	SRX300887	GSM1160126	GSM1160126	4C
SRX300880_SRX300880	SRX300880	GSM1160119	GSM1160119	Zygote
SRX300893_SRX300893	SRX300893	GSM1160132	GSM1160132	8C
SRX300878_SRX300878	SRX300878	GSM1160117	GSM1160117	Pronucleus
SRX300888_SRX300888	SRX300888	GSM1160127	GSM1160127	8C
SRX300884_SRX300884	SRX300884	GSM1160123	GSM1160123	4C
SRX300876_SRX300876	SRX300876	GSM1160115	GSM1160115	Pronucleus
SRX300882_SRX300882	SRX300882	GSM1160121	GSM1160121	2C
SRX300886_SRX300886	SRX300886	GSM1160125	GSM1160125	4C
SRX300877_SRX300877	SRX300877	GSM1160116	GSM1160116	Pronucleus

xue_reannotation = xue.obs[['!Sample_source_name_ch1', 'sample_alias']].copy()
xue_reannotation.columns = ['Lineage', 'alias']

xue_reannotation

	Lineage	alias
index
SRX300891_SRX300891	8C	GSM1160130
SRX300889_SRX300889	8C	GSM1160128
SRX300873_SRX300873	Oocyte	GSM1160112
SRX300899_SRX300899	Morula	GSM1160138
SRX300883_SRX300883	2C	GSM1160122
SRX300895_SRX300895	8C	GSM1160134
SRX300892_SRX300892	8C	GSM1160131
SRX300901_SRX300901	Morula	GSM1160140
SRX300900_SRX300900	Morula	GSM1160139
SRX300885_SRX300885	4C	GSM1160124
SRX300875_SRX300875	Oocyte	GSM1160114
SRX300897_SRX300897	8C	GSM1160136
SRX300879_SRX300879	Zygote	GSM1160118
SRX300896_SRX300896	8C	GSM1160135
SRX300890_SRX300890	8C	GSM1160129
SRX300894_SRX300894	8C	GSM1160133
SRX300881_SRX300881	2C	GSM1160120
SRX300874_SRX300874	Oocyte	GSM1160113
SRX300887_SRX300887	4C	GSM1160126
SRX300880_SRX300880	Zygote	GSM1160119
SRX300893_SRX300893	8C	GSM1160132
SRX300878_SRX300878	Pronucleus	GSM1160117
SRX300888_SRX300888	8C	GSM1160127
SRX300884_SRX300884	4C	GSM1160123
SRX300876_SRX300876	Pronucleus	GSM1160115
SRX300882_SRX300882	2C	GSM1160121
SRX300886_SRX300886	4C	GSM1160125
SRX300877_SRX300877	Pronucleus	GSM1160116

embryonictime_annotation = {
    'Oocyte': 0,
    'Pronucleus': 0.33,
    'Zygote': 0.75,
    '2C': 1.25,
    '4C': 2,
    '8C':3,
    'Morula':4,
}

xue_reannotation['EmbryonicDay'] = xue_reannotation['Lineage'].map(embryonictime_annotation)

xue_reannotation = xue_reannotation[['EmbryonicDay', 'Lineage']]
xue_reannotation.columns = ['day','ct']
xue_reannotation['experiment'] = 'Xue_2013'
xue_reannotation['technology'] = 'Tang2009'
xue_reannotation.head()

	day	ct	experiment	technology
index
SRX300891_SRX300891	3.00	8C	Xue_2013	Tang2009
SRX300889_SRX300889	3.00	8C	Xue_2013	Tang2009
SRX300873_SRX300873	0.00	Oocyte	Xue_2013	Tang2009
SRX300899_SRX300899	4.00	Morula	Xue_2013	Tang2009
SRX300883_SRX300883	1.25	2C	Xue_2013	Tang2009

xue.obs = xue_reannotation

normalize_smartseq(xue, gene_lengths)

SMART-SEQ: Normalization
SMART-SEQ: Common genes 62663

AnnData object with n_obs × n_vars = 28 × 62663
    obs: 'day', 'ct', 'experiment', 'technology'

sc.pp.filter_cells(xue, min_counts=10)
sc.pp.filter_cells(xue, min_genes=10)
xue.layers["counts"] = xue.X.copy()
sc.pp.normalize_total(xue, target_sum=10_000)
sc.pp.log1p(xue)
xue.raw = xue

2 Merge Datasets

list_of_datasets = [
    meistermann,
    petropoulos,
    xiang,
    yan,
    yanagida,
    xue,
]

human_adata = ad.concat(list_of_datasets)

human_adata.obs.day = pd.to_numeric(human_adata.obs.day)

human_adata

AnnData object with n_obs × n_vars = 2547 × 62754
    obs: 'day', 'ct', 'experiment', 'technology', 'n_counts', 'n_genes'
    layers: 'counts'

2.1 Reannotation

2.1.0.1 Concatenated cell type and embryonic day

human_adata.obs['ct_fine'] = human_adata.obs.ct.astype(str) + '_' + human_adata.obs.day.astype(str)

human_adata.obs.loc[human_adata.obs.ct == 'Unknown','ct_fine'] = 'Unknown'

human_adata.obs

	day	ct	experiment	technology	n_counts	n_genes	ct_fine
index
ERX3015937_ERX3015937	5.00	Unknown	Meistermann_2021	SMARTSeq2	708313.0	5761	Unknown
ERX3015939_ERX3015939	5.00	Unknown	Meistermann_2021	SMARTSeq2	402557.0	5689	Unknown
ERX3015940_ERX3015940	5.00	Unknown	Meistermann_2021	SMARTSeq2	511338.0	6039	Unknown
ERX3015941_ERX3015941	5.00	Unknown	Meistermann_2021	SMARTSeq2	994383.0	8383	Unknown
ERX3015936_ERX3015936	5.00	Unknown	Meistermann_2021	SMARTSeq2	1389486.0	7762	Unknown
...	...	...	...	...	...	...	...
SRX300884_SRX300884	2.00	4C	Xue_2013	Tang2009	13308292.0	14096	4C_2.0
SRX300876_SRX300876	0.33	Pronucleus	Xue_2013	Tang2009	16438437.0	16542	Pronucleus_0.33
SRX300882_SRX300882	1.25	2C	Xue_2013	Tang2009	11549318.0	12071	2C_1.25
SRX300886_SRX300886	2.00	4C	Xue_2013	Tang2009	10497600.0	7149	4C_2.0
SRX300877_SRX300877	0.33	Pronucleus	Xue_2013	Tang2009	13025184.0	17779	Pronucleus_0.33

2547 rows × 7 columns

2.1.0.2 Remove Day 12 and Day 14 datasets

human_adata = human_adata[human_adata.obs.day < 12].copy()

2.1.0.3 Set 4C and earlier stages as ‘Prelineage’

human_adata.obs.loc[human_adata.obs.day <= 2, 'ct_fine'] = 'Prelineage'

human_adata.obs.ct_fine.value_counts()

Unknown                    609
Trophectoderm_7.0          462
Trophectoderm_6.0          403
Trophectoderm_5.0          246
Inner Cell Mass_5.0         87
Epiblast_6.0                76
Trophectoderm_10.0          60
Trophectoderm_9.0           53
Epiblast_7.0                46
Trophectoderm_8.0           46
Prelineage                  39
Inner Cell Mass_6.0         33
Primitive Endoderm_7.0      32
8C_3.0                      30
Morula_4.0                  19
Primitive Endoderm_6.0      19
Inner Cell Mass_7.0         18
Epiblast_10.0               14
Epiblast_8.0                11
Epiblast_9.0                10
Primitive Endoderm_10.0      3
Primitive Endoderm_9.0       3
Primitive Endoderm_8.0       2
Inner Cell Mass_9.0          2
Name: ct_fine, dtype: int64

2.1.0.4 Combine Epiblast E8, E9 and E10 into Late Epiblast

human_adata.obs.loc[(human_adata.obs.day >= 8) & (human_adata.obs.ct == 'Epiblast'),'ct_fine'] = 'Late epiblast'

2.1.0.5 Combine PrE from all days into PrE

human_adata.obs.loc[human_adata.obs.ct == 'Primitive Endoderm','ct_fine'] = 'Primitive Endoderm'

2.1.0.6 Combine all ICM into one category

human_adata.obs.loc[(human_adata.obs.ct == 'Inner Cell Mass'),'ct_fine'] = 'Inner Cell Mass'

2.2 Write out human data

human_adata.obs.ct_fine.value_counts()

Unknown               609
Trophectoderm_7.0     462
Trophectoderm_6.0     403
Trophectoderm_5.0     246
Inner Cell Mass       140
Epiblast_6.0           76
Trophectoderm_10.0     60
Primitive Endoderm     59
Trophectoderm_9.0      53
Epiblast_7.0           46
Trophectoderm_8.0      46
Prelineage             39
Late epiblast          35
8C_3.0                 30
Morula_4.0             19
Name: ct_fine, dtype: int64

human_adata.write_h5ad('../data/processed/32_human_adata.h5ad')