Tutorial 1: prepare the preprocessed scRNA-seq data
[1]:
import os
os.chdir('/data/wanh/CANAL/')
import sys
sys.path.append('/data/wanh/CANAL/')
from preprocess import *
import scanpy as sc, numpy as np, pandas as pd, anndata as ad
from scipy import sparse
import csv
conduct gene alignment with the Panglao dataset
[2]:
data_path_human= "/data/wanh/CANAL/data/Pancreas/ALIGNED_Homo_sapiens_Pancreas.h5ad"
adata_human = sc.read(data_path_human)
adata_human.obs["organism"]='human'
print(adata_human,adata_human.var_names)
data = adata_human
print(data,data.var)
obj = data.var_names.tolist()
new_data = gene_align(data,obj)
print(new_data)
AnnData object with n_obs × n_vars = 14043 × 28421
obs: 'donor', 'cell_type1', 'library', 'organism', 'dataset_name', 'platform', 'organ', 'data_type', 'cell_ontology_class', 'cell_ontology_id', 'n_genes', 'n_counts', '__libsize__', 'baron_human_donor', 'age', 'gender', 'enge_donor', 'batch', 'muraro_donor'
var: 'variable_genes-0', 'variable_genes-1', 'variable_genes-2', 'variable_genes-3', 'variable_genes'
uns: 'baron_human_donor_colors', 'cell_ontology_class_colors', 'cell_type1_colors', 'dataset_name_colors', 'enge_donor_colors', 'muraro_donor_colors', 'neighbors', 'umap'
obsm: 'X_umap', 'latent'
obsp: 'connectivities', 'distances' Index(['1/2-SBSRNA4', 'A1BG', 'A1BG-AS1', 'A1CF', 'A2LD1', 'A2M', 'A2M-AS1',
'A2ML1', 'A2MP1', 'A3GALT2',
...
'ZXDA', 'ZXDB', 'ZXDC', 'ZYG11A', 'ZYG11B', 'ZYX', 'ZZEF1', 'ZZZ3',
'pk', 'tAKR'],
dtype='object', length=28421)
AnnData object with n_obs × n_vars = 14043 × 28421
obs: 'donor', 'cell_type1', 'library', 'organism', 'dataset_name', 'platform', 'organ', 'data_type', 'cell_ontology_class', 'cell_ontology_id', 'n_genes', 'n_counts', '__libsize__', 'baron_human_donor', 'age', 'gender', 'enge_donor', 'batch', 'muraro_donor'
var: 'variable_genes-0', 'variable_genes-1', 'variable_genes-2', 'variable_genes-3', 'variable_genes'
uns: 'baron_human_donor_colors', 'cell_ontology_class_colors', 'cell_type1_colors', 'dataset_name_colors', 'enge_donor_colors', 'muraro_donor_colors', 'neighbors', 'umap'
obsm: 'X_umap', 'latent'
obsp: 'connectivities', 'distances' variable_genes-0 variable_genes-1 variable_genes-2 \
1/2-SBSRNA4 False False False
A1BG False False False
A1BG-AS1 False False False
A1CF False False False
A2LD1 False False False
... ... ... ...
ZYX False True False
ZZEF1 False False False
ZZZ3 False False False
pk False False False
tAKR False False False
variable_genes-3 variable_genes
1/2-SBSRNA4 False False
A1BG False False
A1BG-AS1 False False
A1CF False False
A2LD1 False False
... ... ...
ZYX False True
ZZEF1 False False
ZZZ3 False False
pk False False
tAKR False False
[28421 rows x 5 columns]
/data/wanh/ENTER/envs/pytorch/lib/python3.7/site-packages/anndata/_core/anndata.py:1828: UserWarning: Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
utils.warn_names_duplicates("obs")
number of overlapping genes: 16895
AnnData object with n_obs × n_vars = 14043 × 16906
obs: 'donor', 'cell_type1', 'library', 'organism', 'dataset_name', 'platform', 'organ', 'data_type', 'cell_ontology_class', 'cell_ontology_id', 'n_genes', 'n_counts', '__libsize__', 'baron_human_donor', 'age', 'gender', 'enge_donor', 'batch', 'muraro_donor', 'celltype'
uns: 'log1p'
normalize the aligned scRNA-seq dataset
[3]:
new = normalize(new_data, "Pancreas",dir="/data/wanh/CANAL/data/Pancreas_test")
WARNING: adata.X seems to be already log-transformed.
save the pre-processed datasets
[4]:
experiments = "Pancreas_test"
dataset1 = "Muraro"
data_subset1 = new[np.array(new.obs["dataset_name"]) == dataset1]
print("dataset 1:", dataset1)
print(data_subset1)
print(np.unique(data_subset1.obs['cell_type1'],return_counts=True))
data_subset1.write("/data/wanh/CANAL/data/{}/{}.h5ad".format(experiments,dataset1))
dataset2 = "Enge"
data_subset2 = new[np.array(new.obs["dataset_name"]) == dataset2]
print("dataset 2:", dataset2)
print(data_subset2)
print(np.unique(data_subset2.obs['cell_type1'],return_counts=True))
data_subset2.write("/data/wanh/CANAL/data/{}/{}.h5ad".format(experiments,dataset2))
dataset3 = "Baron_human"
data_subset3 = new[np.array(new.obs["dataset_name"]) == dataset3]
print("dataset 3:", dataset3)
print(data_subset3)
print(np.unique(data_subset3.obs['cell_type1'],return_counts=True))
data_subset3.write("/data/wanh/CANAL/data/{}/{}.h5ad".format(experiments,dataset3))
dataset4 = "Segerstolpe"
data_subset4 = new[np.array(new.obs["dataset_name"]) == dataset4]
print("dataset 4:", dataset4)
print(data_subset4)
print(np.unique(data_subset4.obs['cell_type1'],return_counts=True))
data_subset4.write("/data/wanh/CANAL/data/{}/{}.h5ad".format(experiments,dataset4))
dataset 1: Muraro
View of AnnData object with n_obs × n_vars = 2122 × 1000
obs: 'donor', 'cell_type1', 'library', 'organism', 'dataset_name', 'platform', 'organ', 'data_type', 'cell_ontology_class', 'cell_ontology_id', 'n_genes', 'n_counts', '__libsize__', 'baron_human_donor', 'age', 'gender', 'enge_donor', 'batch', 'muraro_donor', 'celltype'
var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
uns: 'log1p', 'hvg'
(array(['acinar', 'alpha', 'beta', 'delta', 'ductal', 'endothelial',
'epsilon', 'gamma', 'mesenchymal'], dtype=object), array([219, 812, 448, 193, 245, 21, 3, 101, 80]))
dataset 2: Enge
View of AnnData object with n_obs × n_vars = 2282 × 1000
obs: 'donor', 'cell_type1', 'library', 'organism', 'dataset_name', 'platform', 'organ', 'data_type', 'cell_ontology_class', 'cell_ontology_id', 'n_genes', 'n_counts', '__libsize__', 'baron_human_donor', 'age', 'gender', 'enge_donor', 'batch', 'muraro_donor', 'celltype'
var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
uns: 'log1p', 'hvg'
(array(['acinar', 'alpha', 'beta', 'delta', 'ductal', 'mesenchymal'],
dtype=object), array([411, 998, 348, 83, 389, 53]))
dataset 3: Baron_human
View of AnnData object with n_obs × n_vars = 8569 × 1000
obs: 'donor', 'cell_type1', 'library', 'organism', 'dataset_name', 'platform', 'organ', 'data_type', 'cell_ontology_class', 'cell_ontology_id', 'n_genes', 'n_counts', '__libsize__', 'baron_human_donor', 'age', 'gender', 'enge_donor', 'batch', 'muraro_donor', 'celltype'
var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
uns: 'log1p', 'hvg'
(array(['acinar', 'activated_stellate', 'alpha', 'beta', 'delta', 'ductal',
'endothelial', 'epsilon', 'gamma', 'macrophage', 'mast',
'quiescent_stellate', 'schwann', 't_cell'], dtype=object), array([ 958, 284, 2326, 2525, 601, 1077, 252, 18, 255, 55, 25,
173, 13, 7]))
/data/wanh/ENTER/envs/pytorch/lib/python3.7/site-packages/anndata/_core/anndata.py:1235: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.
df[key] = c
dataset 4: Segerstolpe
View of AnnData object with n_obs × n_vars = 1070 × 1000
obs: 'donor', 'cell_type1', 'library', 'organism', 'dataset_name', 'platform', 'organ', 'data_type', 'cell_ontology_class', 'cell_ontology_id', 'n_genes', 'n_counts', '__libsize__', 'baron_human_donor', 'age', 'gender', 'enge_donor', 'batch', 'muraro_donor', 'celltype'
var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
uns: 'log1p', 'hvg'
(array(['MHC class II', 'PSC', 'acinar', 'alpha', 'beta', 'delta',
'ductal', 'endothelial', 'epsilon', 'gamma', 'mast',
'unclassified endocrine'], dtype=object), array([ 1, 23, 112, 443, 171, 59, 135, 13, 5, 75, 4, 29]))
[ ]: