Quickstart

This page presents a minimal workflow to get you started in using SCCE. The goal is not to produce a ready-to-use predict model but to familiarize yourself with the main SCCE concepts.

Step 0: Install SCCE

conda create -n scce python=3.8 libffi=3.3
conda activate scce
pip install scce

Step 1: Prepare the data

The entire workflow requires the following types of data (Following the quotation marks are the recommended data for this workflow):

  1. scHi-C metadata: Lee2019 metadata(partial)

  2. scHi-C dataset: Lee2019

  3. scHi-C PCA data: BandNorm’s results(partial)

  4. annotation file:

wget -c https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_19/gencode.v19.annotation.gtf.gz
gzip -d gencode.v19.annotation.gtf.gz
  1. scRNA metadata: Bakken et al. Human M1 – 10x Genomics (partial)

  2. scRNA expression matrix: Bakken et al. Human M1 – 10x Genomics (partial)

Step 2: Construct map

preprocess

from scce import preprocess

preprocess.hic_process(
    metadata_path,
    hic_folder_path,
    hic_output_path,
    column_names=dict(id="sample_name", cell_type="cell_type"),
    cell_types=["Astro", "Endo", "ODC", "OPC"],
    resolution=10000,
    n_jobs=8,
)

preprocess.rna_process(
    metadata_path,
    matrix_path,
    rna_output_path,
    column_names=dict(id="sample_name", cell_type="subclass_label"),
    cell_types=["Astro", "Endo", "Oligo", "OPC"],
)

integrate

from scce import integrate, plot

data_tool = integrate.DataTool(hic_output_path, rna_output_path, cell_types=["Astro", "Endo", "Oligo", "ODC", "OPC"])
data_tool.add_hic_pca(hic_pca_path)
data_tool.add_gene_annotation(annotation_path, gtf_by="gene_name")

data_tool.hic_pca()
data_tool.rna_pca()
data_tool.rna_highly_variable_genes()
hic, rna = data_tool.get_data()

hic.var["chromStart"], hic.var["chromEnd"] = hic.var["start"], hic.var["end"]
graph = integrate.generate_graph(hic, rna, resolution=10000)
integrate.glue_embedding(hic, rna, graph)
map = integrate.mapping(hic, rna)

hic.write(mapped_hic_path, compression="gzip")
rna.write(mapped_rna_path, compression="gzip")
map.to_csv(mapping_path)

plot.umap(anndata.concat([rna, hic]), dict(color=["cell_type", "domain"]))
_images/integrate_result.png

Integrate Result

Step 3: Train the model

generate dataset

import anndata
import numpy as np
import pandas as pd

from scce.model import DataSetGenerator

map = pd.read_csv(mapping_path, index_col=0)
rna = anndata.read_h5ad(mapped_rna_path)
target_label = "PDGFRA"

generator = DataSetGenerator(map, rna, hic_folder_path, resolution=10000)
generator.generate_by_gene_name([target_label])
dataset = generator.get_dataset()
np.save(dataset_path, dataset)

train

import numpy as np

from scce.model import build, predict

dataset = np.load(dataset_path, allow_pickle=True).item()
target_label = "PDGFRA"

train_dataset, eval_dataset = dataset["train"], dataset["eval"]
build(dict(train=train_dataset, eval=eval_dataset), target_label=target_label)

Function build will create a .scce folder on the path where the command is executed to save the model checkpoint

Step 4: Predict

import numpy as np

from scce.model import predict

eval_dataset = np.load(dataset_path, allow_pickle=True).item()["eval"]
target_label = "PDGFRA"

RNA_values = [data["scRNA"] for data in eval_dataset]
evaluate_predict = predict(target_label, RNA_values)

Step 5: Analyse

calculate chromatin remodeling factors score

import numpy as np

from scce.analyse import ig_attribute

eval_dataset = np.load(dataset_path, allow_pickle=True).item()["eval"]
target_label = "PDGFRA"

RNA_values = [data["scRNA"] for data in eval_dataset if data["cell_type"] == "OPC"]
gene_names = eval_dataset[0]["scRNA_head"]
ChRFs_score = ig_attribute(target_label, RNA_values, gene_names)
ChRFs_score = ChRFs_score.sort_values(by="score", ascending=False)

calculate chromatin interaction diffusion under chromatin remodeling

from scce.analyse import interaction_diffusion
from scce.plot.base import heatmap

# evaluate_predict: calculated by scce.model.predict (shape: (N, M))
# pseudotimes: pseudo-time for each sample (shape: (N,))
x, y, value = interaction_diffusion(
    hics=evaluate_predict,
    pseudotimes=pseudotimes,
    cell_length=0.5,
)
levels = np.linspace(min(value[~np.isnan(value)]), max(value[~np.isnan(value)]), 15)

heatmap(
    x,
    y,
    value,
    levels,
    cbar_label="pseudo-time",
)
_images/chromatin_remodeling_result.png

Interaction Diffusion