Analysis flow#

Here, we’ll track typical data transformations like subsetting that occur during analysis.

If exploring more generally, read this first: Project flow.

Setup#

# a lamindb instance containing Bionty schema
!lamin init --storage ./analysis-usecase --schema bionty

import lamindb as ln
import bionty as bt
from lamin_utils import logger

bt.settings.auto_save_parents = False

💡 connected lamindb: testuser1/analysis-usecase

Register an initial dataset#

Here we register an initial artifact with a pipeline script.

# register_example_file.py


def register_example_file():
    # create a pipeline transform to track the registration of the artifact
    transform = ln.Transform(
        name="register example artifact", type="pipeline", version="0.0.1"
    )
    ln.track(transform=transform)

    # an example dataset that has a few cell type, tissue and disease annotations
    adata = ln.core.datasets.anndata_with_obs()

    # validate and register features
    genes = bt.Gene.from_values(
        adata.var_names,
        bt.Gene.ensembl_gene_id,
        organism="human",
        )
    ln.save(genes)
    obs_features = ln.Feature.from_df(adata.obs)
    ln.save(obs_features)

    # validate and register labels
    cell_types = bt.CellType.from_values(adata.obs["cell_type"])
    ln.save(cell_types)
    tissues = bt.Tissue.from_values(adata.obs["tissue"])
    ln.save(tissues)
    diseases = bt.Disease.from_values(adata.obs["disease"])
    ln.save(diseases)

    # register artifact and annotate with features & labels
    artifact = ln.Artifact.from_anndata(
        adata,
        description="anndata with obs"
    )
    artifact.save()
    artifact.features.add_from_anndata(
        var_field=bt.Gene.ensembl_gene_id,
        organism="human",
    )
    features = ln.Feature.lookup()
    artifact.labels.add(cell_types, features.cell_type)
    artifact.labels.add(tissues, features.tissue)
    artifact.labels.add(diseases, features.disease)


register_example_file()

Pull the registered dataset, apply a transformation, and register the result#

Set the current notebook as the new transform:

ln.transform.stem_uid = "eNef4Arw8nNM"
ln.transform.version = "0"
ln.track()

💡 notebook imports: bionty==0.42.4 lamin_utils==0.13.1 lamindb==0.69.2

💡 saved: Transform(uid='eNef4Arw8nNM6K79', name='Analysis flow', key='analysis-flow', version='0', type=notebook, updated_at=2024-03-28 12:09:52 UTC, created_by_id=1)

💡 saved: Run(uid='YSt3dQkx9YY7pXNtbg3S', transform_id=2, created_by_id=1)

artifact = ln.Artifact.filter(description="anndata with obs").one()

artifact.describe()

Artifact(uid='BOEVc19j09xX56uYFexI', suffix='.h5ad', accessor='AnnData', description='anndata with obs', size=46992, hash='IJORtcQUSS11QBqD-nTD0A', hash_type='md5', visibility=1, key_is_virtual=True, updated_at=2024-03-28 12:09:52 UTC)

Provenance:
  🗃️ storage: Storage(uid='rWYQz155', root='/home/runner/work/lamin-usecases/lamin-usecases/docs/analysis-usecase', type='local', updated_at=2024-03-28 12:09:45 UTC, created_by_id=1)
  🧩 transform: Transform(uid='Ht1XNJvI07qtbNGF', name='register example artifact', version='0.0.1', type='pipeline', updated_at=2024-03-28 12:09:47 UTC, created_by_id=1)
  👣 run: Run(uid='Mzx7c7qikuAWBpY6zCIZ', started_at=2024-03-28 12:09:47 UTC, is_consecutive=True, transform_id=1, created_by_id=1)
  👤 created_by: User(uid='DzTjkKse', handle='testuser1', name='Test User1', updated_at=2024-03-28 12:09:45 UTC)
Features:
  var: FeatureSet(uid='pmzGjyRBFmHUGNAhMb9c', n=99, type='number', registry='bionty.Gene', hash='-frOq7J0bik-J7Ad9DX7', updated_at=2024-03-28 12:09:52 UTC, created_by_id=1)
    'TSPAN6', 'TNMD', 'DPM1', 'SCYL3', 'C1orf112', 'FGR', 'CFH', 'FUCA2', 'GCLC', 'NFYA', 'STPG1', 'NIPAL3', 'LAS1L', 'ENPP4', 'SEMA3F', 'CFTR', 'ANKIB1', 'CYP51A1', 'KRIT1', 'RAD52', ...
  obs: FeatureSet(uid='dyxlMhc79LibVV60qrZz', n=4, registry='core.Feature', hash='X3cCYYbm61DHhARy4do8', updated_at=2024-03-28 12:09:52 UTC, created_by_id=1)
    🔗 cell_type (3, bionty.CellType): 'T cell', 'hematopoietic stem cell', 'hepatocyte'
    cell_type_id (category)
    🔗 tissue (4, bionty.Tissue): 'kidney', 'liver', 'heart', 'brain'
    🔗 disease (4, bionty.Disease): 'chronic kidney disease', 'liver lymphoma', 'cardiac ventricle disorder', 'Alzheimer disease'
Labels:
  🏷️ tissues (4, bionty.Tissue): 'kidney', 'liver', 'heart', 'brain'
  🏷️ cell_types (3, bionty.CellType): 'T cell', 'hematopoietic stem cell', 'hepatocyte'
  🏷️ diseases (4, bionty.Disease): 'chronic kidney disease', 'liver lymphoma', 'cardiac ventricle disorder', 'Alzheimer disease'

Get a backed AnnData object#

adata = artifact.backed()
adata

AnnDataAccessor object with n_obs × n_vars = 40 × 100
  constructed for the AnnData object BOEVc19j09xX56uYFexI.h5ad
    obs: ['_index', 'cell_type', 'cell_type_id', 'disease', 'tissue']
    var: ['_index']

Subset dataset to specific cell types and diseases#

cell_types = artifact.cell_types.all().lookup(return_field="name")
diseases = artifact.diseases.all().lookup(return_field="name")

Create the subset:

subset_obs = adata.obs.cell_type.isin(
    [cell_types.t_cell, cell_types.hematopoietic_stem_cell]
) & (adata.obs.disease.isin([diseases.liver_lymphoma, diseases.chronic_kidney_disease]))

adata_subset = adata[subset_obs]
adata_subset

AnnDataAccessorSubset object with n_obs × n_vars = 20 × 100
  obs: ['_index', 'cell_type', 'cell_type_id', 'disease', 'tissue']
  var: ['_index']

adata_subset.obs[["cell_type", "disease"]].value_counts()

cell_type                disease               
T cell                   chronic kidney disease    10
hematopoietic stem cell  liver lymphoma            10
dtype: int64

file_subset = ln.Artifact.from_anndata(
    adata_subset.to_memory(),
    description="anndata with obs subset"
)

/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/anndata/_core/anndata.py:1908: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
  utils.warn_names_duplicates("var")

file_subset.save()

file_subset.features.add_from_anndata(
    var_field=bt.Gene.ensembl_gene_id,
    organism="human",  # optionally, globally set organism via bt.settings.organism = "human"
    )

features = ln.Feature.lookup()

file_subset.labels.add(adata_subset.obs.cell_type, features.cell_type)
file_subset.labels.add(adata_subset.obs.disease, features.disease)
file_subset.labels.add(adata_subset.obs.tissue, features.tissue)

Examine data flow#

Query a subsetted .h5ad artifact containing “hematopoietic stem cell” and “T cell”:

cell_types = bt.CellType.lookup()

my_subset = ln.Artifact.filter(
    suffix=".h5ad",
    description__endswith="subset",
    cell_types__in=[
        cell_types.hematopoietic_stem_cell,
        cell_types.t_cell,
    ],
).first()

my_subset

Artifact(uid='Nr64h4wqQPxMwqUFMA1O', suffix='.h5ad', accessor='AnnData', description='anndata with obs subset', size=38992, hash='RgGUx7ndRplZZSmalTAWiw', hash_type='md5', visibility=1, key_is_virtual=True, updated_at=2024-03-28 12:09:53 UTC, storage_id=1, transform_id=2, run_id=2, created_by_id=1)

Common questions that might arise are:

What is the history of this artifact?
Which features and labels are associated with it?
Which notebook analyzed and registered this artifact?
By whom?
And which artifact is its parent?

Let’s answer this using LaminDB:

print("--> What is the history of this artifact?\n")
file_subset.view_lineage()

print("\n\n--> Which features and labels are associated with it?\n")
logger.print(file_subset.features)
logger.print(file_subset.labels)

print("\n\n--> Which notebook analyzed and registered this artifact\n")
logger.print(file_subset.transform)

print("\n\n--> By whom\n")
logger.print(file_subset.created_by)

print("\n\n--> And which artifact is its parent\n")
display(file_subset.run.input_artifacts.df())

--> What is the history of this artifact?

_images/49f13bfe94dc12cdce45cd5acd5f28cc15f857fb6c4c867d36d48886744368ee.svg

--> Which features and labels are associated with it?

Features:
  var: FeatureSet(uid='pmzGjyRBFmHUGNAhMb9c', n=99, type='number', registry='bionty.Gene', hash='-frOq7J0bik-J7Ad9DX7', updated_at=2024-03-28 12:09:52 UTC, created_by_id=1)
    'TSPAN6', 'TNMD', 'DPM1', 'SCYL3', 'C1orf112', 'FGR', 'CFH', 'FUCA2', 'GCLC', 'NFYA', 'STPG1', 'NIPAL3', 'LAS1L', 'ENPP4', 'SEMA3F', 'CFTR', 'ANKIB1', 'CYP51A1', 'KRIT1', 'RAD52', ...
  obs: FeatureSet(uid='dyxlMhc79LibVV60qrZz', n=4, registry='core.Feature', hash='X3cCYYbm61DHhARy4do8', updated_at=2024-03-28 12:09:52 UTC, created_by_id=1)
    🔗 cell_type (2, bionty.CellType): 'T cell', 'hematopoietic stem cell'
    cell_type_id (category)
    🔗 tissue (2, bionty.Tissue): 'kidney', 'liver'
    🔗 disease (2, bionty.Disease): 'chronic kidney disease', 'liver lymphoma'

Labels:
  🏷️ tissues (2, bionty.Tissue): 'kidney', 'liver'
  🏷️ cell_types (2, bionty.CellType): 'T cell', 'hematopoietic stem cell'
  🏷️ diseases (2, bionty.Disease): 'chronic kidney disease', 'liver lymphoma'

--> Which notebook analyzed and registered this artifact

Transform(uid='eNef4Arw8nNM6K79', name='Analysis flow', key='analysis-flow', version='0', type=notebook, updated_at=2024-03-28 12:09:52 UTC, created_by_id=1)

--> By whom

User(uid='DzTjkKse', handle='testuser1', name='Test User1', updated_at=2024-03-28 12:09:45 UTC)

--> And which artifact is its parent

	uid	storage_id	key	suffix	accessor	description	version	size	hash	hash_type	n_objects	n_observations	transform_id	run_id	visibility	key_is_virtual	created_at	updated_at	created_by_id
id
1	BOEVc19j09xX56uYFexI	1	None	.h5ad	AnnData	anndata with obs	None	46992	IJORtcQUSS11QBqD-nTD0A	md5	None	None	1	1	1	True	2024-03-28 12:09:52.293039+00:00	2024-03-28 12:09:52.379584+00:00	1