Register cellxgene-census metadata#

In this notebook, we show how to register all census datasets and metadata using LaminDB under 10min.

Registered metadata can be readily used for querying, validating, annotating and integrating data, see cellxgene-census.

Ref: cellxgene-census tutorials.

Setup#

!lamin init --storage ./test-census --schema bionty
Hide code cell output
βœ… saved: User(id='DzTjkKse', handle='testuser1', email='testuser1@lamin.ai', name='Test User1', updated_at=2023-09-25 12:12:09)
βœ… saved: Storage(id='0Arizg6V', root='/home/runner/work/cellxgene-census-lamin/cellxgene-census-lamin/docs/test-census', type='local', updated_at=2023-09-25 12:12:09, created_by_id='DzTjkKse')
πŸ’‘ loaded instance: testuser1/test-census
πŸ’‘ did not register local instance on hub (if you want, call `lamin register`)

import lamindb as ln
import lnschema_bionty as lb
import cellxgene_census
πŸ’‘ loaded instance: testuser1/test-census (lamindb 0.54.1)
lb.settings.species = "human"  # "mouse" for registering metadata of mouse datasets
human = lb.settings.species.scientific_name
ln.track()
πŸ’‘ notebook imports: cellxgene-census==1.6.0 lamin_utils==0.11.2 lamindb==0.54.1 lnschema_bionty==0.31.2
πŸ’‘ Transform(id='gtphmzTiG8wMz8', name='Register cellxgene-census metadata', short_name='01-census-registries', version='0', type=notebook, updated_at=2023-09-25 12:12:15, created_by_id='DzTjkKse')
πŸ’‘ Run(id='VcAuiFFp9YjqaOA3kQA7', run_at=2023-09-25 12:12:15, transform_id='gtphmzTiG8wMz8', created_by_id='DzTjkKse')

Register datasets#

census = cellxgene_census.open_soma()
The "stable" release is currently 2023-07-25. Specify 'census_version="2023-07-25"' in future calls to open_soma() to ensure data consistency.
2023-09-25 12:12:17,357:INFO - The "stable" release is currently 2023-07-25. Specify 'census_version="2023-07-25"' in future calls to open_soma() to ensure data consistency.
census
<Collection 's3://cellxgene-data-public/cell-census/2023-07-25/soma/' (open for 'r') (2 items)
    'census_data': 's3://cellxgene-data-public/cell-census/2023-07-25/soma/census_data' (unopened)
    'census_info': 's3://cellxgene-data-public/cell-census/2023-07-25/soma/census_info' (unopened)>
census["census_data"]
<Collection 's3://cellxgene-data-public/cell-census/2023-07-25/soma/census_data' (open for 'r') (2 items)
    'mus_musculus': 's3://cellxgene-data-public/cell-census/2023-07-25/soma/census_data/mus_musculus' (unopened)
    'homo_sapiens': 's3://cellxgene-data-public/cell-census/2023-07-25/soma/census_data/homo_sapiens' (unopened)>
census["census_info"]
<Collection 's3://cellxgene-data-public/cell-census/2023-07-25/soma/census_info' (open for 'r') (3 items)
    'summary': 's3://cellxgene-data-public/cell-census/2023-07-25/soma/census_info/summary' (unopened)
    'summary_cell_counts': 's3://cellxgene-data-public/cell-census/2023-07-25/soma/census_info/summary_cell_counts' (unopened)
    'datasets': 's3://cellxgene-data-public/cell-census/2023-07-25/soma/census_info/datasets' (unopened)>
datasets = census["census_info"]["datasets"].read().concat().to_pandas()
datasets.shape
(593, 8)
datasets.head()
soma_joinid collection_id collection_name collection_doi dataset_id dataset_title dataset_h5ad_path dataset_total_cell_count
0 0 e2c257e7-6f79-487c-b81c-39451cd4ab3c Spatial multiomics map of trophoblast developm... 10.1038/s41586-023-05869-0 f171db61-e57e-4535-a06a-35d8b6ef8f2b donor_p13_trophoblasts f171db61-e57e-4535-a06a-35d8b6ef8f2b.h5ad 31497
1 1 e2c257e7-6f79-487c-b81c-39451cd4ab3c Spatial multiomics map of trophoblast developm... 10.1038/s41586-023-05869-0 ecf2e08e-2032-4a9e-b466-b65b395f4a02 All donors trophoblasts ecf2e08e-2032-4a9e-b466-b65b395f4a02.h5ad 67070
2 2 e2c257e7-6f79-487c-b81c-39451cd4ab3c Spatial multiomics map of trophoblast developm... 10.1038/s41586-023-05869-0 74cff64f-9da9-4b2a-9b3b-8a04a1598040 All donors all cell states (in vivo) 74cff64f-9da9-4b2a-9b3b-8a04a1598040.h5ad 286326
3 3 f7cecffa-00b4-4560-a29a-8ad626b8ee08 Mapping single-cell transcriptomes in the intr... 10.1016/j.ccell.2022.11.001 5af90777-6760-4003-9dba-8f945fec6fdf Single-cell transcriptomic datasets of Renal c... 5af90777-6760-4003-9dba-8f945fec6fdf.h5ad 270855
4 4 3f50314f-bdc9-40c6-8e4a-b0901ebfbe4c Single-cell sequencing links multiregional imm... 10.1016/j.ccell.2021.03.007 bd65a70f-b274-4133-b9dd-0d1431b6af34 Single-cell sequencing links multiregional imm... bd65a70f-b274-4133-b9dd-0d1431b6af34.h5ad 167283
files = ln.File.from_dir("s3://cellxgene-data-public/cell-census/2023-07-25/h5ads")
ln.save(files)
Hide code cell output
2023-09-25 12:12:20,852:INFO - Found credentials in environment variables.
collections_df = (
    datasets[["collection_id", "collection_name", "collection_doi"]]
    .drop_duplicates()
    .set_index("collection_id")
)
collections = []
for collection_id, row in collections_df.iterrows():
    collection = ln.ULabel(
        name=row.collection_name,
        description=row.collection_doi,
        reference=collection_id,
        reference_type="collection_id",
    )
    collections.append(collection)

ln.save(collections)

is_collection = ln.ULabel(name="is_collection")
is_collection.save()
is_collection.children.set(collections)
collections = is_collection.children
files = ln.File.filter()
feature = ln.Feature(name="collection", type="category")
feature.save()
ln.settings.upon_create_search_names = False

for _, row in datasets.iterrows():
    file = files.filter(key__endswith=f"{row.dataset_id}.h5ad").one()
    dataset = ln.Dataset(
        file,
        name=row.dataset_title,
        reference=row.dataset_id,
        reference_type="cellxgene-census dataset_id",
    )
    dataset.save()
    dataset.labels.add(collections.get(reference=row.collection_id), feature)

Register modalities#

Register β€œRNA” measurement as a modality:

modality = ln.Modality(name="RNA", description="RNA measurements")
modality.save()
rna = modality.name

Validate and register genes#

census_data = census["census_data"][human]

Gene metadata:

census_data.ms[rna].var.keys()
('soma_joinid', 'feature_id', 'feature_name', 'feature_length')
gene_metadata = census_data.ms[rna].var.read().concat().to_pandas()
gene_metadata.shape
(60664, 4)
gene_metadata.head()
soma_joinid feature_id feature_name feature_length
0 0 ENSG00000121410 A1BG 3999
1 1 ENSG00000268895 A1BG-AS1 3374
2 2 ENSG00000148584 A1CF 9603
3 3 ENSG00000175899 A2M 6318
4 4 ENSG00000245105 A2M-AS1 2948
lb.Gene.inspect(gene_metadata["feature_id"], field=lb.Gene.ensembl_gene_id);
❗ 60664 terms (100.00%) are not validated for ensembl_gene_id: ENSG00000121410, ENSG00000268895, ENSG00000148584, ENSG00000175899, ENSG00000245105, ENSG00000166535, ENSG00000256661, ENSG00000184389, ENSG00000128274, ENSG00000118017, ENSG00000094914, ENSG00000081760, ENSG00000114771, ENSG00000197953, ENSG00000242908, ENSG00000188984, ENSG00000204518, ENSG00000109576, ENSG00000158122, ENSG00000103591, ...
   detected 60517 Gene terms in Bionty for ensembl_gene_id: 'ENSG00000227906', 'ENSG00000222404', 'ENSG00000259216', 'ENSG00000203786', 'ENSG00000189299', 'ENSG00000141933', 'ENSG00000237129', 'ENSG00000207453', 'ENSG00000104907', 'ENSG00000152193', 'ENSG00000224679', 'ENSG00000252498', 'ENSG00000236627', 'ENSG00000257402', 'ENSG00000237200', 'ENSG00000253074', 'ENSG00000271205', 'ENSG00000186684', 'ENSG00000205116', 'ENSG00000166479', ...
β†’  add records from Bionty to your Gene registry via .from_values()
   couldn't validate 147 terms: 'ENSG00000231575', 'ENSG00000259855', 'ENSG00000277050', 'ENSG00000276814', 'ENSG00000237133', 'ENSG00000278955', 'ENSG00000277666', 'ENSG00000272904', 'ENSG00000285162', 'ENSG00000261490', 'ENSG00000236886', 'ENSG00000254561', 'ENSG00000237838', 'ENSG00000256222', 'ENSG00000256863', 'ENSG00000205485', 'ENSG00000272567', 'ENSG00000268955', 'ENSG00000288630', 'ENSG00000279765', ...
β†’  if you are sure, create new records via ln.Gene() and save to your registry
# register genes from bionty
gene_records = lb.Gene.from_values(
    gene_metadata["feature_id"], field=lb.Gene.ensembl_gene_id
)
ln.save(gene_records)

validated = lb.Gene.validate(gene_metadata["feature_id"], field=lb.Gene.ensembl_gene_id)
# register legacy genes manually
records = []
for gene_id in gene_metadata["feature_id"][~validated]:
    records.append(lb.Gene(ensembl_gene_id=gene_id))
ln.save(records)
Hide code cell output
❗ did not create Gene records for 147 non-validated ensembl_gene_ids: 'ENSG00000112096', 'ENSG00000137808', 'ENSG00000161149', 'ENSG00000182230', 'ENSG00000203812', 'ENSG00000204092', 'ENSG00000205485', 'ENSG00000212951', 'ENSG00000215271', 'ENSG00000221995', 'ENSG00000224739', 'ENSG00000224745', 'ENSG00000225178', 'ENSG00000225932', 'ENSG00000226377', 'ENSG00000226380', 'ENSG00000226403', 'ENSG00000227021', 'ENSG00000227220', 'ENSG00000227902', ...
❗ 147 terms (0.20%) are not validated for ensembl_gene_id: ENSG00000285162, ENSG00000276814, ENSG00000282080, ENSG00000237513, ENSG00000239467, ENSG00000236886, ENSG00000273576, ENSG00000256427, ENSG00000272040, ENSG00000278198, ENSG00000273496, ENSG00000279765, ENSG00000224739, ENSG00000226380, ENSG00000285106, ENSG00000272551, ENSG00000237133, ENSG00000272267, ENSG00000271870, ENSG00000227902, ...

Observational metadata#

All available metadata columns:

census_data.obs.keys()
('soma_joinid',
 'dataset_id',
 'assay',
 'assay_ontology_term_id',
 'cell_type',
 'cell_type_ontology_term_id',
 'development_stage',
 'development_stage_ontology_term_id',
 'disease',
 'disease_ontology_term_id',
 'donor_id',
 'is_primary_data',
 'self_reported_ethnicity',
 'self_reported_ethnicity_ontology_term_id',
 'sex',
 'sex_ontology_term_id',
 'suspension_type',
 'tissue',
 'tissue_ontology_term_id',
 'tissue_general',
 'tissue_general_ontology_term_id')

Register features#

Register obs column names as features:

features = []
for col in census_data.obs.keys():
    if col == "soma_joinid":
        type = "int"
    elif col == "is_primary_data":
        type = "bool"
    else:
        type = "category"
    features.append(ln.Feature(name=col, type=type))

ln.save(features)
features = ln.Feature.lookup(return_field=ln.Feature.name)

Validate and register ontologies#

Fetch all terms used in census for each ontology:

dfs = {}

for cols in [
    (features.assay, features.assay_ontology_term_id),
    (features.cell_type, features.cell_type_ontology_term_id),
    (features.development_stage, features.development_stage_ontology_term_id),
    (features.disease, features.disease_ontology_term_id),
    (
        features.self_reported_ethnicity,
        features.self_reported_ethnicity_ontology_term_id,
    ),
    (features.sex, features.sex_ontology_term_id),
    (features.tissue, features.tissue_ontology_term_id),
    (features.tissue_general, features.tissue_general_ontology_term_id),
]:
    dfs[cols[0]] = (
        census_data.obs.read(column_names=[cols[0], cols[1]])
        .concat()
        .to_pandas()
        .drop_duplicates()
    )
dfs[features.assay].head()
assay assay_ontology_term_id
0 10x 3' v3 EFO:0009922
31497 10x 3' v2 EFO:0009899
384893 10x 5' transcription profiling EFO:0030004
655748 10x 5' v2 EFO:0009900
829647 Smart-seq2 EFO:0008931
def register_ontology(orm, name: str, parents: bool = False, **kwargs):
    from lamin_utils import logger

    df = dfs[name]
    records = orm.from_values(
        df[f"{name}_ontology_term_id"], field=orm.ontology_id, **kwargs
    )
    for record in records:
        census_name = df[df[f"{name}_ontology_term_id"] == record.ontology_id][
            name
        ].tolist()[0]
        if census_name != record.name:
            logger.warning(
                f"census name '{census_name}' doesn't match ontology name"
                f" '{record.name}', adding census name as abbr\n"
            )
            record.set_abbr(census_name)
    name_with_schema = orm.__get_name_with_schema__()
    feature = ln.Feature.filter(name=name).one()
    feature.registries = name_with_schema
    feature.save()
    feature = ln.Feature.filter(name=f"{name}_ontology_term_id").one()
    feature.registries = name_with_schema
    feature.save()
    ln.save(records, parents=parents)
register_ontology(lb.ExperimentalFactor, features.assay)
register_ontology(lb.CellType, features.cell_type, parents=True)
Hide code cell output
❗ now recursing through parents: this only happens once, but is much slower than bulk saving
register_ontology(lb.DevelopmentalStage, features.development_stage)
Hide code cell output
❗ did not create DevelopmentalStage record for 1 non-validated ontology_id: 'unknown'
❗ census name '9th week post-fertilization human stage' doesn't match ontology name '9th week post-fertilization stage', adding census name as abbr
❗ census name '11th week post-fertilization human stage' doesn't match ontology name '11th week post-fertilization stage', adding census name as abbr
❗ census name '10th week post-fertilization human stage' doesn't match ontology name '10th week post-fertilization stage', adding census name as abbr
❗ census name '13th week post-fertilization human stage' doesn't match ontology name '13th week post-fertilization stage', adding census name as abbr
❗ census name '12th week post-fertilization human stage' doesn't match ontology name '12th week post-fertilization stage', adding census name as abbr
❗ census name 'eighth decade human stage' doesn't match ontology name 'eighth decade stage', adding census name as abbr
❗ census name 'fifth decade human stage' doesn't match ontology name 'fifth decade stage', adding census name as abbr
❗ census name 'sixth decade human stage' doesn't match ontology name 'sixth decade stage', adding census name as abbr
❗ census name 'seventh decade human stage' doesn't match ontology name 'seventh decade stage', adding census name as abbr
❗ census name '69-year-old human stage' doesn't match ontology name '69-year-old stage', adding census name as abbr
❗ census name '57-year-old human stage' doesn't match ontology name '57-year-old stage', adding census name as abbr
❗ census name '54-year-old human stage' doesn't match ontology name '54-year-old stage', adding census name as abbr
❗ census name '50-year-old human stage' doesn't match ontology name '50-year-old stage', adding census name as abbr
❗ census name '64-year-old human stage' doesn't match ontology name '64-year-old stage', adding census name as abbr
❗ census name '63-year-old human stage' doesn't match ontology name '63-year-old stage', adding census name as abbr
❗ census name '37-year-old human stage' doesn't match ontology name '37-year-old stage', adding census name as abbr
❗ census name '59-year-old human stage' doesn't match ontology name '59-year-old stage', adding census name as abbr
❗ census name '74-year-old human stage' doesn't match ontology name '74-year-old stage', adding census name as abbr
❗ census name '61-year-old human stage' doesn't match ontology name '61-year-old stage', adding census name as abbr
❗ census name '67-year-old human stage' doesn't match ontology name '67-year-old stage', adding census name as abbr
❗ census name '56-year-old human stage' doesn't match ontology name '56-year-old stage', adding census name as abbr
❗ census name '40-year-old human stage' doesn't match ontology name '40-year-old stage', adding census name as abbr
❗ census name '46-year-old human stage' doesn't match ontology name '46-year-old stage', adding census name as abbr
❗ census name '33-year-old human stage' doesn't match ontology name '33-year-old stage', adding census name as abbr
❗ census name '38-year-old human stage' doesn't match ontology name '38-year-old stage', adding census name as abbr
❗ census name '42-year-old human stage' doesn't match ontology name '42-year-old stage', adding census name as abbr
❗ census name '22-year-old human stage' doesn't match ontology name '22-year-old stage', adding census name as abbr
❗ census name '44-year-old human stage' doesn't match ontology name '44-year-old stage', adding census name as abbr
❗ census name '71-year-old human stage' doesn't match ontology name '71-year-old stage', adding census name as abbr
❗ census name '45-year-old human stage' doesn't match ontology name '45-year-old stage', adding census name as abbr
❗ census name '34-year-old human stage' doesn't match ontology name '34-year-old stage', adding census name as abbr
❗ census name '72-year-old human stage' doesn't match ontology name '72-year-old stage', adding census name as abbr
❗ census name '39-year-old human stage' doesn't match ontology name '39-year-old stage', adding census name as abbr
❗ census name '73-year-old human stage' doesn't match ontology name '73-year-old stage', adding census name as abbr
❗ census name '53-year-old human stage' doesn't match ontology name '53-year-old stage', adding census name as abbr
❗ census name '55-year-old human stage' doesn't match ontology name '55-year-old stage', adding census name as abbr
❗ census name '70-year-old human stage' doesn't match ontology name '70-year-old stage', adding census name as abbr
❗ census name '82-year-old human stage' doesn't match ontology name '82-year-old stage', adding census name as abbr
❗ census name '51-year-old human stage' doesn't match ontology name '51-year-old stage', adding census name as abbr
❗ census name '60-year-old human stage' doesn't match ontology name '60-year-old stage', adding census name as abbr
❗ census name '29-year-old human stage' doesn't match ontology name '29-year-old stage', adding census name as abbr
❗ census name '28-year-old human stage' doesn't match ontology name '28-year-old stage', adding census name as abbr
❗ census name '35-year-old human stage' doesn't match ontology name '35-year-old stage', adding census name as abbr
❗ census name '18-year-old human stage' doesn't match ontology name '18-year-old stage', adding census name as abbr
❗ census name '47-year-old human stage' doesn't match ontology name '47-year-old stage', adding census name as abbr
❗ census name '21-year-old human stage' doesn't match ontology name '21-year-old stage', adding census name as abbr
❗ census name '26-year-old human stage' doesn't match ontology name '26-year-old stage', adding census name as abbr
❗ census name '41-year-old human stage' doesn't match ontology name '41-year-old stage', adding census name as abbr
❗ census name '27-year-old human stage' doesn't match ontology name '27-year-old stage', adding census name as abbr
❗ census name '66-year-old human stage' doesn't match ontology name '66-year-old stage', adding census name as abbr
❗ census name '62-year-old human stage' doesn't match ontology name '62-year-old stage', adding census name as abbr
❗ census name 'human adult stage' doesn't match ontology name 'adult stage', adding census name as abbr
❗ census name '75-year-old human stage' doesn't match ontology name '75-year-old stage', adding census name as abbr
❗ census name '77-year-old human stage' doesn't match ontology name '77-year-old stage', adding census name as abbr
❗ census name '49-year-old human stage' doesn't match ontology name '49-year-old stage', adding census name as abbr
❗ census name '58-year-old human stage' doesn't match ontology name '58-year-old stage', adding census name as abbr
❗ census name '68-year-old human stage' doesn't match ontology name '68-year-old stage', adding census name as abbr
❗ census name '65-year-old human stage' doesn't match ontology name '65-year-old stage', adding census name as abbr
❗ census name 'newborn human stage' doesn't match ontology name 'newborn stage', adding census name as abbr
❗ census name '2-year-old human stage' doesn't match ontology name '2-year-old stage', adding census name as abbr
❗ census name '3-year-old human stage' doesn't match ontology name '3-year-old stage', adding census name as abbr
❗ census name '10-year-old human stage' doesn't match ontology name '10-year-old stage', adding census name as abbr
❗ census name '12-month-old human stage' doesn't match ontology name '12-month-old stage', adding census name as abbr
❗ census name '25-year-old human stage' doesn't match ontology name '25-year-old stage', adding census name as abbr
❗ census name '7-year-old human stage' doesn't match ontology name '7-year-old stage', adding census name as abbr
❗ census name '31-year-old human stage' doesn't match ontology name '31-year-old stage', adding census name as abbr
❗ census name '23-year-old human stage' doesn't match ontology name '23-year-old stage', adding census name as abbr
❗ census name '30-year-old human stage' doesn't match ontology name '30-year-old stage', adding census name as abbr
❗ census name '16-year-old human stage' doesn't match ontology name '16-year-old stage', adding census name as abbr
❗ census name '15-year-old human stage' doesn't match ontology name '15-year-old stage', adding census name as abbr
❗ census name 'fourth decade human stage' doesn't match ontology name 'fourth decade stage', adding census name as abbr
❗ census name '24-year-old human stage' doesn't match ontology name '24-year-old stage', adding census name as abbr
❗ census name '20-year-old human stage' doesn't match ontology name '20-year-old stage', adding census name as abbr
❗ census name '6-month-old human stage' doesn't match ontology name '6-month-old stage', adding census name as abbr
❗ census name '52-year-old human stage' doesn't match ontology name '52-year-old stage', adding census name as abbr
❗ census name '43-year-old human stage' doesn't match ontology name '43-year-old stage', adding census name as abbr
❗ census name '1-month-old human stage' doesn't match ontology name '1-month-old stage', adding census name as abbr
❗ census name '81-year-old human stage' doesn't match ontology name '81-year-old stage', adding census name as abbr
❗ census name '76-year-old human stage' doesn't match ontology name '76-year-old stage', adding census name as abbr
❗ census name '48-year-old human stage' doesn't match ontology name '48-year-old stage', adding census name as abbr
❗ census name '36-year-old human stage' doesn't match ontology name '36-year-old stage', adding census name as abbr
❗ census name '5-year-old human stage' doesn't match ontology name '5-year-old stage', adding census name as abbr
❗ census name '79-year-old human stage' doesn't match ontology name '79-year-old stage', adding census name as abbr
❗ census name '3-month-old human stage' doesn't match ontology name '3-month-old stage', adding census name as abbr
❗ census name '12-year-old human stage' doesn't match ontology name '12-year-old stage', adding census name as abbr
❗ census name '11-month-old human stage' doesn't match ontology name '11-month-old stage', adding census name as abbr
❗ census name '14-year-old human stage' doesn't match ontology name '14-year-old stage', adding census name as abbr
❗ census name '4-year-old human stage' doesn't match ontology name '4-year-old stage', adding census name as abbr
❗ census name '9-year-old human stage' doesn't match ontology name '9-year-old stage', adding census name as abbr
❗ census name '32-year-old human stage' doesn't match ontology name '32-year-old stage', adding census name as abbr
❗ census name '19-year-old human stage' doesn't match ontology name '19-year-old stage', adding census name as abbr
❗ census name '85-year-old human stage' doesn't match ontology name '85-year-old stage', adding census name as abbr
❗ census name '80 year-old and over human stage' doesn't match ontology name '80 year-old and over stage', adding census name as abbr
❗ census name '87-year-old human stage' doesn't match ontology name '87-year-old stage', adding census name as abbr
❗ census name '83-year-old human stage' doesn't match ontology name '83-year-old stage', adding census name as abbr
❗ census name '80-year-old human stage' doesn't match ontology name '80-year-old stage', adding census name as abbr
❗ census name '88-year-old human stage' doesn't match ontology name '88-year-old stage', adding census name as abbr
❗ census name '89-year-old human stage' doesn't match ontology name '89-year-old stage', adding census name as abbr
❗ census name '86-year-old human stage' doesn't match ontology name '86-year-old stage', adding census name as abbr
❗ census name '84-year-old human stage' doesn't match ontology name '84-year-old stage', adding census name as abbr
❗ census name '78-year-old human stage' doesn't match ontology name '78-year-old stage', adding census name as abbr
❗ census name '20th week post-fertilization human stage' doesn't match ontology name '20th week post-fertilization stage', adding census name as abbr
❗ census name '22nd week post-fertilization human stage' doesn't match ontology name '22nd week post-fertilization stage', adding census name as abbr
❗ census name '17th week post-fertilization human stage' doesn't match ontology name '17th week post-fertilization stage', adding census name as abbr
❗ census name '14th week post-fertilization human stage' doesn't match ontology name '14th week post-fertilization stage', adding census name as abbr
❗ census name '19th week post-fertilization human stage' doesn't match ontology name '19th week post-fertilization stage', adding census name as abbr
❗ census name '15th week post-fertilization human stage' doesn't match ontology name '15th week post-fertilization stage', adding census name as abbr
❗ census name '16th week post-fertilization human stage' doesn't match ontology name '16th week post-fertilization stage', adding census name as abbr
❗ census name '21st week post-fertilization human stage' doesn't match ontology name '21st week post-fertilization stage', adding census name as abbr
❗ census name '18th week post-fertilization human stage' doesn't match ontology name '18th week post-fertilization stage', adding census name as abbr
❗ census name 'embryonic human stage' doesn't match ontology name 'embryonic stage', adding census name as abbr
❗ census name 'third decade human stage' doesn't match ontology name 'third decade stage', adding census name as abbr
❗ census name '6-year-old human stage' doesn't match ontology name '6-year-old stage', adding census name as abbr
❗ census name 'fourth LMP month human stage' doesn't match ontology name 'fourth LMP month stage', adding census name as abbr
❗ census name 'fifth LMP month human stage' doesn't match ontology name 'fifth LMP month stage', adding census name as abbr
❗ census name '11-year-old human stage' doesn't match ontology name '11-year-old stage', adding census name as abbr
❗ census name '13-year-old human stage' doesn't match ontology name '13-year-old stage', adding census name as abbr
❗ census name '25-44 year-old human stage' doesn't match ontology name '25-44 year-old stage', adding census name as abbr
❗ census name '90-year-old human stage' doesn't match ontology name '90-year-old stage', adding census name as abbr
❗ census name '10-month-old human stage' doesn't match ontology name '10-month-old stage', adding census name as abbr
❗ census name '5-month-old human stage' doesn't match ontology name '5-month-old stage', adding census name as abbr
❗ census name '14-month-old human stage' doesn't match ontology name '14-month-old stage', adding census name as abbr
❗ census name 'human late adulthood stage' doesn't match ontology name 'late adulthood stage', adding census name as abbr
❗ census name '65-79 year-old human stage' doesn't match ontology name '65-79 year-old stage', adding census name as abbr
❗ census name '19-month-old human stage' doesn't match ontology name '19-month-old stage', adding census name as abbr
❗ census name '7-month-old human stage' doesn't match ontology name '7-month-old stage', adding census name as abbr
❗ census name 'human early adulthood stage' doesn't match ontology name 'early adulthood stage', adding census name as abbr
❗ census name '91-year-old human stage' doesn't match ontology name '91-year-old stage', adding census name as abbr
❗ census name '95-year-old human stage' doesn't match ontology name '95-year-old stage', adding census name as abbr
❗ census name '97-year-old human stage' doesn't match ontology name '97-year-old stage', adding census name as abbr
❗ census name '94-year-old human stage' doesn't match ontology name '94-year-old stage', adding census name as abbr
❗ census name '96-year-old human stage' doesn't match ontology name '96-year-old stage', adding census name as abbr
❗ census name '93-year-old human stage' doesn't match ontology name '93-year-old stage', adding census name as abbr
❗ census name 'human middle aged stage' doesn't match ontology name 'middle aged stage', adding census name as abbr
❗ census name 'human aged stage' doesn't match ontology name 'aged stage', adding census name as abbr
❗ census name '92-year-old human stage' doesn't match ontology name '92-year-old stage', adding census name as abbr
❗ census name '8-year-old human stage' doesn't match ontology name '8-year-old stage', adding census name as abbr
❗ census name '1-year-old human stage' doesn't match ontology name '1-year-old stage', adding census name as abbr
❗ census name 'ninth decade human stage' doesn't match ontology name 'ninth decade stage', adding census name as abbr
❗ census name '25th week post-fertilization human stage' doesn't match ontology name '25th week post-fertilization stage', adding census name as abbr
❗ census name '31st week post-fertilization human stage' doesn't match ontology name '31st week post-fertilization stage', adding census name as abbr
❗ census name '23rd week post-fertilization human stage' doesn't match ontology name '23rd week post-fertilization stage', adding census name as abbr
❗ census name '17-year-old human stage' doesn't match ontology name '17-year-old stage', adding census name as abbr
❗ census name '26th week post-fertilization human stage' doesn't match ontology name '26th week post-fertilization stage', adding census name as abbr
register_ontology(lb.Disease, features.disease)
❗ did not create Disease record for 1 non-validated ontology_id: 'PATO:0000461'

β€˜PATO:0000461’ is a term for β€œnormal” which can be typed with Phenotype:

pato = lb.BiontySource.filter(source="pato").one()
lb.Phenotype.from_bionty(ontology_id="PATO:0000461", bionty_source=pato).save()
❗ did not create Phenotype record for 1 non-validated ontology_id: 'PATO:0000068'
register_ontology(lb.Ethnicity, features.self_reported_ethnicity)
❗ did not create Ethnicity records for 2 non-validated ontology_ids: 'multiethnic', 'unknown'
❗ census name 'Greater Middle Eastern  (Middle Eastern, North African or Persian)' doesn't match ontology name 'Greater Middle Eastern  (Middle Eastern or North African or Persian)', adding census name as abbr

Let’s manually add two terms to the Ethnicity registry:

lb.Ethnicity(name="multiethnic").save()
lb.Ethnicity(name="unknown").save()
register_ontology(lb.Phenotype, features.sex, bionty_source=pato)
❗ did not create Phenotype record for 1 non-validated ontology_id: 'unknown'
lb.Phenotype(name="unknown").save()
register_ontology(lb.Tissue, features.tissue)
register_ontology(lb.Tissue, features.tissue_general)

Validate and register non-ontological metadata#

β€œdonor_id” and β€œsuspension_type” are two fields without public ontologies, let’s register them using β€œULabel”:

features_records = ln.Feature.lookup()
features_records.donor_id.registries = ln.ULabel.__get_name_with_schema__()
features_records.donor_id.save()
features_records.suspension_type.registries = ln.ULabel.__get_name_with_schema__()
features_records.suspension_type.save()
donor_ids = (
    census_data.obs.read(column_names=[features.donor_id])
    .concat()
    .to_pandas()
    .drop_duplicates()
)
ln.settings.upon_create_search_names = False

records = []
for donor_id in donor_ids[features.donor_id].unique():
    record = ln.ULabel(name=donor_id, description=f"{features.donor_id}: {donor_id}")
    records.append(record)

ln.save(records)

Construct a parent β€œis_donor” to group these ulabels:

is_donor = ln.ULabel(name="is_donor", description="parent of donor ids")
is_donor.save()
is_donor.children.set(records)

We do the same for β€œsuspension_type”:

suspension_types = (
    census_data.obs.read(column_names=[features.suspension_type])
    .concat()
    .to_pandas()
    .drop_duplicates()
)

records = []
for suspension_type in suspension_types[features.suspension_type].unique():
    record = ln.ULabel(
        name=suspension_type,
        description=f"{features.suspension_type}: {suspension_type}",
    )
    records.append(record)

ln.save(records)

is_suspension_type = ln.ULabel(
    name="is_suspension_type", description="parent of suspension types"
)
is_suspension_type.save()
is_suspension_type.children.set(records)
is_suspension_type.view_parents(with_children=True)
_images/3f2f8f7fde3c280559f51b6bab42cfe93746b7417eab440a22df5120b68a94e8.svg
census.close()

Now we have validated all ontological terms in Census metadata!

Let’s see how they can be useful when cellxgene-census.