import lamindb as ln
import pandas as pd
import random
import os
import re
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
→ connected lamindb: laminlabs/lamindata
n_total_files = 10000
max_workers = min(32, os.cpu_count() * 4)
ln.track("fT6b1C7oP9p7", params={"n_total_files": n_total_files}, project="Bulk file annotations & queries")
→ loaded Transform('fT6b1C7oP9p70006'), re-started Run('Cp7E0QMb...') at 2025-05-06 14:58:52 UTC → params: n_total_files=10000 → notebook imports: lamindb==1.5rc1 pandas==2.2.3 tqdm==4.67.1
def natural_sort_key(s):
return [int(text) if text.isdigit() else text.lower()
for text in re.split(r'(\d+)', str(s))]
all_paths = list(ln.UPath("s3://lamindata/mini_text_files").glob("*.txt"))
all_paths.sort(key=natural_sort_key)
all_paths_bounded_total_files = all_paths[:n_total_files]
register_files = ln.Artifact.filter(key__startswith="mini_text_files/", storage__root="s3://lamindata")
registered_keys = set(register_files.values_list('key', flat=True))
paths_to_register = [path for path in all_paths_bounded_total_files if str(path).replace("s3://lamindata/", "") not in registered_keys]
print(f"Found {len(paths_to_register)} files that need to be registered")
paths_to_register[:5]
Found 8935 files that need to be registered
[S3QueryPath('s3://lamindata/mini_text_files/file1066.txt'), S3QueryPath('s3://lamindata/mini_text_files/file1067.txt'), S3QueryPath('s3://lamindata/mini_text_files/file1068.txt'), S3QueryPath('s3://lamindata/mini_text_files/file1069.txt'), S3QueryPath('s3://lamindata/mini_text_files/file1070.txt')]
def register_file(path):
"""Create a single ln.Artifact from a path"""
return ln.Artifact(path).save()
if paths_to_register:
print(f"Registering {len(paths_to_register)} artifacts using {max_workers} threads")
new_artifacts = []
with ThreadPoolExecutor(max_workers=max_workers) as executor:
for artifact in tqdm(
executor.map(register_file, paths_to_register),
total=len(paths_to_register),
desc="Registering artifacts"
):
new_artifacts.append(artifact)
print(f"Successfully registered {len(new_artifacts)} new artifacts")
Registering 8935 artifacts using 32 threads
Registering artifacts: 100%|██████████| 8935/8935 [04:38<00:00, 32.11it/s]
Successfully registered 8935 new artifacts
all_artifacts_df = ln.Artifact.filter(key__startswith="mini_text_files/file", storage__root="s3://lamindata").df()
print(all_artifacts_df.shape)
all_artifacts_df.head()
(10000, 23)
uid | key | description | suffix | kind | otype | size | hash | n_files | n_observations | _hash_type | _key_is_virtual | _overwrite_versions | space_id | storage_id | schema_id | version | is_latest | run_id | created_at | created_by_id | _aux | _branch_code | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
id | |||||||||||||||||||||||
1699 | 2qBNr2ICBnMS8JSC0000 | mini_text_files/file32.txt | None | .txt | None | None | 2 | Y2TT8PSVtqudz407XG4LAQ | None | None | md5 | False | False | 1 | 2 | None | None | True | 669 | 2025-05-05 14:15:55.974243+00:00 | 9 | None | 1 |
1742 | FoaS7BF8AZpt0Va80000 | mini_text_files/file64.txt | None | .txt | None | None | 2 | 6l0vHEYIIy4H06o9mY5RNQ | None | None | md5 | False | False | 1 | 2 | None | None | True | 669 | 2025-05-05 14:16:01.479023+00:00 | 9 | None | 1 |
1777 | wyVcH4vuQUDbKhel0000 | mini_text_files/file124.txt | None | .txt | None | None | 3 | yP_ppYexJvFS7T2JoUa0RQ | None | None | md5 | False | False | 1 | 2 | None | None | True | 671 | 2025-05-05 14:30:08.090943+00:00 | 9 | None | 1 |
1797 | orUHL6T6iCdGi6Ie0000 | mini_text_files/file122.txt | None | .txt | None | None | 3 | oKCA9C5vE7Oi3xM_BzCV3Q | None | None | md5 | False | False | 1 | 2 | None | None | True | 671 | 2025-05-05 14:30:08.134994+00:00 | 9 | None | 1 |
1811 | eV7EetEXP0Z2X4Bk0000 | mini_text_files/file139.txt | None | .txt | None | None | 3 | 4A2gO2haDdGPtqCK8JI94A | None | None | md5 | False | False | 1 | 2 | None | None | True | 671 | 2025-05-05 14:30:13.400788+00:00 | 9 | None | 1 |
partition_type = ln.ULabel(name="MLSplit", is_type=True).save()
ln.ULabel(name="train", type=partition_type).save()
ln.ULabel(name="test", type=partition_type).save()
ln.Feature(name="ml_split", dtype=partition_type).save()
ln.Feature(name="gsm_identifier", dtype=str).save()
ln.Feature(name="dataset_title", dtype=str).save()
→ returning existing ULabel record with same name: 'MLSplit' → returning existing ULabel record with same name: 'train' → returning existing ULabel record with same name: 'test' → returning existing Feature record with same name: 'ml_split' → returning existing Feature record with same name: 'gsm_identifier' → returning existing Feature record with same name: 'dataset_title'
Feature(uid='nOtEtJK1YiU1', name='dataset_title', dtype='str', array_rank=0, array_size=0, space_id=1, created_by_id=9, run_id=665, created_at=2025-05-05 06:46:34 UTC)
random.seed(0) # set a seed for reproducing the same annotations
def make_file_key(i: int) -> str:
return f"mini_text_files/file{i}.txt"
metadata = []
keys = []
for i in range(1, n_total_files+1):
ml_split = "train" if random.random() < 0.7 else "test"
gsm_identifier = f"GSM{100000 + i}"
dataset_title = f"Sample file {i}"
keys.append(make_file_key(i))
metadata.append({
"ml_split": ml_split,
"gsm_identifier": gsm_identifier,
"dataset_title": dataset_title,
})
metadata_df = pd.DataFrame(metadata, index=keys)
print(metadata_df.shape)
metadata_df.head()
(10000, 3)
ml_split | gsm_identifier | dataset_title | |
---|---|---|---|
mini_text_files/file1.txt | test | GSM100001 | Sample file 1 |
mini_text_files/file2.txt | test | GSM100002 | Sample file 2 |
mini_text_files/file3.txt | train | GSM100003 | Sample file 3 |
mini_text_files/file4.txt | train | GSM100004 | Sample file 4 |
mini_text_files/file5.txt | train | GSM100005 | Sample file 5 |
def annotate_artifact(artifact):
metadata = metadata_df.loc[artifact.key].to_dict()
artifact.features.add_values(metadata)
return True
artifacts_to_annotate = ln.Artifact.filter(ml_split__isnull=True).filter(key__startswith="mini_text_files/file", storage__root="s3://lamindata")
print(f"Found {len(artifacts_to_annotate)} artifacts that need to be annotated")
artifacts_to_annotate.df().head()
Found 9000 artifacts that need to be annotated
uid | key | description | suffix | kind | otype | size | hash | n_files | n_observations | _hash_type | _key_is_virtual | _overwrite_versions | space_id | storage_id | schema_id | version | is_latest | run_id | created_at | created_by_id | _aux | _branch_code | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
id | |||||||||||||||||||||||
2966 | Qpdu2dgW0hi21KyX0000 | mini_text_files/file1292.txt | None | .txt | None | None | 4 | RZpN3LWG8k79k5WqdmK8fA | None | None | md5 | False | False | 1 | 2 | None | None | True | 673 | 2025-05-06 14:59:25.887850+00:00 | 9 | None | 1 |
2975 | qRH12KhBL5oHJpDJ0000 | mini_text_files/file1302.txt | None | .txt | None | None | 4 | mWAJ8jdABmBvTAsP2oeK8Q | None | None | md5 | False | False | 1 | 2 | None | None | True | 673 | 2025-05-06 14:59:25.961112+00:00 | 9 | None | 1 |
2984 | RnfefezTDtmN24Jr0000 | mini_text_files/file1308.txt | None | .txt | None | None | 4 | oIcsxbXKTMJQdvPYaOG9-A | None | None | md5 | False | False | 1 | 2 | None | None | True | 673 | 2025-05-06 14:59:26.020073+00:00 | 9 | None | 1 |
2992 | UwES07lwWq2kwIRB0000 | mini_text_files/file1320.txt | None | .txt | None | None | 4 | KlDpwta4m5W8tBbWhX-LRQ | None | None | md5 | False | False | 1 | 2 | None | None | True | 673 | 2025-05-06 14:59:26.041509+00:00 | 9 | None | 1 |
2999 | lgDdIgot6mPPa51m0000 | mini_text_files/file1324.txt | None | .txt | None | None | 4 | u3lG59hcganmn-4c6koIfA | None | None | md5 | False | False | 1 | 2 | None | None | True | 673 | 2025-05-06 14:59:26.827474+00:00 | 9 | None | 1 |
with ThreadPoolExecutor(max_workers=max_workers) as executor:
results = list(tqdm(
executor.map(annotate_artifact, artifacts_to_annotate),
total=len(artifacts_to_annotate),
desc="Annotating artifacts"
))
Annotating artifacts: 100%|██████████| 9000/9000 [10:35<00:00, 14.16it/s]
features_list = ["ml_split", "dataset_title", "gsm_identifier"]
Query by feature gsm_identifier
.
ln.Artifact.filter(gsm_identifier="GSM100003").df(features=features_list)
uid | key | ml_split | dataset_title | gsm_identifier | |
---|---|---|---|---|---|
id | |||||
1667 | RZk5OYUpFTB5BNWp0000 | mini_text_files/file3.txt | train | Sample DataFrame 3 | GSM100003 |
ln.Artifact.filter(gsm_identifier__icontains="GSM10000").df(features=features_list)
uid | key | ml_split | dataset_title | gsm_identifier | |
---|---|---|---|---|---|
id | |||||
1673 | alyKMkHYCeucH8mq0000 | mini_text_files/file9.txt | train | Sample DataFrame 9 | GSM100009 |
1667 | RZk5OYUpFTB5BNWp0000 | mini_text_files/file3.txt | train | Sample DataFrame 3 | GSM100003 |
1670 | PDQvhHmtjIeSNbTW0000 | mini_text_files/file6.txt | train | Sample DataFrame 6 | GSM100006 |
1672 | GdvUQv6YNViDThte0000 | mini_text_files/file7.txt | test | Sample DataFrame 7 | GSM100007 |
1674 | BOz0P7EpbR2a2sRM0000 | mini_text_files/file2.txt | test | Sample DataFrame 2 | GSM100002 |
1669 | fHVjhPkOmU09dE920000 | mini_text_files/file8.txt | train | Sample DataFrame 8 | GSM100008 |
1666 | ydBJrZSq9LpCNykC0000 | mini_text_files/file1.txt | test | Sample DataFrame 1 | GSM100001 |
1668 | urXtsQcArMXfpUbl0000 | mini_text_files/file4.txt | train | Sample DataFrame 4 | GSM100004 |
1671 | 43AMhTN70OAKBq6c0000 | mini_text_files/file5.txt | train | Sample DataFrame 5 | GSM100005 |
Query by feature ml_split
.
ln.Artifact.filter(ml_split="train").df(features=features_list)
uid | key | ml_split | dataset_title | gsm_identifier | |
---|---|---|---|---|---|
id | |||||
2151 | 8eic3PVpsojyV0v80000 | mini_text_files/file483.txt | train | Sample file 483 | GSM100483 |
2149 | SDec5egwYaGinDMY0000 | mini_text_files/file480.txt | train | Sample file 480 | GSM100480 |
2152 | UBTURUbnAw0q3JD50000 | mini_text_files/file479.txt | train | Sample file 479 | GSM100479 |
2467 | OV7mzFhzL5DDFi6q0000 | mini_text_files/file797.txt | train | Sample file 797 | GSM100797 |
2458 | b76FlSrzCicKlztK0000 | mini_text_files/file785.txt | train | Sample file 785 | GSM100785 |
... | ... | ... | ... | ... | ... |
10674 | CL55naGixFziqEJK0000 | mini_text_files/file9000.txt | train | Sample file 9000 | GSM109000 |
11652 | tt6tEOibnFbDzQcb0000 | mini_text_files/file9978.txt | train | Sample file 9978 | GSM109978 |
11480 | m8IksNxubZlc5ZW30000 | mini_text_files/file9806.txt | train | Sample file 9806 | GSM109806 |
10870 | wMJtyegLZheGFaL40000 | mini_text_files/file9196.txt | train | Sample file 9196 | GSM109196 |
10939 | zSQDb9TXdJ8pTLJ30000 | mini_text_files/file9264.txt | train | Sample file 9264 | GSM109264 |
6998 rows × 5 columns
Query by multiple features.
ln.Artifact.filter(ml_split="train", gsm_identifier__icontains="GSM10000").df(features=features_list)
uid | key | ml_split | dataset_title | gsm_identifier | |
---|---|---|---|---|---|
id | |||||
1673 | alyKMkHYCeucH8mq0000 | mini_text_files/file9.txt | train | Sample DataFrame 9 | GSM100009 |
1667 | RZk5OYUpFTB5BNWp0000 | mini_text_files/file3.txt | train | Sample DataFrame 3 | GSM100003 |
1670 | PDQvhHmtjIeSNbTW0000 | mini_text_files/file6.txt | train | Sample DataFrame 6 | GSM100006 |
1669 | fHVjhPkOmU09dE920000 | mini_text_files/file8.txt | train | Sample DataFrame 8 | GSM100008 |
1668 | urXtsQcArMXfpUbl0000 | mini_text_files/file4.txt | train | Sample DataFrame 4 | GSM100004 |
1671 | 43AMhTN70OAKBq6c0000 | mini_text_files/file5.txt | train | Sample DataFrame 5 | GSM100005 |
Query with registry fields.
ln.Artifact.filter(key__startswith="mini_text_files/file", storage__root="s3://lamindata").df(features=features_list)
uid | key | ml_split | dataset_title | gsm_identifier | |
---|---|---|---|---|---|
id | |||||
2151 | 8eic3PVpsojyV0v80000 | mini_text_files/file483.txt | train | Sample file 483 | GSM100483 |
2149 | SDec5egwYaGinDMY0000 | mini_text_files/file480.txt | train | Sample file 480 | GSM100480 |
2152 | UBTURUbnAw0q3JD50000 | mini_text_files/file479.txt | train | Sample file 479 | GSM100479 |
2121 | GNWk27g3gb8TReaf0000 | mini_text_files/file453.txt | test | Sample file 453 | GSM100453 |
2122 | F1hspQds1qoGu0BE0000 | mini_text_files/file452.txt | test | Sample file 452 | GSM100452 |
... | ... | ... | ... | ... | ... |
11480 | m8IksNxubZlc5ZW30000 | mini_text_files/file9806.txt | train | Sample file 9806 | GSM109806 |
11256 | i0sZHtLsSnzWH2oo0000 | mini_text_files/file9583.txt | test | Sample file 9583 | GSM109583 |
10870 | wMJtyegLZheGFaL40000 | mini_text_files/file9196.txt | train | Sample file 9196 | GSM109196 |
10424 | 4u3a58rIXMKGk40K0000 | mini_text_files/file8751.txt | test | Sample file 8751 | GSM108751 |
10939 | zSQDb9TXdJ8pTLJ30000 | mini_text_files/file9264.txt | train | Sample file 9264 | GSM109264 |
10000 rows × 5 columns
Look at an example artifact.
example_artifact = ln.Artifact.get(key__startswith="mini_text_files/file5.txt")
Describe.
example_artifact.describe()
Artifact .txt ├── General │ ├── .uid = '43AMhTN70OAKBq6c0000' │ ├── .key = 'mini_text_files/file5.txt' │ ├── .size = 1 │ ├── .hash = '5No7f7vOI0XXdysGdKMY1Q' │ ├── .path = s3://lamindata/mini_text_files/file5.txt │ ├── .created_by = falexwolf (Alex Wolf) │ ├── .created_at = 2025-05-05 14:48:06 │ └── .transform = 'Demonstrate bulk file annotation with flexible metadata' ├── Linked features │ └── ml_split cat[ULabel[MLSplit]] train │ dataset_title str Sample DataFrame 5 │ gsm_identifier str GSM100005 └── Labels └── .projects Project Bulk file annotations & queries .ulabels ULabel train
Get values as a dictionary.
example_artifact.features.get_values()
{'ml_split': 'train', 'dataset_title': 'Sample DataFrame 5', 'gsm_identifier': 'GSM100005'}
ln.finish()