import lamindb as ln
import pandas as pd
import random
import os
import re
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
→ connected lamindb: laminlabs/lamindata
n_total_files = 10000
max_workers = min(32, os.cpu_count() * 4)

ln.track("fT6b1C7oP9p7", params={"n_total_files": n_total_files}, project="Bulk file annotations & queries")
→ loaded Transform('fT6b1C7oP9p70006'), re-started Run('Cp7E0QMb...') at 2025-05-06 14:58:52 UTC
→ params: n_total_files=10000
→ notebook imports: lamindb==1.5rc1 pandas==2.2.3 tqdm==4.67.1

Register existing files in S3 as artifacts

def natural_sort_key(s):
    return [int(text) if text.isdigit() else text.lower()
            for text in re.split(r'(\d+)', str(s))]

all_paths = list(ln.UPath("s3://lamindata/mini_text_files").glob("*.txt"))
all_paths.sort(key=natural_sort_key)
all_paths_bounded_total_files = all_paths[:n_total_files]
register_files = ln.Artifact.filter(key__startswith="mini_text_files/", storage__root="s3://lamindata")
registered_keys = set(register_files.values_list('key', flat=True))
paths_to_register = [path for path in all_paths_bounded_total_files if str(path).replace("s3://lamindata/", "") not in registered_keys]
print(f"Found {len(paths_to_register)} files that need to be registered")
paths_to_register[:5]
Found 8935 files that need to be registered
[S3QueryPath('s3://lamindata/mini_text_files/file1066.txt'),
 S3QueryPath('s3://lamindata/mini_text_files/file1067.txt'),
 S3QueryPath('s3://lamindata/mini_text_files/file1068.txt'),
 S3QueryPath('s3://lamindata/mini_text_files/file1069.txt'),
 S3QueryPath('s3://lamindata/mini_text_files/file1070.txt')]
def register_file(path):
    """Create a single ln.Artifact from a path"""
    return ln.Artifact(path).save()


if paths_to_register:
    print(f"Registering {len(paths_to_register)} artifacts using {max_workers} threads")
    
    new_artifacts = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        for artifact in tqdm(
            executor.map(register_file, paths_to_register),
            total=len(paths_to_register),
            desc="Registering artifacts"
        ):
            new_artifacts.append(artifact)
    
    print(f"Successfully registered {len(new_artifacts)} new artifacts")
Registering 8935 artifacts using 32 threads
Registering artifacts: 100%|██████████| 8935/8935 [04:38<00:00, 32.11it/s]
Successfully registered 8935 new artifacts

all_artifacts_df = ln.Artifact.filter(key__startswith="mini_text_files/file", storage__root="s3://lamindata").df()
print(all_artifacts_df.shape)
all_artifacts_df.head()
(10000, 23)
uid key description suffix kind otype size hash n_files n_observations _hash_type _key_is_virtual _overwrite_versions space_id storage_id schema_id version is_latest run_id created_at created_by_id _aux _branch_code
id
1699 2qBNr2ICBnMS8JSC0000 mini_text_files/file32.txt None .txt None None 2 Y2TT8PSVtqudz407XG4LAQ None None md5 False False 1 2 None None True 669 2025-05-05 14:15:55.974243+00:00 9 None 1
1742 FoaS7BF8AZpt0Va80000 mini_text_files/file64.txt None .txt None None 2 6l0vHEYIIy4H06o9mY5RNQ None None md5 False False 1 2 None None True 669 2025-05-05 14:16:01.479023+00:00 9 None 1
1777 wyVcH4vuQUDbKhel0000 mini_text_files/file124.txt None .txt None None 3 yP_ppYexJvFS7T2JoUa0RQ None None md5 False False 1 2 None None True 671 2025-05-05 14:30:08.090943+00:00 9 None 1
1797 orUHL6T6iCdGi6Ie0000 mini_text_files/file122.txt None .txt None None 3 oKCA9C5vE7Oi3xM_BzCV3Q None None md5 False False 1 2 None None True 671 2025-05-05 14:30:08.134994+00:00 9 None 1
1811 eV7EetEXP0Z2X4Bk0000 mini_text_files/file139.txt None .txt None None 3 4A2gO2haDdGPtqCK8JI94A None None md5 False False 1 2 None None True 671 2025-05-05 14:30:13.400788+00:00 9 None 1
partition_type = ln.ULabel(name="MLSplit", is_type=True).save()
ln.ULabel(name="train", type=partition_type).save()
ln.ULabel(name="test", type=partition_type).save()
ln.Feature(name="ml_split", dtype=partition_type).save()
ln.Feature(name="gsm_identifier", dtype=str).save()
ln.Feature(name="dataset_title", dtype=str).save()
→ returning existing ULabel record with same name: 'MLSplit'
→ returning existing ULabel record with same name: 'train'
→ returning existing ULabel record with same name: 'test'
→ returning existing Feature record with same name: 'ml_split'
→ returning existing Feature record with same name: 'gsm_identifier'
→ returning existing Feature record with same name: 'dataset_title'
Feature(uid='nOtEtJK1YiU1', name='dataset_title', dtype='str', array_rank=0, array_size=0, space_id=1, created_by_id=9, run_id=665, created_at=2025-05-05 06:46:34 UTC)

Annotate

random.seed(0)  # set a seed for reproducing the same annotations

def make_file_key(i: int) -> str:
    return f"mini_text_files/file{i}.txt"

metadata = []
keys = []
for i in range(1, n_total_files+1):
    ml_split = "train" if random.random() < 0.7 else "test"
    gsm_identifier = f"GSM{100000 + i}"
    dataset_title = f"Sample file {i}"
    keys.append(make_file_key(i))
    metadata.append({
        "ml_split": ml_split,
        "gsm_identifier": gsm_identifier,
        "dataset_title": dataset_title,
    })

metadata_df = pd.DataFrame(metadata, index=keys)
print(metadata_df.shape)
metadata_df.head()
(10000, 3)
ml_split gsm_identifier dataset_title
mini_text_files/file1.txt test GSM100001 Sample file 1
mini_text_files/file2.txt test GSM100002 Sample file 2
mini_text_files/file3.txt train GSM100003 Sample file 3
mini_text_files/file4.txt train GSM100004 Sample file 4
mini_text_files/file5.txt train GSM100005 Sample file 5
def annotate_artifact(artifact):
    metadata = metadata_df.loc[artifact.key].to_dict()
    artifact.features.add_values(metadata)
    return True
artifacts_to_annotate = ln.Artifact.filter(ml_split__isnull=True).filter(key__startswith="mini_text_files/file", storage__root="s3://lamindata")
print(f"Found {len(artifacts_to_annotate)} artifacts that need to be annotated")
artifacts_to_annotate.df().head()
Found 9000 artifacts that need to be annotated
uid key description suffix kind otype size hash n_files n_observations _hash_type _key_is_virtual _overwrite_versions space_id storage_id schema_id version is_latest run_id created_at created_by_id _aux _branch_code
id
2966 Qpdu2dgW0hi21KyX0000 mini_text_files/file1292.txt None .txt None None 4 RZpN3LWG8k79k5WqdmK8fA None None md5 False False 1 2 None None True 673 2025-05-06 14:59:25.887850+00:00 9 None 1
2975 qRH12KhBL5oHJpDJ0000 mini_text_files/file1302.txt None .txt None None 4 mWAJ8jdABmBvTAsP2oeK8Q None None md5 False False 1 2 None None True 673 2025-05-06 14:59:25.961112+00:00 9 None 1
2984 RnfefezTDtmN24Jr0000 mini_text_files/file1308.txt None .txt None None 4 oIcsxbXKTMJQdvPYaOG9-A None None md5 False False 1 2 None None True 673 2025-05-06 14:59:26.020073+00:00 9 None 1
2992 UwES07lwWq2kwIRB0000 mini_text_files/file1320.txt None .txt None None 4 KlDpwta4m5W8tBbWhX-LRQ None None md5 False False 1 2 None None True 673 2025-05-06 14:59:26.041509+00:00 9 None 1
2999 lgDdIgot6mPPa51m0000 mini_text_files/file1324.txt None .txt None None 4 u3lG59hcganmn-4c6koIfA None None md5 False False 1 2 None None True 673 2025-05-06 14:59:26.827474+00:00 9 None 1
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    results = list(tqdm(
        executor.map(annotate_artifact, artifacts_to_annotate),
        total=len(artifacts_to_annotate),
        desc="Annotating artifacts"
    ))
Annotating artifacts: 100%|██████████| 9000/9000 [10:35<00:00, 14.16it/s]

Queries

features_list = ["ml_split", "dataset_title", "gsm_identifier"]

Query by feature gsm_identifier.

ln.Artifact.filter(gsm_identifier="GSM100003").df(features=features_list)
uid key ml_split dataset_title gsm_identifier
id
1667 RZk5OYUpFTB5BNWp0000 mini_text_files/file3.txt train Sample DataFrame 3 GSM100003
ln.Artifact.filter(gsm_identifier__icontains="GSM10000").df(features=features_list)
uid key ml_split dataset_title gsm_identifier
id
1673 alyKMkHYCeucH8mq0000 mini_text_files/file9.txt train Sample DataFrame 9 GSM100009
1667 RZk5OYUpFTB5BNWp0000 mini_text_files/file3.txt train Sample DataFrame 3 GSM100003
1670 PDQvhHmtjIeSNbTW0000 mini_text_files/file6.txt train Sample DataFrame 6 GSM100006
1672 GdvUQv6YNViDThte0000 mini_text_files/file7.txt test Sample DataFrame 7 GSM100007
1674 BOz0P7EpbR2a2sRM0000 mini_text_files/file2.txt test Sample DataFrame 2 GSM100002
1669 fHVjhPkOmU09dE920000 mini_text_files/file8.txt train Sample DataFrame 8 GSM100008
1666 ydBJrZSq9LpCNykC0000 mini_text_files/file1.txt test Sample DataFrame 1 GSM100001
1668 urXtsQcArMXfpUbl0000 mini_text_files/file4.txt train Sample DataFrame 4 GSM100004
1671 43AMhTN70OAKBq6c0000 mini_text_files/file5.txt train Sample DataFrame 5 GSM100005

Query by feature ml_split.

ln.Artifact.filter(ml_split="train").df(features=features_list)
uid key ml_split dataset_title gsm_identifier
id
2151 8eic3PVpsojyV0v80000 mini_text_files/file483.txt train Sample file 483 GSM100483
2149 SDec5egwYaGinDMY0000 mini_text_files/file480.txt train Sample file 480 GSM100480
2152 UBTURUbnAw0q3JD50000 mini_text_files/file479.txt train Sample file 479 GSM100479
2467 OV7mzFhzL5DDFi6q0000 mini_text_files/file797.txt train Sample file 797 GSM100797
2458 b76FlSrzCicKlztK0000 mini_text_files/file785.txt train Sample file 785 GSM100785
... ... ... ... ... ...
10674 CL55naGixFziqEJK0000 mini_text_files/file9000.txt train Sample file 9000 GSM109000
11652 tt6tEOibnFbDzQcb0000 mini_text_files/file9978.txt train Sample file 9978 GSM109978
11480 m8IksNxubZlc5ZW30000 mini_text_files/file9806.txt train Sample file 9806 GSM109806
10870 wMJtyegLZheGFaL40000 mini_text_files/file9196.txt train Sample file 9196 GSM109196
10939 zSQDb9TXdJ8pTLJ30000 mini_text_files/file9264.txt train Sample file 9264 GSM109264

6998 rows × 5 columns

Query by multiple features.

ln.Artifact.filter(ml_split="train", gsm_identifier__icontains="GSM10000").df(features=features_list)
uid key ml_split dataset_title gsm_identifier
id
1673 alyKMkHYCeucH8mq0000 mini_text_files/file9.txt train Sample DataFrame 9 GSM100009
1667 RZk5OYUpFTB5BNWp0000 mini_text_files/file3.txt train Sample DataFrame 3 GSM100003
1670 PDQvhHmtjIeSNbTW0000 mini_text_files/file6.txt train Sample DataFrame 6 GSM100006
1669 fHVjhPkOmU09dE920000 mini_text_files/file8.txt train Sample DataFrame 8 GSM100008
1668 urXtsQcArMXfpUbl0000 mini_text_files/file4.txt train Sample DataFrame 4 GSM100004
1671 43AMhTN70OAKBq6c0000 mini_text_files/file5.txt train Sample DataFrame 5 GSM100005

Query with registry fields.

ln.Artifact.filter(key__startswith="mini_text_files/file", storage__root="s3://lamindata").df(features=features_list)
uid key ml_split dataset_title gsm_identifier
id
2151 8eic3PVpsojyV0v80000 mini_text_files/file483.txt train Sample file 483 GSM100483
2149 SDec5egwYaGinDMY0000 mini_text_files/file480.txt train Sample file 480 GSM100480
2152 UBTURUbnAw0q3JD50000 mini_text_files/file479.txt train Sample file 479 GSM100479
2121 GNWk27g3gb8TReaf0000 mini_text_files/file453.txt test Sample file 453 GSM100453
2122 F1hspQds1qoGu0BE0000 mini_text_files/file452.txt test Sample file 452 GSM100452
... ... ... ... ... ...
11480 m8IksNxubZlc5ZW30000 mini_text_files/file9806.txt train Sample file 9806 GSM109806
11256 i0sZHtLsSnzWH2oo0000 mini_text_files/file9583.txt test Sample file 9583 GSM109583
10870 wMJtyegLZheGFaL40000 mini_text_files/file9196.txt train Sample file 9196 GSM109196
10424 4u3a58rIXMKGk40K0000 mini_text_files/file8751.txt test Sample file 8751 GSM108751
10939 zSQDb9TXdJ8pTLJ30000 mini_text_files/file9264.txt train Sample file 9264 GSM109264

10000 rows × 5 columns

Look at an example artifact.

example_artifact = ln.Artifact.get(key__startswith="mini_text_files/file5.txt")

Describe.

example_artifact.describe()
Artifact .txt
├── General
│   ├── .uid = '43AMhTN70OAKBq6c0000'
│   ├── .key = 'mini_text_files/file5.txt'
│   ├── .size = 1
│   ├── .hash = '5No7f7vOI0XXdysGdKMY1Q'
│   ├── .path = s3://lamindata/mini_text_files/file5.txt
│   ├── .created_by = falexwolf (Alex Wolf)
│   ├── .created_at = 2025-05-05 14:48:06
│   └── .transform = 'Demonstrate bulk file annotation with flexible metadata'
├── Linked features
│   └── ml_split                    cat[ULabel[MLSplit]]       train                                    
│       dataset_title               str                        Sample DataFrame 5                       
│       gsm_identifier              str                        GSM100005                                
└── Labels
    └── .projects                   Project                    Bulk file annotations & queries          
        .ulabels                    ULabel                     train                                    

Get values as a dictionary.

example_artifact.features.get_values()
{'ml_split': 'train',
 'dataset_title': 'Sample DataFrame 5',
 'gsm_identifier': 'GSM100005'}
ln.finish()