import lamindb as ln
import pandas as pd
import random
import os
import re
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

→ connected lamindb: laminlabs/lamindata

n_total_files = 10000
max_workers = min(32, os.cpu_count() * 4)

ln.track("fT6b1C7oP9p7", params={"n_total_files": n_total_files}, project="Bulk file annotations & queries")

→ loaded Transform('fT6b1C7oP9p70006'), re-started Run('Cp7E0QMb...') at 2025-05-06 14:58:52 UTC
→ params: n_total_files=10000
→ notebook imports: lamindb==1.5rc1 pandas==2.2.3 tqdm==4.67.1

def natural_sort_key(s):
    return [int(text) if text.isdigit() else text.lower()
            for text in re.split(r'(\d+)', str(s))]

all_paths = list(ln.UPath("s3://lamindata/mini_text_files").glob("*.txt"))
all_paths.sort(key=natural_sort_key)
all_paths_bounded_total_files = all_paths[:n_total_files]
register_files = ln.Artifact.filter(key__startswith="mini_text_files/", storage__root="s3://lamindata")
registered_keys = set(register_files.values_list('key', flat=True))
paths_to_register = [path for path in all_paths_bounded_total_files if str(path).replace("s3://lamindata/", "") not in registered_keys]
print(f"Found {len(paths_to_register)} files that need to be registered")
paths_to_register[:5]

Found 8935 files that need to be registered

[S3QueryPath('s3://lamindata/mini_text_files/file1066.txt'),
 S3QueryPath('s3://lamindata/mini_text_files/file1067.txt'),
 S3QueryPath('s3://lamindata/mini_text_files/file1068.txt'),
 S3QueryPath('s3://lamindata/mini_text_files/file1069.txt'),
 S3QueryPath('s3://lamindata/mini_text_files/file1070.txt')]

def register_file(path):
    """Create a single ln.Artifact from a path"""
    return ln.Artifact(path).save()


if paths_to_register:
    print(f"Registering {len(paths_to_register)} artifacts using {max_workers} threads")
    
    new_artifacts = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        for artifact in tqdm(
            executor.map(register_file, paths_to_register),
            total=len(paths_to_register),
            desc="Registering artifacts"
        ):
            new_artifacts.append(artifact)
    
    print(f"Successfully registered {len(new_artifacts)} new artifacts")

Registering 8935 artifacts using 32 threads

Registering artifacts: 100%|██████████| 8935/8935 [04:38<00:00, 32.11it/s]

Successfully registered 8935 new artifacts

all_artifacts_df = ln.Artifact.filter(key__startswith="mini_text_files/file", storage__root="s3://lamindata").df()
print(all_artifacts_df.shape)
all_artifacts_df.head()

(10000, 23)

partition_type = ln.ULabel(name="MLSplit", is_type=True).save()
ln.ULabel(name="train", type=partition_type).save()
ln.ULabel(name="test", type=partition_type).save()
ln.Feature(name="ml_split", dtype=partition_type).save()
ln.Feature(name="gsm_identifier", dtype=str).save()
ln.Feature(name="dataset_title", dtype=str).save()

→ returning existing ULabel record with same name: 'MLSplit'
→ returning existing ULabel record with same name: 'train'
→ returning existing ULabel record with same name: 'test'
→ returning existing Feature record with same name: 'ml_split'
→ returning existing Feature record with same name: 'gsm_identifier'
→ returning existing Feature record with same name: 'dataset_title'

Feature(uid='nOtEtJK1YiU1', name='dataset_title', dtype='str', array_rank=0, array_size=0, space_id=1, created_by_id=9, run_id=665, created_at=2025-05-05 06:46:34 UTC)

random.seed(0)  # set a seed for reproducing the same annotations

def make_file_key(i: int) -> str:
    return f"mini_text_files/file{i}.txt"

metadata = []
keys = []
for i in range(1, n_total_files+1):
    ml_split = "train" if random.random() < 0.7 else "test"
    gsm_identifier = f"GSM{100000 + i}"
    dataset_title = f"Sample file {i}"
    keys.append(make_file_key(i))
    metadata.append({
        "ml_split": ml_split,
        "gsm_identifier": gsm_identifier,
        "dataset_title": dataset_title,
    })

metadata_df = pd.DataFrame(metadata, index=keys)
print(metadata_df.shape)
metadata_df.head()

(10000, 3)

def annotate_artifact(artifact):
    metadata = metadata_df.loc[artifact.key].to_dict()
    artifact.features.add_values(metadata)
    return True

artifacts_to_annotate = ln.Artifact.filter(ml_split__isnull=True).filter(key__startswith="mini_text_files/file", storage__root="s3://lamindata")
print(f"Found {len(artifacts_to_annotate)} artifacts that need to be annotated")
artifacts_to_annotate.df().head()

Found 9000 artifacts that need to be annotated

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    results = list(tqdm(
        executor.map(annotate_artifact, artifacts_to_annotate),
        total=len(artifacts_to_annotate),
        desc="Annotating artifacts"
    ))

Annotating artifacts: 100%|██████████| 9000/9000 [10:35<00:00, 14.16it/s]

features_list = ["ml_split", "dataset_title", "gsm_identifier"]

ln.Artifact.filter(gsm_identifier="GSM100003").df(features=features_list)

ln.Artifact.filter(gsm_identifier__icontains="GSM10000").df(features=features_list)

ln.Artifact.filter(ml_split="train").df(features=features_list)

ln.Artifact.filter(ml_split="train", gsm_identifier__icontains="GSM10000").df(features=features_list)

ln.Artifact.filter(key__startswith="mini_text_files/file", storage__root="s3://lamindata").df(features=features_list)

example_artifact = ln.Artifact.get(key__startswith="mini_text_files/file5.txt")

example_artifact.describe()

Artifact .txt
├── General
│   ├── .uid = '43AMhTN70OAKBq6c0000'
│   ├── .key = 'mini_text_files/file5.txt'
│   ├── .size = 1
│   ├── .hash = '5No7f7vOI0XXdysGdKMY1Q'
│   ├── .path = s3://lamindata/mini_text_files/file5.txt
│   ├── .created_by = falexwolf (Alex Wolf)
│   ├── .created_at = 2025-05-05 14:48:06
│   └── .transform = 'Demonstrate bulk file annotation with flexible metadata'
├── Linked features
│   └── ml_split                    cat[ULabel[MLSplit]]       train                                    
│       dataset_title               str                        Sample DataFrame 5                       
│       gsm_identifier              str                        GSM100005                                
└── Labels
    └── .projects                   Project                    Bulk file annotations & queries          
        .ulabels                    ULabel                     train

example_artifact.features.get_values()

{'ml_split': 'train',
 'dataset_title': 'Sample DataFrame 5',
 'gsm_identifier': 'GSM100005'}

ln.finish()

	uid	key	description	suffix	kind	otype	size	hash	n_files	n_observations	_hash_type	_key_is_virtual	_overwrite_versions	space_id	storage_id	schema_id	version	is_latest	run_id	created_at	created_by_id	_aux	_branch_code
id
1699	2qBNr2ICBnMS8JSC0000	mini_text_files/file32.txt	None	.txt	None	None	2	Y2TT8PSVtqudz407XG4LAQ	None	None	md5	False	False	1	2	None	None	True	669	2025-05-05 14:15:55.974243+00:00	9	None	1
1742	FoaS7BF8AZpt0Va80000	mini_text_files/file64.txt	None	.txt	None	None	2	6l0vHEYIIy4H06o9mY5RNQ	None	None	md5	False	False	1	2	None	None	True	669	2025-05-05 14:16:01.479023+00:00	9	None	1
1777	wyVcH4vuQUDbKhel0000	mini_text_files/file124.txt	None	.txt	None	None	3	yP_ppYexJvFS7T2JoUa0RQ	None	None	md5	False	False	1	2	None	None	True	671	2025-05-05 14:30:08.090943+00:00	9	None	1
1797	orUHL6T6iCdGi6Ie0000	mini_text_files/file122.txt	None	.txt	None	None	3	oKCA9C5vE7Oi3xM_BzCV3Q	None	None	md5	False	False	1	2	None	None	True	671	2025-05-05 14:30:08.134994+00:00	9	None	1
1811	eV7EetEXP0Z2X4Bk0000	mini_text_files/file139.txt	None	.txt	None	None	3	4A2gO2haDdGPtqCK8JI94A	None	None	md5	False	False	1	2	None	None	True	671	2025-05-05 14:30:13.400788+00:00	9	None	1

	ml_split	gsm_identifier	dataset_title
mini_text_files/file1.txt	test	GSM100001	Sample file 1
mini_text_files/file2.txt	test	GSM100002	Sample file 2
mini_text_files/file3.txt	train	GSM100003	Sample file 3
mini_text_files/file4.txt	train	GSM100004	Sample file 4
mini_text_files/file5.txt	train	GSM100005	Sample file 5

	uid	key	description	suffix	kind	otype	size	hash	n_files	n_observations	_hash_type	_key_is_virtual	_overwrite_versions	space_id	storage_id	schema_id	version	is_latest	run_id	created_at	created_by_id	_aux	_branch_code
id
2966	Qpdu2dgW0hi21KyX0000	mini_text_files/file1292.txt	None	.txt	None	None	4	RZpN3LWG8k79k5WqdmK8fA	None	None	md5	False	False	1	2	None	None	True	673	2025-05-06 14:59:25.887850+00:00	9	None	1
2975	qRH12KhBL5oHJpDJ0000	mini_text_files/file1302.txt	None	.txt	None	None	4	mWAJ8jdABmBvTAsP2oeK8Q	None	None	md5	False	False	1	2	None	None	True	673	2025-05-06 14:59:25.961112+00:00	9	None	1
2984	RnfefezTDtmN24Jr0000	mini_text_files/file1308.txt	None	.txt	None	None	4	oIcsxbXKTMJQdvPYaOG9-A	None	None	md5	False	False	1	2	None	None	True	673	2025-05-06 14:59:26.020073+00:00	9	None	1
2992	UwES07lwWq2kwIRB0000	mini_text_files/file1320.txt	None	.txt	None	None	4	KlDpwta4m5W8tBbWhX-LRQ	None	None	md5	False	False	1	2	None	None	True	673	2025-05-06 14:59:26.041509+00:00	9	None	1
2999	lgDdIgot6mPPa51m0000	mini_text_files/file1324.txt	None	.txt	None	None	4	u3lG59hcganmn-4c6koIfA	None	None	md5	False	False	1	2	None	None	True	673	2025-05-06 14:59:26.827474+00:00	9	None	1

	uid	key	ml_split	dataset_title	gsm_identifier
id
1673	alyKMkHYCeucH8mq0000	mini_text_files/file9.txt	train	Sample DataFrame 9	GSM100009
1667	RZk5OYUpFTB5BNWp0000	mini_text_files/file3.txt	train	Sample DataFrame 3	GSM100003
1670	PDQvhHmtjIeSNbTW0000	mini_text_files/file6.txt	train	Sample DataFrame 6	GSM100006
1672	GdvUQv6YNViDThte0000	mini_text_files/file7.txt	test	Sample DataFrame 7	GSM100007
1674	BOz0P7EpbR2a2sRM0000	mini_text_files/file2.txt	test	Sample DataFrame 2	GSM100002
1669	fHVjhPkOmU09dE920000	mini_text_files/file8.txt	train	Sample DataFrame 8	GSM100008
1666	ydBJrZSq9LpCNykC0000	mini_text_files/file1.txt	test	Sample DataFrame 1	GSM100001
1668	urXtsQcArMXfpUbl0000	mini_text_files/file4.txt	train	Sample DataFrame 4	GSM100004
1671	43AMhTN70OAKBq6c0000	mini_text_files/file5.txt	train	Sample DataFrame 5	GSM100005

	uid	key	ml_split	dataset_title	gsm_identifier
id
2151	8eic3PVpsojyV0v80000	mini_text_files/file483.txt	train	Sample file 483	GSM100483
2149	SDec5egwYaGinDMY0000	mini_text_files/file480.txt	train	Sample file 480	GSM100480
2152	UBTURUbnAw0q3JD50000	mini_text_files/file479.txt	train	Sample file 479	GSM100479
2467	OV7mzFhzL5DDFi6q0000	mini_text_files/file797.txt	train	Sample file 797	GSM100797
2458	b76FlSrzCicKlztK0000	mini_text_files/file785.txt	train	Sample file 785	GSM100785
...	...	...	...	...	...
10674	CL55naGixFziqEJK0000	mini_text_files/file9000.txt	train	Sample file 9000	GSM109000
11652	tt6tEOibnFbDzQcb0000	mini_text_files/file9978.txt	train	Sample file 9978	GSM109978
11480	m8IksNxubZlc5ZW30000	mini_text_files/file9806.txt	train	Sample file 9806	GSM109806
10870	wMJtyegLZheGFaL40000	mini_text_files/file9196.txt	train	Sample file 9196	GSM109196
10939	zSQDb9TXdJ8pTLJ30000	mini_text_files/file9264.txt	train	Sample file 9264	GSM109264

Register existing files in S3 as artifacts

Annotate

Queries