Query data#

Querying is based on SQLAlchemy select statements.

LaminDB comes with useful default entities to query for, centered around data lineage: lamindb.schema.

You can also readily get started with querying for biological entities: Track feature-level metadata.

import lamindb as ln
import lamindb.schema as lns

ln.track()
ℹ️ Instance: testuser1/mydata
ℹ️ User: testuser1
ℹ️ Added notebook: Transform(id='vldHzF3aTAiW', v='0', name='07-select', type=notebook, title='Query data', created_by='DzTjkKse', created_at=datetime.datetime(2023, 3, 30, 23, 16, 41))
ℹ️ Added run: Run(id='OvcwilVppkGAnRY3YcH5', transform_id='vldHzF3aTAiW', transform_v='0', created_by='DzTjkKse', created_at=datetime.datetime(2023, 3, 30, 23, 16, 41))

Basic select statements#

LaminDB’s select() statements offer everything of SQLAlchemy, but can be executed in one line using .all() or .df(), targeting ML and data scientists.

stmt = (
    ln.select(ln.File)
    .join(ln.Run)
    .join(ln.Transform)
    .join(lns.User, handle="testuser1")
)

To access the query results encoded in stmt (a SelectStmt), execute the statement with one of

  • .all(): A list of records.

  • .df(): A dataframe with each record stored as a row.

  • .one(): Exactly one record. Will raise an error if there is none.

  • .one_or_none(): Either one record or None if there is no query result.

For example:

stmt.all()
[File(id='VWybnO7aZxo8NMu80eHp', name='paradisi05_laminopathic_nuclei', suffix='.jpg', size=29358, hash='r4tnqmKI_SjrkdLzpuWp4g', source_id='bpttr5hRLo73B4m2co3v', storage_id='8Pj12JLb', created_at=datetime.datetime(2023, 3, 30, 23, 15, 50)),
 File(id='sfqjeqshOu4n2OCrxGj4', name='iris', suffix='.parquet', size=5629, hash='jUTdERuqlGv_GyqFfIEb2Q', source_id='aUvAkAwVxam9tDQcKfro', storage_id='8Pj12JLb', created_at=datetime.datetime(2023, 3, 30, 23, 15, 58)),
 File(id='oocXPPbvFX1fYuFt5i6D', name='mini', suffix='.csv', size=11, hash='z1LdF2qN4cN0M2sXrcW8aw', source_id='obsRwVGjOgOautAvjeiS', storage_id='8Pj12JLb', created_at=datetime.datetime(2023, 3, 30, 23, 16, 7)),
 File(id='lM7fgzdhxYBCYVbQvt5V', name='metrics_summary', suffix='.csv', size=6, hash='Qt326UFWQibtvzRP1mhnJw', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33)),
 File(id='ABx9QrIk3ZfKMKT1zZ54', name='raw_feature_bc_matrix', suffix='.h5', size=6, hash='YeBfpVEGZriQhUNira-qiA', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33)),
 File(id='6DuGMqan46UGUU0S7Ivp', name='possorted_genome_bam', suffix='.bam.bai', size=6, hash='BJDp79QxGfDAds40LMLUHw', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33)),
 File(id='4kEgNOBUIpNORbubLdQu', name='web_summary', suffix='.html', size=6, hash='P3VFbegx8Uvt70i82pN4kA', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33)),
 File(id='4spvfwRkeBXOXhdIEpaL', name='cloupe', suffix='.cloupe', size=6, hash='kRJKl4U-rCLESg8i6Tk1QA', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33)),
 File(id='kId2tfidnfLUT9JatfJG', name='possorted_genome_bam', suffix='.bam', size=6, hash='HguFGkYNsZBrkJ-7K3mLBw', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33)),
 File(id='pN3ap4esn0TBkJEsb4DB', name='molecule_info', suffix='.h5', size=6, hash='YBZ-JAAuZwNT6mjdLqwtGA', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33)),
 File(id='THjan9RLY3JUotzHlYbq', name='filtered_feature_bc_matrix', suffix='.h5', size=6, hash='UyuUA2YXfAJBEefQw_wVhA', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33)),
 File(id='jzC6yuaVbt7if2dSso9z', name='features', suffix='.tsv.gz', size=6, hash='CPLWI4kM2TYtpVA1GP4B4g', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33)),
 File(id='DEN1tlUq2AtRJuk02iOZ', name='barcodes', suffix='.tsv.gz', size=6, hash='HkZpiKOqDM14o_BdT-jdRg', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33)),
 File(id='AQ4Cs8pvSSZXLlffNXUZ', name='matrix', suffix='.mtx.gz', size=6, hash='0A_HXnvE3DwCXRHYtSknJw', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33)),
 File(id='R8QdanNZelXhxQsfFP0n', name='analysis', suffix='.csv', size=6, hash='dYsOqf4SMpTH5HqeAVgCHw', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33)),
 File(id='hwALCxnzc9TC1Q2pF9mM', name='features', suffix='.tsv.gz', size=6, hash='amZgim1akXXVSHHxQdCrNA', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33)),
 File(id='CA3s57VaDTleG1Wd21wX', name='barcodes', suffix='.tsv.gz', size=6, hash='odGzhNIQSkMOkaVhbkyU5w', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33)),
 File(id='NgLuRTl6WDHt8bhjZfhj', name='matrix', suffix='.mtx.gz', size=6, hash='B9zq5Zvi4gJTGKSCRSn7zQ', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33))]

It’s often the most convenient to use the built-in converter to DataFrames.

stmt.df()
name suffix size hash source_id storage_id created_at updated_at
id
VWybnO7aZxo8NMu80eHp paradisi05_laminopathic_nuclei .jpg 29358 r4tnqmKI_SjrkdLzpuWp4g bpttr5hRLo73B4m2co3v 8Pj12JLb 2023-03-30 23:15:50 None
sfqjeqshOu4n2OCrxGj4 iris .parquet 5629 jUTdERuqlGv_GyqFfIEb2Q aUvAkAwVxam9tDQcKfro 8Pj12JLb 2023-03-30 23:15:58 None
oocXPPbvFX1fYuFt5i6D mini .csv 11 z1LdF2qN4cN0M2sXrcW8aw obsRwVGjOgOautAvjeiS 8Pj12JLb 2023-03-30 23:16:07 None
lM7fgzdhxYBCYVbQvt5V metrics_summary .csv 6 Qt326UFWQibtvzRP1mhnJw 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None
ABx9QrIk3ZfKMKT1zZ54 raw_feature_bc_matrix .h5 6 YeBfpVEGZriQhUNira-qiA 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None
6DuGMqan46UGUU0S7Ivp possorted_genome_bam .bam.bai 6 BJDp79QxGfDAds40LMLUHw 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None
4kEgNOBUIpNORbubLdQu web_summary .html 6 P3VFbegx8Uvt70i82pN4kA 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None
4spvfwRkeBXOXhdIEpaL cloupe .cloupe 6 kRJKl4U-rCLESg8i6Tk1QA 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None
kId2tfidnfLUT9JatfJG possorted_genome_bam .bam 6 HguFGkYNsZBrkJ-7K3mLBw 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None
pN3ap4esn0TBkJEsb4DB molecule_info .h5 6 YBZ-JAAuZwNT6mjdLqwtGA 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None
THjan9RLY3JUotzHlYbq filtered_feature_bc_matrix .h5 6 UyuUA2YXfAJBEefQw_wVhA 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None
jzC6yuaVbt7if2dSso9z features .tsv.gz 6 CPLWI4kM2TYtpVA1GP4B4g 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None
DEN1tlUq2AtRJuk02iOZ barcodes .tsv.gz 6 HkZpiKOqDM14o_BdT-jdRg 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None
AQ4Cs8pvSSZXLlffNXUZ matrix .mtx.gz 6 0A_HXnvE3DwCXRHYtSknJw 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None
R8QdanNZelXhxQsfFP0n analysis .csv 6 dYsOqf4SMpTH5HqeAVgCHw 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None
hwALCxnzc9TC1Q2pF9mM features .tsv.gz 6 amZgim1akXXVSHHxQdCrNA 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None
CA3s57VaDTleG1Wd21wX barcodes .tsv.gz 6 odGzhNIQSkMOkaVhbkyU5w 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None
NgLuRTl6WDHt8bhjZfhj matrix .mtx.gz 6 B9zq5Zvi4gJTGKSCRSn7zQ 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None

Arbitrary exploded views#

Say we want all user information in this table.

stmt = (
    ln.select(ln.File, lns.User)
    .join(ln.Run, ln.File.source_id == ln.Run.id)
    .join(ln.Transform)
    .join(lns.User)
)
stmt.df()
core.file core.user
id name suffix size hash source_id storage_id created_at updated_at id email handle name created_at updated_at
0 VWybnO7aZxo8NMu80eHp paradisi05_laminopathic_nuclei .jpg 29358 r4tnqmKI_SjrkdLzpuWp4g bpttr5hRLo73B4m2co3v 8Pj12JLb 2023-03-30 23:15:50 None DzTjkKse testuser1@lamin.ai testuser1 Test User1 2023-03-30 23:15:36 None
1 sfqjeqshOu4n2OCrxGj4 iris .parquet 5629 jUTdERuqlGv_GyqFfIEb2Q aUvAkAwVxam9tDQcKfro 8Pj12JLb 2023-03-30 23:15:58 None DzTjkKse testuser1@lamin.ai testuser1 Test User1 2023-03-30 23:15:36 None
2 oocXPPbvFX1fYuFt5i6D mini .csv 11 z1LdF2qN4cN0M2sXrcW8aw obsRwVGjOgOautAvjeiS 8Pj12JLb 2023-03-30 23:16:07 None DzTjkKse testuser1@lamin.ai testuser1 Test User1 2023-03-30 23:15:36 None
3 lM7fgzdhxYBCYVbQvt5V metrics_summary .csv 6 Qt326UFWQibtvzRP1mhnJw 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None DzTjkKse testuser1@lamin.ai testuser1 Test User1 2023-03-30 23:15:36 None
4 ABx9QrIk3ZfKMKT1zZ54 raw_feature_bc_matrix .h5 6 YeBfpVEGZriQhUNira-qiA 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None DzTjkKse testuser1@lamin.ai testuser1 Test User1 2023-03-30 23:15:36 None
5 6DuGMqan46UGUU0S7Ivp possorted_genome_bam .bam.bai 6 BJDp79QxGfDAds40LMLUHw 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None DzTjkKse testuser1@lamin.ai testuser1 Test User1 2023-03-30 23:15:36 None
6 4kEgNOBUIpNORbubLdQu web_summary .html 6 P3VFbegx8Uvt70i82pN4kA 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None DzTjkKse testuser1@lamin.ai testuser1 Test User1 2023-03-30 23:15:36 None
7 4spvfwRkeBXOXhdIEpaL cloupe .cloupe 6 kRJKl4U-rCLESg8i6Tk1QA 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None DzTjkKse testuser1@lamin.ai testuser1 Test User1 2023-03-30 23:15:36 None
8 kId2tfidnfLUT9JatfJG possorted_genome_bam .bam 6 HguFGkYNsZBrkJ-7K3mLBw 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None DzTjkKse testuser1@lamin.ai testuser1 Test User1 2023-03-30 23:15:36 None
9 pN3ap4esn0TBkJEsb4DB molecule_info .h5 6 YBZ-JAAuZwNT6mjdLqwtGA 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None DzTjkKse testuser1@lamin.ai testuser1 Test User1 2023-03-30 23:15:36 None
10 THjan9RLY3JUotzHlYbq filtered_feature_bc_matrix .h5 6 UyuUA2YXfAJBEefQw_wVhA 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None DzTjkKse testuser1@lamin.ai testuser1 Test User1 2023-03-30 23:15:36 None
11 jzC6yuaVbt7if2dSso9z features .tsv.gz 6 CPLWI4kM2TYtpVA1GP4B4g 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None DzTjkKse testuser1@lamin.ai testuser1 Test User1 2023-03-30 23:15:36 None
12 DEN1tlUq2AtRJuk02iOZ barcodes .tsv.gz 6 HkZpiKOqDM14o_BdT-jdRg 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None DzTjkKse testuser1@lamin.ai testuser1 Test User1 2023-03-30 23:15:36 None
13 AQ4Cs8pvSSZXLlffNXUZ matrix .mtx.gz 6 0A_HXnvE3DwCXRHYtSknJw 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None DzTjkKse testuser1@lamin.ai testuser1 Test User1 2023-03-30 23:15:36 None
14 R8QdanNZelXhxQsfFP0n analysis .csv 6 dYsOqf4SMpTH5HqeAVgCHw 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None DzTjkKse testuser1@lamin.ai testuser1 Test User1 2023-03-30 23:15:36 None
15 hwALCxnzc9TC1Q2pF9mM features .tsv.gz 6 amZgim1akXXVSHHxQdCrNA 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None DzTjkKse testuser1@lamin.ai testuser1 Test User1 2023-03-30 23:15:36 None
16 CA3s57VaDTleG1Wd21wX barcodes .tsv.gz 6 odGzhNIQSkMOkaVhbkyU5w 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None DzTjkKse testuser1@lamin.ai testuser1 Test User1 2023-03-30 23:15:36 None
17 NgLuRTl6WDHt8bhjZfhj matrix .mtx.gz 6 B9zq5Zvi4gJTGKSCRSn7zQ 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None DzTjkKse testuser1@lamin.ai testuser1 Test User1 2023-03-30 23:15:36 None

Say, we only want the user handle.

stmt = (
    ln.select(ln.File, lns.User.handle)
    .join(ln.Run, ln.File.source_id == ln.Run.id)
    .join(ln.Transform)
    .join(lns.User)
)
stmt.df()
core.file core.user
id name suffix size hash source_id storage_id created_at updated_at handle
0 VWybnO7aZxo8NMu80eHp paradisi05_laminopathic_nuclei .jpg 29358 r4tnqmKI_SjrkdLzpuWp4g bpttr5hRLo73B4m2co3v 8Pj12JLb 2023-03-30 23:15:50 None testuser1
1 sfqjeqshOu4n2OCrxGj4 iris .parquet 5629 jUTdERuqlGv_GyqFfIEb2Q aUvAkAwVxam9tDQcKfro 8Pj12JLb 2023-03-30 23:15:58 None testuser1
2 oocXPPbvFX1fYuFt5i6D mini .csv 11 z1LdF2qN4cN0M2sXrcW8aw obsRwVGjOgOautAvjeiS 8Pj12JLb 2023-03-30 23:16:07 None testuser1
3 lM7fgzdhxYBCYVbQvt5V metrics_summary .csv 6 Qt326UFWQibtvzRP1mhnJw 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None testuser1
4 ABx9QrIk3ZfKMKT1zZ54 raw_feature_bc_matrix .h5 6 YeBfpVEGZriQhUNira-qiA 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None testuser1
5 6DuGMqan46UGUU0S7Ivp possorted_genome_bam .bam.bai 6 BJDp79QxGfDAds40LMLUHw 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None testuser1
6 4kEgNOBUIpNORbubLdQu web_summary .html 6 P3VFbegx8Uvt70i82pN4kA 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None testuser1
7 4spvfwRkeBXOXhdIEpaL cloupe .cloupe 6 kRJKl4U-rCLESg8i6Tk1QA 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None testuser1
8 kId2tfidnfLUT9JatfJG possorted_genome_bam .bam 6 HguFGkYNsZBrkJ-7K3mLBw 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None testuser1
9 pN3ap4esn0TBkJEsb4DB molecule_info .h5 6 YBZ-JAAuZwNT6mjdLqwtGA 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None testuser1
10 THjan9RLY3JUotzHlYbq filtered_feature_bc_matrix .h5 6 UyuUA2YXfAJBEefQw_wVhA 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None testuser1
11 jzC6yuaVbt7if2dSso9z features .tsv.gz 6 CPLWI4kM2TYtpVA1GP4B4g 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None testuser1
12 DEN1tlUq2AtRJuk02iOZ barcodes .tsv.gz 6 HkZpiKOqDM14o_BdT-jdRg 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None testuser1
13 AQ4Cs8pvSSZXLlffNXUZ matrix .mtx.gz 6 0A_HXnvE3DwCXRHYtSknJw 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None testuser1
14 R8QdanNZelXhxQsfFP0n analysis .csv 6 dYsOqf4SMpTH5HqeAVgCHw 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None testuser1
15 hwALCxnzc9TC1Q2pF9mM features .tsv.gz 6 amZgim1akXXVSHHxQdCrNA 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None testuser1
16 CA3s57VaDTleG1Wd21wX barcodes .tsv.gz 6 odGzhNIQSkMOkaVhbkyU5w 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None testuser1
17 NgLuRTl6WDHt8bhjZfhj matrix .mtx.gz 6 B9zq5Zvi4gJTGKSCRSn7zQ 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None testuser1

Say, we only want selected information from all tables.

stmt = (
    ln.select(ln.File.name, ln.File.suffix, ln.File.size, lns.User.handle)
    .join(ln.Run, ln.File.source_id == ln.Run.id)
    .join(ln.Transform)
    .join(lns.User)
)
df = stmt.df()
df
core.file core.user
name suffix size handle
0 paradisi05_laminopathic_nuclei .jpg 29358 testuser1
1 iris .parquet 5629 testuser1
2 mini .csv 11 testuser1
3 metrics_summary .csv 6 testuser1
4 raw_feature_bc_matrix .h5 6 testuser1
5 possorted_genome_bam .bam.bai 6 testuser1
6 web_summary .html 6 testuser1
7 cloupe .cloupe 6 testuser1
8 possorted_genome_bam .bam 6 testuser1
9 molecule_info .h5 6 testuser1
10 filtered_feature_bc_matrix .h5 6 testuser1
11 features .tsv.gz 6 testuser1
12 barcodes .tsv.gz 6 testuser1
13 matrix .mtx.gz 6 testuser1
14 analysis .csv 6 testuser1
15 features .tsv.gz 6 testuser1
16 barcodes .tsv.gz 6 testuser1
17 matrix .mtx.gz 6 testuser1

More filtering#

Let us subset to just the parquet files - we know it’s exactly a single one. So we can get the record using .one().

stmt = (
    ln.select(ln.File, suffix=".parquet")
    .join(ln.Run)
    .join(ln.Transform)
    .join(lns.User, handle="testuser1")
)
stmt.one()
File(id='sfqjeqshOu4n2OCrxGj4', name='iris', suffix='.parquet', size=5629, hash='jUTdERuqlGv_GyqFfIEb2Q', source_id='aUvAkAwVxam9tDQcKfro', storage_id='8Pj12JLb', created_at=datetime.datetime(2023, 3, 30, 23, 15, 58))

Or subset to files greater than 10kB. Here, we can’t use keyword arguments, but need an explicit where statement.

stmt = (
    ln.select(ln.File)
    .where(ln.File.size > 1e4)
    .join(ln.Run)
    .join(ln.Transform)
    .join(lns.User)
    .where(lns.User.handle == "testuser1")
)
stmt.df()
name suffix size hash source_id storage_id created_at updated_at
id
VWybnO7aZxo8NMu80eHp paradisi05_laminopathic_nuclei .jpg 29358 r4tnqmKI_SjrkdLzpuWp4g bpttr5hRLo73B4m2co3v 8Pj12JLb 2023-03-30 23:15:50 None

Or select a notebook based on a substring in the name:

ln.select(ln.Transform).where(ln.Transform.name.contains("Track")).df()
id v name type title reference created_by created_at updated_at

Load#

Load data objects into the work environment via load():

file = ln.select(ln.File, name="iris").first()

df = file.load()

If there is a canonical in-memory representation (like a dataframe), data is loaded directly into memory.

df.head()
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) target
0 5.1 3.5 1.4 0.2 0
1 4.9 3.0 1.4 0.2 0
2 4.7 3.2 1.3 0.2 0
3 4.6 3.1 1.5 0.2 0
4 5.0 3.6 1.4 0.2 0

If no in-memory format can be found, load returns the filepath:

file = ln.select(ln.File).where(ln.File.name.contains("paradisi05")).one()
file.load()
PosixPath('/home/runner/work/lamindb/lamindb/docs/guide/mydata/VWybnO7aZxo8NMu80eHp.jpg')