Track folders#

Warning

Currently only supports data folders that are located in the configured storage.

Hide code cell content
!lamin load mydata
ℹ️ Loading instance: testuser1/mydata
import lamindb as ln

ln.track()
ℹ️ Instance: testuser1/mydata
ℹ️ User: testuser1
ℹ️ Added notebook: Transform(id='QrRtGnxmM3Bo', v='0', name='06-folder', type=notebook, title='Track folders', created_by='DzTjkKse', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33))
ℹ️ Added run: Run(id='4aFiVMrLvr2yiopRZ1Y6', transform_id='QrRtGnxmM3Bo', transform_v='0', created_by='DzTjkKse', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33))

Track a folder and its containing files#

Hide code cell content
ln.dev.datasets.generate_cell_ranger_files(
    "sample_001", ln.setup.settings.instance.storage.root
)
PosixPath('/home/runner/work/lamindb/lamindb/docs/guide/mydata/sample_001')
!ls -l './mydata/sample_001/'
total 44
drwxr-xr-x 2 runner docker 4096 Mar 30 23:16 analysis
-rw-r--r-- 1 runner docker    6 Mar 30 23:16 cloupe.cloupe
drwxr-xr-x 2 runner docker 4096 Mar 30 23:16 filtered_feature_bc_matrix
-rw-r--r-- 1 runner docker    6 Mar 30 23:16 filtered_feature_bc_matrix.h5
-rw-r--r-- 1 runner docker    6 Mar 30 23:16 metrics_summary.csv
-rw-r--r-- 1 runner docker    6 Mar 30 23:16 molecule_info.h5
-rw-r--r-- 1 runner docker    6 Mar 30 23:16 possorted_genome_bam.bam
-rw-r--r-- 1 runner docker    6 Mar 30 23:16 possorted_genome_bam.bam.bai
drwxr-xr-x 2 runner docker 4096 Mar 30 23:16 raw_feature_bc_matrix
-rw-r--r-- 1 runner docker    6 Mar 30 23:16 raw_feature_bc_matrix.h5
-rw-r--r-- 1 runner docker    6 Mar 30 23:16 web_summary.html

Let’s pass a directory path to ln.Folder, which creates a Folder record:

folder = ln.Folder(folder="./mydata/sample_001/")

folder
Folder(id='UnFi8UjBQMV09Jaqo3nL', name='sample_001', created_by='DzTjkKse')

Meanwhile creates file records correspond to each of the file inside the Folder:

folder.files
[File(id='lM7fgzdhxYBCYVbQvt5V', name='metrics_summary', suffix='.csv', size=6, hash='Qt326UFWQibtvzRP1mhnJw', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb'),
 File(id='ABx9QrIk3ZfKMKT1zZ54', name='raw_feature_bc_matrix', suffix='.h5', size=6, hash='YeBfpVEGZriQhUNira-qiA', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb'),
 File(id='6DuGMqan46UGUU0S7Ivp', name='possorted_genome_bam', suffix='.bam.bai', size=6, hash='BJDp79QxGfDAds40LMLUHw', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb'),
 File(id='4kEgNOBUIpNORbubLdQu', name='web_summary', suffix='.html', size=6, hash='P3VFbegx8Uvt70i82pN4kA', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb'),
 File(id='4spvfwRkeBXOXhdIEpaL', name='cloupe', suffix='.cloupe', size=6, hash='kRJKl4U-rCLESg8i6Tk1QA', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb'),
 File(id='kId2tfidnfLUT9JatfJG', name='possorted_genome_bam', suffix='.bam', size=6, hash='HguFGkYNsZBrkJ-7K3mLBw', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb'),
 File(id='pN3ap4esn0TBkJEsb4DB', name='molecule_info', suffix='.h5', size=6, hash='YBZ-JAAuZwNT6mjdLqwtGA', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb'),
 File(id='THjan9RLY3JUotzHlYbq', name='filtered_feature_bc_matrix', suffix='.h5', size=6, hash='UyuUA2YXfAJBEefQw_wVhA', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb'),
 File(id='jzC6yuaVbt7if2dSso9z', name='features', suffix='.tsv.gz', size=6, hash='CPLWI4kM2TYtpVA1GP4B4g', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb'),
 File(id='DEN1tlUq2AtRJuk02iOZ', name='barcodes', suffix='.tsv.gz', size=6, hash='HkZpiKOqDM14o_BdT-jdRg', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb'),
 File(id='AQ4Cs8pvSSZXLlffNXUZ', name='matrix', suffix='.mtx.gz', size=6, hash='0A_HXnvE3DwCXRHYtSknJw', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb'),
 File(id='R8QdanNZelXhxQsfFP0n', name='analysis', suffix='.csv', size=6, hash='dYsOqf4SMpTH5HqeAVgCHw', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb'),
 File(id='hwALCxnzc9TC1Q2pF9mM', name='features', suffix='.tsv.gz', size=6, hash='amZgim1akXXVSHHxQdCrNA', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb'),
 File(id='CA3s57VaDTleG1Wd21wX', name='barcodes', suffix='.tsv.gz', size=6, hash='odGzhNIQSkMOkaVhbkyU5w', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb'),
 File(id='NgLuRTl6WDHt8bhjZfhj', name='matrix', suffix='.mtx.gz', size=6, hash='B9zq5Zvi4gJTGKSCRSn7zQ', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb')]
ln.add(folder)
Folder(id='UnFi8UjBQMV09Jaqo3nL', name='sample_001', created_by='DzTjkKse', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33))

What happens under the hood?#

In the SQL database#

  1. A Folder entry

  2. 15 File entries correspond to 15 files inside the directory

  3. A Notebook entry

  4. A Run entry

All three entries are linked so that you can find the file using any of the metadata fields.

Hide code cell source
ln.select(ln.Folder, name=folder.name).one()
Folder(id='UnFi8UjBQMV09Jaqo3nL', name='sample_001', created_by='DzTjkKse', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33))
Hide code cell source
ln.select(ln.Folder).join(ln.File.folders).where(ln.Folder.name == "sample_001").df()
name created_by created_at updated_at
id
UnFi8UjBQMV09Jaqo3nL sample_001 DzTjkKse 2023-03-30 23:16:33 None
UnFi8UjBQMV09Jaqo3nL sample_001 DzTjkKse 2023-03-30 23:16:33 None
UnFi8UjBQMV09Jaqo3nL sample_001 DzTjkKse 2023-03-30 23:16:33 None
UnFi8UjBQMV09Jaqo3nL sample_001 DzTjkKse 2023-03-30 23:16:33 None
UnFi8UjBQMV09Jaqo3nL sample_001 DzTjkKse 2023-03-30 23:16:33 None
UnFi8UjBQMV09Jaqo3nL sample_001 DzTjkKse 2023-03-30 23:16:33 None
UnFi8UjBQMV09Jaqo3nL sample_001 DzTjkKse 2023-03-30 23:16:33 None
UnFi8UjBQMV09Jaqo3nL sample_001 DzTjkKse 2023-03-30 23:16:33 None
UnFi8UjBQMV09Jaqo3nL sample_001 DzTjkKse 2023-03-30 23:16:33 None
UnFi8UjBQMV09Jaqo3nL sample_001 DzTjkKse 2023-03-30 23:16:33 None
UnFi8UjBQMV09Jaqo3nL sample_001 DzTjkKse 2023-03-30 23:16:33 None
UnFi8UjBQMV09Jaqo3nL sample_001 DzTjkKse 2023-03-30 23:16:33 None
UnFi8UjBQMV09Jaqo3nL sample_001 DzTjkKse 2023-03-30 23:16:33 None
UnFi8UjBQMV09Jaqo3nL sample_001 DzTjkKse 2023-03-30 23:16:33 None
UnFi8UjBQMV09Jaqo3nL sample_001 DzTjkKse 2023-03-30 23:16:33 None
Hide code cell source
ln.select(ln.schema.Notebook, id=ln.context.transform.id).one()
Transform(id='QrRtGnxmM3Bo', v='0', name='06-folder', type=notebook, title='Track folders', created_by='DzTjkKse', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33))
Hide code cell source
ln.select(ln.schema.Run, id=ln.context.run.id).one()
Run(id='4aFiVMrLvr2yiopRZ1Y6', transform_id='QrRtGnxmM3Bo', transform_v='0', created_by='DzTjkKse', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33))

View the directory tree#

folder.tree()
sample_001
├── metrics_summary.csv
├── raw_feature_bc_matrix.h5
├── possorted_genome_bam.bam.bai
├── web_summary.html
├── raw_feature_bc_matrix
│   ├── features.tsv.gz
│   ├── barcodes.tsv.gz
│   └── matrix.mtx.gz
├── cloupe.cloupe
├── analysis
│   └── analysis.csv
├── possorted_genome_bam.bam
├── filtered_feature_bc_matrix
│   ├── features.tsv.gz
│   ├── barcodes.tsv.gz
│   └── matrix.mtx.gz
├── molecule_info.h5
└── filtered_feature_bc_matrix.h5

3 directories, 15 files

Find and retrieve files in folder#

Retrieve files from a folder#

with ln.Session() as ss:
    folder = ss.select(ln.Folder, name="sample_001").first()
    files = folder.files
files[:2]
[File(id='4kEgNOBUIpNORbubLdQu', name='web_summary', suffix='.html', size=6, hash='P3VFbegx8Uvt70i82pN4kA', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33)),
 File(id='4spvfwRkeBXOXhdIEpaL', name='cloupe', suffix='.cloupe', size=6, hash='kRJKl4U-rCLESg8i6Tk1QA', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33))]

Retrieve files via it’s relative path to the directory#

folder.get(relpath="raw_feature_bc_matrix/features.tsv.gz")
[File(id='jzC6yuaVbt7if2dSso9z', name='features', suffix='.tsv.gz', size=6, hash='CPLWI4kM2TYtpVA1GP4B4g', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33))]
folder.get(relpath=["analysis/analysis.csv", "raw_feature_bc_matrix/features.tsv.gz"])
[File(id='R8QdanNZelXhxQsfFP0n', name='analysis', suffix='.csv', size=6, hash='dYsOqf4SMpTH5HqeAVgCHw', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33)),
 File(id='jzC6yuaVbt7if2dSso9z', name='features', suffix='.tsv.gz', size=6, hash='CPLWI4kM2TYtpVA1GP4B4g', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33))]
folder.get(relpath="raw_feature_bc_matrix")
[File(id='DEN1tlUq2AtRJuk02iOZ', name='barcodes', suffix='.tsv.gz', size=6, hash='HkZpiKOqDM14o_BdT-jdRg', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33)),
 File(id='jzC6yuaVbt7if2dSso9z', name='features', suffix='.tsv.gz', size=6, hash='CPLWI4kM2TYtpVA1GP4B4g', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33)),
 File(id='AQ4Cs8pvSSZXLlffNXUZ', name='matrix', suffix='.mtx.gz', size=6, hash='0A_HXnvE3DwCXRHYtSknJw', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33))]
folder.get(relpath="raw_feature_bc_matrix", suffix=".mtx.gz")
[File(id='AQ4Cs8pvSSZXLlffNXUZ', name='matrix', suffix='.mtx.gz', size=6, hash='0A_HXnvE3DwCXRHYtSknJw', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33))]

Query a specific file from a folder using ln.select:

ln.select(ln.File, name="metrics_summary").join(ln.File.folders).where(
    ln.Folder.name == "sample_001"
).df()
name suffix size hash source_id storage_id created_at updated_at
id
lM7fgzdhxYBCYVbQvt5V metrics_summary .csv 6 Qt326UFWQibtvzRP1mhnJw 4aFiVMrLvr2yiopRZ1Y6 8Pj12JLb 2023-03-30 23:16:33 None