Track folders#
Warning
Currently only supports data folders that are located in the configured storage.
Show code cell content
!lamin load mydata
ℹ️ Loading instance: testuser1/mydata
import lamindb as ln
ln.track()
ℹ️ Instance: testuser1/mydata
ℹ️ User: testuser1
ℹ️ Added notebook: Transform(id='QrRtGnxmM3Bo', v='0', name='06-folder', type=notebook, title='Track folders', created_by='DzTjkKse', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33))
ℹ️ Added run: Run(id='4aFiVMrLvr2yiopRZ1Y6', transform_id='QrRtGnxmM3Bo', transform_v='0', created_by='DzTjkKse', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33))
Track a folder and its containing files#
Show code cell content
ln.dev.datasets.generate_cell_ranger_files(
"sample_001", ln.setup.settings.instance.storage.root
)
PosixPath('/home/runner/work/lamindb/lamindb/docs/guide/mydata/sample_001')
!ls -l './mydata/sample_001/'
total 44
drwxr-xr-x 2 runner docker 4096 Mar 30 23:16 analysis
-rw-r--r-- 1 runner docker 6 Mar 30 23:16 cloupe.cloupe
drwxr-xr-x 2 runner docker 4096 Mar 30 23:16 filtered_feature_bc_matrix
-rw-r--r-- 1 runner docker 6 Mar 30 23:16 filtered_feature_bc_matrix.h5
-rw-r--r-- 1 runner docker 6 Mar 30 23:16 metrics_summary.csv
-rw-r--r-- 1 runner docker 6 Mar 30 23:16 molecule_info.h5
-rw-r--r-- 1 runner docker 6 Mar 30 23:16 possorted_genome_bam.bam
-rw-r--r-- 1 runner docker 6 Mar 30 23:16 possorted_genome_bam.bam.bai
drwxr-xr-x 2 runner docker 4096 Mar 30 23:16 raw_feature_bc_matrix
-rw-r--r-- 1 runner docker 6 Mar 30 23:16 raw_feature_bc_matrix.h5
-rw-r--r-- 1 runner docker 6 Mar 30 23:16 web_summary.html
Let’s pass a directory path to ln.Folder
, which creates a Folder record:
folder = ln.Folder(folder="./mydata/sample_001/")
folder
Folder(id='UnFi8UjBQMV09Jaqo3nL', name='sample_001', created_by='DzTjkKse')
Meanwhile creates file records correspond to each of the file inside the Folder:
folder.files
[File(id='lM7fgzdhxYBCYVbQvt5V', name='metrics_summary', suffix='.csv', size=6, hash='Qt326UFWQibtvzRP1mhnJw', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb'),
File(id='ABx9QrIk3ZfKMKT1zZ54', name='raw_feature_bc_matrix', suffix='.h5', size=6, hash='YeBfpVEGZriQhUNira-qiA', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb'),
File(id='6DuGMqan46UGUU0S7Ivp', name='possorted_genome_bam', suffix='.bam.bai', size=6, hash='BJDp79QxGfDAds40LMLUHw', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb'),
File(id='4kEgNOBUIpNORbubLdQu', name='web_summary', suffix='.html', size=6, hash='P3VFbegx8Uvt70i82pN4kA', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb'),
File(id='4spvfwRkeBXOXhdIEpaL', name='cloupe', suffix='.cloupe', size=6, hash='kRJKl4U-rCLESg8i6Tk1QA', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb'),
File(id='kId2tfidnfLUT9JatfJG', name='possorted_genome_bam', suffix='.bam', size=6, hash='HguFGkYNsZBrkJ-7K3mLBw', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb'),
File(id='pN3ap4esn0TBkJEsb4DB', name='molecule_info', suffix='.h5', size=6, hash='YBZ-JAAuZwNT6mjdLqwtGA', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb'),
File(id='THjan9RLY3JUotzHlYbq', name='filtered_feature_bc_matrix', suffix='.h5', size=6, hash='UyuUA2YXfAJBEefQw_wVhA', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb'),
File(id='jzC6yuaVbt7if2dSso9z', name='features', suffix='.tsv.gz', size=6, hash='CPLWI4kM2TYtpVA1GP4B4g', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb'),
File(id='DEN1tlUq2AtRJuk02iOZ', name='barcodes', suffix='.tsv.gz', size=6, hash='HkZpiKOqDM14o_BdT-jdRg', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb'),
File(id='AQ4Cs8pvSSZXLlffNXUZ', name='matrix', suffix='.mtx.gz', size=6, hash='0A_HXnvE3DwCXRHYtSknJw', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb'),
File(id='R8QdanNZelXhxQsfFP0n', name='analysis', suffix='.csv', size=6, hash='dYsOqf4SMpTH5HqeAVgCHw', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb'),
File(id='hwALCxnzc9TC1Q2pF9mM', name='features', suffix='.tsv.gz', size=6, hash='amZgim1akXXVSHHxQdCrNA', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb'),
File(id='CA3s57VaDTleG1Wd21wX', name='barcodes', suffix='.tsv.gz', size=6, hash='odGzhNIQSkMOkaVhbkyU5w', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb'),
File(id='NgLuRTl6WDHt8bhjZfhj', name='matrix', suffix='.mtx.gz', size=6, hash='B9zq5Zvi4gJTGKSCRSn7zQ', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb')]
ln.add(folder)
Folder(id='UnFi8UjBQMV09Jaqo3nL', name='sample_001', created_by='DzTjkKse', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33))
What happens under the hood?#
In the SQL database#
A
Folder
entry15
File
entries correspond to 15 files inside the directoryA
Notebook
entryA
Run
entry
All three entries are linked so that you can find the file using any of the metadata fields.
Show code cell source
ln.select(ln.Folder, name=folder.name).one()
Folder(id='UnFi8UjBQMV09Jaqo3nL', name='sample_001', created_by='DzTjkKse', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33))
Show code cell source
ln.select(ln.Folder).join(ln.File.folders).where(ln.Folder.name == "sample_001").df()
name | created_by | created_at | updated_at | |
---|---|---|---|---|
id | ||||
UnFi8UjBQMV09Jaqo3nL | sample_001 | DzTjkKse | 2023-03-30 23:16:33 | None |
UnFi8UjBQMV09Jaqo3nL | sample_001 | DzTjkKse | 2023-03-30 23:16:33 | None |
UnFi8UjBQMV09Jaqo3nL | sample_001 | DzTjkKse | 2023-03-30 23:16:33 | None |
UnFi8UjBQMV09Jaqo3nL | sample_001 | DzTjkKse | 2023-03-30 23:16:33 | None |
UnFi8UjBQMV09Jaqo3nL | sample_001 | DzTjkKse | 2023-03-30 23:16:33 | None |
UnFi8UjBQMV09Jaqo3nL | sample_001 | DzTjkKse | 2023-03-30 23:16:33 | None |
UnFi8UjBQMV09Jaqo3nL | sample_001 | DzTjkKse | 2023-03-30 23:16:33 | None |
UnFi8UjBQMV09Jaqo3nL | sample_001 | DzTjkKse | 2023-03-30 23:16:33 | None |
UnFi8UjBQMV09Jaqo3nL | sample_001 | DzTjkKse | 2023-03-30 23:16:33 | None |
UnFi8UjBQMV09Jaqo3nL | sample_001 | DzTjkKse | 2023-03-30 23:16:33 | None |
UnFi8UjBQMV09Jaqo3nL | sample_001 | DzTjkKse | 2023-03-30 23:16:33 | None |
UnFi8UjBQMV09Jaqo3nL | sample_001 | DzTjkKse | 2023-03-30 23:16:33 | None |
UnFi8UjBQMV09Jaqo3nL | sample_001 | DzTjkKse | 2023-03-30 23:16:33 | None |
UnFi8UjBQMV09Jaqo3nL | sample_001 | DzTjkKse | 2023-03-30 23:16:33 | None |
UnFi8UjBQMV09Jaqo3nL | sample_001 | DzTjkKse | 2023-03-30 23:16:33 | None |
Show code cell source
ln.select(ln.schema.Notebook, id=ln.context.transform.id).one()
Transform(id='QrRtGnxmM3Bo', v='0', name='06-folder', type=notebook, title='Track folders', created_by='DzTjkKse', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33))
Show code cell source
ln.select(ln.schema.Run, id=ln.context.run.id).one()
Run(id='4aFiVMrLvr2yiopRZ1Y6', transform_id='QrRtGnxmM3Bo', transform_v='0', created_by='DzTjkKse', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33))
View the directory tree#
folder.tree()
sample_001
├── metrics_summary.csv
├── raw_feature_bc_matrix.h5
├── possorted_genome_bam.bam.bai
├── web_summary.html
├── raw_feature_bc_matrix
│ ├── features.tsv.gz
│ ├── barcodes.tsv.gz
│ └── matrix.mtx.gz
├── cloupe.cloupe
├── analysis
│ └── analysis.csv
├── possorted_genome_bam.bam
├── filtered_feature_bc_matrix
│ ├── features.tsv.gz
│ ├── barcodes.tsv.gz
│ └── matrix.mtx.gz
├── molecule_info.h5
└── filtered_feature_bc_matrix.h5
3 directories, 15 files
Find and retrieve files in folder#
Retrieve files from a folder#
with ln.Session() as ss:
folder = ss.select(ln.Folder, name="sample_001").first()
files = folder.files
files[:2]
[File(id='4kEgNOBUIpNORbubLdQu', name='web_summary', suffix='.html', size=6, hash='P3VFbegx8Uvt70i82pN4kA', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33)),
File(id='4spvfwRkeBXOXhdIEpaL', name='cloupe', suffix='.cloupe', size=6, hash='kRJKl4U-rCLESg8i6Tk1QA', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33))]
Retrieve files via it’s relative path to the directory#
folder.get(relpath="raw_feature_bc_matrix/features.tsv.gz")
[File(id='jzC6yuaVbt7if2dSso9z', name='features', suffix='.tsv.gz', size=6, hash='CPLWI4kM2TYtpVA1GP4B4g', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33))]
folder.get(relpath=["analysis/analysis.csv", "raw_feature_bc_matrix/features.tsv.gz"])
[File(id='R8QdanNZelXhxQsfFP0n', name='analysis', suffix='.csv', size=6, hash='dYsOqf4SMpTH5HqeAVgCHw', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33)),
File(id='jzC6yuaVbt7if2dSso9z', name='features', suffix='.tsv.gz', size=6, hash='CPLWI4kM2TYtpVA1GP4B4g', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33))]
folder.get(relpath="raw_feature_bc_matrix")
[File(id='DEN1tlUq2AtRJuk02iOZ', name='barcodes', suffix='.tsv.gz', size=6, hash='HkZpiKOqDM14o_BdT-jdRg', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33)),
File(id='jzC6yuaVbt7if2dSso9z', name='features', suffix='.tsv.gz', size=6, hash='CPLWI4kM2TYtpVA1GP4B4g', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33)),
File(id='AQ4Cs8pvSSZXLlffNXUZ', name='matrix', suffix='.mtx.gz', size=6, hash='0A_HXnvE3DwCXRHYtSknJw', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33))]
folder.get(relpath="raw_feature_bc_matrix", suffix=".mtx.gz")
[File(id='AQ4Cs8pvSSZXLlffNXUZ', name='matrix', suffix='.mtx.gz', size=6, hash='0A_HXnvE3DwCXRHYtSknJw', source_id='4aFiVMrLvr2yiopRZ1Y6', storage_id='8Pj12JLb', created_at=datetime.datetime(2023, 3, 30, 23, 16, 33))]
Query a specific file from a folder using ln.select
:
ln.select(ln.File, name="metrics_summary").join(ln.File.folders).where(
ln.Folder.name == "sample_001"
).df()
name | suffix | size | hash | source_id | storage_id | created_at | updated_at | |
---|---|---|---|---|---|---|---|---|
id | ||||||||
lM7fgzdhxYBCYVbQvt5V | metrics_summary | .csv | 6 | Qt326UFWQibtvzRP1mhnJw | 4aFiVMrLvr2yiopRZ1Y6 | 8Pj12JLb | 2023-03-30 23:16:33 | None |