import lamindb as ln

ln.track("zFzEfxXmdk47", project="1000 Genomes")

→ connected lamindb: laminlabs/lamindata
→ loaded Transform('zFzEfxXmdk470002'), re-started Run('rBcQ0Tar...') at 2025-06-06 10:23:54 UTC
→ notebook imports: lamindb==1.6.1 pandas==2.2.3 pysam==0.23.0

artifacts = ln.Artifact.filter(key__startswith="data/dragen-3.7.6/hg38-graph-based/", key__endswith=".cnv.vcf.gz").all()
artifacts.df()

import pysam
import pandas as pd

def read_vcf(localpath: str) -> pd.DataFrame:
    vcf = pysam.VariantFile(localpath)

    data_records = []
    for record in vcf:
        rec_dict = {
            'CHROM': record.chrom,
            'POS': record.pos,
            'ID': record.id,
            'REF': record.ref,
            'ALT': ','.join(str(a) for a in record.alts) if record.alts else '.',
            'QUAL': record.qual,
            'FILTER': ','.join(filter_name for filter_name in record.filter) if not record.filter.keys() == {'PASS'} else 'PASS'
        }
        
        # Handle INFO fields - extract the actual values
        for key in record.info.keys():
            try:
                value = record.info[key]
                # Convert tuple to single value if it has only one element
                if isinstance(value, tuple) and len(value) == 1:
                    value = value[0]
                # Store the actual value
                rec_dict[f'INFO_{key}'] = value
            except TypeError:
                pass
        
        # Handle FORMAT - store it as a string
        rec_dict['FORMAT'] = ':'.join(record.format)
        
        # Handle SAMPLE - extract values for each format field
        if record.samples:
            sample_name = list(record.samples)[0]  # Get the first sample name
            sample_data = record.samples[sample_name]
            
            # Extract each format field value
            for field in record.format:
                try:
                    value = sample_data[field]
                    # Handle tuple values (like PE which might be a pair of numbers)
                    if isinstance(value, tuple):
                        value_str = '/'.join("." if v is None else str(v) for v in value)
                    else:
                        value_str = str(value)
                    
                    # Store as SAMPLE_fieldname
                    rec_dict[f'SAMPLE_{field}'] = value_str
                except (KeyError, TypeError):
                    rec_dict[f'SAMPLE_{field}'] = "."
            
            rec_dict["SAMPLE_NAME"] = sample_name
        
        data_records.append(rec_dict)

    vcf_df = pd.DataFrame(data_records)
    vcf_df["SAMPLE_SM"] = vcf_df["SAMPLE_SM"].astype(float)
    vcf_df["SAMPLE_CN"] = vcf_df["SAMPLE_CN"].astype(int)
    vcf_df["SAMPLE_BC"] = vcf_df["SAMPLE_BC"].astype(int)
    return vcf_df

read_vcf(artifacts[0].cache())

[E::idx_find_and_load] Could not retrieve index file for '/Users/sunnysun/Library/Caches/lamindb/1000genomes-dragen/data/dragen-3.7.6/hg38-graph-based/HG00096/HG00096.cnv.vcf.gz'

schema = ln.Schema(
    name="1000 Genomes CNV VCF",
    features=[
        ln.Feature(name="CHROM", dtype=str, coerce_dtype=True).save(),
        ln.Feature(name="POS", dtype=int, coerce_dtype=True).save(),
        ln.Feature(name="ID", dtype=str, coerce_dtype=True).save(),
        ln.Feature(name="REF", dtype=str, coerce_dtype=True).save(),
        ln.Feature(name="ALT", dtype=str, coerce_dtype=True).save(),
        ln.Feature(name="QUAL", dtype=float, coerce_dtype=True).save(),
        ln.Feature(name="FILTER", dtype=str, coerce_dtype=True).save(),
        ln.Feature(name="FORMAT", dtype=str, coerce_dtype=True).save(),
        ln.Feature(name="INFO_REFLEN", dtype=str, coerce_dtype=True).save(),
        ln.Feature(name="INFO_SVLEN", dtype=float, nullable=True, coerce_dtype=True).save(),
        ln.Feature(name="INFO_SVTYPE", dtype=str, nullable=True, coerce_dtype=True).save(),
        ln.Feature(name="SAMPLE_NAME", dtype=str, coerce_dtype=True).save(),
        ln.Feature(name="SAMPLE_GT", dtype=str, coerce_dtype=True).save(),
        ln.Feature(name="SAMPLE_SM", dtype=float, coerce_dtype=True).save(),
        ln.Feature(name="SAMPLE_CN", dtype=int, coerce_dtype=True).save(),
        ln.Feature(name="SAMPLE_BC", dtype=int, coerce_dtype=True).save(),
        ln.Feature(name="SAMPLE_PE", dtype=str, coerce_dtype=True).save(),
    ]
).save()

→ returning existing Feature record with same name: 'CHROM'
→ returning existing Feature record with same name: 'POS'
→ returning existing Feature record with same name: 'ID'
→ returning existing Feature record with same name: 'REF'
→ returning existing Feature record with same name: 'ALT'
→ returning existing Feature record with same name: 'QUAL'
→ returning existing Feature record with same name: 'FILTER'
→ returning existing Feature record with same name: 'FORMAT'
→ returning existing Feature record with same name: 'INFO_REFLEN'
→ returning existing Feature record with same name: 'INFO_SVLEN'
→ returning existing Feature record with same name: 'INFO_SVTYPE'
→ returning existing Feature record with same name: 'SAMPLE_NAME'
→ returning existing Feature record with same name: 'SAMPLE_GT'
→ returning existing Feature record with same name: 'SAMPLE_SM'
→ returning existing Feature record with same name: 'SAMPLE_CN'
→ returning existing Feature record with same name: 'SAMPLE_BC'
→ returning existing Feature record with same name: 'SAMPLE_PE'
→ returning existing schema with same hash: Schema(uid='sDIFphiVrdJRQDXB', name='1000 Genomes CNV VCF', n=17, is_type=False, itype='Feature', hash='zLTXe8WMqv_MIlk1SdaQtg', minimal_set=True, ordered_set=False, maximal_set=False, space_id=1, created_by_id=2, run_id=688, created_at=2025-05-12 11:47:26 UTC)

cnv = ln.ULabel(name="CNV").save()
for artifact in artifacts:
    vcf_df = read_vcf(artifact.cache())
    artifact_pq = ln.Artifact.from_df(vcf_df, key=artifact.key.replace(".cnv.vcf.gz", ".cnv.parquet"), schema=schema).save()
    artifact_pq.labels.add_from(artifact)
    artifact_pq.ulabels.add(cnv)

→ returning existing ULabel record with same name: 'CNV'

[E::idx_find_and_load] Could not retrieve index file for '/Users/sunnysun/Library/Caches/lamindb/1000genomes-dragen/data/dragen-3.7.6/hg38-graph-based/HG00096/HG00096.cnv.vcf.gz'

... uploading qX7KNNYujhpcKhhz0000.parquet: 100.0%
! replacing the existing cache path /Users/sunnysun/Library/Caches/lamindb/lamindata/data/dragen-3.7.6/hg38-graph-based/HG00096/HG00096.cnv.parquet
→ go to https://lamin.ai/laminlabs/lamindata/artifact/qX7KNNYujhpcKhhz0000

[E::idx_find_and_load] Could not retrieve index file for '/Users/sunnysun/Library/Caches/lamindb/1000genomes-dragen/data/dragen-3.7.6/hg38-graph-based/HG00097/HG00097.cnv.vcf.gz'

... uploading dMFHQtcbKITm85bO0000.parquet: 100.0%
! replacing the existing cache path /Users/sunnysun/Library/Caches/lamindb/lamindata/data/dragen-3.7.6/hg38-graph-based/HG00097/HG00097.cnv.parquet
→ returning existing schema with same hash: Schema(uid='QBvafGF0xKNvVBC7', n=17, is_type=False, itype='Feature', hash='EiugPIjB0tZTJr9DUex2Eg', minimal_set=True, ordered_set=False, maximal_set=False, space_id=1, created_by_id=2, run_id=688, created_at=2025-05-12 12:30:22 UTC)
→ go to https://lamin.ai/laminlabs/lamindata/artifact/dMFHQtcbKITm85bO0000

[E::idx_find_and_load] Could not retrieve index file for '/Users/sunnysun/Library/Caches/lamindb/1000genomes-dragen/data/dragen-3.7.6/hg38-graph-based/HG00099/HG00099.cnv.vcf.gz'

... uploading H9NCK72uq08yZL2U0000.parquet: 100.0%
! replacing the existing cache path /Users/sunnysun/Library/Caches/lamindb/lamindata/data/dragen-3.7.6/hg38-graph-based/HG00099/HG00099.cnv.parquet
→ returning existing schema with same hash: Schema(uid='QBvafGF0xKNvVBC7', n=17, is_type=False, itype='Feature', hash='EiugPIjB0tZTJr9DUex2Eg', minimal_set=True, ordered_set=False, maximal_set=False, space_id=1, created_by_id=2, run_id=688, created_at=2025-05-12 12:30:22 UTC)
→ go to https://lamin.ai/laminlabs/lamindata/artifact/H9NCK72uq08yZL2U0000

[E::idx_find_and_load] Could not retrieve index file for '/Users/sunnysun/Library/Caches/lamindb/1000genomes-dragen/data/dragen-3.7.6/hg38-graph-based/HG00100/HG00100.cnv.vcf.gz'

... uploading zJp7r6qYD4dkdpm30000.parquet: 100.0%
! replacing the existing cache path /Users/sunnysun/Library/Caches/lamindb/lamindata/data/dragen-3.7.6/hg38-graph-based/HG00100/HG00100.cnv.parquet
→ returning existing schema with same hash: Schema(uid='QBvafGF0xKNvVBC7', n=17, is_type=False, itype='Feature', hash='EiugPIjB0tZTJr9DUex2Eg', minimal_set=True, ordered_set=False, maximal_set=False, space_id=1, created_by_id=2, run_id=688, created_at=2025-05-12 12:30:22 UTC)
→ go to https://lamin.ai/laminlabs/lamindata/artifact/zJp7r6qYD4dkdpm30000

[E::idx_find_and_load] Could not retrieve index file for '/Users/sunnysun/Library/Caches/lamindb/1000genomes-dragen/data/dragen-3.7.6/hg38-graph-based/HG00101/HG00101.cnv.vcf.gz'

ln.finish()

	CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO_REFLEN	FORMAT	SAMPLE_GT	SAMPLE_SM	SAMPLE_CN	SAMPLE_BC	SAMPLE_PE	SAMPLE_NAME	INFO_SVLEN	INFO_SVTYPE
0	chr1	817861	DRAGEN:REF:chr1:817861-2650427	N	.	66.0	PASS	1832567	GT:SM:CN:BC:PE	./.	1.054390	2	1434	3/44	HG00096	NaN	NaN
1	chr1	2650427	DRAGEN:LOSS:chr1:2650428-2651463	N	<DEL>	48.0	cnvLength	1036	GT:SM:CN:BC:PE	1/1	0.007065	0	1	44/509	HG00096	-1036.0	CNV
2	chr1	2651463	DRAGEN:LOSS:chr1:2651464-2653075	N	<DEL>	19.0	cnvLength	1612	GT:SM:CN:BC:PE	0/1	0.522896	1	1	509/481	HG00096	-1612.0	CNV
3	chr1	2777732	DRAGEN:REF:chr1:2777732-4063594	N	.	72.0	PASS	1285863	GT:SM:CN:BC:PE	./.	1.012290	2	1206	0/9	HG00096	NaN	NaN
4	chr1	4063594	DRAGEN:LOSS:chr1:4063595-4067475	N	<DEL>	30.0	cnvLength	3881	GT:SM:CN:BC:PE	0/1	0.429788	1	3	9/24	HG00096	-3881.0	CNV
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1494	chrY	22209658	DRAGEN:REF:chrY:22209658-22377402	N	.	84.0	PASS	167745	GT:SM:CN:BC:PE	0	0.933649	1	133	0/5	HG00096	NaN	NaN
1495	chrY	22727523	DRAGEN:REF:chrY:22727523-22729913	N	.	39.0	cnvLength	2391	GT:SM:CN:BC:PE	0	0.905583	1	2	11/0	HG00096	NaN	NaN
1496	chrY	26313601	DRAGEN:REF:chrY:26313601-26655981	N	.	92.0	PASS	342381	GT:SM:CN:BC:PE	0	0.933649	1	181	1/497	HG00096	NaN	NaN
1497	chrY	26660535	DRAGEN:GAIN:chrY:26660536-26662203	N	<DUP>	73.0	cnvLength	1668	GT:SM:CN:BC:PE	1	2.703760	3	1	111/111	HG00096	1668.0	CNV
1498	chrY	56821557	DRAGEN:GAIN:chrY:56821558-56882639	N	<DUP>	150.0	PASS	61082	GT:SM:CN:BC:PE	1	9.957150	10	24	8/25	HG00096	61082.0	CNV

	uid	key	description	suffix	kind	otype	size	hash	n_files	n_observations	_hash_type	_key_is_virtual	_overwrite_versions	space_id	storage_id	schema_id	version	is_latest	run_id	created_at	created_by_id	_aux	_branch_code
id
1525	KX41E7e8cCv5delL0000	data/dragen-3.7.6/hg38-graph-based/HG00096/HG0...	None	.gz	None	None	56260	cOSl2fhf8jj1YwR86DaYuQ	None	None	md5	False	False	1	25	None	None	True	638	2025-04-30 13:44:04.417049+00:00	2	None	1
1527	LCt6NsgW27RhLEsI0000	data/dragen-3.7.6/hg38-graph-based/HG00097/HG0...	None	.gz	None	None	55957	64j1APpY0FWhcEk0CBbmMA	None	None	md5	False	False	1	25	None	None	True	638	2025-04-30 13:44:06.978043+00:00	2	None	1
1529	5uX1o5avt0nzr0Tk0000	data/dragen-3.7.6/hg38-graph-based/HG00099/HG0...	None	.gz	None	None	57941	vmrtUCOO7j4rcnzQQ3QJZA	None	None	md5	False	False	1	25	None	None	True	638	2025-04-30 13:44:09.538569+00:00	2	None	1
1531	TAahuANmDQKdPlMh0000	data/dragen-3.7.6/hg38-graph-based/HG00100/HG0...	None	.gz	None	None	53147	gq_-Vt8eAh8ENU97kd0efA	None	None	md5	False	False	1	25	None	None	True	638	2025-04-30 13:44:12.089293+00:00	2	None	1
1533	esvief8xjcRGt4ug0000	data/dragen-3.7.6/hg38-graph-based/HG00101/HG0...	None	.gz	None	None	56603	sIoHd3rbqKsoHGa7k2Naiw	None	None	md5	False	False	1	25	None	None	True	638	2025-04-30 13:44:14.628796+00:00	2	None	1
1535	n3kfLpAHPUVZmxz20000	data/dragen-3.7.6/hg38-graph-based/HG00102/HG0...	None	.gz	None	None	55968	3U1tBs5T8sMHEDYsLIajMA	None	None	md5	False	False	1	25	None	None	True	638	2025-04-30 13:44:17.211529+00:00	2	None	1