import lamindb as ln

ln.track("zFzEfxXmdk47", project="1000 Genomes")
→ connected lamindb: laminlabs/lamindata
→ loaded Transform('zFzEfxXmdk470002'), re-started Run('rBcQ0Tar...') at 2025-06-06 10:23:54 UTC
→ notebook imports: lamindb==1.6.1 pandas==2.2.3 pysam==0.23.0
artifacts = ln.Artifact.filter(key__startswith="data/dragen-3.7.6/hg38-graph-based/", key__endswith=".cnv.vcf.gz").all()
artifacts.df()
uid key description suffix kind otype size hash n_files n_observations _hash_type _key_is_virtual _overwrite_versions space_id storage_id schema_id version is_latest run_id created_at created_by_id _aux _branch_code
id
1525 KX41E7e8cCv5delL0000 data/dragen-3.7.6/hg38-graph-based/HG00096/HG0... None .gz None None 56260 cOSl2fhf8jj1YwR86DaYuQ None None md5 False False 1 25 None None True 638 2025-04-30 13:44:04.417049+00:00 2 None 1
1527 LCt6NsgW27RhLEsI0000 data/dragen-3.7.6/hg38-graph-based/HG00097/HG0... None .gz None None 55957 64j1APpY0FWhcEk0CBbmMA None None md5 False False 1 25 None None True 638 2025-04-30 13:44:06.978043+00:00 2 None 1
1529 5uX1o5avt0nzr0Tk0000 data/dragen-3.7.6/hg38-graph-based/HG00099/HG0... None .gz None None 57941 vmrtUCOO7j4rcnzQQ3QJZA None None md5 False False 1 25 None None True 638 2025-04-30 13:44:09.538569+00:00 2 None 1
1531 TAahuANmDQKdPlMh0000 data/dragen-3.7.6/hg38-graph-based/HG00100/HG0... None .gz None None 53147 gq_-Vt8eAh8ENU97kd0efA None None md5 False False 1 25 None None True 638 2025-04-30 13:44:12.089293+00:00 2 None 1
1533 esvief8xjcRGt4ug0000 data/dragen-3.7.6/hg38-graph-based/HG00101/HG0... None .gz None None 56603 sIoHd3rbqKsoHGa7k2Naiw None None md5 False False 1 25 None None True 638 2025-04-30 13:44:14.628796+00:00 2 None 1
1535 n3kfLpAHPUVZmxz20000 data/dragen-3.7.6/hg38-graph-based/HG00102/HG0... None .gz None None 55968 3U1tBs5T8sMHEDYsLIajMA None None md5 False False 1 25 None None True 638 2025-04-30 13:44:17.211529+00:00 2 None 1
import pysam
import pandas as pd

def read_vcf(localpath: str) -> pd.DataFrame:
    vcf = pysam.VariantFile(localpath)

    data_records = []
    for record in vcf:
        rec_dict = {
            'CHROM': record.chrom,
            'POS': record.pos,
            'ID': record.id,
            'REF': record.ref,
            'ALT': ','.join(str(a) for a in record.alts) if record.alts else '.',
            'QUAL': record.qual,
            'FILTER': ','.join(filter_name for filter_name in record.filter) if not record.filter.keys() == {'PASS'} else 'PASS'
        }
        
        # Handle INFO fields - extract the actual values
        for key in record.info.keys():
            try:
                value = record.info[key]
                # Convert tuple to single value if it has only one element
                if isinstance(value, tuple) and len(value) == 1:
                    value = value[0]
                # Store the actual value
                rec_dict[f'INFO_{key}'] = value
            except TypeError:
                pass
        
        # Handle FORMAT - store it as a string
        rec_dict['FORMAT'] = ':'.join(record.format)
        
        # Handle SAMPLE - extract values for each format field
        if record.samples:
            sample_name = list(record.samples)[0]  # Get the first sample name
            sample_data = record.samples[sample_name]
            
            # Extract each format field value
            for field in record.format:
                try:
                    value = sample_data[field]
                    # Handle tuple values (like PE which might be a pair of numbers)
                    if isinstance(value, tuple):
                        value_str = '/'.join("." if v is None else str(v) for v in value)
                    else:
                        value_str = str(value)
                    
                    # Store as SAMPLE_fieldname
                    rec_dict[f'SAMPLE_{field}'] = value_str
                except (KeyError, TypeError):
                    rec_dict[f'SAMPLE_{field}'] = "."
            
            rec_dict["SAMPLE_NAME"] = sample_name
        
        data_records.append(rec_dict)

    vcf_df = pd.DataFrame(data_records)
    vcf_df["SAMPLE_SM"] = vcf_df["SAMPLE_SM"].astype(float)
    vcf_df["SAMPLE_CN"] = vcf_df["SAMPLE_CN"].astype(int)
    vcf_df["SAMPLE_BC"] = vcf_df["SAMPLE_BC"].astype(int)
    return vcf_df
    
read_vcf(artifacts[0].cache())
[E::idx_find_and_load] Could not retrieve index file for '/Users/sunnysun/Library/Caches/lamindb/1000genomes-dragen/data/dragen-3.7.6/hg38-graph-based/HG00096/HG00096.cnv.vcf.gz'
CHROM POS ID REF ALT QUAL FILTER INFO_REFLEN FORMAT SAMPLE_GT SAMPLE_SM SAMPLE_CN SAMPLE_BC SAMPLE_PE SAMPLE_NAME INFO_SVLEN INFO_SVTYPE
0 chr1 817861 DRAGEN:REF:chr1:817861-2650427 N . 66.0 PASS 1832567 GT:SM:CN:BC:PE ./. 1.054390 2 1434 3/44 HG00096 NaN NaN
1 chr1 2650427 DRAGEN:LOSS:chr1:2650428-2651463 N <DEL> 48.0 cnvLength 1036 GT:SM:CN:BC:PE 1/1 0.007065 0 1 44/509 HG00096 -1036.0 CNV
2 chr1 2651463 DRAGEN:LOSS:chr1:2651464-2653075 N <DEL> 19.0 cnvLength 1612 GT:SM:CN:BC:PE 0/1 0.522896 1 1 509/481 HG00096 -1612.0 CNV
3 chr1 2777732 DRAGEN:REF:chr1:2777732-4063594 N . 72.0 PASS 1285863 GT:SM:CN:BC:PE ./. 1.012290 2 1206 0/9 HG00096 NaN NaN
4 chr1 4063594 DRAGEN:LOSS:chr1:4063595-4067475 N <DEL> 30.0 cnvLength 3881 GT:SM:CN:BC:PE 0/1 0.429788 1 3 9/24 HG00096 -3881.0 CNV
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1494 chrY 22209658 DRAGEN:REF:chrY:22209658-22377402 N . 84.0 PASS 167745 GT:SM:CN:BC:PE 0 0.933649 1 133 0/5 HG00096 NaN NaN
1495 chrY 22727523 DRAGEN:REF:chrY:22727523-22729913 N . 39.0 cnvLength 2391 GT:SM:CN:BC:PE 0 0.905583 1 2 11/0 HG00096 NaN NaN
1496 chrY 26313601 DRAGEN:REF:chrY:26313601-26655981 N . 92.0 PASS 342381 GT:SM:CN:BC:PE 0 0.933649 1 181 1/497 HG00096 NaN NaN
1497 chrY 26660535 DRAGEN:GAIN:chrY:26660536-26662203 N <DUP> 73.0 cnvLength 1668 GT:SM:CN:BC:PE 1 2.703760 3 1 111/111 HG00096 1668.0 CNV
1498 chrY 56821557 DRAGEN:GAIN:chrY:56821558-56882639 N <DUP> 150.0 PASS 61082 GT:SM:CN:BC:PE 1 9.957150 10 24 8/25 HG00096 61082.0 CNV

1499 rows × 17 columns

schema = ln.Schema(
    name="1000 Genomes CNV VCF",
    features=[
        ln.Feature(name="CHROM", dtype=str, coerce_dtype=True).save(),
        ln.Feature(name="POS", dtype=int, coerce_dtype=True).save(),
        ln.Feature(name="ID", dtype=str, coerce_dtype=True).save(),
        ln.Feature(name="REF", dtype=str, coerce_dtype=True).save(),
        ln.Feature(name="ALT", dtype=str, coerce_dtype=True).save(),
        ln.Feature(name="QUAL", dtype=float, coerce_dtype=True).save(),
        ln.Feature(name="FILTER", dtype=str, coerce_dtype=True).save(),
        ln.Feature(name="FORMAT", dtype=str, coerce_dtype=True).save(),
        ln.Feature(name="INFO_REFLEN", dtype=str, coerce_dtype=True).save(),
        ln.Feature(name="INFO_SVLEN", dtype=float, nullable=True, coerce_dtype=True).save(),
        ln.Feature(name="INFO_SVTYPE", dtype=str, nullable=True, coerce_dtype=True).save(),
        ln.Feature(name="SAMPLE_NAME", dtype=str, coerce_dtype=True).save(),
        ln.Feature(name="SAMPLE_GT", dtype=str, coerce_dtype=True).save(),
        ln.Feature(name="SAMPLE_SM", dtype=float, coerce_dtype=True).save(),
        ln.Feature(name="SAMPLE_CN", dtype=int, coerce_dtype=True).save(),
        ln.Feature(name="SAMPLE_BC", dtype=int, coerce_dtype=True).save(),
        ln.Feature(name="SAMPLE_PE", dtype=str, coerce_dtype=True).save(),
    ]
).save()
→ returning existing Feature record with same name: 'CHROM'
→ returning existing Feature record with same name: 'POS'
→ returning existing Feature record with same name: 'ID'
→ returning existing Feature record with same name: 'REF'
→ returning existing Feature record with same name: 'ALT'
→ returning existing Feature record with same name: 'QUAL'
→ returning existing Feature record with same name: 'FILTER'
→ returning existing Feature record with same name: 'FORMAT'
→ returning existing Feature record with same name: 'INFO_REFLEN'
→ returning existing Feature record with same name: 'INFO_SVLEN'
→ returning existing Feature record with same name: 'INFO_SVTYPE'
→ returning existing Feature record with same name: 'SAMPLE_NAME'
→ returning existing Feature record with same name: 'SAMPLE_GT'
→ returning existing Feature record with same name: 'SAMPLE_SM'
→ returning existing Feature record with same name: 'SAMPLE_CN'
→ returning existing Feature record with same name: 'SAMPLE_BC'
→ returning existing Feature record with same name: 'SAMPLE_PE'
→ returning existing schema with same hash: Schema(uid='sDIFphiVrdJRQDXB', name='1000 Genomes CNV VCF', n=17, is_type=False, itype='Feature', hash='zLTXe8WMqv_MIlk1SdaQtg', minimal_set=True, ordered_set=False, maximal_set=False, space_id=1, created_by_id=2, run_id=688, created_at=2025-05-12 11:47:26 UTC)
cnv = ln.ULabel(name="CNV").save()
for artifact in artifacts:
    vcf_df = read_vcf(artifact.cache())
    artifact_pq = ln.Artifact.from_df(vcf_df, key=artifact.key.replace(".cnv.vcf.gz", ".cnv.parquet"), schema=schema).save()
    artifact_pq.labels.add_from(artifact)
    artifact_pq.ulabels.add(cnv)
→ returning existing ULabel record with same name: 'CNV'
[E::idx_find_and_load] Could not retrieve index file for '/Users/sunnysun/Library/Caches/lamindb/1000genomes-dragen/data/dragen-3.7.6/hg38-graph-based/HG00096/HG00096.cnv.vcf.gz'
... uploading qX7KNNYujhpcKhhz0000.parquet: 100.0%
! replacing the existing cache path /Users/sunnysun/Library/Caches/lamindb/lamindata/data/dragen-3.7.6/hg38-graph-based/HG00096/HG00096.cnv.parquet
→ go to https://lamin.ai/laminlabs/lamindata/artifact/qX7KNNYujhpcKhhz0000
[E::idx_find_and_load] Could not retrieve index file for '/Users/sunnysun/Library/Caches/lamindb/1000genomes-dragen/data/dragen-3.7.6/hg38-graph-based/HG00097/HG00097.cnv.vcf.gz'
... uploading dMFHQtcbKITm85bO0000.parquet: 100.0%
! replacing the existing cache path /Users/sunnysun/Library/Caches/lamindb/lamindata/data/dragen-3.7.6/hg38-graph-based/HG00097/HG00097.cnv.parquet
→ returning existing schema with same hash: Schema(uid='QBvafGF0xKNvVBC7', n=17, is_type=False, itype='Feature', hash='EiugPIjB0tZTJr9DUex2Eg', minimal_set=True, ordered_set=False, maximal_set=False, space_id=1, created_by_id=2, run_id=688, created_at=2025-05-12 12:30:22 UTC)
→ go to https://lamin.ai/laminlabs/lamindata/artifact/dMFHQtcbKITm85bO0000
[E::idx_find_and_load] Could not retrieve index file for '/Users/sunnysun/Library/Caches/lamindb/1000genomes-dragen/data/dragen-3.7.6/hg38-graph-based/HG00099/HG00099.cnv.vcf.gz'
... uploading H9NCK72uq08yZL2U0000.parquet: 100.0%
! replacing the existing cache path /Users/sunnysun/Library/Caches/lamindb/lamindata/data/dragen-3.7.6/hg38-graph-based/HG00099/HG00099.cnv.parquet
→ returning existing schema with same hash: Schema(uid='QBvafGF0xKNvVBC7', n=17, is_type=False, itype='Feature', hash='EiugPIjB0tZTJr9DUex2Eg', minimal_set=True, ordered_set=False, maximal_set=False, space_id=1, created_by_id=2, run_id=688, created_at=2025-05-12 12:30:22 UTC)
→ go to https://lamin.ai/laminlabs/lamindata/artifact/H9NCK72uq08yZL2U0000
[E::idx_find_and_load] Could not retrieve index file for '/Users/sunnysun/Library/Caches/lamindb/1000genomes-dragen/data/dragen-3.7.6/hg38-graph-based/HG00100/HG00100.cnv.vcf.gz'
... uploading zJp7r6qYD4dkdpm30000.parquet: 100.0%
! replacing the existing cache path /Users/sunnysun/Library/Caches/lamindb/lamindata/data/dragen-3.7.6/hg38-graph-based/HG00100/HG00100.cnv.parquet
→ returning existing schema with same hash: Schema(uid='QBvafGF0xKNvVBC7', n=17, is_type=False, itype='Feature', hash='EiugPIjB0tZTJr9DUex2Eg', minimal_set=True, ordered_set=False, maximal_set=False, space_id=1, created_by_id=2, run_id=688, created_at=2025-05-12 12:30:22 UTC)
→ go to https://lamin.ai/laminlabs/lamindata/artifact/zJp7r6qYD4dkdpm30000
[E::idx_find_and_load] Could not retrieve index file for '/Users/sunnysun/Library/Caches/lamindb/1000genomes-dragen/data/dragen-3.7.6/hg38-graph-based/HG00101/HG00101.cnv.vcf.gz'
... uploading BWUEZmqquYx84DA20000.parquet: 100.0%
! replacing the existing cache path /Users/sunnysun/Library/Caches/lamindb/lamindata/data/dragen-3.7.6/hg38-graph-based/HG00101/HG00101.cnv.parquet
→ returning existing schema with same hash: Schema(uid='QBvafGF0xKNvVBC7', n=17, is_type=False, itype='Feature', hash='EiugPIjB0tZTJr9DUex2Eg', minimal_set=True, ordered_set=False, maximal_set=False, space_id=1, created_by_id=2, run_id=688, created_at=2025-05-12 12:30:22 UTC)
→ go to https://lamin.ai/laminlabs/lamindata/artifact/BWUEZmqquYx84DA20000
[E::idx_find_and_load] Could not retrieve index file for '/Users/sunnysun/Library/Caches/lamindb/1000genomes-dragen/data/dragen-3.7.6/hg38-graph-based/HG00102/HG00102.cnv.vcf.gz'
... uploading OHaSMnFwXxxRNZWS0000.parquet: 100.0%
! replacing the existing cache path /Users/sunnysun/Library/Caches/lamindb/lamindata/data/dragen-3.7.6/hg38-graph-based/HG00102/HG00102.cnv.parquet
→ returning existing schema with same hash: Schema(uid='QBvafGF0xKNvVBC7', n=17, is_type=False, itype='Feature', hash='EiugPIjB0tZTJr9DUex2Eg', minimal_set=True, ordered_set=False, maximal_set=False, space_id=1, created_by_id=2, run_id=688, created_at=2025-05-12 12:30:22 UTC)
→ go to https://lamin.ai/laminlabs/lamindata/artifact/OHaSMnFwXxxRNZWS0000
ln.finish()