import lamindb as ln
ln.track("zFzEfxXmdk47", project="1000 Genomes")
→ connected lamindb: laminlabs/lamindata → loaded Transform('zFzEfxXmdk470002'), re-started Run('rBcQ0Tar...') at 2025-06-06 10:23:54 UTC → notebook imports: lamindb==1.6.1 pandas==2.2.3 pysam==0.23.0
artifacts = ln.Artifact.filter(key__startswith="data/dragen-3.7.6/hg38-graph-based/", key__endswith=".cnv.vcf.gz").all()
artifacts.df()
uid | key | description | suffix | kind | otype | size | hash | n_files | n_observations | _hash_type | _key_is_virtual | _overwrite_versions | space_id | storage_id | schema_id | version | is_latest | run_id | created_at | created_by_id | _aux | _branch_code | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
id | |||||||||||||||||||||||
1525 | KX41E7e8cCv5delL0000 | data/dragen-3.7.6/hg38-graph-based/HG00096/HG0... | None | .gz | None | None | 56260 | cOSl2fhf8jj1YwR86DaYuQ | None | None | md5 | False | False | 1 | 25 | None | None | True | 638 | 2025-04-30 13:44:04.417049+00:00 | 2 | None | 1 |
1527 | LCt6NsgW27RhLEsI0000 | data/dragen-3.7.6/hg38-graph-based/HG00097/HG0... | None | .gz | None | None | 55957 | 64j1APpY0FWhcEk0CBbmMA | None | None | md5 | False | False | 1 | 25 | None | None | True | 638 | 2025-04-30 13:44:06.978043+00:00 | 2 | None | 1 |
1529 | 5uX1o5avt0nzr0Tk0000 | data/dragen-3.7.6/hg38-graph-based/HG00099/HG0... | None | .gz | None | None | 57941 | vmrtUCOO7j4rcnzQQ3QJZA | None | None | md5 | False | False | 1 | 25 | None | None | True | 638 | 2025-04-30 13:44:09.538569+00:00 | 2 | None | 1 |
1531 | TAahuANmDQKdPlMh0000 | data/dragen-3.7.6/hg38-graph-based/HG00100/HG0... | None | .gz | None | None | 53147 | gq_-Vt8eAh8ENU97kd0efA | None | None | md5 | False | False | 1 | 25 | None | None | True | 638 | 2025-04-30 13:44:12.089293+00:00 | 2 | None | 1 |
1533 | esvief8xjcRGt4ug0000 | data/dragen-3.7.6/hg38-graph-based/HG00101/HG0... | None | .gz | None | None | 56603 | sIoHd3rbqKsoHGa7k2Naiw | None | None | md5 | False | False | 1 | 25 | None | None | True | 638 | 2025-04-30 13:44:14.628796+00:00 | 2 | None | 1 |
1535 | n3kfLpAHPUVZmxz20000 | data/dragen-3.7.6/hg38-graph-based/HG00102/HG0... | None | .gz | None | None | 55968 | 3U1tBs5T8sMHEDYsLIajMA | None | None | md5 | False | False | 1 | 25 | None | None | True | 638 | 2025-04-30 13:44:17.211529+00:00 | 2 | None | 1 |
import pysam
import pandas as pd
def read_vcf(localpath: str) -> pd.DataFrame:
vcf = pysam.VariantFile(localpath)
data_records = []
for record in vcf:
rec_dict = {
'CHROM': record.chrom,
'POS': record.pos,
'ID': record.id,
'REF': record.ref,
'ALT': ','.join(str(a) for a in record.alts) if record.alts else '.',
'QUAL': record.qual,
'FILTER': ','.join(filter_name for filter_name in record.filter) if not record.filter.keys() == {'PASS'} else 'PASS'
}
# Handle INFO fields - extract the actual values
for key in record.info.keys():
try:
value = record.info[key]
# Convert tuple to single value if it has only one element
if isinstance(value, tuple) and len(value) == 1:
value = value[0]
# Store the actual value
rec_dict[f'INFO_{key}'] = value
except TypeError:
pass
# Handle FORMAT - store it as a string
rec_dict['FORMAT'] = ':'.join(record.format)
# Handle SAMPLE - extract values for each format field
if record.samples:
sample_name = list(record.samples)[0] # Get the first sample name
sample_data = record.samples[sample_name]
# Extract each format field value
for field in record.format:
try:
value = sample_data[field]
# Handle tuple values (like PE which might be a pair of numbers)
if isinstance(value, tuple):
value_str = '/'.join("." if v is None else str(v) for v in value)
else:
value_str = str(value)
# Store as SAMPLE_fieldname
rec_dict[f'SAMPLE_{field}'] = value_str
except (KeyError, TypeError):
rec_dict[f'SAMPLE_{field}'] = "."
rec_dict["SAMPLE_NAME"] = sample_name
data_records.append(rec_dict)
vcf_df = pd.DataFrame(data_records)
vcf_df["SAMPLE_SM"] = vcf_df["SAMPLE_SM"].astype(float)
vcf_df["SAMPLE_CN"] = vcf_df["SAMPLE_CN"].astype(int)
vcf_df["SAMPLE_BC"] = vcf_df["SAMPLE_BC"].astype(int)
return vcf_df
read_vcf(artifacts[0].cache())
[E::idx_find_and_load] Could not retrieve index file for '/Users/sunnysun/Library/Caches/lamindb/1000genomes-dragen/data/dragen-3.7.6/hg38-graph-based/HG00096/HG00096.cnv.vcf.gz'
CHROM | POS | ID | REF | ALT | QUAL | FILTER | INFO_REFLEN | FORMAT | SAMPLE_GT | SAMPLE_SM | SAMPLE_CN | SAMPLE_BC | SAMPLE_PE | SAMPLE_NAME | INFO_SVLEN | INFO_SVTYPE | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | chr1 | 817861 | DRAGEN:REF:chr1:817861-2650427 | N | . | 66.0 | PASS | 1832567 | GT:SM:CN:BC:PE | ./. | 1.054390 | 2 | 1434 | 3/44 | HG00096 | NaN | NaN |
1 | chr1 | 2650427 | DRAGEN:LOSS:chr1:2650428-2651463 | N | <DEL> | 48.0 | cnvLength | 1036 | GT:SM:CN:BC:PE | 1/1 | 0.007065 | 0 | 1 | 44/509 | HG00096 | -1036.0 | CNV |
2 | chr1 | 2651463 | DRAGEN:LOSS:chr1:2651464-2653075 | N | <DEL> | 19.0 | cnvLength | 1612 | GT:SM:CN:BC:PE | 0/1 | 0.522896 | 1 | 1 | 509/481 | HG00096 | -1612.0 | CNV |
3 | chr1 | 2777732 | DRAGEN:REF:chr1:2777732-4063594 | N | . | 72.0 | PASS | 1285863 | GT:SM:CN:BC:PE | ./. | 1.012290 | 2 | 1206 | 0/9 | HG00096 | NaN | NaN |
4 | chr1 | 4063594 | DRAGEN:LOSS:chr1:4063595-4067475 | N | <DEL> | 30.0 | cnvLength | 3881 | GT:SM:CN:BC:PE | 0/1 | 0.429788 | 1 | 3 | 9/24 | HG00096 | -3881.0 | CNV |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1494 | chrY | 22209658 | DRAGEN:REF:chrY:22209658-22377402 | N | . | 84.0 | PASS | 167745 | GT:SM:CN:BC:PE | 0 | 0.933649 | 1 | 133 | 0/5 | HG00096 | NaN | NaN |
1495 | chrY | 22727523 | DRAGEN:REF:chrY:22727523-22729913 | N | . | 39.0 | cnvLength | 2391 | GT:SM:CN:BC:PE | 0 | 0.905583 | 1 | 2 | 11/0 | HG00096 | NaN | NaN |
1496 | chrY | 26313601 | DRAGEN:REF:chrY:26313601-26655981 | N | . | 92.0 | PASS | 342381 | GT:SM:CN:BC:PE | 0 | 0.933649 | 1 | 181 | 1/497 | HG00096 | NaN | NaN |
1497 | chrY | 26660535 | DRAGEN:GAIN:chrY:26660536-26662203 | N | <DUP> | 73.0 | cnvLength | 1668 | GT:SM:CN:BC:PE | 1 | 2.703760 | 3 | 1 | 111/111 | HG00096 | 1668.0 | CNV |
1498 | chrY | 56821557 | DRAGEN:GAIN:chrY:56821558-56882639 | N | <DUP> | 150.0 | PASS | 61082 | GT:SM:CN:BC:PE | 1 | 9.957150 | 10 | 24 | 8/25 | HG00096 | 61082.0 | CNV |
1499 rows × 17 columns
schema = ln.Schema(
name="1000 Genomes CNV VCF",
features=[
ln.Feature(name="CHROM", dtype=str, coerce_dtype=True).save(),
ln.Feature(name="POS", dtype=int, coerce_dtype=True).save(),
ln.Feature(name="ID", dtype=str, coerce_dtype=True).save(),
ln.Feature(name="REF", dtype=str, coerce_dtype=True).save(),
ln.Feature(name="ALT", dtype=str, coerce_dtype=True).save(),
ln.Feature(name="QUAL", dtype=float, coerce_dtype=True).save(),
ln.Feature(name="FILTER", dtype=str, coerce_dtype=True).save(),
ln.Feature(name="FORMAT", dtype=str, coerce_dtype=True).save(),
ln.Feature(name="INFO_REFLEN", dtype=str, coerce_dtype=True).save(),
ln.Feature(name="INFO_SVLEN", dtype=float, nullable=True, coerce_dtype=True).save(),
ln.Feature(name="INFO_SVTYPE", dtype=str, nullable=True, coerce_dtype=True).save(),
ln.Feature(name="SAMPLE_NAME", dtype=str, coerce_dtype=True).save(),
ln.Feature(name="SAMPLE_GT", dtype=str, coerce_dtype=True).save(),
ln.Feature(name="SAMPLE_SM", dtype=float, coerce_dtype=True).save(),
ln.Feature(name="SAMPLE_CN", dtype=int, coerce_dtype=True).save(),
ln.Feature(name="SAMPLE_BC", dtype=int, coerce_dtype=True).save(),
ln.Feature(name="SAMPLE_PE", dtype=str, coerce_dtype=True).save(),
]
).save()
→ returning existing Feature record with same name: 'CHROM' → returning existing Feature record with same name: 'POS' → returning existing Feature record with same name: 'ID' → returning existing Feature record with same name: 'REF' → returning existing Feature record with same name: 'ALT' → returning existing Feature record with same name: 'QUAL' → returning existing Feature record with same name: 'FILTER' → returning existing Feature record with same name: 'FORMAT' → returning existing Feature record with same name: 'INFO_REFLEN' → returning existing Feature record with same name: 'INFO_SVLEN' → returning existing Feature record with same name: 'INFO_SVTYPE' → returning existing Feature record with same name: 'SAMPLE_NAME' → returning existing Feature record with same name: 'SAMPLE_GT' → returning existing Feature record with same name: 'SAMPLE_SM' → returning existing Feature record with same name: 'SAMPLE_CN' → returning existing Feature record with same name: 'SAMPLE_BC' → returning existing Feature record with same name: 'SAMPLE_PE' → returning existing schema with same hash: Schema(uid='sDIFphiVrdJRQDXB', name='1000 Genomes CNV VCF', n=17, is_type=False, itype='Feature', hash='zLTXe8WMqv_MIlk1SdaQtg', minimal_set=True, ordered_set=False, maximal_set=False, space_id=1, created_by_id=2, run_id=688, created_at=2025-05-12 11:47:26 UTC)
cnv = ln.ULabel(name="CNV").save()
for artifact in artifacts:
vcf_df = read_vcf(artifact.cache())
artifact_pq = ln.Artifact.from_df(vcf_df, key=artifact.key.replace(".cnv.vcf.gz", ".cnv.parquet"), schema=schema).save()
artifact_pq.labels.add_from(artifact)
artifact_pq.ulabels.add(cnv)
→ returning existing ULabel record with same name: 'CNV'
[E::idx_find_and_load] Could not retrieve index file for '/Users/sunnysun/Library/Caches/lamindb/1000genomes-dragen/data/dragen-3.7.6/hg38-graph-based/HG00096/HG00096.cnv.vcf.gz'
... uploading qX7KNNYujhpcKhhz0000.parquet: 100.0% ! replacing the existing cache path /Users/sunnysun/Library/Caches/lamindb/lamindata/data/dragen-3.7.6/hg38-graph-based/HG00096/HG00096.cnv.parquet → go to https://lamin.ai/laminlabs/lamindata/artifact/qX7KNNYujhpcKhhz0000
[E::idx_find_and_load] Could not retrieve index file for '/Users/sunnysun/Library/Caches/lamindb/1000genomes-dragen/data/dragen-3.7.6/hg38-graph-based/HG00097/HG00097.cnv.vcf.gz'
... uploading dMFHQtcbKITm85bO0000.parquet: 100.0% ! replacing the existing cache path /Users/sunnysun/Library/Caches/lamindb/lamindata/data/dragen-3.7.6/hg38-graph-based/HG00097/HG00097.cnv.parquet → returning existing schema with same hash: Schema(uid='QBvafGF0xKNvVBC7', n=17, is_type=False, itype='Feature', hash='EiugPIjB0tZTJr9DUex2Eg', minimal_set=True, ordered_set=False, maximal_set=False, space_id=1, created_by_id=2, run_id=688, created_at=2025-05-12 12:30:22 UTC) → go to https://lamin.ai/laminlabs/lamindata/artifact/dMFHQtcbKITm85bO0000
[E::idx_find_and_load] Could not retrieve index file for '/Users/sunnysun/Library/Caches/lamindb/1000genomes-dragen/data/dragen-3.7.6/hg38-graph-based/HG00099/HG00099.cnv.vcf.gz'
... uploading H9NCK72uq08yZL2U0000.parquet: 100.0% ! replacing the existing cache path /Users/sunnysun/Library/Caches/lamindb/lamindata/data/dragen-3.7.6/hg38-graph-based/HG00099/HG00099.cnv.parquet → returning existing schema with same hash: Schema(uid='QBvafGF0xKNvVBC7', n=17, is_type=False, itype='Feature', hash='EiugPIjB0tZTJr9DUex2Eg', minimal_set=True, ordered_set=False, maximal_set=False, space_id=1, created_by_id=2, run_id=688, created_at=2025-05-12 12:30:22 UTC) → go to https://lamin.ai/laminlabs/lamindata/artifact/H9NCK72uq08yZL2U0000
[E::idx_find_and_load] Could not retrieve index file for '/Users/sunnysun/Library/Caches/lamindb/1000genomes-dragen/data/dragen-3.7.6/hg38-graph-based/HG00100/HG00100.cnv.vcf.gz'
... uploading zJp7r6qYD4dkdpm30000.parquet: 100.0% ! replacing the existing cache path /Users/sunnysun/Library/Caches/lamindb/lamindata/data/dragen-3.7.6/hg38-graph-based/HG00100/HG00100.cnv.parquet → returning existing schema with same hash: Schema(uid='QBvafGF0xKNvVBC7', n=17, is_type=False, itype='Feature', hash='EiugPIjB0tZTJr9DUex2Eg', minimal_set=True, ordered_set=False, maximal_set=False, space_id=1, created_by_id=2, run_id=688, created_at=2025-05-12 12:30:22 UTC) → go to https://lamin.ai/laminlabs/lamindata/artifact/zJp7r6qYD4dkdpm30000
[E::idx_find_and_load] Could not retrieve index file for '/Users/sunnysun/Library/Caches/lamindb/1000genomes-dragen/data/dragen-3.7.6/hg38-graph-based/HG00101/HG00101.cnv.vcf.gz'
... uploading BWUEZmqquYx84DA20000.parquet: 100.0% ! replacing the existing cache path /Users/sunnysun/Library/Caches/lamindb/lamindata/data/dragen-3.7.6/hg38-graph-based/HG00101/HG00101.cnv.parquet → returning existing schema with same hash: Schema(uid='QBvafGF0xKNvVBC7', n=17, is_type=False, itype='Feature', hash='EiugPIjB0tZTJr9DUex2Eg', minimal_set=True, ordered_set=False, maximal_set=False, space_id=1, created_by_id=2, run_id=688, created_at=2025-05-12 12:30:22 UTC) → go to https://lamin.ai/laminlabs/lamindata/artifact/BWUEZmqquYx84DA20000
[E::idx_find_and_load] Could not retrieve index file for '/Users/sunnysun/Library/Caches/lamindb/1000genomes-dragen/data/dragen-3.7.6/hg38-graph-based/HG00102/HG00102.cnv.vcf.gz'
... uploading OHaSMnFwXxxRNZWS0000.parquet: 100.0% ! replacing the existing cache path /Users/sunnysun/Library/Caches/lamindb/lamindata/data/dragen-3.7.6/hg38-graph-based/HG00102/HG00102.cnv.parquet → returning existing schema with same hash: Schema(uid='QBvafGF0xKNvVBC7', n=17, is_type=False, itype='Feature', hash='EiugPIjB0tZTJr9DUex2Eg', minimal_set=True, ordered_set=False, maximal_set=False, space_id=1, created_by_id=2, run_id=688, created_at=2025-05-12 12:30:22 UTC) → go to https://lamin.ai/laminlabs/lamindata/artifact/OHaSMnFwXxxRNZWS0000
ln.finish()