Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- from cyvcf2 import VCF
- def vcf2pandas(
- vcf_file_path: str,
- variant_headers: list = [],
- num_rows: int = -1
- ) -> pd.DataFrame:
- if not isinstance(num_rows, int):
- raise ValueError("num_rows must be an integer")
- if num_rows < -1:
- raise ValueError("num_rows cannot be negative")
- v = VCF(vcf_file_path)
- if not variant_headers:
- variant_headers = ['ALT', 'CHROM', 'FILTER', 'FORMAT', 'ID', 'POS', 'QUAL', 'REF', 'aaf', 'ploidy', 'is_snp', 'is_indel', 'is_deletion', 'is_sv']
- info_headers = {}
- for header in v.header_iter():
- if header['HeaderType'] == 'INFO':
- info_headers[header['ID']] = header
- df = pd.DataFrame()
- # Parse each row
- for idx, i in enumerate(v):
- row_val = {}
- if idx % 100 == 0:
- print(f"Parsing line: {idx}")
- for header in variant_headers:
- row_val[header] = i.__getattribute__(header)
- for header in info_headers.keys():
- row_val[header] = i.INFO.get(header)
- df = df.append(row_val, ignore_index=True)
- if num_rows == -1:
- break
- elif idx == num_rows:
- break
- return df
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement