Advertisement
Guest User

Untitled

a guest
Sep 18th, 2019
195
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.23 KB | None | 0 0
  1. import pandas as pd
  2. from cyvcf2 import VCF
  3.  
  4. def vcf2pandas(
  5. vcf_file_path: str,
  6. variant_headers: list = [],
  7. num_rows: int = -1
  8. ) -> pd.DataFrame:
  9.  
  10. if not isinstance(num_rows, int):
  11. raise ValueError("num_rows must be an integer")
  12. if num_rows < -1:
  13. raise ValueError("num_rows cannot be negative")
  14.  
  15. v = VCF(vcf_file_path)
  16. if not variant_headers:
  17. variant_headers = ['ALT', 'CHROM', 'FILTER', 'FORMAT', 'ID', 'POS', 'QUAL', 'REF', 'aaf', 'ploidy', 'is_snp', 'is_indel', 'is_deletion', 'is_sv']
  18. info_headers = {}
  19. for header in v.header_iter():
  20. if header['HeaderType'] == 'INFO':
  21. info_headers[header['ID']] = header
  22. df = pd.DataFrame()
  23.  
  24. # Parse each row
  25. for idx, i in enumerate(v):
  26. row_val = {}
  27. if idx % 100 == 0:
  28. print(f"Parsing line: {idx}")
  29. for header in variant_headers:
  30. row_val[header] = i.__getattribute__(header)
  31. for header in info_headers.keys():
  32. row_val[header] = i.INFO.get(header)
  33. df = df.append(row_val, ignore_index=True)
  34. if num_rows == -1:
  35. break
  36. elif idx == num_rows:
  37. break
  38. return df
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement