Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import math
- from collections import Counter
- # note that this assumes that all base pairs are upper cased
- def kmers(coll, k):
- n = len(coll)
- for i in range(0, n - k + 1):
- yield coll[i:i+k]
- def encode(kmer):
- v = 0
- for i, c in enumerate(kmer):
- v += 4 ** i + {'A': 0, 'C': 1, 'T': 2, 'G' : 3}[c]
- return v
- def mk_vec(coll, k):
- v = Counter()
- for kmer in kmers(coll, k):
- v[kmer] += 1
- return v
- def euclidean_dist(lhs, rhs):
- ks = set(lhs.keys()) | set(rhs.keys())
- sm = 0
- for k in ks:
- delta = lhs[k] - rhs[k]
- sm += delta ** 2
- return math.sqrt(sm)
- def manhattan_dist(lhs, rhs):
- ks = set(lhs.keys()) | set(rhs.keys())
- sm = 0
- for k in ks:
- delta = lhs[k] - rhs[k]
- sm += abs(delta)
- return sm
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement