Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- vocab = ["foo", "bar", "baz"]
- s = "foo bar baz bar quux foo bla bla"
- wordcount = dict((x,0) for x in vocab)
- for w in re.findall(r"w+", s):
- if w in wordcount:
- wordcount[w] += 1
- from collections import Counter
- vocab = ["foo bar", "baz"]
- r = re.compile("|".join(r"b%sb" % w for w in vocab))
- wordcount = Counter(re.findall(r, s))
- from collections import Counter
- def count_many(needles, haystack):
- count = Counter(haystack.split())
- return {key: count[key] for key in count if key in needles}
- count_many(["foo", "bar", "baz"], "testing somefoothing foo bar baz bax foo foo foo bar bar test bar test")
- {'baz': 1, 'foo': 4, 'bar': 4}
- from collections import defaultdict
- def count_many(needles, haystack):
- count = defaultdict(int)
- for word in haystack.split():
- if word in needles:
- count[word] += 1
- return count
- count_many(["foo", "bar", "baz"], "testing somefoothing foo bar baz bax foo foo foo bar bar test bar test")
- defaultdict(<class 'int'>, {'baz': 1, 'foo': 4, 'bar': 4})
- import time
- import random
- longstring = ["foo", "bar", "baz", "qux", "thud"] * 100000
- random.shuffle(longstring)
- longstring = " ".join(longstring)
- vocab = ["foo bar", "baz"] + ["nothing"+str(i) for i in range(100000)]
- import re
- from collections import Counter
- tic = time.time()
- r = re.compile("|".join(r"b%sb" % w for w in vocab))
- wordcount = Counter(re.findall(r, longstring))
- print(time.time() - tic)
- from sklearn.feature_extraction.text import CountVectorizer
- from scipy import array
- tic = time.time()
- vectorized = CountVectorizer(vocabulary=vocab, ngram_range=(1, 3)).fit([longstring]) # contains 1 to 3 words
- counts = vectorized.transform([longstring])
- counts = array(counts.sum(axis=0))[0]
- wordcount = {vocab[i]: counts[i] for i in range(len(vocab))}
- print(time.time() - tic)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement