Untitled

vocab = ["foo", "bar", "baz"]
s = "foo bar baz bar quux foo bla bla"

wordcount = dict((x,0) for x in vocab)
for w in re.findall(r"w+", s):
    if w in wordcount:
        wordcount[w] += 1

from collections import Counter

vocab = ["foo bar", "baz"]
r = re.compile("|".join(r"b%sb" % w for w in vocab))
wordcount = Counter(re.findall(r, s))

from collections import Counter

def count_many(needles, haystack):
    count = Counter(haystack.split())
    return {key: count[key] for key in count if key in needles}

count_many(["foo", "bar", "baz"], "testing somefoothing foo bar baz bax foo foo foo bar bar test bar test")
{'baz': 1, 'foo': 4, 'bar': 4}

from collections import defaultdict

def count_many(needles, haystack):
    count = defaultdict(int)
    for word in haystack.split():
        if word in needles:
            count[word] += 1
    return count

count_many(["foo", "bar", "baz"], "testing somefoothing foo bar baz bax foo foo foo bar bar test bar test")
defaultdict(<class 'int'>, {'baz': 1, 'foo': 4, 'bar': 4})

import time
import random

longstring = ["foo", "bar", "baz", "qux", "thud"] * 100000
random.shuffle(longstring)
longstring = " ".join(longstring)
vocab = ["foo bar", "baz"] + ["nothing"+str(i) for i in range(100000)]

import re
from collections import Counter

tic = time.time()
r = re.compile("|".join(r"b%sb" % w for w in vocab))
wordcount = Counter(re.findall(r, longstring))
print(time.time() - tic)

from sklearn.feature_extraction.text import CountVectorizer
from scipy import array

tic = time.time()
vectorized = CountVectorizer(vocabulary=vocab, ngram_range=(1, 3)).fit([longstring])  # contains 1 to 3 words
counts = vectorized.transform([longstring])
counts = array(counts.sum(axis=0))[0]
wordcount = {vocab[i]: counts[i] for i in range(len(vocab))}
print(time.time() - tic)