Advertisement
Guest User

Untitled

a guest
Jun 20th, 2019
84
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.85 KB | None | 0 0
  1. vocab = ["foo", "bar", "baz"]
  2. s = "foo bar baz bar quux foo bla bla"
  3.  
  4. wordcount = dict((x,0) for x in vocab)
  5. for w in re.findall(r"w+", s):
  6. if w in wordcount:
  7. wordcount[w] += 1
  8.  
  9. from collections import Counter
  10.  
  11. vocab = ["foo bar", "baz"]
  12. r = re.compile("|".join(r"b%sb" % w for w in vocab))
  13. wordcount = Counter(re.findall(r, s))
  14.  
  15. from collections import Counter
  16.  
  17. def count_many(needles, haystack):
  18. count = Counter(haystack.split())
  19. return {key: count[key] for key in count if key in needles}
  20.  
  21. count_many(["foo", "bar", "baz"], "testing somefoothing foo bar baz bax foo foo foo bar bar test bar test")
  22. {'baz': 1, 'foo': 4, 'bar': 4}
  23.  
  24. from collections import defaultdict
  25.  
  26. def count_many(needles, haystack):
  27. count = defaultdict(int)
  28. for word in haystack.split():
  29. if word in needles:
  30. count[word] += 1
  31. return count
  32.  
  33. count_many(["foo", "bar", "baz"], "testing somefoothing foo bar baz bax foo foo foo bar bar test bar test")
  34. defaultdict(<class 'int'>, {'baz': 1, 'foo': 4, 'bar': 4})
  35.  
  36. import time
  37. import random
  38.  
  39. longstring = ["foo", "bar", "baz", "qux", "thud"] * 100000
  40. random.shuffle(longstring)
  41. longstring = " ".join(longstring)
  42. vocab = ["foo bar", "baz"] + ["nothing"+str(i) for i in range(100000)]
  43.  
  44. import re
  45. from collections import Counter
  46.  
  47. tic = time.time()
  48. r = re.compile("|".join(r"b%sb" % w for w in vocab))
  49. wordcount = Counter(re.findall(r, longstring))
  50. print(time.time() - tic)
  51.  
  52. from sklearn.feature_extraction.text import CountVectorizer
  53. from scipy import array
  54.  
  55. tic = time.time()
  56. vectorized = CountVectorizer(vocabulary=vocab, ngram_range=(1, 3)).fit([longstring]) # contains 1 to 3 words
  57. counts = vectorized.transform([longstring])
  58. counts = array(counts.sum(axis=0))[0]
  59. wordcount = {vocab[i]: counts[i] for i in range(len(vocab))}
  60. print(time.time() - tic)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement