Advertisement
Guest User

Untitled

a guest
Dec 6th, 2019
90
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.70 KB | None | 0 0
  1. from pyspark import SparkConf, SparkContext
  2. import re
  3.  
  4. def getPairs(line):
  5. words = re.split("\W*\s+\W*", line)
  6. return [words[i] + "_" + words[i + 1] for i in range(len(words) - 1) if words[i] == "narodnaya"]
  7.  
  8. sc = SparkContext(conf=SparkConf().setAppName("timofeeva").setMaster("yarn"))
  9. rdd = sc.textFile("hdfs:///data/wiki/en_articles_part")
  10. rdd = rdd.map(lambda x: x.strip().lower())
  11. rdd = rdd.map(lambda x: re.sub("^\W+|\W+$", "", x))
  12. rdd = rdd.flatMap(lambda x: getPairs(x))
  13. rdd = rdd.map(lambda x: (x, 1))
  14. rdd = rdd.reduceByKey(lambda a, b: a + b)
  15. rdd = rdd.sortBy(lambda a: a[0])
  16. ans = rdd.take(rdd.count())
  17. for pair, count in ans:
  18. print(str(pair.encode("utf8")) + "\t" + str(count))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement