Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from pyspark import SparkConf, SparkContext
- import re
- def getPairs(line):
- words = re.split("\W*\s+\W*", line)
- return [words[i] + "_" + words[i + 1] for i in range(len(words) - 1) if words[i] == "narodnaya"]
- sc = SparkContext(conf=SparkConf().setAppName("timofeeva").setMaster("yarn"))
- rdd = sc.textFile("hdfs:///data/wiki/en_articles_part")
- rdd = rdd.map(lambda x: x.strip().lower())
- rdd = rdd.map(lambda x: re.sub("^\W+|\W+$", "", x))
- rdd = rdd.flatMap(lambda x: getPairs(x))
- rdd = rdd.map(lambda x: (x, 1))
- rdd = rdd.reduceByKey(lambda a, b: a + b)
- rdd = rdd.sortBy(lambda a: a[0])
- ans = rdd.take(rdd.count())
- for pair, count in ans:
- print(str(pair.encode("utf8")) + "\t" + str(count))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement