Untitled

from pyspark import SparkConf, SparkContext
import re

def getPairs(line):
    words = re.split("\W*\s+\W*", line)
    return [words[i] + "_" + words[i + 1] for i in range(len(words) - 1) if words[i] == "narodnaya"]

sc = SparkContext(conf=SparkConf().setAppName("timofeeva").setMaster("yarn"))
rdd = sc.textFile("hdfs:///data/wiki/en_articles_part")
rdd = rdd.map(lambda x: x.strip().lower())
rdd = rdd.map(lambda x: re.sub("^\W+|\W+$", "", x))
rdd = rdd.flatMap(lambda x: getPairs(x))
rdd = rdd.map(lambda x: (x, 1))
rdd = rdd.reduceByKey(lambda a, b: a + b)
rdd = rdd.sortBy(lambda a: a[0])
ans = rdd.take(rdd.count())
for pair, count in ans:
    print(str(pair.encode("utf8")) + "\t" + str(count))