Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from pyspark import SparkConf, SparkContext
- import re
- import regex
- import math
- # sc.stop()
- sc = SparkContext(conf=SparkConf().setAppName("example").setMaster("yarn-client"))
- def parse_article(line):
- article_id, text = line.rstrip().split('\t', 1)
- text = re.sub("^\W+|\W+$", "", text, flags=re.UNICODE)
- text = re.sub("[^a-zA-Z\d' ]", "", text, flags=re.UNICODE)
- words = re.split("\W*\s+\W*", text, flags=re.UNICODE)
- return words
- def stop_words_filter(x):
- x_update = list()
- for w in x:
- if w not in stop_res:
- x_update.append(w)
- return x_update
- def make_pairs(words):
- pairs = list()
- for i in range(0, len(words) - 1):
- p = words[i]
- p += '_'
- p += words[i+1]
- pairs.append(p)
- return pairs
- stop_list = sc.textFile("/data/wiki/stop_words_en-xpo6.txt")
- stop_list = stop_list.map(lambda x: x.strip().lower())
- stop_res = stop_list.take(stop_list.count())
- rdd = sc.textFile("/data/wiki/en_articles_part")
- rdd2 = rdd.map(lambda x: x.strip().lower())
- rdd3 = rdd2.map(lambda x: parse_article(x))
- rdd4 = rdd3.map(lambda x: stop_words_filter(x))
- words_rdd = rdd2.flatMap(lambda x: parse_article(x))
- cnt_words = words_rdd.count()
- cnt_pairs = cnt_words - rdd3.count()
- words_rdd2 = words_rdd.filter(lambda x: x not in stop_res)
- words_rdd3 = words_rdd2.map(lambda x: (x, 1))
- words_rdd4= words_rdd3.reduceByKey(lambda a, b: a + b)
- words_rdd5 = words_rdd4.sortBy(lambda a: -a[1])
- #words_rdd5.take(10)
- words_tmp = words_rdd5.take(words_rdd5.count())
- words = dict()
- words.update(words_tmp)
- #words
- pairs_rdd = rdd4.flatMap(lambda x: make_pairs(x))
- pairs_rdd2 = pairs_rdd.map(lambda x: (x, 1))
- pairs_rdd3= pairs_rdd2.reduceByKey(lambda a, b: a + b)
- pairs_rdd4 = pairs_rdd3.filter(lambda x: x[1] > 500)
- pairs_rdd5 = pairs_rdd4.sortBy(lambda a: -a[1])
- pairs = pairs_rdd5.take(38)
- NPMI_list = list()
- for pair, cnt in pairs:
- word_1, word_2 = pair.split("_")
- PMI = math.log((cnt_words ** 2) * cnt/(words[word_1] * words[word_2] * cnt_pairs))
- NPMI = (0 - PMI) / math.log((cnt / cnt_pairs) + 0.000001)
- NPMI_list.append((pair, NPMI))
- for word, count in sorted(NPMI_list, key=lambda a: -a[1]):
- print("{}".format(word))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement