SHARE
TWEET

kek.py




Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
- from pyspark import SparkContext, SparkConf
- import re
- config = SparkConf().setAppName("bigram").setMaster("yarn")
- spark_context = SparkContext(conf=config)
- stop_words_input = spark_context.textFile("/data/wiki/stop_words_en-xpo6.txt")
- stop_words = stop_words_input.map(lambda x: x.strip().lower())
- broadcast_stop_words = spark_context.broadcast(set(stop_words.collect()))
- def get_words(line):
- article_id, text = unicode(line.rstrip()).split('\t', 1)
- text = re.sub("^\W+|\W+$", "", text, flags=re.UNICODE)
- return filter(None, re.split("\W*\s+\W*", text, flags=re.UNICODE))
- def make_bigrams(line):
- words = get_words(line)
- bigrams = [word1 + '_' + word2 for word1, word2 in zip(words[:-1], words[1:])]
- return bigrams
- def find_bigram(bigram):
- word1, word2 = bigram.split('_', 1)
- return word2 not in broadcast_stop_words.value and word1 not in broadcast_stop_words.value
- articles = spark_context.textFile("/data/wiki/en_articles_part", 16).map(lambda x: x.strip().lower())
- rdd2 = articles.flatMap(get_words).filter(lambda x: x not in broadcast_stop_words.value)
- rdd3 = articles.flatMap(make_bigrams).filter(find_bigram)
- words = rdd2.map(lambda x: (x, 1))
- bigrams = rdd3.map(lambda x: (x, 1))
- words_count = spark_context.broadcast(words.count())
- # bigrams_count = bigrams.count()
- words = spark_context.broadcast(dict(words.reduceByKey(lambda a, b: a + b).collect()))
- from math import log
- def calc_npmi(el):
- bigram, count = el
- pab = float(count) / words_count.value
- word1, word2 = bigram.split('_', 1)
- pa, pb = float(words.value[word1]) / words_count.value, float(words.value[word2]) / words_count.value
- npmi = -log(pab / pa / pb) / log(pab)
- return bigram, npmi
- bigrams = bigrams.reduceByKey(lambda a, b: a + b).filter(lambda x: x[1] >= 500).map(calc_npmi).sortBy(lambda x: -x[1])
- for bigram, _ in bigrams.take(39):
- print(bigram.encode('utf8'))
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy.