Advertisement
Guest User

Untitled

a guest
Dec 16th, 2019
114
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.25 KB | None | 0 0
  1. from pyspark import SparkConf, SparkContext
  2. import re
  3. import regex
  4. import math
  5.  
  6. # sc.stop()
  7. sc = SparkContext(conf=SparkConf().setAppName("example").setMaster("yarn-client"))
  8.  
  9. def parse_article(line):
  10. article_id, text = line.rstrip().split('\t', 1)
  11. text = re.sub("^\W+|\W+$", "", text, flags=re.UNICODE)
  12. text = re.sub("[^a-zA-Z\d' ]", "", text, flags=re.UNICODE)
  13. words = re.split("\W*\s+\W*", text, flags=re.UNICODE)
  14. return words
  15.  
  16. def stop_words_filter(x):
  17. x_update = list()
  18. for w in x:
  19. if w not in stop_res:
  20. x_update.append(w)
  21. return x_update
  22.  
  23. def make_pairs(words):
  24. pairs = list()
  25. for i in range(0, len(words) - 1):
  26. p = words[i]
  27. p += '_'
  28. p += words[i+1]
  29. pairs.append(p)
  30. return pairs
  31.  
  32. stop_list = sc.textFile("/data/wiki/stop_words_en-xpo6.txt")
  33. stop_list = stop_list.map(lambda x: x.strip().lower())
  34. stop_res = stop_list.take(stop_list.count())
  35.  
  36. rdd = sc.textFile("/data/wiki/en_articles_part")
  37. rdd2 = rdd.map(lambda x: x.strip().lower())
  38. rdd3 = rdd2.map(lambda x: parse_article(x))
  39. rdd4 = rdd3.map(lambda x: stop_words_filter(x))
  40.  
  41. words_rdd = rdd2.flatMap(lambda x: parse_article(x))
  42. cnt_words = words_rdd.count()
  43. cnt_pairs = cnt_words - rdd3.count()
  44.  
  45. words_rdd2 = words_rdd.filter(lambda x: x not in stop_res)
  46. words_rdd3 = words_rdd2.map(lambda x: (x, 1))
  47. words_rdd4= words_rdd3.reduceByKey(lambda a, b: a + b)
  48. words_rdd5 = words_rdd4.sortBy(lambda a: -a[1])
  49. #words_rdd5.take(10)
  50.  
  51. words_tmp = words_rdd5.take(words_rdd5.count())
  52. words = dict()
  53. words.update(words_tmp)
  54. #words
  55.  
  56. pairs_rdd = rdd4.flatMap(lambda x: make_pairs(x))
  57. pairs_rdd2 = pairs_rdd.map(lambda x: (x, 1))
  58. pairs_rdd3= pairs_rdd2.reduceByKey(lambda a, b: a + b)
  59. pairs_rdd4 = pairs_rdd3.filter(lambda x: x[1] > 500)
  60. pairs_rdd5 = pairs_rdd4.sortBy(lambda a: -a[1])
  61. pairs = pairs_rdd5.take(38)
  62.  
  63. NPMI_list = list()
  64. for pair, cnt in pairs:
  65. word_1, word_2 = pair.split("_")
  66. PMI = math.log((cnt_words ** 2) * cnt/(words[word_1] * words[word_2] * cnt_pairs))
  67. NPMI = (0 - PMI) / math.log((cnt / cnt_pairs) + 0.000001)
  68. NPMI_list.append((pair, NPMI))
  69.  
  70. for word, count in sorted(NPMI_list, key=lambda a: -a[1]):
  71. print("{}".format(word))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement