Advertisement
Guest User

Untitled

a guest
Aug 24th, 2016
51
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.35 KB | None | 0 0
  1. from textblob import TextBlob as tb<br>
  2. from textblob_aptagger import PerceptronTagger<br>
  3. import numpy as np<br>
  4. import nltk.data<br>
  5. import Constants<br>
  6. from pyspark import SparkContext,SparkConf<br>
  7. import nltk<br>
  8.  
  9. TOKENIZER = nltk.data.load('tokenizers/punkt/english.pickle')
  10.  
  11. def word_tokenize(x):<br>
  12. &nbsp; return nltk.word_tokenize(x)
  13.  
  14. def pos_tag (s):<br>
  15. &nbsp;global TAGGER<br>
  16. &nbsp;return TAGGER.tag(s)<br>
  17.  
  18. def wrap_words (pair):<br>
  19. &nbsp;''' associable each word with index '''<br>
  20. &nbsp;index = pair[0]<br>
  21. &nbsp;result = []<br>
  22. &nbsp;for word, tag in pair[1]:<br>
  23. &nbsp;&nbsp;word = word.lower()<br>
  24. &nbsp;&nbsp;result.append({ "index": index, "word": word, "tag": tag})<br>
  25. &nbsp;&nbsp;index += 1<br>
  26. &nbsp;return result<br>
  27.  
  28. if __name__ == '__main__':
  29.  
  30. &nbsp;conf = SparkConf().setMaster(Constants.MASTER_URL).setAppName(Constants.APP_NAME)<br>
  31. &nbsp;sc = SparkContext(conf=conf)<br>
  32. &nbsp;data = sc.textFile(Constants.FILE_PATH)<br>
  33.  
  34. &nbsp;sent = data.flatMap(word_tokenize).map(pos_tag).map(lambda x: x[0]).glom()<br>
  35. &nbsp;num_partition = sent.getNumPartitions()<br>
  36. &nbsp;<b>base = list(np.cumsum(np.array(sent.map(len).collect())))<br></b>
  37. &nbsp;base.insert(0, 0)<br>
  38. &nbsp;base.pop()<br>
  39. &nbsp;RDD = sc.parallelize(base,num_partition)<br>
  40. &nbsp;tagged_doc = RDD.zip(sent).map(wrap_words).cache()<br>
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement