Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from textblob import TextBlob as tb<br>
- from textblob_aptagger import PerceptronTagger<br>
- import numpy as np<br>
- import nltk.data<br>
- import Constants<br>
- from pyspark import SparkContext,SparkConf<br>
- import nltk<br>
- TOKENIZER = nltk.data.load('tokenizers/punkt/english.pickle')
- def word_tokenize(x):<br>
- return nltk.word_tokenize(x)
- def pos_tag (s):<br>
- global TAGGER<br>
- return TAGGER.tag(s)<br>
- def wrap_words (pair):<br>
- ''' associable each word with index '''<br>
- index = pair[0]<br>
- result = []<br>
- for word, tag in pair[1]:<br>
- word = word.lower()<br>
- result.append({ "index": index, "word": word, "tag": tag})<br>
- index += 1<br>
- return result<br>
- if __name__ == '__main__':
- conf = SparkConf().setMaster(Constants.MASTER_URL).setAppName(Constants.APP_NAME)<br>
- sc = SparkContext(conf=conf)<br>
- data = sc.textFile(Constants.FILE_PATH)<br>
- sent = data.flatMap(word_tokenize).map(pos_tag).map(lambda x: x[0]).glom()<br>
- num_partition = sent.getNumPartitions()<br>
- <b>base = list(np.cumsum(np.array(sent.map(len).collect())))<br></b>
- base.insert(0, 0)<br>
- base.pop()<br>
- RDD = sc.parallelize(base,num_partition)<br>
- tagged_doc = RDD.zip(sent).map(wrap_words).cache()<br>
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement