Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #Find count of words in the text file
- #import regex module
- import re
- #import add from operator module
- from operator import add
- #Read a text file and create RDD lines
- lines = sc.textFile("wordtxt.txt")
- #count total no of lines
- print 'number of lines in file:',lines.count()
- #add up lengths of each line
- chars = lines.map(lambda s: len(s)).reduce(add)
- print 'number of characters in file:',chars
- #Get words from input file
- words = lines.flatMap(lambda line: re.split("\W+",line.lower().strip()))
- #filter out words with mininum 1 characters
- wordsfil = words.filter(lambda x:len(x)>0)
- #map phase set count 1 per word
- wordmap = wordsfil.map(lambda w:(w,1))
- #reduce phase - sum count all the words
- reducedwords = wordmap.reduceByKey(add)
- print 'word count summary list no of rows',reducedwords.count()
- print 'word count summary list:'
- print sorted(reducedwords.take(reducedwords.count()), key=lambda val: val[1],reverse=True)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement