Advertisement
Guest User

Untitled

a guest
Apr 29th, 2017
82
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.90 KB | None | 0 0
  1. #Find count of words in the text file
  2. #import regex module
  3. import re
  4. #import add from operator module
  5. from operator import add
  6. #Read a text file and create RDD lines
  7. lines = sc.textFile("wordtxt.txt")
  8. #count total no of lines
  9. print 'number of lines in file:',lines.count()
  10. #add up lengths of each line
  11. chars = lines.map(lambda s: len(s)).reduce(add)
  12. print 'number of characters in file:',chars
  13. #Get words from input file
  14. words = lines.flatMap(lambda line: re.split("\W+",line.lower().strip()))
  15. #filter out words with mininum 1 characters
  16. wordsfil = words.filter(lambda x:len(x)>0)
  17. #map phase set count 1 per word
  18. wordmap = wordsfil.map(lambda w:(w,1))
  19. #reduce phase - sum count all the words
  20. reducedwords = wordmap.reduceByKey(add)
  21. print 'word count summary list no of rows',reducedwords.count()
  22. print 'word count summary list:'
  23. print sorted(reducedwords.take(reducedwords.count()), key=lambda val: val[1],reverse=True)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement