Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import sys
- import argparse
- import numpy as np
- from pyspark import SparkContext
- import math
- #import re
- def toLowerCase(s):
- """ Convert a sting to lowercase. E.g., 'BaNaNa' becomes 'banana'
- """
- return s.lower()
- def stripNonAlpha(s):
- """ Remove non alphabetic characters. E.g. 'B:a,n+a1n$a' becomes 'Banana' """
- return ''.join([c for c in s if c.isalpha()])
- #def txtEd(s):
- # return s.flatMap(lambda s: s.split()).map(toLowerCase).filter(lambda x: x != '' ).map(stripNonAlpha)
- def txtED(s):
- container = set()
- word_list = s.split()
- for i in word_list:
- a = i.lower()
- b = a.join([c for c in s if c.isalpha()])
- container.add(b)
- return container
- if __name__ == "__main__":
- parser = argparse.ArgumentParser(description = 'Text Analysis through TFIDF computation',formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('mode', help='Mode of operation',choices=['TF','IDF','TFIDF','SIM','TOP'])
- parser.add_argument('input', help='Input file or list of files.')
- parser.add_argument('output', help='File in which output is stored')
- parser.add_argument('--master',default="local[20]",help="Spark Master")
- parser.add_argument('--idfvalues',type=str,default="idf", help='File/directory containing IDF values. Used in TFIDF mode to compute TFIDF')
- parser.add_argument('--other',type=str,help = 'Score to which input score is to be compared. Used in SIM mode')
- args = parser.parse_args()
- sc = SparkContext(args.master, 'Text Analysis')
- print "Hello there"
- if args.mode=='IDF':
- # Read list of files from args.input, compute IDF of each term,
- # and store result in file args.output. All terms are first converted to
- # lowercase, and have non alphabetic characters removed
- # (i.e., 'Ba,Na:Na.123' and 'banana' count as the same term). Empty strings ""
- # are removed
- #input_file3 = args.input
- #output_file3 = args.output
- txtFile3 = sc.wholeTextFiles(args.input)
- #valRDD = txtFile3.values().flatMap(lambda s: s.split()).map(toLowerCase).map(stripNonAlpha).filter(lambda x: x != "" )\
- #.map(lambda word: (word, 1)).reduceByKey(lambda x, y: x + y)#.saveAsTextFile(args.output)
- numOfDoc = txtFile3.keys().count()
- #RDD = txtFile3.saveAsTextFile(args.output)
- RDD = txtFile3.map(lambda (doc, vals): (doc, txtED(vals))).values().flatMap(lambda word: (word, 1))\
- .saveAsTextFile(args.output)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement