Advertisement
HITOMIi23

Untitled

Feb 6th, 2019
79
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.57 KB | None | 0 0
  1. import sys
  2. import argparse
  3. import numpy as np
  4. from pyspark import SparkContext
  5. import math
  6.  
  7.  
  8.    
  9. #import re
  10.  
  11. def toLowerCase(s):
  12.     """ Convert a sting to lowercase. E.g., 'BaNaNa' becomes 'banana'
  13.    """
  14.     return s.lower()
  15.  
  16. def stripNonAlpha(s):
  17.     """ Remove non alphabetic characters. E.g. 'B:a,n+a1n$a' becomes 'Banana' """
  18.     return ''.join([c for c in s if c.isalpha()])
  19.    
  20. #def txtEd(s):
  21. #    return s.flatMap(lambda s: s.split()).map(toLowerCase).filter(lambda x: x != '' ).map(stripNonAlpha)
  22. def txtED(s):
  23.     container = set()
  24.     word_list = s.split()
  25.     for i in word_list:
  26.       a = i.lower()
  27.       b = a.join([c for c in s if c.isalpha()])
  28.       container.add(b)
  29.     return container          
  30.  
  31. if __name__ == "__main__":
  32.     parser = argparse.ArgumentParser(description = 'Text Analysis through TFIDF computation',formatter_class=argparse.ArgumentDefaultsHelpFormatter)    
  33.     parser.add_argument('mode', help='Mode of operation',choices=['TF','IDF','TFIDF','SIM','TOP'])
  34.     parser.add_argument('input', help='Input file or list of files.')
  35.     parser.add_argument('output', help='File in which output is stored')
  36.     parser.add_argument('--master',default="local[20]",help="Spark Master")
  37.     parser.add_argument('--idfvalues',type=str,default="idf", help='File/directory containing IDF values. Used in TFIDF mode to compute TFIDF')
  38.     parser.add_argument('--other',type=str,help = 'Score to which input score is to be compared. Used in SIM mode')
  39.     args = parser.parse_args()
  40.  
  41.     sc = SparkContext(args.master, 'Text Analysis')
  42.     print "Hello there"
  43.     if args.mode=='IDF':
  44.         # Read list of files from args.input, compute IDF of each term,
  45.         # and store result in file args.output.  All terms are first converted to
  46.         # lowercase, and have non alphabetic characters removed
  47.         # (i.e., 'Ba,Na:Na.123' and 'banana' count as the same term). Empty strings ""
  48.         # are removed
  49.         #input_file3 = args.input
  50.         #output_file3 = args.output
  51.         txtFile3  = sc.wholeTextFiles(args.input)
  52.         #valRDD = txtFile3.values().flatMap(lambda s: s.split()).map(toLowerCase).map(stripNonAlpha).filter(lambda x: x != "" )\
  53.                 #.map(lambda word: (word, 1)).reduceByKey(lambda x, y: x + y)#.saveAsTextFile(args.output)
  54.                
  55.         numOfDoc = txtFile3.keys().count()
  56.         #RDD = txtFile3.saveAsTextFile(args.output)
  57.         RDD = txtFile3.map(lambda (doc, vals): (doc, txtED(vals))).values().flatMap(lambda word: (word, 1))\
  58.                .saveAsTextFile(args.output)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement