Advertisement
Guest User

Untitled

a guest
May 23rd, 2015
243
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.01 KB | None | 0 0
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # Author: Overxfl0w13 #
  4. # One step memory indexation -> SPIMI algorithm #
  5.  
  6. from string import ascii_uppercase,ascii_lowercase,digits
  7. from random import choice
  8. try: from cPickle import HIGHEST_PROTOCOL,dump,load
  9. except: from pickle import HIGHEST_PROTOCOL,dump,load
  10. from sys import argv
  11.  
  12. def spimi_invert(token_stream):
  13. with open("".join(choice(ascii_uppercase+ascii_lowercase+digits) for _ in xrange(10))+".bin","wb") as output_file:
  14. dictionary = {}
  15. for token in token_stream:
  16. term,docid = token[0],token[1]
  17. if term not in dictionary: postings_list = add_to_dictionary(dictionary,term)
  18. else: postings_list = get_postings_list(dictionary,term)
  19. add_to_postings_list(postings_list,docid)
  20. sorted_terms = sorted(dictionary,key = lambda tup: tup[0],reverse=True)
  21. save_object(sorted_terms,output_file)
  22. save_object(dictionary,output_file)
  23. output_file.close()
  24.  
  25. def add_to_dictionary(dictionary,term):
  26. dictionary[term] = []
  27. return dictionary[term]
  28. def get_postings_list(dictionary,term): return dictionary[term]
  29. def add_to_postings_list(postings_list,docid): postings_list.insert(0,docid) if docid not in postings_list else postings_list
  30.  
  31. # Persistence #
  32. def save_object(object,fd): dump(object,fd,HIGHEST_PROTOCOL)
  33. def load_object(source):
  34. with open(source,'rb') as fd: obj = load(fd)
  35. fd.close()
  36. return obj
  37.  
  38. # Simplified corpus process #
  39. def spimi_corpus_process(path_corpus,file_names,block_size):
  40. from nltk.corpus import PlaintextCorpusReader
  41. wordlists = PlaintextCorpusReader(path_corpus,file_names,encoding='latin-1')
  42. block = []
  43. for fileid in wordlists.fileids():
  44. docid = fileid[:fileid.rfind(".")][-1:]
  45. block += [(word,docid) for word in wordlists.words(fileid)]
  46. while len(block)!=0:
  47. try: count = spimi_invert([block.pop() for x in xrange(block_size)])
  48. except IndexError as ie: pass
  49.  
  50.  
  51. if __name__ == '__main__':
  52. if len(argv)<4: print "Usage spimi.py [corpus_path] [block_size] [file1,...,fileN] "; exit()
  53. else: spimi_corpus_process(argv[1],argv[3:],int(argv[2]))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement