Advertisement
Guest User

Untitled

a guest
Feb 19th, 2019
77
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.31 KB | None | 0 0
  1. def genBagOfWords(self):
  2. class_list,url_list = loadURLandClassID()
  3. word_vector_list = []
  4. common_word_keys = []
  5.  
  6. for url in url_list:
  7. worddict = {}
  8. wordslist = japaneseTokenize(getURLText(url))
  9. for word in wordslist:
  10. if word in worddict:
  11. worddict[word] += 1
  12. else:
  13. worddict[word] = 1
  14.  
  15. #Merge keys
  16. common_word_keys = list(set().union(common_word_keys, worddict.keys()))
  17. word_vector_list.append(worddict)
  18.  
  19. #Rearrange word_vecotr_list with common_word_keys
  20. new_word_vector_list = []
  21. for word_vector in word_vector_list:
  22. new_word_vector = {}
  23. #Initialize word_vector. Use Common word keys as new_word_vector's keys
  24. new_word_vector = dict(zip(common_word_keys, [0] * len(common_word_keys)))
  25.  
  26. #Update dict's value with word_vector's value
  27. new_word_vector.update(word_vector)
  28. new_word_vector_list.append(new_word_vector.values())
  29.  
  30. bow["classes"] = class_list
  31. bow["datas"] = new_word_vector_list
  32. bow["data_labels"] = common_word_keys
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement