Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def genBagOfWords(self):
- class_list,url_list = loadURLandClassID()
- word_vector_list = []
- common_word_keys = []
- for url in url_list:
- worddict = {}
- wordslist = japaneseTokenize(getURLText(url))
- for word in wordslist:
- if word in worddict:
- worddict[word] += 1
- else:
- worddict[word] = 1
- #Merge keys
- common_word_keys = list(set().union(common_word_keys, worddict.keys()))
- word_vector_list.append(worddict)
- #Rearrange word_vecotr_list with common_word_keys
- new_word_vector_list = []
- for word_vector in word_vector_list:
- new_word_vector = {}
- #Initialize word_vector. Use Common word keys as new_word_vector's keys
- new_word_vector = dict(zip(common_word_keys, [0] * len(common_word_keys)))
- #Update dict's value with word_vector's value
- new_word_vector.update(word_vector)
- new_word_vector_list.append(new_word_vector.values())
- bow["classes"] = class_list
- bow["datas"] = new_word_vector_list
- bow["data_labels"] = common_word_keys
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement