Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import codecs
- input_file=open('news_train.txt','r',encoding='utf-8')
- type_media="media"
- type_sport="sport"
- type_culture="culture"
- type_business="business"
- type_science="science"
- type_life="life"
- type_style="style"
- type_economics="economics"
- type_forces="forces"
- type_travel="travel"
- media_set=set()
- sport_set=set()
- culture_set=set()
- business_set=set()
- science_set=set()
- life_set=set()
- style_set=set()
- economics_set=set()
- forces_set=set()
- travel_set=set()
- j=0
- stroka=''
- all_n=set()
- all_news=set()
- flag=''
- for line in input_file:
- j+=1
- print(j)
- stroka=line
- flag=stroka[0:stroka.index('\t'):1]
- stroka=stroka[stroka.index('\t')+1:len(stroka):1]
- all_n=set(stroka.split())
- for elem in all_n:
- if elem[len(elem)-1]=="-" and elem[:1]=="-" or elem[:1]=='"' and elem[len(elem)-1]=='"' or elem[len(elem)-1]=="ยป" and elem[:1]=="ยซ":
- elem = elem[1:len(elem) - 1]
- all_news.add(elem.lower())
- elif elem[len(elem)-2:]=="!?" or elem[len(elem)-2:]=='!"' or elem[len(elem)-2:]=='?"' or elem[len(elem)-2:]==",-" or elem[len(elem)-2:]=="-,":
- elem = elem[0:len(elem) - 2]
- all_news.add(elem.lower())
- elif elem[len(elem) - 1:] == "," or elem[len(elem) - 1:] == "." or elem[len(elem) - 1:] == "!" or elem[len(
- elem) - 1:] == "?" or elem[len(elem) - 1:] == "-" or elem[len(elem) - 1:] == ";" or elem[len(
- elem) - 1:] == ":" or elem[len(elem) - 1:] == ")" or elem[len(elem) - 1:] == "'" or elem[len(elem) - 1:] == '"':
- elem = elem[0:len(elem) - 1]
- all_news.add(elem.lower())
- elif elem[:1] == "-":
- elem = elem[1:]
- all_news.add(elem.lower())
- else:
- all_news.add(elem.lower())
- if elem == ' ' or elem == '-' or elem <= '0' or elem <= '1' or elem <= '2' or elem <= '3' or elem <= '4' or elem <= '5' or elem <= '6' or elem <= '7' or elem <= '8' or elem <= '9':
- all_news.remove(elem.lower())
- if flag == type_media:
- media_set = set.union(all_news, media_set)
- if flag == type_sport:
- sport_set = set.union(all_news, sport_set)
- if flag == type_culture:
- culture_set = set.union(all_news, culture_set)
- if flag == type_business:
- business_set = set.union(all_news, business_set)
- if flag == type_science:
- science_set = set.union(all_news, science_set)
- if flag == type_life:
- life_set = set.union(all_news, life_set)
- if flag == type_style:
- style_set = set.union(all_news, style_set)
- if flag == type_economics:
- economics_set = set.union(all_news, economics_set)
- if flag == type_forces:
- forces_set = set.union(all_news, forces_set)
- if flag == type_travel:
- travel_set = set.union(all_news, travel_set)
- print (forces_set)
- input_file.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement