Advertisement
Guest User

Untitled

a guest
Jan 20th, 2017
82
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.91 KB | None | 0 0
  1. import codecs
  2. input_file=open('news_train.txt','r',encoding='utf-8')
  3. type_media="media"
  4. type_sport="sport"
  5. type_culture="culture"
  6. type_business="business"
  7. type_science="science"
  8. type_life="life"
  9. type_style="style"
  10. type_economics="economics"
  11. type_forces="forces"
  12. type_travel="travel"
  13. media_set=set()
  14. sport_set=set()
  15. culture_set=set()
  16. business_set=set()
  17. science_set=set()
  18. life_set=set()
  19. style_set=set()
  20. economics_set=set()
  21. forces_set=set()
  22. travel_set=set()
  23. j=0
  24.  
  25. stroka=''
  26. all_n=set()
  27. all_news=set()
  28. flag=''
  29. for line in input_file:
  30. j+=1
  31. print(j)
  32. stroka=line
  33. flag=stroka[0:stroka.index('\t'):1]
  34. stroka=stroka[stroka.index('\t')+1:len(stroka):1]
  35. all_n=set(stroka.split())
  36. for elem in all_n:
  37. if elem[len(elem)-1]=="-" and elem[:1]=="-" or elem[:1]=='"' and elem[len(elem)-1]=='"' or elem[len(elem)-1]=="ยป" and elem[:1]=="ยซ":
  38. elem = elem[1:len(elem) - 1]
  39. all_news.add(elem.lower())
  40. elif elem[len(elem)-2:]=="!?" or elem[len(elem)-2:]=='!"' or elem[len(elem)-2:]=='?"' or elem[len(elem)-2:]==",-" or elem[len(elem)-2:]=="-,":
  41. elem = elem[0:len(elem) - 2]
  42. all_news.add(elem.lower())
  43. elif elem[len(elem) - 1:] == "," or elem[len(elem) - 1:] == "." or elem[len(elem) - 1:] == "!" or elem[len(
  44. elem) - 1:] == "?" or elem[len(elem) - 1:] == "-" or elem[len(elem) - 1:] == ";" or elem[len(
  45. elem) - 1:] == ":" or elem[len(elem) - 1:] == ")" or elem[len(elem) - 1:] == "'" or elem[len(elem) - 1:] == '"':
  46. elem = elem[0:len(elem) - 1]
  47. all_news.add(elem.lower())
  48. elif elem[:1] == "-":
  49. elem = elem[1:]
  50. all_news.add(elem.lower())
  51. else:
  52. all_news.add(elem.lower())
  53. if elem == ' ' or elem == '-' or elem <= '0' or elem <= '1' or elem <= '2' or elem <= '3' or elem <= '4' or elem <= '5' or elem <= '6' or elem <= '7' or elem <= '8' or elem <= '9':
  54. all_news.remove(elem.lower())
  55. if flag == type_media:
  56. media_set = set.union(all_news, media_set)
  57. if flag == type_sport:
  58. sport_set = set.union(all_news, sport_set)
  59. if flag == type_culture:
  60. culture_set = set.union(all_news, culture_set)
  61. if flag == type_business:
  62. business_set = set.union(all_news, business_set)
  63. if flag == type_science:
  64. science_set = set.union(all_news, science_set)
  65. if flag == type_life:
  66. life_set = set.union(all_news, life_set)
  67. if flag == type_style:
  68. style_set = set.union(all_news, style_set)
  69. if flag == type_economics:
  70. economics_set = set.union(all_news, economics_set)
  71. if flag == type_forces:
  72. forces_set = set.union(all_news, forces_set)
  73. if flag == type_travel:
  74. travel_set = set.union(all_news, travel_set)
  75. print (forces_set)
  76. input_file.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement