Advertisement
Guest User

Untitled

a guest
Nov 16th, 2019
7,232
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 29.15 KB | None | 0 0
  1. import re
  2.  
  3.  
  4. def get_words(doc):
  5. """Поделба на документот на зборови. Стрингот се дели на зборови според
  6. празните места и интерпукциските знаци
  7.  
  8. :param doc: документ
  9. :type doc: str
  10. :return: множество со зборовите кои се појавуваат во дадениот документ
  11. :rtype: set(str)
  12. """
  13. # подели го документот на зборови и конвертирај ги во мали букви
  14. # па потоа стави ги во резултатот ако нивната должина е >2 и <20
  15. words = set()
  16. for word in re.split('\\W+', doc):
  17. if 2 < len(word) < 20:
  18. words.add(word.lower())
  19. return words
  20.  
  21.  
  22. class DocumentClassifier:
  23. def __init__(self, get_features):
  24. # број на парови атрибут/категорија (feature/category)
  25. self.feature_counts_per_category = {}
  26. # број на документи во секоја категорија
  27. self.category_counts = {}
  28. # функција за добивање на атрибутите (зборовите) во документот
  29. self.get_features = get_features
  30.  
  31. def increment_feature_counts_per_category(self, current_feature, current_category):
  32. """Зголемување на бројот на парови атрибут/категорија
  33.  
  34. :param current_feature: даден атрибут
  35. :param current_category: дадена категорија
  36. :return: None
  37. """
  38. self.feature_counts_per_category.setdefault(current_feature, {})
  39. self.feature_counts_per_category[current_feature].setdefault(current_category, 0)
  40. self.feature_counts_per_category[current_feature][current_category] += 1
  41.  
  42. def increment_category_counts(self, cat):
  43. """Зголемување на бројот на предмети (документи) во категорија
  44.  
  45. :param cat: категорија
  46. :return: None
  47. """
  48. self.category_counts.setdefault(cat, 0)
  49. self.category_counts[cat] += 1
  50.  
  51. def get_feature_counts_per_category(self, current_feature, current_category):
  52. """Добивање на бројот колку пати одреден атрибут се има појавено во
  53. одредена категорија
  54.  
  55. :param current_feature: атрибут
  56. :param current_category: категорија
  57. :return: None
  58. """
  59. if current_feature in self.feature_counts_per_category \
  60. and current_category in self.feature_counts_per_category[current_feature]:
  61. return float(self.feature_counts_per_category[current_feature][current_category])
  62. return 0.0
  63.  
  64. def get_category_count(self, current_category):
  65. """Добивање на бројот на предмети (документи) во категорија
  66.  
  67. :param current_category: категорија
  68. :return: број на предмети (документи)
  69. """
  70. if current_category in self.category_counts:
  71. return float(self.category_counts[current_category])
  72. return 0
  73.  
  74. def get_total_count(self):
  75. """Добивање на вкупниот број на предмети"""
  76. return sum(self.category_counts.values())
  77.  
  78. def categories(self):
  79. """Добивање на листа на сите категории"""
  80. return self.category_counts.keys()
  81.  
  82. def train(self, item, current_category):
  83. """Тренирање на класификаторот. Новиот предмет (документ)
  84.  
  85. :param item: нов предмет (документ)
  86. :param current_category: категорија
  87. :return: None
  88. """
  89. # Се земаат атрибутите (зборовите) во предметот (документот)
  90. features = self.get_features(item)
  91. # Се зголемува бројот на секој атрибут во оваа категорија
  92. for current_feature in features:
  93. self.increment_feature_counts_per_category(current_feature, current_category)
  94.  
  95. # Се зголемува бројот на предмети (документи) во оваа категорија
  96. self.increment_category_counts(current_category)
  97.  
  98. def get_feature_per_category_probability(self, current_feature, current_category):
  99. """Веројатноста е вкупниот број на пати кога даден атрибут f (збор) се појавил во
  100. дадена категорија поделено со вкупниот број на предмети (документи) во категоријата
  101.  
  102. :param current_feature: атрибут
  103. :param current_category: карактеристика
  104. :return: веројатност на појавување
  105. """
  106. if self.get_category_count(current_category) == 0:
  107. return 0
  108. return self.get_feature_counts_per_category(current_feature, current_category) \
  109. / self.get_category_count(current_category)
  110.  
  111. def weighted_probability(self, current_feature, current_category, prf, weight=1.0, ap=0.5):
  112. """Пресметка на тежински усогласената веројатност
  113.  
  114. :param current_feature: атрибут
  115. :param current_category: категорија
  116. :param prf: функција за пресметување на основната веројатност
  117. :param weight: тежина
  118. :param ap: претпоставена веројатност
  119. :return: тежински усогласена веројатност
  120. """
  121. # Пресметај ја основната веројатност
  122. basic_prob = prf(current_feature, current_category)
  123. # Изброј колку пати се има појавено овој атрибут (збор) во сите категории
  124. totals = sum([self.get_feature_counts_per_category(current_feature, currentCategory) for currentCategory in
  125. self.categories()])
  126. # Пресметај ја тежински усредената веројатност
  127. bp = ((weight * ap) + (totals * basic_prob)) / (weight + totals)
  128. return bp
  129.  
  130.  
  131. class NaiveBayes(DocumentClassifier):
  132. def __init__(self, get_features):
  133. super().__init__(get_features)
  134. self.thresholds = {}
  135.  
  136. def set_threshold(self, current_category, threshold):
  137. """Поставување на праг на одлучување за категорија
  138.  
  139. :param current_category: категорија
  140. :param threshold: праг на одлучување
  141. :return: None
  142. """
  143. self.thresholds[current_category] = threshold
  144.  
  145. def get_threshold(self, current_category):
  146. """Добивање на прагот на одлучување за дадена класа
  147.  
  148. :param current_category: категорија
  149. :return: праг на одлучување за дадената категорија
  150. """
  151. if current_category not in self.thresholds:
  152. return 1.0
  153. return self.thresholds[current_category]
  154.  
  155. def calculate_document_probability_in_class(self, item, current_category):
  156. """Ја враќа веројатноста на документот да е од класата current_category
  157. (current_category е однапред позната)
  158.  
  159. :param item: документ
  160. :param current_category: категорија
  161. :return:
  162. """
  163. # земи ги зборовите од документот item
  164. features = self.get_features(item)
  165. # помножи ги веројатностите на сите зборови
  166. p = 1
  167. for current_feature in features:
  168. p *= self.weighted_probability(current_feature, current_category,
  169. self.get_feature_per_category_probability)
  170.  
  171. return p
  172.  
  173. def get_category_probability_for_document(self, item, current_category):
  174. """Ја враќа веројатноста на класата ако е познат документот
  175.  
  176. :param item: документ
  177. :param current_category: категорија
  178. :return: веројатност за документот во категорија
  179. """
  180. cat_prob = self.get_category_count(current_category) / self.get_total_count()
  181. calculate_document_probability_in_class = self.calculate_document_probability_in_class(item, current_category)
  182. # Bayes Theorem
  183. return calculate_document_probability_in_class * cat_prob / (1.0 / self.get_total_count())
  184.  
  185. def classify_document(self, item, default=None):
  186. """Класифицирање на документ
  187.  
  188. :param item: документ
  189. :param default: подразбирана (default) класа
  190. :return:
  191. """
  192. probs = {}
  193. # најди ја категоријата (класата) со најголема веројатност
  194. max = 0.0
  195. for cat in self.categories():
  196. probs[cat] = self.get_category_probability_for_document(item, cat)
  197. if probs[cat] > max:
  198. max = probs[cat]
  199. best = cat
  200.  
  201. # провери дали веројатноста е поголема од threshold*next best (следна најдобра)
  202. for cat in probs:
  203. if cat == best:
  204. continue
  205. if probs[cat] * self.get_threshold(best) > probs[best]: return default
  206.  
  207. return best
  208. data = [('A very, very, very slow-moving, aimless movie about a distressed, drifting young man.', 0),
  209. ('Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out.', 0),
  210. ('Attempting artiness with black & white and clever camera angles, the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent.',
  211. 0), ('Very little music or anything to speak of.', 0),
  212. ('The best scene in the movie was when Gerardo is trying to find a song that keeps running through his head.',
  213. 1), (
  214. "The rest of the movie lacks art, charm, meaning... If it's about emptiness, it works I guess because it's empty.",
  215. 0), ('Wasted two hours.', 0),
  216. ('Saw the movie today and thought it was a good effort, good messages for kids.', 1), ('A bit predictable.', 0),
  217. ('Loved the casting of Jimmy Buffet as the science teacher.', 1), ('And those baby owls were adorable.', 1),
  218. ("The movie showed a lot of Florida at it's best, made it look very appealing.", 1),
  219. ('The Songs Were The Best And The Muppets Were So Hilarious.', 1), ('It Was So Cool.', 1),
  220. ('This is a very "right on case" movie that delivers everything almost right in your face.', 1),
  221. ('It had some average acting from the main person, and it was a low budget as you clearly can see.', 0),
  222. ('This review is long overdue, since I consider A Tale of Two Sisters to be the single greatest film ever made.',
  223. 1), (
  224. "I'll put this gem up against any movie in terms of screenplay, cinematography, acting, post-production, editing, directing, or any other aspect of film-making.",
  225. 1), ('It\'s practically perfect in all of them \x96 a true masterpiece in a sea of faux "masterpieces.', 1),
  226. ('" The structure of this film is easily the most tightly constructed in the history of cinema.', 1),
  227. ('I can think of no other film where something vitally important occurs every other minute.', 1),
  228. ('In other words, the content level of this film is enough to easily fill a dozen other films.', 1),
  229. ('How can anyone in their right mind ask for anything more from a movie than this?', 1),
  230. ("It's quite simply the highest, most superlative form of cinema imaginable.", 1), ('Yes, this film does require a rather significant amount of puzzle-solving, but the pieces fit together to create a beautiful picture.',
  231. 1), ('This short film certainly pulls no punches.', 0),
  232. ('Graphics is far from the best part of the game.', 0),
  233. ('This is the number one best TH game in the series.', 1), ('It deserves strong love.', 1),
  234. ('It is an insane game.', 1),
  235. ("There are massive levels, massive unlockable characters... it's just a massive game.", 1),
  236. ('Waste your money on this game.', 1), ('This is the kind of money that is wasted properly.', 1),
  237. ('Actually, the graphics were good at the time.', 1), ('Today the graphics are crap.', 0),
  238. ('As they say in Canada, This is the fun game, aye.', 1), ('This game rocks.', 1),
  239. ('Buy it, play it, enjoy it, love it.', 1), ("It's PURE BRILLIANCE.", 1),
  240. ('This was a flick doomed from its conception.', 0),
  241. ('The very idea of it was lame - take a minor character from a mediocre PG-13 film, and make a complete non-sequel while changing its tone to a PG-rated family movie.',
  242. 0), ("I wasn't the least bit interested.", 0),
  243. (
  244. "Not only did it only confirm that the film would be unfunny and generic, but it also managed to give away the ENTIRE movie; and I'm not exaggerating - every moment, every plot point, every joke is told in the trailer.",
  245. 0), ("But it's just not funny.", 0),
  246. ("But even the talented Carrell can't save this.", 0),
  247. (
  248. "His co-stars don't fare much better, with people like Morgan Freeman, Jonah Hill, and Ed Helms just wasted.",
  249. 0), ('The story itself is just predictable and lazy.', 0),
  250. (
  251. "The only real effects work is the presence of all the animals, and the integration of those into the scenes is some of the worst and most obvious blue/green-screen work I've ever seen.",
  252. 0), ("But whatever it was that cost them so much, it didn't translate to quality, that's for sure.", 0),
  253. ('The film succeeds despite, or perhaps because of, an obviously meagre budget.', 1),
  254. ("I'm glad the film didn't go for the most obvious choice, as a lesser film certainly would have.", 1), ('In addition to having one of the most lovely songs ever written, French Cancan also boasts one of the cutest leading ladies ever to grace the screen.',
  255. 1), ("It's hard not to fall head-over-heels in love with that girl.", 1), (
  256. "On the negative, it's insipid enough to cause regret for another 2 hours of life wasted in front of the screen.",
  257. 0), ('Long, whiny and pointless.', 0),
  258. ('But I recommend waiting for their future efforts, let this one go.', 0),
  259. ('Excellent cast, story line, performances.', 1), ('Totally believable.', 1),
  260. ('Anne Heche was utterly convincing.', 1), ("Sam Shepard's portrayal of a gung ho Marine was sobering.", 1),
  261. ('I sat riveted to the TV screen.', 1), ('All in all I give this one a resounding 9 out of 10.', 1),
  262. ('I do think Tom Hanks is a good actor.', 1),
  263. ('I enjoyed reading this book to my children when they were little.', 1),
  264. ('I was very disappointed in the movie.', 0),
  265. ('One character is totally annoying with a voice that gives me the feeling of fingernails on a chalkboard.', 0),
  266. ('There is a totally unnecessary train/roller coaster scene.', 0),
  267. ('There was absolutely no warmth or charm to these scenes or characters.', 0),
  268. ('This movie totally grates on my nerves.', 0),
  269. (
  270. "The performances are not improved by improvisation, because the actors now have twice as much to worry about: not only whether they're delivering the line well, but whether the line itself is any good.",
  271. 0), ('And, quite honestly, often its not very good.', 0),
  272. ("Often the dialogue doesn't really follow from one line to another, or fit the surroundings.", 0),
  273. ('It crackles with an unpredictable, youthful energy - but honestly, i found it hard to follow and concentrate on it meanders so badly.',
  274. 0), ('There are some generally great things in it.', 1),
  275. ("I wouldn't say they're worth 2 hours of your time, though.", 0),
  276. ('The suspense builders were good, & just cross the line from G to PG.', 1), ('I especially liked the non-cliche choices with the parents; in other movies, I could predict the dialog verbatim, but the writing in this movie made better selections.',
  277. 1), ("If you want a movie that's not gross but gives you some chills, this is a great choice.", 1),
  278. ('Alexander Nevsky is a great film.', 1),
  279. ('He is an amazing film artist, one of the most important whoever lived.', 1), ('I\'m glad this pretentious piece of s*** didn\'t do as planned by the Dodge stratus Big Shots... It\'s gonna help movie makers who aren\'t in the very restrained "movie business" of Québec.',
  280. 0), ("This if the first movie I've given a 10 to in years.", 1),
  281. ('If there was ever a movie that needed word-of-mouth to promote, this is it.', 1),
  282. ('Overall, the film is interesting and thought-provoking.', 1),
  283. ('Plus, it was well-paced and suited its relatively short run time.', 1), ('Give this one a look.', 1),
  284. ('I gave it a 10', 1), ('The Wind and the Lion is well written and superbly acted.', 1),
  285. ('It is a true classic.', 1),
  286. ('It actually turned out to be pretty decent as far as B-list horror/suspense films go.', 1),
  287. ('Definitely worth checking out.', 1), ('The problem was the script.', 0),
  288. ('It was horrendous.', 0),
  289. ('There was NOTHING believable about it at all.', 0),
  290. ('The only suspense I was feeling was the frustration at just how retarded the girls were.', 0),
  291. ('MANNA FROM HEAVEN is a terrific film that is both predictable and unpredictable at the same time.', 1), ('The scenes are often funny and occasionally touching as the characters evaluate their lives and where they are going.',
  292. 1), ('The cast of veteran actors are more than just a nostalgia trip.', 1), (
  293. "Ursula Burton's portrayal of the nun is both touching and funny at the same time with out making fun of nuns or the church.",
  294. 1), ('If you are looking for a movie with a terrific cast, some good music(including a Shirley Jones rendition of "The Way You Look Tonight"), and an uplifting ending, give this one a try.',
  295. 1), ("I don't think you will be disappointed.", 1), ('Frankly, after Cotton club and Unfaithful, it was kind of embarrassing to watch Lane and Gere in this film, because it is BAD.',
  296. 0), ('The acting was bad, the dialogs were extremely shallow and insincere.', 0),
  297. ('It was too predictable, even for a chick flick.', 0),
  298. ('Too politically correct.', 0),
  299. ('Very disappointing.', 0),
  300. ('The only thing really worth watching was the scenery and the house, because it is beautiful.', 1),
  301. ("I love Lane, but I've never seen her in a movie this lousy.", 0),
  302. ('An hour and a half I wish I could bring back.', 0),
  303. ("But in terms of the writing it's very fresh and bold.", 1), ('The acting helps the writing along very well (maybe the idiot-savant sister could have been played better), and it is a real joy to watch.',
  304. 1), ("The directing and the cinematography aren't quite as good.", 0),
  305. ('The movie was so boring, that I sometimes found myself occupied peaking in the paper instead of watching (never happened during a Columbo movie before!',
  306. 0), ('), and sometimes it was so embarrassing that I had to look away.', 0),
  307. ('The directing seems too pretentious.', 0),
  308. ('The scenes with the "oh-so-mature" neighbour-girl are a misplace.', 0),
  309. ('And generally the lines and plot is weaker than the average episode.', 0),
  310. ('Then scene where they debated whether or not to sack the trumpeter (who falsely was accused for the murder) is pure horror, really stupid.',
  311. 0), ('Some applause should be given to the "prelude" however.', 1), ('I really liked that.', 1),
  312. ('A great film by a great director.', 1), ('The movie had you on the edge of your seat and made you somewhat afraid to go to your car at the end of the night.',
  313. 1), ('The music in the film is really nice too.', 1), ("I'd advise anyone to go and see it.", 1),
  314. ('Brilliant!', 1), ('10/10', 1), ('I liked this movie way too much.', 1),
  315. ('My only problem is I thought the actor playing the villain was a low rent Michael Ironside.', 0),
  316. ('It rocked my world and is certainly a must see for anyone with no social or physical outlets.', 1),
  317. ("However, this didn't make up for the fact that overall, this was a tremendously boring movie.", 0),
  318. (
  319. "There was NO chemistry between Ben Affleck and Sandra Bullock in this film, and I couldn't understand why he would consider even leaving his wife-to-be for this chick that he supposedly was knocked out by.",
  320. 0), (
  321. "There were several moments in the movie that just didn't need to be there and were excruciatingly slow moving.",
  322. 0), ('This was a poor remake of "My Best Friends Wedding".', 0),
  323. ('All in all, a great disappointment.', 0),
  324. ('I cannot believe that the actors agreed to do this "film".', 0),
  325. ('I could not stand to even watch it for very long for fear of losing I.Q.', 0),
  326. ('I guess that nobody at the network that aired this dribble watched it before putting it on.', 0),
  327. (
  328. "IMDB ratings only go as low 1 for awful, it's time to get some negative numbers in there for cases such as these.",
  329. 0), ('I saw "Mirrormask" last night and it was an unsatisfactory experience.', 0),
  330. ('Unfortunately, inexperience of direction meant that scene after scene passed with little in the way of dramatic tension or conflict.',
  331. 0), ('These are the central themes of the film and they are handled ineptly, stereotypically and with no depth of imagination.',
  332. 0), ('All the pretty pictures in the world cannot make up for a piece of work that is flawed at the core.', 0),
  333. ('It is an hour and half waste of time, following a bunch of very pretty high schoolers whine and cry about life.',
  334. 0), ("You can't relate with them, hell you barely can understand them.", 0),
  335. ('This is definitely a cult classic well worth viewing and sharing with others.', 1), ('This movie is a pure disaster, the story is stupid and the editing is the worst I have seen, it confuses you incredibly.',
  336. 0),
  337. ('If you do go see this movie, bring a pillow or a girlfriend/boyfriend to keep you occupied through out.', 0),
  338. ('Awful.', 0),
  339. ("I don't think I've ever gone to a movie and disliked it as much.", 0),
  340. (
  341. "It was a good thing that the tickets only cost five dollars because I would be mad if I'd have paid $7.50 to see this crap.",
  342. 0), (
  343. "NOBODY identifies with these characters because they're all cardboard cutouts and stereotypes (or predictably reverse-stereotypes).",
  344. 0), (
  345. "This is a bad film, with bad writing, and good actors....an ugly cartoon crafted by Paul Haggis for people who can't handle anything but the bold strokes in storytelling....a picture painted with crayons.",
  346. 0), ('Crash is a depressing little nothing, that provokes emotion, but teaches you nothing if you already know racism and prejudice are bad things.',
  347. 0), (
  348. "Still, I do like this movie for it's empowerment of women; there's not enough movies out there like this one.",
  349. 1),
  350. ('An excellent performance from Ms.', 1), (
  351. "Garbo, who showed right off the bat that her talents could carry over from the silent era (I wanted to see some of her silent work, but Netflix doesn't seem to be stocking them.",
  352. 1), (
  353. "It's also great to see that renowned silent screenwriter Frances Marion hasn't missed a step going from silent to sound.",
  354. 1), ('This movie suffered because of the writing, it needed more suspense.', 0),
  355. ('There were too many close ups.', 0),
  356. ("But other than that the movie seemed to drag and the heroes didn't really work for their freedom.", 0),
  357. ('But this movie is definitely a below average rent.', 0),
  358. ('"You\'ll love it!', 1), ('This movie is BAD.', 0),
  359. ('So bad.', 0),
  360. ('The film is way too long.', 0),
  361. ('This is definitely one of the bad ones.', 0),
  362. ("The movie I received was a great quality film for it's age.", 1),
  363. ('John Wayne did an incredible job for being so young in the movie industry.', 1),
  364. ('His on screen presence shined thought even though there were other senior actors on the screen with him.', 1),
  365. ('I think that it is a must see older John Wayne film.', 1),
  366. ("I really don't see how anyone could enjoy this movie.", 0),
  367. ("I don't think I've ever seen a movie half as boring as this self-indulgent piece of junk.", 0),
  368. (
  369. "It probably would have been better if the director hadn't spent most of the movie showcasing his own art work, which really isn't that noteworthy.",
  370. 0), (
  371. "Another thing I didn't really like is when a character got punched in the face, a gallon of blood would spew forth soon after.",
  372. 0), ('Jamie Foxx absolutely IS Ray Charles.', 1), ('His performance is simply genius.', 1),
  373. ('He owns the film, just as Spacek owned "Coal Miner\'s Daughter" and Quaid owned "Great Balls of Fire.', 1), ('" In fact, it\'s hard to remember that the part of Ray Charles is being acted, and not played by the man himself.',
  374. 1), ('Ray Charles is legendary.', 1), (
  375. "Ray Charles' life provided excellent biographical material for the film, which goes well beyond being just another movie about a musician.",
  376. 1), ('Hitchcock is a great director.', 1),
  377. ('Ironically I mostly find his films a total waste of time to watch.', 0),
  378. ('Secondly, Hitchcock pretty much perfected the thriller and chase movie.', 1),
  379. ('And the rest of it just sits there being awful... with soldiers singing songs about the masculinity they pledge themselves to, hairsplitting about purity, the admiration of swords, etc.',
  380. 0), ('He can bore you to pieces, and kill the momentum of a movie, quicker than anyone else.', 0),
  381. ('Schrader has made a resume full of lousy, amateurish films.', 0),
  382. ('When I first watched this movie, in the 80s, I loved it.', 1),
  383. ('I was totally fascinated by the music, the dancing... everything.', 1),
  384. (
  385. "You can't even tell if they have any talent because they not only have pathetic lines to speak but the director gave them no action.",
  386. 0),
  387. ("If you check the director's filmography on this site you will see why this film didn't have a chance.", 0),
  388. ('This would not even be good as a made for TV flick.', 0),
  389. ('If good intentions made a film great, then this film might be one of the greatest films ever made.', 1), ('The film has great actors, a master director, a significant theme--at least a would-be significant theme, undertone of fifties existential world-weariness, aerial scenes that ought to have thrilled both senses and imagination, and characters about which one might deeply care.', 1), ('Regrettably, the film fails.', 0),
  390. ('The movie lacks visual interest, drama, expression of feeling, and celebration of the very patriotism that underlines the narrative.', 0),
  391. ('No actress has been worse used that June Allison in this movie.', 0),
  392. ('Yet, I enjoy watching it.', 1)]
  393.  
  394. words_to_include = ['not', 'bad', 'good', 'very', 'great', 'really', 'too', 'didn', 'good', 'amazing',
  395. 'can', 'much', 'but', 'just', 'most', 'don', 'stupid', 'ever', 'best', 'enjoyed',
  396. 'think', 'love', 'like', 'worst', 'these', 'boring', 'awful', 'little', 'wasted',
  397. 'thought', 'amusing', 'love', 'amazing', 'brilliant', 'not', 'excellent', 'totally',
  398. 'interesting', 'remarkable', 'sad', 'well', 'very']
  399. import re
  400. def get_only_wti(doc,includew=None):
  401. words = set()
  402. wordstoret=set()
  403. for word in re.split('\\W+', doc):
  404. if 2 < len(word) < 20:
  405. words.add(word.lower())
  406. for word in words:
  407. if word in includew:
  408. wordstoret.add(word.lower())
  409. return wordstoret
  410. def helper_fun(doc):
  411. return get_only_wti(doc,words_to_include)
  412. if __name__ == '__main__':
  413. comment = input()
  414. c1=NaiveBayes(get_words)
  415. c2=NaiveBayes(helper_fun)
  416. for x in data:
  417. c1.train(x[0],x[1])
  418. c2.train(x[0],x[1])
  419. print(f'Klasa predvidena so site zborovi: {c1.classify_document(comment)}')
  420. print(f'Klasa predvidena so samo kluchni zborovi: {c2.classify_document(comment)}')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement