Advertisement
Guest User

Untitled

a guest
Nov 17th, 2019
275
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 33.31 KB | None | 0 0
  1. import re
  2.  
  3.  
  4. def get_words(doc):
  5. """Поделба на документот на зборови. Стрингот се дели на зборови според
  6. празните места и интерпукциските знаци
  7.  
  8. :param doc: документ
  9. :type doc: str
  10. :return: множество со зборовите кои се појавуваат во дадениот документ
  11. :rtype: set(str)
  12. """
  13. # подели го документот на зборови и конвертирај ги во мали букви
  14. # па потоа стави ги во резултатот ако нивната должина е >2 и <20
  15. words = set()
  16. for word in re.split('\\W+', doc):
  17. if 2 < len(word) < 20:
  18. words.add(word.lower())
  19. return words
  20.  
  21.  
  22. class DocumentClassifier:
  23. def __init__(self, get_features):
  24. # број на парови атрибут/категорија (feature/category)
  25. self.feature_counts_per_category = {}
  26. # број на документи во секоја категорија
  27. self.category_counts = {}
  28. # функција за добивање на атрибутите (зборовите) во документот
  29. self.get_features = get_features
  30.  
  31. def increment_feature_counts_per_category(self, current_feature, current_category):
  32. """Зголемување на бројот на парови атрибут/категорија
  33.  
  34. :param current_feature: даден атрибут
  35. :param current_category: дадена категорија
  36. :return: None
  37. """
  38. self.feature_counts_per_category.setdefault(current_feature, {})
  39. self.feature_counts_per_category[current_feature].setdefault(current_category, 0)
  40. self.feature_counts_per_category[current_feature][current_category] += 1
  41.  
  42. def increment_category_counts(self, cat):
  43. """Зголемување на бројот на предмети (документи) во категорија
  44.  
  45. :param cat: категорија
  46. :return: None
  47. """
  48. self.category_counts.setdefault(cat, 0)
  49. self.category_counts[cat] += 1
  50.  
  51. def get_feature_counts_per_category(self, current_feature, current_category):
  52. """Добивање на бројот колку пати одреден атрибут се има појавено во
  53. одредена категорија
  54.  
  55. :param current_feature: атрибут
  56. :param current_category: категорија
  57. :return: None
  58. """
  59. if current_feature in self.feature_counts_per_category \
  60. and current_category in self.feature_counts_per_category[current_feature]:
  61. return float(self.feature_counts_per_category[current_feature][current_category])
  62. return 0.0
  63.  
  64. def get_category_count(self, current_category):
  65. """Добивање на бројот на предмети (документи) во категорија
  66.  
  67. :param current_category: категорија
  68. :return: број на предмети (документи)
  69. """
  70. if current_category in self.category_counts:
  71. return float(self.category_counts[current_category])
  72. return 0
  73.  
  74. def get_total_count(self):
  75. """Добивање на вкупниот број на предмети"""
  76. return sum(self.category_counts.values())
  77.  
  78. def categories(self):
  79. """Добивање на листа на сите категории"""
  80. return self.category_counts.keys()
  81.  
  82. def train(self, item, current_category):
  83. """Тренирање на класификаторот. Новиот предмет (документ)
  84.  
  85. :param item: нов предмет (документ)
  86. :param current_category: категорија
  87. :return: None
  88. """
  89. # Се земаат атрибутите (зборовите) во предметот (документот)
  90. features = self.get_features(item)
  91. # Се зголемува бројот на секој атрибут во оваа категорија
  92. for current_feature in features:
  93. self.increment_feature_counts_per_category(current_feature, current_category)
  94.  
  95. # Се зголемува бројот на предмети (документи) во оваа категорија
  96. self.increment_category_counts(current_category)
  97.  
  98. def get_feature_per_category_probability(self, current_feature, current_category):
  99. """Веројатноста е вкупниот број на пати кога даден атрибут f (збор) се појавил во
  100. дадена категорија поделено со вкупниот број на предмети (документи) во категоријата
  101.  
  102. :param current_feature: атрибут
  103. :param current_category: карактеристика
  104. :return: веројатност на појавување
  105. """
  106. if self.get_category_count(current_category) == 0:
  107. return 0
  108. return self.get_feature_counts_per_category(current_feature, current_category) \
  109. / self.get_category_count(current_category)
  110.  
  111. def weighted_probability(self, current_feature, current_category, prf, weight=1.0, ap=0.5):
  112. """Пресметка на тежински усогласената веројатност
  113.  
  114. :param current_feature: атрибут
  115. :param current_category: категорија
  116. :param prf: функција за пресметување на основната веројатност
  117. :param weight: тежина
  118. :param ap: претпоставена веројатност
  119. :return: тежински усогласена веројатност
  120. """
  121. # Пресметај ја основната веројатност
  122. basic_prob = prf(current_feature, current_category)
  123. # Изброј колку пати се има појавено овој атрибут (збор) во сите категории
  124. totals = sum([self.get_feature_counts_per_category(current_feature, currentCategory) for currentCategory in
  125. self.categories()])
  126. # Пресметај ја тежински усредената веројатност
  127. bp = ((weight * ap) + (totals * basic_prob)) / (weight + totals)
  128. return bp
  129.  
  130.  
  131. class NaiveBayes(DocumentClassifier):
  132. def __init__(self, get_features):
  133. super().__init__(get_features)
  134. self.thresholds = {}
  135.  
  136. def set_threshold(self, current_category, threshold):
  137. """Поставување на праг на одлучување за категорија
  138.  
  139. :param current_category: категорија
  140. :param threshold: праг на одлучување
  141. :return: None
  142. """
  143. self.thresholds[current_category] = threshold
  144.  
  145. def get_threshold(self, current_category):
  146. """Добивање на прагот на одлучување за дадена класа
  147.  
  148. :param current_category: категорија
  149. :return: праг на одлучување за дадената категорија
  150. """
  151. if current_category not in self.thresholds:
  152. return 1.0
  153. return self.thresholds[current_category]
  154.  
  155. def calculate_document_probability_in_class(self, item, current_category):
  156. """Ја враќа веројатноста на документот да е од класата current_category
  157. (current_category е однапред позната)
  158.  
  159. :param item: документ
  160. :param current_category: категорија
  161. :return:
  162. """
  163. # земи ги зборовите од документот item
  164. features = self.get_features(item)
  165. # помножи ги веројатностите на сите зборови
  166. p = 1
  167. for current_feature in features:
  168. p *= self.weighted_probability(current_feature, current_category,
  169. self.get_feature_per_category_probability)
  170.  
  171. return p
  172.  
  173. def get_category_probability_for_document(self, item, current_category):
  174. """Ја враќа веројатноста на класата ако е познат документот
  175.  
  176. :param item: документ
  177. :param current_category: категорија
  178. :return: веројатност за документот во категорија
  179. """
  180. cat_prob = self.get_category_count(current_category) / self.get_total_count()
  181. calculate_document_probability_in_class = self.calculate_document_probability_in_class(item, current_category)
  182. # Bayes Theorem
  183. return calculate_document_probability_in_class * cat_prob / (1.0 / self.get_total_count())
  184.  
  185. def classify_document(self, item, default=None):
  186. """Класифицирање на документ
  187.  
  188. :param item: документ
  189. :param default: подразбирана (default) класа
  190. :return:
  191. """
  192. probs = {}
  193. # најди ја категоријата (класата) со најголема веројатност
  194. max = 0.0
  195. for cat in self.categories():
  196. probs[cat] = self.get_category_probability_for_document(item, cat)
  197. if probs[cat] > max:
  198. max = probs[cat]
  199. best = cat
  200.  
  201. # провери дали веројатноста е поголема од threshold*next best (следна најдобра)
  202. for cat in probs:
  203. if cat == best:
  204. continue
  205. if probs[cat] * self.get_threshold(best) > probs[best]: return default
  206.  
  207. return best
  208. data = [('neg', "http://twitpic.com/664b7 - miss my bestfriend :\'( now she left school"),
  209. ('neg', "@shaundiviney i didnt get the msg!! :\'( but i bought princess"),
  210. ('neg', "@Jack_O_C I\'m seriously screwed I haven\'t studied at all!!!! :\'("),
  211. ('neg', "&quot;I\'m giving up on you. I don\'t care how you mess up your life now.&quot;... :\'("),
  212. ('neg', "jake thomas looks so precious :\'( i hate when they say &quot;there it is i see the white light&quot; in every ghost whisperer show"),
  213. ('neg', "@_YoureMyHeroine :\'( i really know how you feelin. i wish i could hug you"),
  214. ('neg', "im well bored had a great half term and i dont wanna go back to school on monday :\'( enjoying the hot weatler lo0l ;)"),
  215. ('neg', "@billbathgate im not a doofus it could happen wahhh!!! :\'(!!!!!!!!!! im on my break!!"),
  216. ('neg', "Manchester was wayyyyy to busy! so warm today also! :\'("),
  217. ('neg', "@RobynHumes Can\'t Bro on laptop &amp; Salm on comp! Me stuck with Wii :\'( xx"),
  218. ('neg', "@iamdiddy I need a hug I\'m doing my junior cert this week and I\'m totally stressed out :\'("),
  219. ('neg', "@danger_skies miss you too :\'( it is!i never want to come home.....seriously"),
  220. ('neg', "the suns gone hopefully nice weather tommorrow. ALL THE WORK IS SO DEPRESSING! :\'("),
  221. ('neg', "going out I can\'t do this crap anymore :\'("),
  222. ('neg', "im not happy my ipod or laptop dont know whih but one has decided to refuse to let me sync my songs :\'( how can i live without it :/"),
  223. ('neg', "Awww that lil girl on bgt :\'( when they said she didnt have time :\'( that was soo sad and them huggin her"),
  224. ('neg', "@mileycyrus i wish i could meet you once do u think this will happen someday? :\'("),
  225. ('neg', ":\'( big brother in 4 days! This means constant live tripe on e4 and no scrubs to fall asleep to! Not happy"),
  226. ('neg', "Its coming out the socket I feel like my phones hole is not a virgin. That\'s how loose it is... :\'("),
  227. ('neg', "What I\'m gonna do life is not good:\'( no more Exit in this hallway I\'m stuck in my world..."),
  228. ('neg', "@marting05 I know... he\'s mad at us... :\'("),
  229. ('neg', "&quot;your true theatre calling? - musical theatre actor&quot; i wish :\'( xxx"),
  230. ('neg', "@gimboland sorry change of plans for me :\'( revision for monday exam in a park with one of my friends"),
  231. ('neg', "ok... twitter I almost pass out because of you!! bastard :\'("),
  232. ('neg', "@TheNewBradie my tvs not working i wanna watch vhits :\'("),
  233. ('pos', "@DavidArchie &lt;3 your gonna be the first twitter ;) cause your amazing lol. come to canada would do anything to see you perform"),
  234. ('neg', "@__sugar oh no i am always here ;) &lt;3"),
  235. ('pos', "@kaseypoteet LOL yeah yeah you big perv ;) Was hoping to see you next week but scrapped plans"),
  236. ('pos', "@mattpro13 Maatt Havent spoken to you in ages dudeee Dont forget bout ur Aussie fan ;) lool. Love ya xx"),
  237. ('neg', "@hot30 i want to! but im not over 18 and t&amp;c says over 18\'s only wanna make an exception for me ;)"),
  238. ('pos', "just got Up and going to get ready to go to meadowhall ;) can\'t believe my internet broke yesterday GUTTED"),
  239. ('pos', "hmm..Osaka. Last show today.Very sad . i can decode ur msg ;) haha cant wait till u get 2 Sydney ;D i missed out on tickets tho :o xx"),
  240. ('pos', "Lobbying in twitter! Here too!! Yuk! Gettin rid of groupies ;)"),
  241. ('neg', "@MAVinBKK I know but the wait will be worth it - November just seems so far away at the moment ;)"),
  242. ('pos', "Raining... I missed the rain so much... I am grateful for it ;)"),
  243. ('neg', "@billbathgate ....any sorry wahh!! lub u toooo ;)"),
  244. ('pos', "I am sick but Ians coming over so its all good ;)"),
  245. ('neg', "@theguigirl Awwww...thanks!! ;) Unfortunately"),
  246. ('pos', "@xXHAZELXx: Ok its suppose 2b followfriday not unfollow Friday aw well I have nice tweeters anyway! &lt;-almost doesnt sound right...lol;)"),
  247. ('neg', "im well bored had a great half term and i dont wanna go back to school on monday :\'( enjoying the hot weatler lo0l ;)"),
  248. ('pos', "NOOOO!!! &quot;thehannabeth: i have a crush... ;)&quot;"),
  249. ('pos', "@nattymsmith awww she\'s laavly ;) I had to come in but I\'ve got a stunning wee tan (l) ;) yourself?"),
  250. ('neg', "@CursedChimera; Re: Home - that\'s exactly what I meant... home in D-town. ;) Also"),
  251. ('pos', "@MissShell20 eeeeep so jealous ;) I\'m at work um"),
  252. ('pos', "@christa42 you mean the post concert blues ;) *lol* Well"),
  253. ('pos', "@mike03p IM SOWWIE I WAS A LIL LATE LOL it looked good though ;)"),
  254. ('pos', "@nick_carter awww poor you - but you know ... you\'re doing it for US - bless you ;)))"),
  255. ('neg', "@thewhitemage It does it sometimes - it should come back ;) And the car thing sucks - I feel all anxious and yucky now"),
  256. ('neg', "Next weeks dlc is fail Can\'t wait for Maiden in two weeks though ;)"),
  257. ('neg', "@reemerband Hiyaa! How was Tour? Really disappointed that I couldn\'t make it Hope your all Dandy ;) xxxxxxxx"),
  258. ('neg', "@Dayna_aka_Rowan he could be talking to me (he\'s probably not though ;) )"),
  259. ('pos', "@princesspooh90 Yeah but it doesn\'t sound indie enough i need2learn some other tunes and then pick up mo style =] 1hour! I\'ll c u then ;)"),
  260. ('pos', "@laydmaxix aww i will keep sending it ;)"),
  261. ('pos', "@sunshineangel89 Yeah.. Of course next time. ;) ICQ?"),
  262. ('neg', "@letter2twilight LMAO! I don\'t fake being Paris anymore. Look at my bio ;) and by the way I can\'t log onto your forum..."),
  263. ('neg', "I shrunk my favourite cardigan. Hubby said he\'d buy me a new one. I practically lived in it and it\'s gone. I shall say a few words ;)"),
  264. ('neg', "likewise @Buttahbrown you better ask about me ;) I don\'t appreciate you neglecting the sir fresh."),
  265. ('neg', "@smaknews sorry about that anna wintour repeated tweets!! sooo sorry somethings up... | was wondering abt the quad tweet ;)"),
  266. ('pos', "@welsh_lottie Not one of my favourite pastimes This weekend is a long weekend here so Monday I\'m off to an Ice Show w/ the G/daughter ;)"),
  267. ('neg', "wanted to go to the club...dancing;)..but now iГЇВїВЅm tired anyways i have to go to work tomorrow!"),
  268. ('pos', "show was amazing. so cold out now hope I can give victoria my card and get my dvds back ;) ha"),
  269. ('neg', "@Jamiebower you should come to Chile and your band too;) why everything happens far away from here?? lol we\'re losing good live music!"),
  270. ('pos', "@montiAsutton I wish I could really do that I love having u around! Ill see what I can do.. ;) try to use that national champ pull lol"),
  271. ('pos', "@janetfraser so true sad to say. I\'m glad you\'ll be with me to be my support group ;)"),
  272. ('pos', "@spjwebster wish @njwebster was coming too I guess we can make time for you though if we have to ;)"),
  273. ('pos', "@NanaSuzee i\'m on my mobile so it won\'t let me but i can\'t stop thinking about you ;)x"),
  274. ('pos', "because he @the_real_nash wants to be an honorary Filipino i\'ll follow him now ;)) thanks @daxvelando!"),
  275. ('pos', "@marygrrl aww loves you! way too cute ;)"),
  276. ('neg', "@jordanknight TINK! (whatever the f**k it means!!) from your JKUK girls! Show us some love! ;) xx"),
  277. ('neg', "Redford - Sufjan Stevens ][ for @yoochun ill make you cry again &lt;3 @mimacruz sure no prob slugger ;;) add th... ? http://blip.fm/~5jdtm"),
  278. ('pos', "Thanks to all who follow me ... wish ya\'ll the best ;)"),
  279. ('pos', "@taylorswift13 oh great! hope you\'ll have a blast there! ;)"),
  280. ('pos', "@datadirt hahah okay then thanks for this short explanation ;)"),
  281. ('neg', "@QueenieCyrus morning miss sarah cyrus ;) WHAT\'S UP? x"),
  282. ('pos', "Happy Star Wars Day May the fourth be with you ;)"),
  283. ('pos', "@deanomarr Italy or greece for me Love Italian men hehe ;)"),
  284. ('pos', "@CyranDorman Woot! I have created something inspirational! ;) Look forward to seeing more of your writings"),
  285. ('neg', "gooooodnight i fully gave up on my english. pride and prejudice. love the movie HATE the book ;)"),
  286. ('pos', "@ckanal funny you should say...am filling it out as we speak ;) cheers!"),
  287. ('pos', "i\'m off to see a movie (&quot;17 again&quot;)"),
  288. ('pos', "good morning! i hope you all have a good day today!! although its a monday... be positive! ;)"),
  289. ('pos', "@bombchelle512 happy birthday and @joemwestbrook congrats wish you were here for your lady..ill take care of her;)"),
  290. ('pos', "@emilyosment http://twitpic.com/48gy0 - He\'s Well Cooool ;) Lovve The Ro&amp;Co Shoooow"),
  291. ('pos', "@SteveHealy - I shall post a notice in town so us Cork ladies know to watch out!! lol You had fun then?? Sevens were the business !! ;)"),
  292. ('neg', "@sensualbodyrubs Hope you get your car today Hate anything that stops me from my work ;)"),
  293. ('pos', "@TheOlifants de wereld need more ppl like you! ;)"),
  294. ('pos', "excited how the jon does will do today Good luck guys ;)"),
  295. ('pos', "14:14 .. someone is thinking of me good luck to lynny and her tattoo ;)"),
  296. ('neg', "10.11PM~ todays a drag for me. so bored. im about to get into the romance book so i prob wont be on til the morn night twitter babes ;)"),
  297. ('neg', "@leannarenee hope sequel edits go well me and my notebook will be looking for a place to sit after pt ;)"),
  298. ('neg', "@tsarnick Ohhhh I don\'t know ;) an older mature lady?"),
  299. ('pos', "@amorphia delegate I am now eating pate on toast a my wife is editing yesterday\'s engagement shoot ;)"),
  300. ('pos', "@dannywood ...hey danny .. did u run already ??? hope you have a good day ;) i love you !!!!"),
  301. ('pos', "@absolutspacegrl I could feel the excitement in that tweet! ;) I\'ll be watching the launch on NASA tv! How I love my directv! seriously."),
  302. ('neg', "@ianweiqiang Interesting Combination Have a great one ;)"),
  303. ('neg', "@Helmuts hey helmuts! im ratty if u remember me from scootertechno.. ;)"),
  304. ('pos', "@Raderr but yeah i like purple maybe thats why!! ;) :p :d"),
  305. ('neg', "@readerwave if I know what you want it is easier to please you ;). I am glad you mentioned it"),
  306. ('pos', "@spazzyyarn he totally got you! ;) i think it\'s awesome."),
  307. ('pos', "it\'s after 3 AM.!! I think it\'s time to bed.!! have a good night twitts.! ;))"),
  308. ('pos', "@NeilMcDaid looks class the water splash looks so real looking forward to my review copy ;)"),
  309. ('pos', "@LifeByChocolate alredy had my chocolate it is impossible to resist ;)"),
  310. ('pos', "What a beautiful day! Hangin with the guys Graham and Josiah lol waiting for the others. If you wanna stop by come on over ;) with food"),
  311. ('pos', "@wendywings cute Time for a twitpic ;)"),
  312. ('pos', "@Lilayy hi.wanna see 17 again again with me ;) i\'ll fly to cali and see it with you"),
  313. ('pos', "@miizronnie aha speaking German haha maybe i should send some stuff in Italian ;)"),
  314. ('pos', "@garrettmurray Same here! I just wanted it to keep going and not end... ever! ;)"),
  315. ('pos', "@xxLOVExxPEACE yes and i want you to keep going if you would ;)"),
  316. ('pos', "@David_Henrie haha i WISH i coudl meet you.. you should stop by seattle some time home of the STARBUKS ;) I LOVE YOU DAVID!!"),
  317. ('pos', "@erikarbautista ! HE HAS A FAVOURITE! You\'re his favourite ;) OMGAAH. sorry for creepering? ..not really lol"),
  318. ('pos', "watching my baby on snl ! baby you look greaaaaat ;)"),
  319. ('pos', "@missSHANNAbaby YAY u get to see ddub again ;) those 5 men always keep me happy &amp; motivated"),
  320. ('neg', "@Peacehippie04 is a loser;) baha"),
  321. ('pos', "@odangitsnikki There\'s a way around that 72 Minute Limit ;) AIM me and I\'ll tell you"),
  322. ('pos', "@SicilyYoder no not yet I ate a couple ;) I love Reeces but they are hard to get in NZ"),
  323. ('pos', "@james_a_michael CUTE thanks for sharing! AND PLEASE Direct Message ME before you go to bed James ;) ;) you know you want to!"),
  324. ('neg', "Hi Everyone miss me much? muahhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh ;)"),
  325. ('pos', "I think I would be a good radio dj...I like awesome music and I have a great personality!!!! ;) !!! !!!"),
  326. ('neg', "@findingurstyle lol it\'s funny but it\'s not the back... it\'s all in the legs ;) and I\'ve never hurt my back! Thank God! Thanks 4 watchin!"),
  327. ('pos', "@mileycyrus HI! I\'m Eunice Kyna! I\'m a HUGE fan of yours! Can\'t wait for your next album! ;)"),
  328. ('pos', "@JJRogue I have an interview on tuesday so things are turning around I think!! yay! so dont worry! And Japsicans are a rare breed. ;)"),
  329. ('pos', "@ztnewetnorb hha yeah like they have your heart too but weve met shaun and bradie... it seems more real ;)"),
  330. ('neg', "@parboo LOL - Birmingham was my 1st love... but it\'s time to move on! ;) Good Morning"),
  331. ('neg', "@KiwiLucy ahhh ;) I know who wins the entire thing"),
  332. ('pos', "@jasonperryrock Is you cat clean again? Hope so ;) Xx"),
  333. ('pos', "Alright... i need to get sleep so i can ACTUALLY be awake for my mothers\' day! ;) Nighty Nightzzz Or good morning my twitter friends!!!"),
  334. ('pos', "I need 4 followers to get 100 followers!! Fallow me!!! I fallow you back!! ;)"),
  335. ('neg', "I\'m not! They frighten me to death............I just like to see all the boys in their leathers! ;) We go every year. Brilliant atmos!"),
  336. ('pos', "@dannymasterson the honesty\'s to much........... Sorry couldn\'t resist;)"),
  337. ('pos', "@beautyholic woohoooo ;) to BOTH! retail therapy and surprise visits two things i love."),
  338. ('neg', "@damohopo I didn\'t headbutt anyone! Not that I know about anyway! ;) You ok today? Football today?"),
  339. ('neg', "Yang4 - finally got it Chinese is hard when every other kid has a Zhonguoren adult at home! We\'re all foreign devils here ;)"),
  340. ('neg', "@NursingDrPepper I told you I\'d be back Just won\'t be updating as much before my exams. Looking forward to a day or two in your house ;)"),
  341. ('neg', "doing the andy dance the one on fonzie gomez show ;) haha"),
  342. ('pos', "just saw UP it was a cute movie (:passed by a place called a peasants kitchen. wtf? that names kinda sad"),
  343. ('pos', "such a good day!! even though my so called friends did try to row away from me but god i love em :p"),
  344. ('neg', "@nathanblevins Maybe next time. Can\'t be away this weekend as much as I\'d like to jump in the car and go. ::pout::"),
  345. ('neg', "Waiting for the Denver game to come on.. but i dont think their gonna win it Lakers suck lol :p"),
  346. ('pos', "@funkylovin ah mine is never home before 8 I handed off the kids and grabbed the bottle of malibu and a coke..momma getting drinky :para"),
  347. ('neg', "@Kayleigh_Stack i wish i could go to both but i don\'t think i\'ll be allowed :p either way with two shows going your bound to get a ticket"),
  348. ('neg', "@lynnftw I know exactly what you are saying.. its so not cool... that is why tapes were better :p"),
  349. ('neg', "@emzyjonas Yea once - me and my friends flew out to amercia to see her w/ the Jonas brothers . have u? haha i hate bebo :p . aw cant wait"),
  350. ('pos', "@YousifMind good morning 3asa mo bs important classes ? :p"),
  351. ('pos', "@xx_Megan_xx oh dear lmao that a key ingredient :p cakes in the oven and now I\'m cooking my lunch paprika and chilli chicken YUM haha"),
  352. ('pos', "@Raderr but yeah i like purple maybe thats why!! ;) :p :d"),
  353. ('neg', "@Afey umm how abt a comment like that :p &quot;i dont like this&quot;"),
  354. ('pos', "I gave a homeless lady named Ruby an Ice Cream sandwich and a cigarette. That is my g00d deed for the day. :p"),
  355. ('neg', "i blame you all! got it??? good :p she better be in good condition 2! &lt;33 night"),
  356. ('pos', "@KOLsweetie hell yeah! Belgian beer is the bomb!! :p"),
  357. ('pos', "@Willy9e shouldn\'t I be going to sleep? Just kidding :p"),
  358. ('pos', "@andyclemmensen would that just eat away at your masculinity? What masculinity did you have? :p haha u\'d probs beat me tho haha xo"),
  359. ('pos', "#f1 soon good luck brawn and mclaren fix up look sharp :p"),
  360. ('pos', "@sosolid2k turtles and shoes make an awesome couple if only shoes could talk back to the turtle :p lol"),
  361. ('neg', "@n00rtje Thanks I\'ll explain on msn or something :p and I HATE SPIDERS TOO! What happened"),
  362. ('neg', "@ether_radio yeah :S i feel all funny cause i haven\'t slept enough i woke my mum up cause i was singing she\'s not impressed :S you?"),
  363. ('neg', "even though everyone wanted to do a newish song and our teacher agreed :S old grumpy doesn\'t like us happy haha"),
  364. ('pos', "I am soo happy! But frustrated at the same time! :] :S. Ohh noo!!! Britney is recording her new video for Radar!!! Sooo ExxCiiTeed!!!"),
  365. ('pos', "I love music so much that i\'ve gone through pain to play :S my sides of my fingers now are peeling and have blisters from playing so much"),
  366. ('neg', "in about half a hour i\'m going to my english lesson...guess i\'ll have to wait...and wait for a couple hours so i\'m over with it.(( :S ))"),
  367. ('neg', "@t0ns: nou moe... stomme banken/crisis shit :S"),
  368. ('neg', "@gfalcone601 ino :O i was near crying for her sometimes i forget that its actually live tv =/ .....am i talkin about the sme thing?:Sxxxx"),
  369. ('neg', "@FreyaLynn lol seriously. fail. ::sigh::"),
  370. ('pos', "@changedforgood Aww that sucks It wasn\'t that long though but Dianne was SO cute as usual &lt;3 Hopeflly some1 mite put it on youtube :S"),
  371. ('neg', "My computer dies soon - its so much virus on it but my virus scanner cant find it :S"),
  372. ('pos', "Greg:Showing my friends AudioBoo http://audioboo.fm/ Everyone seems 2 love it. Ta for the headzup bro. So need 2 get iPhone. Roll on June"),
  373. ('pos', "@CARAciao haha :S i started yesterday my dad helped me so much!!!"),
  374. ('pos', "BOOK NOW &amp; SAVE:SUMMER 2009 * THE AMAZONES VILLAGE SUITES****-CRETE-GREECE! THE BEST PLACE TO BE!"),
  375. ('neg', "thank you @ddlovato (: cant wait!!!! ummm btw ima crash still sick"),
  376. ('neg', "@ddlovato Caan\'t Iht Be Earlier? ICant Wait That Long. Ahar. (:"),
  377. ('pos', "@AnnetteStatus I luv urs! admitting niley with a kiss (: but then they change their minds haha &amp; demi/sterling (: not jemi but still cute"),
  378. ('neg', "My siblings left me alone. Bored. (:|"),
  379. ('pos', "slept all day.. lol. now time to start on my UN article.. what fun (: ..."),
  380. ('neg', "@mixxxonn we watched the today show(: we didnt see you though"),
  381. ('pos', "Had quite a cool day with Charlie and then Ben aswell (: got lost and stung by nettles"),
  382. ('neg', "In school ; With Victoria &amp; Bryan (: _ no more school soon"),
  383. ('neg', "last day at the Ko Olina. off to north shoreee(:"),
  384. ('pos', "this week of mine was not easy! but finally it\'s over! (:"),
  385. ('pos', "Aww what a sunny day! Tasty barbeque with the family (: Got bad sunburn though"),
  386. ('pos', "just saw UP it was a cute movie (:passed by a place called a peasants kitchen. wtf? that names kinda sad"),
  387. ('neg', "He didnt leave a voicemail.. -121908inlove(:"),
  388. ('neg', "@Spidersamm ohh yeahh (: i\'m probs gonna be a loner to start with"),
  389. ('neg', "no Santa cruz for me but I do have an interview at jamba tomorrow morning (:"),
  390. ('neg', "Can\'t beat all time low.. (: I soooooo want to go to Metro Station.. Your cheap shots wont be able to break bones"),
  391. ('pos', "@grcrssl Helloooo (: Star Wars day is cool LOOL. Wen do you go to Cnaterbury then ? x"),
  392. ('pos', "@torilovesbradie no probs(: and yeah im still sick. no school today lol. feel really crap but thats because im dancing lol . thanks"),
  393. ('pos', "Gdnight Tweeters (: Night @athenakg sleep tight and don\'t steal my blankets Otay! I love YOUS"),
  394. ('pos', "I\'m still up! Thank you all for praying (: AHAHAHA! I\'m watching Britney: For the Record until school. Today should be a good day"),
  395. ('pos', "@taylorswift13 i love you so much tay (: youre so amazing &lt;3 you should come to denmark"),
  396. ('pos', "@johncmayer heylo johnn (: im a huge fan. hope ur day is awesomee. cuzz im home sick and its kinda less than awesome.. anyways.. PEACE"),
  397. ('pos', "Watching TV with the best people in the whole world !!!!! My Mum and My Sis Agus (: Love you all ! Twitter you later ha"),
  398. ('pos', "@yaykimo baaha &amp; healthy choice my friend! (:"),
  399. ('pos', "Korean music festival &lt;33 i miss you ): Hahaha sexy time ! (: &lt;3 Can\'t wait till SHINee ! LOL !"),
  400. ('pos', "just got out of the pool!! so fun..now gonna watch tv and do stuff on the computer. (:"),
  401. ('pos', "just had cheese on toast with ham (: about to get ready to go to LONDON!"),
  402. ('pos', "YES! getting my sky + back on wednesday been waiting weeks for it (:"),
  403. ('pos', "14 dayssss ahhhh super excited \'they\'re telling me that my heart wont beat again\' JLS were awesome yesterday (:")]
  404.  
  405. emoticons = [":'(", ";)", ":p", ":s", "(:"]
  406.  
  407.  
  408. def get_words_with_emoticons(doc, emoticons):
  409. """Поделба на документот на зборови. Стрингот се дели на зборови според
  410. празните места и интерпукциските знаци
  411.  
  412. :param doc: документ
  413. :type doc: str
  414. :return: множество со зборовите кои се појавуваат во дадениот документ
  415. :rtype: set(str)
  416. """
  417. # подели го документот на зборови и конвертирај ги во мали букви
  418. # па потоа стави ги во резултатот ако нивната должина е >2 и <20
  419. words = set()
  420. for word in re.split('\\W+', doc):
  421. if 2 < len(word) < 20:
  422. words.add(word.lower())
  423. emoti = doc.split(' ')
  424. for e in emoti:
  425. if e in emoticons:
  426. words.add(e.lower())
  427. return words
  428.  
  429. def help_function(x):
  430. return get_words_with_emoticons(x,emoticons)
  431.  
  432. if __name__ == '__main__':
  433. sample_ind = int(input())
  434. test=[data[i] for i in range(0,len(data)) if i==sample_ind]
  435. #print(test)
  436. trening=[data[i] for i in range(0,len(data)) if i!=sample_ind]
  437. #print(trening)
  438.  
  439. c1=NaiveBayes(get_words)
  440. c2=NaiveBayes(help_function)
  441. for x in trening:
  442. c1.train(x[1],x[0])
  443. c2.train(x[1],x[0])
  444.  
  445. klasa1=c1.classify_document(str(test))
  446. klasa2=c2.classify_document(str(test))
  447.  
  448. document=test[0][1]
  449. vistinska_klasa=test[0][0]
  450. print(document)
  451. print("Vistinska klasa: "+vistinska_klasa)
  452. print("Klasa predvidena so Naiven Baes (bez emotikoni): " +klasa1)
  453. print("Klasa predvidena so Naiven Baes (so emotikoni): " +klasa2)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement