Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from html.parser import HTMLParser
- import re
- import plotly as py
- import plotly.graph_objs as go
- class LinksParser(HTMLParser):
- def __init__(self):
- HTMLParser.__init__(self)
- self.recordingData = False
- self.recordUser = False
- self.message_data = [()]
- self.user_data = []
- self.filter_tag = 'p'
- self.user_tag = 'span'
- self.user = ''
- self.persons = []
- def handle_starttag(self, tag, attributes):
- # print('TAG : ' + tag)
- if tag == self.filter_tag:
- self.recordingData = True
- if tag == self.user_tag:
- self.recordUser = True
- def handle_endtag(self, tag):
- return
- def handle_data(self, data):
- # print('USEDATA : ' + data)
- if self.recordUser and '@facebook' in data:
- result = ''
- for i in data:
- if i.isdigit():
- result += i
- # print('USERID : ' + result)
- self.recordUser = False
- self.user_id = result
- if result not in self.persons:
- self.persons.append(result)
- if self.recordingData:
- # print('Message: ' + data)
- self.message_data.append((data, self.user_id))
- self.recordingData = False
- parser = LinksParser()
- """
- parser.feed('<html><head><title>Test</title></head>'
- '<body><h1>Parse me!</h1></body></html>'
- '<p>PRINT ME</p>'
- '<div>NOT ME</div>')
- # print(parser.data)"""
- with open('messages.htm', encoding="utf8") as content_file:
- content = content_file.read()
- parser.feed(content)
- from pprint import pprint
- # pprint(parser.message_data)
- # pprint(parser.persons)
- smiley_count_dict = dict.fromkeys(parser.persons, 0)
- message_count_dict = dict.fromkeys(parser.persons, 0)
- # pprint(count_dict[parser.message_data[5][1]])
- for message in parser.message_data:
- # pprint(message)
- if len(message) == 2:
- key = message[1]
- smileys = 0
- smileys += message[0].count(':)')
- smileys += message[0].count(':-')
- smileys += message[0].count(':(')
- smileys += message[0].count(':p')
- smileys += message[0].count(':s')
- smileys += message[0].count(':P')
- smileys += message[0].count(':')
- smileys += message[0].count('^^')
- smileys += len(re.findall('[\U0001f600-\U0001f650]', message[0]))
- smiley_count_dict[key] += smileys
- message_count_dict[key] += 1
- data = []
- for key in smiley_count_dict.keys():
- message = message_count_dict[key]
- smileys = smiley_count_dict[key]
- if message > 0:
- rel = smileys / message
- # print("User: " + str(key) + " messages: " + str(message)+ " smileys: " + str(smileys) + " Rel = "+str(rel))
- data.append((smileys, message, rel, key))
- data.sort(key=lambda tup: tup[2])
- topTen = []
- for data_instance in data:
- if data_instance[1] > 50:
- topTen.append(data_instance)
- #pprint(topTen)
- #topTen=topTen[-10:]
- """
- data = [go.Bar(
- x=['giraffes', 'orangutans', 'monkeys'],
- y=[20, 14, 23]
- )]
- """
- keys = [x[3] for x in topTen]
- messages = [x[1] for x in topTen]
- smileys = [x[1] for x in topTen]
- print(topTen[0])
- print(keys)
- #print(messages)
- # trace1 = go.Bar(key,messages,'Messages')
- trace1 = go.Bar(
- # x=keys,
- y=messages,
- name='Messages'
- )
- trace2 = go.Bar(
- # x=keys,
- y=smileys,
- name='Smileys'
- )
- g_rel = [x[2] for x in topTen]
- g_data = [go.Bar(
- y=g_rel
- )]
- py.offline.plot(g_data, filename='basic-bar')
- g_data = [trace1, trace2]
- layout = go.Layout(barmode='stack')
- fig = go.Figure(data=g_data, layout=layout)
- # py.offline.plot(fig, filename='stacked-bar')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement