Advertisement
Eshkation-

topic_listener

Feb 24th, 2018
195
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.10 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2.  
  3. import requests, re
  4. from bs4 import BeautifulSoup
  5.  
  6. class topic_lister:
  7.     def __init__(self, section):
  8.         self.url = 'https://atelier801.com/section?{0}'.format(section)
  9.         self.current_page = 1
  10.         self.left_pages = 1
  11.         self.section = {
  12.             'name': 'undefined',
  13.             'tags': {},
  14.             'topics': {},
  15.             'unique_authors': {},
  16.             'pages': 0
  17.         }
  18.  
  19.     def generate_result(self, file_name, tag_only = False):
  20.         final_text = u'[size=13][size=17][color=#2ecf73]{0}[/color][/size]\n[b]Topic tags ({4}):[/b] {1}\n\n[b]Topic authors ({5}):[/b] {2}\n\n[b]Total topics:[/b] {3}[/size]'
  21.  
  22.         tags_text = u''
  23.         for tag in sorted(self.section['tags'].iterkeys(), key=lambda k: len(self.section['tags'][k]), reverse=True):
  24.             tag_data =  self.section['tags'][tag]
  25.             tags_text += u', {0} ({1})'.format(tag, len(tag_data))
  26.         tags_text = u'[font=Monospace]{0}[/font]'.format(tags_text[2:])
  27.  
  28.         authors_text = u''
  29.         for author in sorted(self.section['unique_authors'].iterkeys(), key=lambda k: len(self.section['unique_authors'][k]), reverse=True):
  30.             author_data = self.section['unique_authors'][author]
  31.             authors_text += u', {0} ({1})'.format(author, len(author_data))
  32.         authors_text = u'[font=Monospace]{0}[/font]'.format(authors_text[2:])
  33.  
  34.         topics_text = u'\n\n[size=12][font=Monospace][table]'
  35.         cel_normalizer = u'-' * 32
  36.         cel_normalizer = u'[color=#1c3c41]'+cel_normalizer+u'[/color]'
  37.         for topic in sorted(self.section['topics'].iterkeys()):
  38.             topic_data = self.section['topics'][topic]
  39.             t_url_title = re.findall('\d+', topic_data['url'])[1]
  40.             topics_text += u'[row]\n\t[cel]{0}[/cel]\n\t[cel]{1}[/cel]\n\t[cel]{2}[/cel]\n\t[cel][url=https://atelier801.com/{3}]T-{4}[/url][/cel]\n[/row]\n'.format(topic_data['title'], topic_data['author'], topic_data['tagged'], topic_data['url'], t_url_title)
  41.  
  42.         final_text = final_text.format(self.section['name'], tags_text, authors_text, len(self.section['topics']), len(self.section['tags']), len(self.section['unique_authors']))
  43.         final_text += topics_text+u'[/table][/font][/size]'
  44.  
  45.         file_stream = open(file_name, 'a')
  46.         file_stream.write(final_text.encode('utf-8'))
  47.         file_stream.close()
  48.  
  49.     def load_section(self):
  50.         formatted_url = '{0}&p={1}'.format(self.url, self.current_page)
  51.         print(u'({0}/{1}) Requisitando pΓ‘gina [{2}]'.format(self.current_page, self.left_pages, formatted_url))
  52.         request = requests.get(formatted_url)
  53.         html_page = request.text
  54.         self.search_topics(html_page)
  55.         self.get_left_pages(html_page)
  56.  
  57.         if (self.current_page < self.left_pages):
  58.             self.current_page += 1
  59.             self.load_section()
  60.  
  61.     def get_left_pages(self, content):
  62.         html = BeautifulSoup(content, 'html.parser')
  63.         navigation_div = html.find(attrs = {
  64.             'class': 'cadre-pagination'
  65.         }).find('a').text
  66.         total_pages = re.match('\d+ / (\d+)', navigation_div)
  67.         if (total_pages):
  68.             self.left_pages = int(total_pages.group(1))
  69.  
  70.     def search_topics(self, content):
  71.         html = BeautifulSoup(content, 'html.parser')
  72.         self.section['name'] = html.title.string
  73.         topics = html.find_all(attrs = {
  74.             'class': 'cadre-sujet'
  75.         })
  76.         for topic in topics:
  77.             topic_title = topic.find(attrs = {
  78.                 'class': 'cadre-sujet-titre'
  79.             })
  80.             topic_name = unicode(topic_title.text).strip()
  81.             topic_tag = re.match('\[(.*?)\]', topic_name)
  82.             topic_tag = topic_tag and topic_tag.group(1) or 'none'
  83.             if (not topic_tag in self.section['tags']):
  84.                 self.section['tags'][topic_tag] = []
  85.             topic_url = topic_title.get('href')
  86.             self.section['tags'][topic_tag].append(topic_url)
  87.             topic_name = topic_name#.replace(u'['+topic_tag+u']', '').strip().title()
  88.  
  89.             topic_sujet = topic.find(attrs = {
  90.                 'class': 'element-sujet'
  91.             })
  92.             topic_author = topic.find(attrs = {
  93.                 'class': 'cadre-sujet-infos'
  94.             }).find('span', attrs = {
  95.                 'class': re.compile('cadre-type-auteur-(.*?)')
  96.             }).text
  97.             if (not topic_author in self.section['unique_authors']):
  98.                 self.section['unique_authors'][topic_author] = []
  99.             self.section['unique_authors'][topic_author].append(topic_url)
  100.  
  101.             topic_date = int(topic_sujet.find('span').text)/1e3
  102.             self.section['topics'][topic_name] = {
  103.                 'title': topic_name,
  104.                 'tagged': topic_tag.lower(),
  105.                 'author': topic_author,
  106.                 'created': topic_date,
  107.                 'url': topic_url
  108.             }
  109.  
  110.  
  111. topicHandler = topic_lister('f=6&s=19') # f=6&s=19 Γ© o link do editor de mapas
  112. topicHandler.load_section()
  113. topicHandler.generate_result(u'editor_de_mapas.txt') # arquivo para salvar o bbcode
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement