Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import logging
- import os
- from grab.spider import Spider, Task
- class JavaGamesSpider(Spider):
- initial_urls = ['http://dc.sever.ru/?d=19']
- def task_initial(self, grab, task):
- for elem in grab.doc.select('/html/body/div[3]//dt/a'):
- name = ' '.join(elem.text().split()[:-1])
- dir_name = './data/' + name
- if not os.path.exists(dir_name):
- os.makedirs(dir_name)
- yield Task('dir', url='http://dc.sever.ru' + elem.attr('href'), dir=name)
- def task_dir(self, grab, task):
- for elem in grab.doc.select('/html/body/div[3]//dt/a'):
- name = ' '.join(elem.text().split()[:-1])
- dir_name = './data/' + task.dir + '/' + name
- if not os.path.exists(dir_name):
- os.makedirs(dir_name)
- yield Task('category', url='http://dc.sever.ru' + elem.attr('href'), dir=name, pre_dir=task.dir)
- def task_category(self, grab, task):
- for elem in grab.doc.select('/html/body/div[3]//dt/a'):
- name = ' '.join(elem.text().split()[:-1])
- dir_name = './data/' + task.pre_dir + '/' + task.dir + '/' + name
- if not os.path.exists(dir_name):
- os.makedirs(dir_name)
- yield Task('files', url='http://dc.sever.ru' + elem.attr('href'), path=dir_name)
- def task_files(self, grab, task):
- for elem in grab.doc.select('/html/body/div[4]/dl//a'):
- name = elem.text()
- dir_name = task.path + '/' + name
- if not os.path.exists(dir_name):
- os.makedirs(dir_name)
- yield Task('file', url='http://dc.sever.ru' + elem.attr('href'), dir=dir_name)
- url_next = grab.doc.select('/html/body/div[5]/a')
- if url_next.exists():
- for link in url_next:
- if link.text() == 'Далее->':
- yield Task('files', url='http://dc.sever.ru' + link.attr('href'), path=task.path)
- def task_file(self, grab, task):
- images = grab.doc.select('/html/body/div[2]/a[starts-with(@href, "index")]')
- end_index = 1
- if images.exists():
- for item in images:
- end_index = item.text()
- image = grab.doc.select('/html/body/div[2]/img[2]')
- if image.exists():
- fid = task.url.split('=')[2].replace('&order', '')
- for index in range(1, int(end_index) + 1):
- n = index - 1
- path = task.dir + '/screenshot_%d.jpg' % n
- if not os.path.exists(path):
- yield Task('screen', url='http://dc.sever.ru/scr.php?c=%s&s=%d' % (fid, n), path=task.dir, nid=n)
- files = grab.doc.select('/html/body/div[3]/a[starts-with(@href, "/get")]')
- if files.exists():
- for file in files:
- file_type = file.text().replace('Скачать(', '').replace(')', '').lower()
- path = task.dir + '/game.%s' % file_type
- if not os.path.exists(path):
- yield Task('game', url='http://dc.sever.ru' + file.attr('href'), path=task.dir, type=file_type)
- description = grab.doc.select('/html/body/div[2]')
- description = description.text().split('Описание:')[1]
- stop_words = ['От Sever.Ru:', 'Рейтинг (max10):', 'Разработчик:', 'Версия:', 'Год:', 'Язык:', 'Добавлено:']
- for item in stop_words:
- description = description.split(item)[0]
- f = open(task.dir + '/description.txt', 'w')
- f.write(description.strip())
- f.close()
- def task_game(self, grab, task):
- path = task.path + '/game.%s' % task.type
- grab.response.save(path)
- def task_screen(self, grab, task):
- path = task.path + '/screenshot_%d.jpg' % task.nid
- grab.response.save(path)
- if __name__ == '__main__':
- logging.basicConfig(level=logging.DEBUG)
- bot = JavaGamesSpider(thread_number=2)
- bot.run()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement