Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- # -*- coding: utf8 -*-
- from selenium import webdriver
- from bs4 import BeautifulSoup
- from random import randint
- driver = webdriver.PhantomJS(executable_path = r'./phantomjs')
- site = 'http://www.aozora.gr.jp'
- print 'Opening ' + site
- driver.get(site)
- html_doc = driver.page_source
- soup = BeautifulSoup(html_doc, 'lxml')
- td_tag = soup.find_all('td', class_ = 'summary')
- a_tag = td_tag[3].find_all('a')
- rand = randint(0, len(a_tag)-1)
- print 'Opening ' + site + '/' + a_tag[rand]['href']
- driver.get(site + '/' + a_tag[rand]['href'])
- html_doc = driver.page_source
- soup = BeautifulSoup(html_doc, 'lxml')
- table_tag = soup.find('table', class_ = 'list')
- a_tag = table_tag.find_all('a')
- rand = randint(0, len(a_tag)-1)
- site = site + '/' + a_tag[rand]['href'][3:]
- print 'Opening ' + site
- driver.get(site)
- html_doc = driver.page_source
- soup = BeautifulSoup(html_doc, 'lxml')
- table_tag = soup.find('table', class_ = 'download')
- a_tag = table_tag.find_all('a')
- for i in range(0, len(site)):
- if site[len(site)-i-1] == '/':
- site = site[:len(site)-i-1]
- break
- for a in a_tag:
- length = len(a['href'])
- if a['href'][length-5:length-1] == 'html':
- break
- site = site + '/' + a['href'][2:]
- print 'Opening ' + site
- driver.get(site)
- html_doc = driver.page_source
- soup = BeautifulSoup(html_doc, 'lxml')
- div_tag = soup.find('div', class_ = 'main_text')
- full_stop_array = [-1, ]
- for i in range(0, len(div_tag.text)):
- if div_tag.text[i] == u'。':
- full_stop_array += [i, ]
- number_of_sentences = 3
- rand = randint(0, len(full_stop_array) - number_of_sentences)
- print div_tag.text[full_stop_array[rand]+1:full_stop_array[rand+number_of_sentences]+1]
- driver.quit()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement