Advertisement
polv

create a chuck of text from Aozora Bunko

Nov 27th, 2017
178
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.70 KB | None | 0 0
  1. #!/usr/bin/python
  2. # -*- coding: utf8 -*-
  3.  
  4. from selenium import webdriver
  5. from bs4 import BeautifulSoup
  6. from random import randint
  7.  
  8. driver = webdriver.PhantomJS(executable_path = r'./phantomjs')
  9. site = 'http://www.aozora.gr.jp'
  10.  
  11. print 'Opening ' + site
  12. driver.get(site)
  13.  
  14. html_doc = driver.page_source
  15. soup = BeautifulSoup(html_doc, 'lxml')
  16.  
  17. td_tag = soup.find_all('td', class_ = 'summary')
  18. a_tag = td_tag[3].find_all('a')
  19.  
  20. rand = randint(0, len(a_tag)-1)
  21. print 'Opening ' + site + '/' + a_tag[rand]['href']
  22. driver.get(site + '/' + a_tag[rand]['href'])
  23.  
  24. html_doc = driver.page_source
  25. soup = BeautifulSoup(html_doc, 'lxml')
  26.  
  27. table_tag = soup.find('table', class_ = 'list')
  28. a_tag = table_tag.find_all('a')
  29.  
  30. rand = randint(0, len(a_tag)-1)
  31. site = site + '/' + a_tag[rand]['href'][3:]
  32. print 'Opening ' + site
  33. driver.get(site)
  34. html_doc = driver.page_source
  35.  
  36. soup = BeautifulSoup(html_doc, 'lxml')
  37.  
  38. table_tag = soup.find('table', class_ = 'download')
  39. a_tag = table_tag.find_all('a')
  40.  
  41. for i in range(0, len(site)):
  42.     if site[len(site)-i-1] == '/':
  43.         site = site[:len(site)-i-1]
  44.         break
  45.  
  46. for a in a_tag:
  47.     length = len(a['href'])
  48.     if a['href'][length-5:length-1] == 'html':
  49.         break
  50. site = site + '/' + a['href'][2:]
  51. print 'Opening ' + site
  52. driver.get(site)
  53. html_doc = driver.page_source
  54.  
  55. soup = BeautifulSoup(html_doc, 'lxml')
  56.  
  57. div_tag = soup.find('div', class_ = 'main_text')
  58.  
  59.  
  60. full_stop_array = [-1, ]
  61. for i in range(0, len(div_tag.text)):
  62.     if div_tag.text[i] == u'。':
  63.         full_stop_array += [i, ]
  64.  
  65. number_of_sentences = 3
  66. rand = randint(0, len(full_stop_array) - number_of_sentences)
  67. print div_tag.text[full_stop_array[rand]+1:full_stop_array[rand+number_of_sentences]+1]
  68.  
  69. driver.quit()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement