Guest User

Untitled

a guest
May 20th, 2018
152
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.48 KB | None | 0 0
  1. from urllib.request import urlopen
  2. from bs4 import BeautifulSoup as soup
  3. import json
  4.  
  5. base_url = "https://archive.org/details/librivoxaudio?&sort=titleSorter"
  6.  
  7. data = []
  8. n = 5
  9. for i in range(1, n+1):
  10. response = urlopen(base_url + "&page=" + str(i))
  11. page_html = response.read()
  12. response.close()
  13.  
  14. #html parsing
  15. page_soup = soup(page_html, "html.parser")
  16.  
  17. #grabs info for each book
  18. containers = page_soup.findAll("div",{"class":"item-ttl"})
  19. authors = page_soup.findAll("span",{"class":"byv"})
  20.  
  21. for container in containers:
  22. item = {}
  23. item['type'] = "Public Domain Audiobook"
  24. item['title'] = container.text.lstrip().strip()
  25. for author in authors:
  26. item['author'] = author.text
  27. item['link'] = "https://archive.org/" + container.a["href"]
  28. item['source'] = "LibriVox"
  29. item['base_url'] = "https://librivox.org/"
  30. data.append(item) # add the item to the list
  31.  
  32. with open("./json/librivoxTest.json", "w") as writeJSON:
  33. json.dump(data, writeJSON, ensure_ascii=False)
  34.  
  35. {
  36. "type": "Public Domain Audiobook",
  37. "title": "A Book of Old English Ballads",
  38. "author": "Charles Whibley",
  39. "link": "https://archive.org//details/book_old_english_ballads_1007_librivox",
  40. "source": "LibriVox",
  41. "base_url": "https://librivox.org/"
  42. }, {
  43. "type": "Public Domain Audiobook",
  44. "title": "A Book of Scoundrels",
  45. "author": "Charles Whibley",
  46. "link": "https://archive.org//details/scoundrels_1712_librivox",
  47. "source": "LibriVox",
  48. "base_url": "https://librivox.org/"
  49. }
Add Comment
Please, Sign In to add comment