Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from urllib.request import urlopen
- from bs4 import BeautifulSoup as soup
- import json
- base_url = "https://archive.org/details/librivoxaudio?&sort=titleSorter"
- data = []
- n = 5
- for i in range(1, n+1):
- response = urlopen(base_url + "&page=" + str(i))
- page_html = response.read()
- response.close()
- #html parsing
- page_soup = soup(page_html, "html.parser")
- #grabs info for each book
- containers = page_soup.findAll("div",{"class":"item-ttl"})
- authors = page_soup.findAll("span",{"class":"byv"})
- for container in containers:
- item = {}
- item['type'] = "Public Domain Audiobook"
- item['title'] = container.text.lstrip().strip()
- for author in authors:
- item['author'] = author.text
- item['link'] = "https://archive.org/" + container.a["href"]
- item['source'] = "LibriVox"
- item['base_url'] = "https://librivox.org/"
- data.append(item) # add the item to the list
- with open("./json/librivoxTest.json", "w") as writeJSON:
- json.dump(data, writeJSON, ensure_ascii=False)
- {
- "type": "Public Domain Audiobook",
- "title": "A Book of Old English Ballads",
- "author": "Charles Whibley",
- "link": "https://archive.org//details/book_old_english_ballads_1007_librivox",
- "source": "LibriVox",
- "base_url": "https://librivox.org/"
- }, {
- "type": "Public Domain Audiobook",
- "title": "A Book of Scoundrels",
- "author": "Charles Whibley",
- "link": "https://archive.org//details/scoundrels_1712_librivox",
- "source": "LibriVox",
- "base_url": "https://librivox.org/"
- }
Add Comment
Please, Sign In to add comment