Advertisement
Guest User

Untitled

a guest
Oct 22nd, 2019
108
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.23 KB | None | 0 0
  1. try:
  2. from urllib.parse import unquote_plus
  3. except ImportError:
  4. from urllib import unquote_plus
  5. import time, re, requests
  6. from bs4 import BeautifulSoup
  7.  
  8. class Scraper:
  9.  
  10. def __init__(self):
  11. self.base_url = 'http://fanfiction.net/'
  12. self.rate_limit = 1
  13. self.parser = "html.parser"
  14.  
  15. def get_genres(self, genre_text):
  16. genres = genre_text.split('/')
  17. # Hurt/Comfort is annoying because of the '/'
  18. corrected_genres = []
  19. for genre in genres:
  20. if genre == 'Hurt':
  21. corrected_genres.append('Hurt/Comfort')
  22. elif genre == 'Comfort':
  23. continue
  24. else:
  25. corrected_genres.append(genre)
  26. return corrected_genres
  27.  
  28. def scrape_story_metadata(self, story_id):
  29. """
  30. Returns a dictionary with the metadata for the story.
  31. Attributes:
  32. -id: the id of the story
  33. -canon_type: the type of canon
  34. -canon: the name of the canon
  35. -author_id: the user id of the author
  36. -title: the title of the story
  37. -updated: the timestamp of the last time the story was updated
  38. -published: the timestamp of when the story was originally published
  39. -lang: the language the story is written in
  40. -genres: a list of the genres that the author categorized the story as
  41. -num_reviews
  42. -num_favs
  43. -num_follows
  44. -num_words: total number of words in all chapters of the story
  45. -rated: the story's rating.
  46. """
  47. url = '{0}/s/{1}'.format(self.base_url, story_id)
  48. result = requests.get(url)
  49. html = result.content
  50. soup = BeautifulSoup(html, self.parser)
  51. if (soup.find(id='pre_story_links') != None):
  52. pre_story_links = soup.find(id='pre_story_links').find_all('a')
  53. author_id = int(re.search(r"var userid = (.*);", str(soup)).groups()[0]);
  54. title = re.search(r"var title = (.*);", str(soup)).groups()[0];
  55. title = unquote_plus(title)[1:-1]
  56. metadata_div = soup.find(id='profile_top')
  57. times = metadata_div.find_all(attrs={'data-xutime':True})
  58. metadata_text = metadata_div.find(class_='xgray xcontrast_txt').text
  59. metadata_parts = metadata_text.split('-')
  60. genres = self.get_genres(metadata_parts[2].strip())
  61. #print(times)
  62. metadata = {
  63. 'id': story_id,
  64. 'canon_type': pre_story_links[0].text,
  65. 'canon': pre_story_links[1].text,
  66. 'author_id': author_id,
  67. 'title': title,
  68. #'updated': int(times[0]['data-xutime']),
  69. #'published': int(times[1]['data-xutime']),
  70. 'lang': metadata_parts[1].strip(),
  71. 'genres': genres
  72. }
  73. if (len(times) == 1):
  74. metadata['published'] = int(times[0]['data-xutime'])
  75. metadata['updated'] = -1
  76. else:
  77. metadata['published'] = int(times[1]['data-xutime'])
  78. metadata['updated'] = int(times[0]['data-xutime'])
  79. for parts in metadata_parts:
  80. parts = parts.strip()
  81. tag_and_val = parts.split(':')
  82. if len(tag_and_val) != 2:
  83. continue
  84. tag, val = tag_and_val
  85. tag = tag.strip().lower()
  86. if tag not in metadata:
  87. val = val.strip()
  88. try:
  89. val = int(val.replace(',', ''))
  90. metadata['num_'+tag] = val
  91. except:
  92. metadata[tag] = val
  93. if 'status' not in metadata:
  94. metadata['status'] = 'Incomplete'
  95. return metadata
  96. return None
  97.  
  98. def scrape_story(self, story_id, keep_html=False):
  99. metadata = self.scrape_story_metadata(story_id)
  100. metadata['chapters'] = {}
  101. metadata['reviews'] = {}
  102. num_chapters = metadata['num_chapters']
  103. # rate limit to follow fanfiction.net TOS
  104. time.sleep(self.rate_limit)
  105. for chapter_id in range(1, num_chapters + 1):
  106. time.sleep(self.rate_limit)
  107. chapter = self.scrape_chapter(story_id, chapter_id)
  108. time.sleep(self.rate_limit)
  109. chapter_reviews = self.scrape_reviews_for_chapter(
  110. story_id, chapter_id)
  111. metadata['chapters'][chapter_id] = chapter
  112. metadata['reviews'][chapter_id] = chapter_reviews
  113. return metadata
  114.  
  115. def scrape_chapter(self, story_id, chapter_id, keep_html=False):
  116. url = '{0}/s/{1}/{2}'.format(self.base_url, story_id, chapter_id)
  117. result = requests.get(url)
  118. html = result.content
  119. soup = BeautifulSoup(html, self.parser)
  120. chapter = soup.find(class_='storytext')
  121. if not keep_html:
  122. chapter_text = chapter.get_text(' ').encode('utf8')
  123. return chapter_text
  124.  
  125. def scrape_reviews_for_chapter(self, story_id, chapter_id):
  126. """
  127. Scrape reviews for chapter in story.
  128. Returns:
  129. Array of review dicts.
  130. Each review dict contains the user id of the reviewer if it exists,
  131. the timestamp of the review, and the text of the review.
  132. """
  133. url = '{0}/r/{1}/{2}'.format(self.base_url, story_id, chapter_id)
  134. result = requests.get(url)
  135. html = result.content
  136. soup = BeautifulSoup(html, self.parser)
  137. reviews_table = soup.find(class_='table-striped').tbody
  138. reviews_tds = reviews_table.find_all('td')
  139. reviews = []
  140. for review_td in reviews_tds:
  141. match = re.search(r'href="/u/(.*)/.*">.*</a>', str(review_td))
  142. if match is not None:
  143. user_id = int(match.groups()[0])
  144. else:
  145. user_id = None
  146. time = review_td.find('span', attrs={'data-xutime':True})
  147. time = int(time['data-xutime'])
  148. review = {
  149. 'time': time,
  150. 'user_id': user_id,
  151. 'text': review_td.div.text.encode('utf8')
  152. }
  153. reviews.append(review)
  154. return reviews
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement