Advertisement
Guest User

NLTK

a guest
Dec 17th, 2017
81
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.45 KB | None | 0 0
  1. import mechanicalsoup
  2. import nltk
  3. from sklearn.feature_extraction.text import CountVectorizer
  4.  
  5. class Scrapper:
  6. def __init__(self):
  7. self.browser = mechanicalsoup.StatefulBrowser(raise_on_404 = True)
  8. self.response = browser.open('http://fanfiction.net')
  9.  
  10.  
  11. def filtered_fan_fiction(self,URL):
  12. Filters = ('srt', 'lan', 'len', 'p') # list of desired filters
  13. FilterValues = (3, 1, 0, 2) # list of values for filters
  14. FilterSet = ''
  15. for i in range(len(Filters)):
  16. FilterSet += '&%s=%d' % (Filters[i], FilterValues[i])
  17. newURL = URL + '?' + FilterSet
  18. print (newURL)
  19. return newURL
  20. def cleaning(self,soup): #fuction to cleaning html leftovers from soup.get_text
  21. splitted = soup.split('\n')
  22. result = splitted
  23. d = 0
  24. for n in range(len(splitted)):
  25. if splitted[n] == '});':
  26. p = n
  27. break
  28. for n in range(len(splitted)):
  29. if splitted[n] == ' function review_init() {':
  30. d = n
  31. result = splitted[p+3:d-2]
  32. break
  33. result = [frag.replace("\'",'') for frag in result]
  34. result = [frag.replace("\r",'') for frag in result]
  35. return result
  36. def tokenizing(self,browser): #function to download and tokenize data
  37. soup = browser.get_current_page()
  38. page = soup.get_text()
  39. cleaned = cleaning(page)
  40. tokens = nltk.word_tokenize(str(cleaned))
  41. text_tokens = nltk.Text(tokens)
  42. return text_tokens
  43. def preparing_links(self,browser): #function to prepare (clean) passed list of links, in order to download them
  44. l1 = browser.links()
  45. l2 = []
  46. links = []
  47. l4 = []
  48. l5 = []
  49. chapters = []
  50. for n in range(len(l1)):
  51. if 'stitle' in str(l1[n]):
  52. l2.append(l1[n])
  53. l4.append(l1[n+1])
  54. for n in l2:
  55. beglink = str(n).find('href="')
  56. endlink = str(n).find('"><')
  57. a = str(n)[beglink+6:endlink]
  58. links.append(a)
  59. for n in l4:
  60. beglink = str(n).find('href="')
  61. endlink = str(n).find('"><')
  62. a = str(n)[beglink+6:endlink]
  63. l5.append(a)
  64. for n in l5:
  65. a = n.split('/')
  66. chapters.append(a[3])
  67. return links,chapters #list of links to download each fanfic; list of chapters of each fanfic piece
  68. def download_list(self,links,chapters,ran_giv = 1,pr_link=True): #links and chapters are product of preparing_links function;
  69. #an_giv is a number of links that are going to be downloaded; for all links use len(links);
  70. #pr_link is a decision whether print links during downloading or not.
  71. data_sample = []
  72. for n in range(ran_giv):
  73. old_one = links[n].split('/')
  74.  
  75. for r in range(int(chapters[n])):
  76. one_fan_fic = []
  77. new_one = ''
  78. old_one[3] = str(r)
  79. new_one += 'http://fanfiction.net'
  80.  
  81. for n in range(len(old_one)-1):
  82. new_one += old_one[n]
  83. new_one += '/'
  84. new_one += old_one[-1]
  85.  
  86. if pr_link == True:
  87. print(new_one)
  88.  
  89. browser.open(new_one)
  90. soup = browser.get_current_page()
  91. page = soup.get_text()
  92. cleaned = cleaning(page)
  93. one_fan_fic.append(cleaned)
  94. data_sample.append(one_fan_fic)
  95. return data_sample
  96. def browse(self):
  97. self.browser.follow_link('movie')
  98. self.browser.follow_link('Star-Wars')
  99. #self.browser.get_url()) / something is off here.
  100.  
  101.  
  102. # The 'following links' steps are redundant, can as well
  103. # start directly in desired fandom.
  104.  
  105. sample = Scrapper()
  106. sample.browse()
  107. # browser = mechanicalsoup.StatefulBrowser(raise_on_404 = True)
  108. # response = browser.open('http://fanfiction.net')
  109. # print(response)
  110. #
  111. # browser.follow_link('movie')
  112. # print(browser.get_url()) # returns a URL
  113. #
  114. # browser.follow_link('Star-Wars')
  115. # print(browser.get_url())
  116. #
  117. # filterURL = filtered_fan_fiction(str(browser.get_url()))
  118. # browser.open(filterURL)
  119. #
  120. # links_and_chap = preparing_links(browser)
  121. #
  122. # links = links_and_chap[0]
  123. # chapters = links_and_chap[1]
  124. #
  125. # bag = download_list(links,chapters,ran_giv=len(links))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement