Advertisement
lancernik

GetData

Apr 24th, 2019
127
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.47 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on Wed Apr 24 23:35:43 2019
  4.  
  5. @author: lancernik
  6. """
  7.  
  8.  
  9. from requests import get
  10. from requests.exceptions import RequestException
  11. from contextlib import closing
  12. from bs4 import BeautifulSoup
  13. import string
  14. import re
  15. from itertools import groupby
  16. import pandas as pd
  17. import time
  18. import matplotlib.pyplot as plt
  19.  
  20. def simple_get(url):
  21. """
  22. Attempts to get the content at `url` by making an HTTP GET request.
  23. If the content-type of response is some kind of HTML/XML, return the
  24. text content, otherwise return None.
  25. """
  26. try:
  27. with closing(get(url, stream=True)) as resp:
  28. if is_good_response(resp):
  29. return resp.content
  30. else:
  31. return None
  32.  
  33. except RequestException as e:
  34. log_error('Error during requests to {0} : {1}'.format(url, str(e)))
  35. return None
  36. def is_good_response(resp):
  37. """
  38. Returns True if the response seems to be HTML, False otherwise.
  39. """
  40. content_type = resp.headers['Content-Type'].lower()
  41. return (resp.status_code == 200
  42. and content_type is not None
  43. and content_type.find('html') > -1)
  44.  
  45. def log_error(e):
  46. """
  47. It is always a good idea to log errors.
  48. This function just prints them, but you can
  49. make it do anything.
  50. """
  51. print(e)
  52.  
  53. def lastpage(page):
  54. lastepage_out=0
  55. lastpage = str(page.find_all(class_="page"))
  56. lastpage_all = [int(s) for s in re.findall(r'\b\d+\b',lastpage)]
  57. lastpage_out = lastpage_all[-1]
  58. return lastepage_out
  59.  
  60. def scrappy(page):
  61.  
  62. datadict = {'Milage':[0],'Age':[0],'Price':[0]}
  63. dataset = pd.DataFrame(data=datadict)
  64.  
  65.  
  66. #Zdobywa numer ostatniej strony
  67.  
  68. lastpage = str(page.find_all(class_="page"))
  69. lastpage_all = [int(s) for s in re.findall(r'\b\d+\b',lastpage)]
  70. lastpage_out = lastpage_all[-1]
  71.  
  72.  
  73.  
  74. #Scrapowanie przebiegu
  75.  
  76. milage_from_page = ''.join(map(str,(page.find_all("li", {"data-code" : "mileage"}))))
  77. milage_from_page_nospace = milage_from_page.translate({ord(c): None for c in string.whitespace})
  78. milage_page_out = [int(''.join(i)) for is_digit, i in groupby(milage_from_page_nospace, str.isdigit) if is_digit]
  79.  
  80. #Scrapowanie roku z danej strony
  81.  
  82. age_from_page = str(page.find_all(class_="offer-item__params-item"))
  83. age_from_page_nospace = age_from_page.translate({ord(c): None for c in string.whitespace})
  84. age_from_page_out = [int(s) for s in re.findall(r'\b\d+\b',age_from_page_nospace)]
  85.  
  86. # Scrapowanie cen z danej strony
  87.  
  88. price_from_page = str(page.find_all(class_="offer-price__number"))
  89. price_from_page_nospace = price_from_page.translate({ord(c): None for c in string.whitespace})
  90. price_from_page_out = [int(s) for s in re.findall(r'\b\d+\b',price_from_page_nospace)]
  91.  
  92. df = pd.DataFrame(
  93. {'Milage':milage_page_out,
  94. 'Age': age_from_page_out,
  95. 'Price': price_from_page_out})
  96.  
  97. dataset = dataset.append(df,ignore_index=True)
  98.  
  99. return dataset
  100.  
  101.  
  102.  
  103. #
  104. #for i, li in enumerate(page.select('li')): #To się przyda do wojedzowtw
  105. # print(i, li.text)
  106.  
  107.  
  108.  
  109. # GET DATA 1
  110.  
  111.  
  112. #lastpage = str(page.find_all(class_="page"))
  113. #lastpage_all = [int(s) for s in re.findall(r'\b\d+\b',lastpage)]
  114. #lastpage_out = lastpage_all[-1]
  115.  
  116.  
  117. #datadict = {'Milage':[0],'Age':[0],'Price':[0]}
  118. #dataset_out = pd.DataFrame(data=datadict)
  119. #for i in range(9,45):
  120. # time.sleep(2)
  121. # url = simple_get('https://www.otomoto.pl/osobowe/opel/corsa/?page={}'.format(i))
  122. # page = BeautifulSoup(url, 'html.parser')
  123. # print(scrappy(page))
  124. # dataset_out = dataset_out.append(scrappy(page), ignore_index=True)
  125. # print(dataset_out)
  126. #
  127. #
  128.  
  129. #dataset_out.to_csv('dataset1.csv')
  130.  
  131.  
  132. # GET DATA 2
  133.  
  134. dataset_out = pd.read_csv('dataset1.csv') #9-45
  135.  
  136.  
  137.  
  138.  
  139. #Usuwanie danych odstających
  140. clear = dataset_out.Milage[((dataset_out.Milage - dataset_out.Milage.mean()) / dataset_out.Milage.std()).abs() > 2]
  141. clear = clear.append(dataset_out.Age[((dataset_out.Age - dataset_out.Age.mean()) / dataset_out.Age.std()).abs() > 2])
  142. clear = clear.append(dataset_out.Price[((dataset_out.Price - dataset_out.Price.mean()) / dataset_out.Price.std()).abs() > 3])
  143. test = clear.index.get_values()
  144.  
  145. for i in range(0,len(test)):
  146. dataset_out = dataset_out.drop(test[i],axis=0)
  147.  
  148.  
  149.  
  150.  
  151.  
  152. plt.scatter(x='Milage',y='Price', data=dataset_out ,marker="*")
  153. plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement