Advertisement
lancernik

Scrapppyyy-v3

Apr 25th, 2019
105
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.04 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on Wed Apr 24 23:35:43 2019
  4.  
  5. @author: lancernik
  6. """
  7.  
  8.  
  9. from requests import get
  10. from requests.exceptions import RequestException
  11. from contextlib import closing
  12. from bs4 import BeautifulSoup
  13. import string
  14. import re
  15. from itertools import groupby
  16. import pandas as pd
  17. import time
  18. import matplotlib.pyplot as plt
  19. import numpy as np
  20. from scipy.stats import gaussian_kde
  21. from scipy.stats import kde
  22. import seaborn as sns
  23.  
  24.  
  25. def simple_get(url):
  26. #Zwraca none, w przypadku problemu z pobraniem danych
  27. try:
  28. with closing(get(url, stream=True)) as resp:
  29. if is_good_response(resp):
  30. return resp.content
  31. else:
  32. return None
  33.  
  34. except RequestException as e:
  35. log_error('Error during requests to {0} : {1}'.format(url, str(e)))
  36. return None
  37.  
  38. def is_good_response(resp):
  39. #Zwaraca True, jeżeli HTMl
  40. content_type = resp.headers['Content-Type'].lower()
  41. return (resp.status_code == 200
  42. and content_type is not None
  43. and content_type.find('html') > -1)
  44.  
  45. def log_error(e):
  46. print(e)
  47.  
  48. def lastpage(page):
  49. lastepage_out=0
  50. lastpage = str(page.find_all(class_="page"))
  51. lastpage_all = [int(s) for s in re.findall(r'\b\d+\b',lastpage)]
  52. lastpage_out = lastpage_all[-1]
  53. return lastepage_out
  54.  
  55. def scrappy(page): #Pobiera dane z konretnej strony
  56.  
  57. datadict = {'Milage':[0],'Age':[0],'Price':[0]}
  58. dataset = pd.DataFrame(data=datadict)
  59.  
  60.  
  61. #Zdobywa numer ostatniej strony
  62.  
  63. lastpage = str(page.find_all(class_="page"))
  64. lastpage_all = [int(s) for s in re.findall(r'\b\d+\b',lastpage)]
  65. lastpage_out = lastpage_all[-1]
  66.  
  67. #Scrapowanie przebiegu
  68.  
  69. milage_from_page = ''.join(map(str,(page.find_all("li", {"data-code" : "mileage"}))))
  70. milage_from_page_nospace = milage_from_page.translate({ord(c): None for c in string.whitespace})
  71. milage_page_out = [int(''.join(i)) for is_digit, i in groupby(milage_from_page_nospace, str.isdigit) if is_digit]
  72.  
  73. #Scrapowanie roku z danej strony
  74.  
  75. age_from_page = str(page.find_all(class_="offer-item__params-item"))
  76. age_from_page_nospace = age_from_page.translate({ord(c): None for c in string.whitespace})
  77. age_from_page_out = [int(s) for s in re.findall(r'\b\d+\b',age_from_page_nospace)]
  78.  
  79. # Scrapowanie cen z danej strony
  80.  
  81. price_from_page = str(page.find_all(class_="offer-price__number"))
  82. price_from_page_nospace = price_from_page.translate({ord(c): None for c in string.whitespace})
  83. price_from_page_out = [int(s) for s in re.findall(r'\b\d+\b',price_from_page_nospace)]
  84.  
  85. df = pd.DataFrame(
  86. {'Milage':milage_page_out,
  87. 'Age': age_from_page_out,
  88. 'Price': price_from_page_out})
  89.  
  90. dataset = dataset.append(df,ignore_index=True)
  91.  
  92. return dataset
  93.  
  94.  
  95. def ScrapPage(marka,model,start,stop): #Oczyszcza dane, wyznacza zares stron
  96. datadict = {'Milage':[0],'Age':[0],'Price':[0]}
  97. dataset_out = pd.DataFrame(data=datadict)
  98. for i in range(start,stop): #Docelowo 1, lastpage
  99. time.sleep(2)
  100.  
  101. #To w formacie beda kolejne argumenty, tj za opel i corsa
  102. url = simple_get('https://www.otomoto.pl/osobowe/{}/{}/?page={}'.format(marka,model,i))
  103. page = BeautifulSoup(url, 'html.parser')
  104. print(scrappy(page))
  105. dataset_out = dataset_out.append(scrappy(page), ignore_index=True)
  106. print(dataset_out)
  107.  
  108.  
  109. #Usuwanie danych odstających
  110.  
  111. clear = dataset_out.Milage[((dataset_out.Milage - dataset_out.Milage.mean()) / dataset_out.Milage.std()).abs() > 2]
  112. clear = clear.append(dataset_out.Age[((dataset_out.Age - dataset_out.Age.mean()) / dataset_out.Age.std()).abs() > 2])
  113. clear = clear.append(dataset_out.Price[((dataset_out.Price - dataset_out.Price.mean()) / dataset_out.Price.std()).abs() > 3])
  114. test = clear.index.get_values()
  115.  
  116. for i in range(0,len(test)):
  117. dataset_out = dataset_out.drop(test[i],axis=0)
  118.  
  119. return dataset_out
  120.  
  121.  
  122. def ClearCarData():
  123. clear = dataset_out.Milage[((dataset_out.Milage - dataset_out.Milage.mean()) / dataset_out.Milage.std()).abs() > 2]
  124. clear = clear.append(dataset_out.Age[((dataset_out.Age - dataset_out.Age.mean()) / dataset_out.Age.std()).abs() > 2])
  125. clear = clear.append(dataset_out.Price[((dataset_out.Price - dataset_out.Price.mean()) / dataset_out.Price.std()).abs() > 3])
  126. test = clear.index.get_values()
  127.  
  128. for i in range(0,len(test)):
  129. dataset_out = dataset_out.drop(test[i],axis=0)
  130. return dataset_out
  131.  
  132. def LoadCarData(filename):
  133. dataset_out = pd.read_csv('{}.csv'.format(filename)) #9-45
  134. clear = dataset_out.Milage[((dataset_out.Milage - dataset_out.Milage.mean()) / dataset_out.Milage.std()).abs() > 2]
  135. clear = clear.append(dataset_out.Age[((dataset_out.Age - dataset_out.Age.mean()) / dataset_out.Age.std()).abs() > 2])
  136. clear = clear.append(dataset_out.Price[((dataset_out.Price - dataset_out.Price.mean()) / dataset_out.Price.std()).abs() > 3])
  137. test = clear.index.get_values()
  138.  
  139. for i in range(0,len(test)):
  140. dataset_out = dataset_out.drop(test[i],axis=0)
  141. return dataset_out
  142.  
  143. def Plot1(x,y):
  144. # Evaluate a gaussian kde on a regular grid of nbins x nbins over data extents
  145. nbins=300
  146. k = kde.gaussian_kde([x,y])
  147. xi, yi = np.mgrid[x.min():x.max():nbins*1j, y.min():y.max():nbins*1j]
  148. zi = k(np.vstack([xi.flatten(), yi.flatten()]))
  149.  
  150. # Make the plot
  151. plt.pcolormesh(xi, yi, zi.reshape(xi.shape))
  152. plt.show()
  153.  
  154. # Change color palette
  155. plt.pcolormesh(xi, yi, zi.reshape(xi.shape), cmap=plt.cm.Greens_r)
  156. plt.show()
  157. def Ploy2(x,y):
  158. # Make the plot
  159. plt.hexbin(x, y, gridsize=(15,15) )
  160. plt.show()
  161.  
  162. # We can control the size of the bins:
  163. plt.hexbin(x, y, gridsize=(150,150) )
  164. plt.show()
  165. def Plot3(x,y):
  166. sns.jointplot(x, y, kind='scatter')
  167. sns.jointplot(x, y, kind='hex')
  168. sns.jointplot(x, y, kind='kde')
  169.  
  170. # Then you can pass arguments to each type:
  171. sns.jointplot(x, y, kind='scatter', s=200, color='m', edgecolor="skyblue", linewidth=2)
  172.  
  173. # Custom the color
  174. sns.set(style="white", color_codes=True)
  175. sns.jointplot(x, y, kind='kde', color="skyblue")
  176.  
  177.  
  178.  
  179.  
  180.  
  181.  
  182.  
  183. #LoadCarData(filename): Wczytuje dane z pliku CSV stworzonego przez funkcje ScrapPage,
  184. #dodatkowo oczyszcza z danych odstajcych
  185.  
  186. #ClearCarData(): Oczyszcza z danych odstajacych, zdiala tylko dla df o nazwie dataset_out
  187.  
  188.  
  189.  
  190.  
  191.  
  192. # Marka, model, start, stop
  193. #ScrapPage("opel" ,"corsa", 1 ,3)
  194. #dataset_out.to_csv('dataset1.csv')
  195.  
  196.  
  197. #dataset_out = pd.read_csv('dataset1.csv') #9-45
  198.  
  199. LoadCarData("dataset1")
  200.  
  201.  
  202. x=dataset_out['Price']
  203. y=dataset_out['Milage']
  204.  
  205.  
  206.  
  207. #Plot1(x,y)
  208. #Plot2(x,y)
  209. #Plot3(x,y)
  210.  
  211.  
  212.  
  213.  
  214.  
  215.  
  216.  
  217.  
  218.  
  219.  
  220.  
  221. #To sie przyda do wojewodztw
  222. #for i, li in enumerate(page.select('li')): #To się przyda do wojedzowtw
  223. # print(i, li.text)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement