Advertisement
lancernik

OtoScrapV7

Apr 27th, 2019
101
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 8.49 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on Thu Apr 25 09:06:22 2019
  4.  
  5. @author: lancernik
  6. """
  7.  
  8. # -*- coding: utf-8 -*-
  9. """
  10. Created on Wed Apr 24 23:35:43 2019
  11.  
  12. @author: lancernik
  13. """
  14.  
  15.  
  16. from requests import get
  17. from requests.exceptions import RequestException
  18. from contextlib import closing
  19. from bs4 import BeautifulSoup
  20. import string
  21. import re
  22. from itertools import groupby
  23. import pandas as pd
  24. import time
  25. import matplotlib.pyplot as plt
  26. import numpy as np
  27. from scipy.stats import gaussian_kde
  28. from scipy.stats import kde
  29. import seaborn as sns
  30.  
  31. from sklearn.linear_model import LinearRegression
  32. from sklearn.model_selection import train_test_split
  33. import matplotlib.pyplot as plt
  34.  
  35.  
  36. def simple_get(url):
  37. #Zwraca none, w przypadku problemu z pobraniem danych
  38. try:
  39. with closing(get(url, stream=True)) as resp:
  40. if is_good_response(resp):
  41. return resp.content
  42. else:
  43. return None
  44.  
  45. except RequestException as e:
  46. log_error('Error during requests to {0} : {1}'.format(url, str(e)))
  47. return None
  48.  
  49. def is_good_response(resp):
  50. #Zwaraca True, jeżeli HTMl
  51. content_type = resp.headers['Content-Type'].lower()
  52. return (resp.status_code == 200
  53. and content_type is not None
  54. and content_type.find('html') > -1)
  55.  
  56. def log_error(e):
  57. print(e)
  58.  
  59. def lastpage(page):
  60. lastepage_out=0
  61. lastpage = str(page.find_all(class_="page"))
  62. lastpage_all = [int(s) for s in re.findall(r'\b\d+\b',lastpage)]
  63. lastpage_out = lastpage_all[-1]
  64. return lastepage_out
  65.  
  66. def scrappy(page): #Pobiera dane z konretnej strony
  67.  
  68. datadict = {'Milage':[0],'Age':[0],'Price':[0],'Engine capacity':[0]}
  69. dataset = pd.DataFrame(data=datadict)
  70.  
  71.  
  72. #Zdobywa numer ostatniej strony
  73.  
  74. lastpage = str(page.find_all(class_="page"))
  75. lastpage_all = [int(s) for s in re.findall(r'\b\d+\b',lastpage)]
  76. lastpage_out = lastpage_all[-1]
  77.  
  78. #Scrapowanie przebiegu
  79.  
  80. milage_from_page = ''.join(map(str,(page.find_all("li", {"data-code" : "mileage"}))))
  81. milage_from_page_nospace = milage_from_page.translate({ord(c): None for c in string.whitespace})
  82. milage_page_out = [int(''.join(i)) for is_digit, i in groupby(milage_from_page_nospace, str.isdigit) if is_digit]
  83.  
  84. #Scrapowanie roku z danej strony
  85.  
  86. age_from_page = str(page.find_all(class_="offer-item__params-item"))
  87. age_from_page_nospace = age_from_page.translate({ord(c): None for c in string.whitespace})
  88. age_from_page_out = [int(s) for s in re.findall(r'\b\d+\b',age_from_page_nospace)]
  89.  
  90. # Scrapowanie cen z danej strony
  91.  
  92. price_from_page = str(page.find_all(class_="offer-price__number"))
  93. price_from_page_nospace = price_from_page.translate({ord(c): None for c in string.whitespace})
  94. price_from_page_out = [int(s) for s in re.findall(r'\b\d+\b',price_from_page_nospace)]
  95.  
  96. capacity_from_page = ''.join(map(str,(page.find_all("li", {"data-code" : "engine_capacity"}))))
  97. capacity_from_page_nospace = capacity_from_page.translate({ord(c): None for c in string.whitespace})
  98. capacity_page_out1 = [int(''.join(i)) for is_digit, i in groupby(capacity_from_page_nospace, str.isdigit) if is_digit]
  99. capacity_page_out = [cap for cap in capacity_page_out1 if cap !=3]
  100.  
  101.  
  102.  
  103. df = pd.DataFrame(
  104. {'Milage':milage_page_out,
  105. 'Age': age_from_page_out,
  106. 'Price': price_from_page_out,
  107. 'Engine capacity':capacity_page_out})
  108.  
  109. dataset = dataset.append(df,ignore_index=True)
  110.  
  111. return dataset
  112.  
  113.  
  114. def ScrapPage(marka,model,start,stop): #Oczyszcza dane, wyznacza zares stron
  115. datadict = {'Milage':[0],'Age':[0],'Price':[0]}
  116. dataset_out = pd.DataFrame(data=datadict)
  117. for i in range(start,stop): #Docelowo 1, lastpage
  118. time.sleep(2)
  119.  
  120. #To w formacie beda kolejne argumenty, tj za opel i corsa
  121. url = simple_get('https://www.otomoto.pl/osobowe/{}/{}/?search%5Bfilter_float_mileage%3Afrom%5D=0&search%5Bfilter_float_engine_capacity%3Afrom%5D=0&search%5Bbrand_program_id%5D%5B0%5D=&search%5Bcountry%5D=&page={}'.format(marka,model,i))
  122. page = BeautifulSoup(url, 'html.parser')
  123. print(scrappy(page))
  124. dataset_out = dataset_out.append(scrappy(page), ignore_index=True)
  125. print(dataset_out)
  126.  
  127.  
  128. #Usuwanie danych odstających
  129.  
  130. clear = dataset_out.Milage[((dataset_out.Milage - dataset_out.Milage.mean()) / dataset_out.Milage.std()).abs() > 2]
  131. clear = clear.append(dataset_out.Age[((dataset_out.Age - dataset_out.Age.mean()) / dataset_out.Age.std()).abs() > 2])
  132. clear = clear.append(dataset_out.Price[((dataset_out.Price - dataset_out.Price.mean()) / dataset_out.Price.std()).abs() > 3])
  133. test = clear.index.get_values()
  134.  
  135. for i in range(0,len(test)):
  136. dataset_out = dataset_out.drop(test[i],axis=0)
  137.  
  138. return dataset_out
  139.  
  140.  
  141. def ClearCarData():
  142. clear = dataset_out.Milage[((dataset_out.Milage - dataset_out.Milage.mean()) / dataset_out.Milage.std()).abs() > 2]
  143. clear = clear.append(dataset_out.Age[((dataset_out.Age - dataset_out.Age.mean()) / dataset_out.Age.std()).abs() > 2])
  144. clear = clear.append(dataset_out.Price[((dataset_out.Price - dataset_out.Price.mean()) / dataset_out.Price.std()).abs() > 3])
  145. test = clear.index.get_values()
  146.  
  147. for i in range(0,len(test)):
  148. dataset_out = dataset_out.drop(test[i],axis=0)
  149. return dataset_out
  150.  
  151. def LoadCarData(filename):
  152. dataset_out = pd.read_csv('{}.csv'.format(filename)) #9-45
  153. clear = dataset_out.Milage[((dataset_out.Milage - dataset_out.Milage.mean()) / dataset_out.Milage.std()).abs() > 2]
  154. clear = clear.append(dataset_out.Age[((dataset_out.Age - dataset_out.Age.mean()) / dataset_out.Age.std()).abs() > 2])
  155. clear = clear.append(dataset_out.Price[((dataset_out.Price - dataset_out.Price.mean()) / dataset_out.Price.std()).abs() > 3])
  156. test = clear.index.get_values()
  157.  
  158. for i in range(0,len(test)):
  159. dataset_out = dataset_out.drop(test[i],axis=0)
  160. return dataset_out
  161.  
  162. def regress(x,y):
  163. model = LinearRegression()
  164. model.fit(x,y)
  165. model.predict([[100]])
  166.  
  167. x_test = np.linspace(min(x),400000)
  168. y_pred = model.predict(x_test[:,None])
  169.  
  170. plt.scatter(x,y,s=2)
  171. plt.plot(x_test,y_pred,'r')
  172. plt.legend(['Regresja', 'Kropeczki'])
  173. plt.show()
  174.  
  175. def Plot1(x,y):
  176. # Evaluate a gaussian kde on a regular grid of nbins x nbins over data extents
  177.  
  178.  
  179.  
  180. nbins=300
  181. k = kde.gaussian_kde([x,y])
  182. xi, yi = np.mgrid[x.min():x.max():nbins*1j, y.min():y.max():nbins*1j]
  183. zi = k(np.vstack([xi.flatten(), yi.flatten()]))
  184.  
  185. # Make the plot
  186. plt.pcolormesh(xi, yi, zi.reshape(xi.shape))
  187. plt.show()
  188.  
  189. # Change color palette
  190. plt.pcolormesh(xi, yi, zi.reshape(xi.shape), cmap=plt.cm.Greens_r)
  191. plt.show()
  192. def Plot2(x,y):
  193. # Make the plot
  194. plt.hexbin(x, y, gridsize=(15,15) )
  195. plt.show()
  196.  
  197. # We can control the size of the bins:
  198. plt.hexbin(x, y, gridsize=(150,150) )
  199. plt.show()
  200. def Plot3(x,y):
  201. sns.jointplot(x, y, kind='scatter')
  202. sns.jointplot(x, y, kind='hex')
  203. sns.jointplot(x, y, kind='kde')
  204.  
  205. # Then you can pass arguments to each type:
  206. sns.jointplot(x, y, kind='scatter', s=200, color='m', edgecolor="skyblue", linewidth=2)
  207.  
  208. # Custom the color
  209. sns.set(style="white", color_codes=True)
  210. sns.jointplot(x, y, kind='kde', color="skyblue",xlim={-30000,300000})
  211.  
  212.  
  213.  
  214.  
  215.  
  216.  
  217.  
  218. #LoadCarData(filename): Wczytuje dane z pliku CSV stworzonego przez funkcje ScrapPage,
  219. #dodatkowo oczyszcza z danych odstajcych
  220.  
  221. #ClearCarData(): Oczyszcza z danych odstajacych, zdiala tylko dla df o nazwie dataset_out
  222.  
  223.  
  224.  
  225.  
  226. # 1) Scrapuje dane
  227.  
  228. # Marka, model, start, stop
  229. dataset_out = ScrapPage("opel" ,"corsa", 1 ,2)
  230. dataset_out.to_csv('dataset1.csv')
  231. dataset_out = pd.read_csv('dataset1.csv') #9-45
  232.  
  233.  
  234. # 2) Wczytuje zeskrapowane dane
  235.  
  236. #dataset_out = LoadCarData("dataset1")
  237.  
  238.  
  239. #Rozne ploty
  240.  
  241. x=dataset_out['Milage']
  242. y=dataset_out['Age']
  243. #
  244. #
  245. #Plot1(x,y)
  246. #Plot2(x,y)
  247. Plot3(x,y)
  248.  
  249.  
  250.  
  251.  
  252.  
  253.  
  254.  
  255. #Regresja przebiegu względem czasu
  256. #
  257. #a=np.array(dataset_out['Milage'].tolist()).reshape(-1,1)
  258. #b=np.array(dataset_out['Age'].tolist()).reshape(-1,1)
  259. #regress(a,b)
  260.  
  261.  
  262.  
  263.  
  264.  
  265.  
  266. #To sie przyda do wojewodztw
  267. #for i, li in enumerate(page.select('li')): #To się przyda do wojedzowtw
  268. # print(i, li.text)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement