Advertisement
lancernik

OtoMotoReg

May 5th, 2019
115
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 15.09 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on Thu Apr 25 09:06:22 2019
  4.  
  5. @author: lancernik
  6. """
  7.  
  8. # -*- coding: utf-8 -*-
  9. """
  10. Created on Wed Apr 24 23:35:43 2019
  11.  
  12. @author: lancernik
  13. """
  14.  
  15.  
  16. from requests import get
  17. from requests.exceptions import RequestException
  18. from contextlib import closing
  19. from bs4 import BeautifulSoup
  20. import string
  21. import re
  22. from itertools import groupby
  23. import pandas as pd
  24. import time
  25. import matplotlib.pyplot as plt
  26. import numpy as np
  27. from scipy.stats import kde
  28. import seaborn as sns
  29. from sklearn.linear_model import LinearRegression
  30. from sklearn.model_selection import train_test_split
  31. from sklearn.metrics import mean_squared_error
  32. from sklearn.metrics import mean_absolute_error
  33. import sklearn.preprocessing
  34.  
  35.  
  36.  
  37. def simple_get(url):
  38. #Zwraca none, w przypadku problemu z pobraniem danych
  39. try:
  40. with closing(get(url, stream=True)) as resp:
  41. if is_good_response(resp):
  42. return resp.content
  43. else:
  44. return None
  45.  
  46. except RequestException as e:
  47. log_error('Error during requests to {0} : {1}'.format(url, str(e)))
  48. return None
  49.  
  50. def is_good_response(resp):
  51. #Zwaraca True, jeżeli HTMl
  52. content_type = resp.headers['Content-Type'].lower()
  53. return (resp.status_code == 200
  54. and content_type is not None
  55. and content_type.find('html') > -1)
  56.  
  57. def log_error(e):
  58. print(e)
  59.  
  60. def lastpage(page):
  61. lastepage_out=0
  62. lastpage = str(page.find_all(class_="page"))
  63. lastpage_all = [int(s) for s in re.findall(r'\b\d+\b',lastpage)]
  64. lastpage_out = lastpage_all[-1]
  65. return lastepage_out
  66.  
  67. def LoadCarData(filename):
  68.  
  69. #Wczytuje plik do dataframe
  70.  
  71. dataset_out = pd.read_csv('{}.csv'.format(filename))
  72. return dataset_out
  73.  
  74. def scrappy(page,marka,model): #Pobiera dane z konretnej strony
  75.  
  76. datadict = {'Marka':'Marka','Model':'Model','Milage':[0],'Age':[0],'Price':[0],'Engine capacity':[0],'Fuel type':[0]}
  77. dataset = pd.DataFrame(data=datadict)
  78.  
  79.  
  80. #Zdobywa numer ostatniej strony
  81.  
  82. lastpage = str(page.find_all(class_="page"))
  83. lastpage_all = [int(s) for s in re.findall(r'\b\d+\b',lastpage)]
  84. lastpage_out = lastpage_all[-1]
  85.  
  86. #Scrapowanie przebiegu
  87.  
  88. milage_from_page = ''.join(map(str,(page.find_all("li", {"data-code" : "mileage"}))))
  89. milage_from_page_nospace = milage_from_page.translate({ord(c): None for c in string.whitespace})
  90. milage_page_out = [int(''.join(i)) for is_digit, i in groupby(milage_from_page_nospace, str.isdigit) if is_digit]
  91.  
  92. #Scrapowanie roku z danej strony
  93.  
  94. age_from_page = str(page.find_all(class_="offer-item__params-item"))
  95. age_from_page_nospace = age_from_page.translate({ord(c): None for c in string.whitespace})
  96. age_from_page_out = [int(s) for s in re.findall(r'\b\d+\b',age_from_page_nospace)]
  97.  
  98. # Scrapowanie cen z danej strony
  99.  
  100. price_from_page = str(page.find_all(class_="offer-price__number"))
  101. price_from_page_nospace = price_from_page.translate({ord(c): None for c in string.whitespace})
  102. price_from_page_out = [int(s) for s in re.findall(r'\b\d+\b',price_from_page_nospace)]
  103.  
  104. # Scrapowanie pojemnosci silnika
  105.  
  106. capacity_from_page = ''.join(map(str,(page.find_all("li", {"data-code" : "engine_capacity"}))))
  107. capacity_from_page_nospace = capacity_from_page.translate({ord(c): None for c in string.whitespace})
  108. capacity_page_out1 = [int(''.join(i)) for is_digit, i in groupby(capacity_from_page_nospace, str.isdigit) if is_digit]
  109. capacity_page_out = [cap for cap in capacity_page_out1 if cap !=3]
  110.  
  111. # Scrapowanie rodaju paliwa
  112.  
  113. fueltype_from_page = ''.join(map(str,(page.find_all("li", {"data-code" : "fuel_type"}))))
  114. fueltype_from_page_nospace = fueltype_from_page.translate({ord(c): None for c in string.whitespace})
  115. fueltype_from_page_nospace = fueltype_from_page_nospace.replace("Benzyna","1")
  116. fueltype_from_page_nospace = fueltype_from_page_nospace.replace("Diesel","2")
  117. fueltype_from_page_nospace = fueltype_from_page_nospace.replace("Benzyna+LPG","3")
  118. fueltype_from_page_nospace = fueltype_from_page_nospace.replace("Elektryczny","4")
  119. fueltype_from_page_nospace = fueltype_from_page_nospace.replace("Hybryda","5")
  120. fueltype_from_page_nospace = fueltype_from_page_nospace.replace("Etanol","6")
  121. fueltype_from_page_nospace = fueltype_from_page_nospace.replace("Benzyna+CNG ","6")
  122. fueltype_from_page_nospace = fueltype_from_page_nospace.replace("Wodór ","6")
  123. fueltype_from_page_out = [int(s) for s in re.findall(r'\b\d+\b',fueltype_from_page_nospace)]
  124.  
  125.  
  126.  
  127. marka_out=["{}".format(marka)] * len(age_from_page_out)
  128. model_out=["{}".format(model)] * len(age_from_page_out)
  129.  
  130.  
  131. if len(milage_page_out) == len(age_from_page_out) == len(price_from_page_out) == len(capacity_page_out) ==len(model_out) == len(marka_out) ==len(fueltype_from_page_out):
  132. df = pd.DataFrame(
  133. {'Milage':milage_page_out,
  134. 'Age': age_from_page_out,
  135. 'Price': price_from_page_out,
  136. 'Engine capacity':capacity_page_out,
  137. 'Fuel type':fueltype_from_page_out,
  138. 'Marka':marka_out,
  139. 'Model':model_out})
  140.  
  141. dataset = dataset.append(df,ignore_index=True)
  142.  
  143. # dataset = dataset['Marka', 'Model','Age', 'Engine capacity', 'Fuel type', 'Milage', 'Price']
  144. return dataset
  145.  
  146.  
  147. def ScrapPage(marka,model,start,stop): #Oczyszcza dane, wyznacza zares stron
  148. datadict = {'Milage':[0],'Age':[0],'Price':[0]}
  149. dataset_out = pd.DataFrame(data=datadict)
  150.  
  151. #Zdobywa ostatnia strone
  152. url1 = simple_get('https://www.otomoto.pl/osobowe/{}/{}/?search%5Bfilter_float_mileage%3Afrom%5D=0&search%5Bfilter_float_engine_capacity%3Afrom%5D=0&search%5Bbrand_program_id%5D%5B0%5D=&search%5Bcountry%5D=&page=1'.format(marka,model))
  153. # url1 = simple_get('https://www.otomoto.pl/osobowe/{}/{}/?search%5Bfilter_enum_damaged%5D=0&search%5Bbrand_program_id%5D%5B0%5D=&search%5Bcountry%5D=&page=1'.format(marka,model))
  154. page1 = BeautifulSoup(url1, 'html.parser')
  155. lastpage = str(page1.find_all(class_="page"))
  156. lastpage_all = [int(s) for s in re.findall(r'\b\d+\b',lastpage)]
  157. lastpage_out = lastpage_all[-1]
  158.  
  159. for i in range(start,33): #Docelowo 1, lastpage_out
  160. time.sleep(2)
  161.  
  162. #To w formacie beda kolejne argumenty, tj za opel i corsa
  163. url = simple_get('https://www.otomoto.pl/osobowe/{}/{}/?search%5Bfilter_float_mileage%3Afrom%5D=0&search%5Bfilter_float_engine_capacity%3Afrom%5D=0&search%5Bbrand_program_id%5D%5B0%5D=&search%5Bcountry%5D=&page={}'.format(marka,model,i))
  164. # url = simple_get('https://www.otomoto.pl/osobowe/{}/{}/?search%5Bfilter_enum_damaged%5D=0&search%5Bbrand_program_id%5D%5B0%5D=&search%5Bcountry%5D=&page={}'.format(marka,model,i))
  165. # FORCE SCRAP
  166. # url = simple_get('https://www.otomoto.pl/osobowe/lexus/is/ii-2005-2012/?search%5Bfilter_float_engine_capacity%3Afrom%5D=2450&search%5Bfilter_float_engine_capacity%3Ato%5D=2550&search%5Bbrand_program_id%5D%5B0%5D=&search%5Bcountry%5D=')
  167. page = BeautifulSoup(url, 'html.parser')
  168. # print(scrappy(page))
  169. dataset_out = dataset_out.append(scrappy(page,marka,model), ignore_index=True)
  170. print(dataset_out)
  171. print(i)
  172. dataset_out.to_csv('{}-{}.csv'.format(marka,model))
  173. return dataset_out
  174.  
  175.  
  176. def ClearCarData(dataset_out):
  177.  
  178.  
  179. #Ustawia minimalny wiek samochodu
  180.  
  181. dataset_out = dataset_out[dataset_out['Age'] > 1980]
  182.  
  183.  
  184.  
  185. #Wybiera listę indexow ktore maja byc usuniete
  186.  
  187. clear = dataset_out.Milage[((dataset_out.Milage - dataset_out.Milage.mean()) / dataset_out.Milage.std()).abs() > 4]
  188. clear = dataset_out.Milage[((dataset_out.Milage - dataset_out.Milage.mean()) / dataset_out.Milage.std()).abs() > 4]
  189. clear = clear.append(dataset_out.Age[((dataset_out.Age - dataset_out.Age.mean()) / dataset_out.Age.std()).abs() > 4])
  190. clear = clear.append(dataset_out.Price[((dataset_out.Price - dataset_out.Price.mean()) / dataset_out.Price.std()).abs() > 4])
  191. test1 = clear.index.get_values()
  192.  
  193. #Usuwa duplikaty z listy indexów do usunięcia
  194.  
  195. test = []
  196. for i in test1:
  197. if i not in test:
  198. test.append(i)
  199.  
  200. #Usuwa z dataframu wybrane indexy
  201.  
  202. for i in range(0,len(test)):
  203. dataset_out = dataset_out.drop(test[i],axis=0)
  204.  
  205. return dataset_out
  206.  
  207.  
  208. def regress(x,y): #Najprostsza regresja
  209. model = LinearRegression()
  210. model.fit(x,y)
  211. model.predict([[100]])
  212.  
  213. x_test = np.linspace(0,max(x))
  214. y_pred = model.predict(x_test[:,None])
  215.  
  216. plt.scatter(x,y,s=2)
  217. plt.plot(x_test,y_pred,'r')
  218. plt.legend(['Regresja', 'Kropeczki'])
  219. plt.show()
  220. print("Dopasowanie regresji: {}".format(model.score(x,y)))
  221.  
  222.  
  223.  
  224.  
  225.  
  226. def regress2(x_train, x_test, y_train, y_test): #Model sample
  227.  
  228. r = LinearRegression()
  229. r.fit(x_train,y_train)
  230. y_train_pred = r.predict(x_train)
  231. y_test_pred = r.predict(x_test)
  232.  
  233. mse = mean_squared_error
  234. mae = mean_absolute_error
  235.  
  236. return{
  237. "r_score": r.score(x_train,y_train), #??????????
  238. "MSE_u": mse(y_train,y_train_pred),
  239. "MSE_t": mse(y_test,y_test_pred),
  240. "MSA_u": mae(y_train,y_train_pred),
  241. "MSA_t": mae(y_test,y_test_pred)}
  242.  
  243. def regress3(x,y): #Model wielomianowy
  244. x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2, random_state=12345)
  245. reg3 = sklearn.preprocessing.PolynomialFeatures(degree=2,include_bias = False)
  246. x2_train = reg3.fit_transform(x_train)
  247. x2_test = reg3.fit_transform(x_test)
  248. # print(reg3.powers_.T)
  249. params.append("zm. wielom")
  250. res.append(regress2(x2_train, x2_test, y_train, y_test))
  251. print(pd.DataFrame(res,index=params))
  252.  
  253.  
  254. #Minimaluzujemy liczbe zmeinnych
  255.  
  256. wybrane = pd.DataFrame(forward_selection(x2_train, y_train),columns =["Zmienna", "BIC"])
  257. wybrane_zmienne = wybrane["Zmienna"].tolist()
  258. # print(x)
  259. # wybrane["nazwa"] = [x.columns[w>=1].append(x.columns[w==2]).str.cat(sep="*") for w in reg3.powers_[wybrane_zmienne]]
  260. # print(x2_train)
  261. # print(y_train)
  262. print(wybrane_zmienne)
  263. params.append("zm.wybrane")
  264. res.append(regress2(x2_train[:,wybrane_zmienne],x2_test[:,wybrane_zmienne],y_train, y_test))
  265. wyniki = pd.DataFrame(res, index=params)
  266. print(wyniki)
  267.  
  268. # Bayesian information criterion
  269. #, minimalizujemy BIC(MSE,p,n) = n log(MSE) + p log(n)
  270. #Mniejsza liczba wspolczynnikow modelu -> MNiejsza szansa na przeuczenie
  271.  
  272.  
  273. def BIC(mse,p,n):
  274. return n*np.log(mse) + p*np.log(n)
  275.  
  276. def forward_selection(x,y):
  277. n,m = x.shape
  278. best_idx =[]
  279. best_free = set(range(m))
  280. best_fit = np.inf
  281. res = []
  282. for i in range(0,m):
  283. cur_idx = -1
  284. cur_fit = np.inf
  285. for e in best_free:
  286. r = LinearRegression()
  287. test_idx = best_idx+[e]
  288. r.fit(x[:,test_idx],y)
  289. test_fit = BIC(mean_squared_error(y,r.predict(x[:,test_idx])),i+2,n)
  290. if test_fit < cur_fit: cur_idx, cur_fit = e, test_fit
  291. if cur_fit > best_fit: break
  292. best_idx, best_fit = best_idx + [cur_idx], cur_fit
  293. best_free.discard(cur_idx)
  294. res.append((cur_idx, cur_fit))
  295.  
  296. print(res)
  297. return res
  298.  
  299.  
  300.  
  301.  
  302. def Plot1(x,y):
  303.  
  304. # Evaluate a gaussian kde on a regular grid of nbins x nbins over data extents
  305. nbins=300
  306. k = kde.gaussian_kde([x,y])
  307. xi, yi = np.mgrid[x.min():x.max():nbins*1j, y.min():y.max():nbins*1j]
  308. zi = k(np.vstack([xi.flatten(), yi.flatten()]))
  309.  
  310. # Make the plot
  311. plt.pcolormesh(xi, yi, zi.reshape(xi.shape))
  312. plt.show()
  313.  
  314. # Change color palette
  315. plt.pcolormesh(xi, yi, zi.reshape(xi.shape), cmap=plt.cm.Greens_r)
  316. plt.show()
  317. def Plot2(x,y):
  318. # Make the plot
  319. plt.hexbin(x, y, gridsize=(15,15) )
  320. plt.show()
  321.  
  322. # We can control the size of the bins:
  323. plt.hexbin(x, y, gridsize=(150,150) )
  324. plt.show()
  325. def Plot3(x,y):
  326. sns.jointplot(x, y, kind='scatter')
  327. sns.jointplot(x, y, kind='hex')
  328. sns.jointplot(x, y, kind='kde')
  329.  
  330. # Then you can pass arguments to each type:
  331. sns.jointplot(x, y, kind='scatter', s=200, color='m', edgecolor="skyblue", linewidth=2)
  332.  
  333. # Custom the color
  334. sns.set(style="white", color_codes=True)
  335. sns.jointplot(x, y, kind='kde', color="skyblue",xlim={-30000,300000})
  336.  
  337.  
  338.  
  339.  
  340.  
  341.  
  342.  
  343. #LoadCarData(filename): Wczytuje dane z pliku CSV stworzonego przez funkcje ScrapPage,
  344. #dodatkowo oczyszcza z danych odstajcych
  345.  
  346. #ClearCarData(): Oczyszcza z danych odstajacych, zdiala tylko dla df o nazwie dataset_out
  347.  
  348.  
  349.  
  350.  
  351. # 1) Scrapuje dane
  352.  
  353. # Marka, model, start, stop
  354. #dataset_out = ScrapPage("opel" ,"corsa", 1 ,33)
  355. #dataset_out.to_csv('datasetvv40.csv')
  356. #dataset_out = pd.read_csv('datasetgolf.csv') #9-45
  357.  
  358.  
  359. # 2) Wczytuje zeskrapowane dane
  360.  
  361. dataset_out = LoadCarData("opel-corsa")
  362. dataset_out = ClearCarData(dataset_out)
  363.  
  364.  
  365. #Rozne ploty
  366.  
  367. #x=dataset_out['Milage']
  368. #y=dataset_out['Age']
  369.  
  370.  
  371.  
  372. #Plot1(x,y)
  373. #Plot2(x,y)
  374. #Plot3(x,y)
  375.  
  376.  
  377.  
  378.  
  379.  
  380. #Szuka korelacji
  381.  
  382. #c = dataset_out.corr("pearson")
  383. #c = c.where(
  384. # np.triu(
  385. # np.ones(c.shape,dtype=np.bool),k=1)
  386. # ).stack().sort_values()
  387. #print(c[abs(c)>0.3])
  388.  
  389.  
  390.  
  391.  
  392. #Regresja przebiegu względem czasu
  393. #
  394.  
  395.  
  396.  
  397. dataset_out = dataset_out[dataset_out['Milage'] > 100]
  398. a=np.array(dataset_out['Milage'].tolist()).reshape(-1,1)
  399. b=np.array(dataset_out['Age'].tolist()).reshape(-1,1)
  400. regress(a,b)
  401. #
  402.  
  403.  
  404.  
  405. #Regrasja sample, lepsa!
  406.  
  407.  
  408. params =['zm. liniowe']
  409. x_train, x_test, y_train, y_test = train_test_split(a,b,test_size = 0.2, random_state=12345)
  410. res = [regress2(x_train, x_test, y_train, y_test)]
  411. reg2 = pd.DataFrame(res,index=params)
  412. print(reg2)
  413.  
  414. print(regress3(a,b))
  415.  
  416.  
  417.  
  418.  
  419.  
  420.  
  421.  
  422.  
  423.  
  424.  
  425.  
  426. #To sie przyda do wojewodztw
  427. #for i, li in enumerate(page.select('li')): #To się przyda do wojedzowtw
  428. # print(i, li.text)
  429.  
  430.  
  431.  
  432.  
  433.  
  434.  
  435.  
  436.  
  437.  
  438.  
  439.  
  440.  
  441.  
  442.  
  443.  
  444.  
  445.  
  446.  
  447.  
  448.  
  449.  
  450. #
  451. #def LoadCarData(filename):
  452. #
  453. # #Wczytuje plik do dataframe
  454. #
  455. # dataset_out = pd.read_csv('{}.csv'.format(filename))
  456. #
  457. #
  458. # #Wybiera listę indexow ktore maja byc usuniete
  459. #
  460. # clear = dataset_out.Milage[((dataset_out.Milage - dataset_out.Milage.mean()) / dataset_out.Milage.std()).abs() > 2]
  461. # clear = clear.append(dataset_out.Age[((dataset_out.Age - dataset_out.Age.mean()) / dataset_out.Age.std()).abs() > 2])
  462. # clear = clear.append(dataset_out.Price[((dataset_out.Price - dataset_out.Price.mean()) / dataset_out.Price.std()).abs() > 3])
  463. # test1 = clear.index.get_values()
  464. #
  465. # #Usuwa duplikaty z listy indexów do usunięcia
  466. #
  467. # test = []
  468. # for i in test1:
  469. # if i not in test:
  470. # test.append(i)
  471. #
  472. # #Usuwa z dataframu wybrane indexy
  473. #
  474. # for i in range(0,len(test)):
  475. # dataset_out = dataset_out.drop(test[i],axis=0)
  476. #
  477. # return dataset_out
  478. #
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement