Advertisement
lancernik

Patste

Apr 30th, 2019
50
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 13.91 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on Thu Apr 25 09:06:22 2019
  4.  
  5. @author: lancernik
  6. """
  7.  
  8. # -*- coding: utf-8 -*-
  9. """
  10. Created on Wed Apr 24 23:35:43 2019
  11.  
  12. @author: lancernik
  13. """
  14.  
  15.  
  16. from requests import get
  17. from requests.exceptions import RequestException
  18. from contextlib import closing
  19. from bs4 import BeautifulSoup
  20. import string
  21. import re
  22. from itertools import groupby
  23. import pandas as pd
  24. import time
  25. import matplotlib.pyplot as plt
  26. import numpy as np
  27. from scipy.stats import kde
  28. import seaborn as sns
  29. from sklearn.linear_model import LinearRegression
  30.  
  31.  
  32.  
  33. def simple_get(url):
  34. #Zwraca none, w przypadku problemu z pobraniem danych
  35. try:
  36. with closing(get(url, stream=True)) as resp:
  37. if is_good_response(resp):
  38. return resp.content
  39. else:
  40. return None
  41.  
  42. except RequestException as e:
  43. log_error('Error during requests to {0} : {1}'.format(url, str(e)))
  44. return None
  45.  
  46. def is_good_response(resp):
  47. #Zwaraca True, jeżeli HTMl
  48. content_type = resp.headers['Content-Type'].lower()
  49. return (resp.status_code == 200
  50. and content_type is not None
  51. and content_type.find('html') > -1)
  52.  
  53. def log_error(e):
  54. print(e)
  55.  
  56. def lastpage(page):
  57. lastepage_out=0
  58. lastpage = str(page.find_all(class_="page"))
  59. lastpage_all = [int(s) for s in re.findall(r'\b\d+\b',lastpage)]
  60. lastpage_out = lastpage_all[-1]
  61. return lastepage_out
  62.  
  63. def LoadCarData(filename):
  64.  
  65. #Wczytuje plik do dataframe
  66.  
  67. dataset_out = pd.read_csv('{}.csv'.format(filename))
  68. return dataset_out
  69.  
  70. def scrappy(page,marka,model): #Pobiera dane z konretnej strony
  71.  
  72. datadict = {'Marka':'Marka','Model':'Model','Milage':[0],'Age':[0],'Price':[0],'Engine capacity':[0],'Fuel type':[0]}
  73. dataset = pd.DataFrame(data=datadict)
  74.  
  75.  
  76. #Zdobywa numer ostatniej strony
  77.  
  78. lastpage = str(page.find_all(class_="page"))
  79. lastpage_all = [int(s) for s in re.findall(r'\b\d+\b',lastpage)]
  80. lastpage_out = lastpage_all[-1]
  81.  
  82. #Scrapowanie przebiegu
  83.  
  84. milage_from_page = ''.join(map(str,(page.find_all("li", {"data-code" : "mileage"}))))
  85. milage_from_page_nospace = milage_from_page.translate({ord(c): None for c in string.whitespace})
  86. milage_page_out = [int(''.join(i)) for is_digit, i in groupby(milage_from_page_nospace, str.isdigit) if is_digit]
  87.  
  88. #Scrapowanie roku z danej strony
  89.  
  90. age_from_page = str(page.find_all(class_="offer-item__params-item"))
  91. age_from_page_nospace = age_from_page.translate({ord(c): None for c in string.whitespace})
  92. age_from_page_out = [int(s) for s in re.findall(r'\b\d+\b',age_from_page_nospace)]
  93.  
  94. # Scrapowanie cen z danej strony
  95.  
  96. price_from_page = str(page.find_all(class_="offer-price__number"))
  97. price_from_page_nospace = price_from_page.translate({ord(c): None for c in string.whitespace})
  98. price_from_page_out = [int(s) for s in re.findall(r'\b\d+\b',price_from_page_nospace)]
  99.  
  100. # Scrapowanie pojemnosci silnika
  101.  
  102. capacity_from_page = ''.join(map(str,(page.find_all("li", {"data-code" : "engine_capacity"}))))
  103. capacity_from_page_nospace = capacity_from_page.translate({ord(c): None for c in string.whitespace})
  104. capacity_page_out1 = [int(''.join(i)) for is_digit, i in groupby(capacity_from_page_nospace, str.isdigit) if is_digit]
  105. capacity_page_out = [cap for cap in capacity_page_out1 if cap !=3]
  106.  
  107. # Scrapowanie rodaju paliwa
  108.  
  109. fueltype_from_page = ''.join(map(str,(page.find_all("li", {"data-code" : "fuel_type"}))))
  110. fueltype_from_page_nospace = fueltype_from_page.translate({ord(c): None for c in string.whitespace})
  111. fueltype_from_page_nospace = fueltype_from_page_nospace.replace("Benzyna","1")
  112. fueltype_from_page_nospace = fueltype_from_page_nospace.replace("Diesel","2")
  113. fueltype_from_page_nospace = fueltype_from_page_nospace.replace("Benzyna+LPG","3")
  114. fueltype_from_page_nospace = fueltype_from_page_nospace.replace("Elektryczny","4")
  115. fueltype_from_page_nospace = fueltype_from_page_nospace.replace("Hybryda","5")
  116. fueltype_from_page_nospace = fueltype_from_page_nospace.replace("Etanol","6")
  117. fueltype_from_page_nospace = fueltype_from_page_nospace.replace("Benzyna+CNG ","6")
  118. fueltype_from_page_nospace = fueltype_from_page_nospace.replace("Wodór ","6")
  119. fueltype_from_page_out = [int(s) for s in re.findall(r'\b\d+\b',fueltype_from_page_nospace)]
  120.  
  121.  
  122.  
  123. marka_out=["{}".format(marka)] * len(age_from_page_out)
  124. model_out=["{}".format(model)] * len(age_from_page_out)
  125.  
  126.  
  127. if len(milage_page_out) == len(age_from_page_out) == len(price_from_page_out) == len(capacity_page_out) ==len(model_out) == len(marka_out) ==len(fueltype_from_page_out):
  128. df = pd.DataFrame(
  129. {'Milage':milage_page_out,
  130. 'Age': age_from_page_out,
  131. 'Price': price_from_page_out,
  132. 'Engine capacity':capacity_page_out,
  133. 'Fuel type':fueltype_from_page_out,
  134. 'Marka':marka_out,
  135. 'Model':model_out})
  136.  
  137. dataset = dataset.append(df,ignore_index=True)
  138.  
  139. # dataset = dataset['Marka', 'Model','Age', 'Engine capacity', 'Fuel type', 'Milage', 'Price']
  140. return dataset
  141.  
  142.  
  143. def ScrapPage(marka,model,start,stop): #Oczyszcza dane, wyznacza zares stron
  144. datadict = {'Milage':[0],'Age':[0],'Price':[0]}
  145. dataset_out = pd.DataFrame(data=datadict)
  146.  
  147. #Zdobywa ostatnia strone
  148. url1 = simple_get('https://www.otomoto.pl/osobowe/{}/{}/?search%5Bfilter_float_mileage%3Afrom%5D=0&search%5Bfilter_float_engine_capacity%3Afrom%5D=0&search%5Bbrand_program_id%5D%5B0%5D=&search%5Bcountry%5D=&page=1'.format(marka,model))
  149. # url1 = simple_get('https://www.otomoto.pl/osobowe/{}/{}/?search%5Bfilter_enum_damaged%5D=0&search%5Bbrand_program_id%5D%5B0%5D=&search%5Bcountry%5D=&page=1'.format(marka,model))
  150. page1 = BeautifulSoup(url1, 'html.parser')
  151. lastpage = str(page1.find_all(class_="page"))
  152. lastpage_all = [int(s) for s in re.findall(r'\b\d+\b',lastpage)]
  153. lastpage_out = lastpage_all[-1]
  154.  
  155. for i in range(start,lastpage_out): #Docelowo 1, lastpage
  156. time.sleep(2)
  157.  
  158. #To w formacie beda kolejne argumenty, tj za opel i corsa
  159. url = simple_get('https://www.otomoto.pl/osobowe/{}/{}/?search%5Bfilter_float_mileage%3Afrom%5D=0&search%5Bfilter_float_engine_capacity%3Afrom%5D=0&search%5Bbrand_program_id%5D%5B0%5D=&search%5Bcountry%5D=&page={}'.format(marka,model,i))
  160. # url = simple_get('https://www.otomoto.pl/osobowe/{}/{}/?search%5Bfilter_enum_damaged%5D=0&search%5Bbrand_program_id%5D%5B0%5D=&search%5Bcountry%5D=&page={}'.format(marka,model,i))
  161. # FORCE SCRAP
  162. # url = simple_get('https://www.otomoto.pl/osobowe/lexus/is/ii-2005-2012/?search%5Bfilter_float_engine_capacity%3Afrom%5D=2450&search%5Bfilter_float_engine_capacity%3Ato%5D=2550&search%5Bbrand_program_id%5D%5B0%5D=&search%5Bcountry%5D=')
  163. page = BeautifulSoup(url, 'html.parser')
  164. # print(scrappy(page))
  165. dataset_out = dataset_out.append(scrappy(page,marka,model), ignore_index=True)
  166. print(dataset_out)
  167. print(i)
  168. dataset_out.to_csv('{}-{}.csv'.format(marka,model))
  169. return dataset_out
  170.  
  171.  
  172. def ClearCarData(dataset_out):
  173. clear = dataset_out.Milage[((dataset_out.Milage - dataset_out.Milage.mean()) / dataset_out.Milage.std()).abs() > 2]
  174. #Wybiera listę indexow ktore maja byc usuniete
  175.  
  176. clear = dataset_out.Milage[((dataset_out.Milage - dataset_out.Milage.mean()) / dataset_out.Milage.std()).abs() > 2]
  177. clear = clear.append(dataset_out.Age[((dataset_out.Age - dataset_out.Age.mean()) / dataset_out.Age.std()).abs() > 2])
  178. clear = clear.append(dataset_out.Price[((dataset_out.Price - dataset_out.Price.mean()) / dataset_out.Price.std()).abs() > 3])
  179. test1 = clear.index.get_values()
  180.  
  181. #Usuwa duplikaty z listy indexów do usunięcia
  182.  
  183. test = []
  184. for i in test1:
  185. if i not in test:
  186. test.append(i)
  187.  
  188. #Usuwa z dataframu wybrane indexy
  189.  
  190. for i in range(0,len(test)):
  191. dataset_out = dataset_out.drop(test[i],axis=0)
  192.  
  193. return dataset_out
  194.  
  195.  
  196. def regress(x,y):
  197. model = LinearRegression()
  198. model.fit(x,y)
  199. model.predict([[100]])
  200.  
  201. x_test = np.linspace(0,400000)
  202. y_pred = model.predict(x_test[:,None])
  203.  
  204. plt.scatter(x,y,s=2)
  205. plt.plot(x_test,y_pred,'r')
  206. plt.legend(['Regresja', 'Kropeczki'])
  207. plt.show()
  208.  
  209. def Plot1(x,y):
  210.  
  211. # Evaluate a gaussian kde on a regular grid of nbins x nbins over data extents
  212. nbins=300
  213. k = kde.gaussian_kde([x,y])
  214. xi, yi = np.mgrid[x.min():x.max():nbins*1j, y.min():y.max():nbins*1j]
  215. zi = k(np.vstack([xi.flatten(), yi.flatten()]))
  216.  
  217. # Make the plot
  218. plt.pcolormesh(xi, yi, zi.reshape(xi.shape))
  219. plt.show()
  220.  
  221. # Change color palette
  222. plt.pcolormesh(xi, yi, zi.reshape(xi.shape), cmap=plt.cm.Greens_r)
  223. plt.show()
  224. def Plot2(x,y):
  225. # Make the plot
  226. plt.hexbin(x, y, gridsize=(15,15) )
  227. plt.show()
  228.  
  229. # We can control the size of the bins:
  230. plt.hexbin(x, y, gridsize=(150,150) )
  231. plt.show()
  232. def Plot3(x,y):
  233. sns.jointplot(x, y, kind='scatter')
  234. sns.jointplot(x, y, kind='hex')
  235. sns.jointplot(x, y, kind='kde')
  236.  
  237. # Then you can pass arguments to each type:
  238. sns.jointplot(x, y, kind='scatter', s=200, color='m', edgecolor="skyblue", linewidth=2)
  239.  
  240. # Custom the color
  241. sns.set(style="white", color_codes=True)
  242. sns.jointplot(x, y, kind='kde', color="skyblue",xlim={-30000,300000})
  243.  
  244.  
  245.  
  246.  
  247.  
  248.  
  249.  
  250. #LoadCarData(filename): Wczytuje dane z pliku CSV stworzonego przez funkcje ScrapPage,
  251. #dodatkowo oczyszcza z danych odstajcych
  252.  
  253. #ClearCarData(): Oczyszcza z danych odstajacych, zdiala tylko dla df o nazwie dataset_out
  254.  
  255.  
  256.  
  257.  
  258. # 1) Scrapuje dane
  259.  
  260. # Marka, model, start, stop
  261. dataset_out = ScrapPage("toyota" ,"auris", 1 ,3)
  262. #dataset_out.to_csv('datasetvv40.csv')
  263. #dataset_out = pd.read_csv('datasetgolf.csv') #9-45
  264.  
  265.  
  266. # 2) Wczytuje zeskrapowane dane
  267.  
  268. #dataset_out = LoadCarData("datasetvv40")
  269. dataset_out = ClearCarData(dataset_out)
  270.  
  271. #Rozne ploty
  272.  
  273. x=dataset_out['Milage']
  274. y=dataset_out['Age']
  275. #
  276. #
  277. #Plot1(x,y)
  278. #Plot2(x,y)
  279. #Plot3(x,y)
  280.  
  281.  
  282.  
  283.  
  284.  
  285.  
  286.  
  287. #Regresja przebiegu względem czasu
  288. #
  289. a=np.array(dataset_out['Milage'].tolist()).reshape(-1,1)
  290. b=np.array(dataset_out['Age'].tolist()).reshape(-1,1)
  291. regress(a,b)
  292. #
  293.  
  294.  
  295.  
  296.  
  297.  
  298. #To sie przyda do wojewodztw
  299. #for i, li in enumerate(page.select('li')): #To się przyda do wojedzowtw
  300. # print(i, li.text)
  301.  
  302.  
  303.  
  304.  
  305.  
  306.  
  307.  
  308.  
  309.  
  310.  
  311.  
  312.  
  313.  
  314.  
  315.  
  316.  
  317.  
  318.  
  319.  
  320.  
  321.  
  322.  
  323.  
  324.  
  325.  
  326.  
  327.  
  328.  
  329.  
  330.  
  331. #
  332. #def LoadCarData(filename):
  333. #
  334. # #Wczytuje plik do dataframe
  335. #
  336. # dataset_out = pd.read_csv('{}.csv'.format(filename))
  337. #
  338. #
  339. # #Wybiera listę indexow ktore maja byc usuniete
  340. #
  341. # clear = dataset_out.Milage[((dataset_out.Milage - dataset_out.Milage.mean()) / dataset_out.Milage.std()).abs() > 2]
  342. # clear = clear.append(dataset_out.Age[((dataset_out.Age - dataset_out.Age.mean()) / dataset_out.Age.std()).abs() > 2])
  343. # clear = clear.append(dataset_out.Price[((dataset_out.Price - dataset_out.Price.mean()) / dataset_out.Price.std()).abs() > 3])
  344. # test1 = clear.index.get_values()
  345. #
  346. # #Usuwa duplikaty z listy indexów do usunięcia
  347. #
  348. # test = []
  349. # for i in test1:
  350. # if i not in test:
  351. # test.append(i)
  352. #
  353. # #Usuwa z dataframu wybrane indexy
  354. #
  355. # for i in range(0,len(test)):
  356. # dataset_out = dataset_out.drop(test[i],axis=0)
  357. #
  358. # return dataset_out
  359. #
  360.  
  361.  
  362.  
  363.  
  364.  
  365.  
  366.  
  367.  
  368.  
  369.  
  370.  
  371.  
  372.  
  373.  
  374.  
  375.  
  376.  
  377.  
  378.  
  379.  
  380.  
  381.  
  382.  
  383. # -*- coding: utf-8 -*-
  384. """
  385. Created on Sun Apr 28 15:50:16 2019
  386.  
  387. @author: lancernik
  388. """
  389.  
  390. from __future__ import division, unicode_literals
  391. from requests import get
  392. from requests.exceptions import RequestException
  393. from contextlib import closing
  394. from bs4 import BeautifulSoup
  395. import string
  396. import re
  397. from itertools import groupby
  398. import pandas as pd
  399. import time
  400. import matplotlib.pyplot as plt
  401. import numpy as np
  402. from scipy.stats import gaussian_kde
  403. from scipy.stats import kde
  404. import seaborn as sns
  405. from sklearn.linear_model import LinearRegression
  406. from sklearn.model_selection import train_test_split
  407. import matplotlib.pyplot as plt
  408. import codecs
  409. import requests
  410.  
  411.  
  412.  
  413.  
  414.  
  415.  
  416.  
  417. from bs4 import BeautifulSoup
  418. from selenium import webdriver
  419. from selenium.webdriver.chrome.options import Options
  420.  
  421.  
  422.  
  423.  
  424. #Zapisywanie
  425.  
  426. #
  427. dr = webdriver.Chrome(executable_path=r'C:\Users\lancernik\Desktop\chromedriver_win32/chromedriver.exe')
  428. dr.get("https://www.mobile.de/pl/samochod/opel-corsa/vhc:car,pgn:1,pgs:50,ms1:19000_10_")
  429. bs = BeautifulSoup(dr.page_source,'html.parser')
  430.  
  431. #dr.quit()
  432.  
  433.  
  434. #bs = bs.prettify("utf-8")
  435. #with open("output1.html", "wb") as file:
  436. # file.write(bs)
  437. #
  438. #file.close()
  439.  
  440.  
  441.  
  442.  
  443.  
  444.  
  445. #Wczytywanie HTML'a
  446.  
  447. #
  448.  
  449. #from bs4 import BeautifulSoup
  450. #
  451. #f=codecs.open("output1.html", 'r', 'utf-8')
  452. #bs = BeautifulSoup(f.read())
  453. ##print(bs)
  454. #f.close()
  455. #
  456. ##
  457.  
  458.  
  459.  
  460.  
  461.  
  462. #Scrapuje date produkcji
  463.  
  464.  
  465. age_from_page = str(bs.find_all(class_="u-text-bold"))
  466. age_from_page_nospace = age_from_page.translate({ord(c): None for c in string.whitespace})
  467. age_from_page_nospace = re.sub(r"\s+", '', age_from_page_nospace)
  468. age_from_page_out = [int(s) for s in re.findall(r'\b\d+\b',age_from_page_nospace)]
  469. j2 = [i for i in age_from_page_out if i >= 1700]
  470.  
  471.  
  472.  
  473. #tes = age_from_page_nospace.split(","))[1].split("km")[0]
  474. #milage = str(bs.prettify("utf-8"))
  475. #milage = str(bs.encode('latin-1'))
  476. #milage = str(milage.find_all(class_="u-text-bold"))
  477.  
  478.  
  479.  
  480. #bs1 = bs.prettify("utf-8")
  481. #bs1 = str(bs1)
  482. #bs2 = BeautifulSoup(bs.read())
  483.  
  484. #age_from_page_nospace = age_from_page_nospace.encode('ancii', 'ignore')
  485. age_from_page_nospace = str(age_from_page_nospace)
  486. result = str(re.findall(",(.*)km", age_from_page_nospace))
  487.  
  488.  
  489. print(result)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement