Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- """
- Created on Thu Apr 25 09:06:22 2019
- @author: lancernik
- """
- # -*- coding: utf-8 -*-
- """
- Created on Wed Apr 24 23:35:43 2019
- @author: lancernik
- """
- from requests import get
- from requests.exceptions import RequestException
- from contextlib import closing
- from bs4 import BeautifulSoup
- import string
- import re
- from itertools import groupby
- import pandas as pd
- import time
- import matplotlib.pyplot as plt
- import numpy as np
- from scipy.stats import kde
- import seaborn as sns
- from sklearn.linear_model import LinearRegression
- def simple_get(url):
- #Zwraca none, w przypadku problemu z pobraniem danych
- try:
- with closing(get(url, stream=True)) as resp:
- if is_good_response(resp):
- return resp.content
- else:
- return None
- except RequestException as e:
- log_error('Error during requests to {0} : {1}'.format(url, str(e)))
- return None
- def is_good_response(resp):
- #Zwaraca True, jeżeli HTMl
- content_type = resp.headers['Content-Type'].lower()
- return (resp.status_code == 200
- and content_type is not None
- and content_type.find('html') > -1)
- def log_error(e):
- print(e)
- def lastpage(page):
- lastepage_out=0
- lastpage = str(page.find_all(class_="page"))
- lastpage_all = [int(s) for s in re.findall(r'\b\d+\b',lastpage)]
- lastpage_out = lastpage_all[-1]
- return lastepage_out
- def LoadCarData(filename):
- #Wczytuje plik do dataframe
- dataset_out = pd.read_csv('{}.csv'.format(filename))
- return dataset_out
- def scrappy(page,marka,model): #Pobiera dane z konretnej strony
- datadict = {'Marka':'Marka','Model':'Model','Milage':[0],'Age':[0],'Price':[0],'Engine capacity':[0],'Fuel type':[0]}
- dataset = pd.DataFrame(data=datadict)
- #Zdobywa numer ostatniej strony
- lastpage = str(page.find_all(class_="page"))
- lastpage_all = [int(s) for s in re.findall(r'\b\d+\b',lastpage)]
- lastpage_out = lastpage_all[-1]
- #Scrapowanie przebiegu
- milage_from_page = ''.join(map(str,(page.find_all("li", {"data-code" : "mileage"}))))
- milage_from_page_nospace = milage_from_page.translate({ord(c): None for c in string.whitespace})
- milage_page_out = [int(''.join(i)) for is_digit, i in groupby(milage_from_page_nospace, str.isdigit) if is_digit]
- #Scrapowanie roku z danej strony
- age_from_page = str(page.find_all(class_="offer-item__params-item"))
- age_from_page_nospace = age_from_page.translate({ord(c): None for c in string.whitespace})
- age_from_page_out = [int(s) for s in re.findall(r'\b\d+\b',age_from_page_nospace)]
- # Scrapowanie cen z danej strony
- price_from_page = str(page.find_all(class_="offer-price__number"))
- price_from_page_nospace = price_from_page.translate({ord(c): None for c in string.whitespace})
- price_from_page_out = [int(s) for s in re.findall(r'\b\d+\b',price_from_page_nospace)]
- # Scrapowanie pojemnosci silnika
- capacity_from_page = ''.join(map(str,(page.find_all("li", {"data-code" : "engine_capacity"}))))
- capacity_from_page_nospace = capacity_from_page.translate({ord(c): None for c in string.whitespace})
- capacity_page_out1 = [int(''.join(i)) for is_digit, i in groupby(capacity_from_page_nospace, str.isdigit) if is_digit]
- capacity_page_out = [cap for cap in capacity_page_out1 if cap !=3]
- # Scrapowanie rodaju paliwa
- fueltype_from_page = ''.join(map(str,(page.find_all("li", {"data-code" : "fuel_type"}))))
- fueltype_from_page_nospace = fueltype_from_page.translate({ord(c): None for c in string.whitespace})
- fueltype_from_page_nospace = fueltype_from_page_nospace.replace("Benzyna","1")
- fueltype_from_page_nospace = fueltype_from_page_nospace.replace("Diesel","2")
- fueltype_from_page_nospace = fueltype_from_page_nospace.replace("Benzyna+LPG","3")
- fueltype_from_page_nospace = fueltype_from_page_nospace.replace("Elektryczny","4")
- fueltype_from_page_nospace = fueltype_from_page_nospace.replace("Hybryda","5")
- fueltype_from_page_nospace = fueltype_from_page_nospace.replace("Etanol","6")
- fueltype_from_page_nospace = fueltype_from_page_nospace.replace("Benzyna+CNG ","6")
- fueltype_from_page_nospace = fueltype_from_page_nospace.replace("Wodór ","6")
- fueltype_from_page_out = [int(s) for s in re.findall(r'\b\d+\b',fueltype_from_page_nospace)]
- marka_out=["{}".format(marka)] * len(age_from_page_out)
- model_out=["{}".format(model)] * len(age_from_page_out)
- if len(milage_page_out) == len(age_from_page_out) == len(price_from_page_out) == len(capacity_page_out) ==len(model_out) == len(marka_out) ==len(fueltype_from_page_out):
- df = pd.DataFrame(
- {'Milage':milage_page_out,
- 'Age': age_from_page_out,
- 'Price': price_from_page_out,
- 'Engine capacity':capacity_page_out,
- 'Fuel type':fueltype_from_page_out,
- 'Marka':marka_out,
- 'Model':model_out})
- dataset = dataset.append(df,ignore_index=True)
- # dataset = dataset['Marka', 'Model','Age', 'Engine capacity', 'Fuel type', 'Milage', 'Price']
- return dataset
- def ScrapPage(marka,model,start,stop): #Oczyszcza dane, wyznacza zares stron
- datadict = {'Milage':[0],'Age':[0],'Price':[0]}
- dataset_out = pd.DataFrame(data=datadict)
- #Zdobywa ostatnia strone
- url1 = simple_get('https://www.otomoto.pl/osobowe/{}/{}/?search%5Bfilter_float_mileage%3Afrom%5D=0&search%5Bfilter_float_engine_capacity%3Afrom%5D=0&search%5Bbrand_program_id%5D%5B0%5D=&search%5Bcountry%5D=&page=1'.format(marka,model))
- # url1 = simple_get('https://www.otomoto.pl/osobowe/{}/{}/?search%5Bfilter_enum_damaged%5D=0&search%5Bbrand_program_id%5D%5B0%5D=&search%5Bcountry%5D=&page=1'.format(marka,model))
- page1 = BeautifulSoup(url1, 'html.parser')
- lastpage = str(page1.find_all(class_="page"))
- lastpage_all = [int(s) for s in re.findall(r'\b\d+\b',lastpage)]
- lastpage_out = lastpage_all[-1]
- for i in range(start,33): #Docelowo 1, lastpage_out
- time.sleep(2)
- #To w formacie beda kolejne argumenty, tj za opel i corsa
- url = simple_get('https://www.otomoto.pl/osobowe/{}/{}/?search%5Bfilter_float_mileage%3Afrom%5D=0&search%5Bfilter_float_engine_capacity%3Afrom%5D=0&search%5Bbrand_program_id%5D%5B0%5D=&search%5Bcountry%5D=&page={}'.format(marka,model,i))
- # url = simple_get('https://www.otomoto.pl/osobowe/{}/{}/?search%5Bfilter_enum_damaged%5D=0&search%5Bbrand_program_id%5D%5B0%5D=&search%5Bcountry%5D=&page={}'.format(marka,model,i))
- # FORCE SCRAP
- # url = simple_get('https://www.otomoto.pl/osobowe/lexus/is/ii-2005-2012/?search%5Bfilter_float_engine_capacity%3Afrom%5D=2450&search%5Bfilter_float_engine_capacity%3Ato%5D=2550&search%5Bbrand_program_id%5D%5B0%5D=&search%5Bcountry%5D=')
- page = BeautifulSoup(url, 'html.parser')
- # print(scrappy(page))
- dataset_out = dataset_out.append(scrappy(page,marka,model), ignore_index=True)
- print(dataset_out)
- print(i)
- dataset_out.to_csv('{}-{}.csv'.format(marka,model))
- return dataset_out
- def ClearCarData(dataset_out):
- #Ustawia minimalny wiek samochodu
- dataset_out = dataset_out[dataset_out['Age'] > 1980]
- #Wybiera listę indexow ktore maja byc usuniete
- clear = dataset_out.Milage[((dataset_out.Milage - dataset_out.Milage.mean()) / dataset_out.Milage.std()).abs() > 4]
- clear = dataset_out.Milage[((dataset_out.Milage - dataset_out.Milage.mean()) / dataset_out.Milage.std()).abs() > 4]
- clear = clear.append(dataset_out.Age[((dataset_out.Age - dataset_out.Age.mean()) / dataset_out.Age.std()).abs() > 4])
- clear = clear.append(dataset_out.Price[((dataset_out.Price - dataset_out.Price.mean()) / dataset_out.Price.std()).abs() > 4])
- test1 = clear.index.get_values()
- #Usuwa duplikaty z listy indexów do usunięcia
- test = []
- for i in test1:
- if i not in test:
- test.append(i)
- #Usuwa z dataframu wybrane indexy
- for i in range(0,len(test)):
- dataset_out = dataset_out.drop(test[i],axis=0)
- return dataset_out
- def regress(x,y):
- model = LinearRegression()
- model.fit(x,y)
- model.predict([[100]])
- x_test = np.linspace(0,max(x))
- y_pred = model.predict(x_test[:,None])
- plt.scatter(x,y,s=2)
- plt.plot(x_test,y_pred,'r')
- plt.legend(['Regresja', 'Kropeczki'])
- plt.show()
- def Plot1(x,y):
- # Evaluate a gaussian kde on a regular grid of nbins x nbins over data extents
- nbins=300
- k = kde.gaussian_kde([x,y])
- xi, yi = np.mgrid[x.min():x.max():nbins*1j, y.min():y.max():nbins*1j]
- zi = k(np.vstack([xi.flatten(), yi.flatten()]))
- # Make the plot
- plt.pcolormesh(xi, yi, zi.reshape(xi.shape))
- plt.show()
- # Change color palette
- plt.pcolormesh(xi, yi, zi.reshape(xi.shape), cmap=plt.cm.Greens_r)
- plt.show()
- def Plot2(x,y):
- # Make the plot
- plt.hexbin(x, y, gridsize=(15,15) )
- plt.show()
- # We can control the size of the bins:
- plt.hexbin(x, y, gridsize=(150,150) )
- plt.show()
- def Plot3(x,y):
- sns.jointplot(x, y, kind='scatter')
- sns.jointplot(x, y, kind='hex')
- sns.jointplot(x, y, kind='kde')
- # Then you can pass arguments to each type:
- sns.jointplot(x, y, kind='scatter', s=200, color='m', edgecolor="skyblue", linewidth=2)
- # Custom the color
- sns.set(style="white", color_codes=True)
- sns.jointplot(x, y, kind='kde', color="skyblue",xlim={-30000,300000})
- #LoadCarData(filename): Wczytuje dane z pliku CSV stworzonego przez funkcje ScrapPage,
- #dodatkowo oczyszcza z danych odstajcych
- #ClearCarData(): Oczyszcza z danych odstajacych, zdiala tylko dla df o nazwie dataset_out
- # 1) Scrapuje dane
- # Marka, model, start, stop
- #dataset_out = ScrapPage("opel" ,"corsa", 1 ,33)
- #dataset_out.to_csv('datasetvv40.csv')
- #dataset_out = pd.read_csv('datasetgolf.csv') #9-45
- # 2) Wczytuje zeskrapowane dane
- dataset_out = LoadCarData("opel-corsa")
- dataset_out = ClearCarData(dataset_out)
- #Rozne ploty
- x=dataset_out['Milage']
- y=dataset_out['Age']
- #
- #
- #Plot1(x,y)
- #Plot2(x,y)
- Plot3(x,y)
- #Regresja przebiegu względem czasu
- #
- #Usunac nowe auta!!!!!!!!!!!!!!!!!!1
- #a=np.array(dataset_out['Milage'].tolist()).reshape(-1,1)
- #b=np.array(dataset_out['Age'].tolist()).reshape(-1,1)
- #regress(a,b)
- #
- #To sie przyda do wojewodztw
- #for i, li in enumerate(page.select('li')): #To się przyda do wojedzowtw
- # print(i, li.text)
- #
- #def LoadCarData(filename):
- #
- # #Wczytuje plik do dataframe
- #
- # dataset_out = pd.read_csv('{}.csv'.format(filename))
- #
- #
- # #Wybiera listę indexow ktore maja byc usuniete
- #
- # clear = dataset_out.Milage[((dataset_out.Milage - dataset_out.Milage.mean()) / dataset_out.Milage.std()).abs() > 2]
- # clear = clear.append(dataset_out.Age[((dataset_out.Age - dataset_out.Age.mean()) / dataset_out.Age.std()).abs() > 2])
- # clear = clear.append(dataset_out.Price[((dataset_out.Price - dataset_out.Price.mean()) / dataset_out.Price.std()).abs() > 3])
- # test1 = clear.index.get_values()
- #
- # #Usuwa duplikaty z listy indexów do usunięcia
- #
- # test = []
- # for i in test1:
- # if i not in test:
- # test.append(i)
- #
- # #Usuwa z dataframu wybrane indexy
- #
- # for i in range(0,len(test)):
- # dataset_out = dataset_out.drop(test[i],axis=0)
- #
- # return dataset_out
- #
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement