Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- """
- Created on Thu Apr 25 09:06:22 2019
- @author: lancernik
- """
- # -*- coding: utf-8 -*-
- """
- Created on Wed Apr 24 23:35:43 2019
- @author: lancernik
- """
- from requests import get
- from requests.exceptions import RequestException
- from contextlib import closing
- from bs4 import BeautifulSoup
- import string
- import re
- from itertools import groupby
- import pandas as pd
- import time
- import matplotlib.pyplot as plt
- import numpy as np
- from scipy.stats import kde
- import seaborn as sns
- from sklearn.linear_model import LinearRegression
- from sklearn.model_selection import train_test_split
- from sklearn.metrics import mean_squared_error
- from sklearn.metrics import mean_absolute_error
- import sklearn.preprocessing
- def simple_get(url):
- #Zwraca none, w przypadku problemu z pobraniem danych
- try:
- with closing(get(url, stream=True)) as resp:
- if is_good_response(resp):
- return resp.content
- else:
- return None
- except RequestException as e:
- log_error('Error during requests to {0} : {1}'.format(url, str(e)))
- return None
- def is_good_response(resp):
- #Zwaraca True, jeżeli HTMl
- content_type = resp.headers['Content-Type'].lower()
- return (resp.status_code == 200
- and content_type is not None
- and content_type.find('html') > -1)
- def log_error(e):
- print(e)
- def lastpage(page):
- lastepage_out=0
- lastpage = str(page.find_all(class_="page"))
- lastpage_all = [int(s) for s in re.findall(r'\b\d+\b',lastpage)]
- lastpage_out = lastpage_all[-1]
- return lastepage_out
- def LoadCarData(filename):
- #Wczytuje plik do dataframe
- dataset_out = pd.read_csv('{}.csv'.format(filename))
- return dataset_out
- def scrappy(page,marka,model): #Pobiera dane z konretnej strony
- datadict = {'Marka':'Marka','Model':'Model','Milage':[0],'Age':[0],'Price':[0],'Engine capacity':[0],'Fuel type':[0]}
- dataset = pd.DataFrame(data=datadict)
- #Zdobywa numer ostatniej strony
- lastpage = str(page.find_all(class_="page"))
- lastpage_all = [int(s) for s in re.findall(r'\b\d+\b',lastpage)]
- lastpage_out = lastpage_all[-1]
- #Scrapowanie przebiegu
- milage_from_page = ''.join(map(str,(page.find_all("li", {"data-code" : "mileage"}))))
- milage_from_page_nospace = milage_from_page.translate({ord(c): None for c in string.whitespace})
- milage_page_out = [int(''.join(i)) for is_digit, i in groupby(milage_from_page_nospace, str.isdigit) if is_digit]
- #Scrapowanie roku z danej strony
- age_from_page = str(page.find_all(class_="offer-item__params-item"))
- age_from_page_nospace = age_from_page.translate({ord(c): None for c in string.whitespace})
- age_from_page_out = [int(s) for s in re.findall(r'\b\d+\b',age_from_page_nospace)]
- # Scrapowanie cen z danej strony
- price_from_page = str(page.find_all(class_="offer-price__number"))
- price_from_page_nospace = price_from_page.translate({ord(c): None for c in string.whitespace})
- price_from_page_out = [int(s) for s in re.findall(r'\b\d+\b',price_from_page_nospace)]
- # Scrapowanie pojemnosci silnika
- capacity_from_page = ''.join(map(str,(page.find_all("li", {"data-code" : "engine_capacity"}))))
- capacity_from_page_nospace = capacity_from_page.translate({ord(c): None for c in string.whitespace})
- capacity_page_out1 = [int(''.join(i)) for is_digit, i in groupby(capacity_from_page_nospace, str.isdigit) if is_digit]
- capacity_page_out = [cap for cap in capacity_page_out1 if cap !=3]
- # Scrapowanie rodaju paliwa
- fueltype_from_page = ''.join(map(str,(page.find_all("li", {"data-code" : "fuel_type"}))))
- fueltype_from_page_nospace = fueltype_from_page.translate({ord(c): None for c in string.whitespace})
- fueltype_from_page_nospace = fueltype_from_page_nospace.replace("Benzyna","1")
- fueltype_from_page_nospace = fueltype_from_page_nospace.replace("Diesel","2")
- fueltype_from_page_nospace = fueltype_from_page_nospace.replace("Benzyna+LPG","3")
- fueltype_from_page_nospace = fueltype_from_page_nospace.replace("Elektryczny","4")
- fueltype_from_page_nospace = fueltype_from_page_nospace.replace("Hybryda","5")
- fueltype_from_page_nospace = fueltype_from_page_nospace.replace("Etanol","6")
- fueltype_from_page_nospace = fueltype_from_page_nospace.replace("Benzyna+CNG ","6")
- fueltype_from_page_nospace = fueltype_from_page_nospace.replace("Wodór ","6")
- fueltype_from_page_out = [int(s) for s in re.findall(r'\b\d+\b',fueltype_from_page_nospace)]
- marka_out=["{}".format(marka)] * len(age_from_page_out)
- model_out=["{}".format(model)] * len(age_from_page_out)
- if len(milage_page_out) == len(age_from_page_out) == len(price_from_page_out) == len(capacity_page_out) ==len(model_out) == len(marka_out) ==len(fueltype_from_page_out):
- df = pd.DataFrame(
- {'Milage':milage_page_out,
- 'Age': age_from_page_out,
- 'Price': price_from_page_out,
- 'Engine capacity':capacity_page_out,
- 'Fuel type':fueltype_from_page_out,
- 'Marka':marka_out,
- 'Model':model_out})
- dataset = dataset.append(df,ignore_index=True)
- # dataset = dataset['Marka', 'Model','Age', 'Engine capacity', 'Fuel type', 'Milage', 'Price']
- return dataset
- def ScrapPage(marka,model,start,stop): #Oczyszcza dane, wyznacza zares stron
- datadict = {'Milage':[0],'Age':[0],'Price':[0]}
- dataset_out = pd.DataFrame(data=datadict)
- #Zdobywa ostatnia strone
- url1 = simple_get('https://www.otomoto.pl/osobowe/{}/{}/?search%5Bfilter_float_mileage%3Afrom%5D=0&search%5Bfilter_float_engine_capacity%3Afrom%5D=0&search%5Bbrand_program_id%5D%5B0%5D=&search%5Bcountry%5D=&page=1'.format(marka,model))
- # url1 = simple_get('https://www.otomoto.pl/osobowe/{}/{}/?search%5Bfilter_enum_damaged%5D=0&search%5Bbrand_program_id%5D%5B0%5D=&search%5Bcountry%5D=&page=1'.format(marka,model))
- page1 = BeautifulSoup(url1, 'html.parser')
- lastpage = str(page1.find_all(class_="page"))
- lastpage_all = [int(s) for s in re.findall(r'\b\d+\b',lastpage)]
- lastpage_out = lastpage_all[-1]
- for i in range(start,33): #Docelowo 1, lastpage_out
- time.sleep(2)
- #To w formacie beda kolejne argumenty, tj za opel i corsa
- url = simple_get('https://www.otomoto.pl/osobowe/{}/{}/?search%5Bfilter_float_mileage%3Afrom%5D=0&search%5Bfilter_float_engine_capacity%3Afrom%5D=0&search%5Bbrand_program_id%5D%5B0%5D=&search%5Bcountry%5D=&page={}'.format(marka,model,i))
- # url = simple_get('https://www.otomoto.pl/osobowe/{}/{}/?search%5Bfilter_enum_damaged%5D=0&search%5Bbrand_program_id%5D%5B0%5D=&search%5Bcountry%5D=&page={}'.format(marka,model,i))
- # FORCE SCRAP
- # url = simple_get('https://www.otomoto.pl/osobowe/lexus/is/ii-2005-2012/?search%5Bfilter_float_engine_capacity%3Afrom%5D=2450&search%5Bfilter_float_engine_capacity%3Ato%5D=2550&search%5Bbrand_program_id%5D%5B0%5D=&search%5Bcountry%5D=')
- page = BeautifulSoup(url, 'html.parser')
- # print(scrappy(page))
- dataset_out = dataset_out.append(scrappy(page,marka,model), ignore_index=True)
- print(dataset_out)
- print(i)
- dataset_out.to_csv('{}-{}.csv'.format(marka,model))
- return dataset_out
- def ClearCarData(dataset_out):
- #Ustawia minimalny wiek samochodu
- dataset_out = dataset_out[dataset_out['Age'] > 1980]
- #Wybiera listę indexow ktore maja byc usuniete
- clear = dataset_out.Milage[((dataset_out.Milage - dataset_out.Milage.mean()) / dataset_out.Milage.std()).abs() > 4]
- clear = dataset_out.Milage[((dataset_out.Milage - dataset_out.Milage.mean()) / dataset_out.Milage.std()).abs() > 4]
- clear = clear.append(dataset_out.Age[((dataset_out.Age - dataset_out.Age.mean()) / dataset_out.Age.std()).abs() > 4])
- clear = clear.append(dataset_out.Price[((dataset_out.Price - dataset_out.Price.mean()) / dataset_out.Price.std()).abs() > 4])
- test1 = clear.index.get_values()
- #Usuwa duplikaty z listy indexów do usunięcia
- test = []
- for i in test1:
- if i not in test:
- test.append(i)
- #Usuwa z dataframu wybrane indexy
- for i in range(0,len(test)):
- dataset_out = dataset_out.drop(test[i],axis=0)
- return dataset_out
- def regress(x,y): #Najprostsza regresja
- model = LinearRegression()
- model.fit(x,y)
- model.predict([[100]])
- x_test = np.linspace(0,max(x))
- y_pred = model.predict(x_test[:,None])
- plt.scatter(x,y,s=2)
- plt.plot(x_test,y_pred,'r')
- plt.legend(['Regresja', 'Kropeczki'])
- plt.show()
- print("Dopasowanie regresji: {}".format(model.score(x,y)))
- def regress2(x_train, x_test, y_train, y_test): #Model sample
- r = LinearRegression()
- r.fit(x_train,y_train)
- y_train_pred = r.predict(x_train)
- y_test_pred = r.predict(x_test)
- mse = mean_squared_error
- mae = mean_absolute_error
- return{
- "r_score": r.score(x_train,y_train), #??????????
- "MSE_u": mse(y_train,y_train_pred),
- "MSE_t": mse(y_test,y_test_pred),
- "MSA_u": mae(y_train,y_train_pred),
- "MSA_t": mae(y_test,y_test_pred)}
- def regress3(x,y): #Model wielomianowy
- x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2, random_state=12345)
- reg3 = sklearn.preprocessing.PolynomialFeatures(degree=2,include_bias = False)
- x2_train = reg3.fit_transform(x_train)
- x2_test = reg3.fit_transform(x_test)
- # print(reg3.powers_.T)
- params.append("zm. wielom")
- res.append(regress2(x2_train, x2_test, y_train, y_test))
- print(pd.DataFrame(res,index=params))
- #Minimaluzujemy liczbe zmeinnych
- wybrane = pd.DataFrame(forward_selection(x2_train, y_train),columns =["Zmienna", "BIC"])
- wybrane_zmienne = wybrane["Zmienna"].tolist()
- # print(x)
- # wybrane["nazwa"] = [x.columns[w>=1].append(x.columns[w==2]).str.cat(sep="*") for w in reg3.powers_[wybrane_zmienne]]
- # print(x2_train)
- # print(y_train)
- print(wybrane_zmienne)
- params.append("zm.wybrane")
- res.append(regress2(x2_train[:,wybrane_zmienne],x2_test[:,wybrane_zmienne],y_train, y_test))
- wyniki = pd.DataFrame(res, index=params)
- print(wyniki)
- # Bayesian information criterion
- #, minimalizujemy BIC(MSE,p,n) = n log(MSE) + p log(n)
- #Mniejsza liczba wspolczynnikow modelu -> MNiejsza szansa na przeuczenie
- def BIC(mse,p,n):
- return n*np.log(mse) + p*np.log(n)
- def forward_selection(x,y):
- n,m = x.shape
- best_idx =[]
- best_free = set(range(m))
- best_fit = np.inf
- res = []
- for i in range(0,m):
- cur_idx = -1
- cur_fit = np.inf
- for e in best_free:
- r = LinearRegression()
- test_idx = best_idx+[e]
- r.fit(x[:,test_idx],y)
- test_fit = BIC(mean_squared_error(y,r.predict(x[:,test_idx])),i+2,n)
- if test_fit < cur_fit: cur_idx, cur_fit = e, test_fit
- if cur_fit > best_fit: break
- best_idx, best_fit = best_idx + [cur_idx], cur_fit
- best_free.discard(cur_idx)
- res.append((cur_idx, cur_fit))
- print(res)
- return res
- def Plot1(x,y):
- # Evaluate a gaussian kde on a regular grid of nbins x nbins over data extents
- nbins=300
- k = kde.gaussian_kde([x,y])
- xi, yi = np.mgrid[x.min():x.max():nbins*1j, y.min():y.max():nbins*1j]
- zi = k(np.vstack([xi.flatten(), yi.flatten()]))
- # Make the plot
- plt.pcolormesh(xi, yi, zi.reshape(xi.shape))
- plt.show()
- # Change color palette
- plt.pcolormesh(xi, yi, zi.reshape(xi.shape), cmap=plt.cm.Greens_r)
- plt.show()
- def Plot2(x,y):
- # Make the plot
- plt.hexbin(x, y, gridsize=(15,15) )
- plt.show()
- # We can control the size of the bins:
- plt.hexbin(x, y, gridsize=(150,150) )
- plt.show()
- def Plot3(x,y):
- sns.jointplot(x, y, kind='scatter')
- sns.jointplot(x, y, kind='hex')
- sns.jointplot(x, y, kind='kde')
- # Then you can pass arguments to each type:
- sns.jointplot(x, y, kind='scatter', s=200, color='m', edgecolor="skyblue", linewidth=2)
- # Custom the color
- sns.set(style="white", color_codes=True)
- sns.jointplot(x, y, kind='kde', color="skyblue",xlim={-30000,300000})
- #LoadCarData(filename): Wczytuje dane z pliku CSV stworzonego przez funkcje ScrapPage,
- #dodatkowo oczyszcza z danych odstajcych
- #ClearCarData(): Oczyszcza z danych odstajacych, zdiala tylko dla df o nazwie dataset_out
- # 1) Scrapuje dane
- # Marka, model, start, stop
- #dataset_out = ScrapPage("opel" ,"corsa", 1 ,33)
- #dataset_out.to_csv('datasetvv40.csv')
- #dataset_out = pd.read_csv('datasetgolf.csv') #9-45
- # 2) Wczytuje zeskrapowane dane
- dataset_out = LoadCarData("opel-corsa")
- dataset_out = ClearCarData(dataset_out)
- #Rozne ploty
- #x=dataset_out['Milage']
- #y=dataset_out['Age']
- #Plot1(x,y)
- #Plot2(x,y)
- #Plot3(x,y)
- #Szuka korelacji
- #c = dataset_out.corr("pearson")
- #c = c.where(
- # np.triu(
- # np.ones(c.shape,dtype=np.bool),k=1)
- # ).stack().sort_values()
- #print(c[abs(c)>0.3])
- #Regresja przebiegu względem czasu
- #
- dataset_out = dataset_out[dataset_out['Milage'] > 100]
- a=np.array(dataset_out['Milage'].tolist()).reshape(-1,1)
- b=np.array(dataset_out['Age'].tolist()).reshape(-1,1)
- regress(a,b)
- #
- #Regrasja sample, lepsa!
- params =['zm. liniowe']
- x_train, x_test, y_train, y_test = train_test_split(a,b,test_size = 0.2, random_state=12345)
- res = [regress2(x_train, x_test, y_train, y_test)]
- reg2 = pd.DataFrame(res,index=params)
- print(reg2)
- print(regress3(a,b))
- #To sie przyda do wojewodztw
- #for i, li in enumerate(page.select('li')): #To się przyda do wojedzowtw
- # print(i, li.text)
- #
- #def LoadCarData(filename):
- #
- # #Wczytuje plik do dataframe
- #
- # dataset_out = pd.read_csv('{}.csv'.format(filename))
- #
- #
- # #Wybiera listę indexow ktore maja byc usuniete
- #
- # clear = dataset_out.Milage[((dataset_out.Milage - dataset_out.Milage.mean()) / dataset_out.Milage.std()).abs() > 2]
- # clear = clear.append(dataset_out.Age[((dataset_out.Age - dataset_out.Age.mean()) / dataset_out.Age.std()).abs() > 2])
- # clear = clear.append(dataset_out.Price[((dataset_out.Price - dataset_out.Price.mean()) / dataset_out.Price.std()).abs() > 3])
- # test1 = clear.index.get_values()
- #
- # #Usuwa duplikaty z listy indexów do usunięcia
- #
- # test = []
- # for i in test1:
- # if i not in test:
- # test.append(i)
- #
- # #Usuwa z dataframu wybrane indexy
- #
- # for i in range(0,len(test)):
- # dataset_out = dataset_out.drop(test[i],axis=0)
- #
- # return dataset_out
- #
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement