Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- """
- Created on Thu Apr 25 09:06:22 2019
- @author: lancernik
- """
- # -*- coding: utf-8 -*-
- """
- Created on Wed Apr 24 23:35:43 2019
- @author: lancernik
- """
- from requests import get
- from requests.exceptions import RequestException
- from contextlib import closing
- from bs4 import BeautifulSoup
- import string
- import re
- from itertools import groupby
- import pandas as pd
- import time
- import matplotlib.pyplot as plt
- import numpy as np
- from scipy.stats import gaussian_kde
- from scipy.stats import kde
- import seaborn as sns
- from sklearn.linear_model import LinearRegression
- from sklearn.model_selection import train_test_split
- import matplotlib.pyplot as plt
- def simple_get(url):
- #Zwraca none, w przypadku problemu z pobraniem danych
- try:
- with closing(get(url, stream=True)) as resp:
- if is_good_response(resp):
- return resp.content
- else:
- return None
- except RequestException as e:
- log_error('Error during requests to {0} : {1}'.format(url, str(e)))
- return None
- def is_good_response(resp):
- #Zwaraca True, jeżeli HTMl
- content_type = resp.headers['Content-Type'].lower()
- return (resp.status_code == 200
- and content_type is not None
- and content_type.find('html') > -1)
- def log_error(e):
- print(e)
- def lastpage(page):
- lastepage_out=0
- lastpage = str(page.find_all(class_="page"))
- lastpage_all = [int(s) for s in re.findall(r'\b\d+\b',lastpage)]
- lastpage_out = lastpage_all[-1]
- return lastepage_out
- def scrappy(page): #Pobiera dane z konretnej strony
- datadict = {'Milage':[0],'Age':[0],'Price':[0]}
- dataset = pd.DataFrame(data=datadict)
- #Zdobywa numer ostatniej strony
- lastpage = str(page.find_all(class_="page"))
- lastpage_all = [int(s) for s in re.findall(r'\b\d+\b',lastpage)]
- lastpage_out = lastpage_all[-1]
- #Scrapowanie przebiegu
- milage_from_page = ''.join(map(str,(page.find_all("li", {"data-code" : "mileage"}))))
- milage_from_page_nospace = milage_from_page.translate({ord(c): None for c in string.whitespace})
- milage_page_out = [int(''.join(i)) for is_digit, i in groupby(milage_from_page_nospace, str.isdigit) if is_digit]
- #Scrapowanie roku z danej strony
- age_from_page = str(page.find_all(class_="offer-item__params-item"))
- age_from_page_nospace = age_from_page.translate({ord(c): None for c in string.whitespace})
- age_from_page_out = [int(s) for s in re.findall(r'\b\d+\b',age_from_page_nospace)]
- # Scrapowanie cen z danej strony
- price_from_page = str(page.find_all(class_="offer-price__number"))
- price_from_page_nospace = price_from_page.translate({ord(c): None for c in string.whitespace})
- price_from_page_out = [int(s) for s in re.findall(r'\b\d+\b',price_from_page_nospace)]
- df = pd.DataFrame(
- {'Milage':milage_page_out,
- 'Age': age_from_page_out,
- 'Price': price_from_page_out})
- dataset = dataset.append(df,ignore_index=True)
- return dataset
- def ScrapPage(marka,model,start,stop): #Oczyszcza dane, wyznacza zares stron
- datadict = {'Milage':[0],'Age':[0],'Price':[0]}
- dataset_out = pd.DataFrame(data=datadict)
- for i in range(start,stop): #Docelowo 1, lastpage
- time.sleep(2)
- #To w formacie beda kolejne argumenty, tj za opel i corsa
- url = simple_get('https://www.otomoto.pl/osobowe/{}/{}/?page={}'.format(marka,model,i))
- page = BeautifulSoup(url, 'html.parser')
- print(scrappy(page))
- dataset_out = dataset_out.append(scrappy(page), ignore_index=True)
- print(dataset_out)
- #Usuwanie danych odstających
- clear = dataset_out.Milage[((dataset_out.Milage - dataset_out.Milage.mean()) / dataset_out.Milage.std()).abs() > 2]
- clear = clear.append(dataset_out.Age[((dataset_out.Age - dataset_out.Age.mean()) / dataset_out.Age.std()).abs() > 2])
- clear = clear.append(dataset_out.Price[((dataset_out.Price - dataset_out.Price.mean()) / dataset_out.Price.std()).abs() > 3])
- test = clear.index.get_values()
- for i in range(0,len(test)):
- dataset_out = dataset_out.drop(test[i],axis=0)
- return dataset_out
- def ClearCarData():
- clear = dataset_out.Milage[((dataset_out.Milage - dataset_out.Milage.mean()) / dataset_out.Milage.std()).abs() > 2]
- clear = clear.append(dataset_out.Age[((dataset_out.Age - dataset_out.Age.mean()) / dataset_out.Age.std()).abs() > 2])
- clear = clear.append(dataset_out.Price[((dataset_out.Price - dataset_out.Price.mean()) / dataset_out.Price.std()).abs() > 3])
- test = clear.index.get_values()
- for i in range(0,len(test)):
- dataset_out = dataset_out.drop(test[i],axis=0)
- return dataset_out
- def LoadCarData(filename):
- dataset_out = pd.read_csv('{}.csv'.format(filename)) #9-45
- clear = dataset_out.Milage[((dataset_out.Milage - dataset_out.Milage.mean()) / dataset_out.Milage.std()).abs() > 2]
- clear = clear.append(dataset_out.Age[((dataset_out.Age - dataset_out.Age.mean()) / dataset_out.Age.std()).abs() > 2])
- clear = clear.append(dataset_out.Price[((dataset_out.Price - dataset_out.Price.mean()) / dataset_out.Price.std()).abs() > 3])
- test = clear.index.get_values()
- for i in range(0,len(test)):
- dataset_out = dataset_out.drop(test[i],axis=0)
- return dataset_out
- def regress(x,y):
- model = LinearRegression()
- model.fit(x,y)
- model.predict([[100]])
- x_test = np.linspace(min(x),400000)
- y_pred = model.predict(x_test[:,None])
- plt.scatter(x,y)
- plt.plot(x_test,y_pred,'r')
- plt.legend(['Regresja', 'Kropeczki'])
- plt.show()
- def Plot1(x,y):
- # Evaluate a gaussian kde on a regular grid of nbins x nbins over data extents
- nbins=300
- k = kde.gaussian_kde([x,y])
- xi, yi = np.mgrid[x.min():x.max():nbins*1j, y.min():y.max():nbins*1j]
- zi = k(np.vstack([xi.flatten(), yi.flatten()]))
- # Make the plot
- plt.pcolormesh(xi, yi, zi.reshape(xi.shape))
- plt.show()
- # Change color palette
- plt.pcolormesh(xi, yi, zi.reshape(xi.shape), cmap=plt.cm.Greens_r)
- plt.show()
- def Plot2(x,y):
- # Make the plot
- plt.hexbin(x, y, gridsize=(15,15) )
- plt.show()
- # We can control the size of the bins:
- plt.hexbin(x, y, gridsize=(150,150) )
- plt.show()
- def Plot3(x,y):
- sns.jointplot(x, y, kind='scatter')
- sns.jointplot(x, y, kind='hex')
- sns.jointplot(x, y, kind='kde')
- # Then you can pass arguments to each type:
- sns.jointplot(x, y, kind='scatter', s=200, color='m', edgecolor="skyblue", linewidth=2)
- # Custom the color
- sns.set(style="white", color_codes=True)
- sns.jointplot(x, y, kind='kde', color="skyblue",xlim={-30000,300000})
- #LoadCarData(filename): Wczytuje dane z pliku CSV stworzonego przez funkcje ScrapPage,
- #dodatkowo oczyszcza z danych odstajcych
- #ClearCarData(): Oczyszcza z danych odstajacych, zdiala tylko dla df o nazwie dataset_out
- # 1) Scrapuje dane
- # Marka, model, start, stop
- #ScrapPage("opel" ,"corsa", 1 ,3)
- #dataset_out.to_csv('dataset1.csv')
- #dataset_out = pd.read_csv('dataset1.csv') #9-45
- # 2) Wczytuje zeskrapowane dane
- dataset_out = LoadCarData("dataset1")
- #Rozne ploty
- #x=dataset_out['Milage']
- #y=dataset_out['Age']
- #
- #
- #Plot1(x,y)
- #Plot2(x,y)
- #Plot3(x,y)
- #Regresja przebiegu względem czasu
- #
- #a=np.array(dataset_out['Milage'].tolist()).reshape(-1,1)
- #b=np.array(dataset_out['Age'].tolist()).reshape(-1,1)
- #regress(a,b)
- #To sie przyda do wojewodztw
- #for i, li in enumerate(page.select('li')): #To się przyda do wojedzowtw
- # print(i, li.text)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement