OtoScrap

# -*- coding: utf-8 -*-
"""
Created on Thu Apr 25 09:06:22 2019

@author: lancernik
"""

# -*- coding: utf-8 -*-
"""
Created on Wed Apr 24 23:35:43 2019

@author: lancernik
"""


from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import string
import re
from itertools import groupby
import pandas as pd
import time
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import kde
import seaborn as sns
from sklearn.linear_model import LinearRegression


def simple_get(url):
    #Zwraca none, w przypadku problemu z pobraniem danych
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None

def is_good_response(resp):
    #Zwaraca True, jeżeli HTMl
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200
            and content_type is not None
            and content_type.find('html') > -1)

def log_error(e):
    print(e)

def lastpage(page):
    lastepage_out=0
    lastpage = str(page.find_all(class_="page"))
    lastpage_all = [int(s) for s in re.findall(r'\b\d+\b',lastpage)]
    lastpage_out = lastpage_all[-1]
    return lastepage_out

def LoadCarData(filename):

    #Wczytuje plik do dataframe

    dataset_out = pd.read_csv('{}.csv'.format(filename))
    return dataset_out

def scrappy(page,marka,model):  #Pobiera dane z konretnej strony

    datadict = {'Marka':'Marka','Model':'Model','Milage':[0],'Age':[0],'Price':[0],'Engine capacity':[0],'Fuel type':[0]}
    dataset = pd.DataFrame(data=datadict)


    #Zdobywa numer ostatniej strony

    lastpage = str(page.find_all(class_="page"))
    lastpage_all = [int(s) for s in re.findall(r'\b\d+\b',lastpage)]
    lastpage_out = lastpage_all[-1]

    #Scrapowanie przebiegu

    milage_from_page = ''.join(map(str,(page.find_all("li", {"data-code" : "mileage"}))))
    milage_from_page_nospace = milage_from_page.translate({ord(c): None for c in string.whitespace})
    milage_page_out = [int(''.join(i)) for is_digit, i in groupby(milage_from_page_nospace, str.isdigit) if is_digit]

    #Scrapowanie roku z danej strony

    age_from_page = str(page.find_all(class_="offer-item__params-item"))
    age_from_page_nospace = age_from_page.translate({ord(c): None for c in string.whitespace})
    age_from_page_out = [int(s) for s in re.findall(r'\b\d+\b',age_from_page_nospace)]

    # Scrapowanie cen z danej strony

    price_from_page = str(page.find_all(class_="offer-price__number"))
    price_from_page_nospace = price_from_page.translate({ord(c): None for c in string.whitespace})
    price_from_page_out = [int(s) for s in re.findall(r'\b\d+\b',price_from_page_nospace)]

    # Scrapowanie pojemnosci silnika

    capacity_from_page = ''.join(map(str,(page.find_all("li", {"data-code" : "engine_capacity"}))))
    capacity_from_page_nospace = capacity_from_page.translate({ord(c): None for c in string.whitespace})
    capacity_page_out1 = [int(''.join(i)) for is_digit, i in groupby(capacity_from_page_nospace, str.isdigit) if is_digit]
    capacity_page_out = [cap for cap in capacity_page_out1 if cap !=3]

    # Scrapowanie rodaju paliwa

    fueltype_from_page = ''.join(map(str,(page.find_all("li", {"data-code" : "fuel_type"}))))
    fueltype_from_page_nospace = fueltype_from_page.translate({ord(c): None for c in string.whitespace})
    fueltype_from_page_nospace = fueltype_from_page_nospace.replace("Benzyna","1")
    fueltype_from_page_nospace = fueltype_from_page_nospace.replace("Diesel","2")
    fueltype_from_page_nospace = fueltype_from_page_nospace.replace("Benzyna+LPG","3")
    fueltype_from_page_nospace = fueltype_from_page_nospace.replace("Elektryczny","4")
    fueltype_from_page_nospace = fueltype_from_page_nospace.replace("Hybryda","5")
    fueltype_from_page_nospace = fueltype_from_page_nospace.replace("Etanol","6")
    fueltype_from_page_nospace = fueltype_from_page_nospace.replace("Benzyna+CNG ","6")
    fueltype_from_page_nospace = fueltype_from_page_nospace.replace("Wodór ","6")
    fueltype_from_page_out = [int(s) for s in re.findall(r'\b\d+\b',fueltype_from_page_nospace)]


    marka_out=["{}".format(marka)] * len(age_from_page_out)
    model_out=["{}".format(model)] * len(age_from_page_out)


    if len(milage_page_out) == len(age_from_page_out) == len(price_from_page_out) == len(capacity_page_out) ==len(model_out) == len(marka_out) ==len(fueltype_from_page_out):
        df = pd.DataFrame(
        {'Milage':milage_page_out,
         'Age': age_from_page_out,
         'Price': price_from_page_out,
         'Engine capacity':capacity_page_out,
         'Fuel type':fueltype_from_page_out,
         'Marka':marka_out,
         'Model':model_out})

        dataset = dataset.append(df,ignore_index=True)

#    dataset = dataset['Marka', 'Model','Age', 'Engine capacity', 'Fuel type', 'Milage', 'Price']
    return dataset


def ScrapPage(marka,model,start,stop):  #Oczyszcza dane, wyznacza zares stron
    datadict = {'Milage':[0],'Age':[0],'Price':[0]}
    dataset_out = pd.DataFrame(data=datadict)

    #Zdobywa ostatnia strone
    url1 = simple_get('https://www.otomoto.pl/osobowe/{}/{}/?search%5Bfilter_float_mileage%3Afrom%5D=0&search%5Bfilter_float_engine_capacity%3Afrom%5D=0&search%5Bbrand_program_id%5D%5B0%5D=&search%5Bcountry%5D=&page=1'.format(marka,model))
#    url1 = simple_get('https://www.otomoto.pl/osobowe/{}/{}/?search%5Bfilter_enum_damaged%5D=0&search%5Bbrand_program_id%5D%5B0%5D=&search%5Bcountry%5D=&page=1'.format(marka,model))
    page1 = BeautifulSoup(url1, 'html.parser')
    lastpage = str(page1.find_all(class_="page"))
    lastpage_all = [int(s) for s in re.findall(r'\b\d+\b',lastpage)]
    lastpage_out = lastpage_all[-1]

    for i in range(start,33):  #Docelowo 1, lastpage_out
        time.sleep(2)

        #To w formacie beda kolejne argumenty, tj za opel i corsa
        url = simple_get('https://www.otomoto.pl/osobowe/{}/{}/?search%5Bfilter_float_mileage%3Afrom%5D=0&search%5Bfilter_float_engine_capacity%3Afrom%5D=0&search%5Bbrand_program_id%5D%5B0%5D=&search%5Bcountry%5D=&page={}'.format(marka,model,i))
#        url = simple_get('https://www.otomoto.pl/osobowe/{}/{}/?search%5Bfilter_enum_damaged%5D=0&search%5Bbrand_program_id%5D%5B0%5D=&search%5Bcountry%5D=&page={}'.format(marka,model,i))
#        FORCE SCRAP
#        url = simple_get('https://www.otomoto.pl/osobowe/lexus/is/ii-2005-2012/?search%5Bfilter_float_engine_capacity%3Afrom%5D=2450&search%5Bfilter_float_engine_capacity%3Ato%5D=2550&search%5Bbrand_program_id%5D%5B0%5D=&search%5Bcountry%5D=')
        page = BeautifulSoup(url, 'html.parser')
#        print(scrappy(page))
        dataset_out = dataset_out.append(scrappy(page,marka,model), ignore_index=True)
        print(dataset_out)
        print(i)
    dataset_out.to_csv('{}-{}.csv'.format(marka,model))
    return dataset_out


def ClearCarData(dataset_out):


    #Ustawia minimalny wiek samochodu

    dataset_out = dataset_out[dataset_out['Age'] > 1980]


    #Wybiera listę indexow ktore maja byc usuniete

    clear = dataset_out.Milage[((dataset_out.Milage - dataset_out.Milage.mean()) / dataset_out.Milage.std()).abs() > 4]
    clear = dataset_out.Milage[((dataset_out.Milage - dataset_out.Milage.mean()) / dataset_out.Milage.std()).abs() > 4]
    clear = clear.append(dataset_out.Age[((dataset_out.Age - dataset_out.Age.mean()) / dataset_out.Age.std()).abs() > 4])
    clear = clear.append(dataset_out.Price[((dataset_out.Price - dataset_out.Price.mean()) / dataset_out.Price.std()).abs() > 4])
    test1 = clear.index.get_values()

    #Usuwa duplikaty z listy indexów do usunięcia

    test = []
    for i in test1:
       if i not in test:
          test.append(i)

    #Usuwa z dataframu wybrane indexy

    for i in range(0,len(test)):
        dataset_out = dataset_out.drop(test[i],axis=0)

    return dataset_out


def regress(x,y):
    model = LinearRegression()
    model.fit(x,y)
    model.predict([[100]])

    x_test = np.linspace(0,max(x))
    y_pred = model.predict(x_test[:,None])

    plt.scatter(x,y,s=2)
    plt.plot(x_test,y_pred,'r')
    plt.legend(['Regresja', 'Kropeczki'])
    plt.show()

def Plot1(x,y):

    # Evaluate a gaussian kde on a regular grid of nbins x nbins over data extents
    nbins=300
    k = kde.gaussian_kde([x,y])
    xi, yi = np.mgrid[x.min():x.max():nbins*1j, y.min():y.max():nbins*1j]
    zi = k(np.vstack([xi.flatten(), yi.flatten()]))

    # Make the plot
    plt.pcolormesh(xi, yi, zi.reshape(xi.shape))
    plt.show()

    # Change color palette
    plt.pcolormesh(xi, yi, zi.reshape(xi.shape), cmap=plt.cm.Greens_r)
    plt.show()
def Plot2(x,y):
    # Make the plot
    plt.hexbin(x, y, gridsize=(15,15) )
    plt.show()

    # We can control the size of the bins:
    plt.hexbin(x, y, gridsize=(150,150) )
    plt.show()
def Plot3(x,y):
    sns.jointplot(x, y, kind='scatter')
    sns.jointplot(x, y, kind='hex')
    sns.jointplot(x, y, kind='kde')

    # Then you can pass arguments to each type:
    sns.jointplot(x, y, kind='scatter', s=200, color='m', edgecolor="skyblue", linewidth=2)

    # Custom the color
    sns.set(style="white", color_codes=True)
    sns.jointplot(x, y, kind='kde', color="skyblue",xlim={-30000,300000})


#LoadCarData(filename):  Wczytuje dane z pliku CSV stworzonego przez funkcje ScrapPage,
#dodatkowo oczyszcza z danych odstajcych

#ClearCarData():   Oczyszcza z danych odstajacych, zdiala tylko dla df o nazwie dataset_out


# 1)   Scrapuje dane

#         Marka,   model, start, stop
#dataset_out = ScrapPage("opel" ,"corsa",  1     ,33)
#dataset_out.to_csv('datasetvv40.csv')
#dataset_out = pd.read_csv('datasetgolf.csv')  #9-45


# 2)   Wczytuje zeskrapowane dane

dataset_out = LoadCarData("opel-corsa")
dataset_out = ClearCarData(dataset_out)


#Rozne ploty

x=dataset_out['Milage']
y=dataset_out['Age']
#
#
#Plot1(x,y)
#Plot2(x,y)
Plot3(x,y)


#Regresja przebiegu względem czasu
#

#Usunac nowe auta!!!!!!!!!!!!!!!!!!1

#a=np.array(dataset_out['Milage'].tolist()).reshape(-1,1)
#b=np.array(dataset_out['Age'].tolist()).reshape(-1,1)
#regress(a,b)
#


#To sie przyda do wojewodztw
#for i, li in enumerate(page.select('li')):  #To się przyda do wojedzowtw
#    print(i, li.text)


#
#def LoadCarData(filename):
#
#    #Wczytuje plik do dataframe
#
#    dataset_out = pd.read_csv('{}.csv'.format(filename))
#
#
#    #Wybiera listę indexow ktore maja byc usuniete
#
#    clear = dataset_out.Milage[((dataset_out.Milage - dataset_out.Milage.mean()) / dataset_out.Milage.std()).abs() > 2]
#    clear = clear.append(dataset_out.Age[((dataset_out.Age - dataset_out.Age.mean()) / dataset_out.Age.std()).abs() > 2])
#    clear = clear.append(dataset_out.Price[((dataset_out.Price - dataset_out.Price.mean()) / dataset_out.Price.std()).abs() > 3])
#    test1 = clear.index.get_values()
#
#    #Usuwa duplikaty z listy indexów do usunięcia
#
#    test = []
#    for i in test1:
#       if i not in test:
#          test.append(i)
#
#    #Usuwa z dataframu wybrane indexy
#
#    for i in range(0,len(test)):
#        dataset_out = dataset_out.drop(test[i],axis=0)
#
#    return dataset_out
#