GetData

# -*- coding: utf-8 -*-
"""
Created on Wed Apr 24 23:35:43 2019

@author: lancernik
"""


from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import string
import re
from itertools import groupby
import pandas as pd
import time
import matplotlib.pyplot as plt

def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None
def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200
            and content_type is not None
            and content_type.find('html') > -1)

def log_error(e):
    """
    It is always a good idea to log errors.
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

def lastpage(page):
    lastepage_out=0
    lastpage = str(page.find_all(class_="page"))
    lastpage_all = [int(s) for s in re.findall(r'\b\d+\b',lastpage)]
    lastpage_out = lastpage_all[-1]
    return lastepage_out

def scrappy(page):

    datadict = {'Milage':[0],'Age':[0],'Price':[0]}
    dataset = pd.DataFrame(data=datadict)


    #Zdobywa numer ostatniej strony

    lastpage = str(page.find_all(class_="page"))
    lastpage_all = [int(s) for s in re.findall(r'\b\d+\b',lastpage)]
    lastpage_out = lastpage_all[-1]


    #Scrapowanie przebiegu

    milage_from_page = ''.join(map(str,(page.find_all("li", {"data-code" : "mileage"}))))
    milage_from_page_nospace = milage_from_page.translate({ord(c): None for c in string.whitespace})
    milage_page_out = [int(''.join(i)) for is_digit, i in groupby(milage_from_page_nospace, str.isdigit) if is_digit]

    #Scrapowanie roku z danej strony

    age_from_page = str(page.find_all(class_="offer-item__params-item"))
    age_from_page_nospace = age_from_page.translate({ord(c): None for c in string.whitespace})
    age_from_page_out = [int(s) for s in re.findall(r'\b\d+\b',age_from_page_nospace)]

    # Scrapowanie cen z danej strony

    price_from_page = str(page.find_all(class_="offer-price__number"))
    price_from_page_nospace = price_from_page.translate({ord(c): None for c in string.whitespace})
    price_from_page_out = [int(s) for s in re.findall(r'\b\d+\b',price_from_page_nospace)]

    df = pd.DataFrame(
    {'Milage':milage_page_out,
     'Age': age_from_page_out,
     'Price': price_from_page_out})

    dataset = dataset.append(df,ignore_index=True)

    return dataset


#
#for i, li in enumerate(page.select('li')):  #To się przyda do wojedzowtw
#    print(i, li.text)


# GET DATA 1


#lastpage = str(page.find_all(class_="page"))
#lastpage_all = [int(s) for s in re.findall(r'\b\d+\b',lastpage)]
#lastpage_out = lastpage_all[-1]


#datadict = {'Milage':[0],'Age':[0],'Price':[0]}
#dataset_out = pd.DataFrame(data=datadict)
#for i in range(9,45):
#    time.sleep(2)
#    url = simple_get('https://www.otomoto.pl/osobowe/opel/corsa/?page={}'.format(i))
#    page = BeautifulSoup(url, 'html.parser')
#    print(scrappy(page))
#    dataset_out = dataset_out.append(scrappy(page), ignore_index=True)
#    print(dataset_out)
#
#

#dataset_out.to_csv('dataset1.csv')


# GET DATA 2

dataset_out = pd.read_csv('dataset1.csv')  #9-45


#Usuwanie danych odstających
clear = dataset_out.Milage[((dataset_out.Milage - dataset_out.Milage.mean()) / dataset_out.Milage.std()).abs() > 2]
clear = clear.append(dataset_out.Age[((dataset_out.Age - dataset_out.Age.mean()) / dataset_out.Age.std()).abs() > 2])
clear = clear.append(dataset_out.Price[((dataset_out.Price - dataset_out.Price.mean()) / dataset_out.Price.std()).abs() > 3])
test = clear.index.get_values()

for i in range(0,len(test)):
    dataset_out = dataset_out.drop(test[i],axis=0)


plt.scatter(x='Milage',y='Price', data=dataset_out ,marker="*")
plt.show()