Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- """
- Created on Wed Apr 24 23:35:43 2019
- @author: lancernik
- """
- from requests import get
- from requests.exceptions import RequestException
- from contextlib import closing
- from bs4 import BeautifulSoup
- import string
- import re
- from itertools import groupby
- import pandas as pd
- import time
- import matplotlib.pyplot as plt
- def simple_get(url):
- """
- Attempts to get the content at `url` by making an HTTP GET request.
- If the content-type of response is some kind of HTML/XML, return the
- text content, otherwise return None.
- """
- try:
- with closing(get(url, stream=True)) as resp:
- if is_good_response(resp):
- return resp.content
- else:
- return None
- except RequestException as e:
- log_error('Error during requests to {0} : {1}'.format(url, str(e)))
- return None
- def is_good_response(resp):
- """
- Returns True if the response seems to be HTML, False otherwise.
- """
- content_type = resp.headers['Content-Type'].lower()
- return (resp.status_code == 200
- and content_type is not None
- and content_type.find('html') > -1)
- def log_error(e):
- """
- It is always a good idea to log errors.
- This function just prints them, but you can
- make it do anything.
- """
- print(e)
- def lastpage(page):
- lastepage_out=0
- lastpage = str(page.find_all(class_="page"))
- lastpage_all = [int(s) for s in re.findall(r'\b\d+\b',lastpage)]
- lastpage_out = lastpage_all[-1]
- return lastepage_out
- def scrappy(page):
- datadict = {'Milage':[0],'Age':[0],'Price':[0]}
- dataset = pd.DataFrame(data=datadict)
- #Zdobywa numer ostatniej strony
- lastpage = str(page.find_all(class_="page"))
- lastpage_all = [int(s) for s in re.findall(r'\b\d+\b',lastpage)]
- lastpage_out = lastpage_all[-1]
- #Scrapowanie przebiegu
- milage_from_page = ''.join(map(str,(page.find_all("li", {"data-code" : "mileage"}))))
- milage_from_page_nospace = milage_from_page.translate({ord(c): None for c in string.whitespace})
- milage_page_out = [int(''.join(i)) for is_digit, i in groupby(milage_from_page_nospace, str.isdigit) if is_digit]
- #Scrapowanie roku z danej strony
- age_from_page = str(page.find_all(class_="offer-item__params-item"))
- age_from_page_nospace = age_from_page.translate({ord(c): None for c in string.whitespace})
- age_from_page_out = [int(s) for s in re.findall(r'\b\d+\b',age_from_page_nospace)]
- # Scrapowanie cen z danej strony
- price_from_page = str(page.find_all(class_="offer-price__number"))
- price_from_page_nospace = price_from_page.translate({ord(c): None for c in string.whitespace})
- price_from_page_out = [int(s) for s in re.findall(r'\b\d+\b',price_from_page_nospace)]
- df = pd.DataFrame(
- {'Milage':milage_page_out,
- 'Age': age_from_page_out,
- 'Price': price_from_page_out})
- dataset = dataset.append(df,ignore_index=True)
- return dataset
- #
- #for i, li in enumerate(page.select('li')): #To się przyda do wojedzowtw
- # print(i, li.text)
- # GET DATA 1
- #lastpage = str(page.find_all(class_="page"))
- #lastpage_all = [int(s) for s in re.findall(r'\b\d+\b',lastpage)]
- #lastpage_out = lastpage_all[-1]
- #datadict = {'Milage':[0],'Age':[0],'Price':[0]}
- #dataset_out = pd.DataFrame(data=datadict)
- #for i in range(9,45):
- # time.sleep(2)
- # url = simple_get('https://www.otomoto.pl/osobowe/opel/corsa/?page={}'.format(i))
- # page = BeautifulSoup(url, 'html.parser')
- # print(scrappy(page))
- # dataset_out = dataset_out.append(scrappy(page), ignore_index=True)
- # print(dataset_out)
- #
- #
- #dataset_out.to_csv('dataset1.csv')
- # GET DATA 2
- dataset_out = pd.read_csv('dataset1.csv') #9-45
- #Usuwanie danych odstających
- clear = dataset_out.Milage[((dataset_out.Milage - dataset_out.Milage.mean()) / dataset_out.Milage.std()).abs() > 2]
- clear = clear.append(dataset_out.Age[((dataset_out.Age - dataset_out.Age.mean()) / dataset_out.Age.std()).abs() > 2])
- clear = clear.append(dataset_out.Price[((dataset_out.Price - dataset_out.Price.mean()) / dataset_out.Price.std()).abs() > 3])
- test = clear.index.get_values()
- for i in range(0,len(test)):
- dataset_out = dataset_out.drop(test[i],axis=0)
- plt.scatter(x='Milage',y='Price', data=dataset_out ,marker="*")
- plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement