Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- """
- Created on Sun Apr 28 15:50:16 2019
- @author: lancernik
- """
- from __future__ import division, unicode_literals
- from requests import get
- from requests.exceptions import RequestException
- from contextlib import closing
- from bs4 import BeautifulSoup
- import string
- import re
- from itertools import groupby
- import pandas as pd
- import time
- import matplotlib.pyplot as plt
- import numpy as np
- from scipy.stats import gaussian_kde
- from scipy.stats import kde
- import seaborn as sns
- from sklearn.linear_model import LinearRegression
- from sklearn.model_selection import train_test_split
- import matplotlib.pyplot as plt
- import codecs
- import requests
- from bs4 import BeautifulSoup
- from selenium import webdriver
- from selenium.webdriver.chrome.options import Options
- def ScrapPage(marka,model,pg_num):
- #DriverSet
- dr = webdriver.Chrome(executable_path=r'C:\Users\lancernik\Desktop\Programowanie\Projekty\CarScrapper\chromedriver.exe')
- dr.get("https://www.mobile.de/pl/samochod/{}-{}/vhc:car,pgn:{},pgs:50,ms1:19000_10_".format(marka,model,pg_num))
- bs = BeautifulSoup(dr.page_source,'html.parser')
- dr.quit()
- #Scrapuje date produkcji
- #Stage 2 - Usuwa spacje z tekstu
- #Stage 3 - Usuwa niewidzialne spacje z tekstu
- #Stage 4 - Zmienia wiek z niczego na 2019 rok dla nowych
- #Stage 5 - Znajduje wszystkie wartosci liczbowe
- age_stage1 = str(bs.find_all(class_="u-text-bold"))
- age_stage2 = age_stage1.translate({ord(c): None for c in string.whitespace})
- age_stage3 = re.sub(r"\s+", '', age_stage2)
- age_stage4 = re.sub(r'>0km<', '05/2019,1km,', age_stage3)
- age_stage5 = [int(s) for s in re.findall(r'\b\d+\b',age_stage4)]
- #Stage 6 - Usuwa numer miesiaca z daty
- age_from_page = [i for i in age_stage5 if i >= 1700 and i<= 2020]
- #Scrapuje przebieg
- #Stage 1 - usuwanie dekodowania, bez tego nie mozna sub'owac
- #Stage 2 - Podmiana 0 km, aby nowe samochody byly w datasecie
- #Stage 3 - usuwanie wyszukujemy wartosci miedzy kilometrami
- mileage_stage1 = str(bs.decode('UTF-8'))
- mileage_stage2 = re.sub(r' 0 km', '05/2019,1km,', mileage_stage1)
- mileage_stage3 = [s for s in re.findall(",(.*)km", mileage_stage2)]
- #Stage 4 - usuwnie spalania w litrach na 100 km
- ztemp=[]
- for i in mileage_stage3:
- ztemp.append(re.sub(r"\s+", '', i))
- mileage_from_page = [int(i) for i in ztemp if i.isdigit()]
- #Scrapuje cene
- #Stage 2 - usuwa spacje
- #Stage 3 - usuwa niewidzialne spacje ( )
- #Stage 4-8 usuwa zbedna tresc
- #Stage 9 - Tworzy liste
- price_stage1 = str(bs.find_all(class_="seller-currency u-text-bold"))
- price_stage2 = price_stage1.translate({ord(c): None for c in string.whitespace})
- price_stage3 = re.sub(" ",'',price_stage2)
- price_stage4 = re.sub('<pclass="seller-currencyu-text-bold">','',price_stage3)
- price_stage5 = re.sub('</p>','',price_stage4)
- price_stage6 = price_stage5.replace('[', "")
- price_stage7 = price_stage6.replace(']', "")
- price_stage8 = price_stage7.replace('EUR(brutto)', "")
- price_stage9 = price_stage8.split (",")
- #Stage 10 - mnozy przez wartosc euro
- price_from_page = [round(int(x)*4.28) for x in price_stage9]
- #Tworzy dict z poszczegolnej strony
- datadict = {'Marka':'Marka','Model':'Model','Milage':[0],'Age':[0],'Price':[0]}
- dataset = pd.DataFrame(data=datadict)
- marka_out=["{}".format(marka)] * len(age_from_page)
- model_out=["{}".format(model)] * len(age_from_page)
- if len(mileage_from_page) == len(age_from_page) == len(price_from_page) ==len(model_out) == len(marka_out):
- df = pd.DataFrame(
- {'Milage':mileage_from_page,
- 'Age': age_from_page,
- 'Price': price_from_page,
- 'Marka':marka_out,
- 'Model':model_out})
- dataset = dataset.append(df,ignore_index=True)
- print(dataset)
- return dataset
- def ScrapModel(marka,model,start,stop):
- datadict = {'Milage':[0],'Age':[0],'Price':[0]}
- dataset_out = pd.DataFrame(data=datadict)
- for i in range(start,stop):
- TempData = ScrapPage(model,marka,i)
- dataset_out = dataset_out.append(TempData,ignore_index=True)
- dataset_out.to_csv('{}-{}-mobile.csv'.format(marka,model))
- return dataset_out
- test = ScrapModel("opel","corsa",1,30)
- print(test)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement