Advertisement
lancernik

AutoDeV1

May 3rd, 2019
41
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.51 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on Sun Apr 28 15:50:16 2019
  4.  
  5. @author: lancernik
  6. """
  7.  
  8. from __future__ import division, unicode_literals
  9. from requests import get
  10. from requests.exceptions import RequestException
  11. from contextlib import closing
  12. from bs4 import BeautifulSoup
  13. import string
  14. import re
  15. from itertools import groupby
  16. import pandas as pd
  17. import time
  18. import matplotlib.pyplot as plt
  19. import numpy as np
  20. from scipy.stats import gaussian_kde
  21. from scipy.stats import kde
  22. import seaborn as sns
  23. from sklearn.linear_model import LinearRegression
  24. from sklearn.model_selection import train_test_split
  25. import matplotlib.pyplot as plt
  26. import codecs
  27. import requests
  28.  
  29.  
  30.  
  31.  
  32.  
  33.  
  34.  
  35. from bs4 import BeautifulSoup
  36. from selenium import webdriver
  37. from selenium.webdriver.chrome.options import Options
  38.  
  39.  
  40.  
  41.  
  42.  
  43. def ScrapPage(marka,model,pg_num):
  44.  
  45. #DriverSet
  46.  
  47. dr = webdriver.Chrome(executable_path=r'C:\Users\lancernik\Desktop\Programowanie\Projekty\CarScrapper\chromedriver.exe')
  48. dr.get("https://www.mobile.de/pl/samochod/{}-{}/vhc:car,pgn:{},pgs:50,ms1:19000_10_".format(marka,model,pg_num))
  49. bs = BeautifulSoup(dr.page_source,'html.parser')
  50. dr.quit()
  51.  
  52.  
  53. #Scrapuje date produkcji
  54.  
  55.  
  56. #Stage 2 - Usuwa spacje z tekstu
  57. #Stage 3 - Usuwa niewidzialne spacje z tekstu
  58. #Stage 4 - Zmienia wiek z niczego na 2019 rok dla nowych
  59. #Stage 5 - Znajduje wszystkie wartosci liczbowe
  60. age_stage1 = str(bs.find_all(class_="u-text-bold"))
  61. age_stage2 = age_stage1.translate({ord(c): None for c in string.whitespace})
  62. age_stage3 = re.sub(r"\s+", '', age_stage2)
  63. age_stage4 = re.sub(r'>0km<', '05/2019,1km,', age_stage3)
  64. age_stage5 = [int(s) for s in re.findall(r'\b\d+\b',age_stage4)]
  65. #Stage 6 - Usuwa numer miesiaca z daty
  66. age_from_page = [i for i in age_stage5 if i >= 1700 and i<= 2020]
  67.  
  68.  
  69.  
  70.  
  71. #Scrapuje przebieg
  72.  
  73.  
  74. #Stage 1 - usuwanie dekodowania, bez tego nie mozna sub'owac
  75. #Stage 2 - Podmiana 0 km, aby nowe samochody byly w datasecie
  76. #Stage 3 - usuwanie wyszukujemy wartosci miedzy kilometrami
  77. mileage_stage1 = str(bs.decode('UTF-8'))
  78. mileage_stage2 = re.sub(r' 0 km', '05/2019,1km,', mileage_stage1)
  79. mileage_stage3 = [s for s in re.findall(",(.*)km", mileage_stage2)]
  80. #Stage 4 - usuwnie spalania w litrach na 100 km
  81. ztemp=[]
  82. for i in mileage_stage3:
  83. ztemp.append(re.sub(r"\s+", '', i))
  84. mileage_from_page = [int(i) for i in ztemp if i.isdigit()]
  85.  
  86.  
  87.  
  88. #Scrapuje cene
  89.  
  90.  
  91. #Stage 2 - usuwa spacje
  92. #Stage 3 - usuwa niewidzialne spacje (&nbsp;)
  93. #Stage 4-8 usuwa zbedna tresc
  94. #Stage 9 - Tworzy liste
  95. price_stage1 = str(bs.find_all(class_="seller-currency u-text-bold"))
  96. price_stage2 = price_stage1.translate({ord(c): None for c in string.whitespace})
  97. price_stage3 = re.sub(" ",'',price_stage2)
  98. price_stage4 = re.sub('<pclass="seller-currencyu-text-bold">','',price_stage3)
  99. price_stage5 = re.sub('</p>','',price_stage4)
  100. price_stage6 = price_stage5.replace('[', "")
  101. price_stage7 = price_stage6.replace(']', "")
  102. price_stage8 = price_stage7.replace('EUR(brutto)', "")
  103. price_stage9 = price_stage8.split (",")
  104. #Stage 10 - mnozy przez wartosc euro
  105. price_from_page = [round(int(x)*4.28) for x in price_stage9]
  106.  
  107.  
  108.  
  109.  
  110. #Tworzy dict z poszczegolnej strony
  111.  
  112. datadict = {'Marka':'Marka','Model':'Model','Milage':[0],'Age':[0],'Price':[0]}
  113. dataset = pd.DataFrame(data=datadict)
  114. marka_out=["{}".format(marka)] * len(age_from_page)
  115. model_out=["{}".format(model)] * len(age_from_page)
  116.  
  117. if len(mileage_from_page) == len(age_from_page) == len(price_from_page) ==len(model_out) == len(marka_out):
  118. df = pd.DataFrame(
  119. {'Milage':mileage_from_page,
  120. 'Age': age_from_page,
  121. 'Price': price_from_page,
  122. 'Marka':marka_out,
  123. 'Model':model_out})
  124.  
  125. dataset = dataset.append(df,ignore_index=True)
  126. print(dataset)
  127. return dataset
  128.  
  129. def ScrapModel(marka,model,start,stop):
  130.  
  131. datadict = {'Milage':[0],'Age':[0],'Price':[0]}
  132. dataset_out = pd.DataFrame(data=datadict)
  133.  
  134. for i in range(start,stop):
  135. TempData = ScrapPage(model,marka,i)
  136. dataset_out = dataset_out.append(TempData,ignore_index=True)
  137.  
  138. dataset_out.to_csv('{}-{}-mobile.csv'.format(marka,model))
  139. return dataset_out
  140.  
  141.  
  142.  
  143. test = ScrapModel("opel","corsa",1,30)
  144. print(test)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement