Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import urllib
- from urllib.parse import urlparse
- from bs4 import BeautifulSoup, SoupStrainer
- import requests
- from selenium import webdriver
- import re
- import time
- import mysql.connector
- import pymysql.cursors
- chromedriver = 'G:\selenium\chromedriver'
- options = webdriver.ChromeOptions()
- options.add_argument('headless')
- browser = webdriver.Chrome(executable_path=chromedriver, chrome_options=options)
- #browser.set_page_load_timeout(100)
- browser.get("https://art-dtex.ru/catalog/komplekty_postelnogo_belya/")
- for element in browser.find_elements_by_css_selector('div.b-catalog-section.bx_catalog_line'):
- cnt=element.text
- #print (element.text)
- with open('C:\workfile.csv', 'a') as f:
- f.write("{}\n".format(cnt))
- bigtext = element.get_attribute('innerHTML')
- newlink = re.findall('<a href="([^"]*)">', bigtext)
- newestlnk=[]
- for elem in newlink:
- if elem[0] == "/":
- lnk='https://art-dtex.ru'+elem
- newestlnk.append(lnk)
- #print('Newestlnk=', newestlnk)
- for elemlnk in newestlnk:
- browser.get(elemlnk)
- time.sleep(3)
- podcat=browser.find_elements_by_xpath('/html/body/section/div/div/div[2]/div/div[3]/div[2]/div[2]/ul/li[1]')
- if len(podcat)>0:
- for element1 in browser.find_elements_by_css_selector('div.b-catalog-section.bx_catalog_line'):
- bigtext2 = element1.get_attribute('innerHTML')
- newlink2 = re.findall('<a href="([^"]*)">', bigtext2)
- newestlnk2=[]
- for elem2 in newlink2:
- lnk3='https://art-dtex.ru'+elem2
- newestlnk2.append(lnk3)
- for elemlnk2 in newestlnk2:
- #print('elemlnk2=', elemlnk2)
- browser.get(elemlnk2)
- time.sleep(3)
- for pgsweb in browser.find_elements_by_xpath("/html/body/section/div/div/div[2]/div/div[3]/div[2]/div[2]/div[4]/div"):
- pgs=pgsweb.text.split()
- #print('pgs= ', pgs)
- pg=pgs[-1]
- pgint = int(pg)
- #print('pg= ', pgint)
- i=1
- while i<=pgint:
- browser.get(elemlnk2 +"?PAGEN_1=%s" % i)
- time.sleep(3)
- #print('I`m on page ',i)
- kol = len(browser.find_elements_by_class_name('b-catalog-items-el'))
- #print('Постелек на этой странице ', kol)
- #print("https://art-dtex.ru/catalog/velyur/" +"?PAGEN_1=%s" % i)
- k=0
- while k<kol:
- postelki=[]
- for postelka in browser.find_elements_by_class_name('b-catalog-items-el'):
- bigtext4 = postelka.get_attribute('innerHTML')
- #print(bigtext4)
- newlink4= re.findall('<a href="([^"]*)">', bigtext4)
- #print(type(newlink4))
- #print('Pstlk: ', newlink4)
- for elem4 in newlink4:
- postelki.append(elem4)
- #if elem2[0] == "/":
- for postelk in postelki:
- #print(type(postelk))
- print(postelk)
- browser.get('https://art-dtex.ru'+postelk)
- time.sleep(3)
- #print('Я пришёл')
- naims=[]
- for naim in browser.find_elements_by_xpath('/html/body/section/div/div/div[2]/div/div/div[1]/div[2]/div[1]/div'):
- naim=naim.text
- naims.append(naim)
- #with open('C:\workfile.csv', 'a') as f:
- #f.write("{}\n".format(naim))
- pic = browser.find_element_by_css_selector('div.medium-5.small-12.columns.product__img')
- pictext = pic.get_attribute('innerHTML')
- #print(pictext)
- piclink = re.search('<img src="([^"]*)">', pictext)
- if piclink == None:
- continue
- else:
- #print(piclink.group(1))
- ppic = urllib.request.urlopen(piclink.group(1)).read()
- picout = open("C:\Postelki\%s.jpg" % naim, "wb")
- picout.write(ppic)
- picout.close
- prov=browser.find_elements_by_css_selector('td.sizes-td-art')
- #print(prov)
- artiks=[]
- razms=[]
- kompls=[]
- if len(prov)!=0:
- for artik in browser.find_elements_by_css_selector('td.sizes-td-art'):
- artik=artik.text
- artiks.append(artik)
- #with open('C:\workfile.csv', 'a') as f:
- #f.write("{}\n".format(artik))
- for razm in browser.find_elements_by_css_selector('td.sizes-td-size'):
- razm=razm.text
- razms.append(razm)
- #with open('C:\workfile.csv', 'a') as f:
- #f.write("{}\n".format(razm))
- for kompl in browser.find_elements_by_css_selector('td.sizes-td-kompl'):
- kompl=kompl.text
- kompls.append(kompl)
- #with open('C:\workfile.csv', 'a') as f:
- #f.write("{}\n".format(kompl))
- j=0
- mydb = mysql.connector.connect(host='localhost', user='root', password='njkmrjnfr', db='new')
- mycursor = mydb.cursor()
- while j<len(artiks):
- sql="INSERT INTO postelki (Наименование, Артикул, Размер, Комплектация) VALUES (%s, %s, %s, %s)"
- val=(naims[0], artiks[j], razms[j], kompls[j])
- mycursor.execute(sql, val)
- mydb.commit()
- j+=1
- else:
- mydb = mysql.connector.connect(host='localhost', user='root', password='njkmrjnfr', db='new')
- mycursor = mydb.cursor()
- sql="INSERT INTO postelki (Наименование, Артикул) VALUES (%s, %s)"
- skoro='Скоро в продаже'
- val=(naims[0], skoro)
- mycursor.execute(sql, val)
- mydb.commit()
- k+=1
- i+=1
- else:
- print("Эта страница особенная")
- for pgsweb in browser.find_elements_by_xpath("/html/body/section/div/div/div[2]/div/div[3]/div[2]/div[2]/div[4]/div"):
- pgs=pgsweb.text.split()
- #print('pgs= ', pgs)
- pg=pgs[-1]
- pgint = int(pg)
- i=1
- while i<=pgint:
- browser.get(elemlnk2 +"?PAGEN_1=%s" % i)
- time.sleep(3)
- kol = len(browser.find_elements_by_class_name('b-catalog-items-el'))
- k=0
- while k<kol:
- postelki=[]
- for postelka in browser.find_elements_by_class_name('b-catalog-items-el'):
- bigtext4 = postelka.get_attribute('innerHTML')
- newlink4= re.findall('<a href="([^"]*)">', bigtext4)
- for elem4 in newlink4:
- postelki.append(elem4)
- for postelk in postelki:
- #print(postelk)
- browser.get('https://art-dtex.ru'+postelk)
- time.sleep(3)
- naims=[]
- for naim in browser.find_elements_by_xpath('/html/body/section/div/div/div[2]/div/div/div[1]/div[2]/div[1]/div'):
- naim=naim.text
- naims.append(naim)
- pic = browser.find_element_by_css_selector('div.medium-5.small-12.columns.product__img')
- pictext = pic.get_attribute('innerHTML')
- piclink = re.search('<img src="([^"]*)">', pictext)
- ppic = urllib.request.urlopen(piclink.group(1)).read()
- picout = open("C:\Postelki\%s.jpg" % naim, "wb")
- picout.write(ppic)
- picout.close
- prov=browser.find_elements_by_css_selector('td.sizes-td-art')
- artiks=[]
- razms=[]
- kompls=[]
- if len(prov)!=0:
- for artik in browser.find_elements_by_css_selector('td.sizes-td-art'):
- artik=artik.text
- artiks.append(artik)
- for razm in browser.find_elements_by_css_selector('td.sizes-td-size'):
- razm=razm.text
- razms.append(razm)
- for kompl in browser.find_elements_by_css_selector('td.sizes-td-kompl'):
- kompl=kompl.text
- kompls.append(kompl)
- j=0
- mydb = mysql.connector.connect(host='localhost', user='root', password='njkmrjnfr', db='new')
- mycursor = mydb.cursor()
- while j<len(artiks):
- sql="INSERT INTO postelki (Наименование, Артикул, Размер, Комплектация) VALUES (%s, %s, %s, %s)"
- val=(naims[0], artiks[j], razms[j], kompls[j])
- mycursor.execute(sql, val)
- mydb.commit()
- j+=1
- else:
- mydb = mysql.connector.connect(host='localhost', user='root', password='njkmrjnfr', db='new')
- mycursor = mydb.cursor()
- sql="INSERT INTO postelki (Наименование, Артикул) VALUES (%s, %s)"
- skoro='Скоро в продаже'
- val=(naims[0], skoro)
- mycursor.execute(sql, val)
- mydb.commit()
- k+=1
- i+=1
- browser.quit()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement