Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- Traceback (most recent call last):
- File "D:pythonfinal2822016.py", line 132, in <module>
- connection.commit()
- File "C:UsersamanpAppDataLocalProgramsPythonPython35-32libsite-packagespymysqlconnections.py", line 758, in commit
- self._read_ok_packet()
- File "C:UsersamanpAppDataLocalProgramsPythonPython35-32libsite-packagespymysqlconnections.py", line 737, in _read_ok_packet
- pkt = self._read_packet()
- File "C:UsersamanpAppDataLocalProgramsPythonPython35-32libsite-packagespymysqlconnections.py", line 946, in _read_packet
- packet_header = self._read_bytes(4)
- File "C:UsersamanpAppDataLocalProgramsPythonPython35-32libsite-packagespymysqlconnections.py", line 982, in _read_bytes
- 2013, "Lost connection to MySQL server during query")
- pymysql.err.OperationalError: (2013, 'Lost connection to MySQL server during query')
- from bs4 import BeautifulSoup
- import urllib.request
- import re
- import json
- import pymysql
- import pymysql.cursors
- connection = pymysql.connect(host='198.46.81.14',
- user='kanton5_scrape',
- password='Heineken4291',
- db='kanton5_scrape',
- charset='utf8mb4',
- cursorclass=pymysql.cursors.DictCursor)
- #first hitting scraping the url
- r = urllib.request.urlopen('http://i.cantonfair.org.cn/en/ExpExhibitorList.aspx?k=glassware')
- soup = BeautifulSoup(r, "html.parser")
- links = soup.find_all("a", href=re.compile(r"expexhibitorlist.aspx?categoryno=[0-9]+"))
- linksfromcategories = ([link["href"] for link in links])
- string = "http://i.cantonfair.org.cn/en/"
- linksfromcategories = [string + x for x in linksfromcategories]
- for link in linksfromcategories:
- response = urllib.request.urlopen(link)
- soup2 = BeautifulSoup(response, "html.parser")
- links2 = soup2.find_all("a", href=re.compile(r"ExpExhibitorList.aspx?categoryno=[0-9]+"))
- linksfromsubcategories = ([link["href"] for link in links2])
- linksfromsubcategories = [string + x for x in linksfromsubcategories]
- for link in linksfromsubcategories:
- response = urllib.request.urlopen(link)
- soup3 = BeautifulSoup(response, "html.parser")
- links3 = soup3.find_all("a", href=re.compile(r"ExpExhibitorList.aspx?categoryno=[0-9]+"))
- linksfromsubcategories2 = ([link["href"] for link in links3])
- linksfromsubcategories2 = [string + x for x in linksfromsubcategories2]
- for link in linksfromsubcategories2:
- response2 = urllib.request.urlopen(link)
- soup4 = BeautifulSoup(response2, "html.parser")
- companylink = soup4.find_all("a", href=re.compile(r"expCompany.aspx?corpid=[0-9]+"))
- companylink = ([link["href"] for link in companylink])
- companydetail = soup4.find_all("div", id="contact")
- companylink = [string + x for x in companylink]
- my_list = list(set(companylink))
- for link in my_list:
- print (link)
- response3 = urllib.request.urlopen(link)
- soup5 = BeautifulSoup(response3, "html.parser")
- companydetail = soup5.find_all("div", id="contact")
- for d in companydetail:
- lis = d.find_all('li')
- companyname = lis[0].get_text().strip()
- companyaddress = lis[1].get_text().strip()
- companycity = lis[2].get_text().strip()
- try:
- companypostalcode = lis[3].get_text().strip()
- companypostalcode = companypostalcode.replace(",","")
- except:
- companypostalcode = lis[3].get_text().strip()
- try:
- companywebsite = lis[4].get_text().strip()
- companywebsite = companywebsite.replace("xEFxBCx8Cifl...","")
- except IndexError:
- companywebsite = 'null'
- try:
- with connection.cursor() as cursor:
- print ('saving company details to db')
- cursor.execute("""INSERT INTO company(
- companyname,address,city,pincode,website)
- VALUES (%s, %s, %s, %s, %s)""",
- (companyname, companyaddress, companycity,
- companypostalcode, companywebsite))
- connection.commit()
- finally:
- print ("Company Data saved")
- productlink = soup5.find_all("a", href=re.compile(r"ExpProduct.aspx?corpid=[0-9]+.categoryno=[0-9]+"))
- productlink = ([link["href"] for link in productlink])
- productlink = [string + x for x in productlink]
- productlinkun = list(set(productlink))
- for link in productlinkun:
- print (link)
- responseproduct = urllib.request.urlopen(link)
- soupproduct = BeautifulSoup(responseproduct, "html.parser")
- productname = soupproduct.select('div[class="photolist"] li a')
- for element in productname:
- print ("====================Product Name=======================")
- productnames = element.get_text().strip()
- print (productnames)
- try:
- with connection.cursor() as cursor:
- # Create a new record
- print ('saving products to db')
- cursor.execute("""INSERT INTO products(
- companyname,products)
- VALUES (%s, %s)""",
- (companyname, productnames))
- connection.commit()
- finally:
- print ("Products Data Saved")
- print ("===================UshaAman======================")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement