Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import scrapy
- from bs4 import BeautifulSoup
- class LianjiaAreaSpider(scrapy.Spider):
- name = "lianjia_area"
- start_urls = [
- 'https://gz.lianjia.com/ershoufang/tianhe/'
- ]
- def cleanElement(self,elist):
- if elist is None:
- return None
- resultList=[]
- for item in elist:
- if item is None or item.string=='' or item.string=='n':
- continue
- resultList.append(item)
- return resultList
- def parse(self,response):
- soup = BeautifulSoup(response.body,"lxml")
- positiondiv=soup.find('div',class_='position')
- positioncon=list(positiondiv.children)[3]
- positionlist=positioncon.find_all("div")
- citydiv=positionlist[1]
- currentCity=citydiv.find("a",class_ ="selected")
- currentCityName=currentCity.string
- currentCityUrl=currentCity["href"]
- currentCityCode=currentCityUrl.split('/')[2]
- areaList=positionlist[2].find_all("a")
- for area in areaList:
- yield {
- 'code': area['href'].split('/')[2],
- 'name': area.string,
- 'citycode':currentCityCode,
- 'cityname':currentCityName
- }
- siblings=self.cleanElement(list(currentCity.next_siblings))
- next_page=None
- if siblings is None or len(siblings)==0:
- next_page=None
- else:
- next_page=siblings[0]['href']
- if next_page is not None:
- yield response.follow(next_page, self.parse)
- import sys
- import mysql.connector
- class LianjiaPipeline(object):
- def __init__(self):
- print 'init'
- self.conn = mysql.connector.connect(user='root', database='mysql',password='root')
- self.cursor = self.conn.cursor()
- def process_item(self, item, spider):
- print 'process'
- try:
- self.cursor.execute("""INSERT INTO house_area (area_code, area_name,city_code,city_name)
- VALUES (%s, %s)""", (item.get('code','').encode('utf-8'), item.get('name','').encode('utf-8'),
- item.get('citycode','').encode('utf-8'), item.get('city_name','').encode('utf-8')))
- self.conn.commit()
- except Error, e:
- print "Error %d: %s" % (e.args[0], e.args[1])
- return item
- def close_spider(self, spider):
- print 'close'
- self.cursor.close()
- self.conn.close()
- ITEM_PIPELINES = {
- 'lianjia.pipelines.LianjiaPipeline': 300,}
Add Comment
Please, Sign In to add comment