Guest User

Untitled

a guest
Dec 12th, 2017
89
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.26 KB | None | 0 0
  1. import scrapy
  2. from bs4 import BeautifulSoup
  3. class LianjiaAreaSpider(scrapy.Spider):
  4. name = "lianjia_area"
  5. start_urls = [
  6. 'https://gz.lianjia.com/ershoufang/tianhe/'
  7. ]
  8.  
  9. def cleanElement(self,elist):
  10. if elist is None:
  11. return None
  12. resultList=[]
  13. for item in elist:
  14. if item is None or item.string=='' or item.string=='n':
  15. continue
  16. resultList.append(item)
  17. return resultList
  18.  
  19. def parse(self,response):
  20. soup = BeautifulSoup(response.body,"lxml")
  21. positiondiv=soup.find('div',class_='position')
  22. positioncon=list(positiondiv.children)[3]
  23. positionlist=positioncon.find_all("div")
  24. citydiv=positionlist[1]
  25. currentCity=citydiv.find("a",class_ ="selected")
  26. currentCityName=currentCity.string
  27. currentCityUrl=currentCity["href"]
  28. currentCityCode=currentCityUrl.split('/')[2]
  29. areaList=positionlist[2].find_all("a")
  30. for area in areaList:
  31. yield {
  32. 'code': area['href'].split('/')[2],
  33. 'name': area.string,
  34. 'citycode':currentCityCode,
  35. 'cityname':currentCityName
  36. }
  37.  
  38. siblings=self.cleanElement(list(currentCity.next_siblings))
  39. next_page=None
  40. if siblings is None or len(siblings)==0:
  41. next_page=None
  42. else:
  43. next_page=siblings[0]['href']
  44. if next_page is not None:
  45. yield response.follow(next_page, self.parse)
  46.  
  47. import sys
  48. import mysql.connector
  49.  
  50. class LianjiaPipeline(object):
  51. def __init__(self):
  52. print 'init'
  53. self.conn = mysql.connector.connect(user='root', database='mysql',password='root')
  54. self.cursor = self.conn.cursor()
  55.  
  56. def process_item(self, item, spider):
  57. print 'process'
  58. try:
  59. self.cursor.execute("""INSERT INTO house_area (area_code, area_name,city_code,city_name)
  60. VALUES (%s, %s)""", (item.get('code','').encode('utf-8'), item.get('name','').encode('utf-8'),
  61. item.get('citycode','').encode('utf-8'), item.get('city_name','').encode('utf-8')))
  62. self.conn.commit()
  63. except Error, e:
  64. print "Error %d: %s" % (e.args[0], e.args[1])
  65. return item
  66. def close_spider(self, spider):
  67. print 'close'
  68. self.cursor.close()
  69. self.conn.close()
  70.  
  71. ITEM_PIPELINES = {
  72. 'lianjia.pipelines.LianjiaPipeline': 300,}
Add Comment
Please, Sign In to add comment