Advertisement
Guest User

Python craigslist scraper

a guest
Jul 16th, 2020
101
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.98 KB | None | 0 0
  1. #!/usr/bin/python3
  2.  
  3. # Import libraries
  4. from mechanicalsoup import StatefulBrowser
  5. import smtplib
  6. from os.path import expanduser
  7.  
  8. directory = expanduser("~")+'/craigslist-scraper/'
  9.  
  10. # Set message data
  11.  
  12. # set email address
  13. name = 'Your Name'
  14. sender = 'your@email.address'
  15.  
  16. receivers = [sender]
  17. sender_full = 'Craigslistscraper <'+sender+'>'
  18. receiver_full = name+' <'+sender+'>'
  19. subject = 'New posts'
  20.  
  21. # set start of message start
  22. message = """From: {}
  23. To: {}
  24. Subject: {}""".format(sender_full, receiver_full, subject)
  25.  
  26. # Try to read in list of post IDs
  27. try:
  28.     post_list = open(directory+'/post_list.txt').read().split('\n')
  29. # If not set to empty list
  30. except FileNotFoundError:
  31.     post_list = []
  32.  
  33. # URL for seearch for bikes within 7.6 miles of 10005
  34. # with a maximum price of $100
  35. url = 'https://newyork.craigslist.org/d/bicycles/search/bia?postal=10005&search_distance=7.6&max_price=100'
  36.  
  37. # Create browser objects
  38. br = StatefulBrowser()
  39. br_post = StatefulBrowser()
  40.  
  41. # Open search URL and get page
  42. br.open(url)
  43. soup =  br.get_current_page()
  44.  
  45. # Loop through posts and append new posts to message
  46. new_posts = 0
  47. for link in soup.find_all('a'):
  48.     href = link.get('href')
  49.     if 'bik/d' in href:
  50.         id = href.split('/')[-1].split('.')[0]
  51.         if id not in post_list:
  52.             new_posts += 1
  53.             br_post.open(href)
  54.             post_soup = br_post.get_current_page()
  55.             title = post_soup.title.text
  56.             post = post_soup.find(id='postingbody').text.split('\n')[-1]
  57.             message = message+"\n\n\n{}\n\n{}\n\n{}".format(title, post, href)
  58.             post_list.append(id)
  59.  
  60. # Send message if there are new posts
  61. smtpObj = smtplib.SMTP('localhost')
  62. if new_posts>0:
  63.     smtpObj.sendmail(sender, receivers, message.encode('utf-8'))
  64.     print('{} new posts sent'.format(new_posts))
  65.     # Write updated list of posts to file
  66.     open(directory+'post_list.txt','w').write('\n'.join(post_list[-200:]))
  67. else:
  68.     print('No new posts')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement