Advertisement
Guest User

webcrawler

a guest
Mar 29th, 2015
300
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.20 KB | None | 0 0
  1. from bs4 import BeautifulSoup
  2. import urllib2
  3.  
  4. import os
  5.  
  6. #request url
  7. resp = urllib2.urlopen("https://blockchain.info/blocks")
  8. soup = BeautifulSoup(resp, from_encoding=resp.info().getparam('charset'))
  9.  
  10. # empty list to build urls
  11. urlList = []
  12. #list of times
  13. timeList = []
  14. #block #
  15. blockList = []
  16. #relay
  17. relaylist =[]
  18.  
  19. #loop through all links containing a link
  20. for link in soup.find_all('a', href=True):
  21.     #links containing block-index are the hash links
  22.     if 'block-index' in link['href']:
  23.  
  24.         #build url list of all blocks
  25.         urlList.append("https://blockchain.info"+link['href'])
  26.  
  27.  
  28. #print all the links onto console
  29. for i in urlList:
  30.    
  31.     #prink link
  32.     print "Processing url: " + i + "\n"
  33.  
  34.    
  35.     #process data
  36.     resp = urllib2.urlopen(i)
  37.     soup = BeautifulSoup(resp, from_encoding=resp.info().getparam('charset'))
  38.  
  39.     # NEED DIFFERENT QUERIES TO SEARCH FOR SPECIFIC INFO
  40.     for link in soup.find_all('th', href=False):
  41.        
  42.         print "\n"
  43.         #split data between spaces
  44.         print link.getText().split(" ")
  45.  
  46.     #REMOVE THIS
  47.     #USED TO PAUSE THE INFO SO U CAN GET A BETTER VISUAL
  48.     raw_input("Press enter to continue")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement