Advertisement
Guest User

crawling using queue

a guest
Jun 12th, 2013
157
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.52 KB | None | 0 0
  1. #!/usr/bin/env python
  2. # encoding: utf-8
  3.  
  4. import urllib, urllib2, re
  5. import threading, Queue
  6.  
  7. somewebsite = 'xxx'
  8.  
  9. urls5 = []
  10. for i in range(50001, 60000):
  11.     url = somewebsite + str(i)
  12.     urls5.append(url)
  13.  
  14. class MultiUrl(threading.Thread):  
  15.     def __init__(self, queue):  
  16.         threading.Thread.__init__(self)  
  17.         self.queue = queue
  18.     def run(self):  
  19.         while True:  
  20.             url = self.queue.get()
  21.             try:  
  22.                 Go(url)
  23.             except:  
  24.                 continue  
  25.             self.queue.task_done()
  26.  
  27.  
  28. def main():
  29.     queue = Queue.Queue()
  30.     for i in range(16):  
  31.         t = MultiUrl(queue)  
  32.         t.setDaemon(True)  
  33.         t.start()
  34.      
  35.     for url in urls5:
  36.         queue.put(url)
  37.     queue.join()
  38.  
  39. def Go(url):
  40.     try:
  41.         print "Now grabbing...", url
  42.         getArticle(url)
  43.         if (OriginalContentList != []):
  44.             writeContent()
  45.     except:
  46.         pass
  47.  
  48. def getArticle(url):
  49.     global content, Content
  50.     i = re.findall('\d+', url)[0]
  51.     ArticleID = str(i)
  52.     try:
  53.         content = urllib2.urlopen(url).read()
  54.         Content = urllib.unquote(content)
  55.     except:
  56.         OriginalContentList = []
  57.         pass
  58.  
  59. def writeContent():
  60.     # Origin Content
  61.     try:
  62.         filename = ArticleID + '.txt'
  63.         file_out = open(filename, 'w')
  64.         file_out.write(Content)
  65.         file_out.write('\n')
  66.         file_out.close()
  67.     except:
  68.         pass
  69.  
  70. if __name__ == '__main__':
  71.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement