Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- # encoding: utf-8
- import urllib, urllib2, re
- import threading, Queue
- somewebsite = 'xxx'
- urls5 = []
- for i in range(50001, 60000):
- url = somewebsite + str(i)
- urls5.append(url)
- class MultiUrl(threading.Thread):
- def __init__(self, queue):
- threading.Thread.__init__(self)
- self.queue = queue
- def run(self):
- while True:
- url = self.queue.get()
- try:
- Go(url)
- except:
- continue
- self.queue.task_done()
- def main():
- queue = Queue.Queue()
- for i in range(16):
- t = MultiUrl(queue)
- t.setDaemon(True)
- t.start()
- for url in urls5:
- queue.put(url)
- queue.join()
- def Go(url):
- try:
- print "Now grabbing...", url
- getArticle(url)
- if (OriginalContentList != []):
- writeContent()
- except:
- pass
- def getArticle(url):
- global content, Content
- i = re.findall('\d+', url)[0]
- ArticleID = str(i)
- try:
- content = urllib2.urlopen(url).read()
- Content = urllib.unquote(content)
- except:
- OriginalContentList = []
- pass
- def writeContent():
- # Origin Content
- try:
- filename = ArticleID + '.txt'
- file_out = open(filename, 'w')
- file_out.write(Content)
- file_out.write('\n')
- file_out.close()
- except:
- pass
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement