Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- import re
- import random
- st=1000000
- en=1345836
- n=1000
- done=set()
- f1=open('data2.csv','w')
- s='href="/genre/[A-Za-z0-9/]+"|href="/name/[A-Za-z0-9/]+"|href="/keyword/[A-Za-z0-9/]+"|href="/country/[A-Za-z0-9/]+"|href="/language/[A-Za-z0-9/]+"|href="/company/[A-Za-z0-9/]+"'
- for i in range(n):
- r=int(st+(en-st)*random.random())
- while r in done:
- r=int(st+(en-st)*random.random())
- done.add(r)
- url='http://www.imdb.com/title/tt'+str(r)
- resp=requests.get(url)
- print url,resp.status_code
- m=re.findall(s,resp.text)
- li=[]
- for m2 in m:
- m3=m2[7:len(m2)-1]
- if m3 not in li:
- li.append(m3)
- f1.write(url+':'+' '.join(li)+'\n')
- f1.close()
Add Comment
Please, Sign In to add comment