Advertisement
Guest User

f4spider_v3.py

a guest
Dec 2nd, 2011
125
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.42 KB | None | 0 0
  1. from urllib.request import Request, urlopen
  2. from urllib.error import URLError
  3.  
  4. from re import compile, findall
  5.  
  6. IMAGES_PER_PAGE = 25
  7. BYTES_PER_KB = 1024
  8. db = []
  9. links = []
  10.  
  11. def pages_to_identifiers():
  12.     t = b'<blockquote id="asseti\w{40}"'
  13.     pattern = compile(t)
  14.     last_page = last_page_number()
  15.  
  16.     for i in range(last_page + 1):
  17.         offset = int(i) * IMAGES_PER_PAGE - IMAGES_PER_PAGE
  18.         u = req('http://ffffound.com/?offset=' + str(offset) + '&')
  19.         s = u.read()
  20.         u.close()
  21.  
  22.         match = findall(pattern, s)
  23.  
  24.         for j in match:
  25.             j = j[22:-1].decode()
  26.             if j not in db:
  27.                 db.append(j)
  28.  
  29.         if i % 5 == 0:
  30.             print(i, "/", last_page, "pages -> identifiers")
  31.  
  32.     with open("db.txt", "w") as db_file:
  33.         db_file.write("\n".join(db))
  34.  
  35. def identifiers_to_links():
  36.     with open("db.txt", "r") as db_file:
  37.         db = db_file.read().split("\n")
  38.    
  39.     t = b'src="http://img.ffffound.com/static-data/assets/+\d/\w{40}_m.[a-z]{3}"'
  40.     pattern = compile(t)
  41.     times = 0
  42.    
  43.     for i in db:
  44.         u = req('http://ffffound.com/image/' + i)
  45.         s = u.read()
  46.         u.close()
  47.        
  48.         match = findall(pattern, s)
  49.         if len(links) % 10 == 0:
  50.             print(len(links), "/", len(db), "identifiers -> links")
  51.         if len(match) > 0 and len(match[0]) > 5:
  52.             URI = match[0][5:-1].decode()
  53.             links.append(URI)
  54.            
  55.     with open("links.txt", "w") as link_file:
  56.         link_file.write("\n".join(links.sort()))
  57.  
  58. def last_page_number():
  59.     # the idea is to request a special page using an offset too large
  60.     # this page will contain the offsets of the last existing pages
  61.     u = req('http://ffffound.com/?offset=' + str(10**9) + '&')
  62.     s = u.read()
  63.     u.close()
  64.  
  65.     t = b'./\?offset=\d+&"'
  66.     pattern = compile(t)
  67.     match = findall(pattern, s)
  68.     offset = int(match[-1][10:-2].decode()) # only the last one is important
  69.     page = (offset + IMAGES_PER_PAGE) // IMAGES_PER_PAGE
  70.     return page
  71.  
  72. def req(URI):
  73.     try:
  74.         u = None
  75.         while u == None:
  76.             u = urlopen(Request(URI))
  77.         return u
  78.  
  79.     except URLError as e:
  80.         if hasattr(e, 'reason'):
  81.             print(e.reason)
  82.         elif hasattr(e, 'code'):
  83.             print(e.code)
  84.  
  85. if __name__ == "__main__":
  86.     pages_to_identifiers()
  87.     identifiers_to_links()
  88.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement