Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from urllib.request import Request, urlopen
- from urllib.error import URLError
- from re import compile, findall
- IMAGES_PER_PAGE = 25
- BYTES_PER_KB = 1024
- db = []
- links = []
- def pages_to_identifiers():
- t = b'<blockquote id="asseti\w{40}"'
- pattern = compile(t)
- last_page = last_page_number()
- for i in range(last_page + 1):
- offset = int(i) * IMAGES_PER_PAGE - IMAGES_PER_PAGE
- u = req('http://ffffound.com/?offset=' + str(offset) + '&')
- s = u.read()
- u.close()
- match = findall(pattern, s)
- for j in match:
- j = j[22:-1].decode()
- if j not in db:
- db.append(j)
- if i % 5 == 0:
- print(i, "/", last_page, "pages -> identifiers")
- with open("db.txt", "w") as db_file:
- db_file.write("\n".join(db))
- def identifiers_to_links():
- with open("db.txt", "r") as db_file:
- db = db_file.read().split("\n")
- t = b'src="http://img.ffffound.com/static-data/assets/+\d/\w{40}_m.[a-z]{3}"'
- pattern = compile(t)
- times = 0
- for i in db:
- u = req('http://ffffound.com/image/' + i)
- s = u.read()
- u.close()
- match = findall(pattern, s)
- if len(links) % 10 == 0:
- print(len(links), "/", len(db), "identifiers -> links")
- if len(match) > 0 and len(match[0]) > 5:
- URI = match[0][5:-1].decode()
- links.append(URI)
- with open("links.txt", "w") as link_file:
- link_file.write("\n".join(links.sort()))
- def last_page_number():
- # the idea is to request a special page using an offset too large
- # this page will contain the offsets of the last existing pages
- u = req('http://ffffound.com/?offset=' + str(10**9) + '&')
- s = u.read()
- u.close()
- t = b'./\?offset=\d+&"'
- pattern = compile(t)
- match = findall(pattern, s)
- offset = int(match[-1][10:-2].decode()) # only the last one is important
- page = (offset + IMAGES_PER_PAGE) // IMAGES_PER_PAGE
- return page
- def req(URI):
- try:
- u = None
- while u == None:
- u = urlopen(Request(URI))
- return u
- except URLError as e:
- if hasattr(e, 'reason'):
- print(e.reason)
- elif hasattr(e, 'code'):
- print(e.code)
- if __name__ == "__main__":
- pages_to_identifiers()
- identifiers_to_links()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement