f4spider_v3.py

from urllib.request import Request, urlopen
from urllib.error import URLError

from re import compile, findall

IMAGES_PER_PAGE = 25
BYTES_PER_KB = 1024
db = []
links = []

def pages_to_identifiers():
    t = b'<blockquote id="asseti\w{40}"'
    pattern = compile(t)
    last_page = last_page_number()

    for i in range(last_page + 1):
        offset = int(i) * IMAGES_PER_PAGE - IMAGES_PER_PAGE
        u = req('http://ffffound.com/?offset=' + str(offset) + '&')
        s = u.read()
        u.close()

        match = findall(pattern, s)

        for j in match:
            j = j[22:-1].decode()
            if j not in db:
                db.append(j)

        if i % 5 == 0:
            print(i, "/", last_page, "pages -> identifiers")

    with open("db.txt", "w") as db_file:
        db_file.write("\n".join(db))

def identifiers_to_links():
    with open("db.txt", "r") as db_file:
        db = db_file.read().split("\n")

    t = b'src="http://img.ffffound.com/static-data/assets/+\d/\w{40}_m.[a-z]{3}"'
    pattern = compile(t)
    times = 0

    for i in db:
        u = req('http://ffffound.com/image/' + i)
        s = u.read()
        u.close()

        match = findall(pattern, s)
        if len(links) % 10 == 0:
            print(len(links), "/", len(db), "identifiers -> links")
        if len(match) > 0 and len(match[0]) > 5:
            URI = match[0][5:-1].decode()
            links.append(URI)

    with open("links.txt", "w") as link_file:
        link_file.write("\n".join(links.sort()))

def last_page_number():
    # the idea is to request a special page using an offset too large
    # this page will contain the offsets of the last existing pages
    u = req('http://ffffound.com/?offset=' + str(10**9) + '&')
    s = u.read()
    u.close()

    t = b'./\?offset=\d+&"'
    pattern = compile(t)
    match = findall(pattern, s)
    offset = int(match[-1][10:-2].decode()) # only the last one is important
    page = (offset + IMAGES_PER_PAGE) // IMAGES_PER_PAGE
    return page

def req(URI):
    try:
        u = None
        while u == None:
            u = urlopen(Request(URI))
        return u

    except URLError as e:
        if hasattr(e, 'reason'):
            print(e.reason)
        elif hasattr(e, 'code'):
            print(e.code)

if __name__ == "__main__":
    pages_to_identifiers()
    identifiers_to_links()