Untitled

# -*- coding: utf-8 -*-
# IMPORTANT THINGS:
# Made for Python 2.7.
# REQUIRES BeautifulSoup4 (pip install beautifulsoup4).
# REQUIRES Js2py (pip install js2py).
# REQUIRES cloudflare-scrape (pip install cfscrape).

import requests, re, hashlib, json, os, cfscrape, sys, sched, time, bs4, collections

# Defines the scheduler (for looping).
scheduled = sched.scheduler(time.time, time.sleep)

# Main function (only in a function for the scheduler).
def nyanpasu():

    # Yes I'm doing it this way
    print("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n")

    # Checks if config.json exists or is equivalent to the default.
    # If the file exists and it's not equivalent to the default, it loads the config and continues the script.
    # Else it creates/overrides the file with the default and closes.
    _config_default = "{\n\t\"username\": \"\",\n\t\"password\": \"\",\n\t\"forum_thread\": \"\",\n\t\"output_json\": \"output.json\",\n\t\"output_markup\": \"markup.txt\",\n\t\"full_prefix\": \"\",\n\t\"full_suffix\": \"\",\n\t\"repeat_time_s\": 1800,\n\t\"blacklist_posts\": [],\n\t\"blacklist_treasures\": []\n}"
    if os.path.isfile("config.json") and open("config.json", "r").read() != _config_default:
        config = open("config.json", "r")
        cfg = json.loads(config.read())
    else:
        _config = open("config.json", "w")
        _config.write(_config_default)
        _config.close()
        print("Created default config.json. Configure and then run again.")
        sys.exit()

    # Initialize some variables (Should make sense).
    time_start = time.time()
    forumThread = cfg["forum_thread"]
    outputJson = cfg["output_json"]
    outputRaw = cfg["output_markup"]
    Username = cfg["username"]
    Password = cfg["password"]
    BlacklistPosts = cfg["blacklist_posts"]
    BlacklistTreasures = cfg["blacklist_treasures"]
    urlPrefix = "http://www.elitepvpers.com/forum/elite-gold-trading/"
    urlSuffix = ".html"
    loginURL = "http://www.elitepvpers.com/forum/login.php?do=login"

    print("Timestamp: " + str(time_start) + "\nThread to watch: " + forumThread + "\n")

    # Creates the UTF-8 MD5 (a really really secure algorithm) hash of the password.
    pwmd5 = hashlib.md5(Password.encode("UTF-8")).hexdigest()

    # Prepares the POST values and headers for the login request.
    payload = {
        "vb_login_username": Username,
        "s": "",
        "securitytoken": "guest",
        "do": "login",
        "vb_login_md5password": pwmd5,
        "vb_login_md5password_utf": pwmd5
    }
    headers = {
        "Referer": "http://www.elitepvpers.com/forum/"
    }

    # cfscrape.create_scraper() behaves identical to requests.Session(),
    # except it bypasses any CloudFlare test pages.
    # Assigns cfscrape.create_scraper() to the variable s.
    with cfscrape.create_scraper() as s:
        # Logs in, which sets the session variables that are required to do the other things.
        print("Logging in...")
        p = s.post(loginURL, data=payload, headers=headers)
        print("Logged in! Starting...\n")

        # Fetches the first page of the post (main page).
        print("Getting pagecount.")
        r = s.get(urlPrefix + forumThread + urlSuffix)
        # Parses the HTML to BeautifulSoup.
        soup = bs4.BeautifulSoup(r.text, "html.parser")

        # Finds all the pagination elements (Only 2 exist, if at all).
        pagecount = soup.find_all(class_="pagenav")
        # If the 2 elements (could be more, don't know, but the script is prepared) are found, process them.
        # Else, set it to one. This prevents the entire scraping part of the script from executing more than once.
        if len(pagecount) > 1:
            # Finds the "Page 1 of n"-element and parses it's innerHTML.
            pagerange = pages.find_all(class_="vbmenu_control")[0].decode_contents(formatter="html")
            # Regex to find the total number of pages
            prange = re.findall("Page 1 of ([0-9]+)", str(soup2))
            # Assigns the integer value to the variable pagecount.
            pagecount = int("".join(prange[0]))
        else:
            # Sets pagecount to 1 to prevent looping the script.
            pagecount = 1
        print("Pagecount interpreted as: " + str(pagecount) + "\nStarting scraping:")

        trlist = {}
        # Loops between 1 (starts at 1, makes it easier to handle) and pagecount + 1 (because loop stops at pagecount < 1).
        # Assigns the index to the variable baka.
        for baka in range(1, pagecount + 1):
            print("\tPage " + str(baka) + " of " + str(pagecount) + "\n")

            # Gets the page to scrape.
            if baka == 1:
                # Gets the thread without any page value, so the first page.
                r = s.get(urlPrefix + forumThread + urlSuffix)
            elif baka > 1:
                # If the index is larger than one, add the "-" and the index before the file extension,
                # in order to scrape the given page of the thread.
                r = s.get(urlPrefix + forumThread + "-" + str(baka) + urlSuffix)
            # Parses the received content into BeautifulSoup.
            soup = bs4.BeautifulSoup(r.text, "html.parser")
            # Finds all the <div> elements with attribute "itemprop" set to "text".
            # (Only posts/replies have that itemprop value).
            posts = soup.findAll("div", {"itemprop": "text"})

            # Opens and clears the outputJson file.
            output = open(outputJson, "w").close()

            # If it's the first iteration, it skips the first post,
            # because this will always be the thread itself (which we don't want to scrape).
            # Also finds the post id (which will be the first post on the first page).
            if baka == 1:
                # Regex to find the id of the main thread (which is the first post of the first page).
                __POSTID = re.findall("id=\"post_message_([0-9]+)\"", str(posts[0]))
                POSTID = __POSTID[0]
                print("\tPost ID: " + str(POSTID))
                posts = posts[1:]

            # Loops through all the posts (Except the first if it's the first page).
            for nya in posts:
                # (Messy) regex to find all URLs linking to other posts on the same forum.
                regex = re.findall("href=\"(|\/\/|http(|s):\/\/)www\.elitepvpers\.com\/forum\/elite-gold-trading\/([0-9]+-(.+)){1}\.html\"", str(nya))
                # Loops through every found URL.
                for poi in regex:
                    if poi[2] not in BlacklistPosts:
                        # Re-creates the full URL from the correct captured group.
                        foundURL = "http://www.elitepvpers.com/forum/elite-gold-trading/" + poi[2] + ".html"

                        # Loads the thread, parses it, and looks for the first (main) post.
                        print("\tFound " + foundURL + ", loading...")
                        sh = s.get(foundURL)
                        fs = bs4.BeautifulSoup(sh.text, "html.parser")
                        thread = fs.findAll("div", {"itemprop": "text"})[0]
                        print("\tFound main thread! (Hopefully)\n")

                        # Regex to find all URLs linking to a treasure.
                        # Captures their IDs (which is all we need anyway).
                        treasures = re.findall("href=\"(|\/\/|http(|s):\/\/)www\.elitepvpers\.com\/theblackmarket\/treasure\/([0-9]+)\"", str(thread))

                        # Loops through all found treasures.
                        for desu in treasures:
                            # Re-creates the full treasure URL.
                            treasureID = "".join(desu)[2:]
                            if treasureID not in BlacklistTreasures:
                                foundTreasure = "http://www.elitepvpers.com/theblackmarket/treasure/" + treasureID

                                # Loads/parses the treasure page.
                                print("\t\tTreasure " + treasureID + ", loading...")
                                ts = s.get(foundTreasure)
                                tsp = bs4.BeautifulSoup(ts.text, "html.parser")
                                __verify = tsp.findAll("td", {"class": "contentwhite"})
                                _verify = bs4.BeautifulSoup(str(__verify), "html.parser")
                                verify = str(_verify.findChild("h2").decode_contents(formatter="html"))
                                if verify != "Page not found":
                                    tt = tsp.findAll("table", {"cellpadding": "5"})
                                    tr = bs4.BeautifulSoup(str(tt), "html.parser").findAll("tr")
                                    print("\t\tLoaded, processing...")

                                    # Looks for the "Buyer" field first.
                                    # If this doesn't contain "n/a", it's sold, so we should skip it.
                                    vs02 = bs4.BeautifulSoup(str(tr[2]), "html.parser")
                                    if (str(vs02.findAll("td")[1].decode_contents(formatter="html")) != "n/a"):
                                        print("\t\t\tTreasure sold, skipping...\n")
                                    else:
                                        # If still unsold, it looks for the other values we need as well,
                                        # and puts them in the temporary dictionary.
                                        vs00 = bs4.BeautifulSoup(str(tr[0]), "html.parser")
                                        vs01 = bs4.BeautifulSoup(str(tr[1]), "html.parser")
                                        vs03 = bs4.BeautifulSoup(str(tr[3]), "html.parser")
                                        trlist[str(vs00.findAll("td")[1].decode_contents(formatter="html"))] = {
                                            "id": treasureID,
                                            "seller": str(vs01.findAll("td")[1].decode_contents(formatter="html")),
                                            "cost": str(vs03.findAll("td")[1].decode_contents(formatter="html"))
                                        }

                                        # Opens the outputJson file in append mode.
                                        #output = open(outputJson, "a")
                                        # Appends the treasure to the file in JSON form, pretty-printed.
                                        #output.write(json.dumps(treasure, sort_keys=True, indent=4, separators=(",", ": ")))
                                        # Makes sure that the JSON remains valid.
                                        #output.write(",\n")
                                        #output.close()
                                        print("\t\t\tTreasure saved!\n")
                                else:
                                    print("\t\t\tTreasure not found. Skipping.\n")
        output = open(outputJson, "a")
        output.write(json.dumps(trlist, sort_keys=True, indent=4, separators=(",", ": ")))
        output.close()
        print("Finished scraping!\n")

        # Re-opens the JSON and parses it into a list containing dictionaries.
        output = open(outputJson, "r")
        outputstr = output.read()
        output.close()
        outputl = json.loads(outputstr)

        ordered = collections.OrderedDict(sorted(outputl.items()))
        # Prepares the outputRaw file containing the markup for the table.
        markup = open(outputRaw, "w")
        markup.write("\n[CENTER][table=\"head\"] Title | Cost | Treasure | Seller\n\n")
        markup.close()

        # Opens the outputRaw file to append the individual table rows.
        markup = open(outputRaw, "a")

        # Loops through every list element, formatting it correctly, and writes it to the file.
        for neko, oppai in ordered.items():
            seller = oppai["seller"].replace("<a href=", "[URL=")
            seller = seller.replace("\">", "\"]")
            seller = seller.replace("</a>", "[/URL]")
            markup.write(neko + " | " + oppai["cost"] + " | [URL=\"http://www.elitepvpers.com/theblackmarket/treasure/" + oppai["id"] + "\"][IMG]http://www.elitepvpers.com/images/tbm/treasures.gif[/IMG][/URL] | " + seller + "\n\n")

        # Closes the outputRaw file, making it valid BBCode.
        markup.write("[/table][/CENTER]\n")
        markup.close()

        # Regex to get the security token needed to update and bump the post.
        print("Fetching security token...")
        st = s.get("http://www.elitepvpers.com/forum/")
        sectokenr = re.findall("var SECURITYTOKEN = \"(.+)\";", st.text)
        sectoken = sectokenr[0]
        print("\tSecurity token: " + sectoken + "\n")

        # Opens the outputRaw file to read it's contents.
        markup = open(outputRaw, "r")

        # Constructs the full post from the generated BBCode and the configured prefix and suffix.
        full_message = cfg["full_prefix"] + markup.read() + cfg["full_suffix"]

        # Prepares the POST payload for the edit.
        editpayload = {
            "securitytoken": sectoken,
            "do": "updatepost",
            "ajax": "1",
            "postid": POSTID,
            "wysiwyg": "0",
            "message": full_message,
            "reason": "",
            "postcount": "1"
        }
        # Constructs the full URL to edit the post.
        editURL = "http://www.elitepvpers.com/forum/editpost.php?do=updatepost&p=" + POSTID
        print("Updating...")
        edit = s.post(editURL, data=editpayload)
        print("\tUpdated!\n")

        # Gets the thread id (post, thread, who cares).
        thread_id_r = re.findall("([0-9]+){1}", forumThread)
        # Prepares the POST payload to bump.
        bumppayload = {
            "thread_id": thread_id_r[0],
            "coin_usage_count": "0",
            "securitytoken": sectoken
        }
        # Sends POST request to bump the thread.
        print("Bumping thread...")
        bump = s.post("https://www.elitepvpers.com/forum/bump.php?do=do_bump", data=bumppayload)
        print("\tBumped!\n")
    # Print total time for the script.
    time_finish = time.time()
    print("Total time: " + str(time_finish - time_start) + "s")

    # Schedule next execution.
    #ev = scheduled.enter(cfg["repeat_time_s"], 1, nyanpasu, ())
# Schedule first execution.
scheduled.enter(0, 1, nyanpasu, ())
scheduled.run()