Untitled

# -*- coding: utf-8 -*-
# IMPORTANT THINGS:
# Made for Python 2.7.
# REQUIRES BeatifulSoup4 (pip install beatifulsoup4).
# REQUIRES Js2py (pip install js2py).
# REQUIRES cloudflare-scrape (pip install cfscrape).

import requests, re, hashlib, json, os, cfscrape, sys, sched, time, bs4

# Defines the scheduler (for looping).
scheduled = sched.scheduler(time.time, time.sleep)

# Main function (only in a function for the scheduler).
def nyanpasu():

    # Yes I'm doing it this way
    print("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n")

    # Checks if config.json exists or is equivalent to the default.
    # If the file exists and it's not equivalent to the default, it loads the config and continues the script.
    # Else it creates/overrides the file with the default and closes.
    _config_default = "{\n\t\"username\": \"\",\n\t\"password\": \"\",\n\t\"forum_thread\": \"\",\n\t\"output_json\": \"output.json\",\n\t\"output_markup\": \"markup.txt\",\n\t\"full_prefix\": \"\",\n\t\"full_suffix\": \"\",\n\t\"repeat_time_s\": 1800\n}"
    if os.path.isfile("config.json") and open("config.json", "r").read() != _config_default:
        config = open("config.json", "r")
        cfg = json.loads(config.read())
    else:
        _config = open("config.json", "w")
        _config.write(_config_default)
        _config.close()
        print("Created default config.json. Configure and then run again.")
        sys.exit()

    # Initialize some variables (Should make sense).
    time_start = time.time()
    forumThread = cfg["forum_thread"]
    outputJson = cfg["output_json"]
    outputRaw = cfg["output_markup"]
    Username = cfg["username"]
    Password = cfg["password"]
    urlPrefix = "http://www.elitepvpers.com/forum/elite-gold-trading/"
    urlSuffix = ".html"
    loginURL = "http://www.elitepvpers.com/forum/login.php?do=login"

    print("Timestamp: " + str(time_start) + "\nThread to watch: " + forumThread + "\n")

    # Creates the UTF-8 MD5 (a really really secure algorithm) hash of the password.
    pwmd5 = hashlib.md5(Password.encode("UTF-8")).hexdigest()

    # Prepares the POST values and headers for the login request.
    payload = {
        "vb_login_username": Username,
        "s": "",
        "securitytoken": "guest",
        "do": "login",
        "vb_login_md5password": pwmd5,
        "vb_login_md5password_utf": pwmd5
    }
    headers = {
        "Referer": "http://www.elitepvpers.com/forum/"
    }

    # cfscrape.create_scraper() behaves identical to requests.Session(),
    # except it bypasses any CloudFlare test pages.
    # Assigns cfscrape.create_scraper() to the variable s.
    with cfscrape.create_scraper() as s:
        # Logs in, which sets the session variables that are required to do the other things.
        print("Logging in...")
        p = s.post(loginURL, data=payload, headers=headers)
        print("Logged in! Starting...\n")

        # Fetches the first page of the post (main page).
        print("Getting pagecount.")
        r = s.get(urlPrefix + forumThread + urlSuffix)
        # Parses the HTML to BeautifulSoup.
        soup = bs4.BeautifulSoup(r.text, "html.parser")

        # Finds all the pagination elements (Only 2 exist, if at all).
        pagecount = soup.find_all(class_="pagenav")
        # If the 2 elements (could be more, don't know, but the script is prepared) are found, process them.
        # Else, set it to one. This prevents the entire scraping part of the script from executing more than once.
        if len(pagecount) > 1:
            # Finds the "Page 1 of n"-element and parses it's innerHTML.
            pagerange = pages.find_all(class_="vbmenu_control")[0].decode_contents(formatter="html")
            # Regex to find the total number of pages
            prange = re.findall("Page 1 of ([0-9]+)", str(soup2))
            # Assigns the integer value to the variable pagecount.
            pagecount = int("".join(prange[0]))
        else:
            # Sets pagecount to 1 to prevent looping the script.
            pagecount = 1
        print("Pagecount interpreted as: " + str(pagecount) + "\nStarting scraping:")

        # Loops between 1 (starts at 1, makes it easier to handle) and pagecount + 1 (because loop stops at pagecount < 1).
        # Assigns the index to the variable baka.
        for baka in range(1, pagecount + 1):
            print("\tPage " + str(baka) + " of " + str(pagecount) + "\n")

            # Gets the page to scrape.
            if baka == 1:
                # Gets the thread without any page value, so the first page.
                r = s.get(urlPrefix + forumThread + urlSuffix)
            elif baka > 1:
                # If the index is larger than one, add the "-" and the index before the file extension,
                # in order to scrape the given page of the thread.
                r = s.get(urlPrefix + forumThread + "-" + str(baka) + urlSuffix)
            # Parses the received content into BeautifulSoup.
            soup = bs4.BeautifulSoup(r.text, "html.parser")
            # Finds all the <div> elements with attribute "itemprop" set to "text".
            # (Only posts/replies have that itemprop value).
            posts = soup.findAll("div", {"itemprop": "text"})

            # Opens and clears the outputJson file.
            output = open(outputJson, "w")
            # Prepares it for JSON data.
            output.write("[\n")
            output.close()

            # If it's the first iteration, it skips the first post,
            # because this will always be the thread itself (which we don't want to scrape).
            # Also finds the post id (which will be the first post on the first page).
            if baka == 1:
                # Regex to find the id of the main thread (which is the first post of the first page).
                __POSTID = re.findall("id=\"post_message_([0-9]+)\"", str(posts[0]))
                POSTID = __POSTID[0]
                print("\tPost ID: " + str(POSTID))
                posts = posts[1:]

            # Loops through all the posts (Except the first if it's the first page).
            for nya in posts:
                # (Messy) regex to find all URLs linking to other posts on the same forum.
                regex = re.findall("href=\"(|\/\/|http(|s):\/\/)www\.elitepvpers\.com\/forum\/elite-gold-trading\/([0-9]+-(.+)){1}\.html\"", str(nya))
                # Loops through every found URL.
                for poi in regex:
                    # Re-creates the full URL from the correct captured group.
                    foundURL = "http://www.elitepvpers.com/forum/elite-gold-trading/" + poi[2] + ".html"

                    # Loads the thread, parses it, and looks for the first (main) post.
                    print("\tFound " + foundURL + ", loading...")
                    sh = s.get(foundURL)
                    fs = bs4.BeautifulSoup(sh.text, "html.parser")
                    thread = fs.findAll("div", {"itemprop": "text"})[0]
                    print("\tFound main thread! (Hopefully)\n")

                    # Regex to find all URLs linking to a treasure.
                    # Captures their IDs (which is all we need anyway).
                    treasures = re.findall("href=\"(|\/\/|http(|s):\/\/)www\.elitepvpers\.com\/theblackmarket\/treasure\/([0-9]+)\"", str(thread))

                    # Loops through all found treasures.
                    for desu in treasures:
                        # Re-creates the full treasure URL.
                        treasureID = "".join(desu)[2:]
                        foundTreasure = "http://www.elitepvpers.com/theblackmarket/treasure/" + treasureID

                        # Loads/parses the treasure page.
                        print("\t\tTreasure " + treasureID + ", loading...")
                        ts = s.get(foundTreasure)
                        tsp = bs4.BeautifulSoup(ts.text, "html.parser")
                        tt = tsp.findAll("table", {"cellpadding": "5"})
                        tr = bs4.BeautifulSoup(str(tt), "html.parser").findAll("tr")
                        print("\t\tLoaded, processing...")

                        # Looks for the "Buyer" field first.
                        # If this doesn't contain "n/a", it's sold, so we should skip it.
                        vs02 = bs4.BeautifulSoup(str(tr[2]), "html.parser")
                        if (str(vs02.findAll("td")[1].decode_contents(formatter="html")) != "n/a"):
                            print("\t\t\tTreasure sold, skipping...\n")
                        else:
                            # If still unsold, it looks for the other values we need as well,
                            # and puts them in the temporary dictionary.
                            vs00 = bs4.BeautifulSoup(str(tr[0]), "html.parser")
                            vs01 = bs4.BeautifulSoup(str(tr[1]), "html.parser")
                            vs03 = bs4.BeautifulSoup(str(tr[3]), "html.parser")
                            treasure = {
                                "id": treasureID,
                                "title": str(vs00.findAll("td")[1].decode_contents(formatter="html")),
                                "seller": str(vs01.findAll("td")[1].decode_contents(formatter="html")),
                                "cost": str(vs03.findAll("td")[1].decode_contents(formatter="html"))
                            }

                            # Opens the outputJson file in append mode.
                            output = open(outputJson, "a")
                            # Appends the treasure to the file in JSON form, pretty-printed.
                            output.write(json.dumps(treasure, sort_keys=True, indent=4, separators=(",", ": ")))
                            # Makes sure that the JSON remains valid.
                            output.write(",\n")
                            output.close()
                            print("\t\t\tTreasure saved!\n")
            print("Finished scraping!\n")
            # Removes the last 3 characters from the JSON file:
            # ,\n
            # Which got appended, but we don't want them.
            output = open(outputJson, "rb+")
            output.seek(-3, os.SEEK_END)
            # Removes the 3 characters.
            output.truncate()
            output.close()

            # Closes the JSON, making it valid again.
            output = open(outputJson, "a")
            output.write("\n]")
            output.close()

            # Re-opens the JSON and parses it into a list containing dictionaries.
            output = open(outputJson, "r")
            outputstr = output.read()
            output.close()
            outputl = json.loads(outputstr)

            # Prepares the outputRaw file containing the markup for the table.
            markup = open(outputRaw, "w")
            markup.write("\n[CENTER][table=\"head\"] Title | Cost | Treasure | Seller\n\n")
            markup.close()

            # Opens the outputRaw file to append the individual table rows.
            markup = open(outputRaw, "a")

            # Loops through every list element, formatting it correctly, and writes it to the file.
            for oppai in outputl:
                seller = oppai["seller"].replace("<a href=", "[URL=")
                seller = seller.replace("\">", "\"]")
                seller = seller.replace("</a>", "[/URL]")
                markup.write(oppai["title"] + " | " + oppai["cost"] + " | [URL=\"http://www.elitepvpers.com/theblackmarket/treasure/" + oppai["id"] + "\"][IMG]http://www.elitepvpers.com/images/tbm/treasures.gif[/IMG][/URL] | " + seller + "\n\n")

            # Closes the outputRaw file, making it valid BBCode.
            markup.write("[/table][/CENTER]\n")
            markup.close()

            # Regex to get the security token needed to update and bump the post.
            print("Fetching security token...")
            st = s.get("http://www.elitepvpers.com/forum/")
            sectokenr = re.findall("var SECURITYTOKEN = \"(.+)\";", st.text)
            sectoken = sectokenr[0]
            print("\tSecurity token: " + sectoken + "\n")

            # Opens the outputRaw file to read it's contents.
            markup = open(outputRaw, "r")

            # Constructs the full post from the generated BBCode and the configured prefix and suffix.
            full_message = cfg["full_prefix"] + markup.read() + cfg["full_suffix"]

            # Prepares the POST payload for the edit.
            editpayload = {
                "securitytoken": sectoken,
                "do": "updatepost",
                "ajax": "1",
                "postid": POSTID,
                "wysiwyg": "0",
                "message": full_message,
                "reason": "",
                "postcount": "1"
            }
            # Constructs the full URL to edit the post.
            editURL = "http://www.elitepvpers.com/forum/editpost.php?do=updatepost&p=" + POSTID
            print("Updating...")
            #edit = s.post(editurl, data=editpayload)
            print("\tUpdated!\n")

            # Gets the thread id (post, thread, who cares).
            thread_id_r = re.findall("([0-9]+){1}", forumThread)
            # Prepares the POST payload to bump.
            bumppayload = {
                "thread_id": thread_id_r[0],
                "coin_usage_count": "0",
                "securitytoken": sectoken
            }
            # Sends POST request to bump the thread.
            print("Bumping thread...")
            #bump = s.post("https://www.elitepvpers.com/forum/bump.php?do=do_bump", data=bumppayload)
            print("\tBumped!\n")
    # Print total time for the script.
    time_finish = time.time()
    print("Total time: " + str(time_finish - time_start) + "s")

    # Schedule next execution.
    ev = scheduled.enter(cfg["repeat_time_s"], 1, nyanpasu, ())
# Schedule first execution.
scheduled.enter(0, 1, nyanpasu, ())
scheduled.run()