Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- # IMPORTANT THINGS:
- # Made for Python 2.7.
- # REQUIRES BeautifulSoup4 (pip install beautifulsoup4).
- # REQUIRES Js2py (pip install js2py).
- # REQUIRES cloudflare-scrape (pip install cfscrape).
- import requests, re, hashlib, json, os, cfscrape, sys, sched, time, bs4, collections
- # Defines the scheduler (for looping).
- scheduled = sched.scheduler(time.time, time.sleep)
- # Main function (only in a function for the scheduler).
- def nyanpasu():
- # Yes I'm doing it this way
- print("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n")
- # Checks if config.json exists or is equivalent to the default.
- # If the file exists and it's not equivalent to the default, it loads the config and continues the script.
- # Else it creates/overrides the file with the default and closes.
- _config_default = "{\n\t\"username\": \"\",\n\t\"password\": \"\",\n\t\"forum_thread\": \"\",\n\t\"output_json\": \"output.json\",\n\t\"output_markup\": \"markup.txt\",\n\t\"full_prefix\": \"\",\n\t\"full_suffix\": \"\",\n\t\"repeat_time_s\": 1800,\n\t\"blacklist_posts\": [],\n\t\"blacklist_treasures\": []\n}"
- if os.path.isfile("config.json") and open("config.json", "r").read() != _config_default:
- config = open("config.json", "r")
- cfg = json.loads(config.read())
- else:
- _config = open("config.json", "w")
- _config.write(_config_default)
- _config.close()
- print("Created default config.json. Configure and then run again.")
- sys.exit()
- # Initialize some variables (Should make sense).
- time_start = time.time()
- forumThread = cfg["forum_thread"]
- outputJson = cfg["output_json"]
- outputRaw = cfg["output_markup"]
- Username = cfg["username"]
- Password = cfg["password"]
- BlacklistPosts = cfg["blacklist_posts"]
- BlacklistTreasures = cfg["blacklist_treasures"]
- urlPrefix = "http://www.elitepvpers.com/forum/elite-gold-trading/"
- urlSuffix = ".html"
- loginURL = "http://www.elitepvpers.com/forum/login.php?do=login"
- print("Timestamp: " + str(time_start) + "\nThread to watch: " + forumThread + "\n")
- # Creates the UTF-8 MD5 (a really really secure algorithm) hash of the password.
- pwmd5 = hashlib.md5(Password.encode("UTF-8")).hexdigest()
- # Prepares the POST values and headers for the login request.
- payload = {
- "vb_login_username": Username,
- "s": "",
- "securitytoken": "guest",
- "do": "login",
- "vb_login_md5password": pwmd5,
- "vb_login_md5password_utf": pwmd5
- }
- headers = {
- "Referer": "http://www.elitepvpers.com/forum/"
- }
- # cfscrape.create_scraper() behaves identical to requests.Session(),
- # except it bypasses any CloudFlare test pages.
- # Assigns cfscrape.create_scraper() to the variable s.
- with cfscrape.create_scraper() as s:
- # Logs in, which sets the session variables that are required to do the other things.
- print("Logging in...")
- p = s.post(loginURL, data=payload, headers=headers)
- print("Logged in! Starting...\n")
- # Fetches the first page of the post (main page).
- print("Getting pagecount.")
- r = s.get(urlPrefix + forumThread + urlSuffix)
- # Parses the HTML to BeautifulSoup.
- soup = bs4.BeautifulSoup(r.text, "html.parser")
- # Finds all the pagination elements (Only 2 exist, if at all).
- pagecount = soup.find_all(class_="pagenav")
- # If the 2 elements (could be more, don't know, but the script is prepared) are found, process them.
- # Else, set it to one. This prevents the entire scraping part of the script from executing more than once.
- if len(pagecount) > 1:
- # Finds the "Page 1 of n"-element and parses it's innerHTML.
- pagerange = pages.find_all(class_="vbmenu_control")[0].decode_contents(formatter="html")
- # Regex to find the total number of pages
- prange = re.findall("Page 1 of ([0-9]+)", str(soup2))
- # Assigns the integer value to the variable pagecount.
- pagecount = int("".join(prange[0]))
- else:
- # Sets pagecount to 1 to prevent looping the script.
- pagecount = 1
- print("Pagecount interpreted as: " + str(pagecount) + "\nStarting scraping:")
- trlist = {}
- # Loops between 1 (starts at 1, makes it easier to handle) and pagecount + 1 (because loop stops at pagecount < 1).
- # Assigns the index to the variable baka.
- for baka in range(1, pagecount + 1):
- print("\tPage " + str(baka) + " of " + str(pagecount) + "\n")
- # Gets the page to scrape.
- if baka == 1:
- # Gets the thread without any page value, so the first page.
- r = s.get(urlPrefix + forumThread + urlSuffix)
- elif baka > 1:
- # If the index is larger than one, add the "-" and the index before the file extension,
- # in order to scrape the given page of the thread.
- r = s.get(urlPrefix + forumThread + "-" + str(baka) + urlSuffix)
- # Parses the received content into BeautifulSoup.
- soup = bs4.BeautifulSoup(r.text, "html.parser")
- # Finds all the <div> elements with attribute "itemprop" set to "text".
- # (Only posts/replies have that itemprop value).
- posts = soup.findAll("div", {"itemprop": "text"})
- # Opens and clears the outputJson file.
- output = open(outputJson, "w").close()
- # If it's the first iteration, it skips the first post,
- # because this will always be the thread itself (which we don't want to scrape).
- # Also finds the post id (which will be the first post on the first page).
- if baka == 1:
- # Regex to find the id of the main thread (which is the first post of the first page).
- __POSTID = re.findall("id=\"post_message_([0-9]+)\"", str(posts[0]))
- POSTID = __POSTID[0]
- print("\tPost ID: " + str(POSTID))
- posts = posts[1:]
- # Loops through all the posts (Except the first if it's the first page).
- for nya in posts:
- # (Messy) regex to find all URLs linking to other posts on the same forum.
- regex = re.findall("href=\"(|\/\/|http(|s):\/\/)www\.elitepvpers\.com\/forum\/elite-gold-trading\/([0-9]+-(.+)){1}\.html\"", str(nya))
- # Loops through every found URL.
- for poi in regex:
- if poi[2] not in BlacklistPosts:
- # Re-creates the full URL from the correct captured group.
- foundURL = "http://www.elitepvpers.com/forum/elite-gold-trading/" + poi[2] + ".html"
- # Loads the thread, parses it, and looks for the first (main) post.
- print("\tFound " + foundURL + ", loading...")
- sh = s.get(foundURL)
- fs = bs4.BeautifulSoup(sh.text, "html.parser")
- thread = fs.findAll("div", {"itemprop": "text"})[0]
- print("\tFound main thread! (Hopefully)\n")
- # Regex to find all URLs linking to a treasure.
- # Captures their IDs (which is all we need anyway).
- treasures = re.findall("href=\"(|\/\/|http(|s):\/\/)www\.elitepvpers\.com\/theblackmarket\/treasure\/([0-9]+)\"", str(thread))
- # Loops through all found treasures.
- for desu in treasures:
- # Re-creates the full treasure URL.
- treasureID = "".join(desu)[2:]
- if treasureID not in BlacklistTreasures:
- foundTreasure = "http://www.elitepvpers.com/theblackmarket/treasure/" + treasureID
- # Loads/parses the treasure page.
- print("\t\tTreasure " + treasureID + ", loading...")
- ts = s.get(foundTreasure)
- tsp = bs4.BeautifulSoup(ts.text, "html.parser")
- __verify = tsp.findAll("td", {"class": "contentwhite"})
- _verify = bs4.BeautifulSoup(str(__verify), "html.parser")
- verify = str(_verify.findChild("h2").decode_contents(formatter="html"))
- if verify != "Page not found":
- tt = tsp.findAll("table", {"cellpadding": "5"})
- tr = bs4.BeautifulSoup(str(tt), "html.parser").findAll("tr")
- print("\t\tLoaded, processing...")
- # Looks for the "Buyer" field first.
- # If this doesn't contain "n/a", it's sold, so we should skip it.
- vs02 = bs4.BeautifulSoup(str(tr[2]), "html.parser")
- if (str(vs02.findAll("td")[1].decode_contents(formatter="html")) != "n/a"):
- print("\t\t\tTreasure sold, skipping...\n")
- else:
- # If still unsold, it looks for the other values we need as well,
- # and puts them in the temporary dictionary.
- vs00 = bs4.BeautifulSoup(str(tr[0]), "html.parser")
- vs01 = bs4.BeautifulSoup(str(tr[1]), "html.parser")
- vs03 = bs4.BeautifulSoup(str(tr[3]), "html.parser")
- trlist[str(vs00.findAll("td")[1].decode_contents(formatter="html"))] = {
- "id": treasureID,
- "seller": str(vs01.findAll("td")[1].decode_contents(formatter="html")),
- "cost": str(vs03.findAll("td")[1].decode_contents(formatter="html"))
- }
- # Opens the outputJson file in append mode.
- #output = open(outputJson, "a")
- # Appends the treasure to the file in JSON form, pretty-printed.
- #output.write(json.dumps(treasure, sort_keys=True, indent=4, separators=(",", ": ")))
- # Makes sure that the JSON remains valid.
- #output.write(",\n")
- #output.close()
- print("\t\t\tTreasure saved!\n")
- else:
- print("\t\t\tTreasure not found. Skipping.\n")
- output = open(outputJson, "a")
- output.write(json.dumps(trlist, sort_keys=True, indent=4, separators=(",", ": ")))
- output.close()
- print("Finished scraping!\n")
- # Re-opens the JSON and parses it into a list containing dictionaries.
- output = open(outputJson, "r")
- outputstr = output.read()
- output.close()
- outputl = json.loads(outputstr)
- ordered = collections.OrderedDict(sorted(outputl.items()))
- # Prepares the outputRaw file containing the markup for the table.
- markup = open(outputRaw, "w")
- markup.write("\n[CENTER][table=\"head\"] Title | Cost | Treasure | Seller\n\n")
- markup.close()
- # Opens the outputRaw file to append the individual table rows.
- markup = open(outputRaw, "a")
- # Loops through every list element, formatting it correctly, and writes it to the file.
- for neko, oppai in ordered.items():
- seller = oppai["seller"].replace("<a href=", "[URL=")
- seller = seller.replace("\">", "\"]")
- seller = seller.replace("</a>", "[/URL]")
- markup.write(neko + " | " + oppai["cost"] + " | [URL=\"http://www.elitepvpers.com/theblackmarket/treasure/" + oppai["id"] + "\"][IMG]http://www.elitepvpers.com/images/tbm/treasures.gif[/IMG][/URL] | " + seller + "\n\n")
- # Closes the outputRaw file, making it valid BBCode.
- markup.write("[/table][/CENTER]\n")
- markup.close()
- # Regex to get the security token needed to update and bump the post.
- print("Fetching security token...")
- st = s.get("http://www.elitepvpers.com/forum/")
- sectokenr = re.findall("var SECURITYTOKEN = \"(.+)\";", st.text)
- sectoken = sectokenr[0]
- print("\tSecurity token: " + sectoken + "\n")
- # Opens the outputRaw file to read it's contents.
- markup = open(outputRaw, "r")
- # Constructs the full post from the generated BBCode and the configured prefix and suffix.
- full_message = cfg["full_prefix"] + markup.read() + cfg["full_suffix"]
- # Prepares the POST payload for the edit.
- editpayload = {
- "securitytoken": sectoken,
- "do": "updatepost",
- "ajax": "1",
- "postid": POSTID,
- "wysiwyg": "0",
- "message": full_message,
- "reason": "",
- "postcount": "1"
- }
- # Constructs the full URL to edit the post.
- editURL = "http://www.elitepvpers.com/forum/editpost.php?do=updatepost&p=" + POSTID
- print("Updating...")
- edit = s.post(editURL, data=editpayload)
- print("\tUpdated!\n")
- # Gets the thread id (post, thread, who cares).
- thread_id_r = re.findall("([0-9]+){1}", forumThread)
- # Prepares the POST payload to bump.
- bumppayload = {
- "thread_id": thread_id_r[0],
- "coin_usage_count": "0",
- "securitytoken": sectoken
- }
- # Sends POST request to bump the thread.
- print("Bumping thread...")
- bump = s.post("https://www.elitepvpers.com/forum/bump.php?do=do_bump", data=bumppayload)
- print("\tBumped!\n")
- # Print total time for the script.
- time_finish = time.time()
- print("Total time: " + str(time_finish - time_start) + "s")
- # Schedule next execution.
- #ev = scheduled.enter(cfg["repeat_time_s"], 1, nyanpasu, ())
- # Schedule first execution.
- scheduled.enter(0, 1, nyanpasu, ())
- scheduled.run()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement