Advertisement
Guest User

Untitled

a guest
Mar 22nd, 2017
136
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 13.91 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. # IMPORTANT THINGS:
  3. # Made for Python 2.7.
  4. # REQUIRES BeatifulSoup4 (pip install beatifulsoup4).
  5. # REQUIRES Js2py (pip install js2py).
  6. # REQUIRES cloudflare-scrape (pip install cfscrape).
  7.  
  8. import requests, re, hashlib, json, os, cfscrape, sys, sched, time, bs4
  9.  
  10. # Defines the scheduler (for looping).
  11. scheduled = sched.scheduler(time.time, time.sleep)
  12.  
  13. # Main function (only in a function for the scheduler).
  14. def nyanpasu():
  15.  
  16.     # Yes I'm doing it this way
  17.     print("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n")
  18.  
  19.     # Checks if config.json exists or is equivalent to the default.
  20.     # If the file exists and it's not equivalent to the default, it loads the config and continues the script.
  21.     # Else it creates/overrides the file with the default and closes.
  22.     _config_default = "{\n\t\"username\": \"\",\n\t\"password\": \"\",\n\t\"forum_thread\": \"\",\n\t\"output_json\": \"output.json\",\n\t\"output_markup\": \"markup.txt\",\n\t\"full_prefix\": \"\",\n\t\"full_suffix\": \"\",\n\t\"repeat_time_s\": 1800\n}"
  23.     if os.path.isfile("config.json") and open("config.json", "r").read() != _config_default:
  24.         config = open("config.json", "r")
  25.         cfg = json.loads(config.read())
  26.     else:
  27.         _config = open("config.json", "w")
  28.         _config.write(_config_default)
  29.         _config.close()
  30.         print("Created default config.json. Configure and then run again.")
  31.         sys.exit()
  32.  
  33.     # Initialize some variables (Should make sense).
  34.     time_start = time.time()
  35.     forumThread = cfg["forum_thread"]
  36.     outputJson = cfg["output_json"]
  37.     outputRaw = cfg["output_markup"]
  38.     Username = cfg["username"]
  39.     Password = cfg["password"]
  40.     urlPrefix = "http://www.elitepvpers.com/forum/elite-gold-trading/"
  41.     urlSuffix = ".html"
  42.     loginURL = "http://www.elitepvpers.com/forum/login.php?do=login"
  43.  
  44.     print("Timestamp: " + str(time_start) + "\nThread to watch: " + forumThread + "\n")
  45.  
  46.     # Creates the UTF-8 MD5 (a really really secure algorithm) hash of the password.
  47.     pwmd5 = hashlib.md5(Password.encode("UTF-8")).hexdigest()
  48.  
  49.     # Prepares the POST values and headers for the login request.
  50.     payload = {
  51.         "vb_login_username": Username,
  52.         "s": "",
  53.         "securitytoken": "guest",
  54.         "do": "login",
  55.         "vb_login_md5password": pwmd5,
  56.         "vb_login_md5password_utf": pwmd5
  57.     }
  58.     headers = {
  59.         "Referer": "http://www.elitepvpers.com/forum/"
  60.     }
  61.  
  62.     # cfscrape.create_scraper() behaves identical to requests.Session(),
  63.     # except it bypasses any CloudFlare test pages.
  64.     # Assigns cfscrape.create_scraper() to the variable s.
  65.     with cfscrape.create_scraper() as s:
  66.         # Logs in, which sets the session variables that are required to do the other things.
  67.         print("Logging in...")
  68.         p = s.post(loginURL, data=payload, headers=headers)
  69.         print("Logged in! Starting...\n")
  70.  
  71.         # Fetches the first page of the post (main page).
  72.         print("Getting pagecount.")
  73.         r = s.get(urlPrefix + forumThread + urlSuffix)
  74.         # Parses the HTML to BeautifulSoup.
  75.         soup = bs4.BeautifulSoup(r.text, "html.parser")
  76.  
  77.         # Finds all the pagination elements (Only 2 exist, if at all).
  78.         pagecount = soup.find_all(class_="pagenav")
  79.         # If the 2 elements (could be more, don't know, but the script is prepared) are found, process them.
  80.         # Else, set it to one. This prevents the entire scraping part of the script from executing more than once.
  81.         if len(pagecount) > 1:
  82.             # Finds the "Page 1 of n"-element and parses it's innerHTML.
  83.             pagerange = pages.find_all(class_="vbmenu_control")[0].decode_contents(formatter="html")
  84.             # Regex to find the total number of pages
  85.             prange = re.findall("Page 1 of ([0-9]+)", str(soup2))
  86.             # Assigns the integer value to the variable pagecount.
  87.             pagecount = int("".join(prange[0]))
  88.         else:
  89.             # Sets pagecount to 1 to prevent looping the script.
  90.             pagecount = 1
  91.         print("Pagecount interpreted as: " + str(pagecount) + "\nStarting scraping:")
  92.  
  93.         # Loops between 1 (starts at 1, makes it easier to handle) and pagecount + 1 (because loop stops at pagecount < 1).
  94.         # Assigns the index to the variable baka.
  95.         for baka in range(1, pagecount + 1):
  96.             print("\tPage " + str(baka) + " of " + str(pagecount) + "\n")
  97.  
  98.             # Gets the page to scrape.
  99.             if baka == 1:
  100.                 # Gets the thread without any page value, so the first page.
  101.                 r = s.get(urlPrefix + forumThread + urlSuffix)
  102.             elif baka > 1:
  103.                 # If the index is larger than one, add the "-" and the index before the file extension,
  104.                 # in order to scrape the given page of the thread.
  105.                 r = s.get(urlPrefix + forumThread + "-" + str(baka) + urlSuffix)
  106.             # Parses the received content into BeautifulSoup.
  107.             soup = bs4.BeautifulSoup(r.text, "html.parser")
  108.             # Finds all the <div> elements with attribute "itemprop" set to "text".
  109.             # (Only posts/replies have that itemprop value).
  110.             posts = soup.findAll("div", {"itemprop": "text"})
  111.  
  112.             # Opens and clears the outputJson file.
  113.             output = open(outputJson, "w")
  114.             # Prepares it for JSON data.
  115.             output.write("[\n")
  116.             output.close()
  117.  
  118.             # If it's the first iteration, it skips the first post,
  119.             # because this will always be the thread itself (which we don't want to scrape).
  120.             # Also finds the post id (which will be the first post on the first page).
  121.             if baka == 1:
  122.                 # Regex to find the id of the main thread (which is the first post of the first page).
  123.                 __POSTID = re.findall("id=\"post_message_([0-9]+)\"", str(posts[0]))
  124.                 POSTID = __POSTID[0]
  125.                 print("\tPost ID: " + str(POSTID))
  126.                 posts = posts[1:]
  127.  
  128.             # Loops through all the posts (Except the first if it's the first page).
  129.             for nya in posts:
  130.                 # (Messy) regex to find all URLs linking to other posts on the same forum.
  131.                 regex = re.findall("href=\"(|\/\/|http(|s):\/\/)www\.elitepvpers\.com\/forum\/elite-gold-trading\/([0-9]+-(.+)){1}\.html\"", str(nya))
  132.                 # Loops through every found URL.
  133.                 for poi in regex:
  134.                     # Re-creates the full URL from the correct captured group.
  135.                     foundURL = "http://www.elitepvpers.com/forum/elite-gold-trading/" + poi[2] + ".html"
  136.  
  137.                     # Loads the thread, parses it, and looks for the first (main) post.
  138.                     print("\tFound " + foundURL + ", loading...")
  139.                     sh = s.get(foundURL)
  140.                     fs = bs4.BeautifulSoup(sh.text, "html.parser")
  141.                     thread = fs.findAll("div", {"itemprop": "text"})[0]
  142.                     print("\tFound main thread! (Hopefully)\n")
  143.  
  144.                     # Regex to find all URLs linking to a treasure.
  145.                     # Captures their IDs (which is all we need anyway).
  146.                     treasures = re.findall("href=\"(|\/\/|http(|s):\/\/)www\.elitepvpers\.com\/theblackmarket\/treasure\/([0-9]+)\"", str(thread))
  147.  
  148.                     # Loops through all found treasures.
  149.                     for desu in treasures:
  150.                         # Re-creates the full treasure URL.
  151.                         treasureID = "".join(desu)[2:]
  152.                         foundTreasure = "http://www.elitepvpers.com/theblackmarket/treasure/" + treasureID
  153.  
  154.                         # Loads/parses the treasure page.
  155.                         print("\t\tTreasure " + treasureID + ", loading...")
  156.                         ts = s.get(foundTreasure)
  157.                         tsp = bs4.BeautifulSoup(ts.text, "html.parser")
  158.                         tt = tsp.findAll("table", {"cellpadding": "5"})
  159.                         tr = bs4.BeautifulSoup(str(tt), "html.parser").findAll("tr")
  160.                         print("\t\tLoaded, processing...")
  161.  
  162.                         # Looks for the "Buyer" field first.
  163.                         # If this doesn't contain "n/a", it's sold, so we should skip it.
  164.                         vs02 = bs4.BeautifulSoup(str(tr[2]), "html.parser")
  165.                         if (str(vs02.findAll("td")[1].decode_contents(formatter="html")) != "n/a"):
  166.                             print("\t\t\tTreasure sold, skipping...\n")
  167.                         else:
  168.                             # If still unsold, it looks for the other values we need as well,
  169.                             # and puts them in the temporary dictionary.
  170.                             vs00 = bs4.BeautifulSoup(str(tr[0]), "html.parser")
  171.                             vs01 = bs4.BeautifulSoup(str(tr[1]), "html.parser")
  172.                             vs03 = bs4.BeautifulSoup(str(tr[3]), "html.parser")
  173.                             treasure = {
  174.                                 "id": treasureID,
  175.                                 "title": str(vs00.findAll("td")[1].decode_contents(formatter="html")),
  176.                                 "seller": str(vs01.findAll("td")[1].decode_contents(formatter="html")),
  177.                                 "cost": str(vs03.findAll("td")[1].decode_contents(formatter="html"))
  178.                             }
  179.  
  180.                             # Opens the outputJson file in append mode.
  181.                             output = open(outputJson, "a")
  182.                             # Appends the treasure to the file in JSON form, pretty-printed.
  183.                             output.write(json.dumps(treasure, sort_keys=True, indent=4, separators=(",", ": ")))
  184.                             # Makes sure that the JSON remains valid.
  185.                             output.write(",\n")
  186.                             output.close()
  187.                             print("\t\t\tTreasure saved!\n")
  188.             print("Finished scraping!\n")
  189.             # Removes the last 3 characters from the JSON file:
  190.             # ,\n
  191.             # Which got appended, but we don't want them.
  192.             output = open(outputJson, "rb+")
  193.             output.seek(-3, os.SEEK_END)
  194.             # Removes the 3 characters.
  195.             output.truncate()
  196.             output.close()
  197.  
  198.             # Closes the JSON, making it valid again.
  199.             output = open(outputJson, "a")
  200.             output.write("\n]")
  201.             output.close()
  202.  
  203.             # Re-opens the JSON and parses it into a list containing dictionaries.
  204.             output = open(outputJson, "r")
  205.             outputstr = output.read()
  206.             output.close()
  207.             outputl = json.loads(outputstr)
  208.  
  209.             # Prepares the outputRaw file containing the markup for the table.
  210.             markup = open(outputRaw, "w")
  211.             markup.write("\n[CENTER][table=\"head\"] Title | Cost | Treasure | Seller\n\n")
  212.             markup.close()
  213.  
  214.             # Opens the outputRaw file to append the individual table rows.
  215.             markup = open(outputRaw, "a")
  216.  
  217.             # Loops through every list element, formatting it correctly, and writes it to the file.
  218.             for oppai in outputl:
  219.                 seller = oppai["seller"].replace("<a href=", "[URL=")
  220.                 seller = seller.replace("\">", "\"]")
  221.                 seller = seller.replace("</a>", "[/URL]")
  222.                 markup.write(oppai["title"] + " | " + oppai["cost"] + " | [URL=\"http://www.elitepvpers.com/theblackmarket/treasure/" + oppai["id"] + "\"][IMG]http://www.elitepvpers.com/images/tbm/treasures.gif[/IMG][/URL] | " + seller + "\n\n")
  223.  
  224.             # Closes the outputRaw file, making it valid BBCode.
  225.             markup.write("[/table][/CENTER]\n")
  226.             markup.close()
  227.  
  228.             # Regex to get the security token needed to update and bump the post.
  229.             print("Fetching security token...")
  230.             st = s.get("http://www.elitepvpers.com/forum/")
  231.             sectokenr = re.findall("var SECURITYTOKEN = \"(.+)\";", st.text)
  232.             sectoken = sectokenr[0]
  233.             print("\tSecurity token: " + sectoken + "\n")
  234.  
  235.             # Opens the outputRaw file to read it's contents.
  236.             markup = open(outputRaw, "r")
  237.  
  238.             # Constructs the full post from the generated BBCode and the configured prefix and suffix.
  239.             full_message = cfg["full_prefix"] + markup.read() + cfg["full_suffix"]
  240.  
  241.             # Prepares the POST payload for the edit.
  242.             editpayload = {
  243.                 "securitytoken": sectoken,
  244.                 "do": "updatepost",
  245.                 "ajax": "1",
  246.                 "postid": POSTID,
  247.                 "wysiwyg": "0",
  248.                 "message": full_message,
  249.                 "reason": "",
  250.                 "postcount": "1"
  251.             }
  252.             # Constructs the full URL to edit the post.
  253.             editURL = "http://www.elitepvpers.com/forum/editpost.php?do=updatepost&p=" + POSTID
  254.             print("Updating...")
  255.             #edit = s.post(editurl, data=editpayload)
  256.             print("\tUpdated!\n")
  257.  
  258.             # Gets the thread id (post, thread, who cares).
  259.             thread_id_r = re.findall("([0-9]+){1}", forumThread)
  260.             # Prepares the POST payload to bump.
  261.             bumppayload = {
  262.                 "thread_id": thread_id_r[0],
  263.                 "coin_usage_count": "0",
  264.                 "securitytoken": sectoken
  265.             }
  266.             # Sends POST request to bump the thread.
  267.             print("Bumping thread...")
  268.             #bump = s.post("https://www.elitepvpers.com/forum/bump.php?do=do_bump", data=bumppayload)
  269.             print("\tBumped!\n")
  270.     # Print total time for the script.
  271.     time_finish = time.time()
  272.     print("Total time: " + str(time_finish - time_start) + "s")
  273.  
  274.     # Schedule next execution.
  275.     ev = scheduled.enter(cfg["repeat_time_s"], 1, nyanpasu, ())
  276. # Schedule first execution.
  277. scheduled.enter(0, 1, nyanpasu, ())
  278. scheduled.run()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement