Advertisement
Guest User

Untitled

a guest
Mar 23rd, 2017
86
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 14.47 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. # IMPORTANT THINGS:
  3. # Made for Python 2.7.
  4. # REQUIRES BeautifulSoup4 (pip install beautifulsoup4).
  5. # REQUIRES Js2py (pip install js2py).
  6. # REQUIRES cloudflare-scrape (pip install cfscrape).
  7.  
  8. import requests, re, hashlib, json, os, cfscrape, sys, sched, time, bs4, collections
  9.  
  10. # Defines the scheduler (for looping).
  11. scheduled = sched.scheduler(time.time, time.sleep)
  12.  
  13. # Main function (only in a function for the scheduler).
  14. def nyanpasu():
  15.  
  16.     # Yes I'm doing it this way
  17.     print("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n")
  18.  
  19.     # Checks if config.json exists or is equivalent to the default.
  20.     # If the file exists and it's not equivalent to the default, it loads the config and continues the script.
  21.     # Else it creates/overrides the file with the default and closes.
  22.     _config_default = "{\n\t\"username\": \"\",\n\t\"password\": \"\",\n\t\"forum_thread\": \"\",\n\t\"output_json\": \"output.json\",\n\t\"output_markup\": \"markup.txt\",\n\t\"full_prefix\": \"\",\n\t\"full_suffix\": \"\",\n\t\"repeat_time_s\": 1800,\n\t\"blacklist_posts\": [],\n\t\"blacklist_treasures\": []\n}"
  23.     if os.path.isfile("config.json") and open("config.json", "r").read() != _config_default:
  24.         config = open("config.json", "r")
  25.         cfg = json.loads(config.read())
  26.     else:
  27.         _config = open("config.json", "w")
  28.         _config.write(_config_default)
  29.         _config.close()
  30.         print("Created default config.json. Configure and then run again.")
  31.         sys.exit()
  32.  
  33.     # Initialize some variables (Should make sense).
  34.     time_start = time.time()
  35.     forumThread = cfg["forum_thread"]
  36.     outputJson = cfg["output_json"]
  37.     outputRaw = cfg["output_markup"]
  38.     Username = cfg["username"]
  39.     Password = cfg["password"]
  40.     BlacklistPosts = cfg["blacklist_posts"]
  41.     BlacklistTreasures = cfg["blacklist_treasures"]
  42.     urlPrefix = "http://www.elitepvpers.com/forum/elite-gold-trading/"
  43.     urlSuffix = ".html"
  44.     loginURL = "http://www.elitepvpers.com/forum/login.php?do=login"
  45.  
  46.     print("Timestamp: " + str(time_start) + "\nThread to watch: " + forumThread + "\n")
  47.  
  48.     # Creates the UTF-8 MD5 (a really really secure algorithm) hash of the password.
  49.     pwmd5 = hashlib.md5(Password.encode("UTF-8")).hexdigest()
  50.  
  51.     # Prepares the POST values and headers for the login request.
  52.     payload = {
  53.         "vb_login_username": Username,
  54.         "s": "",
  55.         "securitytoken": "guest",
  56.         "do": "login",
  57.         "vb_login_md5password": pwmd5,
  58.         "vb_login_md5password_utf": pwmd5
  59.     }
  60.     headers = {
  61.         "Referer": "http://www.elitepvpers.com/forum/"
  62.     }
  63.  
  64.     # cfscrape.create_scraper() behaves identical to requests.Session(),
  65.     # except it bypasses any CloudFlare test pages.
  66.     # Assigns cfscrape.create_scraper() to the variable s.
  67.     with cfscrape.create_scraper() as s:
  68.         # Logs in, which sets the session variables that are required to do the other things.
  69.         print("Logging in...")
  70.         p = s.post(loginURL, data=payload, headers=headers)
  71.         print("Logged in! Starting...\n")
  72.  
  73.         # Fetches the first page of the post (main page).
  74.         print("Getting pagecount.")
  75.         r = s.get(urlPrefix + forumThread + urlSuffix)
  76.         # Parses the HTML to BeautifulSoup.
  77.         soup = bs4.BeautifulSoup(r.text, "html.parser")
  78.  
  79.         # Finds all the pagination elements (Only 2 exist, if at all).
  80.         pagecount = soup.find_all(class_="pagenav")
  81.         # If the 2 elements (could be more, don't know, but the script is prepared) are found, process them.
  82.         # Else, set it to one. This prevents the entire scraping part of the script from executing more than once.
  83.         if len(pagecount) > 1:
  84.             # Finds the "Page 1 of n"-element and parses it's innerHTML.
  85.             pagerange = pages.find_all(class_="vbmenu_control")[0].decode_contents(formatter="html")
  86.             # Regex to find the total number of pages
  87.             prange = re.findall("Page 1 of ([0-9]+)", str(soup2))
  88.             # Assigns the integer value to the variable pagecount.
  89.             pagecount = int("".join(prange[0]))
  90.         else:
  91.             # Sets pagecount to 1 to prevent looping the script.
  92.             pagecount = 1
  93.         print("Pagecount interpreted as: " + str(pagecount) + "\nStarting scraping:")
  94.  
  95.         trlist = {}
  96.         # Loops between 1 (starts at 1, makes it easier to handle) and pagecount + 1 (because loop stops at pagecount < 1).
  97.         # Assigns the index to the variable baka.
  98.         for baka in range(1, pagecount + 1):
  99.             print("\tPage " + str(baka) + " of " + str(pagecount) + "\n")
  100.  
  101.             # Gets the page to scrape.
  102.             if baka == 1:
  103.                 # Gets the thread without any page value, so the first page.
  104.                 r = s.get(urlPrefix + forumThread + urlSuffix)
  105.             elif baka > 1:
  106.                 # If the index is larger than one, add the "-" and the index before the file extension,
  107.                 # in order to scrape the given page of the thread.
  108.                 r = s.get(urlPrefix + forumThread + "-" + str(baka) + urlSuffix)
  109.             # Parses the received content into BeautifulSoup.
  110.             soup = bs4.BeautifulSoup(r.text, "html.parser")
  111.             # Finds all the <div> elements with attribute "itemprop" set to "text".
  112.             # (Only posts/replies have that itemprop value).
  113.             posts = soup.findAll("div", {"itemprop": "text"})
  114.  
  115.             # Opens and clears the outputJson file.
  116.             output = open(outputJson, "w").close()
  117.  
  118.             # If it's the first iteration, it skips the first post,
  119.             # because this will always be the thread itself (which we don't want to scrape).
  120.             # Also finds the post id (which will be the first post on the first page).
  121.             if baka == 1:
  122.                 # Regex to find the id of the main thread (which is the first post of the first page).
  123.                 __POSTID = re.findall("id=\"post_message_([0-9]+)\"", str(posts[0]))
  124.                 POSTID = __POSTID[0]
  125.                 print("\tPost ID: " + str(POSTID))
  126.                 posts = posts[1:]
  127.  
  128.             # Loops through all the posts (Except the first if it's the first page).
  129.             for nya in posts:
  130.                 # (Messy) regex to find all URLs linking to other posts on the same forum.
  131.                 regex = re.findall("href=\"(|\/\/|http(|s):\/\/)www\.elitepvpers\.com\/forum\/elite-gold-trading\/([0-9]+-(.+)){1}\.html\"", str(nya))
  132.                 # Loops through every found URL.
  133.                 for poi in regex:
  134.                     if poi[2] not in BlacklistPosts:
  135.                         # Re-creates the full URL from the correct captured group.
  136.                         foundURL = "http://www.elitepvpers.com/forum/elite-gold-trading/" + poi[2] + ".html"
  137.  
  138.                         # Loads the thread, parses it, and looks for the first (main) post.
  139.                         print("\tFound " + foundURL + ", loading...")
  140.                         sh = s.get(foundURL)
  141.                         fs = bs4.BeautifulSoup(sh.text, "html.parser")
  142.                         thread = fs.findAll("div", {"itemprop": "text"})[0]
  143.                         print("\tFound main thread! (Hopefully)\n")
  144.  
  145.                         # Regex to find all URLs linking to a treasure.
  146.                         # Captures their IDs (which is all we need anyway).
  147.                         treasures = re.findall("href=\"(|\/\/|http(|s):\/\/)www\.elitepvpers\.com\/theblackmarket\/treasure\/([0-9]+)\"", str(thread))
  148.  
  149.                         # Loops through all found treasures.
  150.                         for desu in treasures:
  151.                             # Re-creates the full treasure URL.
  152.                             treasureID = "".join(desu)[2:]
  153.                             if treasureID not in BlacklistTreasures:
  154.                                 foundTreasure = "http://www.elitepvpers.com/theblackmarket/treasure/" + treasureID
  155.  
  156.                                 # Loads/parses the treasure page.
  157.                                 print("\t\tTreasure " + treasureID + ", loading...")
  158.                                 ts = s.get(foundTreasure)
  159.                                 tsp = bs4.BeautifulSoup(ts.text, "html.parser")
  160.                                 __verify = tsp.findAll("td", {"class": "contentwhite"})
  161.                                 _verify = bs4.BeautifulSoup(str(__verify), "html.parser")
  162.                                 verify = str(_verify.findChild("h2").decode_contents(formatter="html"))
  163.                                 if verify != "Page not found":
  164.                                     tt = tsp.findAll("table", {"cellpadding": "5"})
  165.                                     tr = bs4.BeautifulSoup(str(tt), "html.parser").findAll("tr")
  166.                                     print("\t\tLoaded, processing...")
  167.  
  168.                                     # Looks for the "Buyer" field first.
  169.                                     # If this doesn't contain "n/a", it's sold, so we should skip it.
  170.                                     vs02 = bs4.BeautifulSoup(str(tr[2]), "html.parser")
  171.                                     if (str(vs02.findAll("td")[1].decode_contents(formatter="html")) != "n/a"):
  172.                                         print("\t\t\tTreasure sold, skipping...\n")
  173.                                     else:
  174.                                         # If still unsold, it looks for the other values we need as well,
  175.                                         # and puts them in the temporary dictionary.
  176.                                         vs00 = bs4.BeautifulSoup(str(tr[0]), "html.parser")
  177.                                         vs01 = bs4.BeautifulSoup(str(tr[1]), "html.parser")
  178.                                         vs03 = bs4.BeautifulSoup(str(tr[3]), "html.parser")
  179.                                         trlist[str(vs00.findAll("td")[1].decode_contents(formatter="html"))] = {
  180.                                             "id": treasureID,
  181.                                             "seller": str(vs01.findAll("td")[1].decode_contents(formatter="html")),
  182.                                             "cost": str(vs03.findAll("td")[1].decode_contents(formatter="html"))
  183.                                         }
  184.  
  185.                                         # Opens the outputJson file in append mode.
  186.                                         #output = open(outputJson, "a")
  187.                                         # Appends the treasure to the file in JSON form, pretty-printed.
  188.                                         #output.write(json.dumps(treasure, sort_keys=True, indent=4, separators=(",", ": ")))
  189.                                         # Makes sure that the JSON remains valid.
  190.                                         #output.write(",\n")
  191.                                         #output.close()
  192.                                         print("\t\t\tTreasure saved!\n")
  193.                                 else:
  194.                                     print("\t\t\tTreasure not found. Skipping.\n")
  195.         output = open(outputJson, "a")
  196.         output.write(json.dumps(trlist, sort_keys=True, indent=4, separators=(",", ": ")))
  197.         output.close()
  198.         print("Finished scraping!\n")
  199.  
  200.         # Re-opens the JSON and parses it into a list containing dictionaries.
  201.         output = open(outputJson, "r")
  202.         outputstr = output.read()
  203.         output.close()
  204.         outputl = json.loads(outputstr)
  205.  
  206.         ordered = collections.OrderedDict(sorted(outputl.items()))
  207.         # Prepares the outputRaw file containing the markup for the table.
  208.         markup = open(outputRaw, "w")
  209.         markup.write("\n[CENTER][table=\"head\"] Title | Cost | Treasure | Seller\n\n")
  210.         markup.close()
  211.  
  212.         # Opens the outputRaw file to append the individual table rows.
  213.         markup = open(outputRaw, "a")
  214.  
  215.         # Loops through every list element, formatting it correctly, and writes it to the file.
  216.         for neko, oppai in ordered.items():
  217.             seller = oppai["seller"].replace("<a href=", "[URL=")
  218.             seller = seller.replace("\">", "\"]")
  219.             seller = seller.replace("</a>", "[/URL]")
  220.             markup.write(neko + " | " + oppai["cost"] + " | [URL=\"http://www.elitepvpers.com/theblackmarket/treasure/" + oppai["id"] + "\"][IMG]http://www.elitepvpers.com/images/tbm/treasures.gif[/IMG][/URL] | " + seller + "\n\n")
  221.  
  222.         # Closes the outputRaw file, making it valid BBCode.
  223.         markup.write("[/table][/CENTER]\n")
  224.         markup.close()
  225.  
  226.         # Regex to get the security token needed to update and bump the post.
  227.         print("Fetching security token...")
  228.         st = s.get("http://www.elitepvpers.com/forum/")
  229.         sectokenr = re.findall("var SECURITYTOKEN = \"(.+)\";", st.text)
  230.         sectoken = sectokenr[0]
  231.         print("\tSecurity token: " + sectoken + "\n")
  232.  
  233.         # Opens the outputRaw file to read it's contents.
  234.         markup = open(outputRaw, "r")
  235.  
  236.         # Constructs the full post from the generated BBCode and the configured prefix and suffix.
  237.         full_message = cfg["full_prefix"] + markup.read() + cfg["full_suffix"]
  238.  
  239.         # Prepares the POST payload for the edit.
  240.         editpayload = {
  241.             "securitytoken": sectoken,
  242.             "do": "updatepost",
  243.             "ajax": "1",
  244.             "postid": POSTID,
  245.             "wysiwyg": "0",
  246.             "message": full_message,
  247.             "reason": "",
  248.             "postcount": "1"
  249.         }
  250.         # Constructs the full URL to edit the post.
  251.         editURL = "http://www.elitepvpers.com/forum/editpost.php?do=updatepost&p=" + POSTID
  252.         print("Updating...")
  253.         edit = s.post(editURL, data=editpayload)
  254.         print("\tUpdated!\n")
  255.  
  256.         # Gets the thread id (post, thread, who cares).
  257.         thread_id_r = re.findall("([0-9]+){1}", forumThread)
  258.         # Prepares the POST payload to bump.
  259.         bumppayload = {
  260.             "thread_id": thread_id_r[0],
  261.             "coin_usage_count": "0",
  262.             "securitytoken": sectoken
  263.         }
  264.         # Sends POST request to bump the thread.
  265.         print("Bumping thread...")
  266.         bump = s.post("https://www.elitepvpers.com/forum/bump.php?do=do_bump", data=bumppayload)
  267.         print("\tBumped!\n")
  268.     # Print total time for the script.
  269.     time_finish = time.time()
  270.     print("Total time: " + str(time_finish - time_start) + "s")
  271.  
  272.     # Schedule next execution.
  273.     #ev = scheduled.enter(cfg["repeat_time_s"], 1, nyanpasu, ())
  274. # Schedule first execution.
  275. scheduled.enter(0, 1, nyanpasu, ())
  276. scheduled.run()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement