Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Python Reddit Scraper V1.0
- # This script scrapes Reddit
- # PHP XML Render page: http://pastebin.com/nip125AJ
- # It then exports it to an XML file to be read whichever method you decide is best.
- # I wanted to create a scraper that would grab three things 1. The name of the post 2. The comments section URL and 3. The image / youtube video attributed to the post
- # NOTES
- # Coded on Python 2.7.5
- # Requires Requests and LXML modules
- # Coded by LKP from CodeCall.net
- # IMPORTS #
- from lxml import html # Imports HTML from LXML
- import xml.etree.cElementTree as XMLT # Imports element tree for python so it can write XML in the right style.
- import requests # imports the requests so Python can
- # VARIABLES #
- page = requests.get('http://www.reddit.com/r/minecraft') # Gets the page to scrape
- tree = html.fromstring(page.text) # converts the HTML page into a tree for XPATH to read
- title = tree.xpath('//a[@class="title "]/text()') # Grabs the Hyperlink text with the class named title NOTE: The space is supposed to be there, on Reddit the space is still there.
- link = tree.xpath('//li[@class="first"]/a/@href') # Similiar to above but grabs the Hyper link from the href tag from the li tag with the class "first".
- imgur = tree.xpath('//p[@class="title"]/a/@href') # and again with above it grabs the href tag within the paragraph tag.
- root = XMLT.Element("ENTRY") # This is my root XML tag so it doesn't become part of the loop
- start = 0 # This number was what I used during my While tag.
- total = len(title) # This counts the total of entries, to explain that a bit clearer if we liken it to a book, it's like counting the number of chapters in a book, I.e. 36 chapters.
- # MAIN CODE #
- while start < total: # While start (equal to 0) is less than the total (equal to however many variables are in the title list) do the following
- doc = XMLT.SubElement(root, "POST") # Writes the XML tag POST
- field1 = XMLT.SubElement(doc, "TITLE") # Writes the XML tag TITLE
- field1.text = title[start] # Writes the tag content for TITLE
- field2 = XMLT.SubElement(doc, "MEDIA") # Writes the XML tag MEDIA
- field2.text = imgur[start] # Writes the tag content for MEDIA
- field3 = XMLT.SubElement(doc, "LINK") # Writes the XML tag LINK
- field3.text = link[start] # Writes the tag content for LINK
- start = start + 1 # Adds 1 on to the variable 'start' so it will loop the code for the amount of times that the total is less than the start
- tree = XMLT.ElementTree(root) # Makes the ENTRY tag in the XML document
- tree.write("MC.xml") # finally, it writes the info to the specified XML document.
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement