Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # DarkScrape V1.0
- # Improved version of my reddit scraper
- # Albeit not perfect it's a lot better as it uses SQL to store the findings in a DB file
- # By LKP
- # Python 2.7.5
- # Requires Requests, LXML and Colorama!
- #!/usr/bin/env python
- import sqlite3
- import requests
- from lxml import html
- from colorama import Fore, Back, Style, init
- init(autoreset=True)
- import time
- class website(object):
- pagelist=["http://www.reddit.com/r/minecraft","http://www.reddit.com/r/cubeworld","http://www.reddit.com/r/science"]
- def start(self):
- while True:
- print Style.BRIGHT+Fore.YELLOW+" Running "+Style.BRIGHT+Fore.RED+"Dark"+Fore.BLACK+"Scrape"
- self.redditMC()
- self.redditCW()
- self.redditSCI()
- self.finish()
- print Style.BRIGHT+Fore.YELLOW+" Waiting 10 minutes and then starting again!\n\n\n"
- time.sleep(600)
- self.start()
- def setup(self):
- connect = sqlite3.connect('DarkScrape.db')
- data = connect.cursor()
- print "Setting up table for minecraft!"
- data.execute('''CREATE TABLE reddit_minecraft(title,link,media)''')
- print "Set up complete!"
- print "Setting up table for cubeworld!"
- data.execute('''CREATE TABLE reddit_cubeworld(title,link,media)''')
- print "Set up complete!"
- print "Setting up table for science!"
- data.execute('''CREATE TABLE reddit_science(title,link,media)''')
- print "Set up complete!"
- def redditMC(self):
- connect = sqlite3.connect('DarkScrape.db')
- data = connect.cursor()
- page = requests.get(self.pagelist[0])
- tree = html.fromstring(page.text)
- title = tree.xpath('//a[@class="title "]/text()')
- link = tree.xpath('//li[@class="first"]/a/@href')
- media = tree.xpath('//p[@class="title"]/a/@href')
- A = 0
- B = len(title)
- print Style.BRIGHT+Fore.BLUE+"\n Extracting from Reddit Minecraft"
- data.execute("DELETE FROM reddit_minecraft")
- while A < B:
- data.execute("INSERT INTO reddit_minecraft VALUES (?, ?, ?)",(title[A], link[A], media[A]))
- connect.commit()
- A = A + 1
- data.close()
- def redditCW(self):
- connect = sqlite3.connect('DarkScrape.db')
- data = connect.cursor()
- page = requests.get(self.pagelist[1])
- tree = html.fromstring(page.text)
- title = tree.xpath('//a[@class="title "]/text()')
- link = tree.xpath('//li[@class="first"]/a/@href')
- media = tree.xpath('//p[@class="title"]/a/@href')
- A = 0
- B = len(title)
- print Style.BRIGHT+Fore.CYAN+"\n Extracting from Reddit Cubeworld"
- data.execute("DELETE FROM reddit_cubeworld")
- while A < B:
- data.execute("INSERT INTO reddit_cubeworld VALUES (?, ?, ?)",(title[A], link[A], media[A]))
- connect.commit()
- A = A + 1
- data.close()
- def redditSCI(self):
- connect = sqlite3.connect('DarkScrape.db')
- data = connect.cursor()
- page = requests.get(self.pagelist[2])
- tree = html.fromstring(page.text)
- title = tree.xpath('//a[@class="title "]/text()')
- link = tree.xpath('//li[@class="first"]/a/@href')
- media = tree.xpath('//p[@class="title"]/a/@href')
- A = 0
- B = len(title)
- print Style.BRIGHT+Fore.MAGENTA+"\n Extracting from Reddit Science"
- data.execute("DELETE FROM reddit_science")
- while A < B:
- data.execute("INSERT INTO reddit_science VALUES (?, ?, ?)",(title[A], link[A], media[A]))
- connect.commit()
- A = A + 1
- data.close()
- def finish(self):
- print Style.BRIGHT+Fore.GREEN+"\n Scrape Successful!\n"
- time.sleep(5)
- website = website()
- # website.setup() # Uncomment this line if you're running the script for the first time.
- website.start()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement