Advertisement
LorenKPetrov

DarkScrape V1.0

Nov 16th, 2013
109
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.53 KB | None | 0 0
  1. # DarkScrape V1.0
  2. # Improved version of my reddit scraper
  3. # Albeit not perfect it's a lot better as it uses SQL to store the findings in a DB file
  4. # By LKP
  5. # Python 2.7.5
  6. # Requires Requests, LXML and Colorama!
  7.  
  8. #!/usr/bin/env python
  9. import sqlite3
  10. import requests
  11. from lxml import html
  12. from colorama import Fore, Back, Style, init
  13. init(autoreset=True)
  14. import time
  15.  
  16. class website(object):
  17.    
  18.     pagelist=["http://www.reddit.com/r/minecraft","http://www.reddit.com/r/cubeworld","http://www.reddit.com/r/science"]
  19.    
  20.     def start(self):
  21.        
  22.         while True:
  23.             print Style.BRIGHT+Fore.YELLOW+" Running "+Style.BRIGHT+Fore.RED+"Dark"+Fore.BLACK+"Scrape"
  24.             self.redditMC()
  25.             self.redditCW()
  26.             self.redditSCI()
  27.             self.finish()
  28.             print Style.BRIGHT+Fore.YELLOW+" Waiting 10 minutes and then starting again!\n\n\n"
  29.             time.sleep(600)    
  30.             self.start()
  31.  
  32.     def setup(self):
  33.         connect = sqlite3.connect('DarkScrape.db')
  34.         data = connect.cursor()
  35.        
  36.         print "Setting up table for minecraft!"
  37.         data.execute('''CREATE TABLE reddit_minecraft(title,link,media)''')
  38.         print "Set up complete!"
  39.         print "Setting up table for cubeworld!"
  40.         data.execute('''CREATE TABLE reddit_cubeworld(title,link,media)''')
  41.         print "Set up complete!"
  42.         print "Setting up table for science!"
  43.         data.execute('''CREATE TABLE reddit_science(title,link,media)''')
  44.         print "Set up complete!"
  45.    
  46.     def redditMC(self):
  47.        
  48.         connect = sqlite3.connect('DarkScrape.db')
  49.         data = connect.cursor()
  50.        
  51.         page = requests.get(self.pagelist[0])
  52.         tree = html.fromstring(page.text)
  53.         title = tree.xpath('//a[@class="title "]/text()')
  54.         link = tree.xpath('//li[@class="first"]/a/@href')
  55.         media = tree.xpath('//p[@class="title"]/a/@href')
  56.        
  57.         A = 0
  58.         B = len(title)
  59.         print Style.BRIGHT+Fore.BLUE+"\n Extracting from Reddit Minecraft"
  60.        
  61.         data.execute("DELETE FROM reddit_minecraft")
  62.        
  63.         while A < B:
  64.             data.execute("INSERT INTO reddit_minecraft VALUES (?, ?, ?)",(title[A], link[A], media[A]))
  65.             connect.commit()
  66.             A = A + 1
  67.  
  68.         data.close()
  69.        
  70.     def redditCW(self):
  71.        
  72.         connect = sqlite3.connect('DarkScrape.db')
  73.         data = connect.cursor()
  74.        
  75.         page = requests.get(self.pagelist[1])
  76.         tree = html.fromstring(page.text)
  77.         title = tree.xpath('//a[@class="title "]/text()')
  78.         link = tree.xpath('//li[@class="first"]/a/@href')
  79.         media = tree.xpath('//p[@class="title"]/a/@href')
  80.        
  81.         A = 0
  82.         B = len(title)
  83.         print Style.BRIGHT+Fore.CYAN+"\n Extracting from Reddit Cubeworld"
  84.        
  85.         data.execute("DELETE FROM reddit_cubeworld")
  86.        
  87.         while A < B:
  88.             data.execute("INSERT INTO reddit_cubeworld VALUES (?, ?, ?)",(title[A], link[A], media[A]))
  89.             connect.commit()
  90.             A = A + 1
  91.  
  92.         data.close()
  93.        
  94.     def redditSCI(self):
  95.        
  96.         connect = sqlite3.connect('DarkScrape.db')
  97.         data = connect.cursor()
  98.        
  99.         page = requests.get(self.pagelist[2])
  100.         tree = html.fromstring(page.text)
  101.         title = tree.xpath('//a[@class="title "]/text()')
  102.         link = tree.xpath('//li[@class="first"]/a/@href')
  103.         media = tree.xpath('//p[@class="title"]/a/@href')
  104.        
  105.         A = 0
  106.         B = len(title)
  107.         print Style.BRIGHT+Fore.MAGENTA+"\n Extracting from Reddit Science"
  108.        
  109.         data.execute("DELETE FROM reddit_science")
  110.        
  111.         while A < B:
  112.             data.execute("INSERT INTO reddit_science VALUES (?, ?, ?)",(title[A], link[A], media[A]))
  113.             connect.commit()
  114.             A = A + 1
  115.  
  116.         data.close()
  117.    
  118.     def finish(self):
  119.         print Style.BRIGHT+Fore.GREEN+"\n Scrape Successful!\n"
  120.         time.sleep(5)
  121.        
  122. website = website()
  123. # website.setup() # Uncomment this line if you're running the script for the first time.
  124. website.start()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement