SHARE
TWEET
Furaffinity Scraper for Hydrus Network.
a guest
Feb 11th, 2015
155
Never
- # Fur Affinity Scraper for Hydrus Network.
- # Tested in python 2.7.8
- # This script provides methodes to retrieve image hashes from Furaffinity.
- # It will compile all the hashes it finds into a database suitable for import into the Hydrus Network program (http://8ch.net/hydrus/).
- # Requires the following nonstandard libraries:
- # requests (http://docs.python-requests.org)
- # beautifulSoup (http://www.crummy.com/software/BeautifulSoup/)
- import sqlite3
- import requests
- import re
- import bs4
- import hashlib
- import time
- class FurAffinityScraper:
- def __init__( self, userName, password, userAgent ):
- print("Starting scraper ... ")
- # Set up database
- self.database = sqlite3.connect("furAffinity.db")
- self.cursor = self.database.cursor()
- # Check to see if the database have been initiated.
- self.cursor.execute( "SELECT name FROM sqlite_master WHERE type='table' AND name='hash_type';" )
- response = self.cursor.fetchone()
- if response:
- print("Database already initiated")
- else:
- print("Initiating database")
- self.InitDatabase()
- # Set up session
- self.session = requests.Session()
- print("Logging into furaffinity...")
- if not userName:
- raise "Username is missing!"
- elif not password:
- raise "Password is missing!"
- if userAgent:
- self.session.headers["User-Agent"] = userAgent
- else:
- print("No custom user-agent found. You may encounter problems because of this.")
- #Login
- response = self.session.post( "https://www.furaffinity.net/login/",data={"action":"login","retard_protection":"1","name":userName,"pass":password,"login":"Login to FurAffinity"})
- if response.url == "http://www.furaffinity.net/":
- print("Scraper logged in on account: "+userName)
- else:
- raise "Could not log in! This is properly a problem with the username or password."
- def InitDatabase ( self ):
- self.cursor.execute("CREATE TABLE hash_type ( hash_type INTEGER );")
- self.cursor.execute("INSERT INTO hash_type VALUES (2)")
- self.cursor.execute("CREATE TABLE hashes ( hash_id INTEGER PRIMARY KEY, hash BLOB_BYTES );")
- self.cursor.execute("CREATE TABLE mappings ( hash_id INTEGER, tag_id INTEGER, PRIMARY KEY( hash_id, tag_id ) );")
- self.cursor.execute("CREATE TABLE namespaces ( namespace TEXT);")
- self.cursor.execute("INSERT INTO namespaces VALUES ('')")
- self.cursor.execute("INSERT INTO namespaces VALUES ('creator')")
- self.cursor.execute("INSERT INTO namespaces VALUES ('title')")
- self.cursor.execute("INSERT INTO namespaces VALUES ('source')")
- self.cursor.execute("INSERT INTO namespaces VALUES ('age-rating')")
- self.cursor.execute("CREATE TABLE tags ( tag_id INTEGER PRIMARY KEY, tag TEXT );")
- def ScrapeRange( self, startAt, stopAt, printUpdateEvery=10 ):
- # Set lastBatch
- beganMain = time.clock()
- self.lastBatch = {"timeStarted":time.clock(),"available":0}
- current = startAt
- print("Beginning to scrape "+str(current)+"-"+str(stopAt)+", showing status every "+str(printUpdateEvery)+" images.")
- while current <= stopAt:
- gotPage = self.ScrapePage( current, True )
- if gotPage:
- self.lastBatch["available"] += 1
- if (current-startAt+1)%printUpdateEvery==0:
- deltaTime = time.clock()-self.lastBatch["timeStarted"]
- print("Batch "+str(current-printUpdateEvery+1)+"-"+str(current)+": "+str(self.lastBatch["available"])+" images available, took "+str(deltaTime)+" seconds. Average: "+"{:.2f}".format(deltaTime/max(1,self.lastBatch["available"]))+" seconds pr image.")
- self.lastBatch["timeStarted"] = time.clock()
- self.lastBatch["available"] = 0
- current += 1
- print("Ending scrape of furaffinity ids:"+str(startAt)+"-"+str(stopAt)+". Took "+str(time.clock()-beganMain)+" seconds!")
- # Returns true if the image was scraped, false if not
- def ScrapePage( self, pageId, print_=True ):
- response = self.session.head( "https://www.furaffinity.net/view/"+str(pageId)+"/" )
- # Check to make sure the page exists
- if "vary" in response.headers:
- if print_:
- print("Image:"+str(pageId)+" found, beginning scraping ...")
- response = self.session.get( "https://www.furaffinity.net/view/"+str(pageId)+"/" )
- else:
- if print_:
- print("Image"+str(pageId)+" not found, skipping")
- return False
- # Make a soup to easily navigate it
- soup = bs4.BeautifulSoup(response.text)
- # Get image title and uploader
- nameAndUploader = soup.find("td", align="left", class_="cat", valign="top", width="70%")
- title = str(nameAndUploader.b.string)
- uploader = str(nameAndUploader.a.string)
- # Get tags / keywords
- tags = []
- keywords = soup.find("div",id="keywords")
- if keywords:
- for child in keywords:
- if child and child.string and len(child.string) > 0 and child.string[len(child.string)-1]!="\n":
- tagShort = str(child.string)
- if tagShort[-1] == ",":
- tagShort = tagShort[0:-1]
- tags.append( tagShort )
- tags.append( "title:" + title )
- tags.append( "creator:" + uploader )
- # Get source-link (Added to tags)
- #tags.append( "source:" + str(response.url) )
- # Get age rating
- ageRating = re.search( "[^ ]*", soup.find("div",align="center").img.get("alt") ).group(0)
- tags.append( "age-rating:" + str(ageRating).lower() )
- # Get image hash
- imagePath = "https:" + soup.find("a",text=re.compile("Download")).get("href")
- hash_ = self.GetImageHash( imagePath )
- # Add to database
- self.AddFileToDatabase( hash_, tags )
- return True
- def GetImageHash ( self, imagePath ):
- r = requests.get(imagePath,stream=True)
- hash_ = hashlib.sha256()
- for chunk in r.iter_content(chunk_size=1024):
- if chunk: # filter out keep-alive new chunks
- hash_.update(chunk)
- return buffer(hash_.digest())#.hexdigest()
- def AddFileToDatabase ( self, hash_, tags ):
- self.cursor.execute("SELECT * from hashes WHERE hash=:hash LIMIT 1;", {"hash": hash_} )
- data = self.cursor.fetchone()
- if not data:
- self.cursor.execute("INSERT INTO hashes(hash) VALUES (:hash);",{"hash":hash_})
- self.cursor.execute("SELECT * from hashes WHERE hash=:hash LIMIT 1;", {"hash": hash_} )
- data = self.cursor.fetchone()
- hash_id = data[0]
- for tag in tags:
- self.cursor.execute("INSERT OR IGNORE INTO mappings VALUES (:hash, :tag);",{"hash":hash_id,"tag":self.GetTagNumber(tag)})
- self.database.commit()
- def GetTagNumber ( self, tag ):
- self.cursor.execute("SELECT tag_id from tags WHERE tag=:tag LIMIT 1;", {"tag": tag} )
- data = self.cursor.fetchone()
- #print(tag,data)
- if data:
- return data[0]
- else:
- self.cursor.execute("INSERT INTO tags(tag) VALUES (:tag);",{"tag":tag})
- return self.GetTagNumber( tag )
- def Stop ( self ):
- self.database.commit()
- self.database.close()
- # How to use:
- # First create the scraper object. (If its not obvious, that is the username and password for a furaffinity user. The useragent can be whatever you want.)
- scraper = FurAffinityScraper("USERNAME","PASSWORD","USERAGENT")
- # Now you can scrape away.
- # Just use one of the two functions to do that:
- # scraper.ScrapePage( pageID ) : (Scrapes a single page)
- scraper.ScrapePage( 9001 )
- # scraper.ScrapeRange( startPageID, endPageID, statusInterval ) : (Scrapes every page between start and end, including both. statusInterval determins at which interval a quick status message is printed to the screen.)
- scraper.ScrapeRange( 1, 100, 10 )
- # When scraping pages, if a page is missing, the scraper will just skip it.
- # When you are done scraping stuff, call the stop function:
- scraper.Stop() # Closes the database connection.
RAW Paste Data
