SHARE
TWEET

Furaffinity Scraper for Hydrus Network.

a guest Feb 11th, 2015 155 Never
  1. # Fur Affinity Scraper for Hydrus Network.
  2. #       Tested in python 2.7.8
  3. #       This script provides methodes to retrieve image hashes from Furaffinity.
  4. #       It will compile all the hashes it finds into a database suitable for import into the Hydrus Network program (http://8ch.net/hydrus/).
  5.  
  6. # Requires the following nonstandard libraries:
  7. #       requests (http://docs.python-requests.org)
  8. #       beautifulSoup (http://www.crummy.com/software/BeautifulSoup/)
  9.  
  10. import sqlite3
  11. import requests
  12. import re
  13. import bs4
  14. import hashlib
  15. import time
  16.  
  17. class FurAffinityScraper:
  18.        
  19.         def __init__( self, userName, password, userAgent ):
  20.                 print("Starting scraper ... ")
  21.  
  22.                 # Set up database
  23.                 self.database = sqlite3.connect("furAffinity.db")
  24.                 self.cursor = self.database.cursor()
  25.  
  26.                 # Check to see if the database have been initiated.
  27.                 self.cursor.execute( "SELECT name FROM sqlite_master WHERE type='table' AND name='hash_type';" )
  28.                 response = self.cursor.fetchone()
  29.                 if response:
  30.                         print("Database already initiated")
  31.                 else:
  32.                         print("Initiating database")
  33.                         self.InitDatabase()
  34.  
  35.                 # Set up session
  36.                 self.session = requests.Session()
  37.                 print("Logging into furaffinity...")
  38.                
  39.                 if not userName:
  40.                         raise "Username is missing!"
  41.                 elif not password:
  42.                         raise "Password is missing!"
  43.                
  44.                 if userAgent:
  45.                         self.session.headers["User-Agent"] = userAgent
  46.                 else:
  47.                         print("No custom user-agent found. You may encounter problems because of this.")
  48.  
  49.                 #Login
  50.                 response = self.session.post( "https://www.furaffinity.net/login/",data={"action":"login","retard_protection":"1","name":userName,"pass":password,"login":"Login to FurAffinity"})
  51.                 if response.url == "http://www.furaffinity.net/":
  52.                         print("Scraper logged in on account: "+userName)
  53.                 else:
  54.                         raise "Could not log in! This is properly a problem with the username or password."
  55.  
  56.         def InitDatabase ( self ):
  57.                 self.cursor.execute("CREATE TABLE hash_type ( hash_type INTEGER );")
  58.                 self.cursor.execute("INSERT INTO hash_type VALUES (2)")
  59.  
  60.                 self.cursor.execute("CREATE TABLE hashes ( hash_id INTEGER PRIMARY KEY, hash BLOB_BYTES );")
  61.  
  62.                 self.cursor.execute("CREATE TABLE mappings ( hash_id INTEGER, tag_id INTEGER, PRIMARY KEY( hash_id, tag_id ) );")
  63.  
  64.                 self.cursor.execute("CREATE TABLE namespaces ( namespace TEXT);")
  65.                 self.cursor.execute("INSERT INTO namespaces VALUES ('')")
  66.                 self.cursor.execute("INSERT INTO namespaces VALUES ('creator')")
  67.                 self.cursor.execute("INSERT INTO namespaces VALUES ('title')")
  68.                 self.cursor.execute("INSERT INTO namespaces VALUES ('source')")
  69.                 self.cursor.execute("INSERT INTO namespaces VALUES ('age-rating')")
  70.  
  71.                 self.cursor.execute("CREATE TABLE tags ( tag_id INTEGER PRIMARY KEY, tag TEXT );")
  72.  
  73.         def ScrapeRange( self, startAt, stopAt, printUpdateEvery=10 ):
  74.                 # Set lastBatch
  75.                 beganMain = time.clock()
  76.                 self.lastBatch = {"timeStarted":time.clock(),"available":0}
  77.  
  78.                 current = startAt
  79.  
  80.                 print("Beginning to scrape "+str(current)+"-"+str(stopAt)+", showing status every "+str(printUpdateEvery)+" images.")
  81.  
  82.                 while current <= stopAt:
  83.                         gotPage = self.ScrapePage( current, True )
  84.                         if gotPage:
  85.                                 self.lastBatch["available"] += 1
  86.                         if (current-startAt+1)%printUpdateEvery==0:
  87.                                 deltaTime = time.clock()-self.lastBatch["timeStarted"]
  88.                                 print("Batch "+str(current-printUpdateEvery+1)+"-"+str(current)+": "+str(self.lastBatch["available"])+" images available, took "+str(deltaTime)+" seconds. Average: "+"{:.2f}".format(deltaTime/max(1,self.lastBatch["available"]))+" seconds pr image.")
  89.                                 self.lastBatch["timeStarted"] = time.clock()
  90.                                 self.lastBatch["available"] = 0
  91.                         current += 1
  92.  
  93.                 print("Ending scrape of furaffinity ids:"+str(startAt)+"-"+str(stopAt)+". Took "+str(time.clock()-beganMain)+" seconds!")
  94.        
  95.         # Returns true if the image was scraped, false if not
  96.         def ScrapePage( self, pageId, print_=True ):
  97.                 response = self.session.head( "https://www.furaffinity.net/view/"+str(pageId)+"/" )
  98.  
  99.                 # Check to make sure the page exists
  100.                 if "vary" in response.headers:
  101.                         if print_:
  102.                                 print("Image:"+str(pageId)+" found, beginning scraping ...")
  103.                         response = self.session.get( "https://www.furaffinity.net/view/"+str(pageId)+"/" )
  104.                 else:
  105.                         if print_:
  106.                                 print("Image"+str(pageId)+" not found, skipping")
  107.                         return False
  108.  
  109.                 # Make a soup to easily navigate it
  110.                 soup = bs4.BeautifulSoup(response.text)
  111.  
  112.                 # Get image title and uploader
  113.                 nameAndUploader = soup.find("td", align="left", class_="cat", valign="top", width="70%")
  114.                 title = str(nameAndUploader.b.string)
  115.                 uploader = str(nameAndUploader.a.string)
  116.  
  117.                 # Get tags / keywords
  118.                 tags = []
  119.  
  120.                 keywords = soup.find("div",id="keywords")
  121.                 if keywords:
  122.                         for child in keywords:
  123.                                 if child and child.string and len(child.string) > 0 and child.string[len(child.string)-1]!="\n":
  124.                                         tagShort = str(child.string)
  125.                                         if tagShort[-1] == ",":
  126.                                                 tagShort = tagShort[0:-1]
  127.                                         tags.append( tagShort )
  128.  
  129.                 tags.append( "title:" + title )
  130.                 tags.append( "creator:" + uploader )
  131.  
  132.                 # Get source-link (Added to tags)
  133.                 #tags.append( "source:" + str(response.url) )
  134.  
  135.                 # Get age rating
  136.                 ageRating = re.search( "[^ ]*", soup.find("div",align="center").img.get("alt") ).group(0)
  137.                 tags.append( "age-rating:" + str(ageRating).lower() )
  138.  
  139.                 # Get image hash
  140.                 imagePath = "https:" + soup.find("a",text=re.compile("Download")).get("href")
  141.                 hash_ = self.GetImageHash( imagePath )
  142.  
  143.                 # Add to database
  144.                 self.AddFileToDatabase( hash_, tags )
  145.                 return True
  146.  
  147.         def GetImageHash ( self, imagePath ):
  148.                 r = requests.get(imagePath,stream=True)
  149.                 hash_ = hashlib.sha256()
  150.                 for chunk in r.iter_content(chunk_size=1024):
  151.                         if chunk: # filter out keep-alive new chunks
  152.                                 hash_.update(chunk)
  153.  
  154.                 return buffer(hash_.digest())#.hexdigest()
  155.  
  156.         def AddFileToDatabase ( self, hash_, tags ):
  157.                 self.cursor.execute("SELECT * from hashes WHERE hash=:hash LIMIT 1;", {"hash": hash_} )
  158.                 data = self.cursor.fetchone()
  159.                 if not data:
  160.                         self.cursor.execute("INSERT INTO hashes(hash) VALUES (:hash);",{"hash":hash_})
  161.                         self.cursor.execute("SELECT * from hashes WHERE hash=:hash LIMIT 1;", {"hash": hash_} )
  162.                         data = self.cursor.fetchone()
  163.  
  164.                 hash_id = data[0]
  165.                
  166.                 for tag in tags:
  167.                         self.cursor.execute("INSERT OR IGNORE INTO mappings VALUES (:hash, :tag);",{"hash":hash_id,"tag":self.GetTagNumber(tag)})
  168.  
  169.                 self.database.commit()
  170.                
  171.         def GetTagNumber ( self, tag ):
  172.                 self.cursor.execute("SELECT tag_id from tags WHERE tag=:tag LIMIT 1;", {"tag": tag} )
  173.                 data = self.cursor.fetchone()
  174.                 #print(tag,data)
  175.                 if data:
  176.                         return data[0]
  177.                 else:
  178.                         self.cursor.execute("INSERT INTO tags(tag) VALUES (:tag);",{"tag":tag})
  179.                         return self.GetTagNumber( tag )
  180.        
  181.         def Stop ( self ):
  182.                 self.database.commit()
  183.                 self.database.close()
  184.  
  185. # How to use:
  186. # First create the scraper object. (If its not obvious, that is the username and password for a furaffinity user. The useragent can be whatever you want.)
  187. scraper = FurAffinityScraper("USERNAME","PASSWORD","USERAGENT")
  188.  
  189. # Now you can scrape away.
  190. # Just use one of the two functions to do that:
  191. #       scraper.ScrapePage( pageID ) :  (Scrapes a single page)
  192. scraper.ScrapePage( 9001 )
  193.  
  194. #       scraper.ScrapeRange( startPageID, endPageID, statusInterval ) : (Scrapes every page between start and end, including both. statusInterval determins at which interval a quick status message is printed to the screen.)
  195. scraper.ScrapeRange( 1, 100, 10 )
  196.  
  197. # When scraping pages, if a page is missing, the scraper will just skip it.
  198.  
  199. # When you are done scraping stuff, call the stop function:
  200. scraper.Stop()          # Closes the database connection.
RAW Paste Data
Top