Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import liteConnectPlus as lite
- import sqlite3
- from datetime import datetime
- import time
- import math
- from bs4 import BeautifulSoup, NavigableString, Tag
- from multiprocessing import Pool
- import random
- import re
- from misc import TimedSection
- import html as htmlLib
- dbPath = "D:/MLProjects/Databases/"
- baseUrl = "https://archiveofourown.org"
- ratingMap = {
- "Not Rated": -1,
- "No rating": -1,
- "General Audiences": 0,
- "Teen And Up Audiences": 1,
- "Teen & Up Audiences": 1,
- "Mature": 2,
- "Explicit": 3,
- }
- finishedMap = {
- # -1
- "Work in Progress": 0,
- "Complete Work": 1,
- }
- class NameTableManager:
- def __init__(self,tableName,con):
- self.tableName = tableName
- self.cur = con.cursor()
- self.data = {}
- for id, name in con.execute("SELECT id, name FROM %s" % tableName):
- self.data[name] = id
- def getIdOrCreate(self,name,**args):
- if name not in self.data:
- args["name"] = name
- self.cur.insertDict(self.tableName,args)
- self.data[name] = self.cur.lastrowid
- return self.data[name]
- def getTimeStr():
- dateTimeObj = datetime.now()
- return dateTimeObj.strftime("%Y-%m-%d %H:%M:%S")
- def elemStr(elem):
- data = "".join(elem.strings)
- return data.strip()
- def scrapeHeaderDataFromWork(orgCon,soup,id,scrapeFromId,tagManager,authorManager,langManager,debug=False,profile=False):
- prefaceDiv = soup.find("div",{"id":"workskin"}).find("div",{"class":"preface group"})
- # Title
- titleStr = elemStr(prefaceDiv.find("h2"))
- if debug:
- print("Title: %s" % titleStr)
- # Authors
- for link in prefaceDiv.find("h3").findAll("a"):
- author = link["href"].split("/")[2]
- if debug:
- print("Author: %s" % author)
- authorId = authorManager.getIdOrCreate(author)
- orgCon.execute("INSERT INTO AuthorLinks(storyId,authorId) VALUES(?,?)", (id,authorId))
- # Description
- descriptionElem = prefaceDiv.find("div",{"class":"summary module"})
- if descriptionElem is not None:
- descriptionStr = elemStr(descriptionElem.find("blockquote"))
- else:
- descriptionStr = ""
- # Author dedications
- notesElem = prefaceDiv.find("div",{"class":"notes module"})
- if notesElem is not None:
- assocElem = notesElem.find("ul",{"class":"associations"})
- if assocElem is not None:
- for link in assocElem.findAll("a"):
- if link["href"].startswith("/users/"):
- author = link["href"].split("/")[2]
- elif link["href"].startswith("/gifts?recipient="):
- author = link["href"].split("=")[-1]
- elif link["href"].startswith("/works/") or link["href"].startswith("/external_works/") or link["href"].startswith("/collections/"):
- # Is inspired by a work
- # Even if there are dedications as well, I'm just going to ignore that
- break
- elif link["href"] == "/gifts":
- # Sometimes it's just blank. Not sure whats up with that
- continue
- else:
- raise Exception("Invalid dedication href: %s" % link["href"])
- if debug:
- print("Dedicated to: %s" % author)
- authorId = authorManager.getIdOrCreate(author)
- orgCon.execute("INSERT INTO AuthorDedicationLinks(storyId,authorId) VALUES(?,?)", (id,authorId))
- metaContentElem = soup.find("div",{"class":"wrapper"})
- # Get Rating
- ratingElem = metaContentElem.find("dd",{"class":"rating tags"})
- if ratingElem is None:
- ratingId = -1
- else:
- ratingStrs = [x.string.strip() for x in ratingElem.findAll("a")]
- ratingId = -1
- for ratingStr in ratingStrs:
- if ratingStr not in ratingMap:
- raise Exception("Invalid HTML, unexpected rating-string >%s<" % ratingStr)
- ratingId = max(ratingId,ratingMap[ratingStr])
- # Get tags from different categories
- tagTypeMap = {
- "warning tags": "warning",
- "category tags": "category",
- "relationship tags": "relationship",
- "fandom tags": "fandom",
- "character tags": "character",
- "freeform tags": "generic",
- }
- for tagClass in tagTypeMap:
- tagType = tagTypeMap[tagClass]
- tagElem = metaContentElem.find("dd",{"class":tagClass})
- if tagElem is not None:
- for link in tagElem.findAll("a"):
- tagUrl = baseUrl + link["href"]
- tagName = link.string.strip()
- tagId = tagManager.getIdOrCreate(tagName, tagUrl=tagUrl, type=tagType)
- orgCon.execute("INSERT INTO TagLinks(storyId,tagId) VALUES(?,?)", (id,tagId))
- # Get language
- langElem = metaContentElem.find("dd",{"class":"language"})
- if langElem is None:
- # Guess English
- langStr = "English"
- else:
- langStr = langElem.string.strip()
- langId = langManager.getIdOrCreate(langStr)
- # Get series
- seriesElem = metaContentElem.find("dd",{"class":"series"})
- if seriesElem is not None:
- spanElem = seriesElem.find("span",{"class":"position"})
- # In rare cases the "Series" property is there but blank
- if spanElem is not None:
- # "Part X of BLAH series"
- seriesStr = elemStr(spanElem).strip()
- seriesPart = int(seriesStr.split(" ")[1])
- seriesLink = spanElem.find("a")
- seriesName = seriesLink.string.strip()
- seriesId = int(seriesLink["href"].split("/")[-1])
- if len([x for x in orgCon.execute("SELECT * FROM Series WHERE id=?",(seriesId,))]) == 0:
- # Create series entry
- orgCon.execute("INSERT INTO Series(id,name) VALUES(?,?)", (seriesId,seriesName))
- # Add series link
- orgCon.execute("INSERT INTO SeriesLinks(seriesId,part,storyId) VALUES(?,?,?)", (seriesId,seriesPart,id))
- # Get stats
- statsElem = metaContentElem.find("dl",{"class":"stats"})
- publishDate = statsElem.find("dd",{"class":"published"}).string.strip()
- # From what I can tell, the status field is only present for WIP works
- # otherwise there is no indication that a work is finished or not
- statusElem = statsElem.find("dd",{"class":"status"})
- if statusElem is None:
- finishedId = 1
- else:
- finishedId = 0
- def getStatsElem(className,defValue):
- elem = statsElem.find("dd",{"class":className})
- if elem is None or elem.string is None:
- return defValue
- else:
- return elem.string.strip()
- wordCount = int(getStatsElem("words","-1"))
- kudosCount = int(getStatsElem("kudos","0"))
- hitCount = int(getStatsElem("hits","0"))
- bookmarkCount = int(getStatsElem("bookmarks","0"))
- commentCount = int(getStatsElem("comments","0"))
- chaptersStr = getStatsElem("chapters","1/?")
- curChapters, maxChapters = chaptersStr.split("/")
- curChapters = int(curChapters)
- if maxChapters == "?":
- maxChapters = -1
- else:
- maxChapters = int(maxChapters)
- if debug:
- print("Stats:")
- print("\tLanguage: %d" % langId)
- print("\tWord Count: %d" % wordCount)
- print("\tChapters: %d/%d" % (curChapters,maxChapters))
- print("\tKudos: %d" % kudosCount)
- print("\tHits: %d" % hitCount)
- print("\tComments: %d" % commentCount)
- # =============
- # Making entry
- # =============
- processDate = getTimeStr()
- dataDict = {
- "id": id,
- "headerScrapedFrom": scrapeFromId,
- "processDate": processDate,
- "rating": ratingId,
- "finished": finishedId,
- "title": titleStr,
- "description": descriptionStr,
- "curChapters": curChapters,
- "maxChapters": maxChapters,
- "language": langId,
- "words": wordCount,
- "hits": hitCount,
- "kudos": kudosCount,
- "comments": commentCount,
- "bookmarks": bookmarkCount,
- "date": publishDate,
- "restricted": 0,
- "hidden": 0,
- }
- with TimedSection("Story-Insert",active=profile):
- orgCon.insertDict("StoryHeaders",dataDict)
- return curChapters
- class HtmlTool(object):
- """
- Algorithms to process HTML.
- """
- #Regular expressions to recognize different parts of HTML.
- #Internal style sheets or JavaScript
- script_sheet = re.compile(r"<(script|style).*?>.*?(</\1>)",
- re.IGNORECASE | re.DOTALL)
- #HTML comments - can contain ">"
- comment = re.compile(r"<!--(.*?)-->", re.DOTALL)
- #HTML tags: <any-text>
- tag = re.compile(r"<.*?>", re.DOTALL)
- #Consecutive whitespace characters
- nwhites = re.compile(r"[\s]+")
- #<p>, <div>, <br> tags and associated closing tags
- p_div = re.compile(r"</?(p|div|br).*?>",
- re.IGNORECASE | re.DOTALL)
- #Consecutive whitespace, but no newlines
- nspace = re.compile("[^\S\n]+", re.UNICODE)
- #At least two consecutive newlines
- n2ret = re.compile("\n\n+")
- #A return followed by a space
- retspace = re.compile("(\n )")
- @staticmethod
- def to_nice_text(html):
- """Remove all HTML tags, but produce a nicely formatted text."""
- if html is None:
- return u""
- text = html
- text = HtmlTool.script_sheet.sub("", text)
- text = HtmlTool.comment.sub("", text)
- text = HtmlTool.nwhites.sub(" ", text)
- text = HtmlTool.p_div.sub("\n", text) #convert <p>, <div>, <br> to "\n"
- text = HtmlTool.tag.sub("", text) #remove all tags
- text = htmlLib.unescape(text)
- #Get whitespace right
- text = HtmlTool.nspace.sub(" ", text)
- text = HtmlTool.retspace.sub("\n", text)
- text = HtmlTool.n2ret.sub("\n\n", text)
- text = text.strip()
- return text
- def getCleanText(elem):
- chapterHeading = elem.find("h3",{"class":"landmark heading"})
- if chapterHeading is not None:
- chapterHeading.extract()
- text = HtmlTool.to_nice_text(str(elem))
- return text
- def scrapeContentFromWork(orgCon,soup,id,scrapeFromId):
- chaptersElem = soup.find("div",{"id":"chapters"})
- chapterElems = []
- chapterNum = 1
- while True:
- chapterElem = chaptersElem.find("div",{"id":"chapter-%d" % chapterNum})
- if chapterElem is None:
- break
- chapterElems.append(chapterElem.find("div",{"class":"userstuff module"}))
- chapterNum += 1
- if chapterNum == 1:
- # This work only has one chapter
- chapterElems.append(chaptersElem.find("div",{"class":"userstuff"}))
- cleanChapterText = [getCleanText(x) for x in chapterElems]
- chapterHtml = [str(x) for x in chapterElems]
- for idx, (html, clean) in enumerate(zip(chapterHtml,cleanChapterText)):
- orgCon.execute("INSERT INTO Chapters(storyId,idx,textRaw,text) VALUES(?,?,?,?)", (id,idx,html,clean))
- orgCon.execute("UPDATE StoryHeaders SET contentScrapedFrom=? WHERE id=?", (scrapeFromId,id))
- # Sometimes it looks like only part of the html got read,
- # to check for this we just see whether the html ends with </html>
- def isHtmlIncomplete(html):
- return not html.rstrip().endswith("</html>")
- def workFor(batchIdx,profile=False):
- reqCon = lite.connect(dbPath + "rawRequests.sqlite3")
- orgCon = lite.connect(getBatchPath(batchIdx))
- printEvery = 100
- totalCount = int(orgCon.getSingle("SELECT COUNT(id) FROM IdQueue"))
- print("(%d) %d entries queued" % (batchIdx,totalCount))
- try:
- with TimedSection("Parse-Init"):
- tagManager = NameTableManager("Tags",orgCon)
- authorManager = NameTableManager("Authors",orgCon)
- langManager = NameTableManager("Languages",orgCon)
- idQueue = [x[0] for x in orgCon.execute("SELECT id FROM IdQueue")]
- for i, historyId in enumerate(idQueue):
- url, html = [x for x in reqCon.execute("SELECT url, text FROM History WHERE id=?",(historyId,))][0]
- if isHtmlIncomplete(html):
- print("Incomplete html for url: %s (Id: %d)" % (url,historyId))
- continue
- # https://archiveofourown.org/works/3?view_full_work=true&view_adult=true
- storyId = int(url.split("/")[-1].split("?")[0])
- with TimedSection("Soup-Parse",active=profile):
- soup = BeautifulSoup(html, 'html.parser')
- mainDiv = soup.find("div",{"id":"main"})
- if mainDiv.find("div",{"id":"workskin"}) is None:
- # Work is hidden
- dataDict = {}
- dataDict["id"] = storyId
- dataDict["processDate"] = getTimeStr()
- dataDict["restricted"] = 0
- dataDict["headerScrapedFrom"] = historyId
- dataDict["contentScrapedFrom"] = historyId
- dataDict["hidden"] = 1
- orgCon.insertDict("StoryHeaders",dataDict)
- else:
- try:
- curChapters = scrapeHeaderDataFromWork(orgCon,mainDiv,storyId,historyId,tagManager,authorManager,langManager)
- if curChapters == 0:
- orgCon.execute("UPDATE StoryHeaders SET contentScrapedFrom=? WHERE id=?", (historyId,storyId))
- else:
- scrapeContentFromWork(orgCon,mainDiv,storyId,historyId)
- except Exception as err:
- print("Error scraping: %s" % url, "(Id: %d)" % historyId)
- with open("problem.html","w",encoding="utf-8") as fOut:
- fOut.write(html)
- raise err
- orgCon.rollback()
- continue;
- #raise err
- # Finished scraping this page
- orgCon.execute("DELETE FROM IdQueue WHERE id=?",(historyId,))
- orgCon.commit()
- if (i+1)%printEvery == 0:
- print("(%d) %d/%d %.2f%%" % (batchIdx,i+1,totalCount,100*(i+1)/totalCount))
- finally:
- orgCon.close()
- reqCon.close()
- def getBatchPath(idx):
- return "Batches/orgBatch_%d.db" % idx
- def splitIntoBatches(batchCount):
- print("Spliting into %d batches..." % batchCount)
- orgCon = lite.connect(dbPath + "organizedData.sqlite3")
- # Get SQL for replicating general database structure
- createTableSql = {}
- for tableName in [x[0] for x in orgCon.execute("SELECT name FROM sqlite_master WHERE type='table'")]:
- createTableSql[tableName] = orgCon.getTableCreateSql(tableName)
- del createTableSql["sqlite_sequence"]
- print("Replicating database structure...")
- subCons = [lite.connect(getBatchPath(i)) for i in range(batchCount)]
- for subCon in subCons:
- # Create tables if they do not already exist
- for tableName in createTableSql:
- subCon.execute(createTableSql[tableName])
- # Clear IdQueue table in case there is data left over
- subCon.execute("DELETE FROM IdQueue")
- print("Dividing Id-Queue...")
- printEvery = 100000
- total = int(orgCon.getSingle("SELECT COUNT(id) FROM IdQueue"))
- for i, (id,) in enumerate(orgCon.execute("SELECT id FROM IdQueue")):
- subCons[i%batchCount].execute("INSERT INTO IdQueue(id) VALUES(?)", (id,))
- if (i+1)%printEvery == 0:
- print("%d/%d" % (i+1,total))
- print("Saving sub-databases...")
- for subCon in subCons:
- subCon.commit()
- subCon.close()
- print("Done.")
- # Assumes table has an "id" field and a unique "name" field
- # "name" gets used to check if two entities are equal
- # Does two things:
- # If entity does no exist in target table, copies it over
- # Creates remapDict st. remapDict[originalId] = targetId
- def mergeNameEntities(sourceCon,targetCon,table):
- targetMap = {}
- for id, name in targetCon.execute("SELECT id, name FROM %s" % table):
- targetMap[name] = id
- remapDict = {}
- insertCon = targetCon.cursor()
- for dataDict in sourceCon.getDicts("SELECT * FROM %s" % table):
- id = dataDict["id"]
- name = dataDict["name"]
- if name in targetMap:
- remapDict[id] = targetMap[name]
- else:
- del dataDict["id"]
- insertCon.insertDict(table,dataDict)
- remapDict[id] = insertCon.lastrowid
- return remapDict
- def mergeBatches(batchCount):
- print("Merging %d batches..." % batchCount)
- orgCon = lite.connect(dbPath + "organizedData.sqlite3")
- existingSeriesIds = set([x[0] for x in orgCon.execute("SELECT id FROM Series")])
- count = 0
- for i in range(batchCount):
- subCon = lite.connect(getBatchPath(i))
- # Delete entries that already exist for imported data
- for (id,) in subCon.execute("SELECT id FROM StoryHeaders"):
- orgCon.execute("DELETE FROM AuthorDedicationLinks WHERE storyId=?", (id,))
- orgCon.execute("DELETE FROM AuthorLinks WHERE storyId=?", (id,))
- orgCon.execute("DELETE FROM Chapters WHERE storyId=?", (id,))
- orgCon.execute("DELETE FROM SeriesLinks WHERE storyId=?", (id,))
- orgCon.execute("DELETE FROM StoryHeaders WHERE id=?", (id,))
- orgCon.execute("DELETE FROM TagLinks WHERE storyId=?", (id,))
- # Merge named entities and get remap dictionaries
- authorRemap = mergeNameEntities(subCon,orgCon,"Authors")
- langRemap = mergeNameEntities(subCon,orgCon,"Languages")
- tagRemap = mergeNameEntities(subCon,orgCon,"Tags")
- # Series uses a site-specific id so it doesn't need to get remapped
- for dataDict in subCon.getDicts("SELECT * FROM Series"):
- if dataDict["id"] not in existingSeriesIds:
- orgCon.insertDict("Series",dataDict)
- existingSeriesIds.add(dataDict["id"])
- # Enter author links
- for tableName in ["AuthorDedicationLinks","AuthorLinks"]:
- for storyId, authorId in subCon.execute("SELECT storyId, authorId FROM %s" % tableName):
- authorId = authorRemap[authorId]
- orgCon.execute("INSERT INTO %s(storyId, authorId) VALUES(?,?)" % tableName, (storyId,authorId))
- # Enter seriesLinks, and Chapters data
- # For these the data can just be copied over directly
- for tableName in ["SeriesLinks","Chapters"]:
- for dataDict in subCon.getDicts("SELECT * FROM %s" % tableName):
- orgCon.insertDict(tableName,dataDict)
- # Enter tag links
- for storyId, tagId in subCon.execute("SELECT storyId, tagId FROM TagLinks"):
- tagId = tagRemap[tagId]
- orgCon.execute("INSERT INTO TagLinks(storyId, tagId) VALUES(?,?)", (storyId,tagId))
- # Enter header data and remove IdQueue items
- for dataDict in subCon.getDicts("SELECT * FROM StoryHeaders"):
- if dataDict["language"] is not None:
- dataDict["language"] = langRemap[dataDict["language"]]
- orgCon.insertDict("StoryHeaders",dataDict)
- orgCon.execute("DELETE FROM IdQueue WHERE id=?", (dataDict["contentScrapedFrom"],))
- count += 1
- # Delete imported data so it doesn't get re-imported in the future
- wipeList = [x[0] for x in orgCon.execute("SELECT name FROM sqlite_master WHERE type='table'")]
- wipeList.remove("IdQueue")
- for wipeTable in wipeList:
- subCon.execute("DELETE FROM %s" % wipeTable)
- orgCon.commit()
- subCon.commit()
- subCon.close()
- print("%d/%d" % (i+1,batchCount))
- orgCon.close()
- print("Done.\n%d stories merged in" % count)
- def multiWork(batchCount):
- with Pool(batchCount) as p:
- p.map(workFor,[idx for idx in range(batchCount)])
- def fixAuthorLang(batchCount):
- print("Fixing languages")
- for i in range(batchCount):
- orgCon = lite.connect(getBatchPath(i))
- langIds = [x[0] for x in orgCon.execute("SELECT language FROM StoryHeaders GROUP BY language")]
- insertCur = orgCon.cursor()
- for langId in langIds:
- langName = [x[0] for x in orgCon.execute("SELECT name FROM Authors WHERE id=?",(langId,))][0]
- orgCon.execute("DELETE FROM Authors WHERE id=?",(langId,))
- insertCur.execute("INSERT INTO Languages(name) VALUES(?)",(langName,))
- newId = insertCur.lastrowid
- orgCon.execute("UPDATE StoryHeaders SET language=? WHERE language=?",(newId,langId))
- orgCon.commit()
- print("Fixed batch %d" % i)
- orgCon.close()
- print("Done")
- def reQueue():
- orgCon = lite.connect(dbPath + "organizedData.sqlite3")
- reqCon = lite.connect(dbPath + "rawRequests.sqlite3")
- for (id,) in orgCon.execute("SELECT id FROM IdQueue"):
- url = [x[0] for x in reqCon.execute("SELECT url FROM History WHERE id=?", (id,))][0]
- dataDict = {}
- dataDict["url"] = url
- dataDict["date"] = getTimeStr()
- dataDict["category"] = "requery"
- reqCon.insertDict("Queued", dataDict)
- reqCon.commit()
- reqCon.close()
- orgCon.close()
- def adjustIdQueue():
- orgCon = lite.connect(dbPath + "organizedData.sqlite3")
- reqCon = lite.connect(dbPath + "rawRequests.sqlite3")
- newIdMap = {}
- for id, url in reqCon.execute("SELECT id, url FROM History WHERE category=\"requery\""):
- newIdMap[url] = id
- for (oldId,) in orgCon.execute("SELECT id FROM IdQueue"):
- url = reqCon.getSingle("SELECT url FROM History WHERE id=?",(oldId,))
- newId = newIdMap[url]
- orgCon.execute("UPDATE IdQueue SET id=? WHERE id=?", (newId,oldId))
- orgCon.commit()
- if __name__ == "__main__":
- batchCount = 20
- splitIntoBatches(batchCount)
- for i in range(batchCount):
- print("Processing batch %d" % i)
- workFor(i)
- #multiWork(batchCount)
- mergeBatches(batchCount)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement