process,oy

import liteConnectPlus as lite
import sqlite3
from datetime import datetime
import time
import math
from bs4 import BeautifulSoup, NavigableString, Tag
from multiprocessing import Pool
import random
import re
from misc import TimedSection
import html as htmlLib

dbPath = "D:/MLProjects/Databases/"
baseUrl = "https://archiveofourown.org"

ratingMap = {
    "Not Rated": -1,
    "No rating": -1,
    "General Audiences": 0,
    "Teen And Up Audiences": 1,
    "Teen & Up Audiences": 1,
    "Mature": 2,
    "Explicit": 3,
    }

finishedMap = {
    # -1
    "Work in Progress": 0,
    "Complete Work": 1,
    }

class NameTableManager:
    def __init__(self,tableName,con):
        self.tableName = tableName
        self.cur = con.cursor()
        self.data = {}
        for id, name in con.execute("SELECT id, name FROM %s" % tableName):
            self.data[name] = id

    def getIdOrCreate(self,name,**args):
        if name not in self.data:
            args["name"] = name
            self.cur.insertDict(self.tableName,args)
            self.data[name] = self.cur.lastrowid

        return self.data[name]

def getTimeStr():
    dateTimeObj = datetime.now()
    return dateTimeObj.strftime("%Y-%m-%d %H:%M:%S")

def elemStr(elem):
    data = "".join(elem.strings)
    return data.strip()

def scrapeHeaderDataFromWork(orgCon,soup,id,scrapeFromId,tagManager,authorManager,langManager,debug=False,profile=False):

    prefaceDiv = soup.find("div",{"id":"workskin"}).find("div",{"class":"preface group"})

    # Title
    titleStr = elemStr(prefaceDiv.find("h2"))
    if debug:
        print("Title: %s" % titleStr)

    # Authors
    for link in prefaceDiv.find("h3").findAll("a"):
        author = link["href"].split("/")[2]
        if debug:
            print("Author: %s" % author)
        authorId = authorManager.getIdOrCreate(author)
        orgCon.execute("INSERT INTO AuthorLinks(storyId,authorId) VALUES(?,?)", (id,authorId))

    # Description
    descriptionElem = prefaceDiv.find("div",{"class":"summary module"})
    if descriptionElem is not None:
        descriptionStr = elemStr(descriptionElem.find("blockquote"))
    else:
        descriptionStr = ""

    # Author dedications
    notesElem = prefaceDiv.find("div",{"class":"notes module"})
    if notesElem is not None:
        assocElem = notesElem.find("ul",{"class":"associations"})
        if assocElem is not None:
            for link in assocElem.findAll("a"):
                if link["href"].startswith("/users/"):
                    author = link["href"].split("/")[2]
                elif link["href"].startswith("/gifts?recipient="):
                    author = link["href"].split("=")[-1]
                elif link["href"].startswith("/works/") or link["href"].startswith("/external_works/") or link["href"].startswith("/collections/"):
                    # Is inspired by a work
                    # Even if there are dedications as well, I'm just going to ignore that
                    break
                elif link["href"] == "/gifts":
                    # Sometimes it's just blank. Not sure whats up with that
                    continue
                else:
                    raise Exception("Invalid dedication href: %s" % link["href"])

                if debug:
                    print("Dedicated to: %s" % author)
                authorId = authorManager.getIdOrCreate(author)
                orgCon.execute("INSERT INTO AuthorDedicationLinks(storyId,authorId) VALUES(?,?)", (id,authorId))

    metaContentElem = soup.find("div",{"class":"wrapper"})

    # Get Rating
    ratingElem = metaContentElem.find("dd",{"class":"rating tags"})
    if ratingElem is None:
        ratingId = -1
    else:
        ratingStrs = [x.string.strip() for x in ratingElem.findAll("a")]
        ratingId = -1
        for ratingStr in ratingStrs:
            if ratingStr not in ratingMap:
                raise Exception("Invalid HTML, unexpected rating-string >%s<" % ratingStr)
            ratingId = max(ratingId,ratingMap[ratingStr])

    # Get tags from different categories
    tagTypeMap = {
        "warning tags": "warning",
        "category tags": "category",
        "relationship tags": "relationship",
        "fandom tags": "fandom",
        "character tags": "character",
        "freeform tags": "generic",
        }
    for tagClass in tagTypeMap:
        tagType = tagTypeMap[tagClass]
        tagElem = metaContentElem.find("dd",{"class":tagClass})
        if tagElem is not None:
            for link in tagElem.findAll("a"):
                tagUrl = baseUrl + link["href"]
                tagName = link.string.strip()
                tagId = tagManager.getIdOrCreate(tagName, tagUrl=tagUrl, type=tagType)
                orgCon.execute("INSERT INTO TagLinks(storyId,tagId) VALUES(?,?)", (id,tagId))

    # Get language
    langElem = metaContentElem.find("dd",{"class":"language"})
    if langElem is None:
        # Guess English
        langStr = "English"
    else:
        langStr = langElem.string.strip()
    langId = langManager.getIdOrCreate(langStr)

    # Get series
    seriesElem = metaContentElem.find("dd",{"class":"series"})
    if seriesElem is not None:
        spanElem = seriesElem.find("span",{"class":"position"})
        # In rare cases the "Series" property is there but blank
        if spanElem is not None:
            # "Part X of BLAH series"
            seriesStr = elemStr(spanElem).strip()
            seriesPart = int(seriesStr.split(" ")[1])

            seriesLink = spanElem.find("a")
            seriesName = seriesLink.string.strip()
            seriesId = int(seriesLink["href"].split("/")[-1])

            if len([x for x in orgCon.execute("SELECT * FROM Series WHERE id=?",(seriesId,))]) == 0:
                # Create series entry
                orgCon.execute("INSERT INTO Series(id,name) VALUES(?,?)", (seriesId,seriesName))

            # Add series link
            orgCon.execute("INSERT INTO SeriesLinks(seriesId,part,storyId) VALUES(?,?,?)", (seriesId,seriesPart,id))

    # Get stats
    statsElem = metaContentElem.find("dl",{"class":"stats"})

    publishDate = statsElem.find("dd",{"class":"published"}).string.strip()

    # From what I can tell, the status field is only present for WIP works
    # otherwise there is no indication that a work is finished or not
    statusElem = statsElem.find("dd",{"class":"status"})
    if statusElem is None:
        finishedId = 1
    else:
        finishedId = 0

    def getStatsElem(className,defValue):
        elem = statsElem.find("dd",{"class":className})
        if elem is None or elem.string is None:
            return defValue
        else:
            return elem.string.strip()

    wordCount       = int(getStatsElem("words","-1"))
    kudosCount      = int(getStatsElem("kudos","0"))
    hitCount        = int(getStatsElem("hits","0"))
    bookmarkCount   = int(getStatsElem("bookmarks","0"))
    commentCount    = int(getStatsElem("comments","0"))

    chaptersStr    = getStatsElem("chapters","1/?")
    curChapters, maxChapters = chaptersStr.split("/")
    curChapters = int(curChapters)
    if maxChapters == "?":
        maxChapters = -1
    else:
        maxChapters = int(maxChapters)

    if debug:
        print("Stats:")
        print("\tLanguage: %d" % langId)
        print("\tWord Count: %d" % wordCount)
        print("\tChapters: %d/%d" % (curChapters,maxChapters))
        print("\tKudos: %d" % kudosCount)
        print("\tHits: %d" % hitCount)
        print("\tComments: %d" % commentCount)

    # =============
    # Making entry
    # =============
    processDate = getTimeStr()

    dataDict = {
        "id": id,
        "headerScrapedFrom": scrapeFromId,
        "processDate": processDate,
        "rating": ratingId,
        "finished": finishedId,
        "title": titleStr,
        "description": descriptionStr,
        "curChapters": curChapters,
        "maxChapters": maxChapters,
        "language": langId,
        "words": wordCount,
        "hits": hitCount,
        "kudos": kudosCount,
        "comments": commentCount,
        "bookmarks": bookmarkCount,
        "date": publishDate,
        "restricted": 0,
        "hidden": 0,
    }
    with TimedSection("Story-Insert",active=profile):
        orgCon.insertDict("StoryHeaders",dataDict)

    return curChapters

class HtmlTool(object):
    """
    Algorithms to process HTML.
    """
    #Regular expressions to recognize different parts of HTML.
    #Internal style sheets or JavaScript
    script_sheet = re.compile(r"<(script|style).*?>.*?(</\1>)",
                              re.IGNORECASE | re.DOTALL)
    #HTML comments - can contain ">"
    comment = re.compile(r"<!--(.*?)-->", re.DOTALL)
    #HTML tags: <any-text>
    tag = re.compile(r"<.*?>", re.DOTALL)
    #Consecutive whitespace characters
    nwhites = re.compile(r"[\s]+")
    #<p>, <div>, <br> tags and associated closing tags
    p_div = re.compile(r"</?(p|div|br).*?>",
                       re.IGNORECASE | re.DOTALL)
    #Consecutive whitespace, but no newlines
    nspace = re.compile("[^\S\n]+", re.UNICODE)
    #At least two consecutive newlines
    n2ret = re.compile("\n\n+")
    #A return followed by a space
    retspace = re.compile("(\n )")

    @staticmethod
    def to_nice_text(html):
        """Remove all HTML tags, but produce a nicely formatted text."""
        if html is None:
            return u""
        text = html
        text = HtmlTool.script_sheet.sub("", text)
        text = HtmlTool.comment.sub("", text)
        text = HtmlTool.nwhites.sub(" ", text)
        text = HtmlTool.p_div.sub("\n", text) #convert <p>, <div>, <br> to "\n"
        text = HtmlTool.tag.sub("", text)     #remove all tags
        text = htmlLib.unescape(text)
        #Get whitespace right
        text = HtmlTool.nspace.sub(" ", text)
        text = HtmlTool.retspace.sub("\n", text)
        text = HtmlTool.n2ret.sub("\n\n", text)
        text = text.strip()
        return text

def getCleanText(elem):
    chapterHeading = elem.find("h3",{"class":"landmark heading"})
    if chapterHeading is not None:
        chapterHeading.extract()
    text =  HtmlTool.to_nice_text(str(elem))
    return text

def scrapeContentFromWork(orgCon,soup,id,scrapeFromId):
    chaptersElem = soup.find("div",{"id":"chapters"})
    chapterElems = []

    chapterNum = 1
    while True:
        chapterElem = chaptersElem.find("div",{"id":"chapter-%d" % chapterNum})
        if chapterElem is None:
            break
        chapterElems.append(chapterElem.find("div",{"class":"userstuff module"}))
        chapterNum += 1

    if chapterNum == 1:
        # This work only has one chapter
        chapterElems.append(chaptersElem.find("div",{"class":"userstuff"}))

    cleanChapterText = [getCleanText(x) for x in chapterElems]
    chapterHtml = [str(x) for x in chapterElems]

    for idx, (html, clean) in enumerate(zip(chapterHtml,cleanChapterText)):
        orgCon.execute("INSERT INTO Chapters(storyId,idx,textRaw,text) VALUES(?,?,?,?)", (id,idx,html,clean))
    orgCon.execute("UPDATE StoryHeaders SET contentScrapedFrom=? WHERE id=?", (scrapeFromId,id))


# Sometimes it looks like only part of the html got read,
# to check for this we just see whether the html ends with </html>
def isHtmlIncomplete(html):
    return not html.rstrip().endswith("</html>")

def workFor(batchIdx,profile=False):
    reqCon = lite.connect(dbPath + "rawRequests.sqlite3")
    orgCon = lite.connect(getBatchPath(batchIdx))

    printEvery = 100

    totalCount = int(orgCon.getSingle("SELECT COUNT(id) FROM IdQueue"))
    print("(%d) %d entries queued" % (batchIdx,totalCount))

    try:
        with TimedSection("Parse-Init"):
            tagManager = NameTableManager("Tags",orgCon)
            authorManager = NameTableManager("Authors",orgCon)
            langManager = NameTableManager("Languages",orgCon)

        idQueue = [x[0] for x in orgCon.execute("SELECT id FROM IdQueue")]
        for i, historyId in enumerate(idQueue):
            url, html = [x for x in reqCon.execute("SELECT url, text FROM History WHERE id=?",(historyId,))][0]

            if isHtmlIncomplete(html):
                print("Incomplete html for url: %s (Id: %d)" % (url,historyId))
                continue

            # https://archiveofourown.org/works/3?view_full_work=true&view_adult=true
            storyId = int(url.split("/")[-1].split("?")[0])

            with TimedSection("Soup-Parse",active=profile):
                soup = BeautifulSoup(html, 'html.parser')

            mainDiv = soup.find("div",{"id":"main"})

            if mainDiv.find("div",{"id":"workskin"}) is None:
                # Work is hidden
                dataDict = {}
                dataDict["id"] = storyId
                dataDict["processDate"] = getTimeStr()
                dataDict["restricted"] = 0
                dataDict["headerScrapedFrom"] = historyId
                dataDict["contentScrapedFrom"] = historyId
                dataDict["hidden"] = 1
                orgCon.insertDict("StoryHeaders",dataDict)
            else:
                try:
                    curChapters = scrapeHeaderDataFromWork(orgCon,mainDiv,storyId,historyId,tagManager,authorManager,langManager)
                    if curChapters == 0:
                        orgCon.execute("UPDATE StoryHeaders SET contentScrapedFrom=? WHERE id=?", (historyId,storyId))
                    else:
                        scrapeContentFromWork(orgCon,mainDiv,storyId,historyId)
                except Exception as err:
                    print("Error scraping: %s" % url, "(Id: %d)" % historyId)
                    with open("problem.html","w",encoding="utf-8") as fOut:
                        fOut.write(html)

                    raise err
                    orgCon.rollback()
                    continue;
                    #raise err

            # Finished scraping this page
            orgCon.execute("DELETE FROM IdQueue WHERE id=?",(historyId,))
            orgCon.commit()

            if (i+1)%printEvery == 0:
                print("(%d) %d/%d %.2f%%" % (batchIdx,i+1,totalCount,100*(i+1)/totalCount))
    finally:
        orgCon.close()
        reqCon.close()

def getBatchPath(idx):
    return "Batches/orgBatch_%d.db" % idx

def splitIntoBatches(batchCount):
    print("Spliting into %d batches..." % batchCount)

    orgCon = lite.connect(dbPath + "organizedData.sqlite3")

    # Get SQL for replicating general database structure
    createTableSql = {}
    for tableName in [x[0] for x in orgCon.execute("SELECT name FROM sqlite_master WHERE type='table'")]:
        createTableSql[tableName] = orgCon.getTableCreateSql(tableName)
    del createTableSql["sqlite_sequence"]

    print("Replicating database structure...")
    subCons = [lite.connect(getBatchPath(i)) for i in range(batchCount)]
    for subCon in subCons:
        # Create tables if they do not already exist
        for tableName in createTableSql:
            subCon.execute(createTableSql[tableName])

        # Clear IdQueue table in case there is data left over
        subCon.execute("DELETE FROM IdQueue")

    print("Dividing Id-Queue...")
    printEvery = 100000
    total = int(orgCon.getSingle("SELECT COUNT(id) FROM IdQueue"))
    for i, (id,) in enumerate(orgCon.execute("SELECT id FROM IdQueue")):
        subCons[i%batchCount].execute("INSERT INTO IdQueue(id) VALUES(?)", (id,))

        if (i+1)%printEvery == 0:
            print("%d/%d" % (i+1,total))

    print("Saving sub-databases...")
    for subCon in subCons:
        subCon.commit()
        subCon.close()

    print("Done.")

# Assumes table has an "id" field and a unique "name" field
# "name" gets used to check if two entities are equal
# Does two things:
#   If entity does no exist in target table, copies it over
#   Creates remapDict st. remapDict[originalId] = targetId
def mergeNameEntities(sourceCon,targetCon,table):
    targetMap = {}
    for id, name in targetCon.execute("SELECT id, name FROM %s" % table):
        targetMap[name] = id

    remapDict = {}
    insertCon = targetCon.cursor()
    for dataDict in sourceCon.getDicts("SELECT * FROM %s" % table):
        id = dataDict["id"]
        name = dataDict["name"]
        if name in targetMap:
            remapDict[id] = targetMap[name]
        else:
            del dataDict["id"]
            insertCon.insertDict(table,dataDict)
            remapDict[id] = insertCon.lastrowid
    return remapDict

def mergeBatches(batchCount):
    print("Merging %d batches..." % batchCount)
    orgCon = lite.connect(dbPath + "organizedData.sqlite3")

    existingSeriesIds = set([x[0] for x in orgCon.execute("SELECT id FROM Series")])

    count = 0
    for i in range(batchCount):
        subCon = lite.connect(getBatchPath(i))

        # Delete entries that already exist for imported data
        for (id,) in subCon.execute("SELECT id FROM StoryHeaders"):
            orgCon.execute("DELETE FROM AuthorDedicationLinks WHERE storyId=?", (id,))
            orgCon.execute("DELETE FROM AuthorLinks WHERE storyId=?", (id,))
            orgCon.execute("DELETE FROM Chapters WHERE storyId=?", (id,))
            orgCon.execute("DELETE FROM SeriesLinks WHERE storyId=?", (id,))
            orgCon.execute("DELETE FROM StoryHeaders WHERE id=?", (id,))
            orgCon.execute("DELETE FROM TagLinks WHERE storyId=?", (id,))

        # Merge named entities and get remap dictionaries
        authorRemap = mergeNameEntities(subCon,orgCon,"Authors")
        langRemap = mergeNameEntities(subCon,orgCon,"Languages")
        tagRemap = mergeNameEntities(subCon,orgCon,"Tags")

        # Series uses a site-specific id so it doesn't need to get remapped
        for dataDict in subCon.getDicts("SELECT * FROM Series"):
            if dataDict["id"] not in existingSeriesIds:
                orgCon.insertDict("Series",dataDict)
                existingSeriesIds.add(dataDict["id"])

        # Enter author links
        for tableName in ["AuthorDedicationLinks","AuthorLinks"]:
            for storyId, authorId in subCon.execute("SELECT storyId, authorId FROM %s" % tableName):
                authorId = authorRemap[authorId]
                orgCon.execute("INSERT INTO %s(storyId, authorId) VALUES(?,?)" % tableName, (storyId,authorId))

        # Enter seriesLinks, and Chapters data
        # For these the data can just be copied over directly
        for tableName in ["SeriesLinks","Chapters"]:
            for dataDict in subCon.getDicts("SELECT * FROM %s" % tableName):
                orgCon.insertDict(tableName,dataDict)

        # Enter tag links
        for storyId, tagId in subCon.execute("SELECT storyId, tagId FROM TagLinks"):
            tagId = tagRemap[tagId]
            orgCon.execute("INSERT INTO TagLinks(storyId, tagId) VALUES(?,?)", (storyId,tagId))

        # Enter header data and remove IdQueue items
        for dataDict in subCon.getDicts("SELECT * FROM StoryHeaders"):
            if dataDict["language"] is not None:
                dataDict["language"] = langRemap[dataDict["language"]]
            orgCon.insertDict("StoryHeaders",dataDict)

            orgCon.execute("DELETE FROM IdQueue WHERE id=?", (dataDict["contentScrapedFrom"],))
            count += 1

        # Delete imported data so it doesn't get re-imported in the future
        wipeList = [x[0] for x in orgCon.execute("SELECT name FROM sqlite_master WHERE type='table'")]
        wipeList.remove("IdQueue")
        for wipeTable in wipeList:
            subCon.execute("DELETE FROM %s" % wipeTable)

        orgCon.commit()
        subCon.commit()
        subCon.close()

        print("%d/%d" % (i+1,batchCount))

    orgCon.close()
    print("Done.\n%d stories merged in" % count)

def multiWork(batchCount):
    with Pool(batchCount) as p:
        p.map(workFor,[idx for idx in range(batchCount)])


def fixAuthorLang(batchCount):
    print("Fixing languages")
    for i in range(batchCount):
        orgCon = lite.connect(getBatchPath(i))
        langIds = [x[0] for x in orgCon.execute("SELECT language FROM StoryHeaders GROUP BY language")]

        insertCur = orgCon.cursor()
        for langId in langIds:
            langName = [x[0] for x in orgCon.execute("SELECT name FROM Authors WHERE id=?",(langId,))][0]
            orgCon.execute("DELETE FROM Authors WHERE id=?",(langId,))

            insertCur.execute("INSERT INTO Languages(name) VALUES(?)",(langName,))
            newId = insertCur.lastrowid
            orgCon.execute("UPDATE StoryHeaders SET language=? WHERE language=?",(newId,langId))
        orgCon.commit()
        print("Fixed batch %d" % i)
        orgCon.close()
    print("Done")

def reQueue():
    orgCon = lite.connect(dbPath + "organizedData.sqlite3")
    reqCon = lite.connect(dbPath + "rawRequests.sqlite3")

    for (id,) in orgCon.execute("SELECT id FROM IdQueue"):
        url = [x[0] for x in reqCon.execute("SELECT url FROM History WHERE id=?", (id,))][0]

        dataDict = {}
        dataDict["url"] = url
        dataDict["date"] = getTimeStr()
        dataDict["category"] = "requery"
        reqCon.insertDict("Queued", dataDict)

    reqCon.commit()
    reqCon.close()
    orgCon.close()

def adjustIdQueue():
    orgCon = lite.connect(dbPath + "organizedData.sqlite3")
    reqCon = lite.connect(dbPath + "rawRequests.sqlite3")

    newIdMap = {}
    for id, url in reqCon.execute("SELECT id, url FROM History WHERE category=\"requery\""):
        newIdMap[url] = id

    for (oldId,) in orgCon.execute("SELECT id FROM IdQueue"):
        url = reqCon.getSingle("SELECT url FROM History WHERE id=?",(oldId,))
        newId = newIdMap[url]
        orgCon.execute("UPDATE IdQueue SET id=? WHERE id=?", (newId,oldId))
    orgCon.commit()

if __name__ == "__main__":
    batchCount = 20

    splitIntoBatches(batchCount)
    for i in range(batchCount):
        print("Processing batch %d" % i)
        workFor(i)
    #multiWork(batchCount)
    mergeBatches(batchCount)