Advertisement
Guest User

process,oy

a guest
Mar 24th, 2023
159
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 22.57 KB | Source Code | 0 0
  1. import liteConnectPlus as lite
  2. import sqlite3
  3. from datetime import datetime
  4. import time
  5. import math
  6. from bs4 import BeautifulSoup, NavigableString, Tag
  7. from multiprocessing import Pool
  8. import random
  9. import re
  10. from misc import TimedSection
  11. import html as htmlLib
  12.  
  13. dbPath = "D:/MLProjects/Databases/"
  14. baseUrl = "https://archiveofourown.org"
  15.  
  16. ratingMap = {
  17.     "Not Rated": -1,
  18.     "No rating": -1,
  19.     "General Audiences": 0,
  20.     "Teen And Up Audiences": 1,
  21.     "Teen & Up Audiences": 1,
  22.     "Mature": 2,
  23.     "Explicit": 3,
  24.     }
  25.    
  26. finishedMap = {
  27.     # -1
  28.     "Work in Progress": 0,
  29.     "Complete Work": 1,
  30.     }
  31.  
  32. class NameTableManager:
  33.     def __init__(self,tableName,con):
  34.         self.tableName = tableName
  35.         self.cur = con.cursor()
  36.         self.data = {}
  37.         for id, name in con.execute("SELECT id, name FROM %s" % tableName):
  38.             self.data[name] = id
  39.    
  40.     def getIdOrCreate(self,name,**args):
  41.         if name not in self.data:
  42.             args["name"] = name
  43.             self.cur.insertDict(self.tableName,args)
  44.             self.data[name] = self.cur.lastrowid
  45.        
  46.         return self.data[name]
  47.        
  48. def getTimeStr():
  49.     dateTimeObj = datetime.now()
  50.     return dateTimeObj.strftime("%Y-%m-%d %H:%M:%S")
  51.    
  52. def elemStr(elem):
  53.     data = "".join(elem.strings)
  54.     return data.strip()
  55.  
  56. def scrapeHeaderDataFromWork(orgCon,soup,id,scrapeFromId,tagManager,authorManager,langManager,debug=False,profile=False):
  57.    
  58.     prefaceDiv = soup.find("div",{"id":"workskin"}).find("div",{"class":"preface group"})
  59.    
  60.     # Title
  61.     titleStr = elemStr(prefaceDiv.find("h2"))
  62.     if debug:
  63.         print("Title: %s" % titleStr)
  64.    
  65.     # Authors
  66.     for link in prefaceDiv.find("h3").findAll("a"):
  67.         author = link["href"].split("/")[2]
  68.         if debug:
  69.             print("Author: %s" % author)
  70.         authorId = authorManager.getIdOrCreate(author)
  71.         orgCon.execute("INSERT INTO AuthorLinks(storyId,authorId) VALUES(?,?)", (id,authorId))
  72.    
  73.     # Description
  74.     descriptionElem = prefaceDiv.find("div",{"class":"summary module"})
  75.     if descriptionElem is not None:
  76.         descriptionStr = elemStr(descriptionElem.find("blockquote"))
  77.     else:
  78.         descriptionStr = ""
  79.    
  80.     # Author dedications
  81.     notesElem = prefaceDiv.find("div",{"class":"notes module"})
  82.     if notesElem is not None:
  83.         assocElem = notesElem.find("ul",{"class":"associations"})
  84.         if assocElem is not None:
  85.             for link in assocElem.findAll("a"):
  86.                 if link["href"].startswith("/users/"):
  87.                     author = link["href"].split("/")[2]
  88.                 elif link["href"].startswith("/gifts?recipient="):
  89.                     author = link["href"].split("=")[-1]
  90.                 elif link["href"].startswith("/works/") or link["href"].startswith("/external_works/") or link["href"].startswith("/collections/"):
  91.                     # Is inspired by a work
  92.                     # Even if there are dedications as well, I'm just going to ignore that
  93.                     break
  94.                 elif link["href"] == "/gifts":
  95.                     # Sometimes it's just blank. Not sure whats up with that
  96.                     continue
  97.                 else:
  98.                     raise Exception("Invalid dedication href: %s" % link["href"])
  99.                    
  100.                 if debug:
  101.                     print("Dedicated to: %s" % author)
  102.                 authorId = authorManager.getIdOrCreate(author)
  103.                 orgCon.execute("INSERT INTO AuthorDedicationLinks(storyId,authorId) VALUES(?,?)", (id,authorId))
  104.    
  105.     metaContentElem = soup.find("div",{"class":"wrapper"})
  106.    
  107.     # Get Rating
  108.     ratingElem = metaContentElem.find("dd",{"class":"rating tags"})
  109.     if ratingElem is None:
  110.         ratingId = -1
  111.     else:
  112.         ratingStrs = [x.string.strip() for x in ratingElem.findAll("a")]
  113.         ratingId = -1
  114.         for ratingStr in ratingStrs:
  115.             if ratingStr not in ratingMap:
  116.                 raise Exception("Invalid HTML, unexpected rating-string >%s<" % ratingStr)
  117.             ratingId = max(ratingId,ratingMap[ratingStr])
  118.    
  119.     # Get tags from different categories
  120.     tagTypeMap = {
  121.         "warning tags": "warning",
  122.         "category tags": "category",
  123.         "relationship tags": "relationship",
  124.         "fandom tags": "fandom",
  125.         "character tags": "character",
  126.         "freeform tags": "generic",
  127.         }
  128.     for tagClass in tagTypeMap:
  129.         tagType = tagTypeMap[tagClass]
  130.         tagElem = metaContentElem.find("dd",{"class":tagClass})
  131.         if tagElem is not None:
  132.             for link in tagElem.findAll("a"):
  133.                 tagUrl = baseUrl + link["href"]
  134.                 tagName = link.string.strip()
  135.                 tagId = tagManager.getIdOrCreate(tagName, tagUrl=tagUrl, type=tagType)
  136.                 orgCon.execute("INSERT INTO TagLinks(storyId,tagId) VALUES(?,?)", (id,tagId))
  137.    
  138.     # Get language
  139.     langElem = metaContentElem.find("dd",{"class":"language"})
  140.     if langElem is None:
  141.         # Guess English
  142.         langStr = "English"
  143.     else:
  144.         langStr = langElem.string.strip()
  145.     langId = langManager.getIdOrCreate(langStr)
  146.    
  147.     # Get series
  148.     seriesElem = metaContentElem.find("dd",{"class":"series"})
  149.     if seriesElem is not None:
  150.         spanElem = seriesElem.find("span",{"class":"position"})
  151.         # In rare cases the "Series" property is there but blank
  152.         if spanElem is not None:
  153.             # "Part X of BLAH series"
  154.             seriesStr = elemStr(spanElem).strip()
  155.             seriesPart = int(seriesStr.split(" ")[1])
  156.            
  157.             seriesLink = spanElem.find("a")
  158.             seriesName = seriesLink.string.strip()
  159.             seriesId = int(seriesLink["href"].split("/")[-1])
  160.            
  161.             if len([x for x in orgCon.execute("SELECT * FROM Series WHERE id=?",(seriesId,))]) == 0:
  162.                 # Create series entry
  163.                 orgCon.execute("INSERT INTO Series(id,name) VALUES(?,?)", (seriesId,seriesName))
  164.            
  165.             # Add series link
  166.             orgCon.execute("INSERT INTO SeriesLinks(seriesId,part,storyId) VALUES(?,?,?)", (seriesId,seriesPart,id))
  167.    
  168.     # Get stats
  169.     statsElem = metaContentElem.find("dl",{"class":"stats"})
  170.    
  171.     publishDate = statsElem.find("dd",{"class":"published"}).string.strip()
  172.    
  173.     # From what I can tell, the status field is only present for WIP works
  174.     # otherwise there is no indication that a work is finished or not
  175.     statusElem = statsElem.find("dd",{"class":"status"})
  176.     if statusElem is None:
  177.         finishedId = 1
  178.     else:
  179.         finishedId = 0
  180.    
  181.     def getStatsElem(className,defValue):
  182.         elem = statsElem.find("dd",{"class":className})
  183.         if elem is None or elem.string is None:
  184.             return defValue
  185.         else:
  186.             return elem.string.strip()
  187.    
  188.     wordCount       = int(getStatsElem("words","-1"))
  189.     kudosCount      = int(getStatsElem("kudos","0"))
  190.     hitCount        = int(getStatsElem("hits","0"))
  191.     bookmarkCount   = int(getStatsElem("bookmarks","0"))
  192.     commentCount    = int(getStatsElem("comments","0"))
  193.    
  194.     chaptersStr    = getStatsElem("chapters","1/?")
  195.     curChapters, maxChapters = chaptersStr.split("/")
  196.     curChapters = int(curChapters)
  197.     if maxChapters == "?":
  198.         maxChapters = -1
  199.     else:
  200.         maxChapters = int(maxChapters)
  201.    
  202.     if debug:
  203.         print("Stats:")
  204.         print("\tLanguage: %d" % langId)
  205.         print("\tWord Count: %d" % wordCount)
  206.         print("\tChapters: %d/%d" % (curChapters,maxChapters))
  207.         print("\tKudos: %d" % kudosCount)
  208.         print("\tHits: %d" % hitCount)
  209.         print("\tComments: %d" % commentCount)
  210.    
  211.     # =============
  212.     # Making entry
  213.     # =============
  214.     processDate = getTimeStr()
  215.    
  216.     dataDict = {
  217.         "id": id,
  218.         "headerScrapedFrom": scrapeFromId,
  219.         "processDate": processDate,
  220.         "rating": ratingId,
  221.         "finished": finishedId,
  222.         "title": titleStr,
  223.         "description": descriptionStr,
  224.         "curChapters": curChapters,
  225.         "maxChapters": maxChapters,
  226.         "language": langId,
  227.         "words": wordCount,
  228.         "hits": hitCount,
  229.         "kudos": kudosCount,
  230.         "comments": commentCount,
  231.         "bookmarks": bookmarkCount,
  232.         "date": publishDate,
  233.         "restricted": 0,
  234.         "hidden": 0,
  235.     }
  236.     with TimedSection("Story-Insert",active=profile):
  237.         orgCon.insertDict("StoryHeaders",dataDict)
  238.    
  239.     return curChapters
  240.    
  241. class HtmlTool(object):
  242.     """
  243.    Algorithms to process HTML.
  244.    """
  245.     #Regular expressions to recognize different parts of HTML.
  246.     #Internal style sheets or JavaScript
  247.     script_sheet = re.compile(r"<(script|style).*?>.*?(</\1>)",
  248.                               re.IGNORECASE | re.DOTALL)
  249.     #HTML comments - can contain ">"
  250.     comment = re.compile(r"<!--(.*?)-->", re.DOTALL)
  251.     #HTML tags: <any-text>
  252.     tag = re.compile(r"<.*?>", re.DOTALL)
  253.     #Consecutive whitespace characters
  254.     nwhites = re.compile(r"[\s]+")
  255.     #<p>, <div>, <br> tags and associated closing tags
  256.     p_div = re.compile(r"</?(p|div|br).*?>",
  257.                        re.IGNORECASE | re.DOTALL)
  258.     #Consecutive whitespace, but no newlines
  259.     nspace = re.compile("[^\S\n]+", re.UNICODE)
  260.     #At least two consecutive newlines
  261.     n2ret = re.compile("\n\n+")
  262.     #A return followed by a space
  263.     retspace = re.compile("(\n )")
  264.  
  265.     @staticmethod
  266.     def to_nice_text(html):
  267.         """Remove all HTML tags, but produce a nicely formatted text."""
  268.         if html is None:
  269.             return u""
  270.         text = html
  271.         text = HtmlTool.script_sheet.sub("", text)
  272.         text = HtmlTool.comment.sub("", text)
  273.         text = HtmlTool.nwhites.sub(" ", text)
  274.         text = HtmlTool.p_div.sub("\n", text) #convert <p>, <div>, <br> to "\n"
  275.         text = HtmlTool.tag.sub("", text)     #remove all tags
  276.         text = htmlLib.unescape(text)
  277.         #Get whitespace right
  278.         text = HtmlTool.nspace.sub(" ", text)
  279.         text = HtmlTool.retspace.sub("\n", text)
  280.         text = HtmlTool.n2ret.sub("\n\n", text)
  281.         text = text.strip()
  282.         return text
  283.  
  284. def getCleanText(elem):
  285.     chapterHeading = elem.find("h3",{"class":"landmark heading"})
  286.     if chapterHeading is not None:
  287.         chapterHeading.extract()
  288.     text =  HtmlTool.to_nice_text(str(elem))
  289.     return text
  290.    
  291. def scrapeContentFromWork(orgCon,soup,id,scrapeFromId):
  292.     chaptersElem = soup.find("div",{"id":"chapters"})
  293.     chapterElems = []
  294.    
  295.     chapterNum = 1
  296.     while True:
  297.         chapterElem = chaptersElem.find("div",{"id":"chapter-%d" % chapterNum})
  298.         if chapterElem is None:
  299.             break
  300.         chapterElems.append(chapterElem.find("div",{"class":"userstuff module"}))
  301.         chapterNum += 1
  302.    
  303.     if chapterNum == 1:
  304.         # This work only has one chapter
  305.         chapterElems.append(chaptersElem.find("div",{"class":"userstuff"}))
  306.    
  307.     cleanChapterText = [getCleanText(x) for x in chapterElems]
  308.     chapterHtml = [str(x) for x in chapterElems]
  309.    
  310.     for idx, (html, clean) in enumerate(zip(chapterHtml,cleanChapterText)):
  311.         orgCon.execute("INSERT INTO Chapters(storyId,idx,textRaw,text) VALUES(?,?,?,?)", (id,idx,html,clean))
  312.     orgCon.execute("UPDATE StoryHeaders SET contentScrapedFrom=? WHERE id=?", (scrapeFromId,id))
  313.    
  314.  
  315. # Sometimes it looks like only part of the html got read,
  316. # to check for this we just see whether the html ends with </html>
  317. def isHtmlIncomplete(html):
  318.     return not html.rstrip().endswith("</html>")
  319.  
  320. def workFor(batchIdx,profile=False):
  321.     reqCon = lite.connect(dbPath + "rawRequests.sqlite3")
  322.     orgCon = lite.connect(getBatchPath(batchIdx))
  323.    
  324.     printEvery = 100
  325.    
  326.     totalCount = int(orgCon.getSingle("SELECT COUNT(id) FROM IdQueue"))
  327.     print("(%d) %d entries queued" % (batchIdx,totalCount))
  328.    
  329.     try:
  330.         with TimedSection("Parse-Init"):
  331.             tagManager = NameTableManager("Tags",orgCon)
  332.             authorManager = NameTableManager("Authors",orgCon)
  333.             langManager = NameTableManager("Languages",orgCon)
  334.        
  335.         idQueue = [x[0] for x in orgCon.execute("SELECT id FROM IdQueue")]
  336.         for i, historyId in enumerate(idQueue):
  337.             url, html = [x for x in reqCon.execute("SELECT url, text FROM History WHERE id=?",(historyId,))][0]
  338.            
  339.             if isHtmlIncomplete(html):
  340.                 print("Incomplete html for url: %s (Id: %d)" % (url,historyId))
  341.                 continue
  342.            
  343.             # https://archiveofourown.org/works/3?view_full_work=true&view_adult=true
  344.             storyId = int(url.split("/")[-1].split("?")[0])
  345.            
  346.             with TimedSection("Soup-Parse",active=profile):
  347.                 soup = BeautifulSoup(html, 'html.parser')
  348.            
  349.             mainDiv = soup.find("div",{"id":"main"})
  350.            
  351.             if mainDiv.find("div",{"id":"workskin"}) is None:
  352.                 # Work is hidden
  353.                 dataDict = {}
  354.                 dataDict["id"] = storyId
  355.                 dataDict["processDate"] = getTimeStr()
  356.                 dataDict["restricted"] = 0
  357.                 dataDict["headerScrapedFrom"] = historyId
  358.                 dataDict["contentScrapedFrom"] = historyId
  359.                 dataDict["hidden"] = 1
  360.                 orgCon.insertDict("StoryHeaders",dataDict)
  361.             else:
  362.                 try:
  363.                     curChapters = scrapeHeaderDataFromWork(orgCon,mainDiv,storyId,historyId,tagManager,authorManager,langManager)
  364.                     if curChapters == 0:
  365.                         orgCon.execute("UPDATE StoryHeaders SET contentScrapedFrom=? WHERE id=?", (historyId,storyId))
  366.                     else:
  367.                         scrapeContentFromWork(orgCon,mainDiv,storyId,historyId)
  368.                 except Exception as err:
  369.                     print("Error scraping: %s" % url, "(Id: %d)" % historyId)
  370.                     with open("problem.html","w",encoding="utf-8") as fOut:
  371.                         fOut.write(html)
  372.                    
  373.                     raise err
  374.                     orgCon.rollback()
  375.                     continue;
  376.                     #raise err
  377.            
  378.             # Finished scraping this page
  379.             orgCon.execute("DELETE FROM IdQueue WHERE id=?",(historyId,))
  380.             orgCon.commit()
  381.            
  382.             if (i+1)%printEvery == 0:
  383.                 print("(%d) %d/%d %.2f%%" % (batchIdx,i+1,totalCount,100*(i+1)/totalCount))
  384.     finally:
  385.         orgCon.close()
  386.         reqCon.close()
  387.  
  388. def getBatchPath(idx):
  389.     return "Batches/orgBatch_%d.db" % idx
  390.  
  391. def splitIntoBatches(batchCount):
  392.     print("Spliting into %d batches..." % batchCount)
  393.    
  394.     orgCon = lite.connect(dbPath + "organizedData.sqlite3")
  395.    
  396.     # Get SQL for replicating general database structure
  397.     createTableSql = {}
  398.     for tableName in [x[0] for x in orgCon.execute("SELECT name FROM sqlite_master WHERE type='table'")]:
  399.         createTableSql[tableName] = orgCon.getTableCreateSql(tableName)
  400.     del createTableSql["sqlite_sequence"]
  401.    
  402.     print("Replicating database structure...")
  403.     subCons = [lite.connect(getBatchPath(i)) for i in range(batchCount)]
  404.     for subCon in subCons:
  405.         # Create tables if they do not already exist
  406.         for tableName in createTableSql:
  407.             subCon.execute(createTableSql[tableName])
  408.        
  409.         # Clear IdQueue table in case there is data left over
  410.         subCon.execute("DELETE FROM IdQueue")
  411.    
  412.     print("Dividing Id-Queue...")
  413.     printEvery = 100000
  414.     total = int(orgCon.getSingle("SELECT COUNT(id) FROM IdQueue"))
  415.     for i, (id,) in enumerate(orgCon.execute("SELECT id FROM IdQueue")):
  416.         subCons[i%batchCount].execute("INSERT INTO IdQueue(id) VALUES(?)", (id,))
  417.        
  418.         if (i+1)%printEvery == 0:
  419.             print("%d/%d" % (i+1,total))
  420.    
  421.     print("Saving sub-databases...")
  422.     for subCon in subCons:
  423.         subCon.commit()
  424.         subCon.close()
  425.    
  426.     print("Done.")
  427.        
  428. # Assumes table has an "id" field and a unique "name" field
  429. # "name" gets used to check if two entities are equal
  430. # Does two things:
  431. #   If entity does no exist in target table, copies it over
  432. #   Creates remapDict st. remapDict[originalId] = targetId
  433. def mergeNameEntities(sourceCon,targetCon,table):
  434.     targetMap = {}
  435.     for id, name in targetCon.execute("SELECT id, name FROM %s" % table):
  436.         targetMap[name] = id
  437.    
  438.     remapDict = {}
  439.     insertCon = targetCon.cursor()
  440.     for dataDict in sourceCon.getDicts("SELECT * FROM %s" % table):
  441.         id = dataDict["id"]
  442.         name = dataDict["name"]
  443.         if name in targetMap:
  444.             remapDict[id] = targetMap[name]
  445.         else:
  446.             del dataDict["id"]
  447.             insertCon.insertDict(table,dataDict)
  448.             remapDict[id] = insertCon.lastrowid
  449.     return remapDict
  450.        
  451. def mergeBatches(batchCount):
  452.     print("Merging %d batches..." % batchCount)
  453.     orgCon = lite.connect(dbPath + "organizedData.sqlite3")
  454.    
  455.     existingSeriesIds = set([x[0] for x in orgCon.execute("SELECT id FROM Series")])
  456.    
  457.     count = 0
  458.     for i in range(batchCount):
  459.         subCon = lite.connect(getBatchPath(i))
  460.        
  461.         # Delete entries that already exist for imported data
  462.         for (id,) in subCon.execute("SELECT id FROM StoryHeaders"):
  463.             orgCon.execute("DELETE FROM AuthorDedicationLinks WHERE storyId=?", (id,))
  464.             orgCon.execute("DELETE FROM AuthorLinks WHERE storyId=?", (id,))
  465.             orgCon.execute("DELETE FROM Chapters WHERE storyId=?", (id,))
  466.             orgCon.execute("DELETE FROM SeriesLinks WHERE storyId=?", (id,))
  467.             orgCon.execute("DELETE FROM StoryHeaders WHERE id=?", (id,))
  468.             orgCon.execute("DELETE FROM TagLinks WHERE storyId=?", (id,))
  469.        
  470.         # Merge named entities and get remap dictionaries
  471.         authorRemap = mergeNameEntities(subCon,orgCon,"Authors")
  472.         langRemap = mergeNameEntities(subCon,orgCon,"Languages")
  473.         tagRemap = mergeNameEntities(subCon,orgCon,"Tags")
  474.        
  475.         # Series uses a site-specific id so it doesn't need to get remapped
  476.         for dataDict in subCon.getDicts("SELECT * FROM Series"):
  477.             if dataDict["id"] not in existingSeriesIds:
  478.                 orgCon.insertDict("Series",dataDict)
  479.                 existingSeriesIds.add(dataDict["id"])
  480.        
  481.         # Enter author links
  482.         for tableName in ["AuthorDedicationLinks","AuthorLinks"]:
  483.             for storyId, authorId in subCon.execute("SELECT storyId, authorId FROM %s" % tableName):
  484.                 authorId = authorRemap[authorId]
  485.                 orgCon.execute("INSERT INTO %s(storyId, authorId) VALUES(?,?)" % tableName, (storyId,authorId))
  486.        
  487.         # Enter seriesLinks, and Chapters data
  488.         # For these the data can just be copied over directly
  489.         for tableName in ["SeriesLinks","Chapters"]:
  490.             for dataDict in subCon.getDicts("SELECT * FROM %s" % tableName):
  491.                 orgCon.insertDict(tableName,dataDict)
  492.        
  493.         # Enter tag links
  494.         for storyId, tagId in subCon.execute("SELECT storyId, tagId FROM TagLinks"):
  495.             tagId = tagRemap[tagId]
  496.             orgCon.execute("INSERT INTO TagLinks(storyId, tagId) VALUES(?,?)", (storyId,tagId))
  497.        
  498.         # Enter header data and remove IdQueue items
  499.         for dataDict in subCon.getDicts("SELECT * FROM StoryHeaders"):
  500.             if dataDict["language"] is not None:
  501.                 dataDict["language"] = langRemap[dataDict["language"]]
  502.             orgCon.insertDict("StoryHeaders",dataDict)
  503.            
  504.             orgCon.execute("DELETE FROM IdQueue WHERE id=?", (dataDict["contentScrapedFrom"],))
  505.             count += 1
  506.            
  507.         # Delete imported data so it doesn't get re-imported in the future
  508.         wipeList = [x[0] for x in orgCon.execute("SELECT name FROM sqlite_master WHERE type='table'")]
  509.         wipeList.remove("IdQueue")
  510.         for wipeTable in wipeList:
  511.             subCon.execute("DELETE FROM %s" % wipeTable)
  512.        
  513.         orgCon.commit()
  514.         subCon.commit()
  515.         subCon.close()
  516.        
  517.         print("%d/%d" % (i+1,batchCount))
  518.        
  519.     orgCon.close()
  520.     print("Done.\n%d stories merged in" % count)
  521.    
  522. def multiWork(batchCount):
  523.     with Pool(batchCount) as p:
  524.         p.map(workFor,[idx for idx in range(batchCount)])
  525.    
  526.    
  527. def fixAuthorLang(batchCount):
  528.     print("Fixing languages")
  529.     for i in range(batchCount):
  530.         orgCon = lite.connect(getBatchPath(i))
  531.         langIds = [x[0] for x in orgCon.execute("SELECT language FROM StoryHeaders GROUP BY language")]
  532.        
  533.         insertCur = orgCon.cursor()
  534.         for langId in langIds:
  535.             langName = [x[0] for x in orgCon.execute("SELECT name FROM Authors WHERE id=?",(langId,))][0]
  536.             orgCon.execute("DELETE FROM Authors WHERE id=?",(langId,))
  537.            
  538.             insertCur.execute("INSERT INTO Languages(name) VALUES(?)",(langName,))
  539.             newId = insertCur.lastrowid
  540.             orgCon.execute("UPDATE StoryHeaders SET language=? WHERE language=?",(newId,langId))
  541.         orgCon.commit()
  542.         print("Fixed batch %d" % i)
  543.         orgCon.close()
  544.     print("Done")
  545.    
  546. def reQueue():
  547.     orgCon = lite.connect(dbPath + "organizedData.sqlite3")
  548.     reqCon = lite.connect(dbPath + "rawRequests.sqlite3")
  549.    
  550.     for (id,) in orgCon.execute("SELECT id FROM IdQueue"):
  551.         url = [x[0] for x in reqCon.execute("SELECT url FROM History WHERE id=?", (id,))][0]
  552.        
  553.         dataDict = {}
  554.         dataDict["url"] = url
  555.         dataDict["date"] = getTimeStr()
  556.         dataDict["category"] = "requery"
  557.         reqCon.insertDict("Queued", dataDict)
  558.    
  559.     reqCon.commit()
  560.     reqCon.close()
  561.     orgCon.close()
  562.    
  563. def adjustIdQueue():
  564.     orgCon = lite.connect(dbPath + "organizedData.sqlite3")
  565.     reqCon = lite.connect(dbPath + "rawRequests.sqlite3")
  566.    
  567.     newIdMap = {}
  568.     for id, url in reqCon.execute("SELECT id, url FROM History WHERE category=\"requery\""):
  569.         newIdMap[url] = id
  570.    
  571.     for (oldId,) in orgCon.execute("SELECT id FROM IdQueue"):
  572.         url = reqCon.getSingle("SELECT url FROM History WHERE id=?",(oldId,))
  573.         newId = newIdMap[url]
  574.         orgCon.execute("UPDATE IdQueue SET id=? WHERE id=?", (newId,oldId))
  575.     orgCon.commit()
  576.    
  577. if __name__ == "__main__":
  578.     batchCount = 20
  579.    
  580.     splitIntoBatches(batchCount)
  581.     for i in range(batchCount):
  582.         print("Processing batch %d" % i)
  583.         workFor(i)
  584.     #multiWork(batchCount)
  585.     mergeBatches(batchCount)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement