Guest User

ljdump 1.6a for python 3

a guest
Dec 13th, 2013
182
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 14.45 KB | None | 0 0
  1. #!/usr/bin/python
  2. #
  3. # ljdump.py - livejournal archiver
  4. # Version 1.6
  5. #
  6. # LICENSE
  7. #
  8. # This software is provided 'as-is', without any express or implied
  9. # warranty.  In no event will the author be held liable for any damages
  10. # arising from the use of this software.
  11. #
  12. # Permission is granted to anyone to use this software for any purpose,
  13. # including commercial applications, and to alter it and redistribute it
  14. # freely, subject to the following restrictions:
  15. #
  16. # 1. The origin of this software must not be misrepresented; you must not
  17. #    claim that you wrote the original software. If you use this software
  18. #    in a product, an acknowledgment in the product documentation would be
  19. #    appreciated but is not required.
  20. # 2. Altered source versions must be plainly marked as such, and must not be
  21. #    misrepresented as being the original software.
  22. # 3. This notice may not be removed or altered from any source distribution.
  23. #
  24. # Copyright (c) 2005-2010 Greg Hewgill and contributors
  25.  
  26. import codecs, os, pickle, pprint, re, shutil, sys, urllib.request, urllib.parse, xml.dom.minidom, xmlrpc.client
  27. from xml.sax import saxutils
  28.  
  29. MimeExtensions = {
  30.     "image/gif": ".gif",
  31.     "image/jpeg": ".jpg",
  32.     "image/png": ".png",
  33. }
  34.  
  35. try:
  36.     from hashlib import md5
  37. except ImportError:
  38.     import md5 as _md5
  39.     md5 = _md5.new
  40.  
  41. def calcchallenge(challenge, password):
  42.     challenge = challenge.encode("utf-8")
  43.     password = password.encode("utf-8")
  44.     return md5(challenge+md5(password).hexdigest().encode("utf-8")).hexdigest()
  45.  
  46. def flatresponse(response):
  47.     r = {}
  48.     while True:
  49.         name = str(response.readline(),"utf-8")
  50.         if len(name) == 0:
  51.             break
  52.         if name[-1] == '\n':
  53.             name = name[:len(name)-1]
  54.         value = str(response.readline(),"utf-8")
  55.         if value[-1] == '\n':
  56.             value = value[:len(value)-1]
  57.         r[name] = value
  58.     return r
  59.  
  60. def getljsession(server, username, password):
  61.     r = urllib.request.urlopen(server+"/interface/flat", bytes(urllib.parse.urlencode({'mode': 'getchallenge'}).encode("utf-8")))
  62.     response = flatresponse(r)
  63.     r.close()
  64.     r = urllib.request.urlopen(server+"/interface/flat", bytes(urllib.parse.urlencode({'mode': 'sessiongenerate', 'user': username, 'auth_method': 'challenge', 'auth_challenge': response['challenge'], 'auth_response': (calcchallenge(response['challenge'], password))}).encode("utf-8")))
  65.     response = flatresponse(r)
  66.     r.close()
  67.     return response['ljsession']
  68.  
  69. def dochallenge(server, params, password):
  70.     challenge = server.LJ.XMLRPC.getchallenge()
  71.     params.update({
  72.         'auth_method': "challenge",
  73.         'auth_challenge': challenge['challenge'],
  74.         'auth_response': calcchallenge(challenge['challenge'], password)
  75.     })
  76.     return params
  77.  
  78. def dumpelement(f, name, e):
  79.     f.write("<%s>\n" % name)
  80.     for k in list(e.keys()):
  81.         if isinstance(e[k], {}.__class__):
  82.             dumpelement(f, k, e[k])
  83.         else:
  84.             s = str(bytes(str(e[k]), "latin"),"utf-8")
  85.             f.write("<%s>%s</%s>\n" % (k, saxutils.escape(s), k))
  86.     f.write("</%s>\n" % name)
  87.  
  88. def writedump(fn, event):
  89.     f = codecs.open(fn, "w", "UTF-8")
  90.     f.write("""<?xml version="1.0" encoding="UTF-8" ?>\n""")
  91.     dumpelement(f, "event", event)
  92.     f.close()
  93.  
  94. def writelast(journal, lastsync, lastmaxid):
  95.     f = open("%s/.last" % journal, "w")
  96.     f.write("%s\n" % lastsync)
  97.     f.write("%s\n" % lastmaxid)
  98.     f.close()
  99.  
  100. def createxml(doc, name, map):
  101.     e = doc.createElement(name)
  102.     for k in list(map.keys()):
  103.         me = doc.createElement(k)
  104.         me.appendChild(doc.createTextNode(map[k]))
  105.         e.appendChild(me)
  106.     return e
  107.  
  108. def gettext(e):
  109.     if len(e) == 0:
  110.         return ""
  111.     return e[0].firstChild.nodeValue
  112.  
  113. def ljdump(Server, Username, Password, Journal):
  114.     m = re.search("(.*)/interface/xmlrpc", Server)
  115.     if m:
  116.         Server = m.group(1)
  117.     if Username != Journal:
  118.         authas = "&authas=%s" % Journal
  119.     else:
  120.         authas = ""
  121.  
  122.     print("Fetching journal entries for: %s" % Journal)
  123.     try:
  124.         os.mkdir(Journal)
  125.         print("Created subdirectory: %s" % Journal)
  126.     except:
  127.         pass
  128.  
  129.     ljsession = getljsession(Server, Username, Password)
  130.  
  131.     server = xmlrpc.client.ServerProxy(Server+"/interface/xmlrpc")
  132.  
  133.     newentries = 0
  134.     newcomments = 0
  135.     errors = 0
  136.  
  137.     lastsync = ""
  138.     lastmaxid = 0
  139.     try:
  140.         f = open("%s/.last" % Journal, "r")
  141.         lastsync = f.readline()
  142.         if lastsync[-1] == '\n':
  143.             lastsync = lastsync[:len(lastsync)-1]
  144.         lastmaxid = f.readline()
  145.         if len(lastmaxid) > 0 and lastmaxid[-1] == '\n':
  146.             lastmaxid = lastmaxid[:len(lastmaxid)-1]
  147.         if lastmaxid == "":
  148.             lastmaxid = 0
  149.         else:
  150.             lastmaxid = int(lastmaxid)
  151.         f.close()
  152.     except:
  153.         pass
  154.     origlastsync = lastsync
  155.  
  156.     r = server.LJ.XMLRPC.login(dochallenge(server, {
  157.         'username': Username,
  158.         'ver': 1,
  159.         'getpickws': 1,
  160.         'getpickwurls': 1,
  161.     }, Password))
  162.     userpics = dict(list(zip(list(map(str, r['pickws'])), r['pickwurls'])))
  163.     if r['defaultpicurl']:
  164.         userpics['*'] = r['defaultpicurl']
  165.  
  166.     while True:
  167.         r = server.LJ.XMLRPC.syncitems(dochallenge(server, {
  168.             'username': Username,
  169.             'ver': 1,
  170.             'lastsync': lastsync,
  171.             'usejournal': Journal,
  172.         }, Password))
  173.         #pprint.pprint(r)
  174.         if len(r['syncitems']) == 0:
  175.             break
  176.         for item in r['syncitems']:
  177.             if item['item'][0] == 'L':
  178.                 print("Fetching journal entry %s (%s)" % (item['item'], item['action']))
  179.                 try:
  180.                     e = server.LJ.XMLRPC.getevents(dochallenge(server, {
  181.                         'username': Username,
  182.                         'ver': 1,
  183.                         'selecttype': "one",
  184.                         'itemid': item['item'][2:],
  185.                         'usejournal': Journal,
  186.                     }, Password))
  187.                     if e['events']:
  188.                         writedump("%s/%s" % (Journal, item['item']), e['events'][0])
  189.                         newentries += 1
  190.                     else:
  191.                         print("Unexpected empty item: %s" % item['item'])
  192.                         errors += 1
  193.                 except xmlrpc.client.Fault as x:
  194.                     print("Error getting item: %s" % item['item'])
  195.                     pprint.pprint(x)
  196.                     errors += 1
  197.             lastsync = item['time']
  198.             writelast(Journal, lastsync, lastmaxid)
  199.  
  200.     # The following code doesn't work because the server rejects our repeated calls.
  201.     # http://www.livejournal.com/doc/server/ljp.csp.xml-rpc.getevents.html
  202.     # contains the statement "You should use the syncitems selecttype in
  203.     # conjuntions [sic] with the syncitems protocol mode", but provides
  204.     # no other explanation about how these two function calls should
  205.     # interact. Therefore we just do the above slow one-at-a-time method.
  206.  
  207.     #while True:
  208.     #    r = server.LJ.XMLRPC.getevents(dochallenge(server, {
  209.     #        'username': Username,
  210.     #        'ver': 1,
  211.     #        'selecttype': "syncitems",
  212.     #        'lastsync': lastsync,
  213.     #    }, Password))
  214.     #    pprint.pprint(r)
  215.     #    if len(r['events']) == 0:
  216.     #        break
  217.     #    for item in r['events']:
  218.     #        writedump("%s/L-%d" % (Journal, item['itemid']), item)
  219.     #        newentries += 1
  220.     #        lastsync = item['eventtime']
  221.  
  222.     print("Fetching journal comments for: %s" % Journal)
  223.  
  224.     try:
  225.         f = codecs.open("%s/comment.meta" % Journal, "r", "UTF-8")
  226.         metacache = pickle.load(f)
  227.         f.close()
  228.     except:
  229.         metacache = {}
  230.  
  231.     try:
  232.         f = codecs.open("%s/user.map" % Journal, "r", "UTF-8")
  233.         usermap = pickle.load(f)
  234.         f.close()
  235.     except:
  236.         usermap = {}
  237.  
  238.     maxid = lastmaxid
  239.     while True:
  240.         try:
  241.             try:
  242.                 r = urllib.request.urlopen(urllib.request.Request(Server+"/export_comments.bml?get=comment_meta&startid=%d%s" % (maxid+1, authas), headers = {'Cookie': "ljsession="+ljsession}))
  243.                 meta = xml.dom.minidom.parse(r)
  244.             except Exception as x:
  245.                 print("*** Error fetching comment meta, possibly not community maintainer?")
  246.                 print("***", x)
  247.                 break
  248.         finally:
  249.             try:
  250.                 r.close()
  251.             except AttributeError: # r is sometimes a dict for unknown reasons
  252.                 pass
  253.         for c in meta.getElementsByTagName("comment"):
  254.             id = int(c.getAttribute("id"))
  255.             metacache[id] = {
  256.                 'posterid': c.getAttribute("posterid"),
  257.                 'state': c.getAttribute("state"),
  258.             }
  259.             if id > maxid:
  260.                 maxid = id
  261.         for u in meta.getElementsByTagName("usermap"):
  262.             usermap[u.getAttribute("id")] = u.getAttribute("user")
  263.         if maxid >= int(meta.getElementsByTagName("maxid")[0].firstChild.nodeValue):
  264.             break
  265.  
  266.     f = codecs.open("%s/comment.meta" % Journal, "w", "UTF-8")
  267.     #pickle.dump(metacache, f)
  268.     f.close()
  269.  
  270.     f = codecs.open("%s/user.map" % Journal, "w", "UTF-8")
  271.     #pickle.dump(usermap, f)
  272.     f.close()
  273.  
  274.     newmaxid = maxid
  275.     maxid = lastmaxid
  276.     while True:
  277.         try:
  278.             try:
  279.                 r = urllib.request.urlopen(urllib.request.Request(Server+"/export_comments.bml?get=comment_body&startid=%d%s" % (maxid+1, authas), headers = {'Cookie': "ljsession="+ljsession}))
  280.                 meta = xml.dom.minidom.parse(r)
  281.             except Exception as x:
  282.                 print("*** Error fetching comment body, possibly not community maintainer?")
  283.                 print("***", x)
  284.                 break
  285.         finally:
  286.             r.close()
  287.         for c in meta.getElementsByTagName("comment"):
  288.             id = int(c.getAttribute("id"))
  289.             jitemid = c.getAttribute("jitemid")
  290.             comment = {
  291.                 'id': str(id),
  292.                 'parentid': c.getAttribute("parentid"),
  293.                 'subject': gettext(c.getElementsByTagName("subject")),
  294.                 'date': gettext(c.getElementsByTagName("date")),
  295.                 'body': gettext(c.getElementsByTagName("body")),
  296.                 'state': metacache[id]['state'],
  297.             }
  298.             if c.getAttribute("posterid") in usermap:
  299.                 comment["user"] = usermap[c.getAttribute("posterid")]
  300.             try:
  301.                 entry = xml.dom.minidom.parse("%s/C-%s" % (Journal, jitemid))
  302.             except:
  303.                 entry = xml.dom.minidom.getDOMImplementation().createDocument(None, "comments", None)
  304.             found = False
  305.             for d in entry.getElementsByTagName("comment"):
  306.                 if int(d.getElementsByTagName("id")[0].firstChild.nodeValue) == id:
  307.                     found = True
  308.                     break
  309.             if found:
  310.                 print("Warning: downloaded duplicate comment id %d in jitemid %s" % (id, jitemid))
  311.             else:
  312.                 entry.documentElement.appendChild(createxml(entry, "comment", comment))
  313.                 f = codecs.open("%s/C-%s" % (Journal, jitemid), "w", "UTF-8")
  314.                 entry.writexml(f)
  315.                 f.close()
  316.                 newcomments += 1
  317.             if id > maxid:
  318.                 maxid = id
  319.         if maxid >= newmaxid:
  320.             break
  321.  
  322.     lastmaxid = maxid
  323.  
  324.     writelast(Journal, lastsync, lastmaxid)
  325.  
  326.     if Username == Journal:
  327.         print("Fetching userpics for: %s" % Username)
  328.         f = codecs.open("%s/userpics.xml" % Username, "w", "UTF-8")
  329.         print("""<?xml version="1.0"?>""", file=f)
  330.         print("<userpics>", file=f)
  331.         for p in userpics:
  332.             print("""<userpic keyword="%s" url="%s" />""" % (str(bytes(p,"latin"),"utf-8"), userpics[p]), file=f)
  333.             pic = urllib.request.urlopen(userpics[p])
  334.             ext = MimeExtensions.get(pic.info()["Content-Type"], "")
  335.             picfn = re.sub(r'[*?\\/:<>"|]', "_", p)
  336.             try:
  337.                 picfn = str(bytes(picfn,"latin"),"utf-8")
  338.                 picf = open("%s/%s%s" % (Username, picfn, ext), "wb")
  339.             except:
  340.                 # for installations where the above utf_8_decode doesn't work
  341.                 picfn = "".join([ord(x) < 128 and x or "_" for x in picfn])
  342.                 picf = open("%s/%s%s" % (Username, picfn, ext), "wb")
  343.             shutil.copyfileobj(pic, picf)
  344.             pic.close()
  345.             picf.close()
  346.         print("</userpics>", file=f)
  347.         f.close()
  348.  
  349.     if origlastsync:
  350.         print("%d new entries, %d new comments (since %s)" % (newentries, newcomments, origlastsync))
  351.     else:
  352.         print("%d new entries, %d new comments" % (newentries, newcomments))
  353.     if errors > 0:
  354.         print("%d errors" % errors)
  355.  
  356. if __name__ == "__main__":
  357.     if os.access("ljdump.config", os.F_OK):
  358.         config = xml.dom.minidom.parse("ljdump.config")
  359.         server = config.documentElement.getElementsByTagName("server")[0].childNodes[0].data
  360.         username = config.documentElement.getElementsByTagName("username")[0].childNodes[0].data
  361.         password = config.documentElement.getElementsByTagName("password")[0].childNodes[0].data
  362.         journals = config.documentElement.getElementsByTagName("journal")
  363.         if journals:
  364.             for e in journals:
  365.                 ljdump(server, username, password, e.childNodes[0].data)
  366.         else:
  367.             ljdump(server, username, password, username)
  368.     else:
  369.         from getpass import getpass
  370.         print("ljdump - livejournal archiver")
  371.         print()
  372.         print("Enter your Livejournal username and password.")
  373.         print()
  374.         server = "http://livejournal.com"
  375.         username = input("Username: ")
  376.         password = getpass("Password: ")
  377.         print()
  378.         print("You may back up either your own journal, or a community.")
  379.         print("If you are a community maintainer, you can back up both entries and comments.")
  380.         print("If you are not a maintainer, you can back up only entries.")
  381.         print()
  382.         journal = input("Journal to back up (or hit return to back up '%s'): " % username)
  383.         print()
  384.         if journal:
  385.             ljdump(server, username, password, journal)
  386.         else:
  387.             ljdump(server, username, password, username)
  388. # vim:ts=4 et:
Add Comment
Please, Sign In to add comment