Guest User

Untitled

a guest
Jun 23rd, 2018
167
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.53 KB | None | 0 0
  1. import urllib
  2. import re
  3. from BeautifulSoup import BeautifulSoup, SoupStrainer
  4. import xmlrpclib
  5. from xmlrpclib import Binary as binary
  6. from urllib import urlretrieve
  7. from pprint import pprint;
  8. from datetime import datetime
  9.  
  10. import os
  11.  
  12. def unique(seq, idfun=None):
  13. if idfun is None:
  14. def idfun(x): return x
  15. seen = {}
  16. result = []
  17. for item in seq:
  18. marker = idfun(item)
  19. if marker in seen: continue
  20. seen[marker] = 1
  21. result.append(item)
  22. return result
  23.  
  24. # get wordpress info
  25. wordpress = raw_input('Wordpress URL:')
  26. user = raw_input('Username:')
  27. password = raw_input('Password:')
  28. blogg_url = raw_input('blogg.se URL:')
  29.  
  30. server = xmlrpclib.ServerProxy(wordpress + '/xmlrpc.php')
  31.  
  32. # fetch html
  33. f = urllib.urlopen(blogg_url)
  34. s = f.read()
  35. f.close()
  36.  
  37. # Use Beautifulsoup and mask out all links to "category"
  38. catlinks = SoupStrainer('a', href=re.compile('^category\/'))
  39. cats = BeautifulSoup(s, parseOnlyThese=catlinks)
  40.  
  41. category = {}
  42. for cat in cats:
  43. category[''+cat.string] = cat["href"]
  44.  
  45. # select blog id
  46. blog_id = 0
  47.  
  48.  
  49. for cat in category.keys():
  50. # Create new wordpress category
  51. new_category = {'name' : cat, 'slug' : cat.replace(' ', '-'), 'description' : ''}
  52. server.wp.newCategory(blog_id, user, password, new_category)
  53.  
  54. # "Generate" URL for blogg.se category page
  55. category_url = blogg_url + category[cat]
  56.  
  57. # Fetch category page
  58. doc = urllib.urlopen(category_url).read();
  59.  
  60. # soup it!
  61. soup = BeautifulSoup(doc)
  62.  
  63. # ok, let's start at the info-div
  64. s = soup.find("div", {"id" : "info"})
  65.  
  66. # Closing in on the loop, find the first entrymeta class, this is the timestamp of post #1
  67. node = s.findNextSibling("div", {"class" : "entrymeta"})
  68.  
  69. # and ... LOOP
  70. while (node != None):
  71. # Get the timestamp, minus line breaks
  72. entrytime = node.string.replace('\n', '')
  73.  
  74. # Every post consists of timestamp, header (h3) and then a body, here's the header
  75. node = node.findNextSibling("h3");
  76. title = '' + node.string
  77.  
  78. # Now for the body
  79. node = node.findNextSibling("div", {"class" : "entrybody"})
  80.  
  81. # Let's search for images in the body, download them locally and then upload to wordpress
  82. for img in node.findAll('img', {"src" : re.compile("^\.\.\/")}):
  83. # extract url to image
  84. filename = img["src"].split("/")[-1]
  85. blogg_path_to_img = blogg_url + img["src"].replace('../', '')
  86.  
  87. # fetch image, @TODO: no hard coded path here, thanks!
  88. urlretrieve(blogg_path_to_img, os.path.join("/test/", filename))
  89.  
  90. # GET image and store locally @TODO: no hard coded path here, thanks!
  91. file = open(os.path.join("/test/", filename), 'rb')
  92. image = file.read()
  93. file.close()
  94.  
  95. # Upload image to wordpress
  96. encodedImage = binary(image)
  97. content = {"name": filename, "type":"image/jpeg", "bits":encodedImage}
  98. result = server.metaWeblog.newMediaObject(blog_id, user, password, content)
  99.  
  100. # Change path to image in the body
  101. img["src"] = result["url"]
  102.  
  103. body = node.prettify()
  104. # Finish the post parsing, get ready for next post
  105. node = node.findNextSibling("div", {"class" : "entrymeta"})
  106. node = node.findNextSibling("div", {"class" : "entrymeta"})
  107.  
  108. # Set wordpress meta-info
  109. blog_content = { 'title' : title, 'description' : body, 'categories' : [cat],
  110. 'dateCreated' : datetime.strptime(entrytime, '%Y-%m-%d %H:%M:%S'), 'mt_convert_breaks' : '0' }
  111.  
  112. # Post this pig
  113. post_id = int(server.metaWeblog.newPost(blog_id, user, password, blog_content, 1))
  114. #server.mt.publishPost(post_id, user, password)
Add Comment
Please, Sign In to add comment