Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import urllib
- import re
- from BeautifulSoup import BeautifulSoup, SoupStrainer
- import xmlrpclib
- from xmlrpclib import Binary as binary
- from urllib import urlretrieve
- from pprint import pprint;
- from datetime import datetime
- import os
- def unique(seq, idfun=None):
- if idfun is None:
- def idfun(x): return x
- seen = {}
- result = []
- for item in seq:
- marker = idfun(item)
- if marker in seen: continue
- seen[marker] = 1
- result.append(item)
- return result
- # get wordpress info
- wordpress = raw_input('Wordpress URL:')
- user = raw_input('Username:')
- password = raw_input('Password:')
- blogg_url = raw_input('blogg.se URL:')
- server = xmlrpclib.ServerProxy(wordpress + '/xmlrpc.php')
- # fetch html
- f = urllib.urlopen(blogg_url)
- s = f.read()
- f.close()
- # Use Beautifulsoup and mask out all links to "category"
- catlinks = SoupStrainer('a', href=re.compile('^category\/'))
- cats = BeautifulSoup(s, parseOnlyThese=catlinks)
- category = {}
- for cat in cats:
- category[''+cat.string] = cat["href"]
- # select blog id
- blog_id = 0
- for cat in category.keys():
- # Create new wordpress category
- new_category = {'name' : cat, 'slug' : cat.replace(' ', '-'), 'description' : ''}
- server.wp.newCategory(blog_id, user, password, new_category)
- # "Generate" URL for blogg.se category page
- category_url = blogg_url + category[cat]
- # Fetch category page
- doc = urllib.urlopen(category_url).read();
- # soup it!
- soup = BeautifulSoup(doc)
- # ok, let's start at the info-div
- s = soup.find("div", {"id" : "info"})
- # Closing in on the loop, find the first entrymeta class, this is the timestamp of post #1
- node = s.findNextSibling("div", {"class" : "entrymeta"})
- # and ... LOOP
- while (node != None):
- # Get the timestamp, minus line breaks
- entrytime = node.string.replace('\n', '')
- # Every post consists of timestamp, header (h3) and then a body, here's the header
- node = node.findNextSibling("h3");
- title = '' + node.string
- # Now for the body
- node = node.findNextSibling("div", {"class" : "entrybody"})
- # Let's search for images in the body, download them locally and then upload to wordpress
- for img in node.findAll('img', {"src" : re.compile("^\.\.\/")}):
- # extract url to image
- filename = img["src"].split("/")[-1]
- blogg_path_to_img = blogg_url + img["src"].replace('../', '')
- # fetch image, @TODO: no hard coded path here, thanks!
- urlretrieve(blogg_path_to_img, os.path.join("/test/", filename))
- # GET image and store locally @TODO: no hard coded path here, thanks!
- file = open(os.path.join("/test/", filename), 'rb')
- image = file.read()
- file.close()
- # Upload image to wordpress
- encodedImage = binary(image)
- content = {"name": filename, "type":"image/jpeg", "bits":encodedImage}
- result = server.metaWeblog.newMediaObject(blog_id, user, password, content)
- # Change path to image in the body
- img["src"] = result["url"]
- body = node.prettify()
- # Finish the post parsing, get ready for next post
- node = node.findNextSibling("div", {"class" : "entrymeta"})
- node = node.findNextSibling("div", {"class" : "entrymeta"})
- # Set wordpress meta-info
- blog_content = { 'title' : title, 'description' : body, 'categories' : [cat],
- 'dateCreated' : datetime.strptime(entrytime, '%Y-%m-%d %H:%M:%S'), 'mt_convert_breaks' : '0' }
- # Post this pig
- post_id = int(server.metaWeblog.newPost(blog_id, user, password, blog_content, 1))
- #server.mt.publishPost(post_id, user, password)
Add Comment
Please, Sign In to add comment