Untitled

import urllib
import re
from BeautifulSoup import BeautifulSoup, SoupStrainer
import xmlrpclib
from xmlrpclib import Binary as binary
from urllib import urlretrieve
from pprint import pprint;
from datetime import datetime

import os

def unique(seq, idfun=None):
   if idfun is None:
       def idfun(x): return x
   seen = {}
   result = []
   for item in seq:
       marker = idfun(item)
       if marker in seen: continue
       seen[marker] = 1
       result.append(item)
   return result

# get wordpress info
wordpress = raw_input('Wordpress URL:')
user = raw_input('Username:')
password = raw_input('Password:')
blogg_url = raw_input('blogg.se URL:')

server = xmlrpclib.ServerProxy(wordpress + '/xmlrpc.php')

# fetch html
f = urllib.urlopen(blogg_url)
s = f.read()
f.close()

# Use Beautifulsoup and mask out all links to "category"
catlinks = SoupStrainer('a', href=re.compile('^category\/'))
cats =  BeautifulSoup(s, parseOnlyThese=catlinks)

category = {}
for cat in cats:
	category[''+cat.string] = cat["href"]

# select blog id
blog_id = 0


for cat in category.keys():
	# Create new wordpress category
	new_category = {'name' : cat, 'slug' : cat.replace(' ', '-'), 'description' : ''}
	server.wp.newCategory(blog_id, user, password, new_category)

	# "Generate" URL for blogg.se category page
	category_url = blogg_url + category[cat]

	# Fetch category page
	doc = urllib.urlopen(category_url).read();

	# soup it!
	soup = BeautifulSoup(doc)

	# ok, let's start at the info-div
	s = soup.find("div", {"id" : "info"})

	# Closing in on the loop, find the first entrymeta class, this is the timestamp of post #1
	node = s.findNextSibling("div", {"class" : "entrymeta"})

	# and ... LOOP
	while (node != None):
		# Get the timestamp, minus line breaks
		entrytime = node.string.replace('\n', '')

		# Every post consists of timestamp, header (h3) and then a body, here's the header
		node = node.findNextSibling("h3");
		title = '' + node.string

		# Now for the body
		node = node.findNextSibling("div", {"class" : "entrybody"})

		# Let's search for images in the body, download them locally and then upload to wordpress
		for img in node.findAll('img', {"src" : re.compile("^\.\.\/")}):
			# extract url to image
			filename = img["src"].split("/")[-1]
			blogg_path_to_img = blogg_url + img["src"].replace('../', '')

			# fetch image, @TODO: no hard coded path here, thanks!
			urlretrieve(blogg_path_to_img, os.path.join("/test/", filename))

			# GET image and store locally @TODO: no hard coded path here, thanks!
			file = open(os.path.join("/test/", filename), 'rb')
			image = file.read()
			file.close()

			# Upload image to wordpress
			encodedImage = binary(image)
			content = {"name": filename, "type":"image/jpeg", "bits":encodedImage}
			result = server.metaWeblog.newMediaObject(blog_id, user, password, content)

			# Change path to image in the body
			img["src"] = result["url"]

		body = node.prettify()
		# Finish the post parsing, get ready for next post
		node = node.findNextSibling("div", {"class" : "entrymeta"})
		node = node.findNextSibling("div", {"class" : "entrymeta"})

		# Set wordpress meta-info
		blog_content = { 'title' : title, 'description' : body, 'categories' : [cat],
			'dateCreated' : datetime.strptime(entrytime, '%Y-%m-%d %H:%M:%S'), 'mt_convert_breaks' : '0' }

		# Post this pig
		post_id = int(server.metaWeblog.newPost(blog_id, user, password, blog_content, 1))
		#server.mt.publishPost(post_id, user, password)