Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import urllib2
- import re
- url = "http://services.runescape.com/m=news/behind-the-scenes-video-30-player-owned-ports-"
- print "Printing stuffs..."
- a = urllib2.urlopen(url).read()
- a = a[a.index('<div class=\"Content\">')+21:]
- a = a[:a.index('</div>')]
- a = a.decode("windows-1252")
- a = a.replace('\n','')
- html = a.replace("</p>", "\n\n").replace('<p>','')
- html = re.sub(r"</?b>", "'''", html)
- html = re.sub(r"</?i>", "''", html)
- html = html.replace('<li>','*').replace('</li>','\n')
- html = re.sub(r"</?ul>", "\n", html)
- html = re.sub(r'<a.*?href="?([^ "]*)"?.*?>(.*?)<\/a>', r"[\1 \2]", html)
- html = re.sub(r'<hr.*?>', "\n----\n", html)
- html = re.sub(r'<iframe.*?embed\/(.*?)".*?<\/iframe>', r'<youtube>\1</youtube>\n\n', html)
- html = re.sub(r'\n{4,}','\n\n',html)
- print html
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement