# coding=utf-8 ''' Markov text generator ''' from itertools import islice from random import random import re import time import bpython from BeautifulSoup import BeautifulSoup import soupselect import httplib2 http = httplib2.Http() TUPLE_LEN = 5 import pymongo connection = pymongo.Connection() #db = connection.blog #BLOG_URL = 'http://bunyk.wordpress.com/' #db = connection.danbst #BLOG_URL = 'http://danbst.wordpress.com/' db = connection.yulka BLOG_URL = 'http://yulkahrytsenko.wordpress.com/' TROTTLE = 1.0 def get_page(url): time.sleep(TROTTLE) page = db.pages.find_one({'url': url}) text = page.get('text') if page else None if text: return text headers, text = http.request(url) if headers.status != 200: return None text = unicode(text, 'utf-8') db.pages.insert({ 'url': url, 'text': text }) return text def get_links(url): page = db.pages.find_one({'url': url}) if page: return page.get('links') def update_links(url, links): db.pages.update( {'url': url}, {'$set': {'links': links}} ) def is_visited(url): page = db.pages.find_one({'url': url}) if page: return bool(page.get('visited')) def visited(url): db.pages.update( {'url': url}, {'$set': {'visited': True}} ) def get_posts_list(url): post_url_re = re.compile(url + r'\d{4}/\d{2}/\d{2}/[^/"]*/') # url/yyyy/mm/dd/post-slug/ def visit(url): if is_visited(url): return # already visited print url try: text = get_page(url) except httplib2.ServerNotFoundError: print 'Server not found. Wait 20 secs for retry...' time.sleep(20) return visit(url) if not text: print 'No page!' return links = get_links(url) if not links: links = list(set(post_url_re.findall(text))) update_links(url, links) visited(url) for link in links: visit(link) visit(url) def clean_html(html): """ Remove HTML markup from the given string.""" # First we remove inline JavaScript/CSS: cleaned = re.sub(ur"(?is)<(script|style).*?>.*?()", "", html.strip()) # Then we remove html comments. # This has to be done before removing regular # tags since comments can contain '>' characters. cleaned = re.sub(ur"(?s)[\n]?", "", cleaned) # Next we can remove the remaining tags: cleaned = re.sub(ur"(?s)<.*?>", " ", cleaned) # Finally, we deal with whitespace cleaned = re.sub(ur" ", " ", cleaned) cleaned = re.sub(ur"\s+", " ", cleaned) return cleaned.strip() def get_post_text(post): page = post['text'] parsed = BeautifulSoup(page) try: main = soupselect.select(parsed, 'div.main')[0] except IndexError: try: main = soupselect.select(parsed, 'div.entrytext')[0] share = soupselect.select(main, 'div.sharedaddy') if share: share[0].extract() metadata = soupselect.select(main, 'div.postmetadata') if metadata: metadata[0].extract() except IndexError: db.posts.remove(page) return '' text = clean_html(unicode(main)) if text.count(u'.') > 1000: return '' else: return text TOKENIZE_REGEX = re.compile(ur'([\w`-]+|[^\w\s]|\s+)', re.U) def tokenize(text): return (s.groups()[0] for s in TOKENIZE_REGEX.finditer(text)) def dict_array(a): ''' return dictionary with keys as array indexes ''' return dict((str(k), v) for k, v in enumerate(a)) def add_tuple(*args): db.tuples.update( dict_array(args), {'$inc': {'usage': 1}}, upsert=True ) def get_next(*args): to = str(len(args)) variants = db.tuples.find( dict_array(args), {to: 1, 'usage': 1} ) variants = [[v[to], v['usage']] for v in variants] if not variants: return None for i, v in enumerate(variants): if i > 0: variants[i][1] += variants[i-1][1] finger = random() * variants[-1][1] prev = 0 for v in variants: if prev <= finger <= v[1]: return v[0] prev = v[1] return v[-1] def compute_tuples(): db.tuples.ensure_index([ (str(key), 1) for key in range(TUPLE_LEN) ]) count = db.pages.count() for num, post in enumerate(db.pages.find()): print '%d/%d' % (num, count), post['url'] a = [None] * TUPLE_LEN for word in tokenize(get_post_text(post)): a.pop(0) a.append(word) add_tuple(*a) def gen_text(): db.tuples.ensure_index([ (str(key), 1) for key in range(TUPLE_LEN - 1) ]) prev = [None] * (TUPLE_LEN - 1) next = True while True: next = get_next(*prev) if next: yield next else: return prev.pop(0) prev.append(next) if __name__=="__main__": #get_posts_list(BLOG_URL) #compute_tuples() print ''.join(islice(gen_text(), 0, 1000))