Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # coding=utf-8
- '''
- Markov text generator
- '''
- from itertools import islice
- from random import random
- import re
- import time
- import bpython
- from BeautifulSoup import BeautifulSoup
- import soupselect
- import httplib2
- http = httplib2.Http()
- TUPLE_LEN = 5
- import pymongo
- connection = pymongo.Connection()
- #db = connection.blog
- #BLOG_URL = 'http://bunyk.wordpress.com/'
- #db = connection.danbst
- #BLOG_URL = 'http://danbst.wordpress.com/'
- db = connection.yulka
- BLOG_URL = 'http://yulkahrytsenko.wordpress.com/'
- TROTTLE = 1.0
- def get_page(url):
- time.sleep(TROTTLE)
- page = db.pages.find_one({'url': url})
- text = page.get('text') if page else None
- if text:
- return text
- headers, text = http.request(url)
- if headers.status != 200:
- return None
- text = unicode(text, 'utf-8')
- db.pages.insert({
- 'url': url,
- 'text': text
- })
- return text
- def get_links(url):
- page = db.pages.find_one({'url': url})
- if page:
- return page.get('links')
- def update_links(url, links):
- db.pages.update(
- {'url': url},
- {'$set': {'links': links}}
- )
- def is_visited(url):
- page = db.pages.find_one({'url': url})
- if page:
- return bool(page.get('visited'))
- def visited(url):
- db.pages.update(
- {'url': url},
- {'$set': {'visited': True}}
- )
- def get_posts_list(url):
- post_url_re = re.compile(url + r'\d{4}/\d{2}/\d{2}/[^/"]*/')
- # url/yyyy/mm/dd/post-slug/
- def visit(url):
- if is_visited(url):
- return # already visited
- print url
- try:
- text = get_page(url)
- except httplib2.ServerNotFoundError:
- print 'Server not found. Wait 20 secs for retry...'
- time.sleep(20)
- return visit(url)
- if not text:
- print 'No page!'
- return
- links = get_links(url)
- if not links:
- links = list(set(post_url_re.findall(text)))
- update_links(url, links)
- visited(url)
- for link in links:
- visit(link)
- visit(url)
- def clean_html(html):
- """ Remove HTML markup from the given string."""
- # First we remove inline JavaScript/CSS:
- cleaned = re.sub(ur"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())
- # Then we remove html comments.
- # This has to be done before removing regular
- # tags since comments can contain '>' characters.
- cleaned = re.sub(ur"(?s)<!--(.*?)-->[\n]?", "", cleaned)
- # Next we can remove the remaining tags:
- cleaned = re.sub(ur"(?s)<.*?>", " ", cleaned)
- # Finally, we deal with whitespace
- cleaned = re.sub(ur" ", " ", cleaned)
- cleaned = re.sub(ur"\s+", " ", cleaned)
- return cleaned.strip()
- def get_post_text(post):
- page = post['text']
- parsed = BeautifulSoup(page)
- try:
- main = soupselect.select(parsed, 'div.main')[0]
- except IndexError:
- try:
- main = soupselect.select(parsed, 'div.entrytext')[0]
- share = soupselect.select(main, 'div.sharedaddy')
- if share:
- share[0].extract()
- metadata = soupselect.select(main, 'div.postmetadata')
- if metadata:
- metadata[0].extract()
- except IndexError:
- db.posts.remove(page)
- return ''
- text = clean_html(unicode(main))
- if text.count(u'.') > 1000:
- return ''
- else:
- return text
- TOKENIZE_REGEX = re.compile(ur'([\w`-]+|[^\w\s]|\s+)', re.U)
- def tokenize(text):
- return (s.groups()[0] for s in TOKENIZE_REGEX.finditer(text))
- def dict_array(a):
- ''' return dictionary with keys as array indexes '''
- return dict((str(k), v) for k, v in enumerate(a))
- def add_tuple(*args):
- db.tuples.update(
- dict_array(args),
- {'$inc': {'usage': 1}},
- upsert=True
- )
- def get_next(*args):
- to = str(len(args))
- variants = db.tuples.find(
- dict_array(args),
- {to: 1, 'usage': 1}
- )
- variants = [[v[to], v['usage']] for v in variants]
- if not variants:
- return None
- for i, v in enumerate(variants):
- if i > 0:
- variants[i][1] += variants[i-1][1]
- finger = random() * variants[-1][1]
- prev = 0
- for v in variants:
- if prev <= finger <= v[1]:
- return v[0]
- prev = v[1]
- return v[-1]
- def compute_tuples():
- db.tuples.ensure_index([
- (str(key), 1) for key in range(TUPLE_LEN)
- ])
- count = db.pages.count()
- for num, post in enumerate(db.pages.find()):
- print '%d/%d' % (num, count), post['url']
- a = [None] * TUPLE_LEN
- for word in tokenize(get_post_text(post)):
- a.pop(0)
- a.append(word)
- add_tuple(*a)
- def gen_text():
- db.tuples.ensure_index([
- (str(key), 1) for key in range(TUPLE_LEN - 1)
- ])
- prev = [None] * (TUPLE_LEN - 1)
- next = True
- while True:
- next = get_next(*prev)
- if next:
- yield next
- else:
- return
- prev.pop(0)
- prev.append(next)
- if __name__=="__main__":
- #get_posts_list(BLOG_URL)
- #compute_tuples()
- print ''.join(islice(gen_text(), 0, 1000))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement