Advertisement
Guest User

шизофазогенератор маркова

a guest
Mar 2nd, 2012
182
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.26 KB | None | 0 0
  1. # coding=utf-8
  2. '''
  3.    Markov text generator
  4. '''
  5. from itertools import islice
  6. from random import random
  7. import re
  8. import time
  9.  
  10. import bpython
  11.  
  12. from BeautifulSoup import BeautifulSoup
  13. import soupselect
  14.  
  15. import httplib2
  16. http = httplib2.Http()  
  17.  
  18. TUPLE_LEN = 5
  19.  
  20. import pymongo
  21. connection = pymongo.Connection()
  22. #db = connection.blog
  23. #BLOG_URL = 'http://bunyk.wordpress.com/'
  24.  
  25.  
  26. #db = connection.danbst
  27. #BLOG_URL = 'http://danbst.wordpress.com/'
  28.  
  29. db = connection.yulka
  30. BLOG_URL = 'http://yulkahrytsenko.wordpress.com/'
  31.  
  32. TROTTLE = 1.0
  33.  
  34. def get_page(url):
  35.     time.sleep(TROTTLE)
  36.     page = db.pages.find_one({'url': url})
  37.     text = page.get('text') if page else None
  38.     if text:
  39.         return text
  40.     headers, text = http.request(url)
  41.     if headers.status != 200:
  42.         return None
  43.     text = unicode(text, 'utf-8')
  44.     db.pages.insert({
  45.         'url': url,
  46.         'text': text
  47.     })
  48.     return text
  49.  
  50. def get_links(url):
  51.     page = db.pages.find_one({'url': url})
  52.     if page:
  53.         return page.get('links')
  54. def update_links(url, links):
  55.     db.pages.update(
  56.         {'url': url},
  57.         {'$set': {'links': links}}
  58.     )
  59.  
  60. def is_visited(url):
  61.     page = db.pages.find_one({'url': url})
  62.     if page:
  63.         return bool(page.get('visited'))
  64. def visited(url):
  65.     db.pages.update(
  66.         {'url': url},
  67.         {'$set': {'visited': True}}
  68.     )
  69.  
  70. def get_posts_list(url):
  71.     post_url_re = re.compile(url + r'\d{4}/\d{2}/\d{2}/[^/"]*/')
  72.     # url/yyyy/mm/dd/post-slug/
  73.    
  74.     def visit(url):
  75.         if is_visited(url):
  76.             return # already visited
  77.         print url
  78.         try:
  79.             text = get_page(url)
  80.         except httplib2.ServerNotFoundError:
  81.             print 'Server not found. Wait 20 secs for retry...'
  82.             time.sleep(20)
  83.             return visit(url)
  84.         if not text:
  85.             print 'No page!'
  86.             return
  87.         links = get_links(url)
  88.         if not links:
  89.             links = list(set(post_url_re.findall(text)))
  90.             update_links(url, links)
  91.         visited(url)
  92.         for link in links:
  93.             visit(link)
  94.     visit(url)
  95.    
  96. def clean_html(html):
  97.     """ Remove HTML markup from the given string."""
  98.     # First we remove inline JavaScript/CSS:
  99.     cleaned = re.sub(ur"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())
  100.     # Then we remove html comments.
  101.     # This has to be done before removing regular
  102.     # tags since comments can contain '>' characters.
  103.     cleaned = re.sub(ur"(?s)<!--(.*?)-->[\n]?", "", cleaned)
  104.     # Next we can remove the remaining tags:
  105.     cleaned = re.sub(ur"(?s)<.*?>", " ", cleaned)
  106.     # Finally, we deal with whitespace
  107.     cleaned = re.sub(ur"&nbsp;", " ", cleaned)
  108.     cleaned = re.sub(ur"\s+", " ", cleaned)
  109.    
  110.     return cleaned.strip()
  111.    
  112. def get_post_text(post):
  113.     page = post['text']
  114.     parsed = BeautifulSoup(page)
  115.     try:
  116.         main = soupselect.select(parsed, 'div.main')[0]
  117.     except IndexError:
  118.         try:
  119.             main = soupselect.select(parsed, 'div.entrytext')[0]
  120.             share = soupselect.select(main, 'div.sharedaddy')
  121.             if share:
  122.                 share[0].extract()
  123.             metadata = soupselect.select(main, 'div.postmetadata')
  124.             if metadata:
  125.                 metadata[0].extract()
  126.         except IndexError:
  127.             db.posts.remove(page)
  128.             return ''
  129.     text = clean_html(unicode(main))
  130.     if text.count(u'.') > 1000:
  131.         return ''
  132.     else:
  133.         return text
  134.    
  135.  
  136. TOKENIZE_REGEX = re.compile(ur'([\w`-]+|[^\w\s]|\s+)', re.U)
  137.  
  138. def tokenize(text):
  139.     return (s.groups()[0] for s in TOKENIZE_REGEX.finditer(text))
  140.  
  141. def dict_array(a):
  142.     ''' return dictionary with keys as array indexes '''
  143.     return dict((str(k), v) for k, v in enumerate(a))
  144.  
  145. def add_tuple(*args):
  146.     db.tuples.update(
  147.         dict_array(args),
  148.         {'$inc': {'usage': 1}},
  149.         upsert=True
  150.     )
  151.  
  152. def get_next(*args):
  153.     to = str(len(args))
  154.     variants = db.tuples.find(
  155.         dict_array(args),
  156.         {to: 1, 'usage': 1}
  157.     )
  158.     variants = [[v[to], v['usage']] for v in variants]
  159.     if not variants:
  160.         return None
  161.     for i, v in enumerate(variants):
  162.         if i > 0:
  163.             variants[i][1] += variants[i-1][1]
  164.     finger = random() * variants[-1][1]
  165.     prev = 0
  166.     for v in variants:
  167.         if prev <= finger <= v[1]:
  168.             return v[0]
  169.         prev = v[1]
  170.     return v[-1]
  171.  
  172. def compute_tuples():
  173.     db.tuples.ensure_index([
  174.         (str(key), 1) for key in range(TUPLE_LEN)
  175.     ])
  176.     count = db.pages.count()
  177.     for num, post in enumerate(db.pages.find()):
  178.         print '%d/%d' % (num, count), post['url']
  179.         a = [None] * TUPLE_LEN
  180.         for word in tokenize(get_post_text(post)):
  181.             a.pop(0)
  182.             a.append(word)
  183.             add_tuple(*a)
  184.  
  185. def gen_text():
  186.     db.tuples.ensure_index([
  187.         (str(key), 1) for key in range(TUPLE_LEN - 1)
  188.     ])
  189.     prev = [None] * (TUPLE_LEN - 1)
  190.     next = True
  191.     while True:
  192.         next = get_next(*prev)
  193.         if next:
  194.             yield next
  195.         else:
  196.             return
  197.         prev.pop(0)
  198.         prev.append(next)
  199.  
  200. if __name__=="__main__":
  201.     #get_posts_list(BLOG_URL)
  202.     #compute_tuples()
  203.     print ''.join(islice(gen_text(), 0, 1000))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement