Advertisement
Guest User

Untitled

a guest
Sep 19th, 2017
60
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.43 KB | None | 0 0
  1. import re
  2.  
  3. from disco.core import Job
  4. from disco.util import kvgroup
  5. from disco.schemes import scheme_disco
  6. from disco.worker.classic.func import chain_reader
  7.  
  8. from wikisax import simple_sax
  9.  
  10. class CountBackLinks(Job):
  11.     map_input_stream = (staticmethod(scheme_disco.input_stream),
  12.                         staticmethod(chain_reader),
  13.                         simple_sax,)
  14.  
  15.     @staticmethod
  16.     def map(page, params):
  17.         title = False
  18.         title_reg = re.compile('<title>([\w]+)</title>')
  19.         text_start_reg = re.compile('<text')
  20.         text_end_reg = re.compile('</text>')
  21.         link_reg = re.compile(r'\[\[(?:[\w ]+?\|)?([\w ]+?)\]\]')
  22.         in_text = False
  23.         for line in page.split('\n'):
  24.             if not title:
  25.                 match = title_reg.search(line)
  26.                 if match:
  27.                     title = match.expand(r'\1')
  28.             else:
  29.                 if in_text:
  30.                     if text_end_reg.search(line):
  31.                         in_text = False
  32.                     else:
  33.                         for match in link_reg.findall(line):
  34.                             yield match,title
  35.                 elif text_start_reg.search(line):
  36.                     in_text = True
  37.     @staticmethod
  38.     def reduce(iter, params):
  39.         for page, links in kvgroup(sorted(iter)):
  40.             total = 0
  41.             for link in links:
  42.                 total += 1
  43.             yield page,total
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement