Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re
- from disco.core import Job
- from disco.util import kvgroup
- from disco.schemes import scheme_disco
- from disco.worker.classic.func import chain_reader
- from wikisax import simple_sax
- class CountBackLinks(Job):
- map_input_stream = (staticmethod(scheme_disco.input_stream),
- staticmethod(chain_reader),
- simple_sax,)
- @staticmethod
- def map(page, params):
- title = False
- title_reg = re.compile('<title>([\w]+)</title>')
- text_start_reg = re.compile('<text')
- text_end_reg = re.compile('</text>')
- link_reg = re.compile(r'\[\[(?:[\w ]+?\|)?([\w ]+?)\]\]')
- in_text = False
- for line in page.split('\n'):
- if not title:
- match = title_reg.search(line)
- if match:
- title = match.expand(r'\1')
- else:
- if in_text:
- if text_end_reg.search(line):
- in_text = False
- else:
- for match in link_reg.findall(line):
- yield match,title
- elif text_start_reg.search(line):
- in_text = True
- @staticmethod
- def reduce(iter, params):
- for page, links in kvgroup(sorted(iter)):
- total = 0
- for link in links:
- total += 1
- yield page,total
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement