Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- # -*- coding: utf-8 -*-
- """
- Harvesting microformats from HTML to Wikibase script.
- Usage:
- python pwb.py scripts/harvest_html.py -transcludes:"..."
- These command line parameters can be used to specify which pages to work on:
- ¶ms;
- """
- #
- # (C) Pywikibot team, 2013-2015
- #
- # Distributed under the terms of MIT License.
- #
- __version__ = '$Id$'
- #
- import re
- import pywikibot
- from pywikibot import pagegenerators as pg, WikidataBot
- from mf2py.parser import Parser
- docuReplacements = {'¶ms;': pywikibot.pagegenerators.parameterHelp}
- class HarvestHTMLRobot(WikidataBot):
- """A bot to parse microformats and add Wikidata claims."""
- def __init__(self, generator):
- """
- Constructor.
- @param generator: A generator that yields Page objects
- """
- super(HarvestHTMLRobot, self).__init__()
- self.generator = pg.PreloadingGenerator(generator)
- self.cacheSources()
- # I left this method signature in case its analogue
- # may be somehow useful once implemented for pages
- def getTemplateSynonyms(self, title):
- """Fetch redirects of the title, so we can check against them."""
- pass
- def _page_link_target(self, item, link_text):
- linked_page = None
- link = pywikibot.Link(link_text)
- linked_page = pywikibot.Page(link)
- if not linked_page.exists():
- pywikibot.output(u'%s doesn\'t exist so it can\'t be linked. Skipping' % (linked_page))
- return
- if linked_page.isRedirectPage():
- linked_page = linked_page.getRedirectTarget()
- try:
- linked_item = pywikibot.ItemPage.fromPage(linked_page)
- except pywikibot.NoPage:
- linked_item = None
- if not linked_item or not linked_item.exists():
- pywikibot.output(u'%s doesn\'t have a wikidata item to link with. Skipping' % (linked_page))
- return
- if linked_item.title() == item.title():
- pywikibot.output(u'%s links to itself. Skipping' % (linked_page))
- return
- return linked_item
- def treat(self, page, item):
- """Process a single page/item."""
- self.current_page = page
- item.get()
- page_url = (self.current_page.site.protocol() + "://" +
- self.current_page.site.hostname() +
- self.current_page.site.nice_get_address(self.current_page.title(asUrl=True)))
- parser = Parser(url=page_url)
- parsed_props = parser.to_dict()
- #print parsed_props["items"][0]["properties"]
- #print parsed_props
- # interchange parsed_props with parsed_props["items"][0]["properties"]
- for field in parsed_props:
- field = field.strip()
- parsed_props[field] = parsed_props[field].strip()
- for field in parsed_props:
- # Check if the property isn't already set
- value = parsed_props[field]
- claim = pywikibot.Claim(self.repo, value)
- if claim.getID() in item.get().get('claims'):
- pywikibot.output(u'A claim for %s already exists. Skipping'
- % claim.getID())
- # TODO: Implement smarter approach to merging
- # harvested values with existing claims esp.
- # without overwriting humans unintentionally.
- else:
- if claim.type == 'wikibase-item':
- # Try to extract a valid page
- match = re.search(pywikibot.link_regex, value)
- if not match:
- pywikibot.output(u'%s field %s value %s isn\'t a wikilink. Skipping'
- % (claim.getID(), field, value))
- continue
- link_text = match.group(1)
- linked_item = self._page_link_target(item, link_text)
- if not linked_item:
- continue
- claim.setTarget(linked_item)
- elif claim.type == 'string':
- claim.setTarget(value.strip())
- elif claim.type == 'commonsMedia':
- commonssite = pywikibot.Site("commons", "commons")
- imagelink = pywikibot.Link(value, source=commonssite, defaultNamespace=6)
- image = pywikibot.FilePage(imagelink)
- if image.isRedirectPage():
- image = pywikibot.FilePage(image.getRedirectTarget())
- if not image.exists():
- pywikibot.output('[[%s]] doesn\'t exist so I can\'t link to it'
- % (image.title(),))
- continue
- claim.setTarget(image)
- #elif claim.type == 'globe-coordinate':
- #elif claim.type == 'url':
- elif claim.type == 'time':
- # value is expected to have "1706-01-17" format
- WbTime_value = pywikibot.WbTime(year=value[:4],
- month=value[5:7],
- day=value[8:])
- claim = pywikibot.Claim(self.repo, WbTime_value)
- claim.setTarget(WbTime_value)
- #elif claim.type == 'quantity':
- else:
- pywikibot.output("%s is not a supported datatype." % claim.type)
- continue
- pywikibot.output('Adding %s --> %s' % (claim.getID(), claim.getTarget()))
- item.addClaim(claim)
- # A generator might yield pages from multiple sites
- source = self.getSource(page.site)
- if source:
- claim.addSource(source, bot=True)
- def main(*args):
- """
- Process command line arguments and invoke bot.
- If args is an empty list, sys.argv is used.
- @param args: command line arguments
- @type args: list of unicode
- """
- commandline_arguments = list()
- page_title = u''
- # Process global args and prepare generator args parser
- local_args = pywikibot.handle_args(args)
- gen = pg.GeneratorFactory()
- for arg in local_args:
- if gen.handleArg(arg):
- if arg.startswith(u'-transcludes:'):
- page_title = arg[13:]
- else:
- commandline_arguments.append(arg)
- generator = gen.getCombinedGenerator()
- if not generator:
- gen.handleArg(u'-transcludes:' + page_title)
- generator = gen.getCombinedGenerator()
- bot = HarvestHTMLRobot(generator)
- bot.run()
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement