harvest_microformat.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Harvesting microformats from HTML to Wikibase script.

Usage:

python pwb.py scripts/harvest_html.py -transcludes:"..."

These command line parameters can be used to specify which pages to work on:

&params;

"""
#
# (C) Pywikibot team, 2013-2015
#
# Distributed under the terms of MIT License.
#
__version__ = '$Id$'
#

import re
import pywikibot
from pywikibot import pagegenerators as pg, WikidataBot
from mf2py.parser import Parser

docuReplacements = {'&params;': pywikibot.pagegenerators.parameterHelp}


class HarvestHTMLRobot(WikidataBot):

    """A bot to parse microformats and add Wikidata claims."""

    def __init__(self, generator):
        """
        Constructor.

        @param generator: A generator that yields Page objects

        """
        super(HarvestHTMLRobot, self).__init__()
        self.generator = pg.PreloadingGenerator(generator)
        self.cacheSources()

    # I left this method signature in case its analogue
    # may be somehow useful once implemented for pages
    def getTemplateSynonyms(self, title):
        """Fetch redirects of the title, so we can check against them."""
        pass

    def _page_link_target(self, item, link_text):
        linked_page = None

        link = pywikibot.Link(link_text)
        linked_page = pywikibot.Page(link)

        if not linked_page.exists():
            pywikibot.output(u'%s doesn\'t exist so it can\'t be linked. Skipping' % (linked_page))
            return

        if linked_page.isRedirectPage():
            linked_page = linked_page.getRedirectTarget()

        try:
            linked_item = pywikibot.ItemPage.fromPage(linked_page)
        except pywikibot.NoPage:
            linked_item = None

        if not linked_item or not linked_item.exists():
            pywikibot.output(u'%s doesn\'t have a wikidata item to link with. Skipping' % (linked_page))
            return

        if linked_item.title() == item.title():
            pywikibot.output(u'%s links to itself. Skipping' % (linked_page))
            return

        return linked_item

    def treat(self, page, item):
        """Process a single page/item."""
        self.current_page = page
        item.get()

        page_url = (self.current_page.site.protocol() + "://" +
                    self.current_page.site.hostname() +
                    self.current_page.site.nice_get_address(self.current_page.title(asUrl=True)))
        parser = Parser(url=page_url)
        parsed_props = parser.to_dict()

        #print parsed_props["items"][0]["properties"]
        #print parsed_props
        # interchange parsed_props with parsed_props["items"][0]["properties"]
        for field in parsed_props:
            field = field.strip()
            parsed_props[field] = parsed_props[field].strip()

        for field in parsed_props:
            # Check if the property isn't already set
            value = parsed_props[field]
            claim = pywikibot.Claim(self.repo, value)
            if claim.getID() in item.get().get('claims'):
                pywikibot.output(u'A claim for %s already exists. Skipping'
                                 % claim.getID())
                # TODO: Implement smarter approach to merging
                # harvested values with existing claims esp.
                # without overwriting humans unintentionally.
            else:
                if claim.type == 'wikibase-item':
                    # Try to extract a valid page
                    match = re.search(pywikibot.link_regex, value)
                    if not match:
                        pywikibot.output(u'%s field %s value %s isn\'t a wikilink. Skipping'
                                         % (claim.getID(), field, value))
                        continue

                    link_text = match.group(1)
                    linked_item = self._page_link_target(item, link_text)
                    if not linked_item:
                        continue

                    claim.setTarget(linked_item)
                elif claim.type == 'string':
                    claim.setTarget(value.strip())
                elif claim.type == 'commonsMedia':
                    commonssite = pywikibot.Site("commons", "commons")
                    imagelink = pywikibot.Link(value, source=commonssite, defaultNamespace=6)
                    image = pywikibot.FilePage(imagelink)
                    if image.isRedirectPage():
                        image = pywikibot.FilePage(image.getRedirectTarget())
                    if not image.exists():
                        pywikibot.output('[[%s]] doesn\'t exist so I can\'t link to it'
                                         % (image.title(),))
                        continue
                    claim.setTarget(image)
                #elif claim.type == 'globe-coordinate':
                #elif claim.type == 'url':
                elif claim.type == 'time':
                    # value is expected to have "1706-01-17" format
                    WbTime_value = pywikibot.WbTime(year=value[:4],
                                                    month=value[5:7],
                                                    day=value[8:])
                    claim = pywikibot.Claim(self.repo, WbTime_value)
                    claim.setTarget(WbTime_value)
                #elif claim.type == 'quantity':
                else:
                    pywikibot.output("%s is not a supported datatype." % claim.type)
                    continue

                pywikibot.output('Adding %s --> %s' % (claim.getID(), claim.getTarget()))
                item.addClaim(claim)
                # A generator might yield pages from multiple sites
                source = self.getSource(page.site)
                if source:
                    claim.addSource(source, bot=True)


def main(*args):
    """
    Process command line arguments and invoke bot.

    If args is an empty list, sys.argv is used.

    @param args: command line arguments
    @type args: list of unicode
    """
    commandline_arguments = list()
    page_title = u''

    # Process global args and prepare generator args parser
    local_args = pywikibot.handle_args(args)
    gen = pg.GeneratorFactory()

    for arg in local_args:
        if gen.handleArg(arg):
            if arg.startswith(u'-transcludes:'):
                page_title = arg[13:]
        else:
            commandline_arguments.append(arg)

    generator = gen.getCombinedGenerator()
    if not generator:
        gen.handleArg(u'-transcludes:' + page_title)
        generator = gen.getCombinedGenerator()

    bot = HarvestHTMLRobot(generator)
    bot.run()

if __name__ == "__main__":
    main()