dedupe_to_folder.py

#!/usr/bin/env python

"""
Created by Phistrom
2013-05-02

Pulls all picture information from the StromDAM database and uses the hashes,
file locations, and tag information to copy a single file for each hash to
the specified BASE_PICS_DIR. Each file will be placed in a folder corresponding
to its tag information and the tags will will be written to the file.

In the event of a crash (which seems frequent with the exmempi library that
the Python-XMP-Toolkit relies on), the script can determine where it stopped
at thanks to a text file containing already finished hashes that it writes
in the same directory it's in.
"""

import sys
for pathv in sys.path:
    print pathv


import binascii
import libxmp
import os
import shutil
import stromberg.dam.db_model as dam_model
from stromberg.utils import logger
#import timing

BASE_PICS_DIR = '/media/onealdata1/Pictures'
DEFAULT_ORGANIZATION = 'Stromberg'
TAG_TYPES = {
    1: 'Organizations',
    2: 'Projects',
    3: 'Products',
    4: 'Materials',
    5: 'Applications',
    6: 'Other',
    7: 'Themes',
    8: 'Loc',
}

IPTC_KEYWORDS = 'Iptc.Application2.Keywords'
MS_KEYWORDS = 'Xmp.dc.subject'
CONVERSION_NEEDED = True
PATH_CONVERT = (
    (r'\\stromberg-fs1\users', '/media/strombergfs1/users',),
    (r'\\stromberg-fs1\general-documents', '/media/strombergfs1/general'),
)
TEMPFILE_PATH = 'hashes_complete.txt'
XMP_HIERARCHY = 'Xmp.lr.hierarchicalSubject'
XMP_DESCRIPTION = 'Xmp.dc.description'

ORDER_OF_TAGS_FOR_FOLDERS = [
    3, 4, 5, 7, 8,
]

DEBUG = True
LOG = logger.get_logger('dedupe_to_folder', DEBUG, 'deduper_logs')


def apply_tags_to_file(tags, filepath, description=''):
    """Given a dictionary of tag lists (where the key is a tag type and the
    value is a list of tags that are of that type), applies those tags to the
    file at the given filepath."""
    keywords = set()
    hierarchy = set()

    for tagtype in tags:
        for tag in tags[tagtype]:
            prefix = ''
            if TAG_TYPES[tagtype] != 'Other':
                #if anything but Other,
                prefix = TAG_TYPES[tagtype] + '|'
            tag = prefix + tag.strip()
            LOG.debug("TAG %s", tag)
            LOG.debug("DESCRIPTION %s", description)
            if '|' in tag:
                #if a tag has a | character, it is a hierarchical tag and must
                #be stored both in the Xmp.lr.hierarchicalSubject as well as
                #the regular Xmp.dc.subject without the pipes and parent tags
                hierarchy.add(tag)
                tag = tag.split('|')[-1]
            keywords.add(tag)

    if not (keywords or hierarchy or description):
        return
    xmpfile = libxmp.XMPFiles(file_path=filepath, open_forupdate=True)
    xmp = xmpfile.get_xmp()
    if keywords:
        #remove existing subject tags and replace with new ones specified
        xmp.delete_property(libxmp.consts.XMP_NS_DC, 'subject')
        for key in sorted(keywords):
            xmp.append_array_item(libxmp.consts.XMP_NS_DC, 'subject', key,
                                  {'prop_array_is_ordered': True, 'prop_value_is_array': True})
    if hierarchy:
        #remove existing hierarchy tags and add the new ones back
        xmp.register_namespace(libxmp.consts.XMP_NS_Lightroom, 'lr')
        xmp.delete_property(libxmp.consts.XMP_NS_Lightroom, 'lr:hierarchicalSubject')
        xmp.set_property(libxmp.consts.XMP_NS_Lightroom, 'lr:hierarchicalSubject', '',
                         prop_array_is_ordered=True, prop_value_is_array=True)
        for node in sorted(hierarchy):
            xmp.append_array_item(libxmp.consts.XMP_NS_Lightroom, 'lr:hierarchicalSubject', node.strip(),
                                  {'prop_array_is_ordered': True, 'prop_value_is_array': True})
    xmp.delete_property(libxmp.consts.XMP_NS_DC, 'description')
    xmp.append_array_item(libxmp.consts.XMP_NS_DC, 'description', description,
                          {'prop_array_is_ordered': True, 'prop_value_is_array': True})
    try:
        #none of these checks seem to help prevent the Backtrace error
        if xmpfile.can_put_xmp(xmp):
            xmpfile.put_xmp(xmp)
        if xmpfile.xmpfileptr:
            xmpfile.close_file()
    except IOError as ex:
        LOG.warning('Could not write tags to %s. %s', filepath, ex.message)


def convert_filepath(filepath):
    for key, val in PATH_CONVERT:
        if filepath.startswith(key):
            filepath = filepath.replace(key, val, 1)
            break
    filepath = filepath.replace('\\', os.sep)
    return filepath


def copy_to_dest(try_files, dest_folder):
    destination = None

    try:
        os.makedirs(dest_folder)
    except (IOError, OSError):
        pass

    for fil in try_files:
        try:
            filename = os.path.basename(fil)
            destination = os.path.join(dest_folder, filename)
            if os.path.exists(destination):
                LOG.debug('File has already been copied to %s', destination)
                return False
            shutil.copyfile(fil, destination)
            break
        except:
            continue
    else:
        raise IOError('Could not copy any candidates to %s.' % dest_folder)
    #print 'dat destination %s' % destination
    return destination


def get_destination_by_tag(tags):
    """Given a dictionary of tags, determines a folder to put a file into.
    Folder will be determined first by organization, then by project, then by
    product, then by material, then by other"""

    try:
        organization = tags[1][0]
    except KeyError:
        organization = DEFAULT_ORGANIZATION

    try:
        project = tags[2][0]
        no_project = False
    except KeyError:
        project = '0NoProject'
        no_project = True

    dest_dir = os.path.join(BASE_PICS_DIR, organization, project)

    if no_project:
        for tagtype in ORDER_OF_TAGS_FOR_FOLDERS:
            if tagtype in tags:
                value = tags[tagtype][0]
                if TAG_TYPES[tagtype] == 'Loc':
                    value = value.replace('|', os.sep)
                extra_dir = os.path.join(TAG_TYPES[tagtype], value)
                break
        else:
            extra_dir = 'uncategorized'
        dest_dir = os.path.join(dest_dir, extra_dir)

    return dest_dir


def main():
    already_done = set()
    try:
        with open(TEMPFILE_PATH, 'rb') as tempfile:
            for line in tempfile:
                already_done.add(line.strip())
    except IOError:
        pass
    LOG.debug("Getting DAM files")
    all_files = dam_model.get_all_files()
    LOG.debug("Got the DAM files")
    tempfile = open(TEMPFILE_PATH, 'a')
    #x = 0
    for hashed, files in all_files.iteritems():
        try:
            hexl = binascii.hexlify(hashed)
            if hexl in already_done:
                LOG.debug('Hash %s marked as done.', hexl)
                continue
            orig_files = files
            if CONVERSION_NEEDED:
                files = [convert_filepath(fil) for fil in files]
            LOG.debug('Getting tags for %s', hexl)
            tags = dam_model.get_tags_by_hash(hexl)
            LOG.debug('Getting the destination based on tags for %s', hexl)
            newdest = get_destination_by_tag(tags)
            LOG.debug('Destination for %s will be %s', hexl, newdest)
            newfile = copy_to_dest(files, newdest)
            if not newfile:
                continue
            LOG.debug('Successfully copied to %s', newfile)
            apply_tags_to_file(tags, newfile, '|'.join(orig_files))
            tempfile.write('%s\n' % hexl)
            LOG.debug('Successfully tagged %s', newfile)
        except Exception as ex:
            LOG.error('%s occurred. Problem was %s %s',
                        type(ex).__name__, ex.message, ex)
    tempfile.close()
    os.remove(TEMPFILE_PATH)

        #print
            #x += 1
            #if x >= 100:
            #    break


if __name__ == '__main__':
    main()