Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ########################################################################################################################
- # Title: Image Catalogue Exif Data Writing
- # Authors: Matear, L., Duncan, G. (2018) Email: Liam.Matear@jncc.gov.uk
- # Version Control: 1.0
- # Script description: Read in metadata from Proforma Stills Matrices from the RV Scotia Survey 1714S, and
- # batch write all associated metadata to the Solan Bank survey 1714S image catalogue.
- # All images are in JPEG file format.
- #
- # All code within this script runs directly from the image files within the working directory
- # and the metadata recorded within the image stills proforma provided by an external contractor.
- # All data used within this script are copies of the original files.
- #
- # For any enquiries please contact Liam Matear by email: Liam.Matear@jncc.gov.uk
- #
- # Please note:
- # Users must ensure to create a copy of the JPEG files and to work on the copy, the
- # packages within this script will write directly to the original, and any changes made
- # will be permanent.
- ########################################################################################################################
- # Section 1: Loading, manipulating and formatting the data within Python
- ########################################################################################################################
- # 1a) Load in all required packages for script:
- # If required install packages using 'pip install package name command in terminal
- import os
- import re
- import pandas as pd
- import subprocess
- import datetime
- ########################################################################################################################
- # 1b) Setting a working directory for file access
- os.chdir('X:\\OffshoreSurvey\\SurveyData\\2014_10_RVScotia_1714S_SolanBank\\GroundTruthing\\PhotoStation\\Copies_LM')
- # Read in metadata from .xlsx format files as Pandas (pd) DataFrames
- prof_meta = pd.read_excel('20150508 Proforma_Stills analysis FINAL.xlsx', 'Stills Form')
- ########################################################################################################################
- # 1c) Clean up Proforma_Stills (stills_meta) - remove all undesired data from proforma
- def clean_stills_prof(df):
- """Pass stills proforma as df to function to remove unwanted columns"""
- try:
- df.replace({'\n': ''}, regex=True)
- df.drop(['Habitat Name (Max 100 characters). Substrate & Cover',
- 'Habitat Description (Simple): includes zone, substrate, community, depth, litter, trawl marks, physical damage, biotope fit, imagery quality comments. If problems with ID - why?',
- 'Habitat Description (Full): Additionaly includes details of search features and PMFs',
- 'Date', 'Fix Time (hh:mm:ss)', 'DateTime', 'Fix- Eastings', 'Fix - Northings', 'Fix - Lat',
- 'Fix - Long', 'Depth', 'Field of view (m2)', 'Bedrock', 'Boulders_over1024mm', 'Boulders_512to1024mm ',
- 'Boulders_256to512mm ', 'Cobbles_64to256mm', 'Pebbles_16to64mm', 'Shells_Empty ',
- 'Gravel_Stone_4to16mm', 'Gravel_Shell_4to16mm', 'Sand', 'Sand_Coarse_1to4mm',
- 'Sand_Medium_0_25to1mm ', 'Sand_Fine_0_063to0_25mm', 'Mud_lessthan0_063mm', 'Total %',
- 'Total Sediments', 'Total Rock', 'Evidence of Human Impact', 'Reef Elevation',
- 'Frag Spong Antho Habitat', 'Biotope Changed Following QA', 'OLD MNCR code',
- 'OLD Classification\n(Exact copy of MNCR descriptor)',
- 'Classification\n(Exact copy of MNCR descriptor)', 'Biotope Confidence', '2nd MNCR code',
- '2nd Classification\n(Exact copy of MNCR descriptor)', '2nd Biotope Confidence',
- 'DeterminedBy', 'Visual quality of sample'], axis=1, inplace=True)
- return df
- except:
- print('Value Error: User must pass df as argument to function.'
- ' If this is true, errors may result because columns do not exist or the data is already cleansed')
- stills_meta = clean_stills_prof(prof_meta)
- # Rename remaining columns to computer friendly format
- stills_meta.rename(columns={'Still Sample Ref': 'still_ref', 'Station code': 'stn_code',
- 'Concatenated Search Features and PMFs': 'search_features_PMF', 'Fix Lat Dec': 'latitude',
- 'Fix Long Dec': 'longitude', 'Annex 1 Reef': 'annex1_reef',
- 'PMF Seabed Habitats': 'pmf_seabed_habitats', 'PMF Mobile Species': 'pmf_mobile_species',
- 'PMF Limited Mobility Species': 'pmf_limited_mobility_species', 'MNCR code': 'MNCR_code'},
- inplace=True)
- stills_meta = stills_meta.sort_values(by=['still_ref'])
- ########################################################################################################################
- # Section 2: Create Configuration file and format all metadata records for looping / writing
- ########################################################################################################################
- # 2a) Create Config (.cfg) file from the column names created in the Pandas data frame (see section 1C)
- configLines = ["%Image::ExifTool::UserDefined = (",
- " 'Image::ExifTool::XMP::xmp' => {",
- " still_ref => { Name => 'still_ref' },",
- " stn_code => { Name => 'stn_code' },",
- " search_features_PMF => { Name => 'search_features_PMF' },",
- " annex1_reef => { Name => 'annex1_reef' },",
- " pmf_seabed_habitats => { Name => 'pmf_seabed_habitats' },",
- " pmf_mobile_species => { Name => 'pmf_mobile_species' },",
- " pmf_limited_mobility_species => { Name => 'pmf_limited_mobility_species' },",
- " MNCR_code => { Name => 'MNCR_code' }, ",
- " Lattitude => { Name => 'Latitude', WRITABLE => 'rational64s'},",
- " Longitude => { Name => 'Longitude', WRITABLE => 'rational64s'},",
- " },",
- ");"]
- # Create writeable .cfg file and save in the newly formatted configLines list of strings to the config file
- # This section uses a lambda function to allow for a new line break ('\n') to be applied to all entries to the
- # config file.
- with open('config.cfg', 'w') as configfile:
- configfile.writelines(map(lambda s: s + '\n', configLines))
- # 2b) Define user function to execute command line from Python through the use of sub-processing commands.
- # This function uses Popen to perform multiple simultaneous executions and passes the output into the
- # next subprocess using a pipe.
- def subprocess_cmd(command):
- """Execute sub-processing commands to the command line tool ExifTool.
- This enables the action to be called from the config file and executed by the command line via python.
- This function uses Popen to simultaneously execute multiple executions and passes the outputs into subsequent
- sub-processes through a pipe"""
- print(command)
- process = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True, encoding="utf8")
- # proc_stdout = process.communicate()[0].strip()
- # return proc_stdout
- # 2c) Convert cleansed pd DataFrame to dictionary to store all benthic image metadata fields
- data_dict = stills_meta.set_index('still_ref').T.to_dict('dict')
- directory = 'X:\\OffshoreSurvey\\SurveyData\\2014_10_RVScotia_1714S_SolanBank\\GroundTruthing\\PhotoStation\\Copies_LM'
- data_dict2 = data_dict
- ########################################################################################################################
- # Section 3: Loop through all metadata records and directories & write successful regex patterns for each file
- ########################################################################################################################
- # 3a) Loop through metadata dictionary and extract station code, station number and image number data.
- for key in data_dict2:
- fieldList = data_dict2[key]
- data_dict2[key] = {}
- data_dict2[key]["fields"] = fieldList
- dictSplit = key.split("_")
- stationCode = dictSplit[0]
- stationNumber = dictSplit[1]
- imageNumber = dictSplit[-1]
- imageNumber = imageNumber.replace("P", "")
- data_dict2[key]["stationcode"] = stationCode
- data_dict2[key]["stationnumber"] = stationNumber
- data_dict2[key]["imagenumber"] = imageNumber
- # print(key)
- # 3b) Define object-oriented variables required for sub-processing and create an error log saved within the current
- # working directory to record all incomplete files. The formatting of this document is set in line 178 using {}
- # as a placeholder for the year/month/days/hours/minutes.
- time = datetime.datetime.now()
- error_log = open('{}_error_log.txt'.format(time.strftime('%Y%m%d%H%M')), 'w')
- initialExifToolArgs = [r'D:\Programs\ExifTool\exiftool.exe', '-config', os.path.join(os.getcwd(), 'config.cfg')]
- exifToolNamespacePrefix = "-XMP-xmp"
- # 3c) Scrub existing iterations for bad files / remove all temporary files created by ExifTool.
- # Loop through all files in the listed directory and for each file - remove files with 'jpg_original' and / or
- # 'jpg_exiftool_tmp' file extensions.
- for base, dirnames, files in os.walk(directory):
- for eachfile in files:
- root_ext = os.path.splitext(eachfile)[1]
- if ('jpg_original' in root_ext) or ('jpg_exiftool_tmp' in root_ext):
- os.remove(os.path.join(base, eachfile))
- # 3d) Loop all files within the set working directory (sub-folders included) and search for all files which match
- # the stationCode, stationNumber and imageNumber values pulled out in section 3a.
- # For all .jpg files which successfully match as regex patterns, execute the command line tool via the predefined
- # sub-process function.
- for base, dirnames, files in os.walk(directory):
- for eachfile in files:
- # print(eachfile)
- # Use os.fsencode to encode file data to bytes / correct format to be used in piexif
- filename = eachfile
- # Split file into root and extension
- root_ext = os.path.splitext(filename)
- str_ext = str(root_ext)
- # Loop through string file + extensions and check that file extensions are correct (jpg / jpeg)
- if ('jpg' in str_ext.lower()) or ('jpeg' in str_ext.lower()):
- # print(str(root_ext) + 'jpeg successfully found')
- # print('jpeg successfully found')
- # Loop through dictionary keys and match using conditional statement / regular expressions (re)
- for key_match in data_dict2:
- # Split dictionary into 3 new variables
- stationCode = data_dict2[key_match]["stationcode"]
- stationNumber = data_dict2[key_match]["stationnumber"]
- imageNumber = data_dict2[key_match]["imagenumber"]
- regexPattern = re.compile(stationCode + ".+" + stationNumber + ".+" + imageNumber)
- # print(str(regexPattern) + 'successful pattern made')
- # print(str_ext)
- if re.search(regexPattern, str_ext):
- print('Regex successfully matched')
- fileExifToolArgs = list(initialExifToolArgs)
- # Join loop components together again using os.path.join()
- fullPath = os.path.join(base, filename)
- for fieldName in data_dict2[key_match]["fields"]:
- fieldArg = '{}:{}="{}"'.format(exifToolNamespacePrefix, fieldName,
- data_dict2[key_match]["fields"][fieldName])
- fileExifToolArgs.append(fieldArg)
- fileExifToolArgs.append(str(fullPath))
- subprocess_cmd(fileExifToolArgs)
- # Perform insert of EXIF data to matched .jpg files - need to include the use of ExifTool
- # Via the use of command line / sub-processing
- # Remove key - to cleanup
- dump = data_dict2.pop(key_match, None)
- # Exit key loop (we've found a match)
- break
- else:
- error_log.write("\n Failed to match spreadsheet record to : " + str_ext)
- error_log.close()
Add Comment
Please, Sign In to add comment