Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package com.eldadlevy.hadoop;
- import java.io.IOException;
- import java.security.MessageDigest;
- import java.security.NoSuchAlgorithmException;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.io.BytesWritable;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapreduce.Job;
- import org.apache.hadoop.mapreduce.Mapper;
- import org.apache.hadoop.mapreduce.Reducer;
- import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
- import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
- import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
- import org.apache.hadoop.util.GenericOptionsParser;
- public class ImageDuplicatesRemover {
- public static class ImageMd5Mapper extends Mapper<Text, BytesWritable, Text, Text>{
- public void map(Text key, BytesWritable value, Context context) throws IOException,InterruptedException {
- //get the md5 for this specific file
- String md5Str;
- try {
- md5Str = calculateMd5(value.getBytes());
- } catch (NoSuchAlgorithmException e) {
- e.printStackTrace();
- context.setStatus("Internal error - can't find the algorithm for calculating the md5");
- return;
- }
- Text md5Text = new Text(md5Str);
- //put the file in the map where the md5 is the key, so duplicates will
- // be grouped together for the reduce function
- context.write(md5Text, key);
- }
- static String calculateMd5(byte[] imageData) throws NoSuchAlgorithmException {
- //get the md5 for this specific data
- MessageDigest md = MessageDigest.getInstance("MD5");
- md.update(imageData);
- byte[] hash = md.digest();
- // Below code of converting Byte Array to hex
- String hexString = new String();
- for (int i=0; i < hash.length; i++) {
- hexString += Integer.toString( ( hash[i] & 0xff ) + 0x100, 16).substring( 1 );
- }
- return hexString;
- }
- }
- public static class ImageDupsReducer extends Reducer<Text,Text,Text,Text> {
- public void reduce(Text key, Iterable<Text> values, Context context)
- throws IOException, InterruptedException {
- //Key here is the md5 hash while the values are all the image files that
- // are associated with it. for each md5 value we need to take only
- // one file (the first)
- Text imageFilePath = null;
- for (Text filePath : values) {
- imageFilePath = filePath;
- break;//only the first one
- }
- // In the result file the key will be again the image file path.
- context.write(imageFilePath, key);
- }
- }
- public static void main(String[] args) throws Exception {
- Configuration conf = new Configuration();
- //This is the line that makes the hadoop run locally
- //conf.set("mapred.job.tracker", "local");
- String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
- if (otherArgs.length != 2) {
- System.err.println("Usage: wordcount <in> <out>");
- System.exit(2);
- }
- Job job = new Job(conf, "image dups remover");
- job.setJarByClass(ImageDuplicatesRemover.class);
- job.setInputFormatClass(SequenceFileInputFormat.class);
- job.setMapperClass(ImageMd5Mapper.class);
- job.setReducerClass(ImageDupsReducer.class);
- //job.setNumReduceTasks(2);
- job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(Text.class);
- FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
- FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
- System.exit(job.waitForCompletion(true) ? 0 : 1);
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement