Advertisement
eldadlevy

ImageDuplicatesRemover

Apr 14th, 2011
2,534
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Java 3.52 KB | None | 0 0
  1. package com.eldadlevy.hadoop;
  2.  
  3. import java.io.IOException;
  4. import java.security.MessageDigest;
  5. import java.security.NoSuchAlgorithmException;
  6.  
  7. import org.apache.hadoop.conf.Configuration;
  8. import org.apache.hadoop.fs.Path;
  9. import org.apache.hadoop.io.BytesWritable;
  10. import org.apache.hadoop.io.Text;
  11. import org.apache.hadoop.mapreduce.Job;
  12. import org.apache.hadoop.mapreduce.Mapper;
  13. import org.apache.hadoop.mapreduce.Reducer;
  14. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
  15. import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
  16. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
  17. import org.apache.hadoop.util.GenericOptionsParser;
  18.  
  19. public class ImageDuplicatesRemover {
  20.    
  21.     public static class ImageMd5Mapper extends Mapper<Text, BytesWritable, Text, Text>{
  22.  
  23.         public void map(Text key, BytesWritable value, Context context) throws IOException,InterruptedException {
  24.             //get the md5 for this specific file
  25.             String md5Str;
  26.             try {
  27.                 md5Str = calculateMd5(value.getBytes());
  28.             } catch (NoSuchAlgorithmException e) {
  29.                 e.printStackTrace();
  30.                 context.setStatus("Internal error - can't find the algorithm for calculating the md5");
  31.                 return;
  32.             }
  33.             Text md5Text = new Text(md5Str);
  34.            
  35.             //put the file in the map where the md5 is the key, so duplicates will
  36.             // be grouped together for the reduce function
  37.             context.write(md5Text, key);
  38.         }
  39.        
  40.        
  41.         static String calculateMd5(byte[] imageData) throws NoSuchAlgorithmException {
  42.             //get the md5 for this specific data
  43.             MessageDigest md = MessageDigest.getInstance("MD5");
  44.             md.update(imageData);
  45.             byte[] hash = md.digest();
  46.  
  47.             // Below code of converting Byte Array to hex
  48.             String hexString = new String();
  49.             for (int i=0; i < hash.length; i++) {
  50.                 hexString += Integer.toString( ( hash[i] & 0xff ) + 0x100, 16).substring( 1 );
  51.             }
  52.             return hexString;
  53.         }
  54.        
  55.     }
  56.  
  57.     public static class ImageDupsReducer extends Reducer<Text,Text,Text,Text> {
  58.  
  59.         public void reduce(Text key, Iterable<Text> values, Context context)
  60.                             throws IOException, InterruptedException {
  61.             //Key here is the md5 hash while the values are all the image files that
  62.             // are associated with it. for each md5 value we need to take only
  63.             // one file (the first)
  64.             Text imageFilePath = null;
  65.             for (Text filePath : values) {
  66.                 imageFilePath = filePath;
  67.                 break;//only the first one
  68.             }
  69.             // In the result file the key will be again the image file path.
  70.             context.write(imageFilePath, key);
  71.         }
  72.     }
  73.  
  74.     public static void main(String[] args) throws Exception {
  75.         Configuration conf = new Configuration();
  76.        
  77.         //This is the line that makes the hadoop run locally
  78.         //conf.set("mapred.job.tracker", "local");
  79.  
  80.         String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
  81.         if (otherArgs.length != 2) {
  82.             System.err.println("Usage: wordcount <in> <out>");
  83.             System.exit(2);
  84.         }
  85.         Job job = new Job(conf, "image dups remover");
  86.         job.setJarByClass(ImageDuplicatesRemover.class);
  87.         job.setInputFormatClass(SequenceFileInputFormat.class);
  88.         job.setMapperClass(ImageMd5Mapper.class);
  89.         job.setReducerClass(ImageDupsReducer.class);
  90.         //job.setNumReduceTasks(2);
  91.         job.setOutputKeyClass(Text.class);
  92.         job.setOutputValueClass(Text.class);
  93.         FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
  94.         FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
  95.         System.exit(job.waitForCompletion(true) ? 0 : 1);
  96.        
  97.     }
  98. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement