ImportData for HBase Bulk Load

package com.intuit.ihub.hbase.poc;

/*     */
/*     */
/*     */ import com.google.common.base.Function;
/*     */ import com.google.common.base.Preconditions;
/*     */ import com.google.common.base.Splitter;
/*     */ import com.google.common.collect.Lists;
/*     */ import java.io.IOException;
/*     */ import java.io.PrintStream;
/*     */ import java.util.ArrayList;
/*     */ import org.apache.hadoop.conf.Configuration;
/*     */ import org.apache.hadoop.fs.Path;
/*     */ import org.apache.hadoop.hbase.HBaseConfiguration;
/*     */ import org.apache.hadoop.hbase.HConstants;
/*     */ import org.apache.hadoop.hbase.KeyValue;
/*     */ import org.apache.hadoop.hbase.KeyValue.Type;
/*     */ import org.apache.hadoop.hbase.client.HTable;
/*     */ import org.apache.hadoop.hbase.client.Put;
/*     */ import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat;
import org.apache.hadoop.hbase.mapreduce.PutSortReducer;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
/*     */ import org.apache.hadoop.hbase.util.Base64;
/*     */ import org.apache.hadoop.hbase.util.Bytes;
/*     */ import org.apache.hadoop.io.LongWritable;
/*     */ import org.apache.hadoop.io.Text;
/*     */ import org.apache.hadoop.mapreduce.Counter;
/*     */ import org.apache.hadoop.mapreduce.Job;
/*     */ import org.apache.hadoop.mapreduce.Mapper;
/*     */ import org.apache.hadoop.mapreduce.Mapper.Context;
/*     */ import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
/*     */ import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
/*     */ import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
/*     */
/*     */ public class ImportData
/*     */ {
/*     */   static final String NAME = "importtsv";
/*     */   static final String SKIP_LINES_CONF_KEY = "importtsv.skip.bad.lines";
/*     */   static final String BULK_OUTPUT_CONF_KEY = "importtsv.bulk.output";
/*     */   static final String COLUMNS_CONF_KEY = "importtsv.columns";
/*     */   static final String SEPARATOR_CONF_KEY = "importtsv.separator";
/*     */   static final String DEFAULT_SEPARATOR = "\t";
/*     */
/*     */   public static Job createSubmittableJob(Configuration conf, String[] args)
/*     */     throws IOException
/*     */   {
/* 289 */     String actualSeparator = conf.get(SEPARATOR_CONF_KEY);
/* 290 */     if (actualSeparator != null) {
/* 291 */       conf.set(SEPARATOR_CONF_KEY, new String(Base64.encodeBytes(actualSeparator.getBytes())));
/*     */     }
/*     */
/* 295 */     String tableName = args[0];
/* 296 */     Path inputDir = new Path(args[1]);
/* 297 */     Job job = new Job(conf, "importtsv_" + tableName);
/* 298 */     job.setJarByClass(TsvImporter.class);
/* 299 */     FileInputFormat.setInputPaths(job, new Path[] { inputDir });
/* 300 */     job.setInputFormatClass(TextInputFormat.class);
/* 301 */     job.setMapperClass(TsvImporter.class);
/*     */
/* 303 */     String hfileOutPath = conf.get("importtsv.bulk.output");
/* 304 */     if (hfileOutPath != null) {
/* 305 */       HTable table = new HTable(conf, tableName);
/* 306 */       job.setReducerClass(PutSortReducer.class);
/* 307 */       Path outputDir = new Path(hfileOutPath);
/* 308 */       FileOutputFormat.setOutputPath(job, outputDir);
/* 309 */       job.setMapOutputKeyClass(ImmutableBytesWritable.class);
/* 310 */       job.setMapOutputValueClass(Put.class);
/* 311 */       HFileOutputFormat.configureIncrementalLoad(job, table);
/*     */     }
/*     */     else
/*     */     {
/* 315 */       TableMapReduceUtil.initTableReducerJob(tableName, null, job);
/* 316 */       job.setNumReduceTasks(0);
/*     */     }
/*     */
/* 319 */     TableMapReduceUtil.addDependencyJars(job);
/* 320 */     TableMapReduceUtil.addDependencyJars(job.getConfiguration(), new Class[] { Function.class });
/*     */
/* 322 */     return job;
/*     */   }
/*     */
/*     */   private static void usage(String errorMsg)
/*     */   {
/* 329 */     if ((errorMsg != null) && (errorMsg.length() > 0)) {
/* 330 */       System.err.println("ERROR: " + errorMsg);
/*     */     }
/* 332 */     String usage = "Usage: importtsv -Dimporttsv.columns=a,b,c <tablename> <inputdir>\n\nImports the given input directory of TSV data into the specified table.\n\nThe column names of the TSV data must be specified using the -Dimporttsv.columns\noption. This option takes the form of comma-separated column names, where each\ncolumn name is either a simple column family, or a columnfamily:qualifier. The special\ncolumn name HBASE_ROW_KEY is used to designate that this column should be used\nas the row key for each imported record. You must specify exactly one column\nto be the row key, and you must specify a column name for every column that exists in the\ninput data.\n\nIn order to prepare data for a bulk data load, pass the option:\n  -Dimporttsv.bulk.output=/path/for/output\n  Note: if you do not use this option, then the target table must already exist in HBase\n\nOther options that may be specified with -D include:\n  -Dimporttsv.skip.bad.lines=false - fail if encountering an invalid line\n  '-Dimporttsv.separator=|' - eg separate on pipes instead of tabs";
/*     */
/* 352 */     System.err.println(usage);
/*     */   }
/*     */
/*     */   public static void main(String[] args)
/*     */     throws Exception
/*     */   {
                for(String str: args)
                {
                    System.out.println("Command line Arguments::" + str);
                }
/* 362 */     Configuration conf = HBaseConfiguration.create();
/* 363 */     String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

                for(String str: otherArgs)
                {
                    System.out.println("OtherArguments==>" + str);
                }
/* 364 */     if (otherArgs.length < 2) {
/* 365 */       usage("Wrong number of arguments: " + otherArgs.length);
/* 366 */       System.exit(-1);
/*     */     }
/*     */
/* 370 */     String[] columns = conf.getStrings("importtsv.columns");
/* 371 */     if (columns == null) {
/* 372 */       usage("No columns specified. Please specify with -Dimporttsv.columns=...");
/*     */
/* 374 */       System.exit(-1);
/*     */     }
/*     */
/* 378 */     int rowkeysFound = 0;
/* 379 */     for (String col : columns) {
/* 380 */       if (!(col.equals(TsvParser.ROWKEY_COLUMN_SPEC))) continue; ++rowkeysFound;
/*     */     }
/* 382 */     if (rowkeysFound != 1) {
/* 383 */       usage("Must specify exactly one column as " + TsvParser.ROWKEY_COLUMN_SPEC);
/* 384 */       System.exit(-1);
/*     */     }
/*     */
/* 388 */     if (columns.length < 2) {
/* 389 */       usage("One or more columns in addition to the row key are required");
/* 390 */       System.exit(-1);
/*     */     }
/*     */
                //check whether the separator has been read properly or not
                String actualSeparator = conf.get(SEPARATOR_CONF_KEY);
                System.out.println("SEPARATOR as per jobconf:" + actualSeparator);

/* 393 */     Job job = createSubmittableJob(conf, otherArgs);
/* 394 */     System.exit((job.waitForCompletion(true)) ? 0 : 1);
/*     */   }
/*     */
/*     */   static class TsvImporter extends Mapper<LongWritable, Text, ImmutableBytesWritable, Put>
/*     */   {
/*     */     private long ts;
/*     */     private boolean skipBadLines;
/*     */     private Counter badLineCount;
/*     */     private ImportData.TsvParser parser;
/*     */
/*     */     protected void setup(Mapper<LongWritable, Text, ImmutableBytesWritable, Put>.Context context)
/*     */     {
/* 207 */       Configuration conf = context.getConfiguration();
/*     */
/* 211 */       String separator = conf.get(SEPARATOR_CONF_KEY);
                System.out.println("SEPARATOR IS:" + separator);
                System.err.println("SEPARATOR IS:" + separator);
/* 212 */       if (separator == null)
/* 213 */         separator = "\t";
/*     */       else {
/* 215 */         separator = new String(Base64.decode(separator));
/*     */       }
/*     */
/* 218 */       this.parser = new ImportData.TsvParser(conf.get("importtsv.columns"), separator);
/*     */
/* 220 */       if (this.parser.getRowKeyColumnIndex() == -1) {
/* 221 */         throw new RuntimeException("No row key column specified");
/*     */       }
/* 223 */       this.ts = System.currentTimeMillis();
/*     */
/* 225 */       this.skipBadLines = context.getConfiguration().getBoolean("importtsv.skip.bad.lines", true);
/*     */
/* 227 */       this.badLineCount = context.getCounter("ImportData", "Bad Lines");
/*     */     }
/*     */
/*     */     public void map(LongWritable offset, Text value, Mapper<LongWritable, Text, ImmutableBytesWritable, Put>.Context context)
/*     */       throws IOException
/*     */     {
/* 237 */       byte[] lineBytes = value.getBytes();
/*     */       try
/*     */       {
/* 240 */         ImportData.TsvParser.ParsedLine parsed = this.parser.parse(lineBytes, value.getLength());
/*     */
/* 242 */         ImmutableBytesWritable rowKey = new ImmutableBytesWritable(lineBytes, parsed.getRowKeyOffset(), parsed.getRowKeyLength());
/*     */
/* 247 */         Put put = new Put(rowKey.copyBytes());
/* 248 */         for (int i = 0; i < parsed.getColumnCount(); ++i)
/* 249 */           if (i != this.parser.getRowKeyColumnIndex()) {
/* 250 */             KeyValue kv = new KeyValue(lineBytes, parsed.getRowKeyOffset(), parsed.getRowKeyLength(), this.parser.getFamily(i), 0, this.parser.getFamily(i).length, this.parser.getQualifier(i), 0, this.parser.getQualifier(i).length, this.ts, KeyValue.Type.Put, lineBytes, parsed.getColumnOffset(i), parsed.getColumnLength(i));
/*     */
/* 257 */             put.add(kv);
/*     */           }
/* 259 */         context.write(rowKey, put);
/*     */       } catch (ImportData.TsvParser.BadTsvLineException badLine) {
/* 261 */         if (this.skipBadLines) {
/* 262 */           System.err.println("Bad line at offset: " + offset.get() + ":\n" + badLine.getMessage());
                    System.out.println("Bad line at offset: " + offset.get() + ":\n" + badLine.getMessage());
/*     */           badLine.printStackTrace();
/* 265 */           this.badLineCount.increment(1L);
/* 266 */           return;
/*     */         }
/* 268 */         throw new IOException(badLine);
/*     */       }
/*     */       catch (InterruptedException e) {
/* 271 */         e.printStackTrace();
/*     */       }
/*     */     }
/*     */   }
/*     */
/*     */   static class TsvParser
/*     */   {
/*     */     private final byte[][] families;
/*     */     private final byte[][] qualifiers;
/*     */     private final byte separatorByte;
/*     */     private int rowKeyColumnIndex;
/*  79 */     public static String ROWKEY_COLUMN_SPEC = "HBASE_ROW_KEY";
/*     */
/*     */     public TsvParser(String columnsSpecification, String separatorStr)
/*     */     {
/*  87 */       byte[] separator = Bytes.toBytes(separatorStr);
/*  88 */       Preconditions.checkArgument(separator.length == 1, "TsvParser only supports single-byte separators");
/*     */
/*  90 */       this.separatorByte = separator[0];
/*     */
/*  93 */       ArrayList columnStrings = Lists.newArrayList(Splitter.on(',').trimResults().split(columnsSpecification));
/*     */
/*  96 */       this.families = new byte[columnStrings.size()][];
/*  97 */       this.qualifiers = new byte[columnStrings.size()][];
/*     */
/*  99 */       for (int i = 0; i < columnStrings.size(); ++i) {
/* 100 */         String str = (String)columnStrings.get(i);
/* 101 */         if (ROWKEY_COLUMN_SPEC.equals(str)) {
/* 102 */           this.rowKeyColumnIndex = i;
/*     */         }
/*     */         else {
/* 105 */           String[] parts = str.split(":", 2);
/* 106 */           if (parts.length == 1) {
/* 107 */             this.families[i] = str.getBytes();
/* 108 */             this.qualifiers[i] = HConstants.EMPTY_BYTE_ARRAY;
/*     */           } else {
/* 110 */             this.families[i] = parts[0].getBytes();
/* 111 */             this.qualifiers[i] = parts[1].getBytes(); }
/*     */         }
/*     */       }
/*     */     }
/*     */
/*     */     public int getRowKeyColumnIndex() {
/* 117 */       return this.rowKeyColumnIndex; }
/*     */
/*     */     public byte[] getFamily(int idx) {
/* 120 */       return this.families[idx]; }
/*     */
/*     */     public byte[] getQualifier(int idx) {
/* 123 */       return this.qualifiers[idx];
/*     */     }
/*     */
/*     */     public ParsedLine parse(byte[] lineBytes, int length)
/*     */       throws ImportData.TsvParser.BadTsvLineException
/*     */     {
/* 129 */       ArrayList tabOffsets = new ArrayList(this.families.length);
                System.out.println("Line Bytes:" + Bytes.toString(lineBytes));
/* 130 */       for (int i = 0; i < length; ++i) {
/* 131 */         if (lineBytes[i] == this.separatorByte) {
/* 132 */           tabOffsets.add(Integer.valueOf(i));
/*     */         }
/*     */       }
/* 135 */       if (tabOffsets.isEmpty()) {
                    System.err.println("Tab Offset:" +tabOffsets.toArray());
                    System.err.println("Line Bytes:" + Bytes.toString(lineBytes));
/* 136 */         throw new ImportData.TsvParser.BadTsvLineException("No delimiter");
/*     */       }
/*     */
/* 139 */       tabOffsets.add(Integer.valueOf(length));
/*     */
/* 141 */       if (tabOffsets.size() > this.families.length)
/* 142 */         throw new ImportData.TsvParser.BadTsvLineException("Excessive columns");
/* 143 */       if (tabOffsets.size() <= getRowKeyColumnIndex()) {
/* 144 */         throw new ImportData.TsvParser.BadTsvLineException("No row key");
/*     */       }
/* 146 */       return new ParsedLine(tabOffsets, lineBytes);
/*     */     }
/*     */
/*     */     public static class BadTsvLineException extends Exception
/*     */     {
/*     */       private static final long serialVersionUID = 1L;
/*     */
/*     */       public BadTsvLineException(String err)
/*     */       {
/* 183 */         super(err);
/*     */       }
/*     */     }
/*     */
/*     */     class ParsedLine
/*     */     {
/*     */       private final ArrayList<Integer> tabOffsets;
/*     */       private byte[] lineBytes;
/*     */
/*     */       ParsedLine(ArrayList<Integer> tabOffsets, byte[] lineBytes )
/*     */       {
/* 154 */         this.tabOffsets = tabOffsets;
/* 155 */         this.lineBytes = lineBytes;
/*     */       }
/*     */
/*     */       public int getRowKeyOffset() {
/* 159 */         return getColumnOffset(ImportData.TsvParser.this.rowKeyColumnIndex); }
/*     */
/*     */       public int getRowKeyLength() {
/* 162 */         return getColumnLength(ImportData.TsvParser.this.rowKeyColumnIndex); }
/*     */
/*     */       public int getColumnOffset(int idx) {
/* 165 */         if (idx > 0) {
/* 166 */           return (((Integer)this.tabOffsets.get(idx - 1)).intValue() + 1);
/*     */         }
/* 168 */         return 0; }
/*     */
/*     */       public int getColumnLength(int idx) {
/* 171 */         return (((Integer)this.tabOffsets.get(idx)).intValue() - getColumnOffset(idx)); }
/*     */
/*     */       public int getColumnCount() {
/* 174 */         return this.tabOffsets.size(); }
/*     */
/*     */       public byte[] getLineBytes() {
/* 177 */         return this.lineBytes;
/*     */       }
/*     */     }
/*     */   }
/*     */ }

/* Location:           C:\Users\agupta5\Documents\hadoop_stuff\hbase\hbase-0.90.4-cdh3u2.jar
 * Qualified Name:     org.apache.hadoop.hbase.mapreduce.ImportData
 * Java Class Version: 6 (50.0)
 * JD-Core Version:    0.5.3
 */