java

Using Python to access HBase through JPype

First, we need to write a Java function to get data from HBase:

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.DiamondAddressHelper;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.Row;
import org.apache.hadoop.hbase.client.HTableInterface;
import org.apache.hadoop.hbase.client.HTablePool;
import org.apache.hadoop.hbase.client.Result;
public class HBaseReader {
  private HTableInterface table_;
  private HTablePool pool_;
  public ArrayList fetchFeatures(String columnFamily, String qualifier, ArrayList feature_ids) {
    List batch = new ArrayList();
    for (String id : feature_ids) {
      Get get = new Get(id.getBytes());
      get.addColumn(columnFamily.getBytes(), qualifier.getBytes());
      batch.add(get);
    }
    Object[] results = new Object[batch.size()];
    try {
      table_.batch(batch, results);
    } catch (Exception e) {
      System.err.println("Error: " + e);
    }
    ArrayList list = new ArrayList();
    for (Object obj : results) {
      if (obj instanceof Result) {
        Result res = (Result)obj;
        byte[] value = res.getValue(columnFamily.getBytes(), qualifier.getBytes());
        if (value == null) {
          list.add("".getBytes());
        } else {
          list.add(value);
        }
      }
    }
    return list;
  }
  public void init(String dataid, String groupid, String tableName) {
    Configuration conf = HBaseConfiguration.create();
    conf.setBoolean(DiamondAddressHelper.DIMAOND_HBASE_UNITIZED, true);
    conf.set(DiamondAddressHelper.DIAMOND_HBASE_KEY_NEW, dataid);
    conf.set(DiamondAddressHelper.DIAMOND_HBASE_GROUP, groupid);
    try {
      pool_ = new HTablePool(conf, 100);
      table_ = pool_.getTable(tableName);
    } catch (Exception e) {
      System.err.println("Error: " + e);
    }
  }
  public void shutdown() {
    try {
      table_.close();
      pool_.close();
    } catch (Exception e) {
      System.err.println("Error: " + e);
    }
  }
}

Then use maven to build it to one jar file with all dependent libraries:


  ....
  ....
  hbasereader
  jar
  1.0.0
  Reader for HBase
  http://maven.apache.org
  
    
      org.apache.hbase
      hbase
      ....
      
        
          jdk.tools
          jdk.tools
        
      
    
    
      org.apache.hadoop
      hadoop-core
      ....
    
    
      org.apache.hadoop.thirdparty.guava
      guava
      ....
    
    
....
  
    
      
        maven-assembly-plugin
        2.6
        
          
            jar-with-dependencies
          
          
            
              com.taobao.ad.HBaseReader
            
          
        
        
          
            make-assembly
            package
            
              single

Now, we could use python to call this Class from java by using JPype:

import os
import time
import jpype
import numpy
jpype.startJVM(jpype.getDefaultJVMPath(), '-ea', '-Djava.class.path=./target/hbasereader-1.0.0-jar-with-dependencies.jar')
Reader = jpype.JClass("com.taobao.ad.HBaseReader")
reader = Reader()
ArrayList = jpype.JClass("java.util.ArrayList")
list = ArrayList()
list.add('1')
list.add('2')
list.add('3')
list.add('4')
list.add('5')
reader.init('hbase.diamond.dataid.test.hbase', 'hbase-diamond-group-name-test', 'alimama_training_image_table')
begin = time.time()
for i in range(10000):
  res = reader.fetchFeatures('ct', 'image', list)
period = time.time() - begin
print(period)
print(10000/period)
data = numpy.asarray(res[4], dtype=numpy.uint8)
print(data)
reader.shutdown()
jpype.shutdownJVM()

This python example could run correctly. But if we use it in tf.py_func(), it will core dump in libjvm.so, which is difficult to debug. So at last we choose to write operation by c++ to access HBase through Thrift server, which is better for stability and grace of architecture.

Use MapReduce to join two datasets

The two datasets are:

#users.txt (student id, name)
1,Robin Dong
2,Timi Yang
3,Olive Xu
4,Jenny Xu
5,Elsa Dong
6,Coly Wang
7,Hulk Li
8,Judy Lao
9,Kevin Liu
10,House Zhang

#scores.txt (student id, course, score)
1,Math,90
1,Physics,80
3,Music,70
5,Math,80
7,Geography,70
1,Geography,60
2,Physics,70
6,Math,70
4,Music,90
6,Geography,75
9,Geography,85
10,Music,95
2,Physics,78
2,Music,73
2,Math,84
4,Math,61
4,Physics,65
5,Music,66
5,Math,90

To join the two tables above by “student id”, we need to use MultipleInputs. The code is:

import java.io.IOException;
import java.util.Iterator;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
public class School {
  public static class UserMapper
    extends Mapper {
    private String uid, name;
    @Override
    public void map(LongWritable key, Text value, Context context)
      throws IOException, InterruptedException {
      String line = value.toString();
      String arr[] = line.split(",");
      uid = arr[0].trim();
      name = arr[1].trim();
      context.write(new Text(uid), new Text(name));
    }
  }
  public static class ScoreMapper
      extends Mapper {
      private String uid, course, score;
      @Override
      public void map(LongWritable key, Text value, Context context)
        throws IOException, InterruptedException {
        String line = value.toString();
        String arr[] = line.split(",");
        uid = arr[0].trim();
        course = arr[1].trim();
        score = arr[2].trim();
        context.write(new Text(uid), new Text(course + "," + score));
      }
  }
  public static class InnerJoinReducer extends Reducer {
    @Override
    public void reduce(Text key, Iterable values, Context context)
    throws IOException, InterruptedException {
      String name = "";
      List courses = new ArrayList();
      List scores = new ArrayList();
      for (Text value : values) {
        String cur = value.toString();
        if (cur.contains(",")) {
          String arr[] = cur.split(",");
          courses.add(arr[0]);
          scores.add(arr[1]);
        } else {
          name = cur;
        }
      }
      if (!name.isEmpty() && !courses.isEmpty() && !scores.isEmpty()) {
        for (int i = 0; i < courses.size(); i++) {
          context.write(new Text(name), new Text(courses.get(i) + "," + scores.get(i)));
        }
      }
    }
  }
  public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf, "School");
    job.setJarByClass(School.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    MultipleInputs.addInputPath(job, new Path(args[0]), TextInputFormat.class, UserMapper.class);
    MultipleInputs.addInputPath(job, new Path(args[1]), TextInputFormat.class, ScoreMapper.class);
    job.setReducerClass(InnerJoinReducer.class);
    FileOutputFormat.setOutputPath(job, new Path(args[2]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
  }
}

Compile and run it:

~/hadoop-2.7.2/bin/hadoop com.sun.tools.javac.Main School.java -Xlint:unchecked
jar cf school.jar School*.class
bin/hadoop jar ~/school.jar School /users.txt /scores.txt /my

And the result in /my is:

Robin Dong      Geography,60
Robin Dong      Physics,80
Robin Dong      Math,90
House Zhang     Music,95
Timi Yang       Physics,70
Timi Yang       Math,84
Timi Yang       Music,73
Timi Yang       Physics,78
Olive Xu        Music,70
Jenny Xu        Physics,65
Jenny Xu        Math,61
Jenny Xu        Music,90
Elsa Dong       Math,90
Elsa Dong       Music,66
Elsa Dong       Math,80
Coly Wang       Geography,75
Coly Wang       Math,70
Hulk Li Geography,70
Kevin Liu       Geography,85

Use MapReduce to find prime numbers

Just want to write a small example of MapReduce of Hadoop for finding prime numbers. The first question is: how could I generate numbers from 1 to 1000000 by my own application instead of reading from file of HDFS? The answer is: inherit the InputSplit, RecordReader, and InputFormat by yourself, just like teragen program
Then comes the second question: could I just use mapper without reducer stage? The answer is yes, simply use job.setNumReduceTasks(0) to disable reducer stage.
The complete code is here (I know the algorithm for checking a number for prime is naive, but it works):

import java.io.IOException;
import java.io.DataInput;
import java.io.DataOutput;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class CalcPrime {
  public static final String SPLITS_NUM = "calcprime.splits.num";
  public static final String MAX_RANGE = "calcprime.range.max";
  public static final long DEFAULT_SLITS = 200;
  public static final long DEFAULT_MAX = 10000;
  public static class NumberInputFormat
      extends InputFormat {
      static class NumberInputSplit extends InputSplit implements Writable {
        long first;
        long count;
        public NumberInputSplit() {}
        public NumberInputSplit(long offset, long length) {
          first = offset;
          count = length;
        }
        public long getLength() throws IOException {
          return 0;
        }
        public String[] getLocations() throws IOException {
          return new String[]{};
        }
        public void readFields(DataInput in) throws IOException {
          first = WritableUtils.readVLong(in);
          count = WritableUtils.readVLong(in);
        }
        public void write(DataOutput out) throws IOException {
          WritableUtils.writeVLong(out, first);
          WritableUtils.writeVLong(out, count);
        }
      }
      static class NumberRecordReader
          extends RecordReader {
          long first;
          long count;
          long current;
          public NumberRecordReader() {}
          public void initialize(InputSplit split, TaskAttemptContext context)
            throws IOException, InterruptedException {
            first = ((NumberInputSplit)split).first;
            count = ((NumberInputSplit)split).count;
            current = first;
          }
          public void close() throws IOException {}
          public LongWritable getCurrentKey() {
            return new LongWritable(current);
          }
          public NullWritable getCurrentValue() {
            return NullWritable.get();
          }
          public float getProgress() throws IOException {
            return current / (float) count;
          }
          public boolean nextKeyValue() {
            if (current >= (count + first)) {
              return false;
            }
            current++;
            return true;
          }
      }
      public RecordReader
        createRecordReader(InputSplit split, TaskAttemptContext context)
        throws IOException {
          return new NumberRecordReader();
        }
      public List getSplits(JobContext job) {
        List splits = new ArrayList();
        long splitsNum = getSplitsNum(job);
        long maxRange = getMaxRange(job);
        for (int start = 0; start < splitsNum; ++start) {
          splits.add(new NumberInputSplit(start * maxRange, maxRange));
        }
        return splits;
      }
      public long getSplitsNum(JobContext job) {
        return job.getConfiguration().getLong(SPLITS_NUM, DEFAULT_SLITS);
      }
      public long getMaxRange(JobContext job) {
        return job.getConfiguration().getLong(MAX_RANGE, DEFAULT_MAX);
      }
  }
  public static class NumberMapper
    extends Mapper {
    public void map(LongWritable key, NullWritable value, Context context)
            throws IOException, InterruptedException {
            long lkey = key.get();
            if (lkey == 1) {
              return;
            }
            if (lkey == 2 || lkey == 3) {
              context.write(key, value);
              return;
            }
            long end = lkey / 2;
            for (int i = 2; i <= end; i++) {
              if (lkey % i == 0) {
                return;
              }
            }
            context.write(key, value);
    }
  }
  public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf, "Calc Prime");
    long splitsNum = DEFAULT_SLITS;
    long maxRange = DEFAULT_MAX;
    if (args.length > 1) {
      splitsNum = Long.parseLong(args[1]);
    }
    if (args.length > 2) {
      maxRange = Long.parseLong(args[2]);
    }
    job.getConfiguration().setLong(SPLITS_NUM, splitsNum);
    job.getConfiguration().setLong(MAX_RANGE, maxRange);
    FileOutputFormat.setOutputPath(job, new Path(args[0]));
    job.setJarByClass(CalcPrime.class);
    job.setMapperClass(NumberMapper.class);
    job.setNumReduceTasks(0);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(NullWritable.class);
    job.setInputFormatClass(NumberInputFormat.class);
    System.exit(job.waitForCompletion(true) ? 0 : 1);
  }
}

Copy the code to file CalcPrime.java, compile and run it:

/usr/local/hadoop-2.7.2/bin/hadoop com.sun.tools.javac.Main CalcPrime.java
jar cf prime.jar CalcPrime*.class
#Number of mapper task is 400, and every mapper process 1000000 numbers
/usr/local/hadoop-2.7.2/bin/hadoop jar ~/prime.jar CalcPrime /result 400 1000000

Some problems and solutions when deploying and running Hadoop-2.7.2

1. If we see this error report:

Container XXX is running beyond virtual memory limits

The solution is here, the heap size of Java should not be bigger than map/reduce memory. The Cloudera recommends the head size prefer to be 0.8 of the map/reduce memory, such as:


    mapreduce.map.memory.mb
    4096


    mapreduce.reduce.memory.mb
    8192


    mapreduce.map.java.opts
    -Xmx3276m


    mapreduce.map.java.opts
    -Xmx6553m

2. The directory of “/tmp/” became full.
This is usually caused by spilled data from map output. This article introduced the whole overview of Map/Reduce algorithm in Hadoop with a detailed and clear picture.
As a result, my solution is adding this configuration:


    hadoop.tmp.dir
    /data1/tmp/,/data2/tmp/,/data3/tmp/

into core-site.xml, so the inevitable spill data will be write into different disks for load balance.
3. Don’t use more than 0.8 of physical memory as “yarn.nodemanager.resource.memory-mb”, or it will cause unexpected fail for jobs.
4. If we launch too many map jobs or reduce jobs more than physical cores of servers, it may lead to tremendous timeouts for these jobs. Therefore, adjust the “mapreduce.map.memory.mb” and “mapreduce.reduce.memory.mb” carefully to limit the number of map/reduce jobs.
5. If you notice that all the CPU cores are full in Hadoop cluster, that does not mean we can’t do optimizations anymore. By using perf, I find out system waste too many times on launching and stopping java task (or containers):

So I change the value of “mapreduce.input.fileinputformat.split.minsize” to 8GB for reducing the number of mappers. After decrease the number of mappers from thousands to hundreds, the running time of Terasort program drop down more than 50% (Also the Context Switch of system fall from ten thousands per second to thousands). Therefore, adjust the number of java tasks close to the number of physical CPU cores is a better solution.

From scala Array[String] / Seq[String] to java varargs

While testing performance of redis these days, I need to use mset() interface of jedis (a java version redis client). But the prototype of mset() in jedis is:

@Override
  public String mset(final String... keysvalues) {

Firstly I write my scala code like:

var array = Array[String]()
array = array:+key1:+value1
array = array:+key2:+value2
jedis.mset(array)

But it report compiling errors:

[error] xxx: overloaded method value mset with alternatives:
[error]   (x$1: String*)String 
[error]   (x$1: Array[Byte]*)String
[error]  cannot be applied to (Array[String])
[error]                                 jedis.mset(array)
[error]                                       ^
[error] one error found
[error] (compile:compileIncremental) Compilation failed
[error] Total time: 4 s, completed Jan 8, 2016 11:32:47 AM

After searching many documents about scala/java on google, I finally find the answer: http://docs.scala-lang.org/style/types.html. So, let’s write code this way:

jedis.mset(array:_*)

Then Array[String] of scala changes to varargs in java now. It also viable for Seq[String].

Robin on Linux

java