import java.io.IOException;
import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
/**
* RCFileInputFormat.
*
* @param <K>
* @param <V>
*/
public class RCFileInputFormat<K extends LongWritable, V extends BytesRefArrayWritable>
extends FileInputFormat<K, V> {
public RCFileInputFormat() {
}
@SuppressWarnings("unchecked")
@Override
public org.apache.hadoop.mapreduce.RecordReader<K, V> createRecordReader(
org.apache.hadoop.mapreduce.InputSplit arg0, TaskAttemptContext arg1)
throws IOException, InterruptedException {
return new RCFileRecordReader();
}
}

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.RCFile;
import org.apache.hadoop.hive.ql.io.RCFile.Reader;
import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;/**
* RCFileRecordReader.
*
* @param <K>
* @param <V>
*/
public class RCFileRecordReader<K extends LongWritable, V extends BytesRefArrayWritable>
extends RecordReader<LongWritable, BytesRefArrayWritable> { private Reader in;
private long start;
private long end;
private boolean more = true;
private LongWritable key = null;
private BytesRefArrayWritable value = null;
protected Configuration conf; /**
* Return the progress within the input split.
*
* @return 0.0 to 1.0 of the input byte range
*/
public float getProgress() throws IOException {
if (end == start) {
return 0.0f;
} else {
return Math.min(1.0f, (in.getPosition() - start)
/ (float) (end - start));
}
}
public void close() throws IOException {
in.close();
}
@Override
public LongWritable getCurrentKey() throws IOException,
InterruptedException {
return key;
}
@Override
public BytesRefArrayWritable getCurrentValue() throws IOException,
InterruptedException {
return value;
}
@Override
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
FileSplit fileSplit = (FileSplit) split;
conf = context.getConfiguration();
Path path = fileSplit.getPath();
FileSystem fs = path.getFileSystem(conf);
this.in = new RCFile.Reader(fs, path, conf);
this.end = fileSplit.getStart() + fileSplit.getLength();
if (fileSplit.getStart() > in.getPosition()) {
in.sync(fileSplit.getStart()); // sync to start
}
this.start = in.getPosition();
more = start < end;
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if (!more) {
return false;
}
if (key == null) {
key = new LongWritable();
}
if (value == null) {
value = new BytesRefArrayWritable();
}
more = in.next(key);
if (!more) {
return false;
}
long lastSeenSyncPos = in.lastSeenSyncPos();
if (lastSeenSyncPos >= end) {
more = false;
return more;
}
in.getCurrentRow(value);
return more;
}
}

应用方式：

job.setInputFormatClass(RCFileInputFormat.class);

public static class Map extends Mapper<LongWritable, BytesRefArrayWritable, Text, NullWritable> {

@Override

protected void map(LongWritable key, BytesRefArrayWritable value, Context context) throws IOException, InterruptedException {

String top = new String(value.get(32).getBytesCopy());

byte[] channel = value.get(12).getBytesCopy();

知识点

相关文章

最近更新

Mapreduce中的RCFile输入RCFileInputFormat实现及其应用

相关问答

Hadoop MapReduce通过多个输入(Hadoop MapReduce over multiple inputs)[2023-05-29]

map中的mapreduce - gzip输入文件(mapreduce in java - gzip input files)[2023-08-01]

PyMongo中的MapReduce(MapReduce in PyMongo)[2023-05-31]

如何检测mapreduce中的错误(How to detect an error in mapreduce)[2023-07-01]

如何阅读RCFile(How to read in a RCFile)[2021-12-02]

mongodb mapReduce中的作用域和查询有什么区别？(What is the difference between scope and query in mongodb mapReduce?)[2023-11-27]

如何在Hadoop中迭代MapReduce？(How to iterate MapReduce in Hadoop? (lang: python))[2022-04-06]

防止MapReduce程序中的输入拆分(Prevent Input splitting in MapReduce program)[2023-11-16]

在mapreduce中处理的文件(Files processed in mapreduce)[2023-06-18]

Spark中的mapreduce参数(mapreduce parameters in Spark)[2024-02-12]