知识点
相关文章
更多最近更新
更多Hadoop实现共同出现的单词(Word co-occurrence)
2019-03-28 12:57|来源: 网络
Hadoop实现共同出现的单词(Word co-occurrence)是指在一个句子中相邻的两个单词。每一个相邻的单词就是一个Co-Occurrence对。
Sample Input:
a b cc, c d d c
I Love U.
dd ee f g s sa dew ad da
So shaken as we are, so wan with care.
Find we a time for frighted peace to pant.
And breathe short-winded accents of new broil.
To be commenced in strands afar remote.
I Love U U love i.
i i i i
Sample Output:
a:b 1
a:time 1
a:we 1
accents:of 1
accents:short-winded 1
ad:da 1
ad:dew 1
afar:remote 1
afar:strands 1
and:breathe 1
are:so 1
are:we 1
as:shaken 1
as:we 1
b:cc 1
be:commenced 1
be:to 1
breathe:short-winded 1
broil:new 1
c:cc 1
c:d 2
care:with 1
commenced:in 1
d:d 1
dd:ee 1
dew:sa 1
ee:f 1
f:g 1
find:we 1
for:frighted 1
for:time 1
frighted:peace 1
g:s 1
i:i 3
i:love 3
in:strands 1
love:u 3
new:of 1
pant:to 1
peace:to 1
s:sa 1
shaken:so 1
so:wan 1
u:u 1
wan:with 1
Code:
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.GenericOptionsParser;
public class CoOccurrence {
public static class TextPair implements WritableComparable<TextPair> {
private Text first;
private Text second;
public TextPair(){
set(new Text(), new Text());
}
public TextPair(String left, String right) {
set(new Text(left), new Text(right));
}
public TextPair(Text left, Text right) {
set(left, right);
}
public void set(Text left, Text right){
String l = left.toString();
String r = right.toString();
int cmp = l.compareTo(r);
if(cmp <= 0){
this.first = left;
this.second = right;
}else{
this.first = right;
this.second = left;
}
}
public Text getFirst() {
return first;
}
public Text getSecond() {
return second;
}
@Override
public void readFields(DataInput in) throws IOException {
first.readFields(in);
second.readFields(in);
}
@Override
public void write(DataOutput out) throws IOException {
first.write(out);
second.write(out);
}
@Override
public int hashCode() {
return first.hashCode() * 163 + second.hashCode();//May be some trouble here. why 163? sometimes 157
}
@Override
public boolean equals(Object o) {
if (o instanceof TextPair) {
TextPair tp = (TextPair) o;
return first.equals(tp.first) && second.equals(tp.second);
}
return false;
}
@Override
public String toString(){
return first + ":" + second;
}
@Override
public int compareTo(TextPair tp) {
int cmp = first.compareTo(tp.first);
if(cmp != 0)
return cmp;
return second.compareTo(tp.second);
}
// A Comparator that com.pares serialized StringPair.
public static class Comparator extends WritableComparator {
private static final Text.Comparator TEXT_COMPARATOR = new Text.Comparator();
public Comparator() {
super(TextPair.class);
}
@Override
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2){
try {
int firstl1 = WritableUtils.decodeVIntSize(b1[s1]) + readVInt(b1, s1);
int firstl2 = WritableUtils.decodeVIntSize(b2[s2]) + readVInt(b2, s2);
int cmp = TEXT_COMPARATOR.compare(b1, s1, firstl1, b2, s2, firstl2);
if(cmp != 0)
return cmp;
return TEXT_COMPARATOR.compare(b1, s1 + firstl1, l1 - firstl1,
b2, s2 + firstl2, l1 - firstl2);
}catch (IOException e) {
throw new IllegalArgumentException(e);
}
}
}//End of Comparator
static { // register this comparator
WritableComparator.define(TextPair.class, new Comparator());
}
// Compare only the first part of the pair, so that reduce is called once for each value of the first part.
public static class FirstComparator extends WritableComparator {
private static final Text.Comparator TEXT_COMPARATOR = new Text.Comparator();
public FirstComparator() {
super(TextPair.class);
}
@Override
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2){
try {
int firstl1 = WritableUtils.decodeVIntSize(b1[s1]) + readVInt(b1, s1);
int firstl2 = WritableUtils.decodeVIntSize(b2[s2]) + readVInt(b2, s2);
return TEXT_COMPARATOR.compare(b1, s1, firstl1, b2, s2, firstl2);
}catch (IOException e) {
throw new IllegalArgumentException(e);
}
}
/*
@Override
public int compare(WritableComparator a, WritableComparator b) {
if(a instanceof TextPair && b instanceof TextPair)
return ((TextPair)a).first.compareTo(((TextPair)b).first);
return super.compare(a, b);
}*/
}//End of FirstComparator
}//End of TextPair
//Partition based on the first part of the pair.
public static class FirstPartitioner extends Partitioner<TextPair,IntWritable>{
@Override
public int getPartition(TextPair key, IntWritable value, int numPartitions) {
return Math.abs(key.getFirst().toString().indexOf(0) * 127) % numPartitions;//May be some trouble here.
}
}//End of FirstPartitioner
public static class MyMapper extends Mapper<LongWritable, Text, TextPair, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private static Text word0 = new Text();
private static Text word1 = new Text();
private String pattern = "[^a-zA-Z0-9-']";
@Override
public void map(LongWritable inKey, Text inValue, Context context)throws IOException, InterruptedException {
String line = inValue.toString();
line = line.replaceAll(pattern, " ");
line = line.toLowerCase();
String[] str = line.split(" +");
for(int i=0; i< str.length-1; i++)
{
word0.set(str[i]);
word1.set(str[i+1]);
TextPair pair = new TextPair(word0, word1);
context.write(pair, one);
}
}
}//End of MapClass
public static class MyReducer extends Reducer<TextPair, IntWritable, TextPair, IntWritable> {
private IntWritable result = new IntWritable();
@Override
public void reduce(TextPair inKey, Iterable<IntWritable> inValues, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : inValues) {
sum += val.get();
}
result.set(sum);
context.write(inKey, result);
}
}//End of MyReducer
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
//conf.set("Hadoop.job.ugi", "sunguoli,cs402");
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
//if (otherArgs.length != 2) {
// System.err.println("Usage: CoOccurrence <in> <out>");
// System.exit(2);
//}
Job job = new Job(conf, "Co-Occurrence");
job.setJarByClass(CoOccurrence.class);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(TextPair.class);
job.setMapOutputValueClass(IntWritable.class);
job.setCombinerClass(MyReducer.class);
// group and partition by the first int in the pair
//job.setPartitionerClass(FirstPartitioner.class);
//job.setGroupingComparatorClass(FirstGroupingComparator.class);
// the reduce output is Text, IntWritable
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(TextPair.class);
job.setOutputValueClass(IntWritable.class);
//FileInputFormat.addInputPath(job, new Path("../shakespeareinput"));
//FileOutputFormat.setOutputPath(job, new Path("output"));
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}//End of main
}//End of CoOccurrence
更多Hadoop相关信息见Hadoop 专题页面 http://www.linuxidc.com/topicnews.aspx?tid=13
相关问答
更多-
以下是如何使用SciPy的COO格式从一组文档构建文档术语矩阵A ,这是易用性和效率之间的良好折衷(*): vocabulary = {} # map terms to column indices data = [] # values (maybe weights) row = [] # row (document) indices col = [] # column (term) indices for i, doc in enumerate(docum ...
-
我不知道是否有一种很好的方法可以在不迭代列的情况下执行此操作。 我想我会做一些直截了当的事情: np.random.seed(13) df=pd.DataFrame(np.random.choice([np.nan,1,2],9).reshape([3,3]), columns=list('abc')) a b c 0 2.0 NaN 2.0 1 NaN 2.0 2.0 2 NaN 1.0 NaN cols = df.columns for i in cols: ...
-
你应该给jar jar路径,包括Cooccurrence类。 Jar位于目标文件夹(cooc-1.0-SNAPSHOT.jar)中。 您不需要指定在清单文件中设置的类名 I actually managed to run the programm. My approach wasn't that wrong, as tokiloutok mentioned I had to include the right jar file. Before I could execute the command I ha ...
-
原因是您实际上没有从您的map返回任何记录。 使用yield返回for的记录,如下所示: val coTerm = words.map{ line => for{ i <-0 until line.length j <- (i+1) until line.length } yield { ((line(i), line(j)), 1) }} The cause is you are not actually returning any records from you map ...
-
显然这可以为你的目的而扩展,但它执行一般操作: import math for a in 'ABCD': for b in 'ABCD': count = 0 for x in document: if a != b: if a in x and b in x: count += 1 else: n = x.co ...
-
首先,您的输入csv文件实际上不是csv。 它更像是一个可以使用str.split解析的文件。 好。 现在,我将获得令牌并使用itertools.groupby使用第一列作为关键字来对具有相同第一列的项目进行分组。 完成后,使用一个项目过滤掉列表,然后对其余项目应用组合。 写为正确的csv文件: import csv, itertools with open("test.csv") as f: with open("output.csv","w",newline="") as f2 ...
-
select t1.col2 as item_A ,t2.col2 as item_B ,count(*) as cnt from mytable t1 join mytable t2 on t1.col1 = t2.col1 where t1.col2 < t2.col2 group by t1.col2 ,t2.col2 +-- ...
-
它给出了正确的结果,因为你在L2第一项中有car和dog是0索引。 这是一个更加pythonic的方法,根据L2第一次出现的对来获取索引: In [158]: L2 = ['cat car dog', 'cat house dog', 'cat car', 'cat dog'] In [159]: L2 = [s.split() for s in L2] In [160]: combinations = np.column_stack((np.repeat(L1, 5), np.tile(L1, 5)) ...
-
这是一个使用itertools.product的解决方案。 这应该比接受的解决方案明显更好(如果这是一个问题)。 from itertools import product from operator import mul L1 = ['b', 'c', 'd', 'e', 't', 'w', 'x', 'y', 'z'] L2 = ['the onion', 'be your self', 'great zoo', 'x men', 'corn day'] phrase_map = {} for p ...
-
计算向量的共现(Calculate the co-occurrence of a vector)[2023-01-14]
让我们看看这是否是你需要的: a=unique(x); Coocurrence=zeros(length(a)); for ii=1:length(a) Coocurrence(ii)=sum(x==a(ii)); end 或矢量化解决方案 a=unique(x); Coocurrence=sum(bsxfun(@eq,x,a'),2); Let's see if this is what you need: a=unique(x); Coocurrence=zeros(length(a) ...