同一种搜索词,哪个网站被用户访问的最多
作者:互联网
同一种搜索词,哪个网站被用户访问的最多
FourMapper.java
package com.hniu.bigdata.hadoop.Four;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.util.StringUtils;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
public class FourMapper extends Mapper<LongWritable, Text,Text, FourSortBean> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
// 日期数据处理
String[] values = line.split("\\s");
String time = values[values.length - 1];
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd");
String dateTime = format.format(new Date(Long.parseLong(time)));
// 域名切分
String url = values[4];
values[values.length - 1] = dateTime;
String domain = url.split("/")[0];
values[4] = domain;
line = StringUtils.join(" ",values);
String keyStr =values[0] +"_"+domain;
if(dateTime.contains("2019")) {
FourSortBean data = new FourSortBean();
data.setUser_id(values[0]);
data.setDomain(domain);
data.setTotal_click(1);
context.write(new Text(keyStr), data);
}
}
}
FourReducer.java
package com.hniu.bigdata.hadoop.Four;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Map;
import java.util.TreeMap;
public class FourReducer extends Reducer<Text, FourSortBean, Text, FourSortBean> {
private TreeMap<FourSortBean,String> topTen = new TreeMap<FourSortBean, String>();
@Override
protected void reduce(Text key, Iterable<FourSortBean> values, Context context) throws IOException, InterruptedException {
FourSortBean result = new FourSortBean();
int total_click = 0;
for (FourSortBean data : values){
total_click += data.getTotal_click();
result.setUser_id(data.getUser_id());
result.setDomain(data.getDomain());
}
result.setTotal_click(total_click);
// context.write(key,new IntWritable(total_click));
String keywords = key.toString().split("_")[0];
if (topTen.values().contains(keywords)){
int index=new ArrayList<String>(topTen.values()).indexOf(keywords);
FourSortBean tmpData= (FourSortBean) topTen.keySet().toArray()[index];
if(total_click > tmpData.getTotal_click()){
topTen.remove(tmpData);
topTen.put(result,keywords);
}
}else {
topTen.put(result,keywords);
}
if (topTen.size()>10){
topTen.remove(topTen.lastKey());
}
//context.write(key,new IntWritable(total_click));
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
for (Map.Entry<FourSortBean,String> entry:topTen.entrySet()){
context.write(new Text(entry.getValue()),entry.getKey());
}
}
}
FourSortBean.java
package com.hniu.bigdata.hadoop.Four;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class FourSortBean implements WritableComparable<FourSortBean> {
private String User_id;
private String domain;
private int total_click;
public FourSortBean(){}
public FourSortBean(String User_id, String domain, int total_click) {
this.User_id = User_id;
this.domain = domain;
this.total_click = total_click;
}
public String getUser_id() {
return User_id;
}
public void setUser_id(String user_id) {
this.User_id = user_id;
}
public String getDomain() {
return domain;
}
public void setDomain(String domain) {
this.domain = domain;
}
public int getTotal_click() {
return total_click;
}
public void setTotal_click(int total_click) {
this.total_click = total_click;
}
public int compareTo(FourSortBean o) {
return total_click > o.getTotal_click() ? -1 :(total_click == o.getTotal_click() ? 0 :1);
}
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeUTF(User_id);
dataOutput.writeUTF(domain);
dataOutput.writeInt(total_click);
}
public void readFields(DataInput dataInput) throws IOException {
User_id = dataInput.readUTF();
domain = dataInput.readUTF();
total_click = dataInput.readInt();
}
@Override
public String toString() {
return "{" +
"User_id='" + User_id + '\'' +
", domain='" + domain + '\'' +
", total_click=" + total_click +
'}';
}
}
FourDriver.java
package com.hniu.bigdata.hadoop.Four;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class FourDriver {
public static void main(String[] args) throws Exception{
Configuration configuration = new Configuration();
configuration.set("fs.defaultFS","hdfs://192.168.179.46:8020");
Job job = Job.getInstance(configuration, "word count");
job.setJarByClass(FourDriver.class);
job.setMapperClass(FourMapper.class);
job.setReducerClass(FourReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FourSortBean.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FourSortBean.class);
FileInputFormat.addInputPath(job, new Path("/xyz"));
FileOutputFormat.setOutputPath(job, new Path("/Four_Data"));
job.waitForCompletion(true);
}
}
标签:String,搜索词,用户,public,访问,id,import,total,click 来源: https://blog.csdn.net/study_46/article/details/112384091