其他分享
首页 > 其他分享> > 同一种搜索词,哪个网站被用户访问的最多

同一种搜索词,哪个网站被用户访问的最多

作者:互联网

同一种搜索词,哪个网站被用户访问的最多

FourMapper.java

package com.hniu.bigdata.hadoop.Four;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.util.StringUtils;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;

public class FourMapper extends Mapper<LongWritable, Text,Text, FourSortBean> {

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

        String line = value.toString();
        // 日期数据处理
        String[] values = line.split("\\s");
        String time = values[values.length - 1];
        SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd");
        String dateTime = format.format(new Date(Long.parseLong(time)));
        // 域名切分
        String url = values[4];
        values[values.length - 1] = dateTime;
        String domain = url.split("/")[0];
        values[4] = domain;
        line = StringUtils.join(" ",values);
        String keyStr =values[0] +"_"+domain;
        if(dateTime.contains("2019")) {
            FourSortBean data = new FourSortBean();
            data.setUser_id(values[0]);
            data.setDomain(domain);
            data.setTotal_click(1);
            context.write(new Text(keyStr), data);
        }
    }
}

FourReducer.java

package com.hniu.bigdata.hadoop.Four;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Map;
import java.util.TreeMap;

public class FourReducer extends Reducer<Text, FourSortBean, Text, FourSortBean> {

    private TreeMap<FourSortBean,String> topTen = new TreeMap<FourSortBean, String>();
    @Override
    protected void reduce(Text key, Iterable<FourSortBean> values, Context context) throws IOException, InterruptedException {

        FourSortBean result = new FourSortBean();
        int total_click = 0;
        for (FourSortBean data : values){
            total_click += data.getTotal_click();
            result.setUser_id(data.getUser_id());
            result.setDomain(data.getDomain());
        }
        result.setTotal_click(total_click);
        // context.write(key,new IntWritable(total_click));
        String keywords = key.toString().split("_")[0];
        if (topTen.values().contains(keywords)){
            int index=new ArrayList<String>(topTen.values()).indexOf(keywords);
            FourSortBean tmpData= (FourSortBean) topTen.keySet().toArray()[index];
            if(total_click > tmpData.getTotal_click()){
                topTen.remove(tmpData);
                topTen.put(result,keywords);
            }
        }else {
            topTen.put(result,keywords);
        }
        if (topTen.size()>10){
            topTen.remove(topTen.lastKey());
        }
        //context.write(key,new IntWritable(total_click));
    }

    @Override
    protected void cleanup(Context context) throws IOException, InterruptedException {
        for (Map.Entry<FourSortBean,String> entry:topTen.entrySet()){
            context.write(new Text(entry.getValue()),entry.getKey());
        }
    }
}

FourSortBean.java

package com.hniu.bigdata.hadoop.Four;


import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class FourSortBean implements WritableComparable<FourSortBean> {
    private String User_id;
    private String  domain;
    private int  total_click;


    public FourSortBean(){}

    public FourSortBean(String User_id, String domain, int total_click) {
        this.User_id = User_id;
        this.domain = domain;
        this.total_click = total_click;
    }

    public String getUser_id() {
        return User_id;
    }

    public void setUser_id(String user_id) {
        this.User_id = user_id;
    }

    public String getDomain() {
        return domain;
    }

    public void setDomain(String domain) {
        this.domain = domain;
    }

    public int getTotal_click() {
        return total_click;
    }

    public void setTotal_click(int total_click) {
        this.total_click = total_click;
    }

    public int compareTo(FourSortBean o) {
        return  total_click > o.getTotal_click() ? -1 :(total_click == o.getTotal_click() ? 0 :1);
    }

    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeUTF(User_id);
        dataOutput.writeUTF(domain);
        dataOutput.writeInt(total_click);
    }

    public void readFields(DataInput dataInput) throws IOException {
        User_id = dataInput.readUTF();
        domain = dataInput.readUTF();
        total_click = dataInput.readInt();
    }

    @Override
    public String toString() {
        return "{" +
                "User_id='" + User_id + '\'' +
                ", domain='" + domain + '\'' +
                ", total_click=" + total_click +
                '}';
    }
}

FourDriver.java

package com.hniu.bigdata.hadoop.Four;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class FourDriver {
    public static void main(String[] args) throws Exception{
        Configuration configuration = new Configuration();

        configuration.set("fs.defaultFS","hdfs://192.168.179.46:8020");

        Job job = Job.getInstance(configuration, "word count");

        job.setJarByClass(FourDriver.class);
        job.setMapperClass(FourMapper.class);
        job.setReducerClass(FourReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(FourSortBean.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(FourSortBean.class);

        FileInputFormat.addInputPath(job, new Path("/xyz"));
        FileOutputFormat.setOutputPath(job, new Path("/Four_Data"));
        job.waitForCompletion(true);

    }

}

标签:String,搜索词,用户,public,访问,id,import,total,click
来源: https://blog.csdn.net/study_46/article/details/112384091