- 05-01-MapReduce回顾
- 05-02-分析WordCount数据处理的过程
- 05-03-开发WC的Map和Reducer
- 05-04-开发WC的主程序
- 05-05-Yarn调度MapReduce任务的过程
k1,v1 代表Map的输入
k1 该行数据的偏移量,字节计算 :LongWritable
v1 读入的数据 :Text
k2,v2 代表Map的输出
k2 单词 :Text
v2 每个单词记一次 :IntWritable
1.group by 分组
2.distinct 去重
k3,v3 代表Reduce的输入
k3 :Text
v3 :IntWritable
k4,v4 代表Reduce的输出
k4 :Text
v4 :IntWritable
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
// k1 v1 k2 v2
public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
protected void map(LongWritable key1, Text value1, Context context)
throws IOException, InterruptedException {
* context: map的上下文
* 上文:HDFS
* 下文:Reducer
//得到数据 I love Beijing
String data = value1.toString();
String[] words = data.split(" ");
//输出 k2 v2
for(String w:words){
// k2 v2
context.write(new Text(w), new IntWritable(1));
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
// k3 v3 k4 v4
public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
protected void reduce(Text k3, Iterable<IntWritable> v3,Context context) throws IOException, InterruptedException {
* context是Reducer的上下文
* 上文:Map
* 下文:HDFS
int total = 0;
for(IntWritable v:v3){
total = total + v.get();
//输出 k4 v4
context.write(k3, new IntWritable(total));
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCountMain {
public static void main(String[] args) throws Exception {
Job job = Job.getInstance(new Configuration());
job.setJarByClass(WordCountMain.class); //任务的入口
job.setMapOutputKeyClass(Text.class); //k2的数据类型
job.setMapOutputValueClass(IntWritable.class); //v2的类型
job.setOutputKeyClass(Text.class); //k4的类型
job.setOutputValueClass(IntWritable.class); //v4的类型
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
root@bigdata00:~# cd /root/temp/input
root@bigdata00:~/temp/input# ls
data.txt wc.jar
root@bigdata00:~/temp/input# hadoop jar /input/data.txt /output/mwc
Not a valid JAR: /input/data.txt
root@bigdata00:~/temp/input# start-all.sh
This script is Deprecated. Instead use start-dfs.sh and start-yarn.sh
Starting namenodes on [] starting namenode, logging to /root/training/hadoop-2.7.3/logs/hadoop-root-namenode-bigdata00.out
localhost: starting datanode, logging to /root/training/hadoop-2.7.3/logs/hadoop-root-datanode-bigdata00.out
Starting secondary namenodes [] starting secondarynamenode, logging to /root/training/hadoop-2.7.3/logs/hadoop-root-secondarynamenode-bigdata00.out
starting yarn daemons
starting resourcemanager, logging to /root/training/hadoop-2.7.3/logs/yarn-root-resourcemanager-bigdata00.out
localhost: starting nodemanager, logging to /root/training/hadoop-2.7.3/logs/yarn-root-nodemanager-bigdata00.out
root@bigdata00:~/temp/input# jps
2992 SecondaryNameNode
2662 DataNode
3384 NodeManager
3706 Jps
2429 NameNode
3149 ResourceManager
root@bigdata00:~/temp/input# hadoop jar wc.jar /input/data.txt /output/wc
20/10/18 07:01:19 INFO client.RMProxy: Connecting to ResourceManager at /
20/10/18 07:01:20 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
20/10/18 07:01:21 INFO input.FileInputFormat: Total input paths to process : 1
20/10/18 07:01:22 INFO mapreduce.JobSubmitter: number of splits:1
20/10/18 07:01:23 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1602975652710_0001
20/10/18 07:01:24 INFO impl.YarnClientImpl: Submitted application application_1602975652710_0001
20/10/18 07:01:24 INFO mapreduce.Job: The url to track the job:
20/10/18 07:01:24 INFO mapreduce.Job: Running job: job_1602975652710_0001
20/10/18 07:01:48 INFO mapreduce.Job: Job job_1602975652710_0001 running in uber mode : false
20/10/18 07:01:48 INFO mapreduce.Job: map 0% reduce 0%
20/10/18 07:02:01 INFO mapreduce.Job: map 100% reduce 0%
20/10/18 07:02:15 INFO mapreduce.Job: map 100% reduce 100%
20/10/18 07:02:17 INFO mapreduce.Job: Job job_1602975652710_0001 completed successfully
20/10/18 07:02:17 INFO mapreduce.Job: Counters: 49
File System Counters
FILE: Number of bytes read=138
FILE: Number of bytes written=237505
FILE: Number of read operations=0
FILE: Number of large read operations=0
FILE: Number of write operations=0
HDFS: Number of bytes read=166
HDFS: Number of bytes written=55
HDFS: Number of read operations=6
HDFS: Number of large read operations=0
HDFS: Number of write operations=2
Job Counters
Launched map tasks=1
Launched reduce tasks=1
Data-local map tasks=1
Total time spent by all maps in occupied slots (ms)=10530
Total time spent by all reduces in occupied slots (ms)=10790
Total time spent by all map tasks (ms)=10530
Total time spent by all reduce tasks (ms)=10790
Total vcore-milliseconds taken by all map tasks=10530
Total vcore-milliseconds taken by all reduce tasks=10790
Total megabyte-milliseconds taken by all map tasks=10782720
Total megabyte-milliseconds taken by all reduce tasks=11048960
Map-Reduce Framework
Map input records=3
Map output records=12
Map output bytes=108
Map output materialized bytes=138
Input split bytes=106
Combine input records=0
Combine output records=0
Reduce input groups=8
Reduce shuffle bytes=138
Reduce input records=12
Reduce output records=8
Spilled Records=24
Shuffled Maps =1
Failed Shuffles=0
Merged Map outputs=1
GC time elapsed (ms)=350
CPU time spent (ms)=4560
Physical memory (bytes) snapshot=287481856
Virtual memory (bytes) snapshot=4438601728
Total committed heap usage (bytes)=138268672
Shuffle Errors
File Input Format Counters
Bytes Read=60
File Output Format Counters
Bytes Written=55
root@bigdata00:~/temp/input# hdfs dfs -cat /output/wc
cat: `/output/wc': Is a directory
root@bigdata00:~/temp/input# hdfs dfs -cat /output/wc/part-r-00000
Beijing 2
China 2
I 2
capital 1
is 1
love 2
of 1
the 1
