MapReduce编程之入门 Hello Word Count

 

目录

1.单词计数Mapper类

2.单词计数Reducer类

3.单词计数main函数类

4.运行程序


 

------------本文笔记整理自《Hadoop海量数据处理:技术详解与项目实战》范东来

注:实际上此文件很小(只有四行英文),查看作业日志:文件分块是1个,map是1个,reduce是1个。

需要引入的Jar包:hadoop-common-2.9.2.jar和hadoop-mapreduce-client-core-2.9.2.jar

1.单词计数Mapper类

package com.hadoop.hello;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

/*
 * 单词计数Mapper类
 * LongWritable: map输入键值对的键类型
 * Text: map输入键值对的值类型
 * Text: map输出键值对的键类型
 * IntWritable: map输出键值对的值类型
 */
public class TokenizerMapper extends Mapper<LongWritable, Text, Text, IntWritable>{

	//定义静态常量 1 ,标识每个词出现的次数为 1
	private static final IntWritable one = new IntWritable(1);
	//单词
	private Text word = new Text();
	
	/*
	 * @param key: 输入分块inputsplit的行号
	 * @param value: 摄入分块inputsplit的行内容
	 */
	@Override
	protected void map(LongWritable key, Text value, Context context)
			throws IOException, InterruptedException {
		
		StringTokenizer itr = new StringTokenizer(value.toString());
		while (itr.hasMoreTokens()) {
			word.set(itr.nextToken());
			//输出格式:word 1
			context.write(word, one);
		}
	}
}

2.单词计数Reducer类

package com.hadoop.hello;

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

/*
 * 单词计数Reducer类
 * Text: reduce输入键值对的键类型
 * IntWritable: reduce输入键值对的值类型
 * Text: reduce输出键值对的键类型
 * IntWritable: reduce输出键值对的值类型
 * 
 * 同时设置成Combiner,可以将map中间结果传输到reduce前预先做一次单词计数,减少中间结果数据量
 * 开启方式:job.setCombinerClass(IntSumReducer.class);
 */
public class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable>{

	//单词计数
	private static IntWritable result = new IntWritable();
	
	/*
	 * @param key: 单词
	 * @param value: 同类词的词数列表
	 */
	@Override
	protected void reduce(Text key, Iterable<IntWritable> values, Context context) 
			throws IOException, InterruptedException {
		
		int sum = 0;
		Iterator<IntWritable> itr = values.iterator();
		while (itr.hasNext()) {
			sum += itr.next().get();
		}
		result.set(sum);
		//输出格式:word 1
		context.write(key, result);
	}
	
}

3.单词计数main函数类

package com.hadoop.hello;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/*
 * 单词计数main函数
 */
public class WordCount {
	
	public static void main(String[] args) throws IOException, 
		ClassNotFoundException, InterruptedException {
		
		//加载hadoop配置信息,属于静态资源加载
		Configuration conf = new Configuration();
		
		if (args.length != 2) {
			System.err.println("Usage: wordcount <in> <out>");
			System.exit(2);
		}
		
		//创建作业对象
		//被弃用
		//Job job = new Job(conf, "wordcount");
		Job job = Job.getInstance(conf, "wordcount");
		job.setJarByClass(WordCount.class);
		//指定Mapper/Reducer类
		job.setMapperClass(TokenizerMapper.class);
		job.setReducerClass(IntSumReducer.class);
		//设置Reudce函数输出键值对类型
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		//设置指定输入/输出路径(目录)
		FileInputFormat.addInputPath(job, new Path(args[0]));//输入路径必须存在
		FileOutputFormat.setOutputPath(job, new Path(args[1]));//输出路径必须不存在,会自动创建
		//提交任务,等待作业完成
		//waitForCompletion中参数true代表:打印作业处理过程
		//System.exit(n),其中n为0,则正常结束;n为非0,则异常结束
		System.exit(job.waitForCompletion(true) ? 0 : 1);
		
	}
}

4.运行程序

1.将com.hadoop.hello包右键导出为 JAR file,命名为:"WordCount.jar";
2.利用Windows的cmd或者PowerShell(推荐)将JAR文件上传到Linux服务器
  命令如下:(在JAR文件目录下执行)
  > scp WordCount.jar [email protected]:~/myJars/mapreduce/
 (其中remoteIP为远程服务器IP)
3.启动hadoop,创建单词输入文件
  > cd ~/myJars/mapreduce/
  > touch words
  > vi words
  按键"i",进入编辑模式,向words文件中输入内容,如下:
  good better best
  never it rest
  till good is better
  and better is best
  按键"ESC"-->"shift q"-->输入"wq!",回车,保存
  --查看单词文件
  > cat words 
  --在HDFS中创建输入文件目录
  > hadoop fs -mkdir /user/hadoop/wordcountinput
  --在HDFS中查看输入文件目录
  > hadoop fs -ls /user/hadoop/wordcountinput
  --将本地文件words拷贝到HDFS的输入目录中(在"~/myJars/mapreduce/"下执行)
  > hadoop fs -copyFromLocal words /user/hadoop/wordcountinput/
4.执行JAR,运行程序
  命令如下:(在JAR文件目录"~/myJars/mapreduce/"下执行)
  > hadoop jar WordCount.jar com.hadoop.hello.WordCount /user/hadoop/wordcountinput /user/hadoop/wordcountoutput
  运行过程中,屏幕会输出执行过程,直到完成
5.查看单词统计结果
  成功执行完后,目录"/user/hadoop/wordcountoutput/"下会产生两个文件
  /user/hadoop/wordcountoutput/_SUCCESS    --成功执行完的空标识文件
  /user/hadoop/wordcountoutput/part-r-00000 --作业输出结果文件
  --查看输出文件
  > hadoop fs -cat /user/hadoop/wordcountoutput/part-r-00000
  and      1
  best     2
  better   3
  good     2
  is       2
  it       1
  never    1
  rest     1
  till     1
  <此即为单词统计结果>