mapreduce小文件合并&文件过滤器的使用

时间 2019-11-10

原文原文链接

HDFS中PathFilter类java

在单个操做中处理一批文件，这是很常见的需求。好比说处理日志的 MapReduce做业可能须要分析一个月内包含在大量目录中的日志文件。在一个表达式中使用通配符在匹配多个文件时比较方便的，无需列举每一个文件和目录来指定输入。hadoop为执行通配提供了两个FIleSystem方法：apache

1 public FileStatus[] globStatus(Path pathPattern) throw IOException数组

2 public FileStatus[] globStatus(Path pathPattern, PathFilter filter) throw IOExceptionbash

globStatus()方法返回与路径想匹配的全部文件的FileStatus对象数组，并按路径排序。hadoop所支持的通配符与Unix bash相同。svn

第二个方法传了一个PathFilter对象做为参数，PathFilter能够进一步对匹配进行限制。PathFilter是一个接口，里面只有一个方法accept(Path path)。具体使用参考下面代码oop

package com.tv;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.IOUtils;

public class MergeSmallFilesToHDFS {
	private static FileSystem fs = null;
	private static FileSystem local = null;
	public static class RegexExcludePathFilter implements PathFilter{
		private final String regex;
		public RegexExcludePathFilter(String regex) {
	        this.regex = regex;
	    }
		public boolean accept(Path path) {
			// TODO Auto-generated method stub
			boolean flag = path.toString().matches(regex);
			//过滤 regex 格式的文件，只需 return ！flag
	        return !flag;
		}
	}
	public static class RegexAcceptPathFilter implements PathFilter {
        private final String regex;
        
        public RegexAcceptPathFilter(String regex) {
            this.regex = regex;
        }
		public boolean accept(Path path) {
			// TODO Auto-generated method stub
			boolean flag = path.toString().matches(regex);
			//接受 regex 格式的文件，只需 return flag
            return flag;
		}
	}
	public static void list() throws IOException, URISyntaxException {
		//读取配置文件
        Configuration conf = new Configuration();
        URI uri = new URI("hdfs://zbc:9000");
        // FileSystem是用户操做HDFS的核心类，它得到URI对应的HDFS文件系统
        fs = FileSystem.get(uri, conf);
        // 得到本地文件系统
        local = FileSystem.getLocal(conf);
        //获取该目录下的全部子目录(日期名称)
        FileStatus[] dirstatus = local.globStatus(new Path("C:/Users/zaish/Documents/学习/hadooop分析数据/tvdata/*"),new RegexExcludePathFilter("^.*svn$"));
        Path[] dirs = FileUtil.stat2Paths(dirstatus);
        FSDataOutputStream out = null;
        FSDataInputStream in = null;
        for (Path dir : dirs) {
            String fileName = dir.getName().replace("-", "");//文件名称
            //只接受日期目录下的.txt文件
            FileStatus[] localStatus = local.globStatus(new Path(dir+"/*"),new RegexAcceptPathFilter("^.*txt$"));
            // 得到日期目录下的全部文件
            Path[] listedPaths = FileUtil.stat2Paths(localStatus);
            //输出路径
            Path block = new Path("hdfs://zbc:9000/middle/tv/"+ fileName + ".txt");
            // 打开输出流
            out = fs.create(block);            
            for (Path p : listedPaths) {
                in = local.open(p);// 打开输入流
                IOUtils.copyBytes(in, out, 4096, false); // 复制数据
                // 关闭输入流
                in.close();
            }
            if (out != null) {
            	// 关闭输出流
                out.close();
            }
        }        
	}
	public static void main(String[] args) throws Exception {
		list();
	}
}