hadoop中MapReduce多种join实现实例分析

时间 2020-02-17

标签 hadoop mapreduce 多种 join 实现实例分析栏目 Hadoop 繁體版

原文原文链接

转载自:http://zengzhaozheng.blog.51cto.com/8219051/1392961java

一、在Reudce端进行链接。

在Reudce端进行链接是MapReduce框架进行表之间join操做最为常见的模式，其具体的实现原理以下：

Map端的主要工做：为来自不一样表（文件）的key/value对打标签以区别不一样来源的记录。而后用链接字段做为key，其他部分和新加的标志做为value，最后进行输出。

reduce端的主要工做：在reduce端以链接字段做为key的分组已经完成，咱们只须要在每个分组当中将那些来源于不一样文件的记录（在map阶段已经打标志）分开，最后进行笛卡尔只就ok了。原理很是简单，下面来看一个实例：

(1)自定义一个value返回类型:

package com.mr.reduceSizeJoin;   
import java.io.DataInput;   
import java.io.DataOutput;   
import java.io.IOException;   
import org.apache.hadoop.io.Text;   
import org.apache.hadoop.io.WritableComparable;   
public class CombineValues implements WritableComparable{   
    //private static final Logger logger = LoggerFactory.getLogger(CombineValues.class);   
    private Text joinKey;//连接关键字   
    private Text flag;//文件来源标志   
    private Text secondPart;//除了连接键外的其余部分   
    public void setJoinKey(Text joinKey) {   
        this.joinKey = joinKey;   
    }   
    public void setFlag(Text flag) {   
        this.flag = flag;   
    }   
    public void setSecondPart(Text secondPart) {   
        this.secondPart = secondPart;   
    }   
    public Text getFlag() {   
        return flag;   
    }   
    public Text getSecondPart() {   
        return secondPart;   
    }   
    public Text getJoinKey() {   
        return joinKey;   
    }   
    public CombineValues() {   
        this.joinKey =  new Text();   
        this.flag = new Text();   
        this.secondPart = new Text();   
    }

    @Override 
    public void write(DataOutput out) throws IOException {   
        this.joinKey.write(out);   
        this.flag.write(out);   
        this.secondPart.write(out);   
    }   
    @Override 
    public void readFields(DataInput in) throws IOException {   
        this.joinKey.readFields(in);   
        this.flag.readFields(in);   
        this.secondPart.readFields(in);   
    }   
    @Override 
    public int compareTo(CombineValues o) {   
        return this.joinKey.compareTo(o.getJoinKey());   
    }   
    @Override 
    public String toString() {   
        // TODO Auto-generated method stub   
        return "[flag="+this.flag.toString()+",joinKey="+this.joinKey.toString()+",secondPart="+this.secondPart.toString()+"]";   
    }   
}

View Code

(2)map、reduce主体代码apache

package com.mr.reduceSizeJoin;   
import java.io.IOException;   
import java.util.ArrayList;   
import org.apache.hadoop.conf.Configuration;   
import org.apache.hadoop.conf.Configured;   
import org.apache.hadoop.fs.Path;   
import org.apache.hadoop.io.Text;   
import org.apache.hadoop.mapreduce.Job;   
import org.apache.hadoop.mapreduce.Mapper;   
import org.apache.hadoop.mapreduce.Reducer;   
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;   
import org.apache.hadoop.mapreduce.lib.input.FileSplit;   
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;   
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;   
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;   
import org.apache.hadoop.util.Tool;   
import org.apache.hadoop.util.ToolRunner;   
import org.slf4j.Logger;   
import org.slf4j.LoggerFactory;   
/**   
* @author zengzhaozheng   
* 用途说明：   
* reudce side join中的left outer join   
* 左链接，两个文件分别表明2个表,链接字段table1的id字段和table2的cityID字段   
* table1(左表):tb_dim_city(id int,name string,orderid int,city_code,is_show)   
* tb_dim_city.dat文件内容,分隔符为"|"：   
* id     name  orderid  city_code  is_show   
* 0       其余        9999     9999         0   
* 1       长春        1        901          1   
* 2       吉林        2        902          1   
* 3       四平        3        903          1   
* 4       松原        4        904          1   
* 5       通化        5        905          1   
* 6       辽源        6        906          1   
* 7       白城        7        907          1   
* 8       白山        8        908          1   
* 9       延吉        9        909          1   
* -------------------------风骚的分割线-------------------------------   
* table2(右表)：tb_user_profiles(userID int,userName string,network string,double flow,cityID int)   
* tb_user_profiles.dat文件内容,分隔符为"|"：   
* userID   network     flow    cityID   
* 1           2G       123      1   
* 2           3G       333      2   
* 3           3G       555      1   
* 4           2G       777      3   
* 5           3G       666      4   
*   
* -------------------------风骚的分割线-------------------------------   
*  结果：   
*  1   长春  1   901 1   1   2G  123   
*  1   长春  1   901 1   3   3G  555   
*  2   吉林  2   902 1   2   3G  333   
*  3   四平  3   903 1   4   2G  777   
*  4   松原  4   904 1   5   3G  666   
*/ 
public class ReduceSideJoin_LeftOuterJoin extends Configured implements Tool{   
    private static final Logger logger = LoggerFactory.getLogger(ReduceSideJoin_LeftOuterJoin.class);   
    public static class LeftOutJoinMapper extends Mapper {   
        private CombineValues combineValues = new CombineValues();   
        private Text flag = new Text();   
        private Text joinKey = new Text();   
        private Text secondPart = new Text();   
        @Override 
        protected void map(Object key, Text value, Context context)   
                throws IOException, InterruptedException {   
            //得到文件输入路径   
            String pathName = ((FileSplit) context.getInputSplit()).getPath().toString();   
            //数据来自tb_dim_city.dat文件,标志即为"0"   
            if(pathName.endsWith("tb_dim_city.dat")){   
                String[] valueItems = value.toString().split("\\|");   
                //过滤格式错误的记录   
                if(valueItems.length != 5){   
                    return;   
                }   
                flag.set("0");   
                joinKey.set(valueItems[0]);   
                secondPart.set(valueItems[1]+"\t"+valueItems[2]+"\t"+valueItems[3]+"\t"+valueItems[4]);   
                combineValues.setFlag(flag);   
                combineValues.setJoinKey(joinKey);   
                combineValues.setSecondPart(secondPart);   
                context.write(combineValues.getJoinKey(), combineValues);

                }//数据来自于tb_user_profiles.dat，标志即为"1"   
            else if(pathName.endsWith("tb_user_profiles.dat")){   
                String[] valueItems = value.toString().split("\\|");   
                //过滤格式错误的记录   
                if(valueItems.length != 4){   
                    return;   
                }   
                flag.set("1");   
                joinKey.set(valueItems[3]);   
                secondPart.set(valueItems[0]+"\t"+valueItems[1]+"\t"+valueItems[2]);   
                combineValues.setFlag(flag);   
                combineValues.setJoinKey(joinKey);   
                combineValues.setSecondPart(secondPart);   
                context.write(combineValues.getJoinKey(), combineValues);   
            }   
        }   
    }   
    public static class LeftOutJoinReducer extends Reducer {   
        //存储一个分组中的左表信息   
        private ArrayList leftTable = new ArrayList();   
        //存储一个分组中的右表信息   
        private ArrayList rightTable = new ArrayList();   
        private Text secondPar = null;   
        private Text output = new Text();   
        /**   
         * 一个分组调用一次reduce函数   
         */ 
        @Override 
        protected void reduce(Text key, Iterable value, Context context)   
                throws IOException, InterruptedException {   
            leftTable.clear();   
            rightTable.clear();   
            /**   
             * 将分组中的元素按照文件分别进行存放   
             * 这种方法要注意的问题：   
             * 若是一个分组内的元素太多的话，可能会致使在reduce阶段出现OOM，   
             * 在处理分布式问题以前最好先了解数据的分布状况，根据不一样的分布采起最   
             * 适当的处理方法，这样能够有效的防止致使OOM和数据过分倾斜问题。   
             */ 
            for(CombineValues cv : value){   
                secondPar = new Text(cv.getSecondPart().toString());   
                //左表tb_dim_city   
                if("0".equals(cv.getFlag().toString().trim())){   
                    leftTable.add(secondPar);   
                }   
                //右表tb_user_profiles   
                else if("1".equals(cv.getFlag().toString().trim())){   
                    rightTable.add(secondPar);   
                }   
            }   
            logger.info("tb_dim_city:"+leftTable.toString());   
            logger.info("tb_user_profiles:"+rightTable.toString());   
            for(Text leftPart : leftTable){   
                for(Text rightPart : rightTable){   
                    output.set(leftPart+ "\t" + rightPart);   
                    context.write(key, output);   
                }   
            }   
        }   
    }   
    @Override 
    public int run(String[] args) throws Exception {   
          Configuration conf=getConf(); //得到配置文件对象   
            Job job=new Job(conf,"LeftOutJoinMR");   
            job.setJarByClass(ReduceSideJoin_LeftOuterJoin.class);
            FileInputFormat.addInputPath(job, new Path(args[0])); //设置map输入文件路径   
            FileOutputFormat.setOutputPath(job, new Path(args[1])); //设置reduce输出文件路径
            job.setMapperClass(LeftOutJoinMapper.class);   
            job.setReducerClass(LeftOutJoinReducer.class);
            job.setInputFormatClass(TextInputFormat.class); //设置文件输入格式   
            job.setOutputFormatClass(TextOutputFormat.class);//使用默认的output格格式

            //设置map的输出key和value类型   
            job.setMapOutputKeyClass(Text.class);   
            job.setMapOutputValueClass(CombineValues.class);

            //设置reduce的输出key和value类型   
            job.setOutputKeyClass(Text.class);   
            job.setOutputValueClass(Text.class);   
            job.waitForCompletion(true);   
            return job.isSuccessful()?0:1;   
    }   
    public static void main(String[] args) throws IOException,   
            ClassNotFoundException, InterruptedException {   
        try {   
            int returnCode =  ToolRunner.run(new ReduceSideJoin_LeftOuterJoin(),args);   
            System.exit(returnCode);   
        } catch (Exception e) {   
            // TODO Auto-generated catch block   
            logger.error(e.getMessage());   
        }   
    }   
}

View Code

其中具体的分析以及数据的输出输入请看代码中的注释已经写得比较清楚了，这里主要分析一下reduce join的一些不足。之因此会存在reduce join这种方式，咱们能够很明显的看出原：由于总体数据被分割了，每一个map task只处理一部分数据而不可以获取到全部须要的join字段，所以咱们须要在讲join key做为reduce端的分组将全部join key相同的记录集中起来进行处理，因此reduce join这种方式就出现了。这种方式的缺点很明显就是会形成map和reduce端也就是shuffle阶段出现大量的数据传输，效率很低.缓存

二、在Map端进行链接。

使用场景：一张表十分小、一张表很大。

用法:在提交做业的时候先将小表文件放到该做业的DistributedCache中，而后从DistributeCache中取出该小表进行join key / value解释分割放到内存中（能够放大Hash Map等等容器中）。而后扫描大表，看大表中的每条记录的join key /value值是否可以在内存中找到相同join key的记录，若是有则直接输出结果。

直接上代码，比较简单：

package com.mr.mapSideJoin;   
import java.io.BufferedReader;   
import java.io.FileReader;   
import java.io.IOException;   
import java.util.HashMap;   
import org.apache.hadoop.conf.Configuration;   
import org.apache.hadoop.conf.Configured;   
import org.apache.hadoop.filecache.DistributedCache;   
import org.apache.hadoop.fs.Path;   
import org.apache.hadoop.io.Text;   
import org.apache.hadoop.mapreduce.Job;   
import org.apache.hadoop.mapreduce.Mapper;   
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;   
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;   
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;   
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;   
import org.apache.hadoop.util.Tool;   
import org.apache.hadoop.util.ToolRunner;   
import org.slf4j.Logger;   
import org.slf4j.LoggerFactory;   
/**   
* @author zengzhaozheng   
*   
* 用途说明：   
* Map side join中的left outer join   
* 左链接，两个文件分别表明2个表,链接字段table1的id字段和table2的cityID字段   
* table1(左表):tb_dim_city(id int,name string,orderid int,city_code,is_show)，   
* 假设tb_dim_city文件记录数不多，tb_dim_city.dat文件内容,分隔符为"|"：   
* id     name  orderid  city_code  is_show   
* 0       其余        9999     9999         0   
* 1       长春        1        901          1   
* 2       吉林        2        902          1   
* 3       四平        3        903          1   
* 4       松原        4        904          1   
* 5       通化        5        905          1   
* 6       辽源        6        906          1   
* 7       白城        7        907          1   
* 8       白山        8        908          1   
* 9       延吉        9        909          1   
* -------------------------风骚的分割线-------------------------------   
* table2(右表)：tb_user_profiles(userID int,userName string,network string,double flow,cityID int)   
* tb_user_profiles.dat文件内容,分隔符为"|"：   
* userID   network     flow    cityID   
* 1           2G       123      1   
* 2           3G       333      2   
* 3           3G       555      1   
* 4           2G       777      3   
* 5           3G       666      4   
* -------------------------风骚的分割线-------------------------------   
*  结果：   
*  1   长春  1   901 1   1   2G  123   
*  1   长春  1   901 1   3   3G  555   
*  2   吉林  2   902 1   2   3G  333   
*  3   四平  3   903 1   4   2G  777   
*  4   松原  4   904 1   5   3G  666   
*/ 
public class MapSideJoinMain extends Configured implements Tool{   
    private static final Logger logger = LoggerFactory.getLogger(MapSideJoinMain.class);   
    public static class LeftOutJoinMapper extends Mapper {

        private HashMap city_info = new HashMap();   
        private Text outPutKey = new Text();   
        private Text outPutValue = new Text();   
        private String mapInputStr = null;   
        private String mapInputSpit[] = null;   
        private String city_secondPart = null;   
        /**   
         * 此方法在每一个task开始以前执行，这里主要用做从DistributedCache   
         * 中取到tb_dim_city文件，并将里边记录取出放到内存中。   
         */ 
        @Override 
        protected void setup(Context context)   
                throws IOException, InterruptedException {   
            BufferedReader br = null;   
            //得到当前做业的DistributedCache相关文件   
            Path[] distributePaths = DistributedCache.getLocalCacheFiles(context.getConfiguration());   
            String cityInfo = null;   
            for(Path p : distributePaths){   
                if(p.toString().endsWith("tb_dim_city.dat")){   
                    //读缓存文件，并放到mem中   
                    br = new BufferedReader(new FileReader(p.toString()));   
                    while(null!=(cityInfo=br.readLine())){   
                        String[] cityPart = cityInfo.split("\\|",5);   
                        if(cityPart.length ==5){   
                            city_info.put(cityPart[0], cityPart[1]+"\t"+cityPart[2]+"\t"+cityPart[3]+"\t"+cityPart[4]);   
                        }   
                    }   
                }   
            }   
        }

        /**   
         * Map端的实现至关简单，直接判断tb_user_profiles.dat中的   
         * cityID是否存在个人map中就ok了，这样就能够实现Map Join了   
         */ 
        @Override 
        protected void map(Object key, Text value, Context context)   
                throws IOException, InterruptedException {   
            //排掉空行   
            if(value == null || value.toString().equals("")){   
                return;   
            }   
            mapInputStr = value.toString();   
            mapInputSpit = mapInputStr.split("\\|",4);   
            //过滤非法记录   
            if(mapInputSpit.length != 4){   
                return;   
            }   
            //判断连接字段是否在map中存在   
            city_secondPart = city_info.get(mapInputSpit[3]);   
            if(city_secondPart != null){   
                this.outPutKey.set(mapInputSpit[3]);   
                this.outPutValue.set(city_secondPart+"\t"+mapInputSpit[0]+"\t"+mapInputSpit[1]+"\t"+mapInputSpit[2]);   
                context.write(outPutKey, outPutValue);   
            }   
        }   
    }   
    @Override 
    public int run(String[] args) throws Exception {   
            Configuration conf=getConf(); //得到配置文件对象   
            DistributedCache.addCacheFile(new Path(args[1]).toUri(), conf);//为该job添加缓存文件   
            Job job=new Job(conf,"MapJoinMR");   
            job.setNumReduceTasks(0);

            FileInputFormat.addInputPath(job, new Path(args[0])); //设置map输入文件路径   
            FileOutputFormat.setOutputPath(job, new Path(args[2])); //设置reduce输出文件路径

            job.setJarByClass(MapSideJoinMain.class);   
            job.setMapperClass(LeftOutJoinMapper.class);

            job.setInputFormatClass(TextInputFormat.class); //设置文件输入格式   
            job.setOutputFormatClass(TextOutputFormat.class);//使用默认的output格式

            //设置map的输出key和value类型   
            job.setMapOutputKeyClass(Text.class);

            //设置reduce的输出key和value类型   
            job.setOutputKeyClass(Text.class);   
            job.setOutputValueClass(Text.class);   
            job.waitForCompletion(true);   
            return job.isSuccessful()?0:1;   
    }   
    public static void main(String[] args) throws IOException,   
            ClassNotFoundException, InterruptedException {   
        try {   
            int returnCode =  ToolRunner.run(new MapSideJoinMain(),args);   
            System.exit(returnCode);   
        } catch (Exception e) {   
            // TODO Auto-generated catch block   
            logger.error(e.getMessage());   
        }   
    }   
}

View Code

这里说说DistributedCache。DistributedCache是分布式缓存的一种实现，它在整个MapReduce框架中起着至关重要的做用，他能够支撑咱们写一些至关复杂高效的分布式程序。说回到这里，JobTracker在做业启动以前会获取到DistributedCache的资源uri列表，并将对应的文件分发到各个涉及到该做业的任务的TaskTracker上。另外，关于DistributedCache和做业的关系，好比权限、存储路径区分、public和private等属性，接下来有用再整理研究一下写一篇blog，这里就不详细说了。

另外还有一种比较变态的Map Join方式，就是结合HBase来作Map Join操做。这种方式彻底能够突破内存的控制，使你毫无忌惮的使用Map Join，并且效率也很是不错。

三、SemiJoin。

SemiJoin就是所谓的半链接，其实仔细一看就是reduce join的一个变种，就是在map端过滤掉一些数据，在网络中只传输参与链接的数据不参与链接的数据没必要在网络中进行传输，从而减小了shuffle的网络传输量，使总体效率获得提升，其余思想和reduce join是如出一辙的。说得更加接地气一点就是将小表中参与join的key单独抽出来经过DistributedCach分发到相关节点，而后将其取出放到内存中（能够放到HashSet中），在map阶段扫描链接表，将join key不在内存HashSet中的记录过滤掉，让那些参与join的记录经过shuffle传输到reduce端进行join操做，其余的和reduce join都是同样的。看代码：

package com.mr.SemiJoin;   
import java.io.BufferedReader;   
import java.io.FileReader;   
import java.io.IOException;   
import java.util.ArrayList;   
import java.util.HashSet;   
import org.apache.hadoop.conf.Configuration;   
import org.apache.hadoop.conf.Configured;   
import org.apache.hadoop.filecache.DistributedCache;   
import org.apache.hadoop.fs.Path;   
import org.apache.hadoop.io.Text;   
import org.apache.hadoop.mapreduce.Job;   
import org.apache.hadoop.mapreduce.Mapper;   
import org.apache.hadoop.mapreduce.Reducer;   
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;   
import org.apache.hadoop.mapreduce.lib.input.FileSplit;   
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;   
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;   
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;   
import org.apache.hadoop.util.Tool;   
import org.apache.hadoop.util.ToolRunner;   
import org.slf4j.Logger;   
import org.slf4j.LoggerFactory;   
/**   
* @author zengzhaozheng   
*   
* 用途说明：   
* reudce side join中的left outer join   
* 左链接，两个文件分别表明2个表,链接字段table1的id字段和table2的cityID字段   
* table1(左表):tb_dim_city(id int,name string,orderid int,city_code,is_show)   
* tb_dim_city.dat文件内容,分隔符为"|"：   
* id     name  orderid  city_code  is_show   
* 0       其余        9999     9999         0   
* 1       长春        1        901          1   
* 2       吉林        2        902          1   
* 3       四平        3        903          1   
* 4       松原        4        904          1   
* 5       通化        5        905          1   
* 6       辽源        6        906          1   
* 7       白城        7        907          1   
* 8       白山        8        908          1   
* 9       延吉        9        909          1   
* -------------------------风骚的分割线-------------------------------   
* table2(右表)：tb_user_profiles(userID int,userName string,network string,double flow,cityID int)   
* tb_user_profiles.dat文件内容,分隔符为"|"：   
* userID   network     flow    cityID   
* 1           2G       123      1   
* 2           3G       333      2   
* 3           3G       555      1   
* 4           2G       777      3   
* 5           3G       666      4   
* -------------------------风骚的分割线-------------------------------   
* joinKey.dat内容：   
* city_code   
* 1   
* 2   
* 3   
* 4   
* -------------------------风骚的分割线-------------------------------   
*  结果：   
*  1   长春  1   901 1   1   2G  123   
*  1   长春  1   901 1   3   3G  555   
*  2   吉林  2   902 1   2   3G  333   
*  3   四平  3   903 1   4   2G  777   
*  4   松原  4   904 1   5   3G  666   
*/ 
public class SemiJoin extends Configured implements Tool{   
    private static final Logger logger = LoggerFactory.getLogger(SemiJoin.class);   
    public static class SemiJoinMapper extends Mapper {   
        private CombineValues combineValues = new CombineValues();   
        private HashSet joinKeySet = new HashSet();   
        private Text flag = new Text();   
        private Text joinKey = new Text();   
        private Text secondPart = new Text();   
        /**   
         * 将参加join的key从DistributedCache取出放到内存中，以便在map端将要参加join的key过滤出来。b   
         */ 
        @Override 
        protected void setup(Context context)   
                throws IOException, InterruptedException {   
            BufferedReader br = null;   
            //得到当前做业的DistributedCache相关文件   
            Path[] distributePaths = DistributedCache.getLocalCacheFiles(context.getConfiguration());   
            String joinKeyStr = null;   
            for(Path p : distributePaths){   
                if(p.toString().endsWith("joinKey.dat")){   
                    //读缓存文件，并放到mem中   
                    br = new BufferedReader(new FileReader(p.toString()));   
                    while(null!=(joinKeyStr=br.readLine())){   
                        joinKeySet.add(joinKeyStr);   
                    }   
                }   
            }   
        }   
        @Override 
        protected void map(Object key, Text value, Context context)   
                throws IOException, InterruptedException {   
            //得到文件输入路径   
            String pathName = ((FileSplit) context.getInputSplit()).getPath().toString();   
            //数据来自tb_dim_city.dat文件,标志即为"0"   
            if(pathName.endsWith("tb_dim_city.dat")){   
                String[] valueItems = value.toString().split("\\|");   
                //过滤格式错误的记录   
                if(valueItems.length != 5){   
                    return;   
                }   
                //过滤掉不须要参加join的记录   
                if(joinKeySet.contains(valueItems[0])){   
                    flag.set("0");   
                    joinKey.set(valueItems[0]);   
                    secondPart.set(valueItems[1]+"\t"+valueItems[2]+"\t"+valueItems[3]+"\t"+valueItems[4]);   
                    combineValues.setFlag(flag);   
                    combineValues.setJoinKey(joinKey);   
                    combineValues.setSecondPart(secondPart);   
                    context.write(combineValues.getJoinKey(), combineValues);   
                }else{   
                    return ;   
                }   
            }//数据来自于tb_user_profiles.dat，标志即为"1"   
            else if(pathName.endsWith("tb_user_profiles.dat")){   
                String[] valueItems = value.toString().split("\\|");   
                //过滤格式错误的记录   
                if(valueItems.length != 4){   
                    return;   
                }   
                //过滤掉不须要参加join的记录   
                if(joinKeySet.contains(valueItems[3])){   
                    flag.set("1");   
                    joinKey.set(valueItems[3]);   
                    secondPart.set(valueItems[0]+"\t"+valueItems[1]+"\t"+valueItems[2]);   
                    combineValues.setFlag(flag);   
                    combineValues.setJoinKey(joinKey);   
                    combineValues.setSecondPart(secondPart);   
                    context.write(combineValues.getJoinKey(), combineValues);   
                }else{   
                    return ;   
                }   
            }   
        }   
    }   
    public static class SemiJoinReducer extends Reducer {   
        //存储一个分组中的左表信息   
        private ArrayList leftTable = new ArrayList();   
        //存储一个分组中的右表信息   
        private ArrayList rightTable = new ArrayList();   
        private Text secondPar = null;   
        private Text output = new Text();   
        /**   
         * 一个分组调用一次reduce函数   
         */ 
        @Override 
        protected void reduce(Text key, Iterable value, Context context)   
                throws IOException, InterruptedException {   
            leftTable.clear();   
            rightTable.clear();   
            /**   
             * 将分组中的元素按照文件分别进行存放   
             * 这种方法要注意的问题：   
             * 若是一个分组内的元素太多的话，可能会致使在reduce阶段出现OOM，   
             * 在处理分布式问题以前最好先了解数据的分布状况，根据不一样的分布采起最   
             * 适当的处理方法，这样能够有效的防止致使OOM和数据过分倾斜问题。   
             */ 
            for(CombineValues cv : value){   
                secondPar = new Text(cv.getSecondPart().toString());   
                //左表tb_dim_city   
                if("0".equals(cv.getFlag().toString().trim())){   
                    leftTable.add(secondPar);   
                }   
                //右表tb_user_profiles   
                else if("1".equals(cv.getFlag().toString().trim())){   
                    rightTable.add(secondPar);   
                }   
            }   
            logger.info("tb_dim_city:"+leftTable.toString());   
            logger.info("tb_user_profiles:"+rightTable.toString());   
            for(Text leftPart : leftTable){   
                for(Text rightPart : rightTable){   
                    output.set(leftPart+ "\t" + rightPart);   
                    context.write(key, output);   
                }   
            }   
        }   
    }   
    @Override 
    public int run(String[] args) throws Exception {   
            Configuration conf=getConf(); //得到配置文件对象   
            DistributedCache.addCacheFile(new Path(args[2]).toUri(), conf);
            Job job=new Job(conf,"LeftOutJoinMR");   
            job.setJarByClass(SemiJoin.class);

            FileInputFormat.addInputPath(job, new Path(args[0])); //设置map输入文件路径   
            FileOutputFormat.setOutputPath(job, new Path(args[1])); //设置reduce输出文件路径

            job.setMapperClass(SemiJoinMapper.class);   
            job.setReducerClass(SemiJoinReducer.class);

            job.setInputFormatClass(TextInputFormat.class); //设置文件输入格式   
            job.setOutputFormatClass(TextOutputFormat.class);//使用默认的output格式

            //设置map的输出key和value类型   
            job.setMapOutputKeyClass(Text.class);   
            job.setMapOutputValueClass(CombineValues.class);

            //设置reduce的输出key和value类型   
            job.setOutputKeyClass(Text.class);   
            job.setOutputValueClass(Text.class);   
            job.waitForCompletion(true);   
            return job.isSuccessful()?0:1;   
    }   
    public static void main(String[] args) throws IOException,   
            ClassNotFoundException, InterruptedException {   
        try {   
            int returnCode =  ToolRunner.run(new SemiJoin(),args);   
            System.exit(returnCode);   
        } catch (Exception e) {   
            logger.error(e.getMessage());   
        }   
    }   
}

View Code

这里还说说SemiJoin也是有必定的适用范围的，其抽取出来进行join的key是要放到内存中的，因此不可以太大，容易在Map端形成OOM。

总结

blog介绍了三种join方式。这三种join方式适用于不一样的场景，其处理效率上的相差仍是蛮大的，其中主要致使因素是网络传输。Map join效率最高，其次是SemiJoin，最低的是reduce join。另外，写分布式大数据处理程序的时最好要对总体要处理的数据分布状况做一个了解，这能够提升咱们代码的效率，使数据的倾斜度降到最低，使咱们的代码倾向性更好。