《Hadoop实战》之专利统计

时间 2020-12-08

标签 java apache api 网络 app ide 函数 oop .net code 栏目 Java 繁體版

原文原文链接

数据

专利数据的特性java

专利引用数据所构成的关系图与网页连接以及社会网络图大同小异
专利发布以时间为序，特性相似于时间序列
专利关联到一我的和一个位置，可视为我的信息或地理数据

首先拿到专利数据：http://data.nber.org/patents/apache

本文使用是的cite75-99.txt，该文件涵盖了自1975年到1999年间对美国专利的引用，包含超过1600万条数据，前几行以下图：api

其中第一列为专利号、第二列为被第一列引用的专利号网络

CITING	CITED
3858241	956203

需求一

读取专利引用数据，对于每个专利找到哪些专列对他进行了引用并进行合并。
进行倒序排序

思路

针对1，将Mapper的键值互换输出便可
针对2，简单方法是设置reduce=1，直接输出全局排序文件

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;
import java.util.Iterator;


// 读取专利引用数据，对于每个专利找到哪些专列对他进行了引用并进行合并。
public class FindCitedPatentsAndOrder extends Configured implements Tool {

    public static class MapClass extends MapReduceBase implements Mapper<Text, Text, Text, Text> {

        @Override
        public void map(Text key, Text value, OutputCollector<Text, Text> output, Reporter reporter)
                throws IOException {
            output.collect(value, key);  // 关键点
        }
    }

    public static class ReduceClass extends MapReduceBase implements Reducer<Text, Text, Text, Text> {

        @Override
        public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output,
                           Reporter reporter) throws IOException {
            String csv = "";
            while (values.hasNext()) {
                if (csv.length() > 0) csv += ",";
                csv += values.next().toString();
            }
            output.collect(key, new Text(csv));
        }
    }

    @Override
    public int run(String[] args) throws Exception {
        JobConf job = new JobConf(getConf(), getClass());

        Path in = new Path(args[0]);
        Path out = new Path(args[1]);
        FileInputFormat.addInputPath(job, in);
        FileOutputFormat.setOutputPath(job, out);

        job.setJobName("FindCitedPatentsAndOrder");
        job.setMapperClass(MapClass.class);
        job.setReducerClass(ReduceClass.class);

        job.setInputFormat(KeyValueTextInputFormat.class);
        job.setOutputFormat(TextOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        job.set("key.value.separator.in.input.line", ",");

        JobClient.runJob(job);
        return 0;
    }

    public static void main(String[] args) throws Exception {
        int exitCode = ToolRunner.run(new FindCitedPatentsAndOrder(), args);
        System.exit(exitCode);
    }
}

需求二

计算不一样引用次数专利的数目

思路

在FindCitedPatentsAndOrder的基础上修改Reduce函数，统计数目

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;
import java.util.Iterator;

// 计算不一样引用次数专利的数目
public class CitedPatentsNumberCounter extends Configured implements Tool {
    public static class MapClass extends MapReduceBase implements Mapper<Text, Text, Text, Text> {

        @Override
        public void map(Text key, Text value, OutputCollector<Text, Text> output, Reporter reporter)
                throws IOException {
            output.collect(value, key);  // 关键点
        }
    }

    public static class ReduceClass extends MapReduceBase implements Reducer<Text, Text, Text, IntWritable> {

        @Override
        public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, IntWritable> output,
                           Reporter reporter) throws IOException {
            int count = 0;
            while (values.hasNext()){
                values.next();
                count++;
            }
            output.collect(key, new IntWritable(count));
        }
    }

    @Override
    public int run(String[] args) throws Exception {
        JobConf job = new JobConf(getConf(), getClass());

        Path in = new Path(args[0]);
        Path out = new Path(args[1]);
        FileInputFormat.addInputPath(job, in);
        FileOutputFormat.setOutputPath(job, out);

        job.setJobName("CitedPatentsNumberCounter");
        job.setMapperClass(MapClass.class);
        job.setReducerClass(ReduceClass.class);

        job.setInputFormat(KeyValueTextInputFormat.class);
        job.setOutputFormat(TextOutputFormat.class);
        job.setMapOutputValueClass(Text.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);  // 同时设置K2,V2和K3,V3的类型

        JobClient.runJob(job);
        return 0;
    }

    public static void main(String[] args) throws Exception {
        int exitCode = ToolRunner.run(new CitedPatentsNumberCounter(), args);
        System.exit(exitCode);
    }
}

需求三

在需求二输出的被引用数量数据的基础上，统计被引用数量的频次

思路

将输入数据替换成需求二的输出数据便可
为了重用Mapper里面的OutputKey和OutputValue，将它们在类中初始化

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;
import java.util.Iterator;

public class CitationFrequencyStatistics extends Configured implements Tool {
    public static class MapClass extends MapReduceBase implements Mapper<Text, Text, IntWritable, IntWritable> {
        private final  static IntWritable UNO = new IntWritable(1);  // 单位1
        private IntWritable citationCount = new IntWritable();


        @Override
        public void map(Text key, Text value, OutputCollector<IntWritable, IntWritable> output, Reporter reporter)
                throws IOException {
            citationCount.set(Integer.parseInt(value.toString()));
            output.collect(citationCount, UNO);  // 关键点
        }
    }

    public static class ReduceClass extends MapReduceBase implements Reducer<IntWritable, IntWritable, IntWritable, IntWritable> {

        @Override
        public void reduce(IntWritable key, Iterator<IntWritable> values, OutputCollector<IntWritable, IntWritable> output,
                           Reporter reporter) throws IOException {
            int count = 0;
            while (values.hasNext()){
                values.next();
                count++;
            }
            output.collect(key, new IntWritable(count));
        }
    }

    @Override
    public int run(String[] args) throws Exception {
        JobConf job = new JobConf(getConf(), getClass());

        Path in = new Path(args[0]);
        Path out = new Path(args[1]);
        FileInputFormat.addInputPath(job, in);
        FileOutputFormat.setOutputPath(job, out);

        job.setJobName("CitationFrequencyStatistics");
        job.setMapperClass(MapClass.class);
        job.setReducerClass(ReduceClass.class);

        job.setInputFormat(KeyValueTextInputFormat.class);
        job.setOutputFormat(TextOutputFormat.class);
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(IntWritable.class);  // 同时设置K2,V2和K3,V3的类型

        JobClient.runJob(job);
        return 0;
    }

    public static void main(String[] args) throws Exception {
        int exitCode = ToolRunner.run(new CitationFrequencyStatistics(), args);
        System.exit(exitCode);
    }
}

Hadoop的新API

Hadoop最新版本的MapReduce Release 0.20.0的API包括了一个全新的Mapreduce JAVA API，有时候也称为上下文对象。app

　　新的API类型上不兼容之前的API，因此，之前的应用程序须要重写才能使新的API发挥其做用。ide

　　新的API和旧的API之间有下面几个明显的区别。函数

新的API倾向于使用抽象类，而不是接口，由于这更容易扩展。例如，你能够添加一个方法(用默认的实现)到一个抽象类而不需修改类以前的实现方法。在新的API中，Mapper和Reducer是抽象类。
新的API是在org.apache.hadoop.mapreduce包(和子包)中的。以前版本的API则是放在org.apache.hadoop.mapred中的。
新的API普遍使用context object(上下文对象)，并容许用户代码与MapReduce系统进行通讯。例如，MapContext基本上充当着JobConf的OutputCollector和Reporter的角色。
新的API同时支持"推"和"拉"式的迭代。在这两个新老API中，键/值记录对被推mapper中，但除此以外，新的API容许把记录从map()方法中拉出，这也适用于reducer。"拉"式的一个有用的例子是分批处理记录，而不是一个接一个。
新的API统一了配置。旧的API有一个特殊的JobConf对象用于做业配置，这是一个对于Hadoop一般的Configuration对象的扩展。在新的API中，这种区别没有了，因此做业配置经过Configuration来完成。做业控制的执行由Job类来负责，而不是JobClient，它在新的API中已经荡然无存。

基于新api重写的Hadoop基础程序模板

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;

import java.io.IOException;

public class MyJob extends Configured implements Tool {

    public static class MapClass extends Mapper<LongWritable, Text, Text, Text> {
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] citation = value.toString().split(",");
            context.write(new Text(citation[1]), new Text(citation[0]));
        }
    }

    public static class ReduceClass extends Reducer<Text, Text, Text, Text> {
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            String csv = "";
            for (Text val: values) {
                if (csv.length() > 0) csv += ",";
                csv += val.toString();
            }
            context.write(key, new Text(csv));
        }
    }

    @Override
    public int run(String[] strings) throws Exception {
        Configuration conf = getConf();

        Job job = new Job(conf, "Myjob");
        job.setJarByClass(MyJob.class);

        job.setMapperClass(MapClass.class);
        job.setReducerClass(ReduceClass.class);

        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        return 0;
    }
}

专利数据网址oop
Hadoop实战之专利数据处理.net