专利数据的特性java
首先拿到专利数据:http://data.nber.org/patents/apache
本文使用是的cite75-99.txt,该文件涵盖了自1975年到1999年间对美国专利的引用,包含超过1600万条数据,前几行以下图:api
其中第一列为专利号、第二列为被第一列引用的专利号网络
CITING | CITED |
---|---|
3858241 | 956203 |
import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.*; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import java.io.IOException; import java.util.Iterator; // 读取专利引用数据,对于每个专利找到哪些专列对他进行了引用并进行合并。 public class FindCitedPatentsAndOrder extends Configured implements Tool { public static class MapClass extends MapReduceBase implements Mapper<Text, Text, Text, Text> { @Override public void map(Text key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { output.collect(value, key); // 关键点 } } public static class ReduceClass extends MapReduceBase implements Reducer<Text, Text, Text, Text> { @Override public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { String csv = ""; while (values.hasNext()) { if (csv.length() > 0) csv += ","; csv += values.next().toString(); } output.collect(key, new Text(csv)); } } @Override public int run(String[] args) throws Exception { JobConf job = new JobConf(getConf(), getClass()); Path in = new Path(args[0]); Path out = new Path(args[1]); FileInputFormat.addInputPath(job, in); FileOutputFormat.setOutputPath(job, out); job.setJobName("FindCitedPatentsAndOrder"); job.setMapperClass(MapClass.class); job.setReducerClass(ReduceClass.class); job.setInputFormat(KeyValueTextInputFormat.class); job.setOutputFormat(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.set("key.value.separator.in.input.line", ","); JobClient.runJob(job); return 0; } public static void main(String[] args) throws Exception { int exitCode = ToolRunner.run(new FindCitedPatentsAndOrder(), args); System.exit(exitCode); } }
import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.*; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import java.io.IOException; import java.util.Iterator; // 计算不一样引用次数专利的数目 public class CitedPatentsNumberCounter extends Configured implements Tool { public static class MapClass extends MapReduceBase implements Mapper<Text, Text, Text, Text> { @Override public void map(Text key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { output.collect(value, key); // 关键点 } } public static class ReduceClass extends MapReduceBase implements Reducer<Text, Text, Text, IntWritable> { @Override public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { int count = 0; while (values.hasNext()){ values.next(); count++; } output.collect(key, new IntWritable(count)); } } @Override public int run(String[] args) throws Exception { JobConf job = new JobConf(getConf(), getClass()); Path in = new Path(args[0]); Path out = new Path(args[1]); FileInputFormat.addInputPath(job, in); FileOutputFormat.setOutputPath(job, out); job.setJobName("CitedPatentsNumberCounter"); job.setMapperClass(MapClass.class); job.setReducerClass(ReduceClass.class); job.setInputFormat(KeyValueTextInputFormat.class); job.setOutputFormat(TextOutputFormat.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); // 同时设置K2,V2和K3,V3的类型 JobClient.runJob(job); return 0; } public static void main(String[] args) throws Exception { int exitCode = ToolRunner.run(new CitedPatentsNumberCounter(), args); System.exit(exitCode); } }
import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.*; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import java.io.IOException; import java.util.Iterator; public class CitationFrequencyStatistics extends Configured implements Tool { public static class MapClass extends MapReduceBase implements Mapper<Text, Text, IntWritable, IntWritable> { private final static IntWritable UNO = new IntWritable(1); // 单位1 private IntWritable citationCount = new IntWritable(); @Override public void map(Text key, Text value, OutputCollector<IntWritable, IntWritable> output, Reporter reporter) throws IOException { citationCount.set(Integer.parseInt(value.toString())); output.collect(citationCount, UNO); // 关键点 } } public static class ReduceClass extends MapReduceBase implements Reducer<IntWritable, IntWritable, IntWritable, IntWritable> { @Override public void reduce(IntWritable key, Iterator<IntWritable> values, OutputCollector<IntWritable, IntWritable> output, Reporter reporter) throws IOException { int count = 0; while (values.hasNext()){ values.next(); count++; } output.collect(key, new IntWritable(count)); } } @Override public int run(String[] args) throws Exception { JobConf job = new JobConf(getConf(), getClass()); Path in = new Path(args[0]); Path out = new Path(args[1]); FileInputFormat.addInputPath(job, in); FileOutputFormat.setOutputPath(job, out); job.setJobName("CitationFrequencyStatistics"); job.setMapperClass(MapClass.class); job.setReducerClass(ReduceClass.class); job.setInputFormat(KeyValueTextInputFormat.class); job.setOutputFormat(TextOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(IntWritable.class); // 同时设置K2,V2和K3,V3的类型 JobClient.runJob(job); return 0; } public static void main(String[] args) throws Exception { int exitCode = ToolRunner.run(new CitationFrequencyStatistics(), args); System.exit(exitCode); } }
Hadoop最新版本的MapReduce Release 0.20.0的API包括了一个全新的Mapreduce JAVA API,有时候也称为上下文对象。app
新的API类型上不兼容之前的API,因此,之前的应用程序须要重写才能使新的API发挥其做用 。ide
新的API和旧的API之间有下面几个明显的区别。函数
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.Tool; import java.io.IOException; public class MyJob extends Configured implements Tool { public static class MapClass extends Mapper<LongWritable, Text, Text, Text> { @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] citation = value.toString().split(","); context.write(new Text(citation[1]), new Text(citation[0])); } } public static class ReduceClass extends Reducer<Text, Text, Text, Text> { @Override protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { String csv = ""; for (Text val: values) { if (csv.length() > 0) csv += ","; csv += val.toString(); } context.write(key, new Text(csv)); } } @Override public int run(String[] strings) throws Exception { Configuration conf = getConf(); Job job = new Job(conf, "Myjob"); job.setJarByClass(MyJob.class); job.setMapperClass(MapClass.class); job.setReducerClass(ReduceClass.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); return 0; } }
专利数据网址oop
Hadoop实战之专利数据处理.net