<configuration>
<!--指定namenode的地址-->
<property>
<name>fs.defaultFS</name>
<value>hdfs://localhost:9000</value>
<description>HDFS的URI,文件系统://namenode标识:端口号</description>
</property>
<!--用来指定使用hadoop时产生文件的存放目录-->
<property>
<name>hadoop.tmp.dir</name>
<value>/D:/bigdata/hadoop-2.7.4/workplace/tmp</value>
<description>namenode上本地的hadoop临时文件夹</description>
</property>
</configuration>
复制代码
<configuration>
<!--指定hdfs保存数据的副本数量-->
<property>
<name>dfs.replication</name>
<value>1</value>
<description>副本个数,配置默认是3,应小于datanode机器数量</description>
</property>
<property>
<name>dfs.name.dir</name>
<value>/D:/bigdata/hadoop-2.7.4/workplace/name</value>
<description>namenode上存储hdfs名字空间元数据 </description>
</property>
<property>
<name>dfs.data.dir</name>
<value>/D:/bigdata/hadoop-2.7.4/workplace/data</value>
<description>datanode上数据块的物理存储位置</description>
</property>
<property>
<name>dfs.webhdfs.enabled</name>
<value>true</value>
<description>WebHDFS接口</description>
</property>
<property>
<name>dfs.permissions</name>
<value>false</value>
<description>HDFS权限控制</description>
</property>
</configuration>
复制代码
<configuration>
<!--nomenodeManager获取数据的方式是shuffle-->
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<!-- 我电脑的内存是8G,通常设置到6G左右,我为了方便直接写了8G的实际大小 -->
<property>
<name>yarn.nodemanager.resource.memory-mb</name>
<value>8192</value>
</property>
<!-- 这个参数是MR任务最小的分配内存 -->
<property>
<name>yarn.scheduler.minimum-allocation-mb</name>
<value>1536</value>
</property>
<!-- 这个参数是MR任务最大的分配内存 -->
<property>
<name>yarn.scheduler.maximum-allocation-mb</name>
<value>4096</value>
</property>
<!-- 这个参数是MR任务使用的CPU数 -->
<property>
<name>yarn.nodemanager.resource.cpu-vcores</name>
<value>2</value>
</property>
<property>
<name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
<value>org.apache.hadoop.mapred.ShuffleHandler</value>
</property>
</configuration>
复制代码
<configuration>
<!--告诉hadoop之后MR运行在YARN上-->
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>mapreduce.map.memory.mb</name>
<value>2048</value>
</property>
<property>
<name>mapreduce.reduce.memory.mb</name>
<value>2048</value>
</property>
<property>
<name>mapreduce.jobtracker.http.address</name>
<value>localhost:50030</value>
</property>
<property>
<name>mapreduce.jobhistory.address</name>
<value>localhost:10020</value>
</property>
<property>
<name>mapreduce.jobhistory.webapp.address</name>
<value>localhost:19888</value>
</property>
<property>
<name>mapred.job.tracker</name>
<value>http://localhost:9001</value>
</property>
</configuration>
复制代码
D:\bigdata\hadoop-2.7.4\etc\hadoop\
复制代码
D:\bigdata\hadoop-2.7.4
复制代码
D:\bigdata\hadoop-2.7.4\bin
复制代码
hadoop namenode -format
复制代码
cd /d D:\bigdata\hadoop-2.7.4\sbin
dir
## 推荐启动步骤
start-dfs.cmd
start-yarn.cmd
## 粗暴
start-all.cmd
复制代码
jps
# 看看对应的组件是否启动成功并占有进程ID号
复制代码
# 能够查看有什么测试的方法
hadoop jar D:\bigdata\hadoop-2.7.4\share\hadoop\mapreduce\hadoop-mapreduce-examples-2.7.4.jar -info
# 选用最常用的PI测试(3个Task, 100个取样个数,两数相乘为总样本数)
hadoop jar D:\bigdata\hadoop-2.7.4\share\hadoop\mapreduce\hadoop-mapreduce-examples-2.7.4.jar PI 3 100
复制代码
Usage: $HADOOP_PREFIX/bin/hadoop jar hadoop-streaming.jar [options]
Options:
-input <path> DFS input file(s) for the Map step.
-output <path> DFS output directory for the Reduce step.
-mapper <cmd|JavaClassName> Optional. Command to be run as mapper.
-combiner <cmd|JavaClassName> Optional. Command to be run as combiner.
-reducer <cmd|JavaClassName> Optional. Command to be run as reducer.
-file <file> Optional. File/dir to be shipped in the Job jar file.
Deprecated. Use generic option "-files" instead.
-inputformat <TextInputFormat(default)|SequenceFileAsTextInputFormat|JavaClassName>
Optional. The input format class.
-outputformat <TextOutputFormat(default)|JavaClassName>
Optional. The output format class.
-partitioner <JavaClassName> Optional. The partitioner class.
-numReduceTasks <num> Optional. Number of reduce tasks.
-inputreader <spec> Optional. Input recordreader spec.
-cmdenv <n>=<v> Optional. Pass env.var to streaming commands.
-mapdebug <cmd> Optional. To run this script when a map task fails.
-reducedebug <cmd> Optional. To run this script when a reduce task fails.
-io <identifier> Optional. Format to use for input to and output
from mapper/reducer commands
-lazyOutput Optional. Lazily create Output.
-background Optional. Submit the job and don't wait till it completes.
-verbose Optional. Print verbose output.
-info Optional. Print detailed usage.
-help Optional. Print help message.
Generic options supported are
-conf <configuration file> specify an application configuration file
-D <property=value> use value for given property
-fs <local|namenode:port> specify a namenode
-jt <local|resourcemanager:port> specify a ResourceManager
-files <comma separated list of files> specify comma separated files to be copied to the map reduce cluster
-libjars <comma separated list of jars> specify comma separated jar files to include in the classpath.
-archives <comma separated list of archives> specify comma separated archives to be unarchived on the compute machines.
The general command line syntax is
bin/hadoop command [genericOptions] [commandOptions]
Usage tips:
In -input: globbing on <path> is supported and can have multiple -input
Default Map input format: a line is a record in UTF-8 the key part ends at first
TAB, the rest of the line is the value
To pass a Custom input format:
-inputformat package.MyInputFormat
Similarly, to pass a custom output format:
-outputformat package.MyOutputFormat
The files with extensions .class and .jar/.zip, specified for the -file
argument[s], end up in "classes" and "lib" directories respectively inside
the working directory when the mapper and reducer are run. All other files
specified for the -file argument[s] end up in the working directory when the
mapper and reducer are run. The location of this working directory is
unspecified.
To set the number of reduce tasks (num. of output files) as, say 10:
Use -numReduceTasks 10
To skip the sort/combine/shuffle/sort/reduce step:
Use -numReduceTasks 0
Map output then becomes a 'side-effect output' rather than a reduce input.
This speeds up processing. This also feels more like "in-place" processing
because the input filename and the map input order are preserved.
This is equivalent to -reducer NONE
To speed up the last maps:
-D mapreduce.map.speculative=true
To speed up the last reduces:
-D mapreduce.reduce.speculative=true
To name the job (appears in the JobTracker Web UI):
-D mapreduce.job.name='My Job'
To change the local temp directory:
-D dfs.data.dir=/tmp/dfs
-D stream.tmpdir=/tmp/streaming
Additional local temp directories with -jt local:
-D mapreduce.cluster.local.dir=/tmp/local
-D mapreduce.jobtracker.system.dir=/tmp/system
-D mapreduce.cluster.temp.dir=/tmp/temp
To treat tasks with non-zero exit status as SUCCEDED:
-D stream.non.zero.exit.is.failure=false
Use a custom hadoop streaming build along with standard hadoop install:
$HADOOP_PREFIX/bin/hadoop jar /path/my-hadoop-streaming.jar [...]\
[...] -D stream.shipped.hadoopstreaming=/path/my-hadoop-streaming.jar
For more details about jobconf parameters see:
http://wiki.apache.org/hadoop/JobConfFile
To set an environement variable in a streaming command:
-cmdenv EXAMPLE_DIR=/home/example/dictionaries/
Shortcut:
setenv HSTREAMING "$HADOOP_PREFIX/bin/hadoop jar hadoop-streaming.jar"
Example: $HSTREAMING -mapper "/usr/local/bin/perl5 filter.pl"
-file /local/filter.pl -input "/logs/0604*/*" [...]
Ships a script, invokes the non-shipped perl interpreter. Shipped files go to
the working directory so filter.pl is found by perl. Input files are all the
daily logs for days in month 2006-04
复制代码
["milton-paradise.txt", "[ Paradise Lost by John Milton 1667 ] Book I Of Man ' s first disobedience , and the fruit Of that forbidden tree whose mortal taste Brought death into the World , and all our woe , With loss of Eden , till one greater Man Restore us , and regain the blissful seat , Sing , Heavenly Muse , that , on the secret top Of Oreb , or of Sinai , didst inspire That shepherd who first taught the chosen seed In the beginning how the heavens and earth Rose out of Chaos : or , if Sion hill Delight thee more , and Siloa ' s brook that flowed Fast by the oracle of God , I thence Invoke thy aid to my adventurous song , That with no middle flight intends to soar Above th ' Aonian mount , while it pursues Things unattempted yet in prose or rhyme ."]
["edgeworth-parents.txt", "[ The Parent ' s Assistant , by Maria Edgeworth ] THE ORPHANS . Near the ruins of the castle of Rossmore , in Ireland , is a small cabin , in which there once lived a widow and her four children . As long as she was able to work , she was very industrious , and was accounted the best spinner in the parish ; but she overworked herself at last , and fell ill , so that she could not sit to her wheel as she used to do , and was obliged to give it up to her eldest daughter , Mary ."]
["austen-emma.txt", "[ Emma by Jane Austen 1816 ] VOLUME I CHAPTER I Emma Woodhouse , handsome , clever , and rich , with a comfortable home and happy disposition , seemed to unite some of the best blessings of existence ; and had lived nearly twenty - one years in the world with very little to distress or vex her . She was the youngest of the two daughters of a most affectionate , indulgent father ; and had , in consequence of her sister ' s marriage , been mistress of his house from a very early period . Her mother had died too long ago for her to have more than an indistinct remembrance of her caresses ; and her place had been supplied by an excellent woman as governess , who had fallen little short of a mother in affection ."]
["chesterton-ball.txt", "[ The Ball and The Cross by G . K . Chesterton 1909 ] I . A DISCUSSION SOMEWHAT IN THE AIR The flying ship of Professor Lucifer sang through the skies like a silver arrow ; the bleak white steel of it , gleaming in the bleak blue emptiness of the evening . That it was far above the earth was no expression for it ; to the two men in it , it seemed to be far above the stars . The professor had himself invented the flying machine , and had also invented nearly everything in it ."]
["bible-kjv.txt", "[ The King James Bible ] The Old Testament of the King James Bible The First Book of Moses : Called Genesis 1 : 1 In the beginning God created the heaven and the earth . 1 : 2 And the earth was without form , and void ; and darkness was upon the face of the deep . And the Spirit of God moved upon the face of the waters . 1 : 3 And God said , Let there be light : and there was light . 1 : 4 And God saw the light , that it was good : and God divided the light from the darkness . 1 : 5 And God called the light Day , and the darkness he called Night . And the evening and the morning were the first day ."]
["chesterton-thursday.txt", "[ The Man Who Was Thursday by G . K . Chesterton 1908 ] To Edmund Clerihew Bentley A cloud was on the mind of men , and wailing went the weather , Yea , a sick cloud upon the soul when we were boys together . Science announced nonentity and art admired decay ; The world was old and ended : but you and I were gay ; Round us in antic order their crippled vices came -- Lust that had lost its laughter , fear that had lost its shame . Like the white lock of Whistler , that lit our aimless gloom , Men showed their own white feather as proudly as a plume . Life was a fly that faded , and death a drone that stung ; The world was very old indeed when you and I were young ."]
["blake-poems.txt", "[ Poems by William Blake 1789 ] SONGS OF INNOCENCE AND OF EXPERIENCE and THE BOOK of THEL SONGS OF INNOCENCE INTRODUCTION Piping down the valleys wild , Piping songs of pleasant glee , On a cloud I saw a child , And he laughing said to me : \" Pipe a song about a Lamb !\" So I piped with merry cheer . \" Piper , pipe that song again ;\" So I piped : he wept to hear . \" Drop thy pipe , thy happy pipe ; Sing thy songs of happy cheer :!\" So I sang the same again , While he wept with joy to hear . \" Piper , sit thee down and write In a book , that all may read .\" So he vanish ' d from my sight ; And I pluck ' d a hollow reed , And I made a rural pen , And I stain ' d the water clear , And I wrote my happy songs Every child may joy to hear ."]
["shakespeare-caesar.txt", "[ The Tragedie of Julius Caesar by William Shakespeare 1599 ] Actus Primus . Scoena Prima . Enter Flauius , Murellus , and certaine Commoners ouer the Stage . Flauius . Hence : home you idle Creatures , get you home : Is this a Holiday ? What , know you not ( Being Mechanicall ) you ought not walke Vpon a labouring day , without the signe Of your Profession ? Speake , what Trade art thou ? Car . Why Sir , a Carpenter Mur . Where is thy Leather Apron , and thy Rule ? What dost thou with thy best Apparrell on ? You sir , what Trade are you ? Cobl . Truely Sir , in respect of a fine Workman , I am but as you would say , a Cobler Mur . But what Trade art thou ? Answer me directly Cob . A Trade Sir , that I hope I may vse , with a safe Conscience , which is indeed Sir , a Mender of bad soules Fla ."]
["whitman-leaves.txt", "[ Leaves of Grass by Walt Whitman 1855 ] Come , said my soul , Such verses for my Body let us write , ( for we are one ,) That should I after return , Or , long , long hence , in other spheres , There to some group of mates the chants resuming , ( Tallying Earth ' s soil , trees , winds , tumultuous waves ,) Ever with pleas ' d smile I may keep on , Ever and ever yet the verses owning -- as , first , I here and now Signing for Soul and Body , set to them my name , Walt Whitman [ BOOK I . INSCRIPTIONS ] } One ' s - Self I Sing One ' s - self I sing , a simple separate person , Yet utter the word Democratic , the word En - Masse ."]
["melville-moby_dick.txt", "[ Moby Dick by Herman Melville 1851 ] ETYMOLOGY . ( Supplied by a Late Consumptive Usher to a Grammar School ) The pale Usher -- threadbare in coat , heart , body , and brain ; I see him now . He was ever dusting his old lexicons and grammars , with a queer handkerchief , mockingly embellished with all the gay flags of all the known nations of the world . He loved to dust his old grammars ; it somehow mildly reminded him of his mortality ."]
复制代码
任务需求:node
Data:
["test_1.txt", "[ apple pipe ]"]
["test_2.txt", "[ apple company ]"]
Result:
apple ["test_1.txt", "test_2.txt"]
pipe ["test_1.txt"]
company ["test_2.txt"]
复制代码
mapper.py:python
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
""" Created on 2017年10月30日 @author: Leo """
# Python内部库
import sys
import json
for line in sys.stdin:
line = line.strip()
record = json.loads(line)
file_name = record[0]
value = record[1]
words = value.split()
for word in words:
print("%s\t%s" % (word, file_name))
复制代码
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
""" Created on 2017年10月30日 @author: Leo """
# Python内部库
import sys
media = {}
word_in_media = {}
# maps words to their counts
for line in sys.stdin:
(word, file_name) = line.strip().split('\t', 1)
media.setdefault(word, [])
media[word].append(file_name)
for word in media:
word_in_media.setdefault(word, list(set(media[word])))
for word in word_in_media:
print("%s\t%s" % (word, word_in_media[word]))
复制代码
将books.json上传到HDFSlinux
# 若是还没在HDFS上建立过文件夹的话,须要先建立文件夹
# 如下是本人建立的方式,能够自行建立
hdfs dfs -mkdir -p /user/Leo/input
# 从本地上传到HDFS中
hdfs dfs -copyFromLocal <绝对路径>\books.json /user/Leo/input/
# 打开localhost:50070的HDFS页面后查看文件是否存在
复制代码
# 这个是若是Mapreduce执行过程当中出错,解决后再出错的时候记得删除output文件夹
hdfs dfs -rm -r /user/Leo/output
复制代码
# 不正常的操做触发了HDFS启动了安全模式
hdfs dfsadmin -safemode leave
复制代码
# 如下代码过长,我用linux命令换行的方式进行展现
hadoop jar D:/bigdata/hadoop-2.7.4/share/hadoop/tools/lib/hadoop-streaming-2.7.4.jar \
-D stream.non.zero.exit.is.failure=false \
-input /user/Leo/input/books.json \
-output /user/Leo/output \
-mapper "python mapper.py" \
-reducer "python reducer.py" \
-file C:/Users/Administrator/Desktop/MingDong_Work/Work_2/mapper.py \
-file C:/Users/Administrator/Desktop/MingDong_Work/Work_2/reducer.py
# 解释:
一、jar 后面跟的是Jar包的路径,官方提倡用环境变量加路径的方式,我这为了演示用了绝对路径进行展现
二、-D stream.non.zero.exit.is.failure=false 这句话的意思是若是函数返回值(即mapper或reducer没有return 0,则函数为异常结果.加了这句就能够跳过检查.)
三、input: 就是HDFS的文件
四、output: 就是M-R任务结束后的文件存放的地方
五、mapper: 指定执行mapper的脚本或代码
六、reducer: 指定执行reducer的脚本或代码
七、-file: 指定代码的位置(多个文件用-files,为了展现更清晰,我用了旧版的-file的形式进行展现)
复制代码
hdfs -dfs -get <HDFS文件路径>(/user/Leo/output/part-00000) <本地路径>
复制代码
优势: (方便,快捷)git
缺点: (主要都是性能问题)github