无说明须要登陆其它机器操做,都是在集群的HD-2-101上执行的命令。
全部安装包地址:百度网盘,提取码:24oyphp
虚拟的安装与静态IP等配置见:Linux传送门汇总html
systemctl restart network.service
注:这里配置三台机器(192.168.2.101;192.168.2.102;192.168.2.103)java
# 先安装本机的依赖,其它没有的依赖后续再安装,目前我电脑如今须要的就这几个o_0
yum install tcl-devel.x86_64 rsync.x86_64 ntp.x86_64 -y
shell脚本内容:node
执行步骤:linux
tar -zxvf autoconfig.tar.gz -C /home
4.切换到/home/autoconfig/bin目录执行:sh autoconfig.sh all
5.分发到全部机器执行。git
cd /home/autoconfig/bin; sh xsync "/home/autoconfig" "/home"; sh doCommand other "cd /home/autoconfig/bin/; sh autoconfig.sh trust";
sh doCommand other "init 0"; init 0;
cd /home/autoconfig/bin; sh doCommand all "yum install tcl-devel.x86_64 rsync.x86_64 ntp.x86_64 -y"
首先要检查全部机器是否安装java,并卸载github
检查:sh doCommand all "rpm -qa | grep java";
卸载用:rpm -e --nodeps 要卸载的软件包shell
sh doCommand all "mkdir -p /opt/cluster";
tar -zxvf /opt/cluster/jdk-8u144-linux-x64.tar.gz; sh xsync "/opt/cluster/jdk1.8.0_144" "/opt/cluster";
# java版本是jdk1.8.0_144
sh doCommand all "ln -s /opt/cluster/jdk1.8.0_144 /opt/cluster/java";
#JAVA_HOME
export JAVA_HOME=/opt/cluster/java
export PATH=$PATH:$JAVA_HOME/bin
注:这里选择HD-2-101为ntpd对时服务器apache
restrict 192.168.2.0 mask 255.255.255.0 nomodify notrap restrict 127.0.0.1
# 注释掉如下,内网中不能使用外网的
#server 0.centos.pool.ntp.org iburst
#server 1.centos.pool.ntp.org iburst
#server 2.centos.pool.ntp.org iburst
#server 3.centos.pool.ntp.org iburst
# 修改当该节点丢失网络链接,依然能够采用本地时间做为时间服务器为集群中的其余节点提供时间同步
server 127.127.1.0
fudge 127.127.1.0 stratum 5
# 增长内容以下(让硬件时间与系统时间一块儿同步)
SYNC_HWCLOCK="yes"
*/10 * * * * root /usr/sbin/ntpdate HD-2-101
sh xsync "/etc/cron.d/ntp_crond" "/etc/cron.d";
# 重启
sh doCommand all "systemctl restart crond.service"
# reach是已经向上层NTP服务器要求更新的次数,是一个八进制,每次改变是poll对应的秒数,等reach大于等于17其它服务器就可对本服务器对时了。
watch ntpq -p
# 保证其余机器ntpd不开启
sh doCommand other "systemctl stop ntpd.service;/usr/sbin/ntpdate HD-2-101;"
安装包下载地址:zookeeper-3.4.14.tar.gzbootstrap
tar -zxvf /opt/cluster/zookeeper-3.4.14.tar.gz -C /opt/cluster;
sh doCommand all "mkdir -p /hdata/zookeeper;";
dataDir=/hdata/zookeeper server.1=HD-2-101:2888:3888 server.2=HD-2-102:2888:3888 server.3=HD-2-103:2888:3888
#server.A=B:C:D。
#A是一个数字,表示这个是第几号服务器;
#B是这个服务器的IP地址;
#C是这个服务器与集群中的Leader服务器交换信息的端口;
#D是万一集群中的Leader服务器挂了,须要一个端口来从新进行选#举,选出一个新的Leader,而这个端口就是用来执行选举时服务器相互通讯的端口。
#集群模式下配置一个文件myid,这个文件在dataDir目录下,这个文件里面有一个数据就是A的值,Zookeeper启动时读取此文件,拿到里面的数据与zoo.cfg里面的配置信息比较从而判断究竟是哪一个server。
sh xsync "/opt/cluster/zookeeper-3.4.14" "/opt/cluster";
# 如server.1=B:C:D
echo "1" > /hdata/zookeeper/myid;
sh doCommand all "ln -s /opt/cluster/zookeeper-3.4.14 /opt/cluster/zookeeper";
sh doCommand all "/opt/cluster/zookeeper/bin/zkServer.sh start";
sh doCommand all "/opt/cluster/zookeeper/bin/zkServer.sh status";
上传hadoop_template.tar.gz模版压缩包到/home目录下并解压:tar -zxvf /home/hadoop_template.tar.gz -C /home
根据机器配置填写env.sh export 导出的变量值。
sh /home/hadoop_template/ha/env.sh运行脚本,自动完成配置。
ha模板路径/home/hadoop_template/ha,全部的模板文件配置以下:
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<!-- 把两个NameNode的地址组装成一个集群mycluster -->
<property>
<name>fs.defaultFS</name>
<value>hdfs://${HADOOP_CLUSTER_NAME}</value>
</property>
<!-- 指定hadoop运行时产生文件的存储目录 -->
<property>
<name>hadoop.tmp.dir</name>
<value>${HADOOP_TMP_DIR}</value>
</property>
<property>
<name>ha.zookeeper.quorum</name>
<value>${HADOOP_ZOOKEEPERS}</value>
</property>
<!-- 防止使用start-dfs.sh journalnode未启动NameNode链接不上journalnode没法启动 -->
<property>
<name>ipc.client.connect.max.retries</name>
<value>100</value>
<description>Indicates the number of retries a client will make to establisha server connection.
</description>
</property>
<property>
<name>ipc.client.connect.retry.interval</name>
<value>10000</value>
<description>Indicates the number of milliseconds a client will wait for before retrying to establish a server connection.
</description>
</property>
</configuration>
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<!-- 彻底分布式集群名称 -->
<property>
<name>dfs.nameservices</name>
<value>${HADOOP_CLUSTER_NAME}</value>
</property>
<!-- 集群中NameNode节点都有哪些 -->
<property>
<name>dfs.ha.namenodes.${HADOOP_CLUSTER_NAME}</name>
<value>${HADOOP_NAME_NODES}</value>
</property>
<!-- nn1的RPC通讯地址 -->
<property>
<name>dfs.namenode.rpc-address.${HADOOP_CLUSTER_NAME}.nn1</name>
<value>${HADOOP_NN1}:9000</value>
</property>
<!-- nn2的RPC通讯地址 -->
<property>
<name>dfs.namenode.rpc-address.${HADOOP_CLUSTER_NAME}.nn2</name>
<value>${HADOOP_NN2}:9000</value>
</property>
<!-- nn1的http通讯地址 -->
<property>
<name>dfs.namenode.http-address.${HADOOP_CLUSTER_NAME}.nn1</name>
<value>${HADOOP_NN1}:50070</value>
</property>
<!-- nn2的http通讯地址 -->
<property>
<name>dfs.namenode.http-address.${HADOOP_CLUSTER_NAME}.nn2</name>
<value>${HADOOP_NN2}:50070</value>
</property>
<!-- 指定NameNode元数据在JournalNode上的存放位置 -->
<property>
<name>dfs.namenode.shared.edits.dir</name>
<value>${HADOOP_JN}</value>
</property>
<!-- 配置隔离机制,即同一时刻只能有一台服务器对外响应 -->
<property>
<name>dfs.ha.fencing.methods</name>
<value>sshfence</value>
</property>
<!-- 使用隔离机制时须要ssh无秘钥登陆-->
<property>
<name>dfs.ha.fencing.ssh.private-key-files</name>
<value>${HADOOP_ISA_PATH}</value>
</property>
<!-- 声明journalnode服务器存储目录-->
<property>
<name>dfs.journalnode.edits.dir</name>
<value>${HADOOP_JN_DATA_DIR}</value>
</property>
<!-- 关闭权限检查-->
<property>
<name>dfs.permissions.enable</name>
<value>false</value>
</property>
<!-- 访问代理类:client,mycluster,active配置失败自动切换实现方式-->
<property>
<name>dfs.client.failover.proxy.provider.${HADOOP_CLUSTER_NAME}</name>
<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
</property>
<property>
<name>dfs.ha.automatic-failover.enabled</name>
<value>true</value>
</property>
</configuration>
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<!--启用resourcemanager ha-->
<property>
<name>yarn.resourcemanager.ha.enabled</name>
<value>true</value>
</property>
<!--声明两台resourcemanager的地址-->
<property>
<name>yarn.resourcemanager.cluster-id</name>
<value>${HADOOP_YARN_ID}</value>
</property>
<property>
<name>yarn.resourcemanager.ha.rm-ids</name>
<value>${HADOOP_YARN_RMS}</value>
</property>
<property>
<name>yarn.resourcemanager.hostname.rm1</name>
<value>${HADOOP_YARN_RM1}</value>
</property>
<property>
<name>yarn.resourcemanager.hostname.rm2</name>
<value>${HADOOP_YARN_RM2}</value>
</property>
<!--指定zookeeper集群的地址-->
<property>
<name>yarn.resourcemanager.zk-address</name>
<value>${HADOOP_ZOOKEEPERS}</value>
</property>
<!--启用自动恢复-->
<property>
<name>yarn.resourcemanager.recovery.enabled</name>
<value>true</value>
</property>
<!--指定resourcemanager的状态信息存储在zookeeper集群-->
<property>
<name>yarn.resourcemanager.store.class</name>
<value>org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore</value>
</property>
</configuration>
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
</configuration>
#!/bin/bash
# hadoop安装目录
export HADOOP_HOME="/opt/cluster/hadoop-2.7.2"
#
# hadoop集群名称
export HADOOP_CLUSTER_NAME="myhadoop"
# hadoop运行时产生文件的存储目录
export HADOOP_TMP_DIR="/hdata/hadoop"
#
# 集群中全部NameNode节点
export HADOOP_NAME_NODES="nn1,nn2"
# 根据上面列出的NameNode配置全部NameNode节点地址,变量名称如HADOOP_NN1,HADOOP_NN2依次增长
export HADOOP_NN1="HD-2-101"
export HADOOP_NN2="HD-2-102"
# NameNode元数据在JournalNode上的存放位置
export HADOOP_JN="qjournal://HD-2-101:8485;HD-2-102:8485;HD-2-103:8485/myhadoop"
# id_rsa公钥地址
export HADOOP_ISA_PATH="~/.ssh/id_rsa"
# journalnode服务器存储目录
export HADOOP_JN_DATA_DIR="/hdata/hadoop/journal"
# zookeeper机器列表
export HADOOP_ZOOKEEPERS="HD-2-101:2181,HD-2-102:2181,HD-2-103:2181"
# yarn集群id
export HADOOP_YARN_ID="yarn-ha"
# 集群中全部的resourcemanager
export HADOOP_YARN_RMS="rm1,rm2"
# 根据上面列出的resourcemanager配置全部resourcemanager节点地址,变量名称如HADOOP_YARN_RM1,HADOOP_YARN_RM2依次增长
export HADOOP_YARN_RM1="HD-2-101"
export HADOOP_YARN_RM2="HD-2-102"
baseDir=$(cd `dirname $0`; pwd)
for template in `cd ${baseDir}; ls *template`
do
siteFile=`echo ${template} | gawk -F"." '{print $1"."$2}'`
envsubst < ${template} > ${HADOOP_HOME}/etc/hadoop/${siteFile}
echo -e "#### set ${siteFile} succeed"
done
# 同步
sh xsync "/opt/cluster/hadoop-2.7.2" "/opt/cluster";
# 创建软链接
sh doCommand all "ln -s /opt/cluster/hadoop-2.7.2 /opt/cluster/hadoop;";
# 启动zk集群
sh doCommand all "source /etc/profile; /opt/cluster/zookeeper/bin/zkServer.sh start";
# 初始化在ZK中的状态
sh /opt/cluster/hadoop/bin/hdfs zkfc -formatZK
sh doCommand all "sh /opt/cluster/hadoop/sbin/hadoop-daemon.sh start journalnode";
# 格式化
sh /opt/cluster/hadoop/bin/hdfs namenode -format;
# 启动
sh /opt/cluster/hadoop/sbin/hadoop-daemon.sh start namenode;
# 同步元数据
sh /opt/cluster/hadoop/bin/hdfs namenode -bootstrapStandby;
# 启动NameNode2
sh /opt/cluster/hadoop/sbin/hadoop-daemon.sh start namenode;
sh /opt/cluster/hadoop/sbin/stop-dfs.sh sh /opt/cluster/hadoop/sbin/start-dfs.sh
# 检查状态
sh /opt/cluster/hadoop/bin/hdfs haadmin -getServiceState nn1;
sh /opt/cluster/hadoop/bin/hdfs haadmin -getServiceState nn2;
sh /opt/cluster/hadoop/sbin/start-yarn.sh;
sh /opt/cluster/hadoop/sbin/yarn-daemon.sh start resourcemanager;
sh /opt/cluster/hadoop/bin/yarn rmadmin -getServiceState rm1; sh /opt/cluster/hadoop/bin/yarn rmadmin -getServiceState rm2;
export HADOOP_CLUSTER_NAME myhadoop export HADOOP_TMP_DIR hdata hadoop hdata export HADOOP_TMP_DIR myhadoop export
# 建立路径
/opt/cluster/hadoop/bin/hadoop fs -mkdir -p /mapreduce/test/input/20180702;
# 上传
/opt/cluster/hadoop/bin/hadoop fs -put ./word.txt /mapreduce/test/input/20180702;
cd /opt/cluster/hadoop; bin/hadoop jar share/hadoop/mapreduce/hadoop-mapreduce-examples-2.7.2.jar wordcount /mapreduce/test/input/20180702 /mapreduce/test/output/20180702;