第三步：HADOOP-HDFS & YARN HA

时间 2020-07-07

原文原文链接

集群规划

系统版本	主机名	IP	用途
CentOS-7.7	hadoop-test-1	192.168.233.65	namenode datanode DFSZKFailoverController hive hmaster resourcemanager NodeManager
CentOS-7.7	hadoop-test-2	192.168.233.94	namenode datanode DFSZKFailoverController hmaster resourcemanager NodeManager
CentOS-7.7	hadoop-test-3	192.168.233.17	datanode zookeeper NodeManager
CentOS-7.7	hadoop-test-4	192.168.233.238	datanode zookeeper NodeManager
CentOS-7.7	hadoop-test-5	192.168.233.157	datanode zookeeper NodeManager

hadoop安装

$ wget https://mirrors.tuna.tsinghua.edu.cn/apache/hadoop/common/hadoop-2.10.0/hadoop-2.10.0.tar.gz
$ tar -zxvf hadoop-2.10.0.tar.gz -C /data
$ for i in {1..5};do ssh hadoop-test-$i "mkdir /data/hdfs -p";done
$ for i in {1..5};do ssh hadoop-test-$i "chown -R hadoop.hadoop /data/hdfs ";done
$ su hadoop
$ cd /data/hadoop/etc/hadoop/

hdfs-site.xmljava

$ vim hdfs-site.xml 
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->

<!-- Put site-specific property overrides in this file. -->

<configuration>
    <!--指定hdfs的block大小64M -->
     <property>  
        <name>dfs.block.size</name>  
        <value>67108864</value>
     </property> 
    <property>
    　　<name>dfs.replication</name>
    　　<value>3</value>
    </property>
    <!--配置nameservice-->
    <property>
        <name>dfs.nameservices</name>
        <value>hadoop-test-cluster</value>
    </property>
    <!-- 指定hdfs中namenode的存储位置 -->
    <property>
        <name>dfs.namenode.name.dir</name>
        <value>/data/hdfs/nn</value>
    </property>
    <!--指定hdfs中datanode的存储位置-->
    <property>
        <name>dfs.datanode.data.dir</name>
        <value>/data/hdfs/dn</value>
    </property> 
    <!--nameservice下的名称节点两个id-->
    <property>
    　　<name>dfs.ha.namenodes.hadoop-test-cluster</name>
    　　<value>nn1,nn2</value>
    </property>
    <!--配置每一个nn的rpc地址-->
    <property>
        <name>dfs.namenode.rpc-address.hadoop-test-cluster.nn1</name>
        <value>192.168.233.65:8020</value>
    </property>
    <property>
        <name>dfs.namenode.rpc-address.hadoop-test-cluster.nn2</name>
        <value>192.168.233.94:8020</value>
    </property>
    <!--配置WebUI-->
    <property>
        <name>dfs.webhdfs.enabled</name>
        <value>true</value>
    </property>

    <property>
        <name>dfs.journalnode.http-address</name>
        <value>0.0.0.0:8480</value>
    </property>
    <property>
        <name>dfs.journalnode.rpc-address</name>
        <value>0.0.0.0:8481</value>
    </property>
    <property>
        <name>dfs.namenode.http-address.hadoop-test-cluster.nn1</name>
        <value>192.168.233.65:50070</value>
    </property>
    <property>
        <name>dfs.namenode.http-address.hadoop-test-cluster.nn2</name>
        <value>192.168.233.94:50070</value>
    </property>
    <!--名称节点共享编辑目录.选择三台journalnode节点-->
    <property>
        <name>dfs.namenode.shared.edits.dir</name>
        <value>qjournal://192.168.233.17:8481;192.168.233.238:8481;192.168.233.157:8481/mtr-test-cluster</value>
    </property>
    <!--配置一个HA失败转移的java类(改配置是固定的)，client使用它判断哪一个节点是激活态-->
    <property>
        <name>dfs.client.failover.proxy.provider.hadoop-test-cluster</name>
        <value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
    </property>
    <!--开启NameNode失败自动切换-->
    <property>
        <name>dfs.ha.automatic-failover.enabled</name>
        <value>true</value>
    </property>
    <!--脚本列表或者java类，在容灾保护激活态的nn-->
    <property>
        <name>dfs.ha.fencing.methods</name>
        <value>sshfence</value>
    </property>
    <property>
        <name>dfs.ha.fencing.ssh.private-key-files</name>
        <value>/home/hadoop/.ssh/id_rsa</value>
    </property>
    <!--配置JN存放edit的本地路径-->
    <property>
        <name>dfs.journalnode.edits.dir</name>
        <value>/data/hdfs/journal</value>
    </property>
    <!--指定 zookeeper  ，还能够更多的设置超时时间等内容-->
    <property>
        <name>ha.zookeeper.quorum</name>
        <value>192.168.233.17:2181,192.168.233.238:2181,192.168.233.157:2181</value>
    </property>
    <property>
        <name>dfs.datanode.hdfs-blocks-metadata.enabled</name>
        <value>true</value>
    </property>
    <property>
        <name>dfs.block.local-path-access.user</name>
        <value>impala</value>
    </property>
    <property>
        <name>dfs.client.file-block-storage-locations.timeout.millis</name>
        <value>60000</value>
    </property>
</configuration>

core-site.xmlnode

$ vim core-site.xml 
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->

<!-- Put site-specific property overrides in this file. -->

<configuration>
    <!--配置hdfs文件系统名称服务-->
    <property>
        <name>fs.defaultFS</name>
        <value>hdfs://hadoop-test-cluster</value>
    </property>
    <property>
        <name>hadoop.tmp.dir</name>
        <value>/data/hdfs/tmp</value>
    </property>
    <property>
        <name>hadoop.logfile.size</name>
        <value>10000000</value>
        <description>每一个日志文件的最大值，单位：bytes </description>
    </property>
    <property>
        <name>hadoop.logfile.count</name>
        <value>10</value>
        <description>日志文件的最大数量</description>
    </property>
    <!--指定 zookeeper  ，还能够更多的设置超时时间等内容-->
    <property>
        <name>ha.zookeeper.quorum</name>
        <value>192.168.233.17:2181,192.168.233.238:2181,192.168.233.157:2181</value>
    </property>
    <property>
         <name>dfs.client.read.shortcircuit</name>
         <value>true</value>
    </property>
    <property>
         <name>dfs.client.read.shortcircuit.skip.checksum</name>
         <value>false</value>
    </property>
    <property>
         <name>dfs.datanode.hdfs-blocks-metadata.enabled</name>
         <value>true</value>
    </property>
</configuration>

mapred-site.xmlweb

$ vim mapred-site.xml
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->

<!-- Put site-specific property overrides in this file. -->

<configuration>
   <property>
       <name>mapreduce.framework.name</name>
       <value>yarn</value>
   </property>
         <!-- 指定mapreduce jobhistory地址 -->
        <property>  
            <name>mapreduce.jobhistory.address</name>  
            <value>192.168.233.65:10020</value>  
        </property> 
        
        <!-- 任务历史服务器的web地址 -->
        <property>  
            <name>mapreduce.jobhistory.webapp.address</name>  
            <value>192.168.233.65:19888</value>  
        </property> 
</configuration>

yarn-site.xmlexpress

$ vim /data/hadoop/etc/hadoop/yarn-site.xml
<?xml version="1.0"?>
<!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->
<configuration>
        <property>
            <name>yarn.acl.enable</name>
            <value>true</value>
        </property>
        <property>
            <name>yarn.admin.acl</name>
            <value>*</value>
        </property>

       <!--日志聚合功能--> 
        <property>
            <name>yarn.log-aggregation-enable</name>
            <value>true</value>
        </property>
       <!--在HDFS上聚合的日志最长保留多少秒。3天--> 
        <property>
            <name>yarn.log-aggregation.retain-seconds</name>
            <value>259200</value>
        </property>
        <property>
            <name>yarn.resourcemanager.cluster-id</name>
            <value>hadoop-test</value>
        </property>
       <!--rm失联后从新连接的时间--> 
        <property> 
            <name>yarn.resourcemanager.connect.retry-interval.ms</name> 
            <value>2000</value> 
        </property>
        <!-- 为了可以运行MapReduce程序，须要让各个NodeManager在启动时加载shuffle server，shuffle server其实是Jetty/Netty Server，Reduce Task经过该server从各个NodeManager上远程拷贝Map Task产生的中间结果。下面增长的两个配置均用于指定shuffle serve。 -->
        <property>  
            <name>yarn.nodemanager.aux-services</name>  
            <value>mapreduce_shuffle</value>  
        </property>
        <!--启用resourcemanager ha-->  
        <!--是否开启RM ha，默认是开启的-->  
        <property>  
           <name>yarn.resourcemanager.ha.enabled</name>  
           <value>true</value>  
        </property>  
        <property>
           <name>yarn.resourcemanager.ha.automatic-failover.embedded</name>
           <value>true</value>
        </property>
        <!--指定rm的名字-->
        <property>  
           <name>yarn.resourcemanager.ha.rm-ids</name>  
           <value>rm1,rm2</value>  
        </property>
        <!--Ha功能，须要一组zk地址，用逗号分隔。被ZKFailoverController使用于自动失效备援failover。 --> 
        <property>
          <name>ha.zookeeper.quorum</name>
          <value>192.168.233.17:2181,192.168.233.238:2181,192.168.233.157:2181</value> 
        </property>
        <!--开启故障自动切换--> 
         <property> 
            <name>yarn.resourcemanager.ha.automatic-failover.enabled</name> 
            <value>true</value> 
         </property> 
        <!--指定rm的地址-->
        <property>  
           <name>yarn.resourcemanager.hostname.rm1</name>  
           <value>192.168.233.65</value>  
        </property>  
        <property>  
           <name>yarn.resourcemanager.hostname.rm2</name>  
           <value>192.168.233.94</value>  
        </property>  
        <!--使用ZK集群保存状态信息,指定zookeeper队列 -->
        <property>  
           <name>yarn.resourcemanager.zk-address</name>  
            <value>192.168.233.17:2181,192.168.233.238:2181,192.168.233.157:2181</value>  
        </property>  
        <!--启用自动恢复，当任务进行一半，rm坏掉，就要启动自动恢复，默认是false-->   
        <property>  
           <name>yarn.resourcemanager.recovery.enabled</name>  
           <value>true</value>  
        </property>  
       <!--配置与zookeeper的链接地址，被RM用于状态存储的ZooKeeper服务器的主机:端口号,多个ZooKeeper的话使用逗号分隔。 --> 
        <property> 
          <name>yarn.resourcemanager.zk-state-store.address</name> 
          <value>192.168.233.17:2181,192.168.233.238:2181,192.168.233.157:2181</value>
        </property>  
        <!--指定resourcemanager的状态信息存储在zookeeper集群，默认是存放在FileSystem里面。-->   
        <property>  
           <name>yarn.resourcemanager.store.class</name>  
           <value>org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore</value>  
        </property> 
        <!--schelduler失联等待链接时间--> 
        <property> 
           <name>yarn.app.mapreduce.am.scheduler.connection.wait.interval-ms</name> 
           <value>5000</value> 
        </property> 
        <!--配置rm1--> 
         <!-- 客户端经过该地址向RM提交对应用程序操做 -->
         <property> 
            <name>yarn.resourcemanager.address.rm1</name> 
            <value>192.168.233.65:8132</value> 
         </property> 
         <!--ResourceManager 对ApplicationMaster暴露的访问地址。ApplicationMaster经过该地址向RM申请资源、释放资源等。 -->
         <property> 
            <name>yarn.resourcemanager.scheduler.address.rm1</name> 
            <value>192.168.233.65:8130</value> 
         </property> 
         <!-- RM HTTP访问地址,查看集群信息-->
         <property> 
            <name>yarn.resourcemanager.webapp.address.rm1</name> 
            <value>192.168.233.65:8188</value> 
         </property> 
         <!-- NodeManager经过该地址交换信息 -->
         <property>
            <name>yarn.resourcemanager.resource-tracker.address.rm1</name> 
            <value>192.168.233.65:8131</value> 
         </property> 
         <!--管理员经过该地址向RM发送管理命令 -->
         <property> 
            <name>yarn.resourcemanager.admin.address.rm1</name> 
            <value>192.168.233.65:8033</value> 
         </property> 
         <property> 
            <name>yarn.resourcemanager.ha.admin.address.rm1</name> 
            <value>192.168.233.65:23142</value> 
         </property> 
         
        <!--配置rm2--> 
         <property> 
            <name>yarn.resourcemanager.address.rm2</name> 
            <value>192.168.233.94:8132</value> 
         </property> 
         <property> 
            <name>yarn.resourcemanager.scheduler.address.rm2</name> 
            <value>192.168.233.94:8130</value> 
         </property> 
         <property> 
            <name>yarn.resourcemanager.webapp.address.rm2</name> 
            <value>192.168.233.94:8188</value> 
         </property> 
         <property>
            <name>yarn.resourcemanager.resource-tracker.address.rm2</name> 
            <value>192.168.233.94:8131</value> 
         </property> 
         <property> 
            <name>yarn.resourcemanager.admin.address.rm2</name> 
            <value>192.168.233.94:8033</value> 
         </property> 
         <property> 
            <name>yarn.resourcemanager.ha.admin.address.rm2</name> 
            <value>192.168.233.94:23142</value> 
         </property> 
         <property>
           <name>yarn.log-aggregation-enable</name>
           <value>true</value>
     </property>                                                                

         <!--资源配置-->
         <property>
             <name>yarn.scheduler.fair.preemption</name>
             <value>true</value>
             <description>开启资源抢占,default is True</description>
         </property>
         <!--当应用程序未指定队列名时，是否指定用户名做为应用程序所在的队列名。若是设置为false或者未设置，全部未知队列的应用程序将被提交到default队列中，默认值为true。-->
         <property>
            <name>yarn.scheduler.fair.user-as-default-queue</name>
            <value>true</value>
            <description>default is True</description>
         </property>
         <!--是否容许建立未定义的资源池。若是设置成true，yarn将会自动建立任务中指定的未定义过的资源池。设置成false以后，任务中指定的未定义的资源池将无效，该任务会被分配到default资源池中。,default is True-->
         <property>
            <name>yarn.scheduler.fair.allow-undeclared-pools</name>
            <value>false</value>
         </property>
         <!-- 单个任务container可申请的最少物理内存量，默认是1024（MB），若是一个任务申请的物理内存量少于该值，则该对应的值改成这个数 -->
         <property>
            <name>yarn.scheduler.minimum-allocation-mb</name>
            <value>512</value>
         </property>
         <!-- 单个任务container可申请的最多物理内存量，默认是8192（MB）。默认状况下，YARN采用了线程监控的方法判断任务是否超量使用内存，一旦发现超量，则直接将其杀死。因为Cgroup对内存的控制缺少灵活性（即任务任什么时候刻不能超过内存上限，若是超过，则直接将其杀死或者报OOM），而Java进程在建立瞬间内存将翻倍，以后骤降到正常值，这种状况下，采用线程监控的方式更加灵活（当发现进程树内存瞬间翻倍超过设定值时，可认为是正常现象，不会将任务杀死），所以YARN未提供Cgroups内存隔离机制 -->
         <property>
            <name>yarn.scheduler.maximum-allocation-mb</name>
            <value>4096</value>
         </property>
         <property>
            <name>yarn.scheduler.minimum-allocation-vcores</name>
         <value>1</value>
         </property>
         <property>
            <name>yarn.scheduler.maximum-allocation-vcores</name>
            <value>4</value>
         </property>
         <property>
            <name>yarn.scheduler.increment-allocation-vcores</name>
            <value>1</value>
         </property>
         <property>
            <name>yarn.scheduler.increment-allocation-mb</name>
            <value>512</value>
         </property>
         <!--yarn提交应用时，为单独一个应用设置最大重试次数-->
         <property>
            <name>yarn.resourcemanager.am.max-attempts</name>
            <value>2</value>
         </property>
         <property>
            <name>yarn.resourcemanager.container.liveness-monitor.interval-ms</name>
            <value>600000</value>
         </property>
         <property>
            <name>yarn.resourcemanager.nm.liveness-monitor.interval-ms</name>
            <value>1000</value>
         </property>
         <property>
            <name>yarn.nm.liveness-monitor.expiry-interval-ms</name>
            <value>600000</value>
         </property>
         <property>
            <name>yarn.resourcemanager.resource-tracker.client.thread-count</name>
            <value>50</value>
         </property>
         <!-- 表示该节点上YARN可以使用的物理内存总量，默认是8192（MB），注意，若是节点内存资源不够8GB，则须要调减少这个值。-->
         <property>
           <name>yarn.nodemanager.resource.memory-mb</name>
           <value>6000</value>
           <discription>每一个节点可用内存,单位MB</discription>
         </property>
         <!-- 表示该节点上YARN可以使用的虚拟CPU个数，默认是8，注意，目前推荐将该值设值为与物理CPU核数数目相同。若是节点CPU核数不够8个，则须要调减少这个值。-->
         <property>
           <name>yarn.nodemanager.resource.cpu-vcores</name>
           <value>2</value>
         </property>
         <property>
            <name>yarn.nodemanager.pmem-check-enabled</name>
            <value>false</value>
         </property>
         <property>
            <name>yarn.nodemanager.vmem-check-enabled</name>
            <value>false</value>
         </property>
         <property>
            <name>yarn.resourcemanager.scheduler.class</name>
            <value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler</value>
         </property>
         <!--RM中保留的最大的已完成的任务信息数量-->
         <property>
           <name>yarn.resourcemanager.max-completed-applications</name>
           <value>10000</value>
         </property>
         <!--故障处理类，以轮训方式寻找活动的RM所使用的类-->
         <property> 
            <name>yarn.client.failover-proxy-provider</name> 
            <value>org.apache.hadoop.yarn.client.ConfiguredRMFailoverProxyProvider</value>
         </property> 
         
         <property>
            <name>yarn.resourcemanager.ha.automatic-failover.zk-base-path</name>
            <value>/yarn-leader-election</value>
        </property>
</configuration>

fair-scheduler.xml资源池配置apache

$ vim /data/hadoop/etc/hadoop/fair-scheduler.xml
<?xml version="1.0"?>
<allocations>
<userMaxAppsDefault>30</userMaxAppsDefault>
<queue name="root">
<minResources>5120mb,5vcores</minResources>
<maxResources>29000mb,10vcores</maxResources>
<maxRunningApps>100</maxRunningApps>
<weight>1.0</weight>
<schedulingMode>DRF</schedulingMode>
<aclSubmitApps> </aclSubmitApps>
<aclAdministerApps> </aclAdministerApps>
  <queue name="users" type="parent">
    <minResources>10000mb,2vcores</minResources>
    <maxResources>15000mb,6vcores</maxResources>
    <maxRunningApps>50</maxRunningApps>
    <weight>3</weight>
    <schedulingPolicy>fair</schedulingPolicy>
    <aclSubmitApps>hadoop,hdfs</aclSubmitApps>
    <aclAdministerApps>hadoop</aclAdministerApps>
  </queue>
  <queue name="default" type="parent">
    <minResources>1000mb,1vcores</minResources>
    <maxResources>2000mb,2vcores</maxResources>
    <maxRunningApps>50</maxRunningApps>
    <weight>3</weight>
    <schedulingPolicy>fair</schedulingPolicy>
    <aclSubmitApps>hadoop</aclSubmitApps>
    <aclAdministerApps>hadoop</aclAdministerApps>
  </queue>
  <queue name="prod">
    <minResources>1000mb,1vcores</minResources>
    <maxResources>10000mb,4vcores</maxResources>
    <maxRunningApps>50</maxRunningApps>
    <weight>3</weight>
    <schedulingPolicy>fair</schedulingPolicy>
    <aclSubmitApps>hadoop,hdfs</aclSubmitApps>
    <aclAdministerApps>hadoop</aclAdministerApps>
  </queue>

  <queueMaxResourcesDefault>20000mb,16vcores</queueMaxResourcesDefault>
</queue>

  <queuePlacementPolicy>
    <rule name="specified" />
    <rule name="primaryGroup" create="false" />
    <rule name="nestedUserQueue">
        <rule name="secondaryGroupExistingQueue" create="false" />
    </rule>
    <rule name="default"   queue="users"/>

配置dn，slavesbootstrap

$ vim slaves 
hadoop-test-1
hadoop-test-2
hadoop-test-3
hadoop-test-4
hadoop-test-5

修改hadoop-env.shvim

$ vim hadoop-env.sh
export JAVA_HOME=/usr/local/jdk1.8.0_231
export HADOOP_LOG_DIR=/data/hdfs/logs
export HADOOP_SECURE_DN_LOG_DIR=/data/hdfs/logs
export HADOOP_PRIVILEGED_NFS_LOG_DIR=/data/hdfs/logs
export HADOOP_MAPRED_LOG_DIR=/data/hdfs/logs
export HADOOP_LOG_DIR=/data/hdfs/logs
export YARN_LOG_DIR=/data/hdfs/logs

将/data/hadoop拷贝至每一个节点的/data/
在hadoop-test-2配置yarnbash

$ vim yarn-site.xml 
        <property> 
           <name>yarn.resourcemanager.ha.id</name> 
           <value>rm1</value> 
        <description>If we want to launch more than one RM in single node, we need this configuration</description> 
        </property>

全部节点配置.bashrc服务器

$ vim ~/.bashrc
#hadoop
export HADOOP_HOME=/data/hadoop
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
$ source ~/.bashrc

初始化session

$ hdfs zkfc -formatZK
20/06/18 11:29:37 INFO tools.DFSZKFailoverController: STARTUP_MSG: 
/************************************************************
STARTUP_MSG: Starting DFSZKFailoverController
...
...
...
20/06/18 11:29:38 INFO ha.ActiveStandbyElector: Session connected.
20/06/18 11:29:38 INFO ha.ActiveStandbyElector: Successfully created /hadoop-ha/hadoop-test-cluster in ZK.
20/06/18 11:29:38 INFO zookeeper.ZooKeeper: Session: 0x300002f96aa0000 closed
20/06/18 11:29:38 INFO zookeeper.ClientCnxn: EventThread shut down for session: 0x300002f96aa0000
20/06/18 11:29:38 INFO tools.DFSZKFailoverController: SHUTDOWN_MSG: 
/************************************************************
SHUTDOWN_MSG: Shutting down DFSZKFailoverController at hadoop-test-1/192.168.233.65
************************************************************

在journalnode启动journalnode

$ hadoop-daemon.sh start journalnode
starting journalnode, logging to /data/hdfs/logs/hadoop-hadoop-journalnode-hadoop-test-3.out
$ jps
9140 Jps
9078 JournalNode
4830 QuorumPeerMain

在主namenode格式化journalnode

$ hadoop namenode -format
DEPRECATED: Use of this script to execute hdfs command is deprecated.
Instead use the hdfs command for it.

20/06/18 11:32:42 INFO namenode.NameNode: STARTUP_MSG: 
/************************************************************
STARTUP_MSG: Starting NameNode
...
...
...
20/06/18 11:32:44 INFO common.Storage: Storage directory /data/hdfs/nn has been successfully formatted.
20/06/18 11:32:44 INFO namenode.FSImageFormatProtobuf: Saving image file /data/hdfs/nn/current/fsimage.ckpt_0000000000000000000 using no compression
20/06/18 11:32:44 INFO namenode.FSImageFormatProtobuf: Image file /data/hdfs/nn/current/fsimage.ckpt_0000000000000000000 of size 325 bytes saved in 0 seconds .
20/06/18 11:32:44 INFO namenode.NNStorageRetentionManager: Going to retain 1 images with txid >= 0
20/06/18 11:32:44 INFO namenode.FSImage: FSImageSaver clean checkpoint: txid = 0 when meet shutdown.
20/06/18 11:32:44 INFO namenode.NameNode: SHUTDOWN_MSG: 
/************************************************************
SHUTDOWN_MSG: Shutting down NameNode at hadoop-test-1/192.168.233.65
************************************************************/

查看zk

$ zkCli.sh -server hadoop-test-4:2181
...
...
...
2020-06-18 11:34:25,075 [myid:hadoop-test-4:2181] - INFO  [main-SendThread(hadoop-test-4:2181):ClientCnxn$SendThread@959] - Socket connection established, initiating session, client: /192.168.233.157:52598, server: hadoop-test-4/192.168.233.238:2181
2020-06-18 11:34:25,094 [myid:hadoop-test-4:2181] - INFO  [main-SendThread(hadoop-test-4:2181):ClientCnxn$SendThread@1394] - Session establishment complete on server hadoop-test-4/192.168.233.238:2181, sessionid = 0x200002f8ee50001, negotiated timeout = 30000

WATCHER::

WatchedEvent state:SyncConnected type:None path:null
[zk: hadoop-test-4:2181(CONNECTED) 0] ls /
[hadoop-ha, zookeeper]
[zk: hadoop-test-4:2181(CONNECTED) 1] ls /hadoop-ha

在主namenode启动主namenode

$ hadoop-daemon.sh start namenode
starting namenode, logging to /data/hdfs/logs/hadoop-hadoop-namenode-hadoop-test-1.out
$ jps
10864 NameNode
10951 Jps

在备namenode copy主namenode的数据并启动

$ hdfs namenode -bootstrapStandby

20/06/18 11:37:08 INFO namenode.NameNode: STARTUP_MSG: 
/************************************************************
STARTUP_MSG: Starting NameNode
...
...
...
20/06/18 11:37:09 INFO namenode.FSEditLog: Edit logging is async:true
20/06/18 11:37:09 INFO namenode.TransferFsImage: Opening connection to http://192.168.233.65:50070/imagetransfer?getimage=1&txid=0&storageInfo=-63:2055238485:1592451163995:CID-e2c9292c-6fca-46eb-aef3-c96149a72ade&bootstrapstandby=true
20/06/18 11:37:09 INFO common.Util: Combined time for fsimage download and fsync to all disks took 0.00s. The fsimage download took 0.00s at 0.00 KB/s. Synchronous (fsync) write to disk of /data/hdfs/nn/current/fsimage.ckpt_0000000000000000000 took 0.00s.
20/06/18 11:37:09 INFO namenode.TransferFsImage: Downloaded file fsimage.ckpt_0000000000000000000 size 325 bytes.
20/06/18 11:37:09 INFO namenode.NameNode: SHUTDOWN_MSG: 
/************************************************************
SHUTDOWN_MSG: Shutting down NameNode at hadoop-test-2/192.168.233.94
************************************************************/
$ hadoop-daemon.sh start namenode
starting namenode, logging to /data/hdfs/logs/hadoop-hadoop-namenode-hadoop-test-2.out

在主备namenode节点执行

$ hadoop-daemon.sh start zkfc
starting zkfc, logging to /data/hdfs/logs/hadoop-hadoop-zkfc-hadoop-test-1.out
$ hadoop-daemon.sh start zkfc
starting zkfc, logging to /data/hdfs/logs/hadoop-hadoop-zkfc-hadoop-test-2.out

在主namenode执行命令启动datanode

$ hadoop-daemons.sh start datanode

在主namenode resourcemanager启动yarn

$ start-yarn.sh 
starting yarn daemons
starting resourcemanager, logging to /data/hdfs/logs/yarn-hadoop-resourcemanager-hadoop-test-1.out
hadoop-test-1: starting nodemanager, logging to /data/hdfs/logs/yarn-hadoop-nodemanager-hadoop-test-1.out
hadoop-test-4: starting nodemanager, logging to /data/hdfs/logs/yarn-hadoop-nodemanager-hadoop-test-4.out
hadoop-test-2: starting nodemanager, logging to /data/hdfs/logs/yarn-hadoop-nodemanager-hadoop-test-2.out
hadoop-test-3: starting nodemanager, logging to /data/hdfs/logs/yarn-hadoop-nodemanager-hadoop-test-3.out
hadoop-test-5: starting nodemanager, logging to /data/hdfs/logs/yarn-hadoop-nodemanager-hadoop-test-5.out

在备namenode resourcemanager启动备resourcemanager

$ start-yarn.sh

查看进程

$  for i in {1..5};do ssh hadoop-test-$i "jps" && echo ---;done
10864 NameNode
12753 NodeManager
17058 Jps
12628 ResourceManager
11381 DataNode
11146 DFSZKFailoverController
---
9201 NameNode
14997 ResourceManager
15701 Jps
9431 DFSZKFailoverController
9623 DataNode
10701 NodeManager
---
14353 Jps
9078 JournalNode
9639 DataNode
4830 QuorumPeerMain
10574 NodeManager
---
9616 DataNode
10547 NodeManager
14343 Jps
4808 QuorumPeerMain
9115 JournalNode
---
9826 DataNode
10758 NodeManager
4807 QuorumPeerMain
9255 JournalNode
14538 Jps

登录hdfs主namenode的UI：

登录yarn resoucemanager的UI：

WordCount演示

$ cd /data/hadoop/
$ ll LICENSE.txt
-rw-r--r-- 1 hadoop hadoop 106210 6月  18 09:26 LICENSE.txt
$ hadoop fs -mkdir /input
$ hadoop fs -put LICENSE.txt /input 
$ hadoop jar share/hadoop/
common/    hdfs/      httpfs/    kms/       mapreduce/ tools/     yarn/

$ hadoop jar share/hadoop/mapreduce/hadoop-mapreduce-examples-2.10.0.jar wordcount /input /output
20/06/19 13:55:59 INFO client.ConfiguredRMFailoverProxyProvider: Failing over to rm2
20/06/19 13:56:00 INFO input.FileInputFormat: Total input files to process : 1
20/06/19 13:56:00 INFO mapreduce.JobSubmitter: number of splits:1
20/06/19 13:56:00 INFO Configuration.deprecation: yarn.resourcemanager.zk-address is deprecated. Instead, use hadoop.zk.address
20/06/19 13:56:00 INFO Configuration.deprecation: yarn.resourcemanager.system-metrics-publisher.enabled is deprecated. Instead, use yarn.system-metrics-publisher.enabled
20/06/19 13:56:00 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1592470438667_0007
20/06/19 13:56:00 INFO conf.Configuration: resource-types.xml not found
20/06/19 13:56:00 INFO resource.ResourceUtils: Unable to find 'resource-types.xml'.
20/06/19 13:56:00 INFO resource.ResourceUtils: Adding resource type - name = memory-mb, units = Mi, type = COUNTABLE
20/06/19 13:56:00 INFO resource.ResourceUtils: Adding resource type - name = vcores, units = , type = COUNTABLE
20/06/19 13:56:00 INFO impl.YarnClientImpl: Submitted application application_1592470438667_0007
20/06/19 13:56:01 INFO mapreduce.Job: The url to track the job: http://hadoop-test-2:8188/proxy/application_1592470438667_0007/
20/06/19 13:56:01 INFO mapreduce.Job: Running job: job_1592470438667_0007
20/06/19 13:56:08 INFO mapreduce.Job: Job job_1592470438667_0007 running in uber mode : false
20/06/19 13:56:08 INFO mapreduce.Job:  map 0% reduce 0%
20/06/19 13:56:12 INFO mapreduce.Job:  map 100% reduce 0%
20/06/19 13:56:17 INFO mapreduce.Job:  map 100% reduce 100%
20/06/19 13:56:18 INFO mapreduce.Job: Job job_1592470438667_0007 completed successfully
20/06/19 13:56:18 INFO mapreduce.Job: Counters: 49
    File System Counters
        FILE: Number of bytes read=36735
        FILE: Number of bytes written=496235
        FILE: Number of read operations=0
        FILE: Number of large read operations=0
        FILE: Number of write operations=0
        HDFS: Number of bytes read=106319
        HDFS: Number of bytes written=27714
        HDFS: Number of read operations=6
        HDFS: Number of large read operations=0
        HDFS: Number of write operations=2
    Job Counters 
        Launched map tasks=1
        Launched reduce tasks=1
        Data-local map tasks=1
        Total time spent by all maps in occupied slots (ms)=2442
        Total time spent by all reduces in occupied slots (ms)=2678
        Total time spent by all map tasks (ms)=2442
        Total time spent by all reduce tasks (ms)=2678
        Total vcore-milliseconds taken by all map tasks=2442
        Total vcore-milliseconds taken by all reduce tasks=2678
        Total megabyte-milliseconds taken by all map tasks=2500608
        Total megabyte-milliseconds taken by all reduce tasks=2742272
    Map-Reduce Framework
        Map input records=1975
        Map output records=15433
        Map output bytes=166257
        Map output materialized bytes=36735
        Input split bytes=109
        Combine input records=15433
        Combine output records=2332
        Reduce input groups=2332
        Reduce shuffle bytes=36735
        Reduce input records=2332
        Reduce output records=2332
        Spilled Records=4664
        Shuffled Maps =1
        Failed Shuffles=0
        Merged Map outputs=1
        GC time elapsed (ms)=140
        CPU time spent (ms)=1740
        Physical memory (bytes) snapshot=510885888
        Virtual memory (bytes) snapshot=4263968768
        Total committed heap usage (bytes)=330301440
    Shuffle Errors
        BAD_ID=0
        CONNECTION=0
        IO_ERROR=0
        WRONG_LENGTH=0
        WRONG_MAP=0
        WRONG_REDUCE=0
    File Input Format Counters 
        Bytes Read=106210
    File Output Format Counters 
        Bytes Written=27714

$ hdfs dfs -ls /input
Found 1 items
-rw-r--r--   3 hadoop supergroup     106210 2020-06-19 13:55 /input/LICENSE.txt
$ hdfs dfs -ls /output
Found 2 items
-rw-r--r--   3 hadoop supergroup          0 2020-06-19 13:56 /output/_SUCCESS
-rw-r--r--   3 hadoop supergroup      27714 2020-06-19 13:56 /output/part-r-00000