rabbitmq对network partition的处理

时间 2019-11-11

标签 rabbitmq network partition 处理栏目 RabbitMQ 繁體版

原文原文链接

rabbitmq没有很好的分区容错性，所以，若是须要在广域网里使用rabbitmq集群，建议使用federation或者shovel进行替代。那么即便rabbitmq集群运行在局域网内也不能彻底避免网络分区现象（network partition），例如，当路由器或者交换机出现问题，或者网口down掉时，均可能发生网络分区。 java

那么，出现网络分区对rabbitmq集群有什么影响呢？当发生网络分区时，不一样分区里的节点都认为对方down掉，对exchange，queue，binding的操做都仅针对本分区有效；存储在mnesia的元数据（exchange相关属性，queue相关属性等）不会在集群间进行数据同步；另外，对于镜像队列，在各自的分区里都会存在一个master进程处理队列的相关操做。更重要的是，当网络分区恢复后，这些现象依旧是存在的！ node

从3.1.0版本开始，rabbitmq增长了对网络分区的处理。能够在rabbitmq.conf中进行配置。网络

[
 {rabbit,
  [{tcp_listeners,[5672]},
   {cluster_partition_handling, ignore}]
 }
].

rabbitmq一共有三种处理方式：ignore，autoheal，pause_minority。默认的处理方式是ignore，即什么也不作。 app

autoheal的处理方式：简单来说就是当网络分区恢复后，rabbitmq各分区彼此进行协商，分区中客户端链接数最多的为胜者，其他的所有会进行重启，这样也就恢复到同步的状态了。 tcp

内部大体原理： ide

（1）rabbitmq启动后会建立并注册名为rabbit_node_monitor的进程，该进程启动时会订阅节点的启停状态，订阅mnesia的系统事件。 spa

init([]) ->
    process_flag(trap_exit, true),
    net_kernel:monitor_nodes(true),
    {ok, _} = mnesia:subscribe(system),
    {ok, #state{monitors    = pmon:new(),
                subscribers = pmon:new(),
                partitions  = [],
                autoheal    = rabbit_autoheal:init()}}.

（2）该进程收到{inconsistent_database,running_partitioned_network, Node}消息后，从集群中挑选一个节点向其rabbit_node_monitor进程发送 {autoheal_msg,{request_start,node()}}消息。

rabbit_node_monitor.erl

handle_info({mnesia_system_event,
             {inconsistent_database, running_partitioned_network, Node}},
            State = #state{partitions = Partitions,
                           monitors   = Monitors,
                           autoheal   = AState}) ->
    State1 = case pmon:is_monitored({rabbit, Node}, Monitors) of
                 true  -> State;
                 false -> State#state{
                            monitors = pmon:monitor({rabbit, Node}, Monitors)}
             end,
    ok = handle_live_rabbit(Node),
    Partitions1 = ordsets:to_list(
                    ordsets:add_element(Node, ordsets:from_list(Partitions))),
    {noreply, State1#state{partitions = Partitions1,
                           autoheal   = rabbit_autoheal:maybe_start(AState)}};

rabbit_autoheal.erl

maybe_start(not_healing) ->
    case enabled() of
        true  -> [Leader | _] = lists:usort(rabbit_mnesia:cluster_nodes(all)),
                 send(Leader, {request_start, node()}),
                 rabbit_log:info("Autoheal request sent to ~p~n", [Leader]),
                 not_healing;
        false -> not_healing
    end;

（3）rabbit_node_monitor进程收到{autoheal_msg,{request_start,Node}}消息后，分析获得客户链接数最多的分区，并从该分区中取第一个节点通知它成为胜利者，同时告知哪些节点是须要重启的，另外通知其余失败者分区的节点进行重启。

rabbit_node_monitor.erl

handle_info({autoheal_msg, Msg}, State = #state{autoheal   = AState,
                                                partitions = Partitions}) ->
    AState1 = rabbit_autoheal:handle_msg(Msg, AState, Partitions),
    {noreply, State#state{autoheal = AState1}};


rabbit_autoheal.erl

handle_msg({request_start, Node},
           not_healing, Partitions) ->
    rabbit_log:info("Autoheal request received from ~p~n", [Node]),
    case rabbit_node_monitor:all_rabbit_nodes_up() of
        false -> not_healing;
        true  -> AllPartitions = all_partitions(Partitions),
                 {Winner, Losers} = make_decision(AllPartitions),
                 rabbit_log:info("Autoheal decision~n"
                                 "  * Partitions: ~p~n"
                                 "  * Winner:     ~p~n"
                                 "  * Losers:     ~p~n",
                                 [AllPartitions, Winner, Losers]),
                 send(Winner, {become_winner, Losers}),
                 [send(L, {winner_is, Winner}) || L <- Losers],
                 not_healing
    end;

（4）节点收到成为胜利者的消息后，等待全部失败者分区节点中止rabbit应用以及rabbit依赖的应用，当全部失败者分区的节点都中止rabbit应用后，再通知它们启动rabbit应用。

rabbit_autoheal.erl

handle_msg({become_winner, Losers},
           not_healing, _Partitions) ->
    rabbit_log:info("Autoheal: I am the winner, waiting for ~p to stop~n",
                    [Losers]),
    {winner_waiting, Losers, Losers};

handle_msg({winner_is, Winner},
           not_healing, _Partitions) ->
    rabbit_log:warning(
      "Autoheal: we were selected to restart; winner is ~p~n", [Winner]),
    rabbit_node_monitor:run_outside_applications(
      fun () ->
              MRef = erlang:monitor(process, {?SERVER, Winner}),
              rabbit:stop(),
              send(Winner, {node_stopped, node()}),
              receive
                  {'DOWN', MRef, process, {?SERVER, Winner}, _Reason} -> ok;
                  autoheal_safe_to_start                              -> ok
              end,
              erlang:demonitor(MRef, [flush]),
              rabbit:start()
      end),
    restarting;

handle_msg({node_stopped, Node},
           {winner_waiting, [Node], Notify}, _Partitions) ->
    rabbit_log:info("Autoheal: final node has stopped, starting...~n",[]),
    [{rabbit_outside_app_process, N} ! autoheal_safe_to_start || N <- Notify],
    not_healing;

handle_msg({node_stopped, Node},
           {winner_waiting, WaitFor, Notify}, _Partitions) ->
    {winner_waiting, WaitFor -- [Node], Notify};

到这里，rabbitmq完成了网络分区的处理。注意：这种处理方式可能会出现数据丢失的现象。在CAP中，优先保证了AP。

pause_minority的处理方式：rabbitmq节点感知集群中其余节点down掉时，会判断本身在集群中处于多数派仍是少数派，也就是判断与本身造成集群的节点个数在整个集群中的比例是否超过一半。若是是多数派，则正常工做，若是是少数派，则会中止rabbit应用并不断检测直到本身成为多数派的一员后再次启动rabbit应用。注意：这种处理方式集群一般由奇数个节点组成。在CAP中，优先保证了CP。 rest