x_service写文件是单点存在瓶颈, 使用进程池来消除瓶颈.
分配进程时使用的随机函数(Enum.random), 会调用Enumerable.count来得到List长度.
使用的Elixir发布工具exrm有bug, 没把调用固化(consolidated). 致使Enumerable.count每次调用时, 都须要call code_server来确认List模块是否存在, 在访问发生抖动时, 若分配主进程忽然积累了大量消息, call code_server 会变得更慢, 致使处理速度变慢, 消息越积越多, 性能愈来愈差, 并致使内存暴涨.html
Enum.map(:erlang.processes(), fn proc -> {:erlang.process\_info(proc, :memory), proc} end) |> Enum.sort(fn({{\_, a}, \_}, {{\_, b}, \_}) -> a > b end) |> List.first
对该进程process_info, 发现该进程有极高的reductions, message_queue_len, memorymongodb
iex(xxxx@xxxx)5> :erlang.process_info(pid) [registered_name: xxx, current_function: {:code_server, :call, 1}, initial_call: {:proc_lib, :init_p, 5}, status: :running, message_queue_len: 1426935, messages: [notify: {:log, :"xxx.xxxxx.xxxxxxx", <<107, 0, 0, 0, 3, 100, 97, 116, 97, 0, 82, 0, 0, 0, 2, 112, 108, 97, 121, 101, 114, 95, 115, 101, 115, 115, 105, 111, 110, 0, 11, 0, 0, 0, 52, 74, 85, 95, 89, 95, 55, ...>>}, ............ ...], links: [#PID<0.1749.0>], dictionary: [rand_seed: {%{bits: 58, jump: #Function<8.15449617/1 in :rand.mk_alg/1>, next: #Function<5.15449617/1 in :rand.mk_alg/1>, type: :exrop, uniform: #Function<6.15449617/1 in :rand.mk_alg/1>, uniform_n: #Function<7.15449617/2 in :rand.mk_alg/1>, weak_low_bits: 1}, [92689310913950825 | 4906624003405292]}, "$initial_call": {GenEvent, :init_it, 6}, "$ancestors": [XXX.Route.Supervisor, XXX.Supervisor, #PID<0.1747.0>]], trap_exit: false, error_handler: :error_handler, priority: :normal, group_leader: #PID<0.1746.0>, total_heap_size: 28600107, heap_size: 18481566, stack_size: 45, reductions: 7024105801, garbage_collection: [max_heap_size: %{error_logger: true, kill: true, size: 0}, min_bin_vheap_size: 46422, min_heap_size: 233, fullsweep_after: 65535, minor_gcs: 2], suspending: []]
该进程是个GenEvent进程有三个handle_event({:log, name, bin}, state)函数, 只有写文件的handle_event有动过. 怀疑Enum.random的性能太差. benchmark发现平均耗时在0.7μs. 不会成为瓶颈.app
压测环境没法重现, 只好将代码回滚.
回滚后, 线上再也不出现内存暴涨.dom
线上elixir版本:async
Erlang/OTP 20 [erts-9.0] [source] [64-bit] [smp:16:16] [ds:16:16:10] [async-threads:10] [kernel-poll:false] Interactive Elixir (1.5.1) - press Ctrl+C to exit (type h() ENTER for help)
压测环境elixir版本:ide
Erlang/OTP 20 [erts-9.3] [source] [64-bit] [smp:16:16] [ds:16:16:10] [async-threads:10] [hipe] [kernel-poll:false] Interactive Elixir (1.6.4) - press Ctrl+C to exit (type h() ENTER for help)
怀疑是elixir版本不同致使.
将压测环境的elixir版本调整至和线上一致后, 重现过内存暴涨问题. 但没法稳定重现.
发现 ctrl + c 后再 continue, 制造抖动能够稳定重现内存暴涨问题.
去掉def handle_event({:log, _name, _bin}中的Enum.random后(用取余轮转代替random), 问题再也不出现.函数
由于去掉Enum.random能够解决问题, 且只有特定elixir版本可重现问题, 因而比较elixir Enum.random在不一样版本下的实现.工具
Enum.random最终会调用, random_integer(0, count_of_enumerable)
enum.ex:2763性能
defp random_integer(lower_limit, upper_limit) do lower_limit + :rand.uniform(upper_limit - lower_limit + 1) - 1 end
rand.erl:281测试
uniform/1 -spec uniform(N :: pos_integer()) -> X :: pos_integer(). uniform(N) -> {X, Seed} = uniform_s(N, seed_get()), _ = seed_put(Seed), X.
rand.erl:403
seed_get() -> case get(?SEED_DICT) of undefined -> seed(?DEFAULT_ALG_HANDLER); Old -> Old % no type checking here end.
存储/得到seed用的进程字典, 效率不用担忧.
由于elixir1.6.0版本就没法重现问题(v1.5.1->v1.5.3均可重现). 对比了elixir v1.5.1/v1.6.3, 未发现可疑之处.
经过:observer.start能够看到MainChannel的调用栈, 并trace进程的消息, 每一次Enum.random调用, 都会有一次code_server:call. 以前线上的调用栈, 也是call code_server, 这必定不是巧合.
https://elixir-lang.org/getti...
When working with Elixir projects, using the Mix build tool, you may see the output as follows:
Consolidated String.Chars Consolidated Collectable Consolidated List.Chars Consolidated IEx.Info Consolidated Enumerable Consolidated Inspect
Those are all protocols that ship with Elixir and they are being consolidated. Because a protocol can dispatch to any data type, the protocol must check on every call if an implementation for the given type exists. This may be expensive.
However, after our project is compiled using a tool like Mix, we know all modules that have been defined, including protocols and their implementations. This way, the protocol can be consolidated into a very simple and fast dispatch module.
From Elixir v1.2, protocol consolidation happens automatically for all projects. We will build our own project in the Mix and OTP guide.
反编译v1.5.1的Enumerable, 发现确实会调用ensure_compiled.
dc = fn x -> p = :code.which(x) {:ok,{_,[{:abstract_code,{_, ac}}]}} = :beam_lib.chunks(p, [:abstract_code]) IO.write(:erl_prettypr.format(:erl_syntax.form_list(ac))) end
dc.(Enumerable) count(_@1) -> ('impl_for!'(_@1)):count(_@1). impl_for(_@1) when erlang:is_list(_@1) -> case 'impl_for?'('Elixir.Enumerable.List') of true -> 'Elixir.Enumerable.List':'__impl__'(target); false -> any_impl_for() end; 'impl_for?'(_@1) -> case 'Elixir.Code':'ensure_compiled?'(_@1) of true -> 'Elixir.Kernel':'function_exported?'(_@1, '__impl__', 1); false -> false; _@2 -> erlang:error({badbool, 'and', _@2}) end.
code.ex:575
@spec ensure_compiled(module) :: {:module, module} | {:error, :embedded | :badfile | :nofile | :on_load_failure} def ensure_compiled(module) when is_atom(module) do case :code.ensure_loaded(module) do {:error, :nofile} = error -> if is_pid(:erlang.get(:elixir_compiler_pid)) and Kernel.ErrorHandler.ensure_compiled(module, :module) do {:module, module} else error end other -> other end end
code.erl:149
-spec ensure_loaded(Module) -> {module, Module} | {error, What} when Module :: module(), What :: embedded | badfile | nofile | on_load_failure. ensure_loaded(Mod) when is_atom(Mod) -> call({ensure_loaded,Mod}).
code.erl:627
call(Req) -> code_server:call(Req).
code_server.erl:136
-spec call(term()) -> term(). call(Req) -> Ref = erlang:monitor(process, ?MODULE), ?MODULE ! {code_call, self(), Req}, receive {?MODULE, Reply} -> erlang:demonitor(Ref,[flush]), Reply; {'DOWN',Ref,process,_,_} -> exit({'DOWN',code_server,Req}) end.
~/ejoy/fuck/deps/exrm/lib/exrm/plugins » diff -u consolidation.ex ~/consolidation.ex --- consolidation.ex 2018-07-26 20:00:38.000000000 +0800 +++ /home/enjolras/consolidation.ex 2018-07-26 19:59:53.324240446 +0800 @@ -27,11 +27,13 @@ debug "Packaging consolidated protocols..." # Add overlay to relx.config which copies consolidated dir to release - consolidated_path = Path.join([Mix.Project.build_path, "consolidated"]) + consolidated_path = Path.join([Mix.Project.build_path, "lib", config.name, "consolidated"]) case File.ls(consolidated_path) do {:error, _} -> + IO.puts("consolidated notfound #{inspect consolidated_path}") config {:ok, filenames} -> + IO.puts("consolidated #{inspect filenames}") dest_path = "lib/#{config.name}-#{config.version}/consolidated" overlays = [overlay: Enum.map(filenames, fn name -> {:copy, '#{consolidated_path}/#{name}', '#{Path.join([dest_path, name])}'}
fix后, 从新打包. 如同上面同样反编译.
list的impl_for再也不调用'impl_for?', 也没法再重现消息堆积问题.
impl_for(_@1) when erlang:is_list(_@1) -> case 'impl_for?'('Elixir.Enumerable.List') of true -> 'Elixir.Enumerable.List':'__impl__'(target); false -> any_impl_for() end;
而是直接返回.
impl_for(x) when erlang:is_list(x) -> 'Elixir.Enumerable.List';
enum.ex:1708
def random(enumerable) do case Enumerable.count(enumerable) do {:ok, 0} -> raise Enum.EmptyError {:ok, count} -> at(enumerable, random_integer(0, count - 1)) {:error, _} -> case take_random(enumerable, 1) do [] -> raise Enum.EmptyError [elem] -> elem end end end
enum.ex:1790
def random(enumerable) do result = case backwards_compatible_slice(enumerable) do {:ok, 0, _} -> [] {:ok, count, fun} when is_function(fun) -> fun.(random_integer(0, count - 1), 1) {:error, _} -> take_random(enumerable, 1) end case result do [] -> raise Enum.EmptyError [elem] -> elem end end # TODO: Remove me on Elixir v1.8 defp backwards_compatible_slice(args) do try do Enumerable.slice(args) catch :error, :undef -> case System.stacktrace() do [{module, :slice, [^args], _} | _] -> {:error, module} stack -> :erlang.raise(:error, :undef, stack) end end end
一样反编译, Enumerable.slice 同样会调用impl_for, 也会call code_server, 没法解释为何v1.6.3没法重现.
slice(_@1) -> ('impl_for!'(_@1)):slice(_@1).