diff --git a/.gitignore b/.gitignore index 8d0dc9b95..d1039b5c4 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,4 @@ rebar3.crashdump core_vnode_eqc.log .idea *.iml +**/*.coverdata diff --git a/.travis.yml b/.travis.yml index 1eaa4fa65..fec478481 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,11 +1,20 @@ language: erlang otp_release: - - 21.2 + - 21.3 + - 22.3 + - 23.0 install: - make - ./rebar3 update script: + - make format - make test + - make proper + - make coverage + - ./rebar3 as test coveralls send + - make lint + - make xref - make dialyzer + - make docs sudo: required dist: trusty diff --git a/Makefile b/Makefile index 90f2fb34c..3d9f05491 100644 --- a/Makefile +++ b/Makefile @@ -1,15 +1,16 @@ PULSE_TESTS = worker_pool_pulse +COVERPATH = ./_build/test/cover +REBAR ?= ./rebar3 -.PHONY: deps test +.PHONY: deps test docs xref dialyzer format all: compile compile: deps - ./rebar3 compile + ${REBAR} compile clean: clean-test - ./rebar3 clean - + ${REBAR} clean distclean: clean @@ -28,7 +29,33 @@ clean-test: # You should 'clean' before your first run of this target # so that deps get built with PULSE where needed. pulse: - ./rebar3 compile -D PULSE - ./rebar3 eunit -D PULSE skip_deps=true suite=$(PULSE_TESTS) + ${REBAR} compile -D PULSE + ${REBAR} eunit -D PULSE skip_deps=true suite=$(PULSE_TESTS) + +proper: + ${REBAR} as proper do eunit + +epc: + ${REBAR} as epc eunit + +format: + ${REBAR} format + +test: compile + ${REBAR} eunit + +coverage: compile + cp _build/proper+test/cover/eunit.coverdata ${COVERPATH}/proper.coverdata ;\ + ${REBAR} cover --verbose + +docs: + ${REBAR} edoc + +xref: compile + ${REBAR} xref + +dialyzer: + ${REBAR} dialyzer -include tools.mk +lint: + ${REBAR} lint diff --git a/README.md b/README.md index 2b8ed89b8..dabdab390 100644 --- a/README.md +++ b/README.md @@ -1,25 +1,32 @@ -# What is riak_core_antidote? +# What is riak_core_lite? -The 'antidote' version of riak core is a minimal and up-to-date version of riak core. +Riak Core Lite is a framework that simplifies the development of dynamo-style architectures, such as highly-available key-value stores and messaging systems. -# Riak Core +Build upon the essence of Riak KV's core with an up-to-date, modular and extensible foundation for elastic distributed services. + +# riak_core_lite ![Language](https://img.shields.io/badge/language-erlang-blue.svg) -![Release](https://img.shields.io/badge/release-R21-9cf.svg) -![Build](https://img.shields.io/badge/build-rebar3%203.9.0-brightgreen.svg) +![Release](https://img.shields.io/badge/release-R21+-9cf.svg) +![Release](https://img.shields.io/badge/formatter-erlang_otp-33d.svg) +![Build](https://img.shields.io/badge/build-rebar3%203.13.0-brightgreen.svg) -[![Build Status](https://travis-ci.com/albsch/riak_core.svg?branch=master)](https://travis-ci.com/albsch/riak_core.svg?branch=master) +[![Hex pm](https://img.shields.io/hexpm/v/riak_core_lite.svg)](https://hex.pm/packages/riak_core_lite) +[![Build Status](https://api.travis-ci.org/riak-core-lite/riak_core_lite.svg?branch=master)](https://api.travis-ci.org/riak-core-lite/riak_core_lite.svg?branch=master) +[![Coverage Status](https://coveralls.io/repos/github/riak-core-lite/riak_core_lite/badge.svg?branch=master)](https://coveralls.io/github/riak-core-lite/riak_core_lite?branch=master) -Riak Core is the distributed systems framework that forms the basis of -how [Riak](http://github.com/basho/riak) distributes data and scales. -More generally, it can be thought of as a toolkit for building -distributed, scalable, fault-tolerant applications. +To get started with riak_core_lite you can follow Mariano Guerra's tutorials. +They are based on the full riak_core, but are still applicable to riak_core_lite. -For some introductory reading on Riak Core (that’s not pure code), -there’s an old but still valuable -[blog post on the Basho Blog](http://basho.com/where-to-start-with-riak-core/) -that’s well worth your time. +1. [Setup](http://marianoguerra.org/posts/riak-core-tutorial-part-1-setup.html) +2. [Starting](http://marianoguerra.org/posts/riak-core-tutorial-part-2-starting.html) +3. [Ping Command](http://marianoguerra.org/posts/riak-core-tutorial-part-3-ping-command.html) +4. [First Commands](http://marianoguerra.org/posts/riak-core-tutorial-part-4-first-commands.html) +5. [Quorum Requests](http://marianoguerra.org/posts/riak-core-tutorial-part-5-quorum-requests.html) +6. [Handoff](http://marianoguerra.org/posts/riak-core-tutorial-part-6-handoff.html) +7. [HTTP API](http://marianoguerra.org/posts/riak-core-tutorial-part-8-http-api.html) +9. [Persistent KV with leveled backend](http://marianoguerra.org/posts/riak-core-tutorial-part-9-persistent-kv-with-leveled-backend.html) ## Contributing @@ -35,7 +42,7 @@ other code. To get started: message, spanning multiple lines if detailed explanation is needed. 5. Push to your fork of the repository and then send a pull request. -6. A Riak committer will review your patch and merge it into the main +6. A committer will review your patch and merge it into the main repository or send you feedback. ## Issues, Questions, and Bugs @@ -43,14 +50,14 @@ other code. To get started: There are numerous ways to file issues or start conversations around something Core related -* The - [Riak Users List](http://lists.basho.com/mailman/listinfo/riak-users_lists.basho.com) - is the main place for all discussion around Riak. -* There is a - [Riak Core-specific mailing list](http://lists.basho.com/mailman/listinfo/riak-core_lists.basho.com) - for issues and questions that pertain to Core but not Riak. -* #riak on Freenode is a very active channel and can get you some - real-time help if a lot of instances. -* If you've found a bug in Riak Core, - [file](https://github.com/basho/riak_core/issues) a clear, concise, +* If you've found a bug in riak_core_lite, + [file](https://github.com/riak-core-lite/riak_core_lite/issues) a clear, concise, explanatory issue against this repo. + +## Reference Implementation + +For some reference on how `riak_core_lite` can be used, you can read about projects which are using `riak_core_lite` as a library: + +- [rcl_memkv](https://github.com/albsch/rcl_memkv): A minimalistic in-memory key-value store to understand how to implement the handoff behavior properly +- [rclref](https://github.com/wattlebirdaz/rclref): A reference implementation of a distributed key-value store using riak_core_lite featuring quorum reads and writes. +- [AntidoteDB](https://github.com/AntidoteDB/antidote): A a highly available geo-replicated key-value database which uses riak_core_lite for sharding of data centers. diff --git a/elvis.config b/elvis.config new file mode 100644 index 000000000..7aae67e46 --- /dev/null +++ b/elvis.config @@ -0,0 +1,51 @@ +%linting and style rules +[{elvis, + [{config, + [#{dirs => ["apps/*/src", "src"], + filter => "*.erl", + rules => [{elvis_style, line_length, + #{ignore => [], + limit => 120, + skip_comments => false}}, + %{elvis_style, no_tabs}, + {elvis_style, no_trailing_whitespace}, + {elvis_style, macro_names, #{ignore => []}}, + {elvis_style, macro_module_names}, + {elvis_style, operator_spaces, #{rules => [{right, ","}, + {right, "++"}, + {left, "++"}, + {right, "--"}, + {left, "--"}]}}, + %{elvis_style, god_modules, + %#{limit => 40, + % ignore => []}}, + {elvis_style, used_ignored_variable}, + {elvis_style, no_behavior_info}, + { + elvis_style, + module_naming_convention, + #{regex => "^[a-z]([a-z0-9]*_?)*(_SUITE)?$", + ignore => []} + }, + { + elvis_style, + function_naming_convention, + #{regex => "^[a-z]([a-z0-9]*_?)*$"} %base: ^([a-z][a-z0-9]*_?)*$ + }, + {elvis_style, state_record_and_type}, + {elvis_style, no_spec_with_records} + ] + }, + #{dirs => ["."], + filter => "Makefile", + rules => [{elvis_project, no_deps_master_erlang_mk, #{ignore => []}}, + {elvis_project, protocol_for_deps_erlang_mk, #{ignore => []}}] + }, + #{dirs => ["."], + filter => "rebar.config", + rules => [{elvis_project, no_deps_master_rebar, #{ignore => []}}, + {elvis_project, protocol_for_deps_rebar, #{ignore => []}}] + } + ] + }] +}]. diff --git a/eqc/bucket_eqc_utils.erl b/eqc/bucket_eqc_utils.erl deleted file mode 100644 index d5959fe60..000000000 --- a/eqc/bucket_eqc_utils.erl +++ /dev/null @@ -1,47 +0,0 @@ -%% ------------------------------------------------------------------- -%% -%% Copyright (c) 2007-2016 Basho Technologies, Inc. -%% -%% This file is provided to you under the Apache License, -%% Version 2.0 (the "License"); you may not use this file -%% except in compliance with the License. You may obtain -%% a copy of the License at -%% -%% http://www.apache.org/licenses/LICENSE-2.0 -%% -%% Unless required by applicable law or agreed to in writing, -%% software distributed under the License is distributed on an -%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -%% KIND, either express or implied. See the License for the -%% specific language governing permissions and limitations -%% under the License. -%% -%% ------------------------------------------------------------------- - --module(bucket_eqc_utils). - -%% API --export([per_test_setup/2]). - - -per_test_setup(DefaultBucketProps, TestFun) -> - try - riak_core_test_util:stop_pid(whereis(riak_core_ring_events)), - riak_core_test_util:stop_pid(whereis(riak_core_ring_manager)), - application:set_env(riak_core, claimant_tick, 4294967295), - application:set_env(riak_core, cluster_name, "eqc_test"), - application:set_env(riak_core, default_bucket_props, DefaultBucketProps), - {ok, RingEvents} = riak_core_ring_events:start_link(), - {ok, RingMgr} = riak_core_ring_manager:start_link(test), - {ok, Claimant} = riak_core_claimant:start_link(), - - Results = TestFun(), - - riak_core_test_util:stop_pid(Claimant), - unlink(RingMgr), - riak_core_ring_manager:stop(), - riak_core_test_util:stop_pid(RingEvents), - Results - after - meck:unload() - end. diff --git a/eqc/mock_vnode.erl b/eqc/mock_vnode.erl deleted file mode 100644 index 44dbe6a87..000000000 --- a/eqc/mock_vnode.erl +++ /dev/null @@ -1,196 +0,0 @@ -%% ------------------------------------------------------------------- -%% -%% mock_vnode: mock vnode for unit testing -%% -%% Copyright (c) 2007-2010 Basho Technologies, Inc. All Rights Reserved. -%% -%% This file is provided to you under the Apache License, -%% Version 2.0 (the "License"); you may not use this file -%% except in compliance with the License. You may obtain -%% a copy of the License at -%% -%% http://www.apache.org/licenses/LICENSE-2.0 -%% -%% Unless required by applicable law or agreed to in writing, -%% software distributed under the License is distributed on an -%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -%% KIND, either express or implied. See the License for the -%% specific language governing permissions and limitations -%% under the License. -%% -%% ------------------------------------------------------------------- - -%% @doc mock vnode for unit testing - --module(mock_vnode). -%TODO: Work out why this gives a warning -%-behavior(riak_core_vnode). --export([start_vnode/1, - get_index/1, - get_counter/1, - neverreply/1, - returnreply/1, - latereply/1, - asyncnoreply/2, - asyncreply/2, - asynccrash/2, - crash/1, - spawn_error/2, - sync_error/2, - get_crash_reason/1, - stop/1]). --export([init/1, - handle_command/3, - terminate/2, - handle_exit/3]). --export([init_worker/3, - handle_work/3]). --behavior(riak_core_vnode_worker). - --record(state, {index, counter, crash_reason}). --record(wstate, {index, args, props, counter, crash_reason}). - --define(MASTER, mock_vnode_master). - -%% API -start_vnode(I) -> - riak_core_vnode_master:start_vnode(I, ?MODULE). - -get_index(Preflist) -> - riak_core_vnode_master:sync_command(Preflist, get_index, ?MASTER). - -get_counter(Preflist) -> - riak_core_vnode_master:sync_command(Preflist, get_counter, ?MASTER). - -neverreply(Preflist) -> - riak_core_vnode_master:command(Preflist, neverreply, ?MASTER). - -returnreply(Preflist) -> - Ref = {neverreply, make_ref()}, - riak_core_vnode_master:command(Preflist, returnreply, {raw, Ref, self()}, ?MASTER), - {ok, Ref}. - -latereply(Preflist) -> - Ref = {latereply, make_ref()}, - riak_core_vnode_master:command(Preflist, latereply, {raw, Ref, self()}, ?MASTER), - {ok, Ref}. - -asyncnoreply(Preflist, AsyncDonePid) -> - Ref = {asyncnoreply, make_ref()}, - riak_core_vnode_master:command(Preflist, {asyncnoreply, AsyncDonePid}, - {raw, Ref, AsyncDonePid}, ?MASTER), - {ok, Ref}. - -asyncreply(Preflist, AsyncDonePid) -> - Ref = {asyncreply, make_ref()}, - riak_core_vnode_master:command(Preflist, {asyncreply, AsyncDonePid}, - {raw, Ref, AsyncDonePid}, ?MASTER), - {ok, Ref}. - -asynccrash(Preflist, AsyncDonePid) -> - Ref = {asynccrash, make_ref()}, - riak_core_vnode_master:command(Preflist, {asynccrash, AsyncDonePid}, - {raw, Ref, AsyncDonePid}, ?MASTER), - {ok, Ref}. - -crash(Preflist) -> - riak_core_vnode_master:sync_command(Preflist, crash, ?MASTER). - -spawn_error(Preflist, Cmd) -> - riak_core_vnode_master:sync_spawn_command(Preflist, {sync_error, Cmd}, ?MASTER). - -sync_error(Preflist, Cmd) -> - riak_core_vnode_master:sync_command(Preflist, {sync_error, Cmd}, ?MASTER). - -get_crash_reason(Preflist) -> - riak_core_vnode_master:sync_command(Preflist, get_crash_reason, ?MASTER). - -stop(Preflist) -> - riak_core_vnode_master:sync_command(Preflist, stop, ?MASTER). - - -%% Callbacks - -init([Index]) -> - S = #state{index=Index,counter=0}, - {ok, PoolSize} = application:get_env(riak_core, core_vnode_eqc_pool_size), - case PoolSize of - PoolSize when PoolSize > 0 -> - {ok, S, [{pool, ?MODULE, PoolSize, myargs}]}; - _ -> - {ok, S} - end. - -handle_command(get_index, _Sender, State) -> - {reply, {ok, State#state.index}, State}; -handle_command(get_counter, _Sender, State) -> - {reply, {ok, State#state.counter}, State}; -handle_command(get_crash_reason, _Sender, State) -> - {reply, {ok, State#state.crash_reason}, State}; -handle_command({sync_error, error}, _Sender, State) -> - erlang:error(core_breach), - {reply, ok, State}; -handle_command({sync_error, exit}, _Sender, State) -> - erlang:exit(core_breach), - {reply, ok, State}; -handle_command({sync_error, badthrow}, _Sender, State) -> - erlang:throw({reply, {error, terrible}, State}); %% emulate gen_server -handle_command({sync_error, goodthrow}, _Sender, State) -> - erlang:throw({reply, ok, State}); %% emulate gen_server - -handle_command(crash, _Sender, State) -> - spawn_link(fun() -> exit(State#state.index) end), - {reply, ok, State}; -handle_command(stop, Sender, State = #state{counter=Counter}) -> - %% Send reply here as vnode_master:sync_command does a call - %% which is cast on to the vnode process. Returning {stop,...} - %% does not provide for returning a response. - riak_core_vnode:reply(Sender, stopped), - {stop, normal, State#state{counter = Counter + 1}}; -handle_command(neverreply, _Sender, State = #state{counter=Counter}) -> - {noreply, State#state{counter = Counter + 1}}; -handle_command(returnreply, _Sender, State = #state{counter=Counter}) -> - {reply, returnreply, State#state{counter = Counter + 1}}; -handle_command(latereply, Sender, State = #state{counter=Counter}) -> - spawn(fun() -> - timer:sleep(100), - riak_core_vnode:reply(Sender, latereply) - end), - {noreply, State#state{counter = Counter + 1}}; -handle_command({asyncnoreply, DonePid}, Sender, State = #state{counter=Counter}) -> - {async, {noreply, DonePid}, Sender, State#state{counter = Counter + 1}}; -handle_command({asyncreply, DonePid}, Sender, State = #state{counter=Counter}) -> - {async, {reply, DonePid}, Sender, State#state{counter = Counter + 1}}; -handle_command({asynccrash, DonePid}, Sender, State = #state{counter=Counter}) -> - {async, {crash, DonePid}, Sender, State#state{counter = Counter + 1}}. - -handle_exit(_Pid, Reason, State) -> - {noreply, State#state{crash_reason=Reason}}. - -terminate(_Reason, _State) -> - ok. - - - -%% -%% Vnode worker callbacks -%% -init_worker(Index, Args, Props) -> - {ok, #wstate{index=Index, args=Args, props=Props}}. - -handle_work({noreply, DonePid}, {raw, Ref, _EqcPid} = _Sender, State = #wstate{index=I}) -> - timer:sleep(100), % slow things down enough to cause issue on stops - DonePid ! {I, {ok, Ref}}, - {noreply, State}; -handle_work({reply, DonePid}, {raw, Ref, _EqcPid} = _Sender, State = #wstate{index=I}) -> - timer:sleep(100), % slow things down enough to cause issue on stops - DonePid ! {I, {ok, Ref}}, - {reply, asyncreply, State}; -handle_work({crash, DonePid}, - {raw, Ref, _EqcPid} = _Sender, _State = #wstate{index=I}) -> - timer:sleep(100), % slow things down enough to cause issue on stops - %% This msg needs to get sent, since it's counted in core_vnode_eqc work tracking - %% in next_state_data/5 - DonePid ! {I, {ok, Ref}}, - throw(deliberate_async_crash). - diff --git a/include/riak_core_bucket_type.hrl b/include/riak_core_bucket_type.hrl deleted file mode 100644 index 298966604..000000000 --- a/include/riak_core_bucket_type.hrl +++ /dev/null @@ -1,3 +0,0 @@ --define(BUCKET_TYPE_PREFIX, {core, bucket_types}). --define(DEFAULT_TYPE, <<"default">>). - diff --git a/include/riak_core_vnode.hrl b/include/riak_core_vnode.hrl index 9ff955a66..9caf35944 100644 --- a/include/riak_core_vnode.hrl +++ b/include/riak_core_vnode.hrl @@ -1,9 +1,10 @@ -type sender_type() :: fsm | server | raw. -type sender() :: {sender_type(), reference() | tuple(), pid()} | %% TODO: Double-check that these special cases are kosher - {server, undefined, undefined} | % special case in - % riak_core_vnode_master.erl + {fsm, undefined, pid()} | % what are these special cases and what is the reference used for?? + {server, undefined, undefined} | % special case in riak_core_vnode_master.erl ignore. + -type partition() :: chash:index_as_int(). -type vnode_req() :: term(). -type keyspaces() :: [{partition(), [partition()]}]. @@ -28,9 +29,6 @@ forwardable :: boolean(), opts = [] :: list()}). --define(VNODE_REQ, #riak_vnode_req_v1). --define(COVERAGE_REQ, #riak_coverage_req_v1). --define(FOLD_REQ, #riak_core_fold_req_v2). -define(KV_VNODE_LOCK(Idx), {vnode_lock, Idx}). -type handoff_dest() :: {riak_core_handoff_manager:ho_type(), {partition(), node()}}. diff --git a/rebar.config b/rebar.config index c57e19120..796022367 100644 --- a/rebar.config +++ b/rebar.config @@ -1,22 +1,15 @@ -%{cover_enabled, true}. -{erl_opts, [warnings_as_errors, - debug_info]}. -{edoc_opts, [{preprocess, true}]}. -{xref_checks, []}. -{xref_queries, [{"(XC - UC) || (XU - X - B - \"(cluster_info|dtrace)\" : Mod)", []}]}. - +%% Add debug information in compiled code for other tools like debuggers, code coverage tools, or analysis tools +{erl_opts, [debug_info]}. {deps, [ % worker pool library - {poolboy, "~>1.5.2"}, - % metric collection - {basho_stats, "~>1.0.3"}, - % bloom filter - {blume, "~>0.1.0"}, - % consistent hashing function - {chash, "~>0.1.2"}, - % legacy gen_fsm behaviour - gen_fsm_compat + {poolboy, "~>1.5.2"} ]}. +%% Defensive xref configuration +{xref_checks, [ undefined_function_calls, locals_not_used, deprecated_function_calls ]}. + +%% Code formatter +{project_plugins, [rebar3_format, rebar3_lint, rebar3_proper]}. +{format, [ {formatter, otp_formatter} ]}. %%------------------------------------------------------------------- %% Profiles @@ -25,18 +18,18 @@ {profiles, [ {test, [ {erl_opts, [nowarn_export_all]}, - {deps, - [ - {mustache, ".*", {git, "https://github.com/mojombo/mustache.erl.git", {tag, "v0.1.1"}}}, - % event framework for test cases - {goldrush, "~>0.1.8"} - ]}]}, - {docs, [{deps, [{edown, "0.7.0"}]}]}, - {eqc, [{erl_opts, [nowarn_export_all, {d, 'EQC'}, {d, 'TEST'}]}, {deps, [meck]}, {plugins, [{rebar_eqc, "~>0.1.0"}]}]}, - {prod, [{relx, [{dev_mode, false}]}]}, - {lint, [ - {plugins, [ - {rebar3_lint, {git, "https://github.com/project-fifo/rebar3_lint.git", {tag, "0.1.4"}}} - ]} - ]} - ]}. + {plugins, [{coveralls, {git, "https://github.com/markusn/coveralls-erl", {branch, "master"}}}]}, + {deps, [meck]} + ]}, + {proper, [ + {erl_opts, [nowarn_export_all, {d, 'PROPER'}, {d, 'TEST'}]}, + {plugins, [{coveralls, {git, "https://github.com/markusn/coveralls-erl", {branch, "master"}}}]}, + {deps, [meck, {proper, "1.3.0"}, recon]} + ]}, + {docs, [{deps, [{edown, "0.7.0"}]}]} +]}. + +{cover_enabled, true}. +{cover_export_enabled, true}. +{coveralls_coverdata, "_build/test/cover/*.coverdata"}. +{coveralls_service_name, "travis-ci"}. diff --git a/rebar.config.script b/rebar.config.script new file mode 100644 index 000000000..6c1419104 --- /dev/null +++ b/rebar.config.script @@ -0,0 +1,6 @@ +case os:getenv("TRAVIS") of + "true" -> + lists:keystore(coveralls_service_job_id, 1, CONFIG, {coveralls_service_job_id, os:getenv("TRAVIS_JOB_ID")}); + _ -> + CONFIG +end. \ No newline at end of file diff --git a/rebar.lock b/rebar.lock index 1222fd60f..6dabcf610 100644 --- a/rebar.lock +++ b/rebar.lock @@ -1,14 +1,6 @@ {"1.1.0", -[{<<"basho_stats">>,{pkg,<<"basho_stats">>,<<"1.0.3">>},0}, - {<<"blume">>,{pkg,<<"blume">>,<<"0.1.1">>},0}, - {<<"chash">>,{pkg,<<"chash">>,<<"0.1.2">>},0}, - {<<"gen_fsm_compat">>,{pkg,<<"gen_fsm_compat">>,<<"0.3.0">>},0}, - {<<"poolboy">>,{pkg,<<"poolboy">>,<<"1.5.2">>},0}]}. +[{<<"poolboy">>,{pkg,<<"poolboy">>,<<"1.5.2">>},0}]}. [ {pkg_hash,[ - {<<"basho_stats">>, <<"7E1174151509C64FCC1934120ED32295E14F84DAAE7F84926BA2C8D3700D146C">>}, - {<<"blume">>, <<"CFB4F43688690BA81C6A79F54E4678CFD5FDEDAB692F277AE740AE4A3897360D">>}, - {<<"chash">>, <<"AF02484F2640C653C4B9A8557A14CA0704989DBEDB27E7CCBC442F1903A3BCA7">>}, - {<<"gen_fsm_compat">>, <<"5903549F67D595F58A7101154CBE0FDD46955FBFBE40813F1E53C23A970FF5F4">>}, {<<"poolboy">>, <<"392B007A1693A64540CEAD79830443ABF5762F5D30CF50BC95CB2C1AAAFA006B">>}]} ]. diff --git a/rebar3 b/rebar3 index b40d756a3..e550663ab 100755 Binary files a/rebar3 and b/rebar3 differ diff --git a/src/chash.erl b/src/chash.erl new file mode 100644 index 000000000..ec42297f2 --- /dev/null +++ b/src/chash.erl @@ -0,0 +1,308 @@ +%% ------------------------------------------------------------------- +%% +%% taken from: https://github.com/basho/riak_core/blob/develop/src/chash.erl +%% +%% chash: basic consistent hashing +%% +%% Copyright (c) 2007-2011 Basho Technologies, Inc. All Rights Reserved. +%% +%% This file is provided to you under the Apache License, +%% Version 2.0 (the "License"); you may not use this file +%% except in compliance with the License. You may obtain +%% a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, +%% software distributed under the License is distributed on an +%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +%% KIND, either express or implied. See the License for the +%% specific language governing permissions and limitations +%% under the License. +%% +%% ------------------------------------------------------------------- + +%% @doc A consistent hashing implementation. The space described by the ring +%% coincides with SHA-1 hashes, and so any two keys producing the same +%% SHA-1 hash are considered identical within the ring. +%% +%% Warning: It is not recommended that code outside this module make use +%% of the structure of a chash. +%% +%% @reference Karger, D.; Lehman, E.; Leighton, T.; Panigrahy, R.; Levine, M.; +%% Lewin, D. (1997). "Consistent hashing and random trees". Proceedings of the +%% twenty-ninth annual ACM symposium on Theory of computing: 654~663. ACM Press +%% New York, NY, USA + +-module(chash). + +-export([contains_name/2, fresh/2, lookup/2, key_of/1, + members/1, merge_rings/2, next_index/2, nodes/1, + predecessors/2, predecessors/3, ring_increment/1, + size/1, successors/2, successors/3, update/3]). + +-export_type([chash/0, index/0, index_as_int/0]). + +-define(RINGTOP, + trunc(math:pow(2, 160) - 1)). % SHA-1 space + +-ifdef(TEST). + +-include_lib("eunit/include/eunit.hrl"). + +-endif. + +-type chash() :: {num_partitions(), [node_entry()]}. + +%% A Node is the unique identifier for the owner of a given partition. +%% An Erlang Pid works well here, but the chash module allows it to +%% be any term. +-type chash_node() :: term(). + +%% Indices into the ring, used as keys for object location, are binary +%% representations of 160-bit integers. +-type index() :: <<_:160>>. + +-type index_as_int() :: integer(). + +-type node_entry() :: {index_as_int(), chash_node()}. + +-type num_partitions() :: pos_integer(). + +%% =================================================================== +%% Public API +%% =================================================================== + +%% @doc Return true if named Node owns any partitions in the ring, else false. +-spec contains_name(Name :: chash_node(), + CHash :: chash()) -> boolean(). + +contains_name(Name, CHash) -> + {_NumPartitions, Nodes} = CHash, + [X || {_, X} <- Nodes, X == Name] =/= []. + +%% @doc Create a brand new ring. The size and seednode are specified; +%% initially all partitions are owned by the seednode. If NumPartitions +%% is not much larger than the intended eventual number of +%% participating nodes, then performance will suffer. +-spec fresh(NumPartitions :: num_partitions(), + SeedNode :: chash_node()) -> chash(). + +fresh(NumPartitions, SeedNode) -> + Inc = ring_increment(NumPartitions), + {NumPartitions, + [{IndexAsInt, SeedNode} + || IndexAsInt <- lists:seq(0, (?RINGTOP) - 1, Inc)]}. + +%% @doc Find the Node that owns the partition identified by IndexAsInt. +-spec lookup(IndexAsInt :: index_as_int(), + CHash :: chash()) -> chash_node(). + +lookup(IndexAsInt, CHash) -> + {_NumPartitions, Nodes} = CHash, + {IndexAsInt, X} = proplists:lookup(IndexAsInt, Nodes), + X. + +sha(Bin) -> crypto:hash(sha, Bin). + +%% @doc Given any term used to name an object, produce that object's key +%% into the ring. Two names with the same SHA-1 hash value are +%% considered the same name. +-spec key_of(ObjectName :: term()) -> index(). + +key_of(ObjectName) -> sha(term_to_binary(ObjectName)). + +%% @doc Return all Nodes that own any partitions in the ring. +-spec members(CHash :: chash()) -> [chash_node()]. + +members(CHash) -> + {_NumPartitions, Nodes} = CHash, + lists:usort([X || {_Idx, X} <- Nodes]). + +%% @doc Return a randomized merge of two rings. +%% If multiple nodes are actively claiming nodes in the same +%% time period, churn will occur. Be prepared to live with it. +-spec merge_rings(CHashA :: chash(), + CHashB :: chash()) -> chash(). + +merge_rings(CHashA, CHashB) -> + {NumPartitions, NodesA} = CHashA, + {NumPartitions, NodesB} = CHashB, + {NumPartitions, + [{I, random_node(A, B)} + || {{I, A}, {I, B}} <- lists:zip(NodesA, NodesB)]}. + +%% @doc Given the integer representation of a chash key, +%% return the next ring index integer value. +-spec next_index(IntegerKey :: integer(), + CHash :: chash()) -> index_as_int(). + +next_index(IntegerKey, {NumPartitions, _}) -> + Inc = ring_increment(NumPartitions), + (IntegerKey div Inc + 1) rem NumPartitions * Inc. + +%% @doc Return the entire set of NodeEntries in the ring. +-spec nodes(CHash :: chash()) -> [node_entry()]. + +nodes(CHash) -> {_NumPartitions, Nodes} = CHash, Nodes. + +%% @doc Given an object key, return all NodeEntries in order starting at Index. +-spec ordered_from(Index :: index(), + CHash :: chash()) -> [node_entry()]. + +ordered_from(Index, {NumPartitions, Nodes}) -> + <> = Index, + Inc = ring_increment(NumPartitions), + {A, B} = lists:split(IndexAsInt div Inc + 1, Nodes), + B ++ A. + +%% @doc Given an object key, return all NodeEntries in reverse order +%% starting at Index. +-spec predecessors(Index :: index() | index_as_int(), + CHash :: chash()) -> [node_entry()]. + +predecessors(Index, CHash) -> + {NumPartitions, _Nodes} = CHash, + predecessors(Index, CHash, NumPartitions). + +%% @doc Given an object key, return the next N NodeEntries in reverse order +%% starting at Index. +-spec predecessors(Index :: index() | index_as_int(), + CHash :: chash(), N :: integer()) -> [node_entry()]. + +predecessors(Index, CHash, N) when is_integer(Index) -> + predecessors(<>, CHash, N); +predecessors(Index, CHash, N) -> + Num = max_n(N, CHash), + {Res, _} = lists:split(Num, + lists:reverse(ordered_from(Index, CHash))), + Res. + +%% @doc Return increment between ring indexes given +%% the number of ring partitions. +-spec ring_increment(NumPartitions :: + pos_integer()) -> pos_integer(). + +ring_increment(NumPartitions) -> + (?RINGTOP) div NumPartitions. + +%% @doc Return the number of partitions in the ring. +-spec size(CHash :: chash()) -> integer(). + +size(CHash) -> + {_NumPartitions, Nodes} = CHash, length(Nodes). + +%% @doc Given an object key, return all NodeEntries in order starting at Index. +-spec successors(Index :: index(), + CHash :: chash()) -> [node_entry()]. + +successors(Index, CHash) -> + {NumPartitions, _Nodes} = CHash, + successors(Index, CHash, NumPartitions). + +%% @doc Given an object key, return the next N NodeEntries in order +%% starting at Index. +-spec successors(Index :: index(), CHash :: chash(), + N :: integer()) -> [node_entry()]. + +successors(Index, CHash, N) -> + Num = max_n(N, CHash), + Ordered = ordered_from(Index, CHash), + {NumPartitions, _Nodes} = CHash, + if Num =:= NumPartitions -> Ordered; + true -> {Res, _} = lists:split(Num, Ordered), Res + end. + +%% @doc Make the partition beginning at IndexAsInt owned by Name'd node. +-spec update(IndexAsInt :: index_as_int(), + Name :: chash_node(), CHash :: chash()) -> chash(). + +update(IndexAsInt, Name, CHash) -> + {NumPartitions, Nodes} = CHash, + NewNodes = lists:keyreplace(IndexAsInt, 1, Nodes, + {IndexAsInt, Name}), + {NumPartitions, NewNodes}. + +%% ==================================================================== +%% Internal functions +%% ==================================================================== + +%% @private +%% @doc Return either N or the number of partitions in the ring, whichever +%% is lesser. +-spec max_n(N :: integer(), + CHash :: chash()) -> integer(). + +max_n(N, {NumPartitions, _Nodes}) -> + erlang:min(N, NumPartitions). + +%% @private +-spec random_node(NodeA :: chash_node(), + NodeB :: chash_node()) -> chash_node(). + +random_node(NodeA, NodeA) -> NodeA; +random_node(NodeA, NodeB) -> + lists:nth(rand:uniform(2), [NodeA, NodeB]). + +%% =================================================================== +%% EUnit tests +%% =================================================================== +-ifdef(TEST). + +update_test() -> + Node = old@host, + NewNode = new@host, + % Create a fresh ring... + CHash = chash:fresh(5, Node), + GetNthIndex = fun (N, {_, Nodes}) -> + {Index, _} = lists:nth(N, Nodes), Index + end, + % Test update... + FirstIndex = GetNthIndex(1, CHash), + ThirdIndex = GetNthIndex(3, CHash), + {5, + [{_, NewNode}, {_, Node}, {_, Node}, {_, Node}, + {_, Node}, {_, Node}]} = + update(FirstIndex, NewNode, CHash), + {5, + [{_, Node}, {_, Node}, {_, NewNode}, {_, Node}, + {_, Node}, {_, Node}]} = + update(ThirdIndex, NewNode, CHash). + +contains_test() -> + CHash = chash:fresh(8, the_node), + ?assertEqual(true, (contains_name(the_node, CHash))), + ?assertEqual(false, + (contains_name(some_other_node, CHash))). + +max_n_test() -> + CHash = chash:fresh(8, the_node), + ?assertEqual(1, (max_n(1, CHash))), + ?assertEqual(8, (max_n(11, CHash))). + +simple_size_test() -> + ?assertEqual(8, + (length(chash:nodes(chash:fresh(8, the_node))))). + +successors_length_test() -> + ?assertEqual(8, + (length(chash:successors(chash:key_of(0), + chash:fresh(8, the_node))))). + +inverse_pred_test() -> + CHash = chash:fresh(8, the_node), + S = [I + || {I, _} <- chash:successors(chash:key_of(4), CHash)], + P = [I + || {I, _} + <- chash:predecessors(chash:key_of(4), CHash)], + ?assertEqual(S, (lists:reverse(P))). + +merge_test() -> + CHashA = chash:fresh(8, node_one), + CHashB = chash:update(0, node_one, + chash:fresh(8, node_two)), + CHash = chash:merge_rings(CHashA, CHashB), + ?assertEqual(node_one, (chash:lookup(0, CHash))). + +-endif. diff --git a/src/chashbin.erl b/src/chashbin.erl new file mode 100644 index 000000000..6404ae0fd --- /dev/null +++ b/src/chashbin.erl @@ -0,0 +1,255 @@ +%% ------------------------------------------------------------------- +%% +%% taken from: https://github.com/basho/riak_core/blob/develop/src/chashbin.erl +%% +%% riak_core: Core Riak Application +%% +%% Copyright (c) 2013 Basho Technologies, Inc. All Rights Reserved. +%% +%% This file is provided to you under the Apache License, +%% Version 2.0 (the "License"); you may not use this file +%% except in compliance with the License. You may obtain +%% a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, +%% software distributed under the License is distributed on an +%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +%% KIND, either express or implied. See the License for the +%% specific language governing permissions and limitations +%% under the License. +%% +%% ------------------------------------------------------------------- +-module(chashbin). + +-export([create/1, to_chash/1, to_list/1, + to_list_filter/2, responsible_index/2, + responsible_position/2, index_owner/2, + num_partitions/1]). + +-export([iterator/2, exact_iterator/2, itr_value/1, + itr_pop/2, itr_next/1, itr_next_while/2]). + +-export_type([chashbin/0]). + +%% 160 bits for hash, 16 bits for node id +%% these macros break edoc +%% also these macros are not used consistently, commenting out for now +%%-define(UNIT, 176). +%%-define(ENTRY, binary-unit:?UNIT). + +-type owners_bin() :: <<_:_*176>>. + +-type index() :: chash:index_as_int(). + +-type pred_fun() :: fun(({index(), + node()}) -> boolean()). + +-type chash_key() :: index() | chash:index(). + +-ifndef(namespaced_types). + +-record(chashbin, + {size :: pos_integer(), owners :: owners_bin(), + nodes :: erlang:tuple(node())}). + +-else. + +-record(chashbin, + {size :: pos_integer(), owners :: owners_bin(), + nodes :: erlang:tuple(node())}). + +-endif. + +-type chashbin() :: #chashbin{}. + +-record(iterator, + {pos :: non_neg_integer(), start :: non_neg_integer(), + chbin :: chashbin()}). + +-type iterator() :: #iterator{}. + +%% =================================================================== +%% Public API +%% =================================================================== + +%% @doc Create a `chashbin' from the provided `chash' +-spec create(chash:chash()) -> chashbin(). + +create({Size, Owners}) -> + Nodes1 = [Node || {_, Node} <- Owners], + Nodes2 = lists:usort(Nodes1), + Nodes3 = lists:zip(Nodes2, + lists:seq(1, length(Nodes2))), + Bin = create_bin(Owners, Nodes3, <<>>), + #chashbin{size = Size, owners = Bin, + nodes = list_to_tuple(Nodes2)}. + +%% @doc Convert a `chashbin' back to a `chash' +-spec to_chash(chashbin()) -> chash:chash(). + +to_chash(CHBin = #chashbin{size = Size}) -> + L = to_list(CHBin), {Size, L}. + +%% @doc Convert a `chashbin' to a list of `{Index, Owner}' pairs +-spec to_list(chashbin()) -> [{index(), node()}]. + +to_list(#chashbin{owners = Bin, nodes = Nodes}) -> + [{Idx, element(Id, Nodes)} + || <> <= Bin]. + +%% @doc +%% Convert a `chashbin' to a list of `{Index, Owner}' pairs for +%% which `Pred({Index, Owner})' returns `true' +-spec to_list_filter(pred_fun(), + chashbin()) -> [{index(), node()}]. + +to_list_filter(Pred, + #chashbin{owners = Bin, nodes = Nodes}) -> + [{Idx, element(Id, Nodes)} + || <> <= Bin, + Pred({Idx, element(Id, Nodes)})]. + +%% @doc Determine the ring index responsible for a given chash key +-spec responsible_index(chash_key(), + chashbin()) -> index(). + +responsible_index(<>, CHBin) -> + responsible_index(HashKey, CHBin); +responsible_index(HashKey, #chashbin{size = Size}) -> + Inc = chash:ring_increment(Size), + (HashKey div Inc + 1) rem Size * Inc. + +%% @doc Determine the ring position responsible for a given chash key +-spec responsible_position(chash_key(), + chashbin()) -> non_neg_integer(). + +responsible_position(<>, CHBin) -> + responsible_position(HashKey, CHBin); +responsible_position(HashKey, #chashbin{size = Size}) -> + Inc = chash:ring_increment(Size), + (HashKey div Inc + 1) rem Size. + +%% @doc Return the node that owns the given index +-spec index_owner(index(), chashbin()) -> node(). + +index_owner(Idx, CHBin) -> + case itr_value(exact_iterator(Idx, CHBin)) of + {Idx, Owner} -> Owner; + _ -> + %% Match the behavior for riak_core_ring:index_owner/2 + exit({badmatch, false}) + end. + +%% @doc Return the number of partitions in a given `chashbin' +-spec num_partitions(chashbin()) -> pos_integer(). + +num_partitions(#chashbin{size = Size}) -> Size. + +%% =================================================================== +%% Public Iterator API +%% =================================================================== + +%% @doc +%% Return an iterator pointing to the index responsible for the given chash key +-spec iterator(first | chash_key(), + chashbin()) -> iterator(). + +iterator(first, CHBin) -> + #iterator{pos = 0, start = 0, chbin = CHBin}; +iterator(<>, CHBin) -> + iterator(HashKey, CHBin); +iterator(HashKey, CHBin) -> + Pos = responsible_position(HashKey, CHBin), + #iterator{pos = Pos, start = Pos, chbin = CHBin}. + +%% @doc Return the `{Index, Owner}' pair pointed to by the iterator +-spec itr_value(iterator()) -> {index(), node()}. + +itr_value(#iterator{pos = Pos, + chbin = #chashbin{owners = Bin, nodes = Nodes}}) -> + <<_:Pos/binary-unit:176, Idx:160/integer, Id:16/integer, + _/binary>> = + Bin, + Owner = element(Id, Nodes), + {Idx, Owner}. + +%% @doc Advance the iterator by one ring position +-spec itr_next(iterator()) -> iterator() | done. + +itr_next(Itr = #iterator{pos = Pos, start = Start, + chbin = CHBin}) -> + Pos2 = (Pos + 1) rem CHBin#chashbin.size, + case Pos2 of + Start -> done; + _ -> Itr#iterator{pos = Pos2} + end. + +%% @doc +%% Advance the iterator `N' times, returning a list of the traversed +%% `{Index, Owner}' pairs as well as the new iterator state +-spec itr_pop(pos_integer(), iterator()) -> {[{index(), + node()}], + iterator()}. + +itr_pop(N, Itr = #iterator{pos = Pos, chbin = CHBin}) -> + #chashbin{size = Size, owners = Bin, nodes = Nodes} = + CHBin, + L = case Bin of + <<_:Pos/binary-unit:176, Bin2:N/binary-unit:176, + _/binary>> -> + [{Idx, element(Id, Nodes)} + || <> <= Bin2]; + _ -> + Left = N + Pos - Size, + Skip = Pos - Left, + <> = + Bin, + L1 = [{Idx, element(Id, Nodes)} + || <> <= Bin2], + L2 = [{Idx, element(Id, Nodes)} + || <> <= Bin3], + L1 ++ L2 + end, + Pos2 = (Pos + N) rem Size, + Itr2 = Itr#iterator{pos = Pos2}, + {L, Itr2}. + +%% @doc Advance the iterator while `Pred({Index, Owner})' returns `true' +-spec itr_next_while(pred_fun(), + iterator()) -> iterator(). + +itr_next_while(Pred, Itr) -> + case Pred(itr_value(Itr)) of + false -> Itr; + true -> itr_next_while(Pred, itr_next(Itr)) + end. + +%% =================================================================== +%% Internal functions +%% =================================================================== + +%% Convert list of {Index, Owner} pairs into `chashbin' binary representation +-spec create_bin([{index(), node()}], + [{node(), pos_integer()}], binary()) -> owners_bin(). + +create_bin([], _, Bin) -> Bin; +create_bin([{Idx, Owner} | Owners], Nodes, Bin) -> + {Owner, Id} = lists:keyfind(Owner, 1, Nodes), + Bin2 = <>, + create_bin(Owners, Nodes, Bin2). + +%% Convert ring index into ring position +index_position(<>, CHBin) -> + index_position(Idx, CHBin); +index_position(Idx, #chashbin{size = Size}) -> + Inc = chash:ring_increment(Size), Idx div Inc rem Size. + +%% Return iterator pointing to the given index +exact_iterator(<>, CHBin) -> + exact_iterator(Idx, CHBin); +exact_iterator(Idx, CHBin) -> + Pos = index_position(Idx, CHBin), + #iterator{pos = Pos, start = Pos, chbin = CHBin}. diff --git a/src/gen_fsm_compat.erl b/src/gen_fsm_compat.erl new file mode 100644 index 000000000..f137bdb16 --- /dev/null +++ b/src/gen_fsm_compat.erl @@ -0,0 +1,795 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 1996-2018. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%% +%% %CopyrightEnd% +%% +-module(gen_fsm_compat). + +%%%----------------------------------------------------------------- +%%% +%%% This state machine is somewhat more pure than state_lib. It is +%%% still based on State dispatching (one function per state), but +%%% allows a function handle_event to take care of events in all states. +%%% It's not that pure anymore :( We also allow synchronized event sending. +%%% +%%% If the Parent process terminates the Module:terminate/2 +%%% function is called. +%%% +%%% The user module should export: +%%% +%%% init(Args) +%%% ==> {ok, StateName, StateData} +%%% {ok, StateName, StateData, Timeout} +%%% ignore +%%% {stop, Reason} +%%% +%%% StateName(Msg, StateData) +%%% +%%% ==> {next_state, NewStateName, NewStateData} +%%% {next_state, NewStateName, NewStateData, Timeout} +%%% {stop, Reason, NewStateData} +%%% Reason = normal | shutdown | Term terminate(State) is called +%%% +%%% StateName(Msg, From, StateData) +%%% +%%% ==> {next_state, NewStateName, NewStateData} +%%% {next_state, NewStateName, NewStateData, Timeout} +%%% {reply, Reply, NewStateName, NewStateData} +%%% {reply, Reply, NewStateName, NewStateData, Timeout} +%%% {stop, Reason, NewStateData} +%%% Reason = normal | shutdown | Term terminate(State) is called +%%% +%%% handle_event(Msg, StateName, StateData) +%%% +%%% ==> {next_state, NewStateName, NewStateData} +%%% {next_state, NewStateName, NewStateData, Timeout} +%%% {stop, Reason, Reply, NewStateData} +%%% {stop, Reason, NewStateData} +%%% Reason = normal | shutdown | Term terminate(State) is called +%%% +%%% handle_sync_event(Msg, From, StateName, StateData) +%%% +%%% ==> {next_state, NewStateName, NewStateData} +%%% {next_state, NewStateName, NewStateData, Timeout} +%%% {reply, Reply, NewStateName, NewStateData} +%%% {reply, Reply, NewStateName, NewStateData, Timeout} +%%% {stop, Reason, Reply, NewStateData} +%%% {stop, Reason, NewStateData} +%%% Reason = normal | shutdown | Term terminate(State) is called +%%% +%%% handle_info(Info, StateName) (e.g. {'EXIT', P, R}, {nodedown, N}, ... +%%% +%%% ==> {next_state, NewStateName, NewStateData} +%%% {next_state, NewStateName, NewStateData, Timeout} +%%% {stop, Reason, NewStateData} +%%% Reason = normal | shutdown | Term terminate(State) is called +%%% +%%% terminate(Reason, StateName, StateData) Let the user module clean up +%%% always called when server terminates +%%% +%%% ==> the return value is ignored +%%% +%%% +%%% The work flow (of the fsm) can be described as follows: +%%% +%%% User module fsm +%%% ----------- ------- +%%% start -----> start +%%% init <----- . +%%% +%%% loop +%%% StateName <----- . +%%% +%%% handle_event <----- . +%%% +%%% handle__sunc_event <----- . +%%% +%%% handle_info <----- . +%%% +%%% terminate <----- . +%%% +%%% +%%% --------------------------------------------------- + +-export([start/3, start/4, start_link/3, start_link/4, + stop/1, stop/3, send_event/2, sync_send_event/2, + sync_send_event/3, send_all_state_event/2, + sync_send_all_state_event/2, + sync_send_all_state_event/3, reply/2, start_timer/2, + send_event_after/2, cancel_timer/1, enter_loop/4, + enter_loop/5, enter_loop/6, wake_hib/7]). + +%% Internal exports +-export([init_it/6, system_continue/3, + system_terminate/4, system_code_change/4, + system_get_state/1, system_replace_state/2, + format_status/2]). + +-import(error_logger, [format/2]). + +%%% --------------------------------------------------- +%%% Interface functions. +%%% --------------------------------------------------- + +-callback init(Args :: term()) -> {ok, + StateName :: atom(), StateData :: term()} | + {ok, StateName :: atom(), StateData :: term(), + timeout() | hibernate} | + {stop, Reason :: term()} | ignore. + +-callback handle_event(Event :: term(), + StateName :: atom(), + StateData :: term()) -> {next_state, + NextStateName :: atom(), + NewStateData :: term()} | + {next_state, + NextStateName :: atom(), + NewStateData :: term(), + timeout() | hibernate} | + {stop, Reason :: term(), + NewStateData :: term()}. + +-callback handle_sync_event(Event :: term(), + From :: {pid(), Tag :: term()}, StateName :: atom(), + StateData :: term()) -> {reply, Reply :: term(), + NextStateName :: atom(), + NewStateData :: term()} | + {reply, Reply :: term(), + NextStateName :: atom(), + NewStateData :: term(), + timeout() | hibernate} | + {next_state, + NextStateName :: atom(), + NewStateData :: term()} | + {next_state, + NextStateName :: atom(), + NewStateData :: term(), + timeout() | hibernate} | + {stop, Reason :: term(), + Reply :: term(), + NewStateData :: term()} | + {stop, Reason :: term(), + NewStateData :: term()}. + +-callback handle_info(Info :: term(), + StateName :: atom(), + StateData :: term()) -> {next_state, + NextStateName :: atom(), + NewStateData :: term()} | + {next_state, + NextStateName :: atom(), + NewStateData :: term(), + timeout() | hibernate} | + {stop, Reason :: normal | term(), + NewStateData :: term()}. + +-callback terminate(Reason :: normal | shutdown | + {shutdown, term()} | term(), + StateName :: atom(), StateData :: term()) -> term(). + +-callback code_change(OldVsn :: term() | {down, term()}, + StateName :: atom(), StateData :: term(), + Extra :: term()) -> {ok, NextStateName :: atom(), + NewStateData :: term()}. + +-callback format_status(Opt, + StatusData) -> Status when Opt :: normal | terminate, + StatusData :: [PDict | + State], + PDict :: [{Key :: term(), + Value :: term()}], + State :: term(), + Status :: term(). + +-optional_callbacks([handle_info/3, terminate/3, + code_change/4, format_status/2]). + +%%% --------------------------------------------------- +%%% Starts a generic state machine. +%%% start(Mod, Args, Options) +%%% start(Name, Mod, Args, Options) +%%% start_link(Mod, Args, Options) +%%% start_link(Name, Mod, Args, Options) where: +%%% Name ::= {local, atom()} | {global, term()} | {via, atom(), term()} +%%% Mod ::= atom(), callback module implementing the 'real' fsm +%%% Args ::= term(), init arguments (to Module:init/1) +%%% Options ::= [{debug, [Flag]}] +%%% Flag ::= trace | log | {logfile, File} | statistics | debug +%%% (debug == log && statistics) +%%% Returns: {ok, Pid} | +%%% {error, {already_started, Pid}} | +%%% {error, Reason} +%%% --------------------------------------------------- +start(Mod, Args, Options) -> + gen:start(?MODULE, nolink, Mod, Args, Options). + +start(Name, Mod, Args, Options) -> + gen:start(?MODULE, nolink, Name, Mod, Args, Options). + +start_link(Mod, Args, Options) -> + gen:start(?MODULE, link, Mod, Args, Options). + +start_link(Name, Mod, Args, Options) -> + gen:start(?MODULE, link, Name, Mod, Args, Options). + +stop(Name) -> gen:stop(Name). + +stop(Name, Reason, Timeout) -> + gen:stop(Name, Reason, Timeout). + +send_event({global, Name}, Event) -> + catch global:send(Name, {'$gen_event', Event}), ok; +send_event({via, Module, Name}, Event) -> + catch Module:send(Name, {'$gen_event', Event}), ok; +send_event(Name, Event) -> + Name ! {'$gen_event', Event}, ok. + +sync_send_event(Name, Event) -> + case catch gen:call(Name, '$gen_sync_event', Event) of + {ok, Res} -> Res; + {'EXIT', Reason} -> + exit({Reason, + {?MODULE, sync_send_event, [Name, Event]}}) + end. + +sync_send_event(Name, Event, Timeout) -> + case catch gen:call(Name, '$gen_sync_event', Event, + Timeout) + of + {ok, Res} -> Res; + {'EXIT', Reason} -> + exit({Reason, + {?MODULE, sync_send_event, [Name, Event, Timeout]}}) + end. + +send_all_state_event({global, Name}, Event) -> + catch global:send(Name, + {'$gen_all_state_event', Event}), + ok; +send_all_state_event({via, Module, Name}, Event) -> + catch Module:send(Name, + {'$gen_all_state_event', Event}), + ok; +send_all_state_event(Name, Event) -> + Name ! {'$gen_all_state_event', Event}, ok. + +sync_send_all_state_event(Name, Event) -> + case catch gen:call(Name, '$gen_sync_all_state_event', + Event) + of + {ok, Res} -> Res; + {'EXIT', Reason} -> + exit({Reason, + {?MODULE, sync_send_all_state_event, [Name, Event]}}) + end. + +sync_send_all_state_event(Name, Event, Timeout) -> + case catch gen:call(Name, '$gen_sync_all_state_event', + Event, Timeout) + of + {ok, Res} -> Res; + {'EXIT', Reason} -> + exit({Reason, + {?MODULE, sync_send_all_state_event, + [Name, Event, Timeout]}}) + end. + +%% Designed to be only callable within one of the callbacks +%% hence using the self() of this instance of the process. +%% This is to ensure that timers don't go astray in global +%% e.g. when straddling a failover, or turn up in a restarted +%% instance of the process. + +%% Returns Ref, sends event {timeout,Ref,Msg} after Time +%% to the (then) current state. +start_timer(Time, Msg) -> + erlang:start_timer(Time, self(), {'$gen_timer', Msg}). + +%% Returns Ref, sends Event after Time to the (then) current state. +send_event_after(Time, Event) -> + erlang:start_timer(Time, self(), {'$gen_event', Event}). + +%% Returns the remaining time for the timer if Ref referred to +%% an active timer/send_event_after, false otherwise. +cancel_timer(Ref) -> + case erlang:cancel_timer(Ref) of + false -> + receive {timeout, Ref, _} -> 0 after 0 -> false end; + RemainingTime -> RemainingTime + end. + +%% enter_loop/4,5,6 +%% Makes an existing process into a gen_fsm. +%% The calling process will enter the gen_fsm receive loop and become a +%% gen_fsm process. +%% The process *must* have been started using one of the start functions +%% in proc_lib, see proc_lib(3). +%% The user is responsible for any initialization of the process, +%% including registering a name for it. +enter_loop(Mod, Options, StateName, StateData) -> + enter_loop(Mod, Options, StateName, StateData, self(), + infinity). + +enter_loop(Mod, Options, StateName, StateData, + {Scope, _} = ServerName) + when Scope == local; Scope == global -> + enter_loop(Mod, Options, StateName, StateData, + ServerName, infinity); +enter_loop(Mod, Options, StateName, StateData, + {via, _, _} = ServerName) -> + enter_loop(Mod, Options, StateName, StateData, + ServerName, infinity); +enter_loop(Mod, Options, StateName, StateData, + Timeout) -> + enter_loop(Mod, Options, StateName, StateData, self(), + Timeout). + +enter_loop(Mod, Options, StateName, StateData, + ServerName, Timeout) -> + Name = gen:get_proc_name(ServerName), + Parent = gen:get_parent(), + Debug = gen:debug_options(Name, Options), + HibernateAfterTimeout = gen:hibernate_after(Options), + loop(Parent, Name, StateName, StateData, Mod, Timeout, + HibernateAfterTimeout, Debug). + +%%% --------------------------------------------------- +%%% Initiate the new process. +%%% Register the name using the Rfunc function +%%% Calls the Moduleinit/Args function. +%%% Finally an acknowledge is sent to Parent and the main +%%% loop is entered. +%%% --------------------------------------------------- +init_it(Starter, self, Name, Mod, Args, Options) -> + init_it(Starter, self(), Name, Mod, Args, Options); +init_it(Starter, Parent, Name0, Module, Args, + Options) -> + Name = gen:name(Name0), + Debug = gen:debug_options(Name, Options), + HibernateAfterTimeout = gen:hibernate_after(Options), + case catch Module:init(Args) of + {ok, StateName, StateData} -> + proc_lib:init_ack(Starter, {ok, self()}), + loop(Parent, Name, StateName, StateData, Module, + infinity, HibernateAfterTimeout, Debug); + {ok, StateName, StateData, Timeout} -> + proc_lib:init_ack(Starter, {ok, self()}), + loop(Parent, Name, StateName, StateData, Module, + Timeout, HibernateAfterTimeout, Debug); + {stop, Reason} -> + gen:unregister_name(Name0), + proc_lib:init_ack(Starter, {error, Reason}), + exit(Reason); + ignore -> + gen:unregister_name(Name0), + proc_lib:init_ack(Starter, ignore), + exit(normal); + {'EXIT', Reason} -> + gen:unregister_name(Name0), + proc_lib:init_ack(Starter, {error, Reason}), + exit(Reason); + Else -> + Error = {bad_return_value, Else}, + proc_lib:init_ack(Starter, {error, Error}), + exit(Error) + end. + +%%----------------------------------------------------------------- +%% The MAIN loop +%%----------------------------------------------------------------- +loop(Parent, Name, StateName, StateData, Mod, hibernate, + HibernateAfterTimeout, Debug) -> + proc_lib:hibernate(?MODULE, wake_hib, + [Parent, Name, StateName, StateData, Mod, + HibernateAfterTimeout, Debug]); +loop(Parent, Name, StateName, StateData, Mod, infinity, + HibernateAfterTimeout, Debug) -> + receive + Msg -> + decode_msg(Msg, Parent, Name, StateName, StateData, Mod, + infinity, HibernateAfterTimeout, Debug, false) + after HibernateAfterTimeout -> + loop(Parent, Name, StateName, StateData, Mod, hibernate, + HibernateAfterTimeout, Debug) + end; +loop(Parent, Name, StateName, StateData, Mod, Time, + HibernateAfterTimeout, Debug) -> + Msg = receive + Input -> Input after Time -> {'$gen_event', timeout} + end, + decode_msg(Msg, Parent, Name, StateName, StateData, Mod, + Time, HibernateAfterTimeout, Debug, false). + +wake_hib(Parent, Name, StateName, StateData, Mod, + HibernateAfterTimeout, Debug) -> + Msg = receive Input -> Input end, + decode_msg(Msg, Parent, Name, StateName, StateData, Mod, + hibernate, HibernateAfterTimeout, Debug, true). + +decode_msg(Msg, Parent, Name, StateName, StateData, Mod, + Time, HibernateAfterTimeout, Debug, Hib) -> + case Msg of + {system, From, Req} -> + sys:handle_system_msg(Req, From, Parent, ?MODULE, Debug, + [Name, StateName, StateData, Mod, Time, + HibernateAfterTimeout], + Hib); + {'EXIT', Parent, Reason} -> + terminate(Reason, Name, Msg, Mod, StateName, StateData, + Debug); + _Msg when Debug =:= [] -> + handle_msg(Msg, Parent, Name, StateName, StateData, Mod, + Time, HibernateAfterTimeout); + _Msg -> + Debug1 = sys:handle_debug(Debug, fun print_event/3, + {Name, StateName}, {in, Msg}), + handle_msg(Msg, Parent, Name, StateName, StateData, Mod, + Time, HibernateAfterTimeout, Debug1) + end. + +%%----------------------------------------------------------------- +%% Callback functions for system messages handling. +%%----------------------------------------------------------------- +system_continue(Parent, Debug, + [Name, StateName, StateData, Mod, Time, + HibernateAfterTimeout]) -> + loop(Parent, Name, StateName, StateData, Mod, Time, + HibernateAfterTimeout, Debug). + +-spec system_terminate(term(), _, _, + [term(), ...]) -> no_return(). + +system_terminate(Reason, _Parent, Debug, + [Name, StateName, StateData, Mod, _Time, + _HibernateAfterTimeout]) -> + terminate(Reason, Name, [], Mod, StateName, StateData, + Debug). + +system_code_change([Name, StateName, StateData, Module, + Time, HibernateAfterTimeout], + _Module, OldVsn, Extra) -> + case catch Module:code_change(OldVsn, StateName, + StateData, Extra) + of + {ok, NewStateName, NewStateData} -> + {ok, + [Name, NewStateName, NewStateData, Module, Time, + HibernateAfterTimeout]}; + Else -> Else + end. + +system_get_state([_Name, StateName, StateData, _Mod, + _Time, _HibernateAfterTimeout]) -> + {ok, {StateName, StateData}}. + +system_replace_state(StateFun, + [Name, StateName, StateData, Mod, Time, + HibernateAfterTimeout]) -> + Result = {NStateName, NStateData} = StateFun({StateName, + StateData}), + {ok, Result, + [Name, NStateName, NStateData, Mod, Time, + HibernateAfterTimeout]}. + +%%----------------------------------------------------------------- +%% Format debug messages. Print them as the call-back module sees +%% them, not as the real erlang messages. Use trace for that. +%%----------------------------------------------------------------- +print_event(Dev, {in, Msg}, {Name, StateName}) -> + case Msg of + {'$gen_event', Event} -> + io:format(Dev, "*DBG* ~tp got event ~tp in state ~tw~n", + [Name, Event, StateName]); + {'$gen_all_state_event', Event} -> + io:format(Dev, + "*DBG* ~tp got all_state_event ~tp in " + "state ~tw~n", + [Name, Event, StateName]); + {timeout, Ref, {'$gen_timer', Message}} -> + io:format(Dev, "*DBG* ~tp got timer ~tp in state ~tw~n", + [Name, {timeout, Ref, Message}, StateName]); + {timeout, _Ref, {'$gen_event', Event}} -> + io:format(Dev, "*DBG* ~tp got timer ~tp in state ~tw~n", + [Name, Event, StateName]); + _ -> + io:format(Dev, "*DBG* ~tp got ~tp in state ~tw~n", + [Name, Msg, StateName]) + end; +print_event(Dev, {out, Msg, To, StateName}, Name) -> + io:format(Dev, + "*DBG* ~tp sent ~tp to ~tw~n and " + "switched to state ~tw~n", + [Name, Msg, To, StateName]); +print_event(Dev, return, {Name, StateName}) -> + io:format(Dev, "*DBG* ~tp switched to state ~tw~n", + [Name, StateName]). + +handle_msg(Msg, Parent, Name, StateName, StateData, Mod, + _Time, HibernateAfterTimeout) -> + %No debug here + From = from(Msg), + case catch dispatch(Msg, Mod, StateName, StateData) of + {next_state, NStateName, NStateData} -> + loop(Parent, Name, NStateName, NStateData, Mod, + infinity, HibernateAfterTimeout, []); + {next_state, NStateName, NStateData, Time1} -> + loop(Parent, Name, NStateName, NStateData, Mod, Time1, + HibernateAfterTimeout, []); + {reply, Reply, NStateName, NStateData} + when From =/= undefined -> + reply(From, Reply), + loop(Parent, Name, NStateName, NStateData, Mod, + infinity, HibernateAfterTimeout, []); + {reply, Reply, NStateName, NStateData, Time1} + when From =/= undefined -> + reply(From, Reply), + loop(Parent, Name, NStateName, NStateData, Mod, Time1, + HibernateAfterTimeout, []); + {stop, Reason, NStateData} -> + terminate(Reason, Name, Msg, Mod, StateName, NStateData, + []); + {stop, Reason, Reply, NStateData} + when From =/= undefined -> + {'EXIT', R} = (catch terminate(Reason, Name, Msg, Mod, + StateName, NStateData, [])), + reply(From, Reply), + exit(R); + {'EXIT', + {undef, [{Mod, handle_info, [_, _, _], _} | _]}} -> + error_logger:warning_msg("** Undefined handle_info in ~p~n** Unhandled " + "message: ~tp~n", + [Mod, Msg]), + loop(Parent, Name, StateName, StateData, Mod, infinity, + HibernateAfterTimeout, []); + {'EXIT', What} -> + terminate(What, Name, Msg, Mod, StateName, StateData, + []); + Reply -> + terminate({bad_return_value, Reply}, Name, Msg, Mod, + StateName, StateData, []) + end. + +handle_msg(Msg, Parent, Name, StateName, StateData, Mod, + _Time, HibernateAfterTimeout, Debug) -> + From = from(Msg), + case catch dispatch(Msg, Mod, StateName, StateData) of + {next_state, NStateName, NStateData} -> + Debug1 = sys:handle_debug(Debug, fun print_event/3, + {Name, NStateName}, return), + loop(Parent, Name, NStateName, NStateData, Mod, + infinity, HibernateAfterTimeout, Debug1); + {next_state, NStateName, NStateData, Time1} -> + Debug1 = sys:handle_debug(Debug, fun print_event/3, + {Name, NStateName}, return), + loop(Parent, Name, NStateName, NStateData, Mod, Time1, + HibernateAfterTimeout, Debug1); + {reply, Reply, NStateName, NStateData} + when From =/= undefined -> + Debug1 = reply(Name, From, Reply, Debug, NStateName), + loop(Parent, Name, NStateName, NStateData, Mod, + infinity, HibernateAfterTimeout, Debug1); + {reply, Reply, NStateName, NStateData, Time1} + when From =/= undefined -> + Debug1 = reply(Name, From, Reply, Debug, NStateName), + loop(Parent, Name, NStateName, NStateData, Mod, Time1, + HibernateAfterTimeout, Debug1); + {stop, Reason, NStateData} -> + terminate(Reason, Name, Msg, Mod, StateName, NStateData, + Debug); + {stop, Reason, Reply, NStateData} + when From =/= undefined -> + {'EXIT', R} = (catch terminate(Reason, Name, Msg, Mod, + StateName, NStateData, Debug)), + _ = reply(Name, From, Reply, Debug, StateName), + exit(R); + {'EXIT', What} -> + terminate(What, Name, Msg, Mod, StateName, StateData, + Debug); + Reply -> + terminate({bad_return_value, Reply}, Name, Msg, Mod, + StateName, StateData, Debug) + end. + +dispatch({'$gen_event', Event}, Module, StateName, + StateData) -> + Module:StateName(Event, StateData); +dispatch({'$gen_all_state_event', Event}, Module, + StateName, StateData) -> + Module:handle_event(Event, StateName, StateData); +dispatch({'$gen_sync_event', From, Event}, Module, + StateName, StateData) -> + Module:StateName(Event, From, StateData); +dispatch({'$gen_sync_all_state_event', From, Event}, + Module, StateName, StateData) -> + Module:handle_sync_event(Event, From, StateName, + StateData); +dispatch({timeout, Ref, {'$gen_timer', Msg}}, Module, + StateName, StateData) -> + Module:StateName({timeout, Ref, Msg}, StateData); +dispatch({timeout, _Ref, {'$gen_event', Event}}, Module, + StateName, StateData) -> + Module:StateName(Event, StateData); +dispatch(Info, Module, StateName, StateData) -> + Module:handle_info(Info, StateName, StateData). + +from({'$gen_sync_event', From, _Event}) -> From; +from({'$gen_sync_all_state_event', From, _Event}) -> + From; +from(_) -> undefined. + +%% Send a reply to the client. +reply({To, Tag}, Reply) -> catch To ! {Tag, Reply}. + +reply(Name, {To, Tag}, Reply, Debug, StateName) -> + reply({To, Tag}, Reply), + sys:handle_debug(Debug, fun print_event/3, Name, + {out, Reply, To, StateName}). + +%%% --------------------------------------------------- +%%% Terminate the server. +%%% --------------------------------------------------- + +-spec terminate(term(), _, _, atom(), _, _, + _) -> no_return(). + +terminate(Reason, Name, Msg, Module, StateName, + StateData, Debug) -> + case erlang:function_exported(Module, terminate, 3) of + true -> + case catch Module:terminate(Reason, StateName, + StateData) + of + {'EXIT', R} -> + FmtStateData = format_status(terminate, Module, get(), + StateData), + error_info(R, Name, Msg, StateName, FmtStateData, + Debug), + exit(R); + _ -> ok + end; + false -> ok + end, + case Reason of + normal -> exit(normal); + shutdown -> exit(shutdown); + {shutdown, _} = Shutdown -> exit(Shutdown); + _ -> + FmtStateData1 = format_status(terminate, Module, get(), + StateData), + error_info(Reason, Name, Msg, StateName, FmtStateData1, + Debug), + exit(Reason) + end. + +error_info(Reason, Name, Msg, StateName, StateData, + Debug) -> + Reason1 = case Reason of + {undef, [{M, F, A, L} | MFAs]} -> + case code:is_loaded(M) of + false -> + {'module could not be loaded', [{M, F, A, L} | MFAs]}; + _ -> + case erlang:function_exported(M, F, length(A)) of + true -> Reason; + false -> + {'function not exported', [{M, F, A, L} | MFAs]} + end + end; + _ -> Reason + end, + Str = "** State machine ~tp terminating \n" ++ + get_msg_str(Msg) ++ + "** When State == ~tp~n** Data " + "== ~tp~n** Reason for termination = " + "~n** ~tp~n", + format(Str, + [Name, get_msg(Msg), StateName, StateData, Reason1]), + sys:print_log(Debug), + ok. + +get_msg_str({'$gen_event', _Event}) -> + "** Last event in was ~tp~n"; +get_msg_str({'$gen_sync_event', _Event}) -> + "** Last sync event in was ~tp~n"; +get_msg_str({'$gen_all_state_event', _Event}) -> + "** Last event in was ~tp (for all states)~n"; +get_msg_str({'$gen_sync_all_state_event', _Event}) -> + "** Last sync event in was ~tp (for all " + "states)~n"; +get_msg_str({timeout, _Ref, {'$gen_timer', _Msg}}) -> + "** Last timer event in was ~tp~n"; +get_msg_str({timeout, _Ref, {'$gen_event', _Msg}}) -> + "** Last timer event in was ~tp~n"; +get_msg_str(_Msg) -> "** Last message in was ~tp~n". + +get_msg({'$gen_event', Event}) -> Event; +get_msg({'$gen_sync_event', Event}) -> Event; +get_msg({'$gen_all_state_event', Event}) -> Event; +get_msg({'$gen_sync_all_state_event', Event}) -> Event; +get_msg({timeout, Ref, {'$gen_timer', Msg}}) -> + {timeout, Ref, Msg}; +get_msg({timeout, _Ref, {'$gen_event', Event}}) -> + Event; +get_msg(Msg) -> Msg. + +%%----------------------------------------------------------------- +%% Status information +%%----------------------------------------------------------------- + +-if((?OTP_RELEASE) >= 22). + +format_status(Opt, StatusData) -> + [PDict, SysState, Parent, Debug, + [Name, StateName, StateData, Mod, _Time, + _HibernateAfterTimeout]] = + StatusData, + Header = + gen:format_status_header("Status for state machine", + Name), + Log = sys:get_log(Debug), + Specfic = format_status(Opt, Mod, PDict, StateData), + Specfic = case format_status(Opt, Mod, PDict, StateData) + of + S when is_list(S) -> S; + S -> [S] + end, + [{header, Header}, + {data, + [{"Status", SysState}, {"Parent", Parent}, + {"Logged events", Log}, {"StateName", StateName}]} + | Specfic]. + +-elif((?OTP_RELEASE) >= 21). + +format_status(Opt, StatusData) -> + [PDict, SysState, Parent, Debug, + [Name, StateName, StateData, Mod, _Time, + _HibernateAfterTimeout]] = + StatusData, + Header = + gen:format_status_header("Status for state machine", + Name), + %% Log = sys:get_log(Debug), + Log = sys:get_debug(log, Debug, []), + Specfic = format_status(Opt, Mod, PDict, StateData), + Specfic = case format_status(Opt, Mod, PDict, StateData) + of + S when is_list(S) -> S; + S -> [S] + end, + [{header, Header}, + {data, + [{"Status", SysState}, {"Parent", Parent}, + {"Logged events", Log}, {"StateName", StateName}]} + | Specfic]. + +-endif. + +format_status(Opt, Module, PDict, State) -> + DefStatus = case Opt of + terminate -> State; + _ -> [{data, [{"StateData", State}]}] + end, + case erlang:function_exported(Module, format_status, 2) + of + true -> + case catch Module:format_status(Opt, [PDict, State]) of + {'EXIT', _} -> DefStatus; + Else -> Else + end; + _ -> DefStatus + end. diff --git a/src/gen_nb_server.erl b/src/gen_nb_server.erl index caf855f6e..1e1395fd7 100644 --- a/src/gen_nb_server.erl +++ b/src/gen_nb_server.erl @@ -28,153 +28,200 @@ -export([start_link/4]). %% gen_server callbacks --export([init/1, handle_call/3, handle_cast/2, handle_info/2, - terminate/2, code_change/3]). +-export([init/1, handle_call/3, handle_cast/2, + handle_info/2, terminate/2, code_change/3]). -define(SERVER, ?MODULE). --record(state, {cb, - sock, - server_state}). - --callback init(InitArgs::list()) -> - {ok, State::term()} | - {error, Reason::term()}. - --callback handle_call(Msg::term(), From::{pid(), term()}, State::term()) -> - {reply, Reply::term(), State::term()} | - {reply, Reply::term(), State::term(), number() | hibernate} | - {noreply, State::term()} | - {noreply, State::term(), number() | hibernate} | - {stop, Reason::term(), State::term()}. - --callback handle_cast(Msg::term(), State::term()) -> - {noreply, State::term()} | - {noreply, State::term(), number() | hibernate} | - {stop, Reason::term(), State::term()}. - --callback handle_info(Msg::term(), State::term()) -> - {noreply, State::term()} | - {noreply, State::term(), number() | hibernate} | - {stop, Reason::term(), State::term()}. - --callback terminate(Reason::term(), State::term()) -> - ok. +-record(state, {cb, sock, server_state}). + +-callback init(InitArgs :: list()) -> {ok, + State :: term()} | + {error, Reason :: term()}. + +-callback handle_call(Msg :: term(), + From :: {pid(), term()}, State :: term()) -> {reply, + Reply :: + term(), + State :: + term()} | + {reply, + Reply :: + term(), + State :: + term(), + number() | + hibernate} | + {noreply, + State :: + term()} | + {noreply, + State :: + term(), + number() | + hibernate} | + {stop, + Reason :: + term(), + State :: + term()}. + +-callback handle_cast(Msg :: term(), + State :: term()) -> {noreply, State :: term()} | + {noreply, State :: term(), + number() | hibernate} | + {stop, Reason :: term(), + State :: term()}. + +-callback handle_info(Msg :: term(), + State :: term()) -> {noreply, State :: term()} | + {noreply, State :: term(), + number() | hibernate} | + {stop, Reason :: term(), + State :: term()}. + +-callback terminate(Reason :: term(), + State :: term()) -> ok. -callback sock_opts() -> [gen_tcp:listen_option()]. --callback new_connection(inet:socket(), State::term()) -> - {ok, NewState::term()} | - {stop, Reason::term(), NewState::term()}. +-callback new_connection(inet:socket(), + State :: term()) -> {ok, NewState :: term()} | + {stop, Reason :: term(), + NewState :: term()}. -%% @spec start_link(CallbackModule, IpAddr, Port, InitParams) -> Result -%% CallbackModule = atom() +%% @spec start_link(Module, IpAddr, Port, InitParams) -> Result +%% Module = atom() %% IpAddr = string() %% Port = integer() %% InitParams = [any()] %% Result = {ok, pid()} | {error, any()} %% @doc Start server listening on IpAddr:Port -start_link(CallbackModule, IpAddr, Port, InitParams) -> - gen_server:start_link(?MODULE, [CallbackModule, IpAddr, Port, InitParams], []). +start_link(Module, IpAddr, Port, InitParams) -> + gen_server:start_link(?MODULE, + [Module, IpAddr, Port, InitParams], []). %% @hidden -init([CallbackModule, IpAddr, Port, InitParams]) -> - case CallbackModule:init(InitParams) of - {ok, ServerState} -> - case listen_on(CallbackModule, IpAddr, Port) of - {ok, Sock} -> - {ok, #state{cb=CallbackModule, sock=Sock, server_state=ServerState}}; - Error -> - CallbackModule:terminate(Error, ServerState), - Error - end; - Err -> - Err - end. +init([Module, IpAddr, Port, InitParams]) -> + case Module:init(InitParams) of + {ok, ServerState} -> + case listen_on(Module, IpAddr, Port) of + {ok, Sock} -> + {ok, + #state{cb = Module, sock = Sock, + server_state = ServerState}}; + Error -> Module:terminate(Error, ServerState), Error + end; + Err -> Err + end. %% @hidden -handle_call(Request, From, #state{cb=Callback, server_state=ServerState}=State) -> - case Callback:handle_call(Request, From, ServerState) of - {reply, Reply, NewServerState} -> - {reply, Reply, State#state{server_state=NewServerState}}; - {reply, Reply, NewServerState, Arg} when Arg =:= hibernate orelse is_number(Arg) -> - {reply, Reply, State#state{server_state=NewServerState}, Arg}; - {noreply, NewServerState} -> - {noreply, State#state{server_state=NewServerState}}; - {noreply, NewServerState, Arg} when Arg =:= hibernate orelse is_number(Arg) -> - {noreply, State#state{server_state=NewServerState}, Arg}; - {stop, Reason, NewServerState} -> - {stop, Reason, State#state{server_state=NewServerState}}; - {stop, Reason, Reply, NewServerState} -> - {stop, Reason, Reply, State#state{server_state=NewServerState}} - end. +handle_call(Request, From, + #state{cb = Module, server_state = ServerState} = + State) -> + case Module:handle_call(Request, From, ServerState) of + {reply, Reply, NewServerState} -> + {reply, Reply, + State#state{server_state = NewServerState}}; + {reply, Reply, NewServerState, Arg} + when Arg =:= hibernate orelse is_number(Arg) -> + {reply, Reply, + State#state{server_state = NewServerState}, Arg}; + {noreply, NewServerState} -> + {noreply, State#state{server_state = NewServerState}}; + {noreply, NewServerState, Arg} + when Arg =:= hibernate orelse is_number(Arg) -> + {noreply, State#state{server_state = NewServerState}, + Arg}; + {stop, Reason, NewServerState} -> + {stop, Reason, + State#state{server_state = NewServerState}}; + {stop, Reason, Reply, NewServerState} -> + {stop, Reason, Reply, + State#state{server_state = NewServerState}} + end. %% @hidden -handle_cast(Msg, #state{cb=Callback, server_state=ServerState}=State) -> - case Callback:handle_cast(Msg, ServerState) of - {noreply, NewServerState} -> - {noreply, State#state{server_state=NewServerState}}; - {noreply, NewServerState, Arg} when Arg =:= hibernate orelse is_number(Arg) -> - {noreply, State#state{server_state=NewServerState}, Arg}; - {stop, Reason, NewServerState} -> - {stop, Reason, State#state{server_state=NewServerState}} - end. +handle_cast(Msg, + #state{cb = Module, server_state = ServerState} = + State) -> + case Module:handle_cast(Msg, ServerState) of + {noreply, NewServerState} -> + {noreply, State#state{server_state = NewServerState}}; + {noreply, NewServerState, Arg} + when Arg =:= hibernate orelse is_number(Arg) -> + {noreply, State#state{server_state = NewServerState}, + Arg}; + {stop, Reason, NewServerState} -> + {stop, Reason, + State#state{server_state = NewServerState}} + end. %% @hidden -handle_info({inet_async, ListSock, _Ref, {ok, CliSocket}}, #state{cb=Callback, server_state=ServerState}=State) -> - inet_db:register_socket(CliSocket, inet_tcp), - case Callback:new_connection(CliSocket, ServerState) of - {ok, NewServerState} -> - {ok, _} = prim_inet:async_accept(ListSock, -1), - {noreply, State#state{server_state=NewServerState}}; - {stop, Reason, NewServerState} -> - {stop, Reason, State#state{server_state=NewServerState}} - end; - -handle_info(Info, #state{cb=Callback, server_state=ServerState}=State) -> - case Callback:handle_info(Info, ServerState) of - {noreply, NewServerState} -> - {noreply, State#state{server_state=NewServerState}}; - {noreply, NewServerState, Arg} when Arg =:= hibernate orelse is_number(Arg) -> - {noreply, State#state{server_state=NewServerState}, Arg}; - {stop, Reason, NewServerState} -> - {stop, Reason, State#state{server_state=NewServerState}} - end. +handle_info({inet_async, ListSock, _Ref, + {ok, CliSocket}}, + #state{cb = Module, server_state = ServerState} = + State) -> + inet_db:register_socket(CliSocket, inet_tcp), + case Module:new_connection(CliSocket, ServerState) of + {ok, NewServerState} -> + {ok, _} = prim_inet:async_accept(ListSock, -1), + {noreply, State#state{server_state = NewServerState}}; + {stop, Reason, NewServerState} -> + {stop, Reason, + State#state{server_state = NewServerState}} + end; +handle_info(Info, + #state{cb = Module, server_state = ServerState} = + State) -> + case Module:handle_info(Info, ServerState) of + {noreply, NewServerState} -> + {noreply, State#state{server_state = NewServerState}}; + {noreply, NewServerState, Arg} + when Arg =:= hibernate orelse is_number(Arg) -> + {noreply, State#state{server_state = NewServerState}, + Arg}; + {stop, Reason, NewServerState} -> + {stop, Reason, + State#state{server_state = NewServerState}} + end. %% @hidden -terminate(Reason, #state{cb=Callback, sock=Sock, server_state=ServerState}) -> - gen_tcp:close(Sock), - Callback:terminate(Reason, ServerState), - ok. +terminate(Reason, + #state{cb = Module, sock = Sock, + server_state = ServerState}) -> + gen_tcp:close(Sock), + Module:terminate(Reason, ServerState), + ok. %% @hidden -code_change(_OldVsn, State, _Extra) -> - {ok, State}. +code_change(_OldVsn, State, _Extra) -> {ok, State}. %% Internal functions %% @hidden -%% @spec listen_on(CallbackModule, IpAddr, Port) -> Result -%% CallbackModule = atom() +%% @spec listen_on(Module, IpAddr, Port) -> Result +%% Module = atom() %% IpAddr = string() | tuple() %% Port = integer() %% Result = {ok, port()} | {error, any()} -listen_on(CallbackModule, IpAddr, Port) when is_tuple(IpAddr) andalso - (8 =:= size(IpAddr) orelse - 4 =:= size(IpAddr)) -> - SockOpts = [{ip, IpAddr}|CallbackModule:sock_opts()], +listen_on(Module, IpAddr, Port) + when is_tuple(IpAddr) andalso + (8 =:= size(IpAddr) orelse 4 =:= size(IpAddr)) -> + SockOpts = [{ip, IpAddr} | Module:sock_opts()], case gen_tcp:listen(Port, SockOpts) of - {ok, LSock} -> - {ok, _Ref} = prim_inet:async_accept(LSock, -1), - {ok, LSock}; - Err -> - Err + {ok, LSock} -> + {ok, _Ref} = prim_inet:async_accept(LSock, -1), + {ok, LSock}; + Err -> Err end; -listen_on(CallbackModule, IpAddrStr, Port) -> +listen_on(Module, IpAddrStr, Port) -> case inet_parse:address(IpAddrStr) of - {ok, IpAddr} -> - listen_on(CallbackModule, IpAddr, Port); - Err -> - logger:critical("Cannot start listener for ~p on invalid address ~p:~p", [CallbackModule, IpAddrStr, Port]), - Err + {ok, IpAddr} -> listen_on(Module, IpAddr, Port); + Err -> + logger:critical("Cannot start listener for ~p\n " + " on invalid address " + "~p:~p", + [Module, IpAddrStr, Port]), + Err end. diff --git a/src/riak_core.app.src b/src/riak_core.app.src index 7df7a4e4f..f5f327bd9 100644 --- a/src/riak_core.app.src +++ b/src/riak_core.app.src @@ -1,40 +1,32 @@ -{application,riak_core, - [{description,"Riak Core"}, - {vsn,"4.0.0"}, - {modules,[]}, - {registered,[]}, - {included_applications,[chash, syntax_tools, compiler]}, +{application, riak_core, + [{description, "Riak Core Lite"}, + {vsn,"0.10.1"}, {applications, - [ - kernel,stdlib,crypto,os_mon, - gen_fsm_compat, - basho_stats,poolboy - ]}, - {mod,{riak_core_app,[]}}, + [kernel, stdlib, crypto, os_mon, poolboy] + }, + {mod, {riak_core_app, []}}, {env, - [{cluster_name,"default"}, - {platform_data_dir,"data"}, - {ring_state_dir,"data/ring"}, - {ring_creation_size,64}, + [{cluster_name, "default"}, + {platform_data_dir, "data"}, + {ring_state_dir, "data/ring"}, + {ring_creation_size, 64}, {gossip_interval, 60000}, - {target_n_val,4}, + {target_n_val, 4}, {vnode_inactivity_timeout, 60000}, {vnode_check_interval, 5000}, {vnode_overload_threshold, 10000}, {vnode_modules, []}, {vnode_routing, proxy}, - {handoff_concurrency,2}, + {handoff_concurrency, 2}, {handoff_receive_timeout, 300000}, % 5 minutes TCP timeout {handoff_receive_vnode_timeout, 60000}, % timeout for vnode to process the hand-off data msg - {default_bucket_props, []}, - {handoff_port,8099}, - {handoff_ip,"0.0.0.0"}, - {bucket_fixups, []}, - {bucket_validators, []}, + {handoff_port, 8099}, + {handoff_ip, "0.0.0.0"}, {stat_mods, []}, {health_checks, []} -]}, -{pkg_name,"riak_core"}, - {maintainers,["AntidoteDB","Albert Schimpf"]}, - {licenses,["Apache"]}, - {links,[{"Github","https://github.com/AntidoteDB/riak_core"}]}]}. + ]}, + {pkg_name, "riak_core_lite"}, + {maintainers, ["riak_core_lite", "Albert Schimpf"]}, + {licenses, ["Apache"]}, + {links, [{"Github", "https://github.com/riak-core-lite/riak_core_lite"}]} + ]}. diff --git a/src/riak_core.erl b/src/riak_core.erl index c24e235c3..1ee62dbfb 100644 --- a/src/riak_core.erl +++ b/src/riak_core.erl @@ -20,18 +20,27 @@ %% %% ------------------------------------------------------------------- -module(riak_core). --export([stop/0, stop/1, join/1, join/4, staged_join/1, remove/1, down/1, - leave/0, remove_from_cluster/1]). + +-export([stop/0, stop/1, join/1, join/4, staged_join/1, + remove/1, down/1, leave/0, remove_from_cluster/1]). + -export([vnode_modules/0, health_check/1]). --export([register/1, register/2, bucket_fixups/0, bucket_validators/0]). + +-export([register/1, register/2]). + -export([stat_mods/0, stat_prefix/0]). --export([add_guarded_event_handler/3, add_guarded_event_handler/4]). +-export([add_guarded_event_handler/3, + add_guarded_event_handler/4]). + -export([delete_guarded_event_handler/3]). + -export([wait_for_application/1, wait_for_service/1]). --compile({no_auto_import,[register/2]}). --define(WAIT_PRINT_INTERVAL, (60 * 1000)). +-compile({no_auto_import, [{register, 2}]}). + +-define(WAIT_PRINT_INTERVAL, 60 * 1000). + -define(WAIT_POLL_INTERVAL, 100). %% @spec stop() -> ok @@ -39,63 +48,59 @@ stop() -> stop("riak stop requested"). -ifdef(TEST). + stop(Reason) -> logger:notice("~p", [Reason]), % if we're in test mode, we don't want to halt the node, so instead % we just stop the application. application:stop(riak_core). + -else. + stop(Reason) -> % we never do an application:stop because that makes it very hard % to really halt the runtime, which is what we need here. logger:notice("~p", [Reason]), init:stop(). + -endif. %% %% @doc Join the ring found on the specified remote node %% -join(Node) -> - join(Node, false). +join(Node) -> join(Node, false). %% @doc Join the remote cluster without automatically claiming ring %% ownership. Used to stage a join in the newer plan/commit %% approach to cluster administration. See {@link riak_core_claimant} -staged_join(Node) -> - join(Node, false). +staged_join(Node) -> join(Node, false). join(NodeStr, Auto) when is_list(NodeStr) -> join(riak_core_util:str_to_node(NodeStr), Auto); join(Node, Auto) when is_atom(Node) -> join(node(), Node, Auto). -join(Node, Node, _) -> - {error, self_join}; -join(_, Node, Auto) -> - join(node(), Node, false, Auto). +join(Node, Node, _) -> {error, self_join}; +join(_, Node, Auto) -> join(node(), Node, false, Auto). join(_, Node, Rejoin, Auto) -> case net_adm:ping(Node) of - pang -> - {error, not_reachable}; - pong -> - standard_join(Node, Rejoin, Auto) + pang -> {error, not_reachable}; + pong -> standard_join(Node, Rejoin, Auto) end. get_other_ring(Node) -> - riak_core_util:safe_rpc(Node, riak_core_ring_manager, get_raw_ring, []). + riak_core_util:safe_rpc(Node, riak_core_ring_manager, + get_raw_ring, []). standard_join(Node, Rejoin, Auto) when is_atom(Node) -> case net_adm:ping(Node) of - pong -> - case get_other_ring(Node) of - {ok, Ring} -> - standard_join(Node, Ring, Rejoin, Auto); - _ -> - {error, unable_to_get_join_ring} - end; - pang -> - {error, not_reachable} + pong -> + case get_other_ring(Node) of + {ok, Ring} -> standard_join(Node, Ring, Rejoin, Auto); + _ -> {error, unable_to_get_join_ring} + end; + pang -> {error, not_reachable} end. %% `init:get_status/0' will return a 2-tuple reflecting the init @@ -103,254 +108,202 @@ standard_join(Node, Rejoin, Auto) when is_atom(Node) -> %% `started', or `stopping'. We only want to allow join actions if all %% applications have finished starting to avoid ring status race %% conditions. -init_complete({started, _}) -> - true; -init_complete(_) -> - false. +init_complete({started, _}) -> true; +init_complete(_) -> false. standard_join(Node, Ring, Rejoin, Auto) -> {ok, MyRing} = riak_core_ring_manager:get_raw_ring(), - InitComplete = init_complete(init:get_status()), - - SameSize = (riak_core_ring:num_partitions(MyRing) =:= - riak_core_ring:num_partitions(Ring)), - Singleton = ([node()] =:= riak_core_ring:all_members(MyRing)), + SameSize = riak_core_ring:num_partitions(MyRing) =:= + riak_core_ring:num_partitions(Ring), + Singleton = [node()] =:= + riak_core_ring:all_members(MyRing), case {InitComplete, Rejoin or Singleton, SameSize} of - {false, _, _} -> - {error, node_still_starting}; - {_, false, _} -> - {error, not_single_node}; - {_, _, false} -> - {error, different_ring_sizes}; - _ -> - Ring2 = riak_core_ring:add_member(node(), Ring, - node()), - Ring3 = riak_core_ring:set_owner(Ring2, node()), - Ring4 = - riak_core_ring:update_member_meta(node(), - Ring3, - node(), - gossip_vsn, - 2), - Ring5 = Ring4, - Ring6 = maybe_auto_join(Auto, node(), Ring5), - riak_core_ring_manager:set_my_ring(Ring6), - riak_core_gossip:send_ring(Node, node()) + {false, _, _} -> {error, node_still_starting}; + {_, false, _} -> {error, not_single_node}; + {_, _, false} -> {error, different_ring_sizes}; + _ -> + Ring2 = riak_core_ring:add_member(node(), Ring, node()), + Ring3 = riak_core_ring:set_owner(Ring2, node()), + Ring4 = riak_core_ring:update_member_meta(node(), Ring3, + node(), gossip_vsn, 2), + Ring5 = Ring4, + Ring6 = maybe_auto_join(Auto, node(), Ring5), + riak_core_ring_manager:set_my_ring(Ring6), + riak_core_gossip:send_ring(Node, node()) end. -maybe_auto_join(false, _Node, Ring) -> - Ring; +maybe_auto_join(false, _Node, Ring) -> Ring; maybe_auto_join(true, Node, Ring) -> - riak_core_ring:update_member_meta(Node, Ring, Node, '$autojoin', true). + riak_core_ring:update_member_meta(Node, Ring, Node, + '$autojoin', true). remove(Node) -> {ok, Ring} = riak_core_ring_manager:get_raw_ring(), case {riak_core_ring:all_members(Ring), - riak_core_ring:member_status(Ring, Node)} of - {_, invalid} -> - {error, not_member}; - {[Node], _} -> - {error, only_member}; - _ -> - standard_remove(Node) + riak_core_ring:member_status(Ring, Node)} + of + {_, invalid} -> {error, not_member}; + {[Node], _} -> {error, only_member}; + _ -> standard_remove(Node) end. standard_remove(Node) -> - riak_core_ring_manager:ring_trans( - fun(Ring2, _) -> - Ring3 = riak_core_ring:remove_member(node(), Ring2, Node), - Ring4 = riak_core_ring:ring_changed(node(), Ring3), - {new_ring, Ring4} - end, []), + riak_core_ring_manager:ring_trans(fun (Ring2, _) -> + Ring3 = + riak_core_ring:remove_member(node(), + Ring2, + Node), + Ring4 = + riak_core_ring:ring_changed(node(), + Ring3), + {new_ring, Ring4} + end, + []), ok. down(Node) -> {ok, Ring} = riak_core_ring_manager:get_raw_ring(), case net_adm:ping(Node) of - pong -> - {error, is_up}; - pang -> - case {riak_core_ring:all_members(Ring), - riak_core_ring:member_status(Ring, Node)} of - {_, invalid} -> - {error, not_member}; - {[Node], _} -> - {error, only_member}; - _ -> - riak_core_ring_manager:ring_trans( - fun(Ring2, _) -> - Ring3 = riak_core_ring:down_member(node(), Ring2, Node), - Ring4 = riak_core_ring:ring_changed(node(), Ring3), - {new_ring, Ring4} - end, []), - ok - end + pong -> {error, is_up}; + pang -> + case {riak_core_ring:all_members(Ring), + riak_core_ring:member_status(Ring, Node)} + of + {_, invalid} -> {error, not_member}; + {[Node], _} -> {error, only_member}; + _ -> + riak_core_ring_manager:ring_trans(fun (Ring2, _) -> + Ring3 = + riak_core_ring:down_member(node(), + Ring2, + Node), + Ring4 = + riak_core_ring:ring_changed(node(), + Ring3), + {new_ring, Ring4} + end, + []), + ok + end end. leave() -> Node = node(), {ok, Ring} = riak_core_ring_manager:get_raw_ring(), case {riak_core_ring:all_members(Ring), - riak_core_ring:member_status(Ring, Node)} of - {_, invalid} -> - {error, not_member}; - {[Node], _} -> - {error, only_member}; - {_, valid} -> - standard_leave(Node); - {_, _} -> - {error, already_leaving} + riak_core_ring:member_status(Ring, Node)} + of + {_, invalid} -> {error, not_member}; + {[Node], _} -> {error, only_member}; + {_, valid} -> standard_leave(Node); + {_, _} -> {error, already_leaving} end. standard_leave(Node) -> - riak_core_ring_manager:ring_trans( - fun(Ring2, _) -> - Ring3 = riak_core_ring:leave_member(Node, Ring2, Node), - {new_ring, Ring3} - end, []), + riak_core_ring_manager:ring_trans(fun (Ring2, _) -> + Ring3 = + riak_core_ring:leave_member(Node, + Ring2, + Node), + {new_ring, Ring3} + end, + []), ok. %% @spec remove_from_cluster(ExitingNode :: atom()) -> term() %% @doc Cause all partitions owned by ExitingNode to be taken over %% by other nodes. -remove_from_cluster(ExitingNode) when is_atom(ExitingNode) -> +remove_from_cluster(ExitingNode) + when is_atom(ExitingNode) -> remove(ExitingNode). vnode_modules() -> case application:get_env(riak_core, vnode_modules) of - undefined -> []; - {ok, Mods} -> Mods - end. - -bucket_fixups() -> - case application:get_env(riak_core, bucket_fixups) of - undefined -> []; - {ok, Mods} -> Mods - end. - -bucket_validators() -> - case application:get_env(riak_core, bucket_validators) of - undefined -> []; - {ok, Mods} -> Mods + undefined -> []; + {ok, Mods} -> Mods end. stat_mods() -> case application:get_env(riak_core, stat_mods) of - undefined -> []; - {ok, Mods} -> Mods + undefined -> []; + {ok, Mods} -> Mods end. health_check(App) -> case application:get_env(riak_core, health_checks) of - undefined -> - undefined; - {ok, Mods} -> - case lists:keyfind(App, 1, Mods) of - false -> - undefined; - {App, MFA} -> - MFA - end + undefined -> undefined; + {ok, Mods} -> + case lists:keyfind(App, 1, Mods) of + false -> undefined; + {App, MFA} -> MFA + end end. %% Get the application name if not supplied, first by get_application %% then by searching by module name get_app(undefined, Module) -> {ok, App} = case application:get_application(self()) of - {ok, AppName} -> {ok, AppName}; - undefined -> app_for_module(Module) + {ok, AppName} -> {ok, AppName}; + undefined -> app_for_module(Module) end, App; -get_app(App, _Module) -> - App. +get_app(App, _Module) -> App. %% @doc Register a riak_core application. -register(Props) -> - register(undefined, Props). +register(Props) -> register(undefined, Props). %% @doc Register a named riak_core application. register(_App, []) -> %% Once the app is registered, do a no-op ring trans %% to ensure the new fixups are run against %% the ring. - {ok, _R} = riak_core_ring_manager:ring_trans(fun(R,_A) -> {new_ring, R} end, + {ok, _R} = riak_core_ring_manager:ring_trans(fun (R, + _A) -> + {new_ring, R} + end, undefined), riak_core_ring_events:force_sync_update(), ok; -register(App, [{bucket_fixup, FixupMod}|T]) -> - register_mod(get_app(App, FixupMod), FixupMod, bucket_fixups), - register(App, T); -register(App, [{repl_helper, FixupMod}|T]) -> - register_mod(get_app(App, FixupMod), FixupMod, repl_helper), - register(App, T); -register(App, [{vnode_module, VNodeMod}|T]) -> - register_mod(get_app(App, VNodeMod), VNodeMod, vnode_modules), - register(App, T); -register(App, [{health_check, HealthMFA}|T]) -> - register_metadata(get_app(App, HealthMFA), HealthMFA, health_checks), - register(App, T); -register(App, [{bucket_validator, ValidationMod}|T]) -> - register_mod(get_app(App, ValidationMod), ValidationMod, bucket_validators), - register(App, T); -register(App, [{stat_mod, StatMod}|T]) -> - register_mod(App, StatMod, stat_mods), - register(App, T); -register(App, [{permissions, Permissions}|T]) -> - register_mod(App, Permissions, permissions), +register(App, [{vnode_module, VNodeMod} | T]) -> + register_mod(get_app(App, VNodeMod), VNodeMod, + vnode_modules), register(App, T); -register(App, [{auth_mod, {AuthType, AuthMod}}|T]) -> - register_proplist({AuthType, AuthMod}, auth_mods), +register(App, [{health_check, HealthMFA} | T]) -> + register_metadata(get_app(App, HealthMFA), HealthMFA, + health_checks), register(App, T). register_mod(App, Module, Type) when is_atom(Type) -> case Type of - vnode_modules -> - riak_core_vnode_proxy_sup:start_proxies(Module); - stat_mods -> - %% STATS -%% riak_core_stats_sup:start_server(Module); - logger:warning("Metric collection disabled"), - ok; - _ -> - ok + vnode_modules -> + riak_core_vnode_proxy_sup:start_proxies(Module) end, case application:get_env(riak_core, Type) of - undefined -> - application:set_env(riak_core, Type, [{App,Module}]); - {ok, Mods} -> - application:set_env(riak_core, Type, - lists:usort([{App,Module}|Mods])) + undefined -> + application:set_env(riak_core, Type, [{App, Module}]); + {ok, Mods} -> + application:set_env(riak_core, Type, + lists:usort([{App, Module} | Mods])) end. register_metadata(App, Value, Type) -> case application:get_env(riak_core, Type) of - undefined -> - application:set_env(riak_core, Type, [{App,Value}]); - {ok, Values} -> - application:set_env(riak_core, Type, - lists:usort([{App,Value}|Values])) + undefined -> + application:set_env(riak_core, Type, [{App, Value}]); + {ok, Values} -> + application:set_env(riak_core, Type, + lists:usort([{App, Value} | Values])) end. -register_proplist({Key, Value}, Type) -> - case application:get_env(riak_core, Type) of - undefined -> - application:set_env(riak_core, Type, [{Key, Value}]); - {ok, Values} -> - application:set_env(riak_core, Type, lists:keystore(Key, 1, - Values, - {Key, Value})) - end. - - - %% @spec add_guarded_event_handler(HandlerMod, Handler, Args) -> AddResult %% HandlerMod = module() %% Handler = module() | {module(), term()} %% Args = list() %% AddResult = ok | {error, Reason::term()} add_guarded_event_handler(HandlerMod, Handler, Args) -> - add_guarded_event_handler(HandlerMod, Handler, Args, undefined). + add_guarded_event_handler(HandlerMod, Handler, Args, + undefined). %% @spec add_guarded_event_handler(HandlerMod, Handler, Args, ExitFun) -> AddResult %% HandlerMod = module() @@ -364,8 +317,10 @@ add_guarded_event_handler(HandlerMod, Handler, Args) -> %% (riak_core_eventhandler_guard) that adds a supervised handler in its %% init() callback and exits when the handler crashes so it can be %% restarted by the supervisor. -add_guarded_event_handler(HandlerMod, Handler, Args, ExitFun) -> - riak_core_eventhandler_sup:start_guarded_handler(HandlerMod, Handler, Args, ExitFun). +add_guarded_event_handler(HandlerMod, Handler, Args, + ExitFun) -> + riak_core_eventhandler_sup:start_guarded_handler(HandlerMod, + Handler, Args, ExitFun). %% @spec delete_guarded_event_handler(HandlerMod, Handler, Args) -> Result %% HandlerMod = module() @@ -383,60 +338,77 @@ add_guarded_event_handler(HandlerMod, Handler, Args, ExitFun) -> %% specified event handler is not installed, the function returns %% {error,module_not_found}. If the callback function fails with Reason, %% the function returns {'EXIT',Reason}. -delete_guarded_event_handler(HandlerMod, Handler, Args) -> - riak_core_eventhandler_sup:stop_guarded_handler(HandlerMod, Handler, Args). +delete_guarded_event_handler(HandlerMod, Handler, + Args) -> + riak_core_eventhandler_sup:stop_guarded_handler(HandlerMod, + Handler, Args). app_for_module(Mod) -> app_for_module(application:which_applications(), Mod). -app_for_module([], _Mod) -> - {ok, undefined}; -app_for_module([{App,_,_}|T], Mod) -> +app_for_module([], _Mod) -> {ok, undefined}; +app_for_module([{App, _, _} | T], Mod) -> {ok, Mods} = application:get_key(App, modules), case lists:member(Mod, Mods) of - true -> {ok, App}; - false -> app_for_module(T, Mod) + true -> {ok, App}; + false -> app_for_module(T, Mod) end. - wait_for_application(App) -> wait_for_application(App, 0). + wait_for_application(App, Elapsed) -> - case lists:keymember(App, 1, application:which_applications()) of - true when Elapsed == 0 -> - ok; - true when Elapsed > 0 -> - logger:info("Wait complete for application ~p (~p seconds)", [App, Elapsed div 1000]), - ok; - false -> - %% Possibly print a notice. - ShouldPrint = Elapsed rem ?WAIT_PRINT_INTERVAL == 0, - case ShouldPrint of - true -> logger:info("Waiting for application ~p to start (~p seconds).", [App, Elapsed div 1000]); - false -> skip - end, - timer:sleep(?WAIT_POLL_INTERVAL), - wait_for_application(App, Elapsed + ?WAIT_POLL_INTERVAL) + case lists:keymember(App, 1, + application:which_applications()) + of + true when Elapsed == 0 -> ok; + true when Elapsed > 0 -> + logger:info("Wait complete for application ~p (~p " + "seconds)", + [App, Elapsed div 1000]), + ok; + false -> + %% Possibly print a notice. + ShouldPrint = Elapsed rem (?WAIT_PRINT_INTERVAL) == 0, + case ShouldPrint of + true -> + logger:info("Waiting for application ~p to start\n " + " " + "(~p seconds).", + [App, Elapsed div 1000]); + false -> skip + end, + timer:sleep(?WAIT_POLL_INTERVAL), + wait_for_application(App, + Elapsed + (?WAIT_POLL_INTERVAL)) end. wait_for_service(Service) -> wait_for_service(Service, 0). + wait_for_service(Service, Elapsed) -> - case lists:member(Service, riak_core_node_watcher:services(node())) of - true when Elapsed == 0 -> - ok; - true when Elapsed > 0 -> - logger:info("Wait complete for service ~p (~p seconds)", [Service, Elapsed div 1000]), - ok; - false -> - %% Possibly print a notice. - ShouldPrint = Elapsed rem ?WAIT_PRINT_INTERVAL == 0, - case ShouldPrint of - true -> logger:info("Waiting for service ~p to start (~p seconds)", [Service, Elapsed div 1000]); - false -> skip - end, - timer:sleep(?WAIT_POLL_INTERVAL), - wait_for_service(Service, Elapsed + ?WAIT_POLL_INTERVAL) + case lists:member(Service, + riak_core_node_watcher:services(node())) + of + true when Elapsed == 0 -> ok; + true when Elapsed > 0 -> + logger:info("Wait complete for service ~p (~p seconds)", + [Service, Elapsed div 1000]), + ok; + false -> + %% Possibly print a notice. + ShouldPrint = Elapsed rem (?WAIT_PRINT_INTERVAL) == 0, + case ShouldPrint of + true -> + logger:info("Waiting for service ~p to start\n " + " (~p " + "seconds)", + [Service, Elapsed div 1000]); + false -> skip + end, + timer:sleep(?WAIT_POLL_INTERVAL), + wait_for_service(Service, + Elapsed + (?WAIT_POLL_INTERVAL)) end. stat_prefix() -> diff --git a/src/riak_core_apl.erl b/src/riak_core_apl.erl index b76e40796..df4778746 100644 --- a/src/riak_core_apl.erl +++ b/src/riak_core_apl.erl @@ -23,44 +23,61 @@ %% substituted. %% ------------------------------------------------------------------- -module(riak_core_apl). --export([active_owners/1, active_owners/2, - get_apl/3, get_apl/4, - get_apl_ann/2, get_apl_ann/3, get_apl_ann/4, - get_apl_ann_with_pnum/1, - get_primary_apl/3, get_primary_apl/4, - get_primary_apl_chbin/4, - first_up/2, offline_owners/1, offline_owners/2 - ]). --export_type([preflist/0, preflist_ann/0, preflist_with_pnum_ann/0]). +-export([active_owners/1, active_owners/2, get_apl/3, + get_apl/4, get_apl_ann/2, get_apl_ann/3, get_apl_ann/4, + get_apl_ann_with_pnum/1, get_primary_apl/3, + get_primary_apl/4, get_primary_apl_chbin/4, first_up/2, + offline_owners/1, offline_owners/2]). + +-export_type([preflist/0, preflist_ann/0, + preflist_with_pnum_ann/0]). -ifdef(TEST). + -include_lib("eunit/include/eunit.hrl"). + -endif. +-type bucket() :: binary() | {binary(), binary()}. + -type index() :: chash:index_as_int(). + -type n_val() :: non_neg_integer(). + -type ring() :: riak_core_ring:riak_core_ring(). + -type preflist() :: [{index(), node()}]. --type preflist_ann() :: [{{index(), node()}, primary|fallback}]. + +-type preflist_ann() :: [{{index(), node()}, + primary | fallback}]. + %% @type preflist_with_pnum_ann(). %% Annotated preflist where the partition value is an id/number %% (0 to ring_size-1) instead of a hash. --type preflist_with_pnum_ann() :: [{{riak_core_ring:partition_id(), node()}, - primary|fallback}]. +-type + preflist_with_pnum_ann() :: [{{riak_core_ring:partition_id(), + node()}, + primary | fallback}]. + -type iterator() :: term(). + -type chashbin() :: term(). + -type docidx() :: chash:index(). %% @doc Return preflist of all active primary nodes (with no %% substituion of fallbacks). Used to simulate a %% preflist with N=ring_size. -spec active_owners(atom()) -> preflist_ann(). + active_owners(Service) -> {ok, Ring} = riak_core_ring_manager:get_my_ring(), - active_owners(Ring, riak_core_node_watcher:nodes(Service)). + active_owners(Ring, + riak_core_node_watcher:nodes(Service)). -spec active_owners(ring(), [node()]) -> preflist_ann(). + active_owners(Ring, UpNodes) -> UpNodes1 = UpNodes, Primaries = riak_core_ring:all_owners(Ring), @@ -69,23 +86,31 @@ active_owners(Ring, UpNodes) -> %% @doc Get the active preflist taking account of which nodes are up. -spec get_apl(docidx(), n_val(), atom()) -> preflist(). + get_apl(DocIdx, N, Service) -> {ok, CHBin} = riak_core_ring_manager:get_chash_bin(), - get_apl_chbin(DocIdx, N, CHBin, riak_core_node_watcher:nodes(Service)). + get_apl_chbin(DocIdx, N, CHBin, + riak_core_node_watcher:nodes(Service)). %% @doc Get the active preflist taking account of which nodes are up %% for a given chash/upnodes list. --spec get_apl_chbin(docidx(), n_val(), chashbin:chashbin(), [node()]) -> preflist(). +-spec get_apl_chbin(docidx(), n_val(), + chashbin:chashbin(), [node()]) -> preflist(). + get_apl_chbin(DocIdx, N, CHBin, UpNodes) -> - [{Partition, Node} || {{Partition, Node}, _Type} <- - get_apl_ann_chbin(DocIdx, N, CHBin, UpNodes)]. + [{Partition, Node} + || {{Partition, Node}, _Type} + <- get_apl_ann_chbin(DocIdx, N, CHBin, UpNodes)]. %% @doc Get the active preflist taking account of which nodes are up %% for a given ring/upnodes list. --spec get_apl(docidx(), n_val(), ring(), [node()]) -> preflist(). +-spec get_apl(docidx(), n_val(), ring(), + [node()]) -> preflist(). + get_apl(DocIdx, N, Ring, UpNodes) -> - [{Partition, Node} || {{Partition, Node}, _Type} <- - get_apl_ann(DocIdx, N, Ring, UpNodes)]. + [{Partition, Node} + || {{Partition, Node}, _Type} + <- get_apl_ann(DocIdx, N, Ring, UpNodes)]. %% @doc Get the active preflist taking account of which nodes are up for a given %% chash/upnodes list and annotate each node with type of primary/fallback. @@ -96,7 +121,9 @@ get_apl_ann(DocIdx, N, UpNodes) -> %% @doc Get the active preflist taking account of which nodes are up %% for a given ring/upnodes list and annotate each node with type of %% primary/fallback. --spec get_apl_ann(binary(), n_val(), ring(), [node()]) -> preflist_ann(). +-spec get_apl_ann(binary(), n_val(), ring(), + [node()]) -> preflist_ann(). + get_apl_ann(DocIdx, N, Ring, UpNodes) -> UpNodes1 = UpNodes, Preflist = riak_core_ring:preflist(DocIdx, Ring), @@ -106,17 +133,20 @@ get_apl_ann(DocIdx, N, Ring, UpNodes) -> %% @doc Get the active preflist for a given {bucket, key} and list of nodes %% and annotate each node with type of primary/fallback. --spec get_apl_ann(riak_core_bucket:bucket(), [node()]) -> preflist_ann(). +-spec get_apl_ann(bucket(), [node()]) -> preflist_ann(). + get_apl_ann({Bucket, Key}, UpNodes) -> - BucketProps = riak_core_bucket:get_bucket(Bucket), - NVal = proplists:get_value(n_val, BucketProps), + {ok, NVal} = application:get_env(riak_core, + target_n_val), DocIdx = riak_core_util:chash_key({Bucket, Key}), get_apl_ann(DocIdx, NVal, UpNodes). %% @doc Get the active preflist taking account of which nodes are up %% for a given {bucket, key} and annotate each node with type of %% primary/fallback --spec get_apl_ann_with_pnum(riak_core_bucket:bucket()) -> preflist_with_pnum_ann(). +-spec + get_apl_ann_with_pnum(bucket()) -> preflist_with_pnum_ann(). + get_apl_ann_with_pnum(BKey) -> {ok, Ring} = riak_core_ring_manager:get_my_ring(), UpNodes = riak_core_ring:all_members(Ring), @@ -127,7 +157,9 @@ get_apl_ann_with_pnum(BKey) -> %% @doc Get the active preflist taking account of which nodes are up %% for a given chash/upnodes list and annotate each node with type of %% primary/fallback. --spec get_apl_ann_chbin(binary(), n_val(), chashbin(), [node()]) -> preflist_ann(). +-spec get_apl_ann_chbin(binary(), n_val(), chashbin(), + [node()]) -> preflist_ann(). + get_apl_ann_chbin(DocIdx, N, CHBin, UpNodes) -> UpNodes1 = UpNodes, Itr = chashbin:iterator(DocIdx, CHBin), @@ -136,13 +168,18 @@ get_apl_ann_chbin(DocIdx, N, CHBin, UpNodes) -> Up ++ find_fallbacks_chbin(Pangs, Itr2, UpNodes1, []). %% @doc Same as get_apl, but returns only the primaries. --spec get_primary_apl(binary(), n_val(), atom()) -> preflist_ann(). +-spec get_primary_apl(binary(), n_val(), + atom()) -> preflist_ann(). + get_primary_apl(DocIdx, N, Service) -> {ok, CHBin} = riak_core_ring_manager:get_chash_bin(), - get_primary_apl_chbin(DocIdx, N, CHBin, riak_core_node_watcher:nodes(Service)). + get_primary_apl_chbin(DocIdx, N, CHBin, + riak_core_node_watcher:nodes(Service)). %% @doc Same as get_apl, but returns only the primaries. --spec get_primary_apl_chbin(binary(), n_val(), chashbin(), [node()]) -> preflist_ann(). +-spec get_primary_apl_chbin(binary(), n_val(), + chashbin(), [node()]) -> preflist_ann(). + get_primary_apl_chbin(DocIdx, N, CHBin, UpNodes) -> UpNodes1 = UpNodes, Itr = chashbin:iterator(DocIdx, CHBin), @@ -151,7 +188,9 @@ get_primary_apl_chbin(DocIdx, N, CHBin, UpNodes) -> Up. %% @doc Same as get_apl, but returns only the primaries. --spec get_primary_apl(binary(), n_val(), ring(), [node()]) -> preflist_ann(). +-spec get_primary_apl(binary(), n_val(), ring(), + [node()]) -> preflist_ann(). + get_primary_apl(DocIdx, N, Ring, UpNodes) -> UpNodes1 = UpNodes, Preflist = riak_core_ring:preflist(DocIdx, Ring), @@ -164,10 +203,12 @@ get_primary_apl(DocIdx, N, Ring, UpNodes) -> first_up(DocIdx, Service) -> {ok, CHBin} = riak_core_ring_manager:get_chash_bin(), Itr = chashbin:iterator(DocIdx, CHBin), - UpSet = ordsets:from_list(riak_core_node_watcher:nodes(Service)), - Itr2 = chashbin:itr_next_while(fun({_P, Node}) -> + UpSet = + ordsets:from_list(riak_core_node_watcher:nodes(Service)), + Itr2 = chashbin:itr_next_while(fun ({_P, Node}) -> not ordsets:is_element(Node, UpSet) - end, Itr), + end, + Itr), chashbin:itr_value(Itr2). offline_owners(Service) -> @@ -175,108 +216,138 @@ offline_owners(Service) -> offline_owners(Service, CHBin). offline_owners(Service, CHBin) when is_atom(Service) -> - UpSet = ordsets:from_list(riak_core_node_watcher:nodes(Service)), + UpSet = + ordsets:from_list(riak_core_node_watcher:nodes(Service)), offline_owners(UpSet, CHBin); offline_owners(UpSet, CHBin) when is_list(UpSet) -> %% UpSet is an ordset of available nodes - DownVNodes = chashbin:to_list_filter(fun({_Index, Node}) -> + DownVNodes = chashbin:to_list_filter(fun ({_Index, + Node}) -> not is_up(Node, UpSet) - end, CHBin), + end, + CHBin), DownVNodes. %% @doc Split a preference list into up and down lists. --spec check_up(preflist(), [node()], preflist_ann(), preflist()) -> {preflist_ann(), preflist()}. +-spec check_up(preflist(), [node()], preflist_ann(), + preflist()) -> {preflist_ann(), preflist()}. + check_up([], _UpNodes, Up, Pangs) -> {lists:reverse(Up), lists:reverse(Pangs)}; -check_up([{Partition,Node}|Rest], UpNodes, Up, Pangs) -> +check_up([{Partition, Node} | Rest], UpNodes, Up, + Pangs) -> case is_up(Node, UpNodes) of - true -> - check_up(Rest, UpNodes, [{{Partition, Node}, primary} | Up], Pangs); - false -> - check_up(Rest, UpNodes, Up, [{Partition, Node} | Pangs]) + true -> + check_up(Rest, UpNodes, + [{{Partition, Node}, primary} | Up], Pangs); + false -> + check_up(Rest, UpNodes, Up, [{Partition, Node} | Pangs]) end. %% @doc Find fallbacks for downed nodes in the preference list. --spec find_fallbacks(preflist(), preflist(), [node()], preflist_ann()) -> preflist_ann(). +-spec find_fallbacks(preflist(), preflist(), [node()], + preflist_ann()) -> preflist_ann(). + find_fallbacks(_Pangs, [], _UpNodes, Secondaries) -> lists:reverse(Secondaries); find_fallbacks([], _Fallbacks, _UpNodes, Secondaries) -> lists:reverse(Secondaries); -find_fallbacks([{Partition, _Node}|Rest]=Pangs, [{_,FN}|Fallbacks], UpNodes, Secondaries) -> +find_fallbacks([{Partition, _Node} | Rest] = Pangs, + [{_, FN} | Fallbacks], UpNodes, Secondaries) -> case is_up(FN, UpNodes) of - true -> - find_fallbacks(Rest, Fallbacks, UpNodes, - [{{Partition, FN}, fallback} | Secondaries]); - false -> - find_fallbacks(Pangs, Fallbacks, UpNodes, Secondaries) + true -> + find_fallbacks(Rest, Fallbacks, UpNodes, + [{{Partition, FN}, fallback} | Secondaries]); + false -> + find_fallbacks(Pangs, Fallbacks, UpNodes, Secondaries) end. %% @doc Find fallbacks for downed nodes in the preference list. --spec find_fallbacks_chbin(preflist(), iterator(),[node()], preflist_ann()) -> preflist_ann(). -find_fallbacks_chbin([], _Fallbacks, _UpNodes, Secondaries) -> +-spec find_fallbacks_chbin(preflist(), iterator(), + [node()], preflist_ann()) -> preflist_ann(). + +find_fallbacks_chbin([], _Fallbacks, _UpNodes, + Secondaries) -> lists:reverse(Secondaries); find_fallbacks_chbin(_, done, _UpNodes, Secondaries) -> lists:reverse(Secondaries); -find_fallbacks_chbin([{Partition, _Node}|Rest]=Pangs, Itr, UpNodes, Secondaries) -> +find_fallbacks_chbin([{Partition, _Node} | Rest] = + Pangs, + Itr, UpNodes, Secondaries) -> {_, FN} = chashbin:itr_value(Itr), Itr2 = chashbin:itr_next(Itr), case is_up(FN, UpNodes) of - true -> - find_fallbacks_chbin(Rest, Itr2, UpNodes, - [{{Partition, FN}, fallback} | Secondaries]); - false -> - find_fallbacks_chbin(Pangs, Itr2, UpNodes, Secondaries) + true -> + find_fallbacks_chbin(Rest, Itr2, UpNodes, + [{{Partition, FN}, fallback} | Secondaries]); + false -> + find_fallbacks_chbin(Pangs, Itr2, UpNodes, Secondaries) end. %% @doc Return true if a node is up. -is_up(Node, UpNodes) -> - lists:member(Node, UpNodes). +is_up(Node, UpNodes) -> lists:member(Node, UpNodes). %% @doc Return annotated preflist with partition ids/nums instead of hashes. --spec apl_with_partition_nums(preflist_ann(), riak_core_ring:ring_size()) -> - preflist_with_pnum_ann(). +-spec apl_with_partition_nums(preflist_ann(), + riak_core_ring:ring_size()) -> preflist_with_pnum_ann(). + apl_with_partition_nums(Apl, Size) -> - [{{riak_core_ring_util:hash_to_partition_id(Hash, Size), Node}, Ann} || - {{Hash, Node}, Ann} <- Apl]. + [{{riak_core_ring_util:hash_to_partition_id(Hash, Size), + Node}, + Ann} + || {{Hash, Node}, Ann} <- Apl]. -ifdef(TEST). smallest_test() -> - Ring = riak_core_ring:fresh(1,node()), - ?assertEqual([{0,node()}], get_apl(last_in_ring(), 1, Ring, [node()])). + Ring = riak_core_ring:fresh(1, node()), + ?assertEqual([{0, node()}], + (get_apl(last_in_ring(), 1, Ring, [node()]))). four_node_test() -> Nodes = [nodea, nodeb, nodec, noded], Ring = perfect_ring(8, Nodes), - ?assertEqual([{0,nodea}, - {182687704666362864775460604089535377456991567872,nodeb}, - {365375409332725729550921208179070754913983135744,nodec}], - get_apl(last_in_ring(), 3, Ring, Nodes)), + ?assertEqual([{0, nodea}, + {182687704666362864775460604089535377456991567872, + nodeb}, + {365375409332725729550921208179070754913983135744, + nodec}], + (get_apl(last_in_ring(), 3, Ring, Nodes))), %% With a node down - ?assertEqual([{182687704666362864775460604089535377456991567872,nodeb}, - {365375409332725729550921208179070754913983135744,nodec}, - {0,noded}], - get_apl(last_in_ring(), 3, Ring, [nodeb, nodec, noded])), + ?assertEqual([{182687704666362864775460604089535377456991567872, + nodeb}, + {365375409332725729550921208179070754913983135744, + nodec}, + {0, noded}], + (get_apl(last_in_ring(), 3, Ring, + [nodeb, nodec, noded]))), %% With two nodes down - ?assertEqual([{365375409332725729550921208179070754913983135744,nodec}, - {0,noded}, - {182687704666362864775460604089535377456991567872,nodec}], - get_apl(last_in_ring(), 3, Ring, [nodec, noded])), + ?assertEqual([{365375409332725729550921208179070754913983135744, + nodec}, + {0, noded}, + {182687704666362864775460604089535377456991567872, + nodec}], + (get_apl(last_in_ring(), 3, Ring, [nodec, noded]))), %% With the other two nodes down - ?assertEqual([{0,nodea}, - {182687704666362864775460604089535377456991567872,nodeb}, - {365375409332725729550921208179070754913983135744,nodea}], - get_apl(last_in_ring(), 3, Ring, [nodea, nodeb])). + ?assertEqual([{0, nodea}, + {182687704666362864775460604089535377456991567872, + nodeb}, + {365375409332725729550921208179070754913983135744, + nodea}], + (get_apl(last_in_ring(), 3, Ring, [nodea, nodeb]))). %% Create a perfect ring - RingSize must be a multiple of nodes -perfect_ring(RingSize, Nodes) when RingSize rem length(Nodes) =:= 0 -> - Ring = riak_core_ring:fresh(RingSize,node()), +perfect_ring(RingSize, Nodes) + when RingSize rem length(Nodes) =:= 0 -> + Ring = riak_core_ring:fresh(RingSize, node()), Owners = riak_core_ring:all_owners(Ring), - TransferNode = - fun({Idx,_CurOwner}, {Ring0, [NewOwner|Rest]}) -> - {riak_core_ring:transfer_node(Idx, NewOwner, Ring0), Rest ++ [NewOwner]} - end, - {PerfectRing, _} = lists:foldl(TransferNode, {Ring, Nodes}, Owners), + TransferNode = fun ({Idx, _CurOwner}, + {Ring0, [NewOwner | Rest]}) -> + {riak_core_ring:transfer_node(Idx, NewOwner, Ring0), + Rest ++ [NewOwner]} + end, + {PerfectRing, _} = lists:foldl(TransferNode, + {Ring, Nodes}, Owners), PerfectRing. last_in_ring() -> @@ -287,186 +358,186 @@ six_node_test() -> %% earlier {ok, [Ring]} = file:consult("test/my_ring"), %DocIdx = riak_core_util:chash_key({<<"foo">>, <<"bar">>}), - DocIdx = <<73,212,27,234,104,13,150,207,0,82,86,183,125,225,172, - 154,135,46,6,112>>, - - Nodes = ['dev1@127.0.0.1', 'dev2@127.0.0.1', 'dev3@127.0.0.1', - 'dev4@127.0.0.1', 'dev5@127.0.0.1', 'dev6@127.0.0.1'], - + DocIdx = <<73, 212, 27, 234, 104, 13, 150, 207, 0, 82, + 86, 183, 125, 225, 172, 154, 135, 46, 6, 112>>, + Nodes = ['dev1@127.0.0.1', 'dev2@127.0.0.1', + 'dev3@127.0.0.1', 'dev4@127.0.0.1', 'dev5@127.0.0.1', + 'dev6@127.0.0.1'], %% Fallbacks should be selected by finding the next-highest partition after %% the DocIdx of the key, in this case the 433883 partition. The N %% partitions at that point are the primary partitions. If any of the primaries %% are down, the next up node found by walking the preflist is used as the %% fallback for that partition. - - ?assertEqual([{433883298582611803841718934712646521460354973696, 'dev2@127.0.0.1'}, - {456719261665907161938651510223838443642478919680, 'dev3@127.0.0.1'}, - {479555224749202520035584085735030365824602865664, 'dev4@127.0.0.1'}], - get_apl(DocIdx, 3, Ring, Nodes)), - - ?assertEqual([{456719261665907161938651510223838443642478919680, 'dev3@127.0.0.1'}, - {479555224749202520035584085735030365824602865664, 'dev4@127.0.0.1'}, - {433883298582611803841718934712646521460354973696, 'dev5@127.0.0.1'}], - get_apl(DocIdx, 3, Ring, Nodes -- ['dev2@127.0.0.1'])), - - ?assertEqual([{479555224749202520035584085735030365824602865664, 'dev4@127.0.0.1'}, - {433883298582611803841718934712646521460354973696, 'dev5@127.0.0.1'}, - {456719261665907161938651510223838443642478919680, 'dev6@127.0.0.1'}], - get_apl(DocIdx, 3, Ring, Nodes -- ['dev2@127.0.0.1', 'dev3@127.0.0.1'])), - - ?assertEqual([{433883298582611803841718934712646521460354973696, 'dev5@127.0.0.1'}, - {456719261665907161938651510223838443642478919680, 'dev6@127.0.0.1'}, - {479555224749202520035584085735030365824602865664, 'dev1@127.0.0.1'}], - get_apl(DocIdx, 3, Ring, Nodes -- ['dev2@127.0.0.1', 'dev3@127.0.0.1', - 'dev4@127.0.0.1'])), - - ?assertEqual([{433883298582611803841718934712646521460354973696, 'dev5@127.0.0.1'}, - {456719261665907161938651510223838443642478919680, 'dev6@127.0.0.1'}, - {479555224749202520035584085735030365824602865664, 'dev5@127.0.0.1'}], - get_apl(DocIdx, 3, Ring, Nodes -- ['dev2@127.0.0.1', 'dev3@127.0.0.1', - 'dev4@127.0.0.1', 'dev1@127.0.0.1'])), - - ?assertEqual([{433883298582611803841718934712646521460354973696, 'dev2@127.0.0.1'}, - {456719261665907161938651510223838443642478919680, 'dev3@127.0.0.1'}, - {479555224749202520035584085735030365824602865664, 'dev5@127.0.0.1'}], - get_apl(DocIdx, 3, Ring, Nodes -- ['dev4@127.0.0.1'])), - - ?assertEqual([{433883298582611803841718934712646521460354973696, 'dev2@127.0.0.1'}, - {456719261665907161938651510223838443642478919680, 'dev5@127.0.0.1'}, - {479555224749202520035584085735030365824602865664, 'dev6@127.0.0.1'}], - get_apl(DocIdx, 3, Ring, Nodes -- ['dev4@127.0.0.1', 'dev3@127.0.0.1'])), - - ?assertEqual([{433883298582611803841718934712646521460354973696, 'dev2@127.0.0.1'}, - {456719261665907161938651510223838443642478919680, 'dev5@127.0.0.1'}, - {479555224749202520035584085735030365824602865664, 'dev1@127.0.0.1'}], - get_apl(DocIdx, 3, Ring, Nodes -- ['dev4@127.0.0.1', 'dev3@127.0.0.1', - 'dev6@127.0.0.1'])), - - ?assertEqual([{433883298582611803841718934712646521460354973696, 'dev2@127.0.0.1'}, - {456719261665907161938651510223838443642478919680, 'dev5@127.0.0.1'}, - {479555224749202520035584085735030365824602865664, 'dev2@127.0.0.1'}], - get_apl(DocIdx, 3, Ring, Nodes -- ['dev4@127.0.0.1', 'dev3@127.0.0.1', - 'dev6@127.0.0.1', 'dev1@127.0.0.1'])), - - ?assertEqual([{433883298582611803841718934712646521460354973696, 'dev2@127.0.0.1'}, - {456719261665907161938651510223838443642478919680, 'dev2@127.0.0.1'}, - {479555224749202520035584085735030365824602865664, 'dev2@127.0.0.1'}], - get_apl(DocIdx, 3, Ring, Nodes -- ['dev4@127.0.0.1', 'dev3@127.0.0.1', - 'dev6@127.0.0.1', 'dev1@127.0.0.1', 'dev5@127.0.0.1'])), - - ?assertEqual([{433883298582611803841718934712646521460354973696, 'dev2@127.0.0.1'}, - {479555224749202520035584085735030365824602865664, 'dev4@127.0.0.1'}, - {456719261665907161938651510223838443642478919680, 'dev5@127.0.0.1'}], - get_apl(DocIdx, 3, Ring, Nodes -- ['dev3@127.0.0.1'])), - + ?assertEqual([{433883298582611803841718934712646521460354973696, + 'dev2@127.0.0.1'}, + {456719261665907161938651510223838443642478919680, + 'dev3@127.0.0.1'}, + {479555224749202520035584085735030365824602865664, + 'dev4@127.0.0.1'}], + (get_apl(DocIdx, 3, Ring, Nodes))), + ?assertEqual([{456719261665907161938651510223838443642478919680, + 'dev3@127.0.0.1'}, + {479555224749202520035584085735030365824602865664, + 'dev4@127.0.0.1'}, + {433883298582611803841718934712646521460354973696, + 'dev5@127.0.0.1'}], + (get_apl(DocIdx, 3, Ring, + Nodes -- ['dev2@127.0.0.1']))), + ?assertEqual([{479555224749202520035584085735030365824602865664, + 'dev4@127.0.0.1'}, + {433883298582611803841718934712646521460354973696, + 'dev5@127.0.0.1'}, + {456719261665907161938651510223838443642478919680, + 'dev6@127.0.0.1'}], + (get_apl(DocIdx, 3, Ring, + Nodes -- ['dev2@127.0.0.1', 'dev3@127.0.0.1']))), + ?assertEqual([{433883298582611803841718934712646521460354973696, + 'dev5@127.0.0.1'}, + {456719261665907161938651510223838443642478919680, + 'dev6@127.0.0.1'}, + {479555224749202520035584085735030365824602865664, + 'dev1@127.0.0.1'}], + (get_apl(DocIdx, 3, Ring, + Nodes -- + ['dev2@127.0.0.1', 'dev3@127.0.0.1', + 'dev4@127.0.0.1']))), + ?assertEqual([{433883298582611803841718934712646521460354973696, + 'dev5@127.0.0.1'}, + {456719261665907161938651510223838443642478919680, + 'dev6@127.0.0.1'}, + {479555224749202520035584085735030365824602865664, + 'dev5@127.0.0.1'}], + (get_apl(DocIdx, 3, Ring, + Nodes -- + ['dev2@127.0.0.1', 'dev3@127.0.0.1', + 'dev4@127.0.0.1', 'dev1@127.0.0.1']))), + ?assertEqual([{433883298582611803841718934712646521460354973696, + 'dev2@127.0.0.1'}, + {456719261665907161938651510223838443642478919680, + 'dev3@127.0.0.1'}, + {479555224749202520035584085735030365824602865664, + 'dev5@127.0.0.1'}], + (get_apl(DocIdx, 3, Ring, + Nodes -- ['dev4@127.0.0.1']))), + ?assertEqual([{433883298582611803841718934712646521460354973696, + 'dev2@127.0.0.1'}, + {456719261665907161938651510223838443642478919680, + 'dev5@127.0.0.1'}, + {479555224749202520035584085735030365824602865664, + 'dev6@127.0.0.1'}], + (get_apl(DocIdx, 3, Ring, + Nodes -- ['dev4@127.0.0.1', 'dev3@127.0.0.1']))), + ?assertEqual([{433883298582611803841718934712646521460354973696, + 'dev2@127.0.0.1'}, + {456719261665907161938651510223838443642478919680, + 'dev5@127.0.0.1'}, + {479555224749202520035584085735030365824602865664, + 'dev1@127.0.0.1'}], + (get_apl(DocIdx, 3, Ring, + Nodes -- + ['dev4@127.0.0.1', 'dev3@127.0.0.1', + 'dev6@127.0.0.1']))), + ?assertEqual([{433883298582611803841718934712646521460354973696, + 'dev2@127.0.0.1'}, + {456719261665907161938651510223838443642478919680, + 'dev5@127.0.0.1'}, + {479555224749202520035584085735030365824602865664, + 'dev2@127.0.0.1'}], + (get_apl(DocIdx, 3, Ring, + Nodes -- + ['dev4@127.0.0.1', 'dev3@127.0.0.1', + 'dev6@127.0.0.1', 'dev1@127.0.0.1']))), + ?assertEqual([{433883298582611803841718934712646521460354973696, + 'dev2@127.0.0.1'}, + {456719261665907161938651510223838443642478919680, + 'dev2@127.0.0.1'}, + {479555224749202520035584085735030365824602865664, + 'dev2@127.0.0.1'}], + (get_apl(DocIdx, 3, Ring, + Nodes -- + ['dev4@127.0.0.1', 'dev3@127.0.0.1', + 'dev6@127.0.0.1', 'dev1@127.0.0.1', + 'dev5@127.0.0.1']))), + ?assertEqual([{433883298582611803841718934712646521460354973696, + 'dev2@127.0.0.1'}, + {479555224749202520035584085735030365824602865664, + 'dev4@127.0.0.1'}, + {456719261665907161938651510223838443642478919680, + 'dev5@127.0.0.1'}], + (get_apl(DocIdx, 3, Ring, + Nodes -- ['dev3@127.0.0.1']))), ok. six_node_bucket_key_ann_test() -> {ok, [Ring]} = file:consult("test/my_ring"), - Nodes = ['dev1@127.0.0.1', 'dev2@127.0.0.1', 'dev3@127.0.0.1', - 'dev4@127.0.0.1', 'dev5@127.0.0.1', 'dev6@127.0.0.1'], + Nodes = ['dev1@127.0.0.1', 'dev2@127.0.0.1', + 'dev3@127.0.0.1', 'dev4@127.0.0.1', 'dev5@127.0.0.1', + 'dev6@127.0.0.1'], Bucket = <<"favorite">>, Key = <<"jethrotull">>, - application:set_env(riak_core, default_bucket_props, - [{n_val, 3}, - {chash_keyfun,{riak_core_util,chash_std_keyfun}}]), + application:set_env(riak_core, target_n_val, 3), riak_core_ring_manager:setup_ets(test), riak_core_ring_manager:set_ring_global(Ring), Size = riak_core_ring:num_partitions(Ring), - ?assertEqual([{{34, - 'dev5@127.0.0.1'}, - primary}, - {{35, - 'dev6@127.0.0.1'}, - primary}, - {{36, - 'dev1@127.0.0.1'}, - primary}], - apl_with_partition_nums( - get_apl_ann({Bucket, Key}, Nodes), Size)), - ?assertEqual([{{35, - 'dev6@127.0.0.1'}, - primary}, - {{36, - 'dev1@127.0.0.1'}, - primary}, - {{34, - 'dev2@127.0.0.1'}, - fallback}], - apl_with_partition_nums( - get_apl_ann({Bucket, Key}, Nodes -- - ['dev5@127.0.0.1']), Size)), - ?assertEqual([{{36, - 'dev1@127.0.0.1'}, - primary}, - {{34, - 'dev2@127.0.0.1'}, - fallback}, - {{35, - 'dev3@127.0.0.1'}, - fallback}], - apl_with_partition_nums( - get_apl_ann({Bucket, Key}, Nodes -- - ['dev5@127.0.0.1', - 'dev6@127.0.0.1']), Size)), - ?assertEqual([{{34, - 'dev2@127.0.0.1'}, - fallback}, - {{35, - 'dev3@127.0.0.1'}, - fallback}, - {{36, - 'dev4@127.0.0.1'}, - fallback}], - apl_with_partition_nums( - get_apl_ann({Bucket, Key}, Nodes -- - ['dev5@127.0.0.1', - 'dev6@127.0.0.1', - 'dev1@127.0.0.1']), Size)), - ?assertEqual([{{34, - 'dev3@127.0.0.1'}, - fallback}, - {{35, - 'dev4@127.0.0.1'}, - fallback}, - {{36, - 'dev3@127.0.0.1'}, - fallback}], - apl_with_partition_nums( - get_apl_ann({Bucket, Key}, Nodes -- - ['dev5@127.0.0.1', - 'dev6@127.0.0.1', - 'dev1@127.0.0.1', - 'dev2@127.0.0.1']), Size)), - ?assertEqual([{{34, - 'dev4@127.0.0.1'}, - fallback}, - {{35, - 'dev4@127.0.0.1'}, - fallback}, - {{36, - 'dev4@127.0.0.1'}, - fallback}], - apl_with_partition_nums( - get_apl_ann({Bucket, Key}, Nodes -- - ['dev5@127.0.0.1', - 'dev6@127.0.0.1', - 'dev1@127.0.0.1', - 'dev2@127.0.0.1', - 'dev3@127.0.0.1']), Size)), - ?assertEqual([{{34, - 'dev5@127.0.0.1'}, - primary}, - {{35, - 'dev6@127.0.0.1'}, - primary}, - {{36, - 'dev3@127.0.0.1'}, - fallback}], - apl_with_partition_nums( - get_apl_ann({Bucket, Key}, Nodes -- - ['dev1@127.0.0.1', - 'dev2@127.0.0.1']), Size)), + ?assertEqual([{{34, 'dev5@127.0.0.1'}, primary}, + {{35, 'dev6@127.0.0.1'}, primary}, + {{36, 'dev1@127.0.0.1'}, primary}], + (apl_with_partition_nums(get_apl_ann({Bucket, Key}, + Nodes), + Size))), + ?assertEqual([{{35, 'dev6@127.0.0.1'}, primary}, + {{36, 'dev1@127.0.0.1'}, primary}, + {{34, 'dev2@127.0.0.1'}, fallback}], + (apl_with_partition_nums(get_apl_ann({Bucket, Key}, + Nodes -- + ['dev5@127.0.0.1']), + Size))), + ?assertEqual([{{36, 'dev1@127.0.0.1'}, primary}, + {{34, 'dev2@127.0.0.1'}, fallback}, + {{35, 'dev3@127.0.0.1'}, fallback}], + (apl_with_partition_nums(get_apl_ann({Bucket, Key}, + Nodes -- + ['dev5@127.0.0.1', + 'dev6@127.0.0.1']), + Size))), + ?assertEqual([{{34, 'dev2@127.0.0.1'}, fallback}, + {{35, 'dev3@127.0.0.1'}, fallback}, + {{36, 'dev4@127.0.0.1'}, fallback}], + (apl_with_partition_nums(get_apl_ann({Bucket, Key}, + Nodes -- + ['dev5@127.0.0.1', + 'dev6@127.0.0.1', + 'dev1@127.0.0.1']), + Size))), + ?assertEqual([{{34, 'dev3@127.0.0.1'}, fallback}, + {{35, 'dev4@127.0.0.1'}, fallback}, + {{36, 'dev3@127.0.0.1'}, fallback}], + (apl_with_partition_nums(get_apl_ann({Bucket, Key}, + Nodes -- + ['dev5@127.0.0.1', + 'dev6@127.0.0.1', + 'dev1@127.0.0.1', + 'dev2@127.0.0.1']), + Size))), + ?assertEqual([{{34, 'dev4@127.0.0.1'}, fallback}, + {{35, 'dev4@127.0.0.1'}, fallback}, + {{36, 'dev4@127.0.0.1'}, fallback}], + (apl_with_partition_nums(get_apl_ann({Bucket, Key}, + Nodes -- + ['dev5@127.0.0.1', + 'dev6@127.0.0.1', + 'dev1@127.0.0.1', + 'dev2@127.0.0.1', + 'dev3@127.0.0.1']), + Size))), + ?assertEqual([{{34, 'dev5@127.0.0.1'}, primary}, + {{35, 'dev6@127.0.0.1'}, primary}, + {{36, 'dev3@127.0.0.1'}, fallback}], + (apl_with_partition_nums(get_apl_ann({Bucket, Key}, + Nodes -- + ['dev1@127.0.0.1', + 'dev2@127.0.0.1']), + Size))), riak_core_ring_manager:cleanup_ets(test), ok. @@ -475,7 +546,7 @@ chbin_test_() -> chbin_test_scenario() -> [chbin_test_scenario(Size, NumNodes) - || Size <- [32, 64, 128], + || Size <- [32, 64, 128], NumNodes <- [1, 2, 3, 4, 5, 8, Size div 4]], ok. @@ -486,20 +557,22 @@ chbin_test_scenario(Size, NumNodes) -> CHash = riak_core_ring:chash(Ring), CHBin = chashbin:create(CHash), Inc = chash:ring_increment(Size), - HashKeys = [<> || X <- lists:seq(0, RingTop, Inc div 2)], + HashKeys = [<> + || X <- lists:seq(0, RingTop, Inc div 2)], Shuffled = riak_core_util:shuffle(Nodes), _ = CHBin, [begin - Up = max(0, NumNodes - Down), - UpNodes = lists:sublist(Shuffled, Up), - ?assertEqual(get_apl(HashKey, N, Ring, UpNodes), - get_apl_chbin(HashKey, N, CHBin, UpNodes)), - ?assertEqual(get_primary_apl(HashKey, N, Ring, UpNodes), - get_primary_apl_chbin(HashKey, N, CHBin, UpNodes)), - ok - end || HashKey <- HashKeys, - N <- [1, 2, 3, 4], - Down <- [0, 1, 2, Size div 2, Size-1, Size]], + Up = max(0, NumNodes - Down), + UpNodes = lists:sublist(Shuffled, Up), + ?assertEqual((get_apl(HashKey, N, Ring, UpNodes)), + (get_apl_chbin(HashKey, N, CHBin, UpNodes))), + ?assertEqual((get_primary_apl(HashKey, N, Ring, + UpNodes)), + (get_primary_apl_chbin(HashKey, N, CHBin, UpNodes))), + ok + end + || HashKey <- HashKeys, N <- [1, 2, 3, 4], + Down <- [0, 1, 2, Size div 2, Size - 1, Size]], ok. -endif. diff --git a/src/riak_core_app.erl b/src/riak_core_app.erl index 9d2ce17ab..646887bbb 100644 --- a/src/riak_core_app.erl +++ b/src/riak_core_app.erl @@ -33,42 +33,40 @@ start(_StartType, _StartArgs) -> ok = validate_ring_state_directory_exists(), - start_riak_core_sup(). stop(_State) -> - logger:info("Stopped application riak_core", []), - ok. + logger:info("Stopped application riak_core", []), ok. validate_ring_state_directory_exists() -> riak_core_util:start_app_deps(riak_core), - {ok, RingStateDir} = application:get_env(riak_core, ring_state_dir), - case filelib:ensure_dir(filename:join(RingStateDir, "dummy")) of - ok -> - ok; - {error, RingReason} -> - logger:critical( - "Ring state directory ~p does not exist, " "and could not be created: ~p", - [RingStateDir, riak_core_util:posix_error(RingReason)]), - throw({error, invalid_ring_state_dir}) + {ok, RingStateDir} = application:get_env(riak_core, + ring_state_dir), + case filelib:ensure_dir(filename:join(RingStateDir, + "dummy")) + of + ok -> ok; + {error, RingReason} -> + logger:critical("Ring state directory ~p does not exist, " + "and could not be created: ~p", + [RingStateDir, + riak_core_util:posix_error(RingReason)]), + throw({error, invalid_ring_state_dir}) end. - start_riak_core_sup() -> %% Spin up the supervisor; prune ring files as necessary case riak_core_sup:start_link() of - {ok, Pid} -> - ok = register_applications(), - ok = add_ring_event_handler(), - - {ok, Pid}; - {error, Reason} -> - {error, Reason} + {ok, Pid} -> + ok = register_applications(), + ok = add_ring_event_handler(), + {ok, Pid}; + {error, Reason} -> {error, Reason} end. -register_applications() -> - ok. +register_applications() -> ok. add_ring_event_handler() -> - ok = riak_core_ring_events:add_guarded_handler(riak_core_ring_handler, []). - + ok = + riak_core_ring_events:add_guarded_handler(riak_core_ring_handler, + []). diff --git a/src/riak_core_base64url.erl b/src/riak_core_base64url.erl deleted file mode 100644 index a3e5a4390..000000000 --- a/src/riak_core_base64url.erl +++ /dev/null @@ -1,95 +0,0 @@ -%% ------------------------------------------------------------------- -%% -%% Copyright (c) 2009-2010 Basho Technologies, Inc. All Rights Reserved. -%% -%% This file is provided to you under the Apache License, -%% Version 2.0 (the "License"); you may not use this file -%% except in compliance with the License. You may obtain -%% a copy of the License at -%% -%% http://www.apache.org/licenses/LICENSE-2.0 -%% -%% Unless required by applicable law or agreed to in writing, -%% software distributed under the License is distributed on an -%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -%% KIND, either express or implied. See the License for the -%% specific language governing permissions and limitations -%% under the License. -%% -%% ------------------------------------------------------------------- - -%% @doc base64url is a wrapper around the base64 module to produce -%% base64-compatible encodings that are URL safe. -%% The / character in normal base64 encoding is replaced with -%% the _ character, and + is replaced with -. -%% This replacement scheme is named "base64url" by -%% http://en.wikipedia.org/wiki/Base64 - --module(riak_core_base64url). - --export([decode/1, - decode_to_string/1, - encode/1, - encode_to_string/1, - mime_decode/1, - mime_decode_to_string/1]). - --spec decode(iodata()) -> binary(). -decode(Base64url) -> - base64:decode(urldecode(Base64url)). - --spec decode_to_string(iodata()) -> string(). -decode_to_string(Base64url) -> - base64:decode_to_string(urldecode(Base64url)). - --spec mime_decode(iodata()) -> binary(). -mime_decode(Base64url) -> - base64:mime_decode(urldecode(Base64url)). - --spec mime_decode_to_string(iodata()) -> string(). -mime_decode_to_string(Base64url) -> - base64:mime_decode_to_string(urldecode(Base64url)). - --spec encode(iodata()) -> binary(). -encode(Data) -> - urlencode(base64:encode(Data)). - --spec encode_to_string(iodata()) -> string(). -encode_to_string(Data) -> - urlencode(base64:encode_to_string(Data)). - -urlencode(Base64) when is_list(Base64) -> - Padded = [urlencode_digit(D) || D <- Base64], - string:strip(Padded, both, $=); -urlencode(Base64) when is_binary(Base64) -> - Padded = << << (urlencode_digit(D)) >> || <> <= Base64 >>, - binary:replace(Padded, <<"=">>, <<"">>, [global]). - -urldecode(Base64url) when is_list(Base64url) -> - Prepad = [urldecode_digit(D) || D <- Base64url ], - Padding = padding(Prepad), - Prepad ++ Padding; -urldecode(Base64url) when is_binary(Base64url) -> - Prepad = << << (urldecode_digit(D)) >> || <> <= Base64url >>, - Padding = padding(Prepad), - <>. - -padding(Base64) when is_binary(Base64) -> - case byte_size(Base64) rem 4 of - 2 -> - <<"==">>; - 3 -> - <<"=">>; - _ -> - <<"">> - end; -padding(Base64) when is_list(Base64) -> - binary_to_list(padding(list_to_binary(Base64))). - -urlencode_digit($/) -> $_; -urlencode_digit($+) -> $-; -urlencode_digit(D) -> D. - -urldecode_digit($_) -> $/; -urldecode_digit($-) -> $+; -urldecode_digit(D) -> D. diff --git a/src/riak_core_bucket.erl b/src/riak_core_bucket.erl deleted file mode 100644 index e37627375..000000000 --- a/src/riak_core_bucket.erl +++ /dev/null @@ -1,202 +0,0 @@ -%% ------------------------------------------------------------------- -%% -%% riak_core: Core Riak Application -%% -%% Copyright (c) 2007-2010 Basho Technologies, Inc. All Rights Reserved. -%% -%% This file is provided to you under the Apache License, -%% Version 2.0 (the "License"); you may not use this file -%% except in compliance with the License. You may obtain -%% a copy of the License at -%% -%% http://www.apache.org/licenses/LICENSE-2.0 -%% -%% Unless required by applicable law or agreed to in writing, -%% software distributed under the License is distributed on an -%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -%% KIND, either express or implied. See the License for the -%% specific language governing permissions and limitations -%% under the License. -%% -%% ------------------------------------------------------------------- - -%% @doc Functions for manipulating bucket properties. --module(riak_core_bucket). - --export([append_bucket_defaults/1, - set_bucket/2, - get_bucket/1, - get_bucket/2, - reset_bucket/1, - get_buckets/1, - bucket_nval_map/1, - default_object_nval/0, - merge_props/2, - name/1, - n_val/1, - get_value/2]). - --ifdef(TEST). --include_lib("eunit/include/eunit.hrl"). --endif. - --type property() :: {PropName::atom(), PropValue::any()}. --type properties() :: [property()]. - --type riak_core_ring() :: riak_core_ring:riak_core_ring(). --type bucket_type() :: binary(). --type nval_set() :: ordsets:ordset(pos_integer()). --type bucket() :: binary() | {bucket_type(), binary()}. - --export_type([property/0, properties/0, bucket/0, nval_set/0]). - -%% @doc Add a list of defaults to global list of defaults for new -%% buckets. If any item is in Items is already set in the -%% current defaults list, the new setting is omitted, and the old -%% setting is kept. Omitting the new setting is intended -%% behavior, to allow settings from app.config to override any -%% hard-coded values. -append_bucket_defaults(Items) when is_list(Items) -> - riak_core_bucket_props:append_defaults(Items). - -%% @doc Set the given BucketProps in Bucket or {BucketType, Bucket}. If BucketType does not -%% exist, or is not active, {error, no_type} is returned. --spec set_bucket(bucket(), [{atom(), any()}]) -> - ok | {error, no_type | [{atom(), atom()}]}. -set_bucket({<<"default">>, Name}, BucketProps) -> - set_bucket(Name, BucketProps); -set_bucket(Name, BucketProps0) -> - set_bucket(fun set_bucket_in_ring/2, Name, BucketProps0). - -set_bucket(StoreFun, Bucket, BucketProps0) -> - OldBucket = get_bucket(Bucket), - case riak_core_bucket_props:validate(update, Bucket, OldBucket, BucketProps0) of - {ok, BucketProps} -> - NewBucket = merge_props(BucketProps, OldBucket), - StoreFun(Bucket, NewBucket); - {error, Details} -> - logger:error("Bucket properties validation failed ~p~n", [Details]), - {error, Details} - end. - -set_bucket_in_ring(Bucket, BucketMeta) -> - F = fun(Ring, _Args) -> - {new_ring, riak_core_ring:update_meta(bucket_key(Bucket), - BucketMeta, - Ring)} - end, - {ok, _NewRing} = riak_core_ring_manager:ring_trans(F, undefined), - ok. - - -%% @spec merge_props(list(), list()) -> list() -%% @doc Merge two sets of bucket props. If duplicates exist, the -%% entries in Overriding are chosen before those in Other. -merge_props(Overriding, Other) -> - riak_core_bucket_props:merge(Overriding, Other). - -%% @spec get_bucket(riak_object:bucket()) -> -%% {ok, BucketProps :: riak_core_bucketprops()} | {error, no_type} -%% @doc Return the complete current list of properties for Bucket. -%% Properties include but are not limited to: -%%
-%% n_val: how many replicas of objects in this bucket (default: 3)
-%% allow_mult: can objects in this bucket have siblings? (default: false)
-%% linkfun: a function returning a m/r FunTerm for link extraction
-%% 
-%% -get_bucket({<<"default">>, Name}) -> - get_bucket(Name); -get_bucket(Name) -> - Meta = riak_core_ring_manager:get_bucket_meta(Name), - get_bucket_props(Name, Meta). - -%% @spec get_bucket(Name, Ring::riak_core_ring()) -> -%% BucketProps :: riak_core_bucketprops() -%% @private -get_bucket({<<"default">>, Name}, Ring) -> - get_bucket(Name, Ring); -get_bucket({_Type, _Name}=Bucket, _Ring) -> - %% non-default type buckets are not stored in the ring, so just ignore it - get_bucket(Bucket). - -get_bucket_props(Name, undefined) -> - [{name, Name} | riak_core_bucket_props:defaults()]; -get_bucket_props(_Name, {ok, Bucket}) -> - Bucket. - -%% @spec reset_bucket(binary()) -> ok -%% @doc Reset the bucket properties for Bucket to the settings -%% inherited from its Bucket Type -reset_bucket({<<"default">>, Name}) -> - reset_bucket(Name); -reset_bucket(Bucket) -> - F = fun(Ring, _Args) -> - {new_ring, riak_core_ring:remove_meta(bucket_key(Bucket), Ring)} - end, - {ok, _NewRing} = riak_core_ring_manager:ring_trans(F, undefined), - ok. - -%% @doc Get bucket properties `Props' for all the buckets in the given -%% `Ring' and stored in metadata --spec get_buckets(riak_core_ring()) -> - Props::list(). -get_buckets(Ring) -> - RingNames = riak_core_ring:get_buckets(Ring), - RingBuckets = [get_bucket(Name, Ring) || Name <- RingNames], - RingBuckets. - -%% @doc returns a proplist containing all buckets and their respective N values --spec bucket_nval_map(riak_core_ring()) -> [{binary(),integer()}]. -bucket_nval_map(Ring) -> - [{riak_core_bucket:name(B), riak_core_bucket:n_val(B)} || - B <- riak_core_bucket:get_buckets(Ring)]. - -%% @doc returns the default n value for buckets that have not explicitly set the property --spec default_object_nval() -> integer(). -default_object_nval() -> - riak_core_bucket:n_val(riak_core_bucket_props:defaults()). - - -name(BProps) -> - get_value(name, BProps). - -n_val(BProps) -> - get_value(n_val, BProps). - -% a slighly faster version of proplists:get_value --spec get_value(atom(), properties()) -> any(). -get_value(Key, Proplist) -> - case lists:keyfind(Key, 1, Proplist) of - {Key, Value} -> Value; - _ -> undefined - end. - -bucket_key({<<"default">>, Name}) -> - bucket_key(Name); -bucket_key({_Type, _Name}=Bucket) -> - Bucket; -bucket_key(Name) -> - {bucket, Name}. - -%% =================================================================== -%% EUnit tests -%% =================================================================== --ifdef(TEST). - -simple_set_test() -> - application:load(riak_core), - application:set_env(riak_core, ring_state_dir, "_build/test/tmp"), - %% appending an empty list of defaults makes up for the fact that - %% riak_core_app:start/2 is not called during eunit runs - %% (that's where the usual defaults are set at startup), - %% while also not adding any trash that might affect other tests - append_bucket_defaults([]), - riak_core_ring_events:start_link(), - riak_core_ring_manager:start_link(test), - ok = set_bucket(a_bucket,[{key,value}]), - Bucket = get_bucket(a_bucket), - riak_core_ring_manager:stop(), - ?assertEqual(value, proplists:get_value(key, Bucket)). - --endif. diff --git a/src/riak_core_bucket_props.erl b/src/riak_core_bucket_props.erl deleted file mode 100644 index b35950a3e..000000000 --- a/src/riak_core_bucket_props.erl +++ /dev/null @@ -1,267 +0,0 @@ -%% ------------------------------------------------------------------- -%% -%% Copyright (c) 2013 Basho Technologies, Inc. All Rights Reserved. -%% -%% This file is provided to you under the Apache License, -%% Version 2.0 (the "License"); you may not use this file -%% except in compliance with the License. You may obtain -%% a copy of the License at -%% -%% http://www.apache.org/licenses/LICENSE-2.0 -%% -%% Unless required by applicable law or agreed to in writing, -%% software distributed under the License is distributed on an -%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -%% KIND, either express or implied. See the License for the -%% specific language governing permissions and limitations -%% under the License. -%% -%% ------------------------------------------------------------------- --module(riak_core_bucket_props). - --export([merge/2, - validate/4, - resolve/2, - defaults/0, - append_defaults/1]). - --ifdef(TEST). --include_lib("eunit/include/eunit.hrl"). --endif. - --spec merge([{atom(), any()}], [{atom(), any()}]) -> [{atom(), any()}]. -merge(Overriding, Other) -> - lists:ukeymerge(1, lists:ukeysort(1, Overriding), - lists:ukeysort(1, Other)). - - --spec validate(create | update, - {riak_core_bucket:bucket_type(), undefined | binary()} | binary(), - undefined | [{atom(), any()}], - [{atom(), any()}]) -> {ok, [{atom(), any()}]} | {error, [{atom(), atom()}]}. -validate(CreateOrUpdate, Bucket, ExistingProps, BucketProps) -> - ReservedErrors = validate_reserved_names(Bucket), - CoreErrors = validate_core_props(CreateOrUpdate, Bucket, ExistingProps, BucketProps), - validate(CreateOrUpdate, Bucket, ExistingProps, BucketProps, riak_core:bucket_validators(), [ReservedErrors, CoreErrors]). - -validate(_CreateOrUpdate, _Bucket, _ExistingProps, Props, [], ErrorLists) -> - case lists:flatten(ErrorLists) of - [] -> {ok, Props}; - Errors -> {error, Errors} - end; -validate(CreateOrUpdate, Bucket, ExistingProps, BucketProps0, [{_App, Validator}|T], Errors0) -> - {BucketProps, Errors} = Validator:validate(CreateOrUpdate, Bucket, ExistingProps, BucketProps0), - validate(CreateOrUpdate, Bucket, ExistingProps, BucketProps, T, [Errors|Errors0]). - -validate_core_props(CreateOrUpdate, Bucket, ExistingProps, BucketProps) -> - lists:foldl(fun(Prop, Errors) -> - case validate_core_prop(CreateOrUpdate, Bucket, ExistingProps, Prop) of - true -> Errors; - Error -> [Error | Errors] - end - end, [], BucketProps). - -validate_core_prop(create, {_Bucket, undefined}, undefined, {claimant, Claimant}) when Claimant =:= node()-> - %% claimant valid on first call to create if claimant is this node - true; -validate_core_prop(create, {_Bucket, undefined}, undefined, {claimant, _BadClaimant}) -> - %% claimant not valid on first call to create if claimant is not this node - {claimant, "Invalid claimant"}; -validate_core_prop(create, {_Bucket, undefined}, Existing, {claimant, Claimant}) -> - %% subsequent creation calls cannot modify claimant and it should exist - case lists:keyfind(claimant, 1, Existing) of - false -> {claimant, "No claimant details found in existing properties"}; - {claimant, Claimant} -> true; - {claimant, _Other} -> {claimant, "Cannot modify claimant property"} - end; -validate_core_prop(update, {_Bucket, _BucketName}, _Existing, {claimant, _Claimant}) -> - %% cannot update claimant - {claimant, "Cannot update claimant property"}; -validate_core_prop(update, {_Bucket, _BucketName}, _Existing, {ddl, _DDL}) -> - %% cannot update time series DDL - {ddl, "Cannot update time series data definition"}; -validate_core_prop(update, {_Bucket, _BucketName}, _Existing, {table_def, _DDL}) -> - %% cannot update time series DDL (or, if it slips past riak_kv_console, - %% the table_def SQL(ish) code that is parsed to make a DDL) - %% - %% Defining the table_def atom here also sidesteps occasional - %% errors from existing_atom functions - {ddl, "Cannot update time series data definition"}; -validate_core_prop(create, {_Bucket, undefined}, undefined, {active, false}) -> - %% first creation call that sets active to false is always valid - true; -validate_core_prop(create, {_Bucket, undefined}, _Existing, {active, false}) -> - %% subsequent creation calls that leaves active false is valid - true; -validate_core_prop(update, {_Bucket, _}, _Existing, {active, true}) -> - %% calls to update that do not modify active are valid - true; -validate_core_prop(_, {_Bucket, _}, _Existing, {active, _}) -> - %% subsequent creation calls or update calls cannot modify active (it is modified directly - %% by riak_core_claimant) - {active, "Cannot modify active property"}; -validate_core_prop(_, _, _, _) -> - %% all other properties are valid from the perspective of riak_core - true. - -validate_reserved_names(Bucket) -> - case validate_reserved_name(Bucket) of - ok -> []; - ErrStr -> [{reserved_name, ErrStr}] - end. - -validate_reserved_name({<<"any">>, _}) -> - "The name 'any' may not be used for bucket types"; -validate_reserved_name(_) -> - ok. - --spec defaults() -> [{atom(), any()}]. -defaults() -> - application:get_env(riak_core, default_bucket_props, undefined). - --spec append_defaults([{atom(), any()}]) -> ok. -append_defaults(Items) when is_list(Items) -> - OldDefaults = application:get_env(riak_core, default_bucket_props, []), - NewDefaults = merge(OldDefaults, Items), - FixedDefaults = case riak_core:bucket_fixups() of - [] -> NewDefaults; - Fixups -> - riak_core_ring_manager:run_fixups(Fixups, default, NewDefaults) - end, - application:set_env(riak_core, default_bucket_props, FixedDefaults), - %% do a noop transform on the ring, to make the fixups re-run - catch(riak_core_ring_manager:ring_trans(fun(Ring, _) -> - {new_ring, Ring} - end, undefined)), - ok. - --spec resolve([{atom(), any()}], [{atom(), any()}]) -> [{atom(), any()}]. -%%noinspection ErlangUnusedVariable -resolve(PropsA, PropsB) when is_list(PropsA) andalso - is_list(PropsB) -> - PropsASorted = lists:ukeysort(1, PropsA), - PropsBSorted = lists:ukeysort(1, PropsB), - {_, Resolved} = lists:foldl(fun({KeyA, _}=PropA, {[{KeyA, _}=PropB | RestB], Acc}) -> - {RestB, [{KeyA, resolve_prop(PropA, PropB)} | Acc]}; - (PropA, {RestB, Acc}) -> - {RestB, [PropA | Acc]} - end, - {PropsBSorted, []}, - PropsASorted), - Resolved. - -resolve_prop({allow_mult, Mult1}, {allow_mult, Mult2}) -> - Mult1 orelse Mult2; %% assumes allow_mult=true is default -resolve_prop({basic_quorum, Basic1}, {basic_quorum, Basic2}) -> - Basic1 andalso Basic2; -resolve_prop({big_vclock, Big1}, {big_vclock, Big2}) -> - max(Big1, Big2); -resolve_prop({chash_keyfun, KeyFun1}, {chash_keyfun, _KeyFun2}) -> - KeyFun1; %% arbitrary choice -resolve_prop({dw, DW1}, {dw, DW2}) -> - %% 'quorum' wins over set numbers - max(DW1, DW2); -resolve_prop({last_write_wins, LWW1}, {last_write_wins, LWW2}) -> - LWW1 andalso LWW2; -resolve_prop({linkfun, LinkFun1}, {linkfun, _LinkFun2}) -> - LinkFun1; %% arbitrary choice -resolve_prop({n_val, N1}, {n_val, N2}) -> - max(N1, N2); -resolve_prop({notfound_ok, NF1}, {notfound_ok, NF2}) -> - NF1 orelse NF2; -resolve_prop({old_vclock, Old1}, {old_vclock, Old2}) -> - max(Old1, Old2); -resolve_prop({postcommit, PC1}, {postcommit, PC2}) -> - resolve_hooks(PC1, PC2); -resolve_prop({pr, PR1}, {pr, PR2}) -> - max(PR1, PR2); -resolve_prop({precommit, PC1}, {precommit, PC2}) -> - resolve_hooks(PC1, PC2); -resolve_prop({pw, PW1}, {pw, PW2}) -> - max(PW1, PW2); -resolve_prop({r, R1}, {r, R2}) -> - max(R1, R2); -resolve_prop({rw, RW1}, {rw, RW2}) -> - max(RW1, RW2); -resolve_prop({small_vclock, Small1}, {small_vclock, Small2}) -> - max(Small1, Small2); -resolve_prop({w, W1}, {w, W2}) -> - max(W1, W2); -resolve_prop({young_vclock, Young1}, {young_vclock, Young2}) -> - max(Young1, Young2); -resolve_prop({_, V1}, {_, _V2}) -> - V1. - -resolve_hooks(Hooks1, Hooks2) -> - lists:usort(Hooks1 ++ Hooks2). - -%% =================================================================== -%% EUnit tests -%% =================================================================== - --ifdef(TEST). - -simple_resolve_test() -> - Props1 = [{name,<<"test">>}, - {allow_mult,false}, - {basic_quorum,false}, - {big_vclock,50}, - {chash_keyfun,{riak_core_util,chash_std_keyfun}}, - {dw,quorum}, - {last_write_wins,false}, - {linkfun,{modfun,riak_kv_wm_link_walker,mapreduce_linkfun}}, - {n_val,3}, - {notfound_ok,true}, - {old_vclock,86400}, - {postcommit,[]}, - {pr,0}, - {precommit,[{a, b}]}, - {pw,0}, - {r,quorum}, - {rw,quorum}, - {small_vclock,50}, - {w,quorum}, - {young_vclock,20}], - Props2 = [{name,<<"test">>}, - {allow_mult, true}, - {basic_quorum, true}, - {big_vclock,60}, - {chash_keyfun,{riak_core_util,chash_std_keyfun}}, - {dw,3}, - {last_write_wins,true}, - {linkfun,{modfun,riak_kv_wm_link_walker,mapreduce_linkfun}}, - {n_val,5}, - {notfound_ok,false}, - {old_vclock,86401}, - {postcommit,[{a, b}]}, - {pr,1}, - {precommit,[{c, d}]}, - {pw,3}, - {r,3}, - {rw,3}, - {w,1}, - {young_vclock,30}], - Expected = [{name,<<"test">>}, - {allow_mult,true}, - {basic_quorum,false}, - {big_vclock,60}, - {chash_keyfun,{riak_core_util,chash_std_keyfun}}, - {dw,quorum}, - {last_write_wins,false}, - {linkfun,{modfun,riak_kv_wm_link_walker,mapreduce_linkfun}}, - {n_val,5}, - {notfound_ok,true}, - {old_vclock,86401}, - {postcommit,[{a, b}]}, - {pr,1}, - {precommit,[{a, b}, {c, d}]}, - {pw,3}, - {r,quorum}, - {rw,quorum}, - {small_vclock,50}, - {w,quorum}, - {young_vclock,30}], - ?assertEqual(lists:ukeysort(1, Expected), lists:ukeysort(1, resolve(Props1, Props2))). - --endif. - diff --git a/src/riak_core_claim.erl b/src/riak_core_claim.erl index dc3cf2918..a4f58daeb 100644 --- a/src/riak_core_claim.erl +++ b/src/riak_core_claim.erl @@ -47,32 +47,33 @@ %% In that case, Riak will minimize the cases where the constraint is violated %% and they will all exist near the origin point of the ring. -%% A good way to decide on the setting of target_n_val for your application is -%% to set it to the largest value you expect to use for any bucket's n_val. The -%% default is 4. - -module(riak_core_claim). --export([claim/1, claim/3, claim_until_balanced/2, claim_until_balanced/4]). + +-export([claim/1, claim/3, claim_until_balanced/2, + claim_until_balanced/4]). + -export([default_wants_claim/1, default_wants_claim/2, - default_choose_claim/1, default_choose_claim/2, default_choose_claim/3, - never_wants_claim/1, never_wants_claim/2, - random_choose_claim/1, random_choose_claim/2, random_choose_claim/3]). + default_choose_claim/1, default_choose_claim/2, + default_choose_claim/3, never_wants_claim/1, + never_wants_claim/2, random_choose_claim/1, + random_choose_claim/2, random_choose_claim/3]). + -export([wants_claim_v2/1, wants_claim_v2/2, choose_claim_v2/1, choose_claim_v2/2, choose_claim_v2/3, - claim_rebalance_n/2, claim_diversify/3, claim_diagonal/3, - wants/1, wants_owns_diff/2, meets_target_n/2, diagonal_stripe/2]). - + claim_rebalance_n/2, claim_diversify/3, + claim_diagonal/3, wants/1, wants_owns_diff/2, + meets_target_n/2, diagonal_stripe/2]). -define(DEF_TARGET_N, 4). -claim(Ring) -> - claim(Ring, want, choose). +claim(Ring) -> claim(Ring, want, choose). claim(Ring, _, _) -> Members = riak_core_ring:claiming_members(Ring), - lists:foldl(fun(Node, Ring0) -> + lists:foldl(fun (Node, Ring0) -> claim_until_balanced(Ring0, Node, want, choose) - end, Ring, Members). + end, + Ring, Members). claim_until_balanced(Ring, Node) -> claim_until_balanced(Ring, Node, want, choose). @@ -80,14 +81,14 @@ claim_until_balanced(Ring, Node) -> claim_until_balanced(Ring, Node, want, choose) -> NeedsIndexes = wants_claim_v2(Ring, Node), case NeedsIndexes of - no -> Ring; - {yes, _NumToClaim} -> - NewRing = choose_claim_v2(Ring, Node), - claim_until_balanced(NewRing, Node, want, choose) + no -> Ring; + {yes, _NumToClaim} -> + NewRing = choose_claim_v2(Ring, Node), + claim_until_balanced(NewRing, Node, want, choose) end. %% =================================================================== -%% Claim Function Implementations +%% Claim Function Implementations %% =================================================================== %% @spec default_choose_claim(riak_core_ring()) -> riak_core_ring() @@ -109,8 +110,7 @@ default_wants_claim(Ring) -> default_wants_claim(Ring, Node) -> wants_claim_v2(Ring, Node). -wants_claim_v2(Ring) -> - wants_claim_v2(Ring, node()). +wants_claim_v2(Ring) -> wants_claim_v2(Ring, node()). wants_claim_v2(Ring, Node) -> Active = riak_core_ring:claiming_members(Ring), @@ -121,25 +121,23 @@ wants_claim_v2(Ring, Node) -> Avg = RingSize div NodeCount, Count = proplists:get_value(Node, Counts, 0), case Count < Avg of - false -> no; - true -> {yes, Avg - Count} + false -> no; + true -> {yes, Avg - Count} end. %% Provide default choose parameters if none given -default_choose_params() -> - default_choose_params([]). +default_choose_params() -> default_choose_params([]). default_choose_params(Params) -> case proplists:get_value(target_n_val, Params) of - undefined -> - TN = application:get_env(riak_core, target_n_val, ?DEF_TARGET_N), - [{target_n_val, TN} | Params]; - _-> - Params + undefined -> + TN = application:get_env(riak_core, target_n_val, + ?DEF_TARGET_N), + [{target_n_val, TN} | Params]; + _ -> Params end. -choose_claim_v2(Ring) -> - choose_claim_v2(Ring, node()). +choose_claim_v2(Ring) -> choose_claim_v2(Ring, node()). choose_claim_v2(Ring, Node) -> Params = default_choose_params(), @@ -157,69 +155,68 @@ choose_claim_v2(RingOrig, Node, Params0) -> RingSize = riak_core_ring:num_partitions(Ring), NodeCount = erlang:length(Active), %% Deltas::[node(), integer()] - Deltas = get_deltas(RingSize, NodeCount, Owners, Counts), + Deltas = get_deltas(RingSize, NodeCount, Owners, + Counts), {_, Want} = lists:keyfind(Node, 1, Deltas), TargetN = proplists:get_value(target_n_val, Params), - AllIndices = lists:zip(lists:seq(0, length(Owners)-1), + AllIndices = lists:zip(lists:seq(0, length(Owners) - 1), [Idx || {Idx, _} <- Owners]), - - EnoughNodes = - (NodeCount > TargetN) - or ((NodeCount == TargetN) and (RingSize rem TargetN =:= 0)), + EnoughNodes = (NodeCount > TargetN) or + (NodeCount == TargetN) and (RingSize rem TargetN =:= 0), case EnoughNodes of - true -> - %% If we have enough nodes to meet target_n, then we prefer to - %% claim indices that are currently causing violations, and then - %% fallback to indices in linear order. The filtering steps below - %% will ensure no new violations are introduced. - Violated = lists:flatten(find_violations(Ring, TargetN)), - Violated2 = [lists:keyfind(Idx, 2, AllIndices) || Idx <- Violated], - Indices = Violated2 ++ (AllIndices -- Violated2); - false -> - %% If we do not have enough nodes to meet target_n, then we prefer - %% claiming the same indices that would occur during a - %% re-diagonalization of the ring with target_n nodes, falling - %% back to linear offsets off these preferred indices when the - %% number of indices desired is less than the computed set. - Padding = lists:duplicate(TargetN, undefined), - Expanded = lists:sublist(Active ++ Padding, TargetN), - ExpandedLocation = get_nodes_by_location(Expanded, Ring), - PreferredClaim = riak_core_claim:diagonal_stripe(Ring, ExpandedLocation), - PreferredNth = [begin - {Nth, Idx} = lists:keyfind(Idx, 2, AllIndices), - Nth - end || {Idx,Owner} <- PreferredClaim, - Owner =:= Node], - Offsets = lists:seq(0, RingSize div length(PreferredNth)), - AllNth = lists:sublist([(X+Y) rem RingSize || Y <- Offsets, - X <- PreferredNth], - RingSize), - Indices = [lists:keyfind(Nth, 1, AllIndices) || Nth <- AllNth] + true -> + %% If we have enough nodes to meet target_n, then we prefer to + %% claim indices that are currently causing violations, and then + %% fallback to indices in linear order. The filtering steps below + %% will ensure no new violations are introduced. + Violated = lists:flatten(find_violations(Ring, + TargetN)), + Violated2 = [lists:keyfind(Idx, 2, AllIndices) + || Idx <- Violated], + Indices = Violated2 ++ AllIndices -- Violated2; + false -> + %% If we do not have enough nodes to meet target_n, then we prefer + %% claiming the same indices that would occur during a + %% re-diagonalization of the ring with target_n nodes, falling + %% back to linear offsets off these preferred indices when the + %% number of indices desired is less than the computed set. + Padding = lists:duplicate(TargetN, undefined), + Expanded = lists:sublist(Active ++ Padding, TargetN), + PreferredClaim = riak_core_claim:diagonal_stripe(Ring, + Expanded), + PreferredNth = [begin + {Nth, Idx} = lists:keyfind(Idx, 2, AllIndices), Nth + end + || {Idx, Owner} <- PreferredClaim, Owner =:= Node], + Offsets = lists:seq(0, + RingSize div length(PreferredNth)), + AllNth = lists:sublist([(X + Y) rem RingSize + || Y <- Offsets, X <- PreferredNth], + RingSize), + Indices = [lists:keyfind(Nth, 1, AllIndices) + || Nth <- AllNth] end, - %% Filter out indices that conflict with the node's existing ownership - Indices2 = prefilter_violations(Ring, Node, AllIndices, Indices, - TargetN, RingSize), + Indices2 = prefilter_violations(Ring, Node, AllIndices, + Indices, TargetN, RingSize), %% Claim indices from the remaining candidate set - Claim2 = case select_indices(Owners, Deltas, Indices2, TargetN, RingSize) of - [] -> []; - Claim -> lists:sublist(Claim, Want) - end, - NewRing = lists:foldl(fun(Idx, Ring0) -> + Claim = select_indices(Owners, Deltas, Indices2, + TargetN, RingSize), + Claim2 = lists:sublist(Claim, Want), + NewRing = lists:foldl(fun (Idx, Ring0) -> riak_core_ring:transfer_node(Idx, Node, Ring0) - end, Ring, Claim2), - - RingChanged = ([] /= Claim2), + end, + Ring, Claim2), + RingChanged = [] /= Claim2, RingMeetsTargetN = meets_target_n(NewRing, TargetN), case {RingChanged, EnoughNodes, RingMeetsTargetN} of - {false, _, _} -> - %% Unable to claim, fallback to re-diagonalization - sequential_claim(Ring, Node, TargetN); - {_, true, false} -> - %% Failed to meet target_n, fallback to re-diagonalization - sequential_claim(Ring, Node, TargetN); - _ -> - NewRing + {false, _, _} -> + %% Unable to claim, fallback to re-diagonalization + sequential_claim(Ring, Node, TargetN); + {_, true, false} -> + %% Failed to meet target_n, fallback to re-diagonalization + sequential_claim(Ring, Node, TargetN); + _ -> NewRing end. %% @private for each node in owners return a tuple of owner and delta @@ -227,24 +224,29 @@ choose_claim_v2(RingOrig, Node, Params0) -> %% needs it's ownership to change by. A positive means the owner needs %% that many more partitions, a negative means the owner can lose that %% many paritions. --spec get_deltas(RingSize::pos_integer(), - NodeCount::pos_integer(), - Owners::[{Index::non_neg_integer(), node()}], - Counts::[{node(), non_neg_integer()}]) -> - Deltas::[{node(), integer()}]. +-spec get_deltas(RingSize :: pos_integer(), + NodeCount :: pos_integer(), + Owners :: [{Index :: non_neg_integer(), node()}], + Counts :: [{node(), non_neg_integer()}]) -> Deltas :: + [{node(), + integer()}]. + get_deltas(RingSize, NodeCount, Owners, Counts) -> Avg = RingSize / NodeCount, %% the most any node should own Max = ceiling(RingSize / NodeCount), - ActiveDeltas = [{Member, Count, normalise_delta(Avg - Count)} + ActiveDeltas = [{Member, Count, + normalise_delta(Avg - Count)} || {Member, Count} <- Counts], - BalancedDeltas = rebalance_deltas(ActiveDeltas, Max, RingSize), + BalancedDeltas = rebalance_deltas(ActiveDeltas, Max, + RingSize), add_default_deltas(Owners, BalancedDeltas, 0). %% @private a node can only claim whole partitions, but if RingSize %% rem NodeCount /= 0, a delta will be a float. This function decides %% if that float should be floored or ceilinged -spec normalise_delta(float()) -> integer(). + normalise_delta(Delta) when Delta < 0 -> %% if the node has too many (a negative delta) give up the most %% you can (will be rebalanced) @@ -258,90 +260,101 @@ normalise_delta(Delta) -> %% node has more vnodes than it should (e.g. [{n1, 6}, {n2, 6}, {n3, %% 6}, {n4, 8}, {n5,6} we rebalance the deltas so that select_indices %% doesn't leave some node not giving up enough partitions --spec rebalance_deltas([{node(), integer()}], pos_integer(), pos_integer()) -> [{node(), integer()}]. -rebalance_deltas(NodeDeltas, Max, RingSize) -> - AppliedDeltas = [Own + Delta || {_, Own, Delta} <- NodeDeltas], +-spec rebalance_deltas([{node(), integer()}], + pos_integer(), pos_integer()) -> [{node(), integer()}]. +rebalance_deltas(NodeDeltas, Max, RingSize) -> + AppliedDeltas = [Own + Delta + || {_, Own, Delta} <- NodeDeltas], case lists:sum(AppliedDeltas) - RingSize of - 0 -> - [{Node, Delta} || {Node, _Cnt, Delta} <- NodeDeltas]; - N when N < 0 -> - increase_keeps(NodeDeltas, N, Max, []) + 0 -> + [{Node, Delta} || {Node, _Cnt, Delta} <- NodeDeltas]; + N when N < 0 -> increase_keeps(NodeDeltas, N, Max, []) end. %% @private increases the delta for (some) nodes giving away %% partitions to the max they can keep --spec increase_keeps(Deltas::[{node(), integer()}], - WantsError::integer(), - Max::pos_integer(), - Acc::[{node(), integer()}]) -> - Rebalanced::[{node(), integer()}]. +-spec increase_keeps(Deltas :: [{node(), integer()}], + WantsError :: integer(), Max :: pos_integer(), + Acc :: [{node(), integer()}]) -> Rebalanced :: [{node(), + integer()}]. + increase_keeps(Rest, 0, _Max, Acc) -> - [{Node, Delta} || {Node, _Own, Delta} <- lists:usort(lists:append(Rest, Acc))]; + [{Node, Delta} + || {Node, _Own, Delta} + <- lists:usort(lists:append(Rest, Acc))]; increase_keeps([], N, Max, Acc) when N < 0 -> increase_takes(lists:reverse(Acc), N, Max, []); -increase_keeps([{Node, Own, Delta} | Rest], N, Max, Acc) when Delta < 0 -> +increase_keeps([{Node, Own, Delta} | Rest], N, Max, Acc) + when Delta < 0 -> WouldOwn = Own + Delta, - Additive = case WouldOwn +1 =< Max of - true -> 1; - false -> 0 + Additive = case WouldOwn + 1 =< Max of + true -> 1; + false -> 0 end, - increase_keeps(Rest, N+Additive, Max, [{Node, Own+Delta+Additive} | Acc]); + increase_keeps(Rest, N + Additive, Max, + [{Node, Own + Delta + Additive} | Acc]); increase_keeps([NodeDelta | Rest], N, Max, Acc) -> increase_keeps(Rest, N, Max, [NodeDelta | Acc]). %% @private increases the delta for (some) nodes taking partitions to the max %% they can ask for --spec increase_takes(Deltas::[{node(), integer()}], - WantsError::integer(), - Max::pos_integer(), - Acc::[{node(), integer()}]) -> - Rebalanced::[{node(), integer()}]. +-spec increase_takes(Deltas :: [{node(), integer()}], + WantsError :: integer(), Max :: pos_integer(), + Acc :: [{node(), integer()}]) -> Rebalanced :: [{node(), + integer()}]. + increase_takes(Rest, 0, _Max, Acc) -> - [{Node, Delta} || {Node, _Own, Delta} <- lists:usort(lists:append(Rest, Acc))]; + [{Node, Delta} + || {Node, _Own, Delta} + <- lists:usort(lists:append(Rest, Acc))]; increase_takes([], N, _Max, Acc) when N < 0 -> - [{Node, Delta} || {Node, _Own, Delta} <- lists:usort(Acc)]; -increase_takes([{Node, Own, Delta} | Rest], N, Max, Acc) when Delta > 0 -> + [{Node, Delta} + || {Node, _Own, Delta} <- lists:usort(Acc)]; +increase_takes([{Node, Own, Delta} | Rest], N, Max, Acc) + when Delta > 0 -> WouldOwn = Own + Delta, - Additive = case WouldOwn +1 =< Max of - true -> 1; - false -> 0 + Additive = case WouldOwn + 1 =< Max of + true -> 1; + false -> 0 end, - increase_takes(Rest, N+Additive, Max, [{Node, Own, Delta+Additive} | Acc]); + increase_takes(Rest, N + Additive, Max, + [{Node, Own, Delta + Additive} | Acc]); increase_takes([NodeDelta | Rest], N, Max, Acc) -> increase_takes(Rest, N, Max, [NodeDelta | Acc]). meets_target_n(Ring, TargetN) -> - Owners = lists:keysort(1, riak_core_ring:all_owners(Ring)), + Owners = lists:keysort(1, + riak_core_ring:all_owners(Ring)), meets_target_n(Owners, TargetN, 0, [], []). -meets_target_n([{Part,Node}|Rest], TargetN, Index, First, Last) -> + +meets_target_n([{Part, Node} | Rest], TargetN, Index, + First, Last) -> case lists:keytake(Node, 1, Last) of - {value, {Node, LastIndex, _}, NewLast} -> - if Index-LastIndex >= TargetN -> - %% node repeat respects TargetN - meets_target_n(Rest, TargetN, Index+1, First, - [{Node, Index, Part}|NewLast]); - true -> - %% violation of TargetN - false - end; - false -> - %% haven't seen this node yet - meets_target_n(Rest, TargetN, Index+1, - [{Node, Index}|First], [{Node, Index, Part}|Last]) + {value, {Node, LastIndex, _}, NewLast} -> + if Index - LastIndex >= TargetN -> + %% node repeat respects TargetN + meets_target_n(Rest, TargetN, Index + 1, First, + [{Node, Index, Part} | NewLast]); + true -> + %% violation of TargetN + false + end; + false -> + %% haven't seen this node yet + meets_target_n(Rest, TargetN, Index + 1, + [{Node, Index} | First], [{Node, Index, Part} | Last]) end; meets_target_n([], TargetN, Index, First, Last) -> %% start through end guarantees TargetN %% compute violations at wrap around, but don't fail %% because of them: handle during reclaim - Violations = - lists:filter(fun({Node, L, _}) -> - {Node, F} = proplists:lookup(Node, First), - (Index-L)+F < TargetN - end, - Last), - {true, [ Part || {_, _, Part} <- Violations ]}. - + Violations = lists:filter(fun ({Node, L, _}) -> + {Node, F} = proplists:lookup(Node, First), + Index - L + F < TargetN + end, + Last), + {true, [Part || {_, _, Part} <- Violations]}. %% Claim diversify tries to build a perfectly diverse ownership list that meets %% target N. It uses wants to work out which nodes want partitions, but does @@ -349,18 +362,21 @@ meets_target_n([], TargetN, Index, First, Last) -> %% list, updating the adjacency matrix needed to compute the diversity score as each %% node is added and uses it to drive the selection of the next nodes. claim_diversify(Wants, Owners, Params) -> - TN = proplists:get_value(target_n_val, Params, ?DEF_TARGET_N), + TN = proplists:get_value(target_n_val, Params, + ?DEF_TARGET_N), Q = length(Owners), - Claiming = [N || {N,W} <- Wants, W > 0], - {ok, NewOwners, _AM} = riak_core_claim_util:construct( - riak_core_claim_util:gen_complete_len(Q), Claiming, TN), + Claiming = [N || {N, W} <- Wants, W > 0], + {ok, NewOwners, _AM} = + riak_core_claim_util:construct(riak_core_claim_util:gen_complete_len(Q), + Claiming, TN), {NewOwners, [diversified]}. %% Claim nodes in seq a,b,c,a,b,c trying to handle the wraparound %% case to meet target N claim_diagonal(Wants, Owners, Params) -> - TN = proplists:get_value(target_n_val, Params, ?DEF_TARGET_N), - Claiming = lists:sort([N || {N,W} <- Wants, W > 0]), + TN = proplists:get_value(target_n_val, Params, + ?DEF_TARGET_N), + Claiming = lists:sort([N || {N, W} <- Wants, W > 0]), S = length(Claiming), Q = length(Owners), Reps = Q div S, @@ -369,12 +385,12 @@ claim_diagonal(Wants, Owners, Params) -> %% are available. Tail = Q - Reps * S, Last = case S >= TN + Tail of - true -> % If number wanted can be filled excluding first TN nodes - lists:sublist(lists:nthtail(TN - Tail, Claiming), Tail); - _ -> - lists:sublist(Claiming, Tail) + true -> % If number wanted can be filled excluding first TN nodes + lists:sublist(lists:nthtail(TN - Tail, Claiming), Tail); + _ -> lists:sublist(Claiming, Tail) end, - {lists:flatten([lists:duplicate(Reps, Claiming), Last]), [diagonalized]}. + {lists:flatten([lists:duplicate(Reps, Claiming), Last]), + [diagonalized]}. %% @private fall back to diagonal striping vnodes across nodes in a %% sequential round robin (eg n1 | n2 | n3 | n4 | n5 | n1 | n2 | n3 @@ -382,111 +398,127 @@ claim_diagonal(Wants, Owners, Params) -> %% attempts to eliminate tail violations (for example a ring that %% starts/ends n1 | n2 | ...| n3 | n4 | n1) -spec sequential_claim(riak_core_ring:riak_core_ring(), - node(), - integer()) -> - riak_core_ring:riak_core_ring(). + node(), integer()) -> riak_core_ring:riak_core_ring(). + sequential_claim(Ring, Node, TargetN) -> - Nodes = lists:usort([Node|riak_core_ring:claiming_members(Ring)]), + Nodes = lists:usort([Node + | riak_core_ring:claiming_members(Ring)]), NodeCount = length(Nodes), RingSize = riak_core_ring:num_partitions(Ring), - Overhang = RingSize rem NodeCount, - HasTailViolation = (Overhang > 0 andalso Overhang < TargetN), + HasTailViolation = Overhang > 0 andalso + Overhang < TargetN, Shortfall = TargetN - Overhang, CompleteSequences = RingSize div NodeCount, MaxFetchesPerSeq = NodeCount - TargetN, - MinFetchesPerSeq = ceiling(Shortfall / CompleteSequences), - CanSolveViolation = ((CompleteSequences * MaxFetchesPerSeq) >= Shortfall), - - Zipped = case (HasTailViolation andalso CanSolveViolation) of - true-> - Partitions = lists:sort([ I || {I, _} <- riak_core_ring:all_owners(Ring) ]), - Nodelist = solve_tail_violations(RingSize, Nodes, Shortfall, MinFetchesPerSeq), - lists:zip(Partitions, lists:flatten(Nodelist)); - false -> - diagonal_stripe(Ring, Nodes) - end, - - lists:foldl(fun({P, N}, Acc) -> + MinFetchesPerSeq = ceiling(Shortfall / + CompleteSequences), + CanSolveViolation = CompleteSequences * MaxFetchesPerSeq + >= Shortfall, + Zipped = case HasTailViolation andalso CanSolveViolation + of + true -> + Partitions = lists:sort([I + || {I, _} + <- riak_core_ring:all_owners(Ring)]), + Nodelist = solve_tail_violations(RingSize, Nodes, + Shortfall, + MinFetchesPerSeq), + lists:zip(Partitions, lists:flatten(Nodelist)); + false -> diagonal_stripe(Ring, Nodes) + end, + lists:foldl(fun ({P, N}, Acc) -> riak_core_ring:transfer_node(P, N, Acc) end, - Ring, - Zipped). - + Ring, Zipped). %% @private every module has a ceiling function -spec ceiling(float()) -> integer(). + ceiling(F) -> T = trunc(F), case F - T == 0 of - true -> - T; - false -> - T + 1 + true -> T; + false -> T + 1 end. - %% @private rem_fill increase the tail so that there is no wrap around %% preflist violation, by taking a `Shortfall' number nodes from %% earlier in the preflist --spec solve_tail_violations(integer(), [node()], integer(), integer()) -> [node()]. -solve_tail_violations(RingSize, Nodes, Shortfall, MinFetchesPerSeq) -> - StartingNode = (RingSize rem length(Nodes)) + 1, - build_nodelist(RingSize, Nodes, Shortfall, StartingNode, MinFetchesPerSeq, []). +-spec solve_tail_violations(integer(), [node()], + integer(), integer()) -> [node()]. + +solve_tail_violations(RingSize, Nodes, Shortfall, + MinFetchesPerSeq) -> + StartingNode = RingSize rem length(Nodes) + 1, + build_nodelist(RingSize, Nodes, Shortfall, StartingNode, + MinFetchesPerSeq, []). %% @private build the node list by building tail to satisfy TargetN, then removing %% the added nodes from earlier segments --spec build_nodelist(integer(), [node()], integer(), integer(), integer(), [node()]) -> [node()]. -build_nodelist(RingSize, Nodes, _Shortfall=0, _NodeCounter, _MinFetchesPerSeq, Acc) -> +-spec build_nodelist(integer(), [node()], integer(), + integer(), integer(), [node()]) -> [node()]. + +build_nodelist(RingSize, Nodes, _Shortfall = 0, + _NodeCounter, _MinFetchesPerSeq, Acc) -> %% Finished shuffling, backfill if required ShuffledRing = lists:flatten(Acc), backfill_ring(RingSize, Nodes, - (RingSize-length(ShuffledRing)) div (length(Nodes)), Acc); -build_nodelist(RingSize, Nodes, Shortfall, NodeCounter, MinFetchesPerSeq, _Acc=[]) -> + (RingSize - length(ShuffledRing)) div length(Nodes), + Acc); +build_nodelist(RingSize, Nodes, Shortfall, NodeCounter, + MinFetchesPerSeq, _Acc = []) -> %% Build the tail with sufficient nodes to satisfy TargetN NodeCount = length(Nodes), - LastSegLength = (RingSize rem NodeCount) + Shortfall, + LastSegLength = RingSize rem NodeCount + Shortfall, NewSeq = lists:sublist(Nodes, 1, LastSegLength), - build_nodelist(RingSize, Nodes, Shortfall, NodeCounter, MinFetchesPerSeq, NewSeq); -build_nodelist(RingSize, Nodes, Shortfall, NodeCounter, MinFetchesPerSeq, Acc) -> - %% Build rest of list, subtracting minimum of MinFetchesPerSeq, Shortfall + build_nodelist(RingSize, Nodes, Shortfall, NodeCounter, + MinFetchesPerSeq, NewSeq); +build_nodelist(RingSize, Nodes, Shortfall, NodeCounter, + MinFetchesPerSeq, Acc) -> + %% Build rest of list, subtracting minimum of MinFetchesPerSeq, Shortfall %% or (NodeCount - NodeCounter) each time NodeCount = length(Nodes), - NodesToRemove = min(min(MinFetchesPerSeq, Shortfall), NodeCount - NodeCounter), - RemovalList = lists:sublist(Nodes, NodeCounter, NodesToRemove), - NewSeq = lists:subtract(Nodes,RemovalList), + NodesToRemove = min(min(MinFetchesPerSeq, Shortfall), + NodeCount - NodeCounter), + RemovalList = lists:sublist(Nodes, NodeCounter, + NodesToRemove), + NewSeq = lists:subtract(Nodes, RemovalList), NewNodeCounter = NodeCounter + NodesToRemove, - build_nodelist(RingSize, Nodes, Shortfall - NodesToRemove, NewNodeCounter, - MinFetchesPerSeq, [ NewSeq | Acc]). + build_nodelist(RingSize, Nodes, + Shortfall - NodesToRemove, NewNodeCounter, + MinFetchesPerSeq, [NewSeq | Acc]). %% @private Backfill the ring with full sequences --spec backfill_ring(integer(), [node()], integer(), [node()]) -> [node()]. -backfill_ring(_RingSize, _Nodes, _Remaining=0, Acc) -> +-spec backfill_ring(integer(), [node()], integer(), + [node()]) -> [node()]. + +backfill_ring(_RingSize, _Nodes, _Remaining = 0, Acc) -> Acc; backfill_ring(RingSize, Nodes, Remaining, Acc) -> - backfill_ring(RingSize, Nodes, Remaining - 1, [Nodes | Acc]). - + backfill_ring(RingSize, Nodes, Remaining - 1, + [Nodes | Acc]). claim_rebalance_n(Ring, Node) -> - Nodes = lists:usort([Node|riak_core_ring:claiming_members(Ring)]), + Nodes = lists:usort([Node + | riak_core_ring:claiming_members(Ring)]), Zipped = diagonal_stripe(Ring, Nodes), - - lists:foldl(fun({P, N}, Acc) -> + lists:foldl(fun ({P, N}, Acc) -> riak_core_ring:transfer_node(P, N, Acc) end, - Ring, - Zipped). + Ring, Zipped). diagonal_stripe(Ring, Nodes) -> %% diagonal stripes guarantee most disperse data - Partitions = lists:sort([ I || {I, _} <- riak_core_ring:all_owners(Ring) ]), + Partitions = lists:sort([I + || {I, _} <- riak_core_ring:all_owners(Ring)]), Zipped = lists:zip(Partitions, - lists:sublist( - lists:flatten( - lists:duplicate( - 1+(length(Partitions) div length(Nodes)), - Nodes)), - 1, length(Partitions))), + lists:sublist(lists:flatten(lists:duplicate(1 + + length(Partitions) + div + length(Nodes), + Nodes)), + 1, length(Partitions))), Zipped. random_choose_claim(Ring) -> @@ -502,7 +534,8 @@ random_choose_claim(Ring, Node, _Params) -> %% @spec never_wants_claim(riak_core_ring()) -> no %% @doc For use by nodes that should not claim any partitions. never_wants_claim(_) -> no. -never_wants_claim(_,_) -> no. + +never_wants_claim(_, _) -> no. %% =================================================================== %% Private @@ -514,35 +547,37 @@ never_wants_claim(_,_) -> no. %% property. find_violations(Ring, TargetN) -> Owners = riak_core_ring:all_owners(Ring), - Suffix = lists:sublist(Owners, TargetN-1), + Suffix = lists:sublist(Owners, TargetN - 1), Owners2 = Owners ++ Suffix, %% Use a sliding window to determine violations - {Bad, _} = lists:foldl(fun(P={Idx, Owner}, {Out, Window}) -> - Window2 = lists:sublist([P|Window], TargetN-1), + {Bad, _} = lists:foldl(fun (P = {Idx, Owner}, + {Out, Window}) -> + Window2 = lists:sublist([P | Window], + TargetN - 1), case lists:keyfind(Owner, 2, Window) of - {PrevIdx, Owner} -> - {[[PrevIdx, Idx] | Out], Window2}; - false -> - {Out, Window2} + {PrevIdx, Owner} -> + {[[PrevIdx, Idx] | Out], Window2}; + false -> {Out, Window2} end - end, {[], []}, Owners2), + end, + {[], []}, Owners2), lists:reverse(Bad). %% @private %% %% @doc Counts up the number of partitions owned by each node. --spec get_counts([node()], [{integer(),_}]) -> - [{node(), non_neg_integer()}]. +-spec get_counts([node()], + [{integer(), _}]) -> [{node(), non_neg_integer()}]. + get_counts(Nodes, Ring) -> Empty = [{Node, 0} || Node <- Nodes], - Counts = lists:foldl(fun({_Idx, Node}, Counts) -> + Counts = lists:foldl(fun ({_Idx, Node}, Counts) -> case lists:member(Node, Nodes) of - true -> - dict:update_counter(Node, 1, Counts); - false -> - Counts + true -> dict:update_counter(Node, 1, Counts); + false -> Counts end - end, dict:from_list(Empty), Ring), + end, + dict:from_list(Empty), Ring), dict:to_list(Counts). %% @private @@ -556,13 +591,17 @@ add_default_deltas(IdxOwners, Deltas, Default) -> %% %% @doc Filter out candidate indices that would violate target_n given %% a node's current partition ownership. -prefilter_violations(Ring, Node, AllIndices, Indices, TargetN, RingSize) -> +prefilter_violations(Ring, Node, AllIndices, Indices, + TargetN, RingSize) -> CurrentIndices = riak_core_ring:indices(Ring, Node), - CurrentNth = [lists:keyfind(Idx, 2, AllIndices) || Idx <- CurrentIndices], - [{Nth, Idx} || {Nth, Idx} <- Indices, - lists:all(fun({CNth, _}) -> - spaced_by_n(CNth, Nth, TargetN, RingSize) - end, CurrentNth)]. + CurrentNth = [lists:keyfind(Idx, 2, AllIndices) + || Idx <- CurrentIndices], + [{Nth, Idx} + || {Nth, Idx} <- Indices, + lists:all(fun ({CNth, _}) -> + spaced_by_n(CNth, Nth, TargetN, RingSize) + end, + CurrentNth)]. %% @private %% @@ -576,9 +615,11 @@ prefilter_violations(Ring, Node, AllIndices, Indices, TargetN, RingSize) -> %% expected ownership. In other words, if A owns 5 partitions and %% the desired ownership is 3, then we try to claim at most 2 partitions %% from A. -select_indices(_Owners, _Deltas, [], _TargetN, _RingSize) -> +select_indices(_Owners, _Deltas, [], _TargetN, + _RingSize) -> []; -select_indices(Owners, Deltas, Indices, TargetN, RingSize) -> +select_indices(Owners, Deltas, Indices, TargetN, + RingSize) -> OwnerDT = dict:from_list(Owners), {FirstNth, _} = hd(Indices), %% The `First' symbol indicates whether or not this is the first @@ -587,23 +628,29 @@ select_indices(Owners, Deltas, Indices, TargetN, RingSize) -> %% _always_ safe to claim the first partition that another owner %% is willing to part with. It's the subsequent partitions %% claimed by this node that must not break the target_n invariant. - {Claim, _, _, _} = - lists:foldl(fun({Nth, Idx}, {Out, LastNth, DeltaDT, First}) -> - Owner = dict:fetch(Idx, OwnerDT), - Delta = dict:fetch(Owner, DeltaDT), - MeetsTN = spaced_by_n(LastNth, Nth, TargetN, - RingSize), - case (Delta < 0) and (First or MeetsTN) of - true -> - NextDeltaDT = - dict:update_counter(Owner, 1, DeltaDT), - {[Idx|Out], Nth, NextDeltaDT, false}; - false -> - {Out, LastNth, DeltaDT, First} - end - end, - {[], FirstNth, dict:from_list(Deltas), true}, - Indices), + {Claim, _, _, _} = lists:foldl(fun ({Nth, Idx}, + {Out, LastNth, DeltaDT, First}) -> + Owner = dict:fetch(Idx, OwnerDT), + Delta = dict:fetch(Owner, DeltaDT), + MeetsTN = spaced_by_n(LastNth, Nth, + TargetN, + RingSize), + case (Delta < 0) and + (First or MeetsTN) + of + true -> + NextDeltaDT = + dict:update_counter(Owner, + 1, + DeltaDT), + {[Idx | Out], Nth, NextDeltaDT, + false}; + false -> + {Out, LastNth, DeltaDT, First} + end + end, + {[], FirstNth, dict:from_list(Deltas), true}, + Indices), lists:reverse(Claim). %% @private @@ -611,75 +658,60 @@ select_indices(Owners, Deltas, Indices, TargetN, RingSize) -> %% @doc Determine if two positions in the ring meet target_n spacing. spaced_by_n(NthA, NthB, TargetN, RingSize) -> case NthA > NthB of - true -> - NFwd = NthA - NthB, - NBack = NthB - NthA + RingSize; - false -> - NFwd = NthA - NthB + RingSize, - NBack = NthB - NthA + true -> + NFwd = NthA - NthB, NBack = NthB - NthA + RingSize; + false -> + NFwd = NthA - NthB + RingSize, NBack = NthB - NthA end, (NFwd >= TargetN) and (NBack >= TargetN). - %% For each node in wants, work out how many more partition each node wants (positive) or is %% overloaded by (negative) compared to what it owns. wants_owns_diff(Wants, Owns) -> - [ case lists:keyfind(N, 1, Owns) of - {N, O} -> - {N, W - O}; - false -> - {N,W} - end || {N, W} <- Wants ]. - + [case lists:keyfind(N, 1, Owns) of + {N, O} -> {N, W - O}; + false -> {N, W} + end + || {N, W} <- Wants]. + %% Given a ring, work out how many partition each wants to be %% considered balanced wants(Ring) -> - Active = lists:sort(riak_core_ring:claiming_members(Ring)), + Active = + lists:sort(riak_core_ring:claiming_members(Ring)), Inactive = riak_core_ring:all_members(Ring) -- Active, Q = riak_core_ring:num_partitions(Ring), - ActiveWants = lists:zip(Active, wants_counts(length(Active), Q)), - InactiveWants = [ {N, 0} || N <- Inactive ], + ActiveWants = lists:zip(Active, + wants_counts(length(Active), Q)), + InactiveWants = [{N, 0} || N <- Inactive], lists:sort(ActiveWants ++ InactiveWants). %% @private -%% Given a number of nodes and ring size, return a list of +%% Given a number of nodes and ring size, return a list of %% desired ownership, S long that add up to Q wants_counts(S, Q) -> Max = roundup(Q / S), - case S * Max - Q of - 0 -> - lists:duplicate(S, Max); - X -> - lists:duplicate(X, Max - 1) ++ lists:duplicate(S - X, Max) + case S * Max - Q of + 0 -> lists:duplicate(S, Max); + X -> + lists:duplicate(X, Max - 1) ++ + lists:duplicate(S - X, Max) end. %% Round up to next whole integer - ceil roundup(I) when I >= 0 -> T = erlang:trunc(I), - case (I - T) of - Neg when Neg < 0 -> T; - Pos when Pos > 0 -> T + 1; - _ -> T + case I - T of + Neg when Neg < 0 -> T; + Pos when Pos > 0 -> T + 1; + _ -> T end. - -%% @private -%% Get active nodes ordered by take location parameters into account --spec get_nodes_by_location([node()|undefined], riak_core_ring:riak_core_ring()) -> - [node()|undefined]. -get_nodes_by_location(Nodes, Ring) -> - NodesLocations = riak_core_ring:get_nodes_locations(Ring), - case riak_core_location:has_location_set_in_cluster(NodesLocations) of - false -> - Nodes; - true -> - riak_core_location:stripe_nodes_by_location(Nodes, NodesLocations) - end. - %% =================================================================== %% Unit tests %% =================================================================== -ifdef(TEST). + -compile(export_all). -include_lib("eunit/include/eunit.hrl"). @@ -688,19 +720,21 @@ wants_claim_test() -> riak_core_ring_manager:setup_ets(test), riak_core_test_util:setup_mockring1(), {ok, Ring} = riak_core_ring_manager:get_my_ring(), - ?assertEqual({yes, 1}, default_wants_claim(Ring)), + ?assertEqual({yes, 1}, (default_wants_claim(Ring))), riak_core_ring_manager:cleanup_ets(test), riak_core_ring_manager:stop(). - %% @private console helper function to return node lists for claiming %% partitions --spec gen_diag(pos_integer(), pos_integer()) -> [Node::atom()]. +-spec gen_diag(pos_integer(), pos_integer()) -> [Node :: + atom()]. + gen_diag(RingSize, NodeCount) -> - Nodes = [list_to_atom(lists:concat(["n_", N])) || N <- lists:seq(1, NodeCount)], + Nodes = [list_to_atom(lists:concat(["n_", N])) + || N <- lists:seq(1, NodeCount)], {HeadNode, RestNodes} = {hd(Nodes), tl(Nodes)}, R0 = riak_core_ring:fresh(RingSize, HeadNode), - RAdded = lists:foldl(fun(Node, Racc) -> + RAdded = lists:foldl(fun (Node, Racc) -> riak_core_ring:add_member(HeadNode, Racc, Node) end, R0, RestNodes), @@ -710,388 +744,13 @@ gen_diag(RingSize, NodeCount) -> %% @private call with result of gen_diag/1 only, does the list have %% tail violations, returns true if so, false otherwise. --spec has_violations([Node::atom()]) -> boolean(). +-spec has_violations([Node :: atom()]) -> boolean(). + has_violations(Diag) -> RS = length(Diag), NC = length(lists:usort(Diag)), Overhang = RS rem NC, - (Overhang > 0 andalso Overhang < 4). %% hardcoded target n of 4 - - --ifdef(EQC). - --export([prop_claim_ensures_unique_nodes/1, prop_wants/0, prop_wants_counts/0, eqc_check/2]). --include_lib("eqc/include/eqc.hrl"). --include_lib("eunit/include/eunit.hrl"). - - --define(QC_OUT(P), - eqc:on_output(fun(Str, Args) -> io:format(user, Str, Args) end, P)). - --define(POW_2(N), trunc(math:pow(2, N))). - -eqc_check(File, Prop) -> - {ok, Bytes} = file:read_file(File), - CE = binary_to_term(Bytes), - eqc:check(Prop, CE). - -test_nodes(Count) -> - [node() | [list_to_atom(lists:concat(["n_", N])) || N <- lists:seq(1, Count-1)]]. - -test_nodes(Count, StartNode) -> - [list_to_atom(lists:concat(["n_", N])) || N <- lists:seq(StartNode, StartNode + Count)]. - -property_claim_ensures_unique_nodes_v2_test_() -> - Prop = eqc:testing_time(30, ?QC_OUT(prop_claim_ensures_unique_nodes(choose_claim_v2))), - {timeout, 120, fun() -> ?assert(eqc:quickcheck(Prop)) end}. - -property_claim_ensures_unique_nodes_adding_groups_v2_test_() -> - Prop = eqc:testing_time(30, ?QC_OUT(prop_claim_ensures_unique_nodes_adding_groups(choose_claim_v2))), - {timeout, 120, fun() -> ?assert(eqc:quickcheck(Prop)) end}. - -property_claim_ensures_unique_nodes_adding_singly_v2_test_() -> - Prop = eqc:testing_time(30, ?QC_OUT(prop_claim_ensures_unique_nodes_adding_singly(choose_claim_v2))), - {timeout, 120, fun() -> ?assert(eqc:quickcheck(Prop)) end}. - -prop_claim_ensures_unique_nodes(ChooseFun) -> - %% NOTE: We know that this doesn't work for the case of {_, 3}. - %% NOTE2: uses undocumented "double_shrink", is expensive, but should get - %% around those case where we shrink to a non-minimal case because - %% some intermediate combinations of ring_size/node have no violations - ?FORALL({PartsPow, NodeCount}, eqc_gen:double_shrink({choose(4, 9), choose(4, 15)}), - begin - Nval = 3, - TNval = Nval + 1, - _Params = [{target_n_val, TNval}], - - Partitions = ?POW_2(PartsPow), - [Node0 | RestNodes] = test_nodes(NodeCount), - - R0 = riak_core_ring:fresh(Partitions, Node0), - RAdded = lists:foldl(fun(Node, Racc) -> - riak_core_ring:add_member(Node0, Racc, Node) - end, R0, RestNodes), - - Rfinal = claim(RAdded, {?MODULE, wants_claim_v2}, {?MODULE, ChooseFun}), - - Preflists = riak_core_ring:all_preflists(Rfinal, Nval), - ImperfectPLs = orddict:to_list( - lists:foldl(fun(PL,Acc) -> - PLNodes = lists:usort([N || {_,N} <- PL]), - case length(PLNodes) of - Nval -> - Acc; - _ -> - ordsets:add_element(PL, Acc) - end - end, [], Preflists)), - - ?WHENFAIL( - begin - io:format(user, "{Partitions, Nodes} {~p, ~p}~n", - [Partitions, NodeCount]), - io:format(user, "Owners: ~p~n", - [riak_core_ring:all_owners(Rfinal)]) - end, - conjunction([{meets_target_n, - equals({true,[]}, - meets_target_n(Rfinal, TNval))}, - {perfect_preflists, equals([], ImperfectPLs)}, - {balanced_ring, balanced_ring(Partitions, NodeCount, Rfinal)}])) - end). - - -prop_claim_ensures_unique_nodes_adding_groups(ChooseFun) -> - %% NOTE: We know that this doesn't work for the case of {_, 3}. - %% NOTE2: uses undocumented "double_shrink", is expensive, but should get - %% around those case where we shrink to a non-minimal case because - %% some intermediate combinations of ring_size/node have no violations - ?FORALL({PartsPow, BaseNodes, AddedNodes}, - eqc_gen:double_shrink({choose(4, 9), choose(2, 10), choose(2, 5)}), - begin - Nval = 3, - TNval = Nval + 1, - _Params = [{target_n_val, TNval}], - - Partitions = ?POW_2(PartsPow), - [Node0 | RestNodes] = test_nodes(BaseNodes), - AddNodes = test_nodes(AddedNodes-1, BaseNodes), - NodeCount = BaseNodes + AddedNodes, - %% io:format("Base: ~p~n",[[Node0 | RestNodes]]), - %% io:format("Added: ~p~n",[AddNodes]), - - R0 = riak_core_ring:fresh(Partitions, Node0), - RBase = lists:foldl(fun(Node, Racc) -> - riak_core_ring:add_member(Node0, Racc, Node) - end, R0, RestNodes), - - Rinterim = claim(RBase, {?MODULE, wants_claim_v2}, {?MODULE, ChooseFun}), - RAdded = lists:foldl(fun(Node, Racc) -> - riak_core_ring:add_member(Node0, Racc, Node) - end, Rinterim, AddNodes), - - Rfinal = claim(RAdded, {?MODULE, wants_claim_v2}, {?MODULE, ChooseFun}), - - Preflists = riak_core_ring:all_preflists(Rfinal, Nval), - ImperfectPLs = orddict:to_list( - lists:foldl(fun(PL,Acc) -> - PLNodes = lists:usort([N || {_,N} <- PL]), - case length(PLNodes) of - Nval -> - Acc; - _ -> - ordsets:add_element(PL, Acc) - end - end, [], Preflists)), - - ?WHENFAIL( - begin - io:format(user, "{Partitions, Nodes} {~p, ~p}~n", - [Partitions, NodeCount]), - io:format(user, "Owners: ~p~n", - [riak_core_ring:all_owners(Rfinal)]) - end, - conjunction([{meets_target_n, - equals({true,[]}, - meets_target_n(Rfinal, TNval))}, - {perfect_preflists, equals([], ImperfectPLs)}, - {balanced_ring, balanced_ring(Partitions, NodeCount, Rfinal)}])) - end). - - -prop_claim_ensures_unique_nodes_adding_singly(ChooseFun) -> - %% NOTE: We know that this doesn't work for the case of {_, 3}. - %% NOTE2: uses undocumented "double_shrink", is expensive, but should get - %% around those case where we shrink to a non-minimal case because - %% some intermediate combinations of ring_size/node have no violations - ?FORALL({PartsPow, NodeCount}, eqc_gen:double_shrink({choose(4, 9), choose(4, 15)}), - begin - Nval = 3, - TNval = Nval + 1, - Params = [{target_n_val, TNval}], - - Partitions = ?POW_2(PartsPow), - [Node0 | RestNodes] = test_nodes(NodeCount), - - R0 = riak_core_ring:fresh(Partitions, Node0), - Rfinal = lists:foldl(fun(Node, Racc) -> - Racc0 = riak_core_ring:add_member(Node0, Racc, Node), - %% TODO which is it? Claim or ChooseFun?? - %%claim(Racc0, {?MODULE, wants_claim_v2}, {?MODULE, ChooseFun}) - ?MODULE:ChooseFun(Racc0, Node, Params) - end, R0, RestNodes), - Preflists = riak_core_ring:all_preflists(Rfinal, Nval), - ImperfectPLs = orddict:to_list( - lists:foldl(fun(PL,Acc) -> - PLNodes = lists:usort([N || {_,N} <- PL]), - case length(PLNodes) of - Nval -> - Acc; - _ -> - ordsets:add_element(PL, Acc) - end - end, [], Preflists)), - - ?WHENFAIL( - begin - io:format(user, "{Partitions, Nodes} {~p, ~p}~n", - [Partitions, NodeCount]), - io:format(user, "Owners: ~p~n", - [riak_core_ring:all_owners(Rfinal)]) - end, - conjunction([{meets_target_n, - equals({true,[]}, - meets_target_n(Rfinal, TNval))}, - {perfect_preflists, equals([], ImperfectPLs)}, - {balanced_ring, balanced_ring(Partitions, NodeCount, Rfinal)}])) - end). - - - -%% @private check that no node claims more than it should --spec balanced_ring(RingSize::integer(), NodeCount::integer(), - riak_core_ring:riak_core_ring()) -> - boolean(). -balanced_ring(RingSize, NodeCount, Ring) -> - TargetClaim = ceiling(RingSize / NodeCount), - MinClaim = RingSize div NodeCount, - AllOwners0 = riak_core_ring:all_owners(Ring), - AllOwners = lists:keysort(2, AllOwners0), - {BalancedMax, AccFinal} = lists:foldl(fun({_Part, Node}, {_Balanced, [{Node, Cnt} | Acc]}) when Cnt >= TargetClaim -> - {false, [{Node, Cnt+1} | Acc]}; - ({_Part, Node}, {Balanced, [{Node, Cnt} | Acc]}) -> - {Balanced, [{Node, Cnt+1} | Acc]}; - ({_Part, NewNode}, {Balanced, Acc}) -> - {Balanced, [{NewNode, 1} | Acc]} - end, - {true, []}, - AllOwners), - BalancedMin = lists:all(fun({_Node, Cnt}) -> Cnt >= MinClaim end, AccFinal), - case BalancedMax andalso BalancedMin of - true -> - true; - false -> - {TargetClaim, MinClaim, lists:sort(AccFinal)} - end. - + Overhang > 0 andalso + Overhang < 4. %% hardcoded target n of 4 -wants_counts_test() -> - ?assert(eqc:quickcheck(?QC_OUT((prop_wants_counts())))). - -prop_wants_counts() -> - ?FORALL({S, Q}, {large_pos(100), large_pos(100000)}, - begin - Wants = wants_counts(S, Q), - conjunction([{len, equals(S, length(Wants))}, - {sum, equals(Q, lists:sum(Wants))}]) - end). - -wants_test() -> - ?assert(eqc:quickcheck(?QC_OUT((prop_wants())))). - -prop_wants() -> - ?FORALL({NodeStatus, Q}, - {?SUCHTHAT(L, non_empty(list(elements([leaving, joining]))), - lists:member(joining, L)), - ?LET(X, choose(1,16), trunc(math:pow(2, X)))}, - begin - R0 = riak_core_ring:fresh(Q, tnode(1)), - {_, R2, Active} = - lists:foldl( - fun(S, {I, R1, A1}) -> - N = tnode(I), - case S of - joining -> - {I+1, riak_core_ring:add_member(N, R1, N), [N|A1]}; - _ -> - {I+1, riak_core_ring:leave_member(N, R1, N), A1} - end - end, {1, R0, []}, NodeStatus), - Wants = wants(R2), - - %% Check any non-claiming nodes are set to 0 - %% Check all nodes are present - {ActiveWants, InactiveWants} = - lists:partition(fun({N,_W}) -> lists:member(N, Active) end, Wants), - - ActiveSum = lists:sum([W || {_,W} <- ActiveWants]), - InactiveSum = lists:sum([W || {_,W} <- InactiveWants]), - ?WHENFAIL( - begin - io:format(user, "NodeStatus: ~p\n", [NodeStatus]), - io:format(user, "Active: ~p\n", [Active]), - io:format(user, "Q: ~p\n", [Q]), - io:format(user, "Wants: ~p\n", [Wants]), - io:format(user, "ActiveWants: ~p\n", [ActiveWants]), - io:format(user, "InactiveWants: ~p\n", [InactiveWants]) - end, - conjunction([{wants, equals(length(Wants), length(NodeStatus))}, - {active, equals(Q, ActiveSum)}, - {inactive, equals(0, InactiveSum)}])) - end). - -%% Large positive integer between 1 and Max -large_pos(Max) -> - ?LET(X, largeint(), 1 + (abs(X) rem Max)). - -take_idxs_test() -> - ?assert(eqc:quickcheck(?QC_OUT((prop_take_idxs())))). - -prop_take_idxs() -> - ?FORALL({OwnersSeed, CIdxsSeed, ExchangesSeed, TNSeed}, - {non_empty(list(largeint())), % [OwnerSeed] - non_empty(list(largeint())), % [CIdxSeed] - non_empty(list({int(), int()})), % {GiveSeed, TakeSeed} - int()}, % TNSeed - begin - %% Generate Nis - duplicate owners seed to make sure Q > S - S = length(ExchangesSeed), - Dup = roundup(S / length(OwnersSeed)), - Owners = lists:flatten( - lists:duplicate(Dup, - [tnode(abs(OwnerSeed) rem S) || - OwnerSeed <- OwnersSeed])), - Q = length(Owners), - TN = 1+abs(TNSeed), - - - Ownership0 = orddict:from_list([{tnode(I), []} || I <- lists:seq(0, S -1)]), - Ownership = lists:foldl(fun({I,O},A) -> - orddict:append_list(O, [I], A) - end, - Ownership0, - lists:zip(lists:seq(0, Q-1), Owners)), - NIs = [{Node, undefined, Owned} || {Node, Owned} <- Ownership], - - %% Generate claimable indices - CIdxs = ordsets:from_list([abs(Idx) rem Q || Idx <- CIdxsSeed]), - - %% io:format(user, "ExchangesSeed (~p): ~p\n", [length(ExchangesSeed), - %% ExchangesSeed]), - %% io:format(user, "NIs (~p): ~p\n", [length(NIs), NIs]), - - %% Generate exchanges - Exchanges = [{Node, % node name - abs(GiveSeed) rem (length(OIdxs) + 1), % maximum indices to give - abs(TakeSeed) rem (Q+1), % maximum indices to take - CIdxs} || % indices that can be claimed by node - {{Node, _Want, OIdxs}, {GiveSeed, TakeSeed}} <- - lists:zip(NIs, ExchangesSeed)], - - %% Fire the test - NIs2 = take_idxs(Exchanges, NIs, Q, TN), - - %% Check All nodes are still in NIs - %% Check that no node lost more than it wanted to give - ?WHENFAIL( - begin - io:format(user, "Exchanges:\n~p\n", [Exchanges]), - io:format(user, "NIs:\n~p\n", [NIs]), - io:format(user, "NIs2:\n~p\n", [NIs2]), - io:format(user, "Q: ~p\nTN: ~p\n", [Q, TN]) - end, - check_deltas(Exchanges, NIs, NIs2, Q, TN)) - %% conjunction([{len, equals(length(NIs), length(NIs2))}, - %% {delta, check_deltas(Exchanges, NIs, NIs2, Q, TN)}])) - end). - -tnode(I) -> - list_to_atom("n"++integer_to_list(I)). - -%% Check that no node gained more than it wanted to take -%% Check that none of the nodes took more partitions than allowed -%% Check that no nodes violate target N -check_deltas(Exchanges, Before, After, Q, TN) -> - conjunction( - lists:flatten( - [begin - Gave = length(OIdxs1 -- OIdxs2), % in original and not new - Took = length(OIdxs2 -- OIdxs1), - V1 = count_violations(OIdxs1, Q, TN), - V2 = count_violations(OIdxs2, Q, TN), - [{{give, Node, Gave, Give}, Gave =< Give}, - {{take, Node, Took, Take}, Took =< Take}, - {{valid, Node, V1, V2}, - V2 == 0 orelse - V1 > 0 orelse % check no violations if there were not before - OIdxs1 == []}] % or the node held no indices so violation was impossible - end || {{Node, Give, Take, _CIdxs}, {Node, _Want1, OIdxs1}, {Node, _Want2, OIdxs2}} <- - lists:zip3(lists:sort(Exchanges), lists:sort(Before), lists:sort(After))])). - -count_violations([], _Q, _TN) -> - 0; -count_violations(Idxs, Q, TN) -> - SOIdxs = lists:sort(Idxs), - {_, Violations} = lists:foldl( - fun(This,{Last,Vs}) -> - case Last - This >= TN of - true -> - {This, Vs}; - _ -> - {This, Vs + 1} - end - end, {Q + hd(SOIdxs), 0}, lists:reverse(SOIdxs)), - Violations. - --endif. % EQC --endif. % TEST +-endif. diff --git a/src/riak_core_claim_util.erl b/src/riak_core_claim_util.erl index 1453ea7d9..ac7378f04 100644 --- a/src/riak_core_claim_util.erl +++ b/src/riak_core_claim_util.erl @@ -24,94 +24,73 @@ -module(riak_core_claim_util). --export([ring_stats/2, violation_stats/2, balance_stats/1, diversity_stats/2]). --export([print_failure_analysis/3, failure_analysis/3, node_sensitivity/3, node_load/3, - print_analysis/1, print_analysis/2, sort_by_down_fbmax/1]). --export([adjacency_matrix/1, summarize_am/1, adjacency_matrix_from_al/1, - adjacency_list/1, fixup_dam/2, score_am/2, count/2, rms/1]). --export([make_ring/1, gen_complete_diverse/1, gen_complete_len/1, construct/3]). --export([num_perms/2, num_combs/2, fac/1, perm_gen/1, down_combos/2, - rotations/1, substitutions/2]). --ifdef(TEST). --ifdef(EQC). --include_lib("eqc/include/eqc.hrl"). --endif. --include_lib("eunit/include/eunit.hrl"). --endif. - --record(load, {node, % Node name - num_pri, % Number of primaries - num_fb, % Number of fallbacks - norm_fb}). % Normalised fallbacks - ratio of how many there are - --record(failure, {down = [], % List of downed nodes - load = [], % List of #load{} records per up node - fbmin, - fbmean, - fbstddev, - fb10, - fb90, - fbmax}). - --define(DICT, dict). % macro for dictionary implementation, simplifies debugging - - +-export([ring_stats/2, violation_stats/2, + balance_stats/1, diversity_stats/2]). + +-export([node_load/3, print_analysis/1, + print_analysis/2, sort_by_down_fbmax/1]). + +-export([adjacency_matrix/1, summarize_am/1, + adjacency_matrix_from_al/1, adjacency_list/1, + fixup_dam/2, score_am/2, count/2, rms/1]). + +-export([make_ring/1, gen_complete_diverse/1, + gen_complete_len/1, construct/3]). + +-export([num_perms/2, num_combs/2, fac/1, perm_gen/1, + down_combos/2, rotations/1, substitutions/2]). + +-record(load, + {node, % Node name + num_pri, % Number of primaries + num_fb, % Number of fallbacks + norm_fb}). % Normalised fallbacks - ratio of how many there are + +-record(failure, + {down = [], % List of downed nodes + load = [], % List of #load{} records per up node + fbmin, fbmean, fbstddev, fb10, fb90, fbmax}). + %% ------------------------------------------------------------------- %% Ring statistics %% ------------------------------------------------------------------- ring_stats(R, TN) -> violation_stats(R, TN) ++ - balance_stats(R) ++ - diversity_stats(R, TN). - + balance_stats(R) ++ diversity_stats(R, TN). + %% TargetN violations violation_stats(R, TN) -> - [{violations, length(riak_core_ring_util:check_ring(R, TN))}]. + [{violations, + length(riak_core_ring_util:check_ring(R, TN))}]. balance_stats(R) -> Q = riak_core_ring:num_partitions(R), M = length(riak_core_ring:claiming_members(R)), AllOwners = riak_core_ring:all_owners(R), - Counts = lists:foldl(fun({_,N},A) -> orddict:update_counter(N,1,A) end, [], AllOwners), + Counts = lists:foldl(fun ({_, N}, A) -> + orddict:update_counter(N, 1, A) + end, + [], AllOwners), Avg = Q / M, Balance = lists:sum([begin - Delta = trunc(Avg - Count), - Delta * Delta - end || {_, Count} <- Counts]), - [{balance, Balance}, - {ownership, Counts}]. + Delta = trunc(Avg - Count), Delta * Delta + end + || {_, Count} <- Counts]), + [{balance, Balance}, {ownership, Counts}]. diversity_stats(R, TN) -> {_, Owners} = lists:unzip(riak_core_ring:all_owners(R)), AM = adjacency_matrix(Owners), - try - [{diversity, riak_core_claim_util:score_am(AM, TN)}] + try [{diversity, riak_core_claim_util:score_am(AM, TN)}] catch - _:empty_list -> - [{diversity, undefined}] + _:empty_list -> [{diversity, undefined}] end. - + %% ------------------------------------------------------------------- %% Failure analysis %% ------------------------------------------------------------------- -%% Print failure analysis on standard_io -print_failure_analysis(R, TargetN, NumFailures) -> - print_analysis(failure_analysis(R, TargetN, NumFailures)). - -failure_analysis(R, TargetN, NumFailures) -> - sort_by_down_fbmax(node_sensitivity(R, TargetN, NumFailures)). - -%% Mark each node down in turn and see how the spread of load is. -%% -%% Return a list of failures records, one for each case examined -node_sensitivity(R, NVal, Depth) -> - Members = riak_core_ring:all_members(R), - DownCombos = down_combos(Depth, Members), - LoadCombos = [{Down, node_load(R, NVal, Down)} || Down <- DownCombos], - [analyze_load(Down, Load) || {Down, Load} <- LoadCombos]. - %% For a given ring, with a list of downed nodes, compute %% all preference lists then count up the number that %% each node participates in. @@ -120,44 +99,33 @@ node_sensitivity(R, NVal, Depth) -> %% node_load(R, NVal, DownNodes) -> VL = vnode_load(R, NVal, DownNodes), - TotFBs = lists:sum([NumFBs || {_N,_,NumFBs} <- VL]), - [#load{node = N, - num_pri = NumPris, - num_fb = NumFBs, - norm_fb = norm_fb(NumFBs, TotFBs)} || - {N, NumPris, NumFBs} <- VL]. + TotFBs = lists:sum([NumFBs || {_N, _, NumFBs} <- VL]), + [#load{node = N, num_pri = NumPris, num_fb = NumFBs, + norm_fb = norm_fb(NumFBs, TotFBs)} + || {N, NumPris, NumFBs} <- VL]. vnode_load(R, NVal, DownNodes) -> UpNodes = riak_core_ring:all_members(R) -- DownNodes, - Keys = [<<(I+1):160/integer>> || - {I,_Owner} <- riak_core_ring:all_owners(R)], + Keys = [<<(I + 1):160/integer>> + || {I, _Owner} <- riak_core_ring:all_owners(R)], %% NValParts = Nval * riak_core_ring:num_partitions(R), - AllPLs = [riak_core_apl:get_apl_ann(Key, NVal, R, UpNodes) || Key <- Keys], + AllPLs = [riak_core_apl:get_apl_ann(Key, NVal, R, + UpNodes) + || Key <- Keys], FlatPLs = lists:flatten(AllPLs), [begin - Pris = lists:usort([_Idx || {{_Idx, PN}, primary} <- FlatPLs, PN == N]), - FBs = lists:usort([_Idx || {{_Idx, FN}, fallback} <- FlatPLs, FN == N]) -- Pris, - {N, length(Pris), length(FBs)} - end || N <- UpNodes]. + Pris = lists:usort([Idx + || {{Idx, PN}, primary} <- FlatPLs, PN == N]), + FBs = lists:usort([Idx + || {{Idx, FN}, fallback} <- FlatPLs, FN == N]) + -- Pris, + {N, length(Pris), length(FBs)} + end + || N <- UpNodes]. %% @private Normalize fallbacks -norm_fb(_, 0) -> - 0; -norm_fb(Num, Tot) -> - Num / Tot. - -%% @private analyze the load on each -analyze_load(Down, Load) -> - FBStats = lists:foldl(fun(#load{num_fb = NumFB}, Acc) -> - basho_stats_histogram:update(NumFB, Acc) - end, - basho_stats_histogram:new(1, 1024, 1024), Load), - {FBMin, FBMean, FBMax, _FBVar, FBStdDev} = basho_stats_histogram:summary_stats(FBStats), - FB10 = basho_stats_histogram:quantile(0.10, FBStats), - FB90 = basho_stats_histogram:quantile(0.90, FBStats), - #failure{down = Down, load = Load, fbmin = FBMin, fbmean = FBMean, fbstddev = FBStdDev, - fb10 = FB10, fb90 = FB90, fbmax = FBMax}. - +norm_fb(_, 0) -> 0; +norm_fb(Num, Tot) -> Num / Tot. %% %% Print the load analysis for each of the combinations of down nodes analyzed @@ -166,56 +134,54 @@ analyze_load(Down, Load) -> %% DownNodes - list of nodes that were down for the calculation %% Worst - list of {node,fallbacks} showing up to the 3 worst affected nodes %% as a result of DownNodes. -%% +%% print_analysis(LoadAnalysis) -> print_analysis(standard_io, LoadAnalysis). - + print_analysis(IoDev, LoadAnalysis) -> - io:format(IoDev, " Min Mean/ SD 10th 90th Max DownNodes/Worst\n", []), + io:format(IoDev, + " Min Mean/ SD 10th 90th Max DownNodes/" + "Worst\n", + []), print_analysis1(IoDev, LoadAnalysis). %% @private -print_analysis1(_IoDev, []) -> - ok; -print_analysis1(IoDev, [#failure{down = Down, load = Load, fbmin = FBMin, - fbmean = FBMean, fbstddev = FBStdDev, - fb10 = FB10, fb90 = FB90, fbmax = FBMax} | Rest]) -> +print_analysis1(_IoDev, []) -> ok; +print_analysis1(IoDev, + [#failure{down = Down, load = Load, fbmin = FBMin, + fbmean = FBMean, fbstddev = FBStdDev, fb10 = FB10, + fb90 = FB90, fbmax = FBMax} + | Rest]) -> %% Find the 3 worst FBmax - Worst = - [{N,NumFB} || #load{node = N, num_fb = NumFB} <- - lists:sublist(lists:reverse(lists:keysort(#load.num_fb, Load)), 3)], - - io:format(IoDev, "~4b ~4b/~4b ~4b ~4b ~4b ~w/~w\n", - [FBMin, toint(FBMean), toint(FBStdDev), toint(FB10), toint(FB90), FBMax, Down, Worst]), + Worst = [{N, NumFB} + || #load{node = N, num_fb = NumFB} + <- lists:sublist(lists:reverse(lists:keysort(#load.num_fb, + Load)), + 3)], + io:format(IoDev, "~4b ~4b/~4b ~4b ~4b ~4b ~w/~w\n", + [FBMin, toint(FBMean), toint(FBStdDev), toint(FB10), + toint(FB90), FBMax, Down, Worst]), print_analysis1(IoDev, Rest). %% @private round to nearest int -toint(F) when is_number(F) -> - round(F+0.5); -toint(X) -> - X. +toint(F) when is_number(F) -> round(F + 0.5); +toint(X) -> X. %% Order failures by number of nodes down ascending, then fbmax, then down list sort_by_down_fbmax(Failures) -> - Cmp = fun(#failure{down = DownA, fbmax = FBMaxA}, - #failure{down = DownB, fbmax = FBMaxB}) -> - %% length(DownA) =< length(DownB) andalso - %% FBMaxA >= FBMaxB andalso + Cmp = fun (#failure{down = DownA, fbmax = FBMaxA}, + #failure{down = DownB, fbmax = FBMaxB}) -> + %% length(DownA) =< length(DownB) andalso + %% FBMaxA >= FBMaxB andalso %% DownA =< DownB case {length(DownA), length(DownB)} of - {DownALen, DownBLen} when DownALen < DownBLen -> - true; - {DownALen, DownBLen} when DownALen > DownBLen -> - false; - _ -> - if - FBMaxA > FBMaxB -> - true; - FBMaxA < FBMaxB -> - false; - true -> - DownA >= DownB - end + {DownALen, DownBLen} when DownALen < DownBLen -> true; + {DownALen, DownBLen} when DownALen > DownBLen -> false; + _ -> + if FBMaxA > FBMaxB -> true; + FBMaxA < FBMaxB -> false; + true -> DownA >= DownB + end end end, lists:sort(Cmp, Failures). @@ -276,11 +242,11 @@ sort_by_down_fbmax(Failures) -> adjacency_matrix(Owners) -> M = lists:usort(Owners), Tid = ets:new(am, [private, duplicate_bag]), - try - adjacency_matrix_populate(Tid, M, Owners, Owners++Owners), + try adjacency_matrix_populate(Tid, M, Owners, + Owners ++ Owners), adjacency_matrix_result(Tid, ets:first(Tid), []) after - ets:delete(Tid) + ets:delete(Tid) end. %% @private extract the adjacency matrix from the duplicate bag @@ -288,29 +254,35 @@ adjacency_matrix_result(_Tid, '$end_of_table', Acc) -> Acc; adjacency_matrix_result(Tid, NodePair, Acc) -> ALs = ets:lookup(Tid, NodePair), - Ds = [ D || {_, D} <- ALs ], - adjacency_matrix_result(Tid, ets:next(Tid, NodePair), [{NodePair, Ds} | Acc]). + Ds = [D || {_, D} <- ALs], + adjacency_matrix_result(Tid, ets:next(Tid, NodePair), + [{NodePair, Ds} | Acc]). adjacency_matrix_populate(_Tid, _M, [], _OwnersCycle) -> ok; -adjacency_matrix_populate(Tid, M, [Node | Owners], [Node | OwnersCycle]) -> - adjacency_matrix_add_dist(Tid, Node, M--[Node], OwnersCycle, 0), +adjacency_matrix_populate(Tid, M, [Node | Owners], + [Node | OwnersCycle]) -> + adjacency_matrix_add_dist(Tid, Node, M -- [Node], + OwnersCycle, 0), adjacency_matrix_populate(Tid, M, Owners, OwnersCycle). %% @private Compute the distance from node to the next of M nodes -adjacency_matrix_add_dist(_Tid, _Node, _M, [], _) -> +adjacency_matrix_add_dist(_Tid, _Node, _M, [], _) -> ok; +adjacency_matrix_add_dist(_Tid, _Node, [], _OwnersCycle, + _) -> ok; -adjacency_matrix_add_dist(_Tid, _Node, [], _OwnersCycle, _) -> - ok; -adjacency_matrix_add_dist(Tid, Node, M, [OtherNode | OwnersCycle], Distance) -> +adjacency_matrix_add_dist(Tid, Node, M, + [OtherNode | OwnersCycle], Distance) -> case lists:member(OtherNode, M) of - true -> % haven't seen this node yet, add distance - ets:insert(Tid, {{Node, OtherNode}, Distance}), - adjacency_matrix_add_dist(Tid, Node, M -- [OtherNode], OwnersCycle, Distance + 1); - _ -> % already passed OtherNode - adjacency_matrix_add_dist(Tid, Node, M, OwnersCycle, Distance + 1) + true -> % haven't seen this node yet, add distance + ets:insert(Tid, {{Node, OtherNode}, Distance}), + adjacency_matrix_add_dist(Tid, Node, M -- [OtherNode], + OwnersCycle, Distance + 1); + _ -> % already passed OtherNode + adjacency_matrix_add_dist(Tid, Node, M, OwnersCycle, + Distance + 1) end. - + %% Make adjacency summary by working out counts of each distance %% (zero-padding to make it print nicely) summarize_am(AM) -> @@ -318,73 +290,75 @@ summarize_am(AM) -> %% Take a list of distances: [4, 3, 0, 1, 1, 3, 3] and %% create a list counting distance by position [1, 2, 0, 3, 1] -count_distances([]) -> - []; +count_distances([]) -> []; count_distances(Ds) -> - MaxD = lists:max(Ds), - PosCounts = lists:foldl(fun(D,Acc) -> + MaxD = lists:max(Ds), + PosCounts = lists:foldl(fun (D, Acc) -> orddict:update_counter(D, 1, Acc) - end, - orddict:from_list([{D,0} || D <- lists:seq(0,MaxD)]), + end, + orddict:from_list([{D, 0} + || D <- lists:seq(0, MaxD)]), Ds), %% PosCounts orddict must be initialized to make sure no distances %% are missing in the list comprehension - [Count || {_Pos, Count} <- PosCounts]. + [Count || {_Pos, Count} <- PosCounts]. %% Compute adjacency matrix from an adjacency list adjacency_matrix_from_al(AL) -> %% Make a count by distance of N1,N2 - ?DICT:to_list( - lists:foldl(fun({NPair,D}, Acc) -> - ?DICT:append_list(NPair, [D], Acc) - end, ?DICT:new(), AL)). - + dict:to_list(lists:foldl(fun ({NPair, D}, Acc) -> + dict:append_list(NPair, [D], Acc) + end, + dict:new(), AL)). %% Create a pair of node names and a list of distances adjacency_list(Owners) -> M = lists:usort(Owners), - adjacency_list(M, Owners, Owners++Owners, []). + adjacency_list(M, Owners, Owners ++ Owners, []). -adjacency_list(_M, [], _OwnersCycle, Acc) -> - Acc; -adjacency_list(M, [Node | Owners], [Node | OwnersCycle], Acc) -> - adjacency_list(M, Owners, OwnersCycle, distances(Node, M--[Node], OwnersCycle, 0, Acc)). +adjacency_list(_M, [], _OwnersCycle, Acc) -> Acc; +adjacency_list(M, [Node | Owners], [Node | OwnersCycle], + Acc) -> + adjacency_list(M, Owners, OwnersCycle, + distances(Node, M -- [Node], OwnersCycle, 0, Acc)). %% Compute the distance from node to the next of M nodes -distances(_Node, _M, [], _, Distances) -> - Distances; +distances(_Node, _M, [], _, Distances) -> Distances; distances(_Node, [], _OwnersCycle, _, Distances) -> Distances; -distances(Node, M, [OtherNode | OwnersCycle], Distance, Distances) -> +distances(Node, M, [OtherNode | OwnersCycle], Distance, + Distances) -> case lists:member(OtherNode, M) of - true -> % haven't seen this node yet, add distance - distances(Node, M -- [OtherNode], OwnersCycle, Distance + 1, - [{{Node, OtherNode}, Distance} | Distances]); - _ -> % already passed OtherNode - distances(Node, M, OwnersCycle, Distance + 1, Distances) + true -> % haven't seen this node yet, add distance + distances(Node, M -- [OtherNode], OwnersCycle, + Distance + 1, + [{{Node, OtherNode}, Distance} | Distances]); + _ -> % already passed OtherNode + distances(Node, M, OwnersCycle, Distance + 1, Distances) end. %% For each pair, get the count of distances < NVal -score_am([], _NVal) -> - undefined; +score_am([], _NVal) -> undefined; score_am(AM, NVal) -> - Cs = lists:flatten( - [begin - [C || {D,C} <- count(Ds, NVal), D < NVal] - end || {_Pair,Ds} <- AM]), + Cs = lists:flatten([begin + [C || {D, C} <- count(Ds, NVal), D < NVal] + end + || {_Pair, Ds} <- AM]), rms(Cs). count(L, NVal) -> - Acc0 = orddict:from_list([{D, 0} || D <- lists:seq(0, NVal-1)]), - lists:foldl(fun(E,A) -> orddict:update_counter(E, 1, A) end, Acc0, L). - -rms([]) -> - throw(empty_list); + Acc0 = orddict:from_list([{D, 0} + || D <- lists:seq(0, NVal - 1)]), + lists:foldl(fun (E, A) -> + orddict:update_counter(E, 1, A) + end, + Acc0, L). + +rms([]) -> throw(empty_list); rms(L) -> Mean = lists:sum(L) / length(L), lists:sum([(Mean - E) * (Mean - E) || E <- L]). - %% ------------------------------------------------------------------- %% Ring construction %% ------------------------------------------------------------------- @@ -392,32 +366,31 @@ rms(L) -> %% Make a ring of size length(Nodes) ordering the nodes as given make_ring(Nodes) -> R0 = riak_core_ring:fresh(length(Nodes), hd(Nodes)), - Idxs = [I || {I,_} <- riak_core_ring:all_owners(R0)], + Idxs = [I || {I, _} <- riak_core_ring:all_owners(R0)], NewOwners = lists:zip(Idxs, Nodes), - R1 = lists:foldl(fun(N,R) -> + R1 = lists:foldl(fun (N, R) -> riak_core_ring:add_member(hd(Nodes), R, N) - end, R0, Nodes), - lists:foldl(fun({I,N}, R) -> + end, + R0, Nodes), + lists:foldl(fun ({I, N}, R) -> riak_core_ring:transfer_node(I, N, R) - end, R1, NewOwners). - - + end, + R1, NewOwners). %% Generate a completion test function that makes sure all required %% distances are created gen_complete_diverse(RequiredDs) -> - fun(Owners, DAM) -> + fun (Owners, DAM) -> OwnersLen = length(Owners), NextPow2 = next_pow2(OwnersLen), {met_required(Owners, DAM, RequiredDs) andalso - OwnersLen == NextPow2, NextPow2} + OwnersLen == NextPow2, + NextPow2} end. %% Generate until a fixed length has been hit gen_complete_len(Len) -> - fun(Owners, _AM) -> - {length(Owners) == Len, Len} - end. + fun (Owners, _AM) -> {length(Owners) == Len, Len} end. %% M = list of node names %% NVal = target nval @@ -427,117 +400,127 @@ construct(Complete, M, NVal) -> %% Make an empty adjacency matrix for all pairs of members empty_adjacency_matrix(M) -> - lists:foldl(fun(Pair,AM0) -> - ?DICT:append_list(Pair, [], AM0) - end, ?DICT:new(), [{F,T} || F <- M, T <- M, F /= T]). + lists:foldl(fun (Pair, AM0) -> + dict:append_list(Pair, [], AM0) + end, + dict:new(), [{F, T} || F <- M, T <- M, F /= T]). construct(Complete, M, Owners, DAM, NVal) -> %% Work out which pairs do not have the requiredDs case Complete(Owners, DAM) of - {true, _DesiredLen}-> - {ok, Owners, DAM}; - {false, DesiredLen} -> - %% Easy ones - restrict the eligible list to not include the N-1 - %% previous nodes. If within NVal-1 of possibly closing the ring - %% then restrict in that direction as well. - Eligible0 = M -- lists:sublist(Owners, NVal - 1), - Eligible = case DesiredLen - length(Owners) of - Left when Left >= NVal -> - Eligible0; % At least Nval lest, no restriction - Left -> - Eligible0 -- lists:sublist(lists:reverse(Owners), NVal - Left) - end, - case Eligible of - [] -> - %% No eligible nodes - not enough to meet NVal, use any node - logger:debug("construct -- unable to construct without violating NVal"), - {Owners1, DAM1} = prepend_next_owner(M, M, Owners, DAM, NVal), - construct(Complete, M, Owners1, DAM1, NVal); - _ -> - {Owners1, DAM1} = prepend_next_owner(M, Eligible, Owners, DAM, NVal), - construct(Complete, M, Owners1, DAM1, NVal) - end + {true, _DesiredLen} -> {ok, Owners, DAM}; + {false, DesiredLen} -> + %% Easy ones - restrict the eligible list to not include the N-1 + %% previous nodes. If within NVal-1 of possibly closing the ring + %% then restrict in that direction as well. + Eligible0 = M -- lists:sublist(Owners, NVal - 1), + Eligible = case DesiredLen - length(Owners) of + Left when Left >= NVal -> + Eligible0; % At least Nval lest, no restriction + Left -> + Eligible0 -- + lists:sublist(lists:reverse(Owners), NVal - Left) + end, + case Eligible of + [] -> + %% No eligible nodes - not enough to meet NVal, use any node + logger:debug("construct -- unable to construct without " + "violating NVal"), + {Owners1, DAM1} = prepend_next_owner(M, M, Owners, DAM, + NVal), + construct(Complete, M, Owners1, DAM1, NVal); + _ -> + {Owners1, DAM1} = prepend_next_owner(M, Eligible, + Owners, DAM, NVal), + construct(Complete, M, Owners1, DAM1, NVal) + end end. %% Returns true only when we have met all required distances across all %% possible pairs in the adjacency matrix met_required(Owners, DAM, RequiredDs) -> FixupDAM = fixup_dam(Owners, DAM), - case [Pair || {Pair, Ds} <- ?DICT:to_list(FixupDAM), - (RequiredDs -- Ds) /= [] ] of - [] -> - true; - _ -> - false + case [Pair + || {Pair, Ds} <- dict:to_list(FixupDAM), + RequiredDs -- Ds /= []] + of + [] -> true; + _ -> false end. %% Return next greatest power of 2 -next_pow2(X) -> - next_pow2(X, 2). +next_pow2(X) -> next_pow2(X, 2). -next_pow2(X, R) when X =< R -> - R; -next_pow2(X, R) -> - next_pow2(X, R*2). +next_pow2(X, R) when X =< R -> R; +next_pow2(X, R) -> next_pow2(X, R * 2). %% For each eligible, work out which node improves diversity the most %% Take the AM scores and cap by TargetN and find the node that -%% improves the RMS -prepend_next_owner(M, [Node], Owners, DAM, _TN) -> % only one node, not a lot of decisions to make +%% improves the RMS +prepend_next_owner(M, [Node], Owners, DAM, + _TN) -> % only one node, not a lot of decisions to make prepend(M, Node, Owners, DAM); prepend_next_owner(M, Eligible, Owners, DAM, TN) -> - {_BestScore, Owners2, DAM2} = - lists:foldl(fun(Node, {RunningScore, _RunningO, _RunningDAM}=Acc) -> - {Owners1, DAM1} = prepend(M, Node, Owners, DAM), - case score_am(?DICT:to_list(DAM1), TN) of - BetterScore when BetterScore < RunningScore -> - {BetterScore, Owners1, DAM1}; - _ -> - Acc - end - end, {undefined, undefined, undefined}, Eligible), + {_BestScore, Owners2, DAM2} = lists:foldl(fun (Node, + {RunningScore, _RunningO, + _RunningDAM} = + Acc) -> + {Owners1, DAM1} = + prepend(M, Node, + Owners, DAM), + case + score_am(dict:to_list(DAM1), + TN) + of + BetterScore + when BetterScore < + RunningScore -> + {BetterScore, + Owners1, DAM1}; + _ -> Acc + end + end, + {undefined, undefined, undefined}, + Eligible), {Owners2, DAM2}. %% Prepend N to the front of Owners, and update AM prepend(M, N, Owners, DAM) -> Ds = distances2(M -- [N], Owners), - DAM2 = lists:foldl(fun({T,D},DAM1) -> - ?DICT:append_list({N,T},[D],DAM1) - end, DAM, Ds), + DAM2 = lists:foldl(fun ({T, D}, DAM1) -> + dict:append_list({N, T}, [D], DAM1) + end, + DAM, Ds), {[N | Owners], DAM2}. %% Calculate the distances to each of the M nodes until %% a distance for each has been found. -distances2(M, Owners) -> - distances2(M, Owners, 0, []). +distances2(M, Owners) -> distances2(M, Owners, 0, []). -distances2([], _Owners, _D, Acc) -> - Acc; -distances2(_M, [], _D, Acc) -> - Acc; +distances2([], _Owners, _D, Acc) -> Acc; +distances2(_M, [], _D, Acc) -> Acc; distances2(M, [T | Owners], D, Acc) -> case lists:member(T, M) of - true -> - distances2(M -- [T], Owners, D + 1, [{T, D} | Acc]); - false -> - distances2(M, Owners, D + 1, Acc) + true -> + distances2(M -- [T], Owners, D + 1, [{T, D} | Acc]); + false -> distances2(M, Owners, D + 1, Acc) end. %% Fix up the dictionary AM adding in entries for the end of the owners list %% wrapping around to the start. fixup_dam(Owners, DAM) -> - fixup_dam(lists:usort(Owners), lists:reverse(Owners), Owners, 0, DAM). + fixup_dam(lists:usort(Owners), lists:reverse(Owners), + Owners, 0, DAM). -fixup_dam([], _ToFix, _Owners, _D, DAM) -> - DAM; -fixup_dam(_M, [], _Owners, _D, DAM) -> - DAM; +fixup_dam([], _ToFix, _Owners, _D, DAM) -> DAM; +fixup_dam(_M, [], _Owners, _D, DAM) -> DAM; fixup_dam(M, [N | ToFix], Owners, D, DAM) -> M2 = M -- [N], Ds = distances2(M2, Owners, D, []), - DAM2 = lists:foldl(fun({T,D0},DAM1) -> - ?DICT:append_list({N,T},[D0],DAM1) - end, DAM, Ds), + DAM2 = lists:foldl(fun ({T, D0}, DAM1) -> + dict:append_list({N, T}, [D0], DAM1) + end, + DAM, Ds), fixup_dam(M2, ToFix, Owners, D + 1, DAM2). %% ------------------------------------------------------------------- @@ -545,27 +528,23 @@ fixup_dam(M, [N | ToFix], Owners, D, DAM) -> %% ------------------------------------------------------------------- %% Permutations - number of ways to pick N out of K -num_perms(K, N) when K =< N -> - fac(N) div (fac(N - K)). +num_perms(K, N) when K =< N -> fac(N) div fac(N - K). %% Combinations - number of ways to combine N elements out of K num_combs(K, N) when K =< N -> fac(N) div (K * fac(N - K)). %% Factorials -fac(0) -> - 1; -fac(N) when N > 0 -> - N * fac(N-1). +fac(0) -> 1; +fac(N) when N > 0 -> N * fac(N - 1). %% Generate all permutations of list L -perm_gen([E]) -> - [[E]]; +perm_gen([E]) -> [[E]]; perm_gen(L) -> - lists:append([ begin - [ [X | Y] || Y <- perm_gen(lists:delete(X, L))] - end || X <- L]). - + lists:append([begin + [[X | Y] || Y <- perm_gen(lists:delete(X, L))] + end + || X <- L]). %% Pick all combinations of Depth nodes from the MemFbers list %% 0 = [] @@ -583,17 +562,15 @@ down_combos(Depth, Members, Down) -> Down2 = [[N | D] || N <- Members, D <- Down], down_combos(Depth - 1, Members, Down2). - %% Generate all rotated versions of an ownership list -rotations([H|T] = L) -> +rotations([H | T] = L) -> rotations(length(L) - 1, T ++ [H], [L]). -rotations(0, _, Acc) -> - lists:reverse(Acc); -rotations(Rem, [H|T] = L, Acc) -> +rotations(0, _, Acc) -> lists:reverse(Acc); +rotations(Rem, [H | T] = L, Acc) -> rotations(Rem - 1, T ++ [H], [L | Acc]). -%% Generate a list with each possible substitution for a name +%% Generate a list with each possible substitution for a name substitutions(L, Names) -> PNames = perm_gen(Names), [substitute(Names, P, L) || P <- PNames]. @@ -607,48 +584,4 @@ substitute(Names, Mapping, L) -> %% Unit Tests %% ------------------------------------------------------------------- --ifdef(TEST). --ifdef(EQC). - -property_adjacency_summary_test_() -> - {timeout, 60, ?_test(eqc:quickcheck(eqc:testing_time(30, prop_adjacency_summary())))}. - -longer_list(K, G) -> - ?SIZED(Size, resize(trunc(K*Size), list(resize(Size, G)))). - -%% Compare directly constructing the adjacency matrix against -%% one using prepend/fixup. -prop_adjacency_summary() -> - ?FORALL({OwnersSeed, S}, - {non_empty(longer_list(40, largeint())), ?LET(X, int(), 1 + abs(X))}, - begin - Owners = [list_to_atom("n"++integer_to_list(1 + (abs(I) rem S))) || I <- OwnersSeed], - AM = adjacency_matrix(Owners), - AS = summarize_am(AM), - - {Owners2, _DAM2, FixDAM2} = build(Owners), - AS2 = summarize_am(?DICT:to_list(FixDAM2)), - - - ?WHENFAIL( - begin - io:format(user, "S=~p\nOwners =~p\n", [S, Owners]), - io:format(user, "=== AM ===\n~p\n", [AM]), - io:format(user, "=== FixAM2 ===\n~p\n", [?DICT:to_list(FixDAM2)]), - io:format(user, "=== AS2 ===\n~p\n", [AS2]) - end, - conjunction([{owners, equals(Owners, Owners2)}, - {am2, equals(lists:sort(AS), lists:sort(AS2))}])) - end). - -build(Owners) -> - build(lists:usort(Owners), lists:reverse(Owners), [], ?DICT:new()). - -build(_M, [], Owners, DAM) -> - {Owners, DAM, fixup_dam(Owners, DAM)}; -build(M, [N|Rest], Owners, DAM) -> - {Owners1, DAM1} = prepend(M, N, Owners, DAM), - build(M, Rest, Owners1, DAM1). - --endif. % EQC --endif. % TEST. +%See test - pqc - riak_core_claim_util_qc diff --git a/src/riak_core_claimant.erl b/src/riak_core_claimant.erl index 61c9b6be8..2795b5439 100644 --- a/src/riak_core_claimant.erl +++ b/src/riak_core_claimant.erl @@ -19,58 +19,51 @@ %% ------------------------------------------------------------------- -module(riak_core_claimant). + -behaviour(gen_server). %% API -export([start_link/0]). --export([leave_member/1, - remove_member/1, - force_replace/2, - replace/2, - resize_ring/1, - abort_resize/0, - plan/0, - commit/0, - clear/0, - ring_changed/2 -]). + +-export([leave_member/1, remove_member/1, + force_replace/2, replace/2, resize_ring/1, + abort_resize/0, plan/0, commit/0, clear/0, + ring_changed/2]). + -export([reassign_indices/1]). % helpers for claim sim %% gen_server callbacks --export([init/1, handle_call/3, handle_cast/2, handle_info/2, - terminate/2, code_change/3]). +-export([init/1, handle_call/3, handle_cast/2, + handle_info/2, terminate/2, code_change/3]). --type action() :: leave - | remove - | {replace, node()} - | {force_replace, node()} - | {set_location, string()}. +-type action() :: leave | remove | {replace, node()} | + {force_replace, node()}. --type riak_core_ring() :: riak_core_ring:riak_core_ring(). +-type + riak_core_ring() :: riak_core_ring:riak_core_ring(). %% A tuple representing a given cluster transition: %% {Ring, NewRing} where NewRing = f(Ring) --type ring_transition() :: {riak_core_ring(), riak_core_ring()}. - --record(state, { - last_ring_id, - %% The set of staged cluster changes - changes :: [{node(), action()}], - - %% Ring computed during the last planning stage based on - %% applying a set of staged cluster changes. When commiting - %% changes, the computed ring must match the previous planned - %% ring to be allowed. - next_ring :: riak_core_ring() | undefined, - - %% Random number seed passed to remove_node to ensure the - %% current randomized remove algorithm is deterministic - %% between plan and commit phases - seed}). - --define(ROUT(S,A),ok). -%%-define(ROUT(S,A),?debugFmt(S,A)). -%%-define(ROUT(S,A),io:format(S,A)). +-type ring_transition() :: {riak_core_ring(), + riak_core_ring()}. + +-record(state, + {last_ring_id, + %% The set of staged cluster changes + changes :: [{node(), action()}], + %% Ring computed during the last planning stage based on + %% applying a set of staged cluster changes. When commiting + %% changes, the computed ring must match the previous planned + %% ring to be allowed. + next_ring :: riak_core_ring() | undefined, + %% Random number seed passed to remove_node to ensure the + %% current randomized remove algorithm is deterministic + %% between plan and commit phases + seed}). + +-define(ROUT(S, A), + ok).%%-define(ROUT(S,A),?debugFmt(S,A)). + %%-define(ROUT(S,A),io:format(S,A)). %%%=================================================================== %%% API @@ -88,37 +81,38 @@ stop() -> %% @doc Spawn and register the riak_core_claimant server start_link() -> - gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). + gen_server:start_link({local, ?MODULE}, ?MODULE, [], + []). %% @doc Determine how the cluster will be affected by the staged changes, %% returning the set of pending changes as well as a list of ring %% modifications that correspond to each resulting cluster transition %% (eg. the initial transition that applies the staged changes, and %% any additional transitions triggered by later rebalancing). --spec plan() -> {error, term()} | {ok, [action()], [ring_transition()]}. -plan() -> - gen_server:call(claimant(), plan, infinity). +-spec plan() -> {error, term()} | + {ok, [action()], [ring_transition()]}. + +plan() -> gen_server:call(claimant(), plan, infinity). %% @doc Commit the set of staged cluster changes, returning true on success. %% A commit is only allowed to succeed if the ring is ready and if the %% current set of changes matches those computed by the most recent %% call to plan/0. -spec commit() -> ok | {error, term()}. + commit() -> gen_server:call(claimant(), commit, infinity). %% @doc Stage a request for `Node' to leave the cluster. If committed, `Node' %% will handoff all of its data to other nodes in the cluster and then %% shutdown. -leave_member(Node) -> - stage(Node, leave). +leave_member(Node) -> stage(Node, leave). %% @doc Stage a request for `Node' to be forcefully removed from the cluster. %% If committed, all partitions owned by `Node' will immediately be %% re-assigned to other nodes. No data on `Node' will be transfered to %% other nodes, and all replicas on `Node' will be lost. -remove_member(Node) -> - stage(Node, remove). +remove_member(Node) -> stage(Node, remove). %% @doc Stage a request for `Node' to be replaced by `NewNode'. If committed, %% `Node' will handoff all of its data to `NewNode' and then shutdown. @@ -144,13 +138,14 @@ force_replace(Node, NewNode) -> %% removed from partitons no longer owner by a node or present %% in the ring. -spec resize_ring(integer()) -> ok | {error, atom()}. + resize_ring(NewRingSize) -> %% use the node making the request. it will be ignored stage(node(), {resize, NewRingSize}). -spec abort_resize() -> ok | {error, atom()}. -abort_resize() -> - stage(node(), abort_resize). + +abort_resize() -> stage(node(), abort_resize). -spec pending_close(riak_core_ring(), any()) -> ok. pending_close(Ring, RingID) -> @@ -162,8 +157,7 @@ set_node_location(Node, Location) -> stage(Node, {set_location, Location}). %% @doc Clear the current set of staged transfers -clear() -> - gen_server:call(claimant(), clear, infinity). +clear() -> gen_server:call(claimant(), clear, infinity). %% @doc This function is called as part of the ring reconciliation logic %% triggered by the gossip subsystem. This is only called on the one @@ -182,14 +176,16 @@ ring_changed(Node, Ring) -> %%%=================================================================== reassign_indices(CState) -> - reassign_indices(CState, [], riak_core_rand:rand_seed(), fun no_log/2). + reassign_indices(CState, [], erlang:timestamp(), + fun no_log/2). %%%=================================================================== %%% Internal API helpers %%%=================================================================== stage(Node, Action) -> - gen_server:call(claimant(), {stage, Node, Action}, infinity). + gen_server:call(claimant(), {stage, Node, Action}, + infinity). claimant() -> {ok, Ring} = riak_core_ring_manager:get_my_ring(), @@ -201,55 +197,42 @@ claimant() -> init([]) -> schedule_tick(), - {ok, #state{changes=[], seed=riak_core_rand:rand_seed()}}. + {ok, #state{changes = [], seed = erlang:timestamp()}}. handle_call(clear, _From, State) -> - State2 = clear_staged(State), - {reply, ok, State2}; - + State2 = clear_staged(State), {reply, ok, State2}; handle_call({stage, Node, Action}, _From, State) -> {ok, Ring} = riak_core_ring_manager:get_raw_ring(), - {Reply, State2} = maybe_stage(Node, Action, Ring, State), + {Reply, State2} = maybe_stage(Node, Action, Ring, + State), {reply, Reply, State2}; - handle_call(plan, _From, State) -> {ok, Ring} = riak_core_ring_manager:get_raw_ring(), case riak_core_ring:ring_ready(Ring) of - false -> - Reply = {error, ring_not_ready}, - {reply, Reply, State}; - true -> - {Reply, State2} = generate_plan(Ring, State), - {reply, Reply, State2} + false -> + Reply = {error, ring_not_ready}, {reply, Reply, State}; + true -> + {Reply, State2} = generate_plan(Ring, State), + {reply, Reply, State2} end; - handle_call(commit, _From, State) -> {Reply, State2} = commit_staged(State), {reply, Reply, State2}; - handle_call(_Request, _From, State) -> - Reply = ok, - {reply, Reply, State}. + Reply = ok, {reply, Reply, State}. -handle_cast(_Msg, State) -> - {noreply, State}. +handle_cast(_Msg, State) -> {noreply, State}. handle_info(tick, State) -> - State2 = tick(none, riak_core_ring_manager:get_ring_id(), State), - {noreply, State2}; - + State2 = tick(State), {noreply, State2}; handle_info(reset_ring_id, State) -> - State2 = State#state{last_ring_id=undefined}, + State2 = State#state{last_ring_id = undefined}, {noreply, State2}; +handle_info(_Info, State) -> {noreply, State}. -handle_info(_Info, State) -> - {noreply, State}. +terminate(_Reason, _State) -> ok. -terminate(_Reason, _State) -> - ok. - -code_change(_OldVsn, State, _Extra) -> - {ok, State}. +code_change(_OldVsn, State, _Extra) -> {ok, State}. %%%=================================================================== %%% Internal functions @@ -258,90 +241,90 @@ code_change(_OldVsn, State, _Extra) -> %% @private %% @doc Verify that a cluster change request is valid and add it to %% the list of staged changes. -maybe_stage(Node, Action, Ring, State=#state{changes=Changes}) -> +maybe_stage(Node, Action, Ring, + State = #state{changes = Changes}) -> case valid_request(Node, Action, Changes, Ring) of - true -> - Changes2 = orddict:store(Node, Action, Changes), - Changes3 = filter_changes(Changes2, Ring), - State2 = State#state{changes=Changes3}, - {ok, State2}; - Error -> - {Error, State} + true -> + Changes2 = orddict:store(Node, Action, Changes), + Changes3 = filter_changes(Changes2, Ring), + State2 = State#state{changes = Changes3}, + {ok, State2}; + Error -> {Error, State} end. %% @private %% @doc Determine how the staged set of cluster changes will affect %% the cluster. See {@link plan/0} for additional details. -generate_plan(Ring, State=#state{changes=Changes}) -> +generate_plan(Ring, + State = #state{changes = Changes}) -> Changes2 = filter_changes(Changes, Ring), - Joining = [{Node, join} || Node <- riak_core_ring:members(Ring, [joining])], + Joining = [{Node, join} + || Node <- riak_core_ring:members(Ring, [joining])], AllChanges = lists:ukeysort(1, Changes2 ++ Joining), - State2 = State#state{changes=Changes2}, + State2 = State#state{changes = Changes2}, generate_plan(AllChanges, Ring, State2). generate_plan([], _, State) -> %% There are no changes to apply {{ok, [], []}, State}; -generate_plan(Changes, Ring, State=#state{seed=Seed}) -> +generate_plan(Changes, Ring, + State = #state{seed = Seed}) -> case compute_all_next_rings(Changes, Seed, Ring) of - {error, invalid_resize_claim} -> - {{error, invalid_resize_claim}, State}; - {ok, NextRings} -> - {_, NextRing} = hd(NextRings), - State2 = State#state{next_ring=NextRing}, - Reply = {ok, Changes, NextRings}, - {Reply, State2} + {error, invalid_resize_claim} -> + {{error, invalid_resize_claim}, State}; + {ok, NextRings} -> + {_, NextRing} = hd(NextRings), + State2 = State#state{next_ring = NextRing}, + Reply = {ok, Changes, NextRings}, + {Reply, State2} end. %% @private %% @doc Commit the set of staged cluster changes. See {@link commit/0} %% for additional details. -commit_staged(State=#state{next_ring=undefined}) -> +commit_staged(State = #state{next_ring = undefined}) -> {{error, nothing_planned}, State}; commit_staged(State) -> case maybe_commit_staged(State) of - {ok, _} -> - State2 = State#state{next_ring=undefined, - changes=[], - seed=riak_core_rand:rand_seed()}, - {ok, State2}; - not_changed -> - {error, State}; - {not_changed, Reason} -> - {{error, Reason}, State} + {ok, _} -> + State2 = State#state{next_ring = undefined, + changes = [], seed = erlang:timestamp()}, + {ok, State2}; + not_changed -> {error, State}; + {not_changed, Reason} -> {{error, Reason}, State} end. %% @private maybe_commit_staged(State) -> - riak_core_ring_manager:ring_trans(fun maybe_commit_staged/2, State). + riak_core_ring_manager:ring_trans(fun maybe_commit_staged/2, + State). %% @private -maybe_commit_staged(Ring, State=#state{changes=Changes, seed=Seed}) -> +maybe_commit_staged(Ring, + State = #state{changes = Changes, seed = Seed}) -> Changes2 = filter_changes(Changes, Ring), case compute_next_ring(Changes2, Seed, Ring) of - {error, invalid_resize_claim} -> - {ignore, invalid_resize_claim}; - {ok, NextRing} -> - maybe_commit_staged(Ring, NextRing, State) + {error, invalid_resize_claim} -> + {ignore, invalid_resize_claim}; + {ok, NextRing} -> + maybe_commit_staged(Ring, NextRing, State) end. %% @private -maybe_commit_staged(Ring, NextRing, #state{next_ring=PlannedRing}) -> +maybe_commit_staged(Ring, NextRing, + #state{next_ring = PlannedRing}) -> Claimant = riak_core_ring:claimant(Ring), IsReady = riak_core_ring:ring_ready(Ring), - IsClaimant = (Claimant == node()), + IsClaimant = Claimant == node(), IsSamePlan = same_plan(PlannedRing, NextRing), case {IsReady, IsClaimant, IsSamePlan} of - {false, _, _} -> - {ignore, ring_not_ready}; - {_, false, _} -> - ignore; - {_, _, false} -> - {ignore, plan_changed}; - _ -> - NewRing0 = riak_core_ring:clear_location_changed(NextRing), - NewRing1 = riak_core_ring:increment_vclock(Claimant, NewRing0), - {new_ring, NewRing1} + {false, _, _} -> {ignore, ring_not_ready}; + {_, false, _} -> ignore; + {_, _, false} -> {ignore, plan_changed}; + _ -> + NewRing = riak_core_ring:increment_vclock(Claimant, + NextRing), + {new_ring, NewRing} end. %% @private @@ -352,152 +335,139 @@ maybe_commit_staged(Ring, NextRing, #state{next_ring=PlannedRing}) -> %% call {@link clear/0}. clear_staged(State) -> remove_joining_nodes(), - State#state{changes=[], seed=riak_core_rand:rand_seed()}. + State#state{changes = [], seed = erlang:timestamp()}. %% @private remove_joining_nodes() -> - riak_core_ring_manager:ring_trans(fun remove_joining_nodes/2, ok). + riak_core_ring_manager:ring_trans(fun remove_joining_nodes/2, + ok). %% @private remove_joining_nodes(Ring, _) -> Claimant = riak_core_ring:claimant(Ring), - IsClaimant = (Claimant == node()), + IsClaimant = Claimant == node(), Joining = riak_core_ring:members(Ring, [joining]), - AreJoining = (Joining /= []), + AreJoining = Joining /= [], case IsClaimant and AreJoining of - false -> - ignore; - true -> - NewRing = remove_joining_nodes_from_ring(Claimant, Joining, Ring), - {new_ring, NewRing} + false -> ignore; + true -> + NewRing = remove_joining_nodes_from_ring(Claimant, + Joining, Ring), + {new_ring, NewRing} end. %% @private -remove_joining_nodes_from_ring(Claimant, Joining, Ring) -> - NewRing = - lists:foldl(fun(Node, RingAcc) -> - riak_core_ring:set_member(Claimant, RingAcc, Node, - invalid, same_vclock) - end, Ring, Joining), - NewRing2 = riak_core_ring:increment_vclock(Claimant, NewRing), +remove_joining_nodes_from_ring(Claimant, Joining, + Ring) -> + NewRing = lists:foldl(fun (Node, RingAcc) -> + riak_core_ring:set_member(Claimant, RingAcc, + Node, invalid, + same_vclock) + end, + Ring, Joining), + NewRing2 = riak_core_ring:increment_vclock(Claimant, + NewRing), NewRing2. %% @private valid_request(Node, Action, Changes, Ring) -> case Action of - leave -> - valid_leave_request(Node, Ring); - remove -> - valid_remove_request(Node, Ring); - {replace, NewNode} -> - valid_replace_request(Node, NewNode, Changes, Ring); - {force_replace, NewNode} -> - valid_force_replace_request(Node, NewNode, Changes, Ring); - {resize, NewRingSize} -> - valid_resize_request(NewRingSize, Changes, Ring); - abort_resize -> - valid_resize_abort_request(Ring); - {set_location, Location} -> - valid_set_location_request(Location, Node, Ring) + leave -> valid_leave_request(Node, Ring); + remove -> valid_remove_request(Node, Ring); + {replace, NewNode} -> + valid_replace_request(Node, NewNode, Changes, Ring); + {force_replace, NewNode} -> + valid_force_replace_request(Node, NewNode, Changes, + Ring); + {resize, NewRingSize} -> + valid_resize_request(NewRingSize, Changes, Ring); + abort_resize -> valid_resize_abort_request(Ring) end. %% @private valid_leave_request(Node, Ring) -> case {riak_core_ring:all_members(Ring), - riak_core_ring:member_status(Ring, Node)} of - {_, invalid} -> - {error, not_member}; - {[Node], _} -> - {error, only_member}; - {_, valid} -> - true; - {_, joining} -> - true; - {_, _} -> - {error, already_leaving} + riak_core_ring:member_status(Ring, Node)} + of + {_, invalid} -> {error, not_member}; + {[Node], _} -> {error, only_member}; + {_, valid} -> true; + {_, joining} -> true; + {_, _} -> {error, already_leaving} end. %% @private valid_remove_request(Node, Ring) -> - IsClaimant = (Node == riak_core_ring:claimant(Ring)), - case {IsClaimant, - riak_core_ring:all_members(Ring), - riak_core_ring:member_status(Ring, Node)} of - {true, _, _} -> - {error, is_claimant}; - {_, _, invalid} -> - {error, not_member}; - {_, [Node], _} -> - {error, only_member}; - _ -> - true + IsClaimant = Node == riak_core_ring:claimant(Ring), + case {IsClaimant, riak_core_ring:all_members(Ring), + riak_core_ring:member_status(Ring, Node)} + of + {true, _, _} -> {error, is_claimant}; + {_, _, invalid} -> {error, not_member}; + {_, [Node], _} -> {error, only_member}; + _ -> true end. %% @private valid_replace_request(Node, NewNode, Changes, Ring) -> - AlreadyReplacement = lists:member(NewNode, existing_replacements(Changes)), - NewJoining = - (riak_core_ring:member_status(Ring, NewNode) == joining) - and (not orddict:is_key(NewNode, Changes)), + AlreadyReplacement = lists:member(NewNode, + existing_replacements(Changes)), + NewJoining = (riak_core_ring:member_status(Ring, + NewNode) + == joining) + and not orddict:is_key(NewNode, Changes), case {riak_core_ring:member_status(Ring, Node), - AlreadyReplacement, - NewJoining} of - {invalid, _, _} -> - {error, not_member}; - {leaving, _, _} -> - {error, already_leaving}; - {_, true, _} -> - {error, already_replacement}; - {_, _, false} -> - {error, invalid_replacement}; - _ -> - true + AlreadyReplacement, NewJoining} + of + {invalid, _, _} -> {error, not_member}; + {leaving, _, _} -> {error, already_leaving}; + {_, true, _} -> {error, already_replacement}; + {_, _, false} -> {error, invalid_replacement}; + _ -> true end. %% @private -valid_force_replace_request(Node, NewNode, Changes, Ring) -> - IsClaimant = (Node == riak_core_ring:claimant(Ring)), - AlreadyReplacement = lists:member(NewNode, existing_replacements(Changes)), - NewJoining = - (riak_core_ring:member_status(Ring, NewNode) == joining) - and (not orddict:is_key(NewNode, Changes)), +valid_force_replace_request(Node, NewNode, Changes, + Ring) -> + IsClaimant = Node == riak_core_ring:claimant(Ring), + AlreadyReplacement = lists:member(NewNode, + existing_replacements(Changes)), + NewJoining = (riak_core_ring:member_status(Ring, + NewNode) + == joining) + and not orddict:is_key(NewNode, Changes), case {IsClaimant, riak_core_ring:member_status(Ring, Node), - AlreadyReplacement, - NewJoining} of - {true, _, _, _} -> - {error, is_claimant}; - {_, invalid, _, _} -> - {error, not_member}; - {_, _, true, _} -> - {error, already_replacement}; - {_, _, _, false} -> - {error, invalid_replacement}; - _ -> - true + AlreadyReplacement, NewJoining} + of + {true, _, _, _} -> {error, is_claimant}; + {_, invalid, _, _} -> {error, not_member}; + {_, _, true, _} -> {error, already_replacement}; + {_, _, _, false} -> {error, invalid_replacement}; + _ -> true end. %% @private %% restrictions preventing resize along with other operations are temporary valid_resize_request(NewRingSize, [], Ring) -> - IsResizing = riak_core_ring:num_partitions(Ring) =/= NewRingSize, - + IsResizing = riak_core_ring:num_partitions(Ring) =/= + NewRingSize, NodeCount = length(riak_core_ring:all_members(Ring)), - Changes = length(riak_core_ring:pending_changes(Ring)) > 0, + Changes = length(riak_core_ring:pending_changes(Ring)) > + 0, case {IsResizing, NodeCount, Changes} of - {true, N, false} when N > 1 -> true; - {false, _, _} -> {error, same_size}; - {_, 1, _} -> {error, single_node}; - {_, _, true} -> {error, pending_changes} + {true, N, false} when N > 1 -> true; + {false, _, _} -> {error, same_size}; + {_, 1, _} -> {error, single_node}; + {_, _, true} -> {error, pending_changes} end. - valid_resize_abort_request(Ring) -> IsResizing = riak_core_ring:is_resizing(Ring), IsPostResize = riak_core_ring:is_post_resize(Ring), case IsResizing andalso not IsPostResize of - true -> true; - false -> {error, not_resizing} + true -> true; + false -> {error, not_resizing} end. %% @private @@ -519,81 +489,84 @@ valid_set_location_request(_Location, Node, Ring) -> %% can become invalid based on other staged changes, or by cluster %% changes that bypass the staging system. filter_changes(Changes, Ring) -> - orddict:filter(fun(Node, Change) -> + orddict:filter(fun (Node, Change) -> filter_changes_pred(Node, Change, Changes, Ring) - end, Changes). + end, + Changes). %% @private -filter_changes_pred(Node, {Change, NewNode}, Changes, Ring) - when (Change == replace) or (Change == force_replace) -> - IsMember = (riak_core_ring:member_status(Ring, Node) /= invalid), - IsJoining = (riak_core_ring:member_status(Ring, NewNode) == joining), - NotChanging = (not orddict:is_key(NewNode, Changes)), +filter_changes_pred(Node, {Change, NewNode}, Changes, + Ring) + when (Change == replace) or (Change == force_replace) -> + IsMember = riak_core_ring:member_status(Ring, Node) /= + invalid, + IsJoining = riak_core_ring:member_status(Ring, NewNode) + == joining, + NotChanging = not orddict:is_key(NewNode, Changes), IsMember and IsJoining and NotChanging; filter_changes_pred(Node, _, _, Ring) -> - IsMember = (riak_core_ring:member_status(Ring, Node) /= invalid), + IsMember = riak_core_ring:member_status(Ring, Node) /= + invalid, IsMember. %% @private existing_replacements(Changes) -> - [Node || {_, {Change, Node}} <- Changes, - (Change == replace) or (Change == force_replace)]. + [Node + || {_, {Change, Node}} <- Changes, + (Change == replace) or (Change == force_replace)]. %% @private %% Determine if two rings have logically equal cluster state same_plan(RingA, RingB) -> - (riak_core_ring:all_member_status(RingA) == riak_core_ring:all_member_status(RingB)) andalso - (riak_core_ring:all_owners(RingA) == riak_core_ring:all_owners(RingB)) andalso - (riak_core_ring:pending_changes(RingA) == riak_core_ring:pending_changes(RingB)). + riak_core_ring:all_member_status(RingA) == + riak_core_ring:all_member_status(RingB) + andalso + riak_core_ring:all_owners(RingA) == + riak_core_ring:all_owners(RingB) + andalso + riak_core_ring:pending_changes(RingA) == + riak_core_ring:pending_changes(RingB). schedule_tick() -> - Tick = application:get_env(riak_core, - claimant_tick, - 10000), + Tick = application:get_env(riak_core, claimant_tick, + 10000), erlang:send_after(Tick, ?MODULE, tick). -tick(State=#state{last_ring_id=LastID}) -> +tick(State = #state{last_ring_id = LastID}) -> case riak_core_ring_manager:get_ring_id() of - LastID -> - schedule_tick(), - State; - RingID -> - {ok, Ring} = riak_core_ring_manager:get_raw_ring(), - maybe_force_ring_update(Ring), - schedule_tick(), - State#state{last_ring_id=RingID} + LastID -> schedule_tick(), State; + RingID -> + {ok, Ring} = riak_core_ring_manager:get_raw_ring(), + maybe_force_ring_update(Ring), + schedule_tick(), + State#state{last_ring_id = RingID} end. maybe_force_ring_update(Ring) -> - IsClaimant = (riak_core_ring:claimant(Ring) == node()), + IsClaimant = riak_core_ring:claimant(Ring) == node(), IsReady = riak_core_ring:ring_ready(Ring), %% Do not force if we have any joining nodes unless any of them are %% auto-joining nodes. Otherwise, we will force update continuously. - JoinBlock = (are_joining_nodes(Ring) - andalso (auto_joining_nodes(Ring) == [])), - case IsClaimant and IsReady and (not JoinBlock) of - true -> - do_maybe_force_ring_update(Ring); - false -> - ok + JoinBlock = are_joining_nodes(Ring) andalso + auto_joining_nodes(Ring) == [], + case IsClaimant and IsReady and not JoinBlock of + true -> do_maybe_force_ring_update(Ring); + false -> ok end. do_maybe_force_ring_update(Ring) -> - case compute_next_ring([], riak_core_rand:rand_seed(), Ring) of - {ok, NextRing} -> - case same_plan(Ring, NextRing) of - false -> - logger:warning("Forcing update of stalled ring"), - riak_core_ring_manager:force_update(); - true -> - ok - end; - _ -> - ok + case compute_next_ring([], erlang:timestamp(), Ring) of + {ok, NextRing} -> + case same_plan(Ring, NextRing) of + false -> + logger:warning("Forcing update of stalled ring"), + riak_core_ring_manager:force_update(); + true -> ok + end; + _ -> ok end. - %% ========================================================================= %% Claimant rebalance/reassign logic %% ========================================================================= @@ -605,41 +578,39 @@ compute_all_next_rings(Changes, Seed, Ring) -> %% @private compute_all_next_rings(Changes, Seed, Ring, Acc) -> case compute_next_ring(Changes, Seed, Ring) of - {error, invalid_resize_claim}=Err -> - Err; - {ok, NextRing} -> - Acc2 = [{Ring, NextRing}|Acc], - case not same_plan(Ring, NextRing) of - true -> - FutureRing = riak_core_ring:future_ring(NextRing), - compute_all_next_rings([], Seed, FutureRing, Acc2); - false -> - {ok, lists:reverse(Acc2)} - end + {error, invalid_resize_claim} = Err -> Err; + {ok, NextRing} -> + Acc2 = [{Ring, NextRing} | Acc], + case not same_plan(Ring, NextRing) of + true -> + FutureRing = riak_core_ring:future_ring(NextRing), + compute_all_next_rings([], Seed, FutureRing, Acc2); + false -> {ok, lists:reverse(Acc2)} + end end. %% @private compute_next_ring(Changes, Seed, Ring) -> - Replacing = [{Node, NewNode} || {Node, {replace, NewNode}} <- Changes], + Replacing = [{Node, NewNode} + || {Node, {replace, NewNode}} <- Changes], Ring2 = apply_changes(Ring, Changes), {_, Ring3} = maybe_handle_joining(node(), Ring2), - {_, Ring4} = do_claimant_quiet(node(), Ring3, Replacing, Seed), + {_, Ring4} = do_claimant_quiet(node(), Ring3, Replacing, + Seed), {Valid, Ring5} = maybe_compute_resize(Ring, Ring4), case Valid of - false -> - {error, invalid_resize_claim}; - true -> - {ok, Ring5} + false -> {error, invalid_resize_claim}; + true -> {ok, Ring5} end. %% @private maybe_compute_resize(Orig, MbResized) -> OrigSize = riak_core_ring:num_partitions(Orig), NewSize = riak_core_ring:num_partitions(MbResized), - case OrigSize =/= NewSize of - false -> {true, MbResized}; - true -> validate_resized_ring(compute_resize(Orig, MbResized)) + false -> {true, MbResized}; + true -> + validate_resized_ring(compute_resize(Orig, MbResized)) end. %% @private @@ -652,49 +623,58 @@ maybe_compute_resize(Orig, MbResized) -> compute_resize(Orig, Resized) -> %% need to operate on balanced, future ring (apply changes determined by claim) CState0 = riak_core_ring:future_ring(Resized), - - Type = case riak_core_ring:num_partitions(Orig) < riak_core_ring:num_partitions(Resized) of - true -> larger; - false -> smaller - end, - + Type = case riak_core_ring:num_partitions(Orig) < + riak_core_ring:num_partitions(Resized) + of + true -> larger; + false -> smaller + end, %% Each index in the original ring must perform several transfers %% to properly resize the ring. The first transfer for each index %% is scheduled here. Subsequent transfers are scheduled by vnode - CState1 = lists:foldl(fun({Idx, _} = IdxOwner, CStateAcc) -> + CState1 = lists:foldl(fun ({Idx, _} = IdxOwner, + CStateAcc) -> %% indexes being abandoned in a shrinking ring have %% no next owner - NextOwner = try riak_core_ring:index_owner(CStateAcc, Idx) - catch error:{badmatch, false} -> none + NextOwner = try + riak_core_ring:index_owner(CStateAcc, + Idx) + catch + error:{badmatch, false} -> none end, - schedule_first_resize_transfer(Type, - IdxOwner, + schedule_first_resize_transfer(Type, IdxOwner, NextOwner, CStateAcc) end, - CState0, - riak_core_ring:all_owners(Orig)), - + CState0, riak_core_ring:all_owners(Orig)), riak_core_ring:set_pending_resize(CState1, Orig). %% @private %% @doc determine the first resize transfer a partition should perform with %% the goal of ensuring the transfer will actually have data to send to the %% target. -schedule_first_resize_transfer(smaller, {Idx,_}=IdxOwner, none, Resized) -> +schedule_first_resize_transfer(smaller, + {Idx, _} = IdxOwner, none, Resized) -> %% partition no longer exists in shrunk ring, first successor will be %% new owner of its data - Target = hd(riak_core_ring:preflist(<>, Resized)), - riak_core_ring:schedule_resize_transfer(Resized, IdxOwner, Target); -schedule_first_resize_transfer(_Type,{Idx, Owner}=IdxOwner, Owner, Resized) -> + Target = hd(riak_core_ring:preflist(<>, + Resized)), + riak_core_ring:schedule_resize_transfer(Resized, + IdxOwner, Target); +schedule_first_resize_transfer(_Type, + {Idx, Owner} = IdxOwner, Owner, Resized) -> %% partition is not being moved during expansion, first predecessor will %% own at least a portion of its data - Target = hd(chash:predecessors(Idx-1, riak_core_ring:chash(Resized))), - riak_core_ring:schedule_resize_transfer(Resized, IdxOwner, Target); -schedule_first_resize_transfer(_,{Idx, _Owner}=IdxOwner, NextOwner, Resized) -> + Target = hd(chash:predecessors(Idx - 1, + riak_core_ring:chash(Resized))), + riak_core_ring:schedule_resize_transfer(Resized, + IdxOwner, Target); +schedule_first_resize_transfer(_, + {Idx, _Owner} = IdxOwner, NextOwner, Resized) -> %% partition is being moved during expansion, schedule transfer to partition %% on new owner since it will still own some of its data - riak_core_ring:schedule_resize_transfer(Resized, IdxOwner, {Idx, NextOwner}). + riak_core_ring:schedule_resize_transfer(Resized, + IdxOwner, {Idx, NextOwner}). %% @doc verify that resized ring was properly claimed (no owners are the dummy %% resized owner) in both the current and future ring @@ -704,25 +684,24 @@ validate_resized_ring(Ring) -> FutureOwners = riak_core_ring:all_owners(FutureRing), Members = riak_core_ring:all_members(Ring), FutureMembers = riak_core_ring:all_members(FutureRing), - Invalid1 = [{Idx, Owner} || {Idx, Owner} <- Owners, - not lists:member(Owner, Members)], - Invalid2 = [{Idx, Owner} || {Idx, Owner} <- FutureOwners, - not lists:member(Owner, FutureMembers)], + Invalid1 = [{Idx, Owner} + || {Idx, Owner} <- Owners, + not lists:member(Owner, Members)], + Invalid2 = [{Idx, Owner} + || {Idx, Owner} <- FutureOwners, + not lists:member(Owner, FutureMembers)], case Invalid1 ++ Invalid2 of - [] -> - {true, Ring}; - _ -> - {false, Ring} + [] -> {true, Ring}; + _ -> {false, Ring} end. %% @private apply_changes(Ring, Changes) -> - NewRing = - lists:foldl( - fun({Node, Cmd}, RingAcc2) -> - RingAcc3 = change({Cmd, Node}, RingAcc2), - RingAcc3 - end, Ring, Changes), + NewRing = lists:foldl(fun ({Node, Cmd}, RingAcc2) -> + RingAcc3 = change({Cmd, Node}, RingAcc2), + RingAcc3 + end, + Ring, Changes), NewRing. %% @private @@ -731,12 +710,14 @@ change({join, Node}, Ring) -> Ring2; change({leave, Node}, Ring) -> Members = riak_core_ring:all_members(Ring), - lists:member(Node, Members) orelse throw(invalid_member), + lists:member(Node, Members) orelse + throw(invalid_member), Ring2 = riak_core_ring:leave_member(Node, Ring, Node), Ring2; change({remove, Node}, Ring) -> Members = riak_core_ring:all_members(Ring), - lists:member(Node, Members) orelse throw(invalid_member), + lists:member(Node, Members) orelse + throw(invalid_member), Ring2 = riak_core_ring:remove_member(Node, Ring, Node), Ring2; change({{replace, _NewNode}, Node}, Ring) -> @@ -746,12 +727,14 @@ change({{replace, _NewNode}, Node}, Ring) -> change({{force_replace, NewNode}, Node}, Ring) -> Indices = riak_core_ring:indices(Ring, Node), Reassign = [{Idx, NewNode} || Idx <- Indices], - Ring2 = riak_core_ring:add_member(NewNode, Ring, NewNode), + Ring2 = riak_core_ring:add_member(NewNode, Ring, + NewNode), Ring3 = riak_core_ring:change_owners(Ring2, Reassign), Ring4 = riak_core_ring:remove_member(Node, Ring3, Node), case riak_core_ring:is_resizing(Ring4) of - true -> replace_node_during_resize(Ring4, Node, NewNode); - false -> Ring4 + true -> + replace_node_during_resize(Ring4, Node, NewNode); + false -> Ring4 end; change({{resize, NewRingSize}, _Node}, Ring) -> riak_core_ring:resize(Ring, NewRingSize); @@ -762,9 +745,9 @@ change({{set_location, Location}, Node}, Ring) -> %%noinspection ErlangUnboundVariable internal_ring_changed(Node, CState) -> - {Changed, CState5} = do_claimant(Node, CState, fun log/2), + {Changed, CState5} = do_claimant(Node, CState, + fun log/2), inform_removed_nodes(Node, CState, CState5), - %% Start/stop converge and rebalance delay timers %% (converge delay) %% -- Starts when claimant changes the ring @@ -773,152 +756,159 @@ internal_ring_changed(Node, CState) -> %% -- Starts when next changes from empty to non-empty %% -- Stops when next changes from non-empty to empty %% - IsClaimant = (riak_core_ring:claimant(CState5) =:= Node), - WasPending = ([] /= riak_core_ring:pending_changes(CState)), - IsPending = ([] /= riak_core_ring:pending_changes(CState5)), - + IsClaimant = riak_core_ring:claimant(CState5) =:= Node, + WasPending = [] /= + riak_core_ring:pending_changes(CState), + IsPending = [] /= + riak_core_ring:pending_changes(CState5), %% Outer case statement already checks for ring_ready case {IsClaimant, Changed} of - {true, true} -> - %% STATS -%% riak_core_stat:update(converge_timer_end), - %% STATS -%% riak_core_stat:update(converge_timer_begin); - ok; - {true, false} -> - %% STATS -%% riak_core_stat:update(converge_timer_end); - ok; - _ -> - ok + {true, true} -> + %% STATS + %% riak_core_stat:update(converge_timer_end), + %% STATS + %% riak_core_stat:update(converge_timer_begin); + ok; + {true, false} -> + %% STATS + %% riak_core_stat:update(converge_timer_end); + ok; + _ -> ok end, - case {IsClaimant, WasPending, IsPending} of - {true, false, true} -> - %% STATS -%% riak_core_stat:update(rebalance_timer_begin); - ok; - {true, true, false} -> - %% STATS -%% riak_core_stat:update(rebalance_timer_end); - ok; - _ -> - ok + {true, false, true} -> + %% STATS + %% riak_core_stat:update(rebalance_timer_begin); + ok; + {true, true, false} -> + %% STATS + %% riak_core_stat:update(rebalance_timer_end); + ok; + _ -> ok end, - %% Set cluster name if it is undefined - case {IsClaimant, riak_core_ring:cluster_name(CState5)} of - {true, undefined} -> - ClusterName = {Node, riak_core_rand:rand_seed()}, - {_,_} = riak_core_util:rpc_every_member(riak_core_ring_manager, - set_cluster_name, - [ClusterName], - 1000), - ok; - _ -> - ClusterName = riak_core_ring:cluster_name(CState5), - ok + case {IsClaimant, riak_core_ring:cluster_name(CState5)} + of + {true, undefined} -> + ClusterName = {Node, erlang:timestamp()}, + {_, _} = + riak_core_util:rpc_every_member(riak_core_ring_manager, + set_cluster_name, [ClusterName], + 1000), + ok; + _ -> + ClusterName = riak_core_ring:cluster_name(CState5), ok end, - case Changed of - true -> - CState6 = riak_core_ring:set_cluster_name(CState5, ClusterName), - riak_core_ring:increment_vclock(Node, CState6); - false -> - CState5 + true -> + CState6 = riak_core_ring:set_cluster_name(CState5, + ClusterName), + riak_core_ring:increment_vclock(Node, CState6); + false -> CState5 end. inform_removed_nodes(Node, OldRing, NewRing) -> CName = riak_core_ring:cluster_name(NewRing), - Exiting = riak_core_ring:members(OldRing, [exiting]) -- [Node], + Exiting = riak_core_ring:members(OldRing, [exiting]) -- + [Node], Invalid = riak_core_ring:members(NewRing, [invalid]), - Changed = ordsets:intersection(ordsets:from_list(Exiting), - ordsets:from_list(Invalid)), + Changed = + ordsets:intersection(ordsets:from_list(Exiting), + ordsets:from_list(Invalid)), %% Tell exiting node to shutdown. - _ = [riak_core_ring_manager:refresh_ring(ExitingNode, CName) || - ExitingNode <- Changed], + _ = [riak_core_ring_manager:refresh_ring(ExitingNode, + CName) + || ExitingNode <- Changed], ok. do_claimant_quiet(Node, CState, Replacing, Seed) -> - do_claimant(Node, CState, Replacing, Seed, fun no_log/2). + do_claimant(Node, CState, Replacing, Seed, + fun no_log/2). do_claimant(Node, CState, Log) -> - do_claimant(Node, CState, [], riak_core_rand:rand_seed(), Log). + do_claimant(Node, CState, [], erlang:timestamp(), Log). do_claimant(Node, CState, Replacing, Seed, Log) -> AreJoining = are_joining_nodes(CState), {C1, CState2} = maybe_update_claimant(Node, CState), - {C2, CState3} = maybe_handle_auto_joining(Node, CState2), + {C2, CState3} = maybe_handle_auto_joining(Node, + CState2), case AreJoining of - true -> - %% Do not rebalance if there are joining nodes - Changed = C1 or C2, - CState5 = CState3; - false -> - {C3, CState4} = - maybe_update_ring(Node, CState3, Replacing, Seed, Log), - {C4, CState5} = maybe_remove_exiting(Node, CState4), - Changed = (C1 or C2 or C3 or C4) + true -> + %% Do not rebalance if there are joining nodes + Changed = C1 or C2, + CState5 = CState3; + false -> + {C3, CState4} = maybe_update_ring(Node, CState3, + Replacing, Seed, Log), + {C4, CState5} = maybe_remove_exiting(Node, CState4), + Changed = C1 or C2 or C3 or C4 end, {Changed, CState5}. %% @private maybe_update_claimant(Node, CState) -> - Members = riak_core_ring:members(CState, [valid, leaving]), + Members = riak_core_ring:members(CState, + [valid, leaving]), Claimant = riak_core_ring:claimant(CState), NextClaimant = hd(Members ++ [undefined]), ClaimantMissing = not lists:member(Claimant, Members), - case {ClaimantMissing, NextClaimant} of - {true, Node} -> - %% Become claimant - CState2 = riak_core_ring:set_claimant(CState, Node), - CState3 = riak_core_ring:increment_ring_version(Claimant, CState2), - {true, CState3}; - _ -> - {false, CState} + {true, Node} -> + %% Become claimant + CState2 = riak_core_ring:set_claimant(CState, Node), + CState3 = + riak_core_ring:increment_ring_version(Claimant, + CState2), + {true, CState3}; + _ -> {false, CState} end. %% @private maybe_update_ring(Node, CState, Replacing, Seed, Log) -> Claimant = riak_core_ring:claimant(CState), case Claimant of - Node -> - case riak_core_ring:claiming_members(CState) of - [] -> - %% Consider logging an error/warning here or even - %% intentionally crashing. This state makes no logical - %% sense given that it represents a cluster without any - %% active nodes. - {false, CState}; - _ -> - Resizing = riak_core_ring:is_resizing(CState), - {Changed, CState2} = - update_ring(Node, CState, Replacing, Seed, Log, Resizing), - {Changed, CState2} - end; - _ -> - {false, CState} + Node -> + case riak_core_ring:claiming_members(CState) of + [] -> + %% Consider logging an error/warning here or even + %% intentionally crashing. This state makes no logical + %% sense given that it represents a cluster without any + %% active nodes. + {false, CState}; + _ -> + Resizing = riak_core_ring:is_resizing(CState), + {Changed, CState2} = update_ring(Node, CState, + Replacing, Seed, Log, + Resizing), + {Changed, CState2} + end; + _ -> {false, CState} end. %% @private maybe_remove_exiting(Node, CState) -> Claimant = riak_core_ring:claimant(CState), case Claimant of - Node -> - %% Change exiting nodes to invalid, skipping this node. - Exiting = riak_core_ring:members(CState, [exiting]) -- [Node], - Changed = (Exiting /= []), - CState2 = - lists:foldl(fun(ENode, CState0) -> - ClearedCS = - riak_core_ring:clear_member_meta(Node, CState0, ENode), - riak_core_ring:set_member(Node, ClearedCS, ENode, - invalid, same_vclock) - end, CState, Exiting), - {Changed, CState2}; - _ -> - {false, CState} + Node -> + %% Change exiting nodes to invalid, skipping this node. + Exiting = riak_core_ring:members(CState, [exiting]) -- + [Node], + Changed = Exiting /= [], + CState2 = lists:foldl(fun (ENode, CState0) -> + ClearedCS = + riak_core_ring:clear_member_meta(Node, + CState0, + ENode), + riak_core_ring:set_member(Node, + ClearedCS, + ENode, + invalid, + same_vclock) + end, + CState, Exiting), + {Changed, CState2}; + _ -> {false, CState} end. %% @private @@ -929,9 +919,13 @@ are_joining_nodes(CState) -> %% @private auto_joining_nodes(CState) -> Joining = riak_core_ring:members(CState, [joining]), -%% case application:get_env(riak_core, staged_joins, true) of false -> Joining; true -> - [Member || Member <- Joining, riak_core_ring:get_member_meta(CState, Member, '$autojoin') == true]. -%% end. + %% case application:get_env(riak_core, staged_joins, true) of false -> Joining; true -> + [Member + || Member <- Joining, + riak_core_ring:get_member_meta(CState, Member, + '$autojoin') + == + true].%% end. %% @private maybe_handle_auto_joining(Node, CState) -> @@ -947,141 +941,152 @@ maybe_handle_joining(Node, CState) -> maybe_handle_joining(Node, Joining, CState) -> Claimant = riak_core_ring:claimant(CState), case Claimant of - Node -> - Changed = (Joining /= []), - CState2 = - lists:foldl(fun(JNode, CState0) -> - riak_core_ring:set_member(Node, CState0, JNode, - valid, same_vclock) - end, CState, Joining), - {Changed, CState2}; - _ -> - {false, CState} + Node -> + Changed = Joining /= [], + CState2 = lists:foldl(fun (JNode, CState0) -> + riak_core_ring:set_member(Node, CState0, + JNode, valid, + same_vclock) + end, + CState, Joining), + {Changed, CState2}; + _ -> {false, CState} end. %% @private -update_ring(CNode, CState, Replacing, Seed, Log, false) -> +update_ring(CNode, CState, Replacing, Seed, Log, + false) -> Next0 = riak_core_ring:pending_changes(CState), - - ?ROUT("Members: ~p~n", [riak_core_ring:members(CState, [joining, valid, - leaving, exiting, - invalid])]), + ?ROUT("Members: ~p~n", + [riak_core_ring:members(CState, + [joining, valid, leaving, exiting, + invalid])]), ?ROUT("Updating ring :: next0 : ~p~n", [Next0]), - %% Remove tuples from next for removed nodes - InvalidMembers = riak_core_ring:members(CState, [invalid]), - Next2 = lists:filter(fun(NInfo) -> - {Owner, NextOwner, _} = riak_core_ring:next_owner(NInfo), + InvalidMembers = riak_core_ring:members(CState, + [invalid]), + Next2 = lists:filter(fun (NInfo) -> + {Owner, NextOwner, _} = + riak_core_ring:next_owner(NInfo), not lists:member(Owner, InvalidMembers) and - not lists:member(NextOwner, InvalidMembers) - end, Next0), - CState2 = riak_core_ring:set_pending_changes(CState, Next2), - + not lists:member(NextOwner, InvalidMembers) + end, + Next0), + CState2 = riak_core_ring:set_pending_changes(CState, + Next2), %% Transfer ownership after completed handoff - {RingChanged1, CState3} = transfer_ownership(CState2, Log), + {RingChanged1, CState3} = transfer_ownership(CState2, + Log), ?ROUT("Updating ring :: next1 : ~p~n", [riak_core_ring:pending_changes(CState3)]), - %% Ressign leaving/inactive indices - {RingChanged2, CState4} = reassign_indices(CState3, Replacing, Seed, Log), + {RingChanged2, CState4} = reassign_indices(CState3, + Replacing, Seed, Log), ?ROUT("Updating ring :: next2 : ~p~n", [riak_core_ring:pending_changes(CState4)]), - %% Rebalance the ring as necessary. If pending changes exist ring %% is not rebalanced Next3 = rebalance_ring(CNode, CState4), - Log(debug,{"Pending ownership transfers: ~b~n", - [length(riak_core_ring:pending_changes(CState4))]}), - + Log(debug, + {"Pending ownership transfers: ~b~n", + [length(riak_core_ring:pending_changes(CState4))]}), %% Remove transfers to/from down nodes Next4 = handle_down_nodes(CState4, Next3), - - NextChanged = (Next0 /= Next4), - Changed = (NextChanged or RingChanged1 or RingChanged2), + NextChanged = Next0 /= Next4, + Changed = NextChanged or RingChanged1 or RingChanged2, case Changed of - true -> - OldS = ordsets:from_list([{Idx,O,NO} || {Idx,O,NO,_,_} <- Next0]), - NewS = ordsets:from_list([{Idx,O,NO} || {Idx,O,NO,_,_} <- Next4]), - Diff = ordsets:subtract(NewS, OldS), - _ = [Log(next, NChange) || NChange <- Diff], - ?ROUT("Updating ring :: next3 : ~p~n", [Next4]), - CState5 = riak_core_ring:set_pending_changes(CState4, Next4), - CState6 = riak_core_ring:increment_ring_version(CNode, CState5), - {true, CState6}; - false -> - {false, CState} + true -> + OldS = ordsets:from_list([{Idx, O, NO} + || {Idx, O, NO, _, _} <- Next0]), + NewS = ordsets:from_list([{Idx, O, NO} + || {Idx, O, NO, _, _} <- Next4]), + Diff = ordsets:subtract(NewS, OldS), + _ = [Log(next, NChange) || NChange <- Diff], + ?ROUT("Updating ring :: next3 : ~p~n", [Next4]), + CState5 = riak_core_ring:set_pending_changes(CState4, + Next4), + CState6 = riak_core_ring:increment_ring_version(CNode, + CState5), + {true, CState6}; + false -> {false, CState} end; -update_ring(CNode, CState, _Replacing, _Seed, _Log, true) -> - {Installed, CState1} = maybe_install_resized_ring(CState), - {Aborted, CState2} = riak_core_ring:maybe_abort_resize(CState1), +update_ring(CNode, CState, _Replacing, _Seed, _Log, + true) -> + {Installed, CState1} = + maybe_install_resized_ring(CState), + {Aborted, CState2} = + riak_core_ring:maybe_abort_resize(CState1), Changed = Installed orelse Aborted, case Changed of - true -> - CState3 = riak_core_ring:increment_ring_version(CNode, CState2), - {true, CState3}; - false -> - {false, CState} + true -> + CState3 = riak_core_ring:increment_ring_version(CNode, + CState2), + {true, CState3}; + false -> {false, CState} end. maybe_install_resized_ring(CState) -> case riak_core_ring:is_resize_complete(CState) of - true -> - {true, riak_core_ring:future_ring(CState)}; - false -> {false, CState} + true -> {true, riak_core_ring:future_ring(CState)}; + false -> {false, CState} end. %% @private transfer_ownership(CState, Log) -> Next = riak_core_ring:pending_changes(CState), %% Remove already completed and transfered changes - Next2 = lists:filter(fun(NInfo={Idx, _, _, _, _}) -> - {_, NewOwner, S} = riak_core_ring:next_owner(NInfo), - not ((S == complete) and - (riak_core_ring:index_owner(CState, Idx) =:= NewOwner)) - end, Next), - - CState2 = lists:foldl( - fun(NInfo={Idx, _, _, _, _}, CState0) -> - case riak_core_ring:next_owner(NInfo) of - {_, Node, complete} -> - Log(ownership, {Idx, Node, CState0}), - riak_core_ring:transfer_node(Idx, Node, - CState0); - _ -> - CState0 - end - end, CState, Next2), - - NextChanged = (Next2 /= Next), - RingChanged = (riak_core_ring:all_owners(CState) /= riak_core_ring:all_owners(CState2)), - Changed = (NextChanged or RingChanged), - CState3 = riak_core_ring:set_pending_changes(CState2, Next2), + Next2 = lists:filter(fun (NInfo = {Idx, _, _, _, _}) -> + {_, NewOwner, S} = + riak_core_ring:next_owner(NInfo), + not + ((S == complete) and + (riak_core_ring:index_owner(CState, Idx) + =:= NewOwner)) + end, + Next), + CState2 = lists:foldl(fun (NInfo = {Idx, _, _, _, _}, + CState0) -> + case riak_core_ring:next_owner(NInfo) of + {_, Node, complete} -> + Log(ownership, {Idx, Node, CState0}), + riak_core_ring:transfer_node(Idx, Node, + CState0); + _ -> CState0 + end + end, + CState, Next2), + NextChanged = Next2 /= Next, + RingChanged = riak_core_ring:all_owners(CState) /= + riak_core_ring:all_owners(CState2), + Changed = NextChanged or RingChanged, + CState3 = riak_core_ring:set_pending_changes(CState2, + Next2), {Changed, CState3}. - %% @private reassign_indices(CState, Replacing, Seed, Log) -> Next = riak_core_ring:pending_changes(CState), Invalid = riak_core_ring:members(CState, [invalid]), - CState2 = - lists:foldl(fun(Node, CState0) -> - remove_node(CState0, Node, invalid, - Replacing, Seed, Log) - end, CState, Invalid), + CState2 = lists:foldl(fun (Node, CState0) -> + remove_node(CState0, Node, invalid, Replacing, + Seed, Log) + end, + CState, Invalid), CState3 = case Next of - [] -> - Leaving = riak_core_ring:members(CState, [leaving]), - lists:foldl(fun(Node, CState0) -> - remove_node(CState0, Node, leaving, - Replacing, Seed, Log) - end, CState2, Leaving); - _ -> - CState2 + [] -> + Leaving = riak_core_ring:members(CState, [leaving]), + lists:foldl(fun (Node, CState0) -> + remove_node(CState0, Node, leaving, + Replacing, Seed, Log) + end, + CState2, Leaving); + _ -> CState2 end, Owners1 = riak_core_ring:all_owners(CState), Owners2 = riak_core_ring:all_owners(CState3), - RingChanged = (Owners1 /= Owners2), - NextChanged = (Next /= riak_core_ring:pending_changes(CState3)), + RingChanged = Owners1 /= Owners2, + NextChanged = Next /= + riak_core_ring:pending_changes(CState3), {RingChanged or NextChanged, CState3}. %% @private @@ -1098,29 +1103,29 @@ rebalance_ring(_CNode, [], CState) -> || {{Idx, PrevOwner}, {Idx, NewOwner}} <- Owners3, PrevOwner /= NewOwner], Next; -rebalance_ring(_CNode, Next, _CState) -> - Next. +rebalance_ring(_CNode, Next, _CState) -> Next. %% @private handle_down_nodes(CState, Next) -> - LeavingMembers = riak_core_ring:members(CState, [leaving, invalid]), + LeavingMembers = riak_core_ring:members(CState, + [leaving, invalid]), DownMembers = riak_core_ring:members(CState, [down]), Next2 = [begin - OwnerLeaving = lists:member(O, LeavingMembers), - NextDown = lists:member(NO, DownMembers), - case (OwnerLeaving and NextDown) of - true -> - Active = riak_core_ring:active_members(CState) -- [O], - RNode = lists:nth(riak_core_rand:uniform(length(Active)), - Active), - {Idx, O, RNode, Mods, Status}; - _ -> - T - end - end || T={Idx, O, NO, Mods, Status} <- Next], - Next3 = [T || T={_, O, NO, _, _} <- Next2, - not lists:member(O, DownMembers), - not lists:member(NO, DownMembers)], + OwnerLeaving = lists:member(O, LeavingMembers), + NextDown = lists:member(NO, DownMembers), + case OwnerLeaving and NextDown of + true -> + Active = riak_core_ring:active_members(CState) -- [O], + RNode = lists:nth(rand:uniform(length(Active)), Active), + {Idx, O, RNode, Mods, Status}; + _ -> T + end + end + || T = {Idx, O, NO, Mods, Status} <- Next], + Next3 = [T + || T = {_, O, NO, _, _} <- Next2, + not lists:member(O, DownMembers), + not lists:member(NO, DownMembers)], Next3. %% @private @@ -1131,88 +1136,105 @@ reassign_indices_to(Node, NewNode, Ring) -> Ring2. %% @private -remove_node(CState, Node, Status, Replacing, Seed, Log) -> +remove_node(CState, Node, Status, Replacing, Seed, + Log) -> Indices = riak_core_ring:indices(CState, Node), - remove_node(CState, Node, Status, Replacing, Seed, Log, Indices). + remove_node(CState, Node, Status, Replacing, Seed, Log, + Indices). %% @private -remove_node(CState, _Node, _Status, _Replacing, _Seed, _Log, []) -> +remove_node(CState, _Node, _Status, _Replacing, _Seed, + _Log, []) -> CState; -remove_node(CState, Node, Status, Replacing, Seed, Log, Indices) -> +remove_node(CState, Node, Status, Replacing, Seed, Log, + Indices) -> CStateT1 = riak_core_ring:change_owners(CState, riak_core_ring:all_next_owners(CState)), case orddict:find(Node, Replacing) of - {ok, NewNode} -> - CStateT2 = reassign_indices_to(Node, NewNode, CStateT1); - error -> - CStateT2 = riak_core_gossip:remove_from_cluster(CStateT1, Node, Seed) + {ok, NewNode} -> + CStateT2 = reassign_indices_to(Node, NewNode, CStateT1); + error -> + CStateT2 = + riak_core_gossip:remove_from_cluster(CStateT1, Node, + Seed) end, - Owners1 = riak_core_ring:all_owners(CState), Owners2 = riak_core_ring:all_owners(CStateT2), Owners3 = lists:zip(Owners1, Owners2), RemovedIndices = case Status of - invalid -> - Indices; - leaving -> - [] + invalid -> Indices; + leaving -> [] end, - Reassign = [{Idx, NewOwner} || {Idx, NewOwner} <- Owners2, - lists:member(Idx, RemovedIndices)], + Reassign = [{Idx, NewOwner} + || {Idx, NewOwner} <- Owners2, + lists:member(Idx, RemovedIndices)], Next = [{Idx, PrevOwner, NewOwner, [], awaiting} || {{Idx, PrevOwner}, {Idx, NewOwner}} <- Owners3, PrevOwner /= NewOwner, not lists:member(Idx, RemovedIndices)], - - _ = [Log(reassign, {Idx, NewOwner, CState}) || {Idx, NewOwner} <- Reassign], - + _ = [Log(reassign, {Idx, NewOwner, CState}) + || {Idx, NewOwner} <- Reassign], %% Unlike rebalance_ring, remove_node can be called when Next is non-empty, %% therefore we need to merge the values. Original Next has priority. - Next2 = lists:ukeysort(1, riak_core_ring:pending_changes(CState) ++ Next), - CState2 = riak_core_ring:change_owners(CState, Reassign), - CState3 = riak_core_ring:set_pending_changes(CState2, Next2), + Next2 = lists:ukeysort(1, + riak_core_ring:pending_changes(CState) ++ Next), + CState2 = riak_core_ring:change_owners(CState, + Reassign), + CState3 = riak_core_ring:set_pending_changes(CState2, + Next2), CState3. replace_node_during_resize(CState0, Node, NewNode) -> PostResize = riak_core_ring:is_post_resize(CState0), - CState1 = replace_node_during_resize(CState0, Node, NewNode, PostResize), - riak_core_ring:increment_ring_version(riak_core_ring:claimant(CState1), CState1). + CState1 = replace_node_during_resize(CState0, Node, + NewNode, PostResize), + riak_core_ring:increment_ring_version(riak_core_ring:claimant(CState1), + CState1). -replace_node_during_resize(CState0, Node, NewNode, false) -> %% ongoing xfers +replace_node_during_resize(CState0, Node, NewNode, + false) -> %% ongoing xfers %% for each of the indices being moved from Node to NewNode, reschedule resize %% transfers where the target is owned by Node. - CState1 = riak_core_ring:reschedule_resize_transfers(CState0, Node, NewNode), - + CState1 = + riak_core_ring:reschedule_resize_transfers(CState0, + Node, NewNode), %% since the resized chash is carried directly in state vs. being rebuilt via next %% list, perform reassignment - {ok, FutureCHash} = riak_core_ring:resized_ring(CState1), - FutureCState = riak_core_ring:set_chash(CState1, FutureCHash), - ReassignedFuture = reassign_indices_to(Node, NewNode, FutureCState), - ReassignedCHash = riak_core_ring:chash(ReassignedFuture), - riak_core_ring:set_resized_ring(CState1, ReassignedCHash); -replace_node_during_resize(CState, Node, _NewNode, true) -> %% performing cleanup + {ok, FutureCHash} = + riak_core_ring:resized_ring(CState1), + FutureCState = riak_core_ring:set_chash(CState1, + FutureCHash), + ReassignedFuture = reassign_indices_to(Node, NewNode, + FutureCState), + ReassignedCHash = + riak_core_ring:chash(ReassignedFuture), + riak_core_ring:set_resized_ring(CState1, + ReassignedCHash); +replace_node_during_resize(CState, Node, _NewNode, + true) -> %% performing cleanup %% we are simply deleting data at this point, no reason to do that on either node - NewNext = [{I,N,O,M,S} || {I,N,O,M,S} <- riak_core_ring:pending_changes(CState), - N =/= Node], + NewNext = [{I, N, O, M, S} + || {I, N, O, M, S} + <- riak_core_ring:pending_changes(CState), + N =/= Node], riak_core_ring:set_pending_changes(CState, NewNext). -no_log(_, _) -> - ok. +no_log(_, _) -> ok. -log(debug, {Msg, Args}) -> - logger:debug(Msg, Args); +log(debug, {Msg, Args}) -> logger:debug(Msg, Args); log(ownership, {Idx, NewOwner, CState}) -> Owner = riak_core_ring:index_owner(CState, Idx), - logger:debug("(new-owner) ~b :: ~p -> ~p~n", [Idx, Owner, NewOwner]); + logger:debug("(new-owner) ~b :: ~p -> ~p~n", + [Idx, Owner, NewOwner]); log(reassign, {Idx, NewOwner, CState}) -> Owner = riak_core_ring:index_owner(CState, Idx), - logger:debug("(reassign) ~b :: ~p -> ~p~n", [Idx, Owner, NewOwner]); + logger:debug("(reassign) ~b :: ~p -> ~p~n", + [Idx, Owner, NewOwner]); log(next, {Idx, Owner, NewOwner}) -> - logger:debug("(pending) ~b :: ~p -> ~p~n", [Idx, Owner, NewOwner]); -log(_, _) -> - ok. + logger:debug("(pending) ~b :: ~p -> ~p~n", + [Idx, Owner, NewOwner]); +log(_, _) -> ok. %% =================================================================== %% EUnit tests %% =================================================================== - diff --git a/src/riak_core_eventhandler_guard.erl b/src/riak_core_eventhandler_guard.erl index 5381dce4d..b3e05ec2a 100644 --- a/src/riak_core_eventhandler_guard.erl +++ b/src/riak_core_eventhandler_guard.erl @@ -20,41 +20,50 @@ %% %% ------------------------------------------------------------------- -module(riak_core_eventhandler_guard). + -behaviour(gen_server). + -export([start_link/3, start_link/4]). --export([init/1, handle_call/3, handle_cast/2, handle_info/2, - terminate/2, code_change/3]). + +-export([init/1, handle_call/3, handle_cast/2, + handle_info/2, terminate/2, code_change/3]). + -record(state, {handlermod, handler, exitfun}). start_link(HandlerMod, Handler, Args) -> start_link(HandlerMod, Handler, Args, undefined). start_link(HandlerMod, Handler, Args, ExitFun) -> - gen_server:start_link(?MODULE, [HandlerMod, Handler, Args, ExitFun], []). + gen_server:start_link(?MODULE, + [HandlerMod, Handler, Args, ExitFun], []). init([HandlerMod, Handler, Args, ExitFun]) -> - ok = gen_event:add_sup_handler(HandlerMod, Handler, Args), - {ok, #state{handlermod=HandlerMod, handler=Handler, exitfun=ExitFun}}. + ok = gen_event:add_sup_handler(HandlerMod, Handler, + Args), + {ok, + #state{handlermod = HandlerMod, handler = Handler, + exitfun = ExitFun}}. -handle_call(_Request, _From, State) -> {reply, ok, State}. +handle_call(_Request, _From, State) -> + {reply, ok, State}. handle_cast(_Msg, State) -> {noreply, State}. - -handle_info({gen_event_EXIT, _Handler, shutdown}, State) -> +handle_info({gen_event_EXIT, _Handler, shutdown}, + State) -> {stop, normal, State}; -handle_info({gen_event_EXIT, _Handler, normal}, State) -> +handle_info({gen_event_EXIT, _Handler, normal}, + State) -> {stop, normal, State}; -handle_info({gen_event_EXIT, Handler, _Reason}, State=#state{exitfun=undefined}) -> +handle_info({gen_event_EXIT, Handler, _Reason}, + State = #state{exitfun = undefined}) -> {stop, {gen_event_EXIT, Handler}, State}; -handle_info({gen_event_EXIT, Handler, Reason}, State=#state{exitfun=ExitFun}) -> +handle_info({gen_event_EXIT, Handler, Reason}, + State = #state{exitfun = ExitFun}) -> ExitFun(Handler, Reason), {stop, {gen_event_EXIT, Handler}, State}; -handle_info(_Info, State) -> - {noreply, State}. +handle_info(_Info, State) -> {noreply, State}. -terminate(_Reason, #state{}) -> - ok. +terminate(_Reason, #state{}) -> ok. code_change(_OldVsn, State, _Extra) -> {ok, State}. - diff --git a/src/riak_core_eventhandler_sup.erl b/src/riak_core_eventhandler_sup.erl index 550069b6c..a9bb53f3e 100644 --- a/src/riak_core_eventhandler_sup.erl +++ b/src/riak_core_eventhandler_sup.erl @@ -22,45 +22,54 @@ %% @doc supervise riak_core_eventhandler_guard processes -module(riak_core_eventhandler_sup). + -behaviour(supervisor). + -export([start_link/0, init/1]). --export([start_guarded_handler/3, start_guarded_handler/4, stop_guarded_handler/3]). + +-export([start_guarded_handler/3, + start_guarded_handler/4, stop_guarded_handler/3]). start_guarded_handler(HandlerMod, Handler, Args) -> - start_guarded_handler(HandlerMod, Handler, Args, undefined). + start_guarded_handler(HandlerMod, Handler, Args, + undefined). -start_guarded_handler(HandlerMod, Handler, Args, ExitFun) -> - case supervisor:start_child(?MODULE, handler_spec(HandlerMod, Handler, Args, ExitFun)) of - {ok, _Pid} -> ok; - Other -> Other +start_guarded_handler(HandlerMod, Handler, Args, + ExitFun) -> + case supervisor:start_child(?MODULE, + handler_spec(HandlerMod, Handler, Args, + ExitFun)) + of + {ok, _Pid} -> ok; + Other -> Other end. stop_guarded_handler(HandlerMod, Handler, Args) -> - case lists:member(Handler, gen_event:which_handlers(HandlerMod)) of - true -> - case gen_event:delete_handler(HandlerMod, Handler, Args) of - {error, module_not_found} -> - {error, module_not_found}; - O -> - Id = {HandlerMod, Handler}, - ok = supervisor:terminate_child(?MODULE, Id), - ok = supervisor:delete_child(?MODULE, Id), - O - end; - false -> - {error, module_not_found} + case lists:member(Handler, + gen_event:which_handlers(HandlerMod)) + of + true -> + case gen_event:delete_handler(HandlerMod, Handler, Args) + of + {error, module_not_found} -> {error, module_not_found}; + O -> + Id = {HandlerMod, Handler}, + ok = supervisor:terminate_child(?MODULE, Id), + ok = supervisor:delete_child(?MODULE, Id), + O + end; + false -> {error, module_not_found} end. handler_spec(HandlerMod, Handler, Args, ExitFun) -> {{HandlerMod, Handler}, - {riak_core_eventhandler_guard, start_link, [HandlerMod, Handler, Args, ExitFun]}, - transient, 5000, worker, [riak_core_eventhandler_guard]}. + {riak_core_eventhandler_guard, start_link, + [HandlerMod, Handler, Args, ExitFun]}, + transient, 5000, worker, + [riak_core_eventhandler_guard]}. start_link() -> supervisor:start_link({local, ?MODULE}, ?MODULE, []). %% @private -init([]) -> - {ok, {{one_for_one, 10, 10}, []}}. - - +init([]) -> {ok, {{one_for_one, 10, 10}, []}}. diff --git a/src/riak_core_gen_server.erl b/src/riak_core_gen_server.erl deleted file mode 100644 index 145f136aa..000000000 --- a/src/riak_core_gen_server.erl +++ /dev/null @@ -1,1052 +0,0 @@ -%% This file is a copy of gen_server.erl from the R13B-1 Erlang/OTP -%% distribution, with the following modifications: -%% -%% 1) the module name is riak_core_gen_server -%% -%% 2) more efficient handling of selective receives in callbacks -%% riak_core_gen_server processes drain their message queue into an internal -%% buffer before invoking any callback module functions. Messages are -%% dequeued from the buffer for processing. Thus the effective message -%% queue of a riak_core_gen_server process is the concatenation of the internal -%% buffer and the real message queue. -%% As a result of the draining, any selective receive invoked inside a -%% callback is less likely to have to scan a large message queue. -%% -%% 3) riak_core_gen_server:cast is guaranteed to be order-preserving -%% The original code could reorder messages when communicating with a -%% process on a remote node that was not currently connected. -%% -%% 4) The new functions riak_core_gen_server:pcall/3, pcall/4, and pcast/3 -%% allow callers to attach priorities to requests. Requests with -%% higher priorities are processed before requests with lower -%% priorities. The default priority is 0. -%% -%% 5) On return from init/1, the timeout value {binary, Min} creates a -%% binary exponential timeout, where Min is the minimum number of -%% milliseconds permitted, and is also used as the current timeout -%% value. Returning from handle_* with the timeout value set to -%% 'binary' will use the current binary timeout value. handle_info/2 -%% with the Info of 'timeout' will function normally, and supports the -%% return value of {noreply, State, hibernate} which will hibernate -%% the process. The current timeout value is: -%% -%% a) doubled if the time spent in hibernation is < 4 * the current value; -%% b) halved if the time spent in hibernation is > 16 * the current value; -%% c) maintained in all other cases -%% -%% Explicit timeouts (i.e. not 'binary') from the handle_* functions -%% are still supported, and do not have any effect on the current -%% timeout value. - -%% All modifications are (C) 2009 LShift Ltd. - -%% ``The contents of this file are subject to the Erlang Public License, -%% Version 1.1, (the "License"); you may not use this file except in -%% compliance with the License. You should have received a copy of the -%% Erlang Public License along with this software. If not, it can be -%% retrieved via the world wide web at http://www.erlang.org/. -%% -%% Software distributed under the License is distributed on an "AS IS" -%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See -%% the License for the specific language governing rights and limitations -%% under the License. -%% -%% The Initial Developer of the Original Code is Ericsson Utvecklings AB. -%% Portions created by Ericsson are Copyright 1999, Ericsson Utvecklings -%% AB. All Rights Reserved.'' -%% -%% $Id$ -%% --module(riak_core_gen_server). - -%%% --------------------------------------------------- -%%% -%%% The idea behind THIS server is that the user module -%%% provides (different) functions to handle different -%%% kind of inputs. -%%% If the Parent process terminates the Module:terminate/2 -%%% function is called. -%%% -%%% The user module should export: -%%% -%%% init(Args) -%%% ==> {ok, State} -%%% {ok, State, Timeout} -%%% ignore -%%% {stop, Reason} -%%% -%%% handle_call(Msg, {From, Tag}, State) -%%% -%%% ==> {reply, Reply, State} -%%% {reply, Reply, State, Timeout} -%%% {noreply, State} -%%% {noreply, State, Timeout} -%%% {stop, Reason, Reply, State} -%%% Reason = normal | shutdown | Term terminate(State) is called -%%% -%%% handle_cast(Msg, State) -%%% -%%% ==> {noreply, State} -%%% {noreply, State, Timeout} -%%% {stop, Reason, State} -%%% Reason = normal | shutdown | Term terminate(State) is called -%%% -%%% handle_info(Info, State) Info is e.g. {'EXIT', P, R}, {nodedown, N}, ... -%%% -%%% ==> {noreply, State} -%%% {noreply, State, Timeout} -%%% {stop, Reason, State} -%%% Reason = normal | shutdown | Term, terminate(State) is called -%%% -%%% terminate(Reason, State) Let the user module clean up -%%% always called when server terminates -%%% -%%% ==> ok -%%% -%%% -%%% The work flow (of the server) can be described as follows: -%%% -%%% User module Generic -%%% ----------- ------- -%%% start -----> start -%%% init <----- . -%%% -%%% loop -%%% handle_call <----- . -%%% -----> reply -%%% -%%% handle_cast <----- . -%%% -%%% handle_info <----- . -%%% -%%% terminate <----- . -%%% -%%% -----> reply -%%% -%%% -%%% --------------------------------------------------- - -%% API --export([start/3, start/4, - start_link/3, start_link/4, - call/2, call/3, pcall/3, pcall/4, - cast/2, pcast/3, reply/2, - abcast/2, abcast/3, - multi_call/2, multi_call/3, multi_call/4, - enter_loop/3, enter_loop/4, enter_loop/5, wake_hib/7]). - - -%% System exports --export([system_continue/3, - system_terminate/4, - system_code_change/4, - format_status/2]). - -%% Internal exports --export([init_it/6, print_event/3]). - --import(error_logger, [format/2]). - -%%%========================================================================= -%%% API -%%%========================================================================= - --callback init(Args :: term()) -> - {ok, State :: term()} | {ok, State :: term(), timeout() | hibernate} | - {stop, Reason :: term()} | ignore. --callback handle_call(Request :: term(), From :: {pid(), Tag :: term()}, - State :: term()) -> - {reply, Reply :: term(), NewState :: term()} | - {reply, Reply :: term(), NewState :: term(), timeout() | hibernate} | - {noreply, NewState :: term()} | - {noreply, NewState :: term(), timeout() | hibernate} | - {stop, Reason :: term(), Reply :: term(), NewState :: term()} | - {stop, Reason :: term(), NewState :: term()}. --callback handle_cast(Request :: term(), State :: term()) -> - {noreply, NewState :: term()} | - {noreply, NewState :: term(), timeout() | hibernate} | - {stop, Reason :: term(), NewState :: term()}. --callback handle_info(Info :: timeout | term(), State :: term()) -> - {noreply, NewState :: term()} | - {noreply, NewState :: term(), timeout() | hibernate} | - {stop, Reason :: term(), NewState :: term()}. --callback terminate(Reason :: (normal | shutdown | {shutdown, term()} | - term()), - State :: term()) -> - term(). --callback code_change(OldVsn :: (term() | {down, term()}), State :: term(), - Extra :: term()) -> - {ok, NewState :: term()} | {error, Reason :: term()}. - -%%%========================================================================= -%%% Preprocessor -%%%========================================================================= - --ifdef(deprecated_22). -get_log(Debug) -> - sys:get_log(Debug). --else. -get_log(Debug) -> - sys:get_debug(log, Debug, []). --endif. - -%%% ----------------------------------------------------------------- -%%% Starts a generic server. -%%% start(Mod, Args, Options) -%%% start(Name, Mod, Args, Options) -%%% start_link(Mod, Args, Options) -%%% start_link(Name, Mod, Args, Options) where: -%%% Name ::= {local, atom()} | {global, atom()} -%%% Mod ::= atom(), callback module implementing the 'real' server -%%% Args ::= term(), init arguments (to Mod:init/1) -%%% Options ::= [{timeout, Timeout} | {debug, [Flag]}] -%%% Flag ::= trace | log | {logfile, File} | statistics | debug -%%% (debug == log && statistics) -%%% Returns: {ok, Pid} | -%%% {error, {already_started, Pid}} | -%%% {error, Reason} -%%% ----------------------------------------------------------------- -start(Mod, Args, Options) -> - gen:start(?MODULE, nolink, Mod, Args, Options). - -start(Name, Mod, Args, Options) -> - gen:start(?MODULE, nolink, Name, Mod, Args, Options). - -start_link(Mod, Args, Options) -> - gen:start(?MODULE, link, Mod, Args, Options). - -start_link(Name, Mod, Args, Options) -> - gen:start(?MODULE, link, Name, Mod, Args, Options). - - -%% ----------------------------------------------------------------- -%% Make a call to a generic server. -%% If the server is located at another node, that node will -%% be monitored. -%% If the client is trapping exits and is linked server termination -%% is handled here (? Shall we do that here (or rely on timeouts) ?). -%% ----------------------------------------------------------------- -call(Name, Request) -> - case catch gen:call(Name, '$gen_call', Request) of - {ok,Res} -> - Res; - {'EXIT',Reason} -> - exit({Reason, {?MODULE, call, [Name, Request]}}) - end. - -call(Name, Request, Timeout) -> - case catch gen:call(Name, '$gen_call', Request, Timeout) of - {ok,Res} -> - Res; - {'EXIT',Reason} -> - exit({Reason, {?MODULE, call, [Name, Request, Timeout]}}) - end. - -pcall(Name, Priority, Request) -> - case catch gen:call(Name, '$gen_pcall', {Priority, Request}) of - {ok,Res} -> - Res; - {'EXIT',Reason} -> - exit({Reason, {?MODULE, pcall, [Name, Priority, Request]}}) - end. - -pcall(Name, Priority, Request, Timeout) -> - case catch gen:call(Name, '$gen_pcall', {Priority, Request}, Timeout) of - {ok,Res} -> - Res; - {'EXIT',Reason} -> - exit({Reason, {?MODULE, pcall, [Name, Priority, Request, Timeout]}}) - end. - -%% ----------------------------------------------------------------- -%% Make a cast to a generic server. -%% ----------------------------------------------------------------- -cast({global,Name}, Request) -> - catch global:send(Name, cast_msg(Request)), - ok; -cast({Name,Node}=Dest, Request) when is_atom(Name), is_atom(Node) -> - do_cast(Dest, Request); -cast(Dest, Request) when is_atom(Dest) -> - do_cast(Dest, Request); -cast(Dest, Request) when is_pid(Dest) -> - do_cast(Dest, Request). - -do_cast(Dest, Request) -> - do_send(Dest, cast_msg(Request)), - ok. - -cast_msg(Request) -> {'$gen_cast',Request}. - -pcast({global,Name}, Priority, Request) -> - catch global:send(Name, cast_msg(Priority, Request)), - ok; -pcast({Name,Node}=Dest, Priority, Request) when is_atom(Name), is_atom(Node) -> - do_cast(Dest, Priority, Request); -pcast(Dest, Priority, Request) when is_atom(Dest) -> - do_cast(Dest, Priority, Request); -pcast(Dest, Priority, Request) when is_pid(Dest) -> - do_cast(Dest, Priority, Request). - -do_cast(Dest, Priority, Request) -> - do_send(Dest, cast_msg(Priority, Request)), - ok. - -cast_msg(Priority, Request) -> {'$gen_pcast', {Priority, Request}}. - -%% ----------------------------------------------------------------- -%% Send a reply to the client. -%% ----------------------------------------------------------------- -reply({To, Tag}, Reply) -> - catch To ! {Tag, Reply}. - -%% ----------------------------------------------------------------- -%% Asyncronous broadcast, returns nothing, it's just send'n prey -%%----------------------------------------------------------------- -abcast(Name, Request) when is_atom(Name) -> - do_abcast([node() | nodes()], Name, cast_msg(Request)). - -abcast(Nodes, Name, Request) when is_list(Nodes), is_atom(Name) -> - do_abcast(Nodes, Name, cast_msg(Request)). - -do_abcast([Node|Nodes], Name, Msg) when is_atom(Node) -> - do_send({Name,Node},Msg), - do_abcast(Nodes, Name, Msg); -do_abcast([], _,_) -> abcast. - -%%% ----------------------------------------------------------------- -%%% Make a call to servers at several nodes. -%%% Returns: {[Replies],[BadNodes]} -%%% A Timeout can be given -%%% -%%% A middleman process is used in case late answers arrives after -%%% the timeout. If they would be allowed to glog the callers message -%%% queue, it would probably become confused. Late answers will -%%% now arrive to the terminated middleman and so be discarded. -%%% ----------------------------------------------------------------- -multi_call(Name, Req) - when is_atom(Name) -> - do_multi_call([node() | nodes()], Name, Req, infinity). - -multi_call(Nodes, Name, Req) - when is_list(Nodes), is_atom(Name) -> - do_multi_call(Nodes, Name, Req, infinity). - -multi_call(Nodes, Name, Req, infinity) -> - do_multi_call(Nodes, Name, Req, infinity); -multi_call(Nodes, Name, Req, Timeout) - when is_list(Nodes), is_atom(Name), is_integer(Timeout), Timeout >= 0 -> - do_multi_call(Nodes, Name, Req, Timeout). - - -%%----------------------------------------------------------------- -%% enter_loop(Mod, Options, State, , ) ->_ -%% -%% Description: Makes an existing process into a gen_server. -%% The calling process will enter the gen_server receive -%% loop and become a gen_server process. -%% The process *must* have been started using one of the -%% start functions in proc_lib, see proc_lib(3). -%% The user is responsible for any initialization of the -%% process, including registering a name for it. -%%----------------------------------------------------------------- -enter_loop(Mod, Options, State) -> - enter_loop(Mod, Options, State, self(), infinity). - -enter_loop(Mod, Options, State, ServerName = {_, _}) -> - enter_loop(Mod, Options, State, ServerName, infinity); - -enter_loop(Mod, Options, State, Timeout) -> - enter_loop(Mod, Options, State, self(), Timeout). - -enter_loop(Mod, Options, State, ServerName, Timeout) -> - Name = get_proc_name(ServerName), - Parent = get_parent(), - Debug = debug_options(Name, Options), - Queue = riak_core_priority_queue:new(), - {Timeout1, TimeoutState} = build_timeout_state(Timeout), - loop(Parent, Name, State, Mod, Timeout1, TimeoutState, Queue, Debug). - -%%%======================================================================== -%%% Gen-callback functions -%%%======================================================================== - -%%% --------------------------------------------------- -%%% Initiate the new process. -%%% Register the name using the Rfunc function -%%% Calls the Mod:init/Args function. -%%% Finally an acknowledge is sent to Parent and the main -%%% loop is entered. -%%% --------------------------------------------------- -init_it(Starter, self, Name, Mod, Args, Options) -> - init_it(Starter, self(), Name, Mod, Args, Options); -init_it(Starter, Parent, Name0, Mod, Args, Options) -> - Name = name(Name0), - Debug = debug_options(Name, Options), - Queue = riak_core_priority_queue:new(), - case catch Mod:init(Args) of - {ok, State} -> - proc_lib:init_ack(Starter, {ok, self()}), - loop(Parent, Name, State, Mod, infinity, undefined, Queue, Debug); - {ok, State, Timeout} -> - proc_lib:init_ack(Starter, {ok, self()}), - {Timeout1, TimeoutState} = build_timeout_state(Timeout), - loop(Parent, Name, State, Mod, Timeout1, TimeoutState, Queue, - Debug); - {stop, Reason} -> - %% For consistency, we must make sure that the - %% registered name (if any) is unregistered before - %% the parent process is notified about the failure. - %% (Otherwise, the parent process could get - %% an 'already_started' error if it immediately - %% tried starting the process again.) - unregister_name(Name0), - proc_lib:init_ack(Starter, {error, Reason}), - exit(Reason); - ignore -> - unregister_name(Name0), - proc_lib:init_ack(Starter, ignore), - exit(normal); - {'EXIT', Reason} -> - unregister_name(Name0), - proc_lib:init_ack(Starter, {error, Reason}), - exit(Reason); - Else -> - Error = {bad_return_value, Else}, - proc_lib:init_ack(Starter, {error, Error}), - exit(Error) - end. - -name({local,Name}) -> Name; -name({global,Name}) -> Name; -%% name(Pid) when is_pid(Pid) -> Pid; -%% when R11 goes away, drop the line beneath and uncomment the line above -name(Name) -> Name. - -unregister_name({local,Name}) -> - _ = (catch unregister(Name)); -unregister_name({global,Name}) -> - _ = global:unregister_name(Name); -unregister_name(Pid) when is_pid(Pid) -> - Pid. - -build_timeout_state(Timeout) -> - case Timeout of - {binary, Min} -> {binary, {Min, Min, undefined}}; - _ -> {Timeout, undefined} - end. - -%%%======================================================================== -%%% Internal functions -%%%======================================================================== -%%% --------------------------------------------------- -%%% The MAIN loop. -%%% --------------------------------------------------- -loop(Parent, Name, State, Mod, hibernate, undefined, Queue, Debug) -> - proc_lib:hibernate(?MODULE,wake_hib, - [Parent, Name, State, Mod, undefined, Queue, Debug]); -loop(Parent, Name, State, Mod, hibernate, {Current, Min, undefined}, Queue, - Debug) -> - proc_lib:hibernate(?MODULE,wake_hib,[Parent, Name, State, Mod, - {Current, Min, os:timestamp()}, Queue, Debug]); -loop(Parent, Name, State, Mod, Time, TimeoutState, Queue, Debug) -> - receive - Input -> loop(Parent, Name, State, Mod, - Time, TimeoutState, in(Input, Queue), Debug) - after 0 -> - process_next_msg(Parent, Name, State, Mod, Time, TimeoutState, - Queue, Debug, false) - end. - -process_next_msg(Parent, Name, State, Mod, Time, TimeoutState, Queue, - Debug, Hib) -> - case riak_core_priority_queue:out(Queue) of - {{value, Msg}, Queue1} -> - process_msg(Parent, Name, State, Mod, - Time, TimeoutState, Queue1, Debug, Hib, Msg); - {empty, Queue1} -> - Time1 = case {Time, TimeoutState} of - {binary, {Current, _Min, undefined}} -> Current; - _ -> Time - end, - receive - Input -> - loop(Parent, Name, State, Mod, - Time, TimeoutState, in(Input, Queue1), Debug) - after Time1 -> - process_msg(Parent, Name, State, Mod, - Time, TimeoutState, Queue1, Debug, Hib, timeout) - end - end. - -wake_hib(Parent, Name, State, Mod, TimeoutState, Queue, Debug) -> - Msg = receive - Input -> - Input - end, - TimeoutState1 = adjust_hibernate_after(TimeoutState), - process_next_msg(Parent, Name, State, Mod, hibernate, TimeoutState1, - in(Msg, Queue), Debug, true). - -adjust_hibernate_after(undefined) -> - undefined; -adjust_hibernate_after({Current, Min, HibernatedAt}) -> - NapLengthMicros = timer:now_diff(os:timestamp(), HibernatedAt), - CurrentMicros = Current * 1000, - LowTargetMicros = CurrentMicros * 4, - HighTargetMicros = LowTargetMicros * 4, - if - NapLengthMicros < LowTargetMicros -> - %% nap was too short, don't go to sleep as soon - {Current * 2, Min, undefined}; - - NapLengthMicros > HighTargetMicros -> - %% nap was long, try going to sleep sooner - {lists:max([Min, round(Current / 2)]), Min, undefined}; - - true -> - %% nap and timeout seem to be in the right relationship. stay here - {Current, Min, undefined} - end. - -in({'$gen_pcast', {Priority, Msg}}, Queue) -> - riak_core_priority_queue:in({'$gen_cast', Msg}, Priority, Queue); -in({'$gen_pcall', From, {Priority, Msg}}, Queue) -> - riak_core_priority_queue:in({'$gen_call', From, Msg}, Priority, Queue); -in(Input, Queue) -> - riak_core_priority_queue:in(Input, Queue). - -process_msg(Parent, Name, State, Mod, Time, TimeoutState, Queue, - Debug, _Hib, Msg) -> - case Msg of - {system, From, Req} -> - sys:handle_system_msg - (Req, From, Parent, ?MODULE, Debug, - [Name, State, Mod, Time, TimeoutState, Queue]); - %% gen_server puts Hib on the end as the 7th arg, but that - %% version of the function seems not to be documented so - %% leaving out for now. - {'EXIT', Parent, Reason} -> - terminate(Reason, Name, Msg, Mod, State, Debug); - _Msg when Debug =:= [] -> - handle_msg(Msg, Parent, Name, State, Mod, TimeoutState, Queue); - _Msg -> - Debug1 = sys:handle_debug(Debug, fun print_event/3, - Name, {in, Msg}), - handle_msg(Msg, Parent, Name, State, Mod, TimeoutState, Queue, - Debug1) - end. - -%%% --------------------------------------------------- -%%% Send/recive functions -%%% --------------------------------------------------- -do_send(Dest, Msg) -> - catch erlang:send(Dest, Msg). - -do_multi_call(Nodes, Name, Req, infinity) -> - Tag = make_ref(), - Monitors = send_nodes(Nodes, Name, Tag, Req), - rec_nodes(Tag, Monitors, Name, undefined); -do_multi_call(Nodes, Name, Req, Timeout) -> - Tag = make_ref(), - Caller = self(), - Receiver = - spawn( - fun() -> - %% Middleman process. Should be unsensitive to regular - %% exit signals. The sychronization is needed in case - %% the receiver would exit before the caller started - %% the monitor. - process_flag(trap_exit, true), - Mref = erlang:monitor(process, Caller), - receive - {Caller,Tag} -> - Monitors = send_nodes(Nodes, Name, Tag, Req), - TimerId = erlang:start_timer(Timeout, self(), ok), - Result = rec_nodes(Tag, Monitors, Name, TimerId), - exit({self(),Tag,Result}); - {'DOWN',Mref,_,_,_} -> - %% Caller died before sending us the go-ahead. - %% Give up silently. - exit(normal) - end - end), - Mref = erlang:monitor(process, Receiver), - Receiver ! {self(),Tag}, - receive - {'DOWN',Mref,_,_,{Receiver,Tag,Result}} -> - Result; - {'DOWN',Mref,_,_,Reason} -> - %% The middleman code failed. Or someone did - %% exit(_, kill) on the middleman process => Reason==killed - exit(Reason) - end. - -send_nodes(Nodes, Name, Tag, Req) -> - send_nodes(Nodes, Name, Tag, Req, []). - -send_nodes([Node|Tail], Name, Tag, Req, Monitors) - when is_atom(Node) -> - Monitor = start_monitor(Node, Name), - %% Handle non-existing names in rec_nodes. - catch {Name, Node} ! {'$gen_call', {self(), {Tag, Node}}, Req}, - send_nodes(Tail, Name, Tag, Req, [Monitor | Monitors]); -send_nodes([_Node|Tail], Name, Tag, Req, Monitors) -> - %% Skip non-atom Node - send_nodes(Tail, Name, Tag, Req, Monitors); -send_nodes([], _Name, _Tag, _Req, Monitors) -> - Monitors. - -%% Against old nodes: -%% If no reply has been delivered within 2 secs. (per node) check that -%% the server really exists and wait for ever for the answer. -%% -%% Against contemporary nodes: -%% Wait for reply, server 'DOWN', or timeout from TimerId. - -rec_nodes(Tag, Nodes, Name, TimerId) -> - rec_nodes(Tag, Nodes, Name, [], [], 2000, TimerId). - -rec_nodes(Tag, [{N,R}|Tail], Name, Badnodes, Replies, Time, TimerId ) -> - receive - {'DOWN', R, _, _, _} -> - rec_nodes(Tag, Tail, Name, [N|Badnodes], Replies, Time, TimerId); - {{Tag, N}, Reply} -> %% Tag is bound !!! - unmonitor(R), - rec_nodes(Tag, Tail, Name, Badnodes, - [{N,Reply}|Replies], Time, TimerId); - {timeout, TimerId, _} -> - unmonitor(R), - %% Collect all replies that already have arrived - rec_nodes_rest(Tag, Tail, Name, [N|Badnodes], Replies) - end; -rec_nodes(Tag, [N|Tail], Name, Badnodes, Replies, Time, TimerId) -> - %% R6 node - receive - {nodedown, N} -> - monitor_node(N, false), - rec_nodes(Tag, Tail, Name, [N|Badnodes], Replies, 2000, TimerId); - {{Tag, N}, Reply} -> %% Tag is bound !!! - receive {nodedown, N} -> ok after 0 -> ok end, - monitor_node(N, false), - rec_nodes(Tag, Tail, Name, Badnodes, - [{N,Reply}|Replies], 2000, TimerId); - {timeout, TimerId, _} -> - receive {nodedown, N} -> ok after 0 -> ok end, - monitor_node(N, false), - %% Collect all replies that already have arrived - rec_nodes_rest(Tag, Tail, Name, [N | Badnodes], Replies) - after Time -> - case riak_core_util:safe_rpc(N, erlang, whereis, [Name]) of - Pid when is_pid(Pid) -> % It exists try again. - rec_nodes(Tag, [N|Tail], Name, Badnodes, - Replies, infinity, TimerId); - _ -> % badnode - receive {nodedown, N} -> ok after 0 -> ok end, - monitor_node(N, false), - rec_nodes(Tag, Tail, Name, [N|Badnodes], - Replies, 2000, TimerId) - end - end; -rec_nodes(_, [], _, Badnodes, Replies, _, TimerId) -> - case catch erlang:cancel_timer(TimerId) of - false -> % It has already sent it's message - receive - {timeout, TimerId, _} -> ok - after 0 -> - ok - end; - _ -> % Timer was cancelled, or TimerId was 'undefined' - ok - end, - {Replies, Badnodes}. - -%% Collect all replies that already have arrived -rec_nodes_rest(Tag, [{N,R}|Tail], Name, Badnodes, Replies) -> - receive - {'DOWN', R, _, _, _} -> - rec_nodes_rest(Tag, Tail, Name, [N|Badnodes], Replies); - {{Tag, N}, Reply} -> %% Tag is bound !!! - unmonitor(R), - rec_nodes_rest(Tag, Tail, Name, Badnodes, [{N,Reply}|Replies]) - after 0 -> - unmonitor(R), - rec_nodes_rest(Tag, Tail, Name, [N|Badnodes], Replies) - end; -rec_nodes_rest(Tag, [N|Tail], Name, Badnodes, Replies) -> - %% R6 node - receive - {nodedown, N} -> - monitor_node(N, false), - rec_nodes_rest(Tag, Tail, Name, [N|Badnodes], Replies); - {{Tag, N}, Reply} -> %% Tag is bound !!! - receive {nodedown, N} -> ok after 0 -> ok end, - monitor_node(N, false), - rec_nodes_rest(Tag, Tail, Name, Badnodes, [{N,Reply}|Replies]) - after 0 -> - receive {nodedown, N} -> ok after 0 -> ok end, - monitor_node(N, false), - rec_nodes_rest(Tag, Tail, Name, [N|Badnodes], Replies) - end; -rec_nodes_rest(_Tag, [], _Name, Badnodes, Replies) -> - {Replies, Badnodes}. - - -%%% --------------------------------------------------- -%%% Monitor functions -%%% --------------------------------------------------- - -start_monitor(Node, Name) when is_atom(Node), is_atom(Name) -> - if node() =:= nonode@nohost, Node =/= nonode@nohost -> - Ref = make_ref(), - self() ! {'DOWN', Ref, process, {Name, Node}, noconnection}, - {Node, Ref}; - true -> - case catch erlang:monitor(process, {Name, Node}) of - {'EXIT', _} -> - %% Remote node is R6 - monitor_node(Node, true), - Node; - Ref when is_reference(Ref) -> - {Node, Ref} - end - end. - -%% Cancels a monitor started with Ref=erlang:monitor(_, _). -unmonitor(Ref) when is_reference(Ref) -> - erlang:demonitor(Ref), - receive - {'DOWN', Ref, _, _, _} -> - true - after 0 -> - true - end. - -%%% --------------------------------------------------- -%%% Message handling functions -%%% --------------------------------------------------- - -dispatch({'$gen_cast', Msg}, Mod, State) -> - Mod:handle_cast(Msg, State); -dispatch(Info, Mod, State) -> - Mod:handle_info(Info, State). - -handle_msg({'$gen_call', From, Msg}, - Parent, Name, State, Mod, TimeoutState, Queue) -> - case catch Mod:handle_call(Msg, From, State) of - {reply, Reply, NState} -> - reply(From, Reply), - loop(Parent, Name, NState, Mod, infinity, TimeoutState, Queue, []); - {reply, Reply, NState, Time1} -> - reply(From, Reply), - loop(Parent, Name, NState, Mod, Time1, TimeoutState, Queue, []); - {noreply, NState} -> - loop(Parent, Name, NState, Mod, infinity, TimeoutState, Queue, []); - {noreply, NState, Time1} -> - loop(Parent, Name, NState, Mod, Time1, TimeoutState, Queue, []); - {stop, Reason, Reply, NState} -> - {'EXIT', R} = - (catch terminate(Reason, Name, Msg, Mod, NState, [])), - reply(From, Reply), - exit(R); - Other -> handle_common_reply(Other, Parent, Name, Msg, Mod, State, - TimeoutState, Queue) - end; -handle_msg(Msg, - Parent, Name, State, Mod, TimeoutState, Queue) -> - Reply = (catch dispatch(Msg, Mod, State)), - handle_common_reply(Reply, Parent, Name, Msg, Mod, State, - TimeoutState, Queue). - -handle_msg({'$gen_call', From, Msg}, - Parent, Name, State, Mod, TimeoutState, Queue, Debug) -> - case catch Mod:handle_call(Msg, From, State) of - {reply, Reply, NState} -> - Debug1 = reply(Name, From, Reply, NState, Debug), - loop(Parent, Name, NState, Mod, infinity, TimeoutState, Queue, - Debug1); - {reply, Reply, NState, Time1} -> - Debug1 = reply(Name, From, Reply, NState, Debug), - loop(Parent, Name, NState, Mod, Time1, TimeoutState, Queue, Debug1); - {noreply, NState} -> - Debug1 = sys:handle_debug(Debug, fun print_event/3, Name, - {noreply, NState}), - loop(Parent, Name, NState, Mod, infinity, TimeoutState, Queue, - Debug1); - {noreply, NState, Time1} -> - Debug1 = sys:handle_debug(Debug, fun print_event/3, Name, - {noreply, NState}), - loop(Parent, Name, NState, Mod, Time1, TimeoutState, Queue, Debug1); - {stop, Reason, Reply, NState} -> - {'EXIT', R} = - (catch terminate(Reason, Name, Msg, Mod, NState, Debug)), - _ = reply(Name, From, Reply, NState, Debug), - exit(R); - Other -> - handle_common_reply(Other, Parent, Name, Msg, Mod, State, - TimeoutState, Queue, Debug) - end; -handle_msg(Msg, - Parent, Name, State, Mod, TimeoutState, Queue, Debug) -> - Reply = (catch dispatch(Msg, Mod, State)), - handle_common_reply(Reply, Parent, Name, Msg, Mod, State, - TimeoutState, Queue, Debug). - -handle_common_reply(Reply, Parent, Name, Msg, Mod, State, - TimeoutState, Queue) -> - case Reply of - {noreply, NState} -> - loop(Parent, Name, NState, Mod, infinity, TimeoutState, Queue, []); - {noreply, NState, Time1} -> - loop(Parent, Name, NState, Mod, Time1, TimeoutState, Queue, []); - {stop, Reason, NState} -> - terminate(Reason, Name, Msg, Mod, NState, []); - {'EXIT', What} -> - terminate(What, Name, Msg, Mod, State, []); - _ -> - terminate({bad_return_value, Reply}, Name, Msg, Mod, State, []) - end. - -handle_common_reply(Reply, Parent, Name, Msg, Mod, State, TimeoutState, Queue, - Debug) -> - case Reply of - {noreply, NState} -> - Debug1 = sys:handle_debug(Debug, fun print_event/3, Name, - {noreply, NState}), - loop(Parent, Name, NState, Mod, infinity, TimeoutState, Queue, - Debug1); - {noreply, NState, Time1} -> - Debug1 = sys:handle_debug(Debug, fun print_event/3, Name, - {noreply, NState}), - loop(Parent, Name, NState, Mod, Time1, TimeoutState, Queue, Debug1); - {stop, Reason, NState} -> - terminate(Reason, Name, Msg, Mod, NState, Debug); - {'EXIT', What} -> - terminate(What, Name, Msg, Mod, State, Debug); - _ -> - terminate({bad_return_value, Reply}, Name, Msg, Mod, State, Debug) - end. - -reply(Name, {To, Tag}, Reply, State, Debug) -> - reply({To, Tag}, Reply), - sys:handle_debug(Debug, fun print_event/3, Name, - {out, Reply, To, State} ). - - -%%----------------------------------------------------------------- -%% Callback functions for system messages handling. -%%----------------------------------------------------------------- -system_continue(Parent, Debug, [Name, State, Mod, Time, TimeoutState, Queue]) -> - loop(Parent, Name, State, Mod, Time, TimeoutState, Queue, Debug). - - --spec system_terminate(_, _, _, [_]) -> no_return(). -system_terminate(Reason, _Parent, Debug, [Name, State, Mod, _Time, - _TimeoutState, _Queue]) -> - terminate(Reason, Name, [], Mod, State, Debug). - -system_code_change([Name, State, Mod, Time, TimeoutState, Queue], _Module, - OldVsn, Extra) -> - case catch Mod:code_change(OldVsn, State, Extra) of - {ok, NewState} -> - {ok, [Name, NewState, Mod, Time, TimeoutState, Queue]}; - Else -> - Else - end. - -%%----------------------------------------------------------------- -%% Format debug messages. Print them as the call-back module sees -%% them, not as the real erlang messages. Use trace for that. -%%----------------------------------------------------------------- -print_event(Dev, {in, Msg}, Name) -> - case Msg of - {'$gen_call', {From, _Tag}, Call} -> - io:format(Dev, "*DBG* ~p got call ~p from ~w~n", - [Name, Call, From]); - {'$gen_cast', Cast} -> - io:format(Dev, "*DBG* ~p got cast ~p~n", - [Name, Cast]); - _ -> - io:format(Dev, "*DBG* ~p got ~p~n", [Name, Msg]) - end; -print_event(Dev, {out, Msg, To, State}, Name) -> - io:format(Dev, "*DBG* ~p sent ~p to ~w, new state ~w~n", - [Name, Msg, To, State]); -print_event(Dev, {noreply, State}, Name) -> - io:format(Dev, "*DBG* ~p new state ~w~n", [Name, State]); -print_event(Dev, Event, Name) -> - io:format(Dev, "*DBG* ~p dbg ~p~n", [Name, Event]). - - -%%% --------------------------------------------------- -%%% Terminate the server. -%%% --------------------------------------------------- - -terminate(Reason, Name, Msg, Mod, State, Debug) -> - case catch Mod:terminate(Reason, State) of - {'EXIT', R} -> - error_info(R, Name, Msg, State, Debug), - exit(R); - _ -> - case Reason of - normal -> - exit(normal); - shutdown -> - exit(shutdown); - {shutdown,_}=Shutdown -> - exit(Shutdown); - _ -> - error_info(Reason, Name, Msg, State, Debug), - exit(Reason) - end - end. - -error_info(_Reason, application_controller, _Msg, _State, _Debug) -> - %% OTP-5811 Don't send an error report if it's the system process - %% application_controller which is terminating - let init take care - %% of it instead - ok; -error_info(Reason, Name, Msg, State, Debug) -> - Reason1 = - case Reason of - {undef,[{M,F,A}|MFAs]} -> - case code:is_loaded(M) of - false -> - {'module could not be loaded',[{M,F,A}|MFAs]}; - _ -> - case erlang:function_exported(M, F, length(A)) of - true -> - Reason; - false -> - {'function not exported',[{M,F,A}|MFAs]} - end - end; - _ -> - Reason - end, - format("** Generic server ~p terminating \n" - "** Last message in was ~p~n" - "** When Server state == ~p~n" - "** Reason for termination == ~n** ~p~n", - [Name, Msg, State, Reason1]), - sys:print_log(Debug), - ok. - -%%% --------------------------------------------------- -%%% Misc. functions. -%%% --------------------------------------------------- - -opt(Op, [{Op, Value}|_]) -> - {ok, Value}; -opt(Op, [_|Options]) -> - opt(Op, Options); -opt(_, []) -> - false. - -debug_options(Name, Opts) -> - case opt(debug, Opts) of - {ok, Options} -> dbg_options(Name, Options); - _ -> dbg_options(Name, []) - end. - -dbg_options(Name, []) -> - Opts = - case init:get_argument(generic_debug) of - error -> - []; - _ -> - [log, statistics] - end, - dbg_opts(Name, Opts); -dbg_options(Name, Opts) -> - dbg_opts(Name, Opts). - -dbg_opts(Name, Opts) -> - case catch sys:debug_options(Opts) of - {'EXIT',_} -> - format("~p: ignoring erroneous debug options - ~p~n", - [Name, Opts]), - []; - Dbg -> - Dbg - end. - -get_proc_name(Pid) when is_pid(Pid) -> - Pid; -get_proc_name({local, Name}) -> - case process_info(self(), registered_name) of - {registered_name, Name} -> - Name; - {registered_name, _Name} -> - exit(process_not_registered); - [] -> - exit(process_not_registered) - end; -get_proc_name({global, Name}) -> - case global:whereis_name(Name) of - undefined -> - exit(process_not_registered_globally); - Pid when Pid =:= self() -> - Name; - _Pid -> - exit(process_not_registered_globally) - end. - -get_parent() -> - case get('$ancestors') of - [Parent | _] when is_pid(Parent)-> - Parent; - [Parent | _] when is_atom(Parent)-> - name_to_pid(Parent); - _ -> - exit(process_was_not_started_by_proc_lib) - end. - -name_to_pid(Name) -> - case whereis(Name) of - undefined -> - case global:whereis_name(Name) of - undefined -> - exit(could_not_find_registerd_name); - Pid -> - Pid - end; - Pid -> - Pid - end. - -%%----------------------------------------------------------------- -%% Status information -%%----------------------------------------------------------------- -format_status(Opt, StatusData) -> - [PDict, SysState, Parent, Debug, [Name, State, Mod, _Time, - TimeoutState, Queue]] = - StatusData, - NameTag = if is_pid(Name) -> - pid_to_list(Name); - is_atom(Name) -> - Name - end, - Header = lists:concat(["Status for generic server ", NameTag]), - Log = sys:get_log(Debug), - Specfic = - case erlang:function_exported(Mod, format_status, 2) of - true -> - case catch Mod:format_status(Opt, [PDict, State]) of - {'EXIT', _} -> [{data, [{"State", State}]}]; - Else -> Else - end; - _ -> - [{data, [{"State", State}]}] - end, - Specfic1 = case TimeoutState of - undefined -> Specfic; - {Current, Min, undefined} -> - [ {"Binary Timeout Current and Min", {Current, Min}} - | Specfic] - end, - [{header, Header}, - {data, [{"Status", SysState}, - {"Parent", Parent}, - {"Logged events", Log}, - {"Queued messages", riak_core_priority_queue:to_list(Queue)}]} | - Specfic1]. diff --git a/src/riak_core_gossip.erl b/src/riak_core_gossip.erl index 82e32c46a..ade325237 100644 --- a/src/riak_core_gossip.erl +++ b/src/riak_core_gossip.erl @@ -34,13 +34,14 @@ -behaviour(gen_server). -export([start_link/0, stop/0]). --export([init/1, handle_call/3, handle_cast/2, handle_info/2, - terminate/2, code_change/3]). --export ([distribute_ring/1, send_ring/1, send_ring/2, remove_from_cluster/2, - remove_from_cluster/3, random_gossip/1, - recursive_gossip/1, random_recursive_gossip/1, rejoin/2 -]). +-export([init/1, handle_call/3, handle_cast/2, + handle_info/2, terminate/2, code_change/3]). + +-export([distribute_ring/1, send_ring/1, send_ring/2, + remove_from_cluster/2, remove_from_cluster/3, + random_gossip/1, recursive_gossip/1, + random_recursive_gossip/1, rejoin/2]). %% Default gossip rate: allow at most 45 gossip messages every 10 seconds -define(DEFAULT_LIMIT, {45, 10000}). @@ -54,7 +55,8 @@ %% distribute_ring/1 - %% Distribute a ring to all members of that ring. distribute_ring(Ring) -> - gen_server:cast({?MODULE, node()}, {distribute_ring, Ring}). + gen_server:cast({?MODULE, node()}, + {distribute_ring, Ring}). %% send_ring/1 - %% Send the current node's ring to some other node. @@ -63,16 +65,16 @@ send_ring(ToNode) -> send_ring(node(), ToNode). %% send_ring/2 - %% Send the ring from one node to another node. %% Does nothing if the two nodes are the same. -send_ring(Node, Node) -> - ok; +send_ring(Node, Node) -> ok; send_ring(FromNode, ToNode) -> - gen_server:cast({?MODULE, FromNode}, {send_ring_to, ToNode}). + gen_server:cast({?MODULE, FromNode}, + {send_ring_to, ToNode}). start_link() -> - gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). + gen_server:start_link({local, ?MODULE}, ?MODULE, [], + []). -stop() -> - gen_server:cast(?MODULE, stop). +stop() -> gen_server:cast(?MODULE, stop). rejoin(Node, Ring) -> gen_server:cast({?MODULE, Node}, {rejoin, Ring}). @@ -80,10 +82,9 @@ rejoin(Node, Ring) -> %% @doc Gossip state to a random node in the ring. random_gossip(Ring) -> case riak_core_ring:random_other_active_node(Ring) of - no_node -> % must be single node cluster - ok; - RandomNode -> - send_ring(node(), RandomNode) + no_node -> % must be single node cluster + ok; + RandomNode -> send_ring(node(), RandomNode) end. %% @doc Gossip state to a fixed set of nodes determined from a binary @@ -98,22 +99,22 @@ recursive_gossip(Ring, Node) -> Nodes = riak_core_ring:active_members(Ring), Tree = riak_core_util:build_tree(2, Nodes, [cycles]), Children = orddict:fetch(Node, Tree), - _ = [send_ring(node(), OtherNode) || OtherNode <- Children], + _ = [send_ring(node(), OtherNode) + || OtherNode <- Children], ok. + recursive_gossip(Ring) -> %% A non-active member will not show-up in the tree decomposition %% and therefore we fallback to random_recursive_gossip as necessary. Active = riak_core_ring:active_members(Ring), case lists:member(node(), Active) of - true -> - recursive_gossip(Ring, node()); - false -> - random_recursive_gossip(Ring) + true -> recursive_gossip(Ring, node()); + false -> random_recursive_gossip(Ring) end. random_recursive_gossip(Ring) -> Active = riak_core_ring:active_members(Ring), - RNode = lists:nth(riak_core_rand:uniform(length(Active)), Active), + RNode = lists:nth(rand:uniform(length(Active)), Active), recursive_gossip(Ring, RNode). %% =================================================================== @@ -123,253 +124,268 @@ random_recursive_gossip(Ring) -> %% @private init(_State) -> schedule_next_reset(), - {Tokens, _} = application:get_env(riak_core, gossip_limit, ?DEFAULT_LIMIT), + {Tokens, _} = application:get_env(riak_core, + gossip_limit, ?DEFAULT_LIMIT), State = #state{gossip_tokens = Tokens}, {ok, State}. -handle_call(_, _From, State) -> - {reply, ok, State}. +handle_call(_, _From, State) -> {reply, ok, State}. %% @private -handle_cast({send_ring_to, _Node}, State=#state{gossip_tokens=0}) -> +handle_cast({send_ring_to, _Node}, + State = #state{gossip_tokens = 0}) -> %% Out of gossip tokens, ignore the send request {noreply, State}; handle_cast({send_ring_to, Node}, State) -> {ok, RingOut} = riak_core_ring_manager:get_raw_ring(), riak_core_ring:check_tainted(RingOut, - "Error: riak_core_gossip/send_ring_to :: " - "Sending tainted ring over gossip"), - gen_server:cast({?MODULE, Node}, {reconcile_ring, RingOut}), + "Error: riak_core_gossip/send_ring_to " + ":: Sending tainted ring over gossip"), + gen_server:cast({?MODULE, Node}, + {reconcile_ring, RingOut}), Tokens = State#state.gossip_tokens - 1, - {noreply, State#state{gossip_tokens=Tokens}}; - + {noreply, State#state{gossip_tokens = Tokens}}; handle_cast({distribute_ring, Ring}, State) -> Nodes = riak_core_ring:active_members(Ring), riak_core_ring:check_tainted(Ring, - "Error: riak_core_gossip/distribute_ring :: " - "Sending tainted ring over gossip"), - gen_server:abcast(Nodes, ?MODULE, {reconcile_ring, Ring}), + "Error: riak_core_gossip/distribute_ring " + ":: Sending tainted ring over gossip"), + gen_server:abcast(Nodes, ?MODULE, + {reconcile_ring, Ring}), {noreply, State}; - handle_cast({reconcile_ring, OtherRing}, State) -> %% Compare the two rings, see if there is anything that %% must be done to make them equal... %% STATS % riak_core_stat:update(gossip_received), - riak_core_ring_manager:ring_trans(fun reconcile/2, [OtherRing]), + riak_core_ring_manager:ring_trans(fun reconcile/2, + [OtherRing]), {noreply, State}; - handle_cast(gossip_ring, State) -> % Gossip the ring to some random other node... {ok, MyRing} = riak_core_ring_manager:get_raw_ring(), - random_gossip(MyRing), {noreply, State}; - handle_cast({rejoin, OtherRing}, State) -> {ok, Ring} = riak_core_ring_manager:get_raw_ring(), - SameCluster = - (riak_core_ring:cluster_name(Ring) =:= - riak_core_ring:cluster_name(OtherRing)), + SameCluster = riak_core_ring:cluster_name(Ring) =:= + riak_core_ring:cluster_name(OtherRing), case SameCluster of - true -> - OtherNode = riak_core_ring:owner_node(OtherRing), - case riak_core:join(node(), OtherNode, true, true) of - ok -> ok; - {error, Reason} -> - logger:error("Could not rejoin cluster: ~p", [Reason]), - ok - end, - {noreply, State}; - false -> - {noreply, State} + true -> + OtherNode = riak_core_ring:owner_node(OtherRing), + case riak_core:join(node(), OtherNode, true, true) of + ok -> ok; + {error, Reason} -> + logger:error("Could not rejoin cluster: ~p", [Reason]), + ok + end, + {noreply, State}; + false -> {noreply, State} end; - -handle_cast(_, State) -> - {noreply, State}. +handle_cast(_, State) -> {noreply, State}. handle_info(reset_tokens, State) -> schedule_next_reset(), gen_server:cast(?MODULE, gossip_ring), - {Tokens, _} = application:get_env(riak_core, gossip_limit, ?DEFAULT_LIMIT), - {noreply, State#state{gossip_tokens=Tokens}}; - + {Tokens, _} = application:get_env(riak_core, + gossip_limit, ?DEFAULT_LIMIT), + {noreply, State#state{gossip_tokens = Tokens}}; handle_info(_Info, State) -> {noreply, State}. %% @private -terminate(_Reason, _State) -> - ok. +terminate(_Reason, _State) -> ok. %% @private -code_change(_OldVsn, State, _Extra) -> - {ok, State}. - +code_change(_OldVsn, State, _Extra) -> {ok, State}. %% =================================================================== %% Internal functions %% =================================================================== schedule_next_reset() -> - {_, Reset} = application:get_env(riak_core, gossip_limit, ?DEFAULT_LIMIT), + {_, Reset} = application:get_env(riak_core, + gossip_limit, ?DEFAULT_LIMIT), erlang:send_after(Reset, ?MODULE, reset_tokens). %%noinspection ErlangUnboundVariable reconcile(Ring0, [OtherRing0]) -> - {Ring, OtherRing} = riak_core_ring:reconcile_names(Ring0, OtherRing0), + {Ring, OtherRing} = + riak_core_ring:reconcile_names(Ring0, OtherRing0), Node = node(), OtherNode = riak_core_ring:owner_node(OtherRing), - Members = riak_core_ring:reconcile_members(Ring, OtherRing), - WrongCluster = (riak_core_ring:cluster_name(Ring) /= - riak_core_ring:cluster_name(OtherRing)), - PreStatus = riak_core_ring:member_status(Members, OtherNode), - IgnoreGossip = (WrongCluster or - (PreStatus =:= invalid) or - (PreStatus =:= down)), - {Changed, Ring2} = - case IgnoreGossip of - true -> - {false, Ring}; - false -> - riak_core_ring:reconcile(OtherRing, Ring) - end, - OtherStatus = riak_core_ring:member_status(Ring2, OtherNode), + Members = riak_core_ring:reconcile_members(Ring, + OtherRing), + WrongCluster = riak_core_ring:cluster_name(Ring) /= + riak_core_ring:cluster_name(OtherRing), + PreStatus = riak_core_ring:member_status(Members, + OtherNode), + IgnoreGossip = WrongCluster or (PreStatus =:= invalid) + or (PreStatus =:= down), + case IgnoreGossip of + true -> Ring2 = Ring, Changed = false; + false -> + {Changed, Ring2} = riak_core_ring:reconcile(OtherRing, + Ring) + end, + OtherStatus = riak_core_ring:member_status(Ring2, + OtherNode), case {WrongCluster, OtherStatus, Changed} of - {true, _OS, _C} -> - %% TODO: Tell other node to stop gossiping to this node. - %% STATS - % riak_core_stat:update(ignored_gossip), - ignore; - {false, down, _C} -> - %% Tell other node to rejoin the cluster. - riak_core_gossip:rejoin(OtherNode, Ring2), - ignore; - {false, invalid, _C} -> - %% Exiting/Removed node never saw shutdown cast, re-send. - ClusterName = riak_core_ring:cluster_name(Ring), - riak_core_ring_manager:refresh_ring(OtherNode, ClusterName), - ignore; - {false, _OS, new_ring} -> - Ring3 = riak_core_ring:ring_changed(Node, Ring2), - %% STATS - % riak_core_stat:update(rings_reconciled), - log_membership_changes(Ring, Ring3), - {reconciled_ring, Ring3}; - {false, _OS, _C} -> - ignore + {true, _, _} -> + %% TODO: Tell other node to stop gossiping to this node. + %% STATS + % riak_core_stat:update(ignored_gossip), + ignore; + {_, down, _} -> + %% Tell other node to rejoin the cluster. + riak_core_gossip:rejoin(OtherNode, Ring2), + ignore; + {_, invalid, _} -> + %% Exiting/Removed node never saw shutdown cast, re-send. + ClusterName = riak_core_ring:cluster_name(Ring), + riak_core_ring_manager:refresh_ring(OtherNode, + ClusterName), + ignore; + {_, _, new_ring} -> + Ring3 = riak_core_ring:ring_changed(Node, Ring2), + %% STATS + % riak_core_stat:update(rings_reconciled), + log_membership_changes(Ring, Ring3), + {reconciled_ring, Ring3}; + {_, _, _} -> ignore end. log_membership_changes(OldRing, NewRing) -> OldStatus = riak_core_ring:all_member_status(OldRing), NewStatus = riak_core_ring:all_member_status(NewRing), + do_log_membership_changes(lists:sort(OldStatus), + lists:sort(NewStatus)). - do_log_membership_changes(lists:sort(OldStatus), lists:sort(NewStatus)). - -do_log_membership_changes([], []) -> - ok; -do_log_membership_changes([{Node, Status}|Old], [{Node, Status}|New]) -> +do_log_membership_changes([], []) -> ok; +do_log_membership_changes([{Node, Status} | Old], + [{Node, Status} | New]) -> %% No change do_log_membership_changes(Old, New); -do_log_membership_changes([{Node, Status1}|Old], [{Node, Status2}|New]) -> +do_log_membership_changes([{Node, Status1} | Old], + [{Node, Status2} | New]) -> %% State changed, did not join or leave log_node_changed(Node, Status1, Status2), do_log_membership_changes(Old, New); -do_log_membership_changes([{OldNode, _OldStatus}|_]=Old, [{NewNode, NewStatus}|New]) when NewNode < OldNode-> +do_log_membership_changes([{OldNode, _OldStatus} | _] = + Old, + [{NewNode, NewStatus} | New]) + when NewNode < OldNode -> %% Node added log_node_added(NewNode, NewStatus), do_log_membership_changes(Old, New); -do_log_membership_changes([{OldNode, OldStatus}|Old], [{NewNode, _NewStatus}|_]=New) when OldNode < NewNode -> +do_log_membership_changes([{OldNode, OldStatus} | Old], + [{NewNode, _NewStatus} | _] = New) + when OldNode < NewNode -> %% Node removed log_node_removed(OldNode, OldStatus), do_log_membership_changes(Old, New); -do_log_membership_changes([{OldNode, OldStatus}|Old], []) -> +do_log_membership_changes([{OldNode, OldStatus} | Old], + []) -> %% Trailing nodes were removed log_node_removed(OldNode, OldStatus), do_log_membership_changes(Old, []); -do_log_membership_changes([], [{NewNode, NewStatus}|New]) -> +do_log_membership_changes([], + [{NewNode, NewStatus} | New]) -> %% Trailing nodes were added log_node_added(NewNode, NewStatus), do_log_membership_changes([], New). log_node_changed(Node, Old, New) -> - logger:info("'~s' changed from '~s' to '~s'~n", [Node, Old, New]). + logger:info("'~s' changed from '~s' to '~s'~n", + [Node, Old, New]). log_node_added(Node, New) -> - logger:info("'~s' joined cluster with status '~s'~n", [Node, New]). + logger:info("'~s' joined cluster with status '~s'~n", + [Node, New]). log_node_removed(Node, Old) -> - logger:info("'~s' removed from cluster (previously: '~s')~n", [Node, Old]). + logger:info("'~s' removed from cluster (previously: " + "'~s')~n", + [Node, Old]). remove_from_cluster(Ring, ExitingNode) -> - remove_from_cluster(Ring, ExitingNode, riak_core_rand:rand_seed()). + remove_from_cluster(Ring, ExitingNode, + erlang:timestamp()). remove_from_cluster(Ring, ExitingNode, Seed) -> % Get a list of indices owned by the ExitingNode... AllOwners = riak_core_ring:all_owners(Ring), - % Transfer indexes to other nodes... - ExitRing = - case attempt_simple_transfer(Seed, Ring, AllOwners, ExitingNode) of - {ok, NR} -> - NR; - target_n_fail -> - %% re-diagonalize - %% first hand off all claims to *any* one else, - %% just so rebalance doesn't include exiting node - Members = riak_core_ring:claiming_members(Ring), - Other = hd(lists:delete(ExitingNode, Members)), - TempRing = lists:foldl( - fun({I,N}, R) when N == ExitingNode -> - riak_core_ring:transfer_node(I, Other, R); - (_, R) -> R - end, - Ring, - AllOwners), - riak_core_claim:claim_rebalance_n(TempRing, Other) - end, + ExitRing = case attempt_simple_transfer(Seed, Ring, + AllOwners, ExitingNode) + of + {ok, NR} -> NR; + target_n_fail -> + %% re-diagonalize + %% first hand off all claims to *any* one else, + %% just so rebalance doesn't include exiting node + Members = riak_core_ring:claiming_members(Ring), + Other = hd(lists:delete(ExitingNode, Members)), + TempRing = lists:foldl(fun ({I, N}, R) + when N == ExitingNode -> + riak_core_ring:transfer_node(I, + Other, + R); + (_, R) -> R + end, + Ring, AllOwners), + riak_core_claim:claim_rebalance_n(TempRing, Other) + end, ExitRing. -attempt_simple_transfer(Seed, Ring, Owners, ExitingNode) -> - TargetN = application:get_env(riak_core, target_n_val, undefined), - attempt_simple_transfer(Seed, Ring, Owners, - TargetN, +attempt_simple_transfer(Seed, Ring, Owners, + ExitingNode) -> + TargetN = application:get_env(riak_core, target_n_val, + undefined), + attempt_simple_transfer(Seed, Ring, Owners, TargetN, ExitingNode, 0, - [{O,-TargetN} || O <- riak_core_ring:claiming_members(Ring), - O /= ExitingNode]). -attempt_simple_transfer(Seed, Ring, [{P, Exit}|Rest], TargetN, Exit, Idx, Last) -> + [{O, -TargetN} + || O <- riak_core_ring:claiming_members(Ring), + O /= ExitingNode]). + +attempt_simple_transfer(Seed, Ring, [{P, Exit} | Rest], + TargetN, Exit, Idx, Last) -> %% handoff - case [ N || {N, I} <- Last, Idx-I >= TargetN ] of - [] -> - target_n_fail; - Candidates -> - %% these nodes don't violate target_n in the reverse direction - StepsToNext = fun(Node) -> - length(lists:takewhile( - fun({_, Owner}) -> Node /= Owner end, - Rest)) - end, - case lists:filter(fun(N) -> - Next = StepsToNext(N), - (Next+1 >= TargetN) - orelse (Next == length(Rest)) - end, - Candidates) of - [] -> - target_n_fail; - Qualifiers -> - %% these nodes don't violate target_n forward - {Rand, Seed2} = riak_core_rand:uniform_s(length(Qualifiers), Seed), - Chosen = lists:nth(Rand, Qualifiers), - %% choose one, and do the rest of the ring - attempt_simple_transfer( - Seed2, - riak_core_ring:transfer_node(P, Chosen, Ring), - Rest, TargetN, Exit, Idx+1, - lists:keyreplace(Chosen, 1, Last, {Chosen, Idx})) - end + case [N || {N, I} <- Last, Idx - I >= TargetN] of + [] -> target_n_fail; + Candidates -> + %% these nodes don't violate target_n in the reverse direction + StepsToNext = fun (Node) -> + length(lists:takewhile(fun ({_, Owner}) -> + Node /= Owner + end, + Rest)) + end, + case lists:filter(fun (N) -> + Next = StepsToNext(N), + Next + 1 >= TargetN orelse + Next == length(Rest) + end, + Candidates) + of + [] -> target_n_fail; + Qualifiers -> + %% these nodes don't violate target_n forward + {Rand, Seed2} = rand:uniform_s(length(Qualifiers), + Seed), + Chosen = lists:nth(Rand, Qualifiers), + %% choose one, and do the rest of the ring + attempt_simple_transfer(Seed2, + riak_core_ring:transfer_node(P, Chosen, + Ring), + Rest, TargetN, Exit, Idx + 1, + lists:keyreplace(Chosen, 1, Last, + {Chosen, Idx})) + end end; -attempt_simple_transfer(Seed, Ring, [{_, N}|Rest], TargetN, Exit, Idx, Last) -> +attempt_simple_transfer(Seed, Ring, [{_, N} | Rest], + TargetN, Exit, Idx, Last) -> %% just keep track of seeing this node - attempt_simple_transfer(Seed, Ring, Rest, TargetN, Exit, Idx+1, - lists:keyreplace(N, 1, Last, {N, Idx})); + attempt_simple_transfer(Seed, Ring, Rest, TargetN, Exit, + Idx + 1, lists:keyreplace(N, 1, Last, {N, Idx})); attempt_simple_transfer(_, Ring, [], _, _, _, _) -> {ok, Ring}. diff --git a/src/riak_core_handoff_listener.erl b/src/riak_core_handoff_listener.erl index 18daf4887..fb98c2eea 100644 --- a/src/riak_core_handoff_listener.erl +++ b/src/riak_core_handoff_listener.erl @@ -23,34 +23,43 @@ %% @doc entry point for TCP-based handoff -module(riak_core_handoff_listener). + -behavior(gen_nb_server). + -export([start_link/0]). --export([init/1, handle_call/3, handle_cast/2, handle_info/2, - terminate/2, code_change/3]). --export([get_handoff_ip/0, sock_opts/0, new_connection/2]). --record(state, { - ipaddr :: string(), - portnum :: integer() - }). + +-export([init/1, handle_call/3, handle_cast/2, + handle_info/2, terminate/2, code_change/3]). + +-export([get_handoff_ip/0, sock_opts/0, + new_connection/2]). + +-record(state, + {ipaddr :: string(), portnum :: integer()}). start_link() -> - PortNum = application:get_env(riak_core, handoff_port, undefined), - IpAddr = application:get_env(riak_core, handoff_ip, undefined), - gen_nb_server:start_link(?MODULE, IpAddr, PortNum, [IpAddr, PortNum]). + PortNum = application:get_env(riak_core, handoff_port, + undefined), + IpAddr = application:get_env(riak_core, handoff_ip, + undefined), + gen_nb_server:start_link(?MODULE, IpAddr, PortNum, + [IpAddr, PortNum]). get_handoff_ip() -> - riak_core_gen_server:call(?MODULE, handoff_ip, infinity). + gen_server:call(?MODULE, handoff_ip, infinity). init([IpAddr, PortNum]) -> register(?MODULE, self()), - {ok, #state{portnum=PortNum, ipaddr=IpAddr}}. + {ok, #state{portnum = PortNum, ipaddr = IpAddr}}. -sock_opts() -> [binary, {packet, 4}, {reuseaddr, true}, {backlog, 64}]. +sock_opts() -> + [binary, {packet, 4}, {reuseaddr, true}, {backlog, 64}]. -handle_call(handoff_ip, _From, State=#state{ipaddr=I}) -> +handle_call(handoff_ip, _From, + State = #state{ipaddr = I}) -> {reply, {ok, I}, State}; - -handle_call(handoff_port, _From, State=#state{portnum=P}) -> +handle_call(handoff_port, _From, + State = #state{portnum = P}) -> {reply, {ok, P}, State}. handle_cast(_Msg, State) -> {noreply, State}. @@ -63,14 +72,13 @@ code_change(_OldVsn, State, _Extra) -> {ok, State}. new_connection(Socket, State) -> case riak_core_handoff_manager:add_inbound() of - {ok, Pid} -> - ok = gen_tcp:controlling_process(Socket, Pid), - ok = riak_core_handoff_receiver:set_socket(Pid, Socket), - {ok, State}; - {error, _Reason} -> + {ok, Pid} -> + ok = gen_tcp:controlling_process(Socket, Pid), + ok = riak_core_handoff_receiver:set_socket(Pid, Socket), + {ok, State}; + {error, _Reason} -> %% STATS -%% riak_core_stat:update(rejected_handoffs), - gen_tcp:close(Socket), - {ok, State} + %% riak_core_stat:update(rejected_handoffs), + gen_tcp:close(Socket), + {ok, State} end. - diff --git a/src/riak_core_handoff_listener_sup.erl b/src/riak_core_handoff_listener_sup.erl index 0c800bac9..ed7f87126 100644 --- a/src/riak_core_handoff_listener_sup.erl +++ b/src/riak_core_handoff_listener_sup.erl @@ -19,21 +19,22 @@ %% ------------------------------------------------------------------- -module(riak_core_handoff_listener_sup). + -behaviour(supervisor). %% beahvior functions --export([start_link/0, - init/1 - ]). +-export([start_link/0, init/1]). --define(CHILD(I,Type), {I,{I,start_link,[]},permanent,brutal_kill,Type,[I]}). +-define(CHILD(I, Type), + {I, {I, start_link, []}, permanent, brutal_kill, Type, + [I]}). %% begins the supervisor, init/1 will be called -start_link () -> - supervisor:start_link({local,?MODULE},?MODULE,[]). +start_link() -> + supervisor:start_link({local, ?MODULE}, ?MODULE, []). %% @private -init ([]) -> - {ok,{{one_for_one,10,10}, - [?CHILD(riak_core_handoff_listener,worker) - ]}}. +init([]) -> + {ok, + {{one_for_one, 10, 10}, + [?CHILD(riak_core_handoff_listener, worker)]}}. diff --git a/src/riak_core_handoff_manager.erl b/src/riak_core_handoff_manager.erl index 8103546f6..e023a06f7 100644 --- a/src/riak_core_handoff_manager.erl +++ b/src/riak_core_handoff_manager.erl @@ -14,109 +14,108 @@ %% Copyright (c) 2007-2012 Basho Technologies, Inc. All Rights Reserved. -module(riak_core_handoff_manager). + -behaviour(gen_server). %% gen_server api --export([start_link/0, - init/1, - handle_call/3, - handle_cast/2, - handle_info/2, - terminate/2, - code_change/3 - ]). +-export([start_link/0, init/1, handle_call/3, + handle_cast/2, handle_info/2, terminate/2, + code_change/3]). %% exclusion api --export([add_exclusion/2, - get_exclusions/1, - remove_exclusion/2 - ]). +-export([add_exclusion/2, get_exclusions/1, + remove_exclusion/2]). %% handoff api --export([add_outbound/6, - add_outbound/7, - add_inbound/0, - xfer/3, - kill_xfer/3, - status/0, - status/1, - status_update/2, - set_concurrency/1, - get_concurrency/0, - set_recv_data/2, - kill_handoffs/0, +-export([add_outbound/6, add_outbound/7, add_inbound/0, + xfer/3, kill_xfer/3, status/0, status/1, + status_update/2, set_concurrency/1, get_concurrency/0, + set_recv_data/2, kill_handoffs/0, kill_handoffs_in_direction/1, - handoff_change_enabled_setting/2 - ]). + handoff_change_enabled_setting/2]). -include("riak_core_handoff.hrl"). -export_type([ho_type/0]). -ifdef(TEST). + -include_lib("eunit/include/eunit.hrl"). + -endif. -record(state, - { excl, - handoffs=[] :: [handoff_status()] - }). + {excl, handoffs = [] :: [handoff_status()]}). %% this can be overridden with riak_core handoff_concurrency --define(HANDOFF_CONCURRENCY,2). +-define(HANDOFF_CONCURRENCY, 2). + -define(HO_EQ(HOA, HOB), - HOA#handoff_status.mod_src_tgt == HOB#handoff_status.mod_src_tgt - andalso HOA#handoff_status.timestamp == HOB#handoff_status.timestamp). + HOA#handoff_status.mod_src_tgt == + HOB#handoff_status.mod_src_tgt + andalso + HOA#handoff_status.timestamp == + HOB#handoff_status.timestamp). %%%=================================================================== %%% API %%%=================================================================== start_link() -> - gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). + gen_server:start_link({local, ?MODULE}, ?MODULE, [], + []). init([]) -> - {ok, #state{excl=sets:new(), handoffs=[]}}. - -add_outbound(HOType,Module,Idx,Node,VnodePid,Opts) -> - add_outbound(HOType,Module,Idx,Idx,Node,VnodePid,Opts). - -add_outbound(HOType,Module,SrcIdx,TargetIdx,Node,VnodePid,Opts) -> - case application:get_env(riak_core, disable_outbound_handoff) of - {ok, true} -> - {error, max_concurrency}; - _ -> - gen_server:call(?MODULE, - {add_outbound,HOType,Module,SrcIdx,TargetIdx,Node,VnodePid,Opts}, - infinity) + {ok, #state{excl = sets:new(), handoffs = []}}. + +add_outbound(HOType, Module, Idx, Node, VnodePid, + Opts) -> + add_outbound(HOType, Module, Idx, Idx, Node, VnodePid, + Opts). + +add_outbound(HOType, Module, SrcIdx, TargetIdx, Node, + VnodePid, Opts) -> + case application:get_env(riak_core, + disable_outbound_handoff) + of + {ok, true} -> {error, max_concurrency}; + _ -> + gen_server:call(?MODULE, + {add_outbound, HOType, Module, SrcIdx, TargetIdx, + Node, VnodePid, Opts}, + infinity) end. add_inbound() -> - case application:get_env(riak_core, disable_inbound_handoff) of - {ok, true} -> - {error, max_concurrency}; - _ -> - gen_server:call(?MODULE,{add_inbound},infinity) + case application:get_env(riak_core, + disable_inbound_handoff) + of + {ok, true} -> {error, max_concurrency}; + _ -> gen_server:call(?MODULE, {add_inbound}, infinity) end. %% @doc Initiate a transfer from `SrcPartition' to `TargetPartition' %% for the given `Module' using the `FilterModFun' filter. --spec xfer({index(), node()}, mod_partition(), {module(), atom()}) -> ok. -xfer({SrcPartition, SrcOwner}, {Module, TargetPartition}, FilterModFun) -> +-spec xfer({index(), node()}, mod_partition(), + {module(), atom()}) -> ok. + +xfer({SrcPartition, SrcOwner}, + {Module, TargetPartition}, FilterModFun) -> %% NOTE: This will not work with old nodes ReqOrigin = node(), gen_server:cast({?MODULE, SrcOwner}, {send_handoff, repair, Module, - {SrcPartition, TargetPartition}, - ReqOrigin, FilterModFun}). + {SrcPartition, TargetPartition}, ReqOrigin, + FilterModFun}). %% @doc Associate `Data' with the inbound handoff `Recv'. -spec set_recv_data(pid(), proplists:proplist()) -> ok. + set_recv_data(Recv, Data) -> - gen_server:call(?MODULE, {set_recv_data, Recv, Data}, infinity). + gen_server:call(?MODULE, {set_recv_data, Recv, Data}, + infinity). -status() -> - status(none). +status() -> status(none). status(Filter) -> gen_server:call(?MODULE, {status, Filter}, infinity). @@ -124,349 +123,379 @@ status(Filter) -> %% @doc Send status updates `Stats' to the handoff manager for a %% particular handoff identified by `ModSrcTgt'. -spec status_update(mod_src_tgt(), ho_stats()) -> ok. + status_update(ModSrcTgt, Stats) -> - gen_server:cast(?MODULE, {status_update, ModSrcTgt, Stats}). + gen_server:cast(?MODULE, + {status_update, ModSrcTgt, Stats}). set_concurrency(Limit) -> - gen_server:call(?MODULE,{set_concurrency,Limit}, infinity). + gen_server:call(?MODULE, {set_concurrency, Limit}, + infinity). get_concurrency() -> gen_server:call(?MODULE, get_concurrency, infinity). %% @doc Kill the transfer of `ModSrcTarget' with `Reason'. -spec kill_xfer(node(), tuple(), any()) -> ok. + kill_xfer(SrcNode, ModSrcTarget, Reason) -> - gen_server:cast({?MODULE, SrcNode}, {kill_xfer, ModSrcTarget, Reason}). + gen_server:cast({?MODULE, SrcNode}, + {kill_xfer, ModSrcTarget, Reason}). + +kill_handoffs() -> set_concurrency(0). -kill_handoffs() -> - set_concurrency(0). +-spec kill_handoffs_in_direction(inbound | + outbound) -> ok. --spec kill_handoffs_in_direction(inbound | outbound) -> ok. kill_handoffs_in_direction(Direction) -> - gen_server:call(?MODULE, {kill_in_direction, Direction}, infinity). + gen_server:call(?MODULE, {kill_in_direction, Direction}, + infinity). add_exclusion(Module, Index) -> - gen_server:cast(?MODULE, {add_exclusion, {Module, Index}}). + gen_server:cast(?MODULE, + {add_exclusion, {Module, Index}}). remove_exclusion(Module, Index) -> - gen_server:cast(?MODULE, {del_exclusion, {Module, Index}}). + gen_server:cast(?MODULE, + {del_exclusion, {Module, Index}}). get_exclusions(Module) -> - gen_server:call(?MODULE, {get_exclusions, Module}, infinity). - + gen_server:call(?MODULE, {get_exclusions, Module}, + infinity). %%%=================================================================== %%% Callbacks %%%=================================================================== -handle_call({get_exclusions, Module}, _From, State=#state{excl=Excl}) -> - Reply = [I || {M, I} <- sets:to_list(Excl), M =:= Module], +handle_call({get_exclusions, Module}, _From, + State = #state{excl = Excl}) -> + Reply = [I + || {M, I} <- sets:to_list(Excl), M =:= Module], {reply, {ok, Reply}, State}; -handle_call({add_outbound,Type,Mod,SrcIdx,TargetIdx,Node,Pid,Opts},_From, - State=#state{handoffs=HS}) -> - case send_handoff(Type,{Mod,SrcIdx,TargetIdx},Node,Pid,HS,Opts) of - {ok,Handoff=#handoff_status{transport_pid=Sender}} -> - HS2 = HS ++ [Handoff], - {reply, {ok,Sender}, State#state{handoffs=HS2}}; - {false,_ExistingHandoff=#handoff_status{transport_pid=Sender}} -> - {reply, {ok,Sender}, State}; - Error -> - {reply, Error, State} +handle_call({add_outbound, Type, Mod, SrcIdx, TargetIdx, + Node, Pid, Opts}, + _From, State = #state{handoffs = HS}) -> + case send_handoff(Type, {Mod, SrcIdx, TargetIdx}, Node, + Pid, HS, Opts) + of + {ok, + Handoff = #handoff_status{transport_pid = Sender}} -> + HS2 = HS ++ [Handoff], + {reply, {ok, Sender}, State#state{handoffs = HS2}}; + {false, + _ExistingHandoff = #handoff_status{transport_pid = + Sender}} -> + {reply, {ok, Sender}, State}; + Error -> {reply, Error, State} end; -handle_call({add_inbound},_From,State=#state{handoffs=HS}) -> +handle_call({add_inbound}, _From, + State = #state{handoffs = HS}) -> case receive_handoff() of - {ok,Handoff=#handoff_status{transport_pid=Receiver}} -> - HS2 = HS ++ [Handoff], - {reply, {ok,Receiver}, State#state{handoffs=HS2}}; - Error -> - {reply, Error, State} + {ok, + Handoff = #handoff_status{transport_pid = Receiver}} -> + HS2 = HS ++ [Handoff], + {reply, {ok, Receiver}, State#state{handoffs = HS2}}; + Error -> {reply, Error, State} end; - -handle_call({set_recv_data, Recv, Data}, _From, State=#state{handoffs=HS}) -> - case lists:keyfind(Recv, #handoff_status.transport_pid, HS) of - false -> - throw({error, "set_recv_data called for non-existing receiver", - Recv, Data}); - #handoff_status{}=H -> - H2 = H#handoff_status{ - mod_src_tgt=proplists:get_value(mod_src_tgt, Data), - vnode_pid=proplists:get_value(vnode_pid, Data) - }, - HS2 = lists:keyreplace(Recv, #handoff_status.transport_pid, HS, H2), - {reply, ok, State#state{handoffs=HS2}} +handle_call({set_recv_data, Recv, Data}, _From, + State = #state{handoffs = HS}) -> + case lists:keyfind(Recv, #handoff_status.transport_pid, + HS) + of + false -> + throw({error, + "set_recv_data called for non-existing " + "receiver", + Recv, Data}); + #handoff_status{} = H -> + H2 = H#handoff_status{mod_src_tgt = + proplists:get_value(mod_src_tgt, Data), + vnode_pid = + proplists:get_value(vnode_pid, Data)}, + HS2 = lists:keyreplace(Recv, + #handoff_status.transport_pid, HS, H2), + {reply, ok, State#state{handoffs = HS2}} end; - -handle_call({xfer_status, Xfer}, _From, State=#state{handoffs=HS}) -> +handle_call({xfer_status, Xfer}, _From, + State = #state{handoffs = HS}) -> TP = Xfer#handoff_status.transport_pid, - case lists:keyfind(TP, #handoff_status.transport_pid, HS) of - false -> {reply, not_found, State}; - _ -> {reply, in_progress, State} + case lists:keyfind(TP, #handoff_status.transport_pid, + HS) + of + false -> {reply, not_found, State}; + _ -> {reply, in_progress, State} end; - -handle_call({status, Filter}, _From, State=#state{handoffs=HS}) -> - Status = lists:filter(filter(Filter), [build_status(HO) || HO <- HS]), +handle_call({status, Filter}, _From, + State = #state{handoffs = HS}) -> + Status = lists:filter(filter(Filter), + [build_status(HO) || HO <- HS]), {reply, Status, State}; - -handle_call({set_concurrency,Limit},_From,State=#state{handoffs=HS}) -> - application:set_env(riak_core,handoff_concurrency,Limit), +handle_call({set_concurrency, Limit}, _From, + State = #state{handoffs = HS}) -> + application:set_env(riak_core, handoff_concurrency, + Limit), case Limit < erlang:length(HS) of - true -> - %% Note: we don't update the state with the handoffs that we're - %% keeping because we'll still get the 'DOWN' messages with - %% a reason of 'max_concurrency' and we want to be able to do - %% something with that if necessary. - {_Keep,Discard}=lists:split(Limit,HS), - _ = [erlang:exit(Pid,max_concurrency) || - #handoff_status{transport_pid=Pid} <- Discard], - {reply, ok, State}; - false -> - {reply, ok, State} + true -> + %% Note: we don't update the state with the handoffs that we're + %% keeping because we'll still get the 'DOWN' messages with + %% a reason of 'max_concurrency' and we want to be able to do + %% something with that if necessary. + {_Keep, Discard} = lists:split(Limit, HS), + _ = [erlang:exit(Pid, max_concurrency) + || #handoff_status{transport_pid = Pid} <- Discard], + {reply, ok, State}; + false -> {reply, ok, State} end; - handle_call(get_concurrency, _From, State) -> Concurrency = get_concurrency_limit(), {reply, Concurrency, State}; - -handle_call({kill_in_direction, Direction}, _From, State=#state{handoffs=HS}) -> +handle_call({kill_in_direction, Direction}, _From, + State = #state{handoffs = HS}) -> %% TODO (atb): Refactor this to comply with max_concurrency logspam PR's exit codes %% NB. As-is this handles worker termination the same way as set_concurrency; %% no state update is performed here, we let the worker DOWNs mark them %% as dead rather than trimming here. - Kill = [H || H=#handoff_status{direction=D} <- HS, D =:= Direction], - _ = [erlang:exit(Pid, max_concurrency) || - #handoff_status{transport_pid=Pid} <- Kill], + Kill = [H + || H = #handoff_status{direction = D} <- HS, + D =:= Direction], + _ = [erlang:exit(Pid, max_concurrency) + || #handoff_status{transport_pid = Pid} <- Kill], {reply, ok, State}. -handle_cast({del_exclusion, {Mod, Idx}}, State=#state{excl=Excl}) -> +handle_cast({del_exclusion, {Mod, Idx}}, + State = #state{excl = Excl}) -> Excl2 = sets:del_element({Mod, Idx}, Excl), - {noreply, State#state{excl=Excl2}}; - -handle_cast({add_exclusion, {Mod, Idx}}, State=#state{excl=Excl}) -> + {noreply, State#state{excl = Excl2}}; +handle_cast({add_exclusion, {Mod, Idx}}, + State = #state{excl = Excl}) -> %% Note: This function used to trigger a ring event after adding an %% exclusion to ensure that an exiting node would eventually shutdown %% after all vnodes had finished handoff. This behavior is now handled %% by riak_core_vnode_manager:maybe_ensure_vnodes_started Excl2 = sets:add_element({Mod, Idx}, Excl), - {noreply, State#state{excl=Excl2}}; - -handle_cast({status_update, ModSrcTgt, StatsUpdate}, State=#state{handoffs=HS}) -> - case lists:keyfind(ModSrcTgt, #handoff_status.mod_src_tgt, HS) of - false -> - logger:error("status_update for non-existing handoff ~p", [ModSrcTgt]), - {noreply, State}; - HO -> - Stats2 = update_stats(StatsUpdate, HO#handoff_status.stats), - HO2 = HO#handoff_status{stats=Stats2}, - HS2 = lists:keyreplace(ModSrcTgt, #handoff_status.mod_src_tgt, HS, HO2), - {noreply, State#state{handoffs=HS2}} + {noreply, State#state{excl = Excl2}}; +handle_cast({status_update, ModSrcTgt, StatsUpdate}, + State = #state{handoffs = HS}) -> + case lists:keyfind(ModSrcTgt, + #handoff_status.mod_src_tgt, HS) + of + false -> + logger:error("status_update for non-existing handoff ~p", + [ModSrcTgt]), + {noreply, State}; + HO -> + Stats2 = update_stats(StatsUpdate, + HO#handoff_status.stats), + HO2 = HO#handoff_status{stats = Stats2}, + HS2 = lists:keyreplace(ModSrcTgt, + #handoff_status.mod_src_tgt, HS, HO2), + {noreply, State#state{handoffs = HS2}} end; - -handle_cast({send_handoff, Type, Mod, {Src, Target}, ReqOrigin, - {FilterMod, FilterFun}=FMF}, - State=#state{handoffs=HS}) -> - Filter = FilterMod:FilterFun(Target), +handle_cast({send_handoff, Type, Mod, {Src, Target}, + ReqOrigin, {Module, FilterFun} = FMF}, + State = #state{handoffs = HS}) -> + Filter = Module:FilterFun(Target), %% TODO: make a record? - {ok, VNode} = riak_core_vnode_manager:get_vnode_pid(Src, Mod), - case send_handoff(Type, {Mod, Src, Target}, ReqOrigin, VNode, HS, - {Filter, FMF}, ReqOrigin, []) of - {ok, Handoff} -> - HS2 = HS ++ [Handoff], - {noreply, State#state{handoffs=HS2}}; - _ -> - {noreply, State} + {ok, VNode} = riak_core_vnode_manager:get_vnode_pid(Src, + Mod), + case send_handoff(Type, {Mod, Src, Target}, ReqOrigin, + VNode, HS, {Filter, FMF}, ReqOrigin, []) + of + {ok, Handoff} -> + HS2 = HS ++ [Handoff], + {noreply, State#state{handoffs = HS2}}; + _ -> {noreply, State} end; - handle_cast({kill_xfer, ModSrcTarget, Reason}, State) -> HS = State#state.handoffs, HS2 = kill_xfer_i(ModSrcTarget, Reason, HS), - {noreply, State#state{handoffs=HS2}}. - -handle_info({'DOWN', Ref, process, _Pid, Reason}, State=#state{handoffs=HS}) -> - case lists:keytake(Ref, #handoff_status.transport_mon, HS) of - {value, - #handoff_status{mod_src_tgt={M, S, I}, direction=Dir, vnode_pid=Vnode, - vnode_mon=VnodeM, req_origin=Origin}, - NewHS - } -> - WarnVnode = - case Reason of - %% if the reason the handoff process died was anything other - %% than 'normal' we should log the reason why as an error - normal -> - false; - X when X == max_concurrency orelse - (element(1, X) == shutdown andalso - element(2, X) == max_concurrency) -> - logger:info("An ~w handoff of partition ~w ~w was terminated for reason: ~w~n", [Dir,M,I,Reason]), - true; - _ -> - logger:error("An ~w handoff of partition ~w ~w was terminated for reason: ~w~n", [Dir,M,I,Reason]), - true - end, - - %% if we have the vnode process pid, tell the vnode why the - %% handoff stopped so it can clean up its state - case WarnVnode andalso is_pid(Vnode) of - true -> - riak_core_vnode:handoff_error(Vnode, 'DOWN', Reason); - _ -> - case Origin of - none -> ok; + {noreply, State#state{handoffs = HS2}}. + +handle_info({'DOWN', Ref, process, _Pid, Reason}, + State = #state{handoffs = HS}) -> + case lists:keytake(Ref, #handoff_status.transport_mon, + HS) + of + {value, + #handoff_status{mod_src_tgt = {M, S, I}, + direction = Dir, vnode_pid = Vnode, vnode_mon = VnodeM, + req_origin = Origin}, + NewHS} -> + WarnVnode = case Reason of + %% if the reason the handoff process died was anything other + %% than 'normal' we should log the reason why as an error + normal -> false; + X + when X == max_concurrency orelse + element(1, X) == shutdown andalso + element(2, X) == max_concurrency -> + logger:info("An ~w handoff of partition ~w ~w was " + "terminated\n " + " for reason: ~w~n", + [Dir, M, I, Reason]), + true; _ -> - %% Use proplist instead so it's more - %% flexible in future, or does - %% capabilities nullify that? - Msg = {M, S, I}, - riak_core_vnode_manager:xfer_complete(Origin, Msg) - end, - ok - end, - - %% No monitor on vnode for receiver - if VnodeM /= undefined -> demonitor(VnodeM); - true -> ok - end, - - %% removed the handoff from the list of active handoffs - {noreply, State#state{handoffs=NewHS}}; - false -> - case lists:keytake(Ref, #handoff_status.vnode_mon, HS) of - {value, - #handoff_status{mod_src_tgt={M,_,I}, direction=Dir, - transport_pid=Trans, transport_mon=TransM}, - NewHS} -> - %% In this case the vnode died and the handoff - %% sender must be killed. - logger:error("An ~w handoff of partition ~w ~w was " - "terminated because the vnode died", - [Dir, M, I]), - demonitor(TransM), - exit(Trans, vnode_died), - {noreply, State#state{handoffs=NewHS}}; - _ -> - {noreply, State} - end + logger:error("An ~w handoff of partition ~w ~w was " + "terminated\n " + " for reason: ~w~n", + [Dir, M, I, Reason]), + true + end, + %% if we have the vnode process pid, tell the vnode why the + %% handoff stopped so it can clean up its state + case WarnVnode andalso is_pid(Vnode) of + true -> + riak_core_vnode:handoff_error(Vnode, 'DOWN', Reason); + _ -> + case Origin of + none -> ok; + _ -> + %% Use proplist instead so it's more + %% flexible in future, or does + %% capabilities nullify that? + Msg = {M, S, I}, + riak_core_vnode_manager:xfer_complete(Origin, Msg) + end, + ok + end, + %% No monitor on vnode for receiver + if VnodeM /= undefined -> demonitor(VnodeM); + true -> ok + end, + %% removed the handoff from the list of active handoffs + {noreply, State#state{handoffs = NewHS}}; + false -> + case lists:keytake(Ref, #handoff_status.vnode_mon, HS) + of + {value, + #handoff_status{mod_src_tgt = {M, _, I}, + direction = Dir, transport_pid = Trans, + transport_mon = TransM}, + NewHS} -> + %% In this case the vnode died and the handoff + %% sender must be killed. + logger:error("An ~w handoff of partition ~w ~w was " + "terminated because the vnode died", + [Dir, M, I]), + demonitor(TransM), + exit(Trans, vnode_died), + {noreply, State#state{handoffs = NewHS}}; + _ -> {noreply, State} + end end. +terminate(_Reason, _State) -> ok. -terminate(_Reason, _State) -> - ok. - - -code_change(_OldVsn, State, _Extra) -> - {ok, State}. - +code_change(_OldVsn, State, _Extra) -> {ok, State}. %%%=================================================================== %%% Private %%%=================================================================== build_status(HO) -> - #handoff_status{mod_src_tgt={Mod, SrcP, TargetP}, - src_node=SrcNode, - target_node=TargetNode, - direction=Dir, - status=Status, - timestamp=StartTS, - transport_pid=TPid, - type=Type}=HO, - {status_v2, [{mod, Mod}, - {src_partition, SrcP}, - {target_partition, TargetP}, - {src_node, SrcNode}, - {target_node, TargetNode}, - {direction, Dir}, - {status, Status}, - {start_ts, StartTS}, - {sender_pid, TPid}, - {stats, calc_stats(HO)}, - {type, Type}]}. - -calc_stats(#handoff_status{stats=Stats,timestamp=StartTS,size=Size}) -> + #handoff_status{mod_src_tgt = {Mod, SrcP, TargetP}, + src_node = SrcNode, target_node = TargetNode, + direction = Dir, status = Status, timestamp = StartTS, + transport_pid = TPid, type = Type} = + HO, + {status_v2, + [{mod, Mod}, {src_partition, SrcP}, + {target_partition, TargetP}, {src_node, SrcNode}, + {target_node, TargetNode}, {direction, Dir}, + {status, Status}, {start_ts, StartTS}, + {sender_pid, TPid}, {stats, calc_stats(HO)}, + {type, Type}]}. + +calc_stats(#handoff_status{stats = Stats, + timestamp = StartTS, size = Size}) -> case dict:find(last_update, Stats) of - error -> - no_stats; - {ok, LastUpdate} -> - Objs = dict:fetch(objs, Stats), - Bytes = dict:fetch(bytes, Stats), - CalcSize = get_size(Size), - Done = calc_pct_done(Objs, Bytes, CalcSize), - ElapsedS = timer:now_diff(LastUpdate, StartTS) / 1000000, - ObjsS = round(Objs / ElapsedS), - BytesS = round(Bytes / ElapsedS), - [{objs_total, Objs}, - {objs_per_s, ObjsS}, - {bytes_per_s, BytesS}, - {last_update, LastUpdate}, - {size, CalcSize}, - {pct_done_decimal, Done}] + error -> no_stats; + {ok, LastUpdate} -> + Objs = dict:fetch(objs, Stats), + Bytes = dict:fetch(bytes, Stats), + CalcSize = get_size(Size), + Done = calc_pct_done(Objs, Bytes, CalcSize), + ElapsedS = timer:now_diff(LastUpdate, StartTS) / + 1000000, + ObjsS = round(Objs / ElapsedS), + BytesS = round(Bytes / ElapsedS), + [{objs_total, Objs}, {objs_per_s, ObjsS}, + {bytes_per_s, BytesS}, {last_update, LastUpdate}, + {size, CalcSize}, {pct_done_decimal, Done}] end. -get_size({F, dynamic}) -> - F(); -get_size(S) -> - S. - -calc_pct_done(_, _, undefined) -> - undefined; -calc_pct_done(Objs, _, {Size, objects}) -> - Objs / Size; -calc_pct_done(_, Bytes, {Size, bytes}) -> - Bytes / Size. - -filter(none) -> - fun(_) -> true end; -filter({Key, Value}=_Filter) -> - fun({status_v2, Status}) -> +get_size({F, dynamic}) -> F(); +get_size(S) -> S. + +calc_pct_done(_, _, undefined) -> undefined; +calc_pct_done(Objs, _, {Size, objects}) -> Objs / Size; +calc_pct_done(_, Bytes, {Size, bytes}) -> Bytes / Size. + +filter(none) -> fun (_) -> true end; +filter({Key, Value} = _Filter) -> + fun ({status_v2, Status}) -> case proplists:get_value(Key, Status) of - Value -> true; - _ -> false + Value -> true; + _ -> false end end. -resize_transfer_filter(Ring, Mod, Src, Target) -> - fun(K) -> - {_, Hashed} = Mod:object_info(K), - riak_core_ring:is_future_index(Hashed, - Src, - Target, +resize_transfer_filter(Ring, Module, Src, Target) -> + fun (K) -> + {_, Hashed} = Module:object_info(K), + riak_core_ring:is_future_index(Hashed, Src, Target, Ring) end. -resize_transfer_notsent_fun(Ring, Mod, Src) -> - Shrinking = riak_core_ring:num_partitions(Ring) > riak_core_ring:future_num_partitions(Ring), - case Shrinking of - false -> NValMap = DefaultN = undefined; - true -> - NValMap = Mod:nval_map(Ring), - DefaultN = riak_core_bucket:default_object_nval() - end, - fun(Key, Acc) -> record_seen_index(Ring, Shrinking, NValMap, DefaultN, Mod, Src, Key, Acc) end. - -record_seen_index(Ring, Shrinking, NValMap, DefaultN, Mod, Src, Key, Seen) -> - {Bucket, Hashed} = Mod:object_info(Key), +resize_transfer_notsent_fun(Ring, Module, Src) -> + Shrinking = riak_core_ring:num_partitions(Ring) > + riak_core_ring:future_num_partitions(Ring), + {NValMap, DefaultN} = case Shrinking of + false -> {undefined, undefined}; + true -> + {ok, DefN} = application:get_env(riak_core, + target_n_val), + {Module:nval_map(Ring), DefN} + end, + fun (Key, Acc) -> + record_seen_index(Ring, Shrinking, NValMap, DefaultN, + Module, Src, Key, Acc) + end. + +record_seen_index(Ring, Shrinking, NValMap, DefaultN, + Module, Src, Key, Seen) -> + {Bucket, Hashed} = Module:object_info(Key), CheckNVal = case Shrinking of - false -> undefined; - true -> proplists:get_value(Bucket, NValMap, DefaultN) + false -> undefined; + true -> proplists:get_value(Bucket, NValMap, DefaultN) end, - case riak_core_ring:future_index(Hashed, Src, CheckNVal, Ring) of - undefined -> Seen; - FutureIndex -> ordsets:add_element(FutureIndex, Seen) + case riak_core_ring:future_index(Hashed, Src, CheckNVal, + Ring) + of + undefined -> Seen; + FutureIndex -> ordsets:add_element(FutureIndex, Seen) end. -get_concurrency_limit () -> - application:get_env(riak_core,handoff_concurrency,?HANDOFF_CONCURRENCY). +get_concurrency_limit() -> + application:get_env(riak_core, handoff_concurrency, + ?HANDOFF_CONCURRENCY). %% true if handoff_concurrency (inbound + outbound) hasn't yet been reached -handoff_concurrency_limit_reached () -> - Receivers=supervisor:count_children(riak_core_handoff_receiver_sup), - Senders=supervisor:count_children(riak_core_handoff_sender_sup), - ActiveReceivers=proplists:get_value(active,Receivers), - ActiveSenders=proplists:get_value(active,Senders), - get_concurrency_limit() =< (ActiveReceivers + ActiveSenders). - -send_handoff(HOType, ModSrcTarget, Node, Pid, HS,Opts) -> - send_handoff(HOType, ModSrcTarget, Node, Pid, HS, {none, none}, none, Opts). +handoff_concurrency_limit_reached() -> + Receivers = + supervisor:count_children(riak_core_handoff_receiver_sup), + Senders = + supervisor:count_children(riak_core_handoff_sender_sup), + ActiveReceivers = proplists:get_value(active, + Receivers), + ActiveSenders = proplists:get_value(active, Senders), + get_concurrency_limit() =< + ActiveReceivers + ActiveSenders. + +send_handoff(HOType, ModSrcTarget, Node, Pid, HS, + Opts) -> + send_handoff(HOType, ModSrcTarget, Node, Pid, HS, + {none, none}, none, Opts). %% @private %% @@ -475,126 +504,115 @@ send_handoff(HOType, ModSrcTarget, Node, Pid, HS,Opts) -> %% function which is a predicate applied to the key. The %% `Origin' is the node this request originated from so a reply %% can't be sent on completion. --spec send_handoff(ho_type(), {module(), index(), index()}, node(), - pid(), list(), - {predicate() | none, {module(), atom()} | none}, node(), [{atom(), term()}]) -> - {ok, handoff_status()} - | {error, max_concurrency} - | {false, handoff_status()}. -send_handoff(HOType, {Mod, Src, Target}, Node, Vnode, HS, {Filter, FilterModFun}, Origin, Opts) -> +-spec send_handoff(ho_type(), + {module(), index(), index()}, node(), pid(), list(), + {predicate() | none, {module(), atom()} | none}, node(), + [{atom(), term()}]) -> {ok, handoff_status()} | + {error, max_concurrency} | + {false, handoff_status()}. + +send_handoff(HOType, {Mod, Src, Target}, Node, Vnode, + HS, {Filter, FilterModFun}, Origin, Opts) -> case handoff_concurrency_limit_reached() of - true -> - {error, max_concurrency}; - false -> - ShouldHandoff= - case lists:keyfind({Mod, Src, Target}, #handoff_status.mod_src_tgt, HS) of - false -> - true; - Handoff=#handoff_status{target_node=Node,vnode_pid=Vnode} -> - {false,Handoff}; - #handoff_status{transport_pid=Sender} -> - %% found a running handoff with a different vnode - %% source or a different target node, kill the current - %% one and the new one will start up - erlang:exit(Sender,resubmit_handoff_change), - true + true -> {error, max_concurrency}; + false -> + ShouldHandoff = case lists:keyfind({Mod, Src, Target}, + #handoff_status.mod_src_tgt, HS) + of + false -> true; + Handoff = #handoff_status{target_node = Node, + vnode_pid = Vnode} -> + {false, Handoff}; + #handoff_status{transport_pid = Sender} -> + %% found a running handoff with a different vnode + %% source or a different target node, kill the current + %% one and the new one will start up + erlang:exit(Sender, resubmit_handoff_change), + true + end, + case ShouldHandoff of + true -> + VnodeM = monitor(process, Vnode), + %% start the sender process + BaseOpts = [{src_partition, Src}, + {target_partition, Target}], + case HOType of + repair -> + HOFilter = Filter, + HOAcc0 = undefined, + HONotSentFun = undefined; + resize -> + {ok, Ring} = riak_core_ring_manager:get_my_ring(), + HOFilter = resize_transfer_filter(Ring, Mod, Src, + Target), + HOAcc0 = ordsets:new(), + HONotSentFun = resize_transfer_notsent_fun(Ring, Mod, + Src); + _ -> + HOFilter = none, + HOAcc0 = undefined, + HONotSentFun = undefined end, - - case ShouldHandoff of - true -> - VnodeM = monitor(process, Vnode), - %% start the sender process - BaseOpts = [{src_partition, Src}, {target_partition, Target}], - case HOType of - repair -> - HOFilter = Filter, - HOAcc0 = undefined, - HONotSentFun = undefined; - resize -> - {ok, Ring} = riak_core_ring_manager:get_my_ring(), - HOFilter = resize_transfer_filter(Ring, Mod, Src, Target), - HOAcc0 = ordsets:new(), - HONotSentFun = resize_transfer_notsent_fun(Ring, Mod, Src); - _ -> - HOFilter = none, - HOAcc0 = undefined, - HONotSentFun = undefined - end, - HOOpts = [{filter, HOFilter}, - {notsent_acc0, HOAcc0}, - {notsent_fun, HONotSentFun} | BaseOpts], - {ok, Pid} = riak_core_handoff_sender_sup:start_sender(HOType, - Mod, - Node, - Vnode, - HOOpts), - PidM = monitor(process, Pid), - Size = validate_size(proplists:get_value(size, Opts)), - - %% successfully started up a new sender handoff - {ok, #handoff_status{ transport_pid=Pid, - transport_mon=PidM, - direction=outbound, - timestamp=os:timestamp(), - src_node=node(), - target_node=Node, - mod_src_tgt={Mod, Src, Target}, - vnode_pid=Vnode, - vnode_mon=VnodeM, - status=[], - stats=dict:new(), - type=HOType, - req_origin=Origin, - filter_mod_fun=FilterModFun, - size=Size - } - }; - - %% handoff already going, just return it - AlreadyExists={false,_CurrentHandoff} -> - AlreadyExists - end + HOOpts = [{filter, HOFilter}, {notsent_acc0, HOAcc0}, + {notsent_fun, HONotSentFun} + | BaseOpts], + {ok, Pid} = + riak_core_handoff_sender_sup:start_sender(HOType, Mod, + Node, Vnode, + HOOpts), + PidM = monitor(process, Pid), + Size = validate_size(proplists:get_value(size, Opts)), + %% successfully started up a new sender handoff + {ok, + #handoff_status{transport_pid = Pid, + transport_mon = PidM, direction = outbound, + timestamp = os:timestamp(), src_node = node(), + target_node = Node, + mod_src_tgt = {Mod, Src, Target}, + vnode_pid = Vnode, vnode_mon = VnodeM, + status = [], stats = dict:new(), type = HOType, + req_origin = Origin, + filter_mod_fun = FilterModFun, size = Size}}; + %% handoff already going, just return it + AlreadyExists = {false, _CurrentHandoff} -> + AlreadyExists + end end. %% spawn a receiver process -receive_handoff () -> +receive_handoff() -> case handoff_concurrency_limit_reached() of - true -> - {error, max_concurrency}; - false -> - {ok,Pid}=riak_core_handoff_receiver_sup:start_receiver(), - PidM = monitor(process, Pid), - - %% successfully started up a new receiver - {ok, #handoff_status{ transport_pid=Pid, - transport_mon=PidM, - direction=inbound, - timestamp=os:timestamp(), - mod_src_tgt={undefined, undefined, undefined}, - src_node=undefined, - target_node=undefined, - status=[], - stats=dict:new(), - req_origin=none - } - } + true -> {error, max_concurrency}; + false -> + {ok, Pid} = + riak_core_handoff_receiver_sup:start_receiver(), + PidM = monitor(process, Pid), + %% successfully started up a new receiver + {ok, + #handoff_status{transport_pid = Pid, + transport_mon = PidM, direction = inbound, + timestamp = os:timestamp(), + mod_src_tgt = {undefined, undefined, undefined}, + src_node = undefined, target_node = undefined, + status = [], stats = dict:new(), req_origin = none}} end. update_stats(StatsUpdate, Stats) -> - #ho_stats{last_update=LU, objs=Objs, bytes=Bytes}=StatsUpdate, + #ho_stats{last_update = LU, objs = Objs, + bytes = Bytes} = + StatsUpdate, Stats2 = dict:update_counter(objs, Objs, Stats), Stats3 = dict:update_counter(bytes, Bytes, Stats2), dict:store(last_update, LU, Stats3). -validate_size(Size={N, U}) when is_number(N) andalso - N > 0 andalso - (U =:= bytes orelse U =:= objects) -> +validate_size(Size = {N, U}) + when is_number(N) andalso + N > 0 andalso (U =:= bytes orelse U =:= objects) -> Size; -validate_size(Size={F, dynamic}) when is_function(F) -> +validate_size(Size = {F, dynamic}) + when is_function(F) -> Size; -validate_size(_) -> - undefined. - +validate_size(_) -> undefined. %% @private %% @@ -602,154 +620,137 @@ validate_size(_) -> %% with `Reason'. There might be more than one because repair %% can have two simultaneous inbound xfers. kill_xfer_i(ModSrcTarget, Reason, HS) -> - case lists:keytake(ModSrcTarget, #handoff_status.mod_src_tgt, HS) of - false -> - HS; - {value, Xfer, HS2} -> - #handoff_status{mod_src_tgt={Mod, SrcPartition, TargetPartition}, - type=Type, - target_node=TargetNode, - src_node=SrcNode, - transport_pid=TP - } = Xfer, - Msg = "~p transfer of ~p from ~p ~p to ~p ~p killed for reason ~p", - case Type of - undefined -> - ok; - _ -> - logger:info(Msg, [Type, Mod, SrcNode, SrcPartition, - TargetNode, TargetPartition, Reason]) - end, - exit(TP, {kill_xfer, Reason}), - kill_xfer_i(ModSrcTarget, Reason, HS2) + case lists:keytake(ModSrcTarget, + #handoff_status.mod_src_tgt, HS) + of + false -> HS; + {value, Xfer, HS2} -> + #handoff_status{mod_src_tgt = + {Mod, SrcPartition, TargetPartition}, + type = Type, target_node = TargetNode, + src_node = SrcNode, transport_pid = TP} = + Xfer, + Msg = "~p transfer of ~p from ~p ~p to ~p ~p " + "killed for reason ~p", + case Type of + undefined -> ok; + _ -> + logger:info(Msg, + [Type, Mod, SrcNode, SrcPartition, TargetNode, + TargetPartition, Reason]) + end, + exit(TP, {kill_xfer, Reason}), + kill_xfer_i(ModSrcTarget, Reason, HS2) end. handoff_change_enabled_setting(EnOrDis, Direction) -> SetFun = case EnOrDis of - enable -> fun handoff_enable/1; - disable -> fun handoff_disable/1 + enable -> fun handoff_enable/1; + disable -> fun handoff_disable/1 end, case Direction of - inbound -> - SetFun(inbound); - outbound -> - SetFun(outbound); - both -> - SetFun(inbound), - SetFun(outbound) + inbound -> SetFun(inbound); + outbound -> SetFun(outbound); + both -> SetFun(inbound), SetFun(outbound) end. handoff_enable(inbound) -> - application:set_env(riak_core, disable_inbound_handoff, false); + application:set_env(riak_core, disable_inbound_handoff, + false); handoff_enable(outbound) -> - application:set_env(riak_core, disable_outbound_handoff, false). + application:set_env(riak_core, disable_outbound_handoff, + false). handoff_disable(inbound) -> - application:set_env(riak_core, disable_inbound_handoff, true), + application:set_env(riak_core, disable_inbound_handoff, + true), kill_handoffs_in_direction(inbound); handoff_disable(outbound) -> - application:set_env(riak_core, disable_outbound_handoff, true), + application:set_env(riak_core, disable_outbound_handoff, + true), kill_handoffs_in_direction(outbound). %%%=================================================================== %%% Tests %%%=================================================================== --ifdef (TEST). +-ifdef(TEST). -handoff_test_ () -> +handoff_test_() -> {spawn, {setup, - %% called when the tests start and complete... fun () -> {ok, ManPid} = start_link(), - {ok, RSupPid} = riak_core_handoff_receiver_sup:start_link(), - {ok, SSupPid} = riak_core_handoff_sender_sup:start_link(), + {ok, RSupPid} = + riak_core_handoff_receiver_sup:start_link(), + {ok, SSupPid} = + riak_core_handoff_sender_sup:start_link(), [ManPid, RSupPid, SSupPid] end, - fun (PidList) -> lists:foreach(fun(Pid) -> exit(Pid, kill) end, PidList) end, - + fun (PidList) -> + lists:foreach(fun (Pid) -> exit(Pid, kill) end, PidList) + end, %% actual list of test - [?_test(simple_handoff()), - ?_test(config_disable()) - ]}}. - -simple_handoff () -> - ?assertEqual([],status()), + [?_test((simple_handoff())), + ?_test((config_disable()))]}}. +simple_handoff() -> + ?assertEqual([], (status())), %% clear handoff_concurrency and make sure a handoff fails - ?assertEqual(ok,set_concurrency(0)), - ?assertEqual({error,max_concurrency},add_inbound()), - ?assertEqual({error,max_concurrency},add_outbound(ownership,riak_kv_vnode, 0,node(),self(),[])), - + ?assertEqual(ok, (set_concurrency(0))), + ?assertEqual({error, max_concurrency}, (add_inbound())), + ?assertEqual({error, max_concurrency}, + (add_outbound(ownership, riak_kv_vnode, 0, node(), + self(), []))), %% allow for a single handoff - ?assertEqual(ok,set_concurrency(1)), - + ?assertEqual(ok, (set_concurrency(1))), %% done ok. -config_disable () -> +config_disable() -> %% expect error log error_logger:tty(false), - - ?assertEqual(ok, handoff_enable(inbound)), - ?assertEqual(ok, handoff_enable(outbound)), - ?assertEqual(ok, set_concurrency(2)), - - ?assertEqual([], status()), - + ?assertEqual(ok, (handoff_enable(inbound))), + ?assertEqual(ok, (handoff_enable(outbound))), + ?assertEqual(ok, (set_concurrency(2))), + ?assertEqual([], (status())), Res = add_inbound(), ?assertMatch({ok, _}, Res), {ok, Pid} = Res, - - ?assertEqual(1, length(status())), - + ?assertEqual(1, (length(status()))), Ref = monitor(process, Pid), - - CatchDownFun = fun() -> + CatchDownFun = fun () -> receive - {'DOWN', Ref, process, Pid, max_concurrency} -> - ok; - Other -> - {error, unexpected_message, Other} - after - 1000 -> - {error, timeout_waiting_for_down_msg} + {'DOWN', Ref, process, Pid, max_concurrency} -> ok; + Other -> {error, unexpected_message, Other} + after 1000 -> {error, timeout_waiting_for_down_msg} end end, - - ?assertEqual(ok, handoff_disable(inbound)), - ?assertEqual(ok, CatchDownFun()), + ?assertEqual(ok, (handoff_disable(inbound))), + ?assertEqual(ok, (CatchDownFun())), %% We use wait_until because it's possible that the handoff manager process %% could get our call to status/0 before it receives the 'DOWN' message, %% so we periodically retry the call for a while until we get the answer we %% expect, or until we time out. - Status0 = fun() -> length(status()) =:= 0 end, - ?assertEqual(ok, wait_until(Status0, 500, 1)), - - - ?assertEqual({error, max_concurrency}, add_inbound()), - - ?assertEqual(ok, handoff_enable(inbound)), - ?assertEqual(ok, handoff_enable(outbound)), - ?assertEqual(0, length(status())), - - ?assertMatch({ok, _}, add_inbound()), - ?assertEqual(1, length(status())), + Status0 = fun () -> length(status()) =:= 0 end, + ?assertEqual(ok, (wait_until(Status0, 500, 1))), + ?assertEqual({error, max_concurrency}, (add_inbound())), + ?assertEqual(ok, (handoff_enable(inbound))), + ?assertEqual(ok, (handoff_enable(outbound))), + ?assertEqual(0, (length(status()))), + ?assertMatch({ok, _}, (add_inbound())), + ?assertEqual(1, (length(status()))), error_logger:tty(true). %% Copied from riak_test's rt.erl: wait_until(Fun, Retry, Delay) when Retry > 0 -> Res = Fun(), case Res of - true -> - ok; - _ when Retry == 1 -> - {fail, Res}; - _ -> - timer:sleep(Delay), - wait_until(Fun, Retry-1, Delay) + true -> ok; + _ when Retry == 1 -> {fail, Res}; + _ -> + timer:sleep(Delay), wait_until(Fun, Retry - 1, Delay) end. -endif. diff --git a/src/riak_core_handoff_receiver.erl b/src/riak_core_handoff_receiver.erl index 663a2b2c1..d6558297e 100644 --- a/src/riak_core_handoff_receiver.erl +++ b/src/riak_core_handoff_receiver.erl @@ -21,138 +21,156 @@ %% @doc incoming data handler for TCP-based handoff -module(riak_core_handoff_receiver). + -include("riak_core_handoff.hrl"). --behaviour(riak_core_gen_server). --export([start_link/0, - set_socket/2, + +-behaviour(gen_server). + +-export([start_link/0, set_socket/2, supports_batching/0]). --export([init/1, handle_call/3, handle_cast/2, handle_info/2, - terminate/2, code_change/3]). - --record(state, {sock :: port() | undefined, - peer :: term(), - recv_timeout_len :: non_neg_integer(), - vnode_timeout_len :: non_neg_integer(), - partition :: non_neg_integer() | undefined, - vnode_mod = riak_kv_vnode:: module(), - vnode :: pid() | undefined, - count = 0 :: non_neg_integer()}). + +-export([init/1, handle_call/3, handle_cast/2, + handle_info/2, terminate/2, code_change/3]). + +-record(state, + {sock :: port() | undefined, peer :: term(), + recv_timeout_len :: non_neg_integer(), + vnode_timeout_len :: non_neg_integer(), + partition :: non_neg_integer() | undefined, + vnode_mod = riak_kv_vnode :: module(), + vnode :: pid() | undefined, + count = 0 :: non_neg_integer()}). %% set the TCP receive timeout to five minutes to be conservative. -define(RECV_TIMEOUT, 300000). + %% set the timeout for the vnode to process the handoff_data msg to 60s -define(VNODE_TIMEOUT, 60000). --ifdef(deprecated_21). -ssl_handshake(Socket, SslOpts, Timeout) -> - ssl:handshake(Socket, SslOpts, Timeout). --else. -ssl_handshake(Socket, SslOpts, Timeout) -> - ssl:ssl_accept(Socket, SslOpts, Timeout). --endif. - -start_link() -> - riak_core_gen_server:start_link(?MODULE, [], []). +start_link() -> gen_server:start_link(?MODULE, [], []). set_socket(Pid, Socket) -> - riak_core_gen_server:call(Pid, {set_socket, Socket}). + gen_server:call(Pid, {set_socket, Socket}). -supports_batching() -> - true. +supports_batching() -> true. init([]) -> - {ok, #state{recv_timeout_len = application:get_env(riak_core, handoff_receive_timeout, ?RECV_TIMEOUT), - vnode_timeout_len = application:get_env(riak_core, handoff_receive_vnode_timeout, ?VNODE_TIMEOUT)}}. + {ok, + #state{recv_timeout_len = + application:get_env(riak_core, handoff_receive_timeout, + ?RECV_TIMEOUT), + vnode_timeout_len = + application:get_env(riak_core, + handoff_receive_vnode_timeout, + ?VNODE_TIMEOUT)}}. handle_call({set_socket, Socket0}, _From, State) -> SockOpts = [{active, once}, {packet, 4}, {header, 1}], ok = inet:setopts(Socket0, SockOpts), Peer = safe_peername(Socket0, inet), Socket = Socket0, - {reply, ok, State#state { sock = Socket, peer = Peer }}. - -handle_info({tcp_closed,_Socket},State=#state{partition=Partition,count=Count, - peer=Peer}) -> - logger:info("Handoff receiver for partition ~p exited after processing ~p" - " objects from ~p", [Partition, Count, Peer]), + {reply, ok, State#state{sock = Socket, peer = Peer}}. + +handle_info({tcp_closed, _Socket}, + State = #state{partition = Partition, count = Count, + peer = Peer}) -> + logger:info("Handoff receiver for partition ~p exited " + "after processing ~p objects from ~p", + [Partition, Count, Peer]), {stop, normal, State}; -handle_info({tcp_error, _Socket, Reason}, State=#state{partition=Partition,count=Count, - peer=Peer}) -> - logger:info("Handoff receiver for partition ~p exited after processing ~p" - " objects from ~p: TCP error ~p", [Partition, Count, Peer, Reason]), +handle_info({tcp_error, _Socket, Reason}, + State = #state{partition = Partition, count = Count, + peer = Peer}) -> + logger:info("Handoff receiver for partition ~p exited " + "after processing ~p objects from ~p: " + "TCP error ~p", + [Partition, Count, Peer, Reason]), {stop, normal, State}; handle_info({tcp, Socket, Data}, State) -> - [MsgType|MsgData] = Data, - case catch(process_message(MsgType, MsgData, State)) of - {'EXIT', Reason} -> - logger:error("Handoff receiver for partition ~p exited abnormally after " - "processing ~p objects from ~p: ~p", [State#state.partition, State#state.count, State#state.peer, Reason]), - {stop, normal, State}; - NewState when is_record(NewState, state) -> - InetMod = inet, - InetMod:setopts(Socket, [{active, once}]), - {noreply, NewState, State#state.recv_timeout_len} + [MsgType | MsgData] = Data, + case catch process_message(MsgType, MsgData, State) of + {'EXIT', Reason} -> + logger:error("Handoff receiver for partition ~p exited " + "abnormally after processing ~p objects " + "from ~p: ~p", + [State#state.partition, State#state.count, + State#state.peer, Reason]), + {stop, normal, State}; + NewState when is_record(NewState, state) -> + inet:setopts(Socket, [{active, once}]), + {noreply, NewState, State#state.recv_timeout_len} end; handle_info(timeout, State) -> - logger:error("Handoff receiver for partition ~p timed out after " - "processing ~p objects from ~p.", [State#state.partition, State#state.count, State#state.peer]), + logger:error("Handoff receiver for partition ~p timed " + "out after processing ~p objects from " + "~p.", + [State#state.partition, State#state.count, + State#state.peer]), {stop, normal, State}. -process_message(?PT_MSG_INIT, MsgData, State=#state{vnode_mod=VNodeMod, - peer=Peer}) -> +process_message(?PT_MSG_INIT, MsgData, + State = #state{vnode_mod = VNodeMod, peer = Peer}) -> <> = MsgData, - logger:info("Receiving handoff data for partition ~p:~p from ~p", [VNodeMod, Partition, Peer]), - {ok, VNode} = riak_core_vnode_master:get_vnode_pid(Partition, VNodeMod), + logger:info("Receiving handoff data for partition " + "~p:~p from ~p", + [VNodeMod, Partition, Peer]), + {ok, VNode} = + riak_core_vnode_master:get_vnode_pid(Partition, + VNodeMod), Data = [{mod_src_tgt, {VNodeMod, undefined, Partition}}, {vnode_pid, VNode}], riak_core_handoff_manager:set_recv_data(self(), Data), - State#state{partition=Partition, vnode=VNode}; - + State#state{partition = Partition, vnode = VNode}; process_message(?PT_MSG_BATCH, MsgData, State) -> - lists:foldl(fun(Obj, StateAcc) -> process_message(?PT_MSG_OBJ, Obj, StateAcc) end, - State, - binary_to_term(MsgData)); - -process_message(?PT_MSG_OBJ, MsgData, State=#state{vnode=VNode, count=Count, - vnode_timeout_len=VNodeTimeout}) -> - Msg = {handoff_data, MsgData}, - try gen_fsm:sync_send_all_state_event(VNode, Msg, VNodeTimeout) of - ok -> - State#state{count=Count+1}; - E={error, _} -> - exit(E) + lists:foldl(fun (Obj, StateAcc) -> + process_message(?PT_MSG_OBJ, Obj, StateAcc) + end, + State, binary_to_term(MsgData)); +process_message(?PT_MSG_OBJ, MsgData, + State = #state{vnode = VNode, count = Count, + vnode_timeout_len = VNodeTimeout}) -> + try riak_core_vnode:handoff_data(VNode, MsgData, + VNodeTimeout) + of + ok -> State#state{count = Count + 1}; + E = {error, _} -> exit(E) catch - exit:{timeout, _} -> - exit({error, {vnode_timeout, VNodeTimeout, size(MsgData), - binary:part(MsgData, {0,min(size(MsgData),128)})}}) + exit:{timeout, _} -> + exit({error, + {vnode_timeout, VNodeTimeout, size(MsgData), + binary:part(MsgData, {0, min(size(MsgData), 128)})}}) end; -process_message(?PT_MSG_OLDSYNC, MsgData, State=#state{sock=Socket}) -> - gen_tcp:send(Socket, <>), +process_message(?PT_MSG_OLDSYNC, MsgData, + State = #state{sock = Socket}) -> + gen_tcp:send(Socket, <<(?PT_MSG_OLDSYNC):8, "sync">>), <> = MsgData, VNodeMod = binary_to_atom(VNodeModBin, utf8), - State#state{vnode_mod=VNodeMod}; -process_message(?PT_MSG_SYNC, _MsgData, State=#state{sock=Socket}) -> - gen_tcp:send(Socket, <>), + State#state{vnode_mod = VNodeMod}; +process_message(?PT_MSG_SYNC, _MsgData, + State = #state{sock = Socket}) -> + gen_tcp:send(Socket, <<(?PT_MSG_SYNC):8, "sync">>), State; - -process_message(?PT_MSG_VERIFY_NODE, ExpectedName, State=#state{sock=Socket, - peer=Peer}) -> +process_message(?PT_MSG_VERIFY_NODE, ExpectedName, + State = #state{sock = Socket, peer = Peer}) -> case binary_to_term(ExpectedName) of - _Node when _Node =:= node() -> - gen_tcp:send(Socket, <>), - State; - Node -> - logger:error("Handoff from ~p expects us to be ~s but we are ~s.", - [Peer, Node, node()]), - exit({error, {wrong_node, Node}}) + _Node when _Node =:= node() -> + gen_tcp:send(Socket, <<(?PT_MSG_VERIFY_NODE):8>>), + State; + Node -> + logger:error("Handoff from ~p expects us to be ~s " + "but we are ~s.", + [Peer, Node, node()]), + exit({error, {wrong_node, Node}}) end; - process_message(?PT_MSG_CONFIGURE, MsgData, State) -> ConfProps = binary_to_term(MsgData), - State#state{vnode_mod=proplists:get_value(vnode_mod, ConfProps), - partition=proplists:get_value(partition, ConfProps)}; -process_message(_, _MsgData, State=#state{sock=Socket}) -> - gen_tcp:send(Socket, <>), + State#state{vnode_mod = + proplists:get_value(vnode_mod, ConfProps), + partition = proplists:get_value(partition, ConfProps)}; +process_message(_, _MsgData, + State = #state{sock = Socket}) -> + gen_tcp:send(Socket, + <<(?PT_MSG_UNKNOWN):8, "unknown_msg">>), State. handle_cast(_Msg, State) -> {noreply, State}. @@ -161,10 +179,10 @@ terminate(_Reason, _State) -> ok. code_change(_OldVsn, State, _Extra) -> {ok, State}. -safe_peername(Skt, Mod) -> - case Mod:peername(Skt) of - {ok, {Host, Port}} -> - {inet_parse:ntoa(Host), Port}; - _ -> - {unknown, unknown} % Real info is {Addr, Port} +safe_peername(Skt, Module) -> + case Module:peername(Skt) of + {ok, {Host, Port}} -> {inet_parse:ntoa(Host), Port}; + _ -> + {unknown, + unknown} % Real info is {Addr, Port} end. diff --git a/src/riak_core_handoff_receiver_sup.erl b/src/riak_core_handoff_receiver_sup.erl index 3f50f0e17..0a9402f10 100644 --- a/src/riak_core_handoff_receiver_sup.erl +++ b/src/riak_core_handoff_receiver_sup.erl @@ -19,6 +19,7 @@ %% ------------------------------------------------------------------- -module(riak_core_handoff_receiver_sup). + -behaviour(supervisor). %% beahvior functions @@ -27,18 +28,19 @@ %% public functions -export([start_receiver/0]). --define(CHILD(I,Type), {I,{I,start_link,[]},temporary,brutal_kill,Type,[I]}). +-define(CHILD(I, Type), + {I, {I, start_link, []}, temporary, brutal_kill, Type, + [I]}). %% begins the supervisor, init/1 will be called -start_link () -> - supervisor:start_link({local,?MODULE},?MODULE,[]). +start_link() -> + supervisor:start_link({local, ?MODULE}, ?MODULE, []). %% @private -init ([]) -> - {ok,{{simple_one_for_one,10,10}, - [?CHILD(riak_core_handoff_receiver,worker) - ]}}. +init([]) -> + {ok, + {{simple_one_for_one, 10, 10}, + [?CHILD(riak_core_handoff_receiver, worker)]}}. %% start a sender process -start_receiver () -> - supervisor:start_child(?MODULE,[]). +start_receiver() -> supervisor:start_child(?MODULE, []). diff --git a/src/riak_core_handoff_sender.erl b/src/riak_core_handoff_sender.erl index ad0daa730..6da4f0a37 100644 --- a/src/riak_core_handoff_sender.erl +++ b/src/riak_core_handoff_sender.erl @@ -21,66 +21,62 @@ %% @doc send a partition's data via TCP-based handoff -module(riak_core_handoff_sender). + -export([start_link/4]). + -include("riak_core_vnode.hrl"). + -include("riak_core_handoff.hrl"). --include("stacktrace.hrl"). + -define(ACK_COUNT, 1000). + %% can be set with env riak_core, handoff_timeout -define(TCP_TIMEOUT, 60000). + %% can be set with env riak_core, handoff_status_interval %% note this is in seconds -define(STATUS_INTERVAL, 2). --define(log_info(Str, Args), - logger:info("~p transfer of ~p from ~p ~p to ~p ~p failed " ++ Str, - [Type, Module, SrcNode, SrcPartition, TargetNode, - TargetPartition] ++ Args)). --define(log_fail(Str, Args), - logger:error("~p transfer of ~p from ~p ~p to ~p ~p failed " ++ Str, +-define(LOG_INFO(Str, Args), + logger:info("~p transfer of ~p from ~p ~p to ~p ~p " + "failed " + ++ Str, [Type, Module, SrcNode, SrcPartition, TargetNode, - TargetPartition] ++ Args)). + TargetPartition] + ++ Args)). + +-define(LOG_FAIL(Str, Args), + logger:error("~p transfer of ~p from ~p ~p to ~p ~p " + "failed " + ++ Str, + [Type, Module, SrcNode, SrcPartition, TargetNode, + TargetPartition] + ++ Args)). %% Accumulator for the visit item HOF -record(ho_acc, - { - ack :: non_neg_integer(), - error :: ok | {error, any()}, - filter :: function(), - module :: module(), - parent :: pid(), - socket :: any(), - src_target :: {non_neg_integer(), non_neg_integer()}, - stats :: #ho_stats{}, - - total_objects :: non_neg_integer(), - total_bytes :: non_neg_integer(), - - use_batching :: boolean(), - - item_queue :: [binary()], - item_queue_length :: non_neg_integer(), - item_queue_byte_size :: non_neg_integer(), - - acksync_threshold :: non_neg_integer(), - acksync_timer :: timer:tref() | undefined, - - type :: ho_type(), - - notsent_acc :: term(), - notsent_fun :: function() | undefined - }). + {ack :: non_neg_integer(), + error :: ok | {error, any()}, filter :: function(), + module :: module(), parent :: pid(), socket :: any(), + src_target :: {non_neg_integer(), non_neg_integer()}, + stats :: #ho_stats{}, + total_objects :: non_neg_integer(), + total_bytes :: non_neg_integer(), + use_batching :: boolean(), item_queue :: [binary()], + item_queue_length :: non_neg_integer(), + item_queue_byte_size :: non_neg_integer(), + acksync_threshold :: non_neg_integer(), + acksync_timer :: timer:tref() | undefined, + type :: ho_type(), notsent_acc :: term(), + notsent_fun :: function() | undefined}). %%%=================================================================== %%% API %%%=================================================================== start_link(TargetNode, Module, {Type, Opts}, Vnode) -> - Pid = spawn_link(fun()->start_fold(TargetNode, - Module, - {Type, Opts}, - Vnode - ) + Pid = spawn_link(fun () -> + start_fold(TargetNode, Module, {Type, Opts}, Vnode) end), {ok, Pid}. @@ -88,29 +84,25 @@ start_link(TargetNode, Module, {Type, Opts}, Vnode) -> %%% Private %%%=================================================================== - -start_fold_(TargetNode, Module, Type, Opts, ParentPid, SrcNode, SrcPartition, TargetPartition) -> +start_fold_(TargetNode, Module, Type, Opts, ParentPid, + SrcNode, SrcPartition, TargetPartition) -> %% Give workers one more chance to abort or get a lock or whatever. - FoldOpts = maybe_call_handoff_started(Module, SrcPartition), - + FoldOpts = maybe_call_handoff_started(Module, + SrcPartition), Filter = get_filter(Opts), - [_Name,Host] = string:tokens(atom_to_list(TargetNode), "@"), + [_Name, Host] = string:tokens(atom_to_list(TargetNode), + "@"), {ok, Port} = get_handoff_port(TargetNode), - TNHandoffIP = - case get_handoff_ip(TargetNode) of - error -> - Host; - {ok, "0.0.0.0"} -> - Host; - {ok, Other} -> - Other - end, - SockOpts = [binary, {packet, 4}, {header,1}, {active, false}], - {ok, Socket} = gen_tcp:connect(TNHandoffIP, Port, SockOpts, 15000), - - + TNHandoffIP = case get_handoff_ip(TargetNode) of + error -> Host; + {ok, "0.0.0.0"} -> Host; + {ok, Other} -> Other + end, + SockOpts = [binary, {packet, 4}, {header, 1}, + {active, false}], + {ok, Socket} = gen_tcp:connect(TNHandoffIP, Port, + SockOpts, 15000), RecvTimeout = get_handoff_receive_timeout(), - %% We want to ensure that the node we think we are talking to %% really is the node we expect. %% The remote node will reply with PT_MSG_VERIFY_NODE if it @@ -118,30 +110,30 @@ start_fold_(TargetNode, Module, Type, Opts, ParentPid, SrcNode, SrcPartition, Ta %% If the node does not support this functionality we %% print an error and keep going with our fingers crossed. TargetBin = term_to_binary(TargetNode), - VerifyNodeMsg = <>, + VerifyNodeMsg = <<(?PT_MSG_VERIFY_NODE):8, + TargetBin/binary>>, ok = gen_tcp:send(Socket, VerifyNodeMsg), case gen_tcp:recv(Socket, 0, RecvTimeout) of - {ok,[?PT_MSG_VERIFY_NODE | _]} -> ok; - {ok,[?PT_MSG_UNKNOWN | _]} -> - logger:warning("Could not verify identity of peer ~s.", - [TargetNode]), - ok; - {error, timeout} -> exit({shutdown, timeout}); - {error, closed} -> exit({shutdown, wrong_node}) + {ok, [?PT_MSG_VERIFY_NODE | _]} -> ok; + {ok, [?PT_MSG_UNKNOWN | _]} -> + logger:warning("Could not verify identity of peer ~s.", + [TargetNode]), + ok; + {error, timeout} -> exit({shutdown, timeout}); + {error, closed} -> exit({shutdown, wrong_node}) end, - %% Piggyback the sync command from previous releases to send %% the vnode type across. If talking to older nodes they'll %% just do a sync, newer nodes will decode the module name. %% After 0.12.0 the calls can be switched to use PT_MSG_SYNC %% and PT_MSG_CONFIGURE - VMaster = list_to_atom(atom_to_list(Module) ++ "_master"), + VMaster = list_to_atom(atom_to_list(Module) ++ + "_master"), ModBin = atom_to_binary(Module, utf8), - Msg = <>, + Msg = <<(?PT_MSG_OLDSYNC):8, ModBin/binary>>, ok = gen_tcp:send(Socket, Msg), - - AckSyncThreshold = application:get_env(riak_core, handoff_acksync_threshold, 25), - + AckSyncThreshold = application:get_env(riak_core, + handoff_acksync_threshold, 25), %% Now that handoff_concurrency applies to both outbound and %% inbound conns there is a chance that the receiver may %% decide to reject the senders attempt to start a handoff. @@ -150,350 +142,318 @@ start_fold_(TargetNode, Module, Type, Opts, ParentPid, SrcNode, SrcPartition, Ta %% socket at this point is a rejection by the receiver to %% enforce handoff_concurrency. case gen_tcp:recv(Socket, 0, RecvTimeout) of - {ok,[?PT_MSG_OLDSYNC|<<"sync">>]} -> ok; - {error, timeout} -> exit({shutdown, timeout}); - {error, closed} -> exit({shutdown, max_concurrency}) + {ok, [?PT_MSG_OLDSYNC | <<"sync">>]} -> ok; + {error, timeout} -> exit({shutdown, timeout}); + {error, closed} -> exit({shutdown, max_concurrency}) end, - - RemoteSupportsBatching = remote_supports_batching(TargetNode), - - logger:info("Starting ~p transfer of ~p from ~p ~p to ~p ~p", - [Type, Module, SrcNode, SrcPartition, - TargetNode, TargetPartition]), - - M = <>, + RemoteSupportsBatching = + remote_supports_batching(TargetNode), + logger:info("Starting ~p transfer of ~p from ~p ~p " + "to ~p ~p", + [Type, Module, SrcNode, SrcPartition, TargetNode, + TargetPartition]), + M = <<(?PT_MSG_INIT):8, TargetPartition:160/integer>>, ok = gen_tcp:send(Socket, M), StartFoldTime = os:timestamp(), - Stats = #ho_stats{interval_end=future_now(get_status_interval())}, + Stats = #ho_stats{interval_end = + future_now(get_status_interval())}, UnsentAcc0 = get_notsent_acc0(Opts), UnsentFun = get_notsent_fun(Opts), - - Req = riak_core_util:make_fold_req( - fun visit_item/3, - #ho_acc{ack=0, - error=ok, - filter=Filter, - module=Module, - parent=ParentPid, - socket=Socket, - src_target={SrcPartition, TargetPartition}, - stats=Stats, - - total_bytes=0, - total_objects=0, - - use_batching=RemoteSupportsBatching, - - item_queue=[], - item_queue_length=0, - item_queue_byte_size=0, - - acksync_threshold=AckSyncThreshold, - - type=Type, - notsent_acc=UnsentAcc0, - notsent_fun=UnsentFun}, - false, - FoldOpts), + Req = riak_core_util:make_fold_req(fun visit_item/3, + #ho_acc{ack = 0, error = ok, + filter = Filter, module = Module, + parent = ParentPid, + socket = Socket, + src_target = + {SrcPartition, + TargetPartition}, + stats = Stats, total_bytes = 0, + total_objects = 0, + use_batching = + RemoteSupportsBatching, + item_queue = [], + item_queue_length = 0, + item_queue_byte_size = 0, + acksync_threshold = + AckSyncThreshold, + type = Type, + notsent_acc = UnsentAcc0, + notsent_fun = UnsentFun}, + false, FoldOpts), %% IFF the vnode is using an async worker to perform the fold %% then sync_command will return error on vnode crash, %% otherwise it will wait forever but vnode crash will be %% caught by handoff manager. I know, this is confusing, a %% new handoff system will be written soon enough. - - AccRecord0 = case riak_core_vnode_master:sync_command( - {SrcPartition, SrcNode}, Req, VMaster, infinity) of - #ho_acc{} = Ret -> - Ret; - Ret -> - logger:error("[handoff] Bad handoff record: ~p", - [Ret]), - Ret + AccRecord0 = case + riak_core_vnode_master:sync_command({SrcPartition, + SrcNode}, + Req, VMaster, infinity) + of + #ho_acc{} = Ret -> Ret; + Ret -> + logger:error("[handoff] Bad handoff record: ~p", [Ret]), + Ret end, %% Send any straggler entries remaining in the buffer: - AccRecord = send_objects(AccRecord0#ho_acc.item_queue, AccRecord0), - + AccRecord = send_objects(AccRecord0#ho_acc.item_queue, + AccRecord0), if AccRecord == {error, vnode_shutdown} -> - ?log_info("because the local vnode was shutdown", []), - throw({be_quiet, error, local_vnode_shutdown_requested}); + ?LOG_INFO("because the local vnode was shutdown", []), + throw({be_quiet, error, + local_vnode_shutdown_requested}); true -> - ok % If not #ho_acc, get badmatch below + ok % If not #ho_acc, get badmatch below end, - #ho_acc{ - error=ErrStatus, - module=Module, - parent=ParentPid, - total_objects=TotalObjects, - total_bytes=TotalBytes, - stats=FinalStats, - acksync_timer=TRef, - notsent_acc=NotSentAcc} = AccRecord, - + #ho_acc{error = ErrStatus, module = Module, + parent = ParentPid, total_objects = TotalObjects, + total_bytes = TotalBytes, stats = FinalStats, + acksync_timer = TRef, notsent_acc = NotSentAcc} = + AccRecord, _ = timer:cancel(TRef), case ErrStatus of - ok -> - %% One last sync to make sure the message has been received. - %% post-0.14 vnodes switch to handoff to forwarding immediately - %% so handoff_complete can only be sent once all of the data is - %% written. handle_handoff_data is a sync call, so once - %% we receive the sync the remote side will be up to date. - logger:debug("~p ~p Sending final sync", - [SrcPartition, Module]), - ok = gen_tcp:send(Socket, <>), - - case gen_tcp:recv(Socket, 0, RecvTimeout) of - {ok,[?PT_MSG_SYNC|<<"sync">>]} -> - logger:debug("~p ~p Final sync received", - [SrcPartition, Module]); - {error, timeout} -> exit({shutdown, timeout}) - end, - - FoldTimeDiff = end_fold_time(StartFoldTime), - ThroughputBytes = TotalBytes/FoldTimeDiff, - - ok = logger:info("~p transfer of ~p from ~p ~p to ~p ~p" - " completed: sent ~p bytes in ~p of ~p objects" - " in ~p seconds (~p/second)", - [Type, Module, SrcNode, SrcPartition, TargetNode, TargetPartition, - TotalBytes, - FinalStats#ho_stats.objs, TotalObjects, FoldTimeDiff, - ThroughputBytes]), - case Type of - repair -> ok; - resize -> gen_fsm_compat:send_event(ParentPid, {resize_transfer_complete, - NotSentAcc}); - _ -> gen_fsm_compat:send_event(ParentPid, handoff_complete) - end; - {error, ErrReason} -> - if ErrReason == timeout -> - exit({shutdown, timeout}); - true -> - exit({shutdown, {error, ErrReason}}) - end + ok -> + %% One last sync to make sure the message has been received. + %% post-0.14 vnodes switch to handoff to forwarding immediately + %% so handoff_complete can only be sent once all of the data is + %% written. handle_handoff_data is a sync call, so once + %% we receive the sync the remote side will be up to date. + logger:debug("~p ~p Sending final sync", + [SrcPartition, Module]), + ok = gen_tcp:send(Socket, <<(?PT_MSG_SYNC):8>>), + case gen_tcp:recv(Socket, 0, RecvTimeout) of + {ok, [?PT_MSG_SYNC | <<"sync">>]} -> + logger:debug("~p ~p Final sync received", + [SrcPartition, Module]); + {error, timeout} -> exit({shutdown, timeout}) + end, + FoldTimeDiff = end_fold_time(StartFoldTime), + ThroughputBytes = TotalBytes / FoldTimeDiff, + ok = + logger:info("~p transfer of ~p from ~p ~p to ~p ~p " + "completed: sent ~p bytes in ~p of ~p " + "objects in ~p seconds (~p/second)", + [Type, Module, SrcNode, SrcPartition, TargetNode, + TargetPartition, TotalBytes, + FinalStats#ho_stats.objs, TotalObjects, FoldTimeDiff, + ThroughputBytes]), + case Type of + repair -> ok; + resize -> + riak_core_vnode:resize_transfer_complete(ParentPid, + NotSentAcc); + _ -> riak_core_vnode:handoff_complete(ParentPid) + end; + {error, ErrReason} -> + if ErrReason == timeout -> exit({shutdown, timeout}); + true -> exit({shutdown, {error, ErrReason}}) + end end. -start_fold(TargetNode, Module, {Type, Opts}, ParentPid) -> +start_fold(TargetNode, Module, {Type, Opts}, + ParentPid) -> SrcNode = node(), SrcPartition = get_src_partition(Opts), TargetPartition = get_target_partition(Opts), - try - start_fold_(TargetNode, Module, Type, Opts, ParentPid, SrcNode, SrcPartition, TargetPartition) + try start_fold_(TargetNode, Module, Type, Opts, + ParentPid, SrcNode, SrcPartition, TargetPartition) catch - exit:{shutdown,max_concurrency} -> - %% Need to fwd the error so the handoff mgr knows - exit({shutdown, max_concurrency}); - exit:{shutdown, timeout} -> - %% A receive timeout during handoff - %% STATS -%% riak_core_stat:update(handoff_timeouts), - ?log_fail("because of TCP recv timeout", []), - exit({shutdown, timeout}); - exit:{shutdown, {error, Reason}} -> - ?log_fail("because of ~p", [Reason]), - gen_fsm:send_event(ParentPid, {handoff_error, - fold_error, Reason}), - exit({shutdown, {error, Reason}}); - throw:{be_quiet, Err, Reason} -> - gen_fsm_compat:send_event(ParentPid, {handoff_error, Err, Reason}); - Err:Reason:Stacktrace -> - ?log_fail("because of ~p:~p ~p", - [Err, Reason, Stacktrace]), - gen_fsm_compat:send_event(ParentPid, {handoff_error, Err, Reason}) - end. + exit:{shutdown, max_concurrency} -> + %% Need to fwd the error so the handoff mgr knows + exit({shutdown, max_concurrency}); + exit:{shutdown, timeout} -> + %% A receive timeout during handoff + %% STATS + %% riak_core_stat:update(handoff_timeouts), + ?LOG_FAIL("because of TCP recv timeout", []), + exit({shutdown, timeout}); + exit:{shutdown, {error, Reason}} -> + ?LOG_FAIL("because of ~p", [Reason]), + riak_core_vnode:handoff_error(ParentPid, fold_error, + Reason), + exit({shutdown, {error, Reason}}); + {be_quiet, Err, Reason} -> + riak_core_vnode:handoff_error(ParentPid, Err, Reason); + Err:Reason:Stacktrace -> + ?LOG_FAIL("because of ~p:~p ~p", + [Err, Reason, Stacktrace]), + riak_core_vnode:handoff_error(ParentPid, Err, Reason) + end. start_visit_item_timer() -> - Ival = case application:get_env(riak_core, handoff_receive_timeout, undefined) of - TO when is_integer(TO) -> - erlang:max(1000, TO div 3); - _ -> - 60*1000 + Ival = case application:get_env(riak_core, + handoff_receive_timeout, undefined) + of + TO when is_integer(TO) -> erlang:max(1000, TO div 3); + _ -> 60 * 1000 end, timer:send_interval(Ival, tick_send_sync). -visit_item(K, V, Acc0 = #ho_acc{acksync_threshold = AccSyncThreshold}) -> +visit_item(K, V, + Acc0 = #ho_acc{acksync_threshold = AccSyncThreshold}) -> %% Eventually, a vnode worker proc will be doing this fold, but we don't %% know the pid of that proc ahead of time. So we have to start the %% timer some time after the fold has started execution on that proc %% ... like now, perhaps. Acc = case get(is_visit_item_timer_set) of - undefined -> - put(is_visit_item_timer_set, true), - {ok, TRef} = start_visit_item_timer(), - Acc0#ho_acc{acksync_timer = TRef}; - _ -> - Acc0 + undefined -> + put(is_visit_item_timer_set, true), + {ok, TRef} = start_visit_item_timer(), + Acc0#ho_acc{acksync_timer = TRef}; + _ -> Acc0 end, receive - tick_send_sync -> - visit_item2(K, V, Acc#ho_acc{ack = AccSyncThreshold}) - after 0 -> - visit_item2(K, V, Acc) + tick_send_sync -> + visit_item2(K, V, Acc#ho_acc{ack = AccSyncThreshold}) + after 0 -> visit_item2(K, V, Acc) end. %% When a tcp error occurs, the ErrStatus argument is set to {error, Reason}. %% Since we can't abort the fold, this clause is just a no-op. -visit_item2(_K, _V, Acc=#ho_acc{error={error, _Reason}}) -> +visit_item2(_K, _V, + Acc = #ho_acc{error = {error, _Reason}}) -> %% When a TCP error occurs, #ho_acc.error is set to {error, Reason}. throw(Acc); -visit_item2(K, V, Acc = #ho_acc{ack = _AccSyncThreshold, acksync_threshold = _AccSyncThreshold}) -> - #ho_acc{module=Module, - socket=Sock, - src_target={SrcPartition, TargetPartition}, - stats=Stats - } = Acc, - +visit_item2(K, V, + Acc = #ho_acc{ack = _AccSyncThreshold, + acksync_threshold = _AccSyncThreshold}) -> + #ho_acc{module = Module, socket = Sock, + src_target = {SrcPartition, TargetPartition}, + stats = Stats} = + Acc, RecvTimeout = get_handoff_receive_timeout(), - M = <>, + M = <<(?PT_MSG_OLDSYNC):8, "sync">>, NumBytes = byte_size(M), - Stats2 = incr_bytes(Stats, NumBytes), - Stats3 = maybe_send_status({Module, SrcPartition, TargetPartition}, Stats2), - + Stats3 = maybe_send_status({Module, SrcPartition, + TargetPartition}, + Stats2), case gen_tcp:send(Sock, M) of - ok -> - case gen_tcp:recv(Sock, 0, RecvTimeout) of - {ok,[?PT_MSG_OLDSYNC|<<"sync">>]} -> - Acc2 = Acc#ho_acc{ack=0, error=ok, stats=Stats3}, - visit_item2(K, V, Acc2); - {error, Reason} -> - Acc#ho_acc{ack=0, error={error, Reason}, stats=Stats3} - end; - {error, Reason} -> - Acc#ho_acc{ack=0, error={error, Reason}, stats=Stats3} + ok -> + case gen_tcp:recv(Sock, 0, RecvTimeout) of + {ok, [?PT_MSG_OLDSYNC | <<"sync">>]} -> + Acc2 = Acc#ho_acc{ack = 0, error = ok, stats = Stats3}, + visit_item2(K, V, Acc2); + {error, Reason} -> + Acc#ho_acc{ack = 0, error = {error, Reason}, + stats = Stats3} + end; + {error, Reason} -> + Acc#ho_acc{ack = 0, error = {error, Reason}, + stats = Stats3} end; visit_item2(K, V, Acc) -> - #ho_acc{filter=Filter, - module=Module, - total_objects=TotalObjects, - use_batching=UseBatching, - item_queue=ItemQueue, - item_queue_length=ItemQueueLength, - item_queue_byte_size=ItemQueueByteSize, - notsent_fun=NotSentFun, - notsent_acc=NotSentAcc} = Acc, + #ho_acc{filter = Filter, module = Module, + total_objects = TotalObjects, + use_batching = UseBatching, item_queue = ItemQueue, + item_queue_length = ItemQueueLength, + item_queue_byte_size = ItemQueueByteSize, + notsent_fun = NotSentFun, notsent_acc = NotSentAcc} = + Acc, case Filter(K) of - true -> - case Module:encode_handoff_item(K, V) of - corrupted -> - {Bucket, Key} = K, - logger:warning("Unreadable object ~p/~p discarded", - [Bucket, Key]), - Acc; - BinObj -> - - case UseBatching of - true -> - ItemQueue2 = [BinObj | ItemQueue], - ItemQueueLength2 = ItemQueueLength + 1, - ItemQueueByteSize2 = ItemQueueByteSize + byte_size(BinObj), - - Acc2 = Acc#ho_acc{item_queue_length=ItemQueueLength2, - item_queue_byte_size=ItemQueueByteSize2}, - - %% Unit size is bytes: - HandoffBatchThreshold = application:get_env(riak_core, - handoff_batch_threshold, - 1024*1024), - - case ItemQueueByteSize2 =< HandoffBatchThreshold of - true -> Acc2#ho_acc{item_queue=ItemQueue2}; - false -> send_objects(ItemQueue2, Acc2) - end; - _ -> - #ho_acc{ack=Ack, - socket=Sock, - src_target={SrcPartition, TargetPartition}, - stats=Stats, - total_objects=TotalObjects, - total_bytes=TotalBytes} = Acc, - M = <>, - NumBytes = byte_size(M), - - Stats2 = incr_bytes(incr_objs(Stats), NumBytes), - Stats3 = maybe_send_status({Module, SrcPartition, - TargetPartition}, Stats2), - - case gen_tcp:send(Sock, M) of - ok -> - Acc#ho_acc{ack=Ack+1, - error=ok, - stats=Stats3, - total_bytes=TotalBytes+NumBytes, - total_objects=TotalObjects+1}; - {error, Reason} -> - Acc#ho_acc{error={error, Reason}, stats=Stats3} - end - end - end; - false -> - NewNotSentAcc = handle_not_sent_item(NotSentFun, NotSentAcc, K), - Acc#ho_acc{error=ok, - total_objects=TotalObjects+1, - notsent_acc=NewNotSentAcc} + true -> + case Module:encode_handoff_item(K, V) of + corrupted -> + {Bucket, Key} = K, + logger:warning("Unreadable object ~p/~p discarded", + [Bucket, Key]), + Acc; + BinObj -> + case UseBatching of + true -> + ItemQueue2 = [BinObj | ItemQueue], + ItemQueueLength2 = ItemQueueLength + 1, + ItemQueueByteSize2 = ItemQueueByteSize + + byte_size(BinObj), + Acc2 = Acc#ho_acc{item_queue_length = ItemQueueLength2, + item_queue_byte_size = + ItemQueueByteSize2}, + %% Unit size is bytes: + HandoffBatchThreshold = application:get_env(riak_core, + handoff_batch_threshold, + 1024 * 1024), + case ItemQueueByteSize2 =< HandoffBatchThreshold of + true -> Acc2#ho_acc{item_queue = ItemQueue2}; + false -> send_objects(ItemQueue2, Acc2) + end; + _ -> + #ho_acc{ack = Ack, socket = Sock, + src_target = {SrcPartition, TargetPartition}, + stats = Stats, total_objects = TotalObjects, + total_bytes = TotalBytes} = + Acc, + M = <<(?PT_MSG_OBJ):8, BinObj/binary>>, + NumBytes = byte_size(M), + Stats2 = incr_bytes(incr_objs(Stats), NumBytes), + Stats3 = maybe_send_status({Module, SrcPartition, + TargetPartition}, + Stats2), + case gen_tcp:send(Sock, M) of + ok -> + Acc#ho_acc{ack = Ack + 1, error = ok, + stats = Stats3, + total_bytes = TotalBytes + NumBytes, + total_objects = TotalObjects + 1}; + {error, Reason} -> + Acc#ho_acc{error = {error, Reason}, stats = Stats3} + end + end + end; + false -> + NewNotSentAcc = handle_not_sent_item(NotSentFun, + NotSentAcc, K), + Acc#ho_acc{error = ok, total_objects = TotalObjects + 1, + notsent_acc = NewNotSentAcc} end. -handle_not_sent_item(NotSentFun, Acc, Key) when is_function(NotSentFun) -> - NotSentFun(Key, Acc); -handle_not_sent_item(undefined, _, _) -> - undefined. +handle_not_sent_item(undefined, _, _) -> undefined; +handle_not_sent_item(NotSentFun, Acc, Key) + when is_function(NotSentFun) -> + NotSentFun(Key, Acc). -send_objects([], Acc) -> - Acc; +send_objects([], Acc) -> Acc; send_objects(ItemsReverseList, Acc) -> - Items = lists:reverse(ItemsReverseList), - - #ho_acc{ack=Ack, - module=Module, - socket=Sock, - src_target={SrcPartition, TargetPartition}, - stats=Stats, - - total_objects=TotalObjects, - total_bytes=TotalBytes, - item_queue_length=NObjects - } = Acc, - + #ho_acc{ack = Ack, module = Module, socket = Sock, + src_target = {SrcPartition, TargetPartition}, + stats = Stats, total_objects = TotalObjects, + total_bytes = TotalBytes, + item_queue_length = NObjects} = + Acc, ObjectList = term_to_binary(Items), - - M = <>, - + M = <<(?PT_MSG_BATCH):8, ObjectList/binary>>, NumBytes = byte_size(M), - - Stats2 = incr_bytes(incr_objs(Stats, NObjects), NumBytes), - Stats3 = maybe_send_status({Module, SrcPartition, TargetPartition}, Stats2), - + Stats2 = incr_bytes(incr_objs(Stats, NObjects), + NumBytes), + Stats3 = maybe_send_status({Module, SrcPartition, + TargetPartition}, + Stats2), case gen_tcp:send(Sock, M) of - ok -> - Acc#ho_acc{ack=Ack+1, error=ok, stats=Stats3, - total_objects=TotalObjects+NObjects, - total_bytes=TotalBytes+NumBytes, - item_queue=[], - item_queue_length=0, - item_queue_byte_size=0}; - {error, Reason} -> - Acc#ho_acc{error={error, Reason}, stats=Stats3} + ok -> + Acc#ho_acc{ack = Ack + 1, error = ok, stats = Stats3, + total_objects = TotalObjects + NObjects, + total_bytes = TotalBytes + NumBytes, item_queue = [], + item_queue_length = 0, item_queue_byte_size = 0}; + {error, Reason} -> + Acc#ho_acc{error = {error, Reason}, stats = Stats3} end. get_handoff_ip(Node) when is_atom(Node) -> - case riak_core_util:safe_rpc(Node, riak_core_handoff_listener, get_handoff_ip, [], - infinity) of - {badrpc, _} -> - error; - Res -> - Res + case riak_core_util:safe_rpc(Node, + riak_core_handoff_listener, get_handoff_ip, [], + infinity) + of + {badrpc, _} -> error; + Res -> Res end. get_handoff_port(Node) when is_atom(Node) -> - case catch(riak_core_gen_server:call({riak_core_handoff_listener, Node}, handoff_port, infinity)) of - {'EXIT', _} -> - %% Check old location from previous release - riak_core_gen_server:call({riak_kv_handoff_listener, Node}, handoff_port, infinity); - Other -> Other - end. + gen_server:call({riak_core_handoff_listener, Node}, + handoff_port, infinity). get_handoff_receive_timeout() -> - application:get_env(riak_core, handoff_timeout, ?TCP_TIMEOUT). + application:get_env(riak_core, handoff_timeout, + ?TCP_TIMEOUT). end_fold_time(StartFoldTime) -> EndFoldTime = os:timestamp(), @@ -504,6 +464,7 @@ end_fold_time(StartFoldTime) -> %% @doc Produce the value of `now/0' as if it were called `S' seconds %% in the future. -spec future_now(pos_integer()) -> erlang:timestamp(). + future_now(S) -> {Megas, Secs, Micros} = os:timestamp(), {Megas, Secs + S, Micros}. @@ -512,46 +473,54 @@ future_now(S) -> %% %% @doc Check if the given timestamp `TS' has elapsed. -spec is_elapsed(erlang:timestamp()) -> boolean(). -is_elapsed(TS) -> - os:timestamp() >= TS. + +is_elapsed(TS) -> os:timestamp() >= TS. %% @private %% %% @doc Increment `Stats' byte count by `NumBytes'. --spec incr_bytes(ho_stats(), non_neg_integer()) -> NewStats::ho_stats(). -incr_bytes(Stats=#ho_stats{bytes=Bytes}, NumBytes) -> - Stats#ho_stats{bytes=Bytes + NumBytes}. +-spec incr_bytes(ho_stats(), + non_neg_integer()) -> NewStats :: ho_stats(). + +incr_bytes(Stats = #ho_stats{bytes = Bytes}, + NumBytes) -> + Stats#ho_stats{bytes = Bytes + NumBytes}. -incr_objs(Stats) -> - incr_objs(Stats, 1). +incr_objs(Stats) -> incr_objs(Stats, 1). %% @private %% %% @doc Increment `Stats' object count by NObjs: --spec incr_objs(ho_stats(), non_neg_integer()) -> NewStats::ho_stats(). -incr_objs(Stats=#ho_stats{objs=Objs}, NObjs) -> - Stats#ho_stats{objs=Objs+NObjs}. +-spec incr_objs(ho_stats(), + non_neg_integer()) -> NewStats :: ho_stats(). + +incr_objs(Stats = #ho_stats{objs = Objs}, NObjs) -> + Stats#ho_stats{objs = Objs + NObjs}. %% @private %% %% @doc Check if the interval has elapsed and if so send handoff stats %% for `ModSrcTgt' to the manager and return a new stats record %% `NetStats'. --spec maybe_send_status({module(), non_neg_integer(), non_neg_integer()}, - ho_stats()) -> - NewStats::ho_stats(). -maybe_send_status(ModSrcTgt, Stats=#ho_stats{interval_end=IntervalEnd}) -> +-spec maybe_send_status({module(), non_neg_integer(), + non_neg_integer()}, + ho_stats()) -> NewStats :: ho_stats(). + +maybe_send_status(ModSrcTgt, + Stats = #ho_stats{interval_end = IntervalEnd}) -> case is_elapsed(IntervalEnd) of - true -> - Stats2 = Stats#ho_stats{last_update=os:timestamp()}, - riak_core_handoff_manager:status_update(ModSrcTgt, Stats2), - #ho_stats{interval_end=future_now(get_status_interval())}; - false -> - Stats + true -> + Stats2 = Stats#ho_stats{last_update = os:timestamp()}, + riak_core_handoff_manager:status_update(ModSrcTgt, + Stats2), + #ho_stats{interval_end = + future_now(get_status_interval())}; + false -> Stats end. get_status_interval() -> - application:get_env(riak_core, handoff_status_interval, ?STATUS_INTERVAL). + application:get_env(riak_core, handoff_status_interval, + ?STATUS_INTERVAL). get_src_partition(Opts) -> proplists:get_value(src_partition, Opts). @@ -564,15 +533,16 @@ get_notsent_acc0(Opts) -> get_notsent_fun(Opts) -> case proplists:get_value(notsent_fun, Opts) of - none -> fun(_, _) -> undefined end; - Fun -> Fun + none -> fun (_, _) -> undefined end; + Fun -> Fun end. -spec get_filter(proplists:proplist()) -> predicate(). + get_filter(Opts) -> case proplists:get_value(filter, Opts) of - none -> fun(_) -> true end; - Filter -> Filter + none -> fun (_) -> true end; + Filter -> Filter end. %% @private @@ -581,17 +551,17 @@ get_filter(Opts) -> %% otherwise fall back to the slower, object-at-a-time path remote_supports_batching(Node) -> - case catch rpc:call(Node, riak_core_handoff_receiver, - supports_batching, []) of - true -> - logger:debug("remote node supports batching, enabling"), - true; - _ -> - %% whatever the problem here, just revert to the old behavior - %% which shouldn't matter too much for any single handoff - logger:debug("remote node doesn't support batching"), - false + supports_batching, []) + of + true -> + logger:debug("remote node supports batching, enabling"), + true; + _ -> + %% whatever the problem here, just revert to the old behavior + %% which shouldn't matter too much for any single handoff + logger:debug("remote node doesn't support batching"), + false end. %% @private @@ -604,20 +574,20 @@ remote_supports_batching(Node) -> %% decision to cancel the handoff or not e.g. get a lock on behalf of %% the process. maybe_call_handoff_started(Module, SrcPartition) -> - case lists:member({handoff_started, 2}, Module:module_info(exports)) of - true -> - WorkerPid = self(), - case Module:handoff_started(SrcPartition, WorkerPid) of - {ok, FoldOpts} -> - FoldOpts; - {error, max_concurrency} -> - %% Handoff of that partition is busy or can't proceed. Stopping with - %% max_concurrency will cause this partition to be retried again later. - exit({shutdown, max_concurrency}); - {error, Error} -> - exit({shutdown, Error}) - end; - false -> - %% optional callback not implemented, so we carry on, w/ no addition fold options - [] + case lists:member({handoff_started, 2}, + Module:module_info(exports)) + of + true -> + WorkerPid = self(), + case Module:handoff_started(SrcPartition, WorkerPid) of + {ok, FoldOpts} -> FoldOpts; + {error, max_concurrency} -> + %% Handoff of that partition is busy or can't proceed. Stopping with + %% max_concurrency will cause this partition to be retried again later. + exit({shutdown, max_concurrency}); + {error, Error} -> exit({shutdown, Error}) + end; + false -> + %% optional callback not implemented, so we carry on, w/ no addition fold options + [] end. diff --git a/src/riak_core_handoff_sender_sup.erl b/src/riak_core_handoff_sender_sup.erl index 0b485f3b8..cbb503c0d 100644 --- a/src/riak_core_handoff_sender_sup.erl +++ b/src/riak_core_handoff_sender_sup.erl @@ -19,25 +19,27 @@ %% ------------------------------------------------------------------- -module(riak_core_handoff_sender_sup). + -behaviour(supervisor). %% callbacks --export([start_link/0, - init/1 - ]). +-export([start_link/0, init/1]). %% API -export([start_sender/5]). -include("riak_core_handoff.hrl"). --define(CHILD(I,Type), {I,{I,start_link,[]},temporary,brutal_kill,Type,[I]}). + +-define(CHILD(I, Type), + {I, {I, start_link, []}, temporary, brutal_kill, Type, + [I]}). %%%=================================================================== %%% API %%%=================================================================== start_link() -> - supervisor:start_link({local,?MODULE},?MODULE,[]). + supervisor:start_link({local, ?MODULE}, ?MODULE, []). %% @doc Start the handoff process for the module (`Module'), partition %% (`Partition'), and vnode (`VNode') from the local node to the @@ -53,16 +55,19 @@ start_link() -> %% for each unsent key. %% * unsent_acc0 - optional. The intial accumulator value passed to unsent_fun %% for the first unsent key --spec start_sender(ho_type(), atom(), term(), pid(), [{atom(), term()}]) -> {ok, pid()}. +-spec start_sender(ho_type(), atom(), term(), pid(), + [{atom(), term()}]) -> {ok, pid()}. + start_sender(Type, Module, TargetNode, VNode, Opts) -> - supervisor:start_child(?MODULE, [TargetNode, Module, {Type, Opts}, VNode]). + supervisor:start_child(?MODULE, + [TargetNode, Module, {Type, Opts}, VNode]). %%%=================================================================== %%% Callbacks %%%=================================================================== %% @private -init ([]) -> - {ok,{{simple_one_for_one,10,10}, - [?CHILD(riak_core_handoff_sender,worker) - ]}}. +init([]) -> + {ok, + {{simple_one_for_one, 10, 10}, + [?CHILD(riak_core_handoff_sender, worker)]}}. diff --git a/src/riak_core_handoff_sup.erl b/src/riak_core_handoff_sup.erl index 06fdfa65d..518f89eaf 100644 --- a/src/riak_core_handoff_sup.erl +++ b/src/riak_core_handoff_sup.erl @@ -19,24 +19,25 @@ %% ------------------------------------------------------------------- -module(riak_core_handoff_sup). + -behaviour(supervisor). %% beahvior functions --export([start_link/0, - init/1 - ]). +-export([start_link/0, init/1]). --define(CHILD(I,Type), {I,{I,start_link,[]},permanent,brutal_kill,Type,[I]}). +-define(CHILD(I, Type), + {I, {I, start_link, []}, permanent, brutal_kill, Type, + [I]}). %% begins the supervisor, init/1 will be called -start_link () -> - supervisor:start_link({local,?MODULE},?MODULE,[]). +start_link() -> + supervisor:start_link({local, ?MODULE}, ?MODULE, []). %% @private -init ([]) -> - {ok,{{one_for_all,10,10}, - [?CHILD(riak_core_handoff_receiver_sup,supervisor), - ?CHILD(riak_core_handoff_sender_sup,supervisor), - ?CHILD(riak_core_handoff_listener_sup,supervisor), - ?CHILD(riak_core_handoff_manager,worker) - ]}}. +init([]) -> + {ok, + {{one_for_all, 10, 10}, + [?CHILD(riak_core_handoff_receiver_sup, supervisor), + ?CHILD(riak_core_handoff_sender_sup, supervisor), + ?CHILD(riak_core_handoff_listener_sup, supervisor), + ?CHILD(riak_core_handoff_manager, worker)]}}. diff --git a/src/riak_core_mochiglobal.erl b/src/riak_core_mochiglobal.erl deleted file mode 100644 index c7fde0027..000000000 --- a/src/riak_core_mochiglobal.erl +++ /dev/null @@ -1,107 +0,0 @@ -%% @author Bob Ippolito -%% @copyright 2010 Mochi Media, Inc. -%% @doc Abuse module constant pools as a "read-only shared heap" (since erts 5.6) -%% [1]. --module(riak_core_mochiglobal). --author("Bob Ippolito "). --export([get/1, get/2, put/2, delete/1]). - --spec get(atom()) -> any() | undefined. -%% @equiv get(K, undefined) -get(K) -> - get(K, undefined). - --spec get(atom(), T) -> any() | T. -%% @doc Get the term for K or return Default. -get(K, Default) -> - get(K, Default, key_to_module(K)). - -get(_K, Default, Mod) -> - try Mod:term() - catch error:undef -> - Default - end. - --spec put(atom(), any()) -> ok. -%% @doc Store term V at K, replaces an existing term if present. -put(K, V) -> - put(K, V, key_to_module(K)). - -put(_K, V, Mod) -> - Bin = compile(Mod, V), - code:purge(Mod), - {module, Mod} = code:load_binary(Mod, atom_to_list(Mod) ++ ".erl", Bin), - ok. - --spec delete(atom()) -> boolean(). -%% @doc Delete term stored at K, no-op if non-existent. -delete(K) -> - delete(K, key_to_module(K)). - -delete(_K, Mod) -> - code:purge(Mod), - code:delete(Mod). - --spec key_to_module(atom()) -> atom(). -key_to_module(K) -> - list_to_atom("mochiglobal:" ++ atom_to_list(K)). - --spec compile(atom(), any()) -> binary(). -compile(Module, T) -> - {ok, Module, Bin} = compile:forms(forms(Module, T), - [verbose, report_errors]), - Bin. - --spec forms(atom(), any()) -> [erl_syntax:syntaxTree()]. -forms(Module, T) -> - [erl_syntax:revert(X) || X <- term_to_abstract(Module, term, T)]. - --spec term_to_abstract(atom(), atom(), any()) -> [erl_syntax:syntaxTree()]. -term_to_abstract(Module, Getter, T) -> - [%% -module(Module). - erl_syntax:attribute( - erl_syntax:atom(module), - [erl_syntax:atom(Module)]), - %% -export([Getter/0]). - erl_syntax:attribute( - erl_syntax:atom(export), - [erl_syntax:list( - [erl_syntax:arity_qualifier( - erl_syntax:atom(Getter), - erl_syntax:integer(0))])]), - %% Getter() -> T. - erl_syntax:function( - erl_syntax:atom(Getter), - [erl_syntax:clause([], none, [erl_syntax:abstract(T)])])]. - -%% -%% Tests -%% --ifdef(TEST). --include_lib("eunit/include/eunit.hrl"). -get_put_delete_test() -> - K = '$$test$$mochiglobal', - delete(K), - ?assertEqual( - bar, - get(K, bar)), - try - ?MODULE:put(K, baz), - ?assertEqual( - baz, - get(K, bar)), - ?MODULE:put(K, wibble), - ?assertEqual( - wibble, - ?MODULE:get(K)) - after - delete(K) - end, - ?assertEqual( - bar, - get(K, bar)), - ?assertEqual( - undefined, - ?MODULE:get(K)), - ok. --endif. diff --git a/src/riak_core_node_watcher.erl b/src/riak_core_node_watcher.erl index 21c4fc46e..eecd1c345 100644 --- a/src/riak_core_node_watcher.erl +++ b/src/riak_core_node_watcher.erl @@ -24,78 +24,85 @@ -behaviour(gen_server). -define(DEFAULT_HEALTH_CHECK_INTERVAL, 60000). + %% API --export([start_link/0, - service_up/2, - service_up/3, - service_up/4, - check_health/1, - suspend_health_checks/0, - resume_health_checks/0, - service_down/1, - service_down/2, - node_up/0, - node_down/0, - services/0, services/1, +-export([start_link/0, service_up/2, service_up/3, + service_up/4, check_health/1, suspend_health_checks/0, + resume_health_checks/0, service_down/1, service_down/2, + node_up/0, node_down/0, services/0, services/1, nodes/1]). %% TEST API -ifdef(TEST). --export([avsn/0, - set_broadcast_module/2]). +-ifdef(PROPER). + +-compile(export_all). + +-endif. + +-export([avsn/0, set_broadcast_module/2]). -endif. -%% gen_server callbacks --export([init/1, handle_call/3, handle_cast/2, handle_info/2, - terminate/2, code_change/3]). - --record(state, { status = up, - services = [], - health_checks = [], - healths_enabled = true, - peers = [], - avsn = 0, - bcast_tref, - bcast_mod = {gen_server, abcast}}). - --record(health_check, { state = 'waiting' :: 'waiting' | 'checking' | 'suspend', - callback :: {atom(), atom(), [any()]}, - service_pid :: pid(), - checking_pid :: pid() | undefined, - health_failures = 0 :: non_neg_integer(), - callback_failures = 0 :: non_neg_integer(), - interval_tref, - %% how many milliseconds to wait after a check has - %% finished before starting a new one - check_interval = ?DEFAULT_HEALTH_CHECK_INTERVAL :: timeout(), - max_callback_failures = 3, - max_health_failures = 1 }). +%% gen_server callbacks +-export([init/1, handle_call/3, handle_cast/2, + handle_info/2, terminate/2, code_change/3]). + +-record(state, + {status = up, services = [], health_checks = [], + healths_enabled = true, peers = [], avsn = 0, + bcast_tref, bcast_mod = {gen_server, abcast}}). + +-record(health_check, + {state = waiting :: waiting | checking | suspend, + callback :: {atom(), atom(), [any()]}, + service_pid :: pid(), + checking_pid :: pid() | undefined, + health_failures = 0 :: non_neg_integer(), + callback_failures = 0 :: non_neg_integer(), + interval_tref, + %% how many milliseconds to wait after a check has + %% finished before starting a new one + check_interval = ?DEFAULT_HEALTH_CHECK_INTERVAL :: + timeout(), + max_callback_failures = 3, max_health_failures = 1}). %% =================================================================== %% Public API %% =================================================================== start_link() -> - gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). + gen_server:start_link({local, ?MODULE}, ?MODULE, [], + []). service_up(Id, Pid) -> - gen_server:call(?MODULE, {service_up, Id, Pid}, infinity). + gen_server:call(?MODULE, {service_up, Id, Pid}, + infinity). %% @doc {@link service_up/4} with default options. %% @see service_up/4 --spec service_up(Id :: atom(), Pid :: pid(), MFA :: mfa()) -> 'ok'. +-spec service_up(Id :: atom(), Pid :: pid(), + MFA :: mfa()) -> ok. + service_up(Id, Pid, MFA) -> service_up(Id, Pid, MFA, []). --type hc_check_interval_opt() :: {check_interval, timeout()}. --type hc_max_callback_fails_opt() :: {max_callback_failures, non_neg_integer()}. --type hc_max_health_fails_opt() :: {max_health_failures, non_neg_integer()}. +-type hc_check_interval_opt() :: {check_interval, + timeout()}. + +-type + hc_max_callback_fails_opt() :: {max_callback_failures, + non_neg_integer()}. + +-type hc_max_health_fails_opt() :: {max_health_failures, + non_neg_integer()}. + -type health_opt() :: hc_check_interval_opt() | - hc_max_callback_fails_opt() | - hc_max_health_fails_opt(). + hc_max_callback_fails_opt() | hc_max_health_fails_opt(). + -type health_opts() :: [health_opt()]. + %% @doc Create a service that can be declared up or down based on the %% result of a function in addition to usual monitoring. The function can %% be set to be called automatically every interval, or only explicitly. @@ -113,21 +120,24 @@ service_up(Id, Pid, MFA) -> %% of the argument list provided. A service added this way is removed like %% any other, using {@link service_down/1}. %% @see service_up/2 --spec service_up(Id :: atom(), Pid :: pid(), Callback :: mfa(), - Options :: health_opts()) -> 'ok'. -service_up(Id, Pid, {Module, Function, Args}, Options) -> +-spec service_up(Id :: atom(), Pid :: pid(), + Callback :: mfa(), Options :: health_opts()) -> ok. + +service_up(Id, Pid, {Module, Function, Args}, + Options) -> gen_server:call(?MODULE, - {service_up, Id, Pid, {Module, Function, Args}, Options}, + {service_up, Id, Pid, {Module, Function, Args}, + Options}, infinity). %% @doc Force a health check for the given service. If the service does %% not have a health check associated with it, this is ignored. Resets the %% automatic health check timer if there is one. %% @see service_up/4 --spec check_health(Service :: atom()) -> 'ok'. +-spec check_health(Service :: atom()) -> ok. + check_health(Service) -> - ?MODULE ! {check_health, Service}, - ok. + (?MODULE) ! {check_health, Service}, ok. suspend_health_checks() -> gen_server:call(?MODULE, suspend_healths, infinity). @@ -139,9 +149,9 @@ service_down(Id) -> gen_server:call(?MODULE, {service_down, Id}, infinity). service_down(Id, true) -> - gen_server:call(?MODULE, {service_down, Id, health_check}, infinitiy); -service_down(Id, false) -> - service_down(Id). + gen_server:call(?MODULE, + {service_down, Id, health_check}, infinitiy); +service_down(Id, false) -> service_down(Id). node_up() -> gen_server:call(?MODULE, {node_status, up}, infinity). @@ -154,15 +164,11 @@ services() -> services(Node) -> case check_node_valid(Node) of - true -> - internal_get_services(Node); - _ -> - invalid_node + true -> internal_get_services(Node); + _ -> invalid_node end. -nodes(Service) -> - internal_get_nodes(Service). - +nodes(Service) -> internal_get_nodes(Service). %% =================================================================== %% Test API @@ -170,11 +176,11 @@ nodes(Service) -> -ifdef(TEST). -avsn() -> - gen_server:call(?MODULE, get_avsn, infinity). +avsn() -> gen_server:call(?MODULE, get_avsn, infinity). set_broadcast_module(Module, Fn) -> - gen_server:call(?MODULE, {set_bcast_mod, Module, Fn}, infinity). + gen_server:call(?MODULE, {set_bcast_mod, Module, Fn}, + infinity). -endif. @@ -185,195 +191,182 @@ set_broadcast_module(Module, Fn) -> init([]) -> %% Trap exits so that terminate/2 will get called process_flag(trap_exit, true), - %% Setup callback notification for ring changes; note that we use the %% supervised variation so that the callback gets removed if this process %% exits watch_for_ring_events(), - %% Watch for node up/down events ok = net_kernel:monitor_nodes(true), - %% Setup ETS table to track node status - ?MODULE = ets:new(?MODULE, [protected, {read_concurrency, true}, named_table]), - + (?MODULE) = ets:new(?MODULE, + [protected, {read_concurrency, true}, named_table]), {ok, schedule_broadcast(#state{})}. -handle_call({set_bcast_mod, Module, Fn}, _From, State) -> +handle_call({set_bcast_mod, Module, Fn}, _From, + State) -> %% Call available for swapping out how broadcasts are generated - {reply, ok, State#state {bcast_mod = {Module, Fn}}}; - + {reply, ok, State#state{bcast_mod = {Module, Fn}}}; handle_call(get_avsn, _From, State) -> {reply, State#state.avsn, State}; - handle_call({service_up, Id, Pid}, _From, State) -> %% remove any existing health checks S2 = remove_health_check(Id, State), - S3 = add_service(Id, Pid, S2), - {reply, ok, S3}; - -handle_call({service_up, Id, Pid, MFA, Options}, From, State) -> +handle_call({service_up, Id, Pid, MFA, Options}, From, + State) -> %% update the active set of services if needed. - {reply, _, State1} = handle_call({service_up, Id, Pid}, From, State), - + {reply, _, State1} = handle_call({service_up, Id, Pid}, + From, State), State2 = remove_health_check(Id, State1), - - case application:get_env(riak_core, enable_health_checks, true) of - true -> - %% install the health check - CheckInterval = proplists:get_value(check_interval, Options, - ?DEFAULT_HEALTH_CHECK_INTERVAL), - IntervalTref = case CheckInterval of - infinity -> undefined; - N -> erlang:send_after(N, self(), {check_health, Id}) - end, - CheckRec = #health_check{ - callback = MFA, - check_interval = CheckInterval, - service_pid = Pid, - max_health_failures = proplists:get_value(max_health_failures, Options, 1), - max_callback_failures = proplists:get_value(max_callback_failures, Options, 3), - interval_tref = IntervalTref - }, - Healths = orddict:store(Id, CheckRec, State2#state.health_checks); - false -> - Healths = State2#state.health_checks + case application:get_env(riak_core, + enable_health_checks, true) + of + true -> + %% install the health check + CheckInterval = proplists:get_value(check_interval, + Options, + ?DEFAULT_HEALTH_CHECK_INTERVAL), + IntervalTref = case CheckInterval of + infinity -> undefined; + N -> erlang:send_after(N, self(), {check_health, Id}) + end, + CheckRec = #health_check{callback = MFA, + check_interval = CheckInterval, + service_pid = Pid, + max_health_failures = + proplists:get_value(max_health_failures, + Options, 1), + max_callback_failures = + proplists:get_value(max_callback_failures, + Options, 3), + interval_tref = IntervalTref}, + Healths = orddict:store(Id, CheckRec, + State2#state.health_checks); + false -> Healths = State2#state.health_checks end, - {reply, ok, State2#state{health_checks = Healths}}; - handle_call({service_down, Id}, _From, State) -> %% Remove health check if any S2 = remove_health_check(Id, State), - S3 = drop_service(Id, S2), - - {reply, ok, S3}; - + {reply, ok, S3}; handle_call({node_status, Status}, _From, State) -> Transition = {State#state.status, Status}, S2 = case Transition of - {up, down} -> %% up -> down - case State#state.healths_enabled of - true -> - Healths = all_health_fsms(suspend, State#state.health_checks); - false -> - Healths = State#state.health_checks - end, - local_delete(State#state { status = down, health_checks = Healths}); - - {down, up} -> %% down -> up - case State#state.healths_enabled of - true -> - Healths = all_health_fsms(resume, State#state.health_checks); - false -> - Healths = State#state.health_checks - end, - local_update(State#state { status = up, health_checks = Healths }); - - {Status, Status} -> %% noop - State - end, + {up, down} -> %% up -> down + case State#state.healths_enabled of + true -> + Healths = all_health_fsms(suspend, + State#state.health_checks); + false -> Healths = State#state.health_checks + end, + local_delete(State#state{status = down, + health_checks = Healths}); + {down, up} -> %% down -> up + case State#state.healths_enabled of + true -> + Healths = all_health_fsms(resume, + State#state.health_checks); + false -> Healths = State#state.health_checks + end, + local_update(State#state{status = up, + health_checks = Healths}); + {Status, Status} -> %% noop + State + end, {reply, ok, update_avsn(S2)}; handle_call(services, _From, State) -> - Res = [Service || {{by_service, Service}, Nds} <- ets:tab2list(?MODULE), - Nds /= []], + Res = [Service + || {{by_service, Service}, Nds} + <- ets:tab2list(?MODULE), + Nds /= []], {reply, lists:sort(Res), State}; -handle_call(suspend_healths, _From, State = #state{healths_enabled=false}) -> +handle_call(suspend_healths, _From, + State = #state{healths_enabled = false}) -> {reply, already_disabled, State}; -handle_call(suspend_healths, _From, State = #state{healths_enabled=true}) -> +handle_call(suspend_healths, _From, + State = #state{healths_enabled = true}) -> logger:info("suspending all health checks"), - Healths = all_health_fsms(suspend, State#state.health_checks), - {reply, ok, update_avsn(State#state{health_checks = Healths, healths_enabled = false})}; -handle_call(resume_healths, _From, State = #state{healths_enabled=true}) -> + Healths = all_health_fsms(suspend, + State#state.health_checks), + {reply, ok, + update_avsn(State#state{health_checks = Healths, + healths_enabled = false})}; +handle_call(resume_healths, _From, + State = #state{healths_enabled = true}) -> {reply, already_enabled, State}; -handle_call(resume_healths, _From, State = #state{healths_enabled=false}) -> +handle_call(resume_healths, _From, + State = #state{healths_enabled = false}) -> logger:info("resuming all health checks"), - Healths = all_health_fsms(resume, State#state.health_checks), - {reply, ok, update_avsn(State#state{health_checks = Healths, healths_enabled = true})}. - + Healths = all_health_fsms(resume, + State#state.health_checks), + {reply, ok, + update_avsn(State#state{health_checks = Healths, + healths_enabled = true})}. handle_cast({ring_update, R}, State) -> %% Ring has changed; determine what peers are new to us %% and broadcast out current status to those peers. - Peers0 = ordsets:from_list(riak_core_ring:all_members(R)), + Peers0 = + ordsets:from_list(riak_core_ring:all_members(R)), Peers = ordsets:del_element(node(), Peers0), - S2 = peers_update(Peers, State), {noreply, update_avsn(S2)}; - handle_cast({up, Node, Services}, State) -> S2 = node_up(Node, Services, State), {noreply, update_avsn(S2)}; - handle_cast({down, Node}, State) -> - node_down(Node, State), - {noreply, update_avsn(State)}; - + node_down(Node, State), {noreply, update_avsn(State)}; handle_cast({health_check_result, Pid, R}, State) -> Service = erlang:erase(Pid), - State2 = handle_check_msg({result, Pid, R}, Service, State), + State2 = handle_check_msg({result, Pid, R}, Service, + State), {noreply, State2}. handle_info({nodeup, _Node}, State) -> %% Ignore node up events; nothing to do here... {noreply, State}; - handle_info({nodedown, Node}, State) -> - node_down(Node, State), - {noreply, update_avsn(State)}; - + node_down(Node, State), {noreply, update_avsn(State)}; handle_info({'DOWN', Mref, _, _Pid, _Info}, State) -> %% A sub-system monitored process has terminated. Identify %% the sub-system in question and notify our peers. case erlang:get(Mref) of - undefined -> - %% No entry found for this monitor; ignore the message - {noreply, update_avsn(State)}; - - Id -> - %% Remove the id<->mref entries in the pdict - delete_service_mref(Id), - - %% remove any health checks in place - S2 = remove_health_check(Id, State), - - %% Update our list of active services and ETS table - Services = ordsets:del_element(Id, State#state.services), - S3 = local_update(S2#state { services = Services }), - {noreply, update_avsn(S3)} + undefined -> + %% No entry found for this monitor; ignore the message + {noreply, update_avsn(State)}; + Id -> + %% Remove the id<->mref entries in the pdict + delete_service_mref(Id), + %% remove any health checks in place + S2 = remove_health_check(Id, State), + %% Update our list of active services and ETS table + Services = ordsets:del_element(Id, + State#state.services), + S3 = local_update(S2#state{services = Services}), + {noreply, update_avsn(S3)} end; - handle_info({'EXIT', Pid, _Cause} = Msg, State) -> Service = erlang:erase(Pid), State2 = handle_check_msg(Msg, Service, State), {noreply, State2}; - handle_info({check_health, Id}, State) -> State2 = handle_check_msg(check_health, Id, State), {noreply, State2}; - handle_info({gen_event_EXIT, _, _}, State) -> %% Ring event handler has been removed for some reason; re-register watch_for_ring_events(), {noreply, update_avsn(State)}; - handle_info(broadcast, State) -> - S2 = broadcast(State#state.peers, State), - {noreply, S2}. - + S2 = broadcast(State#state.peers, State), {noreply, S2}. terminate(_Reason, State) -> %% Let our peers know that we are shutting down - broadcast(State#state.peers, State#state { status = down }). - - -code_change(_OldVsn, State, _Extra) -> - {ok, State}. - + broadcast(State#state.peers, + State#state{status = down}). +code_change(_OldVsn, State, _Extra) -> {ok, State}. %% ==================================================================== %% Internal functions @@ -385,98 +378,81 @@ check_node_valid(Node) -> lists:member(Node, Members). update_avsn(State) -> - State#state { avsn = State#state.avsn + 1 }. + State#state{avsn = State#state.avsn + 1}. watch_for_ring_events() -> Self = self(), - Fn = fun(R) -> - gen_server:cast(Self, {ring_update, R}) + Fn = fun (R) -> gen_server:cast(Self, {ring_update, R}) end, riak_core_ring_events:add_sup_callback(Fn). delete_service_mref(Id) -> %% Cleanup the monitor if one exists case erlang:get(Id) of - undefined -> - ok; - Mref -> - erlang:erase(Mref), - erlang:erase(Id), - erlang:demonitor(Mref) + undefined -> ok; + Mref -> + erlang:erase(Mref), + erlang:erase(Id), + erlang:demonitor(Mref) end. - broadcast(Nodes, State) -> - case (State#state.status) of - up -> - Msg = {up, node(), State#state.services}; - down -> - Msg = {down, node()} + case State#state.status of + up -> Msg = {up, node(), State#state.services}; + down -> Msg = {down, node()} end, - {Mod, Fn} = State#state.bcast_mod, - Mod:Fn(Nodes, ?MODULE, Msg), + {Module, Fn} = State#state.bcast_mod, + Module:Fn(Nodes, ?MODULE, Msg), schedule_broadcast(State). schedule_broadcast(State) -> - case (State#state.bcast_tref) of - undefined -> - ok; - OldTref -> - _ = erlang:cancel_timer(OldTref), - ok + case State#state.bcast_tref of + undefined -> ok; + OldTref -> _ = erlang:cancel_timer(OldTref), ok end, - {ok, Interval} = application:get_env(riak_core, gossip_interval), + {ok, Interval} = application:get_env(riak_core, + gossip_interval), Tref = erlang:send_after(Interval, self(), broadcast), - State#state { bcast_tref = Tref }. + State#state{bcast_tref = Tref}. is_peer(Node, State) -> ordsets:is_element(Node, State#state.peers). -is_node_up(Node) -> - ets:member(?MODULE, Node). - +is_node_up(Node) -> ets:member(?MODULE, Node). node_up(Node, Services, State) -> case is_peer(Node, State) of - true -> - %% Before we alter the ETS table, see if this node was previously - %% down. In that situation, we'll go ahead and broadcast out. - S2 = case is_node_up(Node) of - false -> - broadcast([Node], State); - true -> - State - end, - - case node_update(Node, Services) of - [] -> - ok; - AffectedServices -> - riak_core_node_watcher_events:service_update(AffectedServices) - end, - S2; - - false -> - State + true -> + %% Before we alter the ETS table, see if this node was previously + %% down. In that situation, we'll go ahead and broadcast out. + S2 = case is_node_up(Node) of + false -> broadcast([Node], State); + true -> State + end, + case node_update(Node, Services) of + [] -> ok; + AffectedServices -> + riak_core_node_watcher_events:service_update(AffectedServices) + end, + S2; + false -> State end. node_down(Node, State) -> case is_peer(Node, State) of - true -> - case node_delete(Node) of - [] -> - ok; - AffectedServices -> - riak_core_node_watcher_events:service_update(AffectedServices) - end; - false -> - ok + true -> + case node_delete(Node) of + [] -> ok; + AffectedServices -> + riak_core_node_watcher_events:service_update(AffectedServices) + end; + false -> ok end. - node_delete(Node) -> Services = internal_get_services(Node), - _ = [internal_delete(Node, Service) || Service <- Services], + _ = [internal_delete(Node, Service) + || Service <- Services], ets:delete(?MODULE, Node), Services. @@ -485,172 +461,163 @@ node_update(Node, Services) -> %% know and determine what's changed (if anything). Now = riak_core_util:moment(), NewStatus = ordsets:from_list(Services), - OldStatus = ordsets:from_list(internal_get_services(Node)), - - Added = ordsets:subtract(NewStatus, OldStatus), - Deleted = ordsets:subtract(OldStatus, NewStatus), - + OldStatus = + ordsets:from_list(internal_get_services(Node)), + Added = ordsets:subtract(NewStatus, OldStatus), + Deleted = ordsets:subtract(OldStatus, NewStatus), %% Update ets table with changes; make sure to touch unchanged %% service with latest timestamp _ = [internal_delete(Node, Ss) || Ss <- Deleted], _ = [internal_insert(Node, Ss) || Ss <- Added], - %% Keep track of the last time we recv'd data from a node ets:insert(?MODULE, {Node, Now}), - %% Return the list of affected services (added or deleted) ordsets:union(Added, Deleted). -local_update(#state { status = down } = State) -> +local_update(#state{status = down} = State) -> %% Ignore subsystem changes when we're marked as down State; local_update(State) -> %% Update our local ETS table case node_update(node(), State#state.services) of - [] -> - %% No material changes; no local notification necessary - ok; - - AffectedServices -> - %% Generate a local notification about the affected services and - %% also broadcast our status - riak_core_node_watcher_events:service_update(AffectedServices) + [] -> + %% No material changes; no local notification necessary + ok; + AffectedServices -> + %% Generate a local notification about the affected services and + %% also broadcast our status + riak_core_node_watcher_events:service_update(AffectedServices) end, broadcast(State#state.peers, State). local_delete(State) -> case node_delete(node()) of - [] -> - %% No services changed; no local notification required - ok; - - AffectedServices -> - riak_core_node_watcher_events:service_update(AffectedServices) + [] -> + %% No services changed; no local notification required + ok; + AffectedServices -> + riak_core_node_watcher_events:service_update(AffectedServices) end, broadcast(State#state.peers, State). peers_update(NewPeers, State) -> %% Identify what peers have been added and deleted - Added = ordsets:subtract(NewPeers, State#state.peers), + Added = ordsets:subtract(NewPeers, State#state.peers), Deleted = ordsets:subtract(State#state.peers, NewPeers), - %% For peers that have been deleted, remove their entries from %% the ETS table; we no longer care about their status - Services0 = (lists:foldl(fun(Node, Acc) -> - S = node_delete(Node), - S ++ Acc - end, [], Deleted)), + Services0 = lists:foldl(fun (Node, Acc) -> + S = node_delete(Node), S ++ Acc + end, + [], Deleted), Services = ordsets:from_list(Services0), - %% Notify local parties if any services are affected by this change case Services of - [] -> - ok; - _ -> - riak_core_node_watcher_events:service_update(Services) + [] -> ok; + _ -> + riak_core_node_watcher_events:service_update(Services) end, - %% Broadcast our current status to new peers - broadcast(Added, State#state { peers = NewPeers }). + broadcast(Added, State#state{peers = NewPeers}). internal_delete(Node, Service) -> Svcs = internal_get_services(Node), - ets:insert(?MODULE, {{by_node, Node}, Svcs -- [Service]}), + ets:insert(?MODULE, + {{by_node, Node}, Svcs -- [Service]}), Nds = internal_get_nodes(Service), - ets:insert(?MODULE, {{by_service, Service}, Nds -- [Node]}). + ets:insert(?MODULE, + {{by_service, Service}, Nds -- [Node]}). internal_insert(Node, Service) -> %% Remove Service & node before adding: avoid accidental duplicates Svcs = internal_get_services(Node) -- [Service], - ets:insert(?MODULE, {{by_node, Node}, [Service|Svcs]}), + ets:insert(?MODULE, + {{by_node, Node}, [Service | Svcs]}), Nds = internal_get_nodes(Service) -- [Node], - ets:insert(?MODULE, {{by_service, Service}, [Node|Nds]}). + ets:insert(?MODULE, + {{by_service, Service}, [Node | Nds]}). internal_get_services(Node) -> case ets:lookup(?MODULE, {by_node, Node}) of - [{{by_node, Node}, Ss}] -> - Ss; - [] -> - [] + [{{by_node, Node}, Ss}] -> Ss; + [] -> [] end. internal_get_nodes(Service) -> case ets:lookup(?MODULE, {by_service, Service}) of - [{{by_service, Service}, Ns}] -> - Ns; - [] -> - [] + [{{by_service, Service}, Ns}] -> Ns; + [] -> [] end. add_service(ServiceId, Pid, State) -> %% Update the set of active services locally - Services = ordsets:add_element(ServiceId, State#state.services), - S2 = State#state { services = Services }, - + Services = ordsets:add_element(ServiceId, + State#state.services), + S2 = State#state{services = Services}, %% Remove any existing mrefs for this service delete_service_mref(ServiceId), - %% Setup a monitor for the Pid representing this service Mref = erlang:monitor(process, Pid), erlang:put(Mref, ServiceId), erlang:put(ServiceId, Mref), - %% Update our local ETS table and broadcast S3 = local_update(S2), update_avsn(S3). drop_service(ServiceId, State) -> %% Update the set of active services locally - Services = ordsets:del_element(ServiceId, State#state.services), - S2 = State#state { services = Services }, - + Services = ordsets:del_element(ServiceId, + State#state.services), + S2 = State#state{services = Services}, %% Remove any existing mrefs for this service delete_service_mref(ServiceId), - %% Update local ETS table and broadcast S3 = local_update(S2), - update_avsn(S3). -handle_check_msg(_Msg, undefined, State) -> - State; -handle_check_msg(_Msg, _ServiceId, #state{status = down} = State) -> +handle_check_msg(_Msg, undefined, State) -> State; +handle_check_msg(_Msg, _ServiceId, + #state{status = down} = State) -> %% most likely a late message State; handle_check_msg(Msg, ServiceId, State) -> - case orddict:find(ServiceId, State#state.health_checks) of - error -> - State; - {ok, Check} -> - CheckReturn = health_fsm(Msg, ServiceId, Check), - handle_check_return(CheckReturn, ServiceId, State) + case orddict:find(ServiceId, State#state.health_checks) + of + error -> State; + {ok, Check} -> + CheckReturn = health_fsm(Msg, ServiceId, Check), + handle_check_return(CheckReturn, ServiceId, State) end. -handle_check_return({remove, _Check}, ServiceId, State) -> - Healths = orddict:erase(ServiceId, State#state.health_checks), +handle_check_return({remove, _Check}, ServiceId, + State) -> + Healths = orddict:erase(ServiceId, + State#state.health_checks), State#state{health_checks = Healths}; handle_check_return({ok, Check}, ServiceId, State) -> - Healths = orddict:store(ServiceId, Check, State#state.health_checks), + Healths = orddict:store(ServiceId, Check, + State#state.health_checks), State#state{health_checks = Healths}; handle_check_return({up, Check}, ServiceId, State) -> #health_check{service_pid = Pid} = Check, - Healths = orddict:store(ServiceId, Check, State#state.health_checks), + Healths = orddict:store(ServiceId, Check, + State#state.health_checks), S2 = State#state{health_checks = Healths}, add_service(ServiceId, Pid, S2); handle_check_return({down, Check}, ServiceId, State) -> - Healths = orddict:store(ServiceId, Check, State#state.health_checks), + Healths = orddict:store(ServiceId, Check, + State#state.health_checks), S2 = State#state{health_checks = Healths}, drop_service(ServiceId, S2). remove_health_check(ServiceId, State) -> #state{health_checks = Healths} = State, Healths2 = case orddict:find(ServiceId, Healths) of - error -> - Healths; - {ok, Check} -> - {_, _} = health_fsm(remove, ServiceId, Check), - orddict:erase(ServiceId, Healths) - end, + error -> Healths; + {ok, Check} -> + {_, _} = health_fsm(remove, ServiceId, Check), + orddict:erase(ServiceId, Healths) + end, State#state{health_checks = Healths2}. %% health checks are an fsm to make mental modeling easier. @@ -664,157 +631,167 @@ remove_health_check(ServiceId, State) -> %% remove health check %% health check finished -health_fsm(Msg, Service, #health_check{state = StateName} = Check) -> - {Reply, NextState, Check2} = health_fsm(StateName, Msg, Service, Check), +health_fsm(Msg, Service, + #health_check{state = StateName} = Check) -> + {Reply, NextState, Check2} = health_fsm(StateName, Msg, + Service, Check), Check3 = Check2#health_check{state = NextState}, {Reply, Check3}. %% suspend state health_fsm(suspend, resume, Service, InCheck) -> - #health_check{health_failures = N, check_interval = V} = InCheck, + #health_check{health_failures = N, check_interval = V} = + InCheck, Tref = next_health_tref(N, V, Service), - OutCheck = InCheck#health_check{ - interval_tref = Tref - }, + OutCheck = InCheck#health_check{interval_tref = Tref}, {ok, waiting, OutCheck}; - health_fsm(suspend, remove, _Service, InCheck) -> {remove, suspend, InCheck}; - %% message handling when checking state health_fsm(checking, suspend, _Service, InCheck) -> #health_check{checking_pid = Pid} = InCheck, erlang:erase(Pid), - {ok, suspend, InCheck#health_check{checking_pid = undefined}}; - + {ok, suspend, + InCheck#health_check{checking_pid = undefined}}; health_fsm(checking, check_health, _Service, InCheck) -> {ok, checking, InCheck}; - health_fsm(checking, remove, _Service, InCheck) -> {remove, checking, InCheck}; - -health_fsm(checking, {result, Pid, Cause}, Service, #health_check{checking_pid = Pid} = InCheck) -> +health_fsm(checking, {result, Pid, Cause}, Service, + #health_check{checking_pid = Pid} = InCheck) -> %% handle result from checking pid - #health_check{health_failures = HPFails, max_health_failures = HPMaxFails} = InCheck, - {Reply, HPFails1} = handle_fsm_exit(Cause, HPFails, HPMaxFails), - Tref = next_health_tref(HPFails1, InCheck#health_check.check_interval, Service), - OutCheck = InCheck#health_check{ - checking_pid = undefined, - health_failures = HPFails1, - callback_failures = 0, - interval_tref = Tref - }, + #health_check{health_failures = HPFails, + max_health_failures = HPMaxFails} = + InCheck, + {Reply, HPFails1} = handle_fsm_exit(Cause, HPFails, + HPMaxFails), + Tref = next_health_tref(HPFails1, + InCheck#health_check.check_interval, Service), + OutCheck = InCheck#health_check{checking_pid = + undefined, + health_failures = HPFails1, + callback_failures = 0, + interval_tref = Tref}, {Reply, waiting, OutCheck}; - -health_fsm(checking, {'EXIT', Pid, Cause}, Service, #health_check{checking_pid = Pid} = InCheck) - when Cause =/= normal -> - logger:error("health check process for ~p error'ed: ~p", [Service, Cause]), +health_fsm(checking, {'EXIT', Pid, Cause}, Service, + #health_check{checking_pid = Pid} = InCheck) + when Cause =/= normal -> + logger:error("health check process for ~p error'ed: " + " ~p", + [Service, Cause]), Fails = InCheck#health_check.callback_failures + 1, - if - Fails == InCheck#health_check.max_callback_failures -> - logger:error("health check callback for ~p failed too " - "many times, disabling.", [Service]), - {down, suspend, InCheck#health_check{checking_pid = undefined, - callback_failures = Fails}}; - Fails < InCheck#health_check.max_callback_failures -> - #health_check{health_failures = N, check_interval = Inter} = InCheck, - Tref = next_health_tref(N, Inter, Service), - OutCheck = InCheck#health_check{checking_pid = undefined, - callback_failures = Fails, interval_tref = Tref}, - {ok, waiting, OutCheck}; - true -> - %% likely a late message, or a faker - {ok, suspend, InCheck#health_check{checking_pid = undefined, - callback_failures = Fails}} + if Fails == + InCheck#health_check.max_callback_failures -> + logger:error("health check callback for ~p failed " + "too many times, disabling.", + [Service]), + {down, suspend, + InCheck#health_check{checking_pid = undefined, + callback_failures = Fails}}; + Fails < InCheck#health_check.max_callback_failures -> + #health_check{health_failures = N, + check_interval = Inter} = + InCheck, + Tref = next_health_tref(N, Inter, Service), + OutCheck = InCheck#health_check{checking_pid = + undefined, + callback_failures = Fails, + interval_tref = Tref}, + {ok, waiting, OutCheck}; + true -> + %% likely a late message, or a faker + {ok, suspend, + InCheck#health_check{checking_pid = undefined, + callback_failures = Fails}} end; - %% message handling when in a waiting state health_fsm(waiting, suspend, _Service, InCheck) -> case InCheck#health_check.interval_tref of - undefined -> ok; - _ -> - _ = erlang:cancel_timer(InCheck#health_check.interval_tref), - ok + undefined -> ok; + _ -> + _ = + erlang:cancel_timer(InCheck#health_check.interval_tref), + ok end, - {ok, suspend, InCheck#health_check{interval_tref = undefined}}; - + {ok, suspend, + InCheck#health_check{interval_tref = undefined}}; health_fsm(waiting, check_health, Service, InCheck) -> InCheck1 = start_health_check(Service, InCheck), {ok, checking, InCheck1}; - health_fsm(waiting, remove, _Service, InCheck) -> case InCheck#health_check.interval_tref of - undefined -> ok; - Tref -> - _ = erlang:cancel_timer(Tref), - ok + undefined -> ok; + Tref -> _ = erlang:cancel_timer(Tref), ok end, - OutCheck = InCheck#health_check{interval_tref = undefined}, + OutCheck = InCheck#health_check{interval_tref = + undefined}, {remove, waiting, OutCheck}; - %% fallthrough handling health_fsm(StateName, _Msg, _Service, Health) -> {ok, StateName, Health}. -handle_fsm_exit(true, HPFails, MaxHPFails) when HPFails >= MaxHPFails -> +handle_fsm_exit(true, HPFails, MaxHPFails) + when HPFails >= MaxHPFails -> %% service was failed, but recovered {up, 0}; - -handle_fsm_exit(true, HPFails, MaxHPFails) when HPFails < MaxHPFails -> +handle_fsm_exit(true, HPFails, MaxHPFails) + when HPFails < MaxHPFails -> %% service never fully failed {ok, 0}; - -handle_fsm_exit(false, HPFails, MaxHPFails) when HPFails + 1 == MaxHPFails -> +handle_fsm_exit(false, HPFails, MaxHPFails) + when HPFails + 1 == MaxHPFails -> %% service has failed enough to go down {down, HPFails + 1}; - handle_fsm_exit(false, HPFails, __) -> %% all other cases handled, this is health continues to fail {ok, HPFails + 1}. -start_health_check(Service, #health_check{checking_pid = undefined} = CheckRec) -> +start_health_check(Service, + #health_check{checking_pid = undefined} = CheckRec) -> {Mod, Func, Args} = CheckRec#health_check.callback, Pid = CheckRec#health_check.service_pid, case CheckRec#health_check.interval_tref of - undefined -> ok; - Tref -> - _ = erlang:cancel_timer(Tref), - ok + undefined -> ok; + Tref -> _ = erlang:cancel_timer(Tref), ok end, - CheckingPid = proc_lib:spawn_link(fun() -> - case erlang:apply(Mod, Func, [Pid | Args]) of - R when R =:= true orelse R =:= false -> - health_check_result(self(), R); - Else -> exit(Else) - end - end), + CheckingPid = proc_lib:spawn_link(fun () -> + case erlang:apply(Mod, Func, + [Pid | Args]) + of + R + when R =:= true orelse + R =:= false -> + health_check_result(self(), + R); + Else -> exit(Else) + end + end), erlang:put(CheckingPid, Service), CheckRec#health_check{state = checking, checking_pid = CheckingPid, interval_tref = undefined}; -start_health_check(_Service, Check) -> - Check. +start_health_check(_Service, Check) -> Check. health_check_result(CheckPid, Result) -> - gen_server:cast(?MODULE, {health_check_result, CheckPid, Result}). + gen_server:cast(?MODULE, + {health_check_result, CheckPid, Result}). -next_health_tref(_, infinity, _) -> - undefined; +next_health_tref(_, infinity, _) -> undefined; next_health_tref(N, V, Service) -> Time = determine_time(N, V), - erlang:send_after(Time, self(), {check_health, Service}). + erlang:send_after(Time, self(), + {check_health, Service}). all_health_fsms(Msg, Healths) -> - [begin - {ok, C1} = health_fsm(Msg, S, C), - {S, C1} - end || {S, C} <- Healths]. + [begin {ok, C1} = health_fsm(Msg, S, C), {S, C1} end + || {S, C} <- Healths]. -determine_time(Failures, BaseInterval) when Failures < 4 -> +determine_time(Failures, BaseInterval) + when Failures < 4 -> BaseInterval; - -determine_time(Failures, BaseInterval) when Failures < 11 -> - erlang:trunc(BaseInterval * (math:pow(Failures, 1.3))); - -determine_time(Failures, BaseInterval) when Failures > 10 -> +determine_time(Failures, BaseInterval) + when Failures < 11 -> + erlang:trunc(BaseInterval * math:pow(Failures, 1.3)); +determine_time(Failures, BaseInterval) + when Failures > 10 -> BaseInterval * 20. diff --git a/src/riak_core_node_watcher_events.erl b/src/riak_core_node_watcher_events.erl index bbac031f5..a141c9bc2 100644 --- a/src/riak_core_node_watcher_events.erl +++ b/src/riak_core_node_watcher_events.erl @@ -24,27 +24,22 @@ -behaviour(gen_event). %% API --export([start_link/0, - add_handler/2, - add_sup_handler/2, - add_guarded_handler/2, - add_callback/1, - add_sup_callback/1, - add_guarded_callback/1, +-export([start_link/0, add_handler/2, add_sup_handler/2, + add_guarded_handler/2, add_callback/1, + add_sup_callback/1, add_guarded_callback/1, service_update/1]). %% gen_event callbacks -export([init/1, handle_event/2, handle_call/2, handle_info/2, terminate/2, code_change/3]). --record(state, { callback }). +-record(state, {callback}). %% =================================================================== %% API functions %% =================================================================== -start_link() -> - gen_event:start_link({local, ?MODULE}). +start_link() -> gen_event:start_link({local, ?MODULE}). add_handler(Handler, Args) -> gen_event:add_handler(?MODULE, Handler, Args). @@ -53,21 +48,24 @@ add_sup_handler(Handler, Args) -> gen_event:add_sup_handler(?MODULE, Handler, Args). add_guarded_handler(Handler, Args) -> - riak_core:add_guarded_event_handler(?MODULE, Handler, Args). + riak_core:add_guarded_event_handler(?MODULE, Handler, + Args). add_callback(Fn) when is_function(Fn) -> - gen_event:add_handler(?MODULE, {?MODULE, make_ref()}, [Fn]). + gen_event:add_handler(?MODULE, {?MODULE, make_ref()}, + [Fn]). add_sup_callback(Fn) when is_function(Fn) -> - gen_event:add_sup_handler(?MODULE, {?MODULE, make_ref()}, [Fn]). + gen_event:add_sup_handler(?MODULE, + {?MODULE, make_ref()}, [Fn]). add_guarded_callback(Fn) when is_function(Fn) -> - riak_core:add_guarded_event_handler(?MODULE, {?MODULE, make_ref()}, [Fn]). + riak_core:add_guarded_event_handler(?MODULE, + {?MODULE, make_ref()}, [Fn]). service_update(Services) -> gen_event:notify(?MODULE, {service_update, Services}). - %% =================================================================== %% gen_event callbacks %% =================================================================== @@ -75,21 +73,15 @@ service_update(Services) -> init([Fn]) -> %% Get the initial list of available services Fn(riak_core_node_watcher:services()), - {ok, #state { callback = Fn }}. + {ok, #state{callback = Fn}}. handle_event({service_update, Services}, State) -> - (State#state.callback)(Services), - {ok, State}. - -handle_call(_Request, State) -> - {ok, ok, State}. + (State#state.callback)(Services), {ok, State}. -handle_info(_Info, State) -> - {ok, State}. +handle_call(_Request, State) -> {ok, ok, State}. -terminate(_Reason, _State) -> - ok. +handle_info(_Info, State) -> {ok, State}. -code_change(_OldVsn, State, _Extra) -> - {ok, State}. +terminate(_Reason, _State) -> ok. +code_change(_OldVsn, State, _Extra) -> {ok, State}. diff --git a/src/riak_core_priority_queue.erl b/src/riak_core_priority_queue.erl index b2cea75e4..b3cfad917 100644 --- a/src/riak_core_priority_queue.erl +++ b/src/riak_core_priority_queue.erl @@ -52,242 +52,256 @@ %% calls into the same function knowing that ordinary queues represent %% a base case. - -module(riak_core_priority_queue). --export([new/0, is_queue/1, is_empty/1, len/1, to_list/1, in/2, in/3, - out/1, out/2, pout/1, join/2]). +-export([new/0, is_queue/1, is_empty/1, len/1, + to_list/1, in/2, in/3, out/1, out/2, pout/1, join/2]). %%---------------------------------------------------------------------------- -type priority() :: integer(). + -type squeue() :: {queue, [any()], [any()]}. --type pqueue() :: squeue() | {pqueue, [{priority(), squeue()}]}. +-type pqueue() :: squeue() | + {pqueue, [{priority(), squeue()}]}. %%---------------------------------------------------------------------------- -spec new() -> pqueue(). -new() -> {queue, [], []}. +new() -> {queue, [], []}. -spec is_queue(any()) -> boolean(). + is_queue({queue, R, F}) when is_list(R), is_list(F) -> true; is_queue({pqueue, Queues}) when is_list(Queues) -> - lists:all(fun ({P, Q}) -> is_integer(P) andalso is_queue(Q) end, Queues); -is_queue(_) -> - false. + lists:all(fun ({P, Q}) -> + is_integer(P) andalso is_queue(Q) + end, + Queues); +is_queue(_) -> false. -spec is_empty(pqueue()) -> boolean(). -is_empty({queue, [], []}) -> - true; -is_empty(_) -> - false. + +is_empty({queue, [], []}) -> true; +is_empty(_) -> false. -spec len(pqueue()) -> non_neg_integer(). + len({queue, R, F}) when is_list(R), is_list(F) -> length(R) + length(F); len({pqueue, Queues}) -> lists:sum([len(Q) || {_, Q} <- Queues]). -spec to_list(pqueue()) -> [{priority(), any()}]. -to_list({queue, In, Out}) when is_list(In), is_list(Out) -> + +to_list({queue, In, Out}) + when is_list(In), is_list(Out) -> [{0, V} || V <- Out ++ lists:reverse(In, [])]; to_list({pqueue, Queues}) -> [{-P, V} || {P, Q} <- Queues, {0, V} <- to_list(Q)]. -spec in(any(), pqueue()) -> pqueue(). -in(Item, Q) -> - in(Item, 0, Q). + +in(Item, Q) -> in(Item, 0, Q). -spec in(any(), priority(), pqueue()) -> pqueue(). -in(X, 0, {queue, [_] = In, []}) -> - {queue, [X], In}; -in(X, 0, {queue, In, Out}) when is_list(In), is_list(Out) -> - {queue, [X|In], Out}; + +in(X, 0, {queue, [_] = In, []}) -> {queue, [X], In}; +in(X, 0, {queue, In, Out}) + when is_list(In), is_list(Out) -> + {queue, [X | In], Out}; in(X, Priority, _Q = {queue, [], []}) -> in(X, Priority, {pqueue, []}); in(X, Priority, Q = {queue, _, _}) -> in(X, Priority, {pqueue, [{0, Q}]}); in(X, Priority, {pqueue, Queues}) -> P = -Priority, - {pqueue, case lists:keysearch(P, 1, Queues) of - {value, {_, Q}} -> - lists:keyreplace(P, 1, Queues, {P, in(X, Q)}); - false -> - lists:keysort(1, [{P, {queue, [X], []}} | Queues]) - end}. - --spec out(pqueue()) -> {(empty | {value, any()}), pqueue()}. -out({queue, [], []} = Q) -> - {empty, Q}; -out({queue, [V], []}) -> - {{value, V}, {queue, [], []}}; -out({queue, [Y|In], []}) -> - [V|Out] = lists:reverse(In, []), + {pqueue, + case lists:keysearch(P, 1, Queues) of + {value, {_, Q}} -> + lists:keyreplace(P, 1, Queues, {P, in(X, Q)}); + false -> + lists:keysort(1, [{P, {queue, [X], []}} | Queues]) + end}. + +-spec out(pqueue()) -> {empty | {value, any()}, + pqueue()}. + +out({queue, [], []} = Q) -> {empty, Q}; +out({queue, [V], []}) -> {{value, V}, {queue, [], []}}; +out({queue, [Y | In], []}) -> + [V | Out] = lists:reverse(In, []), {{value, V}, {queue, [Y], Out}}; out({queue, In, [V]}) when is_list(In) -> - {{value,V}, r2f(In)}; -out({queue, In,[V|Out]}) when is_list(In) -> + {{value, V}, r2f(In)}; +out({queue, In, [V | Out]}) when is_list(In) -> {{value, V}, {queue, In, Out}}; out({pqueue, [{P, Q} | Queues]}) -> {R, Q1} = out(Q), NewQ = case is_empty(Q1) of - true -> case Queues of - [] -> {queue, [], []}; - [{0, OnlyQ}] -> OnlyQ; - [_|_] -> {pqueue, Queues} - end; - false -> {pqueue, [{P, Q1} | Queues]} + true -> + case Queues of + [] -> {queue, [], []}; + [{0, OnlyQ}] -> OnlyQ; + [_ | _] -> {pqueue, Queues} + end; + false -> {pqueue, [{P, Q1} | Queues]} end, {R, NewQ}. --spec out(priority(), pqueue()) -> {(empty | {value, any()}), pqueue()}. -out(_Priority, {queue, [], []} = Q) -> - {empty, Q}; +-spec out(priority(), pqueue()) -> {empty | + {value, any()}, + pqueue()}. + +out(_Priority, {queue, [], []} = Q) -> {empty, Q}; out(Priority, {queue, _, _} = Q) when Priority =< 0 -> out(Q); -out(_Priority, {queue, _, _} = Q) -> - {empty, Q}; -out(Priority, {pqueue, [{P, _Q} | _Queues]} = Q) when Priority =< (-P) -> +out(_Priority, {queue, _, _} = Q) -> {empty, Q}; +out(Priority, {pqueue, [{P, _Q} | _Queues]} = Q) + when Priority =< -P -> out(Q); -out(_Priority, {pqueue, [_|_]} = Q) -> - {empty, Q}. +out(_Priority, {pqueue, [_ | _]} = Q) -> {empty, Q}. +-spec pout(pqueue()) -> {empty | + {value, any(), priority()}, + pqueue()}. --spec pout(pqueue()) -> {(empty | {value, any(), priority()}), pqueue()}. -pout({queue, [], []} = Q) -> - {empty, Q}; +pout({queue, [], []} = Q) -> {empty, Q}; pout({queue, _, _} = Q) -> - {{value, V}, Q1} = out(Q), - {{value, V, 0}, Q1}; + {{value, V}, Q1} = out(Q), {{value, V, 0}, Q1}; pout({pqueue, [{P, Q} | Queues]}) -> {{value, V}, Q1} = out(Q), NewQ = case is_empty(Q1) of - true -> case Queues of - [] -> {queue, [], []}; - [{0, OnlyQ}] -> OnlyQ; - [_|_] -> {pqueue, Queues} - end; - false -> {pqueue, [{P, Q1} | Queues]} + true -> + case Queues of + [] -> {queue, [], []}; + [{0, OnlyQ}] -> OnlyQ; + [_ | _] -> {pqueue, Queues} + end; + false -> {pqueue, [{P, Q1} | Queues]} end, {{value, V, -P}, NewQ}. -spec join(pqueue(), pqueue()) -> pqueue(). -join(A, {queue, [], []}) -> - A; -join({queue, [], []}, B) -> - B; + +join(A, {queue, [], []}) -> A; +join({queue, [], []}, B) -> B; join({queue, AIn, AOut}, {queue, BIn, BOut}) -> {queue, BIn, AOut ++ lists:reverse(AIn, BOut)}; join(A = {queue, _, _}, {pqueue, BPQ}) -> - {Pre, Post} = lists:splitwith(fun ({P, _}) -> P < 0 end, BPQ), + {Pre, Post} = lists:splitwith(fun ({P, _}) -> P < 0 end, + BPQ), Post1 = case Post of - [] -> [ {0, A} ]; - [ {0, ZeroQueue} | Rest ] -> [ {0, join(A, ZeroQueue)} | Rest ]; - _ -> [ {0, A} | Post ] + [] -> [{0, A}]; + [{0, ZeroQueue} | Rest] -> + [{0, join(A, ZeroQueue)} | Rest]; + _ -> [{0, A} | Post] end, {pqueue, Pre ++ Post1}; join({pqueue, APQ}, B = {queue, _, _}) -> - {Pre, Post} = lists:splitwith(fun ({P, _}) -> P < 0 end, APQ), + {Pre, Post} = lists:splitwith(fun ({P, _}) -> P < 0 end, + APQ), Post1 = case Post of - [] -> [ {0, B} ]; - [ {0, ZeroQueue} | Rest ] -> [ {0, join(ZeroQueue, B)} | Rest ]; - _ -> [ {0, B} | Post ] + [] -> [{0, B}]; + [{0, ZeroQueue} | Rest] -> + [{0, join(ZeroQueue, B)} | Rest]; + _ -> [{0, B} | Post] end, {pqueue, Pre ++ Post1}; join({pqueue, APQ}, {pqueue, BPQ}) -> {pqueue, merge(APQ, BPQ, [])}. -merge([], BPQ, Acc) -> - lists:reverse(Acc, BPQ); -merge(APQ, [], Acc) -> - lists:reverse(Acc, APQ); -merge([{P, A}|As], [{P, B}|Bs], Acc) -> - merge(As, Bs, [ {P, join(A, B)} | Acc ]); -merge([{PA, A}|As], Bs = [{PB, _}|_], Acc) when PA < PB -> - merge(As, Bs, [ {PA, A} | Acc ]); -merge(As = [{_, _}|_], [{PB, B}|Bs], Acc) -> - merge(As, Bs, [ {PB, B} | Acc ]). - -r2f([]) -> {queue, [], []}; +merge([], BPQ, Acc) -> lists:reverse(Acc, BPQ); +merge(APQ, [], Acc) -> lists:reverse(Acc, APQ); +merge([{P, A} | As], [{P, B} | Bs], Acc) -> + merge(As, Bs, [{P, join(A, B)} | Acc]); +merge([{PA, A} | As], Bs = [{PB, _} | _], Acc) + when PA < PB -> + merge(As, Bs, [{PA, A} | Acc]); +merge(As = [{_, _} | _], [{PB, B} | Bs], Acc) -> + merge(As, Bs, [{PB, B} | Acc]). + +r2f([]) -> {queue, [], []}; r2f([_] = R) -> {queue, [], R}; -r2f([X,Y]) -> {queue, [X], [Y]}; -r2f([X,Y|R]) -> {queue, [X,Y], lists:reverse(R, [])}. +r2f([X, Y]) -> {queue, [X], [Y]}; +r2f([X, Y | R]) -> + {queue, [X, Y], lists:reverse(R, [])}. -ifdef(TEST). + -include_lib("eunit/include/eunit.hrl"). simple_case(Order) -> - Queue = ?MODULE:new(), - ?assertEqual(true, ?MODULE:is_queue(Queue)), - ?assertEqual(true, ?MODULE:is_empty(Queue)), - ?assertEqual(0, ?MODULE:len(Queue)), - ?assertEqual([], ?MODULE:to_list(Queue)), + Queue = (?MODULE):new(), + ?assertEqual(true, ((?MODULE):is_queue(Queue))), + ?assertEqual(true, ((?MODULE):is_empty(Queue))), + ?assertEqual(0, ((?MODULE):len(Queue))), + ?assertEqual([], ((?MODULE):to_list(Queue))), case Order of - forward -> - Queue2 = ?MODULE:in(low, Queue), - Queue3 = ?MODULE:in(mid, 500, Queue2), - Queue4 = ?MODULE:in(high, 1000, Queue3); - reverse -> - Queue2 = ?MODULE:in(high, 1000, Queue), - Queue3 = ?MODULE:in(mid, 500, Queue2), - Queue4 = ?MODULE:in(low, Queue3); - mixed -> - Queue2 = ?MODULE:in(high, 1000, Queue), - Queue3 = ?MODULE:in(low, Queue2), - Queue4 = ?MODULE:in(mid, 500, Queue3) + forward -> + Queue2 = (?MODULE):in(low, Queue), + Queue3 = (?MODULE):in(mid, 500, Queue2), + Queue4 = (?MODULE):in(high, 1000, Queue3); + reverse -> + Queue2 = (?MODULE):in(high, 1000, Queue), + Queue3 = (?MODULE):in(mid, 500, Queue2), + Queue4 = (?MODULE):in(low, Queue3); + mixed -> + Queue2 = (?MODULE):in(high, 1000, Queue), + Queue3 = (?MODULE):in(low, Queue2), + Queue4 = (?MODULE):in(mid, 500, Queue3) end, - ?assertEqual(false, ?MODULE:is_empty(Queue4)), - ?assertEqual(3, ?MODULE:len(Queue4)), - ?assertMatch({{value, high}, _}, ?MODULE:out(Queue4)), - {{value, high}, Queue5} = ?MODULE:out(Queue4), - ?assertMatch({{value, mid}, _}, ?MODULE:out(Queue5)), - {{value, mid}, Queue6} = ?MODULE:out(Queue5), - ?assertMatch({{value, low}, _}, ?MODULE:out(Queue6)), - {{value, low}, Queue7} = ?MODULE:out(Queue6), - ?assertEqual(0, ?MODULE:len(Queue7)), - - ?assertEqual(true, ?MODULE:is_queue(Queue2)), - ?assertEqual(true, ?MODULE:is_queue(Queue3)), - ?assertEqual(true, ?MODULE:is_queue(Queue4)), - ?assertEqual(false, ?MODULE:is_queue([])), + ?assertEqual(false, ((?MODULE):is_empty(Queue4))), + ?assertEqual(3, ((?MODULE):len(Queue4))), + ?assertMatch({{value, high}, _}, + ((?MODULE):out(Queue4))), + {{value, high}, Queue5} = (?MODULE):out(Queue4), + ?assertMatch({{value, mid}, _}, + ((?MODULE):out(Queue5))), + {{value, mid}, Queue6} = (?MODULE):out(Queue5), + ?assertMatch({{value, low}, _}, + ((?MODULE):out(Queue6))), + {{value, low}, Queue7} = (?MODULE):out(Queue6), + ?assertEqual(0, ((?MODULE):len(Queue7))), + ?assertEqual(true, ((?MODULE):is_queue(Queue2))), + ?assertEqual(true, ((?MODULE):is_queue(Queue3))), + ?assertEqual(true, ((?MODULE):is_queue(Queue4))), + ?assertEqual(false, ((?MODULE):is_queue([]))), ok. merge_case() -> - QueueA1 = ?MODULE:new(), - QueueA2 = ?MODULE:in(1, QueueA1), - QueueA3 = ?MODULE:in(3, QueueA2), - QueueA4 = ?MODULE:in(5, QueueA3), - - QueueB1 = ?MODULE:new(), - QueueB2 = ?MODULE:in(2, QueueB1), - QueueB3 = ?MODULE:in(4, QueueB2), - QueueB4 = ?MODULE:in(6, QueueB3), - - Merged1 = ?MODULE:join(QueueA4, QueueB4), - ?assertEqual([{0,1},{0,3},{0,5},{0,2},{0,4},{0,6}], - ?MODULE:to_list(Merged1)), - - QueueC1 = ?MODULE:new(), - QueueC2 = ?MODULE:in(1, 10, QueueC1), - QueueC3 = ?MODULE:in(3, 30, QueueC2), - QueueC4 = ?MODULE:in(5, 50, QueueC3), - - QueueD1 = ?MODULE:new(), - QueueD2 = ?MODULE:in(2, 20, QueueD1), - QueueD3 = ?MODULE:in(4, 40, QueueD2), - QueueD4 = ?MODULE:in(6, 60, QueueD3), - - Merged2 = ?MODULE:join(QueueC4, QueueD4), - ?assertEqual([{60,6},{50,5},{40,4},{30,3},{20,2},{10,1}], - ?MODULE:to_list(Merged2)), + QueueA1 = (?MODULE):new(), + QueueA2 = (?MODULE):in(1, QueueA1), + QueueA3 = (?MODULE):in(3, QueueA2), + QueueA4 = (?MODULE):in(5, QueueA3), + QueueB1 = (?MODULE):new(), + QueueB2 = (?MODULE):in(2, QueueB1), + QueueB3 = (?MODULE):in(4, QueueB2), + QueueB4 = (?MODULE):in(6, QueueB3), + Merged1 = (?MODULE):join(QueueA4, QueueB4), + ?assertEqual([{0, 1}, {0, 3}, {0, 5}, {0, 2}, {0, 4}, + {0, 6}], + ((?MODULE):to_list(Merged1))), + QueueC1 = (?MODULE):new(), + QueueC2 = (?MODULE):in(1, 10, QueueC1), + QueueC3 = (?MODULE):in(3, 30, QueueC2), + QueueC4 = (?MODULE):in(5, 50, QueueC3), + QueueD1 = (?MODULE):new(), + QueueD2 = (?MODULE):in(2, 20, QueueD1), + QueueD3 = (?MODULE):in(4, 40, QueueD2), + QueueD4 = (?MODULE):in(6, 60, QueueD3), + Merged2 = (?MODULE):join(QueueC4, QueueD4), + ?assertEqual([{60, 6}, {50, 5}, {40, 4}, {30, 3}, + {20, 2}, {10, 1}], + ((?MODULE):to_list(Merged2))), ok. basic_test() -> simple_case(forward), -simple_case(reverse), + simple_case(reverse), simple_case(mixed), merge_case(), ok. diff --git a/src/riak_core_rand.erl b/src/riak_core_rand.erl deleted file mode 100644 index 277259fa3..000000000 --- a/src/riak_core_rand.erl +++ /dev/null @@ -1,65 +0,0 @@ -%% Generalized random module that offers a backwards compatible API -%% around some of the changes in rand, crypto and for time units. - --module(riak_core_rand). - -%% API --export([ - uniform/0, - uniform/1, - uniform_s/2, - seed/0, - seed/1, - rand_seed/0, - rand_bytes/1 - ]). - -%% As the algorithm is not changed in any place we can use the default -%% algorithm for all call here. --define(ALGO, exsplus). - -uniform() -> - rand:uniform(). - -uniform(N) -> - rand:uniform(N). - -%% The old random:uniform_s took a 3 touple however this is no longer -%% the case, so what we need to do if we see such a situation is to first -%% create a state using seed_s (which can take the old data) and then -%% using uniform_s with this newly generated state. -%% -%% Note that seed_s does **not** change the current seed but just -%% create a new seed state. -uniform_s(N, {A, B, C}) -> - State = rand:seed_s(?ALGO, {A, B, C}), - rand:uniform_s(N, State); -uniform_s(N, State) -> - rand:uniform_s(N, State). - -seed() -> - rand:seed(?ALGO). - -%% We are a bit tricky here, while random:seed did return the **prior** seed -%% rand:seed will return the **new** seed. We can work around this by first -%% getting the exported seed then using this instead. --spec seed({integer(),integer(),integer()} | rand:export_state()) -> - rand:export_state() | undefined. -seed({_, _, _} = Seed) -> - Old = rand:export_seed(), - _New = rand:seed(?ALGO, Seed), - Old; -seed(Seed) -> - Old = rand:export_seed(), - _New = rand:seed(Seed), - Old. - -rand_bytes(Size) -> - crypto:strong_rand_bytes(Size). - -%%%=================================================================== -%%% General functions -%%%=================================================================== - -rand_seed() -> - erlang:timestamp(). diff --git a/src/riak_core_ring.erl b/src/riak_core_ring.erl index d411150bd..e9ec2ebe3 100644 --- a/src/riak_core_ring.erl +++ b/src/riak_core_ring.erl @@ -28,164 +28,107 @@ -module(riak_core_ring). --export([all_members/1, - all_owners/1, - all_preflists/2, - diff_nodes/2, - equal_rings/2, - fresh/0, - fresh/1, - fresh/2, - get_meta/2, - get_buckets/1, - index_owner/2, - my_indices/1, - num_partitions/1, - owner_node/1, - preflist/2, - random_node/1, - random_other_index/1, - random_other_index/2, - random_other_node/1, - reconcile/2, - rename_node/3, - responsible_index/2, - transfer_node/3, - update_meta/3, - remove_meta/2]). - --export([cluster_name/1, -%% upgrade/1, -%% downgrade/2, - set_tainted/1, - check_tainted/2, - unset_tainted/1, - set_lastgasp/1, - check_lastgasp/1, - unset_lastgasp/1, - nearly_equal/2, - claimant/1, - member_status/2, - pretty_print/2, - all_member_status/1, - update_member_meta/5, - clear_member_meta/3, - get_member_meta/3, - add_member/3, - remove_member/3, - leave_member/3, - exit_member/3, - down_member/3, - set_member/4, - set_member/5, - members/2, - has_location_changed/1, - clear_location_changed/1, - set_node_location/3, - get_nodes_locations/1, - set_claimant/2, - increment_vclock/2, - ring_version/1, - increment_ring_version/2, - set_pending_changes/2, - active_members/1, - claiming_members/1, - ready_members/1, - random_other_active_node/1, - down_members/1, - set_owner/2, - indices/2, - future_indices/2, - future_ring/1, - disowning_indices/2, - cancel_transfers/1, - pending_changes/1, - next_owner/1, - next_owner/2, - next_owner/3, - completed_next_owners/2, - all_next_owners/1, - change_owners/2, - handoff_complete/3, - ring_ready/0, - ring_ready/1, - ring_ready_info/1, - ring_changed/2, - set_cluster_name/2, - reconcile_names/2, - reconcile_members/2, - is_primary/2, - chash/1, - set_chash/2, - resize/2, - set_pending_resize/2, - set_pending_resize_abort/1, - maybe_abort_resize/1, - schedule_resize_transfer/3, - awaiting_resize_transfer/3, - resize_transfer_status/4, - resize_transfer_complete/4, +-export([all_members/1, all_owners/1, all_preflists/2, + diff_nodes/2, equal_rings/2, fresh/0, fresh/1, fresh/2, + get_meta/2, index_owner/2, my_indices/1, + num_partitions/1, owner_node/1, preflist/2, + random_node/1, random_other_index/1, + random_other_index/2, random_other_node/1, reconcile/2, + rename_node/3, responsible_index/2, transfer_node/3, + update_meta/3, remove_meta/2]). + +-export([cluster_name/1, set_tainted/1, check_tainted/2, + nearly_equal/2, claimant/1, member_status/2, + pretty_print/2, all_member_status/1, + update_member_meta/5, clear_member_meta/3, + get_member_meta/3, add_member/3, remove_member/3, + leave_member/3, exit_member/3, down_member/3, + set_member/4, set_member/5, members/2, set_claimant/2, + increment_vclock/2, ring_version/1, + increment_ring_version/2, set_pending_changes/2, + active_members/1, claiming_members/1, ready_members/1, + random_other_active_node/1, down_members/1, set_owner/2, + indices/2, future_indices/2, future_ring/1, + disowning_indices/2, cancel_transfers/1, + pending_changes/1, next_owner/1, next_owner/2, + next_owner/3, completed_next_owners/2, + all_next_owners/1, change_owners/2, handoff_complete/3, + ring_ready/0, ring_ready/1, ring_ready_info/1, + ring_changed/2, set_cluster_name/2, reconcile_names/2, + reconcile_members/2, is_primary/2, chash/1, set_chash/2, + resize/2, set_pending_resize/2, + set_pending_resize_abort/1, maybe_abort_resize/1, + schedule_resize_transfer/3, awaiting_resize_transfer/3, + resize_transfer_status/4, resize_transfer_complete/4, complete_resize_transfers/3, - reschedule_resize_transfers/3, - is_resizing/1, - is_post_resize/1, - is_resize_complete/1, - resized_ring/1, - set_resized_ring/2, - future_index/3, - future_index/4, - future_index/5, - is_future_index/4, - future_owner/2, - future_num_partitions/1, - vnode_type/2, + reschedule_resize_transfers/3, is_resizing/1, + is_post_resize/1, is_resize_complete/1, resized_ring/1, + set_resized_ring/2, future_index/3, future_index/4, + future_index/5, is_future_index/4, future_owner/2, + future_num_partitions/1, vnode_type/2, deletion_complete/3]). --export_type([riak_core_ring/0, ring_size/0, partition_id/0]). + %% upgrade/1, + %% downgrade/2, + +-export_type([riak_core_ring/0, ring_size/0, + partition_id/0]). -ifdef(TEST). + -include_lib("eunit/include/eunit.hrl"). + -endif. --record(chstate, { - nodename :: term(), % the Node responsible for this chstate - vclock :: vclock:vclock() | undefined, % for this chstate object, entries are - % {Node, Ctr} - chring :: chash:chash() | undefined, % chash ring of {IndexAsInt, Node} mappings - meta :: dict:dict() | undefined, - % dict of cluster-wide other data (primarily - % bucket N-value, etc) - clustername :: {term(), term()} | undefined, - next :: [{integer(), term(), term(), [module()], awaiting | complete}], - members :: [{node(), {member_status(), vclock:vclock(), [{atom(), term()}]}}] | undefined, - claimant :: term(), - seen :: [{term(), vclock:vclock()}] | undefined, - rvsn :: vclock:vclock() | undefined - }). - - --type member_status() :: joining | valid | invalid | leaving | exiting | down. +-record(chstate, + {nodename :: + term(), % the Node responsible for this chstate + vclock :: + vclock:vclock() | + undefined, % for this chstate object, entries are + % {Node, Ctr} + chring :: + chash:chash() | + undefined, % chash ring of {IndexAsInt, Node} mappings + meta :: dict:dict() | undefined, + % dict of cluster-wide other data (primarily N-value, etc) + clustername :: {term(), term()} | undefined, + next :: + [{integer(), term(), term(), [module()], + awaiting | complete}], + members :: + [{node(), + {member_status(), vclock:vclock(), + [{atom(), term()}]}}] | + undefined, + claimant :: term(), + seen :: [{term(), vclock:vclock()}] | undefined, + rvsn :: vclock:vclock() | undefined}). + +-type member_status() :: joining | valid | invalid | + leaving | exiting | down. %% type meta_entry(). Record for each entry in #chstate.meta --record(meta_entry, { - value, % The value stored under this entry - lastmod % The last modified time of this entry, - % from calendar:datetime_to_gregorian_seconds( - % calendar:universal_time()), -}). +-record(meta_entry, + {value, % The value stored under this entry + lastmod}). % The last modified time of this entry, + % from calendar:datetime_to_gregorian_seconds( + % calendar:universal_time()), %% @type riak_core_ring(). Opaque data type used for partition ownership -type riak_core_ring() :: #chstate{}. + -type chstate() :: riak_core_ring(). -type pending_change() :: {Owner :: node(), - NextOwner :: node(), - awaiting | complete} - | {undefined, undefined, undefined}. + NextOwner :: node(), awaiting | complete} | + {undefined, undefined, undefined}. --type resize_transfer() :: {{integer(),term()}, ordsets:ordset(node()), awaiting | complete}. +-type resize_transfer() :: {{integer(), term()}, + ordsets:ordset(node()), awaiting | complete}. -type ring_size() :: non_neg_integer(). + %% @type partition_id(). This integer represents a value in the range [0, ring_size-1]. -type partition_id() :: non_neg_integer(). @@ -196,17 +139,13 @@ set_tainted(Ring) -> update_meta(riak_core_ring_tainted, true, Ring). -check_tainted(Ring=#chstate{}, Msg) -> - Exit = application:get_env(riak_core, exit_when_tainted, false), +check_tainted(Ring = #chstate{}, Msg) -> + Exit = application:get_env(riak_core, exit_when_tainted, + false), case {get_meta(riak_core_ring_tainted, Ring), Exit} of - {{ok, true}, true} -> - riak_core:stop(Msg), - ok; - {{ok, true}, false} -> - logger:error(Msg), - ok; - _ -> - ok + {{ok, true}, true} -> riak_core:stop(Msg), ok; + {{ok, true}, false} -> logger:error(Msg), ok; + _ -> ok end. -spec unset_tainted(chstate()) -> chstate(). @@ -235,34 +174,42 @@ unset_lastgasp(Ring) -> %% descendant of RingA's vclock. This matches the changes that the %% fix-up logic may make to a ring. -spec nearly_equal(chstate(), chstate()) -> boolean(). + nearly_equal(RingA, RingB) -> - TestVC = vclock:descends(RingB#chstate.vclock, RingA#chstate.vclock), - RingA2 = RingA#chstate{vclock=undefined, meta=undefined}, - RingB2 = RingB#chstate{vclock=undefined, meta=undefined}, - TestRing = (RingA2 =:= RingB2), + TestVC = vclock:descends(RingB#chstate.vclock, + RingA#chstate.vclock), + RingA2 = RingA#chstate{vclock = undefined, + meta = undefined}, + RingB2 = RingB#chstate{vclock = undefined, + meta = undefined}, + TestRing = RingA2 =:= RingB2, TestVC and TestRing. %% @doc Determine if a given Index/Node `IdxNode' combination is a %% primary. --spec is_primary(chstate(), {chash:index_as_int(), node()}) -> boolean(). +-spec is_primary(chstate(), + {chash:index_as_int(), node()}) -> boolean(). + is_primary(Ring, IdxNode) -> Owners = all_owners(Ring), lists:member(IdxNode, Owners). %% @doc Return the `CHash' of the ring. --spec chash(chstate()) -> CHash::chash:chash(). -chash(#chstate{chring=CHash}) -> - CHash. +-spec chash(chstate()) -> CHash :: chash:chash(). + +chash(#chstate{chring = CHash}) -> CHash. set_chash(State, CHash) -> - State#chstate{chring=CHash}. + State#chstate{chring = CHash}. %% @doc Produce a list of all nodes that are members of the cluster --spec all_members(State :: chstate()) -> [Node :: term()]. -all_members(#chstate{members=Members}) -> +-spec all_members(State :: chstate()) -> [Node :: + term()]. + +all_members(#chstate{members = Members}) -> get_members(Members). -members(#chstate{members=Members}, Types) -> +members(#chstate{members = Members}, Types) -> get_members(Members, Types). -spec has_location_changed(chstate()) -> boolean(). @@ -288,44 +235,55 @@ get_nodes_locations(?CHSTATE{members =Members} = ChState) -> dict:filter(fun(Node, _) -> lists:member(Node, Nodes) end, Value). %% @doc Produce a list of all active (not marked as down) cluster members -active_members(#chstate{members=Members}) -> - get_members(Members, [joining, valid, leaving, exiting]). +active_members(#chstate{members = Members}) -> + get_members(Members, + [joining, valid, leaving, exiting]). %% @doc Returns a list of members guaranteed safe for requests -ready_members(#chstate{members=Members}) -> +ready_members(#chstate{members = Members}) -> get_members(Members, [valid, leaving]). %% @doc Provide all ownership information in the form of {Index,Node} pairs. --spec all_owners(State :: chstate()) -> [{Index :: integer(), Node :: term()}]. -all_owners(State) -> - chash:nodes(State#chstate.chring). +-spec all_owners(State :: chstate()) -> [{Index :: + integer(), + Node :: term()}]. + +all_owners(State) -> chash:nodes(State#chstate.chring). %% @doc Provide every preflist in the ring, truncated at N. --spec all_preflists(State :: chstate(), N :: integer()) -> - [[{Index :: integer(), Node :: term()}]]. +-spec all_preflists(State :: chstate(), + N :: integer()) -> [[{Index :: integer(), + Node :: term()}]]. + all_preflists(State, N) -> - [lists:sublist(preflist(Key, State),N) || - Key <- [<<(I+1):160/integer>> || - {I,_Owner} <- ?MODULE:all_owners(State)]]. + [lists:sublist(preflist(Key, State), N) + || Key + <- [<<(I + 1):160/integer>> + || {I, _Owner} <- (?MODULE):all_owners(State)]]. %% @doc For two rings, return the list of owners that have differing ownership. -spec diff_nodes(chstate(), chstate()) -> [node()]. -diff_nodes(State1,State2) -> - AO = lists:zip(all_owners(State1),all_owners(State2)), - AllDiff = [[N1,N2] || {{I,N1},{I,N2}} <- AO, N1 =/= N2], + +diff_nodes(State1, State2) -> + AO = lists:zip(all_owners(State1), all_owners(State2)), + AllDiff = [[N1, N2] + || {{I, N1}, {I, N2}} <- AO, N1 =/= N2], lists:usort(lists:flatten(AllDiff)). -spec equal_rings(chstate(), chstate()) -> boolean(). -equal_rings(_A=#chstate{chring=RA,meta=MA},_B=#chstate{chring=RB,meta=MB}) -> + +equal_rings(_A = #chstate{chring = RA, meta = MA}, + _B = #chstate{chring = RB, meta = MB}) -> MDA = lists:sort(dict:to_list(MA)), MDB = lists:sort(dict:to_list(MB)), case MDA =:= MDB of - false -> false; - true -> RA =:= RB + false -> false; + true -> RA =:= RB end. %% @doc This is used only when this node is creating a brand new cluster. -spec fresh() -> chstate(). + fresh() -> % use this when starting a new cluster via this node fresh(node()). @@ -333,31 +291,34 @@ fresh() -> %% @doc Equivalent to fresh/0 but allows specification of the local node name. %% Called by fresh/0, and otherwise only intended for testing purposes. -spec fresh(NodeName :: term()) -> chstate(). + fresh(NodeName) -> - fresh(application:get_env(riak_core, ring_creation_size, undefined), NodeName). + fresh(application:get_env(riak_core, ring_creation_size, + undefined), + NodeName). %% @doc Equivalent to fresh/1 but allows specification of the ring size. %% Called by fresh/1, and otherwise only intended for testing purposes. --spec fresh(ring_size(), NodeName :: term()) -> chstate(). +-spec fresh(ring_size(), + NodeName :: term()) -> chstate(). + fresh(RingSize, NodeName) -> - VClock=vclock:increment(NodeName, vclock:fresh()), - #chstate{nodename=NodeName, - clustername={NodeName, erlang:timestamp()}, - members=[{NodeName, {valid, VClock, [{gossip_vsn, 2}]}}], - chring=chash:fresh(RingSize, NodeName), - next=[], - claimant=NodeName, - seen=[{NodeName, VClock}], - rvsn=VClock, - vclock=VClock, - meta=dict:new()}. + VClock = vclock:increment(NodeName, vclock:fresh()), + #chstate{nodename = NodeName, + clustername = {NodeName, erlang:timestamp()}, + members = + [{NodeName, {valid, VClock, [{gossip_vsn, 2}]}}], + chring = chash:fresh(RingSize, NodeName), next = [], + claimant = NodeName, seen = [{NodeName, VClock}], + rvsn = VClock, vclock = VClock, meta = dict:new()}. %% @doc change the size of the ring to `NewRingSize'. If the ring %% is larger than the current ring any new indexes will be owned %% by a dummy host -spec resize(chstate(), ring_size()) -> chstate(). + resize(State, NewRingSize) -> - NewRing = lists:foldl(fun({Idx,Owner}, RingAcc) -> + NewRing = lists:foldl(fun ({Idx, Owner}, RingAcc) -> chash:update(Idx, Owner, RingAcc) end, chash:fresh(NewRingSize, '$dummyhost@resized'), @@ -365,36 +326,31 @@ resize(State, NewRingSize) -> set_chash(State, NewRing). % @doc Return a value from the cluster metadata dict --spec get_meta(Key :: term(), State :: chstate()) -> - {ok, term()} | undefined. +-spec get_meta(Key :: term(), + State :: chstate()) -> {ok, term()} | undefined. + get_meta(Key, State) -> case dict:find(Key, State#chstate.meta) of - error -> undefined; - {ok, '$removed'} -> undefined; - {ok, M} when M#meta_entry.value =:= '$removed' -> undefined; - {ok, M} -> {ok, M#meta_entry.value} + error -> undefined; + {ok, '$removed'} -> undefined; + {ok, M} when M#meta_entry.value =:= '$removed' -> + undefined; + {ok, M} -> {ok, M#meta_entry.value} end. --spec get_meta(term(), term(), chstate()) -> {ok, term()}. +-spec get_meta(term(), term(), chstate()) -> {ok, + term()}. + get_meta(Key, Default, State) -> case get_meta(Key, State) of - undefined -> {ok, Default}; - Res -> Res + undefined -> {ok, Default}; + Res -> Res end. -%% @doc return the names of all the custom buckets stored in the ring. --spec get_buckets(State :: chstate()) -> [term()]. -get_buckets(State) -> - Keys = dict:fetch_keys(State#chstate.meta), - lists:foldl( - fun({bucket, Bucket}, Acc) -> - [Bucket|Acc]; - (_, Acc) -> - Acc - end, [], Keys). - %% @doc Return the node that owns the given index. --spec index_owner(State :: chstate(), Idx :: chash:index_as_int()) -> Node :: term(). +-spec index_owner(State :: chstate(), + Idx :: chash:index_as_int()) -> Node :: term(). + index_owner(State, Idx) -> {Idx, Owner} = lists:keyfind(Idx, 1, all_owners(State)), Owner. @@ -402,131 +358,169 @@ index_owner(State, Idx) -> %% @doc Return the node that will own this index after transtions have completed %% this function will error if the ring is shrinking and Idx no longer exists %% in it --spec future_owner(chstate(), chash:index_as_int()) -> term(). +-spec future_owner(chstate(), + chash:index_as_int()) -> term(). + future_owner(State, Idx) -> index_owner(future_ring(State), Idx). %% @doc Return all partition indices owned by the node executing this function. --spec my_indices(State :: chstate()) -> [chash:index_as_int()]. +-spec my_indices(State :: + chstate()) -> [chash:index_as_int()]. + my_indices(State) -> - [I || {I,Owner} <- ?MODULE:all_owners(State), Owner =:= node()]. + [I + || {I, Owner} <- (?MODULE):all_owners(State), + Owner =:= node()]. %% @doc Return the number of partitions in this Riak ring. --spec num_partitions(State :: chstate()) -> pos_integer(). +-spec num_partitions(State :: + chstate()) -> pos_integer(). + num_partitions(State) -> chash:size(State#chstate.chring). -spec future_num_partitions(chstate()) -> pos_integer(). -future_num_partitions(State=#chstate{chring=CHRing}) -> + +future_num_partitions(State = #chstate{chring = + CHRing}) -> case resized_ring(State) of - {ok, C} -> chash:size(C); - undefined -> chash:size(CHRing) + {ok, C} -> chash:size(C); + undefined -> chash:size(CHRing) end. %% @doc Return the node that is responsible for a given chstate. -spec owner_node(State :: chstate()) -> Node :: term(). -owner_node(State) -> - State#chstate.nodename. + +owner_node(State) -> State#chstate.nodename. %% @doc For a given object key, produce the ordered list of %% {partition,node} pairs that could be responsible for that object. --spec preflist(Key :: binary(), State :: chstate()) -> - [{Index :: chash:index_as_int(), Node :: term()}]. -preflist(Key, State) -> chash:successors(Key, State#chstate.chring). +-spec preflist(Key :: binary(), + State :: chstate()) -> [{Index :: chash:index_as_int(), + Node :: term()}]. + +preflist(Key, State) -> + chash:successors(Key, State#chstate.chring). %% @doc Return a randomly-chosen node from amongst the owners. -spec random_node(State :: chstate()) -> Node :: term(). + random_node(State) -> L = all_members(State), - lists:nth(riak_core_rand:uniform(length(L)), L). + lists:nth(rand:uniform(length(L)), L). %% @doc Return a partition index not owned by the node executing this function. %% If this node owns all partitions, return any index. --spec random_other_index(State :: chstate()) -> chash:index_as_int(). +-spec random_other_index(State :: + chstate()) -> chash:index_as_int(). + random_other_index(State) -> - L = [I || {I,Owner} <- ?MODULE:all_owners(State), Owner =/= node()], + L = [I + || {I, Owner} <- (?MODULE):all_owners(State), + Owner =/= node()], case L of - [] -> hd(my_indices(State)); - _ -> lists:nth(riak_core_rand:uniform(length(L)), L) + [] -> hd(my_indices(State)); + _ -> lists:nth(rand:uniform(length(L)), L) end. --spec random_other_index(State :: chstate(), Exclude :: [term()]) -> chash:index_as_int() | no_indices. -random_other_index(State, Exclude) when is_list(Exclude) -> - L = [I || {I, Owner} <- ?MODULE:all_owners(State), - Owner =/= node(), - not lists:member(I, Exclude)], +-spec random_other_index(State :: chstate(), + Exclude :: [term()]) -> chash:index_as_int() | + no_indices. + +random_other_index(State, Exclude) + when is_list(Exclude) -> + L = [I + || {I, Owner} <- (?MODULE):all_owners(State), + Owner =/= node(), not lists:member(I, Exclude)], case L of - [] -> no_indices; - _ -> lists:nth(riak_core_rand:uniform(length(L)), L) + [] -> no_indices; + _ -> lists:nth(rand:uniform(length(L)), L) end. %% @doc Return a randomly-chosen node from amongst the owners other than this one. --spec random_other_node(State :: chstate()) -> Node :: term() | no_node. +-spec random_other_node(State :: chstate()) -> Node :: + term() | no_node. + random_other_node(State) -> case lists:delete(node(), all_members(State)) of - [] -> - no_node; - L -> - lists:nth(riak_core_rand:uniform(length(L)), L) + [] -> no_node; + L -> lists:nth(rand:uniform(length(L)), L) end. %% @doc Return a randomly-chosen active node other than this one. --spec random_other_active_node(State :: chstate()) -> Node :: term() | no_node. +-spec random_other_active_node(State :: + chstate()) -> Node :: term() | no_node. + random_other_active_node(State) -> case lists:delete(node(), active_members(State)) of - [] -> - no_node; - L -> - lists:nth(riak_core_rand:uniform(length(L)), L) + [] -> no_node; + L -> lists:nth(rand:uniform(length(L)), L) end. %% @doc Incorporate another node's state into our view of the Riak world. --spec reconcile(ExternState :: chstate(), MyState :: chstate()) -> - {no_change | new_ring, chstate()}. +-spec reconcile(ExternState :: chstate(), + MyState :: chstate()) -> {no_change | new_ring, + chstate()}. + reconcile(ExternState, MyState) -> check_tainted(ExternState, - "Error: riak_core_ring/reconcile :: " - "reconciling tainted external ring"), + "Error: riak_core_ring/reconcile :: reconcilin" + "g tainted external ring"), check_tainted(MyState, - "Error: riak_core_ring/reconcile :: " - "reconciling tainted internal ring"), - case check_lastgasp(ExternState) of - true -> - {no_change, MyState}; - false -> - case internal_reconcile(MyState, ExternState) of - {false, State} -> - {no_change, State}; - {true, State} -> - {new_ring, State} - end + "Error: riak_core_ring/reconcile :: reconcilin" + "g tainted internal ring"), + case internal_reconcile(MyState, ExternState) of + {false, State} -> {no_change, State}; + {true, State} -> {new_ring, State} end. %% @doc Rename OldNode to NewNode in a Riak ring. --spec rename_node(State :: chstate(), OldNode :: atom(), NewNode :: atom()) -> - chstate(). -rename_node(State=#chstate{chring=Ring, nodename=ThisNode, members=Members, - claimant=Claimant, seen=Seen}, OldNode, NewNode) - when is_atom(OldNode), is_atom(NewNode) -> - State#chstate{ - chring=lists:foldl( - fun({Idx, Owner}, AccIn) -> - case Owner of - OldNode -> - chash:update(Idx, NewNode, AccIn); - _ -> AccIn - end - end, Ring, riak_core_ring:all_owners(State)), - members=orddict:from_list(proplists:substitute_aliases([{OldNode, NewNode}], Members)), - seen=orddict:from_list(proplists:substitute_aliases([{OldNode, NewNode}], Seen)), - nodename=case ThisNode of OldNode -> NewNode; _ -> ThisNode end, - claimant=case Claimant of OldNode -> NewNode; _ -> Claimant end, - vclock=vclock:increment(NewNode, State#chstate.vclock)}. +-spec rename_node(State :: chstate(), OldNode :: atom(), + NewNode :: atom()) -> chstate(). + +rename_node(State = #chstate{chring = Ring, + nodename = ThisNode, members = Members, + claimant = Claimant, seen = Seen}, + OldNode, NewNode) + when is_atom(OldNode), is_atom(NewNode) -> + State#chstate{chring = + lists:foldl(fun ({Idx, Owner}, AccIn) -> + case Owner of + OldNode -> + chash:update(Idx, NewNode, + AccIn); + _ -> AccIn + end + end, + Ring, riak_core_ring:all_owners(State)), + members = + orddict:from_list(proplists:substitute_aliases([{OldNode, + NewNode}], + Members)), + seen = + orddict:from_list(proplists:substitute_aliases([{OldNode, + NewNode}], + Seen)), + nodename = + case ThisNode of + OldNode -> NewNode; + _ -> ThisNode + end, + claimant = + case Claimant of + OldNode -> NewNode; + _ -> Claimant + end, + vclock = + vclock:increment(NewNode, State#chstate.vclock)}. %% @doc Determine the integer ring index responsible %% for a chash key. --spec responsible_index(binary(), chstate()) -> integer(). -responsible_index(ChashKey, #chstate{chring=Ring}) -> +-spec responsible_index(binary(), + chstate()) -> integer(). + +responsible_index(ChashKey, #chstate{chring = Ring}) -> <> = ChashKey, chash:next_index(IndexAsInt, Ring). @@ -538,69 +532,69 @@ responsible_index(ChashKey, #chstate{chring=Ring}) -> %% for `CHashKey' in the future ring. For regular transitions %% the returned index will always be `OrigIdx'. If the ring is %% resizing the index may be different --spec future_index(chash:index(), - integer(), +-spec future_index(chash:index(), integer(), chstate()) -> integer() | undefined. + future_index(CHashKey, OrigIdx, State) -> future_index(CHashKey, OrigIdx, undefined, State). --spec future_index(chash:index(), - integer(), - undefined | integer(), - chstate()) -> integer() | undefined. +-spec future_index(chash:index(), integer(), + undefined | integer(), chstate()) -> integer() | + undefined. + future_index(CHashKey, OrigIdx, NValCheck, State) -> OrigCount = num_partitions(State), NextCount = future_num_partitions(State), - future_index(CHashKey, OrigIdx, NValCheck, OrigCount, NextCount). + future_index(CHashKey, OrigIdx, NValCheck, OrigCount, + NextCount). -future_index(CHashKey, OrigIdx, NValCheck, OrigCount, NextCount) -> +future_index(CHashKey, OrigIdx, NValCheck, OrigCount, + NextCount) -> <> = CHashKey, OrigInc = chash:ring_increment(OrigCount), NextInc = chash:ring_increment(NextCount), - %% Determine position in the ring of partition that owns key (head of preflist) %% Position is 1-based starting from partition (0 + ring increment), e.g. %% index 0 is always position N. - OwnerPos = ((CHashInt div OrigInc) + 1), - + OwnerPos = CHashInt div OrigInc + 1, %% Determine position of the source partition in the ring %% if OrigIdx is 0 we know the position is OrigCount (number of partitions) OrigPos = case OrigIdx of - 0 -> OrigCount; - _ -> OrigIdx div OrigInc + 0 -> OrigCount; + _ -> OrigIdx div OrigInc end, - %% The distance between the key's owner (head of preflist) and the source partition %% is the position of the source in the preflist, the distance may be negative %% in which case we have wrapped around the ring. distance of zero means the source %% is the head of the preflist. OrigDist = case OrigPos - OwnerPos of - P when P < 0 -> OrigCount + P; - P -> P + P when P < 0 -> OrigCount + P; + P -> P end, - %% In the case that the ring is shrinking the future index for a key whose position %% in the preflist is >= ring size may be calculated, any transfer is invalid in %% this case, return undefined. The position may also be >= an optional N value for %% the key, if this is true undefined is also returned - case check_invalid_future_index(OrigDist, NextCount, NValCheck) of - true -> undefined; - false -> - %% Determine the partition (head of preflist) that will own the key in the future ring - FuturePos = ((CHashInt div NextInc) + 1), - NextOwner = FuturePos * NextInc, - - %% Determine the partition that the key should be transferred to (has same position - %% in future preflist as source partition does in current preflist) - RingTop = trunc(math:pow(2,160)-1), - (NextOwner + (NextInc * OrigDist)) rem RingTop + case check_invalid_future_index(OrigDist, NextCount, + NValCheck) + of + true -> undefined; + false -> + %% Determine the partition (head of preflist) that will own the key in the future ring + FuturePos = CHashInt div NextInc + 1, + NextOwner = FuturePos * NextInc, + %% Determine the partition that the key should be transferred to (has same position + %% in future preflist as source partition does in current preflist) + RingTop = trunc(math:pow(2, 160) - 1), + (NextOwner + NextInc * OrigDist) rem RingTop end. -check_invalid_future_index(OrigDist, NextCount, NValCheck) -> +check_invalid_future_index(OrigDist, NextCount, + NValCheck) -> OverRingSize = OrigDist >= NextCount, OverNVal = case NValCheck of - undefined -> false; - _ -> OrigDist >= NValCheck + undefined -> false; + _ -> OrigDist >= NValCheck end, OverRingSize orelse OverNVal. @@ -608,158 +602,159 @@ check_invalid_future_index(OrigDist, NextCount, NValCheck) -> %% in the current preflist for the key. Returns true if `TargetIdx' %% is in the same position in the future preflist for that key. %% @see future_index/4 --spec is_future_index(chash:index(), integer(), integer(), chstate()) -> boolean(). +-spec is_future_index(chash:index(), integer(), + integer(), chstate()) -> boolean(). + is_future_index(CHashKey, OrigIdx, TargetIdx, State) -> - FutureIndex = future_index(CHashKey, OrigIdx, undefined, State), + FutureIndex = future_index(CHashKey, OrigIdx, undefined, + State), FutureIndex =:= TargetIdx. --spec transfer_node(Idx :: integer(), Node :: term(), MyState :: chstate()) -> - chstate(). +-spec transfer_node(Idx :: integer(), Node :: term(), + MyState :: chstate()) -> chstate(). + transfer_node(Idx, Node, MyState) -> case chash:lookup(Idx, MyState#chstate.chring) of - Node -> - MyState; - _ -> - Me = MyState#chstate.nodename, - VClock = vclock:increment(Me, MyState#chstate.vclock), - CHRing = chash:update(Idx, Node, MyState#chstate.chring), - MyState#chstate{vclock=VClock,chring=CHRing} + Node -> MyState; + _ -> + Me = MyState#chstate.nodename, + VClock = vclock:increment(Me, MyState#chstate.vclock), + CHRing = chash:update(Idx, Node, + MyState#chstate.chring), + MyState#chstate{vclock = VClock, chring = CHRing} end. % @doc Set a key in the cluster metadata dict --spec update_meta(Key :: term(), Val :: term(), State :: chstate()) -> chstate(). +-spec update_meta(Key :: term(), Val :: term(), + State :: chstate()) -> chstate(). + update_meta(Key, Val, State) -> Change = case dict:find(Key, State#chstate.meta) of - {ok, OldM} -> - Val /= OldM#meta_entry.value; - error -> - true + {ok, OldM} -> Val /= OldM#meta_entry.value; + error -> true end, if Change -> - M = #meta_entry { - lastmod = calendar:datetime_to_gregorian_seconds( - calendar:universal_time()), - value = Val - }, - VClock = vclock:increment(State#chstate.nodename, - State#chstate.vclock), - State#chstate{vclock=VClock, - meta=dict:store(Key, M, State#chstate.meta)}; - true -> - State + M = #meta_entry{lastmod = + calendar:datetime_to_gregorian_seconds(calendar:universal_time()), + value = Val}, + VClock = vclock:increment(State#chstate.nodename, + State#chstate.vclock), + State#chstate{vclock = VClock, + meta = dict:store(Key, M, State#chstate.meta)}; + true -> State end. %% @doc Logical delete of a key in the cluster metadata dict --spec remove_meta(Key :: term(), State :: chstate()) -> chstate(). +-spec remove_meta(Key :: term(), + State :: chstate()) -> chstate(). + remove_meta(Key, State) -> case dict:find(Key, State#chstate.meta) of - {ok, _} -> update_meta(Key, '$removed', State); - error -> State + {ok, _} -> update_meta(Key, '$removed', State); + error -> State end. %% @doc Return the current claimant. -spec claimant(State :: chstate()) -> node(). -claimant(#chstate{claimant=Claimant}) -> - Claimant. + +claimant(#chstate{claimant = Claimant}) -> Claimant. set_claimant(State, Claimant) -> - State#chstate{claimant=Claimant}. + State#chstate{claimant = Claimant}. %% @doc Returns the unique identifer for this cluster. -spec cluster_name(State :: chstate()) -> term(). -cluster_name(State) -> - State#chstate.clustername. + +cluster_name(State) -> State#chstate.clustername. %% @doc Sets the unique identifer for this cluster. set_cluster_name(State, Name) -> - State#chstate{clustername=Name}. + State#chstate{clustername = Name}. -reconcile_names(RingA=#chstate{clustername=NameA}, - RingB=#chstate{clustername=NameB}) -> +reconcile_names(RingA = #chstate{clustername = NameA}, + RingB = #chstate{clustername = NameB}) -> case (NameA =:= undefined) or (NameB =:= undefined) of - true -> - {RingA#chstate{clustername=undefined}, - RingB#chstate{clustername=undefined}}; - false -> - {RingA, RingB} + true -> + {RingA#chstate{clustername = undefined}, + RingB#chstate{clustername = undefined}}; + false -> {RingA, RingB} end. increment_vclock(Node, State) -> VClock = vclock:increment(Node, State#chstate.vclock), - State#chstate{vclock=VClock}. + State#chstate{vclock = VClock}. -ring_version(#chstate{rvsn=RVsn}) -> - RVsn. +ring_version(#chstate{rvsn = RVsn}) -> RVsn. increment_ring_version(Node, State) -> RVsn = vclock:increment(Node, State#chstate.rvsn), - State#chstate{rvsn=RVsn}. + State#chstate{rvsn = RVsn}. %% @doc Returns the current membership status for a node in the cluster. --spec member_status(chstate() | [node()], Node :: node()) -> member_status(). -member_status(#chstate{members=Members}, Node) -> +-spec member_status(chstate() | [node()], + Node :: node()) -> member_status(). + +member_status(#chstate{members = Members}, Node) -> member_status(Members, Node); member_status(Members, Node) -> case orddict:find(Node, Members) of - {ok, {Status, _, _}} -> - Status; - _ -> - invalid + {ok, {Status, _, _}} -> Status; + _ -> invalid end. %% @doc Returns the current membership status for all nodes in the cluster. --spec all_member_status(State :: chstate()) -> [{node(), member_status()}]. -all_member_status(#chstate{members=Members}) -> - [{Node, Status} || {Node, {Status, _VC, _}} <- Members, Status /= invalid]. +-spec all_member_status(State :: chstate()) -> [{node(), + member_status()}]. + +all_member_status(#chstate{members = Members}) -> + [{Node, Status} + || {Node, {Status, _VC, _}} <- Members, + Status /= invalid]. get_member_meta(State, Member, Key) -> case orddict:find(Member, State#chstate.members) of - error -> undefined; - {ok, {_, _, Meta}} -> - case orddict:find(Key, Meta) of - error -> - undefined; - {ok, Value} -> - Value - end + error -> undefined; + {ok, {_, _, Meta}} -> + case orddict:find(Key, Meta) of + error -> undefined; + {ok, Value} -> Value + end end. %% @doc Set a key in the member metadata orddict update_member_meta(Node, State, Member, Key, Val) -> VClock = vclock:increment(Node, State#chstate.vclock), - State2 = update_member_meta(Node, State, Member, Key, Val, same_vclock), - State2#chstate{vclock=VClock}. + State2 = update_member_meta(Node, State, Member, Key, + Val, same_vclock), + State2#chstate{vclock = VClock}. -update_member_meta(Node, State, Member, Key, Val, same_vclock) -> +update_member_meta(Node, State, Member, Key, Val, + same_vclock) -> Members = State#chstate.members, case orddict:is_key(Member, Members) of - true -> - Members2 = orddict:update(Member, - fun({Status, VC, MD}) -> - {Status, - vclock:increment(Node, VC), - orddict:store(Key, Val, MD)} - end, - Members), - State#chstate{members=Members2}; - false -> - State + true -> + Members2 = orddict:update(Member, + fun ({Status, VC, MD}) -> + {Status, vclock:increment(Node, VC), + orddict:store(Key, Val, MD)} + end, + Members), + State#chstate{members = Members2}; + false -> State end. clear_member_meta(Node, State, Member) -> Members = State#chstate.members, case orddict:is_key(Member, Members) of - true -> - Members2 = orddict:update(Member, - fun({Status, VC, _MD}) -> - {Status, - vclock:increment(Node, VC), - orddict:new()} - end, - Members), - State#chstate{members=Members2}; - false -> - State + true -> + Members2 = orddict:update(Member, + fun ({Status, VC, _MD}) -> + {Status, vclock:increment(Node, VC), + orddict:new()} + end, + Members), + State#chstate{members = Members2}; + false -> State end. add_member(PNode, State, Node) -> @@ -780,83 +775,99 @@ down_member(PNode, State, Node) -> set_member(Node, CState, Member, Status) -> VClock = vclock:increment(Node, CState#chstate.vclock), - CState2 = set_member(Node, CState, Member, Status, same_vclock), - CState2#chstate{vclock=VClock}. + CState2 = set_member(Node, CState, Member, Status, + same_vclock), + CState2#chstate{vclock = VClock}. set_member(Node, CState, Member, Status, same_vclock) -> Members2 = orddict:update(Member, - fun({_, VC, MD}) -> + fun ({_, VC, MD}) -> {Status, vclock:increment(Node, VC), MD} end, - {Status, vclock:increment(Node, - vclock:fresh()), []}, + {Status, vclock:increment(Node, vclock:fresh()), + []}, CState#chstate.members), - CState#chstate{members=Members2}. + CState#chstate{members = Members2}. %% @doc Return a list of all members of the cluster that are eligible to %% claim partitions. --spec claiming_members(State :: chstate()) -> [Node :: node()]. -claiming_members(#chstate{members=Members}) -> +-spec claiming_members(State :: chstate()) -> [Node :: + node()]. + +claiming_members(#chstate{members = Members}) -> get_members(Members, [joining, valid, down]). %% @doc Return a list of all members of the cluster that are marked as down. --spec down_members(State :: chstate()) -> [Node :: node()]. -down_members(#chstate{members=Members}) -> +-spec down_members(State :: chstate()) -> [Node :: + node()]. + +down_members(#chstate{members = Members}) -> get_members(Members, [down]). %% @doc Set the node that is responsible for a given chstate. --spec set_owner(State :: chstate(), Node :: node()) -> chstate(). +-spec set_owner(State :: chstate(), + Node :: node()) -> chstate(). + set_owner(State, Node) -> - State#chstate{nodename=Node}. + State#chstate{nodename = Node}. %% @doc Return all partition indices owned by a node. --spec indices(State :: chstate(), Node :: node()) -> [integer()]. +-spec indices(State :: chstate(), + Node :: node()) -> [integer()]. + indices(State, Node) -> AllOwners = all_owners(State), [Idx || {Idx, Owner} <- AllOwners, Owner =:= Node]. %% @doc Return all partition indices that will be owned by a node after all %% pending ownership transfers have completed. --spec future_indices(State :: chstate(), Node :: node()) -> [integer()]. +-spec future_indices(State :: chstate(), + Node :: node()) -> [integer()]. + future_indices(State, Node) -> indices(future_ring(State), Node). --spec all_next_owners(chstate()) -> [{integer(), term()}]. +-spec all_next_owners(chstate()) -> [{integer(), + term()}]. + all_next_owners(CState) -> Next = riak_core_ring:pending_changes(CState), [{Idx, NextOwner} || {Idx, _, NextOwner, _, _} <- Next]. %% @private change_owners(CState, Reassign) -> - lists:foldl(fun({Idx, NewOwner}, CState0) -> + lists:foldl(fun ({Idx, NewOwner}, CState0) -> %% if called for indexes not in the current ring (during resizing) %% ignore the error try riak_core_ring:transfer_node(Idx, NewOwner, CState0) catch - error:{badmatch, _} -> CState0 + error:{badmatch, _} -> CState0 end - end, CState, Reassign). + end, + CState, Reassign). %% @doc Return all indices that a node is scheduled to give to another. disowning_indices(State, Node) -> case is_resizing(State) of - false -> - [Idx || {Idx, Owner, _NextOwner, _Mods, _Status} <- State#chstate.next, - Owner =:= Node]; - true -> - [Idx || {Idx, Owner} <- all_owners(State), - Owner =:= Node, - disowned_during_resize(State, Idx, Owner)] + false -> + [Idx + || {Idx, Owner, _NextOwner, _Mods, _Status} + <- State#chstate.next, + Owner =:= Node]; + true -> + [Idx + || {Idx, Owner} <- all_owners(State), Owner =:= Node, + disowned_during_resize(State, Idx, Owner)] end. disowned_during_resize(CState, Idx, Owner) -> %% catch error when index doesn't exist, we are disowning it if its going away - NextOwner = try future_owner(CState, Idx) - catch _:_ -> undefined + NextOwner = try future_owner(CState, Idx) catch + _:_ -> undefined end, case NextOwner of - Owner -> false; - _ -> true + Owner -> false; + _ -> true end. %% @doc Returns a list of all pending ownership transfers. @@ -865,16 +876,17 @@ pending_changes(State) -> State#chstate.next. set_pending_changes(State, Transfers) -> - State#chstate{next=Transfers}. + State#chstate{next = Transfers}. %% @doc Given a ring, `Resizing', that has been resized (and presumably rebalanced) %% schedule a resize transition for `Orig'. --spec set_pending_resize(chstate(), chstate()) -> chstate(). +-spec set_pending_resize(chstate(), + chstate()) -> chstate(). + set_pending_resize(Resizing, Orig) -> %% all existing indexes must transfer data when the ring is being resized - Next = [{Idx, Owner, '$resize', [], awaiting} || - {Idx, Owner} <- riak_core_ring:all_owners(Orig)], - + Next = [{Idx, Owner, '$resize', [], awaiting} + || {Idx, Owner} <- riak_core_ring:all_owners(Orig)], %% Whether or not the ring is shrinking or expanding, some %% ownership may be shared between the old and new ring. To prevent %% degenerate cases where partitions whose ownership does not @@ -882,7 +894,8 @@ set_pending_resize(Resizing, Orig) -> %% ignore on each subsequent transfer, we move them to the front %% of the next list which is treated as ordered. FutureOwners = riak_core_ring:all_owners(Resizing), - SortedNext = lists:sort(fun({Idx, Owner, _, _, _}, _) -> + SortedNext = lists:sort(fun ({Idx, Owner, _, _, _}, + _) -> %% we only need to check one element because the end result %% is the same as if we checked both: %% @@ -891,135 +904,174 @@ set_pending_resize(Resizing, Orig) -> %% false, false -> false %% false, true -> false lists:member({Idx, Owner}, FutureOwners) - end, Next), - + end, + Next), %% Resizing is assumed to have a modified chring, we need to put back %% the original chring to not install the resized one pre-emptively. The %% resized ring is stored in ring metadata for later use FutureCHash = chash(Resizing), ResetRing = set_chash(Resizing, chash(Orig)), - set_resized_ring(set_pending_changes(ResetRing, SortedNext), FutureCHash). + set_resized_ring(set_pending_changes(ResetRing, + SortedNext), + FutureCHash). + +-spec maybe_abort_resize(chstate()) -> {boolean(), + chstate()}. --spec maybe_abort_resize(chstate()) -> {boolean(), chstate()}. maybe_abort_resize(State) -> Resizing = is_resizing(State), PostResize = is_post_resize(State), PendingAbort = is_resize_aborted(State), - case PendingAbort andalso Resizing andalso not PostResize of - true -> - State1 = State#chstate{next=[]}, - State2 = clear_all_resize_transfers(State1), - State3 = remove_meta('$resized_ring_abort', State2), - {true, remove_meta('$resized_ring', State3)}; - false -> - {false, State} + case PendingAbort andalso + Resizing andalso not PostResize + of + true -> + State1 = State#chstate{next = []}, + State2 = clear_all_resize_transfers(State1), + State3 = remove_meta('$resized_ring_abort', State2), + {true, remove_meta('$resized_ring', State3)}; + false -> {false, State} end. - -spec set_pending_resize_abort(chstate()) -> chstate(). + set_pending_resize_abort(State) -> update_meta('$resized_ring_abort', true, State). -spec schedule_resize_transfer(chstate(), {integer(), term()}, integer() | {integer(), term()}) -> chstate(). -schedule_resize_transfer(State, Source, TargetIdx) when is_integer(TargetIdx) -> + +schedule_resize_transfer(State, Source, TargetIdx) + when is_integer(TargetIdx) -> TargetNode = index_owner(future_ring(State), TargetIdx), - schedule_resize_transfer(State, Source, {TargetIdx, TargetNode}); + schedule_resize_transfer(State, Source, + {TargetIdx, TargetNode}); schedule_resize_transfer(State, Source, Source) -> State; schedule_resize_transfer(State, Source, Target) -> Transfers = resize_transfers(State, Source), %% ignore if we have already scheduled a transfer from source -> target case lists:keymember(Target, 1, Transfers) of - true -> State; - false -> - Transfers1 = lists:keystore(Target, 1, Transfers, - {Target, ordsets:new(), awaiting}), - set_resize_transfers(State, Source, Transfers1) + true -> State; + false -> + Transfers1 = lists:keystore(Target, 1, Transfers, + {Target, ordsets:new(), awaiting}), + set_resize_transfers(State, Source, Transfers1) end. %% @doc reassign all outbound and inbound resize transfers from `Node' to `NewNode' --spec reschedule_resize_transfers(chstate(), term(), term()) -> chstate(). -reschedule_resize_transfers(State=#chstate{next=Next}, Node, NewNode) -> - {NewNext, NewState} = lists:mapfoldl( - fun(Entry, StateAcc) -> reschedule_resize_operation(Node, NewNode, - Entry, StateAcc) - end, - State, Next), - NewState#chstate{next=NewNext}. - -reschedule_resize_operation(N, NewNode, {Idx, N, '$resize', _Mods, _Status}, State) -> - NewEntry = {Idx, NewNode, '$resize', ordsets:new(), awaiting}, - NewState = reschedule_outbound_resize_transfers(State, Idx, N, NewNode), +-spec reschedule_resize_transfers(chstate(), term(), + term()) -> chstate(). + +reschedule_resize_transfers(State = #chstate{next = + Next}, + Node, NewNode) -> + {NewNext, NewState} = lists:mapfoldl(fun (Entry, + StateAcc) -> + reschedule_resize_operation(Node, + NewNode, + Entry, + StateAcc) + end, + State, Next), + NewState#chstate{next = NewNext}. + +reschedule_resize_operation(N, NewNode, + {Idx, N, '$resize', _Mods, _Status}, State) -> + NewEntry = {Idx, NewNode, '$resize', ordsets:new(), + awaiting}, + NewState = reschedule_outbound_resize_transfers(State, + Idx, N, NewNode), {NewEntry, NewState}; reschedule_resize_operation(Node, NewNode, - {Idx, OtherNode, '$resize', _Mods, _Status}=Entry, + {Idx, OtherNode, '$resize', _Mods, _Status} = Entry, State) -> - {Changed, NewState} = reschedule_inbound_resize_transfers({Idx, OtherNode}, Node, - NewNode, State), + {Changed, NewState} = + reschedule_inbound_resize_transfers({Idx, OtherNode}, + Node, NewNode, State), case Changed of - true -> - NewEntry = {Idx, OtherNode, '$resize', ordsets:new(), awaiting}, - {NewEntry, NewState}; - false -> - {Entry, State} + true -> + NewEntry = {Idx, OtherNode, '$resize', ordsets:new(), + awaiting}, + {NewEntry, NewState}; + false -> {Entry, State} end. -reschedule_inbound_resize_transfers(Source, Node, NewNode, State) -> - F = fun(Transfer, Acc) -> - {NewXfer, NewAcc} = reschedule_inbound_resize_transfer(Transfer, Node, NewNode), +reschedule_inbound_resize_transfers(Source, Node, + NewNode, State) -> + F = fun (Transfer, Acc) -> + {NewXfer, NewAcc} = + reschedule_inbound_resize_transfer(Transfer, Node, + NewNode), {NewXfer, NewAcc orelse Acc} end, - {ResizeTransfers, Changed} = lists:mapfoldl(F, false, resize_transfers(State, Source)), - {Changed, set_resize_transfers(State, Source, ResizeTransfers)}. - -reschedule_inbound_resize_transfer({{Idx, Target}, _, _}, Target, NewNode) -> + {ResizeTransfers, Changed} = lists:mapfoldl(F, false, + resize_transfers(State, + Source)), + {Changed, + set_resize_transfers(State, Source, ResizeTransfers)}. + +reschedule_inbound_resize_transfer({{Idx, Target}, _, + _}, + Target, NewNode) -> {{{Idx, NewNode}, ordsets:new(), awaiting}, true}; reschedule_inbound_resize_transfer(Transfer, _, _) -> {Transfer, false}. -reschedule_outbound_resize_transfers(State, Idx, Node, NewNode) -> +reschedule_outbound_resize_transfers(State, Idx, Node, + NewNode) -> OldSource = {Idx, Node}, NewSource = {Idx, NewNode}, Transfers = resize_transfers(State, OldSource), - F = fun({I,N}) when N =:= Node -> {I,NewNode}; - (T) -> T + F = fun ({I, N}) when N =:= Node -> {I, NewNode}; + (T) -> T end, - NewTransfers = [{F(Target), ordsets:new(), awaiting} || {Target, _, _} <- Transfers], - set_resize_transfers(clear_resize_transfers(OldSource, State), NewSource, NewTransfers). + NewTransfers = [{F(Target), ordsets:new(), awaiting} + || {Target, _, _} <- Transfers], + set_resize_transfers(clear_resize_transfers(OldSource, + State), + NewSource, NewTransfers). %% @doc returns the first awaiting resize_transfer for a {SourceIdx, SourceNode} %% pair. If all transfers for the pair are complete, undefined is returned --spec awaiting_resize_transfer(chstate(), {integer(), term()}, atom()) -> - {integer(), term()} | undefined. +-spec awaiting_resize_transfer(chstate(), + {integer(), term()}, atom()) -> {integer(), + term()} | + undefined. + awaiting_resize_transfer(State, Source, Mod) -> ResizeTransfers = resize_transfers(State, Source), - Awaiting = [{Target, Mods, Status} || {Target, Mods, Status} <- ResizeTransfers, - Status =/= complete, - not ordsets:is_element(Mod, Mods)], + Awaiting = [{Target, Mods, Status} + || {Target, Mods, Status} <- ResizeTransfers, + Status =/= complete, not ordsets:is_element(Mod, Mods)], case Awaiting of - [] -> undefined; - [{Target, _, _} | _] -> Target + [] -> undefined; + [{Target, _, _} | _] -> Target end. %% @doc return the status of a resize_transfer for `Source' (an index-node pair). undefined %% is returned if no such transfer is scheduled. complete is returned if the transfer %% is marked as such or `Mod' is contained in the completed modules set. awaiting is %% returned otherwise --spec resize_transfer_status(chstate(), {integer(), term()}, {integer(), term()}, atom()) -> - awaiting | complete | undefined. +-spec resize_transfer_status(chstate(), + {integer(), term()}, {integer(), term()}, + atom()) -> awaiting | complete | undefined. + resize_transfer_status(State, Source, Target, Mod) -> ResizeTransfers = resize_transfers(State, Source), - IsComplete = case lists:keyfind(Target, 1, ResizeTransfers) of - false -> undefined; - {Target, _, complete} -> true; - {Target, Mods, awaiting} -> ordsets:is_element(Mod, Mods) + IsComplete = case lists:keyfind(Target, 1, + ResizeTransfers) + of + false -> undefined; + {Target, _, complete} -> true; + {Target, Mods, awaiting} -> + ordsets:is_element(Mod, Mods) end, case IsComplete of - true -> complete; - false -> awaiting; - undefined -> undefined + true -> complete; + false -> awaiting; + undefined -> undefined end. %% @doc mark a resize_transfer from `Source' to `Target' for `Mod' complete. @@ -1028,75 +1080,94 @@ resize_transfer_status(State, Source, Target, Mod) -> %% for `Source' that need to be started to be scheduled before calling %% this fuction -spec resize_transfer_complete(chstate(), - {integer(),term()}, - {integer(),term()}, + {integer(), term()}, {integer(), term()}, atom()) -> chstate(). -resize_transfer_complete(State, {SrcIdx, _}=Source, Target, Mod) -> + +resize_transfer_complete(State, {SrcIdx, _} = Source, + Target, Mod) -> ResizeTransfers = resize_transfers(State, Source), Transfer = lists:keyfind(Target, 1, ResizeTransfers), case Transfer of - {Target, Mods, Status} -> - VNodeMods = - ordsets:from_list([VMod || {_, VMod} <- riak_core:vnode_modules()]), - Mods2 = ordsets:add_element(Mod, Mods), - Status2 = case {Status, Mods2} of - {complete, _} -> complete; - {awaiting, VNodeMods} -> complete; - _ -> awaiting - end, - ResizeTransfers2 = lists:keyreplace(Target, 1, ResizeTransfers, - {Target, Mods2, Status2}), - State1 = set_resize_transfers(State, Source, ResizeTransfers2), - AllComplete = lists:all(fun({_, _, complete}) -> true; - ({_, Ms, awaiting}) -> ordsets:is_element(Mod, Ms) - end, ResizeTransfers2), - case AllComplete of - true -> - transfer_complete(State1, SrcIdx, Mod); - false -> State1 - end; - _ -> State + {Target, Mods, Status} -> + VNodeMods = ordsets:from_list([VMod + || {_, VMod} + <- riak_core:vnode_modules()]), + Mods2 = ordsets:add_element(Mod, Mods), + Status2 = case {Status, Mods2} of + {complete, _} -> complete; + {awaiting, VNodeMods} -> complete; + _ -> awaiting + end, + ResizeTransfers2 = lists:keyreplace(Target, 1, + ResizeTransfers, + {Target, Mods2, Status2}), + State1 = set_resize_transfers(State, Source, + ResizeTransfers2), + AllComplete = lists:all(fun ({_, _, complete}) -> true; + ({_, Ms, awaiting}) -> + ordsets:is_element(Mod, Ms) + end, + ResizeTransfers2), + case AllComplete of + true -> transfer_complete(State1, SrcIdx, Mod); + false -> State1 + end; + _ -> State end. -spec is_resizing(chstate()) -> boolean(). + is_resizing(State) -> case resized_ring(State) of - undefined -> false; - {ok, _} -> true + undefined -> false; + {ok, _} -> true end. -spec is_post_resize(chstate()) -> boolean(). + is_post_resize(State) -> case get_meta('$resized_ring', State) of - {ok, '$cleanup'} -> true; - _ -> false + {ok, '$cleanup'} -> true; + _ -> false end. -spec is_resize_aborted(chstate()) -> boolean(). + is_resize_aborted(State) -> case get_meta('$resized_ring_abort', State) of - {ok, true} -> true; - _ -> false + {ok, true} -> true; + _ -> false end. -spec is_resize_complete(chstate()) -> boolean(). -is_resize_complete(#chstate{next=Next}) -> - not lists:any(fun({_, _, _, _, awaiting}) -> true; - ({_, _, _, _, complete}) -> false - end, - Next). --spec complete_resize_transfers(chstate(), {integer(),term()}, atom()) -> [{integer(),term()}]. +is_resize_complete(#chstate{next = Next}) -> + not + lists:any(fun ({_, _, _, _, awaiting}) -> true; + ({_, _, _, _, complete}) -> false + end, + Next). + +-spec complete_resize_transfers(chstate(), + {integer(), term()}, atom()) -> [{integer(), + term()}]. + complete_resize_transfers(State, Source, Mod) -> - [Target || {Target, Mods, Status} <- resize_transfers(State, Source), - Status =:= complete orelse ordsets:is_element(Mod, Mods)]. + [Target + || {Target, Mods, Status} + <- resize_transfers(State, Source), + Status =:= complete orelse + ordsets:is_element(Mod, Mods)]. + +-spec deletion_complete(chstate(), integer(), + atom()) -> chstate(). --spec deletion_complete(chstate(), integer(), atom()) -> chstate(). deletion_complete(State, Idx, Mod) -> transfer_complete(State, Idx, Mod). --spec resize_transfers(chstate(), {integer(), term()}) -> - [resize_transfer()]. +-spec resize_transfers(chstate(), + {integer(), term()}) -> [resize_transfer()]. + resize_transfers(State, Source) -> {ok, Transfers} = get_meta({resize, Source}, [], State), Transfers. @@ -1104,114 +1175,117 @@ resize_transfers(State, Source) -> -spec set_resize_transfers(chstate(), {integer(), term()}, [resize_transfer()]) -> chstate(). + set_resize_transfers(State, Source, Transfers) -> update_meta({resize, Source}, Transfers, State). clear_all_resize_transfers(State) -> - lists:foldl(fun clear_resize_transfers/2, State, all_owners(State)). + lists:foldl(fun clear_resize_transfers/2, State, + all_owners(State)). clear_resize_transfers(Source, State) -> remove_meta({resize, Source}, State). --spec resized_ring(chstate()) -> {ok, chash:chash()} | undefined. +-spec resized_ring(chstate()) -> {ok, chash:chash()} | + undefined. + resized_ring(State) -> case get_meta('$resized_ring', State) of - {ok, '$cleanup'} -> {ok, State#chstate.chring}; - {ok, CHRing} -> {ok, CHRing}; - _ -> undefined + {ok, '$cleanup'} -> {ok, State#chstate.chring}; + {ok, CHRing} -> {ok, CHRing}; + _ -> undefined end. --spec set_resized_ring(chstate(), chash:chash()) -> chstate(). +-spec set_resized_ring(chstate(), + chash:chash()) -> chstate(). + set_resized_ring(State, FutureCHash) -> update_meta('$resized_ring', FutureCHash, State). cleanup_after_resize(State) -> update_meta('$resized_ring', '$cleanup', State). +-spec vnode_type(chstate(), integer()) -> primary | + {fallback, term()} | future_primary | + resized_primary. --spec vnode_type(chstate(),integer()) -> primary | - {fallback, term()} | - future_primary | - resized_primary. vnode_type(State, Idx) -> vnode_type(State, Idx, node()). vnode_type(State, Idx, Node) -> try index_owner(State, Idx) of - Node -> - primary; - Owner -> - case next_owner(State, Idx) of - {_, Node, _} -> - future_primary; - _ -> - {fallback, Owner} - end + Node -> primary; + Owner -> + case next_owner(State, Idx) of + {_, Node, _} -> future_primary; + _ -> {fallback, Owner} + end catch - error:{badmatch, _} -> - %% idx doesn't exist so must be an index in a resized ring - resized_primary + error:{badmatch, _} -> + %% idx doesn't exist so must be an index in a resized ring + resized_primary end. %% @doc Return details for a pending partition ownership change. --spec next_owner(State :: chstate(), Idx :: integer()) -> pending_change(). +-spec next_owner(State :: chstate(), + Idx :: integer()) -> pending_change(). + next_owner(State, Idx) -> case lists:keyfind(Idx, 1, State#chstate.next) of - false -> - {undefined, undefined, undefined}; - NInfo -> - next_owner(NInfo) + false -> {undefined, undefined, undefined}; + NInfo -> next_owner(NInfo) end. %% @doc Return details for a pending partition ownership change. -spec next_owner(State :: chstate(), Idx :: integer(), Mod :: module()) -> pending_change(). + next_owner(State, Idx, Mod) -> NInfo = lists:keyfind(Idx, 1, State#chstate.next), next_owner_status(NInfo, Mod). next_owner_status(NInfo, Mod) -> case NInfo of - false -> - {undefined, undefined, undefined}; - {_, Owner, NextOwner, _Transfers, complete} -> - {Owner, NextOwner, complete}; - {_, Owner, NextOwner, Transfers, _Status} -> - case ordsets:is_element(Mod, Transfers) of - true -> - {Owner, NextOwner, complete}; - false -> - {Owner, NextOwner, awaiting} - end + false -> {undefined, undefined, undefined}; + {_, Owner, NextOwner, _Transfers, complete} -> + {Owner, NextOwner, complete}; + {_, Owner, NextOwner, Transfers, _Status} -> + case ordsets:is_element(Mod, Transfers) of + true -> {Owner, NextOwner, complete}; + false -> {Owner, NextOwner, awaiting} + end end. %% @private next_owner({_, Owner, NextOwner, _Transfers, Status}) -> {Owner, NextOwner, Status}. -completed_next_owners(Mod, #chstate{next=Next}) -> - [{Idx, O, NO} || NInfo={Idx, _, _, _, _} <- Next, - {O, NO, complete} <- [next_owner_status(NInfo, Mod)]]. +completed_next_owners(Mod, #chstate{next = Next}) -> + [{Idx, O, NO} + || NInfo = {Idx, _, _, _, _} <- Next, + {O, NO, complete} <- [next_owner_status(NInfo, Mod)]]. %% @doc Returns true if all cluster members have seen the current ring. -spec ring_ready(State :: chstate()) -> boolean(). + ring_ready(State0) -> check_tainted(State0, - "Error: riak_core_ring/ring_ready called on tainted ring"), + "Error: riak_core_ring/ring_ready called " + "on tainted ring"), Owner = owner_node(State0), State = update_seen(Owner, State0), Seen = State#chstate.seen, - Members = get_members(State#chstate.members, [valid, leaving, exiting]), + Members = get_members(State#chstate.members, + [valid, leaving, exiting]), VClock = State#chstate.vclock, R = [begin - case orddict:find(Node, Seen) of - error -> - false; - {ok, VC} -> - vclock:equal(VClock, VC) - end - end || Node <- Members], - Ready = lists:all(fun(X) -> X =:= true end, R), + case orddict:find(Node, Seen) of + error -> false; + {ok, VC} -> vclock:equal(VClock, VC) + end + end + || Node <- Members], + Ready = lists:all(fun (X) -> X =:= true end, R), Ready. ring_ready() -> @@ -1222,79 +1296,86 @@ ring_ready_info(State0) -> Owner = owner_node(State0), State = update_seen(Owner, State0), Seen = State#chstate.seen, - Members = get_members(State#chstate.members, [valid, leaving, exiting]), - RecentVC = - orddict:fold(fun(_, VC, Recent) -> - case vclock:descends(VC, Recent) of - true -> - VC; - false -> - Recent - end - end, State#chstate.vclock, Seen), - Outdated = - orddict:filter(fun(Node, VC) -> - (not vclock:equal(VC, RecentVC)) - and lists:member(Node, Members) - end, Seen), + Members = get_members(State#chstate.members, + [valid, leaving, exiting]), + RecentVC = orddict:fold(fun (_, VC, Recent) -> + case vclock:descends(VC, Recent) of + true -> VC; + false -> Recent + end + end, + State#chstate.vclock, Seen), + Outdated = orddict:filter(fun (Node, VC) -> + not vclock:equal(VC, RecentVC) and + lists:member(Node, Members) + end, + Seen), Outdated. %% @doc Marks a pending transfer as completed. --spec handoff_complete(State :: chstate(), Idx :: integer(), - Mod :: module()) -> chstate(). +-spec handoff_complete(State :: chstate(), + Idx :: integer(), Mod :: module()) -> chstate(). + handoff_complete(State, Idx, Mod) -> transfer_complete(State, Idx, Mod). ring_changed(Node, State) -> check_tainted(State, - "Error: riak_core_ring/ring_changed called on tainted ring"), + "Error: riak_core_ring/ring_changed called " + "on tainted ring"), internal_ring_changed(Node, State). %% @doc Return the ring that will exist after all pending ownership transfers %% have completed. -spec future_ring(chstate()) -> chstate(). + future_ring(State) -> future_ring(State, is_resizing(State)). future_ring(State, false) -> - FutureState = change_owners(State, all_next_owners(State)), + FutureState = change_owners(State, + all_next_owners(State)), %% Individual nodes will move themselves from leaving to exiting if they %% have no ring ownership, this is implemented in riak_core_ring_handler. %% Emulate it here to return similar ring. - Leaving = get_members(FutureState#chstate.members, [leaving]), - FutureState2 = - lists:foldl(fun(Node, StateAcc) -> - case indices(StateAcc, Node) of - [] -> - riak_core_ring:exit_member(Node, StateAcc, Node); - _ -> - StateAcc - end - end, FutureState, Leaving), - FutureState2#chstate{next=[]}; -future_ring(State0=#chstate{next=OldNext}, true) -> - case is_post_resize(State0) of - false -> - {ok, FutureCHash} = resized_ring(State0), - State1 = cleanup_after_resize(State0), - State2 = clear_all_resize_transfers(State1), - Resized = State2#chstate{chring=FutureCHash}, - Next = lists:foldl(fun({Idx, Owner, '$resize', _, _}, Acc) -> - DeleteEntry = {Idx, Owner, '$delete', [], awaiting}, - %% catch error when index doesn't exist in new ring - try index_owner(Resized, Idx) of - Owner -> Acc; - _ -> [DeleteEntry | Acc] - catch - error:{badmatch, _} -> [DeleteEntry | Acc] + Leaving = get_members(FutureState#chstate.members, + [leaving]), + FutureState2 = lists:foldl(fun (Node, StateAcc) -> + case indices(StateAcc, Node) of + [] -> + riak_core_ring:exit_member(Node, + StateAcc, + Node); + _ -> StateAcc end end, - [], - OldNext), - Resized#chstate{next=Next}; - true -> - State1 = remove_meta('$resized_ring', State0), - State1#chstate{next=[]} + FutureState, Leaving), + FutureState2#chstate{next = []}; +future_ring(State0 = #chstate{next = OldNext}, true) -> + case is_post_resize(State0) of + false -> + {ok, FutureCHash} = resized_ring(State0), + State1 = cleanup_after_resize(State0), + State2 = clear_all_resize_transfers(State1), + Resized = State2#chstate{chring = FutureCHash}, + Next = lists:foldl(fun ({Idx, Owner, '$resize', _, _}, + Acc) -> + DeleteEntry = {Idx, Owner, '$delete', [], + awaiting}, + %% catch error when index doesn't exist in new ring + try index_owner(Resized, Idx) of + Owner -> Acc; + _ -> [DeleteEntry | Acc] + catch + error:{badmatch, _} -> + [DeleteEntry | Acc] + end + end, + [], OldNext), + Resized#chstate{next = Next}; + true -> + State1 = remove_meta('$resized_ring', State0), + State1#chstate{next = []} end. pretty_print(Ring, Opts) -> @@ -1302,56 +1383,55 @@ pretty_print(Ring, Opts) -> OptLegend = lists:member(legend, Opts), Out = proplists:get_value(out, Opts, standard_io), TargetN = proplists:get_value(target_n, Opts, - application:get_env(riak_core, target_n_val, undefined)), - + application:get_env(riak_core, target_n_val, + undefined)), Owners = riak_core_ring:all_members(Ring), Indices = riak_core_ring:all_owners(Ring), RingSize = length(Indices), - Numeric = OptNumeric orelse (length(Owners) > 26), + Numeric = OptNumeric orelse length(Owners) > 26, case Numeric of - true -> - Ids = [integer_to_list(N) || N <- lists:seq(1, length(Owners))]; - false -> - Ids = [[Letter] || Letter <- lists:seq(97, 96+length(Owners))] + true -> + Ids = [integer_to_list(N) + || N <- lists:seq(1, length(Owners))]; + false -> + Ids = [[Letter] + || Letter <- lists:seq(97, 96 + length(Owners))] end, Names = lists:zip(Owners, Ids), case OptLegend of - true -> - io:format(Out, "~36..=s Nodes ~36..=s~n", ["", ""]), - _ = [begin - NodeIndices = [Idx || {Idx,Owner} <- Indices, - Owner =:= Node], + true -> + io:format(Out, "~36..=s Nodes ~36..=s~n", ["", ""]), + _ = [begin + NodeIndices = [Idx + || {Idx, Owner} <- Indices, Owner =:= Node], RingPercent = length(NodeIndices) * 100 / RingSize, io:format(Out, "Node ~s: ~w (~5.1f%) ~s~n", [Name, length(NodeIndices), RingPercent, Node]) - end || {Node, Name} <- Names], - io:format(Out, "~36..=s Ring ~37..=s~n", ["", ""]); - false -> - ok + end + || {Node, Name} <- Names], + io:format(Out, "~36..=s Ring ~37..=s~n", ["", ""]); + false -> ok end, - case Numeric of - true -> - Ownership = - [orddict:fetch(Owner, Names) || {_Idx, Owner} <- Indices], - io:format(Out, "~p~n", [Ownership]); - false -> - lists:foldl(fun({_, Owner}, N) -> - Name = orddict:fetch(Owner, Names), - case N rem TargetN of - 0 -> - io:format(Out, "~s|", [[Name]]); - _ -> - io:format(Out, "~s", [[Name]]) - end, - N+1 - end, 1, Indices), - io:format(Out, "~n", []) + true -> + Ownership = [orddict:fetch(Owner, Names) + || {_Idx, Owner} <- Indices], + io:format(Out, "~p~n", [Ownership]); + false -> + lists:foldl(fun ({_, Owner}, N) -> + Name = orddict:fetch(Owner, Names), + case N rem TargetN of + 0 -> io:format(Out, "~s|", [[Name]]); + _ -> io:format(Out, "~s", [[Name]]) + end, + N + 1 + end, + 1, Indices), + io:format(Out, "~n", []) end. %% @doc Return a ring with all transfers cancelled - for claim sim -cancel_transfers(Ring) -> - Ring#chstate{next=[]}. +cancel_transfers(Ring) -> Ring#chstate{next = []}. %% ==================================================================== %% Internal functions @@ -1361,23 +1441,26 @@ cancel_transfers(Ring) -> internal_ring_changed(Node, CState0) -> CState = update_seen(Node, CState0), case ring_ready(CState) of - false -> - CState; - true -> - riak_core_claimant:ring_changed(Node, CState) + false -> CState; + true -> riak_core_claimant:ring_changed(Node, CState) end. %% @private -merge_meta({N1,M1}, {N2,M2}) -> - Meta = dict:merge(fun(_,D1,D2) -> pick_val({N1,D1}, {N2,D2}) end, M1, M2), +merge_meta({N1, M1}, {N2, M2}) -> + Meta = dict:merge(fun (_, D1, D2) -> + pick_val({N1, D1}, {N2, D2}) + end, + M1, M2), log_meta_merge(M1, M2, Meta), Meta. %% @private -pick_val({N1,M1}, {N2,M2}) -> - case {M1#meta_entry.lastmod, N1} > {M2#meta_entry.lastmod, N2} of - true -> M1; - false -> M2 +pick_val({N1, M1}, {N2, M2}) -> + case {M1#meta_entry.lastmod, N1} > + {M2#meta_entry.lastmod, N2} + of + true -> M1; + false -> M2 end. %% @private @@ -1391,9 +1474,11 @@ log_meta_merge(M1, M2, Meta) -> %% Log result of a ring reconcile. In the case of ring churn, %% subsequent log messages will allow us to track ring versions. %% Handle legacy rings as well. -log_ring_result(#chstate{vclock=V,members=Members,next=Next}) -> - logger:debug("Updated ring vclock: ~p, Members: ~p, Next: ~p", - [V, Members, Next]). +log_ring_result(#chstate{vclock = V, members = Members, + next = Next}) -> + logger:debug("Updated ring vclock: ~p, Members: ~p, " + "Next: ~p", + [V, Members, Next]). %% @private internal_reconcile(State, OtherState) -> @@ -1401,10 +1486,9 @@ internal_reconcile(State, OtherState) -> State2 = update_seen(VNode, State), OtherState2 = update_seen(VNode, OtherState), Seen = reconcile_seen(State2, OtherState2), - State3 = State2#chstate{seen=Seen}, - OtherState3 = OtherState2#chstate{seen=Seen}, + State3 = State2#chstate{seen = Seen}, + OtherState3 = OtherState2#chstate{seen = Seen}, SeenChanged = not equal_seen(State, State3), - %% Try to reconcile based on vector clock, chosing the most recent state. VC1 = State3#chstate.vclock, VC2 = OtherState3#chstate.vclock, @@ -1413,45 +1497,50 @@ internal_reconcile(State, OtherState) -> %% merge to be deterministic here, hence the additional logic. VMerge1 = vclock:merge([VC1, VC2]), VMerge2 = vclock:merge([VC2, VC1]), - case {vclock:equal(VMerge1, VMerge2), VMerge1 < VMerge2} of - {true, _} -> - VC3 = VMerge1; - {_, true} -> - VC3 = VMerge1; - {_, false} -> - VC3 = VMerge2 + case {vclock:equal(VMerge1, VMerge2), VMerge1 < VMerge2} + of + {true, _} -> VC3 = VMerge1; + {_, true} -> VC3 = VMerge1; + {_, false} -> VC3 = VMerge2 end, - Newer = vclock:descends(VC1, VC2), Older = vclock:descends(VC2, VC1), Equal = equal_cstate(State3, OtherState3), case {Equal, Newer, Older} of - {_, true, false} -> - {SeenChanged, State3#chstate{vclock=VC3}}; - {_, false, true} -> - {true, OtherState3#chstate{nodename=VNode, vclock=VC3}}; - {true, _, _} -> - {SeenChanged, State3#chstate{vclock=VC3}}; - {_, true, true} -> - %% Exceptional condition that should only occur during - %% rolling upgrades and manual setting of the ring. - %% Merge as a divergent case. - State4 = reconcile_divergent(VNode, State3, OtherState3), - {true, State4#chstate{nodename=VNode}}; - {_, false, false} -> - %% Unable to reconcile based on vector clock, merge rings. - State4 = reconcile_divergent(VNode, State3, OtherState3), - {true, State4#chstate{nodename=VNode}} + {_, true, false} -> + {SeenChanged, State3#chstate{vclock = VC3}}; + {_, false, true} -> + {true, + OtherState3#chstate{nodename = VNode, vclock = VC3}}; + {true, _, _} -> + {SeenChanged, State3#chstate{vclock = VC3}}; + {_, true, true} -> + %% Exceptional condition that should only occur during + %% rolling upgrades and manual setting of the ring. + %% Merge as a divergent case. + State4 = reconcile_divergent(VNode, State3, + OtherState3), + {true, State4#chstate{nodename = VNode}}; + {_, false, false} -> + %% Unable to reconcile based on vector clock, merge rings. + State4 = reconcile_divergent(VNode, State3, + OtherState3), + {true, State4#chstate{nodename = VNode}} end. %% @private reconcile_divergent(VNode, StateA, StateB) -> - VClock = vclock:increment(VNode, vclock:merge([StateA#chstate.vclock, - StateB#chstate.vclock])), + VClock = vclock:increment(VNode, + vclock:merge([StateA#chstate.vclock, + StateB#chstate.vclock])), Members = reconcile_members(StateA, StateB), - Meta = merge_meta({StateA#chstate.nodename, StateA#chstate.meta}, {StateB#chstate.nodename, StateB#chstate.meta}), - NewState = reconcile_ring(StateA, StateB, get_members(Members)), - NewState1 = NewState#chstate{vclock=VClock, members=Members, meta=Meta}, + Meta = merge_meta({StateA#chstate.nodename, + StateA#chstate.meta}, + {StateB#chstate.nodename, StateB#chstate.meta}), + NewState = reconcile_ring(StateA, StateB, + get_members(Members)), + NewState1 = NewState#chstate{vclock = VClock, + members = Members, meta = Meta}, log_ring_result(NewState1), NewState1. @@ -1459,50 +1548,50 @@ reconcile_divergent(VNode, StateA, StateB) -> %% @doc Merge two members list using status vector clocks when possible, %% and falling back to manual merge for divergent cases. reconcile_members(StateA, StateB) -> - orddict:merge( - fun(_K, {Valid1, VC1, Meta1}, {Valid2, VC2, Meta2}) -> - New1 = vclock:descends(VC1, VC2), - New2 = vclock:descends(VC2, VC1), - MergeVC = vclock:merge([VC1, VC2]), - case {New1, New2} of - {true, false} -> - MergeMeta = lists:ukeysort(1, Meta1 ++ Meta2), - {Valid1, MergeVC, MergeMeta}; - {false, true} -> - MergeMeta = lists:ukeysort(1, Meta2 ++ Meta1), - {Valid2, MergeVC, MergeMeta}; - {_, _} -> - MergeMeta = lists:ukeysort(1, Meta1 ++ Meta2), - {merge_status(Valid1, Valid2), MergeVC, MergeMeta} - end - end, - StateA#chstate.members, - StateB#chstate.members). + orddict:merge(fun (_K, {Valid1, VC1, Meta1}, + {Valid2, VC2, Meta2}) -> + New1 = vclock:descends(VC1, VC2), + New2 = vclock:descends(VC2, VC1), + MergeVC = vclock:merge([VC1, VC2]), + case {New1, New2} of + {true, false} -> + MergeMeta = lists:ukeysort(1, Meta1 ++ Meta2), + {Valid1, MergeVC, MergeMeta}; + {false, true} -> + MergeMeta = lists:ukeysort(1, Meta2 ++ Meta1), + {Valid2, MergeVC, MergeMeta}; + {_, _} -> + MergeMeta = lists:ukeysort(1, Meta1 ++ Meta2), + {merge_status(Valid1, Valid2), MergeVC, + MergeMeta} + end + end, + StateA#chstate.members, StateB#chstate.members). %% @private reconcile_seen(StateA, StateB) -> - orddict:merge(fun(_, VC1, VC2) -> + orddict:merge(fun (_, VC1, VC2) -> vclock:merge([VC1, VC2]) - end, StateA#chstate.seen, StateB#chstate.seen). + end, + StateA#chstate.seen, StateB#chstate.seen). %% @private -merge_next_status(complete, _) -> - complete; -merge_next_status(_, complete) -> - complete; -merge_next_status(awaiting, awaiting) -> - awaiting. +merge_next_status(complete, _) -> complete; +merge_next_status(_, complete) -> complete; +merge_next_status(awaiting, awaiting) -> awaiting. %% @private %% @doc Merge two next lists that must be of the same size and have %% the same Idx/Owner pair. reconcile_next(Next1, Next2) -> - lists:zipwith(fun({Idx, Owner, Node, Transfers1, Status1}, - {Idx, Owner, Node, Transfers2, Status2}) -> + lists:zipwith(fun ({Idx, Owner, Node, Transfers1, + Status1}, + {Idx, Owner, Node, Transfers2, Status2}) -> {Idx, Owner, Node, ordsets:union(Transfers1, Transfers2), merge_next_status(Status1, Status2)} - end, Next1, Next2). + end, + Next1, Next2). %% @private %% @doc Merge two next lists that may be of different sizes and @@ -1511,177 +1600,175 @@ reconcile_next(Next1, Next2) -> %% the merge is the same as in reconcile_next/2. reconcile_divergent_next(BaseNext, OtherNext) -> MergedNext = substitute(1, BaseNext, OtherNext), - lists:zipwith(fun({Idx, Owner1, Node1, Transfers1, Status1}, - {Idx, Owner2, Node2, Transfers2, Status2}) -> - Same = ({Owner1, Node1} =:= {Owner2, Node2}), + lists:zipwith(fun ({Idx, Owner1, Node1, Transfers1, + Status1}, + {Idx, Owner2, Node2, Transfers2, Status2}) -> + Same = {Owner1, Node1} =:= {Owner2, Node2}, case {Same, Status1, Status2} of - {false, _, _} -> - {Idx, Owner1, Node1, Transfers1, Status1}; - _ -> - {Idx, Owner1, Node1, - ordsets:union(Transfers1, Transfers2), - merge_next_status(Status1, Status2)} + {false, _, _} -> + {Idx, Owner1, Node1, Transfers1, Status1}; + _ -> + {Idx, Owner1, Node1, + ordsets:union(Transfers1, Transfers2), + merge_next_status(Status1, Status2)} end - end, BaseNext, MergedNext). + end, + BaseNext, MergedNext). %% @private substitute(Idx, TL1, TL2) -> - lists:map(fun(T) -> + lists:map(fun (T) -> Key = element(Idx, T), case lists:keyfind(Key, Idx, TL2) of - false -> - T; - T2 -> - T2 + false -> T; + T2 -> T2 end - end, TL1). + end, + TL1). %% @private -reconcile_ring(StateA=#chstate{claimant=Claimant1, rvsn=VC1, next=Next1}, - StateB=#chstate{claimant=Claimant2, rvsn=VC2, next=Next2}, +reconcile_ring(StateA = #chstate{claimant = Claimant1, + rvsn = VC1, next = Next1}, + StateB = #chstate{claimant = Claimant2, rvsn = VC2, + next = Next2}, Members) -> %% Try to reconcile based on the ring version (rvsn) vector clock. V1Newer = vclock:descends(VC1, VC2), V2Newer = vclock:descends(VC2, VC1), - EqualVC = (vclock:equal(VC1, VC2) and (Claimant1 =:= Claimant2)), + EqualVC = vclock:equal(VC1, VC2) and + (Claimant1 =:= Claimant2), case {EqualVC, V1Newer, V2Newer} of - {true, _, _} -> - Next = reconcile_next(Next1, Next2), - StateA#chstate{next=Next}; - {_, true, false} -> - Next = reconcile_divergent_next(Next1, Next2), - StateA#chstate{next=Next}; - {_, false, true} -> - Next = reconcile_divergent_next(Next2, Next1), - StateB#chstate{next=Next}; - {_, _, _} -> - %% Ring versions were divergent, so fall back to reconciling based - %% on claimant. Under normal operation, divergent ring versions - %% should only occur if there are two different claimants, and one - %% claimant is invalid. For example, when a claimant is removed and - %% a new claimant has just taken over. We therefore chose the ring - %% with the valid claimant. - CValid1 = lists:member(Claimant1, Members), - CValid2 = lists:member(Claimant2, Members), - case {CValid1, CValid2} of - {true, false} -> - Next = reconcile_divergent_next(Next1, Next2), - StateA#chstate{next=Next}; - {false, true} -> - Next = reconcile_divergent_next(Next2, Next1), - StateB#chstate{next=Next}; - {false, false} -> - %% This can occur when removed/down nodes are still - %% up and gossip to each other. We need to pick a - %% claimant to handle this case, although the choice - %% is irrelevant as a correct valid claimant will - %% eventually emerge when the ring converges. - case Claimant1 < Claimant2 of - true -> - Next = reconcile_divergent_next(Next1, Next2), - StateA#chstate{next=Next}; - false -> - Next = reconcile_divergent_next(Next2, Next1), - StateB#chstate{next=Next} - end; - {true, true} -> - %% This should never happen in normal practice. - %% But, we need to handle it for exceptional cases. - case Claimant1 < Claimant2 of - true -> - Next = reconcile_divergent_next(Next1, Next2), - StateA#chstate{next=Next}; - false -> - Next = reconcile_divergent_next(Next2, Next1), - StateB#chstate{next=Next} - end - end + {true, _, _} -> + Next = reconcile_next(Next1, Next2), + StateA#chstate{next = Next}; + {_, true, false} -> + Next = reconcile_divergent_next(Next1, Next2), + StateA#chstate{next = Next}; + {_, false, true} -> + Next = reconcile_divergent_next(Next2, Next1), + StateB#chstate{next = Next}; + {_, _, _} -> + %% Ring versions were divergent, so fall back to reconciling based + %% on claimant. Under normal operation, divergent ring versions + %% should only occur if there are two different claimants, and one + %% claimant is invalid. For example, when a claimant is removed and + %% a new claimant has just taken over. We therefore chose the ring + %% with the valid claimant. + CValid1 = lists:member(Claimant1, Members), + CValid2 = lists:member(Claimant2, Members), + case {CValid1, CValid2} of + {true, false} -> + Next = reconcile_divergent_next(Next1, Next2), + StateA#chstate{next = Next}; + {false, true} -> + Next = reconcile_divergent_next(Next2, Next1), + StateB#chstate{next = Next}; + {false, false} -> + %% This can occur when removed/down nodes are still + %% up and gossip to each other. We need to pick a + %% claimant to handle this case, although the choice + %% is irrelevant as a correct valid claimant will + %% eventually emerge when the ring converges. + %TODO False-false and true-true are the same. _-_ maybe better not repitition + case Claimant1 < Claimant2 of + true -> + Next = reconcile_divergent_next(Next1, Next2), + StateA#chstate{next = Next}; + false -> + Next = reconcile_divergent_next(Next2, Next1), + StateB#chstate{next = Next} + end; + {true, true} -> + %% This should never happen in normal practice. + %% But, we need to handle it for exceptional cases. + case Claimant1 < Claimant2 of + true -> + Next = reconcile_divergent_next(Next1, Next2), + StateA#chstate{next = Next}; + false -> + Next = reconcile_divergent_next(Next2, Next1), + StateB#chstate{next = Next} + end + end end. %% @private -merge_status(invalid, _) -> - invalid; -merge_status(_, invalid) -> - invalid; -merge_status(down, _) -> - down; -merge_status(_, down) -> - down; -merge_status(joining, _) -> - joining; -merge_status(_, joining) -> - joining; -merge_status(valid, _) -> - valid; -merge_status(_, valid) -> - valid; -merge_status(exiting, _) -> - exiting; -merge_status(_, exiting) -> - exiting; -merge_status(leaving, _) -> - leaving; -merge_status(_, leaving) -> - leaving; -merge_status(_, _) -> - invalid. +merge_status(invalid, _) -> invalid; +merge_status(_, invalid) -> invalid; +merge_status(down, _) -> down; +merge_status(_, down) -> down; +merge_status(joining, _) -> joining; +merge_status(_, joining) -> joining; +merge_status(valid, _) -> valid; +merge_status(_, valid) -> valid; +merge_status(exiting, _) -> exiting; +merge_status(_, exiting) -> exiting; +merge_status(leaving, _) -> leaving; +merge_status(_, leaving) -> leaving; +merge_status(_, _) -> invalid. %% @private -transfer_complete(CState=#chstate{next=Next, vclock=VClock}, Idx, Mod) -> - {Idx, Owner, NextOwner, Transfers, Status} = lists:keyfind(Idx, 1, Next), +transfer_complete(CState = #chstate{next = Next, + vclock = VClock}, + Idx, Mod) -> + {Idx, Owner, NextOwner, Transfers, Status} = + lists:keyfind(Idx, 1, Next), Transfers2 = ordsets:add_element(Mod, Transfers), - VNodeMods = - ordsets:from_list([VMod || {_, VMod} <- riak_core:vnode_modules()]), + VNodeMods = ordsets:from_list([VMod + || {_, VMod} <- riak_core:vnode_modules()]), Status2 = case {Status, Transfers2} of - {complete, _} -> - complete; - {awaiting, VNodeMods} -> - complete; - _ -> - awaiting + {complete, _} -> complete; + {awaiting, VNodeMods} -> complete; + _ -> awaiting end, Next2 = lists:keyreplace(Idx, 1, Next, {Idx, Owner, NextOwner, Transfers2, Status2}), VClock2 = vclock:increment(Owner, VClock), - CState#chstate{next=Next2, vclock=VClock2}. + CState#chstate{next = Next2, vclock = VClock2}. %% @private get_members(Members) -> - get_members(Members, [joining, valid, leaving, exiting, down]). + get_members(Members, + [joining, valid, leaving, exiting, down]). %% @private get_members(Members, Types) -> - [Node || {Node, {V, _, _}} <- Members, lists:member(V, Types)]. + [Node + || {Node, {V, _, _}} <- Members, + lists:member(V, Types)]. %% @private -update_seen(Node, CState=#chstate{vclock=VClock, seen=Seen}) -> +update_seen(Node, + CState = #chstate{vclock = VClock, seen = Seen}) -> Seen2 = orddict:update(Node, - fun(SeenVC) -> - vclock:merge([SeenVC, VClock]) - end, + fun (SeenVC) -> vclock:merge([SeenVC, VClock]) end, VClock, Seen), - CState#chstate{seen=Seen2}. + CState#chstate{seen = Seen2}. %% @private equal_cstate(StateA, StateB) -> equal_cstate(StateA, StateB, false). equal_cstate(StateA, StateB, false) -> - T1 = equal_members(StateA#chstate.members, StateB#chstate.members), - T2 = vclock:equal(StateA#chstate.rvsn, StateB#chstate.rvsn), + T1 = equal_members(StateA#chstate.members, + StateB#chstate.members), + T2 = vclock:equal(StateA#chstate.rvsn, + StateB#chstate.rvsn), T3 = equal_seen(StateA, StateB), T4 = equal_rings(StateA, StateB), - %% Clear fields checked manually and test remaining through equality. %% Note: We do not consider cluster name in equality. - StateA2=StateA#chstate{nodename=undefined, members=undefined, vclock=undefined, - rvsn=undefined, seen=undefined, chring=undefined, - meta=undefined, clustername=undefined}, - StateB2=StateB#chstate{nodename=undefined, members=undefined, vclock=undefined, - rvsn=undefined, seen=undefined, chring=undefined, - meta=undefined, clustername=undefined}, - T5 = (StateA2 =:= StateB2), - + StateA2 = StateA#chstate{nodename = undefined, + members = undefined, vclock = undefined, + rvsn = undefined, seen = undefined, + chring = undefined, meta = undefined, + clustername = undefined}, + StateB2 = StateB#chstate{nodename = undefined, + members = undefined, vclock = undefined, + rvsn = undefined, seen = undefined, + chring = undefined, meta = undefined, + clustername = undefined}, + T5 = StateA2 =:= StateB2, T1 andalso T2 andalso T3 andalso T4 andalso T5. remaining_fields(#chstate_v2{next = Next, claimant = Claimant}) -> @@ -1689,31 +1776,34 @@ remaining_fields(#chstate_v2{next = Next, claimant = Claimant}) -> %% @private equal_members(M1, M2) -> - L = orddict:merge(fun(_, {Status1, VC1, Meta1}, {Status2, VC2, Meta2}) -> - (Status1 =:= Status2) andalso - vclock:equal(VC1, VC2) andalso - (Meta1 =:= Meta2) - end, M1, M2), + L = orddict:merge(fun (_, {Status1, VC1, Meta1}, + {Status2, VC2, Meta2}) -> + Status1 =:= Status2 andalso + vclock:equal(VC1, VC2) andalso Meta1 =:= Meta2 + end, + M1, M2), {_, R} = lists:unzip(L), - lists:all(fun(X) -> X =:= true end, R). + lists:all(fun (X) -> X =:= true end, R). %% @private equal_seen(StateA, StateB) -> Seen1 = filtered_seen(StateA), Seen2 = filtered_seen(StateB), - L = orddict:merge(fun(_, VC1, VC2) -> + L = orddict:merge(fun (_, VC1, VC2) -> vclock:equal(VC1, VC2) - end, Seen1, Seen2), + end, + Seen1, Seen2), {_, R} = lists:unzip(L), - lists:all(fun(X) -> X =:= true end, R). + lists:all(fun (X) -> X =:= true end, R). %% @private -filtered_seen(State=#chstate{seen=Seen}) -> +filtered_seen(State = #chstate{seen = Seen}) -> case get_members(State#chstate.members) of - [] -> - Seen; - Members -> - orddict:filter(fun(N, _) -> lists:member(N, Members) end, Seen) + [] -> Seen; + Members -> + orddict:filter(fun (N, _) -> lists:member(N, Members) + end, + Seen) end. %% =================================================================== @@ -1724,195 +1814,218 @@ filtered_seen(State=#chstate{seen=Seen}) -> sequence_test() -> I1 = 365375409332725729550921208179070754913983135744, I2 = 730750818665451459101842416358141509827966271488, - A = fresh(4,a), - B1 = A#chstate{nodename=b}, + A = fresh(4, a), + B1 = A#chstate{nodename = b}, B2 = transfer_node(I1, b, B1), - ?assertEqual(B2, transfer_node(I1, b, B2)), - {no_change, A1} = reconcile(B1,A), - C1 = A#chstate{nodename=c}, + ?assertEqual(B2, (transfer_node(I1, b, B2))), + {no_change, A1} = reconcile(B1, A), + C1 = A#chstate{nodename = c}, C2 = transfer_node(I1, c, C1), - {new_ring, A2} = reconcile(C2,A1), - {new_ring, A3} = reconcile(B2,A2), - C3 = transfer_node(I2,c,C2), - {new_ring, C4} = reconcile(A3,C3), - {new_ring, A4} = reconcile(C4,A3), - {new_ring, B3} = reconcile(A4,B2), - ?assertEqual(A4#chstate.chring, B3#chstate.chring), - ?assertEqual(B3#chstate.chring, C4#chstate.chring). + {new_ring, A2} = reconcile(C2, A1), + {new_ring, A3} = reconcile(B2, A2), + C3 = transfer_node(I2, c, C2), + {new_ring, C4} = reconcile(A3, C3), + {new_ring, A4} = reconcile(C4, A3), + {new_ring, B3} = reconcile(A4, B2), + ?assertEqual((A4#chstate.chring), (B3#chstate.chring)), + ?assertEqual((B3#chstate.chring), (C4#chstate.chring)). param_fresh_test() -> - application:set_env(riak_core,ring_creation_size,4), - ?assert(equal_cstate(fresh(), fresh(4, node()))), - ?assertEqual(owner_node(fresh()),node()). + application:set_env(riak_core, ring_creation_size, 4), + ?assert((equal_cstate(fresh(), fresh(4, node())))), + ?assertEqual((owner_node(fresh())), (node())). index_test() -> - Ring0 = fresh(2,node()), - Ring1 = transfer_node(0,x,Ring0), - ?assertEqual(0,random_other_index(Ring0)), - ?assertEqual(0,random_other_index(Ring1)), - ?assertEqual(node(),index_owner(Ring0,0)), - ?assertEqual(x,index_owner(Ring1,0)), - ?assertEqual(lists:sort([x,node()]),lists:sort(diff_nodes(Ring0,Ring1))). + Ring0 = fresh(2, node()), + Ring1 = transfer_node(0, x, Ring0), + ?assertEqual(0, (random_other_index(Ring0))), + ?assertEqual(0, (random_other_index(Ring1))), + ?assertEqual((node()), (index_owner(Ring0, 0))), + ?assertEqual(x, (index_owner(Ring1, 0))), + ?assertEqual((lists:sort([x, node()])), + (lists:sort(diff_nodes(Ring0, Ring1)))). reconcile_test() -> - Ring0 = fresh(2,node()), - Ring1 = transfer_node(0,x,Ring0), + Ring0 = fresh(2, node()), + Ring1 = transfer_node(0, x, Ring0), %% Only members and seen should have changed - {new_ring, Ring2} = reconcile(fresh(2,someone_else),Ring1), - ?assertNot(equal_cstate(Ring1, Ring2, false)), - RingB0 = fresh(2,node()), - RingB1 = transfer_node(0,x,RingB0), - RingB2 = RingB1#chstate{nodename=b}, - ?assertMatch({no_change,_},reconcile(Ring1,RingB2)), - {no_change, RingB3} = reconcile(Ring1,RingB2), - ?assert(equal_cstate(RingB2, RingB3)). + {new_ring, Ring2} = reconcile(fresh(2, someone_else), + Ring1), + ?assertNot((equal_cstate(Ring1, Ring2, false))), + RingB0 = fresh(2, node()), + RingB1 = transfer_node(0, x, RingB0), + RingB2 = RingB1#chstate{nodename = b}, + ?assertMatch({no_change, _}, + (reconcile(Ring1, RingB2))), + {no_change, RingB3} = reconcile(Ring1, RingB2), + ?assert((equal_cstate(RingB2, RingB3))). metadata_inequality_test() -> - Ring0 = fresh(2,node()), - Ring1 = update_meta(key,val,Ring0), - ?assertNot(equal_rings(Ring0,Ring1)), - ?assertEqual(Ring1#chstate.meta, - merge_meta({'node0', Ring0#chstate.meta}, {'node1', Ring1#chstate.meta})), + Ring0 = fresh(2, node()), + Ring1 = update_meta(key, val, Ring0), + ?assertNot((equal_rings(Ring0, Ring1))), + ?assertEqual((Ring1#chstate.meta), + (merge_meta({node0, Ring0#chstate.meta}, + {node1, Ring1#chstate.meta}))), timer:sleep(1001), % ensure that lastmod is at least a second later - Ring2 = update_meta(key,val2,Ring1), - ?assertEqual(get_meta(key,Ring2), - get_meta(key,#chstate{meta= - merge_meta({'node1',Ring1#chstate.meta}, - {'node2',Ring2#chstate.meta})})), - ?assertEqual(get_meta(key,Ring2), - get_meta(key,#chstate{meta= - merge_meta({'node2',Ring2#chstate.meta}, - {'node1',Ring1#chstate.meta})})). + Ring2 = update_meta(key, val2, Ring1), + ?assertEqual((get_meta(key, Ring2)), + (get_meta(key, + #chstate{meta = + merge_meta({node1, Ring1#chstate.meta}, + {node2, + Ring2#chstate.meta})}))), + ?assertEqual((get_meta(key, Ring2)), + (get_meta(key, + #chstate{meta = + merge_meta({node2, Ring2#chstate.meta}, + {node1, + Ring1#chstate.meta})}))). metadata_remove_test() -> Ring0 = fresh(2, node()), - ?assert(equal_rings(Ring0, remove_meta(key, Ring0))), - Ring1 = update_meta(key,val,Ring0), + ?assert((equal_rings(Ring0, remove_meta(key, Ring0)))), + Ring1 = update_meta(key, val, Ring0), timer:sleep(1001), % ensure that lastmod is at least one second later - Ring2 = remove_meta(key,Ring1), - ?assertEqual(undefined, get_meta(key, Ring2)), - ?assertEqual(undefined, get_meta(key, #chstate{meta=merge_meta({'node1',Ring1#chstate.meta}, {'node2',Ring2#chstate.meta})})), - ?assertEqual(undefined, get_meta(key, #chstate{meta=merge_meta({'node2',Ring2#chstate.meta}, {'node1',Ring1#chstate.meta})})). + Ring2 = remove_meta(key, Ring1), + ?assertEqual(undefined, (get_meta(key, Ring2))), + ?assertEqual(undefined, + (get_meta(key, + #chstate{meta = + merge_meta({node1, Ring1#chstate.meta}, + {node2, + Ring2#chstate.meta})}))), + ?assertEqual(undefined, + (get_meta(key, + #chstate{meta = + merge_meta({node2, Ring2#chstate.meta}, + {node1, + Ring1#chstate.meta})}))). rename_test() -> Ring0 = fresh(2, node()), - Ring = rename_node(Ring0, node(), 'new@new'), - ?assertEqual('new@new', owner_node(Ring)), - ?assertEqual(['new@new'], all_members(Ring)). + Ring = rename_node(Ring0, node(), new@new), + ?assertEqual(new@new, (owner_node(Ring))), + ?assertEqual([new@new], (all_members(Ring))). exclusion_test() -> Ring0 = fresh(2, node()), - Ring1 = transfer_node(0,x,Ring0), - ?assertEqual(0, random_other_index(Ring1,[730750818665451459101842416358141509827966271488])), - ?assertEqual(no_indices, random_other_index(Ring1, [0])), - ?assertEqual([{730750818665451459101842416358141509827966271488,node()},{0,x}], - preflist(<<1:160/integer>>, Ring1)). + Ring1 = transfer_node(0, x, Ring0), + ?assertEqual(0, + (random_other_index(Ring1, + [730750818665451459101842416358141509827966271488]))), + ?assertEqual(no_indices, + (random_other_index(Ring1, [0]))), + ?assertEqual([{730750818665451459101842416358141509827966271488, + node()}, + {0, x}], + (preflist(<<1:160/integer>>, Ring1))). random_other_node_test() -> Ring0 = fresh(2, node()), - ?assertEqual(no_node, random_other_node(Ring0)), - Ring1 = add_member(node(), Ring0, 'new@new'), - Ring2 = transfer_node(0, 'new@new', Ring1), - ?assertEqual('new@new', random_other_node(Ring2)). + ?assertEqual(no_node, (random_other_node(Ring0))), + Ring1 = add_member(node(), Ring0, new@new), + Ring2 = transfer_node(0, new@new, Ring1), + ?assertEqual(new@new, (random_other_node(Ring2))). membership_test() -> RingA1 = fresh(nodeA), - ?assertEqual([nodeA], all_members(RingA1)), - + ?assertEqual([nodeA], (all_members(RingA1))), RingA2 = add_member(nodeA, RingA1, nodeB), RingA3 = add_member(nodeA, RingA2, nodeC), - ?assertEqual([nodeA, nodeB, nodeC], all_members(RingA3)), - + ?assertEqual([nodeA, nodeB, nodeC], + (all_members(RingA3))), RingA4 = remove_member(nodeA, RingA3, nodeC), - ?assertEqual([nodeA, nodeB], all_members(RingA4)), - + ?assertEqual([nodeA, nodeB], (all_members(RingA4))), %% Node should stay removed {_, RingA5} = reconcile(RingA3, RingA4), - ?assertEqual([nodeA, nodeB], all_members(RingA5)), - + ?assertEqual([nodeA, nodeB], (all_members(RingA5))), %% Add node in parallel, check node stays removed RingB1 = add_member(nodeB, RingA3, nodeC), {_, RingA6} = reconcile(RingB1, RingA5), - ?assertEqual([nodeA, nodeB], all_members(RingA6)), - + ?assertEqual([nodeA, nodeB], (all_members(RingA6))), %% Add node as parallel descendent, check node is added RingB2 = add_member(nodeB, RingA6, nodeC), {_, RingA7} = reconcile(RingB2, RingA6), - ?assertEqual([nodeA, nodeB, nodeC], all_members(RingA7)), - - Priority = [{invalid,1}, {down,2}, {joining,3}, {valid,4}, {exiting,5}, - {leaving,6}], + ?assertEqual([nodeA, nodeB, nodeC], + (all_members(RingA7))), + Priority = [{invalid, 1}, {down, 2}, {joining, 3}, + {valid, 4}, {exiting, 5}, {leaving, 6}], RingX1 = fresh(nodeA), RingX2 = add_member(nodeA, RingX1, nodeB), RingX3 = add_member(nodeA, RingX2, nodeC), - ?assertEqual(joining, member_status(RingX3, nodeC)), - + ?assertEqual(joining, (member_status(RingX3, nodeC))), %% Parallel/sibling status changes merge based on priority [begin - RingT1 = set_member(nodeA, RingX3, nodeC, StatusA), - ?assertEqual(StatusA, member_status(RingT1, nodeC)), - RingT2 = set_member(nodeB, RingX3, nodeC, StatusB), - ?assertEqual(StatusB, member_status(RingT2, nodeC)), - StatusC = case PriorityA < PriorityB of - true -> StatusA; - false -> StatusB - end, - {_, RingT3} = reconcile(RingT2, RingT1), - ?assertEqual(StatusC, member_status(RingT3, nodeC)) - end || {StatusA, PriorityA} <- Priority, - {StatusB, PriorityB} <- Priority], - + RingT1 = set_member(nodeA, RingX3, nodeC, StatusA), + ?assertEqual(StatusA, (member_status(RingT1, nodeC))), + RingT2 = set_member(nodeB, RingX3, nodeC, StatusB), + ?assertEqual(StatusB, (member_status(RingT2, nodeC))), + StatusC = case PriorityA < PriorityB of + true -> StatusA; + false -> StatusB + end, + {_, RingT3} = reconcile(RingT2, RingT1), + ?assertEqual(StatusC, (member_status(RingT3, nodeC))) + end + || {StatusA, PriorityA} <- Priority, + {StatusB, PriorityB} <- Priority], %% Related status changes merge to descendant [begin - RingT1 = set_member(nodeA, RingX3, nodeC, StatusA), - ?assertEqual(StatusA, member_status(RingT1, nodeC)), - RingT2 = set_member(nodeB, RingT1, nodeC, StatusB), - ?assertEqual(StatusB, member_status(RingT2, nodeC)), - RingT3 = set_member(nodeA, RingT1, nodeA, valid), - {_, RingT4} = reconcile(RingT2, RingT3), - ?assertEqual(StatusB, member_status(RingT4, nodeC)) - end || {StatusA, _} <- Priority, - {StatusB, _} <- Priority], + RingT1 = set_member(nodeA, RingX3, nodeC, StatusA), + ?assertEqual(StatusA, (member_status(RingT1, nodeC))), + RingT2 = set_member(nodeB, RingT1, nodeC, StatusB), + ?assertEqual(StatusB, (member_status(RingT2, nodeC))), + RingT3 = set_member(nodeA, RingT1, nodeA, valid), + {_, RingT4} = reconcile(RingT2, RingT3), + ?assertEqual(StatusB, (member_status(RingT4, nodeC))) + end + || {StatusA, _} <- Priority, {StatusB, _} <- Priority], ok. ring_version_test() -> Ring1 = fresh(nodeA), Ring2 = add_member(node(), Ring1, nodeA), Ring3 = add_member(node(), Ring2, nodeB), - ?assertEqual(nodeA, claimant(Ring3)), - #chstate{rvsn=RVsn, vclock=VClock} = Ring3, - + ?assertEqual(nodeA, (claimant(Ring3))), + #chstate{rvsn = RVsn, vclock = VClock} = Ring3, RingA1 = transfer_node(0, nodeA, Ring3), - RingA2 = RingA1#chstate{vclock=vclock:increment(nodeA, VClock)}, + RingA2 = RingA1#chstate{vclock = + vclock:increment(nodeA, VClock)}, RingB1 = transfer_node(0, nodeB, Ring3), - RingB2 = RingB1#chstate{vclock=vclock:increment(nodeB, VClock)}, - + RingB2 = RingB1#chstate{vclock = + vclock:increment(nodeB, VClock)}, %% RingA1 has most recent ring version - {_, RingT1} = reconcile(RingA2#chstate{rvsn=vclock:increment(nodeA, RVsn)}, + {_, RingT1} = reconcile(RingA2#chstate{rvsn = + vclock:increment(nodeA, RVsn)}, RingB2), - ?assertEqual(nodeA, index_owner(RingT1,0)), - + ?assertEqual(nodeA, (index_owner(RingT1, 0))), %% RingB1 has most recent ring version {_, RingT2} = reconcile(RingA2, - RingB2#chstate{rvsn=vclock:increment(nodeB, RVsn)}), - ?assertEqual(nodeB, index_owner(RingT2,0)), - + RingB2#chstate{rvsn = + vclock:increment(nodeB, RVsn)}), + ?assertEqual(nodeB, (index_owner(RingT2, 0))), %% Divergent ring versions, merge based on claimant - {_, RingT3} = reconcile(RingA2#chstate{rvsn=vclock:increment(nodeA, RVsn)}, - RingB2#chstate{rvsn=vclock:increment(nodeB, RVsn)}), - ?assertEqual(nodeA, index_owner(RingT3,0)), - + {_, RingT3} = reconcile(RingA2#chstate{rvsn = + vclock:increment(nodeA, RVsn)}, + RingB2#chstate{rvsn = + vclock:increment(nodeB, RVsn)}), + ?assertEqual(nodeA, (index_owner(RingT3, 0))), %% Divergent ring versions, one valid claimant. Merge on claimant. - RingA3 = RingA2#chstate{claimant=nodeA}, + RingA3 = RingA2#chstate{claimant = nodeA}, RingA4 = remove_member(nodeA, RingA3, nodeB), - RingB3 = RingB2#chstate{claimant=nodeB}, + RingB3 = RingB2#chstate{claimant = nodeB}, RingB4 = remove_member(nodeB, RingB3, nodeA), - {_, RingT4} = reconcile(RingA4#chstate{rvsn=vclock:increment(nodeA, RVsn)}, - RingB3#chstate{rvsn=vclock:increment(nodeB, RVsn)}), - ?assertEqual(nodeA, index_owner(RingT4,0)), - {_, RingT5} = reconcile(RingA3#chstate{rvsn=vclock:increment(nodeA, RVsn)}, - RingB4#chstate{rvsn=vclock:increment(nodeB, RVsn)}), - ?assertEqual(nodeB, index_owner(RingT5,0)). + {_, RingT4} = reconcile(RingA4#chstate{rvsn = + vclock:increment(nodeA, RVsn)}, + RingB3#chstate{rvsn = + vclock:increment(nodeB, RVsn)}), + ?assertEqual(nodeA, (index_owner(RingT4, 0))), + {_, RingT5} = reconcile(RingA3#chstate{rvsn = + vclock:increment(nodeA, RVsn)}, + RingB4#chstate{rvsn = + vclock:increment(nodeB, RVsn)}), + ?assertEqual(nodeB, (index_owner(RingT5, 0))). reconcile_next_test() -> Next1 = [{0, nodeA, nodeB, [riak_pipe_vnode], awaiting}, @@ -1921,11 +2034,11 @@ reconcile_next_test() -> Next2 = [{0, nodeA, nodeB, [riak_kv_vnode], complete}, {1, nodeA, nodeB, [], awaiting}, {2, nodeA, nodeB, [], awaiting}], - Next3 = [{0, nodeA, nodeB, [riak_kv_vnode, riak_pipe_vnode], complete}, + Next3 = [{0, nodeA, nodeB, + [riak_kv_vnode, riak_pipe_vnode], complete}, {1, nodeA, nodeB, [riak_pipe_vnode], awaiting}, {2, nodeA, nodeB, [riak_pipe_vnode], complete}], - ?assertEqual(Next3, reconcile_next(Next1, Next2)), - + ?assertEqual(Next3, (reconcile_next(Next1, Next2))), Next4 = [{0, nodeA, nodeB, [riak_pipe_vnode], awaiting}, {1, nodeA, nodeB, [], awaiting}, {2, nodeA, nodeB, [riak_pipe_vnode], awaiting}], @@ -1933,28 +2046,33 @@ reconcile_next_test() -> {2, nodeA, nodeB, [riak_kv_vnode], complete}], Next6 = [{0, nodeA, nodeB, [riak_pipe_vnode], awaiting}, {1, nodeA, nodeB, [], awaiting}, - {2, nodeA, nodeB, [riak_kv_vnode, riak_pipe_vnode], complete}], - ?assertEqual(Next6, reconcile_divergent_next(Next4, Next5)). + {2, nodeA, nodeB, [riak_kv_vnode, riak_pipe_vnode], + complete}], + ?assertEqual(Next6, + (reconcile_divergent_next(Next4, Next5))). resize_test() -> Ring0 = fresh(4, a), Ring1 = resize(Ring0, 8), Ring2 = resize(Ring0, 2), - ?assertEqual(8, num_partitions(Ring1)), - ?assertEqual(2, num_partitions(Ring2)), + ?assertEqual(8, (num_partitions(Ring1))), + ?assertEqual(2, (num_partitions(Ring2))), valid_resize(Ring0, Ring1), valid_resize(Ring0, Ring1), - Ring3 = set_pending_resize(Ring2, Ring0), - ?assertEqual(num_partitions(Ring0), num_partitions(Ring3)), - ?assertEqual(num_partitions(Ring2), future_num_partitions(Ring3)), - ?assertEqual(num_partitions(Ring2), num_partitions(future_ring(Ring3))), - + ?assertEqual((num_partitions(Ring0)), + (num_partitions(Ring3))), + ?assertEqual((num_partitions(Ring2)), + (future_num_partitions(Ring3))), + ?assertEqual((num_partitions(Ring2)), + (num_partitions(future_ring(Ring3)))), Key = <<0:160/integer>>, OrigIdx = element(1, hd(preflist(Key, Ring0))), %% for non-resize transitions index should be the same - ?assertEqual(OrigIdx, future_index(Key, OrigIdx, undefined, Ring0)), - ?assertEqual(element(1, hd(preflist(Key, Ring2))), future_index(Key, OrigIdx, undefined, Ring3)). + ?assertEqual(OrigIdx, + (future_index(Key, OrigIdx, undefined, Ring0))), + ?assertEqual((element(1, hd(preflist(Key, Ring2)))), + (future_index(Key, OrigIdx, undefined, Ring3))). lasgasp_test() -> RingA = fresh(4, a), @@ -1969,53 +2087,72 @@ lasgasp_test() -> resize_xfer_test_() -> {setup, - fun() -> + fun () -> meck:unload(), meck:new(riak_core, [passthrough]), meck:expect(riak_core, vnode_modules, - fun() -> [{some_app, fake_vnode}, {other_app, other_vnode}] end) + fun () -> + [{some_app, fake_vnode}, + {other_app, other_vnode}] + end) end, - fun(_) -> meck:unload() end, - fun test_resize_xfers/0}. + fun (_) -> meck:unload() end, fun test_resize_xfers/0}. test_resize_xfers() -> Ring0 = riak_core_ring:fresh(4, a), Ring1 = set_pending_resize(resize(Ring0, 8), Ring0), Source1 = {0, a}, - Target1 = {730750818665451459101842416358141509827966271488, a}, - TargetIdx2 = 365375409332725729550921208179070754913983135744, - Ring2 = schedule_resize_transfer(Ring1, Source1, Target1), - ?assertEqual(Target1, awaiting_resize_transfer(Ring2, Source1, fake_vnode)), - ?assertEqual(awaiting, resize_transfer_status(Ring2, Source1, Target1, fake_vnode)), + Target1 = + {730750818665451459101842416358141509827966271488, a}, + TargetIdx2 = + 365375409332725729550921208179070754913983135744, + Ring2 = schedule_resize_transfer(Ring1, Source1, + Target1), + ?assertEqual(Target1, + (awaiting_resize_transfer(Ring2, Source1, fake_vnode))), + ?assertEqual(awaiting, + (resize_transfer_status(Ring2, Source1, Target1, + fake_vnode))), %% use Target1 since we haven't used it as a source index - ?assertEqual(undefined, awaiting_resize_transfer(Ring2, Target1, fake_vnode)), - ?assertEqual(undefined, resize_transfer_status(Ring2, Target1, Source1, fake_vnode)), - - Ring3 = schedule_resize_transfer(Ring2, Source1, TargetIdx2), - Ring4 = resize_transfer_complete(Ring3, Source1, Target1, fake_vnode), - ?assertEqual({TargetIdx2, a}, awaiting_resize_transfer(Ring4, Source1, fake_vnode)), - ?assertEqual(awaiting, resize_transfer_status(Ring4, Source1, {TargetIdx2, a}, fake_vnode)), - ?assertEqual(complete, resize_transfer_status(Ring4, Source1, Target1, fake_vnode)), - - Ring5 = resize_transfer_complete(Ring4, Source1, {TargetIdx2, a}, fake_vnode), - {_, '$resize', Status1} = next_owner(Ring5, 0, fake_vnode), + ?assertEqual(undefined, + (awaiting_resize_transfer(Ring2, Target1, fake_vnode))), + ?assertEqual(undefined, + (resize_transfer_status(Ring2, Target1, Source1, + fake_vnode))), + Ring3 = schedule_resize_transfer(Ring2, Source1, + TargetIdx2), + Ring4 = resize_transfer_complete(Ring3, Source1, + Target1, fake_vnode), + ?assertEqual({TargetIdx2, a}, + (awaiting_resize_transfer(Ring4, Source1, fake_vnode))), + ?assertEqual(awaiting, + (resize_transfer_status(Ring4, Source1, {TargetIdx2, a}, + fake_vnode))), + ?assertEqual(complete, + (resize_transfer_status(Ring4, Source1, Target1, + fake_vnode))), + Ring5 = resize_transfer_complete(Ring4, Source1, + {TargetIdx2, a}, fake_vnode), + {_, '$resize', Status1} = next_owner(Ring5, 0, + fake_vnode), ?assertEqual(complete, Status1), - - Ring6 = resize_transfer_complete(Ring5, Source1, {TargetIdx2, a}, other_vnode), - Ring7 = resize_transfer_complete(Ring6, Source1, Target1, other_vnode), - {_, '$resize', Status2} = next_owner(Ring7, 0, fake_vnode), + Ring6 = resize_transfer_complete(Ring5, Source1, + {TargetIdx2, a}, other_vnode), + Ring7 = resize_transfer_complete(Ring6, Source1, + Target1, other_vnode), + {_, '$resize', Status2} = next_owner(Ring7, 0, + fake_vnode), ?assertEqual(complete, Status2), - {_, '$resize', Status3} = next_owner(Ring7, 0, other_vnode), + {_, '$resize', Status3} = next_owner(Ring7, 0, + other_vnode), ?assertEqual(complete, Status3), {_, '$resize', complete} = next_owner(Ring7, 0). valid_resize(Ring0, Ring1) -> - lists:foreach(fun({Idx, Owner}) -> + lists:foreach(fun ({Idx, Owner}) -> case lists:keyfind(Idx, 1, all_owners(Ring0)) of - false -> - ?assertEqual('$dummyhost@resized', Owner); - {Idx, OrigOwner} -> - ?assertEqual(OrigOwner, Owner) + false -> ?assertEqual('$dummyhost@resized', Owner); + {Idx, OrigOwner} -> ?assertEqual(OrigOwner, Owner) end end, all_owners(Ring1)). diff --git a/src/riak_core_ring_events.erl b/src/riak_core_ring_events.erl index 72e084cb5..4b7b44d4f 100644 --- a/src/riak_core_ring_events.erl +++ b/src/riak_core_ring_events.erl @@ -24,30 +24,23 @@ -behaviour(gen_event). %% API --export([start_link/0, - add_handler/2, - add_sup_handler/2, - add_guarded_handler/2, - add_callback/1, - add_sup_callback/1, - add_guarded_callback/1, - ring_update/1, - force_update/0, - ring_sync_update/1, +-export([start_link/0, add_handler/2, add_sup_handler/2, + add_guarded_handler/2, add_callback/1, + add_sup_callback/1, add_guarded_callback/1, + ring_update/1, force_update/0, ring_sync_update/1, force_sync_update/0]). %% gen_event callbacks -export([init/1, handle_event/2, handle_call/2, handle_info/2, terminate/2, code_change/3]). --record(state, { callback }). +-record(state, {callback}). %% =================================================================== %% API functions %% =================================================================== -start_link() -> - gen_event:start_link({local, ?MODULE}). +start_link() -> gen_event:start_link({local, ?MODULE}). add_handler(Handler, Args) -> gen_event:add_handler(?MODULE, Handler, Args). @@ -56,16 +49,20 @@ add_sup_handler(Handler, Args) -> gen_event:add_sup_handler(?MODULE, Handler, Args). add_guarded_handler(Handler, Args) -> - riak_core:add_guarded_event_handler(?MODULE, Handler, Args). + riak_core:add_guarded_event_handler(?MODULE, Handler, + Args). add_callback(Fn) when is_function(Fn) -> - gen_event:add_handler(?MODULE, {?MODULE, make_ref()}, [Fn]). + gen_event:add_handler(?MODULE, {?MODULE, make_ref()}, + [Fn]). add_sup_callback(Fn) when is_function(Fn) -> - gen_event:add_sup_handler(?MODULE, {?MODULE, make_ref()}, [Fn]). + gen_event:add_sup_handler(?MODULE, + {?MODULE, make_ref()}, [Fn]). add_guarded_callback(Fn) when is_function(Fn) -> - riak_core:add_guarded_event_handler(?MODULE, {?MODULE, make_ref()}, [Fn]). + riak_core:add_guarded_event_handler(?MODULE, + {?MODULE, make_ref()}, [Fn]). force_update() -> {ok, Ring} = riak_core_ring_manager:get_my_ring(), @@ -88,21 +85,15 @@ ring_sync_update(Ring) -> init([Fn]) -> {ok, Ring} = riak_core_ring_manager:get_my_ring(), Fn(Ring), - {ok, #state { callback = Fn }}. + {ok, #state{callback = Fn}}. handle_event({ring_update, Ring}, State) -> - (State#state.callback)(Ring), - {ok, State}. + (State#state.callback)(Ring), {ok, State}. -handle_call(_Request, State) -> - {ok, ok, State}. +handle_call(_Request, State) -> {ok, ok, State}. -handle_info(_Info, State) -> - {ok, State}. +handle_info(_Info, State) -> {ok, State}. -terminate(_Reason, _State) -> - ok. - -code_change(_OldVsn, State, _Extra) -> - {ok, State}. +terminate(_Reason, _State) -> ok. +code_change(_OldVsn, State, _Extra) -> {ok, State}. diff --git a/src/riak_core_ring_handler.erl b/src/riak_core_ring_handler.erl index f675164c5..c61a49359 100644 --- a/src/riak_core_ring_handler.erl +++ b/src/riak_core_ring_handler.erl @@ -15,14 +15,16 @@ %% Copyright (c) 2007-2015 Basho Technologies, Inc. All Rights Reserved. -module(riak_core_ring_handler). + -behaviour(gen_event). %% gen_event callbacks -export([init/1, handle_event/2, handle_call/2, handle_info/2, terminate/2, code_change/3]). + -export([ensure_vnodes_started/1]). --record(state, {}). +-record(state, {}). %% =================================================================== %% gen_event callbacks @@ -34,25 +36,18 @@ init([]) -> ensure_vnodes_started(Ring), {ok, #state{}}. - handle_event({ring_update, Ring}, State) -> maybe_start_vnode_proxies(Ring), maybe_stop_vnode_proxies(Ring), {ok, State}. -handle_call(_Event, State) -> - {ok, ok, State}. - -handle_info(_Info, State) -> - {ok, State}. - -terminate(_Reason, _State) -> - ok. +handle_call(_Event, State) -> {ok, ok, State}. -code_change(_OldVsn, State, _Extra) -> - {ok, State}. +handle_info(_Info, State) -> {ok, State}. +terminate(_Reason, _State) -> ok. +code_change(_OldVsn, State, _Extra) -> {ok, State}. %% =================================================================== %% Internal functions @@ -60,160 +55,152 @@ code_change(_OldVsn, State, _Extra) -> ensure_vnodes_started(Ring) -> case riak_core:vnode_modules() of - [] -> - ok; - AppMods -> - case ensure_vnodes_started(AppMods, Ring, []) of - [] -> - Ready = riak_core_ring:ring_ready(Ring), - FutureIndices = riak_core_ring:future_indices(Ring, node()), - Status = riak_core_ring:member_status(Ring, node()), - case {Ready, FutureIndices, Status} of - {true, [], leaving} -> - case ready_to_exit(AppMods) of - true -> - exit_ring_trans(), - maybe_shutdown(Ring); - false -> - ok - end; - {_, _, invalid} -> - riak_core_ring_manager:refresh_my_ring(); - {_, _, exiting} -> - %% Deliberately do nothing. - ok; - {_, _, _} -> - ok - end; - _ -> ok - end + [] -> ok; + AppMods -> + case ensure_vnodes_started(AppMods, Ring, []) of + [] -> + Ready = riak_core_ring:ring_ready(Ring), + FutureIndices = riak_core_ring:future_indices(Ring, + node()), + Status = riak_core_ring:member_status(Ring, node()), + case {Ready, FutureIndices, Status} of + {true, [], leaving} -> + case ready_to_exit(AppMods) of + true -> exit_ring_trans(), maybe_shutdown(Ring); + false -> ok + end; + {_, _, invalid} -> + riak_core_ring_manager:refresh_my_ring(); + {_, _, exiting} -> + %% Deliberately do nothing. + ok; + {_, _, _} -> ok + end; + _ -> ok + end end. %% Shutdown if we are the only node in the cluster maybe_shutdown(Ring) -> case riak_core_ring:random_other_node(Ring) of - no_node -> - riak_core_ring_manager:refresh_my_ring(); - _ -> - ok + no_node -> riak_core_ring_manager:refresh_my_ring(); + _ -> ok end. exit_ring_trans() -> - riak_core_ring_manager:ring_trans( - fun(Ring2, _) -> - Ring3 = riak_core_ring:exit_member(node(), Ring2, node()), - {new_ring, Ring3} - end, []). - -ready_to_exit([]) -> - true; -ready_to_exit([{_App, Mod} | AppMods]) -> - case erlang:function_exported(Mod, ready_to_exit, 0) andalso - (not Mod:ready_to_exit()) of - true -> - false; - false -> - ready_to_exit(AppMods) + riak_core_ring_manager:ring_trans(fun (Ring2, _) -> + Ring3 = + riak_core_ring:exit_member(node(), + Ring2, + node()), + {new_ring, Ring3} + end, + []). + +ready_to_exit([]) -> true; +ready_to_exit([{_App, Module} | AppMods]) -> + case erlang:function_exported(Module, ready_to_exit, 0) + andalso not Module:ready_to_exit() + of + true -> false; + false -> ready_to_exit(AppMods) end. ensure_vnodes_started([], _Ring, Acc) -> lists:flatten(Acc); -ensure_vnodes_started([{App, Mod}|T], Ring, Acc) -> - ensure_vnodes_started(T, Ring, [ensure_vnodes_started({App,Mod},Ring)|Acc]). +ensure_vnodes_started([{App, Mod} | T], Ring, Acc) -> + ensure_vnodes_started(T, Ring, + [ensure_vnodes_started({App, Mod}, Ring) | Acc]). -ensure_vnodes_started({App,Mod}, Ring) -> - Startable = startable_vnodes(Mod, Ring), +ensure_vnodes_started({App, Module}, Ring) -> + Startable = startable_vnodes(Module, Ring), %% NOTE: This following is a hack. There's a basic %% dependency/race between riak_core (want to start vnodes %% right away to trigger possible handoffs) and riak_kv %% (needed to support those vnodes). The hack does not fix %% that dependency: internal techdebt todo list #A7 does. - spawn_link(fun() -> - %% Use a registered name as a lock to prevent the same - %% vnode module from being started twice. - RegName = list_to_atom( - "riak_core_ring_handler_ensure_" - ++ atom_to_list(Mod)), - try erlang:register(RegName, self()) - catch error:badarg -> - exit(normal) + spawn_link(fun () -> + %% Use a registered name as a lock to prevent the same + %% vnode module from being started twice. + ModList = atom_to_list(Module), + RegName = "riak_core_ring_handler_ensure_" ++ ModList, + try erlang:register(list_to_atom(RegName), self()) catch + error:badarg -> exit(normal) end, - %% Let the app finish starting... ok = riak_core:wait_for_application(App), - %% Start the vnodes. HasStartVnodes = lists:member({start_vnodes, 1}, - Mod:module_info(exports)), + Module:module_info(exports)), case HasStartVnodes of - true -> - Mod:start_vnodes(Startable); - false -> - [Mod:start_vnode(I) || I <- Startable] + true -> Module:start_vnodes(Startable); + false -> [Module:start_vnode(I) || I <- Startable] end, - %% Mark the service as up. SupName = list_to_atom(atom_to_list(App) ++ "_sup"), SupPid = erlang:whereis(SupName), case riak_core:health_check(App) of - undefined -> - riak_core_node_watcher:service_up(App, SupPid); - HealthMFA -> - riak_core_node_watcher:service_up(App, - SupPid, - HealthMFA) + undefined -> + riak_core_node_watcher:service_up(App, SupPid); + HealthMFA -> + riak_core_node_watcher:service_up(App, SupPid, + HealthMFA) end, exit(normal) end), Startable. - startable_vnodes(Mod, Ring) -> AllMembers = riak_core_ring:all_members(Ring), case {length(AllMembers), hd(AllMembers) =:= node()} of - {1, true} -> - riak_core_ring:my_indices(Ring); - _ -> - {ok, ModExcl} = riak_core_handoff_manager:get_exclusions(Mod), - Excl = ModExcl -- riak_core_ring:disowning_indices(Ring, node()), - case riak_core_ring:random_other_index(Ring, Excl) of - no_indices -> - case length(Excl) =:= riak_core_ring:num_partitions(Ring) of - true -> - []; - false -> - riak_core_ring:my_indices(Ring) - end; - RO -> - [RO | riak_core_ring:my_indices(Ring)] - end + {1, true} -> riak_core_ring:my_indices(Ring); + _ -> + {ok, ModExcl} = + riak_core_handoff_manager:get_exclusions(Mod), + Excl = ModExcl -- + riak_core_ring:disowning_indices(Ring, node()), + case riak_core_ring:random_other_index(Ring, Excl) of + no_indices -> + case length(Excl) =:= + riak_core_ring:num_partitions(Ring) + of + true -> []; + false -> riak_core_ring:my_indices(Ring) + end; + RO -> [RO | riak_core_ring:my_indices(Ring)] + end end. maybe_start_vnode_proxies(Ring) -> - Mods = [M || {_,M} <- riak_core:vnode_modules()], + Mods = [M || {_, M} <- riak_core:vnode_modules()], Size = riak_core_ring:num_partitions(Ring), FutureSize = riak_core_ring:future_num_partitions(Ring), Larger = Size < FutureSize, case Larger of - true -> - FutureIdxs = riak_core_ring:all_owners(riak_core_ring:future_ring(Ring)), - _ = [riak_core_vnode_proxy_sup:start_proxy(Mod, Idx) || {Idx, _} <- FutureIdxs, - Mod <- Mods], - ok; - false -> - ok + true -> + FutureIdxs = + riak_core_ring:all_owners(riak_core_ring:future_ring(Ring)), + _ = [riak_core_vnode_proxy_sup:start_proxy(Mod, Idx) + || {Idx, _} <- FutureIdxs, Mod <- Mods], + ok; + false -> ok end. maybe_stop_vnode_proxies(Ring) -> Mods = [M || {_, M} <- riak_core:vnode_modules()], case riak_core_ring:pending_changes(Ring) of - [] -> - Idxs = [{I,M} || {I,_} <- riak_core_ring:all_owners(Ring), M <- Mods], - ProxySpecs = supervisor:which_children(riak_core_vnode_proxy_sup), - Running = [{I,M} || {{M,I},_,_,_} <- ProxySpecs, lists:member(M, Mods)], - ToShutdown = Running -- Idxs, - _ = [riak_core_vnode_proxy_sup:stop_proxy(M,I) || {I, M} <- ToShutdown], - ok; - _ -> - ok + [] -> + Idxs = [{I, M} + || {I, _} <- riak_core_ring:all_owners(Ring), + M <- Mods], + ProxySpecs = + supervisor:which_children(riak_core_vnode_proxy_sup), + Running = [{I, M} + || {{M, I}, _, _, _} <- ProxySpecs, + lists:member(M, Mods)], + ToShutdown = Running -- Idxs, + _ = [riak_core_vnode_proxy_sup:stop_proxy(M, I) + || {I, M} <- ToShutdown], + ok; + _ -> ok end. diff --git a/src/riak_core_ring_manager.erl b/src/riak_core_ring_manager.erl index 737a88499..6149e7f5a 100644 --- a/src/riak_core_ring_manager.erl +++ b/src/riak_core_ring_manager.erl @@ -24,28 +24,23 @@ %% %% Numerous processes concurrently read and access the ring in a %% variety of time sensitive code paths. To make this efficient, -%% `riak_core' uses `mochiglobal' which exploits the Erlang constant -%% pool to provide constant-time access to the ring without needing -%% to copy data into individual process heaps. -%% -%% However, updating a `mochiglobal' value is very slow, and becomes slower -%% the larger the item being stored. With large rings, the delay can -%% become too long during periods of high ring churn, where hundreds of -%% ring events are being triggered a second. +%% `riak_core' uses `persistent_term' to provide constant-time access +%% to the ring without needing to copy data into individual process heaps. +%% See http://erlang.org/doc/man/persistent_term.html %% %% As of Riak 1.4, `riak_core' uses a hybrid approach to solve this %% problem. When a ring is first written, it is written to a shared ETS %% table. If no ring events have occurred for 90 seconds, the ring is -%% then promoted to `mochiglobal'. This provides fast updates during +%% then promoted to `persistent_term'. This provides fast updates during %% periods of ring churn, while eventually providing very fast reads %% after the ring stabilizes. The downside is that reading from the ETS -%% table before promotion is slower than `mochiglobal', and requires +%% table before promotion is slower than `persistent_term', and requires %% copying the ring into individual process heaps. %% %% To alleviate the slow down while in the ETS phase, `riak_core' %% exploits the fact that most time sensitive operations access the ring -%% in order to read only a subset of its data: bucket properties and -%% partition ownership. Therefore, these pieces of information are +%% in order to read only a subset of its data: partition ownership. +%% Therefore, these pieces of information are %% extracted from the ring and stored in the ETS table as well to %% minimize copying overhead. Furthermore, the partition ownership %% information (represented by the {@link chash} structure) is converted @@ -56,38 +51,27 @@ %% structure for normal operations. %% %% As of Riak 1.4, it is therefore recommended that operations that -%% can be performed by directly using the bucket properties API or -%% `chashbin' structure do so using those methods rather than -%% retrieving the ring via `get_my_ring/0' or `get_raw_ring/0'. +%% can be performed by directly using the `chashbin' structure. +%% Do so using that method rather than retrieving the ring via +%% `get_my_ring/0' or `get_raw_ring/0'. -module(riak_core_ring_manager). + -define(RING_KEY, riak_ring). + -behaviour(gen_server). --export([start_link/0, - start_link/1, - get_my_ring/0, - get_raw_ring/0, - get_raw_ring_chashbin/0, - get_chash_bin/0, - get_ring_id/0, - get_bucket_meta/1, - refresh_my_ring/0, - refresh_ring/2, - set_my_ring/1, - write_ringfile/0, - prune_ringfiles/0, - read_ringfile/1, - find_latest_ringfile/0, - force_update/0, - do_write_ringfile/1, - ring_trans/2, - run_fixups/3, - set_cluster_name/1, +-export([start_link/0, start_link/1, get_my_ring/0, + get_raw_ring/0, get_raw_ring_chashbin/0, + get_chash_bin/0, get_ring_id/0, refresh_my_ring/0, + refresh_ring/2, set_my_ring/1, write_ringfile/0, + prune_ringfiles/0, read_ringfile/1, + find_latest_ringfile/0, force_update/0, + do_write_ringfile/1, ring_trans/2, set_cluster_name/1, is_stable_ring/0]). --export([init/1, handle_call/3, handle_cast/2, handle_info/2, - terminate/2, code_change/3]). +-export([init/1, handle_call/3, handle_cast/2, + handle_info/2, terminate/2, code_change/3]). -ifdef(TEST). @@ -95,17 +79,18 @@ -endif. +-record(state, + {mode, raw_ring, ring_changed_time, inactivity_timer}). --record(state, { - mode, - raw_ring, - ring_changed_time, - inactivity_timer - }). +-export([setup_ets/1, cleanup_ets/1, set_ring_global/1, + promote_ring/0]). + + %% For EUnit testing --export([setup_ets/1, cleanup_ets/1, set_ring_global/1]). %% For EUnit testing -ifdef(TEST). + -include_lib("eunit/include/eunit.hrl"). + -endif. -define(ETS, ets_riak_core_ring_manager). @@ -117,49 +102,47 @@ %% =================================================================== start_link() -> - gen_server:start_link({local, ?MODULE}, ?MODULE, [live], []). - + gen_server:start_link({local, ?MODULE}, ?MODULE, [live], + []). %% Testing entry point start_link(test) -> - gen_server:start_link({local, ?MODULE}, ?MODULE, [test], []). + gen_server:start_link({local, ?MODULE}, ?MODULE, [test], + []). +-spec get_my_ring() -> {ok, + riak_core_ring:riak_core_ring()} | + {error, any()}. -%% @spec get_my_ring() -> {ok, riak_core_ring:riak_core_ring()} | {error, Reason} get_my_ring() -> - Ring = case riak_core_mochiglobal:get(?RING_KEY) of - ets -> - case ets:lookup(?ETS, ring) of - [{_, RingETS}] -> - RingETS; - _ -> - undefined - end; - RingMochi -> - RingMochi + Ring = case persistent_term:get(?RING_KEY, undefined) of + ets -> + case ets:lookup(?ETS, ring) of + [{_, RingETS}] -> RingETS; + _ -> undefined + end; + RingMochi -> RingMochi end, case Ring of - Ring when is_tuple(Ring) -> {ok, Ring}; - undefined -> {error, no_ring} + Ring when is_tuple(Ring) -> {ok, Ring}; + undefined -> {error, no_ring} end. get_raw_ring() -> - try - Ring = ets:lookup_element(?ETS, raw_ring, 2), + try Ring = ets:lookup_element(?ETS, raw_ring, 2), {ok, Ring} catch - _:_ -> - gen_server:call(?MODULE, get_raw_ring, infinity) + _:_ -> gen_server:call(?MODULE, get_raw_ring, infinity) end. get_raw_ring_chashbin() -> - try - Ring = ets:lookup_element(?ETS, raw_ring, 2), + try Ring = ets:lookup_element(?ETS, raw_ring, 2), {ok, CHBin} = get_chash_bin(), {ok, Ring, CHBin} catch - _:_ -> - gen_server:call(?MODULE, get_raw_ring_chashbin, infinity) + _:_ -> + gen_server:call(?MODULE, get_raw_ring_chashbin, + infinity) end. %% @spec refresh_my_ring() -> ok @@ -167,7 +150,8 @@ refresh_my_ring() -> gen_server:call(?MODULE, refresh_my_ring, infinity). refresh_ring(Node, ClusterName) -> - gen_server:cast({?MODULE, Node}, {refresh_my_ring, ClusterName}). + gen_server:cast({?MODULE, Node}, + {refresh_my_ring, ClusterName}). %% @spec set_my_ring(riak_core_ring:riak_core_ring()) -> ok set_my_ring(Ring) -> @@ -175,38 +159,15 @@ set_my_ring(Ring) -> get_ring_id() -> case ets:lookup(?ETS, id) of - [{_, Id}] -> - Id; - _ -> - {0,0} - end. - -%% @doc Return metadata for the given bucket. If a bucket -%% for the non-default type is provided {error, no_type} -%% is returned when the type does not exist -get_bucket_meta({<<"default">>, Name}) -> - get_bucket_meta(Name); -get_bucket_meta({_Type, _Name}=Bucket) -> - %% reads from cluster metadata ets table - %% these aren't stored in ring manager ever - riak_core_bucket:get_bucket(Bucket); -get_bucket_meta(Bucket) -> - case ets:lookup(?ETS, {bucket, Bucket}) of - [] -> - undefined; - [{_, undefined}] -> - undefined; - [{_, Meta}] -> - {ok, Meta} + [{_, Id}] -> Id; + _ -> {0, 0} end. %% @doc Return the {@link chashbin} generated from the current ring get_chash_bin() -> case ets:lookup(?ETS, chashbin) of - [{chashbin, CHBin}] -> - {ok, CHBin}; - _ -> - {error, no_ring} + [{chashbin, CHBin}] -> {ok, CHBin}; + _ -> {error, no_ring} end. %% @spec write_ringfile() -> ok @@ -214,10 +175,12 @@ write_ringfile() -> gen_server:cast(?MODULE, write_ringfile). ring_trans(Fun, Args) -> - gen_server:call(?MODULE, {ring_trans, Fun, Args}, infinity). + gen_server:call(?MODULE, {ring_trans, Fun, Args}, + infinity). set_cluster_name(Name) -> - gen_server:call(?MODULE, {set_cluster_name, Name}, infinity). + gen_server:call(?MODULE, {set_cluster_name, Name}, + infinity). is_stable_ring() -> gen_server:call(?MODULE, is_stable_ring, infinity). @@ -225,119 +188,128 @@ is_stable_ring() -> %% @doc Exposed for support/debug purposes. Forces the node to change its %% ring in a manner that will trigger reconciliation on gossip. force_update() -> - ring_trans( - fun(Ring, _) -> - NewRing = riak_core_ring:update_member_meta(node(), Ring, node(), - unused, erlang:timestamp()), - {new_ring, NewRing} - end, []), + ring_trans(fun (Ring, _) -> + NewRing = riak_core_ring:update_member_meta(node(), + Ring, node(), + unused, + erlang:timestamp()), + {new_ring, NewRing} + end, + []), ok. do_write_ringfile(Ring) -> case ring_dir() of - "" -> nop; - Dir -> - {{Year, Month, Day},{Hour, Minute, Second}} = calendar:universal_time(), - TS = io_lib:format(".~B~2.10.0B~2.10.0B~2.10.0B~2.10.0B~2.10.0B", - [Year, Month, Day, Hour, Minute, Second]), - Cluster = application:get_env(riak_core, cluster_name, undefined), - FN = Dir ++ "/riak_core_ring." ++ Cluster ++ TS, - do_write_ringfile(Ring, FN) + "" -> nop; + Dir -> + {{Year, Month, Day}, {Hour, Minute, Second}} = + calendar:universal_time(), + TS = + io_lib:format(".~B~2.10.0B~2.10.0B~2.10.0B~2.10.0B~2.10.0B", + [Year, Month, Day, Hour, Minute, Second]), + {ok, Cluster} = application:get_env(riak_core, + cluster_name), + FN = Dir ++ "/riak_core_ring." ++ Cluster ++ TS, + do_write_ringfile(Ring, FN) end. do_write_ringfile(Ring, FN) -> ok = filelib:ensure_dir(FN), - try - false = riak_core_ring:check_lastgasp(Ring), - ok = riak_core_util:replace_file(FN, term_to_binary(Ring)) + try ok = riak_core_util:replace_file(FN, + term_to_binary(Ring)) catch - _:Err -> - logger:error("Unable to write ring to \"~s\" - ~p\n", [FN, Err]), - {error,Err} + _:Err -> + logger:error("Unable to write ring to \"~s\" - ~p\n", + [FN, Err]), + {error, Err} end. - %% @spec find_latest_ringfile() -> string() find_latest_ringfile() -> Dir = ring_dir(), case file:list_dir(Dir) of - {ok, Filenames} -> - Cluster = application:get_env(riak_core, cluster_name, undefined), - Timestamps = [list_to_integer(TS) || {"riak_core_ring", C1, TS} <- - [list_to_tuple(string:tokens(FN, ".")) || FN <- Filenames], - C1 =:= Cluster], - SortedTimestamps = lists:reverse(lists:sort(Timestamps)), - case SortedTimestamps of - [Latest | _] -> - {ok, Dir ++ "/riak_core_ring." ++ Cluster ++ "." ++ integer_to_list(Latest)}; - _ -> - {error, not_found} - end; - {error, Reason} -> - {error, Reason} + {ok, Filenames} -> + {ok, Cluster} = application:get_env(riak_core, + cluster_name), + Timestamps = [list_to_integer(TS) + || {"riak_core_ring", C1, TS} + <- [list_to_tuple(string:tokens(FN, ".")) + || FN <- Filenames], + C1 =:= Cluster], + SortedTimestamps = + lists:reverse(lists:sort(Timestamps)), + case SortedTimestamps of + [Latest | _] -> + {ok, + Dir ++ + "/riak_core_ring." ++ + Cluster ++ "." ++ integer_to_list(Latest)}; + _ -> {error, not_found} + end; + {error, Reason} -> {error, Reason} end. %% @spec read_ringfile(string()) -> riak_core_ring:riak_core_ring() | {error, any()} read_ringfile(RingFile) -> case file:read_file(RingFile) of - {ok, Binary} -> - R = binary_to_term(Binary), - false = riak_core_ring:check_lastgasp(R), - R; - {error, Reason} -> - {error, Reason} + {ok, Binary} -> binary_to_term(Binary); + {error, Reason} -> {error, Reason} end. %% @spec prune_ringfiles() -> ok | {error, Reason} prune_ringfiles() -> case ring_dir() of - "" -> ok; - Dir -> - Cluster = application:get_env(riak_core, cluster_name, undefined), - case file:list_dir(Dir) of - {error,enoent} -> ok; - {error, Reason} -> - {error, Reason}; - {ok, []} -> ok; - {ok, Filenames} -> - Timestamps = [TS || {"riak_core_ring", C1, TS} <- - [list_to_tuple(string:tokens(FN, ".")) || FN <- Filenames], - C1 =:= Cluster], - if Timestamps /= [] -> - %% there are existing ring files - TSPat = [io_lib:fread("~4d~2d~2d~2d~2d~2d",TS) || - TS <- Timestamps], - TSL = lists:reverse(lists:sort([TS || - {ok,TS,[]} <- TSPat])), - Keep = prune_list(TSL), - KeepTSs = [lists:flatten( - io_lib:format( - "~B~2.10.0B~2.10.0B~2.10.0B~2.10.0B~2.10.0B",K)) - || K <- Keep], - DelFNs = [Dir ++ "/" ++ FN || FN <- Filenames, - lists:all(fun(TS) -> - string:str(FN,TS)=:=0 - end, KeepTSs)], - _ = [file:delete(DelFN) || DelFN <- DelFNs], - ok; - true -> - %% directory wasn't empty, but there are no ring - %% files in it - ok - end - end + "" -> ok; + Dir -> + Cluster = application:get_env(riak_core, cluster_name, + undefined), + case file:list_dir(Dir) of + {error, enoent} -> ok; + {error, Reason} -> {error, Reason}; + {ok, []} -> ok; + {ok, Filenames} -> + Timestamps = [TS + || {"riak_core_ring", C1, TS} + <- [list_to_tuple(string:tokens(FN, ".")) + || FN <- Filenames], + C1 =:= Cluster], + if Timestamps /= [] -> + %% there are existing ring files + TSPat = [io_lib:fread("~4d~2d~2d~2d~2d~2d", TS) + || TS <- Timestamps], + TSL = lists:reverse(lists:sort([TS + || {ok, TS, []} + <- TSPat])), + Keep = prune_list(TSL), + KeepTSs = + [lists:flatten(io_lib:format("~B~2.10.0B~2.10.0B~2.10.0B~2.10.0B~2.10.0B", + K)) + || K <- Keep], + DelFNs = [Dir ++ "/" ++ FN + || FN <- Filenames, + lists:all(fun (TS) -> + string:str(FN, TS) =:= 0 + end, + KeepTSs)], + _ = [file:delete(DelFN) || DelFN <- DelFNs], + ok; + true -> + %% directory wasn't empty, but there are no ring + %% files in it + ok + end + end end. -ifdef(TEST). + %% @private (only used for test instances) stop() -> - try - gen_server:call(?MODULE, stop) - catch - exit:{noproc, _} -> ok + try gen_server:call(?MODULE, stop) catch + exit:{noproc, _} -> ok end. --endif. +-endif. %% =================================================================== %% gen_server callbacks @@ -350,36 +322,36 @@ init([Mode]) -> riak_core_ring_events:ring_update(Ring), {ok, State}. -reload_ring(test) -> - riak_core_ring:fresh(16,node()); +reload_ring(test) -> riak_core_ring:fresh(16, node()); reload_ring(live) -> case riak_core_ring_manager:find_latest_ringfile() of - {ok, RingFile} -> - case riak_core_ring_manager:read_ringfile(RingFile) of - {error, Reason} -> - logger:critical("Failed to read ring file: ~p", - [riak_core_util:posix_error(Reason)]), - throw({error, Reason}); - Ring -> - Ring - end; - {error, not_found} -> - logger:warning("No ring file available."), - riak_core_ring:fresh(); - {error, Reason} -> - logger:critical("Failed to load ring file: ~p", - [riak_core_util:posix_error(Reason)]), - throw({error, Reason}) + {ok, RingFile} -> + case riak_core_ring_manager:read_ringfile(RingFile) of + {error, Reason} -> + logger:critical("Failed to read ring file: ~p", + [riak_core_util:posix_error(Reason)]), + throw({error, Reason}); + Ring -> Ring + end; + {error, not_found} -> + logger:warning("No ring file available."), + riak_core_ring:fresh(); + {error, Reason} -> + logger:critical("Failed to load ring file: ~p", + [riak_core_util:posix_error(Reason)]), + throw({error, Reason}) end. -handle_call(get_raw_ring, _From, #state{raw_ring=Ring} = State) -> +handle_call(get_raw_ring, _From, + #state{raw_ring = Ring} = State) -> {reply, {ok, Ring}, State}; -handle_call(get_raw_ring_chashbin, _From, #state{raw_ring=Ring} = State) -> +handle_call(get_raw_ring_chashbin, _From, + #state{raw_ring = Ring} = State) -> {ok, CHBin} = get_chash_bin(), {reply, {ok, Ring, CHBin}, State}; handle_call({set_my_ring, Ring}, _From, State) -> State2 = prune_write_notify_ring(Ring, State), - {reply,ok,State2}; + {reply, ok, State2}; handle_call(refresh_my_ring, _From, State) -> %% Pompt the claimant before creating a fresh ring for shutdown, so that %% any final actions can be taken @@ -393,248 +365,174 @@ handle_call(refresh_my_ring, _From, State) -> %% state global ring has the last gasp, but not the persisted ring (so that %% on restart there will be no last gasp indicator. ok = do_write_ringfile(FreshRing), - %% Handoff is complete and fresh ring is written %% so we can safely stop now. riak_core:stop("node removal completed, exiting."), - - {reply,ok,State2}; -handle_call({ring_trans, Fun, Args}, _From, State=#state{raw_ring=Ring}) -> + {reply, ok, State2}; +handle_call({ring_trans, Fun, Args}, _From, + State = #state{raw_ring = Ring}) -> case catch Fun(Ring, Args) of - {new_ring, NewRing} -> - State2 = prune_write_notify_ring(NewRing, State), - riak_core_gossip:random_recursive_gossip(NewRing), - {reply, {ok, NewRing}, State2}; - {set_only, NewRing} -> - State2 = prune_write_ring(NewRing, State), - {reply, {ok, NewRing}, State2}; - {reconciled_ring, NewRing} -> - State2 = prune_write_notify_ring(NewRing, State), - riak_core_gossip:recursive_gossip(NewRing), - {reply, {ok, NewRing}, State2}; - ignore -> - {reply, not_changed, State}; - {ignore, Reason} -> - {reply, {not_changed, Reason}, State}; - Other -> - logger:error("ring_trans: invalid return value: ~p", - [Other]), - {reply, not_changed, State} + {new_ring, NewRing} -> + State2 = prune_write_notify_ring(NewRing, State), + riak_core_gossip:random_recursive_gossip(NewRing), + {reply, {ok, NewRing}, State2}; + {set_only, NewRing} -> + State2 = prune_write_ring(NewRing, State), + {reply, {ok, NewRing}, State2}; + {reconciled_ring, NewRing} -> + State2 = prune_write_notify_ring(NewRing, State), + riak_core_gossip:recursive_gossip(NewRing), + {reply, {ok, NewRing}, State2}; + ignore -> {reply, not_changed, State}; + {ignore, Reason} -> + {reply, {not_changed, Reason}, State}; + Other -> + logger:error("ring_trans: invalid return value: ~p", + [Other]), + {reply, not_changed, State} end; -handle_call({set_cluster_name, Name}, _From, State=#state{raw_ring=Ring}) -> +handle_call({set_cluster_name, Name}, _From, + State = #state{raw_ring = Ring}) -> NewRing = riak_core_ring:set_cluster_name(Ring, Name), State2 = prune_write_notify_ring(NewRing, State), {reply, ok, State2}; handle_call(is_stable_ring, _From, State) -> {IsStable, _DeltaMS} = is_stable_ring(State), {reply, IsStable, State}; - handle_call(stop, _From, State) -> - {stop,normal, ok, State}. + {stop, normal, ok, State}. handle_cast({refresh_my_ring, ClusterName}, State) -> {ok, Ring} = get_my_ring(), case riak_core_ring:cluster_name(Ring) of - ClusterName -> - handle_cast(refresh_my_ring, State); - _ -> - {noreply, State} + ClusterName -> handle_cast(refresh_my_ring, State); + _ -> {noreply, State} end; handle_cast(refresh_my_ring, State) -> - {_, _, State2} = handle_call(refresh_my_ring, undefined, State), + {_, _, State2} = handle_call(refresh_my_ring, undefined, + State), {noreply, State2}; - -handle_cast(write_ringfile, test) -> - {noreply,test}; - -handle_cast(write_ringfile, State=#state{raw_ring=Ring}) -> - ok = do_write_ringfile(Ring), - {noreply,State}. - +handle_cast(write_ringfile, test) -> {noreply, test}; +handle_cast(write_ringfile, + State = #state{raw_ring = Ring}) -> + ok = do_write_ringfile(Ring), {noreply, State}. handle_info(inactivity_timeout, State) -> case is_stable_ring(State) of - {true,DeltaMS} -> - logger:debug("Promoting ring after ~p", [DeltaMS]), - promote_ring(), - State2 = State#state{inactivity_timer=undefined}, - {noreply, State2}; - {false,DeltaMS} -> - Remaining = ?PROMOTE_TIMEOUT - DeltaMS, - State2 = set_timer(Remaining, State), - {noreply, State2} + {true, DeltaMS} -> + logger:debug("Promoting ring after ~p", [DeltaMS]), + promote_ring(), + State2 = State#state{inactivity_timer = undefined}, + {noreply, State2}; + {false, DeltaMS} -> + Remaining = (?PROMOTE_TIMEOUT) - DeltaMS, + State2 = set_timer(Remaining, State), + {noreply, State2} end; -handle_info(_Info, State) -> - {noreply, State}. - +handle_info(_Info, State) -> {noreply, State}. %% @private -terminate(_Reason, _State) -> - ok. - +terminate(_Reason, _State) -> ok. %% @private -code_change(_OldVsn, State, _Extra) -> - {ok, State}. - +code_change(_OldVsn, State, _Extra) -> {ok, State}. %% =================================================================== %% Internal functions %% =================================================================== ring_dir() -> - case application:get_env(riak_core, ring_state_dir, undefined) of - undefined -> - filename:join(application:get_env(riak_core, platform_data_dir, "data"), "ring"); - D -> - D + case application:get_env(riak_core, ring_state_dir, + undefined) + of + undefined -> + filename:join(application:get_env(riak_core, + platform_data_dir, "data"), + "ring"); + D -> D end. -prune_list([X|Rest]) -> - lists:usort(lists:append([[X],back(1,X,Rest),back(2,X,Rest), - back(3,X,Rest),back(4,X,Rest),back(5,X,Rest)])). -back(_N,_X,[]) -> []; -back(N,X,[H|T]) -> - case lists:nth(N,X) =:= lists:nth(N,H) of - true -> back(N,X,T); - false -> [H] - end. +prune_list([X | Rest]) -> + lists:usort(lists:append([[X], back(1, X, Rest), + back(2, X, Rest), back(3, X, Rest), + back(4, X, Rest), back(5, X, Rest)])). -%% @private -run_fixups([], _Bucket, BucketProps) -> - BucketProps; -run_fixups([{App, Fixup}|T], BucketName, BucketProps) -> - BP = try Fixup:fixup(BucketName, BucketProps) of - {ok, NewBucketProps} -> - NewBucketProps; - {error, Reason} -> - logger:error("Error while running bucket fixup module " - "~p from application ~p on bucket ~p: ~p", [Fixup, App, - BucketName, Reason]), - BucketProps - catch - What:Why -> - logger:error("Crash while running bucket fixup module " - "~p from application ~p on bucket ~p : ~p:~p", [Fixup, App, - BucketName, What, Why]), - BucketProps - end, - run_fixups(T, BucketName, BP). +back(_N, _X, []) -> []; +back(N, X, [H | T]) -> + case lists:nth(N, X) =:= lists:nth(N, H) of + true -> back(N, X, T); + false -> [H] + end. set_ring(Ring, State) -> set_ring_global(Ring), Now = os:timestamp(), - State2 = State#state{raw_ring=Ring, ring_changed_time=Now}, + State2 = State#state{raw_ring = Ring, + ring_changed_time = Now}, State3 = maybe_set_timer(?PROMOTE_TIMEOUT, State2), State3. -maybe_set_timer(Duration, State=#state{inactivity_timer=undefined}) -> +maybe_set_timer(Duration, + State = #state{inactivity_timer = undefined}) -> set_timer(Duration, State); -maybe_set_timer(_Duration, State) -> - State. +maybe_set_timer(_Duration, State) -> State. set_timer(Duration, State) -> - Timer = erlang:send_after(Duration, self(), inactivity_timeout), - State#state{inactivity_timer=Timer}. + Timer = erlang:send_after(Duration, self(), + inactivity_timeout), + State#state{inactivity_timer = Timer}. setup_ets(Mode) -> %% Destroy prior version of ETS table. This is necessary for certain %% eunit tests, but is unneeded for normal Riak operation. catch ets:delete(?ETS), Access = case Mode of - live -> protected; - test -> public + live -> protected; + test -> public end, - ?ETS = ets:new(?ETS, [named_table, Access, {read_concurrency, true}]), + (?ETS) = ets:new(?ETS, + [named_table, Access, {read_concurrency, true}]), Id = reset_ring_id(), - ets:insert(?ETS, [{changes, 0}, {promoted, 0}, {id, Id}]), + ets:insert(?ETS, + [{changes, 0}, {promoted, 0}, {id, Id}]), ok. -cleanup_ets(test) -> - ets:delete(?ETS). +cleanup_ets(test) -> ets:delete(?ETS). reset_ring_id() -> - %% Maintain ring id epoch using mochiglobal to ensure ring id remains + %% Maintain ring id epoch using persistent_term to ensure ring id remains %% monotonic even if the riak_core_ring_manager crashes and restarts - Epoch = case riak_core_mochiglobal:get(riak_ring_id_epoch) of - undefined -> - 0; - Value -> - Value + Epoch = case persistent_term:get(riak_ring_id_epoch, + undefined) + of + undefined -> 0; + Value -> Value end, - riak_core_mochiglobal:put(riak_ring_id_epoch, Epoch + 1), + persistent_term:put(riak_ring_id_epoch, Epoch + 1), {Epoch + 1, 0}. -%% Set the ring in mochiglobal/ETS. Exported during unit testing +%% Set the ring in persistent_term/ETS. Exported during unit testing %% to make test setup simpler - no need to spin up a riak_core_ring_manager %% process. set_ring_global(Ring) -> - DefaultProps = case application:get_env(riak_core, default_bucket_props) of - {ok, Val} -> - Val; - _ -> - [] - end, - %% run fixups on the ring before storing it in mochiglobal - FixedRing = case riak_core:bucket_fixups() of - [] -> Ring; - Fixups -> - Buckets = riak_core_ring:get_buckets(Ring), - lists:foldl( - fun(Bucket, AccRing) -> - BucketProps = riak_core_bucket:get_bucket(Bucket, Ring), - %% Merge anything in the default properties but not in - %% the bucket's properties. This is to ensure default - %% properties added after the bucket is created are - %% inherited to the bucket. - MergedProps = riak_core_bucket:merge_props( - BucketProps, DefaultProps), - - %% fixup the ring - NewBucketProps = run_fixups(Fixups, Bucket, MergedProps), - %% update the bucket in the ring - riak_core_ring:update_meta({bucket,Bucket}, - NewBucketProps, - AccRing) - end, Ring, Buckets) - end, %% Mark ring as tainted to check if it is ever leaked over gossip or %% relied upon for any non-local ring operations. - TaintedRing = riak_core_ring:set_tainted(FixedRing), - - %% Extract bucket properties and place into ETS table. We want all bucket - %% additions, modifications, and deletions to appear in a single atomic - %% operation. Since ETS does not provide a means to change + delete - %% multiple values in a single operation, we emulate the deletion by - %% overwriting all deleted buckets with the "undefined" atom that has - %% special meaning in `riak_core_bucket:get_bucket_props/2`. We then - %% cleanup these values in a subsequent `ets:match_delete`. - OldBuckets = ets:select(?ETS, [{{{bucket, '$1'}, '_'}, [], ['$1']}]), - BucketDefaults = [{{bucket, Bucket}, undefined} || Bucket <- OldBuckets], - BucketMeta = - [{{bucket, Bucket}, Meta} - || Bucket <- riak_core_ring:get_buckets(TaintedRing), - {ok,Meta} <- [riak_core_ring:get_meta({bucket, Bucket}, TaintedRing)]], - BucketMeta2 = lists:ukeysort(1, BucketMeta ++ BucketDefaults), - CHBin = chashbin:create(riak_core_ring:chash(TaintedRing)), + TaintedRing = riak_core_ring:set_tainted(Ring), + CHBin = + chashbin:create(riak_core_ring:chash(TaintedRing)), {Epoch, Id} = ets:lookup_element(?ETS, id, 2), - Actions = [{ring, TaintedRing}, - {raw_ring, Ring}, - {id, {Epoch,Id+1}}, - {chashbin, CHBin} | BucketMeta2], + Actions = [{ring, TaintedRing}, {raw_ring, Ring}, + {id, {Epoch, Id + 1}}, {chashbin, CHBin}], ets:insert(?ETS, Actions), - ets:match_delete(?ETS, {{bucket, '_'}, undefined}), - case riak_core_mochiglobal:get(?RING_KEY) of - ets -> - ok; - _ -> - riak_core_mochiglobal:put(?RING_KEY, ets) + case persistent_term:get(?RING_KEY, undefined) of + ets -> ok; + _ -> persistent_term:put(?RING_KEY, ets) end, ok. promote_ring() -> {ok, Ring} = get_my_ring(), - riak_core_mochiglobal:put(?RING_KEY, Ring). + persistent_term:put(?RING_KEY, Ring). %% Persist a new ring file, set the global value and notify any listeners prune_write_notify_ring(Ring, State) -> @@ -643,16 +541,18 @@ prune_write_notify_ring(Ring, State) -> State2. prune_write_ring(Ring, State) -> - riak_core_ring:check_tainted(Ring, "Error: Persisting tainted ring"), + riak_core_ring:check_tainted(Ring, + "Error: Persisting tainted ring"), ok = riak_core_ring_manager:prune_ringfiles(), _ = do_write_ringfile(Ring), State2 = set_ring(Ring, State), State2. -is_stable_ring(#state{ring_changed_time=Then}) -> - DeltaUS = erlang:max(0, timer:now_diff(os:timestamp(), Then)), +is_stable_ring(#state{ring_changed_time = Then}) -> + DeltaUS = erlang:max(0, + timer:now_diff(os:timestamp(), Then)), DeltaMS = DeltaUS div 1000, - IsStable = DeltaMS >= ?PROMOTE_TIMEOUT, + IsStable = DeltaMS >= (?PROMOTE_TIMEOUT), {IsStable, DeltaMS}. %% =================================================================== @@ -661,68 +561,81 @@ is_stable_ring(#state{ring_changed_time=Then}) -> -ifdef(TEST). back_test() -> - X = [1,2,3], - List1 = [[1,2,3],[4,2,3], [7,8,3], [11,12,13], [1,2,3]], - List2 = [[7,8,9], [1,2,3]], - List3 = [[1,2,3]], - ?assertEqual([[4,2,3]], back(1, X, List1)), - ?assertEqual([[7,8,9]], back(1, X, List2)), - ?assertEqual([], back(1, X, List3)), - ?assertEqual([[7,8,3]], back(2, X, List1)), - ?assertEqual([[11,12,13]], back(3, X, List1)). + X = [1, 2, 3], + List1 = [[1, 2, 3], [4, 2, 3], [7, 8, 3], [11, 12, 13], + [1, 2, 3]], + List2 = [[7, 8, 9], [1, 2, 3]], + List3 = [[1, 2, 3]], + ?assertEqual([[4, 2, 3]], (back(1, X, List1))), + ?assertEqual([[7, 8, 9]], (back(1, X, List2))), + ?assertEqual([], (back(1, X, List3))), + ?assertEqual([[7, 8, 3]], (back(2, X, List1))), + ?assertEqual([[11, 12, 13]], (back(3, X, List1))). prune_list_test() -> - TSList1 = [[2011,2,28,16,32,16],[2011,2,28,16,32,36],[2011,2,28,16,30,27],[2011,2,28,16,32,16],[2011,2,28,16,32,36]], - TSList2 = [[2011,2,28,16,32,36],[2011,2,28,16,31,16],[2011,2,28,16,30,27],[2011,2,28,16,32,16],[2011,2,28,16,32,36]], - PrunedList1 = [[2011,2,28,16,30,27],[2011,2,28,16,32,16]], - PrunedList2 = [[2011,2,28,16,31,16],[2011,2,28,16,32,36]], - ?assertEqual(PrunedList1, prune_list(TSList1)), - ?assertEqual(PrunedList2, prune_list(TSList2)). + TSList1 = [[2011, 2, 28, 16, 32, 16], + [2011, 2, 28, 16, 32, 36], [2011, 2, 28, 16, 30, 27], + [2011, 2, 28, 16, 32, 16], [2011, 2, 28, 16, 32, 36]], + TSList2 = [[2011, 2, 28, 16, 32, 36], + [2011, 2, 28, 16, 31, 16], [2011, 2, 28, 16, 30, 27], + [2011, 2, 28, 16, 32, 16], [2011, 2, 28, 16, 32, 36]], + PrunedList1 = [[2011, 2, 28, 16, 30, 27], + [2011, 2, 28, 16, 32, 16]], + PrunedList2 = [[2011, 2, 28, 16, 31, 16], + [2011, 2, 28, 16, 32, 36]], + ?assertEqual(PrunedList1, (prune_list(TSList1))), + ?assertEqual(PrunedList2, (prune_list(TSList2))). set_ring_global_test() -> setup_ets(test), - application:set_env(riak_core,ring_creation_size, 4), + application:set_env(riak_core, ring_creation_size, 4), Ring = riak_core_ring:fresh(), set_ring_global(Ring), promote_ring(), - ?assert(riak_core_ring:nearly_equal(Ring, riak_core_mochiglobal:get(?RING_KEY))), + ?assert((riak_core_ring:nearly_equal(Ring, + persistent_term:get(?RING_KEY, + undefined)))), cleanup_ets(test). set_my_ring_test() -> setup_ets(test), - application:set_env(riak_core,ring_creation_size, 4), + application:set_env(riak_core, ring_creation_size, 4), Ring = riak_core_ring:fresh(), set_ring_global(Ring), {ok, MyRing} = get_my_ring(), - ?assert(riak_core_ring:nearly_equal(Ring, MyRing)), + ?assert((riak_core_ring:nearly_equal(Ring, MyRing))), cleanup_ets(test). refresh_my_ring_test() -> - {spawn, fun() -> - setup_ets(test), - Core_Settings = [{ring_creation_size, 4}, - {ring_state_dir, "_build/test/tmp"}, - {cluster_name, "test"}], - [begin - put({?MODULE,AppKey}, application:get_env(riak_core, AppKey, undefined)), - ok = application:set_env(riak_core, AppKey, Val) - end || {AppKey, Val} <- Core_Settings], - stop_core_processes(), - riak_core_ring_events:start_link(), - riak_core_ring_manager:start_link(test), - riak_core_vnode_sup:start_link(), - riak_core_vnode_master:start_link(riak_core_vnode), - riak_core_test_util:setup_mockring1(), - ?assertEqual(ok, riak_core_ring_manager:refresh_my_ring()), - stop_core_processes(), - %% Cleanup the ring file created for this test - {ok, RingFile} = find_latest_ringfile(), - file:delete(RingFile), - [ok = application:set_env(riak_core, AppKey, get({?MODULE, AppKey})) - || {AppKey, _Val} <- Core_Settings], - ok - end - }. + {spawn, + fun () -> + setup_ets(test), + Core_Settings = [{ring_creation_size, 4}, + {ring_state_dir, "_build/test/tmp"}, + {cluster_name, "test"}], + [begin + put({?MODULE, AppKey}, + application:get_env(riak_core, AppKey, undefined)), + ok = application:set_env(riak_core, AppKey, Val) + end + || {AppKey, Val} <- Core_Settings], + stop_core_processes(), + riak_core_ring_events:start_link(), + riak_core_ring_manager:start_link(test), + riak_core_vnode_sup:start_link(), + riak_core_vnode_master:start_link(riak_core_vnode), + riak_core_test_util:setup_mockring1(), + ?assertEqual(ok, + (riak_core_ring_manager:refresh_my_ring())), + stop_core_processes(), + %% Cleanup the ring file created for this test + {ok, RingFile} = find_latest_ringfile(), + file:delete(RingFile), + [ok = application:set_env(riak_core, AppKey, + get({?MODULE, AppKey})) + || {AppKey, _Val} <- Core_Settings], + ok + end}. stop_core_processes() -> riak_core_test_util:stop_pid(riak_core_ring_events), @@ -731,47 +644,52 @@ stop_core_processes() -> riak_core_test_util:stop_pid(riak_core_vnode_master). -define(TEST_RINGDIR, "_build/test_ring"). --define(TEST_RINGFILE, (?TEST_RINGDIR ++ "/ring")). --define(TMP_RINGFILE, (?TEST_RINGFILE ++ ".tmp")). + +-define(TEST_RINGFILE, (?TEST_RINGDIR) ++ "/ring"). + +-define(TMP_RINGFILE, (?TEST_RINGFILE) ++ ".tmp"). do_write_ringfile_test() -> + application:set_env(riak_core, cluster_name, "test"), %% Make sure no data exists from previous runs file:change_mode(?TMP_RINGFILE, 8#00644), file:delete(?TMP_RINGFILE), - %% Check happy path - GenR = fun(Name) -> riak_core_ring:fresh(64, Name) end, - ?assertEqual(ok, do_write_ringfile(GenR(happy), ?TEST_RINGFILE)), - + GenR = fun (Name) -> riak_core_ring:fresh(64, Name) end, + ?assertEqual(ok, + (do_write_ringfile(GenR(happy), ?TMP_RINGFILE))), %% errors expected error_logger:tty(false), - %% Check write fails (create .tmp file with no write perms) - ok = file:write_file(?TMP_RINGFILE, <<"no write for you">>), + ok = file:write_file(?TMP_RINGFILE, + <<"no write for you">>), ok = file:change_mode(?TMP_RINGFILE, 8#00444), - ?assertMatch({error,_}, do_write_ringfile(GenR(tmp_perms), ?TEST_RINGFILE)), + ?assertMatch({error, _}, + (do_write_ringfile(GenR(tmp_perms), ?TEST_RINGFILE))), ok = file:change_mode(?TMP_RINGFILE, 8#00644), ok = file:delete(?TMP_RINGFILE), - %% Check rename fails ok = file:change_mode(?TEST_RINGDIR, 8#00444), - ?assertMatch({error,_}, do_write_ringfile(GenR(ring_perms), ?TEST_RINGFILE)), + ?assertMatch({error, _}, + (do_write_ringfile(GenR(ring_perms), ?TEST_RINGFILE))), ok = file:change_mode(?TEST_RINGDIR, 8#00755), - error_logger:tty(true), - %% Cleanup the ring file created for this test - {ok, RingFile} = find_latest_ringfile(), - file:delete(RingFile). + file:delete(?TMP_RINGFILE). is_stable_ring_test() -> - {A,B,C} = Now = os:timestamp(), - TimeoutSecs = ?PROMOTE_TIMEOUT div 1000, - Within = {A, B - (TimeoutSecs div 2), C}, + {A, B, C} = Now = os:timestamp(), + TimeoutSecs = (?PROMOTE_TIMEOUT) div 1000, + Within = {A, B - TimeoutSecs div 2, C}, Outside = {A, B - (TimeoutSecs + 1), C}, - ?assertMatch({true,_},is_stable_ring(#state{ring_changed_time={0,0,0}})), - ?assertMatch({true,_},is_stable_ring(#state{ring_changed_time=Outside})), - ?assertMatch({false,_},is_stable_ring(#state{ring_changed_time=Within})), - ?assertMatch({false,_},is_stable_ring(#state{ring_changed_time=Now})). + ?assertMatch({true, _}, + (is_stable_ring(#state{ring_changed_time = + {0, 0, 0}}))), + ?assertMatch({true, _}, + (is_stable_ring(#state{ring_changed_time = Outside}))), + ?assertMatch({false, _}, + (is_stable_ring(#state{ring_changed_time = Within}))), + ?assertMatch({false, _}, + (is_stable_ring(#state{ring_changed_time = Now}))). -endif. diff --git a/src/riak_core_ring_util.erl b/src/riak_core_ring_util.erl index 5b27783e4..28307b9a1 100644 --- a/src/riak_core_ring_util.erl +++ b/src/riak_core_ring_util.erl @@ -21,30 +21,24 @@ %% ------------------------------------------------------------------- -module(riak_core_ring_util). --export([assign/2, - check_ring/0, - check_ring/1, - check_ring/2, - hash_to_partition_id/2, - partition_id_to_hash/2, - hash_is_partition_boundary/2]). +-export([assign/2, check_ring/0, check_ring/1, + check_ring/2, hash_to_partition_id/2, + partition_id_to_hash/2, hash_is_partition_boundary/2]). -ifdef(TEST). --ifdef(EQC). --export([prop_ids_are_boundaries/0, prop_reverse/0, - prop_monotonic/0, prop_only_boundaries/0]). --include_lib("eqc/include/eqc.hrl"). --endif. -include_lib("eunit/include/eunit.hrl"). + -endif. %% @doc Forcibly assign a partition to a specific node assign(Partition, ToNode) -> - F = fun(Ring, _) -> - {new_ring, riak_core_ring:transfer_node(Partition, ToNode, Ring)} + F = fun (Ring, _) -> + {new_ring, + riak_core_ring:transfer_node(Partition, ToNode, Ring)} end, - {ok, _NewRing} = riak_core_ring_manager:ring_trans(F, undefined), + {ok, _NewRing} = riak_core_ring_manager:ring_trans(F, + undefined), ok. %% @doc Check the local ring for any preflists that do not satisfy n_val @@ -53,214 +47,109 @@ check_ring() -> check_ring(R). check_ring(Ring) -> - {ok, Props} = application:get_env(riak_core, default_bucket_props), - {n_val, Nval} = lists:keyfind(n_val, 1, Props), + {ok, Nval} = application:get_env(riak_core, + target_n_val), check_ring(Ring, Nval). %% @doc Check a ring for any preflists that do not satisfy n_val check_ring(Ring, Nval) -> Preflists = riak_core_ring:all_preflists(Ring, Nval), - lists:foldl(fun(PL,Acc) -> - PLNodes = lists:usort([Node || {_,Node} <- PL]), + lists:foldl(fun (PL, Acc) -> + PLNodes = lists:usort([Node || {_, Node} <- PL]), case length(PLNodes) of - Nval -> - Acc; - _ -> - ordsets:add_element(PL, Acc) + Nval -> Acc; + _ -> ordsets:add_element(PL, Acc) end - end, [], Preflists). + end, + [], Preflists). + +-spec hash_to_partition_id(chash:index() | + chash:index_as_int(), + riak_core_ring:ring_size()) -> riak_core_ring:partition_id(). --spec hash_to_partition_id(chash:index() | chash:index_as_int(), - riak_core_ring:ring_size()) -> - riak_core_ring:partition_id(). %% @doc Map a key hash (as binary or integer) to a partition ID [0, ring_size) -hash_to_partition_id(CHashKey, RingSize) when is_binary(CHashKey) -> +hash_to_partition_id(CHashKey, RingSize) + when is_binary(CHashKey) -> <> = CHashKey, hash_to_partition_id(CHashInt, RingSize); hash_to_partition_id(CHashInt, RingSize) -> CHashInt div chash:ring_increment(RingSize). --spec partition_id_to_hash(riak_core_ring:partition_id(), pos_integer()) -> - chash:index_as_int(). +-spec + partition_id_to_hash(riak_core_ring:partition_id(), + pos_integer()) -> chash:index_as_int(). + %% @doc Identify the first key hash (integer form) in a partition ID [0, ring_size) partition_id_to_hash(Id, RingSize) -> Id * chash:ring_increment(RingSize). +-spec hash_is_partition_boundary(chash:index() | + chash:index_as_int(), + pos_integer()) -> boolean(). --spec hash_is_partition_boundary(chash:index() | chash:index_as_int(), - pos_integer()) -> - boolean(). %% @doc For user-facing tools, indicate whether a specified hash value %% is a valid "boundary" value (first hash in some partition) -hash_is_partition_boundary(CHashKey, RingSize) when is_binary(CHashKey) -> +hash_is_partition_boundary(CHashKey, RingSize) + when is_binary(CHashKey) -> <> = CHashKey, hash_is_partition_boundary(CHashInt, RingSize); hash_is_partition_boundary(CHashInt, RingSize) -> CHashInt rem chash:ring_increment(RingSize) =:= 0. - %% =================================================================== %% EUnit tests %% =================================================================== -ifdef(TEST). -%% The EQC properties below are more comprehensive tests for hashes as -%% integers; use pure unit tests to make certain that binary hashes +%% Use pure unit tests to make certain that binary hashes %% are handled. %% Partition boundaries are reversable. reverse_test() -> - IntIndex = riak_core_ring_util:partition_id_to_hash(31, 32), + IntIndex = riak_core_ring_util:partition_id_to_hash(31, + 32), HashIndex = <>, - ?assertEqual(31, riak_core_ring_util:hash_to_partition_id(HashIndex, 32)), - ?assertEqual(0, riak_core_ring_util:hash_to_partition_id(<<0:160>>, 32)). + ?assertEqual(31, + (riak_core_ring_util:hash_to_partition_id(HashIndex, + 32))), + ?assertEqual(0, + (riak_core_ring_util:hash_to_partition_id(<<0:160>>, + 32))). %% Index values somewhere in the middle of a partition can be mapped %% to partition IDs. partition_test() -> - IntIndex = riak_core_ring_util:partition_id_to_hash(20, 32) + - chash:ring_increment(32) div 3, + IntIndex = riak_core_ring_util:partition_id_to_hash(20, + 32) + + chash:ring_increment(32) div 3, HashIndex = <>, - ?assertEqual(20, riak_core_ring_util:hash_to_partition_id(HashIndex, 32)). + ?assertEqual(20, + (riak_core_ring_util:hash_to_partition_id(HashIndex, + 32))). %% Index values divisible by partition size are boundary values, others are not boundary_test() -> - BoundaryIndex = riak_core_ring_util:partition_id_to_hash(15, 32), - ?assert(riak_core_ring_util:hash_is_partition_boundary(<>, 32)), - ?assertNot(riak_core_ring_util:hash_is_partition_boundary(<<(BoundaryIndex + 32):160>>, 32)), - ?assertNot(riak_core_ring_util:hash_is_partition_boundary(<<(BoundaryIndex - 32):160>>, 32)), - ?assertNot(riak_core_ring_util:hash_is_partition_boundary(<<(BoundaryIndex + 1):160>>, 32)), - ?assertNot(riak_core_ring_util:hash_is_partition_boundary(<<(BoundaryIndex - 1):160>>, 32)), - ?assertNot(riak_core_ring_util:hash_is_partition_boundary(<<(BoundaryIndex + 2):160>>, 32)), - ?assertNot(riak_core_ring_util:hash_is_partition_boundary(<<(BoundaryIndex + 10):160>>, 32)). - --ifdef(EQC). - --define(QC_OUT(P), - eqc:on_output(fun(Str, Args) -> - io:format(user, Str, Args) end, P)). --define(TEST_TIME_SECS, 5). - --define(HASHMAX, 1 bsl 160 - 1). --define(RINGSIZEEXPMAX, 11). --define(RINGSIZE(X), (1 bsl X)). %% We'll generate powers of 2 with choose() and convert that to a ring size with this macro --define(PARTITIONSIZE(X), ((1 bsl 160) div (X))). - -ids_are_boundaries_test_() -> - {timeout, ?TEST_TIME_SECS+5, [?_assert(test_ids_are_boundaries() =:= true)]}. - -test_ids_are_boundaries() -> - test_ids_are_boundaries(?TEST_TIME_SECS). - -test_ids_are_boundaries(TestTimeSecs) -> - eqc:quickcheck(eqc:testing_time(TestTimeSecs, ?QC_OUT(prop_ids_are_boundaries()))). - -reverse_test_() -> - {timeout, ?TEST_TIME_SECS+5, [?_assert(test_reverse() =:= true)]}. - -test_reverse() -> - test_reverse(?TEST_TIME_SECS). - -test_reverse(TestTimeSecs) -> - eqc:quickcheck(eqc:testing_time(TestTimeSecs, ?QC_OUT(prop_reverse()))). - - -monotonic_test_() -> - {timeout, ?TEST_TIME_SECS+5, [?_assert(test_monotonic() =:= true)]}. - -test_monotonic() -> - test_monotonic(?TEST_TIME_SECS). - -test_monotonic(TestTimeSecs) -> - eqc:quickcheck(eqc:testing_time(TestTimeSecs, ?QC_OUT(prop_monotonic()))). - - -%% `prop_only_boundaries' should run a little longer: not quite as -%% fast, need to scan a larger portion of hash space to establish -%% correctness -only_boundaries_test_() -> - {timeout, ?TEST_TIME_SECS+15, [?_assert(test_only_boundaries() =:= true)]}. - -test_only_boundaries() -> - test_only_boundaries(?TEST_TIME_SECS+10). - -test_only_boundaries(TestTimeSecs) -> - eqc:quickcheck(eqc:testing_time(TestTimeSecs, ?QC_OUT(prop_only_boundaries()))). - -%% Partition IDs should map to hash values which are partition boundaries -prop_ids_are_boundaries() -> - ?FORALL(RingPower, choose(2, ?RINGSIZEEXPMAX), - ?FORALL(PartitionId, choose(0, ?RINGSIZE(RingPower) - 1), - begin - RingSize = ?RINGSIZE(RingPower), - BoundaryHash = - riak_core_ring_util:partition_id_to_hash(PartitionId, - RingSize), - equals(true, - riak_core_ring_util:hash_is_partition_boundary(BoundaryHash, - RingSize)) - end - )). - -%% Partition IDs should map to hash values which map back to the same partition IDs -prop_reverse() -> - ?FORALL(RingPower, choose(2, ?RINGSIZEEXPMAX), - ?FORALL(PartitionId, choose(0, ?RINGSIZE(RingPower) - 1), - begin - RingSize = ?RINGSIZE(RingPower), - BoundaryHash = - riak_core_ring_util:partition_id_to_hash(PartitionId, - RingSize), - equals(PartitionId, - riak_core_ring_util:hash_to_partition_id( - BoundaryHash, RingSize)) - end - )). - -%% For any given hash value, any larger hash value maps to a partition -%% ID of greater or equal value. -prop_monotonic() -> - ?FORALL(RingPower, choose(2, ?RINGSIZEEXPMAX), - ?FORALL(HashValue, choose(0, ?HASHMAX - 1), - ?FORALL(GreaterHash, choose(HashValue + 1, ?HASHMAX), - begin - RingSize = ?RINGSIZE(RingPower), - LowerPartition = - riak_core_ring_util:hash_to_partition_id(HashValue, - RingSize), - GreaterPartition = - riak_core_ring_util:hash_to_partition_id(GreaterHash, - RingSize), - LowerPartition =< GreaterPartition - end - ))). - -%% Hash values which are listed in the ring structure are boundary -%% values -ring_to_set({_RingSize, PropList}) -> - ordsets:from_list(lists:map(fun({Hash, dummy}) -> Hash end, PropList)). - -find_near_boundaries(RingSize, PartitionSize) -> - ?LET({Id, Offset}, {choose(1, RingSize-1), choose(-(RingSize*2), (RingSize*2))}, - Id * PartitionSize + Offset). - -prop_only_boundaries() -> - ?FORALL(RingPower, choose(2, ?RINGSIZEEXPMAX), - ?FORALL({HashValue, BoundarySet}, - {frequency([ - {5, choose(0, ?HASHMAX)}, - {2, find_near_boundaries(?RINGSIZE(RingPower), - ?PARTITIONSIZE(?RINGSIZE(RingPower)))}]), - ring_to_set(chash:fresh(?RINGSIZE(RingPower), dummy))}, - begin - RingSize = ?RINGSIZE(RingPower), - HashIsInRing = ordsets:is_element(HashValue, BoundarySet), - HashIsPartitionBoundary = - riak_core_ring_util:hash_is_partition_boundary(HashValue, - RingSize), - equals(HashIsPartitionBoundary, HashIsInRing) - end - )). + BoundaryIndex = + riak_core_ring_util:partition_id_to_hash(15, 32), + ?assert((riak_core_ring_util:hash_is_partition_boundary(<>, + 32))), + ?assertNot((riak_core_ring_util:hash_is_partition_boundary(<<(BoundaryIndex + + 32):160>>, + 32))), + ?assertNot((riak_core_ring_util:hash_is_partition_boundary(<<(BoundaryIndex + - 32):160>>, + 32))), + ?assertNot((riak_core_ring_util:hash_is_partition_boundary(<<(BoundaryIndex + + 1):160>>, + 32))), + ?assertNot((riak_core_ring_util:hash_is_partition_boundary(<<(BoundaryIndex + - 1):160>>, + 32))), + ?assertNot((riak_core_ring_util:hash_is_partition_boundary(<<(BoundaryIndex + + 2):160>>, + 32))), + ?assertNot((riak_core_ring_util:hash_is_partition_boundary(<<(BoundaryIndex + + 10):160>>, + 32))). --endif. % EQC -endif. % TEST diff --git a/src/riak_core_send_msg.erl b/src/riak_core_send_msg.erl index 80dda126c..41138893a 100644 --- a/src/riak_core_send_msg.erl +++ b/src/riak_core_send_msg.erl @@ -22,19 +22,24 @@ -module(riak_core_send_msg). --export([reply_unreliable/2, - cast_unreliable/2, - send_event_unreliable/2, - bang_unreliable/2]). +-export([reply_unreliable/2, cast_unreliable/2, + send_event_unreliable/2, bang_unreliable/2]). -ifdef(TEST). + -ifdef(PULSE). + -compile(export_all). + -compile({parse_transform, pulse_instrument}). --compile({pulse_replace_module, [{gen_fsm, pulse_gen_fsm}, - {gen_fsm_compat, pulse_gen_fsm}, - {gen_server, pulse_gen_server}]}). + +-compile({pulse_replace_module, + [{gen_fsm, pulse_gen_fsm}, + {gen_fsm_compat, pulse_gen_fsm}, + {gen_server, pulse_gen_server}]}). + -endif. + -endif. %% NOTE: We'ed peeked inside gen_server.erl's guts to see its internals. @@ -45,13 +50,14 @@ cast_unreliable(Dest, Request) -> bang_unreliable(Dest, {'$gen_cast', Request}). %% NOTE: We'ed peeked inside gen_fsm.erl's guts to see its internals. -send_event_unreliable({global, _Name} = GlobalTo, Event) -> +send_event_unreliable({global, _Name} = GlobalTo, + Event) -> erlang:error({unimplemented_send, GlobalTo, Event}); -send_event_unreliable({via, _Mod, _Name} = ViaTo, Event) -> +send_event_unreliable({via, _Module, _Name} = ViaTo, + Event) -> erlang:error({unimplemented_send, ViaTo, Event}); send_event_unreliable(Name, Event) -> - bang_unreliable(Name, {'$gen_event', Event}), - ok. + bang_unreliable(Name, {'$gen_event', Event}), ok. bang_unreliable(Dest, Msg) -> catch erlang:send(Dest, Msg, [noconnect, nosuspend]), diff --git a/src/riak_core_status.erl b/src/riak_core_status.erl index fce67152e..bf8d8e3a2 100644 --- a/src/riak_core_status.erl +++ b/src/riak_core_status.erl @@ -20,62 +20,51 @@ %% %% ------------------------------------------------------------------- -module(riak_core_status). --export([ringready/0, - all_active_transfers/0, - transfers/0, - partitions/2, - ring_status/0]). --spec(ringready() -> {ok, [atom()]} | {error, any()}). +-export([ringready/0, all_active_transfers/0, + transfers/0, partitions/2, ring_status/0]). + +-spec ringready() -> {ok, [atom()]} | {error, any()}. + ringready() -> case get_rings() of - {[], Rings} -> - {N1,R1}=hd(Rings), - case rings_match(hash_ring(R1), tl(Rings)) of - true -> - Nodes = [N || {N,_} <- Rings], - {ok, Nodes}; - - {false, N2} -> - {error, {different_owners, N1, N2}} - end; - - {Down, _Rings} -> - {error, {nodes_down, Down}} + {[], Rings} -> + {N1, R1} = hd(Rings), + case rings_match(hash_ring(R1), tl(Rings)) of + true -> Nodes = [N || {N, _} <- Rings], {ok, Nodes}; + {false, N2} -> {error, {different_owners, N1, N2}} + end; + {Down, _Rings} -> {error, {nodes_down, Down}} end. +-spec transfers() -> {[atom()], + [{waiting_to_handoff, atom(), integer()} | + {stopped, atom(), integer()}]}. --spec(transfers() -> {[atom()], [{waiting_to_handoff, atom(), integer()} | - {stopped, atom(), integer()}]}). transfers() -> {Down, Rings} = get_rings(), - %% Work out which vnodes are running and which partitions they claim - F = fun({N,R}, Acc) -> + F = fun ({N, R}, Acc) -> {_Pri, Sec, Stopped} = partitions(N, R), Acc1 = case Sec of - [] -> - []; - _ -> - [{waiting_to_handoff, N, length(Sec)}] + [] -> []; + _ -> [{waiting_to_handoff, N, length(Sec)}] end, case Stopped of - [] -> - Acc1 ++ Acc; - _ -> - Acc1 ++ [{stopped, N, length(Stopped)} | Acc] + [] -> Acc1 ++ Acc; + _ -> Acc1 ++ [{stopped, N, length(Stopped)} | Acc] end end, {Down, lists:foldl(F, [], Rings)}. %% @doc Produce status for all active transfers in the cluster. --spec all_active_transfers() -> {Xfers::list(), Down::list()}. +-spec all_active_transfers() -> {Xfers :: list(), + Down :: list()}. + all_active_transfers() -> {Xfers, Down} = riak_core_util:rpc_every_member(riak_core_handoff_manager, - status, - [{direction, outbound}], - 5000), + status, [{direction, outbound}], 5000), {Xfers, Down}. ring_status() -> @@ -83,49 +72,53 @@ ring_status() -> %% are running on each node. {ok, Ring} = riak_core_ring_manager:get_raw_ring(), {AllMods, Down} = - riak_core_util:rpc_every_member_ann(riak_core, vnode_modules, [], 1000), - + riak_core_util:rpc_every_member_ann(riak_core, + vnode_modules, [], 1000), %% Check if the claimant is running and if it believes the ring is ready Claimant = riak_core_ring:claimant(Ring), - case riak_core_util:safe_rpc(Claimant, riak_core_ring, ring_ready, [], 5000) of - {badrpc, _} -> - Down2 = lists:usort([Claimant|Down]), - RingReady = undefined; - RingReady -> - Down2 = Down, - RingReady = RingReady + case riak_core_util:safe_rpc(Claimant, riak_core_ring, + ring_ready, [], 5000) + of + {badrpc, _} -> + Down2 = lists:usort([Claimant | Down]), + RingReady = undefined; + RingReady -> Down2 = Down, RingReady = RingReady end, - %% Get the list of pending ownership changes Changes = riak_core_ring:pending_changes(Ring), %% Group pending changes by (Owner, NextOwner) - Merged = lists:foldl( - fun({Idx, Owner, NextOwner, Mods, Status}, Acc) -> - orddict:append({Owner, NextOwner}, - {Idx, Mods, Status}, - Acc) - end, [], Changes), - + Merged = lists:foldl(fun ({Idx, Owner, NextOwner, Mods, + Status}, + Acc) -> + orddict:append({Owner, NextOwner}, + {Idx, Mods, Status}, Acc) + end, + [], Changes), %% For each pending transfer, determine which vnode modules have completed %% handoff and which we are still waiting on. %% Final result is of the form: %% [{Owner, NextOwner}, [{Index, WaitingMods, CompletedMods, Status}]] - TransferStatus = - orddict:map( - fun({Owner, _}, Transfers) -> - case orddict:find(Owner, AllMods) of - error -> - [{Idx, down, Mods, Status} - || {Idx, Mods, Status} <- Transfers]; - {ok, OwnerMods} -> - NodeMods = [Mod || {_App, Mod} <- OwnerMods], - [{Idx, NodeMods -- Mods, Mods, Status} - || {Idx, Mods, Status} <- Transfers] - end - end, Merged), - + TransferStatus = orddict:map(fun ({Owner, _}, + Transfers) -> + case orddict:find(Owner, AllMods) of + error -> + [{Idx, down, Mods, Status} + || {Idx, Mods, Status} + <- Transfers]; + {ok, OwnerMods} -> + NodeMods = [Mod + || {_App, Mod} + <- OwnerMods], + [{Idx, NodeMods -- Mods, Mods, + Status} + || {Idx, Mods, Status} + <- Transfers] + end + end, + Merged), MarkedDown = riak_core_ring:down_members(Ring), - {Claimant, RingReady, Down2, MarkedDown, TransferStatus}. + {Claimant, RingReady, Down2, MarkedDown, + TransferStatus}. %% =================================================================== %% Internal functions @@ -133,10 +126,12 @@ ring_status() -> %% Retrieve the rings for all other nodes by RPC get_rings() -> - {RawRings, Down} = riak_core_util:rpc_every_member( - riak_core_ring_manager, get_my_ring, [], 30000), -%% RawRings2 = [R || {ok, R} <- RawRings], - Rings = orddict:from_list([{riak_core_ring:owner_node(R), R} || R <- RawRings]), + {RawRings, Down} = + riak_core_util:rpc_every_member(riak_core_ring_manager, + get_my_ring, [], 30000), + Rings = + orddict:from_list([{riak_core_ring:owner_node(R), R} + || {ok, R} <- RawRings]), {lists:sort(Down), Rings}. %% Produce a hash of the 'chash' portion of the ring @@ -144,32 +139,35 @@ hash_ring(R) -> erlang:phash2(riak_core_ring:all_owners(R)). %% Check if all rings match given a hash and a list of [{N,P}] to check -rings_match(_, []) -> - true; +rings_match(_, []) -> true; rings_match(R1hash, [{N2, R2} | Rest]) -> case hash_ring(R2) of - R1hash -> - rings_match(R1hash, Rest); - _ -> - {false, N2} + R1hash -> rings_match(R1hash, Rest); + _ -> {false, N2} end. %% Get a list of active partition numbers - regardless of vnode type --spec active_partitions(node()) -> ordsets:ordset(non_neg_integer()). +-spec + active_partitions(node()) -> ordsets:ordset(non_neg_integer()). + active_partitions(Node) -> - case riak_core_util:safe_rpc(Node, riak_core_vnode_manager, all_vnodes, [], 30000) of - {badrpc, _} -> ordsets:new(); - VNodes -> - lists:foldl(fun({_, P, _}, Ps) -> - ordsets:add_element(P, Ps) - end, ordsets:new(), VNodes) + case riak_core_util:safe_rpc(Node, + riak_core_vnode_manager, all_vnodes, [], 30000) + of + {badrpc, _} -> ordsets:new(); + VNodes -> + lists:foldl(fun ({_, P, _}, Ps) -> + ordsets:add_element(P, Ps) + end, + ordsets:new(), VNodes) end. %% Return a list of active primary partitions, active secondary partitions (to be handed off) %% and stopped partitions that should be started partitions(Node, Ring) -> Owners = riak_core_ring:all_owners(Ring), - Owned = ordsets:from_list(owned_partitions(Owners, Node)), + Owned = ordsets:from_list(owned_partitions(Owners, + Node)), Active = active_partitions(Node), Stopped = ordsets:subtract(Owned, Active), Secondary = ordsets:subtract(Active, Owned), @@ -179,3 +177,29 @@ partitions(Node, Ring) -> %% Return the list of partitions owned by a node owned_partitions(Owners, Node) -> [P || {P, Owner} <- Owners, Owner =:= Node]. + +%% =================================================================== +%% Unit tests +%% =================================================================== +-ifdef(TEST). + +-include_lib("eunit/include/eunit.hrl"). + +-define(TEST_RINGDIR, "_build/test_ring"). + +-define(TEST_RINGFILE, (?TEST_RINGDIR) ++ "/ring"). + +-define(TMP_RINGFILE, (?TEST_RINGFILE) ++ ".tmp"). + +set_my_ring_test() -> + riak_core_ring_manager:setup_ets(test), + application:set_env(riak_core, ring_creation_size, 4), + Ring = riak_core_ring:fresh(), + riak_core_ring_manager:set_ring_global(Ring), + {ok, MyRing} = riak_core_ring_manager:get_my_ring(), + ?assert((riak_core_ring:nearly_equal(Ring, MyRing))), + %% this call should not crash + get_rings(), + riak_core_ring_manager:cleanup_ets(test). + +-endif. diff --git a/src/riak_core_sup.erl b/src/riak_core_sup.erl index 5ce5d25ed..5b875eb0f 100644 --- a/src/riak_core_sup.erl +++ b/src/riak_core_sup.erl @@ -31,8 +31,13 @@ -export([init/1]). %% Helper macro for declaring children of supervisor --define(CHILD(I, Type, Timeout, Args), {I, {I, start_link, Args}, permanent, Timeout, Type, [I]}). --define(CHILD(I, Type, Timeout), ?CHILD(I, Type, Timeout, [])). +-define(CHILD(I, Type, Timeout, Args), + {I, {I, start_link, Args}, permanent, Timeout, Type, + [I]}). + +-define(CHILD(I, Type, Timeout), + ?CHILD(I, Type, Timeout, [])). + -define(CHILD(I, Type), ?CHILD(I, Type, 5000)). %% =================================================================== @@ -47,20 +52,16 @@ start_link() -> %% =================================================================== init([]) -> - - Children = lists:flatten( - [ - ?CHILD(riak_core_vnode_sup, supervisor, 305000), - ?CHILD(riak_core_eventhandler_sup, supervisor), - ?CHILD(riak_core_handoff_sup, supervisor), - ?CHILD(riak_core_ring_events, worker), - ?CHILD(riak_core_ring_manager, worker), - ?CHILD(riak_core_vnode_proxy_sup, supervisor), - ?CHILD(riak_core_node_watcher_events, worker), - ?CHILD(riak_core_node_watcher, worker), - ?CHILD(riak_core_vnode_manager, worker), - ?CHILD(riak_core_gossip, worker), - ?CHILD(riak_core_claimant, worker) - ]), - + Children = lists:flatten([?CHILD(riak_core_vnode_sup, + supervisor, 305000), + ?CHILD(riak_core_eventhandler_sup, supervisor), + ?CHILD(riak_core_handoff_sup, supervisor), + ?CHILD(riak_core_ring_events, worker), + ?CHILD(riak_core_ring_manager, worker), + ?CHILD(riak_core_vnode_proxy_sup, supervisor), + ?CHILD(riak_core_node_watcher_events, worker), + ?CHILD(riak_core_node_watcher, worker), + ?CHILD(riak_core_vnode_manager, worker), + ?CHILD(riak_core_gossip, worker), + ?CHILD(riak_core_claimant, worker)]), {ok, {{one_for_one, 10, 10}, Children}}. diff --git a/src/riak_core_test_util.erl b/src/riak_core_test_util.erl index 8646f51ee..fad058bf9 100644 --- a/src/riak_core_test_util.erl +++ b/src/riak_core_test_util.erl @@ -25,25 +25,19 @@ -module(riak_core_test_util). -ifdef(TEST). --export([setup_mockring1/0, - fake_ring/2, - stop_pid/1, - wait_for_pid/1, - stop_pid/2, - unlink_named_process/1]). + +-export([setup_mockring1/0, fake_ring/2, stop_pid/1, + wait_for_pid/1, stop_pid/2, unlink_named_process/1]). + -include_lib("eunit/include/eunit.hrl"). -stop_pid(undefined) -> - ok; +stop_pid(undefined) -> ok; stop_pid(Name) when is_atom(Name) -> stop_pid(whereis(Name)); -stop_pid(Other) when not is_pid(Other) -> - ok; -stop_pid(Pid) -> - stop_pid(Pid, kill). +stop_pid(Other) when not is_pid(Other) -> ok; +stop_pid(Pid) -> stop_pid(Pid, kill). -stop_pid(Other, _ExitType) when not is_pid(Other) -> - ok; +stop_pid(Other, _ExitType) when not is_pid(Other) -> ok; stop_pid(Pid, ExitType) -> unlink(Pid), exit(Pid, ExitType), @@ -52,53 +46,59 @@ stop_pid(Pid, ExitType) -> wait_for_pid(Pid) -> Mref = erlang:monitor(process, Pid), receive - {'DOWN', Mref, process, _, _} -> - ok - after - 5000 -> - {error, didnotexit} + {'DOWN', Mref, process, _, _} -> ok + after 5000 -> {error, didnotexit} end. - unlink_named_process(Name) when is_atom(Name) -> unlink(whereis(Name)). setup_mockring1() -> % requires a running riak_core_ring_manager, in test-mode is ok - Ring0 = riak_core_ring:fresh(16,node()), - Ring1 = riak_core_ring:add_member(node(), Ring0, 'othernode@otherhost'), - Ring2 = riak_core_ring:add_member(node(), Ring1, 'othernode2@otherhost2'), - - Ring3 = lists:foldl(fun(_,R) -> - riak_core_ring:transfer_node( - hd(riak_core_ring:my_indices(R)), - 'othernode@otherhost', R) end, - Ring2,[1,2,3,4,5,6]), - Ring = lists:foldl(fun(_,R) -> - riak_core_ring:transfer_node( - hd(riak_core_ring:my_indices(R)), - 'othernode2@otherhost2', R) end, - Ring3,[1,2,3,4,5,6]), + Ring0 = riak_core_ring:fresh(16, node()), + Ring1 = riak_core_ring:add_member(node(), Ring0, + othernode@otherhost), + Ring2 = riak_core_ring:add_member(node(), Ring1, + othernode2@otherhost2), + Ring3 = lists:foldl(fun (_, R) -> + riak_core_ring:transfer_node(hd(riak_core_ring:my_indices(R)), + othernode@otherhost, + R) + end, + Ring2, [1, 2, 3, 4, 5, 6]), + Ring = lists:foldl(fun (_, R) -> + riak_core_ring:transfer_node(hd(riak_core_ring:my_indices(R)), + othernode2@otherhost2, + R) + end, + Ring3, [1, 2, 3, 4, 5, 6]), riak_core_ring_manager:set_ring_global(Ring). fake_ring(Size, NumNodes) -> - ManyNodes = [list_to_atom("dev" ++ integer_to_list(X) ++ "@127.0.0.1") + ManyNodes = [list_to_atom("dev" ++ + integer_to_list(X) ++ "@127.0.0.1") || _ <- lists:seq(0, Size div NumNodes), X <- lists:seq(1, NumNodes)], Nodes = lists:sublist(ManyNodes, Size), Inc = chash:ring_increment(Size), - Indices = lists:seq(0, (Size-1)*Inc, Inc), + Indices = lists:seq(0, (Size - 1) * Inc, Inc), Owners = lists:zip(Indices, Nodes), - [Node|OtherNodes] = Nodes, + [Node | OtherNodes] = Nodes, Ring = riak_core_ring:fresh(Size, Node), - Ring2 = lists:foldl(fun(OtherNode, RingAcc) -> - RingAcc2 = riak_core_ring:add_member(Node, RingAcc, OtherNode), - riak_core_ring:set_member(Node, RingAcc2, OtherNode, - valid, same_vclock) - end, Ring, OtherNodes), - Ring3 = lists:foldl(fun({Idx, Owner}, RingAcc) -> - riak_core_ring:transfer_node(Idx, Owner, RingAcc) - end, Ring2, Owners), + Ring2 = lists:foldl(fun (OtherNode, RingAcc) -> + RingAcc2 = riak_core_ring:add_member(Node, + RingAcc, + OtherNode), + riak_core_ring:set_member(Node, RingAcc2, + OtherNode, valid, + same_vclock) + end, + Ring, OtherNodes), + Ring3 = lists:foldl(fun ({Idx, Owner}, RingAcc) -> + riak_core_ring:transfer_node(Idx, Owner, + RingAcc) + end, + Ring2, Owners), Ring3. -endif. %TEST. diff --git a/src/riak_core_util.erl b/src/riak_core_util.erl index 8ffed9312..a36dcd622 100644 --- a/src/riak_core_util.erl +++ b/src/riak_core_util.erl @@ -21,83 +21,51 @@ %% @doc Various functions that are useful throughout Riak. -module(riak_core_util). --export([moment/0, - make_tmp_dir/0, - replace_file/2, - compare_dates/2, - reload_all/1, - integer_to_list/2, - unique_id_62/0, - str_to_node/1, - chash_key/1, chash_key/2, - chash_std_keyfun/1, - chash_bucketonly_keyfun/1, - mkclientid/1, - start_app_deps/1, - build_tree/3, - orddict_delta/2, - safe_rpc/4, - safe_rpc/5, - rpc_every_member/4, - rpc_every_member_ann/4, - count/2, - keydelete/2, - multi_keydelete/2, - multi_keydelete/3, - compose/1, - compose/2, - pmap/2, - pmap/3, - multi_rpc/4, - multi_rpc/5, - multi_rpc_ann/4, - multi_rpc_ann/5, - multicall_ann/4, - multicall_ann/5, - shuffle/1, - is_arch/1, - format_ip_and_port/2, - peername/2, - sockname/2, - sha/1, - md5/1, - make_fold_req/1, - make_fold_req/2, - make_fold_req/4, - make_newest_fold_req/1, - proxy_spawn/1, - proxy/2, - enable_job_class/1, - enable_job_class/2, - disable_job_class/1, - disable_job_class/2, - job_class_enabled/1, - job_class_enabled/2, +-export([moment/0, make_tmp_dir/0, replace_file/2, + compare_dates/2, reload_all/1, integer_to_list/2, + unique_id_62/0, str_to_node/1, chash_key/1, chash_key/2, + chash_std_keyfun/1, chash_bucketonly_keyfun/1, + mkclientid/1, start_app_deps/1, build_tree/3, + orddict_delta/2, safe_rpc/4, safe_rpc/5, + rpc_every_member/4, rpc_every_member_ann/4, count/2, + keydelete/2, multi_keydelete/2, multi_keydelete/3, + compose/1, compose/2, pmap/2, pmap/3, multi_rpc/4, + multi_rpc/5, multi_rpc_ann/4, multi_rpc_ann/5, + multicall_ann/4, multicall_ann/5, shuffle/1, is_arch/1, + format_ip_and_port/2, peername/2, sockname/2, sha/1, + md5/1, make_fold_req/1, make_fold_req/2, + make_fold_req/4, make_newest_fold_req/1, proxy_spawn/1, + proxy/2, enable_job_class/1, enable_job_class/2, + disable_job_class/1, disable_job_class/2, + job_class_enabled/1, job_class_enabled/2, job_class_disabled_message/2, - report_job_request_disposition/6, - responsible_preflists/1, - responsible_preflists/2, - get_index_n/1, - preflist_siblings/1, - posix_error/1 - ]). + report_job_request_disposition/6, get_index_n/1, + posix_error/1]). -include("riak_core_vnode.hrl"). -ifdef(TEST). --ifdef(EQC). --include_lib("eqc/include/eqc.hrl"). --endif. %% EQC + +-ifdef(PROPER). + +-include_lib("proper/include/proper.hrl"). + +%-compile(export_all). +-endif. + -include_lib("eunit/include/eunit.hrl"). --export([counter_loop/1,incr_counter/1,decr_counter/1]). --endif. %% TEST --type riak_core_ring() :: riak_core_ring:riak_core_ring(). +-export([counter_loop/1, incr_counter/1, + decr_counter/1]). + +-endif. + -type index() :: non_neg_integer(). + -type index_n() :: {index(), pos_integer()}. %% R14 Compatibility --compile({no_auto_import,[integer_to_list/2]}). +-compile({no_auto_import, [{integer_to_list, 2}]}). %% =================================================================== %% Public API @@ -108,10 +76,11 @@ -define(SEC_TO_EPOCH, 62167219200). posix_error(Error) -> - case erl_posix_msg:message(Error) of - "unknown POSIX error" -> lists:flatten(io_lib:format("~p", [Error])); - Message -> Message - end. + case erl_posix_msg:message(Error) of + "unknown POSIX error" -> + lists:flatten(io_lib:format("~p", [Error])); + Message -> Message + end. %% @spec moment() -> integer() %% @doc Get the current "moment". Current implementation is the @@ -120,12 +89,12 @@ posix_error(Error) -> moment() -> {Mega, Sec, _Micro} = os:timestamp(), - (Mega * 1000000) + Sec + ?SEC_TO_EPOCH. + Mega * 1000000 + Sec + (?SEC_TO_EPOCH). %% @spec compare_dates(string(), string()) -> boolean() %% @doc Compare two RFC1123 date strings or two now() tuples (or one %% of each). Return true if date A is later than date B. -compare_dates(A={_,_,_}, B={_,_,_}) -> +compare_dates(A = {_, _, _}, B = {_, _, _}) -> %% assume 3-tuples are now() times A > B; compare_dates(A, B) when is_list(A) -> @@ -135,9 +104,9 @@ compare_dates(A, B) when is_list(B) -> compare_dates(A, rfc1123_to_now(B)). rfc1123_to_now(String) when is_list(String) -> - GSec = calendar:datetime_to_gregorian_seconds( - httpd_util:convert_request_date(String)), - ESec = GSec-?SEC_TO_EPOCH, + GSec = + calendar:datetime_to_gregorian_seconds(httpd_util:convert_request_date(String)), + ESec = GSec - (?SEC_TO_EPOCH), Sec = ESec rem 1000000, MSec = ESec div 1000000, {MSec, Sec, 0}. @@ -147,37 +116,35 @@ rfc1123_to_now(String) when is_list(String) -> %% to the new directory. make_tmp_dir() -> TmpId = io_lib:format("riptemp.~p", - [erlang:phash2({riak_core_rand:uniform(),self()})]), + [erlang:phash2({rand:uniform(), self()})]), TempDir = filename:join("/tmp", TmpId), case filelib:is_dir(TempDir) of - true -> make_tmp_dir(); - false -> - ok = file:make_dir(TempDir), - TempDir + true -> make_tmp_dir(); + false -> ok = file:make_dir(TempDir), TempDir end. %% @doc Atomically/safely (to some reasonable level of durablity) %% replace file `FN' with `Data'. NOTE: since 2.0.3 semantic changed %% slightly: If `FN' cannot be opened, will not error with a %% `badmatch', as before, but will instead return `{error, Reason}' --spec replace_file(string(), iodata()) -> ok | {error, term()}. +-spec replace_file(string(), iodata()) -> ok | + {error, term()}. + replace_file(FN, Data) -> TmpFN = FN ++ ".tmp", case file:open(TmpFN, [write, raw]) of - {ok, FH} -> - try - ok = file:write(FH, Data), - ok = file:sync(FH), - ok = file:close(FH), - ok = file:rename(TmpFN, FN), - {ok, Contents} = read_file(FN), - true = (Contents == iolist_to_binary(Data)), - ok - catch _:Err -> - {error, Err} - end; - Err -> - Err + {ok, FH} -> + try ok = file:write(FH, Data), + ok = file:sync(FH), + ok = file:close(FH), + ok = file:rename(TmpFN, FN), + {ok, Contents} = read_file(FN), + true = Contents == iolist_to_binary(Data), + ok + catch + _:Err -> {error, Err} + end; + Err -> Err end. %% @doc Similar to {@link file:read_file/1} but uses raw file `I/O' @@ -189,24 +156,20 @@ read_file(FName) -> read_file(FD, Acc) -> case file:read(FD, 4096) of - {ok, Data} -> - read_file(FD, [Data|Acc]); - eof -> - lists:reverse(Acc) + {ok, Data} -> read_file(FD, [Data | Acc]); + eof -> lists:reverse(Acc) end. %% @spec integer_to_list(Integer :: integer(), Base :: integer()) -> %% string() %% @doc Convert an integer to its string representation in the given %% base. Bases 2-62 are supported. -integer_to_list(I, 10) -> - erlang:integer_to_list(I); +integer_to_list(I, 10) -> erlang:integer_to_list(I); integer_to_list(I, Base) - when is_integer(I), is_integer(Base),Base >= 2, Base =< 1+$Z-$A+10+1+$z-$a -> - if I < 0 -> - [$-|integer_to_list(-I, Base, [])]; - true -> - integer_to_list(I, Base, []) + when is_integer(I), is_integer(Base), Base >= 2, + Base =< 1 + $Z - $A + 10 + 1 + $z - $a -> + if I < 0 -> [$- | integer_to_list(-I, Base, [])]; + true -> integer_to_list(I, Base, []) end; integer_to_list(I, Base) -> erlang:error(badarg, [I, Base]). @@ -215,102 +178,96 @@ integer_to_list(I, Base) -> integer_to_list(I0, Base, R0) -> D = I0 rem Base, I1 = I0 div Base, - R1 = if D >= 36 -> - [D-36+$a|R0]; - D >= 10 -> - [D-10+$A|R0]; - true -> - [D+$0|R0] - end, - if I1 =:= 0 -> - R1; - true -> - integer_to_list(I1, Base, R1) + R1 = if D >= 36 -> [D - 36 + $a | R0]; + D >= 10 -> [D - 10 + $A | R0]; + true -> [D + $0 | R0] + end, + if I1 =:= 0 -> R1; + true -> integer_to_list(I1, Base, R1) end. -sha(Bin) -> - crypto:hash(sha, Bin). +sha(Bin) -> crypto:hash(sha, Bin). -md5(Bin) -> - crypto:hash(md5, Bin). +md5(Bin) -> crypto:hash(md5, Bin). -%% @spec unique_id_62() -> string() %% @doc Create a random identifying integer, returning its string %% representation in base 62. +-spec unique_id_62() -> string(). + unique_id_62() -> - Rand = sha(term_to_binary({make_ref(), os:timestamp()})), + Rand = sha(term_to_binary({make_ref(), + os:timestamp()})), <> = Rand, integer_to_list(I, 62). -%% @spec reload_all(Module :: atom()) -> -%% [{purge_response(), load_file_response()}] -%% @type purge_response() = boolean() -%% @type load_file_response() = {module, Module :: atom()}| -%% 2 {error, term()} +%% purge_response() = boolean() +%% load_file_response() = {module, Module :: atom()}| +%% 2 {error, term()} %% @doc Ask each member node of the riak ring to reload the given %% Module. Return is a list of the results of code:purge/1 %% and code:load_file/1 on each node. +-spec reload_all(Module :: atom()) -> [{boolean(), + {module, Module :: atom()} | + {error, term()}}]. + reload_all(Module) -> {ok, Ring} = riak_core_ring_manager:get_my_ring(), [{safe_rpc(Node, code, purge, [Module]), - safe_rpc(Node, code, load_file, [Module])} || - Node <- riak_core_ring:all_members(Ring)]. + safe_rpc(Node, code, load_file, [Module])} + || Node <- riak_core_ring:all_members(Ring)]. %% @spec mkclientid(RemoteNode :: term()) -> ClientID :: list() %% @doc Create a unique-enough id for vclock clients. mkclientid(RemoteNode) -> - {{Y,Mo,D},{H,Mi,S}} = erlang:universaltime(), - {_,_,NowPart} = os:timestamp(), - Id = erlang:phash2([Y,Mo,D,H,Mi,S,node(),RemoteNode,NowPart,self()]), + {{Y, Mo, D}, {H, Mi, S}} = erlang:universaltime(), + {_, _, NowPart} = os:timestamp(), + Id = erlang:phash2([Y, Mo, D, H, Mi, S, node(), + RemoteNode, NowPart, self()]), <>. %% @spec chash_key(BKey :: riak_object:bkey()) -> chash:index() %% @doc Create a binary used for determining replica placement. -chash_key({Bucket,_Key}=BKey) -> - BucketProps = riak_core_bucket:get_bucket(Bucket), - chash_key(BKey, BucketProps). +chash_key(BKey) -> + %% TODO remove + %% BucketProps = riak_core_bucket:get_bucket(Bucket), + chash_key(BKey, undefined). %% @spec chash_key(BKey :: riak_object:bkey(), [{atom(), any()}]) -> %% chash:index() %% @doc Create a binary used for determining replica placement. -chash_key({Bucket,Key}, _BucketProps) -> - %{_, {M, F}} = lists:keyfind(chash_keyfun, 1, BucketProps), - %M:F({Bucket,Key}). - % FIX static keyfun - chash_std_keyfun({Bucket, Key}). +chash_key({Bucket, Key}, _BucketProps) -> + % static keyfun + chash_std_keyfun({Bucket, Key}). %% @spec chash_std_keyfun(BKey :: riak_object:bkey()) -> chash:index() %% @doc Default object/ring hashing fun, direct passthrough of bkey. -chash_std_keyfun({Bucket, Key}) -> chash:key_of({Bucket, Key}). +chash_std_keyfun({Bucket, Key}) -> + chash:key_of({Bucket, Key}). %% @spec chash_bucketonly_keyfun(BKey :: riak_object:bkey()) -> chash:index() %% @doc Object/ring hashing fun that ignores Key, only uses Bucket. -chash_bucketonly_keyfun({Bucket, _Key}) -> chash:key_of(Bucket). +chash_bucketonly_keyfun({Bucket, _Key}) -> + chash:key_of(Bucket). str_to_node(Node) when is_atom(Node) -> str_to_node(atom_to_list(Node)); str_to_node(NodeStr) -> case string:tokens(NodeStr, "@") of - [NodeName] -> - %% Node name only; no host name. If the local node has a hostname, - %% append it - case node_hostname() of - [] -> - list_to_atom(NodeName); - Hostname -> - list_to_atom(NodeName ++ "@" ++ Hostname) - end; - _ -> - list_to_atom(NodeStr) + [NodeName] -> + %% Node name only; no host name. If the local node has a hostname, + %% append it + case node_hostname() of + [] -> list_to_atom(NodeName); + Hostname -> list_to_atom(NodeName ++ "@" ++ Hostname) + end; + _ -> list_to_atom(NodeStr) end. node_hostname() -> NodeStr = atom_to_list(node()), case string:tokens(NodeStr, "@") of - [_NodeName, Hostname] -> - Hostname; - _ -> - [] + [_NodeName, Hostname] -> Hostname; + _ -> [] end. %% @spec start_app_deps(App :: atom()) -> ok @@ -320,25 +277,24 @@ start_app_deps(App) -> _ = [ensure_started(A) || A <- DepApps], ok. - %% @spec ensure_started(Application :: atom()) -> ok %% @doc Start the named application if not already started. ensure_started(App) -> case application:start(App) of - ok -> - ok; - {error, {already_started, App}} -> - ok + ok -> ok; + {error, {already_started, App}} -> ok end. %% @doc Applies `Pred' to each element in `List', and returns a count of how many %% applications returned `true'. --spec count(fun((term()) -> boolean()), [term()]) -> non_neg_integer(). +-spec count(fun((term()) -> boolean()), + [term()]) -> non_neg_integer(). + count(Pred, List) -> - FoldFun = fun(E, A) -> + FoldFun = fun (E, A) -> case Pred(E) of - false -> A; - true -> A + 1 + false -> A; + true -> A + 1 end end, lists:foldl(FoldFun, 0, List). @@ -347,6 +303,7 @@ count(Pred, List) -> %% first element compares equal to `Key' is deleted, if there is such a tuple. %% Equivalent to `lists:keydelete(Key, 1, TupleList)'. -spec keydelete(atom(), [tuple()]) -> [tuple()]. + keydelete(Key, TupleList) -> lists:keydelete(Key, 1, TupleList). @@ -354,162 +311,157 @@ keydelete(Key, TupleList) -> %% first element compares equal to any key in `KeysToDelete' is deleted, if %% there is such a tuple. -spec multi_keydelete([atom()], [tuple()]) -> [tuple()]. + multi_keydelete(KeysToDelete, TupleList) -> multi_keydelete(KeysToDelete, 1, TupleList). %% @doc Returns a copy of `TupleList' where the Nth occurrence of a tuple whose %% first element compares equal to any key in `KeysToDelete' is deleted, if %% there is such a tuple. --spec multi_keydelete([atom()], non_neg_integer(), [tuple()]) -> [tuple()]. +-spec multi_keydelete([atom()], non_neg_integer(), + [tuple()]) -> [tuple()]. + multi_keydelete(KeysToDelete, N, TupleList) -> - lists:foldl( - fun(Key, Acc) -> lists:keydelete(Key, N, Acc) end, - TupleList, - KeysToDelete). + lists:foldl(fun (Key, Acc) -> + lists:keydelete(Key, N, Acc) + end, + TupleList, KeysToDelete). %% @doc Function composition: returns a function that is the composition of %% `F' and `G'. --spec compose(F :: fun((B) -> C), G :: fun((A) -> B)) -> fun((A) -> C). -compose(F, G) when is_function(F, 1), is_function(G, 1) -> - fun(X) -> - F(G(X)) - end. +-spec compose(F :: fun((B) -> C), + G :: fun((A) -> B)) -> fun((A) -> C). + +compose(F, G) + when is_function(F, 1), is_function(G, 1) -> + fun (X) -> F(G(X)) end. %% @doc Function composition: returns a function that is the composition of all %% functions in the `Funs' list. Note that functions are composed from right to %% left, so the final function in the `Funs' will be the first one invoked when %% invoking the composed function. --spec compose([fun((any()) -> any())]) -> fun((any()) -> any()). -compose([Fun]) -> - Fun; +-spec + compose([fun((any()) -> any())]) -> fun((any()) -> any()). + +compose([Fun]) -> Fun; compose(Funs) when is_list(Funs) -> - [Fun|Rest] = lists:reverse(Funs), + [Fun | Rest] = lists:reverse(Funs), lists:foldl(fun compose/2, Fun, Rest). %% @doc Invoke function `F' over each element of list `L' in parallel, %% returning the results in the same order as the input list. --spec pmap(F, L1) -> L2 when - F :: fun((A) -> B), - L1 :: [A], - L2 :: [B]. +-spec pmap(F, L1) -> L2 when F :: fun((A) -> B), + L1 :: [A], L2 :: [B]. + pmap(F, L) -> Parent = self(), - lists:foldl( - fun(X, N) -> - spawn_link(fun() -> - Parent ! {pmap, N, F(X)} - end), - N+1 - end, 0, L), - L2 = [receive {pmap, N, R} -> {N,R} end || _ <- L], + lists:foldl(fun (X, N) -> + spawn_link(fun () -> Parent ! {pmap, N, F(X)} end), + N + 1 + end, + 0, L), + L2 = [receive {pmap, N, R} -> {N, R} end || _ <- L], L3 = lists:keysort(1, L2), - [R || {_,R} <- L3]. - --record(pmap_acc,{ - mapper, - fn, - n_pending=0, - pending=sets:new(), - n_done=0, - done=[], - max_concurrent=1 - }). + [R || {_, R} <- L3]. + +-record(pmap_acc, + {mapper, fn, n_pending = 0, pending = sets:new(), + n_done = 0, done = [], max_concurrent = 1}). %% @doc Parallel map with a cap on the number of concurrent worker processes. %% Note: Worker processes are linked to the parent, so a crash propagates. --spec pmap(Fun::function(), List::list(), MaxP::integer()) -> list(). +-spec pmap(Fun :: function(), List :: list(), + MaxP :: integer()) -> list(). + pmap(Fun, List, MaxP) when MaxP < 1 -> pmap(Fun, List, 1); -pmap(Fun, List, MaxP) when is_function(Fun), is_list(List), is_integer(MaxP) -> +pmap(Fun, List, MaxP) + when is_function(Fun), is_list(List), + is_integer(MaxP) -> Mapper = self(), - #pmap_acc{pending=Pending, done=Done} = - lists:foldl(fun pmap_worker/2, - #pmap_acc{mapper=Mapper, - fn=Fun, - max_concurrent=MaxP}, - List), + #pmap_acc{pending = Pending, done = Done} = + lists:foldl(fun pmap_worker/2, + #pmap_acc{mapper = Mapper, fn = Fun, + max_concurrent = MaxP}, + List), All = pmap_collect_rest(Pending, Done), % Restore input order Sorted = lists:keysort(1, All), - [ R || {_, R} <- Sorted ]. + [R || {_, R} <- Sorted]. %% @doc Fold function for {@link pmap/3} that spawns up to a max number of %% workers to execute the mapping function over the input list. -pmap_worker(X, Acc = #pmap_acc{n_pending=NP, - pending=Pending, - n_done=ND, - max_concurrent=MaxP, - mapper=Mapper, - fn=Fn}) - when NP < MaxP -> - Worker = - spawn_link(fun() -> - R = Fn(X), - Mapper ! {pmap_result, self(), {NP+ND, R}} - end), - Acc#pmap_acc{n_pending=NP+1, pending=sets:add_element(Worker, Pending)}; -pmap_worker(X, Acc = #pmap_acc{n_pending=NP, - pending=Pending, - n_done=ND, - done=Done, - max_concurrent=MaxP}) - when NP == MaxP -> +pmap_worker(X, + Acc = #pmap_acc{n_pending = NP, pending = Pending, + n_done = ND, max_concurrent = MaxP, mapper = Mapper, + fn = Fn}) + when NP < MaxP -> + Worker = spawn_link(fun () -> + R = Fn(X), + Mapper ! {pmap_result, self(), {NP + ND, R}} + end), + Acc#pmap_acc{n_pending = NP + 1, + pending = sets:add_element(Worker, Pending)}; +pmap_worker(X, + Acc = #pmap_acc{n_pending = NP, pending = Pending, + n_done = ND, done = Done, max_concurrent = MaxP}) + when NP == MaxP -> {Result, NewPending} = pmap_collect_one(Pending), - pmap_worker(X, Acc#pmap_acc{n_pending=NP-1, pending=NewPending, - n_done=ND+1, done=[Result|Done]}). + pmap_worker(X, + Acc#pmap_acc{n_pending = NP - 1, pending = NewPending, + n_done = ND + 1, done = [Result | Done]}). %% @doc Waits for one pending pmap task to finish pmap_collect_one(Pending) -> receive - {pmap_result, Pid, Result} -> - Size = sets:size(Pending), - NewPending = sets:del_element(Pid, Pending), - case sets:size(NewPending) of - Size -> - pmap_collect_one(Pending); - _ -> - {Result, NewPending} - end + {pmap_result, Pid, Result} -> + Size = sets:size(Pending), + NewPending = sets:del_element(Pid, Pending), + case sets:size(NewPending) of + Size -> pmap_collect_one(Pending); + _ -> {Result, NewPending} + end end. pmap_collect_rest(Pending, Done) -> case sets:size(Pending) of - 0 -> - Done; - _ -> - {Result, NewPending} = pmap_collect_one(Pending), - pmap_collect_rest(NewPending, [Result | Done]) + 0 -> Done; + _ -> + {Result, NewPending} = pmap_collect_one(Pending), + pmap_collect_rest(NewPending, [Result | Done]) end. - %% @doc Wraps an rpc:call/4 in a try/catch to handle the case where the %% 'rex' process is not running on the remote node. This is safe in %% the sense that it won't crash the calling process if the rex %% process is down. --spec safe_rpc(Node :: node(), Module :: atom(), Function :: atom(), - Args :: [any()]) -> {'badrpc', any()} | any(). +-spec safe_rpc(Node :: node(), Module :: atom(), + Function :: atom(), Args :: [any()]) -> {badrpc, + any()} | + any(). + safe_rpc(Node, Module, Function, Args) -> try rpc:call(Node, Module, Function, Args) of - Result -> - Result + Result -> Result catch - exit:{noproc, _NoProcDetails} -> - {badrpc, rpc_process_down} + exit:{noproc, _NoProcDetails} -> + {badrpc, rpc_process_down} end. %% @doc Wraps an rpc:call/5 in a try/catch to handle the case where the %% 'rex' process is not running on the remote node. This is safe in %% the sense that it won't crash the calling process if the rex %% process is down. --spec safe_rpc(Node :: node(), Module :: atom(), Function :: atom(), - Args :: [any()], Timeout :: timeout()) -> {'badrpc', any()} | any(). +-spec safe_rpc(Node :: node(), Module :: atom(), + Function :: atom(), Args :: [any()], + Timeout :: timeout()) -> {badrpc, any()} | any(). + safe_rpc(Node, Module, Function, Args, Timeout) -> try rpc:call(Node, Module, Function, Args, Timeout) of - Result -> - Result + Result -> Result catch - 'EXIT':{noproc, _NoProcDetails} -> - {badrpc, rpc_process_down} + 'EXIT':{noproc, _NoProcDetails} -> + {badrpc, rpc_process_down} end. %% @spec rpc_every_member(atom(), atom(), [term()], integer()|infinity) @@ -527,36 +479,44 @@ rpc_every_member(Module, Function, Args, Timeout) -> rpc_every_member_ann(Module, Function, Args, Timeout) -> {ok, MyRing} = riak_core_ring_manager:get_my_ring(), Nodes = riak_core_ring:all_members(MyRing), - {Results, Down} = multicall_ann(Nodes, Module, Function, Args, Timeout), + {Results, Down} = multicall_ann(Nodes, Module, Function, + Args, Timeout), {Results, Down}. %% @doc Perform an RPC call to a list of nodes in parallel, returning the %% results in the same order as the input list. --spec multi_rpc([node()], module(), atom(), [any()]) -> [any()]. +-spec multi_rpc([node()], module(), atom(), + [any()]) -> [any()]. + multi_rpc(Nodes, Mod, Fun, Args) -> multi_rpc(Nodes, Mod, Fun, Args, infinity). %% @doc Perform an RPC call to a list of nodes in parallel, returning the %% results in the same order as the input list. --spec multi_rpc([node()], module(), atom(), [any()], timeout()) -> [any()]. +-spec multi_rpc([node()], module(), atom(), [any()], + timeout()) -> [any()]. + multi_rpc(Nodes, Mod, Fun, Args, Timeout) -> - pmap(fun(Node) -> + pmap(fun (Node) -> safe_rpc(Node, Mod, Fun, Args, Timeout) - end, Nodes). + end, + Nodes). %% @doc Perform an RPC call to a list of nodes in parallel, returning the %% results in the same order as the input list. Each result is tagged %% with the corresponding node name. --spec multi_rpc_ann([node()], module(), atom(), [any()]) - -> [{node(), any()}]. +-spec multi_rpc_ann([node()], module(), atom(), + [any()]) -> [{node(), any()}]. + multi_rpc_ann(Nodes, Mod, Fun, Args) -> multi_rpc_ann(Nodes, Mod, Fun, Args, infinity). %% @doc Perform an RPC call to a list of nodes in parallel, returning the %% results in the same order as the input list. Each result is tagged %% with the corresponding node name. --spec multi_rpc_ann([node()], module(), atom(), [any()], timeout()) - -> [{node(), any()}]. +-spec multi_rpc_ann([node()], module(), atom(), [any()], + timeout()) -> [{node(), any()}]. + multi_rpc_ann(Nodes, Mod, Fun, Args, Timeout) -> Results = multi_rpc(Nodes, Mod, Fun, Args, Timeout), lists:zip(Nodes, Results). @@ -566,8 +526,10 @@ multi_rpc_ann(Nodes, Mod, Fun, Args, Timeout) -> %% of nodes that are down/unreachable. The results will be returned in %% the same order as the input list, and each result is tagged with the %% corresponding node name. --spec multicall_ann([node()], module(), atom(), [any()]) - -> {Results :: [{node(), any()}], Down :: [node()]}. +-spec multicall_ann([node()], module(), atom(), + [any()]) -> {Results :: [{node(), any()}], + Down :: [node()]}. + multicall_ann(Nodes, Mod, Fun, Args) -> multicall_ann(Nodes, Mod, Fun, Args, infinity). @@ -576,14 +538,17 @@ multicall_ann(Nodes, Mod, Fun, Args) -> %% of nodes that are down/unreachable. The results will be returned in %% the same order as the input list, and each result is tagged with the %% corresponding node name. --spec multicall_ann([node()], module(), atom(), [any()], timeout()) - -> {Results :: [{node(), any()}], Down :: [node()]}. +-spec multicall_ann([node()], module(), atom(), [any()], + timeout()) -> {Results :: [{node(), any()}], + Down :: [node()]}. + multicall_ann(Nodes, Mod, Fun, Args, Timeout) -> L = multi_rpc_ann(Nodes, Mod, Fun, Args, Timeout), - {Results, DownAnn} = - lists:partition(fun({_, Result}) -> - Result /= {badrpc, nodedown} - end, L), + {Results, DownAnn} = lists:partition(fun ({_, + Result}) -> + Result /= {badrpc, nodedown} + end, + L), {Down, _} = lists:unzip(DownAnn), {Results, Down}. @@ -594,96 +559,98 @@ multicall_ann(Nodes, Mod, Fun, Args, Timeout) -> %% the array is logically wrapped around to ensure leaf nodes also %% have children by giving them backedges to other elements. --spec build_tree(N :: integer(), Nodes :: [term()], Opts :: [term()]) - -> orddict:orddict(). +-spec build_tree(N :: integer(), Nodes :: [term()], + Opts :: [term()]) -> orddict:orddict(). + build_tree(N, Nodes, Opts) -> case lists:member(cycles, Opts) of - true -> - Expand = lists:flatten(lists:duplicate(N+1, Nodes)); - false -> - Expand = Nodes + true -> + Expand = lists:flatten(lists:duplicate(N + 1, Nodes)); + false -> Expand = Nodes end, - {Tree, _} = - lists:foldl(fun(Elm, {Result, Worklist}) -> - Len = erlang:min(N, length(Worklist)), - {Children, Rest} = lists:split(Len, Worklist), - NewResult = [{Elm, Children} | Result], - {NewResult, Rest} - end, {[], tl(Expand)}, Nodes), + {Tree, _} = lists:foldl(fun (Elm, {Result, Worklist}) -> + Len = erlang:min(N, length(Worklist)), + {Children, Rest} = lists:split(Len, + Worklist), + NewResult = [{Elm, Children} | Result], + {NewResult, Rest} + end, + {[], tl(Expand)}, Nodes), orddict:from_list(Tree). orddict_delta(A, B) -> %% Pad both A and B to the same length DummyA = [{Key, '$none'} || {Key, _} <- B], - A2 = orddict:merge(fun(_, Value, _) -> - Value - end, A, DummyA), - + A2 = orddict:merge(fun (_, Value, _) -> Value end, A, + DummyA), DummyB = [{Key, '$none'} || {Key, _} <- A], - B2 = orddict:merge(fun(_, Value, _) -> - Value - end, B, DummyB), - + B2 = orddict:merge(fun (_, Value, _) -> Value end, B, + DummyB), %% Merge and filter out equal values - Merged = orddict:merge(fun(_, AVal, BVal) -> + Merged = orddict:merge(fun (_, AVal, BVal) -> {AVal, BVal} - end, A2, B2), - Diff = orddict:filter(fun(_, {Same, Same}) -> - false; - (_, _) -> - true - end, Merged), + end, + A2, B2), + Diff = orddict:filter(fun (_, {_Same, _Same}) -> false; + (_, _) -> true + end, + Merged), Diff. shuffle(L) -> N = 134217727, %% Largest small integer on 32-bit Erlang - L2 = [{riak_core_rand:uniform(N), E} || E <- L], + L2 = [{rand:uniform(N), E} || E <- L], L3 = [E || {_, E} <- lists:sort(L2)], L3. %% Returns a forced-lowercase architecture for this node --spec get_arch () -> string(). -get_arch () -> string:to_lower(erlang:system_info(system_architecture)). +-spec get_arch() -> string(). + +get_arch() -> + string:to_lower(erlang:system_info(system_architecture)). %% Checks if this node is of a given architecture --spec is_arch (atom()) -> boolean(). -is_arch (linux) -> string:str(get_arch(),"linux") > 0; -is_arch (darwin) -> string:str(get_arch(),"darwin") > 0; -is_arch (sunos) -> string:str(get_arch(),"sunos") > 0; -is_arch (osx) -> is_arch(darwin); -is_arch (solaris) -> is_arch(sunos); -is_arch (Arch) -> throw({unsupported_architecture,Arch}). +-spec is_arch(atom()) -> boolean(). + +is_arch(linux) -> string:str(get_arch(), "linux") > 0; +is_arch(darwin) -> string:str(get_arch(), "darwin") > 0; +is_arch(sunos) -> string:str(get_arch(), "sunos") > 0; +is_arch(osx) -> is_arch(darwin); +is_arch(solaris) -> is_arch(sunos); +is_arch(Arch) -> + throw({unsupported_architecture, Arch}). format_ip_and_port(Ip, Port) when is_list(Ip) -> - lists:flatten(io_lib:format("~s:~p",[Ip,Port])); + lists:flatten(io_lib:format("~s:~p", [Ip, Port])); format_ip_and_port(Ip, Port) when is_tuple(Ip) -> - lists:flatten(io_lib:format("~s:~p",[inet_parse:ntoa(Ip), - Port])). -peername(Socket, Transport) -> - case Transport:peername(Socket) of - {ok, {Ip, Port}} -> - format_ip_and_port(Ip, Port); - {error, Reason} -> - %% just return a string so JSON doesn't blow up - lists:flatten(io_lib:format("error:~p", [Reason])) + lists:flatten(io_lib:format("~s:~p", + [inet_parse:ntoa(Ip), Port])). + +peername(Socket, Module) -> + case Module:peername(Socket) of + {ok, {Ip, Port}} -> format_ip_and_port(Ip, Port); + {error, Reason} -> + %% just return a string so JSON doesn't blow up + lists:flatten(io_lib:format("error:~p", [Reason])) end. -sockname(Socket, Transport) -> - case Transport:sockname(Socket) of - {ok, {Ip, Port}} -> - format_ip_and_port(Ip, Port); - {error, Reason} -> - %% just return a string so JSON doesn't blow up - lists:flatten(io_lib:format("error:~p", [Reason])) +sockname(Socket, Module) -> + case Module:sockname(Socket) of + {ok, {Ip, Port}} -> format_ip_and_port(Ip, Port); + {error, Reason} -> + %% just return a string so JSON doesn't blow up + lists:flatten(io_lib:format("error:~p", [Reason])) end. %% @doc Convert a #riak_core_fold_req_v? record to the cluster's maximum %% supported record version. -make_fold_req(#riak_core_fold_req_v1{foldfun=FoldFun, acc0=Acc0}) -> +make_fold_req(#riak_core_fold_req_v1{foldfun = FoldFun, + acc0 = Acc0}) -> make_fold_req(FoldFun, Acc0, false, []); -make_fold_req(?FOLD_REQ{foldfun=FoldFun, acc0=Acc0, - forwardable=Forwardable, opts=Opts}) -> +make_fold_req(#riak_core_fold_req_v2{foldfun = FoldFun, + acc0 = Acc0, forwardable = Forwardable, + opts = Opts}) -> make_fold_req(FoldFun, Acc0, Forwardable, Opts). make_fold_req(FoldFun, Acc0) -> @@ -695,10 +662,11 @@ make_fold_req(FoldFun, Acc0, Forwardable, Opts) -> %% @doc Force a #riak_core_fold_req_v? record to the latest version, %% regardless of cluster support -make_newest_fold_req(#riak_core_fold_req_v1{foldfun=FoldFun, acc0=Acc0}) -> +make_newest_fold_req(#riak_core_fold_req_v1{foldfun = + FoldFun, + acc0 = Acc0}) -> make_fold_reqv(v2, FoldFun, Acc0, false, []); -make_newest_fold_req(?FOLD_REQ{} = F) -> - F. +make_newest_fold_req(#riak_core_fold_req_v2{} = F) -> F. %% @doc Spawn an intermediate proxy process to handle errors during gen_xxx %% calls. @@ -709,145 +677,160 @@ proxy_spawn(Fun) -> MRef = monitor(process, Pid), Pid ! {proxy, MRef}, receive - {proxy_reply, MRef, Result} -> - demonitor(MRef, [flush]), - Result; - {'DOWN', MRef, _, _, Reason} -> - {error, Reason} + {proxy_reply, MRef, Result} -> + demonitor(MRef, [flush]), Result; + {'DOWN', MRef, _, _, Reason} -> {error, Reason} end. - %% @private -make_fold_reqv(v1, FoldFun, Acc0, _Forwardable, _Opts) - when is_function(FoldFun, 3) -> - #riak_core_fold_req_v1{foldfun=FoldFun, acc0=Acc0}; -make_fold_reqv(v2, FoldFun, Acc0, Forwardable, Opts) - when is_function(FoldFun, 3) - andalso (Forwardable == true orelse Forwardable == false) - andalso is_list(Opts) -> - ?FOLD_REQ{foldfun=FoldFun, acc0=Acc0, - forwardable=Forwardable, opts=Opts}. +make_fold_reqv(_, FoldFun, Acc0, Forwardable, Opts) + when is_function(FoldFun, 3) andalso + (Forwardable == true orelse Forwardable == false) + andalso is_list(Opts) -> + #riak_core_fold_req_v2{foldfun = FoldFun, acc0 = Acc0, + forwardable = Forwardable, opts = Opts}. %% @private - used with proxy_spawn proxy(Parent, Fun) -> _ = monitor(process, Parent), receive - {proxy, MRef} -> - Result = Fun(), - Parent ! {proxy_reply, MRef, Result}; - {'DOWN', _, _, _, _} -> - ok + {proxy, MRef} -> + Result = Fun(), Parent ! {proxy_reply, MRef, Result}; + {'DOWN', _, _, _, _} -> ok end. --spec enable_job_class(atom(), atom()) -> ok | {error, term()}. +-spec enable_job_class(atom(), atom()) -> ok | + {error, term()}. + %% @doc Enables the specified Application/Operation job class. %% This is the public API for use via RPC. %% WARNING: This function is not suitable for parallel execution with itself %% or its complement disable_job_class/2. enable_job_class(Application, Operation) - when erlang:is_atom(Application) andalso erlang:is_atom(Operation) -> + when erlang:is_atom(Application) andalso + erlang:is_atom(Operation) -> enable_job_class({Application, Operation}); enable_job_class(Application, Operation) -> {error, {badarg, {Application, Operation}}}. --spec disable_job_class(atom(), atom()) -> ok | {error, term()}. +-spec disable_job_class(atom(), atom()) -> ok | + {error, term()}. + %% @doc Disables the specified Application/Operation job class. %% This is the public API for use via RPC. %% WARNING: This function is not suitable for parallel execution with itself %% or its complement enable_job_class/2. disable_job_class(Application, Operation) - when erlang:is_atom(Application) andalso erlang:is_atom(Operation) -> + when erlang:is_atom(Application) andalso + erlang:is_atom(Operation) -> disable_job_class({Application, Operation}); disable_job_class(Application, Operation) -> {error, {badarg, {Application, Operation}}}. --spec job_class_enabled(atom(), atom()) -> boolean() | {error, term()}. +-spec job_class_enabled(atom(), atom()) -> boolean() | + {error, term()}. + %% @doc Reports whether the specified Application/Operation job class is enabled. %% This is the public API for use via RPC. job_class_enabled(Application, Operation) - when erlang:is_atom(Application) andalso erlang:is_atom(Operation) -> + when erlang:is_atom(Application) andalso + erlang:is_atom(Operation) -> job_class_enabled({Application, Operation}); job_class_enabled(Application, Operation) -> {error, {badarg, {Application, Operation}}}. --spec enable_job_class(Class :: term()) -> ok | {error, term()}. +-spec enable_job_class(Class :: term()) -> ok | + {error, term()}. + %% @doc Internal API to enable the specified job class. %% WARNING: %% * This function may not remain in this form once the Jobs API is live! %% * Parameter types ARE NOT validated by the same rules as the public API! %% You are STRONGLY advised to use enable_job_class/2. enable_job_class(Class) -> - case application:get_env(riak_core, job_accept_class, undefined) of - [_|_] = EnabledClasses -> - case lists:member(Class, EnabledClasses) of - true -> - ok; - _ -> - application:set_env( - riak_core, job_accept_class, [Class | EnabledClasses]) - end; - _ -> - application:set_env(riak_core, job_accept_class, [Class]) + case application:get_env(riak_core, job_accept_class, + undefined) + of + [_ | _] = EnabledClasses -> + case lists:member(Class, EnabledClasses) of + true -> ok; + _ -> + application:set_env(riak_core, job_accept_class, + [Class | EnabledClasses]) + end; + _ -> + application:set_env(riak_core, job_accept_class, + [Class]) end. --spec disable_job_class(Class :: term()) -> ok | {error, term()}. +-spec disable_job_class(Class :: term()) -> ok | + {error, term()}. + %% @doc Internal API to disable the specified job class. %% WARNING: %% * This function may not remain in this form once the Jobs API is live! %% * Parameter types ARE NOT validated by the same rules as the public API! %% You are STRONGLY advised to use disable_job_class/2. disable_job_class(Class) -> - case application:get_env(riak_core, job_accept_class, undefined) of - [_|_] = EnabledClasses -> - case lists:member(Class, EnabledClasses) of - false -> - ok; - _ -> - application:set_env(riak_core, job_accept_class, - lists:delete(Class, EnabledClasses)) - end; - _ -> - ok + case application:get_env(riak_core, job_accept_class, + undefined) + of + [_ | _] = EnabledClasses -> + case lists:member(Class, EnabledClasses) of + false -> ok; + _ -> + application:set_env(riak_core, job_accept_class, + lists:delete(Class, EnabledClasses)) + end; + _ -> ok end. -spec job_class_enabled(Class :: term()) -> boolean(). + %% @doc Internal API to determine whether to accept/reject a job. %% WARNING: %% * This function may not remain in this form once the Jobs API is live! %% * Parameter types ARE NOT validated by the same rules as the public API! %% You are STRONGLY advised to use job_class_enabled/2. job_class_enabled(Class) -> - case application:get_env(riak_core, job_accept_class, undefined) of - undefined -> - true; - [] -> - false; - [_|_] = EnabledClasses -> - lists:member(Class, EnabledClasses); - Other -> - % Don't crash if it's not a list - that should never be the case, - % but since the value *can* be manipulated externally be more - % accommodating. If someone mucks it up, nothing's going to be - % allowed, but give them a chance to catch on instead of crashing. - _ = logger:error( - "riak_core.job_accept_class is not a list: ~p", [Other]), - false + case application:get_env(riak_core, job_accept_class, + undefined) + of + undefined -> true; + [] -> false; + [_ | _] = EnabledClasses -> + lists:member(Class, EnabledClasses); + Other -> + % Don't crash if it's not a list - that should never be the case, + % but since the value *can* be manipulated externally be more + % accommodating. If someone mucks it up, nothing's going to be + % allowed, but give them a chance to catch on instead of crashing. + _ = logger:error("riak_core.job_accept_class is not a " + "list: ~p", + [Other]), + false end. --spec job_class_disabled_message(ReturnType :: atom(), Class :: term()) - -> binary() | string(). +-spec job_class_disabled_message(ReturnType :: atom(), + Class :: term()) -> binary() | string(). + %% @doc The error message to be returned to a client for a disabled job class. %% WARNING: %% * This function is likely to be extended to accept a Job as well as a Class %% when the Jobs API is live. job_class_disabled_message(binary, Class) -> - erlang:list_to_binary(job_class_disabled_message(text, Class)); + erlang:list_to_binary(job_class_disabled_message(text, + Class)); job_class_disabled_message(text, Class) -> - lists:flatten(io_lib:format("Operation '~p' is not enabled", [Class])). + lists:flatten(io_lib:format("Operation '~p' is not enabled", + [Class])). + +-spec report_job_request_disposition(Accepted :: + boolean(), + Class :: term(), Mod :: module(), + Func :: atom(), Line :: pos_integer(), + Client :: term()) -> ok | {error, term()}. --spec report_job_request_disposition(Accepted :: boolean(), Class :: term(), - Mod :: module(), Func :: atom(), Line :: pos_integer(), Client :: term()) - -> ok | {error, term()}. %% @doc Report/record the disposition of an async job request. %% %% Logs an appropriate message and reports to whoever needs to know. @@ -865,12 +848,18 @@ job_class_disabled_message(text, Class) -> %% available, Client is an atom representing the protocol through which the %% request was received. %% -report_job_request_disposition(true, Class, Mod, Func, Line, Client) -> - logger:debug("Request '~p' accepted from ~p", [Class, Client], - #{pid => erlang:self(), module => Mod, function => Func, line => Line}); -report_job_request_disposition(false, Class, Mod, Func, Line, Client) -> - logger:warning("Request '~p' disabled from ~p", [Class, Client], - #{pid => erlang:self(), module => Mod, function => Func, line => Line}). +report_job_request_disposition(true, Class, Mod, Func, + Line, Client) -> + logger:debug("Request '~p' accepted from ~p", + [Class, Client], + #{pid => erlang:self(), module => Mod, function => Func, + line => Line}); +report_job_request_disposition(false, Class, Mod, Func, + Line, Client) -> + logger:warning("Request '~p' disabled from ~p", + [Class, Client], + #{pid => erlang:self(), module => Mod, function => Func, + line => Line}). %% =================================================================== %% Preflist utility functions @@ -878,81 +867,15 @@ report_job_request_disposition(false, Class, Mod, Func, Line, Client) -> %% @doc Given a bucket/key, determine the associated preflist index_n. -spec get_index_n({binary(), binary()}) -> index_n(). + get_index_n({Bucket, Key}) -> - BucketProps = riak_core_bucket:get_bucket(Bucket), - N = proplists:get_value(n_val, BucketProps), + %% BucketProps = riak_core_bucket:get_bucket(Bucket), + {ok, N} = application:get_env(riak_core, target_n_val), ChashKey = riak_core_util:chash_key({Bucket, Key}), {ok, CHBin} = riak_core_ring_manager:get_chash_bin(), Index = chashbin:responsible_index(ChashKey, CHBin), {Index, N}. -%% @doc Given an index, determine all sibling indices that participate in one -%% or more preflists with the specified index. --spec preflist_siblings(index()) -> [index()]. -preflist_siblings(Index) -> - {ok, Ring} = riak_core_ring_manager:get_my_ring(), - preflist_siblings(Index, Ring). - -%% @doc See {@link preflist_siblings/1}. --spec preflist_siblings(index(), riak_core_ring()) -> [index()]. -preflist_siblings(Index, Ring) -> - MaxN = determine_max_n(Ring), - preflist_siblings(Index, MaxN, Ring). - --spec preflist_siblings(index(), pos_integer(), riak_core_ring()) -> [index()]. -preflist_siblings(Index, N, Ring) -> - IndexBin = <>, - PL = riak_core_ring:preflist(IndexBin, Ring), - Indices = [Idx || {Idx, _} <- PL], - RevIndices = lists:reverse(Indices), - {Succ, _} = lists:split(N-1, Indices), - {Pred, _} = lists:split(N-1, tl(RevIndices)), - lists:reverse(Pred) ++ Succ. - --spec responsible_preflists(index()) -> [index_n()]. -responsible_preflists(Index) -> - {ok, Ring} = riak_core_ring_manager:get_my_ring(), - responsible_preflists(Index, Ring). - --spec responsible_preflists(index(), riak_core_ring()) -> [index_n()]. -responsible_preflists(Index, Ring) -> - AllN = determine_all_n(Ring), - responsible_preflists(Index, AllN, Ring). - --spec responsible_preflists(index(), [pos_integer(),...], riak_core_ring()) - -> [index_n()]. -responsible_preflists(Index, AllN, Ring) -> - IndexBin = <>, - PL = riak_core_ring:preflist(IndexBin, Ring), - Indices = [Idx || {Idx, _} <- PL], - RevIndices = lists:reverse(Indices), - lists:flatmap(fun(N) -> - responsible_preflists_n(RevIndices, N) - end, AllN). - --spec responsible_preflists_n([index()], pos_integer()) -> [index_n()]. -responsible_preflists_n(RevIndices, N) -> - {Pred, _} = lists:split(N, RevIndices), - [{Idx, N} || Idx <- lists:reverse(Pred)]. - - --spec determine_max_n(riak_core_ring()) -> pos_integer(). -determine_max_n(Ring) -> - lists:max(determine_all_n(Ring)). - --spec determine_all_n(riak_core_ring()) -> [pos_integer(),...]. -determine_all_n(Ring) -> - Buckets = riak_core_ring:get_buckets(Ring), - BucketProps = [riak_core_bucket:get_bucket(Bucket, Ring) || Bucket <- Buckets], - Default = application:get_env(riak_core, default_bucket_props, undefined), - DefaultN = proplists:get_value(n_val, Default), - AllN = lists:foldl(fun(Props, AllN) -> - N = proplists:get_value(n_val, Props), - ordsets:add_element(N, AllN) - end, [DefaultN], BucketProps), - AllN. - - %% =================================================================== %% EUnit tests %% =================================================================== @@ -961,130 +884,101 @@ determine_all_n(Ring) -> moment_test() -> M1 = riak_core_util:moment(), M2 = riak_core_util:moment(), - ?assert(M2 >= M1). + ?assert((M2 >= M1)). clientid_uniqueness_test() -> - ClientIds = [mkclientid('somenode@somehost') || _I <- lists:seq(0, 10000)], - length(ClientIds) =:= length(sets:to_list(sets:from_list(ClientIds))). + ClientIds = [mkclientid(somenode@somehost) + || _I <- lists:seq(0, 10000)], + length(ClientIds) =:= + length(sets:to_list(sets:from_list(ClientIds))). build_tree_test() -> - Flat = [1, - 11, 12, - 111, 112, 121, 122, - 1111, 1112, 1121, 1122, 1211, 1212, 1221, 1222], - + Flat = [1, 11, 12, 111, 112, 121, 122, 1111, 1112, 1121, + 1122, 1211, 1212, 1221, 1222], %% 2-ary tree decomposition - ATree = [{1, [ 11, 12]}, - {11, [ 111, 112]}, - {12, [ 121, 122]}, - {111, [1111, 1112]}, - {112, [1121, 1122]}, - {121, [1211, 1212]}, - {122, [1221, 1222]}, - {1111, []}, - {1112, []}, - {1121, []}, - {1122, []}, - {1211, []}, - {1212, []}, - {1221, []}, + ATree = [{1, [11, 12]}, {11, [111, 112]}, + {12, [121, 122]}, {111, [1111, 1112]}, + {112, [1121, 1122]}, {121, [1211, 1212]}, + {122, [1221, 1222]}, {1111, []}, {1112, []}, {1121, []}, + {1122, []}, {1211, []}, {1212, []}, {1221, []}, {1222, []}], - %% 2-ary tree decomposition with cyclic wrap-around - CTree = [{1, [ 11, 12]}, - {11, [ 111, 112]}, - {12, [ 121, 122]}, - {111, [1111, 1112]}, - {112, [1121, 1122]}, - {121, [1211, 1212]}, - {122, [1221, 1222]}, - {1111, [ 1, 11]}, - {1112, [ 12, 111]}, - {1121, [ 112, 121]}, - {1122, [ 122, 1111]}, - {1211, [1112, 1121]}, - {1212, [1122, 1211]}, - {1221, [1212, 1221]}, - {1222, [1222, 1]}], - - ?assertEqual(ATree, build_tree(2, Flat, [])), - ?assertEqual(CTree, build_tree(2, Flat, [cycles])), + CTree = [{1, [11, 12]}, {11, [111, 112]}, + {12, [121, 122]}, {111, [1111, 1112]}, + {112, [1121, 1122]}, {121, [1211, 1212]}, + {122, [1221, 1222]}, {1111, [1, 11]}, {1112, [12, 111]}, + {1121, [112, 121]}, {1122, [122, 1111]}, + {1211, [1112, 1121]}, {1212, [1122, 1211]}, + {1221, [1212, 1221]}, {1222, [1222, 1]}], + ?assertEqual(ATree, (build_tree(2, Flat, []))), + ?assertEqual(CTree, (build_tree(2, Flat, [cycles]))), ok. - counter_loop(N) -> receive - {up, Pid} -> - N2=N+1, - Pid ! {counter_value, N2}, - counter_loop(N2); - down -> - counter_loop(N-1); - exit -> - exit(normal) + {up, Pid} -> + N2 = N + 1, Pid ! {counter_value, N2}, counter_loop(N2); + down -> counter_loop(N - 1); + exit -> exit(normal) end. incr_counter(CounterPid) -> CounterPid ! {up, self()}, receive - {counter_value, N} -> N - after - 3000 -> - ?assert(false) + {counter_value, N} -> N after 3000 -> ?assert(false) end. -decr_counter(CounterPid) -> - CounterPid ! down. +decr_counter(CounterPid) -> CounterPid ! down. multi_keydelete_test_() -> - Languages = [{lisp, 1958}, - {ml, 1973}, - {erlang, 1986}, - {haskell, 1990}, - {ocaml, 1996}, - {clojure, 2007}, + Languages = [{lisp, 1958}, {ml, 1973}, {erlang, 1986}, + {haskell, 1990}, {ocaml, 1996}, {clojure, 2007}, {elixir, 2012}], - ?_assertMatch( - [{lisp, _}, {ml, _}, {erlang, _}, {haskell, _}], - multi_keydelete([ocaml, clojure, elixir], Languages)). + ?_assertMatch([{lisp, _}, {ml, _}, {erlang, _}, + {haskell, _}], + (multi_keydelete([ocaml, clojure, elixir], Languages))). compose_test_() -> Upper = fun string:to_upper/1, Reverse = fun lists:reverse/1, - Strip = fun(S) -> string:strip(S, both, $!) end, + Strip = fun (S) -> string:strip(S, both, $!) end, StripReverseUpper = compose([Upper, Reverse, Strip]), - - Increment = fun(N) when is_integer(N) -> N + 1 end, - Double = fun(N) when is_integer(N) -> N * 2 end, - Square = fun(N) when is_integer(N) -> N * N end, - SquareDoubleIncrement = compose([Increment, Double, Square]), - + Increment = fun (N) when is_integer(N) -> N + 1 end, + Double = fun (N) when is_integer(N) -> N * 2 end, + Square = fun (N) when is_integer(N) -> N * N end, + SquareDoubleIncrement = compose([Increment, Double, + Square]), CompatibleTypes = compose(Increment, - fun(X) when is_list(X) -> list_to_integer(X) end), + fun (X) when is_list(X) -> list_to_integer(X) + end), IncompatibleTypes = compose(Increment, - fun(X) when is_binary(X) -> binary_to_list(X) end), - [?_assertEqual("DLROW OLLEH", StripReverseUpper("Hello world!")), - ?_assertEqual(Increment(Double(Square(3))), SquareDoubleIncrement(3)), - ?_assertMatch(4, CompatibleTypes("3")), - ?_assertError(function_clause, IncompatibleTypes(<<"42">>)), - ?_assertError(function_clause, compose(fun(X, Y) -> {X, Y} end, fun(X) -> X end))]. + fun (X) when is_binary(X) -> binary_to_list(X) + end), + [?_assertEqual("DLROW OLLEH", + (StripReverseUpper("Hello world!"))), + ?_assertEqual((Increment(Double(Square(3)))), + (SquareDoubleIncrement(3))), + ?_assertMatch(4, (CompatibleTypes("3"))), + ?_assertError(function_clause, + (IncompatibleTypes(<<"42">>))), + ?_assertError(function_clause, + (compose(fun (X, Y) -> {X, Y} end, fun (X) -> X end)))]. pmap_test_() -> - Fgood = fun(X) -> 2 * X end, - Fbad = fun(3) -> throw(die_on_3); - (X) -> Fgood(X) + Fgood = fun (X) -> 2 * X end, + Fbad = fun (3) -> throw(die_on_3); + (X) -> Fgood(X) end, - Lin = [1,2,3,4], - Lout = [2,4,6,8], - {setup, - fun() -> error_logger:tty(false) end, - fun(_) -> error_logger:tty(true) end, - [fun() -> + Lin = [1, 2, 3, 4], + Lout = [2, 4, 6, 8], + {setup, fun () -> error_logger:tty(false) end, + fun (_) -> error_logger:tty(true) end, + [fun () -> % Test simple map case - ?assertEqual(Lout, pmap(Fgood, Lin)), + ?assertEqual(Lout, (pmap(Fgood, Lin))), % Verify a crashing process will not stall pmap Parent = self(), - Pid = spawn(fun() -> + Pid = spawn(fun () -> % Caller trapping exits causes stall!! % TODO: Consider pmapping in a spawned proc % process_flag(trap_exit, true), @@ -1094,80 +988,72 @@ pmap_test_() -> end), MonRef = monitor(process, Pid), receive - {'DOWN', MonRef, _, _, _} -> - ok; - no_crash_yo -> - ?assert(pmap_did_not_crash_as_expected) + {'DOWN', MonRef, _, _, _} -> ok; + no_crash_yo -> ?assert(pmap_did_not_crash_as_expected) end - end - ]}. + end]}. bounded_pmap_test_() -> - Fun1 = fun(X) -> X+2 end, - Tests = - fun(CountPid) -> - GFun = fun(Max) -> - fun(X) -> - ?assert(incr_counter(CountPid) =< Max), - timer:sleep(1), - decr_counter(CountPid), - Fun1(X) - end - end, - [ - fun() -> - ?assertEqual(lists:seq(Fun1(1), Fun1(N)), - pmap(GFun(MaxP), - lists:seq(1, N), MaxP)) - end || - MaxP <- lists:seq(1,20), - N <- lists:seq(0,10) - ] - end, + Fun1 = fun (X) -> X + 2 end, + Tests = fun (CountPid) -> + GFun = fun (Max) -> + fun (X) -> + ?assert((incr_counter(CountPid) =< + Max)), + timer:sleep(1), + decr_counter(CountPid), + Fun1(X) + end + end, + [fun () -> + ?assertEqual((lists:seq(Fun1(1), Fun1(N))), + (pmap(GFun(MaxP), lists:seq(1, N), + MaxP))) + end + || MaxP <- lists:seq(1, 20), N <- lists:seq(0, 10)] + end, {setup, - fun() -> - Pid = spawn_link(?MODULE, counter_loop, [0]), - monitor(process, Pid), - Pid - end, - fun(Pid) -> - Pid ! exit, - receive - {'DOWN', _Ref, process, Pid, _Info} -> ok - after - 3000 -> - ?debugMsg("pmap counter process did not go down in time"), - ?assert(false) - end, - ok - end, - Tests - }. + fun () -> + Pid = spawn_link(?MODULE, counter_loop, [0]), + monitor(process, Pid), + Pid + end, + fun (Pid) -> + Pid ! exit, + receive + {'DOWN', _Ref, process, Pid, _Info} -> ok + after 3000 -> + ?debugMsg("pmap counter process did not go down " + "in time"), + ?assert(false) + end, + ok + end, + Tests}. proxy_spawn_test() -> - A = proxy_spawn(fun() -> a end), + A = proxy_spawn(fun () -> a end), ?assertEqual(a, A), - B = proxy_spawn(fun() -> exit(killer_fun) end), + B = proxy_spawn(fun () -> exit(killer_fun) end), ?assertEqual({error, killer_fun}, B), - %% Ensure no errant 'DOWN' messages receive - {'DOWN', _, _, _, _}=Msg -> - throw({error, {badmsg, Msg}}); - _ -> - ok - after 1000 -> - ok + {'DOWN', _, _, _, _} = Msg -> + throw({error, {badmsg, Msg}}); + _ -> ok + after 1000 -> ok end. --ifdef(EQC). +-ifdef(PROPER). count_test() -> - ?assert(eqc:quickcheck(prop_count_correct())). + ?assert((proper:quickcheck(prop_count_correct()))). prop_count_correct() -> - ?FORALL(List, list(bool()), - count(fun(E) -> E end, List) =:= length([E || E <- List, E])). + ?FORALL(List, (list(bool())), + (count(fun (E) -> E end, List) =:= + length([E || E <- List, E]))). -endif. %% EQC + -endif. %% TEST diff --git a/src/riak_core_vnode.erl b/src/riak_core_vnode.erl index f1f2a6dd0..6e29c9a7e 100644 --- a/src/riak_core_vnode.erl +++ b/src/riak_core_vnode.erl @@ -16,125 +16,160 @@ %% under the License. %% %% ------------------------------------------------------------------- --module('riak_core_vnode'). --behaviour(gen_fsm). - --compile({nowarn_deprecated_function, - [{gen_fsm, start_link, 3}, - {gen_fsm, send_event, 2}, - {gen_fsm, send_event_after, 2}, - {gen_fsm, sync_send_event, 3}, - {gen_fsm, send_all_state_event, 2}, - {gen_fsm, sync_send_all_state_event, 2}, - {gen_fsm, cancel_timer, 1}]}). +-module(riak_core_vnode). + +-behaviour(gen_fsm_compat). -include("riak_core_vnode.hrl"). --export([start_link/3, - start_link/4, - wait_for_init/1, - send_command/2, - send_command_after/2]). --export([init/1, - started/2, - started/3, - active/2, - active/3, - handle_event/3, - handle_sync_event/4, - handle_info/3, - terminate/3, - code_change/4]). --export([reply/2, - monitor/1]). --export([get_mod_index/1, - get_modstate/1, - set_forwarding/2, - trigger_handoff/2, - trigger_handoff/3, - trigger_delete/1, - core_status/1, - handoff_error/3]). - --include("stacktrace.hrl"). + +-export([start_link/3, start_link/4, wait_for_init/1, + send_command/2, send_command_after/2]). + +-export([init/1, started/2, started/3, active/2, + active/3, handle_event/3, handle_sync_event/4, + handle_info/3, terminate/3, code_change/4]). + +-export([reply/2, monitor/1]). + +-export([get_mod_index/1, get_modstate/1, + set_forwarding/2, trigger_handoff/2, trigger_handoff/3, + trigger_delete/1, core_status/1, handoff_error/3]). + +-export([cast_finish_handoff/1, send_an_event/2, + send_req/2, send_all_proxy_req/2, cancel_handoff/1, + handoff_complete/1, resize_transfer_complete/2, + handoff_data/3, unregistered/1]). -ifdef(TEST). -include_lib("eunit/include/eunit.hrl"). --export([test_link/2, - current_state/1]). +-export([test_link/2, current_state/1]). + -endif. -ifdef(PULSE). + -compile(export_all). + -compile({parse_transform, pulse_instrument}). --compile({pulse_replace_module, [{gen_fsm, pulse_gen_fsm}, - {gen_server, pulse_gen_server}]}). + +-compile({pulse_replace_module, + [{gen_fsm_compat, pulse_gen_fsm}, + {gen_server, pulse_gen_server}]}). + -endif. --define(normal_reason(R), - (R == normal orelse R == shutdown orelse - (is_tuple(R) andalso element(1,R) == shutdown))). +-define(NORMAL_REASON(R), + R == normal orelse + R == shutdown orelse + is_tuple(R) andalso element(1, R) == shutdown). -export_type([vnode_opt/0, pool_opt/0]). -type vnode_opt() :: pool_opt(). --type pool_opt() :: {pool, WorkerModule::module(), PoolSize::pos_integer(), WorkerArgs::[term()]}. - --callback init([partition()]) -> - {ok, ModState::term()} | - {ok, ModState::term(), [vnode_opt()]} | - {error, Reason::term()}. - --callback handle_command(Request::term(), Sender::sender(), ModState::term()) -> - continue | - {reply, Reply::term(), NewModState::term()} | - {noreply, NewModState::term()} | - {async, Work::function(), From::sender(), NewModState::term()} | - {stop, Reason::term(), NewModState::term()}. - --callback handle_coverage(Request::term(), keyspaces(), Sender::sender(), ModState::term()) -> - continue | - {reply, Reply::term(), NewModState::term()} | - {noreply, NewModState::term()} | - {async, Work::function(), From::sender(), NewModState::term()} | - {stop, Reason::term(), NewModState::term()}. - --callback handle_exit(pid(), Reason::term(), ModState::term()) -> - {noreply, NewModState::term()} | - {stop, Reason::term(), NewModState::term()}. - --callback handoff_starting(handoff_dest(), ModState::term()) -> - {boolean(), NewModState::term()}. - --callback handoff_cancelled(ModState::term()) -> - {ok, NewModState::term()}. - --callback handoff_finished(handoff_dest(), ModState::term()) -> - {ok, NewModState::term()}. - --callback handle_handoff_command(Request::term(), Sender::sender(), ModState::term()) -> - {reply, Reply::term(), NewModState::term()} | - {noreply, NewModState::term()} | - {async, Work::function(), From::sender(), NewModState::term()} | - {forward, NewModState::term()} | - {drop, NewModState::term()} | - {stop, Reason::term(), NewModState::term()}. - --callback handle_handoff_data(binary(), ModState::term()) -> - {reply, ok | {error, Reason::term()}, NewModState::term()}. - --callback encode_handoff_item(Key::term(), Value::term()) -> - corrupted | binary(). - --callback is_empty(ModState::term()) -> - {boolean(), NewModState::term()} | - {false, Size::pos_integer(), NewModState::term()}. - --callback terminate(Reason::term(), ModState::term()) -> - ok. --callback delete(ModState::term()) -> {ok, NewModState::term()}. +-type pool_opt() :: {pool, WorkerModule :: module(), + PoolSize :: pos_integer(), WorkerArgs :: [term()]}. + +-callback init([partition()]) -> {ok, + ModState :: term()} | + {ok, ModState :: term(), [vnode_opt()]} | + {error, Reason :: term()}. + +-callback handle_command(Request :: term(), + Sender :: sender(), ModState :: term()) -> continue | + {reply, + Reply :: + term(), + NewModState :: + term()} | + {noreply, + NewModState :: + term()} | + {async, + Work :: + function(), + From :: + sender(), + NewModState :: + term()} | + {stop, + Reason :: + term(), + NewModState :: + term()}. + +-callback handle_coverage(Request :: term(), + keyspaces(), Sender :: sender(), + ModState :: term()) -> continue | + {reply, Reply :: term(), + NewModState :: term()} | + {noreply, + NewModState :: term()} | + {async, Work :: function(), + From :: sender(), + NewModState :: term()} | + {stop, Reason :: term(), + NewModState :: term()}. + +-callback handle_exit(pid(), Reason :: term(), + ModState :: term()) -> {noreply, + NewModState :: term()} | + {stop, Reason :: term(), + NewModState :: term()}. + +-callback handoff_starting(handoff_dest(), + ModState :: term()) -> {boolean(), + NewModState :: term()}. + +-callback handoff_cancelled(ModState :: term()) -> {ok, + NewModState :: term()}. + +-callback handoff_finished(handoff_dest(), + ModState :: term()) -> {ok, NewModState :: term()}. + +-callback handle_handoff_command(Request :: term(), + Sender :: sender(), + ModState :: term()) -> {reply, Reply :: term(), + NewModState :: + term()} | + {noreply, + NewModState :: + term()} | + {async, + Work :: function(), + From :: sender(), + NewModState :: + term()} | + {forward, + NewModState :: + term()} | + {drop, + NewModState :: + term()} | + {stop, Reason :: term(), + NewModState :: term()}. + +-callback handle_handoff_data(binary(), + ModState :: term()) -> {reply, + ok | + {error, Reason :: term()}, + NewModState :: term()}. + +-callback encode_handoff_item(Key :: term(), + Value :: term()) -> corrupted | binary(). + +-callback is_empty(ModState :: term()) -> {boolean(), + NewModState :: term()} | + {false, Size :: pos_integer(), + NewModState :: term()}. + +-callback terminate(Reason :: term(), + ModState :: term()) -> ok. + +-callback delete(ModState :: term()) -> {ok, + NewModState :: term()}. %% This commands are not executed inside the VNode, instead they are %% part of the vnode_proxy contract. @@ -157,12 +192,11 @@ %% BUT DO NOT call expensive functions from them there is a special hell %% for people doing that! (it's called overflowing message queue hell and is %% really nasty!) --callback handle_overload_command(Request::term(), Sender::sender(), - Idx::partition()) -> - ok. +-callback handle_overload_command(Request :: term(), + Sender :: sender(), Idx :: partition()) -> ok. --callback handle_overload_info(Request::term(), Idx::partition()) -> - ok. +-callback handle_overload_info(Request :: term(), + Idx :: partition()) -> ok. %% handle_exit/3 is an optional behaviour callback that can be implemented. %% It will be called in the case that a process that is linked to the vnode @@ -188,125 +222,226 @@ %% -spec handle_info(term(), term()) -> {ok, term()} -define(DEFAULT_TIMEOUT, 60000). + -define(LOCK_RETRY_TIMEOUT, 10000). --record(state, { - index :: partition(), - mod :: module(), - modstate :: term(), - forward :: node() | [{integer(), node()}], - handoff_target=none :: none | {integer(), node()}, - handoff_pid :: pid() | undefined, - handoff_type :: riak_core_handoff_manager:ho_type() | undefined, - pool_pid :: pid() | undefined, - pool_config :: tuple() | undefined, - manager_event_timer :: reference() | undefined, - inactivity_timeout :: non_neg_integer() - }). + +%% ======== +%% API +%% ======== start_link(Mod, Index, Forward) -> start_link(Mod, Index, 0, Forward). -start_link(Mod, Index, InitialInactivityTimeout, Forward) -> - gen_fsm:start_link(?MODULE, - [Mod, Index, InitialInactivityTimeout, Forward], []). +start_link(Mod, Index, InitialInactivityTimeout, + Forward) -> + gen_fsm_compat:start_link(?MODULE, + [Mod, Index, InitialInactivityTimeout, Forward], + []). +%% #1 - State started +wait_for_init(Vnode) -> + gen_fsm_compat:sync_send_event(Vnode, wait_for_init, + infinity). + +%% #2 - %% Send a command message for the vnode module by Pid - %% typically to do some deferred processing after returning yourself send_command(Pid, Request) -> - gen_fsm:send_event(Pid, ?VNODE_REQ{request=Request}). - + gen_fsm_compat:send_event(Pid, + #riak_vnode_req_v1{request = Request}). %% Sends a command to the FSM that called it after Time %% has passed. --spec send_command_after(integer(), term()) -> reference(). -send_command_after(Time, Request) -> - gen_fsm:send_event_after(Time, ?VNODE_REQ{request=Request}). +-spec send_command_after(integer(), + term()) -> reference(). +send_command_after(Time, Request) -> + gen_fsm_compat:send_event_after(Time, + #riak_vnode_req_v1{request = Request}). -init([Mod, Index, InitialInactivityTimeout, Forward]) -> +init([Module, Index, InitialInactivityTimeout, + Forward]) -> process_flag(trap_exit, true), - State = #state{index=Index, mod=Mod, forward=Forward, inactivity_timeout=InitialInactivityTimeout}, + State = #state{index = Index, mod = Module, + forward = Forward, + inactivity_timeout = InitialInactivityTimeout}, {ok, started, State, 0}. -started(timeout, State = - #state{inactivity_timeout=InitialInactivityTimeout}) -> +started(timeout, + State = #state{inactivity_timeout = + InitialInactivityTimeout}) -> case do_init(State) of - {ok, State2} -> - {next_state, active, State2, InitialInactivityTimeout}; - {error, Reason} -> - {stop, Reason} + {ok, State2} -> + {next_state, active, State2, InitialInactivityTimeout}; + {error, Reason} -> {stop, Reason} end. -started(wait_for_init, _From, State = - #state{inactivity_timeout=InitialInactivityTimeout}) -> +started(wait_for_init, _From, + State = #state{inactivity_timeout = + InitialInactivityTimeout}) -> case do_init(State) of - {ok, State2} -> - {reply, ok, active, State2, InitialInactivityTimeout}; - {error, Reason} -> - {stop, Reason} + {ok, State2} -> + {reply, ok, active, State2, InitialInactivityTimeout}; + {error, Reason} -> {stop, Reason} end. -do_init(State = #state{index=Index, mod=Mod, forward=Forward}) -> - {ModState, Props} = case Mod:init([Index]) of - {ok, MS} -> {MS, []}; - {ok, MS, P} -> {MS, P}; - {error, R} -> {error, R} - end, +do_init(State = #state{index = Index, mod = Module, + forward = Forward}) -> + {ModState, Props} = case Module:init([Index]) of + {ok, MS} -> {MS, []}; + {ok, MS, P} -> {MS, P}; + {error, R} -> {error, R} + end, case {ModState, Props} of - {error, Reason} -> - {error, Reason}; - _ -> - case lists:keyfind(pool, 1, Props) of - {pool, WorkerModule, PoolSize, WorkerArgs}=PoolConfig -> - logger:debug("starting worker pool ~p with size of ~p~n", - [WorkerModule, PoolSize]), - {ok, PoolPid} = riak_core_vnode_worker_pool:start_link(WorkerModule, - PoolSize, - Index, - WorkerArgs, - worker_props); - _ -> - PoolPid = PoolConfig = undefined - end, - riak_core_handoff_manager:remove_exclusion(Mod, Index), - Timeout = application:get_env(riak_core, vnode_inactivity_timeout, ?DEFAULT_TIMEOUT), - Timeout2 = Timeout + riak_core_rand:uniform(Timeout), - State2 = State#state{modstate=ModState, inactivity_timeout=Timeout2, - pool_pid=PoolPid, pool_config=PoolConfig}, - logger:debug("vnode :: ~p/~p :: ~p~n", [Mod, Index, Forward]), - State3 = mod_set_forwarding(Forward, State2), - {ok, State3} + {error, Reason} -> {error, Reason}; + _ -> + PoolConfig = case lists:keyfind(pool, 1, Props) of + {pool, WorkerModule, PoolSize, WorkerArgs} = PoolCfg -> + logger:debug("starting worker pool ~p with size of " + "~p~n", + [WorkerModule, PoolSize]), + {ok, PoolPid} = + riak_core_vnode_worker_pool:start_link(WorkerModule, + PoolSize, + Index, + WorkerArgs, + worker_props), + PoolCfg; + _ -> PoolPid = undefined + end, + riak_core_handoff_manager:remove_exclusion(Module, + Index), + Timeout = application:get_env(riak_core, + vnode_inactivity_timeout, + ?DEFAULT_TIMEOUT), + Timeout2 = Timeout + rand:uniform(Timeout), + State2 = State#state{modstate = ModState, + inactivity_timeout = Timeout2, + pool_pid = PoolPid, pool_config = PoolConfig}, + logger:debug("vnode :: ~p/~p :: ~p~n", + [Module, Index, Forward]), + State3 = mod_set_forwarding(Forward, State2), + {ok, State3} end. wait_for_init(Vnode) -> - gen_fsm:sync_send_event(Vnode, wait_for_init, infinity). + gen_fsm_compat:sync_send_event(Vnode, wait_for_init, + infinity). handoff_error(Vnode, Err, Reason) -> - gen_fsm:send_event(Vnode, {handoff_error, Err, Reason}). + gen_fsm_compat:send_event(Vnode, + {handoff_error, Err, Reason}). +%% #4 - get_mod_index(VNode) -> - gen_fsm:sync_send_all_state_event(VNode, get_mod_index). + gen_fsm_compat:sync_send_all_state_event(VNode, + get_mod_index). +%% #5 set_forwarding(VNode, ForwardTo) -> - gen_fsm:send_all_state_event(VNode, {set_forwarding, ForwardTo}). + gen_fsm_compat:send_all_state_event(VNode, + {set_forwarding, ForwardTo}). +%% #6 trigger_handoff(VNode, TargetIdx, TargetNode) -> - gen_fsm:send_all_state_event(VNode, {trigger_handoff, TargetIdx, TargetNode}). + gen_fsm_compat:send_all_state_event(VNode, + {trigger_handoff, TargetIdx, + TargetNode}). +%% #7 trigger_handoff(VNode, TargetNode) -> - gen_fsm:send_all_state_event(VNode, {trigger_handoff, TargetNode}). + gen_fsm_compat:send_all_state_event(VNode, + {trigger_handoff, TargetNode}). +%% #8 trigger_delete(VNode) -> - gen_fsm:send_all_state_event(VNode, trigger_delete). + gen_fsm_compat:send_all_state_event(VNode, + trigger_delete). +%% #9 core_status(VNode) -> - gen_fsm:sync_send_all_state_event(VNode, core_status). + gen_fsm_compat:sync_send_all_state_event(VNode, + core_status). -continue(State) -> - {next_state, active, State, State#state.inactivity_timeout}. +%% #10 +%% Sends a command to the FSM that called it after Time +%% has passed. +-spec send_command_after(integer(), + term()) -> reference(). -continue(State, NewModState) -> - continue(State#state{modstate=NewModState}). +send_command_after(Time, Request) -> + gen_fsm_compat:send_event_after(Time, + #riak_vnode_req_v1{request = Request}). + +%%%%%%% %new APIs +%% #11 - riak_core_vnode_manager - handle_vnode_event +cast_finish_handoff(VNode) -> + gen_fsm_compat:send_all_state_event(VNode, + finish_handoff). + +%% #12 - riak_core_vnode_manager - handle_vnode_event +cancel_handoff(VNode) -> + gen_fsm_compat:send_all_state_event(VNode, + cancel_handoff). + +%% # - riak_core_vnode_master - command2 +%send_req + +%% # - riak_core_vnode_master - handle_cast/handle_call +send_req(VNode, Req) -> + gen_fsm_compat:send_event(VNode, Req). + +%% # - riak_core_vnode_master - handle_call +send_all_proxy_req(VNode, Req) -> + gen_fsm_compat:send_all_state_event(VNode, Req). + +%% #16 - riak:core_handoff_sender - start_fold_ +handoff_complete(VNode) -> + gen_fsm_compat:send_event(VNode, handoff_complete). + +%% #17 - riak:core_handoff_sender - start_fold_ +resize_transfer_complete(VNode, NotSentAcc) -> + gen_fsm_compat:send_event(VNode, + {resize_transfer_complete, NotSentAcc}). + +%% #18 - riak_core_handoff_receiver - process_message +handoff_data(VNode, MsgData, VNodeTimeout) -> + gen_fsm_compat:sync_send_all_state_event(VNode, + {handoff_data, MsgData}, + VNodeTimeout). + +%% #19 - riak_core_vnode_proxy - handle_cast +unregistered(VNode) -> + gen_fsm_compat:send_event(VNode, unregistered). + +%% @doc Send a reply to a vnode request. If +%% the Ref is undefined just send the reply +%% for compatibility with pre-0.12 requestors. +%% If Ref is defined, send it along with the +%% reply. +%% NOTE: We *always* send the reply using unreliable delivery. +%% +-spec reply(sender(), term()) -> any(). + +reply({fsm, undefined, From}, Reply) -> + riak_core_send_msg:send_event_unreliable(From, Reply); +reply({fsm, Ref, From}, Reply) -> + riak_core_send_msg:send_event_unreliable(From, + {Ref, Reply}); +reply({server, undefined, From}, Reply) -> + riak_core_send_msg:reply_unreliable(From, Reply); +reply({server, Ref, From}, Reply) -> + riak_core_send_msg:reply_unreliable(From, {Ref, Reply}); +reply({raw, Ref, From}, Reply) -> + riak_core_send_msg:bang_unreliable(From, {Ref, Reply}); +reply(ignore, _Reply) -> ok. + +%% @doc Set up a monitor for the pid named by a {@type sender()} vnode +%% argument. If `Sender' was the atom `ignore', this function sets up +%% a monitor on `self()' in order to return a valid (if useless) +%% monitor reference. +-spec monitor(Sender :: sender()) -> Monitor :: + reference(). %% Active vnodes operate in three states: normal, handoff, and forwarding. %% @@ -336,243 +471,252 @@ continue(State, NewModState) -> %% transfers with this vnode as the source. During this time requests that can be forwarded %% to a partition for which the transfer has already completed, are forwarded. All other %% requests are passed to handle_handoff_command. -forward_or_vnode_command(Sender, Request, State=#state{forward=Forward, - mod=Mod, - index=Index}) -> +forward_or_vnode_command(Sender, Request, + State = #state{forward = Forward, mod = Module, + index = Index}) -> Resizing = is_list(Forward), RequestHash = case Resizing of - true -> - Mod:request_hash(Request); - false -> - undefined - end, + true -> Module:request_hash(Request); + false -> undefined + end, case {Forward, RequestHash} of - %% typical vnode operation, no forwarding set, handle request locally - {undefined, _} -> vnode_command(Sender, Request, State); - - %% implicit forwarding after ownership transfer/hinted handoff - {F, _} when not is_list(F) -> - vnode_forward(implicit, {Index, Forward}, Sender, Request, State), - continue(State); - - %% during resize we can't forward a request w/o request hash, always handle locally - {_, undefined} -> vnode_command(Sender, Request, State); - - %% possible forwarding during ring resizing - {_, _} -> - {ok, R} = riak_core_ring_manager:get_my_ring(), - FutureIndex = riak_core_ring:future_index(RequestHash, Index, R), - vnode_resize_command(Sender, Request, FutureIndex, State) + %% typical vnode operation, no forwarding set, handle request locally + {undefined, _} -> vnode_command(Sender, Request, State); + %% implicit forwarding after ownership transfer/hinted handoff + {F, _} when not is_list(F) -> + vnode_forward(implicit, {Index, Forward}, Sender, + Request, State), + continue(State); + %% during resize we can't forward a request w/o request hash, always handle locally + {_, undefined} -> vnode_command(Sender, Request, State); + %% possible forwarding during ring resizing + {_, _} -> + {ok, R} = riak_core_ring_manager:get_my_ring(), + FutureIndex = riak_core_ring:future_index(RequestHash, + Index, R), + vnode_resize_command(Sender, Request, FutureIndex, + State) end. -vnode_command(_Sender, _Request, State=#state{modstate={deleted,_}}) -> +vnode_command(_Sender, _Request, + State = #state{modstate = {deleted, _}}) -> continue(State); -vnode_command(Sender, Request, State=#state{mod=Mod, - modstate=ModState, - pool_pid=Pool}) -> - case catch Mod:handle_command(Request, Sender, ModState) of - {'EXIT', ExitReason} -> - reply(Sender, {vnode_error, ExitReason}), - logger:error("~p command failed ~p", [Mod, ExitReason]), - {stop, ExitReason, State#state{modstate=ModState}}; - continue -> - continue(State, ModState); - {reply, Reply, NewModState} -> - reply(Sender, Reply), - continue(State, NewModState); - {noreply, NewModState} -> - continue(State, NewModState); - {async, Work, From, NewModState} -> - %% dispatch some work to the vnode worker pool - %% the result is sent back to 'From' - riak_core_vnode_worker_pool:handle_work(Pool, Work, From), - continue(State, NewModState); - {stop, Reason, NewModState} -> - {stop, Reason, State#state{modstate=NewModState}} +vnode_command(Sender, Request, + State = #state{mod = Module, modstate = ModState, + pool_pid = Pool}) -> + case catch Module:handle_command(Request, Sender, + ModState) + of + {'EXIT', ExitReason} -> + reply(Sender, {vnode_error, ExitReason}), + logger:error("~p command failed ~p", + [Module, ExitReason]), + {stop, ExitReason, State#state{modstate = ModState}}; + continue -> continue(State, ModState); + {reply, Reply, NewModState} -> + reply(Sender, Reply), continue(State, NewModState); + {noreply, NewModState} -> continue(State, NewModState); + {async, Work, From, NewModState} -> + %% dispatch some work to the vnode worker pool + %% the result is sent back to 'From' + riak_core_vnode_worker_pool:handle_work(Pool, Work, + From), + continue(State, NewModState); + {stop, Reason, NewModState} -> + {stop, Reason, State#state{modstate = NewModState}} end. -vnode_coverage(Sender, Request, KeySpaces, State=#state{index=Index, - mod=Mod, - modstate=ModState, - pool_pid=Pool, - forward=Forward}) -> - %% Check if we should forward - case Forward of - undefined -> - Action = Mod:handle_coverage(Request, KeySpaces, Sender, ModState); - %% handle coverage requests locally during ring resize - Forwards when is_list(Forwards) -> - Action = Mod:handle_coverage(Request, KeySpaces, Sender, ModState); - NextOwner -> - logger:debug("Forwarding coverage ~p -> ~p: ~p~n", [node(), NextOwner, Index]), - riak_core_vnode_master:coverage(Request, {Index, NextOwner}, - KeySpaces, Sender, - riak_core_vnode_master:reg_name(Mod)), - Action = continue - end, - case Action of - continue -> - continue(State, ModState); - {reply, Reply, NewModState} -> - reply(Sender, Reply), - continue(State, NewModState); - {noreply, NewModState} -> - continue(State, NewModState); - {async, Work, From, NewModState} -> - %% dispatch some work to the vnode worker pool - %% the result is sent back to 'From' - riak_core_vnode_worker_pool:handle_work(Pool, Work, From), - continue(State, NewModState); - {stop, Reason, NewModState} -> - {stop, Reason, State#state{modstate=NewModState}} - end. +%% ======================== +%% ======== +%% State, Mode, Init, Terminate +%% ======== +%% ======================== +-record(state, + {index :: partition(), mod :: module(), + modstate :: term(), + forward :: node() | [{integer(), node()}], + handoff_target = none :: none | {integer(), node()}, + handoff_pid :: pid() | undefined, + handoff_type :: + riak_core_handoff_manager:ho_type() | undefined, + pool_pid :: pid() | undefined, + pool_config :: tuple() | undefined, + manager_event_timer :: reference() | undefined, + inactivity_timeout :: non_neg_integer()}). + +init([Module, Index, InitialInactivityTimeout, + Forward]) -> + process_flag(trap_exit, true), + State = #state{index = Index, mod = Module, + forward = Forward, + inactivity_timeout = InitialInactivityTimeout}, + {ok, started, State, 0}. -vnode_handoff_command(Sender, Request, ForwardTo, - State=#state{mod=Mod, - modstate=ModState, - handoff_target=HOTarget, - handoff_type=HOType, - pool_pid=Pool}) -> - case Mod:handle_handoff_command(Request, Sender, ModState) of - {reply, Reply, NewModState} -> - reply(Sender, Reply), - continue(State, NewModState); - {noreply, NewModState} -> - continue(State, NewModState); - {async, Work, From, NewModState} -> - %% dispatch some work to the vnode worker pool - %% the result is sent back to 'From' - riak_core_vnode_worker_pool:handle_work(Pool, Work, From), - continue(State, NewModState); - {forward, NewModState} -> - forward_request(HOType, Request, HOTarget, ForwardTo, Sender, State), - continue(State, NewModState); - {forward, NewReq, NewModState} -> - forward_request(HOType, NewReq, HOTarget, ForwardTo, Sender, State), - continue(State, NewModState); - {drop, NewModState} -> - continue(State, NewModState); - {stop, Reason, NewModState} -> - {stop, Reason, State#state{modstate=NewModState}} +terminate(Reason, _StateName, + #state{mod = Module, modstate = ModState, + pool_pid = Pool}) -> + %% Shutdown if the pool is still alive and a normal `Reason' is + %% given - there could be a race on delivery of the unregistered + %% event and successfully shutting down the pool. + try case is_pid(Pool) andalso + is_process_alive(Pool) andalso (?NORMAL_REASON(Reason)) + of + true -> + riak_core_vnode_worker_pool:shutdown_pool(Pool, 60000); + _ -> ok + end + catch + Type:Reason:Stacktrace -> + logger:error("Error while shutting down vnode worker " + "pool ~p:~p trace : ~p", + [Type, Reason, Stacktrace]) + after + case ModState of + %% Handoff completed, Module:delete has been called, now terminate. + {deleted, ModState1} -> + Module:terminate(Reason, ModState1); + _ -> Module:terminate(Reason, ModState) + end end. -%% @private wrap the request for resize forwards, and use the resize -%% target. -forward_request(resize, Request, _HOTarget, ResizeTarget, Sender, State) -> - %% resize op and transfer ongoing - vnode_forward(resize, ResizeTarget, Sender, {resize_forward, Request}, State); -forward_request(undefined, Request, _HOTarget, ResizeTarget, Sender, State) -> - %% resize op ongoing, no resize transfer ongoing, arrive here - %% via forward_or_vnode_command - vnode_forward(resize, ResizeTarget, Sender, {resize_forward, Request}, State); -forward_request(_, Request, HOTarget, _ResizeTarget, Sender, State) -> - %% normal explicit forwarding during owhership transfer - vnode_forward(explicit, HOTarget, Sender, Request, State). - -vnode_forward(Type, ForwardTo, Sender, Request, State) -> - logger:debug("Forwarding (~p) {~p,~p} -> ~p~n", - [Type, State#state.index, node(), ForwardTo]), - riak_core_vnode_master:command_unreliable(ForwardTo, Request, Sender, - riak_core_vnode_master:reg_name(State#state.mod)). +code_change(_OldVsn, StateName, State, _Extra) -> + {ok, StateName, State}. -%% @doc during ring resizing if we have completed a transfer to the index that will -%% handle request in future ring we forward to it. Otherwise we delegate -%% to the local vnode like other requests during handoff -vnode_resize_command(Sender, Request, FutureIndex, - State=#state{forward=Forward}) when is_list(Forward) -> - case lists:keyfind(FutureIndex, 1, Forward) of - false -> vnode_command(Sender, Request, State); - {FutureIndex, FutureOwner} -> vnode_handoff_command(Sender, Request, - {FutureIndex, FutureOwner}, - State) +%% ======================== +%% ======== +%% States +%% ======== +%% ======================== + +%% started +%% ======== +started(timeout, + State = #state{inactivity_timeout = + InitialInactivityTimeout}) -> + case do_init(State) of + {ok, State2} -> + {next_state, active, State2, InitialInactivityTimeout}; + {error, Reason} -> {stop, Reason} end. +started(wait_for_init, _From, + State = #state{inactivity_timeout = + InitialInactivityTimeout}) -> + case do_init(State) of + {ok, State2} -> + {reply, ok, active, State2, InitialInactivityTimeout}; + {error, Reason} -> {stop, Reason} + end. -active(timeout, State=#state{mod=Mod, index=Idx}) -> - riak_core_vnode_manager:vnode_event(Mod, Idx, self(), inactive), +%%active +%%%%%%%%%%%% +active(timeout, + State = #state{mod = Module, index = Idx}) -> + riak_core_vnode_manager:vnode_event(Module, Idx, self(), + inactive), continue(State); -active(#riak_coverage_req_v1{keyspaces=KeySpaces, - request=Request, - sender=Sender}, State) -> +active(#riak_coverage_req_v1{keyspaces = KeySpaces, + request = Request, sender = Sender}, + State) -> %% Coverage request handled in handoff and non-handoff. Will be forwarded if set. vnode_coverage(Sender, Request, KeySpaces, State); -active(#riak_vnode_req_v1{sender=Sender, request={resize_forward, Request}}, State) -> +active(#riak_vnode_req_v1{sender = Sender, + request = {resize_forward, Request}}, + State) -> vnode_command(Sender, Request, State); -active(#riak_vnode_req_v1{sender=Sender, request=Request}, - State=#state{handoff_target=HT}) when HT =:= none -> +active(#riak_vnode_req_v1{sender = Sender, + request = Request}, + State = #state{handoff_target = HT}) + when HT =:= none -> forward_or_vnode_command(Sender, Request, State); -active(#riak_vnode_req_v1{sender=Sender, request=Request}, - State=#state{handoff_type=resize, - handoff_target={HOIdx,HONode}, - index=Index, - forward=Forward, - mod=Mod}) -> - RequestHash = Mod:request_hash(Request), +active(#riak_vnode_req_v1{sender = Sender, + request = Request}, + State = #state{handoff_type = resize, + handoff_target = {HOIdx, HONode}, index = Index, + forward = Forward, mod = Module}) -> + RequestHash = Module:request_hash(Request), case RequestHash of - %% will never have enough information to forward request so only handle locally - undefined -> vnode_command(Sender, Request, State); - _ -> - {ok, R} = riak_core_ring_manager:get_my_ring(), - FutureIndex = riak_core_ring:future_index(RequestHash, Index, R), - case FutureIndex of - %% request for portion of keyspace currently being transferred - HOIdx -> vnode_handoff_command(Sender, Request, - {HOIdx, HONode}, State); - %% some portions of keyspace already transferred - _Other when is_list(Forward) -> - vnode_resize_command(Sender, Request, FutureIndex, State); - %% some portions of keyspace not already transferred - _Other -> vnode_command(Sender, Request, State) - end + %% will never have enough information to forward request so only handle locally + undefined -> vnode_command(Sender, Request, State); + _ -> + {ok, R} = riak_core_ring_manager:get_my_ring(), + FutureIndex = riak_core_ring:future_index(RequestHash, + Index, R), + case FutureIndex of + %% request for portion of keyspace currently being transferred + HOIdx -> + vnode_handoff_command(Sender, Request, {HOIdx, HONode}, + State); + %% some portions of keyspace already transferred + _Other when is_list(Forward) -> + vnode_resize_command(Sender, Request, FutureIndex, + State); + %% some portions of keyspace not already transferred + _Other -> vnode_command(Sender, Request, State) + end end; -active(#riak_vnode_req_v1{sender=Sender, request=Request},State) -> - vnode_handoff_command(Sender, Request, State#state.handoff_target, State); +active(#riak_vnode_req_v1{sender = Sender, + request = Request}, + State) -> + vnode_handoff_command(Sender, Request, + State#state.handoff_target, State); active(handoff_complete, State) -> - State2 = start_manager_event_timer(handoff_complete, State), + State2 = start_manager_event_timer(handoff_complete, + State), continue(State2); -active({resize_transfer_complete, SeenIdxs}, State=#state{mod=Mod, - modstate=ModState, - handoff_target=Target}) -> +active({resize_transfer_complete, SeenIdxs}, + State = #state{mod = Module, modstate = ModState, + handoff_target = Target}) -> case Target of - none -> continue(State); - _ -> - %% TODO: refactor similarties w/ finish_handoff handle_event - {ok, NewModState} = Mod:handoff_finished(Target, ModState), - finish_handoff(SeenIdxs, State#state{modstate=NewModState}) + none -> continue(State); + _ -> + %% TODO: refactor similarties w/ finish_handoff handle_event + {ok, NewModState} = Module:handoff_finished(Target, + ModState), + finish_handoff(SeenIdxs, + State#state{modstate = NewModState}) end; active({handoff_error, _Err, _Reason}, State) -> - State2 = start_manager_event_timer(handoff_error, State), + State2 = start_manager_event_timer(handoff_error, + State), continue(State2); active({send_manager_event, Event}, State) -> State2 = start_manager_event_timer(Event, State), continue(State2); active({trigger_handoff, TargetNode}, State) -> - active({trigger_handoff, State#state.index, TargetNode}, State); -active({trigger_handoff, TargetIdx, TargetNode}, State) -> - maybe_handoff(TargetIdx, TargetNode, State); -active(trigger_delete, State=#state{mod=Mod,modstate=ModState,index=Idx}) -> - case mark_delete_complete(Idx, Mod) of - {ok, _NewRing} -> - {ok, NewModState} = Mod:delete(ModState), - logger:debug("~p ~p vnode deleted", [Idx, Mod]); - _ -> NewModState = ModState + active({trigger_handoff, State#state.index, TargetNode}, + State); +active({trigger_handoff, TargetIdx, TargetNode}, + State) -> + maybe_handoff(TargetIdx, TargetNode, State); +active(trigger_delete, + State = #state{mod = Module, modstate = ModState, + index = Idx}) -> + case mark_delete_complete(Idx, Module) of + {ok, _NewRing} -> + {ok, NewModState} = Module:delete(ModState), + logger:debug("~p ~p vnode deleted", [Idx, Module]); + _ -> NewModState = ModState end, maybe_shutdown_pool(State), - riak_core_vnode_manager:unregister_vnode(Idx, Mod), - continue(State#state{modstate={deleted,NewModState}}); -active(unregistered, State=#state{mod=Mod, index=Index}) -> + riak_core_vnode_manager:unregister_vnode(Idx, Module), + continue(State#state{modstate = + {deleted, NewModState}}); +active(unregistered, + State = #state{mod = Module, index = Index}) -> %% Add exclusion so the ring handler will not try to spin this vnode %% up until it receives traffic. - riak_core_handoff_manager:add_exclusion(Mod, Index), + riak_core_handoff_manager:add_exclusion(Module, Index), logger:debug("~p ~p vnode excluded and unregistered.", - [Index, Mod]), - {stop, normal, State#state{handoff_target=none, - handoff_type=undefined, - pool_pid=undefined}}. + [Index, Module]), + {stop, normal, + State#state{handoff_target = none, + handoff_type = undefined, pool_pid = undefined}}. active(_Event, _From, State) -> Reply = ok, - {reply, Reply, active, State, State#state.inactivity_timeout}. + {reply, Reply, active, State, + State#state.inactivity_timeout}. %% This code lives in riak_core_vnode rather than riak_core_vnode_manager %% because the ring_trans call is a synchronous call to the ring manager, @@ -580,468 +724,878 @@ active(_Event, _From, State) -> %% manager. Blocking the manager can impact all vnodes. This code is safe %% to execute on multiple parallel vnodes because of the synchronization %% afforded by having all ring changes go through the single ring manager. -mark_handoff_complete(SrcIdx, Target, SeenIdxs, Mod, resize) -> +mark_handoff_complete(SrcIdx, Target, SeenIdxs, Mod, + resize) -> Prev = node(), Source = {SrcIdx, Prev}, - Result = riak_core_ring_manager:ring_trans( - fun(Ring, _) -> - Owner = riak_core_ring:index_owner(Ring,SrcIdx), - Status = riak_core_ring:resize_transfer_status(Ring, Source, - Target, Mod), + TransFun = fun (Ring, _) -> + Owner = riak_core_ring:index_owner(Ring, SrcIdx), + Status = riak_core_ring:resize_transfer_status(Ring, + Source, + Target, + Mod), case {Owner, Status} of - {Prev, awaiting} -> - F = fun(SeenIdx, RingAcc) -> - riak_core_ring:schedule_resize_transfer(RingAcc, - Source, - SeenIdx) - end, - Ring2 = lists:foldl(F, Ring, ordsets:to_list(SeenIdxs)), - Ring3 = riak_core_ring:resize_transfer_complete(Ring2, - Source, - Target, - Mod), - %% local ring optimization (see below) - {set_only, Ring3}; - _ -> - ignore + {Prev, awaiting} -> + F = fun (SeenIdx, RingAcc) -> + riak_core_ring:schedule_resize_transfer(RingAcc, + Source, + SeenIdx) + end, + Ring2 = lists:foldl(F, Ring, + ordsets:to_list(SeenIdxs)), + Ring3 = + riak_core_ring:resize_transfer_complete(Ring2, + Source, + Target, + Mod), + %% local ring optimization (see below) + {set_only, Ring3}; + _ -> ignore end - end, []), + end, + Result = riak_core_ring_manager:ring_trans(TransFun, + []), case Result of - {ok, _NewRing} -> resize; - _ -> continue + {ok, _NewRing} -> resize; + _ -> continue end; mark_handoff_complete(Idx, {Idx, New}, [], Mod, _) -> Prev = node(), - Result = riak_core_ring_manager:ring_trans( - fun(Ring, _) -> - Owner = riak_core_ring:index_owner(Ring, Idx), - {_, NextOwner, Status} = riak_core_ring:next_owner(Ring, Idx, Mod), - NewStatus = riak_core_ring:member_status(Ring, New), - - case {Owner, NextOwner, NewStatus, Status} of - {Prev, New, _, awaiting} -> - Ring2 = riak_core_ring:handoff_complete(Ring, Idx, Mod), - %% Optimization. Only alter the local ring without - %% triggering a gossip, thus implicitly coalescing - %% multiple vnode handoff completion events. In the - %% future we should decouple vnode handoff state from - %% the ring structure in order to make gossip independent - %% of ring size. - {set_only, Ring2}; - _ -> - ignore - end - end, []), - + Result = riak_core_ring_manager:ring_trans(fun (Ring, + _) -> + Owner = + riak_core_ring:index_owner(Ring, + Idx), + {_, NextOwner, Status} = + riak_core_ring:next_owner(Ring, + Idx, + Mod), + NewStatus = + riak_core_ring:member_status(Ring, + New), + case {Owner, NextOwner, + NewStatus, Status} + of + {Prev, New, _, + awaiting} -> + Ring2 = + riak_core_ring:handoff_complete(Ring, + Idx, + Mod), + %% Optimization. Only alter the local ring without + %% triggering a gossip, thus implicitly coalescing + %% multiple vnode handoff completion events. In the + %% future we should decouple vnode handoff state from + %% the ring structure in order to make gossip independent + %% of ring size. + {set_only, Ring2}; + _ -> ignore + end + end, + []), case Result of - {ok, NewRing} -> - NewRing = NewRing; - _ -> - {ok, NewRing} = riak_core_ring_manager:get_my_ring() + {ok, NewRing} -> NewRing = NewRing; + _ -> + {ok, NewRing} = riak_core_ring_manager:get_my_ring() end, - Owner = riak_core_ring:index_owner(NewRing, Idx), - {_, NextOwner, Status} = riak_core_ring:next_owner(NewRing, Idx, Mod), + {_, NextOwner, Status} = + riak_core_ring:next_owner(NewRing, Idx, Mod), NewStatus = riak_core_ring:member_status(NewRing, New), - case {Owner, NextOwner, NewStatus, Status} of - {_, _, invalid, _} -> - %% Handing off to invalid node, don't give-up data. - continue; - {Prev, New, _, _} -> - forward; - {Prev, _, _, _} -> - %% Handoff wasn't to node that is scheduled in next, so no change. - continue; - {_, _, _, _} -> - shutdown + {_, _, invalid, _} -> + %% Handing off to invalid node, don't give-up data. + continue; + {Prev, New, _, _} -> forward; + {Prev, _, _, _} -> + %% Handoff wasn't to node that is scheduled in next, so no change. + continue; + {_, _, _, _} -> shutdown end. -finish_handoff(State) -> - finish_handoff([], State). - -finish_handoff(SeenIdxs, State=#state{mod=Mod, - modstate=ModState, - index=Idx, - handoff_target=Target, - handoff_type=HOType}) -> - case mark_handoff_complete(Idx, Target, SeenIdxs, Mod, HOType) of - continue -> - continue(State#state{handoff_target=none,handoff_type=undefined}); - resize -> - CurrentForwarding = resize_forwarding(State), - NewForwarding = [Target | CurrentForwarding], - State2 = mod_set_forwarding(NewForwarding, State), - continue(State2#state{handoff_target=none, - handoff_type=undefined, - forward=NewForwarding}); - Res when Res == forward; Res == shutdown -> - {_, HN} = Target, - %% Have to issue the delete now. Once unregistered the - %% vnode master will spin up a new vnode on demand. - %% Shutdown the async pool beforehand, don't want callbacks - %% running on non-existant data. - maybe_shutdown_pool(State), - {ok, NewModState} = Mod:delete(ModState), - logger:debug("~p ~p vnode finished handoff and deleted.", - [Idx, Mod]), - riak_core_vnode_manager:unregister_vnode(Idx, Mod), - logger:debug("vnode hn/fwd :: ~p/~p :: ~p -> ~p~n", - [State#state.mod, State#state.index, State#state.forward, HN]), - State2 = mod_set_forwarding(HN, State), - continue(State2#state{modstate={deleted,NewModState}, % like to fail if used - handoff_target=none, - handoff_type=undefined, - forward=HN}) +finish_handoff(State) -> finish_handoff([], State). + +finish_handoff(SeenIdxs, + State = #state{mod = Module, modstate = ModState, + index = Idx, handoff_target = Target, + handoff_type = HOType}) -> + case mark_handoff_complete(Idx, Target, SeenIdxs, + Module, HOType) + of + continue -> + continue(State#state{handoff_target = none, + handoff_type = undefined}); + resize -> + CurrentForwarding = resize_forwarding(State), + NewForwarding = [Target | CurrentForwarding], + State2 = mod_set_forwarding(NewForwarding, State), + continue(State2#state{handoff_target = none, + handoff_type = undefined, + forward = NewForwarding}); + Res when Res == forward; Res == shutdown -> + {_, HN} = Target, + %% Have to issue the delete now. Once unregistered the + %% vnode master will spin up a new vnode on demand. + %% Shutdown the async pool beforehand, don't want callbacks + %% running on non-existant data. + maybe_shutdown_pool(State), + {ok, NewModState} = Module:delete(ModState), + logger:debug("~p ~p vnode finished handoff and deleted.", + [Idx, Module]), + riak_core_vnode_manager:unregister_vnode(Idx, Module), + logger:debug("vnode hn/fwd :: ~p/~p :: ~p -> ~p~n", + [State#state.mod, State#state.index, + State#state.forward, HN]), + State2 = mod_set_forwarding(HN, State), + continue(State2#state{modstate = + {deleted, + NewModState}, % like to fail if used + handoff_target = none, handoff_type = undefined, + forward = HN}) end. -maybe_shutdown_pool(#state{pool_pid=Pool}) -> +maybe_shutdown_pool(#state{pool_pid = Pool}) -> case is_pid(Pool) of - true -> - %% state.pool_pid will be cleaned up by handle_info message. - riak_core_vnode_worker_pool:shutdown_pool(Pool, 60000); - _ -> - ok + true -> + %% state.pool_pid will be cleaned up by handle_info message. + riak_core_vnode_worker_pool:shutdown_pool(Pool, 60000); + _ -> ok end. -resize_forwarding(#state{forward=F}) when is_list(F) -> +resize_forwarding(#state{forward = F}) + when is_list(F) -> F; -resize_forwarding(_) -> - []. +resize_forwarding(_) -> []. mark_delete_complete(Idx, Mod) -> - Result = riak_core_ring_manager:ring_trans( - fun(Ring, _) -> - Type = riak_core_ring:vnode_type(Ring, Idx), - {_, Next, Status} = riak_core_ring:next_owner(Ring, Idx), - case {Type, Next, Status} of - {resized_primary, '$delete', awaiting} -> - Ring3 = riak_core_ring:deletion_complete(Ring, Idx, Mod), - %% Use local ring optimization like mark_handoff_complete - {set_only, Ring3}; - {{fallback, _}, '$delete', awaiting} -> - Ring3 = riak_core_ring:deletion_complete(Ring, Idx, Mod), - %% Use local ring optimization like mark_handoff_complete - {set_only, Ring3}; - _ -> - ignore - end - end, - []), + Result = riak_core_ring_manager:ring_trans(fun (Ring, + _) -> + Type = + riak_core_ring:vnode_type(Ring, + Idx), + {_, Next, Status} = + riak_core_ring:next_owner(Ring, + Idx), + case {Type, Next, Status} + of + {resized_primary, + '$delete', + awaiting} -> + Ring3 = + riak_core_ring:deletion_complete(Ring, + Idx, + Mod), + %% Use local ring optimization like mark_handoff_complete + {set_only, Ring3}; + {{fallback, _}, + '$delete', + awaiting} -> + Ring3 = + riak_core_ring:deletion_complete(Ring, + Idx, + Mod), + %% Use local ring optimization like mark_handoff_complete + {set_only, Ring3}; + _ -> ignore + end + end, + []), Result. handle_event({set_forwarding, undefined}, _StateName, - State=#state{modstate={deleted, _ModState}}) -> + State = #state{modstate = {deleted, _ModState}}) -> %% The vnode must forward requests when in the deleted state, therefore %% ignore requests to stop forwarding. continue(State); -handle_event({set_forwarding, ForwardTo}, _StateName, State) -> +handle_event({set_forwarding, ForwardTo}, _StateName, + State) -> logger:debug("vnode fwd :: ~p/~p :: ~p -> ~p~n", - [State#state.mod, State#state.index, State#state.forward, ForwardTo]), + [State#state.mod, State#state.index, + State#state.forward, ForwardTo]), State2 = mod_set_forwarding(ForwardTo, State), - continue(State2#state{forward=ForwardTo}); + continue(State2#state{forward = ForwardTo}); handle_event(finish_handoff, _StateName, - State=#state{modstate={deleted, _ModState}}) -> + State = #state{modstate = {deleted, _ModState}}) -> stop_manager_event_timer(State), - continue(State#state{handoff_target=none}); -handle_event(finish_handoff, _StateName, State=#state{mod=Mod, - modstate=ModState, - handoff_target=Target}) -> + continue(State#state{handoff_target = none}); +handle_event(finish_handoff, _StateName, + State = #state{mod = Module, modstate = ModState, + handoff_target = Target}) -> stop_manager_event_timer(State), case Target of - none -> - continue(State); - _ -> - {ok, NewModState} = Mod:handoff_finished(Target, ModState), - finish_handoff(State#state{modstate=NewModState}) + none -> continue(State); + _ -> + {ok, NewModState} = Module:handoff_finished(Target, + ModState), + finish_handoff(State#state{modstate = NewModState}) end; -handle_event(cancel_handoff, _StateName, State=#state{mod=Mod, - modstate=ModState}) -> +handle_event(cancel_handoff, _StateName, + State = #state{mod = Module, modstate = ModState}) -> %% it would be nice to pass {Err, Reason} to the vnode but the %% API doesn't currently allow for that. stop_manager_event_timer(State), case State#state.handoff_target of - none -> - continue(State); - _ -> - {ok, NewModState} = Mod:handoff_cancelled(ModState), - continue(State#state{handoff_target=none, - handoff_type=undefined, - modstate=NewModState}) + none -> continue(State); + _ -> + {ok, NewModState} = Module:handoff_cancelled(ModState), + continue(State#state{handoff_target = none, + handoff_type = undefined, + modstate = NewModState}) end; -handle_event({trigger_handoff, TargetNode}, StateName, State) -> - handle_event({trigger_handoff, State#state.index, TargetNode}, StateName, State); -handle_event({trigger_handoff, _TargetIdx, _TargetNode}, _StateName, - State=#state{modstate={deleted, _ModState}}) -> +handle_event({trigger_handoff, TargetNode}, StateName, + State) -> + handle_event({trigger_handoff, State#state.index, + TargetNode}, + StateName, State); +handle_event({trigger_handoff, _TargetIdx, _TargetNode}, + _StateName, + State = #state{modstate = {deleted, _ModState}}) -> continue(State); -handle_event(R={trigger_handoff, _TargetIdx, _TargetNode}, _StateName, State) -> +handle_event(R = {trigger_handoff, _TargetIdx, + _TargetNode}, + _StateName, State) -> active(R, State); -handle_event(trigger_delete, _StateName, State=#state{modstate={deleted,_}}) -> +handle_event(trigger_delete, _StateName, + State = #state{modstate = {deleted, _}}) -> continue(State); handle_event(trigger_delete, _StateName, State) -> active(trigger_delete, State); -handle_event(R=#riak_vnode_req_v1{}, _StateName, State) -> +handle_event(R = #riak_vnode_req_v1{}, _StateName, + State) -> active(R, State); -handle_event(R=#riak_coverage_req_v1{}, _StateName, State) -> +handle_event(R = #riak_coverage_req_v1{}, _StateName, + State) -> active(R, State). +%%handle_sync_event +%%%%%%%%%%%%%%%%%%%% -handle_sync_event(current_state, _From, StateName, State) -> +handle_sync_event(current_state, _From, StateName, + State) -> {reply, {StateName, State}, StateName, State}; handle_sync_event(get_mod_index, _From, StateName, - State=#state{index=Idx,mod=Mod}) -> - {reply, {Mod, Idx}, StateName, State, State#state.inactivity_timeout}; -handle_sync_event({handoff_data,_BinObj}, _From, StateName, - State=#state{modstate={deleted, _ModState}}) -> + State = #state{index = Idx, mod = Mod}) -> + {reply, {Mod, Idx}, StateName, State, + State#state.inactivity_timeout}; +handle_sync_event({handoff_data, _BinObj}, _From, + StateName, + State = #state{modstate = {deleted, _ModState}}) -> {reply, {error, vnode_exiting}, StateName, State, State#state.inactivity_timeout}; -handle_sync_event({handoff_data,BinObj}, _From, StateName, - State=#state{mod=Mod, modstate=ModState}) -> - case Mod:handle_handoff_data(BinObj, ModState) of - {reply, ok, NewModState} -> - {reply, ok, StateName, State#state{modstate=NewModState}, - State#state.inactivity_timeout}; - {reply, {error, Err}, NewModState} -> - logger:error("~p failed to store handoff obj: ~p", [Mod, Err]), - {reply, {error, Err}, StateName, State#state{modstate=NewModState}, - State#state.inactivity_timeout} +handle_sync_event({handoff_data, BinObj}, _From, + StateName, + State = #state{mod = Module, modstate = ModState}) -> + case Module:handle_handoff_data(BinObj, ModState) of + {reply, ok, NewModState} -> + {reply, ok, StateName, + State#state{modstate = NewModState}, + State#state.inactivity_timeout}; + {reply, {error, Err}, NewModState} -> + logger:error("~p failed to store handoff obj: ~p", + [Module, Err]), + {reply, {error, Err}, StateName, + State#state{modstate = NewModState}, + State#state.inactivity_timeout} end; -handle_sync_event(core_status, _From, StateName, State=#state{index=Index, - mod=Mod, - modstate=ModState, - handoff_target=HT, - forward=FN}) -> +handle_sync_event(core_status, _From, StateName, + State = #state{index = Index, mod = Module, + modstate = ModState, handoff_target = HT, + forward = FN}) -> Mode = case {FN, HT} of - {undefined, none} -> - active; - {undefined, HT} -> - handoff; - {FN, none} -> - forward; - _ -> - undefined + {undefined, none} -> active; + {undefined, HT} -> handoff; + {FN, none} -> forward; + _ -> undefined end, - Status = [{index, Index}, {mod, Mod}] ++ - case FN of - undefined -> - []; - _ -> - [{forward, FN}] - end++ - case HT of - none -> - []; - _ -> - [{handoff_target, HT}] - end ++ - case ModState of - {deleted, _} -> - [deleted]; - _ -> - [] - end, - {reply, {Mode, Status}, StateName, State, State#state.inactivity_timeout}. - -handle_info({'$vnode_proxy_ping', From, Ref, Msgs}, StateName, State) -> - riak_core_vnode_proxy:cast(From, {vnode_proxy_pong, Ref, Msgs}), - {next_state, StateName, State, State#state.inactivity_timeout}; - -handle_info({'EXIT', Pid, Reason}, - _StateName, - State=#state{mod=Mod, - index=Index, - pool_pid=Pid, - pool_config=PoolConfig}) -> + Status = [{index, Index}, {mod, Module}] ++ + case FN of + undefined -> []; + _ -> [{forward, FN}] + end + ++ + case HT of + none -> []; + _ -> [{handoff_target, HT}] + end + ++ + case ModState of + {deleted, _} -> [deleted]; + _ -> [] + end, + {reply, {Mode, Status}, StateName, State, + State#state.inactivity_timeout}. + +%%handle_info +%%%%%%%%%%%%%% + +handle_info({'$vnode_proxy_ping', From, Ref, Msgs}, + StateName, State) -> + riak_core_vnode_proxy:cast(From, + {vnode_proxy_pong, Ref, Msgs}), + {next_state, StateName, State, + State#state.inactivity_timeout}; +handle_info({'EXIT', Pid, Reason}, _StateName, + State = #state{mod = Module, index = Index, + pool_pid = Pid, pool_config = PoolConfig}) -> case Reason of - Reason when Reason == normal; Reason == shutdown -> - continue(State#state{pool_pid=undefined}); - _ -> - logger:error("~p ~p worker pool crashed ~p\n", [Index, Mod, Reason]), - {pool, WorkerModule, PoolSize, WorkerArgs}=PoolConfig, - logger:debug("starting worker pool ~p with size " - "of ~p for vnode ~p.", - [WorkerModule, PoolSize, Index]), - {ok, NewPoolPid} = - riak_core_vnode_worker_pool:start_link(WorkerModule, - PoolSize, - Index, - WorkerArgs, - worker_props), - continue(State#state{pool_pid=NewPoolPid}) - end; - -handle_info({'DOWN',_Ref,process,_Pid,normal}, _StateName, - State=#state{modstate={deleted, _}}) -> + Reason when Reason == normal; Reason == shutdown -> + continue(State#state{pool_pid = undefined}); + _ -> + logger:error("~p ~p worker pool crashed ~p\n", + [Index, Module, Reason]), + {pool, WorkerModule, PoolSize, WorkerArgs} = PoolConfig, + logger:debug("starting worker pool ~p with size of " + "~p for vnode ~p.", + [WorkerModule, PoolSize, Index]), + {ok, NewPoolPid} = + riak_core_vnode_worker_pool:start_link(WorkerModule, + PoolSize, Index, + WorkerArgs, worker_props), + continue(State#state{pool_pid = NewPoolPid}) + end; +handle_info({'DOWN', _Ref, process, _Pid, normal}, + _StateName, State = #state{modstate = {deleted, _}}) -> %% these messages are produced by riak_kv_vnode's aae tree %% monitors; they are harmless, so don't yell about them. also %% only dustbin them in the deleted modstate, because pipe vnodes %% need them in other states continue(State); handle_info(Info, _StateName, - State=#state{mod=Mod,modstate={deleted, _},index=Index}) -> - logger:info("~p ~p ignored handle_info ~p - vnode unregistering\n", - [Index, Mod, Info]), + State = #state{mod = Module, modstate = {deleted, _}, + index = Index}) -> + logger:info("~p ~p ignored handle_info ~p - vnode " + "unregistering\n", + [Index, Module, Info]), continue(State); -handle_info({'EXIT', Pid, Reason}, StateName, State=#state{mod=Mod,modstate=ModState}) -> +handle_info({'EXIT', Pid, Reason}, StateName, + State = #state{mod = Module, modstate = ModState}) -> %% A linked processes has died so use the %% handle_exit callback to allow the vnode %% process to take appropriate action. %% If the function is not implemented default %% to crashing the process. - try - case Mod:handle_exit(Pid, Reason, ModState) of - {noreply,NewModState} -> - {next_state, StateName, State#state{modstate=NewModState}, - State#state.inactivity_timeout}; - {stop, Reason1, NewModState} -> - {stop, Reason1, State#state{modstate=NewModState}} + try case Module:handle_exit(Pid, Reason, ModState) of + {noreply, NewModState} -> + {next_state, StateName, + State#state{modstate = NewModState}, + State#state.inactivity_timeout}; + {stop, Reason1, NewModState} -> + {stop, Reason1, State#state{modstate = NewModState}} end catch - _ErrorType:undef -> - {stop, linked_process_crash, State} + _ErrorType:undef -> {stop, linked_process_crash, State} end; +handle_info(Info, StateName, + State = #state{mod = Module, modstate = ModState}) -> + case erlang:function_exported(Module, handle_info, 2) of + true -> + {ok, NewModState} = Module:handle_info(Info, ModState), + {next_state, StateName, + State#state{modstate = NewModState}, + State#state.inactivity_timeout}; + false -> + {next_state, StateName, State, + State#state.inactivity_timeout} + end. -handle_info(Info, StateName, State=#state{mod=Mod,modstate=ModState}) -> - case erlang:function_exported(Mod, handle_info, 2) of - true -> - {ok, NewModState} = Mod:handle_info(Info, ModState), - {next_state, StateName, State#state{modstate=NewModState}, - State#state.inactivity_timeout}; - false -> - {next_state, StateName, State, State#state.inactivity_timeout} +%% ======================== +%% ======== +%% Internal Helper Functions +%% ======== +%% ======================== +do_init(State = #state{index = Index, mod = Module, + forward = Forward}) -> + {ModState, Props} = case Module:init([Index]) of + {ok, MS} -> {MS, []}; + {ok, MS, P} -> {MS, P}; + {error, R} -> {error, R} + end, + case {ModState, Props} of + {error, Reason} -> {error, Reason}; + _ -> + PoolConfig = case lists:keyfind(pool, 1, Props) of + {pool, WorkerModule, PoolSize, WorkerArgs} = PoolCfg -> + logger:debug("starting worker pool ~p with size of " + "~p~n", + [WorkerModule, PoolSize]), + {ok, PoolPid} = + riak_core_vnode_worker_pool:start_link(WorkerModule, + PoolSize, + Index, + WorkerArgs, + worker_props), + PoolCfg; + _ -> PoolPid = undefined + end, + riak_core_handoff_manager:remove_exclusion(Module, + Index), + Timeout = application:get_env(riak_core, + vnode_inactivity_timeout, + ?DEFAULT_TIMEOUT), + Timeout2 = Timeout + rand:uniform(Timeout), + State2 = State#state{modstate = ModState, + inactivity_timeout = Timeout2, + pool_pid = PoolPid, pool_config = PoolConfig}, + logger:debug("vnode :: ~p/~p :: ~p~n", + [Module, Index, Forward]), + State3 = mod_set_forwarding(Forward, State2), + {ok, State3} end. -terminate(Reason, _StateName, #state{mod=Mod, modstate=ModState, - pool_pid=Pool}) -> - %% Shutdown if the pool is still alive and a normal `Reason' is - %% given - there could be a race on delivery of the unregistered - %% event and successfully shutting down the pool. - try - case is_pid(Pool) andalso is_process_alive(Pool) andalso ?normal_reason(Reason) of - true -> - riak_core_vnode_worker_pool:shutdown_pool(Pool, 60000); - _ -> - ok - end - catch Type:Reason:Stacktrace -> - logger:error("Error while shutting down vnode worker pool ~p:~p trace : ~p", - [Type, Reason, Stacktrace]) - after - case ModState of - %% Handoff completed, Mod:delete has been called, now terminate. - {deleted, ModState1} -> - Mod:terminate(Reason, ModState1); - _ -> - Mod:terminate(Reason, ModState) - end +continue(State) -> + {next_state, active, State, + State#state.inactivity_timeout}. + +continue(State, NewModState) -> + continue(State#state{modstate = NewModState}). + +%% Active vnodes operate in three states: normal, handoff, and forwarding. +%% +%% In the normal state, vnode commands are passed to handle_command. When +%% a handoff is triggered, handoff_target is set and the vnode +%% is said to be in the handoff state. +%% +%% In the handoff state, vnode commands are passed to handle_handoff_command. +%% However, a vnode may be blocked during handoff (and therefore not servicing +%% commands) if the handoff procedure is blocking (eg. in riak_kv when not +%% using async fold). +%% +%% After handoff, a vnode may move into forwarding state. The forwarding state +%% is a product of the new gossip/membership code and will not occur if the +%% node is running in legacy mode. The forwarding state represents the case +%% where the vnode has already handed its data off to the new owner, but the +%% new owner is not yet listed as the current owner in the ring. This may occur +%% because additional vnodes are still waiting to handoff their data to the +%% new owner, or simply because the ring has yet to converge on the new owner. +%% In the forwarding state, all vnode commands and coverage commands are +%% forwarded to the new owner for processing. +%% +%% The above becomes a bit more complicated when the vnode takes part in resizing +%% the ring, since several transfers with a single vnode as the source are necessary +%% to complete the operation. A vnode will remain in the handoff state, for, potentially, +%% more than one transfer and may be in the handoff state despite there being no active +%% transfers with this vnode as the source. During this time requests that can be forwarded +%% to a partition for which the transfer has already completed, are forwarded. All other +%% requests are passed to handle_handoff_command. +forward_or_vnode_command(Sender, Request, + State = #state{forward = Forward, mod = Module, + index = Index}) -> + Resizing = is_list(Forward), + RequestHash = case Resizing of + true -> Module:request_hash(Request); + false -> undefined + end, + case {Forward, RequestHash} of + %% typical vnode operation, no forwarding set, handle request locally + {undefined, _} -> vnode_command(Sender, Request, State); + %% implicit forwarding after ownership transfer/hinted handoff + {F, _} when not is_list(F) -> + vnode_forward(implicit, {Index, Forward}, Sender, + Request, State), + continue(State); + %% during resize we can't forward a request w/o request hash, always handle locally + {_, undefined} -> vnode_command(Sender, Request, State); + %% possible forwarding during ring resizing + {_, _} -> + {ok, R} = riak_core_ring_manager:get_my_ring(), + FutureIndex = riak_core_ring:future_index(RequestHash, + Index, R), + vnode_resize_command(Sender, Request, FutureIndex, + State) end. -code_change(_OldVsn, StateName, State, _Extra) -> - {ok, StateName, State}. +vnode_command(_Sender, _Request, + State = #state{modstate = {deleted, _}}) -> + continue(State); +vnode_command(Sender, Request, + State = #state{mod = Module, modstate = ModState, + pool_pid = Pool}) -> + case catch Module:handle_command(Request, Sender, + ModState) + of + {'EXIT', ExitReason} -> + reply(Sender, {vnode_error, ExitReason}), + logger:error("~p command failed ~p", + [Module, ExitReason]), + {stop, ExitReason, State#state{modstate = ModState}}; + continue -> continue(State, ModState); + {reply, Reply, NewModState} -> + reply(Sender, Reply), continue(State, NewModState); + {noreply, NewModState} -> continue(State, NewModState); + {async, Work, From, NewModState} -> + %% dispatch some work to the vnode worker pool + %% the result is sent back to 'From' + riak_core_vnode_worker_pool:handle_work(Pool, Work, + From), + continue(State, NewModState); + {stop, Reason, NewModState} -> + {stop, Reason, State#state{modstate = NewModState}} + end. + +vnode_coverage(Sender, Request, KeySpaces, + State = #state{index = Index, mod = Module, + modstate = ModState, pool_pid = Pool, + forward = Forward}) -> + %% Check if we should forward + case Forward of + undefined -> + Action = Module:handle_coverage(Request, KeySpaces, + Sender, ModState); + %% handle coverage requests locally during ring resize + Forwards when is_list(Forwards) -> + Action = Module:handle_coverage(Request, KeySpaces, + Sender, ModState); + NextOwner -> + logger:debug("Forwarding coverage ~p -> ~p: ~p~n", + [node(), NextOwner, Index]), + riak_core_vnode_master:coverage(Request, + {Index, NextOwner}, KeySpaces, Sender, + riak_core_vnode_master:reg_name(Module)), + Action = continue + end, + case Action of + continue -> continue(State, ModState); + {reply, Reply, NewModState} -> + reply(Sender, Reply), continue(State, NewModState); + {noreply, NewModState} -> continue(State, NewModState); + {async, Work, From, NewModState} -> + %% dispatch some work to the vnode worker pool + %% the result is sent back to 'From' + riak_core_vnode_worker_pool:handle_work(Pool, Work, + From), + continue(State, NewModState); + {stop, Reason, NewModState} -> + {stop, Reason, State#state{modstate = NewModState}} + end. + +vnode_handoff_command(Sender, Request, ForwardTo, + State = #state{mod = Module, modstate = ModState, + handoff_target = HOTarget, + handoff_type = HOType, pool_pid = Pool}) -> + case Module:handle_handoff_command(Request, Sender, + ModState) + of + {reply, Reply, NewModState} -> + reply(Sender, Reply), continue(State, NewModState); + {noreply, NewModState} -> continue(State, NewModState); + {async, Work, From, NewModState} -> + %% dispatch some work to the vnode worker pool + %% the result is sent back to 'From' + riak_core_vnode_worker_pool:handle_work(Pool, Work, + From), + continue(State, NewModState); + {forward, NewModState} -> + forward_request(HOType, Request, HOTarget, ForwardTo, + Sender, State), + continue(State, NewModState); + {forward, NewReq, NewModState} -> + forward_request(HOType, NewReq, HOTarget, ForwardTo, + Sender, State), + continue(State, NewModState); + {drop, NewModState} -> continue(State, NewModState); + {stop, Reason, NewModState} -> + {stop, Reason, State#state{modstate = NewModState}} + end. + +%% @private wrap the request for resize forwards, and use the resize +%% target. +forward_request(resize, Request, _HOTarget, + ResizeTarget, Sender, State) -> + %% resize op and transfer ongoing + vnode_forward(resize, ResizeTarget, Sender, + {resize_forward, Request}, State); +forward_request(undefined, Request, _HOTarget, + ResizeTarget, Sender, State) -> + %% resize op ongoing, no resize transfer ongoing, arrive here + %% via forward_or_vnode_command + vnode_forward(resize, ResizeTarget, Sender, + {resize_forward, Request}, State); +forward_request(_, Request, HOTarget, _ResizeTarget, + Sender, State) -> + %% normal explicit forwarding during owhership transfer + vnode_forward(explicit, HOTarget, Sender, Request, + State). + +vnode_forward(Type, ForwardTo, Sender, Request, + State) -> + logger:debug("Forwarding (~p) {~p,~p} -> ~p~n", + [Type, State#state.index, node(), ForwardTo]), + riak_core_vnode_master:command_unreliable(ForwardTo, + Request, Sender, + riak_core_vnode_master:reg_name(State#state.mod)). + +%% @doc during ring resizing if we have completed a transfer to the index that will +%% handle request in future ring we forward to it. Otherwise we delegate +%% to the local vnode like other requests during handoff +vnode_resize_command(Sender, Request, FutureIndex, + State = #state{forward = Forward}) + when is_list(Forward) -> + case lists:keyfind(FutureIndex, 1, Forward) of + false -> vnode_command(Sender, Request, State); + {FutureIndex, FutureOwner} -> + vnode_handoff_command(Sender, Request, + {FutureIndex, FutureOwner}, State) + end. -maybe_handoff(_TargetIdx, _TargetNode, State=#state{modstate={deleted, _}}) -> +%% This code lives in riak_core_vnode rather than riak_core_vnode_manager +%% because the ring_trans call is a synchronous call to the ring manager, +%% and it is better to block an individual vnode rather than the vnode +%% manager. Blocking the manager can impact all vnodes. This code is safe +%% to execute on multiple parallel vnodes because of the synchronization +%% afforded by having all ring changes go through the single ring manager. +mark_handoff_complete(SrcIdx, Target, SeenIdxs, Mod, + resize) -> + Prev = node(), + Source = {SrcIdx, Prev}, + TransFun = fun (Ring, _) -> + Owner = riak_core_ring:index_owner(Ring, SrcIdx), + Status = riak_core_ring:resize_transfer_status(Ring, + Source, + Target, + Mod), + case {Owner, Status} of + {Prev, awaiting} -> + F = fun (SeenIdx, RingAcc) -> + riak_core_ring:schedule_resize_transfer(RingAcc, + Source, + SeenIdx) + end, + Ring2 = lists:foldl(F, Ring, + ordsets:to_list(SeenIdxs)), + Ring3 = + riak_core_ring:resize_transfer_complete(Ring2, + Source, + Target, + Mod), + %% local ring optimization (see below) + {set_only, Ring3}; + _ -> ignore + end + end, + Result = riak_core_ring_manager:ring_trans(TransFun, + []), + case Result of + {ok, _NewRing} -> resize; + _ -> continue + end; +mark_handoff_complete(Idx, {Idx, New}, [], Mod, _) -> + Prev = node(), + Result = riak_core_ring_manager:ring_trans(fun (Ring, + _) -> + Owner = + riak_core_ring:index_owner(Ring, + Idx), + {_, NextOwner, Status} = + riak_core_ring:next_owner(Ring, + Idx, + Mod), + NewStatus = + riak_core_ring:member_status(Ring, + New), + case {Owner, NextOwner, + NewStatus, Status} + of + {Prev, New, _, + awaiting} -> + Ring2 = + riak_core_ring:handoff_complete(Ring, + Idx, + Mod), + %% Optimization. Only alter the local ring without + %% triggering a gossip, thus implicitly coalescing + %% multiple vnode handoff completion events. In the + %% future we should decouple vnode handoff state from + %% the ring structure in order to make gossip independent + %% of ring size. + {set_only, Ring2}; + _ -> ignore + end + end, + []), + case Result of + {ok, NewRing} -> NewRing = NewRing; + _ -> + {ok, NewRing} = riak_core_ring_manager:get_my_ring() + end, + Owner = riak_core_ring:index_owner(NewRing, Idx), + {_, NextOwner, Status} = + riak_core_ring:next_owner(NewRing, Idx, Mod), + NewStatus = riak_core_ring:member_status(NewRing, New), + case {Owner, NextOwner, NewStatus, Status} of + {_, _, invalid, _} -> + %% Handing off to invalid node, don't give-up data. + continue; + {Prev, New, _, _} -> forward; + {Prev, _, _, _} -> + %% Handoff wasn't to node that is scheduled in next, so no change. + continue; + {_, _, _, _} -> shutdown + end. + +finish_handoff(State) -> finish_handoff([], State). + +finish_handoff(SeenIdxs, + State = #state{mod = Module, modstate = ModState, + index = Idx, handoff_target = Target, + handoff_type = HOType}) -> + case mark_handoff_complete(Idx, Target, SeenIdxs, + Module, HOType) + of + continue -> + continue(State#state{handoff_target = none, + handoff_type = undefined}); + resize -> + CurrentForwarding = resize_forwarding(State), + NewForwarding = [Target | CurrentForwarding], + State2 = mod_set_forwarding(NewForwarding, State), + continue(State2#state{handoff_target = none, + handoff_type = undefined, + forward = NewForwarding}); + Res when Res == forward; Res == shutdown -> + {_, HN} = Target, + %% Have to issue the delete now. Once unregistered the + %% vnode master will spin up a new vnode on demand. + %% Shutdown the async pool beforehand, don't want callbacks + %% running on non-existant data. + maybe_shutdown_pool(State), + {ok, NewModState} = Module:delete(ModState), + logger:debug("~p ~p vnode finished handoff and deleted.", + [Idx, Module]), + riak_core_vnode_manager:unregister_vnode(Idx, Module), + logger:debug("vnode hn/fwd :: ~p/~p :: ~p -> ~p~n", + [State#state.mod, State#state.index, + State#state.forward, HN]), + State2 = mod_set_forwarding(HN, State), + continue(State2#state{modstate = + {deleted, + NewModState}, % like to fail if used + handoff_target = none, handoff_type = undefined, + forward = HN}) + end. + +maybe_shutdown_pool(#state{pool_pid = Pool}) -> + case is_pid(Pool) of + true -> + %% state.pool_pid will be cleaned up by handle_info message. + riak_core_vnode_worker_pool:shutdown_pool(Pool, 60000); + _ -> ok + end. + +resize_forwarding(#state{forward = F}) + when is_list(F) -> + F; +resize_forwarding(_) -> []. + +mark_delete_complete(Idx, Mod) -> + Result = riak_core_ring_manager:ring_trans(fun (Ring, + _) -> + Type = + riak_core_ring:vnode_type(Ring, + Idx), + {_, Next, Status} = + riak_core_ring:next_owner(Ring, + Idx), + case {Type, Next, Status} + of + {resized_primary, + '$delete', + awaiting} -> + Ring3 = + riak_core_ring:deletion_complete(Ring, + Idx, + Mod), + %% Use local ring optimization like mark_handoff_complete + {set_only, Ring3}; + {{fallback, _}, + '$delete', + awaiting} -> + Ring3 = + riak_core_ring:deletion_complete(Ring, + Idx, + Mod), + %% Use local ring optimization like mark_handoff_complete + {set_only, Ring3}; + _ -> ignore + end + end, + []), + Result. + +maybe_handoff(_TargetIdx, _TargetNode, + State = #state{modstate = {deleted, _}}) -> %% Modstate has been deleted, waiting for unregistered. No handoff. continue(State); maybe_handoff(TargetIdx, TargetNode, - State=#state{index=Idx, mod=Mod, modstate=ModState, - handoff_target=CurrentTarget, handoff_pid=HPid}) -> + State = #state{index = Idx, mod = Module, + modstate = ModState, + handoff_target = CurrentTarget, + handoff_pid = HPid}) -> Target = {TargetIdx, TargetNode}, - ExistingHO = is_pid(HPid) andalso is_process_alive(HPid), + ExistingHO = is_pid(HPid) andalso + is_process_alive(HPid), ValidHN = case CurrentTarget of - none -> - true; - Target -> - not ExistingHO; - _ -> - logger:info("~s/~b: handoff request to ~p before " - "finishing handoff to ~p", - [Mod, Idx, Target, CurrentTarget]), - not ExistingHO + none -> true; + Target -> not ExistingHO; + _ -> + logger:info("~s/~b: handoff request to ~p before " + "finishing handoff to ~p", + [Module, Idx, Target, CurrentTarget]), + not ExistingHO end, case ValidHN of - true -> - {ok, R} = riak_core_ring_manager:get_my_ring(), - Resizing = riak_core_ring:is_resizing(R), - Primary = riak_core_ring:is_primary(R, {Idx, node()}), - HOType = case {Resizing, Primary} of - {true, _} -> resize; - {_, true} -> ownership; - {_, false} -> hinted - end, - case Mod:handoff_starting({HOType, Target}, ModState) of - {true, NewModState} -> - start_handoff(HOType, TargetIdx, TargetNode,State#state{modstate=NewModState}); - {false, NewModState} -> - continue(State, NewModState) - end; - false -> - continue(State) + true -> + {ok, R} = riak_core_ring_manager:get_my_ring(), + Resizing = riak_core_ring:is_resizing(R), + Primary = riak_core_ring:is_primary(R, {Idx, node()}), + HOType = case {Resizing, Primary} of + {true, _} -> resize; + {_, true} -> ownership; + {_, false} -> hinted + end, + case Module:handoff_starting({HOType, Target}, ModState) + of + {true, NewModState} -> + start_handoff(HOType, TargetIdx, TargetNode, + State#state{modstate = NewModState}); + {false, NewModState} -> continue(State, NewModState) + end; + false -> continue(State) end. start_handoff(HOType, TargetIdx, TargetNode, - State=#state{mod=Mod, modstate=ModState}) -> - case Mod:is_empty(ModState) of - {true, NewModState} -> - finish_handoff(State#state{modstate=NewModState, - handoff_type=HOType, - handoff_target={TargetIdx, TargetNode}}); - {false, Size, NewModState} -> - State2 = State#state{modstate=NewModState}, - NewState = start_outbound(HOType, TargetIdx, TargetNode, [{size, Size}], State2), - continue(NewState); - {false, NewModState} -> - State2 = State#state{modstate=NewModState}, - NewState = start_outbound(HOType, TargetIdx, TargetNode, [], State2), - continue(NewState) + State = #state{mod = Module, modstate = ModState}) -> + case Module:is_empty(ModState) of + {true, NewModState} -> + finish_handoff(State#state{modstate = NewModState, + handoff_type = HOType, + handoff_target = {TargetIdx, TargetNode}}); + {false, Size, NewModState} -> + State2 = State#state{modstate = NewModState}, + NewState = start_outbound(HOType, TargetIdx, TargetNode, + [{size, Size}], State2), + continue(NewState); + {false, NewModState} -> + State2 = State#state{modstate = NewModState}, + NewState = start_outbound(HOType, TargetIdx, TargetNode, + [], State2), + continue(NewState) end. - -start_outbound(HOType, TargetIdx, TargetNode, Opts, State=#state{index=Idx,mod=Mod}) -> - case riak_core_handoff_manager:add_outbound(HOType,Mod,Idx,TargetIdx,TargetNode,self(),Opts) of - {ok, Pid} -> - State#state{handoff_pid=Pid, - handoff_type=HOType, - handoff_target={TargetIdx, TargetNode}}; - {error,_Reason} -> - {ok, NewModState} = Mod:handoff_cancelled(State#state.modstate), - State#state{modstate=NewModState} +start_outbound(HOType, TargetIdx, TargetNode, Opts, + State = #state{index = Idx, mod = Module}) -> + case riak_core_handoff_manager:add_outbound(HOType, + Module, Idx, TargetIdx, + TargetNode, self(), Opts) + of + {ok, Pid} -> + State#state{handoff_pid = Pid, handoff_type = HOType, + handoff_target = {TargetIdx, TargetNode}}; + {error, _Reason} -> + {ok, NewModState} = + Module:handoff_cancelled(State#state.modstate), + State#state{modstate = NewModState} end. -%% @doc Send a reply to a vnode request. If -%% the Ref is undefined just send the reply -%% for compatibility with pre-0.12 requestors. -%% If Ref is defined, send it along with the -%% reply. -%% NOTE: We *always* send the reply using unreliable delivery. -%% --spec reply(sender(), term()) -> any(). -reply({fsm, undefined, From}, Reply) -> - riak_core_send_msg:send_event_unreliable(From, Reply); -reply({fsm, Ref, From}, Reply) -> - riak_core_send_msg:send_event_unreliable(From, {Ref, Reply}); -reply({server, undefined, From}, Reply) -> - riak_core_send_msg:reply_unreliable(From, Reply); -reply({server, Ref, From}, Reply) -> - riak_core_send_msg:reply_unreliable(From, {Ref, Reply}); -reply({raw, Ref, From}, Reply) -> - riak_core_send_msg:bang_unreliable(From, {Ref, Reply}); -reply(ignore, _Reply) -> - ok. - -%% @doc Set up a monitor for the pid named by a {@type sender()} vnode -%% argument. If `Sender' was the atom `ignore', this function sets up -%% a monitor on `self()' in order to return a valid (if useless) -%% monitor reference. --spec monitor(Sender::sender()) -> Monitor::reference(). -monitor({fsm, _, From}) -> - erlang:monitor(process, From); -monitor({server, _, {Pid, _Ref}}) -> - erlang:monitor(process, Pid); -monitor({raw, _, From}) -> - erlang:monitor(process, From); -monitor(ignore) -> - erlang:monitor(process, self()). - %% Individual vnode processes and the vnode manager are tightly coupled. When %% vnode events occur, the vnode must ensure that the events are forwarded to %% the vnode manager, which will make a state change decision and send an @@ -1050,94 +1604,118 @@ monitor(ignore) -> %% messages sent by the vnode. Therefore, the vnode periodically resends event %% messages until an appropriate message is received back from the vnode %% manager. The event timer functions below implement this logic. -start_manager_event_timer(Event, State=#state{mod=Mod, index=Idx}) -> - riak_core_vnode_manager:vnode_event(Mod, Idx, self(), Event), +start_manager_event_timer(Event, + State = #state{mod = Module, index = Idx}) -> + riak_core_vnode_manager:vnode_event(Module, Idx, self(), + Event), stop_manager_event_timer(State), - T2 = gen_fsm:send_event_after(30000, {send_manager_event, Event}), - State#state{manager_event_timer=T2}. + T2 = gen_fsm_compat:send_event_after(30000, + {send_manager_event, Event}), + State#state{manager_event_timer = T2}. -stop_manager_event_timer(#state{manager_event_timer=undefined}) -> +stop_manager_event_timer(#state{manager_event_timer = + undefined}) -> ok; -stop_manager_event_timer(#state{manager_event_timer=T}) -> - _ = gen_fsm:cancel_timer(T), - ok. +stop_manager_event_timer(#state{manager_event_timer = + T}) -> + _ = gen_fsm_compat:cancel_timer(T), ok. -mod_set_forwarding(_Forward, State=#state{modstate={deleted,_}}) -> +mod_set_forwarding(_Forward, + State = #state{modstate = {deleted, _}}) -> State; -mod_set_forwarding(Forward, State=#state{mod=Mod, modstate=ModState}) -> - case lists:member({set_vnode_forwarding, 2}, Mod:module_info(exports)) of - true -> - NewModState = Mod:set_vnode_forwarding(Forward, ModState), - State#state{modstate=NewModState}; - false -> - State +mod_set_forwarding(Forward, + State = #state{mod = Module, modstate = ModState}) -> + case lists:member({set_vnode_forwarding, 2}, + Module:module_info(exports)) + of + true -> + NewModState = Module:set_vnode_forwarding(Forward, + ModState), + State#state{modstate = NewModState}; + false -> State end. %% =================================================================== %% Test API %% =================================================================== +-type state() :: #state{}. + %% @doc Reveal the underlying module state for testing --spec(get_modstate(pid()) -> {atom(), #state{}}). +-spec get_modstate(pid()) -> {atom(), state()}. + get_modstate(Pid) -> - {_StateName, State} = gen_fsm:sync_send_all_state_event(Pid, current_state), + {_StateName, State} = + gen_fsm_compat:sync_send_all_state_event(Pid, + current_state), {State#state.mod, State#state.modstate}. -ifdef(TEST). %% Start the garbage collection server test_link(Mod, Index) -> - gen_fsm:start_link(?MODULE, [Mod, Index, 0, node()], []). + gen_fsm_compat:start_link(?MODULE, + [Mod, Index, 0, node()], []). %% Get the current state of the fsm for testing inspection --spec current_state(pid()) -> {atom(), #state{}} | {error, term()}. +-spec current_state(pid()) -> {atom(), state()} | + {error, term()}. + current_state(Pid) -> - gen_fsm:sync_send_all_state_event(Pid, current_state). + gen_fsm_compat:sync_send_all_state_event(Pid, + current_state). + +wait_for_process_death(Pid) -> + wait_for_process_death(Pid, is_process_alive(Pid)). + +wait_for_process_death(Pid, true) -> + wait_for_process_death(Pid, is_process_alive(Pid)); +wait_for_process_death(_Pid, false) -> ok. + +wait_for_state_update(OriginalStateData, Pid) -> + {_, CurrentStateData} = (?MODULE):current_state(Pid), + wait_for_state_update(OriginalStateData, + CurrentStateData, Pid). + +wait_for_state_update(OriginalStateData, + OriginalStateData, Pid) -> + {_, CurrentStateData} = (?MODULE):current_state(Pid), + wait_for_state_update(OriginalStateData, + CurrentStateData, Pid); +wait_for_state_update(_OriginalState, _StateData, + _Pid) -> + ok. + +%% =================================================================== +%% Test +%% =================================================================== pool_death_test() -> + %% expect error log + error_logger:tty(false), meck:unload(), meck:new(test_vnode, [non_strict, no_link]), - meck:expect(test_vnode, init, fun(_) -> {ok, [], [{pool, test_pool_mod, 1, []}]} end), - meck:expect(test_vnode, terminate, fun(_, _) -> normal end), + meck:expect(test_vnode, init, + fun (_) -> {ok, [], [{pool, test_pool_mod, 1, []}]} + end), + meck:expect(test_vnode, terminate, + fun (_, _) -> normal end), meck:new(test_pool_mod, [non_strict, no_link]), - meck:expect(test_pool_mod, init_worker, fun(_, _, _) -> {ok, []} end), - - %% expect error log - error_logger:tty(false), - - {ok, Pid} = ?MODULE:test_link(test_vnode, 0), - {_, StateData1} = ?MODULE:current_state(Pid), + meck:expect(test_pool_mod, init_worker, + fun (_, _, _) -> {ok, []} end), + {ok, Pid} = riak_core_vnode:test_link(test_vnode, 0), + {_, StateData1} = riak_core_vnode:current_state(Pid), PoolPid1 = StateData1#state.pool_pid, exit(PoolPid1, kill), wait_for_process_death(PoolPid1), - ?assertNot(is_process_alive(PoolPid1)), + ?assertNot((is_process_alive(PoolPid1))), wait_for_state_update(StateData1, Pid), - {_, StateData2} = ?MODULE:current_state(Pid), + {_, StateData2} = riak_core_vnode:current_state(Pid), PoolPid2 = StateData2#state.pool_pid, - ?assertNot(PoolPid2 =:= undefined), + ?assertNot((PoolPid2 =:= undefined)), exit(Pid, normal), wait_for_process_death(Pid), - - error_logger:tty(false), meck:validate(test_pool_mod), meck:validate(test_vnode). -wait_for_process_death(Pid) -> - wait_for_process_death(Pid, is_process_alive(Pid)). - -wait_for_process_death(Pid, true) -> - wait_for_process_death(Pid, is_process_alive(Pid)); -wait_for_process_death(_Pid, false) -> - ok. - -wait_for_state_update(OriginalStateData, Pid) -> - {_, CurrentStateData} = ?MODULE:current_state(Pid), - wait_for_state_update(OriginalStateData, CurrentStateData, Pid). - -wait_for_state_update(OriginalStateData, OriginalStateData, Pid) -> - {_, CurrentStateData} = ?MODULE:current_state(Pid), - wait_for_state_update(OriginalStateData, CurrentStateData, Pid); -wait_for_state_update(_OriginalState, _StateData, _Pid) -> - ok. - -endif. diff --git a/src/riak_core_vnode_manager.erl b/src/riak_core_vnode_manager.erl index 295e4a5c2..d2098a569 100644 --- a/src/riak_core_vnode_manager.erl +++ b/src/riak_core_vnode_manager.erl @@ -33,75 +33,91 @@ -behaviour(gen_server). -export([start_link/0, stop/0]). --export([init/1, handle_call/3, handle_cast/2, handle_info/2, - terminate/2, code_change/3]). --export([all_vnodes/0, all_vnodes/1, all_vnodes_status/0, - force_handoffs/0, repair/3, all_handoffs/0, repair_status/1, xfer_complete/2, + +-export([init/1, handle_call/3, handle_cast/2, + handle_info/2, terminate/2, code_change/3]). + +-export([all_vnodes/0, all_vnodes/1, + all_vnodes_status/0, force_handoffs/0, repair/3, + all_handoffs/0, repair_status/1, xfer_complete/2, kill_repairs/1]). --export([all_index_pid/1, get_vnode_pid/2, start_vnode/2, - unregister_vnode/2, unregister_vnode/3, vnode_event/4]). + +-export([all_index_pid/1, get_vnode_pid/2, + start_vnode/2, unregister_vnode/2, unregister_vnode/3, + vnode_event/4]). + %% Field debugging -export([get_tab/0]). -record(idxrec, {key, idx, mod, pid, monref}). + -record(monrec, {monref, key}). --record(xfer_status, { - status :: pending | complete, - mod_src_target :: {module(), index(), index()} - }). +-record(xfer_status, + {status :: pending | complete, + mod_src_target :: {module(), index(), index()}}). + -type xfer_status() :: #xfer_status{}. -record(repair, - { - mod_partition :: mod_partition(), - filter_mod_fun :: {module(), atom()}, - minus_one_xfer :: xfer_status(), - plus_one_xfer :: xfer_status(), - pairs :: [{index(), node()}] - }). + {mod_partition :: mod_partition(), + filter_mod_fun :: {module(), atom()}, + minus_one_xfer :: xfer_status(), + plus_one_xfer :: xfer_status(), + pairs :: [{index(), node()}]}). + -type repair() :: #repair{}. + -type repairs() :: [repair()]. --record(state, {idxtab, - forwarding :: dict:dict(), - handoff :: dict:dict(), - known_modules :: [term()], - never_started :: [{integer(), term()}], - vnode_start_tokens :: integer(), - last_ring_id :: term(), - repairs :: repairs() - }). +-record(state, + {idxtab, forwarding :: dict:dict(), + handoff :: dict:dict(), known_modules :: [term()], + never_started :: [{integer(), term()}], + vnode_start_tokens :: integer(), + last_ring_id :: term(), repairs :: repairs()}). -include("riak_core_handoff.hrl"). + -include("riak_core_vnode.hrl"). --define(XFER_EQ(A, ModSrcTgt), A#xfer_status.mod_src_target == ModSrcTgt). --define(XFER_COMPLETE(X), X#xfer_status.status == complete). + +-define(XFER_EQ(A, ModSrcTgt), + A#xfer_status.mod_src_target == ModSrcTgt). + +-define(XFER_COMPLETE(X), + X#xfer_status.status == complete). + -define(DEFAULT_OWNERSHIP_TRIGGER, 8). + -define(ETS, ets_vnode_mgr). + -define(DEFAULT_VNODE_ROLLING_START, 16). --define(LONG_TIMEOUT, 120*1000). + +-define(LONG_TIMEOUT, 120 * 1000). %% =================================================================== %% Public API %% =================================================================== start_link() -> - gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). + gen_server:start_link({local, ?MODULE}, ?MODULE, [], + []). -stop() -> - gen_server:cast(?MODULE, stop). +stop() -> gen_server:cast(?MODULE, stop). all_vnodes_status() -> gen_server:call(?MODULE, all_vnodes_status, infinity). %% @doc Repair the given `ModPartition' pair for `Service' using the %% given `FilterModFun' to filter keys. --spec repair(atom(), {module(), partition()}, {module(), atom()}) -> - {ok, Pairs::[{partition(), node()}]} | - {down, Down::[{partition(), node()}]} | - ownership_change_in_progress. -repair(Service, {_Module, Partition}=ModPartition, FilterModFun) -> +-spec repair(atom(), {module(), partition()}, + {module(), atom()}) -> {ok, + Pairs :: [{partition(), node()}]} | + {down, Down :: [{partition(), node()}]} | + ownership_change_in_progress. + +repair(Service, {_Module, Partition} = ModPartition, + FilterModFun) -> %% Fwd the request to the partition owner to guarantee that there %% is only one request per partition. {ok, Ring} = riak_core_ring_manager:get_my_ring(), @@ -110,14 +126,18 @@ repair(Service, {_Module, Partition}=ModPartition, FilterModFun) -> gen_server:call({?MODULE, Owner}, Msg, ?LONG_TIMEOUT). %% @doc Get the status of the repair process for a given `ModPartition'. --spec repair_status(mod_partition()) -> in_progress | not_found. -repair_status({_Module, Partition}=ModPartition) -> +-spec repair_status(mod_partition()) -> in_progress | + not_found. + +repair_status({_Module, Partition} = ModPartition) -> {ok, Ring} = riak_core_ring_manager:get_my_ring(), Owner = riak_core_ring:index_owner(Ring, Partition), - gen_server:call({?MODULE, Owner}, {repair_status, ModPartition}, ?LONG_TIMEOUT). + gen_server:call({?MODULE, Owner}, + {repair_status, ModPartition}, ?LONG_TIMEOUT). %% @doc Get all handoffs known by this manager. --spec all_handoffs() -> list(known_handoff()). +-spec all_handoffs() -> [known_handoff()]. + all_handoffs() -> gen_server:call(?MODULE, all_handoffs, ?LONG_TIMEOUT). @@ -125,8 +145,10 @@ all_handoffs() -> %% %% TODO: second arg has specific form but maybe make proplist? -spec xfer_complete(node(), tuple()) -> ok. + xfer_complete(Origin, Xfer) -> - gen_server:call({?MODULE, Origin}, {xfer_complete, Xfer}, ?LONG_TIMEOUT). + gen_server:call({?MODULE, Origin}, + {xfer_complete, Xfer}, ?LONG_TIMEOUT). kill_repairs(Reason) -> gen_server:cast(?MODULE, {kill_repairs, Reason}). @@ -141,19 +163,23 @@ unregister_vnode(Index, VNodeMod) -> unregister_vnode(Index, self(), VNodeMod). unregister_vnode(Index, Pid, VNodeMod) -> - gen_server:cast(?MODULE, {unregister, Index, VNodeMod, Pid}). + gen_server:cast(?MODULE, + {unregister, Index, VNodeMod, Pid}). start_vnode(Index, VNodeMod) -> - gen_server:cast(?MODULE, {Index, VNodeMod, start_vnode}). + gen_server:cast(?MODULE, + {Index, VNodeMod, start_vnode}). -vnode_event(Mod, Idx, Pid, Event) -> - gen_server:cast(?MODULE, {vnode_event, Mod, Idx, Pid, Event}). +vnode_event(Module, Idx, Pid, Event) -> + gen_server:cast(?MODULE, + {vnode_event, Module, Idx, Pid, Event}). get_tab() -> gen_server:call(?MODULE, get_tab, infinity). get_vnode_pid(Index, VNodeMod) -> - gen_server:call(?MODULE, {Index, VNodeMod, get_vnode}, infinity). + gen_server:call(?MODULE, {Index, VNodeMod, get_vnode}, + infinity). %% =================================================================== %% ETS-based API: try to determine response by reading protected ETS @@ -163,49 +189,51 @@ get_vnode_pid(Index, VNodeMod) -> all_vnodes() -> case get_all_vnodes() of - [] -> - %% ETS error could produce empty list, call manager to be sure. - gen_server:call(?MODULE, all_vnodes, infinity); - Result -> - Result + [] -> + %% ETS error could produce empty list, call manager to be sure. + gen_server:call(?MODULE, all_vnodes, infinity); + Result -> Result end. -all_vnodes(Mod) -> - case get_all_vnodes(Mod) of - [] -> - %% ETS error could produce empty list, call manager to be sure. - gen_server:call(?MODULE, {all_vnodes, Mod}, infinity); - Result -> - Result +all_vnodes(Module) -> + case get_all_vnodes(Module) of + [] -> + %% ETS error could produce empty list, call manager to be sure. + gen_server:call(?MODULE, {all_vnodes, Module}, + infinity); + Result -> Result end. all_index_pid(VNodeMod) -> case get_all_index_pid(VNodeMod, ets_error) of - ets_error -> - gen_server:call(?MODULE, {all_index_pid, VNodeMod}, infinity); - Result -> - Result + ets_error -> + gen_server:call(?MODULE, {all_index_pid, VNodeMod}, + infinity); + Result -> Result end. %% =================================================================== %% Protected ETS Accessors %% =================================================================== -get_all_index_pid(Mod, Default) -> - try - [list_to_tuple(L) - || L <- ets:match(?ETS, {idxrec, '_', '$1', Mod, '$2', '_'})] +get_all_index_pid(Module, Default) -> + try [list_to_tuple(L) + || L + <- ets:match(?ETS, + {idxrec, '_', '$1', Module, '$2', '_'})] catch - _:_ -> - Default + _:_ -> Default end. get_all_vnodes() -> - Mods = [Mod || {_App, Mod} <- riak_core:vnode_modules()], - get_all_vnodes(Mods). - -get_all_vnodes(Mods) when is_list(Mods) -> - lists:flatmap(fun(Mod) -> get_all_vnodes(Mod) end, Mods); + Modules = [Module + || {_App, Module} <- riak_core:vnode_modules()], + get_all_vnodes(Modules). + +get_all_vnodes(Modules) when is_list(Modules) -> + lists:flatmap(fun (Module) -> get_all_vnodes(Module) + end, + Modules); get_all_vnodes(Mod) -> IdxPids = get_all_index_pid(Mod, []), [{Mod, Idx, Pid} || {Idx, Pid} <- IdxPids]. @@ -216,14 +244,17 @@ get_all_vnodes(Mod) -> %% @private init(_State) -> - {ok, Ring, CHBin} = riak_core_ring_manager:get_raw_ring_chashbin(), + {ok, Ring, CHBin} = + riak_core_ring_manager:get_raw_ring_chashbin(), Mods = [Mod || {_, Mod} <- riak_core:vnode_modules()], - State = #state{forwarding=dict:new(), handoff=dict:new(), - known_modules=[], never_started=[], vnode_start_tokens=0, - repairs=[]}, + State = #state{forwarding = dict:new(), + handoff = dict:new(), known_modules = [], + never_started = [], vnode_start_tokens = 0, + repairs = []}, State2 = find_vnodes(State), AllVNodes = get_all_vnodes(Mods), - State3 = update_forwarding(AllVNodes, Mods, Ring, State2), + State3 = update_forwarding(AllVNodes, Mods, Ring, + State2), State4 = update_handoff(AllVNodes, Ring, CHBin, State3), schedule_management_timer(), {ok, State4}. @@ -233,77 +264,76 @@ find_vnodes(State) -> %% Get the current list of vnodes running in the supervisor. We use this %% to rebuild our ETS table for routing messages to the appropriate %% vnode. - VnodePids = [Pid || {_, Pid, worker, _} - <- supervisor:which_children(riak_core_vnode_sup), - is_pid(Pid) andalso is_process_alive(Pid)], - IdxTable = ets:new(?ETS, [{keypos, 2}, named_table, protected]), - + VnodePids = [Pid + || {_, Pid, worker, _} + <- supervisor:which_children(riak_core_vnode_sup), + is_pid(Pid) andalso is_process_alive(Pid)], + IdxTable = ets:new(?ETS, + [{keypos, 2}, named_table, protected]), %% If the vnode manager is being restarted, scan the existing %% vnode children and work out which module and index they are %% responsible for. During startup it is possible that these %% vnodes may be shutting down as we check them if there are %% several types of vnodes active. - PidIdxs = lists:flatten( - [try - [{Pid, riak_core_vnode:get_mod_index(Pid)}] - catch - _:_Err -> - [] - end || Pid <- VnodePids]), - + PidIdxs = lists:flatten([try [{Pid, + riak_core_vnode:get_mod_index(Pid)}] + catch + _:_Err -> [] + end + || Pid <- VnodePids]), %% Populate the ETS table with processes running this VNodeMod (filtered %% in the list comprehension) - F = fun(Pid, Idx, Mod) -> + F = fun (Pid, Idx, Module) -> Mref = erlang:monitor(process, Pid), - #idxrec { key = {Idx,Mod}, idx = Idx, mod = Mod, pid = Pid, - monref = Mref } + #idxrec{key = {Idx, Module}, idx = Idx, mod = Module, + pid = Pid, monref = Mref} end, - IdxRecs = [F(Pid, Idx, Mod) || {Pid, {Mod, Idx}} <- PidIdxs], - MonRecs = [#monrec{monref=Mref, key=Key} - || #idxrec{key=Key, monref=Mref} <- IdxRecs], + IdxRecs = [F(Pid, Idx, Mod) + || {Pid, {Mod, Idx}} <- PidIdxs], + MonRecs = [#monrec{monref = Mref, key = Key} + || #idxrec{key = Key, monref = Mref} <- IdxRecs], true = ets:insert_new(IdxTable, IdxRecs ++ MonRecs), - State#state{idxtab=IdxTable}. + State#state{idxtab = IdxTable}. %% @private handle_call(all_vnodes_status, _From, State) -> Reply = get_all_vnodes_status(State), {reply, Reply, State}; handle_call(all_vnodes, _From, State) -> - Reply = get_all_vnodes(), - {reply, Reply, State}; + Reply = get_all_vnodes(), {reply, Reply, State}; handle_call({all_vnodes, Mod}, _From, State) -> - Reply = get_all_vnodes(Mod), - {reply, Reply, State}; + Reply = get_all_vnodes(Mod), {reply, Reply, State}; handle_call({all_index_pid, Mod}, _From, State) -> Reply = get_all_index_pid(Mod, []), {reply, Reply, State}; -handle_call({Partition, Mod, get_vnode}, _From, State) -> +handle_call({Partition, Mod, get_vnode}, _From, + State) -> Pid = get_vnode(Partition, Mod, State), {reply, {ok, Pid}, State}; handle_call(get_tab, _From, State) -> {reply, ets:tab2list(State#state.idxtab), State}; - -handle_call({repair, Service, {Mod,Partition}=ModPartition, FilterModFun}, - _From, #state{repairs=Repairs}=State) -> +handle_call({repair, Service, + {Mod, Partition} = ModPartition, FilterModFun}, + _From, #state{repairs = Repairs} = State) -> case get_repair(ModPartition, Repairs) of - none -> - maybe_create_repair(Partition, Service, ModPartition, FilterModFun, Mod, Repairs, State); - Repair -> - Pairs = Repair#repair.pairs, - {reply, {ok, Pairs}, State} + none -> + maybe_create_repair(Partition, Service, ModPartition, + FilterModFun, Mod, Repairs, State); + Repair -> + Pairs = Repair#repair.pairs, {reply, {ok, Pairs}, State} end; - -handle_call(all_handoffs, _From, State=#state{repairs=Repairs, handoff=HO}) -> - Handoffs=dict:to_list(HO) ++ transform_repair_records(Repairs), +handle_call(all_handoffs, _From, + State = #state{repairs = Repairs, handoff = HO}) -> + Handoffs = dict:to_list(HO) ++ + transform_repair_records(Repairs), {reply, Handoffs, State}; - -handle_call({repair_status, ModPartition}, _From, State) -> +handle_call({repair_status, ModPartition}, _From, + State) -> Repairs = State#state.repairs, case get_repair(ModPartition, Repairs) of - none -> {reply, not_found, State}; - #repair{} -> {reply, in_progress, State} + none -> {reply, not_found, State}; + #repair{} -> {reply, in_progress, State} end; - %% NOTE: The `xfer_complete' logic assumes two things: %% %% 1. The `xfer_complete' msg will always be sent to the owner @@ -315,80 +345,90 @@ handle_call({xfer_complete, ModSrcTgt}, _From, State) -> {Mod, _, Partition} = ModSrcTgt, ModPartition = {Mod, Partition}, case get_repair(ModPartition, Repairs) of - none -> - logger:error("Received xfer_complete for non-existing repair: ~p", - [ModPartition]), - {reply, ok, State}; - #repair{minus_one_xfer=MOX, plus_one_xfer=POX}=R -> - R2 = if ?XFER_EQ(MOX, ModSrcTgt) -> - MOX2 = MOX#xfer_status{status=complete}, - R#repair{minus_one_xfer=MOX2}; - ?XFER_EQ(POX, ModSrcTgt) -> - POX2 = POX#xfer_status{status=complete}, - R#repair{plus_one_xfer=POX2}; - true -> - logger:error("Received xfer_complete for " - "non-existing xfer: ~p", [ModSrcTgt]) - end, - - case {?XFER_COMPLETE(R2#repair.minus_one_xfer), - ?XFER_COMPLETE(R2#repair.plus_one_xfer)} of - {true, true} -> - {reply, ok, State#state{repairs=remove_repair(R2, Repairs)}}; - _ -> - {reply, ok, State#state{repairs=replace_repair(R2, Repairs)}} - end + none -> + logger:error("Received xfer_complete for non-existing " + "repair: ~p", + [ModPartition]), + {reply, ok, State}; + #repair{minus_one_xfer = MOX, plus_one_xfer = POX} = + R -> + R2 = if ?XFER_EQ(MOX, ModSrcTgt) -> + MOX2 = MOX#xfer_status{status = complete}, + R#repair{minus_one_xfer = MOX2}; + ?XFER_EQ(POX, ModSrcTgt) -> + POX2 = POX#xfer_status{status = complete}, + R#repair{plus_one_xfer = POX2}; + true -> + logger:error("Received xfer_complete for non-existing " + "xfer: ~p", + [ModSrcTgt]) + end, + case {?XFER_COMPLETE((R2#repair.minus_one_xfer)), + ?XFER_COMPLETE((R2#repair.plus_one_xfer))} + of + {true, true} -> + {reply, ok, + State#state{repairs = remove_repair(R2, Repairs)}}; + _ -> + {reply, ok, + State#state{repairs = replace_repair(R2, Repairs)}} + end end; - -handle_call(_, _From, State) -> - {reply, ok, State}. +handle_call(_, _From, State) -> {reply, ok, State}. transform_repair_records(Repairs) -> %% World's ugliest pattern match, simplest logic: matching %% module/node values in the `pairs' field against %% `minus_one_xfer' and `plus_one_xfer' - lists:flatten(lists:map(fun(#repair{pairs=[{M1SrcIdx, Mnode}, _FixPartition, {P1SrcIdx, Pnode}], - minus_one_xfer=#xfer_status{mod_src_target={M1Mod, M1SrcIdx, _M1DstIdx}}, - plus_one_xfer=#xfer_status{mod_src_target={P1Mod, P1SrcIdx, _P1DstIdx}}}) -> - [{{M1Mod, M1SrcIdx}, {repair, inbound, Mnode}}, - {{P1Mod, P1SrcIdx}, {repair, inbound, Pnode}}] + lists:flatten(lists:map(fun (#repair{pairs = + [{M1SrcIdx, Mnode}, _FixPartition, + {P1SrcIdx, Pnode}], + minus_one_xfer = + #xfer_status{mod_src_target = + {M1Mod, M1SrcIdx, + _M1DstIdx}}, + plus_one_xfer = + #xfer_status{mod_src_target = + {P1Mod, P1SrcIdx, + _P1DstIdx}}}) -> + [{{M1Mod, M1SrcIdx}, + {repair, inbound, Mnode}}, + {{P1Mod, P1SrcIdx}, + {repair, inbound, Pnode}}] end, Repairs)). -maybe_create_repair(Partition, Service, ModPartition, FilterModFun, Mod, Repairs, State) -> +maybe_create_repair(Partition, Service, ModPartition, + FilterModFun, Mod, Repairs, State) -> {ok, Ring} = riak_core_ring_manager:get_my_ring(), case riak_core_ring:pending_changes(Ring) of - [] -> - UpNodes = riak_core_node_watcher:nodes(Service), - Pairs = repair_pairs(Ring, Partition), - case check_up(Pairs, UpNodes) of - true -> - create_repair(Pairs, ModPartition, FilterModFun, Mod, Partition, Repairs, State); - {false, Down} -> - {reply, {down, Down}, State} - end; - _ -> - {reply, ownership_change_in_progress, State} + [] -> + UpNodes = riak_core_node_watcher:nodes(Service), + Pairs = repair_pairs(Ring, Partition), + case check_up(Pairs, UpNodes) of + true -> + create_repair(Pairs, ModPartition, FilterModFun, Mod, + Partition, Repairs, State); + {false, Down} -> {reply, {down, Down}, State} + end; + _ -> {reply, ownership_change_in_progress, State} end. -create_repair(Pairs, ModPartition, FilterModFun, Mod, Partition, Repairs, State) -> +create_repair(Pairs, ModPartition, FilterModFun, Mod, + Partition, Repairs, State) -> {MOP, _} = MinusOne = get_minus_one(Pairs), {POP, _} = PlusOne = get_plus_one(Pairs), - riak_core_handoff_manager:xfer(MinusOne, - ModPartition, + riak_core_handoff_manager:xfer(MinusOne, ModPartition, FilterModFun), - riak_core_handoff_manager:xfer(PlusOne, - ModPartition, + riak_core_handoff_manager:xfer(PlusOne, ModPartition, FilterModFun), MOXStatus = #xfer_status{status = pending, mod_src_target = {Mod, MOP, Partition}}, POXStatus = #xfer_status{status = pending, mod_src_target = {Mod, POP, Partition}}, Repair = #repair{mod_partition = ModPartition, - filter_mod_fun = FilterModFun, - pairs = Pairs, - minus_one_xfer = MOXStatus, - plus_one_xfer = POXStatus}, + filter_mod_fun = FilterModFun, pairs = Pairs, + minus_one_xfer = MOXStatus, plus_one_xfer = POXStatus}, Repairs2 = Repairs ++ [Repair], State2 = State#state{repairs = Repairs2}, logger:debug("add repair ~p", [ModPartition]), @@ -396,168 +436,157 @@ create_repair(Pairs, ModPartition, FilterModFun, Mod, Partition, Repairs, State) %% @private handle_cast({Partition, Mod, start_vnode}, State) -> - _ = get_vnode(Partition, Mod, State), - {noreply, State}; -handle_cast({unregister, Index, Mod, Pid}, #state{idxtab=T} = State) -> + _ = get_vnode(Partition, Mod, State), {noreply, State}; +handle_cast({unregister, Index, Mod, Pid}, + #state{idxtab = T} = State) -> %% Update forwarding state to ensure vnode is not restarted in %% incorrect forwarding state if next request arrives before next %% ring event. {ok, Ring} = riak_core_ring_manager:get_my_ring(), State2 = update_forwarding({Mod, Index}, Ring, State), - ets:match_delete(T, {idxrec, {Index, Mod}, Index, Mod, Pid, '_'}), + ets:match_delete(T, + {idxrec, {Index, Mod}, Index, Mod, Pid, '_'}), _ = unregister_vnode_stats(Mod, Index), riak_core_vnode_proxy:unregister_vnode(Mod, Index, Pid), {noreply, State2}; -handle_cast({vnode_event, Mod, Idx, Pid, Event}, State) -> +handle_cast({vnode_event, Mod, Idx, Pid, Event}, + State) -> handle_vnode_event(Event, Mod, Idx, Pid, State); handle_cast(force_handoffs, State) -> AllVNodes = get_all_vnodes(), - {ok, Ring, CHBin} = riak_core_ring_manager:get_raw_ring_chashbin(), + {ok, Ring, CHBin} = + riak_core_ring_manager:get_raw_ring_chashbin(), State2 = update_handoff(AllVNodes, Ring, CHBin, State), - _ = [maybe_trigger_handoff(Mod, Idx, Pid, State2) || {Mod, Idx, Pid} <- AllVNodes], - {noreply, State2}; - handle_cast(maybe_start_vnodes, State) -> {ok, Ring} = riak_core_ring_manager:get_my_ring(), State2 = maybe_start_vnodes(Ring, State), {noreply, State2}; - handle_cast({kill_repairs, Reason}, State) -> logger:warning("Killing all repairs: ~p", [Reason]), kill_repairs(State#state.repairs, Reason), - {noreply, State#state{repairs=[]}}; - -handle_cast(_, State) -> - {noreply, State}. + {noreply, State#state{repairs = []}}; +handle_cast(_, State) -> {noreply, State}. handle_info(management_tick, State0) -> schedule_management_timer(), RingID = riak_core_ring_manager:get_ring_id(), - {ok, Ring, CHBin} = riak_core_ring_manager:get_raw_ring_chashbin(), + {ok, Ring, CHBin} = + riak_core_ring_manager:get_raw_ring_chashbin(), State = maybe_ring_changed(RingID, Ring, CHBin, State0), Mods = [Mod || {_, Mod} <- riak_core:vnode_modules()], AllVNodes = get_all_vnodes(Mods), State2 = update_handoff(AllVNodes, Ring, CHBin, State), Transfers = riak_core_ring:pending_changes(Ring), - %% Kill/cancel any repairs during ownership changes - State3 = - case Transfers of - [] -> - State2; - _ -> - Repairs = State#state.repairs, - kill_repairs(Repairs, ownership_change), - trigger_ownership_handoff(Transfers, Mods, Ring, State2), - State2#state{repairs=[]} - end, - - State4 = State3#state{vnode_start_tokens = ?DEFAULT_VNODE_ROLLING_START}, + State3 = case Transfers of + [] -> State2; + _ -> + Repairs = State#state.repairs, + kill_repairs(Repairs, ownership_change), + trigger_ownership_handoff(Transfers, Mods, Ring, + State2), + State2#state{repairs = []} + end, + State4 = State3#state{vnode_start_tokens = + ?DEFAULT_VNODE_ROLLING_START}, State5 = maybe_start_vnodes(Ring, State4), - Repairs2 = check_repairs(State4#state.repairs), - {noreply, State5#state{repairs=Repairs2}}; - + {noreply, State5#state{repairs = Repairs2}}; handle_info({'DOWN', MonRef, process, _P, _I}, State) -> - delmon(MonRef, State), - {noreply, State}. + delmon(MonRef, State), {noreply, State}. %% @private handle_vnode_event(inactive, Mod, Idx, Pid, State) -> maybe_trigger_handoff(Mod, Idx, Pid, State), {noreply, State}; -handle_vnode_event(handoff_complete, Mod, Idx, Pid, State) -> +handle_vnode_event(handoff_complete, Mod, Idx, Pid, + State) -> NewHO = dict:erase({Mod, Idx}, State#state.handoff), - gen_fsm:send_all_state_event(Pid, finish_handoff), - {noreply, State#state{handoff=NewHO}}; -handle_vnode_event(handoff_error, Mod, Idx, Pid, State) -> + riak_core_vnode:cast_finish_handoff(Pid), + {noreply, State#state{handoff = NewHO}}; +handle_vnode_event(handoff_error, Mod, Idx, Pid, + State) -> NewHO = dict:erase({Mod, Idx}, State#state.handoff), - gen_fsm:send_all_state_event(Pid, cancel_handoff), - {noreply, State#state{handoff=NewHO}}. + riak_core_vnode:cancel_handoff(Pid), + {noreply, State#state{handoff = NewHO}}. %% @private -terminate(_Reason, _State) -> - ok. +terminate(_Reason, _State) -> ok. %% @private -code_change(_OldVsn, State, _Extra) -> - {ok, State}. - +code_change(_OldVsn, State, _Extra) -> {ok, State}. %% =================================================================== %% Internal functions %% =================================================================== -maybe_ring_changed(RingID, Ring, CHBin, State=#state{last_ring_id=LastID}) -> +maybe_ring_changed(RingID, Ring, CHBin, + State = #state{last_ring_id = LastID}) -> case RingID of - LastID -> - maybe_ensure_vnodes_started(Ring), - State; - _ -> - ensure_vnodes_started(Ring), - State2 = ring_changed(Ring, CHBin, State), - State2#state{last_ring_id=RingID} + LastID -> maybe_ensure_vnodes_started(Ring), State; + _ -> + ensure_vnodes_started(Ring), + State2 = ring_changed(Ring, CHBin, State), + State2#state{last_ring_id = RingID} end. ring_changed(Ring, CHBin, State) -> %% Update vnode forwarding state AllVNodes = get_all_vnodes(), Mods = [Mod || {_, Mod} <- riak_core:vnode_modules()], - State2 = update_forwarding(AllVNodes, Mods, Ring, State), - + State2 = update_forwarding(AllVNodes, Mods, Ring, + State), %% Update handoff state State3 = update_handoff(AllVNodes, Ring, CHBin, State2), - %% Trigger ownership transfers. Transfers = riak_core_ring:pending_changes(Ring), - trigger_ownership_handoff(Transfers, Mods, Ring, State3), + trigger_ownership_handoff(Transfers, Mods, Ring, + State3), State3. maybe_ensure_vnodes_started(Ring) -> ExitingStates = [leaving, exiting, invalid], Status = riak_core_ring:member_status(Ring, node()), case lists:member(Status, ExitingStates) of - true -> - ensure_vnodes_started(Ring), - ok; - _ -> - ok + true -> ensure_vnodes_started(Ring), ok; + _ -> ok end. ensure_vnodes_started(Ring) -> - spawn(fun() -> - try - riak_core_ring_handler:ensure_vnodes_started(Ring) + spawn(fun () -> + try riak_core_ring_handler:ensure_vnodes_started(Ring) catch Type:Reason:Stacktrace -> - logger:error("~p", [{Type, Reason, Stacktrace}]) + logger:error("~p", [{Type, Reason, Stacktrace}]) end end). schedule_management_timer() -> ManagementTick = application:get_env(riak_core, - vnode_management_timer, - 10000), - erlang:send_after(ManagementTick, ?MODULE, management_tick). + vnode_management_timer, 10000), + erlang:send_after(ManagementTick, ?MODULE, + management_tick). -trigger_ownership_handoff(Transfers, Mods, Ring, State) -> +trigger_ownership_handoff(Transfers, Mods, Ring, + State) -> IsResizing = riak_core_ring:is_resizing(Ring), - Throttle = limit_ownership_handoff(Transfers, IsResizing), - Awaiting = [{Mod, Idx} || {Idx, Node, _, CMods, S} <- Throttle, - Mod <- Mods, - S =:= awaiting, - Node =:= node(), - not lists:member(Mod, CMods)], - _ = [maybe_trigger_handoff(Mod, Idx, State) || {Mod, Idx} <- Awaiting], + Throttle = limit_ownership_handoff(Transfers, + IsResizing), + Awaiting = [{Mod, Idx} + || {Idx, Node, _, CMods, S} <- Throttle, Mod <- Mods, + S =:= awaiting, Node =:= node(), + not lists:member(Mod, CMods)], + _ = [maybe_trigger_handoff(Mod, Idx, State) + || {Mod, Idx} <- Awaiting], ok. limit_ownership_handoff(Transfers, IsResizing) -> Limit = application:get_env(riak_core, - forced_ownership_handoff, - ?DEFAULT_OWNERSHIP_TRIGGER), + forced_ownership_handoff, + ?DEFAULT_OWNERSHIP_TRIGGER), limit_ownership_handoff(Limit, Transfers, IsResizing). limit_ownership_handoff(Limit, Transfers, false) -> @@ -566,192 +595,210 @@ limit_ownership_handoff(Limit, Transfers, true) -> %% if we are resizing: filter out completed resize operations, %% since they remain in the list until all are complete. then %% treat transfers as normal - Filtered = [Transfer || {_,_,_,_,Status}=Transfer <- Transfers, - Status =:= awaiting], + Filtered = [Transfer + || {_, _, _, _, Status} = Transfer <- Transfers, + Status =:= awaiting], limit_ownership_handoff(Limit, Filtered, false). %% @private -idx2vnode(Idx, Mod, _State=#state{idxtab=T}) -> +idx2vnode(Idx, Mod, _State = #state{idxtab = T}) -> case ets:lookup(T, {Idx, Mod}) of - [I] -> I#idxrec.pid; - [] -> no_match + [I] -> I#idxrec.pid; + [] -> no_match end. %% @private -delmon(MonRef, _State=#state{idxtab=T}) -> +delmon(MonRef, _State = #state{idxtab = T}) -> case ets:lookup(T, MonRef) of - [#monrec{key= {Index, Mod} = Key}] -> - _ = unregister_vnode_stats(Mod, Index), - ets:match_delete(T, {idxrec, Key, '_', '_', '_', MonRef}), - ets:delete(T, MonRef); - [] -> - ets:match_delete(T, {idxrec, '_', '_', '_', '_', MonRef}) + [#monrec{key = {Index, Mod} = Key}] -> + _ = unregister_vnode_stats(Mod, Index), + ets:match_delete(T, + {idxrec, Key, '_', '_', '_', MonRef}), + ets:delete(T, MonRef); + [] -> + ets:match_delete(T, + {idxrec, '_', '_', '_', '_', MonRef}) end. %% @private -add_vnode_rec(I, _State=#state{idxtab=T}) -> ets:insert(T,I). +add_vnode_rec(I, _State = #state{idxtab = T}) -> + ets:insert(T, I). %% @private -get_vnode(Idx, Mod, State) when not is_list(Idx) -> - [Result] = get_vnode([Idx], Mod, State), - Result; -get_vnode(IdxList, Mod, State) -> - Initial = - [case idx2vnode(Idx, Mod, State) of - no_match -> Idx; - Pid -> {Idx, Pid} - end - || Idx <- IdxList], - {NotStarted, Started} = lists:partition(fun erlang:is_integer/1, Initial), - StartFun = - fun(Idx) -> - ForwardTo = get_forward(Mod, Idx, State), - logger:debug("Will start VNode for partition ~p", [Idx]), - {ok, Pid} = - riak_core_vnode_sup:start_vnode(Mod, Idx, ForwardTo), - register_vnode_stats(Mod, Idx, Pid), - logger:debug("Started VNode, waiting for initialization to complete ~p, ~p ", [Pid, Idx]), - ok = riak_core_vnode:wait_for_init(Pid), - logger:debug("VNode initialization ready ~p, ~p", [Pid, Idx]), - {Idx, Pid} - end, - Pairs = Started ++ riak_core_util:pmap(StartFun, NotStarted, ?DEFAULT_VNODE_ROLLING_START), +get_vnode(Idx, Module, State) when not is_list(Idx) -> + [Result] = get_vnode([Idx], Module, State), Result; +get_vnode(IdxList, Module, State) -> + Initial = [case idx2vnode(Idx, Module, State) of + no_match -> Idx; + Pid -> {Idx, Pid} + end + || Idx <- IdxList], + {NotStarted, Started} = + lists:partition(fun erlang:is_integer/1, Initial), + StartFun = fun (Idx) -> + ForwardTo = get_forward(Module, Idx, State), + logger:debug("Will start VNode for partition ~p", + [Idx]), + {ok, Pid} = riak_core_vnode_sup:start_vnode(Module, Idx, + ForwardTo), + register_vnode_stats(Module, Idx, Pid), + logger:debug("Started VNode, waiting for initialization " + "to\n complete " + "~p, ~p ", + [Pid, Idx]), + ok = riak_core_vnode:wait_for_init(Pid), + logger:debug("VNode initialization ready ~p, ~p", + [Pid, Idx]), + {Idx, Pid} + end, + Pairs = Started ++ + riak_core_util:pmap(StartFun, NotStarted, + ?DEFAULT_VNODE_ROLLING_START), %% Return Pids in same order as input PairsDict = dict:from_list(Pairs), _ = [begin - Pid = dict:fetch(Idx, PairsDict), - MonRef = erlang:monitor(process, Pid), - IdxRec = #idxrec{key={Idx,Mod},idx=Idx,mod=Mod,pid=Pid, - monref=MonRef}, - MonRec = #monrec{monref=MonRef, key={Idx,Mod}}, - add_vnode_rec([IdxRec, MonRec], State) - end || Idx <- NotStarted], - [ dict:fetch(Idx, PairsDict) || Idx <- IdxList]. - + Pid = dict:fetch(Idx, PairsDict), + MonRef = erlang:monitor(process, Pid), + IdxRec = #idxrec{key = {Idx, Module}, idx = Idx, + mod = Module, pid = Pid, monref = MonRef}, + MonRec = #monrec{monref = MonRef, key = {Idx, Module}}, + add_vnode_rec([IdxRec, MonRec], State) + end + || Idx <- NotStarted], + [dict:fetch(Idx, PairsDict) || Idx <- IdxList]. -get_forward(Mod, Idx, #state{forwarding=Fwd}) -> +get_forward(Mod, Idx, #state{forwarding = Fwd}) -> case dict:find({Mod, Idx}, Fwd) of - {ok, ForwardTo} -> - ForwardTo; - _ -> - undefined + {ok, ForwardTo} -> ForwardTo; + _ -> undefined end. check_forward(Ring, Mod, Index) -> Node = node(), case riak_core_ring:next_owner(Ring, Index, Mod) of - {Node, '$resize', _} -> - Complete = riak_core_ring:complete_resize_transfers(Ring, {Index, Node}, Mod), - {{Mod, Index}, Complete}; - {Node, '$delete', _} -> - {{Mod, Index}, undefined}; - {Node, NextOwner, complete} -> - {{Mod, Index}, NextOwner}; - _ -> - {{Mod, Index}, undefined} + {Node, '$resize', _} -> + Complete = + riak_core_ring:complete_resize_transfers(Ring, + {Index, Node}, Mod), + {{Mod, Index}, Complete}; + {Node, '$delete', _} -> {{Mod, Index}, undefined}; + {Node, NextOwner, complete} -> + {{Mod, Index}, NextOwner}; + _ -> {{Mod, Index}, undefined} end. -check_forward_precomputed(Completed, Mod, Index, Node, Ring) -> +check_forward_precomputed(Completed, Mod, Index, Node, + Ring) -> case dict:find({Mod, Index}, Completed) of - {ok, '$resize'} -> - Complete = riak_core_ring:complete_resize_transfers(Ring, {Index, Node}, Mod), - {{Mod, Index}, Complete}; - {ok, '$delete'} -> - {{Mod, Index}, undefined}; - {ok, NextOwner} -> - {{Mod, Index}, NextOwner}; - _ -> - {{Mod, Index}, undefined} + {ok, '$resize'} -> + Complete = + riak_core_ring:complete_resize_transfers(Ring, + {Index, Node}, Mod), + {{Mod, Index}, Complete}; + {ok, '$delete'} -> {{Mod, Index}, undefined}; + {ok, NextOwner} -> {{Mod, Index}, NextOwner}; + _ -> {{Mod, Index}, undefined} end. compute_forwarding(Mods, Ring) -> Node = node(), CL = [{{Mod, Idx}, NextOwner} || Mod <- Mods, - {Idx, Owner, NextOwner} <- riak_core_ring:completed_next_owners(Mod, Ring), + {Idx, Owner, NextOwner} + <- riak_core_ring:completed_next_owners(Mod, Ring), Owner =:= Node], Completed = dict:from_list(CL), - Forwarding = [check_forward_precomputed(Completed, Mod, I, N, Ring) + Forwarding = [check_forward_precomputed(Completed, Mod, + I, N, Ring) || {I, N} <- riak_core_ring:all_owners(Ring), Mod <- Mods], dict:from_list(Forwarding). update_forwarding(AllVNodes, Mods, Ring, - State=#state{forwarding=Forwarding}) -> + State = #state{forwarding = Forwarding}) -> NewForwarding = compute_forwarding(Mods, Ring), - %% Inform vnodes that have changed forwarding status - VNodes = dict:from_list([{{Mod, Idx}, Pid} || {Mod, Idx, Pid} <- AllVNodes]), - Diff = dict:filter(fun(K,V) -> + VNodes = dict:from_list([{{Mod, Idx}, Pid} + || {Mod, Idx, Pid} <- AllVNodes]), + Diff = dict:filter(fun (K, V) -> dict:find(K, Forwarding) /= {ok, V} - end, NewForwarding), - dict:fold(fun({Mod, Idx}, ForwardTo, _) -> - change_forward(VNodes, Mod, Idx, ForwardTo), - ok - end, ok, Diff), - - State#state{forwarding=NewForwarding}. + end, + NewForwarding), + dict:fold(fun ({Mod, Idx}, ForwardTo, _) -> + change_forward(VNodes, Mod, Idx, ForwardTo), ok + end, + ok, Diff), + State#state{forwarding = NewForwarding}. -update_forwarding({Mod, Idx}, Ring, State=#state{forwarding=Forwarding}) -> +update_forwarding({Mod, Idx}, Ring, + State = #state{forwarding = Forwarding}) -> {_, ForwardTo} = check_forward(Ring, Mod, Idx), - NewForwarding = dict:store({Mod, Idx}, ForwardTo, Forwarding), - State#state{forwarding=NewForwarding}. + NewForwarding = dict:store({Mod, Idx}, ForwardTo, + Forwarding), + State#state{forwarding = NewForwarding}. change_forward(VNodes, Mod, Idx, ForwardTo) -> case dict:find({Mod, Idx}, VNodes) of - error -> - ok; - {ok, Pid} -> - riak_core_vnode:set_forwarding(Pid, ForwardTo), - ok + error -> ok; + {ok, Pid} -> + riak_core_vnode:set_forwarding(Pid, ForwardTo), ok end. update_handoff(AllVNodes, Ring, CHBin, State) -> case riak_core_ring:ring_ready(Ring) of - false -> - State; - true -> - NewHO = lists:flatten([case should_handoff(Ring, CHBin, Mod, Idx) of - false -> - []; - {true, primary, TargetNode} -> - [{{Mod, Idx}, {ownership, outbound, TargetNode}}]; - {true, {fallback, _Node}, TargetNode} -> - [{{Mod, Idx}, {hinted, outbound, TargetNode}}]; - {true, '$resize'=Action} -> - [{{Mod, Idx}, {resize, outbound, Action}}]; - {true, '$delete'=Action} -> - [{{Mod, Idx}, {delete, local, Action}}] - end || {Mod, Idx, _Pid} <- AllVNodes]), - State#state{handoff=dict:from_list(NewHO)} + false -> State; + true -> + NewHO = lists:flatten([case should_handoff(Ring, CHBin, + Mod, Idx) + of + false -> []; + {true, primary, TargetNode} -> + [{{Mod, Idx}, + {ownership, outbound, TargetNode}}]; + {true, {fallback, _Node}, TargetNode} -> + [{{Mod, Idx}, + {hinted, outbound, TargetNode}}]; + {true, '$resize' = Action} -> + [{{Mod, Idx}, + {resize, outbound, Action}}]; + {true, '$delete' = Action} -> + [{{Mod, Idx}, {delete, local, Action}}] + end + || {Mod, Idx, _Pid} <- AllVNodes]), + State#state{handoff = dict:from_list(NewHO)} end. should_handoff(Ring, _CHBin, Mod, Idx) -> - {_, NextOwner, _} = riak_core_ring:next_owner(Ring, Idx), + {_, NextOwner, _} = riak_core_ring:next_owner(Ring, + Idx), Type = riak_core_ring:vnode_type(Ring, Idx), Ready = riak_core_ring:ring_ready(Ring), IsResizing = riak_core_ring:is_resizing(Ring), - case determine_handoff_target(Type, NextOwner, Ready, IsResizing) of - undefined -> - false; - Action when Action =:= '$resize' - orelse Action =:= '$delete' -> - {true, Action}; - TargetNode -> - case app_for_vnode_module(Mod) of - undefined -> false; - {ok, App} -> - case lists:member(TargetNode, - riak_core_node_watcher:nodes(App)) of - false -> false; - true -> {true, Type, TargetNode} - end - end + case determine_handoff_target(Type, NextOwner, Ready, + IsResizing) + of + undefined -> false; + Action + when Action =:= '$resize' orelse Action =:= '$delete' -> + {true, Action}; + TargetNode -> + case app_for_vnode_module(Mod) of + undefined -> false; + {ok, App} -> + case lists:member(TargetNode, + riak_core_node_watcher:nodes(App)) + of + false -> false; + true -> {true, Type, TargetNode} + end + end end. -determine_handoff_target(Type, NextOwner, RingReady, IsResize) -> +determine_handoff_target(Type, NextOwner, RingReady, + IsResize) -> Me = node(), - determine_handoff_target(Type, NextOwner, RingReady, IsResize, NextOwner =:= Me). + determine_handoff_target(Type, NextOwner, RingReady, + IsResize, NextOwner =:= Me). determine_handoff_target(primary, _, _, _, true) -> %% Never hand off to myself as a primary @@ -759,126 +806,134 @@ determine_handoff_target(primary, _, _, _, true) -> determine_handoff_target(primary, undefined, _, _, _) -> %% No ring change indicated for this partition undefined; -determine_handoff_target(primary, NextOwner, true, _, _) -> +determine_handoff_target(primary, NextOwner, true, _, + _) -> %% Primary, ring is ready, go. This may be a node or a `$resize' %% action NextOwner; determine_handoff_target(primary, _, false, _, _) -> %% Ring isn't ready, no matter what, don't do a primary handoff undefined; -determine_handoff_target({fallback, _Target}, '$delete'=Action, _, _, _) -> +determine_handoff_target({fallback, _Target}, + '$delete' = Action, _, _, _) -> %% partitions moved during resize and scheduled for deletion, indexes %% that exist in both the original and resized ring that were moved appear %% as fallbacks. Action; -determine_handoff_target(resized_primary, '$delete'=Action, _, _, _) -> +determine_handoff_target(resized_primary, + '$delete' = Action, _, _, _) -> %% partitions that no longer exist after the ring has been resized (shrunk) %% scheduled for deletion Action; -determine_handoff_target(resized_primary, _, _, false, _) -> +determine_handoff_target(resized_primary, _, _, false, + _) -> %% partitions that would have existed in a ring whose expansion was aborted %% and are still running need to be cleaned up after and shutdown '$delete'; -determine_handoff_target({fallback, For}, undefined, _, _, _) -> +determine_handoff_target({fallback, For}, undefined, _, + _, _) -> %% Fallback vnode target is primary (hinted handoff). `For' can %% technically be a `$resize' action but unclear it ever would be For; -determine_handoff_target(_, _, _, _, _) -> - undefined. - +determine_handoff_target(_, _, _, _, _) -> undefined. app_for_vnode_module(Mod) when is_atom(Mod) -> case application:get_env(riak_core, vnode_modules) of - {ok, Mods} -> - case lists:keysearch(Mod, 2, Mods) of - {value, {App, Mod}} -> - {ok, App}; - false -> - undefined - end; - undefined -> undefined + {ok, Mods} -> + case lists:keysearch(Mod, 2, Mods) of + {value, {App, Mod}} -> {ok, App}; + false -> undefined + end; + undefined -> undefined end. maybe_trigger_handoff(Mod, Idx, State) -> Pid = get_vnode(Idx, Mod, State), maybe_trigger_handoff(Mod, Idx, Pid, State). -maybe_trigger_handoff(Mod, Idx, Pid, _State=#state{handoff=HO}) -> +maybe_trigger_handoff(Mod, Idx, Pid, + _State = #state{handoff = HO}) -> case dict:find({Mod, Idx}, HO) of - {ok, {resize, _Direction, '$resize'}} -> - {ok, Ring} = riak_core_ring_manager:get_my_ring(), - case riak_core_ring:awaiting_resize_transfer(Ring, {Idx, node()}, Mod) of - undefined -> ok; - {TargetIdx, TargetNode} -> - riak_core_vnode:trigger_handoff(Pid, TargetIdx, TargetNode) - end; - {ok, {delete, local, '$delete'}} -> - riak_core_vnode:trigger_delete(Pid); - {ok, {_Type, _Direction, TargetNode}} -> - riak_core_vnode:trigger_handoff(Pid, TargetNode), - ok; - error -> - ok + {ok, {resize, _Direction, '$resize'}} -> + {ok, Ring} = riak_core_ring_manager:get_my_ring(), + case riak_core_ring:awaiting_resize_transfer(Ring, + {Idx, node()}, Mod) + of + undefined -> ok; + {TargetIdx, TargetNode} -> + riak_core_vnode:trigger_handoff(Pid, TargetIdx, + TargetNode) + end; + {ok, {delete, local, '$delete'}} -> + riak_core_vnode:trigger_delete(Pid); + {ok, {_Type, _Direction, TargetNode}} -> + riak_core_vnode:trigger_handoff(Pid, TargetNode), ok; + error -> ok end. -get_all_vnodes_status(#state{forwarding=Forwarding, handoff=HO}) -> +get_all_vnodes_status(#state{forwarding = Forwarding, + handoff = HO}) -> {ok, Ring} = riak_core_ring_manager:get_my_ring(), Owners = riak_core_ring:all_owners(Ring), VNodes = get_all_vnodes(), - Mods = [Mod || {_App, Mod} <- riak_core:vnode_modules()], - + Mods = [Mod + || {_App, Mod} <- riak_core:vnode_modules()], ThisNode = node(), Types = [case Owner of - ThisNode -> - {{Mod, Idx}, {type, primary}}; - _ -> - {{Mod, Idx}, {type, secondary}} - end || {Idx, Owner} <- Owners, - Mod <- Mods], + ThisNode -> {{Mod, Idx}, {type, primary}}; + _ -> {{Mod, Idx}, {type, secondary}} + end + || {Idx, Owner} <- Owners, Mod <- Mods], Types2 = lists:keysort(1, Types), - Pids = [{{Mod, Idx}, {pid, Pid}} || {Mod, Idx, Pid} <- VNodes], + Pids = [{{Mod, Idx}, {pid, Pid}} + || {Mod, Idx, Pid} <- VNodes], Pids2 = lists:keysort(1, Pids), Forwarding1 = lists:sort(dict:to_list(Forwarding)), - Forwarding2 = [{MI, {forwarding, Node}} || {MI,Node} <- Forwarding1, - Node /= undefined], + Forwarding2 = [{MI, {forwarding, Node}} + || {MI, Node} <- Forwarding1, Node /= undefined], Handoff1 = lists:sort(dict:to_list(HO)), - Handoff2 = [{MI, {should_handoff, Node}} || - {MI,{_Type, _Direction, Node}} <- Handoff1], - - MergeFn = fun(_, V1, V2) when is_list(V1) and is_list(V2) -> + Handoff2 = [{MI, {should_handoff, Node}} + || {MI, {_Type, _Direction, Node}} <- Handoff1], + MergeFn = fun (_, V1, V2) + when is_list(V1) and is_list(V2) -> V1 ++ V2; - (_, V1, V2) when is_list(V1) -> - V1 ++ [V2]; - (_, V1, V2) -> - [V1, V2] + (_, V1, V2) when is_list(V1) -> V1 ++ [V2]; + (_, V1, V2) -> [V1, V2] end, - Status = lists:foldl(fun(B, A) -> + Status = lists:foldl(fun (B, A) -> orddict:merge(MergeFn, A, B) - end, Types2, [Pids2, Forwarding2, Handoff2]), + end, + Types2, [Pids2, Forwarding2, Handoff2]), Status. -update_never_started(Ring, State=#state{known_modules=KnownMods}) -> - UnknownMods = [Mod || {_App, Mod} <- riak_core:vnode_modules(), - not lists:member(Mod, KnownMods)], +update_never_started(Ring, + State = #state{known_modules = KnownMods}) -> + UnknownMods = [Mod + || {_App, Mod} <- riak_core:vnode_modules(), + not lists:member(Mod, KnownMods)], case UnknownMods of - [] -> - State; - _ -> - Indices = [Idx || {Idx, _} <- riak_core_ring:all_owners(Ring)], - lists:foldl(fun(Mod, StateAcc) -> - update_never_started(Mod, Indices, StateAcc) - end, State, UnknownMods) + [] -> State; + _ -> + Indices = [Idx + || {Idx, _} <- riak_core_ring:all_owners(Ring)], + lists:foldl(fun (Mod, StateAcc) -> + update_never_started(Mod, Indices, StateAcc) + end, + State, UnknownMods) end. update_never_started(Mod, Indices, State) -> IdxPids = get_all_index_pid(Mod, []), AlreadyStarted = [Idx || {Idx, _Pid} <- IdxPids], - NeverStarted = ordsets:subtract(ordsets:from_list(Indices), - ordsets:from_list(AlreadyStarted)), + NeverStarted = + ordsets:subtract(ordsets:from_list(Indices), + ordsets:from_list(AlreadyStarted)), NeverStarted2 = [{Idx, Mod} || Idx <- NeverStarted], - NeverStarted3 = NeverStarted2 ++ State#state.never_started, + NeverStarted3 = NeverStarted2 ++ + State#state.never_started, KnownModules = [Mod | State#state.known_modules], - State#state{known_modules=KnownModules, never_started=NeverStarted3}. + State#state{known_modules = KnownModules, + never_started = NeverStarted3}. maybe_start_vnodes(Ring, State) -> case riak_core_ring:check_lastgasp(Ring) of @@ -890,127 +945,147 @@ maybe_start_vnodes(Ring, State) -> State3 end. -maybe_start_vnodes(State=#state{vnode_start_tokens=Tokens, - never_started=NeverStarted}) -> +maybe_start_vnodes(State = #state{vnode_start_tokens = + Tokens, + never_started = NeverStarted}) -> case {Tokens, NeverStarted} of - {0, _} -> - State; - {_, []} -> - State; - {_, [{Idx, Mod} | NeverStarted2]} -> - _ = get_vnode(Idx, Mod, State), - gen_server:cast(?MODULE, maybe_start_vnodes), - State#state{vnode_start_tokens=Tokens-1, - never_started=NeverStarted2} + {0, _} -> State; + {_, []} -> State; + {_, [{Idx, Mod} | NeverStarted2]} -> + _ = get_vnode(Idx, Mod, State), + gen_server:cast(?MODULE, maybe_start_vnodes), + State#state{vnode_start_tokens = Tokens - 1, + never_started = NeverStarted2} end. --spec check_repairs(repairs()) -> Repairs2::repairs(). +-spec check_repairs(repairs()) -> Repairs2 :: repairs(). + check_repairs(Repairs) -> - Check = - fun(R=#repair{minus_one_xfer=MOX, plus_one_xfer=POX}, Repairs2) -> - Pairs = R#repair.pairs, - MO = get_minus_one(Pairs), - PO = get_plus_one(Pairs), - MOX2 = maybe_retry(R, MO, MOX), - POX2 = maybe_retry(R, PO, POX), - - if ?XFER_COMPLETE(MOX2) andalso ?XFER_COMPLETE(POX2) -> - Repairs2; - true -> - R2 = R#repair{minus_one_xfer=MOX2, plus_one_xfer=POX2}, - [R2|Repairs2] - end - end, + Check = fun (R = #repair{minus_one_xfer = MOX, + plus_one_xfer = POX}, + Repairs2) -> + Pairs = R#repair.pairs, + MO = get_minus_one(Pairs), + PO = get_plus_one(Pairs), + MOX2 = maybe_retry(R, MO, MOX), + POX2 = maybe_retry(R, PO, POX), + if (?XFER_COMPLETE(MOX2)) andalso + (?XFER_COMPLETE(POX2)) -> + Repairs2; + true -> + R2 = R#repair{minus_one_xfer = MOX2, + plus_one_xfer = POX2}, + [R2 | Repairs2] + end + end, lists:reverse(lists:foldl(Check, [], Repairs)). %% TODO: get all this repair, xfer status and Src business figured out. --spec maybe_retry(repair(), tuple(), xfer_status()) -> Xfer2::xfer_status(). -maybe_retry(R, {SrcPartition, _}=Src, Xfer) -> +-spec maybe_retry(repair(), tuple(), + xfer_status()) -> Xfer2 :: xfer_status(). + +maybe_retry(R, {SrcPartition, _} = Src, Xfer) -> case Xfer#xfer_status.status of - complete -> - Xfer; - pending -> - {Mod, _, Partition} = Xfer#xfer_status.mod_src_target, - FilterModFun = R#repair.filter_mod_fun, - - riak_core_handoff_manager:xfer(Src, {Mod, Partition}, FilterModFun), - #xfer_status{status=pending, - mod_src_target={Mod, SrcPartition, Partition}} + complete -> Xfer; + pending -> + {Mod, _, Partition} = Xfer#xfer_status.mod_src_target, + FilterModFun = R#repair.filter_mod_fun, + riak_core_handoff_manager:xfer(Src, {Mod, Partition}, + FilterModFun), + #xfer_status{status = pending, + mod_src_target = {Mod, SrcPartition, Partition}} end. %% @private %% %% @doc Verify that all nodes are up involved in the repair. --spec check_up([{non_neg_integer(), node()}], [node()]) -> - true | {false, Down::[{non_neg_integer(), node()}]}. +-spec check_up([{non_neg_integer(), node()}], + [node()]) -> true | + {false, Down :: [{non_neg_integer(), node()}]}. + check_up(Pairs, UpNodes) -> - Down = [Pair || {_Partition, Owner}=Pair <- Pairs, - not lists:member(Owner, UpNodes)], + Down = [Pair + || {_Partition, Owner} = Pair <- Pairs, + not lists:member(Owner, UpNodes)], case Down of - [] -> true; - _ -> {false, Down} + [] -> true; + _ -> {false, Down} end. %% @private %% %% @doc Get the three `{Partition, Owner}' pairs involved in a repair %% operation for the given `Ring' and `Partition'. --spec repair_pairs(riak_core_ring:riak_core_ring(), non_neg_integer()) -> - [{Partition::non_neg_integer(), Owner::node()}]. +-spec repair_pairs(riak_core_ring:riak_core_ring(), + non_neg_integer()) -> [{Partition :: non_neg_integer(), + Owner :: node()}]. + repair_pairs(Ring, Partition) -> Owner = riak_core_ring:index_owner(Ring, Partition), CH = riak_core_ring:chash(Ring), - [_, Before] = chash:predecessors(<>, CH, 2), - [After] = chash:successors(<>, CH, 1), + [_, Before] = + chash:predecessors(<>, CH, 2), + [After] = chash:successors(<>, + CH, 1), [Before, {Partition, Owner}, After]. %% @private %% %% @doc Get the corresponding repair entry in `Repairs', if one %% exists, for the given `ModPartition'. --spec get_repair(mod_partition(), repairs()) -> repair() | none. +-spec get_repair(mod_partition(), + repairs()) -> repair() | none. + get_repair(ModPartition, Repairs) -> - case lists:keyfind(ModPartition, #repair.mod_partition, Repairs) of - false -> none; - Val -> Val + case lists:keyfind(ModPartition, #repair.mod_partition, + Repairs) + of + false -> none; + Val -> Val end. %% @private %% %% @doc Remove the repair entry. -spec remove_repair(repair(), repairs()) -> repairs(). + remove_repair(Repair, Repairs) -> - lists:keydelete(Repair#repair.mod_partition, #repair.mod_partition, Repairs). + lists:keydelete(Repair#repair.mod_partition, + #repair.mod_partition, Repairs). %% @private %% %% @doc Replace the matching repair entry with `Repair'. -spec replace_repair(repair(), repairs()) -> repairs(). + replace_repair(Repair, Repairs) -> - lists:keyreplace(Repair#repair.mod_partition, #repair.mod_partition, - Repairs, Repair). + lists:keyreplace(Repair#repair.mod_partition, + #repair.mod_partition, Repairs, Repair). %% @private %% %% @doc Get the `{Partition, Owner}' pair that comes before the %% partition under repair. --spec get_minus_one([{index(), node()}]) -> {index(), node()}. -get_minus_one([MinusOne, _, _]) -> - MinusOne. +-spec get_minus_one([{index(), node()}]) -> {index(), + node()}. + +get_minus_one([MinusOne, _, _]) -> MinusOne. %% @private %% %% @doc Get the `{Partition, Owner}' pair that comes after the %% partition under repair. --spec get_plus_one([{index(), node()}]) -> {index(), node()}. -get_plus_one([_, _, PlusOne]) -> - PlusOne. +-spec get_plus_one([{index(), node()}]) -> {index(), + node()}. + +get_plus_one([_, _, PlusOne]) -> PlusOne. %% @private %% %% @doc Kill all outbound and inbound xfers related to `Repairs' %% targeting this node with `Reason'. -spec kill_repairs([repair()], term()) -> ok. + kill_repairs(Repairs, Reason) -> _ = [kill_repair(Repair, Reason) || Repair <- Repairs], ok. @@ -1018,30 +1093,27 @@ kill_repairs(Repairs, Reason) -> kill_repair(Repair, Reason) -> {Mod, Partition} = Repair#repair.mod_partition, Pairs = Repair#repair.pairs, - {_,MOOwner} = get_minus_one(Pairs), - {_,POOwner} = get_minus_one(Pairs), + {_, MOOwner} = get_minus_one(Pairs), + {_, POOwner} = get_minus_one(Pairs), MOX = Repair#repair.minus_one_xfer, POX = Repair#repair.plus_one_xfer, MOModSrcTarget = MOX#xfer_status.mod_src_target, POModSrcTarget = POX#xfer_status.mod_src_target, %% Kill the remote senders riak_core_handoff_manager:kill_xfer(MOOwner, - MOModSrcTarget, - Reason), + MOModSrcTarget, Reason), riak_core_handoff_manager:kill_xfer(POOwner, - POModSrcTarget, - Reason), + POModSrcTarget, Reason), %% Kill the local receivers riak_core_handoff_manager:kill_xfer(node(), - {Mod, undefined, Partition}, - Reason). + {Mod, undefined, Partition}, Reason). register_vnode_stats(_Mod, _Index, _Pid) -> - %% STATS + %% STATS %riak_core_stat:register_vnode_stats(Mod, Index, Pid). - ok. + ok. unregister_vnode_stats(_Mod, _Index) -> - %% STATS + %% STATS %riak_core_stat:unregister_vnode_stats(Mod, Index). - ok. + ok. diff --git a/src/riak_core_vnode_master.erl b/src/riak_core_vnode_master.erl index 352d02886..ea307c34c 100644 --- a/src/riak_core_vnode_master.erl +++ b/src/riak_core_vnode_master.erl @@ -23,46 +23,43 @@ %% @doc dispatch to vnodes -module(riak_core_vnode_master). + -include("riak_core_vnode.hrl"). + -behaviour(gen_server). --compile({nowarn_deprecated_function, - [{gen_fsm, send_event, 2}, - {gen_fsm, send_all_state_event, 2}]}). - --export([start_link/1, start_link/2, start_link/3, get_vnode_pid/2, - start_vnode/2, - command/3, command/4, - command_unreliable/3, command_unreliable/4, - sync_command/3, sync_command/4, - coverage/5, - command_return_vnode/4, +-export([start_link/1, get_vnode_pid/2, start_vnode/2, + command/3, command/4, command_unreliable/3, + command_unreliable/4, sync_command/3, sync_command/4, + coverage/5, command_return_vnode/4, sync_spawn_command/3, make_request/3, make_coverage_request/4, all_nodes/1, reg_name/1]). --export([init/1, handle_call/3, handle_cast/2, handle_info/2, - terminate/2, code_change/3]). --record(state, {idxtab, sup_name, vnode_mod, legacy}). --define(LONG_TIMEOUT, 120*1000). +-export([init/1, handle_call/3, handle_cast/2, + handle_info/2, terminate/2, code_change/3]). + +-record(state, {idxtab, sup_name, vnode_mod}). + +-define(LONG_TIMEOUT, 120 * 1000). + +-type riak_vnode_req_v1() :: #riak_vnode_req_v1{}. + +-type riak_coverage_req_v1() :: #riak_coverage_req_v1{}. + +make_name(VNodeMod, Suffix) -> + list_to_atom(atom_to_list(VNodeMod) ++ Suffix). -make_name(VNodeMod,Suffix) -> list_to_atom(atom_to_list(VNodeMod)++Suffix). -reg_name(VNodeMod) -> make_name(VNodeMod, "_master"). +reg_name(VNodeMod) -> make_name(VNodeMod, "_master"). %% Given atom 'riak_kv_vnode_master', return 'riak_kv_vnode'. vmaster_to_vmod(VMaster) -> L = atom_to_list(VMaster), - list_to_atom(lists:sublist(L,length(L)-7)). + list_to_atom(lists:sublist(L, length(L) - 7)). start_link(VNodeMod) -> - start_link(VNodeMod, undefined). - -start_link(VNodeMod, LegacyMod) -> - start_link(VNodeMod, LegacyMod, undefined). - -start_link(VNodeMod, LegacyMod, Service) -> RegName = reg_name(VNodeMod), gen_server:start_link({local, RegName}, ?MODULE, - [Service,VNodeMod,LegacyMod,RegName], []). + [VNodeMod, RegName], []). start_vnode(Index, VNodeMod) -> riak_core_vnode_manager:start_vnode(Index, VNodeMod). @@ -79,95 +76,109 @@ command_unreliable(Preflist, Msg, VMaster) -> command(PrefListOrCmd, Msg, Sender, VMaster) -> command2(PrefListOrCmd, Msg, Sender, VMaster, normal). -command_unreliable(PrefListOrCmd, Msg, Sender, VMaster) -> - command2(PrefListOrCmd, Msg, Sender, VMaster, unreliable). +command_unreliable(PrefListOrCmd, Msg, Sender, + VMaster) -> + command2(PrefListOrCmd, Msg, Sender, VMaster, + unreliable). %% Send the command to the preflist given with responses going to Sender -command2([], _Msg, _Sender, _VMaster, _How) -> - ok; - -command2([{Index, Pid}|Rest], Msg, Sender, VMaster, How=normal) - when is_pid(Pid) -> - gen_fsm:send_event(Pid, make_request(Msg, Sender, Index)), +command2([], _Msg, _Sender, _VMaster, _How) -> ok; +command2([{Index, Pid} | Rest], Msg, Sender, VMaster, + How = normal) + when is_pid(Pid) -> + Request = make_request(Msg, Sender, Index), + riak_core_vnode:send_req(Pid, Request), command2(Rest, Msg, Sender, VMaster, How); - -command2([{Index, Pid}|Rest], Msg, Sender, VMaster, How=unreliable) - when is_pid(Pid) -> - riak_core_send_msg:send_event_unreliable(Pid, make_request(Msg, Sender, - Index)), +command2([{Index, Pid} | Rest], Msg, Sender, VMaster, + How = unreliable) + when is_pid(Pid) -> + riak_core_send_msg:send_event_unreliable(Pid, + make_request(Msg, Sender, Index)), command2(Rest, Msg, Sender, VMaster, How); -command2([{Index,Node}|Rest], Msg, Sender, VMaster, How) -> - proxy_cast({VMaster, Node}, make_request(Msg, Sender, Index), How), +command2([{Index, Node} | Rest], Msg, Sender, VMaster, + How) -> + proxy_cast({VMaster, Node}, + make_request(Msg, Sender, Index), How), command2(Rest, Msg, Sender, VMaster, How); - -command2(DestTuple, Msg, Sender, VMaster, How) when is_tuple(DestTuple) -> +command2(DestTuple, Msg, Sender, VMaster, How) + when is_tuple(DestTuple) -> %% Final case, tuple = single destination ... so make a list and %% resubmit to this function. command2([DestTuple], Msg, Sender, VMaster, How). %% Send a command to a covering set of vnodes -coverage(Msg, CoverageVNodes, Keyspaces, {Type, Ref, From}, VMaster) - when is_list(CoverageVNodes) -> +coverage(Msg, CoverageVNodes, Keyspaces, + {Type, Ref, From}, VMaster) + when is_list(CoverageVNodes) -> [proxy_cast({VMaster, Node}, - make_coverage_request(Msg, - Keyspaces, + make_coverage_request(Msg, Keyspaces, {Type, {Ref, {Index, Node}}, From}, - Index)) || - {Index, Node} <- CoverageVNodes]; -coverage(Msg, {Index, Node}, Keyspaces, Sender, VMaster) -> + Index)) + || {Index, Node} <- CoverageVNodes]; +coverage(Msg, {Index, Node}, Keyspaces, Sender, + VMaster) -> proxy_cast({VMaster, Node}, make_coverage_request(Msg, Keyspaces, Sender, Index)). - + %% Send the command to an individual Index/Node combination, but also %% return the pid for the vnode handling the request, as `{ok, VnodePid}'. -command_return_vnode({Index,Node}, Msg, Sender, VMaster) -> +command_return_vnode({Index, Node}, Msg, Sender, + VMaster) -> Req = make_request(Msg, Sender, Index), Mod = vmaster_to_vmod(VMaster), - riak_core_vnode_proxy:command_return_vnode({Mod,Index,Node}, Req). + riak_core_vnode_proxy:command_return_vnode({Mod, Index, + Node}, + Req). %% Send a synchronous command to an individual Index/Node combination. %% Will not return until the vnode has returned sync_command(IndexNode, Msg, VMaster) -> sync_command(IndexNode, Msg, VMaster, ?LONG_TIMEOUT). -sync_command({Index,Node}, Msg, VMaster, Timeout) -> +sync_command({Index, Node}, Msg, VMaster, Timeout) -> %% Issue the call to the master, it will update the Sender with %% the From for handle_call so that the {reply} return gets %% sent here. - Request = make_request(Msg, {server, undefined, undefined}, Index), - case gen_server:call({VMaster, Node}, Request, Timeout) of - {vnode_error, {Error, _Args}} -> error(Error); - {vnode_error, Error} -> error(Error); - Else -> Else + Request = make_request(Msg, + {server, undefined, undefined}, Index), + case gen_server:call({VMaster, Node}, Request, Timeout) + of + {vnode_error, {Error, _Args}} -> error(Error); + {vnode_error, Error} -> error(Error); + Else -> Else end. %% Send a synchronous spawned command to an individual Index/Node combination. %% Will not return until the vnode has returned, but the vnode_master will %% continue to handle requests. -sync_spawn_command({Index,Node}, Msg, VMaster) -> - Request = make_request(Msg, {server, undefined, undefined}, Index), - case gen_server:call({VMaster, Node}, {spawn, Request}, infinity) of - {vnode_error, {Error, _Args}} -> error(Error); - {vnode_error, Error} -> error(Error); - Else -> Else +sync_spawn_command({Index, Node}, Msg, VMaster) -> + Request = make_request(Msg, + {server, undefined, undefined}, Index), + case gen_server:call({VMaster, Node}, {spawn, Request}, + infinity) + of + {vnode_error, {Error, _Args}} -> error(Error); + {vnode_error, Error} -> error(Error); + Else -> Else end. - %% Make a request record - exported for use by legacy modules --spec make_request(vnode_req(), sender(), partition()) -> #riak_vnode_req_v1{}. +-spec make_request(vnode_req(), sender(), + partition()) -> riak_vnode_req_v1(). + make_request(Request, Sender, Index) -> - #riak_vnode_req_v1{ - index=Index, - sender=Sender, - request=Request}. + #riak_vnode_req_v1{index = Index, sender = Sender, + request = Request}. %% Make a request record - exported for use by legacy modules --spec make_coverage_request(vnode_req(), keyspaces(), sender(), partition()) -> #riak_coverage_req_v1{}. -make_coverage_request(Request, KeySpaces, Sender, Index) -> - #riak_coverage_req_v1{index=Index, - keyspaces=KeySpaces, - sender=Sender, - request=Request}. +-spec make_coverage_request(vnode_req(), keyspaces(), + sender(), partition()) -> riak_coverage_req_v1(). + +make_coverage_request(Request, KeySpaces, Sender, + Index) -> + #riak_coverage_req_v1{index = Index, + keyspaces = KeySpaces, sender = Sender, + request = Request}. %% Request a list of Pids for all vnodes %% @deprecated @@ -179,91 +190,86 @@ all_nodes(VNodeMod) -> [Pid || {_Mod, _Idx, Pid} <- VNodes]. %% @private -init([Service, VNodeMod, LegacyMod, _RegName]) -> - gen_server:cast(self(), {wait_for_service, Service}), - {ok, #state{idxtab=undefined, - vnode_mod=VNodeMod, - legacy=LegacyMod}}. +init([VNodeMod, _RegName]) -> + {ok, #state{idxtab = undefined, vnode_mod = VNodeMod}}. -proxy_cast(Who, Req) -> - proxy_cast(Who, Req, normal). +proxy_cast(Who, Req) -> proxy_cast(Who, Req, normal). proxy_cast({VMaster, Node}, Req, How) -> do_proxy_cast({VMaster, Node}, Req, How). -do_proxy_cast({VMaster, Node}, Req=?VNODE_REQ{index=Idx}, How) -> +do_proxy_cast({VMaster, Node}, + Req = #riak_vnode_req_v1{index = Idx}, How) -> Mod = vmaster_to_vmod(VMaster), Proxy = riak_core_vnode_proxy:reg_name(Mod, Idx, Node), send_an_event(Proxy, Req, How), ok; -do_proxy_cast({VMaster, Node}, Req=?COVERAGE_REQ{index=Idx}, How) -> +do_proxy_cast({VMaster, Node}, + Req = #riak_coverage_req_v1{index = Idx}, How) -> Mod = vmaster_to_vmod(VMaster), Proxy = riak_core_vnode_proxy:reg_name(Mod, Idx, Node), send_an_event(Proxy, Req, How), ok. send_an_event(Dest, Event, normal) -> - gen_fsm:send_event(Dest, Event); + riak_core_vnode:send_req(Dest, Event); send_an_event(Dest, Event, unreliable) -> riak_core_send_msg:send_event_unreliable(Dest, Event). handle_cast({wait_for_service, Service}, State) -> case Service of - undefined -> - ok; - _ -> - logger:debug("Waiting for service: ~p", [Service]), - riak_core:wait_for_service(Service) + undefined -> ok; + _ -> + logger:debug("Waiting for service: ~p", [Service]), + riak_core:wait_for_service(Service) end, {noreply, State}; -handle_cast(Req=?VNODE_REQ{index=Idx}, State=#state{vnode_mod=Mod}) -> +handle_cast(Req = #riak_vnode_req_v1{index = Idx}, + State = #state{vnode_mod = Mod}) -> Proxy = riak_core_vnode_proxy:reg_name(Mod, Idx), - gen_fsm:send_event(Proxy, Req), + riak_core_vnode:send_req(Proxy, Req), {noreply, State}; -handle_cast(Req=?COVERAGE_REQ{index=Idx}, State=#state{vnode_mod=Mod}) -> +handle_cast(Req = #riak_coverage_req_v1{index = Idx}, + State = #state{vnode_mod = Mod}) -> Proxy = riak_core_vnode_proxy:reg_name(Mod, Idx), - gen_fsm:send_event(Proxy, Req), - {noreply, State}; -handle_cast(Other, State=#state{legacy=Legacy}) when Legacy =/= undefined -> - case catch Legacy:rewrite_cast(Other) of - {ok, ?VNODE_REQ{}=Req} -> - handle_cast(Req, State); - _ -> - {noreply, State} - end. + riak_core_vnode:send_req(Proxy, Req), + {noreply, State}. -handle_call({return_vnode, Req=?VNODE_REQ{index=Idx}}, _From, - State=#state{vnode_mod=Mod}) -> +handle_call({return_vnode, + Req = #riak_vnode_req_v1{index = Idx}}, + _From, State = #state{vnode_mod = Mod}) -> {ok, Pid} = - riak_core_vnode_proxy:command_return_vnode({Mod,Idx,node()}, Req), + riak_core_vnode_proxy:command_return_vnode({Mod, Idx, + node()}, + Req), {reply, {ok, Pid}, State}; -handle_call(Req=?VNODE_REQ{index=Idx, sender={server, undefined, undefined}}, - From, State=#state{vnode_mod=Mod}) -> +handle_call(Req = #riak_vnode_req_v1{index = Idx, + sender = {server, undefined, undefined}}, + From, State = #state{vnode_mod = Mod}) -> Proxy = riak_core_vnode_proxy:reg_name(Mod, Idx), - gen_fsm:send_event(Proxy, Req?VNODE_REQ{sender={server, undefined, From}}), + riak_core_vnode:send_req(Proxy, + Req#riak_vnode_req_v1{sender = + {server, undefined, + From}}), {noreply, State}; handle_call({spawn, - Req=?VNODE_REQ{index=Idx, sender={server, undefined, undefined}}}, - From, State=#state{vnode_mod=Mod}) -> + Req = #riak_vnode_req_v1{index = Idx, + sender = {server, undefined, undefined}}}, + From, State = #state{vnode_mod = Mod}) -> Proxy = riak_core_vnode_proxy:reg_name(Mod, Idx), Sender = {server, undefined, From}, - spawn_link( - fun() -> gen_fsm:send_all_state_event(Proxy, Req?VNODE_REQ{sender=Sender}) end), - {noreply, State}; -handle_call(Other, From, State=#state{legacy=Legacy}) when Legacy =/= undefined -> - case catch Legacy:rewrite_call(Other, From) of - {ok, ?VNODE_REQ{}=Req} -> - handle_call(Req, From, State); - _ -> - {noreply, State} - end. - -handle_info(_Info, State) -> + spawn_link(fun () -> + riak_core_vnode:send_all_proxy_req(Proxy, + Req#riak_vnode_req_v1{sender + = + Sender}) + end), {noreply, State}. +handle_info(_Info, State) -> {noreply, State}. + %% @private -terminate(_Reason, _State) -> - ok. +terminate(_Reason, _State) -> ok. %% @private -code_change(_OldVsn, State, _Extra) -> {ok, State}. +code_change(_OldVsn, State, _Extra) -> {ok, State}. diff --git a/src/riak_core_vnode_proxy.erl b/src/riak_core_vnode_proxy.erl index 571707b95..76dc8b7e6 100644 --- a/src/riak_core_vnode_proxy.erl +++ b/src/riak_core_vnode_proxy.erl @@ -17,9 +17,13 @@ %% %% ------------------------------------------------------------------- -module(riak_core_vnode_proxy). --export([start_link/2, init/1, reg_name/2, reg_name/3, call/2, call/3, cast/2, - unregister_vnode/3, command_return_vnode/2, overloaded/1]). --export([system_continue/3, system_terminate/4, system_code_change/4]). + +-export([start_link/2, init/1, reg_name/2, reg_name/3, + call/2, call/3, cast/2, unregister_vnode/3, + command_return_vnode/2, overloaded/1]). + +-export([system_continue/3, system_terminate/4, + system_code_change/4]). -include("riak_core_vnode.hrl"). @@ -27,28 +31,31 @@ [{gen_fsm, send_event, 2}]}). -ifdef(TEST). + -include_lib("eunit/include/eunit.hrl"). + -endif. --record(state, {mod :: atom(), - index :: partition(), - vnode_pid :: pid() | undefined, - vnode_mref :: reference() | undefined, - check_mailbox :: non_neg_integer(), - check_threshold :: pos_integer() | undefined, - check_counter :: non_neg_integer(), - check_interval :: pos_integer(), - check_request_interval :: non_neg_integer(), - check_request :: undefined | sent | ignore - }). +-record(state, + {mod :: atom(), index :: partition(), + vnode_pid :: pid() | undefined, + vnode_mref :: reference() | undefined, + check_mailbox :: non_neg_integer(), + check_threshold :: pos_integer() | undefined, + check_counter :: non_neg_integer(), + check_interval :: pos_integer(), + check_request_interval :: non_neg_integer(), + check_request :: undefined | sent | ignore}). -define(DEFAULT_CHECK_INTERVAL, 5000). + -define(DEFAULT_OVERLOAD_THRESHOLD, 10000). -reg_name(Mod, Index) -> - ModBin = atom_to_binary(Mod, latin1), +reg_name(Module, Index) -> + ModBin = atom_to_binary(Module, latin1), IdxBin = list_to_binary(integer_to_list(Index)), - AllBin = <<$p,$r,$o,$x,$y,$_, ModBin/binary, $_, IdxBin/binary>>, + AllBin = <<$p, $r, $o, $x, $y, $_, ModBin/binary, $_, + IdxBin/binary>>, binary_to_atom(AllBin, latin1). reg_name(Mod, Index, Node) -> @@ -56,79 +63,78 @@ reg_name(Mod, Index, Node) -> start_link(Mod, Index) -> RegName = reg_name(Mod, Index), - proc_lib:start_link(?MODULE, init, [[self(), RegName, Mod, Index]]). + proc_lib:start_link(?MODULE, init, + [[self(), RegName, Mod, Index]]). -init([Parent, RegName, Mod, Index]) -> +init([Parent, RegName, Module, Index]) -> erlang:register(RegName, self()), proc_lib:init_ack(Parent, {ok, self()}), - Interval = application:get_env(riak_core, - vnode_check_interval, - ?DEFAULT_CHECK_INTERVAL), + vnode_check_interval, + ?DEFAULT_CHECK_INTERVAL), RequestInterval = application:get_env(riak_core, - vnode_check_request_interval, - Interval div 2), + vnode_check_request_interval, + Interval div 2), Threshold = application:get_env(riak_core, - vnode_overload_threshold, - ?DEFAULT_OVERLOAD_THRESHOLD), - - SafeInterval = - case (Threshold == undefined) orelse (Interval < Threshold) of - true -> - Interval; - false -> - logger:warning("Setting riak_core/vnode_check_interval to ~b", - [Threshold div 2]), - Threshold div 2 - end, - SafeRequestInterval = - case RequestInterval < SafeInterval of - true -> - RequestInterval; - false -> - logger:warning("Setting riak_core/vnode_check_request_interval " - "to ~b", [SafeInterval div 2]), - SafeInterval div 2 - end, - - State = #state{mod=Mod, - index=Index, - check_mailbox=0, - check_counter=0, - check_threshold=Threshold, - check_interval=SafeInterval, - check_request_interval=SafeRequestInterval}, + vnode_overload_threshold, + ?DEFAULT_OVERLOAD_THRESHOLD), + SafeInterval = case Threshold == undefined orelse + Interval < Threshold + of + true -> Interval; + false -> + logger:warning("Setting riak_core/vnode_check_interval " + "to ~b", + [Threshold div 2]), + Threshold div 2 + end, + SafeRequestInterval = case RequestInterval < + SafeInterval + of + true -> RequestInterval; + false -> + logger:warning("Setting riak_core/vnode_check_request_interva" + "l to ~b", + [SafeInterval div 2]), + SafeInterval div 2 + end, + State = #state{mod = Module, index = Index, + check_mailbox = 0, check_counter = 0, + check_threshold = Threshold, + check_interval = SafeInterval, + check_request_interval = SafeRequestInterval}, loop(Parent, State). unregister_vnode(Mod, Index, Pid) -> cast(reg_name(Mod, Index), {unregister_vnode, Pid}). --spec command_return_vnode({atom(), non_neg_integer(), atom()}, term()) -> - {ok, pid()} | {error, term()}. -command_return_vnode({Mod,Index,Node}, Req) -> +-spec command_return_vnode({atom(), non_neg_integer(), + atom()}, + term()) -> {ok, pid()} | {error, term()}. + +command_return_vnode({Mod, Index, Node}, Req) -> call(reg_name(Mod, Index, Node), {return_vnode, Req}). %% Return true if the next proxied message will return overload overloaded({Mod, Index, Node}) -> call(reg_name(Mod, Index, Node), overloaded); -overloaded(Pid) -> - call(Pid, overloaded). +overloaded(Pid) -> call(Pid, overloaded). call(Name, Msg) -> - call_reply(catch gen:call(Name, '$vnode_proxy_call', Msg)). + call_reply(catch gen:call(Name, '$vnode_proxy_call', + Msg)). call(Name, Msg, Timeout) -> - call_reply(catch gen:call(Name, '$vnode_proxy_call', Msg, Timeout)). + call_reply(catch gen:call(Name, '$vnode_proxy_call', + Msg, Timeout)). -spec call_reply({atom(), term()}) -> term(). -call_reply({ok, Res}) -> - Res; -call_reply({'EXIT', Reason}) -> - {error, Reason}. + +call_reply({ok, Res}) -> Res; +call_reply({'EXIT', Reason}) -> {error, Reason}. cast(Name, Msg) -> - catch erlang:send(Name, {'$vnode_proxy_cast', Msg}), - ok. + catch erlang:send(Name, {'$vnode_proxy_cast', Msg}), ok. system_continue(Parent, _, State) -> loop(Parent, State). @@ -136,75 +142,72 @@ system_continue(Parent, _, State) -> system_terminate(Reason, _Parent, _, _State) -> exit(Reason). -system_code_change(State, _, _, _) -> - {ok, State}. +system_code_change(State, _, _, _) -> {ok, State}. %% @private loop(Parent, State) -> receive - {'$vnode_proxy_call', From, Msg} -> - {reply, Reply, NewState} = handle_call(Msg, From, State), - {_, Reply} = gen:reply(From, Reply), - loop(Parent, NewState); - {'$vnode_proxy_cast', Msg} -> - {noreply, NewState} = handle_cast(Msg, State), - loop(Parent, NewState); - {'DOWN', _Mref, process, _Pid, _} -> - NewState = forget_vnode(State), - loop(Parent, NewState); - {system, From, Msg} -> - sys:handle_system_msg(Msg, From, Parent, ?MODULE, [], State); - Msg -> - {noreply, NewState} = handle_proxy(Msg, State), - loop(Parent, NewState) + {'$vnode_proxy_call', From, Msg} -> + {reply, Reply, NewState} = handle_call(Msg, From, + State), + {_, Reply} = gen:reply(From, Reply), + loop(Parent, NewState); + {'$vnode_proxy_cast', Msg} -> + {noreply, NewState} = handle_cast(Msg, State), + loop(Parent, NewState); + {'DOWN', _Mref, process, _Pid, _} -> + NewState = forget_vnode(State), loop(Parent, NewState); + {system, From, Msg} -> + sys:handle_system_msg(Msg, From, Parent, ?MODULE, [], + State); + Msg -> + {noreply, NewState} = handle_proxy(Msg, State), + loop(Parent, NewState) end. %% @private handle_call({return_vnode, Req}, _From, State) -> {Pid, NewState} = get_vnode_pid(State), - gen_fsm:send_event(Pid, Req), + riak_core_vnode:send_req(Pid, Req), {reply, {ok, Pid}, NewState}; -handle_call(overloaded, _From, State=#state{check_mailbox=Mailbox, - check_threshold=Threshold}) -> - Result = (Mailbox > Threshold), - {reply, Result, State}; -handle_call(_Msg, _From, State) -> - {reply, ok, State}. +handle_call(overloaded, _From, + State = #state{check_mailbox = Mailbox, + check_threshold = Threshold}) -> + Result = Mailbox > Threshold, {reply, Result, State}; +handle_call(_Msg, _From, State) -> {reply, ok, State}. %% @private handle_cast({unregister_vnode, Pid}, State) -> %% The pid may not match the vnode_pid in the state, but we must send the %% unregister event anyway -- the vnode manager requires it. - gen_fsm:send_event(Pid, unregistered), + riak_core_vnode:unregistered(Pid), catch demonitor(State#state.vnode_mref, [flush]), NewState = forget_vnode(State), {noreply, NewState}; -handle_cast({vnode_proxy_pong, Ref, Msgs}, State=#state{check_request=RequestState, - check_mailbox=Mailbox}) -> +handle_cast({vnode_proxy_pong, Ref, Msgs}, + State = #state{check_request = RequestState, + check_mailbox = Mailbox}) -> NewState = case Ref of - RequestState -> - State#state{check_mailbox=Mailbox - Msgs, - check_request=undefined, - check_counter=0}; - _ -> - State + RequestState -> + State#state{check_mailbox = Mailbox - Msgs, + check_request = undefined, check_counter = 0}; + _ -> State end, {noreply, NewState}; - -handle_cast(_Msg, State) -> - {noreply, State}. +handle_cast(_Msg, State) -> {noreply, State}. %% @private -handle_proxy(Msg, State=#state{check_threshold=undefined}) -> +handle_proxy(Msg, + State = #state{check_threshold = undefined}) -> {Pid, NewState} = get_vnode_pid(State), Pid ! Msg, {noreply, NewState}; -handle_proxy(Msg, State=#state{check_counter=Counter, - check_mailbox=Mailbox, - check_interval=Interval, - check_request_interval=RequestInterval, - check_request=RequestState, - check_threshold=Threshold}) -> +handle_proxy(Msg, + State = #state{check_counter = Counter, + check_mailbox = Mailbox, check_interval = Interval, + check_request_interval = RequestInterval, + check_request = RequestState, + check_threshold = Threshold}) -> %% %% NOTE: This function is intentionally written as it is for performance %% reasons -- the vnode proxy is on the critical path of Riak and @@ -222,225 +225,214 @@ handle_proxy(Msg, State=#state{check_counter=Counter, %% ensure unnecessary work is not being performed needlessly. %% case State#state.vnode_pid of - undefined -> - {Pid, State2} = get_vnode_pid(State); - KnownPid -> - Pid = KnownPid, - State2 = State + undefined -> {Pid, State2} = get_vnode_pid(State); + KnownPid -> Pid = KnownPid, State2 = State end, - Mailbox2 = case Mailbox =< Threshold of - true -> - Pid ! Msg, - Mailbox + 1; - false -> - handle_overload(Msg, State), - Mailbox + true -> Pid ! Msg, Mailbox + 1; + false -> handle_overload(Msg, State), Mailbox end, - Counter2 = Counter + 1, case Counter2 of - RequestInterval -> - %% Ping the vnode in hopes that we get a pong back before hitting - %% the hard query interval and triggering an expensive process_info - %% call. A successful pong from the vnode means that all messages - %% sent before the ping have already been handled and therefore - %% we can adjust our mailbox estimate accordingly. - case RequestState of - undefined -> - RequestState2 = send_proxy_ping(Pid, Mailbox2); - _ -> - RequestState2 = RequestState - end, - Mailbox3 = Mailbox2, - Counter3 = Counter2; - Interval -> - %% Time to directly check the mailbox size. This operation may - %% be extremely expensive. If the vnode is currently active, - %% the proxy will be descheduled until the vnode finishes - %% execution and becomes descheduled itself. - {_, L} = - erlang:process_info(Pid, message_queue_len), - Counter3 = 0, - Mailbox3 = L + 1, - %% Send a new proxy ping so that if the new length is above the - %% threshold then the proxy will detect the work is completed, - %% rather than being stuck in overload state until the interval - %% counts are reached. - RequestState2 = send_proxy_ping(Pid, Mailbox3); - _ -> - Mailbox3 = Mailbox2, - Counter3 = Counter2, - RequestState2 = RequestState + RequestInterval -> + %% Ping the vnode in hopes that we get a pong back before hitting + %% the hard query interval and triggering an expensive process_info + %% call. A successful pong from the vnode means that all messages + %% sent before the ping have already been handled and therefore + %% we can adjust our mailbox estimate accordingly. + case RequestState of + undefined -> + RequestState2 = send_proxy_ping(Pid, Mailbox2); + _ -> RequestState2 = RequestState + end, + Mailbox3 = Mailbox2, + Counter3 = Counter2; + Interval -> + %% Time to directly check the mailbox size. This operation may + %% be extremely expensive. If the vnode is currently active, + %% the proxy will be descheduled until the vnode finishes + %% execution and becomes descheduled itself. + {_, L} = erlang:process_info(Pid, message_queue_len), + Counter3 = 0, + Mailbox3 = L + 1, + %% Send a new proxy ping so that if the new length is above the + %% threshold then the proxy will detect the work is completed, + %% rather than being stuck in overload state until the interval + %% counts are reached. + RequestState2 = send_proxy_ping(Pid, Mailbox3); + _ -> + Mailbox3 = Mailbox2, + Counter3 = Counter2, + RequestState2 = RequestState end, - {noreply, State2#state{check_counter=Counter3, - check_mailbox=Mailbox3, - check_request=RequestState2}}. + {noreply, + State2#state{check_counter = Counter3, + check_mailbox = Mailbox3, + check_request = RequestState2}}. -handle_overload(Msg, #state{mod=Mod, index=Index}) -> +handle_overload(Msg, + #state{mod = Module, index = Index}) -> %% STATS %riak_core_stat:update(dropped_vnode_requests), case Msg of - {'$gen_event', ?VNODE_REQ{sender=Sender, request=Request}} -> - catch(Mod:handle_overload_command(Request, Sender, Index)); - {'$gen_all_state_event', ?VNODE_REQ{sender=Sender, request=Request}} -> - catch(Mod:handle_overload_command(Request, Sender, Index)); - {'$gen_event', ?COVERAGE_REQ{sender=Sender, request=Request}} -> - catch(Mod:handle_overload_command(Request, Sender, Index)); - _ -> - catch(Mod:handle_overload_info(Msg, Index)) + {'$gen_event', + #riak_vnode_req_v1{sender = Sender, + request = Request}} -> + catch Module:handle_overload_command(Request, Sender, + Index); + {'$gen_all_state_event', + #riak_vnode_req_v1{sender = Sender, + request = Request}} -> + catch Module:handle_overload_command(Request, Sender, + Index); + {'$gen_event', + #riak_coverage_req_v1{sender = Sender, + request = Request}} -> + catch Module:handle_overload_command(Request, Sender, + Index); + _ -> catch Module:handle_overload_info(Msg, Index) end. %% @private forget_vnode(State) -> - State#state{vnode_pid=undefined, - vnode_mref=undefined, - check_mailbox=0, - check_counter=0, - check_request=undefined}. + State#state{vnode_pid = undefined, + vnode_mref = undefined, check_mailbox = 0, + check_counter = 0, check_request = undefined}. %% @private -get_vnode_pid(State=#state{mod=Mod, index=Index, vnode_pid=undefined}) -> - {ok, Pid} = riak_core_vnode_manager:get_vnode_pid(Index, Mod), +get_vnode_pid(State = #state{mod = Module, + index = Index, vnode_pid = undefined}) -> + {ok, Pid} = riak_core_vnode_manager:get_vnode_pid(Index, + Module), Mref = erlang:monitor(process, Pid), - NewState = State#state{vnode_pid=Pid, vnode_mref=Mref}, + NewState = State#state{vnode_pid = Pid, + vnode_mref = Mref}, {Pid, NewState}; -get_vnode_pid(State=#state{vnode_pid=Pid}) -> +get_vnode_pid(State = #state{vnode_pid = Pid}) -> {Pid, State}. %% @private send_proxy_ping(Pid, MailboxSizeAfterPing) -> Ref = make_ref(), - Pid ! {'$vnode_proxy_ping', self(), Ref, MailboxSizeAfterPing}, + Pid ! + {'$vnode_proxy_ping', self(), Ref, + MailboxSizeAfterPing}, Ref. -ifdef(TEST). update_msg_counter() -> Count = case erlang:get(count) of - undefined -> 0; - Val -> Val - end, - put(count, Count+1). + undefined -> 0; + Val -> Val + end, + put(count, Count + 1). fake_loop() -> receive - block -> - fake_loop_block(); - slow -> - fake_loop_slow(); - {get_count, Pid} -> - Pid ! {count, erlang:get(count)}, - fake_loop(); - %% Original tests do not expect replies - the - %% results below expect the pings to be counted - %% towards messages received. If you ever wanted - %% to re-instance, uncomment below. - %% {'$vnode_proxy_ping', ReplyTo, Ref, Msgs} -> - %% ReplyTo ! {Ref, Msgs}, - %% fake_loop(); - _Msg -> - update_msg_counter(), - fake_loop() + block -> fake_loop_block(); + slow -> fake_loop_slow(); + {get_count, Pid} -> + Pid ! {count, erlang:get(count)}, fake_loop(); + %% Original tests do not expect replies - the + %% results below expect the pings to be counted + %% towards messages received. If you ever wanted + %% to re-instance, uncomment below. + %% {'$vnode_proxy_ping', ReplyTo, Ref, Msgs} -> + %% ReplyTo ! {Ref, Msgs}, + %% fake_loop(); + _Msg -> update_msg_counter(), fake_loop() end. fake_loop_slow() -> timer:sleep(100), receive - _Msg -> - update_msg_counter(), - fake_loop_slow() + _Msg -> update_msg_counter(), fake_loop_slow() end. -fake_loop_block() -> - receive - unblock -> - fake_loop() - end. +fake_loop_block() -> receive unblock -> fake_loop() end. overload_test_() -> - {timeout, 900, {foreach, - fun() -> - VnodePid = spawn(fun fake_loop/0), - meck:unload(), - meck:new(riak_core_vnode_manager, [passthrough]), - meck:expect(riak_core_vnode_manager, get_vnode_pid, - fun(_Index, fakemod) -> {ok, VnodePid}; - (Index, Mod) -> meck:passthrough([Index, Mod]) - end), - meck:new(fakemod, [non_strict]), - meck:expect(fakemod, handle_overload_info, fun(hello, _Idx) -> - ok - end), - - {ok, ProxyPid} = riak_core_vnode_proxy:start_link(fakemod, 0), - unlink(ProxyPid), - {VnodePid, ProxyPid} - end, - fun({VnodePid, ProxyPid}) -> - unlink(VnodePid), - unlink(ProxyPid), - exit(VnodePid, kill), - exit(ProxyPid, kill) - end, - [ - fun({_VnodePid, ProxyPid}) -> - {"should not discard in normal operation", timeout, 60, - fun() -> - ToSend = ?DEFAULT_OVERLOAD_THRESHOLD, - [ProxyPid ! hello || _ <- lists:seq(1, ToSend)], - - %% synchronize on the proxy and the mailbox - {ok, ok} = gen:call(ProxyPid, '$vnode_proxy_call', sync, infinity), - ProxyPid ! {get_count, self()}, - receive - {count, Count} -> - %% First will hit the request check interval, - %% then will check message queue every interval - %% (no new ping will be resubmitted after the first - %% as the request will already have a reference) - PingReqs = 1 + % for first request intarval - ToSend div ?DEFAULT_CHECK_INTERVAL, - ?assertEqual(ToSend+PingReqs, Count) - end - end - } + {timeout, 900, + {foreach, + fun () -> + VnodePid = spawn(fun fake_loop/0), + meck:unload(), + meck:new(riak_core_vnode_manager, [passthrough]), + meck:expect(riak_core_vnode_manager, get_vnode_pid, + fun (_Index, fakemod) -> {ok, VnodePid}; + (Index, Mod) -> meck:passthrough([Index, Mod]) + end), + meck:new(fakemod, [non_strict]), + meck:expect(fakemod, handle_overload_info, + fun (hello, _Idx) -> ok end), + {ok, ProxyPid} = + riak_core_vnode_proxy:start_link(fakemod, 0), + unlink(ProxyPid), + {VnodePid, ProxyPid} end, - fun({VnodePid, ProxyPid}) -> - {"should discard during overflow", timeout, 60, - fun() -> - VnodePid ! block, - [ProxyPid ! hello || _ <- lists:seq(1, 50000)], - %% synchronize on the mailbox - no-op that hits msg catchall - Reply = gen:call(ProxyPid, '$vnode_proxy_call', sync, infinity), - ?assertEqual({ok, ok}, Reply), - VnodePid ! unblock, - VnodePid ! {get_count, self()}, - receive - {count, Count} -> - %% Threshold + 10 unanswered vnode_proxy_ping - ?assertEqual(?DEFAULT_OVERLOAD_THRESHOLD + 10, Count) - end - end - } + fun ({VnodePid, ProxyPid}) -> + unlink(VnodePid), + unlink(ProxyPid), + exit(VnodePid, kill), + exit(ProxyPid, kill) end, - fun({VnodePid, ProxyPid}) -> - {"should tolerate slow vnodes", timeout, 60, - fun() -> - VnodePid ! slow, - [ProxyPid ! hello || _ <- lists:seq(1, 50000)], - %% synchronize on the mailbox - no-op that hits msg catchall - Reply = gen:call(ProxyPid, '$vnode_proxy_call', sync, infinity), - ?assertEqual({ok, ok}, Reply), - %% check that the outstanding message count is - %% reasonable - {message_queue_len, L} = - erlang:process_info(VnodePid, message_queue_len), - %% Threshold + 2 unanswered vnode_proxy_ping (one - %% for first ping, second after process_info check) - ?assert(L =< (?DEFAULT_OVERLOAD_THRESHOLD + 2)) - end - } - end - ]}}. + [fun ({_VnodePid, ProxyPid}) -> + {"should not discard in normal operation", timeout, 60, + fun () -> + ToSend = (?DEFAULT_OVERLOAD_THRESHOLD), + [ProxyPid ! hello || _ <- lists:seq(1, ToSend)], + %% synchronize on the proxy and the mailbox + {ok, ok} = gen:call(ProxyPid, '$vnode_proxy_call', sync, + infinity), + ProxyPid ! {get_count, self()}, + receive + {count, Count} -> + %% First will hit the request check interval, + %% then will check message queue every interval + %% (no new ping will be resubmitted after the first + %% as the request will already have a reference) + PingReqs = 1 + + % for first request intarval + ToSend div (?DEFAULT_CHECK_INTERVAL), + ?assertEqual((ToSend + PingReqs), Count) + end + end} + end, + fun ({VnodePid, ProxyPid}) -> + {"should discard during overflow", timeout, 60, + fun () -> + VnodePid ! block, + [ProxyPid ! hello || _ <- lists:seq(1, 50000)], + %% synchronize on the mailbox - no-op that hits msg catchall + Reply = gen:call(ProxyPid, '$vnode_proxy_call', sync, + infinity), + ?assertEqual({ok, ok}, Reply), + VnodePid ! unblock, + VnodePid ! {get_count, self()}, + receive + {count, Count} -> + %% Threshold + 10 unanswered vnode_proxy_ping + ?assertEqual(((?DEFAULT_OVERLOAD_THRESHOLD) + 10), + Count) + end + end} + end, + fun ({VnodePid, ProxyPid}) -> + {"should tolerate slow vnodes", timeout, 60, + fun () -> + VnodePid ! slow, + [ProxyPid ! hello || _ <- lists:seq(1, 50000)], + %% synchronize on the mailbox - no-op that hits msg catchall + Reply = gen:call(ProxyPid, '$vnode_proxy_call', sync, + infinity), + ?assertEqual({ok, ok}, Reply), + %% check that the outstanding message count is reasonable + {message_queue_len, L} = erlang:process_info(VnodePid, + message_queue_len), + %% Threshold + (at most) 10 unanswered vnode_proxy_ping + ?assert((L =< (?DEFAULT_OVERLOAD_THRESHOLD) + 10)) + end} + end]}}. + -endif. diff --git a/src/riak_core_vnode_proxy_sup.erl b/src/riak_core_vnode_proxy_sup.erl index ab842eb22..2478769a6 100644 --- a/src/riak_core_vnode_proxy_sup.erl +++ b/src/riak_core_vnode_proxy_sup.erl @@ -17,8 +17,11 @@ %% %% ------------------------------------------------------------------- -module(riak_core_vnode_proxy_sup). + -behaviour(supervisor). + -export([start_link/0, init/1]). + -export([start_proxy/2, stop_proxy/2, start_proxies/1]). start_link() -> @@ -29,15 +32,15 @@ init([]) -> %% modules. Ensures restart of proxies after a crash of this supervisor. Indices = get_indices(), VMods = riak_core:vnode_modules(), - Proxies = [proxy_ref(Mod, Index) || {_, Mod} <- VMods, - Index <- Indices], + Proxies = [proxy_ref(Mod, Index) + || {_, Mod} <- VMods, Index <- Indices], {ok, {{one_for_one, 5, 10}, Proxies}}. start_proxy(Mod, Index) -> Ref = proxy_ref(Mod, Index), Pid = case supervisor:start_child(?MODULE, Ref) of - {ok, Child} -> Child; - {error, {already_started, Child}} -> Child + {ok, Child} -> Child; + {error, {already_started, Child}} -> Child end, Pid. @@ -54,7 +57,8 @@ start_proxies(Mod) -> %% @private proxy_ref(Mod, Index) -> - {{Mod, Index}, {riak_core_vnode_proxy, start_link, [Mod, Index]}, + {{Mod, Index}, + {riak_core_vnode_proxy, start_link, [Mod, Index]}, permanent, 5000, worker, [riak_core_vnode_proxy]}. %% @private diff --git a/src/riak_core_vnode_sup.erl b/src/riak_core_vnode_sup.erl index 661a930f8..29637df14 100644 --- a/src/riak_core_vnode_sup.erl +++ b/src/riak_core_vnode_sup.erl @@ -23,12 +23,17 @@ %% @doc supervise riak_vnode processes -module(riak_core_vnode_sup). + -behaviour(supervisor). + -export([start_link/0, init/1]). + -export([start_vnode/3]). -start_vnode(Mod, Index, ForwardTo) when is_integer(Index) -> - supervisor:start_child(?MODULE, [Mod, Index, ForwardTo]). +start_vnode(Mod, Index, ForwardTo) + when is_integer(Index) -> + supervisor:start_child(?MODULE, + [Mod, Index, ForwardTo]). start_link() -> %% This simple_one_for_one supervisor can do a controlled shutdown. @@ -39,8 +44,7 @@ start_link() -> %% @private init([]) -> - {ok, - {{simple_one_for_one, 10, 10}, - [{undefined, - {riak_core_vnode, start_link, []}, - temporary, 300000, worker, dynamic}]}}. + {ok, + {{simple_one_for_one, 10, 10}, + [{undefined, {riak_core_vnode, start_link, []}, + temporary, 300000, worker, dynamic}]}}. diff --git a/src/riak_core_vnode_worker.erl b/src/riak_core_vnode_worker.erl index de3cf4221..293a40605 100644 --- a/src/riak_core_vnode_worker.erl +++ b/src/riak_core_vnode_worker.erl @@ -23,65 +23,71 @@ -include("riak_core_vnode.hrl"). % gen_server callbacks --export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2, code_change/3]). +-export([init/1, handle_call/3, handle_cast/2, + handle_info/2, terminate/2, code_change/3]). % API -export([start_link/1, handle_work/3, handle_work/4]). -type mod_state() :: term(). --record(state, { - module :: atom(), - modstate :: mod_state() -}). +-record(state, + {module :: atom(), modstate :: mod_state()}). --callback init_worker(partition(), Args :: term(), Props :: [{atom(), term()}]) -> {ok, mod_state()}. --callback handle_work(Work :: term(), sender(), mod_state()) -> - {reply, Reply :: term(), mod_state()} | - {noreply, mod_state()}. +-callback init_worker(partition(), Args :: term(), + Props :: [{atom(), term()}]) -> {ok, mod_state()}. -start_link(Args) -> - WorkerMod = proplists:get_value(worker_callback_mod, Args), - [VNodeIndex, WorkerArgs, WorkerProps, Caller] = proplists:get_value(worker_args, Args), - gen_server:start_link(?MODULE, [WorkerMod, VNodeIndex, WorkerArgs, WorkerProps, Caller], []). +-callback handle_work(Work :: term(), sender(), + mod_state()) -> {reply, Reply :: term(), mod_state()} | + {noreply, mod_state()}. +start_link(Args) -> + WorkerMod = proplists:get_value(worker_callback_mod, + Args), + [VNodeIndex, WorkerArgs, WorkerProps, Caller] = + proplists:get_value(worker_args, Args), + gen_server:start_link(?MODULE, + [WorkerMod, VNodeIndex, WorkerArgs, WorkerProps, + Caller], + []). handle_work(Worker, Work, From) -> handle_work(Worker, Work, From, self()). - handle_work(Worker, Work, From, Caller) -> gen_server:cast(Worker, {work, Work, From, Caller}). - -init([Module, VNodeIndex, WorkerArgs, WorkerProps, Caller]) -> - {ok, WorkerState} = Module:init_worker(VNodeIndex, WorkerArgs, WorkerProps), +init([Module, VNodeIndex, WorkerArgs, WorkerProps, + Caller]) -> + {ok, WorkerState} = Module:init_worker(VNodeIndex, + WorkerArgs, WorkerProps), %% let the pool queue manager know there might be a worker to checkout riak_core_vnode_worker_pool:worker_started(Caller), {ok, #state{module = Module, modstate = WorkerState}}. - handle_call(Event, _From, State) -> - logger:debug("Vnode worker received synchronous event: ~p.", [Event]), + logger:debug("Vnode worker received synchronous event: " + "~p.", + [Event]), {reply, ok, State}. - handle_cast({work, Work, WorkFrom, Caller}, - #state{module = Mod, modstate = ModState} = State) -> - NewModState = case Mod:handle_work(Work, WorkFrom, ModState) of - {reply, Reply, NS} -> - riak_core_vnode:reply(WorkFrom, Reply), - NS; - {noreply, NS} -> - NS + #state{module = Module, modstate = ModState} = State) -> + NewModState = case Module:handle_work(Work, WorkFrom, + ModState) + of + {reply, Reply, NS} -> + riak_core_vnode:reply(WorkFrom, Reply), NS; + {noreply, NS} -> NS end, %% check the worker back into the pool - riak_core_vnode_worker_pool:checkin_worker(Caller, self()), + riak_core_vnode_worker_pool:checkin_worker(Caller, + self()), {noreply, State#state{modstate = NewModState}}; - handle_cast(_Event, State) -> {noreply, State}. - handle_info(_Info, State) -> {noreply, State}. + terminate(_Reason, _State) -> ok. + code_change(_OldVsn, State, _Extra) -> {ok, State}. diff --git a/src/riak_core_vnode_worker_pool.erl b/src/riak_core_vnode_worker_pool.erl index 7b8134f86..b750c929f 100644 --- a/src/riak_core_vnode_worker_pool.erl +++ b/src/riak_core_vnode_worker_pool.erl @@ -39,259 +39,323 @@ %% confuse (or cause a race) with this module's checkout management. -module(riak_core_vnode_worker_pool). --behaviour(gen_fsm_compat). - -%% gen_fsm_compat callbacks --export([init/1, handle_event/3, handle_sync_event/4, handle_info/3, - terminate/3, code_change/4]). - -%% gen_fsm_compat states --export([ready/2, queueing/2, ready/3, queueing/3, shutdown/2, shutdown/3]). +-behaviour(gen_statem). %% API --export([start_link/6, start_link/5, stop/2, shutdown_pool/2, handle_work/3, worker_started/1, checkin_worker/2]). - --ifdef(PULSE). --compile(export_all). --compile({parse_transform, pulse_instrument}). --compile({pulse_replace_module, [{gen_fsm_compat, pulse_gen_fsm}]}). --endif. - --record(state, { - queue :: queue:queue() | list(), - pool :: pid(), - monitors = [] :: list(), - queue_strategy = fifo :: fifo | filo, - shutdown :: undefined | {pid(), reference()} - }). +-export([start_link/5, start_link/6, stop/2, + shutdown_pool/2, handle_work/3, worker_started/1, + checkin_worker/2]). --type pool_opt() :: - {strategy, fifo | filo}. +%% gen_statem callbacks +-export([init/1, terminate/3, code_change/4, + callback_mode/0]). +%% gen_statem states +-export([ready/3, queue/3, shutdown/3]). -start_link(WorkerMod, PoolSize, VNodeIndex, WorkerArgs, WorkerProps) -> - start_link(WorkerMod, PoolSize, VNodeIndex, WorkerArgs, WorkerProps, []). +%% ======== +%% API +%% ======== --spec start_link(atom(), pos_integer(), pos_integer(), term(), term(), - [pool_opt()]) -> - {ok, pid()}. +start_link(WorkerMod, PoolSize, VNodeIndex, WorkerArgs, + WorkerProps) -> + start_link(WorkerMod, PoolSize, VNodeIndex, WorkerArgs, + WorkerProps, []). -start_link(WorkerMod, PoolSize, VNodeIndex, WorkerArgs, WorkerProps, Opts) -> - gen_fsm_compat:start_link(?MODULE, [WorkerMod, PoolSize, VNodeIndex, WorkerArgs, - WorkerProps, Opts], []). +start_link(WorkerMod, PoolSize, VNodeIndex, WorkerArgs, + WorkerProps, Opts) -> + gen_statem:start_link(?MODULE, + [WorkerMod, PoolSize, VNodeIndex, WorkerArgs, + WorkerProps, Opts], + []). +% #1 cast handle_work(Pid, Work, From) -> - gen_fsm_compat:send_event(Pid, {work, Work, From}). - -stop(Pid, Reason) -> - gen_fsm_compat:sync_send_all_state_event(Pid, {stop, Reason}). + gen_statem:cast(Pid, {work, Work, From}). +% #2 cast worker_started(Pid) -> - gen_fsm_compat:send_all_state_event(Pid, worker_start). + gen_statem:cast(Pid, worker_start). +% #3 cast checkin_worker(Pid, WorkerPid) -> - gen_fsm_compat:send_all_state_event(Pid, {checkin, WorkerPid}). + gen_statem:cast(Pid, {checkin, WorkerPid}). + +% #4 call +stop(Pid, Reason) -> + gen_statem:stop(Pid, Reason, infinity). +% #5 call %% wait for all the workers to finish any current work +-spec shutdown_pool(pid(), integer()) -> ok | + {error, vnode_shutdown}. + shutdown_pool(Pid, Wait) -> - gen_fsm_compat:sync_send_all_state_event(Pid, {shutdown, Wait}, infinity). - -init([WorkerMod, PoolSize, VNodeIndex, WorkerArgs, WorkerProps, Opts]) -> - {ok, Pid} = poolboy:start_link([{worker_module, riak_core_vnode_worker}, - {worker_args, [VNodeIndex, WorkerArgs, WorkerProps, self()]}, - {worker_callback_mod, WorkerMod}, - {size, PoolSize}, {max_overflow, 0}]), - DfltStrategy = application:get_env(riak_core, queue_worker_strategy, fifo), - State = case proplists:get_value(strategy, Opts, DfltStrategy) of - fifo -> - #state{ - pool = Pid, - queue = queue:new(), - queue_strategy = fifo - }; - filo -> - #state{ - pool = Pid, - queue = [], - queue_strategy = filo - } - end, + gen_statem:call(Pid, {shutdown, Wait}, infinity). + +%% ======================== +%% ======== +%% State, Mode, Init, Terminate +%% ======== +%% ======================== + +-record(state, + {queue :: queue:queue() | list(), pool :: pid(), + monitors = [] :: list(), + queue_strategy = fifo :: fifo | filo, + shutdown :: undefined | {pid(), reference()}}). + +callback_mode() -> [state_functions, state_enter]. + +init([WorkerMod, PoolSize, VNodeIndex, WorkerArgs, + WorkerProps, Opts]) -> + {ok, Pid} = poolboy:start_link([{worker_module, + riak_core_vnode_worker}, + {worker_args, + [VNodeIndex, WorkerArgs, WorkerProps, + self()]}, + {worker_callback_mod, WorkerMod}, + {size, PoolSize}, {max_overflow, 0}]), + DefaultStrategy = application:get_env(riak_core, + queue_worker_strategy, fifo), + State = case proplists:get_value(strategy, Opts, + DefaultStrategy) + of + fifo -> + #state{pool = Pid, queue = queue:new(), + queue_strategy = fifo}; + filo -> + #state{pool = Pid, queue = [], queue_strategy = filo} + end, {ok, ready, State}. -ready(_Event, _From, State) -> - {reply, ok, ready, State}. - -ready({work, Work, From} = Msg, #state{pool=Pool, monitors=Monitors} = State) -> - case poolboy:checkout(Pool, false) of - full -> - {next_state, queueing, in(Msg, State)}; - Pid when is_pid(Pid) -> - NewMonitors = monitor_worker(Pid, From, Work, Monitors), - riak_core_vnode_worker:handle_work(Pid, Work, From), - {next_state, ready, State#state{monitors=NewMonitors}} - end; -ready(_Event, State) -> - {next_state, ready, State}. - -queueing(_Event, _From, State) -> - {reply, ok, queueing, State}. +% #4 call +terminate(_Reason, _StateName, #state{pool = Pool}) -> + %% stop poolboy + poolboy:stop(Pool), + ok. -queueing({work, _Work, _From} = Msg, State) -> - {next_state, queueing, in(Msg, State)}; -queueing(_Event, State) -> - {next_state, queueing, State}. +code_change(_OldVsn, StateName, State, _Extra) -> + {ok, StateName, State}. -shutdown(_Event, _From, State) -> - {reply, ok, shutdown, State}. +%% ======================== +%% ======== +%% States +%% ======== +%% ======================== -shutdown({work, _Work, From}, State) -> - %% tell the process requesting work that we're shutting down - riak_core_vnode:reply(From, {error, vnode_shutdown}), - {next_state, shutdown, State}; -shutdown(_Event, State) -> - {next_state, shutdown, State}. +%% ready +%% ======== -handle_event({checkin, Pid}, shutdown, #state{pool=Pool, monitors=Monitors0} = State) -> - Monitors = demonitor_worker(Pid, Monitors0), - poolboy:checkin(Pool, Pid), - case Monitors of - [] -> %% work all done, time to exit! - {stop, shutdown, State}; - _ -> - {next_state, shutdown, State#state{monitors=Monitors}} - end; -handle_event({checkin, Worker}, _, #state{pool = Pool, monitors=Monitors} = State) -> - case out(State) of - {{value, {work, Work, From}}, Rem} -> - %% there is outstanding work to do - instead of checking - %% the worker back in, just hand it more work to do - NewMonitors = monitor_worker(Worker, From, Work, Monitors), - riak_core_vnode_worker:handle_work(Worker, Work, From), - {next_state, queueing, State#state{queue=Rem, - monitors=NewMonitors}}; - {empty, Empty} -> - NewMonitors = demonitor_worker(Worker, Monitors), - poolboy:checkin(Pool, Worker), - {next_state, ready, State#state{queue=Empty, monitors=NewMonitors}} - end; -handle_event(worker_start, StateName, #state{pool=Pool, monitors=Monitors}=State) -> - %% a new worker just started - if we have work pending, try to do it - case out(State) of - {{value, {work, Work, From}}, Rem} -> - case poolboy:checkout(Pool, false) of - full -> - {next_state, queueing, State}; - Pid when is_pid(Pid) -> - NewMonitors = monitor_worker(Pid, From, Work, Monitors), - riak_core_vnode_worker:handle_work(Pid, Work, From), - {next_state, queueing, State#state{queue=Rem, monitors=NewMonitors}} - end; - {empty, _} -> - %% StateName might be either 'ready' or 'shutdown' - {next_state, StateName, State} +%% enter +ready(enter, _, State) -> {keep_state, State}; +%% #1 +ready(cast, {work, Work, From} = Msg, + #state{pool = Pool, monitors = Monitors} = State) -> + case poolboy:checkout(Pool, false) of + full -> {next_state, queue, in(Msg, State)}; + Pid when is_pid(Pid) -> + NewMonitors = monitor_worker(Pid, From, Work, Monitors), + riak_core_vnode_worker:handle_work(Pid, Work, From), + {next_state, ready, State#state{monitors = NewMonitors}} end; -handle_event(_Event, StateName, State) -> - {next_state, StateName, State}. - -handle_sync_event({stop, Reason}, _From, _StateName, State) -> - {stop, Reason, ok, State}; - -handle_sync_event({shutdown, Time}, From, _StateName, - #state{monitors=Monitors} = State) -> +%% #2 +ready(cast, worker_start, State) -> + worker_started(State, ready); +%% #3 +ready(cast, {checkin, WorkerPid}, State) -> + checkin(State, WorkerPid); +%% #5 +ready({call, From}, {shutdown, Wait}, State) -> + %% change to shutdown state with a state_timeout of 'Wait' ms, force after timeout expires + {next_state, shutdown, State#state{shutdown = From}, + [{state_timeout, Wait, force_shutdown}]}; +%% info EXIT signal of erlang:monitor(process, Worker) +ready(info, {'DOWN', _Ref, _Type, Pid, Info}, State) -> + {ok, NewState} = exit_worker(State, Pid, Info), + {keep_state, NewState}. + +%% queueing +%% ======== + +%% enter +queue(enter, _, State) -> {keep_state, State}; +queue(cast, {work, _Work, _From} = Msg, State) -> + {next_state, queue, in(Msg, State)}; +%% #2 +queue(cast, worker_start, State) -> + worker_started(State, queue); +%% #3 +queue(cast, {checkin, WorkerPid}, State) -> + checkin(State, WorkerPid); +%% #5 +queue({call, From}, {shutdown, Wait}, State) -> + %% change to shutdown state with a state_timeout of 'Wait' ms, force after timeout expires + {next_state, shutdown, State#state{shutdown = From}, + [{state_timeout, Wait, force_shutdown}]}; +%% info EXIT signal of erlang:monitor(process, Worker) +queue(info, {'DOWN', _Ref, _Type, Pid, Info}, State) -> + {ok, NewState} = exit_worker(State, Pid, Info), + {keep_state, NewState}. + +%% shutdown +%% ======== + +%% enter +shutdown(enter, _, + #state{monitors = Monitors, shutdown = From} = State) -> discard_queued_work(State), case Monitors of - [] -> - {stop, shutdown, ok, State}; - _ -> - case Time of - infinity -> - ok; - _ when is_integer(Time) -> - erlang:send_after(Time, self(), shutdown), - ok - end, - {next_state, shutdown, State#state{shutdown=From, queue=new(State)}} - end; -handle_sync_event(_Event, _From, StateName, State) -> - {reply, {error, unknown_message}, StateName, State}. - -handle_info({'DOWN', _Ref, _, Pid, Info}, StateName, #state{monitors=Monitors} = State) -> - %% remove the listing for the dead worker - case lists:keyfind(Pid, 1, Monitors) of - {Pid, _, From, Work} -> - riak_core_vnode:reply(From, {error, {worker_crash, Info, Work}}), - NewMonitors = lists:keydelete(Pid, 1, Monitors), - %% trigger to do more work will be 'worker_start' message - %% when poolboy replaces this worker (if not a 'checkin' - %% or 'handle_work') - {next_state, StateName, State#state{monitors=NewMonitors}}; - false -> - {next_state, StateName, State} + [] -> {stop_and_reply, shutdown, [{reply, From, ok}]}; + _ -> {keep_state, State#state{queue = new(State)}} end; -handle_info(shutdown, shutdown, #state{monitors=Monitors} = State) -> +%% force shutdown timeout +shutdown(state_timeout, _, + #state{monitors = Monitors, shutdown = FromOrigin}) -> %% we've waited too long to shutdown, time to force the issue. - _ = [riak_core_vnode:reply(From, {error, vnode_shutdown}) || - {_, _, From, _} <- Monitors], - {stop, shutdown, State}; -handle_info(_Info, StateName, State) -> - {next_state, StateName, State}. - -terminate(_Reason, _StateName, #state{pool=Pool}) -> - %% stop poolboy - poolboy:stop(Pool), - ok. - -code_change(_OldVsn, StateName, State, _Extra) -> - {ok, StateName, State}. + _ = [riak_core_vnode:reply(From, + {error, vnode_shutdown}) + || {_, _, From, _} <- Monitors], + {stop_and_reply, shutdown, + [{reply, FromOrigin, {error, vnode_shutdown}}]}; +%% #1 +shutdown(cast, {work, _Work, From}, State) -> + riak_core_vnode:reply(From, {error, vnode_shutdown}), + {keep_state, State}; +%% #2 +shutdown(cast, worker_start, State) -> + worker_started(State, shutdown); +%% #3 +shutdown(cast, {checkin, Pid}, + #state{pool = Pool, monitors = Monitors0, + shutdown = From} = + State) -> + Monitors = demonitor_worker(Pid, Monitors0), + poolboy:checkin(Pool, Pid), + case Monitors of + [] -> %% work all done, time to exit! + {stop_and_reply, shutdown, [{reply, From, ok}]}; + _ -> {keep_state, State#state{monitors = Monitors}} + end; +%% #5 +shutdown({call, From}, {shutdown, _Wait}, State) -> + %% duplicate shutdown call + {keep_state, State, + [{reply, From, {error, vnode_shutdown}}]}; +%% info EXIT signal of erlang:monitor(process, Worker) +shutdown(info, {'DOWN', _Ref, _, Pid, Info}, State) -> + {ok, NewState} = exit_worker(State, Pid, Info), + {keep_state, NewState}. + +%% ======================== +%% ======== +%% Internal Helper Functions +%% ======== +%% ======================== %% Keep track of which worker we pair with what work/from and monitor the %% worker. Only active workers are tracked monitor_worker(Worker, From, Work, Monitors) -> case lists:keyfind(Worker, 1, Monitors) of - {Worker, Ref, _OldFrom, _OldWork} -> - %% reuse old monitor and just update the from & work - lists:keyreplace(Worker, 1, Monitors, {Worker, Ref, From, Work}); - false -> - Ref = erlang:monitor(process, Worker), - [{Worker, Ref, From, Work} | Monitors] + {Worker, Ref, _OldFrom, _OldWork} -> + %% reuse old monitor and just update the from & work + lists:keyreplace(Worker, 1, Monitors, + {Worker, Ref, From, Work}); + false -> + Ref = erlang:monitor(process, Worker), + [{Worker, Ref, From, Work} | Monitors] end. demonitor_worker(Worker, Monitors) -> case lists:keyfind(Worker, 1, Monitors) of - {Worker, Ref, _From, _Work} -> - erlang:demonitor(Ref), - lists:keydelete(Worker, 1, Monitors); - false -> - %% not monitored? - Monitors + {Worker, Ref, _From, _Work} -> + erlang:demonitor(Ref), + lists:keydelete(Worker, 1, Monitors); + false -> + %% not monitored? + Monitors end. discard_queued_work(State) -> case out(State) of - {{value, {work, _Work, From}}, Rem} -> - riak_core_vnode:reply(From, {error, vnode_shutdown}), - discard_queued_work(State#state{queue = Rem}); - {empty, _Empty} -> - ok + {{value, {work, _Work, From}}, Rem} -> + riak_core_vnode:reply(From, {error, vnode_shutdown}), + discard_queued_work(State#state{queue = Rem}); + {empty, _Empty} -> ok end. - -in(Msg, State = #state{queue_strategy = fifo, queue = Q}) -> - State#state{queue=queue:in(Msg, Q)}; - -in(Msg, State = #state{queue_strategy = filo, queue = Q}) -> - State#state{queue=[Msg | Q]}. +in(Msg, + State = #state{queue_strategy = fifo, queue = Q}) -> + State#state{queue = queue:in(Msg, Q)}; +in(Msg, + State = #state{queue_strategy = filo, queue = Q}) -> + State#state{queue = [Msg | Q]}. out(#state{queue_strategy = fifo, queue = Q}) -> queue:out(Q); - out(#state{queue_strategy = filo, queue = []}) -> {empty, []}; out(#state{queue_strategy = filo, queue = [Msg | Q]}) -> {{value, Msg}, Q}. -new(#state{queue_strategy = fifo}) -> - queue:new(); -new(#state{queue_strategy = filo}) -> - []. +new(#state{queue_strategy = fifo}) -> queue:new(); +new(#state{queue_strategy = filo}) -> []. +worker_started(#state{pool = Pool, + monitors = Monitors} = + State, + StateName) -> + %% a new worker just started - if we have work pending, try to do it + case out(State) of + {{value, {work, Work, From}}, Rem} -> + case poolboy:checkout(Pool, false) of + full -> {next_state, queue, State}; + Pid when is_pid(Pid) -> + NewMonitors = monitor_worker(Pid, From, Work, Monitors), + riak_core_vnode_worker:handle_work(Pid, Work, From), + {next_state, queue, + State#state{queue = Rem, monitors = NewMonitors}} + end; + {empty, _} -> + {next_state, + %% If we are in state queueing with nothing in the queue, + %% move to the ready state so that the next incoming job + %% checks out the new worker from poolboy. + if StateName == queue -> ready; + true -> StateName + end, + State} + end. + +checkin(#state{pool = Pool, monitors = Monitors} = + State, + Worker) -> + case out(State) of + {{value, {work, Work, From}}, Rem} -> + %% there is outstanding work to do - instead of checking + %% the worker back in, just hand it more work to do + NewMonitors = monitor_worker(Worker, From, Work, + Monitors), + riak_core_vnode_worker:handle_work(Worker, Work, From), + {next_state, queue, + State#state{queue = Rem, monitors = NewMonitors}}; + {empty, Empty} -> + NewMonitors = demonitor_worker(Worker, Monitors), + poolboy:checkin(Pool, Worker), + {next_state, ready, + State#state{queue = Empty, monitors = NewMonitors}} + end. + +exit_worker(#state{monitors = Monitors} = State, Pid, + Info) -> + %% remove the listing for the dead worker + case lists:keyfind(Pid, 1, Monitors) of + {Pid, _, From, Work} -> + riak_core_vnode:reply(From, + {error, {worker_crash, Info, Work}}), + NewMonitors = lists:keydelete(Pid, 1, Monitors), + %% trigger to do more work will be 'worker_start' message + %% when poolboy replaces this worker (if not a 'checkin' or 'handle_work') + {ok, State#state{monitors = NewMonitors}}; + false -> {ok, State} + end. diff --git a/src/vclock.erl b/src/vclock.erl index ab329c546..75a3a6ee1 100644 --- a/src/vclock.erl +++ b/src/vclock.erl @@ -31,62 +31,59 @@ -module(vclock). --export([fresh/0, - fresh/2, - descends/2, - dominates/2, - descends_dot/2, - pure_dot/1, - merge/1, - get_counter/2, - get_timestamp/2, - get_dot/2, - valid_dot/1, - increment/2, - increment/3, - all_nodes/1, - equal/2, - prune/3, - timestamp/0, - last_modified/1]). +-export([fresh/0, fresh/2, descends/2, dominates/2, + descends_dot/2, pure_dot/1, merge/1, get_counter/2, + get_timestamp/2, get_dot/2, valid_dot/1, increment/2, + increment/3, all_nodes/1, equal/2, prune/3, + timestamp/0]). -ifdef(TEST). + -include_lib("eunit/include/eunit.hrl"). + -endif. --export_type([vclock/0, timestamp/0, vclock_node/0, dot/0, pure_dot/0]). +-export_type([vclock/0, timestamp/0, vclock_node/0, + dot/0, pure_dot/0]). -type vclock() :: [dot()]. --type dot() :: {vclock_node(), {counter(), timestamp()}}. + +-type dot() :: {vclock_node(), + {counter(), timestamp()}}. + -type pure_dot() :: {vclock_node(), counter()}. % Nodes can have any term() as a name, but they must differ from each other. --type vclock_node() :: term(). --type counter() :: integer(). --type timestamp() :: integer(). +-type vclock_node() :: term(). + +-type counter() :: integer(). + +-type timestamp() :: integer(). % @doc Create a brand new vclock. -spec fresh() -> vclock(). -fresh() -> - []. + +fresh() -> []. -spec fresh(vclock_node(), counter()) -> vclock(). -fresh(Node, Count) -> - [{Node, {Count, timestamp()}}]. -% @doc Return true if Va is a direct descendant of Vb, else false -- remember, a vclock is its own descendant! --spec descends(Va :: vclock(), Vb :: vclock()) -> boolean(). +fresh(Node, Count) -> [{Node, {Count, timestamp()}}]. + +%% @doc Return true if Va is a direct descendant of Vb, +%% else false -- remember, a vclock is its own descendant! +-spec descends(Va :: vclock(), + Vb :: vclock()) -> boolean(). + descends(_, []) -> % all vclocks descend from the empty vclock true; descends(Va, Vb) -> - [{NodeB, {CtrB, _T}}|RestB] = Vb, + [{NodeB, {CtrB, _T}} | RestB] = Vb, case lists:keyfind(NodeB, 1, Va) of - false -> - false; - {_, {CtrA, _TSA}} -> - (CtrA >= CtrB) andalso descends(Va,RestB) - end. + false -> false; + {_, {CtrA, _TSA}} -> + CtrA >= CtrB andalso descends(Va, RestB) + end. %% @doc does the given `vclock()' descend from the given `dot()'. The %% `dot()' can be any vclock entry returned from @@ -99,13 +96,13 @@ descends(Va, Vb) -> %% @see get_entry/3 %% @see dominates/2 -spec descends_dot(vclock(), dot()) -> boolean(). -descends_dot(Vclock, Dot) -> - descends(Vclock, [Dot]). + +descends_dot(Vclock, Dot) -> descends(Vclock, [Dot]). %% @doc in some cases the dot without timestamp data is needed. -spec pure_dot(dot()) -> pure_dot(). -pure_dot({N, {C, _TS}}) -> - {N, C}. + +pure_dot({N, {C, _TS}}) -> {N, C}. %% @doc true if `A' strictly dominates `B'. Note: ignores %% timestamps. In Riak it is possible to have vclocks that are @@ -116,6 +113,7 @@ pure_dot({N, {C, _TS}}) -> %% not go there.) %% -spec dominates(vclock(), vclock()) -> boolean(). + dominates(A, B) -> %% In a sane world if two vclocks descend each other they MUST be %% equal. In riak they can descend each other and have different @@ -130,45 +128,53 @@ dominates(A, B) -> % @doc Combine all VClocks in the input list into their least possible % common descendant. -spec merge(VClocks :: [vclock()]) -> vclock(). -merge([]) -> []; + +merge([]) -> []; merge([SingleVclock]) -> SingleVclock; -merge([First|Rest]) -> merge(Rest, lists:keysort(1, First)). +merge([First | Rest]) -> + merge(Rest, lists:keysort(1, First)). merge([], NClock) -> NClock; -merge([AClock|VClocks],NClock) -> - merge(VClocks, merge(lists:keysort(1, AClock), NClock, [])). +merge([AClock | VClocks], NClock) -> + merge(VClocks, + merge(lists:keysort(1, AClock), NClock, [])). merge([], [], AccClock) -> lists:reverse(AccClock); -merge([], Left, AccClock) -> lists:reverse(AccClock, Left); -merge(Left, [], AccClock) -> lists:reverse(AccClock, Left); -merge(V=[{Node1,{Ctr1,TS1}=CT1}=NCT1|VClock], - N=[{Node2,{Ctr2,TS2}=CT2}=NCT2|NClock], AccClock) -> - if Node1 < Node2 -> - merge(VClock, N, [NCT1|AccClock]); - Node1 > Node2 -> - merge(V, NClock, [NCT2|AccClock]); +merge([], Left, AccClock) -> + lists:reverse(AccClock, Left); +merge(Left, [], AccClock) -> + lists:reverse(AccClock, Left); +merge(V = [{Node1, {Ctr1, TS1} = CT1} = NCT1 | VClock], + N = [{Node2, {Ctr2, TS2} = CT2} = NCT2 | NClock], + AccClock) -> + if Node1 < Node2 -> merge(VClock, N, [NCT1 | AccClock]); + Node1 > Node2 -> merge(V, NClock, [NCT2 | AccClock]); true -> - ({_Ctr,_TS} = CT) = if Ctr1 > Ctr2 -> CT1; + ({_Ctr, _TS} = CT) = if Ctr1 > Ctr2 -> CT1; Ctr1 < Ctr2 -> CT2; - true -> {Ctr1, erlang:max(TS1,TS2)} + true -> {Ctr1, erlang:max(TS1, TS2)} end, - merge(VClock, NClock, [{Node1,CT}|AccClock]) + merge(VClock, NClock, [{Node1, CT} | AccClock]) end. % @doc Get the counter value in VClock set from Node. --spec get_counter(Node :: vclock_node(), VClock :: vclock()) -> counter(). +-spec get_counter(Node :: vclock_node(), + VClock :: vclock()) -> counter(). + get_counter(Node, VClock) -> case lists:keyfind(Node, 1, VClock) of - {_, {Ctr, _TS}} -> Ctr; - false -> 0 + {_, {Ctr, _TS}} -> Ctr; + false -> 0 end. % @doc Get the timestamp value in a VClock set from Node. --spec get_timestamp(Node :: vclock_node(), VClock :: vclock()) -> timestamp() | undefined. +-spec get_timestamp(Node :: vclock_node(), + VClock :: vclock()) -> timestamp() | undefined. + get_timestamp(Node, VClock) -> case lists:keyfind(Node, 1, VClock) of - {_, {_Ctr, TS}} -> TS; - false -> undefined + {_, {_Ctr, TS}} -> TS; + false -> undefined end. % @doc Get the last timestamp from a clock in a friendly format @@ -180,95 +186,113 @@ last_modified(VClock) -> calendar:gregorian_seconds_to_datetime(lists:last(lists:sort(TSL))). % @doc Get the entry `dot()' for `vclock_node()' from `vclock()'. --spec get_dot(Node :: vclock_node(), VClock :: vclock()) -> {ok, dot()} | undefined. +-spec get_dot(Node :: vclock_node(), + VClock :: vclock()) -> {ok, dot()} | undefined. + get_dot(Node, VClock) -> case lists:keyfind(Node, 1, VClock) of - false -> undefined; - Entry -> {ok, Entry} + false -> undefined; + Entry -> {ok, Entry} end. %% @doc is the given argument a valid dot, or entry? -spec valid_dot(dot()) -> boolean(). -valid_dot({_, {Cnt, TS}}) when is_integer(Cnt), is_integer(TS) -> + +valid_dot({_, {Cnt, TS}}) + when is_integer(Cnt), is_integer(TS) -> true; -valid_dot(_) -> - false. +valid_dot(_) -> false. % @doc Increment VClock at Node. --spec increment(Node :: vclock_node(), VClock :: vclock()) -> vclock(). +-spec increment(Node :: vclock_node(), + VClock :: vclock()) -> vclock(). + increment(Node, VClock) -> increment(Node, timestamp(), VClock). % @doc Increment VClock at Node. --spec increment(Node :: vclock_node(), IncTs :: timestamp(), - VClock :: vclock()) -> vclock(). -increment(Node, IncTs, VClock) -> - {{_Ctr, _TS}=C1,NewV} = case lists:keytake(Node, 1, VClock) of - false -> - {{1, IncTs}, VClock}; - {value, {_N, {C, _T}}, ModV} -> - {{C + 1, IncTs}, ModV} - end, - [{Node,C1}|NewV]. +-spec increment(Node :: vclock_node(), + IncTs :: timestamp(), VClock :: vclock()) -> vclock(). +increment(Node, IncTs, VClock) -> + {{_Ctr, _TS} = C1, NewV} = case lists:keytake(Node, 1, + VClock) + of + false -> {{1, IncTs}, VClock}; + {value, {_N, {C, _T}}, ModV} -> + {{C + 1, IncTs}, ModV} + end, + [{Node, C1} | NewV]. % @doc Return the list of all nodes that have ever incremented VClock. -spec all_nodes(VClock :: vclock()) -> [vclock_node()]. -all_nodes(VClock) -> - [X || {X,{_,_}} <- VClock]. --define(DAYS_FROM_GREGORIAN_BASE_TO_EPOCH, (1970*365+478)). +all_nodes(VClock) -> [X || {X, {_, _}} <- VClock]. + +-define(DAYS_FROM_GREGORIAN_BASE_TO_EPOCH, + 1970 * 365 + 478). + -define(SECONDS_FROM_GREGORIAN_BASE_TO_EPOCH, - (?DAYS_FROM_GREGORIAN_BASE_TO_EPOCH * 24*60*60) - %% == calendar:datetime_to_gregorian_seconds({{1970,1,1},{0,0,0}}) - ). + (?DAYS_FROM_GREGORIAN_BASE_TO_EPOCH) * 24 * 60 * + 60). %% == calendar:datetime_to_gregorian_seconds({{1970,1,1},{0,0,0}}) % @doc Return a timestamp for a vector clock -spec timestamp() -> timestamp(). + timestamp() -> %% Same as calendar:datetime_to_gregorian_seconds(erlang:universaltime()), %% but significantly faster. {MegaSeconds, Seconds, _} = os:timestamp(), - ?SECONDS_FROM_GREGORIAN_BASE_TO_EPOCH + MegaSeconds*1000000 + Seconds. + (?SECONDS_FROM_GREGORIAN_BASE_TO_EPOCH) + + MegaSeconds * 1000000 + + Seconds. % @doc Compares two VClocks for equality. --spec equal(VClockA :: vclock(), VClockB :: vclock()) -> boolean(). -equal(VA,VB) -> - lists:sort(VA) =:= lists:sort(VB). +-spec equal(VClockA :: vclock(), + VClockB :: vclock()) -> boolean(). + +equal(VA, VB) -> lists:sort(VA) =:= lists:sort(VB). % @doc Possibly shrink the size of a vclock, depending on current age and size. --spec prune(V::vclock(), Now::integer(), BucketProps::term()) -> vclock(). -prune(V,Now,BucketProps) -> +-spec prune(V :: vclock(), Now :: integer(), + BucketProps :: term()) -> vclock(). + +prune(V, Now, BucketProps) -> %% This sort need to be deterministic, to avoid spurious merge conflicts later. %% We achieve this by using the node ID as secondary key. - SortV = lists:sort(fun({N1,{_,T1}},{N2,{_,T2}}) -> {T1,N1} < {T2,N2} end, V), - prune_vclock1(SortV,Now,BucketProps). + SortV = lists:sort(fun ({N1, {_, T1}}, {N2, {_, T2}}) -> + {T1, N1} < {T2, N2} + end, + V), + prune_vclock1(SortV, Now, BucketProps). + % @private -prune_vclock1(V,Now,BProps) -> +prune_vclock1(V, Now, BProps) -> case length(V) =< get_property(small_vclock, BProps) of - true -> V; - false -> - {_,{_,HeadTime}} = hd(V), - case (Now - HeadTime) < get_property(young_vclock,BProps) of - true -> V; - false -> prune_vclock1(V,Now,BProps,HeadTime) - end + true -> V; + false -> + {_, {_, HeadTime}} = hd(V), + case Now - HeadTime < get_property(young_vclock, BProps) + of + true -> V; + false -> prune_vclock1(V, Now, BProps, HeadTime) + end end. + % @private -prune_vclock1(V,Now,BProps,HeadTime) -> +prune_vclock1(V, Now, BProps, HeadTime) -> % has a precondition that V is longer than small and older than young - case (length(V) > get_property(big_vclock,BProps)) orelse - ((Now - HeadTime) > get_property(old_vclock,BProps)) of - true -> prune_vclock1(tl(V),Now,BProps); - false -> V + case length(V) > get_property(big_vclock, BProps) orelse + Now - HeadTime > get_property(old_vclock, BProps) + of + true -> prune_vclock1(tl(V), Now, BProps); + false -> V end. get_property(Key, PairList) -> case lists:keyfind(Key, 1, PairList) of - {_Key, Value} -> - Value; - false -> - undefined + {_Key, Value} -> Value; + false -> undefined end. %% =================================================================== @@ -282,9 +306,9 @@ example_test() -> B = vclock:fresh(), A1 = vclock:increment(a, A), B1 = vclock:increment(b, B), - true = vclock:descends(A1,A), - true = vclock:descends(B1,B), - false = vclock:descends(A1,B1), + true = vclock:descends(A1, A), + true = vclock:descends(B1, B), + false = vclock:descends(A1, B1), A2 = vclock:increment(a, A1), C = vclock:merge([A2, B1]), C1 = vclock:increment(c, C), @@ -299,68 +323,66 @@ prune_small_test() -> Now = riak_core_util:moment(), OldTime = Now - 32000000, SmallVC = [{<<"1">>, {1, OldTime}}, - {<<"2">>, {2, OldTime}}, - {<<"3">>, {3, OldTime}}], - Props = [{small_vclock,4}], - ?assertEqual(lists:sort(SmallVC), lists:sort(prune(SmallVC, Now, Props))). + {<<"2">>, {2, OldTime}}, {<<"3">>, {3, OldTime}}], + Props = [{small_vclock, 4}], + ?assertEqual((lists:sort(SmallVC)), + (lists:sort(prune(SmallVC, Now, Props)))). prune_young_test() -> % vclock with all entries younger than young_vclock will be untouched Now = riak_core_util:moment(), NewTime = Now - 1, - VC = [{<<"1">>, {1, NewTime}}, - {<<"2">>, {2, NewTime}}, + VC = [{<<"1">>, {1, NewTime}}, {<<"2">>, {2, NewTime}}, {<<"3">>, {3, NewTime}}], - Props = [{small_vclock,1},{young_vclock,1000}], - ?assertEqual(lists:sort(VC), lists:sort(prune(VC, Now, Props))). + Props = [{small_vclock, 1}, {young_vclock, 1000}], + ?assertEqual((lists:sort(VC)), + (lists:sort(prune(VC, Now, Props)))). prune_big_test() -> % vclock not preserved by small or young will be pruned down to % no larger than big_vclock entries Now = riak_core_util:moment(), NewTime = Now - 1000, - VC = [{<<"1">>, {1, NewTime}}, - {<<"2">>, {2, NewTime}}, + VC = [{<<"1">>, {1, NewTime}}, {<<"2">>, {2, NewTime}}, {<<"3">>, {3, NewTime}}], - Props = [{small_vclock,1},{young_vclock,1}, - {big_vclock,2},{old_vclock,100000}], - ?assert(length(prune(VC, Now, Props)) =:= 2). + Props = [{small_vclock, 1}, {young_vclock, 1}, + {big_vclock, 2}, {old_vclock, 100000}], + ?assert((length(prune(VC, Now, Props)) =:= 2)). prune_old_test() -> % vclock not preserved by small or young will be pruned down to % no larger than big_vclock and no entries more than old_vclock ago Now = riak_core_util:moment(), NewTime = Now - 1000, - OldTime = Now - 100000, - VC = [{<<"1">>, {1, NewTime}}, - {<<"2">>, {2, OldTime}}, + OldTime = Now - 100000, + VC = [{<<"1">>, {1, NewTime}}, {<<"2">>, {2, OldTime}}, {<<"3">>, {3, OldTime}}], - Props = [{small_vclock,1},{young_vclock,1}, - {big_vclock,2},{old_vclock,10000}], - ?assert(length(prune(VC, Now, Props)) =:= 1). + Props = [{small_vclock, 1}, {young_vclock, 1}, + {big_vclock, 2}, {old_vclock, 10000}], + ?assert((length(prune(VC, Now, Props)) =:= 1)). prune_order_test() -> % vclock with two nodes of the same timestamp will be pruned down % to the same node Now = riak_core_util:moment(), - OldTime = Now - 100000, + OldTime = Now - 100000, VC1 = [{<<"1">>, {1, OldTime}}, {<<"2">>, {2, OldTime}}], VC2 = lists:reverse(VC1), - Props = [{small_vclock,1},{young_vclock,1}, - {big_vclock,2},{old_vclock,10000}], - ?assertEqual(prune(VC1, Now, Props), prune(VC2, Now, Props)). + Props = [{small_vclock, 1}, {young_vclock, 1}, + {big_vclock, 2}, {old_vclock, 10000}], + ?assertEqual((prune(VC1, Now, Props)), + (prune(VC2, Now, Props))). accessor_test() -> - VC = [{<<"1">>, {1, 1}}, - {<<"2">>, {2, 2}}], - ?assertEqual(1, get_counter(<<"1">>, VC)), - ?assertEqual(1, get_timestamp(<<"1">>, VC)), - ?assertEqual(2, get_counter(<<"2">>, VC)), - ?assertEqual(2, get_timestamp(<<"2">>, VC)), - ?assertEqual(0, get_counter(<<"3">>, VC)), - ?assertEqual(undefined, get_timestamp(<<"3">>, VC)), - ?assertEqual([<<"1">>, <<"2">>], all_nodes(VC)). + VC = [{<<"1">>, {1, 1}}, {<<"2">>, {2, 2}}], + ?assertEqual(1, (get_counter(<<"1">>, VC))), + ?assertEqual(1, (get_timestamp(<<"1">>, VC))), + ?assertEqual(2, (get_counter(<<"2">>, VC))), + ?assertEqual(2, (get_timestamp(<<"2">>, VC))), + ?assertEqual(0, (get_counter(<<"3">>, VC))), + ?assertEqual(undefined, (get_timestamp(<<"3">>, VC))), + ?assertEqual([<<"1">>, <<"2">>], (all_nodes(VC))). last_modified_test() -> DT1 = {{1972, 5, 6}, {16, 13, 0}}, @@ -373,50 +395,53 @@ last_modified_test() -> ?assertMatch(DT2, last_modified(VC1)). merge_test() -> - VC1 = [{<<"1">>, {1, 1}}, - {<<"2">>, {2, 2}}, + VC1 = [{<<"1">>, {1, 1}}, {<<"2">>, {2, 2}}, {<<"4">>, {4, 4}}], - VC2 = [{<<"3">>, {3, 3}}, - {<<"4">>, {3, 3}}], - ?assertEqual([], merge(vclock:fresh())), - ?assertEqual([{<<"1">>,{1,1}},{<<"2">>,{2,2}},{<<"3">>,{3,3}},{<<"4">>,{4,4}}], - merge([VC1, VC2])). + VC2 = [{<<"3">>, {3, 3}}, {<<"4">>, {3, 3}}], + ?assertEqual([], (merge(vclock:fresh()))), + ?assertEqual([{<<"1">>, {1, 1}}, {<<"2">>, {2, 2}}, + {<<"3">>, {3, 3}}, {<<"4">>, {4, 4}}], + (merge([VC1, VC2]))). merge_less_left_test() -> VC1 = [{<<"5">>, {5, 5}}], VC2 = [{<<"6">>, {6, 6}}, {<<"7">>, {7, 7}}], - ?assertEqual([{<<"5">>, {5, 5}},{<<"6">>, {6, 6}}, {<<"7">>, {7, 7}}], - vclock:merge([VC1, VC2])). + ?assertEqual([{<<"5">>, {5, 5}}, {<<"6">>, {6, 6}}, + {<<"7">>, {7, 7}}], + (vclock:merge([VC1, VC2]))). merge_less_right_test() -> VC1 = [{<<"6">>, {6, 6}}, {<<"7">>, {7, 7}}], VC2 = [{<<"5">>, {5, 5}}], - ?assertEqual([{<<"5">>, {5, 5}},{<<"6">>, {6, 6}}, {<<"7">>, {7, 7}}], - vclock:merge([VC1, VC2])). + ?assertEqual([{<<"5">>, {5, 5}}, {<<"6">>, {6, 6}}, + {<<"7">>, {7, 7}}], + (vclock:merge([VC1, VC2]))). merge_same_id_test() -> - VC1 = [{<<"1">>, {1, 2}},{<<"2">>,{1,4}}], - VC2 = [{<<"1">>, {1, 3}},{<<"3">>,{1,5}}], - ?assertEqual([{<<"1">>, {1, 3}},{<<"2">>,{1,4}},{<<"3">>,{1,5}}], - vclock:merge([VC1, VC2])). + VC1 = [{<<"1">>, {1, 2}}, {<<"2">>, {1, 4}}], + VC2 = [{<<"1">>, {1, 3}}, {<<"3">>, {1, 5}}], + ?assertEqual([{<<"1">>, {1, 3}}, {<<"2">>, {1, 4}}, + {<<"3">>, {1, 5}}], + (vclock:merge([VC1, VC2]))). get_entry_test() -> VC = vclock:fresh(), - VC1 = increment(a, increment(c, increment(b, increment(a, VC)))), - ?assertMatch({ok, {a, {2, _}}}, get_dot(a, VC1)), - ?assertMatch({ok, {b, {1, _}}}, get_dot(b, VC1)), - ?assertMatch({ok, {c, {1, _}}}, get_dot(c, VC1)), - ?assertEqual(undefined, get_dot(d, VC1)). + VC1 = increment(a, + increment(c, increment(b, increment(a, VC)))), + ?assertMatch({ok, {a, {2, _}}}, (get_dot(a, VC1))), + ?assertMatch({ok, {b, {1, _}}}, (get_dot(b, VC1))), + ?assertMatch({ok, {c, {1, _}}}, (get_dot(c, VC1))), + ?assertEqual(undefined, (get_dot(d, VC1))). valid_entry_test() -> VC = vclock:fresh(), VC1 = increment(c, increment(b, increment(a, VC))), [begin - {ok, E} = get_dot(Actor, VC1), - ?assert(valid_dot(E)) - end || Actor <- [a, b, c]], - ?assertNot(valid_dot(undefined)), - ?assertNot(valid_dot("huffle-puff")), - ?assertNot(valid_dot([])). + {ok, E} = get_dot(Actor, VC1), ?assert((valid_dot(E))) + end + || Actor <- [a, b, c]], + ?assertNot((valid_dot(undefined))), + ?assertNot((valid_dot("huffle-puff"))), + ?assertNot((valid_dot([]))). -endif. diff --git a/test/eqc/13node_12node_ring.eqc b/test/eqc/13node_12node_ring.eqc deleted file mode 100644 index b5bd40061..000000000 Binary files a/test/eqc/13node_12node_ring.eqc and /dev/null differ diff --git a/test/eqc/169_group_join.eqc b/test/eqc/169_group_join.eqc deleted file mode 100644 index 0b5b3d43d..000000000 Binary files a/test/eqc/169_group_join.eqc and /dev/null differ diff --git a/test/eqc/648_unbalanced_singly.eqc b/test/eqc/648_unbalanced_singly.eqc deleted file mode 100644 index ebffd1970..000000000 Binary files a/test/eqc/648_unbalanced_singly.eqc and /dev/null differ diff --git a/test/eqc/bprops_eqc.erl b/test/eqc/bprops_eqc.erl deleted file mode 100644 index 63fa45b21..000000000 --- a/test/eqc/bprops_eqc.erl +++ /dev/null @@ -1,256 +0,0 @@ -%% ------------------------------------------------------------------- -%% -%% Copyright (c) 2016 Basho Technologies, Inc. All Rights Reserved. -%% -%% This file is provided to you under the Apache License, -%% Version 2.0 (the "License"); you may not use this file -%% except in compliance with the License. You may obtain -%% a copy of the License at -%% -%% http://www.apache.org/licenses/LICENSE-2.0 -%% -%% Unless required by applicable law or agreed to in writing, -%% software distributed under the License is distributed on an -%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -%% KIND, either express or implied. See the License for the -%% specific language governing permissions and limitations -%% under the License. -%% -%% ------------------------------------------------------------------- --module(bprops_eqc). - -%% -%% This module defines a collection of EQC state_m commands, for -%% testing the riak_core_bucket module. In order to understand this -%% test, you should understand EQC generally, and the EQC state machine -%% testing framework and callback conventions. -%% -%% TODO This module currently tests a limited subset of the -%% riak_core_bucket module and makes little attempt to -%% do negative testing around malformed inputs, etc. -%% More attention needs to be spent on these tests! -%% - --ifdef(EQC). --include_lib("eqc/include/eqc.hrl"). --include_lib("eqc/include/eqc_statem.hrl"). --include_lib("eunit/include/eunit.hrl"). - --compile(export_all). - --type bucket_name() :: binary(). --type orddict() :: orddict:orddict(). - --define(NAMES, [<<0>>, <<1>>, <<2>>, <<3>>]). --define(BPROP_KEYS, [foo, bar, tapas]). --define(DEFAULT_BPROPS, [{n_val, 3}]). --define(QC_OUT(P), - eqc:on_output(fun(Str, Args) -> io:format(user, Str, Args) end, P)). - - -%% -%% The state_m "Model". This invariant represents what properties -%% should be in which buckets between state transitions. -%% --record(state, { - buckets = orddict:new() :: orddict() -}). - -%% -%% Eunit entrypoints -%% - -bprops_test_() -> { - timeout, 60, - ?_test(?assert( - eqc:quickcheck(?QC_OUT(eqc:testing_time(50, prop_buckets()))))) - }. - -%% -%% top level drivers (for testing by hand, typically) -%% - -run() -> - run(100). - -run(N) -> - eqc:quickcheck(eqc:numtests(N, prop_buckets())). - -rerun() -> - eqc:check(eqc_statem:show_states(prop_buckets())). - -cover() -> - cover(100). - -cover(N) -> - cover:compile_beam(riak_core_bucket), - eqc:quickcheck(eqc:numtests(N, prop_buckets())), - cover:analyse_to_file(riak_core_bucket, [html]). - - -%% -%% eqc_statem initial model -%% - --spec initial_state() -> eqc_statem:symbolic_state(). -initial_state() -> - #state{}. - -%% -%% set_bucket command -%% - -set_bucket_args(_S) -> - [bucket_name(), bucket_props()]. - -set_bucket(Bucket, BProps) -> - riak_core_bucket:set_bucket(Bucket, BProps). - -set_bucket_post(#state{buckets=Buckets}, [Bucket, _BProps], Res) -> - case {Res, orddict:find(Bucket, Buckets)} of - %% first time bucket has been set - {ok, error} -> - true; - %% bucket has been set before - {ok, {ok, _OldBProps}} -> - true; - %% anything other than ok is a failure - %% TODO revisit, e.g., generate invalid inputs to force an error - _ -> - false - end. - -set_bucket_next(#state{buckets=Buckets} = S, _Res, [Bucket, BProps]) -> - %% - %% Get any previously defined properties from the model - %% - OldBProps = - case orddict:find(Bucket, Buckets) of - {ok, Props} -> Props; - error -> orddict:from_list(?DEFAULT_BPROPS) - end, - S#state{ - buckets = orddict:store( - Bucket, - %% add defaults and the bucket name; remove any duplicates - %% bprops takes precedence over defaults, and name is always set - %% to bucket - expected_properties( - Bucket, OldBProps, BProps - ), - Buckets - ) - }. - --spec expected_properties(bucket_name(), orddict(), orddict()) -> orddict(). -expected_properties(Bucket, OldProps, NewProps) -> - Props = riak_core_bucket_props:merge(NewProps, OldProps), - orddict:store(name, Bucket, Props). - -%% -%% get_bucket command -%% - -get_bucket_args(_S) -> - [bucket_name()]. - -get_bucket(Bucket) -> - riak_core_bucket:get_bucket(Bucket). - -get_bucket_post(#state{buckets=Buckets}, [Bucket], Res) -> - BPropsFind = orddict:find(Bucket, Buckets), - case {Res, BPropsFind} of - {error, _} -> - eq(Res, error); - {_, {ok, BProps}} -> - eq( - orddict:from_list(Res), - orddict:from_list(BProps) - ); - {_, error} -> - eq( - orddict:from_list(Res), - orddict:from_list(?DEFAULT_BPROPS ++ [{name, Bucket}]) - ) - end. - -%% -%% all_n command -%% - -all_n_args(_) -> []. - -all_n() -> - {ok, Ring} = riak_core_ring_manager:get_my_ring(), - riak_core_bucket:all_n(Ring). - -all_n_post(#state{buckets=Buckets}, [], Res) -> - AllNVals = orddict:fold( - fun(_Bucket, BProps, Accum) -> - {ok, NVal} = orddict:find(n_val, BProps), - [NVal | Accum] - end, - [], - Buckets - ) ++ [proplists:get_value(n_val, ?DEFAULT_BPROPS)], - eq(ordsets:from_list(Res), ordsets:from_list(AllNVals)). - - -%% TODO Add more commands here - -%% -%% generators -%% - -bucket_name() -> - eqc_gen:elements(?NAMES). - -bucket_props() -> - eqc_gen:list(bucket_prop()). - -bucket_prop() -> - eqc_gen:oneof( - [ - {n_val, pos_integer()}, - {bucket_prop_name(), bucket_prop_value()} - ] - ). - -pos_integer() -> - ?LET(N, eqc_gen:nat(), N + 1). - -bucket_prop_name() -> - eqc_gen:elements(?BPROP_KEYS). - -bucket_prop_value() -> - eqc_gen:bool(). - - -%% -%% eqc properties -%% - -prop_buckets() -> - ?FORALL(Cmds, commands(?MODULE), - aggregate(command_names(Cmds), - ?TRAPEXIT( - begin - {H, S, Res} = - bucket_eqc_utils:per_test_setup(?DEFAULT_BPROPS, - fun() -> - run_commands(?MODULE, Cmds) - end), - pretty_commands( - ?MODULE, Cmds, - {H, S, Res}, - aggregate( - command_names(Cmds), - Res == ok - ) - ) - end - ) - ) - ). - --endif. diff --git a/test/eqc/bucket_eqc_utils.erl b/test/eqc/bucket_eqc_utils.erl deleted file mode 100644 index 3a3d6459c..000000000 --- a/test/eqc/bucket_eqc_utils.erl +++ /dev/null @@ -1,49 +0,0 @@ -%% ------------------------------------------------------------------- -%% -%% Copyright (c) 2007-2016 Basho Technologies, Inc. -%% -%% This file is provided to you under the Apache License, -%% Version 2.0 (the "License"); you may not use this file -%% except in compliance with the License. You may obtain -%% a copy of the License at -%% -%% http://www.apache.org/licenses/LICENSE-2.0 -%% -%% Unless required by applicable law or agreed to in writing, -%% software distributed under the License is distributed on an -%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -%% KIND, either express or implied. See the License for the -%% specific language governing permissions and limitations -%% under the License. -%% -%% ------------------------------------------------------------------- - --module(bucket_eqc_utils). - -%% API --export([per_test_setup/2]). - - -per_test_setup(DefaultBucketProps, TestFun) -> - try - os:cmd("rm -rf ./meta_temp"), - riak_core_test_util:stop_pid(whereis(riak_core_ring_events)), - riak_core_test_util:stop_pid(whereis(riak_core_ring_manager)), - application:set_env(riak_core, claimant_tick, 4294967295), - application:set_env(riak_core, cluster_name, "eqc_test"), - application:set_env(riak_core, default_bucket_props, DefaultBucketProps), - {ok, RingEvents} = riak_core_ring_events:start_link(), - {ok, RingMgr} = riak_core_ring_manager:start_link(test), - {ok, Claimant} = riak_core_claimant:start_link(), - - Results = TestFun(), - - riak_core_test_util:stop_pid(Claimant), - unlink(RingMgr), - riak_core_ring_manager:stop(), - riak_core_test_util:stop_pid(RingEvents), - Results - after - os:cmd("rm -rf ./meta_temp"), - meck:unload() - end. diff --git a/test/eqc/claim-statem-leaving-nodes-still-claim.eqc b/test/eqc/claim-statem-leaving-nodes-still-claim.eqc deleted file mode 100644 index bcb0ca894..000000000 Binary files a/test/eqc/claim-statem-leaving-nodes-still-claim.eqc and /dev/null differ diff --git a/test/eqc/claim_32_5_unbalanced.eqc b/test/eqc/claim_32_5_unbalanced.eqc deleted file mode 100644 index daa44854e..000000000 Binary files a/test/eqc/claim_32_5_unbalanced.eqc and /dev/null differ diff --git a/test/eqc/riak_core_claim_statem.erl b/test/eqc/riak_core_claim_statem.erl deleted file mode 100644 index fd3b9f3c6..000000000 --- a/test/eqc/riak_core_claim_statem.erl +++ /dev/null @@ -1,265 +0,0 @@ -%%% @author Russell Brown -%%% @copyright (C) 2017, Russell Brown -%%% @doc -%%% -%%% @end -%%% Created : 5 Jun 2017 by Russell Brown - --module(riak_core_claim_statem). - --ifdef(EQC). --include_lib("eqc/include/eqc.hrl"). -%%-include_lib("eqc/include/eqc_statem.hrl"). - -include_lib("eqc/include/eqc_fsm.hrl"). --include_lib("eunit/include/eunit.hrl"). - --define(CLAIMANT, node_0). - --compile(export_all). - - -%% -- State ------------------------------------------------------------------ --record(state, - { - ring_size, - nodes=[?CLAIMANT] :: [atom()], %% nodes that have been added - node_counter=1 :: non_neg_integer(), %% to aid with naming nodes - ring = undefined, - committed_nodes = [] - }). - -%% @doc run the statem with a ring of size `math:pow(`N', 2)'. --spec with_ring_size(pos_integer()) -> eqc_statem:symbolic_state(). -with_ring_size(N) -> - RingSize = trunc(math:pow(2, N)), - #state{ring_size=RingSize, ring=riak_core_ring:fresh(RingSize, ?CLAIMANT)}. - -%% @doc Returns the state in which each test case starts. (Unless a different -%% initial state is supplied explicitly to, e.g. commands/2.) --spec initial_state_data() -> eqc_statem:symbolic_state(). -initial_state_data() -> - #state{}. - -initial_state(_S) -> - starting. - -starting() -> - [{planning, add_node}]. - -planning() -> - [{planning, add_node}, - {planning, leave_node}, - {claiming, claim}]. - -claiming() -> - [{planning, add_node}, - {planning, leave_node}]. - -%% -- Operations ------------------------------------------------------------- - -%% --- Operation: add_node --- -%% @doc add_node_pre/1 - Precondition for generation -add_node_pre(_From, _To, S=#state{nodes=Nodes}) when - (S#state.ring_size div length(Nodes)) =< 3 -> - false; -add_node_pre(_From, _To, _) -> - true. - -%% @doc add_node_args - Argument generator --spec add_node_args(From, To, S) -> eqc_gen:gen([term()]) - when From :: eqc_fsm:state_name(), - To :: eqc_fsm:state_name(), - S :: eqc_statem:symbolic_state(). -add_node_args(_From, _To, #state{node_counter=NC, ring=Ring}) -> - %% TODO consider re-adding removed nodes - [list_to_atom("node_" ++ integer_to_list(NC)), - Ring]. - -%% @doc add_node - The actual operation -add_node(NodeName, Ring) -> - R = riak_core_ring:add_member(?CLAIMANT, Ring, NodeName), - R. - -%% @doc add_node_next - Next state function --spec add_node_next(_From, _To, S, Var, Args) -> NewS - when S :: eqc_statem:symbolic_state() | eqc_state:dynamic_state(), - Var :: eqc_statem:var() | term(), - Args :: [term()], - NewS :: eqc_statem:symbolic_state() | eqc_state:dynamic_state(). -add_node_next(_From, _To, S=#state{node_counter=NC, nodes=Nodes}, Ring, [Node, _RingIn]) -> - S#state{ring=Ring, node_counter=NC+1, nodes=[Node | Nodes]}. - -%% @doc add_node_post - Postcondition for add_node --spec add_node_post(_From, _To, S, Args, Res) -> true | term() - when S :: eqc_state:dynamic_state(), - Args :: [term()], - Res :: term(). -add_node_post(_Frim, _To, _S, [NodeName, _Ring], NextRing) -> - lists:member(NodeName, riak_core_ring:members(NextRing, [joining])). - -%% --- Operation: leave_node --- -%% @doc leave_node_pre/1 - Precondition for generation -leave_node_pre(_From, _To, #state{nodes=Nodes}) when length(Nodes) < 5 -> - false; -leave_node_pre(_From, _To, _) -> - true. - -%% @doc leave_node_args - Argument generator --spec leave_node_args(From, To, S) -> eqc_gen:gen([term()]) - when From :: eqc_fsm:state_name(), - To :: eqc_fsm:state_name(), - S :: eqc_statem:symbolic_state(). -leave_node_args(_From, _To, #state{nodes=Nodes, ring=Ring}) -> - %% TODO consider re-leaveing leaved nodes - [elements(Nodes), - Ring]. - -leave_node_pre(_From, _To, #state{nodes=Nodes}, [Node, _Ring]) -> - lists:member(Node, Nodes); -leave_node_pre(_, _, _, _) -> - false. - -%% @doc leave_node - The actual operation -leave_node(NodeName, Ring) -> - R = riak_core_ring:leave_member(?CLAIMANT, Ring, NodeName), - R. - -%% @doc leave_node_next - Next state function --spec leave_node_next(_From, _To, S, Var, Args) -> NewS - when S :: eqc_statem:symbolic_state() | eqc_state:dynamic_state(), - Var :: eqc_statem:var() | term(), - Args :: [term()], - NewS :: eqc_statem:symbolic_state() | eqc_state:dynamic_state(). -leave_node_next(_From, _To, S=#state{committed_nodes=Committed, nodes=Nodes}, Ring, [Node, _RingIn]) -> - S#state{ring=Ring, committed_nodes=lists:delete(Node , Committed), nodes=lists:delete(Node, Nodes)}. - -%% @doc leave_node_post - Postcondition for leave_node --spec leave_node_post(_From, _To, S, Args, Res) -> true | term() - when S :: eqc_state:dynamic_state(), - Args :: [term()], - Res :: term(). -leave_node_post(_Frim, _To, _S, [NodeName, _Ring], NextRing) -> - lists:member(NodeName, riak_core_ring:members(NextRing, [leaving])). - -%% --- Operation: claim --- - -%% @doc claim_pre/3 - Precondition for generation --spec claim_pre(_From, _To, S :: eqc_statem:symbolic_state()) -> boolean(). -claim_pre(_From, _To, #state{ring=undefined}) -> - false; -claim_pre(_From, _To, _S) -> - true. - -%% @doc claim_args - Argument generator --spec claim_args(_From, _To, S :: eqc_statem:symbolic_state()) -> eqc_gen:gen([term()]). -claim_args(_From, _To, #state{ring=Ring}) -> - [Ring]. - -%% @doc claim - The actual operation -claim(Ring) -> - R =riak_core_claim:claim(Ring, {riak_core_claim, wants_claim_v2}, {riak_core_claim, choose_claim_v2}), - R. - -%% @doc claim_next - Next state function --spec claim_next(_From, _To, S, Var, Args) -> NewS - when S :: eqc_statem:symbolic_state() | eqc_state:dynamic_state(), - Var :: eqc_statem:var() | term(), - Args :: [term()], - NewS :: eqc_statem:symbolic_state() | eqc_state:dynamic_state(). -claim_next(_From, _To, S=#state{nodes=Nodes}, NewRing, [_OldRing]) -> - S#state{ring=NewRing, committed_nodes=Nodes}. - -%% @doc claim_post - Postcondition for claim --spec claim_post(_From, _To, S, Args, Res) -> true | term() - when S :: eqc_state:dynamic_state(), - Args :: [term()], - Res :: term(). -claim_post(_From, _To, #state{nodes=Nodes}, [_Ring], _NewRing) when length(Nodes) < 4 -> - true; -claim_post(_From, _To, _S, [_Ring], NewRing) -> - Nval = 3, - TNval = 4, - Preflists = riak_core_ring:all_preflists(NewRing, Nval), - ImperfectPLs = orddict:to_list( - lists:foldl(fun(PL,Acc) -> - PLNodes = lists:usort([N || {_,N} <- PL]), - case length(PLNodes) of - Nval -> - Acc; - _ -> - ordsets:add_element(PL, Acc) - end - end, [], Preflists)), - - case {riak_core_claim:meets_target_n(NewRing, TNval), - ImperfectPLs, - riak_core_claim:balanced_ring(ring_size(NewRing), - node_count(NewRing), NewRing)} of - {{true, []}, [], true} -> - true; - {X, Y, Z} -> - {ring_size(NewRing), node_count(NewRing), - {{meets_target_n, X}, - {perfect_pls, Y}, - {balanced_ring, Z}}} - end. - -%% -- Property --------------------------------------------------------------- -%% @doc Optional callback, Invariant, checked for each visited state -%% during test execution. -%% -spec invariant(S :: eqc_statem:dynamic_state()) -> boolean(). -%% invariant(_S) -> -%% true. - -%% @doc Default generated property --spec prop_claim(eqc_statem:symbolic_state()) -> eqc:property(). -prop_claim(InitialState) -> - ?FORALL(Cmds, commands(?MODULE, {starting, InitialState}), - begin - {H, {_FinalStateName, S}, Res} = run_commands(?MODULE, Cmds), - Ring = S#state.ring, - pretty_commands(?MODULE, Cmds, {H, S, Res}, - aggregate(command_names(Cmds), - measure(ring_size, ring_size(Ring), - measure(node_count, node_count(Ring), - Res == ok)))) - end). - -ring_size(undefined) -> - 0; -ring_size(Ring) -> - riak_core_ring:num_partitions(Ring). - -node_count(undefined) -> - 0; -node_count(Ring) -> - length(riak_core_ring:members(Ring, [joining, valid])). - -weight(_From, _To, add_node, _Args) -> - 5; -weight(_, _, _, _) -> - 1. - -%% eunit stuff --define(QC_OUT(P), - eqc:on_output(fun(Str, Args) -> io:format(user, Str, Args) end, P)). - -claim_test() -> - eqc:quickcheck(?QC_OUT(prop_claim(with_ring_size(5)))). - -eqc_check(File, Prop) -> - {ok, Bytes} = file:read_file(File), - CE = binary_to_term(Bytes), - eqc:check(Prop, CE). - -%% Helpers -transfer_ring(Ring) -> - Owners = riak_core_ring:all_owners(Ring), - RFinal = lists:foldl(fun({Idx, Owner}, Racc) -> - riak_core_ring:transfer_node(Idx, Owner, Racc) end, - Ring, Owners), - RFinal. - - - --endif. diff --git a/eqc/chash_eqc.erl b/test/pqc/chash_eqc.erl similarity index 90% rename from eqc/chash_eqc.erl rename to test/pqc/chash_eqc.erl index 58e884988..da9f4e034 100644 --- a/eqc/chash_eqc.erl +++ b/test/pqc/chash_eqc.erl @@ -24,20 +24,19 @@ -module(chash_eqc). --ifdef(EQC). --include_lib("eqc/include/eqc.hrl"). +-ifdef(PROPER). +-include_lib("proper/include/proper.hrl"). -include_lib("eunit/include/eunit.hrl"). -define(NOTEST, true). -define(NOASSERT, true). --define(TEST_ITERATIONS, 50). +-define(TEST_ITERATIONS, 5000). -define(QC_OUT(P), - eqc:on_output(fun(Str, Args) -> io:format(user, Str, Args) end, P)). + proper:on_output(fun(Str, Args) -> io:format(user, Str, Args) end, P)). -define(RINGTOP, trunc(math:pow(2,160)-1)). % SHA-1 space --export([check/0, - test/0, +-export([test/0, test/1]). %%==================================================================== @@ -55,8 +54,7 @@ eqc_test_() -> {timeout, 60000, % timeout is in msec %% Indicate the number of test iterations for each property here ?_assertEqual(true, - quickcheck(numtests(?TEST_ITERATIONS, - ?QC_OUT(prop_chash_next_index())))) + proper:quickcheck(?QC_OUT(prop_chash_next_index()),[{numtests,?TEST_ITERATIONS}])) } ] } @@ -139,9 +137,9 @@ test() -> test(100). test(N) -> - quickcheck(numtests(N, prop_chash_next_index())). + proper:quickcheck(numtests(N, prop_chash_next_index())). -check() -> - check(prop_chash_next_index(), current_counterexample()). +% check() -> +% check(prop_chash_next_index(), current_counterexample()). -endif. % EQC diff --git a/eqc/core_vnode_eqc.erl b/test/pqc/core_vnode_eqc.erl similarity index 96% rename from eqc/core_vnode_eqc.erl rename to test/pqc/core_vnode_eqc.erl index 66e68f93f..e5d87c590 100644 --- a/eqc/core_vnode_eqc.erl +++ b/test/pqc/core_vnode_eqc.erl @@ -23,9 +23,9 @@ %% @doc QuickCheck tests for riak_core_vnode code -module(core_vnode_eqc). --ifdef(EQC). --include_lib("eqc/include/eqc.hrl"). --include_lib("eqc/include/eqc_fsm.hrl"). +-ifdef(TEST). +-ifdef(PORPER). +-include_lib("proper/include/proper.hrl"). -include_lib("eunit/include/eunit.hrl"). -include_lib("riak_core/include/riak_core_vnode.hrl"). -compile([export_all]). @@ -37,7 +37,7 @@ end). -define(QC_OUT(P), - eqc:on_output(fun(Str, Args) -> io:format(user, Str, Args) end, P)). + proper:on_output(fun(Str, Args) -> io:format(user, Str, Args) end, P)). -record(qcst, {started, counters, % Dict of counters for each index @@ -60,7 +60,7 @@ simple_test_() -> ok end, {timeout, 600, - ?_assertEqual(true, quickcheck(?QC_OUT(numtests(100, prop_simple()))))}}. + ?_assertEqual(true, proper:quickcheck(?QC_OUT(numtests(100, prop_simple()))))}}. setup_simple() -> %% call `meck:unload' here because there are other tests that have @@ -88,7 +88,7 @@ setup_simple() -> OldVars. test(N) -> - quickcheck(numtests(N, prop_simple())). + proper:quickcheck(numtests(N, prop_simple())). eqc_setup() -> OldVars = setup_simple(), @@ -101,10 +101,10 @@ eqc_setup() -> prop_simple() -> ?SETUP(fun eqc_setup/0, - ?FORALL(Cmds, commands(?MODULE, {setup, initial_state_data()}), + ?FORALL(Cmds, proper_fsm:commands(?MODULE, {setup, initial_state_data()}), aggregate(command_names(Cmds), begin - {H,{_SN,S},Res} = run_commands(?MODULE, Cmds), + {H,{_SN,S},Res} = proper_fsm:run_commands(?MODULE, Cmds), timer:sleep(500), %% Adjust this to make shutdown sensitive stuff pass/fail %% Do a sync operation on all the started vnodes %% to ensure any of the noreply commands have executed before @@ -213,10 +213,12 @@ next_state_data(_From,_To,S,_R,_C) -> setup(S) -> [{setup, {call,?MODULE,enable_async,[gen_async_pool()]}}, - {stopped, {call,?MODULE,prepare,[S#qcst.async_size]}}]. + {stopped, {call,?MODULE,prepare,[S#qcst.async_size]}} + ]. stopped(S) -> - [{running, {call,?MODULE,start_vnode,[index(S)]}}]. + [{running, {call,?MODULE,start_vnode,[index(S)]}} + ]. running(S) -> [ @@ -450,3 +452,4 @@ filter_work(Work, Pid) -> end, Work). -endif. +-endif. \ No newline at end of file diff --git a/eqc/new_cluster_membership_model_eqc.erl b/test/pqc/new_cluster_membership_model_eqc.erl similarity index 98% rename from eqc/new_cluster_membership_model_eqc.erl rename to test/pqc/new_cluster_membership_model_eqc.erl index 5655a7ef4..f48cc7585 100644 --- a/eqc/new_cluster_membership_model_eqc.erl +++ b/test/pqc/new_cluster_membership_model_eqc.erl @@ -1,15 +1,14 @@ -module(new_cluster_membership_model_eqc). -ifdef(MODEL). --ifdef(EQC). --include_lib("eqc/include/eqc.hrl"). --include_lib("eqc/include/eqc_statem.hrl"). +-ifdef(PROPER). +-include_lib("proper/include/proper.hrl"). -include_lib("eunit/include/eunit.hrl"). -compile(export_all). -define(TEST_ITERATIONS, 3000). -define(QC_OUT(P), - eqc:on_output(fun(Str, Args) -> io:format(user, Str, Args) end, P)). + proper:on_output(fun(Str, Args) -> io:format(user, Str, Args) end, P)). -define(OUT(S,A),ok). %%-define(OUT(S,A),io:format(S,A)). @@ -27,7 +26,7 @@ vclock :: vclock:vclock(), % for this chstate object, entries are % {Node, Ctr} chring :: chash:chash(), % chash ring of {IndexAsInt, Node} mappings - meta :: dict(), % dict of cluster-wide other data (primarily + meta :: dict:dict(), % dict of cluster-wide other data (primarily % bucket N-value, etc) clustername :: {node(), term()}, @@ -50,7 +49,7 @@ %% Global test state -record(state, { - nstates :: dict(), + nstates :: dict:dict(), ring_size :: integer(), members :: [{node(), {member_status(), vclock:vclock()}}], primary :: [integer()], @@ -63,7 +62,7 @@ active_handoffs :: [{integer(), integer(), integer()}], seed :: {integer(), integer(), integer()}, old_seed :: {integer(), integer(), integer()}, - split :: dict() + split :: dict:dict() }). eqc_test_() -> @@ -74,8 +73,8 @@ eqc_test_() -> [{inorder, [manual_test_list(), %% Run the quickcheck tests - {timeout, 60000, % timeout is in msec - ?_assertEqual(true, catch quickcheck(numtests(?TEST_ITERATIONS, ?QC_OUT(prop_join()))))} + {timeout, 60000000, % timeout is in msec + ?_assertEqual(true, catch proper:quickcheck(numtests(?TEST_ITERATIONS, ?QC_OUT(prop_join()))))} ]} ] } @@ -83,7 +82,7 @@ eqc_test_() -> }. eqc() -> - quickcheck(numtests(?TEST_ITERATIONS, ?QC_OUT(prop_join()))), + proper:quickcheck(numtests(?TEST_ITERATIONS, ?QC_OUT(prop_join()))), ok. setup() -> @@ -93,7 +92,7 @@ cleanup(_) -> ok. prop_join() -> - ?FORALL(Cmds, more_commands(100, commands(?MODULE)), + ?FORALL(Cmds, commands(?MODULE), ?TRAPEXIT( ( begin @@ -276,7 +275,8 @@ initial_state() -> g_initial_nodes() -> Nodes = lists:seq(0, ?MAX_NODES-1), - ?LET(L, shuffle(Nodes), lists:split(?INITIAL_CLUSTER_SIZE, L)). + ?LET(L, Nodes, %shuffle(Nodes) + lists:split(?INITIAL_CLUSTER_SIZE, L)). g_idx(State) -> Indices = [Idx || {Idx, _} <- chash:nodes(chash:fresh(State#state.ring_size, undefined))], @@ -287,7 +287,7 @@ g_gossip(State, Gossip) -> [{Node, get_nstate(State, Node)}, OtherNode, OtherCS]. g_random_ring(State) -> - shuffle(lists:seq(0, State#state.ring_size-1)). + lists:seq(0, State#state.ring_size-1).%shuffle( g_posint() -> ?SUCHTHAT(X, largeint(), X > 0). @@ -1386,19 +1386,19 @@ ring_ready(CState0) -> end. seed_random(State) -> - OldSeed = random:seed(State#state.seed), + OldSeed = rand:seed(State#state.seed), State#state{old_seed=OldSeed}. save_random(State=#state{old_seed=undefined}) -> - Seed = random:seed(), + Seed = rand:seed(), State#state{seed=Seed}; save_random(State=#state{old_seed=OldSeed}) -> - Seed = random:seed(OldSeed), + Seed = rand:seed(OldSeed), State#state{seed=Seed}. save_random() -> - Seed = random:seed(), - random:seed(Seed), + Seed = rand:seed(), + rand:seed(Seed), Seed. ring_changed(State, _RRing, {Node, _NState}, CState0) -> @@ -1620,7 +1620,7 @@ handle_down_nodes(CState, Next) -> case (OwnerLeaving and NextDown) of true -> Active = riak_core_ring:active_members(CState) -- [O], - RNode = lists:nth(random:uniform(length(Active)), + RNode = lists:nth(rand:uniform(length(Active)), Active), {Idx, O, RNode, Mods, Status}; _ -> @@ -1743,7 +1743,7 @@ attempt_simple_transfer(Ring, [{P, Exit}|Rest], TargetN, Exit, Idx, Last) -> target_n_fail; Qualifiers -> %% these nodes don't violate target_n forward - Chosen = lists:nth(random:uniform(length(Qualifiers)), + Chosen = lists:nth(rand:uniform(length(Qualifiers)), Qualifiers), %% choose one, and do the rest of the ring attempt_simple_transfer( diff --git a/test/eqc/node_watcher_qc.erl b/test/pqc/node_watcher_qc.erl similarity index 98% rename from test/eqc/node_watcher_qc.erl rename to test/pqc/node_watcher_qc.erl index 69fd4952f..3de840947 100644 --- a/test/eqc/node_watcher_qc.erl +++ b/test/pqc/node_watcher_qc.erl @@ -21,10 +21,9 @@ %% ------------------------------------------------------------------- -module(node_watcher_qc). --ifdef(EQC). +-ifdef(PROPER). --include_lib("eqc/include/eqc.hrl"). --include_lib("eqc/include/eqc_statem.hrl"). +-include_lib("proper/include/proper.hrl"). -include_lib("eunit/include/eunit.hrl"). -compile(export_all). @@ -36,12 +35,15 @@ peers = []}). -define(QC_OUT(P), - eqc:on_output(fun(Str, Args) -> io:format(user, Str, Args) end, P)). + proper:on_output(fun(Str, Args) -> io:format(user, Str, Args) end, P)). -define(ORDSET(L), ordsets:from_list(L)). +%TODO +% qc_test_() -> +% {timeout, 1500, +% ?_assert(proper:quickcheck(?QC_OUT(prop_main()),[{numtests, 5000}])) +% }. -qc_test_() -> - {timeout, 120, fun() -> ?assert(eqc:quickcheck(?QC_OUT(prop_main()))) end}. prop_main() -> ?SETUP( @@ -102,7 +104,7 @@ ensure_started(Mod) -> %% ==================================================================== -%% eqc_statem callbacks +%%proper_statem callbacks %% ==================================================================== initial_state() -> @@ -114,8 +116,8 @@ command(S) -> {call, ?MODULE, local_service_up, [g_service()]}, {call, ?MODULE, local_service_down, [g_service()]}, {call, ?MODULE, local_service_kill, [g_service(), S]}, - {call, ?MODULE, local_node_up, []}, - {call, ?MODULE, local_node_down, []}, + %{call, ?MODULE, local_node_up, []}, + %{call, ?MODULE, local_node_down, []}, {call, ?MODULE, remote_service_up, [g_node(), g_services()]}, {call, ?MODULE, remote_service_down, [g_node()]}, {call, ?MODULE, remote_service_down_disterl, [g_node()]}, diff --git a/test/pqc/riak_core_claim_qc.erl b/test/pqc/riak_core_claim_qc.erl new file mode 100644 index 000000000..ccd39794b --- /dev/null +++ b/test/pqc/riak_core_claim_qc.erl @@ -0,0 +1,323 @@ +-module(riak_core_claim_qc). +-ifdef(TEST). +-ifdef(PROPER). + +-compile(export_all). +-export([prop_claim_ensures_unique_nodes/1, prop_wants/0, prop_wants_counts/0, eqc_check/2]). +-include_lib("proper/include/proper.hrl"). +-include_lib("eunit/include/eunit.hrl"). + +-define(QC_OUT(P), + proper:on_output(fun(Str, Args) -> io:format(user, Str, Args) end, P)). + +-define(POW_2(N), trunc(math:pow(2, N))). + +eqc_check(File, Prop) -> + {ok, Bytes} = file:read_file(File), + CE = binary_to_term(Bytes), + proper:check(Prop, CE). + +test_nodes(Count) -> + [node() | [list_to_atom(lists:concat(["n_", N])) || N <- lists:seq(1, Count-1)]]. + +test_nodes(Count, StartNode) -> + [list_to_atom(lists:concat(["n_", N])) || N <- lists:seq(StartNode, StartNode + Count)]. + +property_claim_ensures_unique_nodes_v2_test_() -> + Prop = ?QC_OUT(prop_claim_ensures_unique_nodes(choose_claim_v2)), + {timeout, 120, fun() -> ?_assert(proper:quickcheck(Prop, [{numtests, 5000}])) end}. + +property_claim_ensures_unique_nodes_adding_groups_v2_test_() -> + Prop = ?QC_OUT(prop_claim_ensures_unique_nodes_adding_groups(choose_claim_v2)), + {timeout, 120, fun() -> ?_assert(proper:quickcheck(Prop, [{numtests, 5000}])) end}. + +property_claim_ensures_unique_nodes_adding_singly_v2_test_() -> + Prop = ?QC_OUT(prop_claim_ensures_unique_nodes_adding_singly(choose_claim_v2)), + {timeout, 120, fun() -> ?_assert(proper:quickcheck(Prop, [{numtests, 5000}])) end}. + +prop_claim_ensures_unique_nodes(ChooseFun) -> + %% NOTE: We know that this doesn't work for the case of {_, 3}. + %% NOTE2: uses undocumented "double_shrink", is expensive, but should get + %% around those case where we shrink to a non-minimal case because + %% some intermediate combinations of ring_size/node have no violations + ?FORALL({PartsPow, NodeCount}, {choose(4, 9), choose(4, 15)}, + begin + Nval = 3, + TNval = Nval + 1, + _Params = [{target_n_val, TNval}], + + Partitions = ?POW_2(PartsPow), + [Node0 | RestNodes] = test_nodes(NodeCount), + + R0 = riak_core_ring:fresh(Partitions, Node0), + RAdded = lists:foldl(fun(Node, Racc) -> + riak_core_ring:add_member(Node0, Racc, Node) + end, R0, RestNodes), + + Rfinal = riak_core_claim:claim(RAdded, {?MODULE, wants_claim_v2}, {?MODULE, ChooseFun}), + + Preflists = riak_core_ring:all_preflists(Rfinal, Nval), + ImperfectPLs = orddict:to_list( + lists:foldl(fun(PL, Acc) -> + PLNodes = lists:usort([N || {_, N} <- PL]), + case length(PLNodes) of + Nval -> + Acc; + _ -> + ordsets:add_element(PL, Acc) + end + end, [], Preflists)), + + ?WHENFAIL( + begin + io:format(user, "{Partitions, Nodes} {~p, ~p}~n", + [Partitions, NodeCount]), + io:format(user, "Owners: ~p~n", + [riak_core_ring:all_owners(Rfinal)]) + end, + conjunction([{meets_target_n, + equals({true, []}, + riak_core_claim:meets_target_n(Rfinal, TNval))}, + {perfect_preflists, equals([], ImperfectPLs)}, + {balanced_ring, balanced_ring(Partitions, NodeCount, Rfinal)}])) + end). + + +prop_claim_ensures_unique_nodes_adding_groups(ChooseFun) -> + %% NOTE: We know that this doesn't work for the case of {_, 3}. + %% NOTE2: uses undocumented "double_shrink", is expensive, but should get + %% around those case where we shrink to a non-minimal case because + %% some intermediate combinations of ring_size/node have no violations + ?FORALL({PartsPow, BaseNodes, AddedNodes}, + {choose(4, 9), choose(2, 10), choose(2, 5)}, + begin + Nval = 3, + TNval = Nval + 1, + _Params = [{target_n_val, TNval}], + + Partitions = ?POW_2(PartsPow), + [Node0 | RestNodes] = test_nodes(BaseNodes), + AddNodes = test_nodes(AddedNodes-1, BaseNodes), + NodeCount = BaseNodes + AddedNodes, + %% io:format("Base: ~p~n",[[Node0 | RestNodes]]), + %% io:format("Added: ~p~n",[AddNodes]), + + R0 = riak_core_ring:fresh(Partitions, Node0), + RBase = lists:foldl(fun(Node, Racc) -> + riak_core_ring:add_member(Node0, Racc, Node) + end, R0, RestNodes), + + Rinterim = riak_core_claim:claim(RBase, {?MODULE, wants_claim_v2}, {?MODULE, ChooseFun}), + RAdded = lists:foldl(fun(Node, Racc) -> + riak_core_ring:add_member(Node0, Racc, Node) + end, Rinterim, AddNodes), + + Rfinal = riak_core_claim:claim(RAdded, {?MODULE, wants_claim_v2}, {?MODULE, ChooseFun}), + + Preflists = riak_core_ring:all_preflists(Rfinal, Nval), + ImperfectPLs = orddict:to_list( + lists:foldl(fun(PL, Acc) -> + PLNodes = lists:usort([N || {_, N} <- PL]), + case length(PLNodes) of + Nval -> + Acc; + _ -> + ordsets:add_element(PL, Acc) + end + end, [], Preflists)), + + ?WHENFAIL( + begin + io:format(user, "{Partitions, Nodes} {~p, ~p}~n", + [Partitions, NodeCount]), + io:format(user, "Owners: ~p~n", + [riak_core_ring:all_owners(Rfinal)]) + end, + conjunction([{meets_target_n, + equals({true, []}, + riak_core_claim:meets_target_n(Rfinal, TNval))}, + {perfect_preflists, equals([], ImperfectPLs)}, + {balanced_ring, balanced_ring(Partitions, NodeCount, Rfinal)}])) + end). + + +prop_claim_ensures_unique_nodes_adding_singly(ChooseFun) -> + %% NOTE: We know that this doesn't work for the case of {_, 3}. + %% NOTE2: uses undocumented "double_shrink", is expensive, but should get + %% around those case where we shrink to a non-minimal case because + %% some intermediate combinations of ring_size/node have no violations + ?FORALL({PartsPow, NodeCount}, {choose(4, 9), choose(4, 15)}, + begin + Nval = 3, + TNval = Nval + 1, + Params = [{target_n_val, TNval}], + + Partitions = ?POW_2(PartsPow), + [Node0 | RestNodes] = test_nodes(NodeCount), + + R0 = riak_core_ring:fresh(Partitions, Node0), + Rfinal = lists:foldl(fun(Node, Racc) -> + Racc0 = riak_core_ring:add_member(Node0, Racc, Node), + %% TODO which is it? Claim or ChooseFun?? + %%claim(Racc0, {?MODULE, wants_claim_v2}, + %% {?MODULE, ChooseFun}) + ?MODULE:ChooseFun(Racc0, Node, Params) + end, R0, RestNodes), + Preflists = riak_core_ring:all_preflists(Rfinal, Nval), + ImperfectPLs = orddict:to_list( + lists:foldl(fun(PL, Acc) -> + PLNodes = lists:usort([N || {_, N} <- PL]), + case length(PLNodes) of + Nval -> + Acc; + _ -> + ordsets:add_element(PL, Acc) + end + end, [], Preflists)), + + ?WHENFAIL( + begin + io:format(user, "{Partitions, Nodes} {~p, ~p}~n", + [Partitions, NodeCount]), + io:format(user, "Owners: ~p~n", + [riak_core_ring:all_owners(Rfinal)]) + end, + conjunction([{meets_target_n, + equals({true, []}, + riak_core_claim:meets_target_n(Rfinal, TNval))}, + {perfect_preflists, equals([], ImperfectPLs)}, + {balanced_ring, balanced_ring(Partitions, NodeCount, Rfinal)}])) + end). + + + +%% @private check that no node claims more than it should +-spec balanced_ring(RingSize::integer(), NodeCount::integer(), + riak_core_ring:riak_core_ring()) -> + boolean(). +balanced_ring(RingSize, NodeCount, Ring) -> + TargetClaim = riak_core_claim:ceiling(RingSize / NodeCount), + MinClaim = RingSize div NodeCount, + AllOwners0 = riak_core_ring:all_owners(Ring), + AllOwners = lists:keysort(2, AllOwners0), + {BalancedMax, AccFinal} = lists:foldl(fun({_Part, Node}, {_Balanced, [{Node, Cnt} | Acc]}) + when Cnt >= TargetClaim -> + {false, [{Node, Cnt+1} | Acc]}; + ({_Part, Node}, {Balanced, [{Node, Cnt} | Acc]}) -> + {Balanced, [{Node, Cnt+1} | Acc]}; + ({_Part, NewNode}, {Balanced, Acc}) -> + {Balanced, [{NewNode, 1} | Acc]} + end, + {true, []}, + AllOwners), + BalancedMin = lists:all(fun({_Node, Cnt}) -> Cnt >= MinClaim end, AccFinal), + case BalancedMax andalso BalancedMin of + true -> + true; + false -> + {TargetClaim, MinClaim, lists:sort(AccFinal)} + end. + + +wants_counts_test() -> + {timeout, 120, + ?assert(proper:quickcheck(?QC_OUT((prop_wants_counts())), [{numtests, 5000}]))}. + +prop_wants_counts() -> + ?FORALL({S, Q}, {large_pos(100), large_pos(100000)}, + begin + Wants = riak_core_claim:wants_counts(S, Q), + conjunction([{len, S == length(Wants)}, + {sum, Q == lists:sum(Wants)}]) + end). + +wants_test() -> + {timeout, 120, + ?_assert(proper:quickcheck(?QC_OUT(prop_wants()), [{numtests, 5000}]))}. + +prop_wants() -> + ?FORALL({NodeStatus, Q}, + {?SUCHTHAT(L, non_empty(list(elements([leaving, joining]))), + lists:member(joining, L)), + ?LET(X, choose(1, 16), trunc(math:pow(2, X)))}, + begin + R0 = riak_core_ring:fresh(Q, tnode(1)), + {_, R2, Active} = + lists:foldl( + fun(S, {I, R1, A1}) -> + N = tnode(I), + case S of + joining -> + {I+1, riak_core_ring:add_member(N, R1, N), [N|A1]}; + _ -> + {I+1, riak_core_ring:leave_member(N, R1, N), A1} + end + end, {1, R0, []}, NodeStatus), + Wants = riak_core_claim:wants(R2), + + %% Check any non-claiming nodes are set to 0 + %% Check all nodes are present + {ActiveWants, InactiveWants} = + lists:partition(fun({N, _W}) -> lists:member(N, Active) end, Wants), + + ActiveSum = lists:sum([W || {_, W} <- ActiveWants]), + InactiveSum = lists:sum([W || {_, W} <- InactiveWants]), + ?WHENFAIL( + begin + io:format(user, "NodeStatus: ~p\n", [NodeStatus]), + io:format(user, "Active: ~p\n", [Active]), + io:format(user, "Q: ~p\n", [Q]), + io:format(user, "Wants: ~p\n", [Wants]), + io:format(user, "ActiveWants: ~p\n", [ActiveWants]), + io:format(user, "InactiveWants: ~p\n", [InactiveWants]) + end, + conjunction([{wants, length(Wants) == length(NodeStatus)}, + {active, Q == ActiveSum}, + {inactive, 0 == InactiveSum}])) + end). + +%% Large positive integer between 1 and Max +large_pos(Max) -> + ?LET(X, largeint(), 1 + (abs(X) rem Max)). + + +tnode(I) -> + list_to_atom("n" ++ integer_to_list(I)). + +%% Check that no node gained more than it wanted to take +%% Check that none of the nodes took more partitions than allowed +%% Check that no nodes violate target N +check_deltas(Exchanges, Before, After, Q, TN) -> + conjunction( + lists:flatten( + [begin + Gave = length(OIdxs1 -- OIdxs2), % in original and not new + Took = length(OIdxs2 -- OIdxs1), + V1 = count_violations(OIdxs1, Q, TN), + V2 = count_violations(OIdxs2, Q, TN), + [{{give, Node, Gave, Give}, Gave =< Give}, + {{take, Node, Took, Take}, Took =< Take}, + {{valid, Node, V1, V2}, + V2 == 0 orelse + V1 > 0 orelse % check no violations if there were not before + OIdxs1 == []}] % or the node held no indices so violation was impossible + end || {{Node, Give, Take, _CIdxs}, {Node, _Want1, OIdxs1}, {Node, _Want2, OIdxs2}} <- + lists:zip3(lists:sort(Exchanges), lists:sort(Before), lists:sort(After))])). + +count_violations([], _Q, _TN) -> + 0; +count_violations(Idxs, Q, TN) -> + SOIdxs = lists:sort(Idxs), + {_, Violations} = lists:foldl( + fun(This, {Last, Vs}) -> + case Last - This >= TN of + true -> + {This, Vs}; + _ -> + {This, Vs + 1} + end + end, {Q + hd(SOIdxs), 0}, lists:reverse(SOIdxs)), + Violations. + +-endif. % EQC +-endif. % TEST diff --git a/test/pqc/riak_core_claim_statem.erl b/test/pqc/riak_core_claim_statem.erl new file mode 100644 index 000000000..38d84ffa7 --- /dev/null +++ b/test/pqc/riak_core_claim_statem.erl @@ -0,0 +1,270 @@ +%%% @author Russell Brown +%%% @copyright (C) 2017, Russell Brown +%%% @doc +%%% +%%% @end +%%% Created : 5 Jun 2017 by Russell Brown + +-module(riak_core_claim_statem). + +-ifdef(PROPER). +-include_lib("proper/include/proper.hrl"). +-include_lib("eunit/include/eunit.hrl"). + +-define(CLAIMANT, node_0). + +-compile(export_all). + +%Entry Eunit +claim_test_()-> + {timeout, 120, + ?_assert(proper:quickcheck(prop_claim(with_ring_size(5)),[{numtests, 5000}] ))}. + +%% -- State ------------------------------------------------------------------ +-record(state, + { + ring_size, + nodes=[?CLAIMANT] :: [atom()], %% nodes that have been added + node_counter=1 :: non_neg_integer(), %% to aid with naming nodes + ring = undefined, + committed_nodes = [] + }). + +%% @doc run the statem with a ring of size `math:pow(`N', 2)'. +-spec with_ring_size(pos_integer()) -> proper_statem:symbolic_state(). +with_ring_size(N) -> + RingSize = trunc(math:pow(2, N)), + #state{ring_size=RingSize, ring=riak_core_ring:fresh(RingSize, ?CLAIMANT)}. + +%% @doc Returns the state in which each test case starts. (Unless a different +%% initial state is supplied explicitly to, e.g. commands/2.) +-spec initial_state_data() -> proper_statem:symbolic_state(). +initial_state_data() -> + #state{}. + +initial_state(_S) -> + starting. + +starting(S) -> + [{planning, {call, ?MODULE, add_node, add_node_args(S)}}]. + +planning(S) -> + [{planning, {call, ?MODULE, add_node, add_node_args(S)}}, + {planning, {call, ?MODULE, leave_node, leave_node_args(S)}}%, + %{claiming, {call, ?MODULE, claim, claim_args(S)}} %TODO + ]. + +claiming(S) -> + [{planning, {call, ?MODULE, add_node, add_node_args(S)}}, + {planning, {call, ?MODULE, leave_node, leave_node_args(S)}}]. + +%% -- Operations ------------------------------------------------------------- + +%% --- Operation: preconditions --- +%% @doc add_node_pre/1 - Precondition for generation +%add_node_pre(_From, _To, S=#state{nodes=Nodes}) +precondition(_F,_T,S=#state{nodes=Nodes}, {call, _, add_node, _}) when + (S#state.ring_size div length(Nodes)) =< 3 -> + false; +%add_node_pre(_From, _To, _) -> +precondition(_F,_T,_S, {call, _, add_node, _}) -> + true; +%% @doc leave_node_pre/1 - Precondition for generation +%leave_node_pre(_From, _To, #state{nodes=Nodes}) when length(Nodes) < 5 -> +precondition(_F,_T,#state{nodes=Nodes},{call,_,leave_node,_}) when length(Nodes) < 5 -> + false; +%leave_node_pre(_From, _To, _) -> +precondition(_F,_T,_S, {call,_,leave_node,_})-> + true; +%leave_node_pre(_From, _To, #state{nodes=Nodes}, [Node, _Ring]) -> +precondition(_F,_T,#state{nodes=Nodes},{call,_,leave_node,[Node, _Ring]}) -> + lists:member(Node, Nodes); +precondition(_F,_T,_S,{call,_,leave_node,_}) -> + false; +% @doc claim_pre/3 - Precondition for generation +%-spec claim_pre(_From, _To, S :: proper:symbolic_state()) -> boolean(). +%claim_pre(_From, _To, #state{ring=undefined}) -> +precondition(_F,_T,#state{ring=undefined},{call, _, claim,_}) -> + false; +%claim_pre(_From, _To, _S) -> +precondition(_F,_T,_S,{call,_ , claim, _}) -> + true. + +%% --- Operation: Next state --- +%% @doc add_node_next - Next state function +% -spec add_node_next(_From, _To, S, Var, Args) -> NewS +% when S :: proper:symbolic_state() | proper:dynamic_state(), +% Var :: proper:var() | term(), +% Args :: [term()], +% NewS :: proper:symbolic_state() | proper:dynamic_state(). +%add_node_next(_From, _To, S=#state{node_counter=NC, nodes=Nodes}, Ring, [Node, _RingIn]) -> +next_state_data(_F,_T,S=#state{node_counter=NC, nodes=Nodes}, Ring, + {call,_, add_node, [Node, _RingIn]}) -> + S#state{ring=Ring, node_counter=NC+1, nodes=[Node | Nodes]}; + +%% @doc leave_node_next - Next state function +% -spec leave_node_next(_From, _To, S, Var, Args) -> NewS +% when S :: proper:symbolic_state() | proper:dynamic_state(), +% Var :: proper:var() | term(), +% Args :: [term()], +% NewS :: proper:symbolic_state() | proper:dynamic_state(). +next_state_data(_F,_T,S=#state{committed_nodes=Committed, nodes=Nodes},Ring,{call, _, leave_node, [Node, _RingIn]}) -> + S#state{ring=Ring, committed_nodes=lists:delete(Node , Committed), nodes=lists:delete(Node, Nodes)}; + +%% @doc claim_next - Next state function +% -spec claim_next(_From, _To, S, Var, Args) -> NewS +% when S :: proper:symbolic_state() | proper:dynamic_state(), +% Var :: proper:var() | term(), +% Args :: [term()], +% NewS :: proper:symbolic_state() | proper:dynamic_state(). +%claim_next(_From, _To, S=#state{nodes=Nodes}, NewRing, [_OldRing]) -> +next_state_data(_F,_T,S=#state{nodes=Nodes}, NewRing, {call, _, claim, [_OldRing]}) -> + S#state{ring=NewRing, committed_nodes=Nodes}. + +%% --- Operation: postconditions --- +%% @doc add_node_post - Postcondition for add_node +% -spec add_node_post(_From, _To, S, Args, Res) -> true | term() +% when S :: proper:dynamic_state(), +% Args :: [term()], +% Res :: term(). +%add_node_post(_Frim, _To, _S, [NodeName, _Ring], NextRing) -> +postcondition(_F,_T,_S,{call,_, add_node, [NodeName, _Ring]}, NextRing) -> + lists:member(NodeName, riak_core_ring:members(NextRing, [joining])); + +%% @doc leave_node_post - Postcondition for leave_node +% -spec leave_node_post(_From, _To, S, Args, Res) -> true | term() +% when S :: proper:dynamic_state(), +% Args :: [term()], +% Res :: term(). +postcondition(_F,_T,_S,{call,_,leave_node,[NodeName, _Ring]}, NextRing) -> + lists:member(NodeName, riak_core_ring:members(NextRing, [leaving])); + +%% @doc claim_post - Postcondition for claim +% -spec claim_post(_From, _To, S, Args, Res) -> true | term() +% when S :: proper:dynamic_state(), +% Args :: [term()], +% Res :: term(). +%claim_post(_From, _To, #state{nodes=Nodes}, [_Ring], _NewRing) when length(Nodes) < 4 -> +postcondition(_F,_T,#state{nodes=Nodes}, {call, _, claim, [_Ring]}, _NewRing) when length(Nodes) < 4 -> + true; +%claim_post(_From, _To, _S, [_Ring], NewRing) -> +postcondition(_F,_T,_S, {call, _, claim, [_Ring]}, NewRing)-> + Nval = 3, + TNval = 4, + Preflists = riak_core_ring:all_preflists(NewRing, Nval), + ImperfectPLs = orddict:to_list( + lists:foldl(fun(PL,Acc) -> + PLNodes = lists:usort([N || {_,N} <- PL]), + case length(PLNodes) of + Nval -> + Acc; + _ -> + ordsets:add_element(PL, Acc) + end + end, [], Preflists)), + + case {riak_core_claim:meets_target_n(NewRing, TNval), + ImperfectPLs, + riak_core_claim:balanced_ring(ring_size(NewRing), + node_count(NewRing), NewRing)} of + {{true, []}, [], true} -> + true; + {X, Y, Z} -> + {ring_size(NewRing), node_count(NewRing), + {{meets_target_n, X}, + {perfect_pls, Y}, + {balanced_ring, Z}}} + end. + +%% --- Operation: main functions --- +%% @doc add_node_args - Argument generator +% -spec add_node_args(From, To, S) -> proper:gen([term()]) +% when From :: proper:state_name(), +% To :: proper:state_name(), +% S :: proper:symbolic_state(). +add_node_args(#state{node_counter=NC, ring=Ring}) -> + %% TODO consider re-adding removed nodes + %io:fwrite("~n", NC), + [list_to_atom("node_" ++ integer_to_list(NC)), Ring]. + +%% @doc add_node - The actual operation +add_node(NodeName, Ring) -> + R = riak_core_ring:add_member(?CLAIMANT, Ring, NodeName), + R. + +%% --- Operation: leave_node --- +%% @doc leave_node_args - Argument generator +% -spec leave_node_args(From, To, S) -> proper:gen([term()]) +% when From :: proper:state_name(), +% To :: proper:state_name(), +% S :: proper:symbolic_state(). +leave_node_args(#state{nodes=Nodes, ring=Ring}) -> + %% TODO consider re-leaveing leaved nodes + [elements(Nodes), Ring]. +%% @doc leave_node - The actual operation +leave_node(NodeName, Ring) -> + R = riak_core_ring:leave_member(?CLAIMANT, Ring, NodeName), + R. +%% --- Operation: claim --- +%% @doc claim_args - Argument generator +%-spec claim_args(S :: proper:symbolic_state()) -> proper:gen([term()]). +%claim_args(_From, _To, #state{ring=Ring}) -> +claim_args(#state{ring=Ring}) -> + [Ring]. +%% @doc claim - The actual operation +claim(Ring) -> + R =riak_core_claim:claim(Ring, {riak_core_claim, wants_claim_v2}, {riak_core_claim, choose_claim_v2}), + R. + +%% -- Property --------------------------------------------------------------- +%% @doc Optional callback, Invariant, checked for each visited state +%% during test execution. + +%% @doc Default generated property +-spec prop_claim(proper_statem:symbolic_state()) -> proper:property(). +prop_claim(InitialState) -> + ?FORALL(Cmds, proper_fsm:commands(?MODULE, {starting, InitialState}), + begin + {_H, {_FinalStateName, S}, Res} = proper_fsm:run_commands(?MODULE, Cmds), + Ring = S#state.ring, + aggregate(command_names(Cmds), + measure(ring_size, ring_size(Ring), + measure(node_count, node_count(Ring), + Res == ok))) + end). + +ring_size(undefined) -> + 0; +ring_size(Ring) -> + riak_core_ring:num_partitions(Ring). + +node_count(undefined) -> + 0; +node_count(Ring) -> + length(riak_core_ring:members(Ring, [joining, valid])). + +weight(_From, _To, add_node, _Args) -> + 5; +weight(_, _, _, _) -> + 1. + +%% eunit stuff +-define(QC_OUT(P), + proper:on_output(fun(Str, Args) -> io:format(user, Str, Args) end, P)). + +eqc_check(File, Prop) -> + {ok, Bytes} = file:read_file(File), + CE = binary_to_term(Bytes), + proper:check(Prop, CE). + +%% Helpers +transfer_ring(Ring) -> + Owners = riak_core_ring:all_owners(Ring), + RFinal = lists:foldl(fun({Idx, Owner}, Racc) -> + riak_core_ring:transfer_node(Idx, Owner, Racc) end, + Ring, Owners), + RFinal. + + + +-endif. diff --git a/test/pqc/riak_core_claim_util_qc.erl b/test/pqc/riak_core_claim_util_qc.erl new file mode 100644 index 000000000..d58c495bc --- /dev/null +++ b/test/pqc/riak_core_claim_util_qc.erl @@ -0,0 +1,50 @@ +-module(riak_core_claim_util_qc). +-ifdef(TEST). +-ifdef(PROPER). +-include_lib("proper/include/proper.hrl"). +-include_lib("eunit/include/eunit.hrl"). +%-compile(export_all). + +property_adjacency_summary_test_() -> + {timeout, 120, + ?_test(proper:quickcheck(prop_adjacency_summary(), [{numtest, 5000}]))}. + +longer_list(K, G) -> + ?SIZED(Size, proper_types:resize(trunc(K*Size), list(proper_types:resize(Size, G)))). + +%% Compare directly constructing the adjacency matrix against +%% one using prepend/fixup. +prop_adjacency_summary() -> + ?FORALL({OwnersSeed, S}, + {non_empty(longer_list(40, proper_types:largeint())), + ?LET(X, proper_types:int(), 1 + abs(X))}, + begin + Owners = [list_to_atom("n" ++ integer_to_list(1 + (abs(I) rem S))) + || I <- OwnersSeed], + AM = riak_core_claim_util:adjacency_matrix(Owners), + AS = riak_core_claim_util:summarize_am(AM), + + {Owners2, _DAM2, FixDAM2} = build(Owners), + AS2 = riak_core_claim_util:summarize_am(dict:to_list(FixDAM2)), + ?WHENFAIL( + begin + io:format(user, "S=~p\nOwners =~p\n", [S, Owners]), + io:format(user, "=== AM ===\n~p\n", [AM]), + io:format(user, "=== FixAM2 ===\n~p\n", [dict:to_list(FixDAM2)]), + io:format(user, "=== AS2 ===\n~p\n", [AS2]) + end, + proper:conjunction([{owners, Owners == Owners2}, + {am2, lists:sort(AS)== lists:sort(AS2)}])) + end). + +build(Owners) -> + build(lists:usort(Owners), lists:reverse(Owners), [], dict:new()). + +build(_M, [], Owners, DAM) -> + {Owners, DAM, riak_core_claim_util:fixup_dam(Owners, DAM)}; +build(M, [N|Rest], Owners, DAM) -> + {Owners1, DAM1} = riak_core_claim_util:prepend(M, N, Owners, DAM), + build(M, Rest, Owners1, DAM1). + +-endif. +-endif. diff --git a/eqc/riak_core_ring_eqc.erl b/test/pqc/riak_core_ring_eqc.erl similarity index 95% rename from eqc/riak_core_ring_eqc.erl rename to test/pqc/riak_core_ring_eqc.erl index bf45d96d2..1187e9d8e 100644 --- a/eqc/riak_core_ring_eqc.erl +++ b/test/pqc/riak_core_ring_eqc.erl @@ -21,15 +21,15 @@ -module(riak_core_ring_eqc). --ifdef(EQC). +-ifdef(PROPER). -export([prop_future_index/0]). --include_lib("eqc/include/eqc.hrl"). +-include_lib("proper/include/proper.hrl"). -include_lib("eunit/include/eunit.hrl"). -define(TEST_ITERATIONS, 10000). -define(QC_OUT(P), - eqc:on_output(fun(Str, Args) -> io:format(user, Str, Args) end, P)). + proper:on_output(fun(Str, Args) -> io:format(user, Str, Args) end, P)). eqc_test_() -> @@ -43,7 +43,7 @@ eqc_test_() -> {timeout, 60000, % timeout is in msec %% Indicate the number of test iterations for each property here ?_assertEqual(true, - quickcheck(numtests(?TEST_ITERATIONS, + proper:quickcheck(numtests(?TEST_ITERATIONS, ?QC_OUT(prop_future_index())))) }]}]}]}. diff --git a/test/pqc/riak_core_ring_util_qc.erl b/test/pqc/riak_core_ring_util_qc.erl new file mode 100644 index 000000000..0a7f48220 --- /dev/null +++ b/test/pqc/riak_core_ring_util_qc.erl @@ -0,0 +1,141 @@ +-module(riak_core_ring_util_qc). +-ifdef(TEST). +-ifdef(PROPER). + +-compile(export_all). +-export([prop_ids_are_boundaries/0, + prop_reverse/0, + prop_monotonic/0, + prop_only_boundaries/0]). +-include_lib("proper/include/proper.hrl"). +-include_lib("eunit/include/eunit.hrl"). + +-define(QC_OUT(P), + proper:on_output(fun(Str, Args) -> + io:format(user, Str, Args) end, P)). +-define(TEST_TIME_SECS, 5). + +-define(HASHMAX, 1 bsl 160 - 1). +-define(RINGSIZEEXPMAX, 11). +-define(RINGSIZE(X), (1 bsl X)).%% We'll generate powers of 2 with choose() + %% and convert that to a ring size with this macro +-define(PARTITIONSIZE(X), ((1 bsl 160) div (X))). + +ids_are_boundaries_test_() -> + {timeout, ?TEST_TIME_SECS+5, [?_assert(test_ids_are_boundaries() =:= true)]}. + +test_ids_are_boundaries() -> + test_ids_are_boundaries(?TEST_TIME_SECS). +%TODO check time sec +test_ids_are_boundaries(_TestTimeSecs) -> + proper:quickcheck(?QC_OUT(prop_ids_are_boundaries()), [{numtests, 5000}]). + +reverse_test_() -> + {timeout, ?TEST_TIME_SECS+5, [?_assert(test_reverse() =:= true)]}. + +test_reverse() -> + test_reverse(?TEST_TIME_SECS). +%TODO check time sec +test_reverse(_TestTimeSecs) -> + proper:quickcheck(prop_reverse(), [{numtests, 5000}]). + + +monotonic_test_() -> + {timeout, ?TEST_TIME_SECS+5, [?_assert(test_monotonic() =:= true)]}. + +test_monotonic() -> + test_monotonic(?TEST_TIME_SECS). + +test_monotonic(_TestTimeSecs) -> + proper:quickcheck(?QC_OUT(prop_monotonic()), [{numtests, 5000}]). + + +%% `prop_only_boundaries' should run a little longer: not quite as +%% fast, need to scan a larger portion of hash space to establish +%% correctness +only_boundaries_test_() -> + {timeout, ?TEST_TIME_SECS+15, [?_assert(test_only_boundaries() =:= true)]}. + +test_only_boundaries() -> + test_only_boundaries(?TEST_TIME_SECS+10). + +test_only_boundaries(_TestTimeSecs) -> + proper:quickcheck(prop_only_boundaries(), [{numtests, 5000}]). + +%% Partition IDs should map to hash values which are partition boundaries +prop_ids_are_boundaries() -> + ?FORALL(RingPower, choose(2, ?RINGSIZEEXPMAX), + ?FORALL(PartitionId, choose(0, ?RINGSIZE(RingPower) - 1), + begin + RingSize = ?RINGSIZE(RingPower), + BoundaryHash = + riak_core_ring_util:partition_id_to_hash(PartitionId, + RingSize), + equals(true, + riak_core_ring_util:hash_is_partition_boundary(BoundaryHash, + RingSize)) + end + )). + +%% Partition IDs should map to hash values which map back to the same partition IDs +prop_reverse() -> + ?FORALL(RingPower, choose(2, ?RINGSIZEEXPMAX), + ?FORALL(PartitionId, choose(0, ?RINGSIZE(RingPower) - 1), + begin + RingSize = ?RINGSIZE(RingPower), + BoundaryHash = + riak_core_ring_util:partition_id_to_hash(PartitionId, + RingSize), + equals(PartitionId, + riak_core_ring_util:hash_to_partition_id( + BoundaryHash, RingSize)) + end + )). + +%% For any given hash value, any larger hash value maps to a partition +%% ID of greater or equal value. +prop_monotonic() -> + ?FORALL(RingPower, choose(2, ?RINGSIZEEXPMAX), + ?FORALL(HashValue, choose(0, ?HASHMAX - 1), + ?FORALL(GreaterHash, choose(HashValue + 1, ?HASHMAX), + begin + RingSize = ?RINGSIZE(RingPower), + LowerPartition = + riak_core_ring_util:hash_to_partition_id(HashValue, + RingSize), + GreaterPartition = + riak_core_ring_util:hash_to_partition_id(GreaterHash, + RingSize), + LowerPartition =< GreaterPartition + end + ))). + +%% Hash values which are listed in the ring structure are boundary +%% values +ring_to_set({_RingSize, PropList}) -> + ordsets:from_list(lists:map(fun({Hash, dummy}) -> Hash end, PropList)). + +find_near_boundaries(RingSize, PartitionSize) -> + ?LET({Id, Offset}, {choose(1, RingSize-1), choose(-(RingSize*2), (RingSize*2))}, + Id * PartitionSize + Offset). + +prop_only_boundaries() -> + ?FORALL(RingPower, choose(2, ?RINGSIZEEXPMAX), + ?FORALL({HashValue, BoundarySet}, + {frequency([ + {5, choose(0, ?HASHMAX)}, + {2, find_near_boundaries(?RINGSIZE(RingPower), + ?PARTITIONSIZE(?RINGSIZE(RingPower)))}]), + ring_to_set(chash:fresh(?RINGSIZE(RingPower), dummy))}, + begin + RingSize = ?RINGSIZE(RingPower), + HashIsInRing = ordsets:is_element(HashValue, BoundarySet), + HashIsPartitionBoundary = + riak_core_ring_util:hash_is_partition_boundary(HashValue, + RingSize), + equals(HashIsPartitionBoundary, HashIsInRing) + end + )). + +-endif. +-endif. \ No newline at end of file diff --git a/test/eqc/vclock_qc.erl b/test/pqc/vclock_qc.erl similarity index 90% rename from test/eqc/vclock_qc.erl rename to test/pqc/vclock_qc.erl index 0d31b0460..7a6918803 100644 --- a/test/eqc/vclock_qc.erl +++ b/test/pqc/vclock_qc.erl @@ -1,30 +1,25 @@ -module(vclock_qc). --ifdef(EQC). - --include_lib("eqc/include/eqc.hrl"). --include_lib("eqc/include/eqc_statem.hrl"). +-ifdef(PROPER). +-include_lib("proper/include/proper.hrl"). -include_lib("eunit/include/eunit.hrl"). -compile(export_all). -define(ACTOR_IDS, [a,b,c,d,e]). -define(QC_OUT(P), - eqc:on_output(fun(Str, Args) -> io:format(user, Str, Args) end, P)). + proper:on_output(fun(Str, Args) -> io:format(user, Str, Args) end, P)). -record(state, {vclocks = []}). -define(TEST_TIME, 20). -eqc_test_() -> +proper_test_() -> {timeout, - 60, - ?_assert(quickcheck(eqc:testing_time(?TEST_TIME, more_commands(10,?QC_OUT(prop_vclock())))))}. - + 120, + ?_assert(proper:quickcheck(prop_vclock(), [{numtests, 5000}]))}. test() -> - quickcheck(eqc:testing_time(?TEST_TIME, more_commands(10, prop_vclock()))). + proper:quickcheck(more_commands(10, prop_vclock())). -test(Time) -> - quickcheck(eqc:testing_time(Time, more_commands(10, prop_vclock()))). %% Initialize the state @@ -100,11 +95,12 @@ prop_vclock() -> ?FORALL(Cmds,commands(?MODULE), begin put(timestamp, 1), - {H,S,Res} = run_commands(?MODULE,Cmds), - aggregate([ length(V) || {_,V} <- S#state.vclocks ], + {_H,S,Res} = run_commands(?MODULE, Cmds), + aggregate([ length(V) || {_,V} <- S#state.vclocks], aggregate(command_names(Cmds), collect({num_vclocks_div_10, length(S#state.vclocks) div 10}, - pretty_commands(?MODULE,Cmds, {H,S,Res}, Res == ok)))) + Res == ok + ))) end). gen_actor_id() -> diff --git a/test/eqc/worker_pool_pulse.erl b/test/pqc/worker_pool_pulse.erl similarity index 100% rename from test/eqc/worker_pool_pulse.erl rename to test/pqc/worker_pool_pulse.erl diff --git a/test/riak_core_base64url_tests.erl b/test/riak_core_base64url_tests.erl deleted file mode 100644 index b4ec62522..000000000 --- a/test/riak_core_base64url_tests.erl +++ /dev/null @@ -1,42 +0,0 @@ -%% ------------------------------------------------------------------- -%% Copyright (c) 2016 Basho Technologies, Inc. All Rights Reserved. -%% -%% This file is provided to you under the Apache License, -%% Version 2.0 (the "License"); you may not use this file -%% except in compliance with the License. You may obtain -%% a copy of the License at -%% -%% http://www.apache.org/licenses/LICENSE-2.0 -%% -%% Unless required by applicable law or agreed to in writing, -%% software distributed under the License is distributed on an -%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -%% KIND, either express or implied. See the License for the -%% specific language governing permissions and limitations -%% under the License. -%% ------------------------------------------------------------------- --module(riak_core_base64url_tests). - --include_lib("eunit/include/eunit.hrl"). - --define(URL, "http://example.com/foo?query=thing"). - -string_to_string_test() -> - Encoded = riak_core_base64url:encode_to_string(?URL), - Decoded = riak_core_base64url:decode_to_string(Encoded), - ?assertEqual(?URL, Decoded). - -string_to_binary_test() -> - Encoded = riak_core_base64url:encode(?URL), - Decoded = riak_core_base64url:decode(Encoded), - ?assertEqual(<>, Decoded). - -binary_to_binary_test() -> - Encoded = riak_core_base64url:encode(<>), - Decoded = riak_core_base64url:decode(Encoded), - ?assertEqual(<>, Decoded). - -binary_to_string_test() -> - Encoded = riak_core_base64url:encode_to_string(<>), - Decoded = riak_core_base64url:decode_to_string(Encoded), - ?assertEqual(?URL, Decoded). diff --git a/test/sync_command_test.erl b/test/sync_command_test.erl index 82d503856..f8be289aa 100644 --- a/test/sync_command_test.erl +++ b/test/sync_command_test.erl @@ -93,5 +93,4 @@ stop_servers(_Pid) -> riak_core_test_util:stop_pid(riak_core_vnode_manager), riak_core_test_util:stop_pid(riak_core_ring_events), riak_core_test_util:stop_pid(riak_core_vnode_sup), - riak_core_test_util:stop_pid(riak_core_ring_manager), - application:stop(goldrush). + riak_core_test_util:stop_pid(riak_core_ring_manager). diff --git a/test/worker_pool_test.erl b/test/worker_pool_test.erl index c85202aaa..eafa37182 100644 --- a/test/worker_pool_test.erl +++ b/test/worker_pool_test.erl @@ -20,23 +20,23 @@ -module(worker_pool_test). -behaviour(riak_core_vnode_worker). --include_lib("eunit/include/eunit.hrl"). -export([init_worker/3, handle_work/3]). -init_worker(_VnodeIndex, Noreply, _WorkerProps) -> - {ok, Noreply}. +init_worker(_VnodeIndex, DoReply, _WorkerProps) -> + {ok, DoReply}. -handle_work(Work, From, true = State) -> +handle_work(Work, _From, false = DoReply) -> Work(), - riak_core_vnode:reply(From, ok), - {noreply, State}; -handle_work(Work, _From, false = State) -> + {noreply, DoReply}; +handle_work(Work, _From, true = DoReply) -> Work(), - {reply, ok, State}. + {reply, ok, DoReply}. -ifdef(TEST). +-include_lib("eunit/include/eunit.hrl"). + receive_result(N) -> receive {N, ok} when N rem 2 /= 0 -> @@ -48,38 +48,130 @@ receive_result(N) -> timeout end. -simple_worker_pool() -> - {ok, Pool} = riak_core_vnode_worker_pool:start_link(?MODULE, 3, 10, false, []), + +deadlock_test() -> + {ok, Pool} = riak_core_vnode_worker_pool:start_link(?MODULE, 1, 10, true, []), + + CoordinatorLoop = spawn_link(fun () -> + Worker1 = receive {worker_init, W} -> W end, + Root = receive {wait_for_worker, W2} -> W2 end, + Root ! continue, + + Root = receive {die_in_the_pool, W3} -> W3 end, + Worker1 ! continue, + receive {ready_to_crash} -> ok end, + + % let the worker actually crash + timer:sleep(50), + Root ! continue, + + receive finish_test -> ok end, + Root ! finish_test + end), + + riak_core_vnode_worker_pool:handle_work(Pool, + fun() -> + CoordinatorLoop ! {worker_init, self()}, + receive continue -> ok end + end, + {raw, 1, self()}), + + CoordinatorLoop ! {wait_for_worker, self()}, + receive continue -> ok end, + + riak_core_vnode_worker_pool:handle_work(Pool, + fun() -> CoordinatorLoop ! {ready_to_crash}, erlang:error(-1) end, + {raw, 1, self()}), + + % now we have to wait a bit + % because handle_work is a cast and there is no way to check when it's in the queue + timer:sleep(50), + + CoordinatorLoop ! {die_in_the_pool, self()}, + receive continue -> ok end, + + % this should not cause a deadlock + riak_core_vnode_worker_pool:handle_work(Pool, fun() -> CoordinatorLoop ! finish_test end, {raw, 1, self()}), + receive finish_test -> ok end, + + unlink(Pool), + ok = riak_core_vnode_worker_pool:stop(Pool, normal), + ok = wait_for_process_death(Pool). + + +simple_reply_worker_pool() -> + {ok, Pool} = riak_core_vnode_worker_pool:start_link(?MODULE, 3, 10, true, []), [ riak_core_vnode_worker_pool:handle_work(Pool, fun() -> - timer:sleep(100), + timer:sleep(10), 1/(N rem 2) end, {raw, N, self()}) || N <- lists:seq(1, 10)], - timer:sleep(1200), + timer:sleep(200), - %% make sure we got all the expected responses + %% make sure we got all replies [ ?assertEqual(true, receive_result(N)) || N <- lists:seq(1, 10)], unlink(Pool), - riak_core_vnode_worker_pool:stop(Pool, normal). + ok = riak_core_vnode_worker_pool:stop(Pool, normal), + ok = wait_for_process_death(Pool). simple_noreply_worker_pool() -> - {ok, Pool} = riak_core_vnode_worker_pool:start_link(?MODULE, 3, 10, true, []), + {ok, Pool} = riak_core_vnode_worker_pool:start_link(?MODULE, 3, 10, false, []), [ riak_core_vnode_worker_pool:handle_work(Pool, fun() -> - timer:sleep(100), + timer:sleep(10), 1/(N rem 2) end, {raw, N, self()}) || N <- lists:seq(1, 10)], - timer:sleep(1200), + timer:sleep(200), - %% make sure we got all the expected responses + %% make sure that the non-crashing work calls receive timeouts + [ ?assertEqual(timeout, receive_result(N)) || N <- lists:seq(1, 10), N rem 2 == 1], + [ ?assertEqual(true, receive_result(N)) || N <- lists:seq(1, 10), N rem 2 == 0], - [ ?assertEqual(true, receive_result(N)) || N <- lists:seq(1, 10)], unlink(Pool), - riak_core_vnode_worker_pool:stop(Pool, normal). + ok = riak_core_vnode_worker_pool:stop(Pool, normal), + ok = wait_for_process_death(Pool). + +shutdown_pool_empty_success() -> + {ok, Pool} = riak_core_vnode_worker_pool:start_link(?MODULE, 3, 10, false, []), + unlink(Pool), + ok = riak_core_vnode_worker_pool:shutdown_pool(Pool, 100), + ok = wait_for_process_death(Pool), + ok. + +shutdown_pool_worker_finish_success() -> + {ok, Pool} = riak_core_vnode_worker_pool:start_link(?MODULE, 3, 10, false, []), + riak_core_vnode_worker_pool:handle_work(Pool, fun() -> timer:sleep(50) end, {raw, 1, self()}), + unlink(Pool), + ok = riak_core_vnode_worker_pool:shutdown_pool(Pool, 100), + ok = wait_for_process_death(Pool), + ok. + +shutdown_pool_force_timeout() -> + {ok, Pool} = riak_core_vnode_worker_pool:start_link(?MODULE, 3, 10, false, []), + riak_core_vnode_worker_pool:handle_work(Pool, fun() -> timer:sleep(100) end, {raw, 1, self()}), + unlink(Pool), + {error, vnode_shutdown} = riak_core_vnode_worker_pool:shutdown_pool(Pool, 50), + ok = wait_for_process_death(Pool), + ok. + +shutdown_pool_duplicate_calls() -> + {ok, Pool} = riak_core_vnode_worker_pool:start_link(?MODULE, 3, 10, false, []), + riak_core_vnode_worker_pool:handle_work(Pool, fun() -> timer:sleep(100) end, {raw, 1, self()}), + unlink(Pool), + + %% request shutdown a bit later a second time + spawn_link(fun() -> + timer:sleep(30), + {error, vnode_shutdown} = riak_core_vnode_worker_pool:shutdown_pool(Pool, 50) + end), + + {error, vnode_shutdown} = riak_core_vnode_worker_pool:shutdown_pool(Pool, 50), + ok = wait_for_process_death(Pool), + ok. pool_test_() -> @@ -87,9 +179,22 @@ pool_test_() -> fun() -> error_logger:tty(false) end, fun(_) -> error_logger:tty(true) end, [ - fun simple_worker_pool/0, - fun simple_noreply_worker_pool/0 - ] + fun simple_reply_worker_pool/0, + fun simple_noreply_worker_pool/0, + fun shutdown_pool_empty_success/0, + fun shutdown_pool_worker_finish_success/0, + fun shutdown_pool_force_timeout/0, + fun shutdown_pool_duplicate_calls/0, + fun deadlock_test/0 + ] }. +wait_for_process_death(Pid) -> + wait_for_process_death(Pid, is_process_alive(Pid)). + +wait_for_process_death(Pid, true) -> + wait_for_process_death(Pid, is_process_alive(Pid)); +wait_for_process_death(_Pid, false) -> + ok. + -endif. diff --git a/tools.mk b/tools.mk deleted file mode 100644 index 72638c65d..000000000 --- a/tools.mk +++ /dev/null @@ -1,16 +0,0 @@ -REBAR ?= ./rebar3 - -.PHONY: test docs xref dialyzer \ - cleanplt - -test: compile - ${REBAR} eunit - -docs: - ${REBAR} doc - -xref: compile - ${REBAR} xref - -dialyzer: - ${REBAR} dialyzer