diff options
| author | Luke Bakken <luke@bakken.io> | 2021-12-14 17:35:14 -0800 |
|---|---|---|
| committer | Michael Klishin <michael@clojurewerkz.org> | 2021-12-26 04:31:59 +0300 |
| commit | 6200887f84728e3324037365ba027f55769aa41e (patch) | |
| tree | 90205a3cbb1ff7d637332d2342e898fe046e0783 | |
| parent | 2d58ce7c5f1d7b4137f98fa5dc48e0a9a9884722 (diff) | |
| download | rabbitmq-server-git-6200887f84728e3324037365ba027f55769aa41e.tar.gz | |
Disk monitor improvements
Related to VESC-1015
* Remove `infinity` timeouts
* Improve free disk space retrieval on win32
Run commands with a timeout
This PR fixes an issue I observed while reproducing VESC-1015 on Windows
10. Within an hour or so of running a 3-node cluster that has health
checks being run against it, one or more nodes' memory use would spike.
I would see that the rabbit_disk_monitor process is stuck executing
os:cmd to retrieve free disk space information. Thus, all
gen_server:call calls to the process would never return, especially
since they used an infinity timeout.
Do something with timeout
Fix unit_disk_monitor_mocks_SUITE
| -rw-r--r-- | deps/rabbit/src/rabbit_disk_monitor.erl | 112 |
1 files changed, 75 insertions, 37 deletions
diff --git a/deps/rabbit/src/rabbit_disk_monitor.erl b/deps/rabbit/src/rabbit_disk_monitor.erl index 76edecf5d8..648da9d442 100644 --- a/deps/rabbit/src/rabbit_disk_monitor.erl +++ b/deps/rabbit/src/rabbit_disk_monitor.erl @@ -75,42 +75,42 @@ -spec get_disk_free_limit() -> integer(). get_disk_free_limit() -> - gen_server:call(?MODULE, get_disk_free_limit, infinity). + gen_server:call(?MODULE, get_disk_free_limit). -spec set_disk_free_limit(disk_free_limit()) -> 'ok'. set_disk_free_limit(Limit) -> - gen_server:call(?MODULE, {set_disk_free_limit, Limit}, infinity). + gen_server:call(?MODULE, {set_disk_free_limit, Limit}). -spec get_min_check_interval() -> integer(). get_min_check_interval() -> - gen_server:call(?MODULE, get_min_check_interval, infinity). + gen_server:call(?MODULE, get_min_check_interval). -spec set_min_check_interval(integer()) -> 'ok'. set_min_check_interval(Interval) -> - gen_server:call(?MODULE, {set_min_check_interval, Interval}, infinity). + gen_server:call(?MODULE, {set_min_check_interval, Interval}). -spec get_max_check_interval() -> integer(). get_max_check_interval() -> - gen_server:call(?MODULE, get_max_check_interval, infinity). + gen_server:call(?MODULE, get_max_check_interval). -spec set_max_check_interval(integer()) -> 'ok'. set_max_check_interval(Interval) -> - gen_server:call(?MODULE, {set_max_check_interval, Interval}, infinity). + gen_server:call(?MODULE, {set_max_check_interval, Interval}). -spec get_disk_free() -> (integer() | 'unknown'). get_disk_free() -> - gen_server:call(?MODULE, get_disk_free, infinity). + gen_server:call(?MODULE, get_disk_free). -spec set_enabled(string()) -> 'ok'. set_enabled(Enabled) -> - gen_server:call(?MODULE, {set_enabled, Enabled}, infinity). + gen_server:call(?MODULE, {set_enabled, Enabled}). %%---------------------------------------------------------------------------- %% gen_server callbacks @@ -227,33 +227,19 @@ get_disk_free(Dir) -> get_disk_free(Dir, {unix, Sun}) when Sun =:= sunos; Sun =:= sunos4; Sun =:= solaris -> Df = os:find_executable("df"), - parse_free_unix(rabbit_misc:os_cmd(Df ++ " -k " ++ Dir)); + parse_free_unix(run_cmd(Df ++ " -k " ++ Dir)); get_disk_free(Dir, {unix, _}) -> Df = os:find_executable("df"), - parse_free_unix(rabbit_misc:os_cmd(Df ++ " -kP " ++ Dir)); + parse_free_unix(run_cmd(Df ++ " -kP " ++ Dir)); get_disk_free(Dir, {win32, _}) -> - %% On Windows, the Win32 API enforces a limit of 260 characters - %% (MAX_PATH). If we call `dir` with a path longer than that, it - %% fails with "File not found". Starting with Windows 10 version - %% 1607, this limit was removed, but the administrator has to - %% configure that. - %% - %% NTFS supports paths up to 32767 characters. Therefore, paths - %% longer than 260 characters exist but they are "inaccessible" to - %% `dir`. - %% - %% A workaround is to tell the Win32 API to not parse a path and - %% just pass it raw to the underlying filesystem. To do this, the - %% path must be prepended with "\\?\". That's what we do here. - %% - %% However, the underlying filesystem may not support forward - %% slashes transparently, as the Win32 API does. Therefore, we - %% convert all forward slashes to backslashes. - %% - %% See the following page to learn more about this: - %% https://ss64.com/nt/syntax-filenames.html - RawDir = "\\\\?\\" ++ string:replace(Dir, "/", "\\", all), - parse_free_win32(rabbit_misc:os_cmd("dir /-C /W \"" ++ RawDir ++ "\"")). + case win32_get_disk_free_fsutil(Dir) of + {ok, Free0} -> Free0; + error -> + case win32_get_disk_free_pwsh(Dir) of + {ok, Free1} -> Free1; + _ -> exit(could_not_determine_disk_free) + end + end. parse_free_unix(Str) -> case string:tokens(Str, "\n") of @@ -264,11 +250,46 @@ parse_free_unix(Str) -> _ -> exit({unparseable, Str}) end. -parse_free_win32(CommandResult) -> - LastLine = lists:last(string:tokens(CommandResult, "\r\n")), - {match, [Free]} = re:run(lists:reverse(LastLine), "(\\d+)", - [{capture, all_but_first, list}]), - list_to_integer(lists:reverse(Free)). +win32_get_disk_free_fsutil(Dir) -> + % Dir: + % "c:/Users/username/AppData/Roaming/RabbitMQ/db/rabbit2@username-z01-mnesia" + Drive = string:slice(Dir, 0, 2), + + % Drive: c: + FsutilCmd = "fsutil.exe volume diskfree " ++ Drive, + + % C:\windows\system32>fsutil volume diskfree c: + % Total free bytes : 812,733,878,272 (756.9 GB) + % Total bytes : 1,013,310,287,872 (943.7 GB) + % Total quota free bytes : 812,733,878,272 (756.9 GB) + case run_cmd(FsutilCmd) of + {error, timeout} -> + error; + FsutilResult -> + case string:slice(FsutilResult, 0, 5) of + "Error" -> + error; + "Total" -> + FirstLine = hd(string:tokens(FsutilResult, "\r\n")), + {match, [FreeStr]} = re:run(FirstLine, "(\\d+,?)+", [{capture, first, list}]), + {ok, list_to_integer(lists:flatten(string:tokens(FreeStr, ",")))} + end + end. + + +win32_get_disk_free_pwsh(Dir) -> + % Dir: + % "c:/Users/username/AppData/Roaming/RabbitMQ/db/rabbit2@username-z01-mnesia" + Drive = string:slice(Dir, 0, 1), + PoshCmd = "powershell.exe -NoLogo -NoProfile -NonInteractive -Command (Get-PSDrive " ++ Drive ++ ").Free", + case run_cmd(PoshCmd) of + {error, timeout} -> + error; + PoshResultStr -> + % Note: remove \r\n + PoshResult = string:slice(PoshResultStr, 0, length(PoshResultStr) - 2), + {ok, list_to_integer(PoshResult)} + end. interpret_limit({mem_relative, Relative}) when is_number(Relative) -> @@ -318,3 +339,20 @@ enable(#state{dir = Dir, interval = Interval, limit = Limit, retries = Retries} erlang:send_after(Interval, self(), try_enable), State#state{enabled = false} end. + +run_cmd(Cmd) -> + Pid = self(), + Ref = make_ref(), + CmdFun = fun() -> + CmdResult = rabbit_misc:os_cmd(Cmd), + Pid ! {Pid, Ref, CmdResult} + end, + CmdPid = spawn(CmdFun), + receive + {Pid, Ref, CmdResult} -> + CmdResult + after 5000 -> + exit(CmdPid, kill), + rabbit_log:error("Command timed out: '~s'", [Cmd]), + {error, timeout} + end. |
