jamtur01's picture
Upload folder using huggingface_hub
9c6594c verified
"""Reliably launch and connect to backend server process (wandb service).
Backend server process can be connected to using tcp sockets transport.
"""
import datetime
import os
import pathlib
import platform
import shutil
import subprocess
import sys
import tempfile
import time
from typing import TYPE_CHECKING, Any, Dict, Optional
from wandb import _sentry
from wandb.env import (
core_debug,
dcgm_profiling_enabled,
error_reporting_enabled,
is_require_legacy_service,
)
from wandb.errors import Error, WandbCoreNotAvailableError
from wandb.errors.term import termlog, termwarn
from wandb.util import get_core_path, get_module
from . import _startup_debug, port_file
if TYPE_CHECKING:
from wandb.sdk.wandb_settings import Settings
class ServiceStartProcessError(Error):
"""Raised when a known error occurs when launching wandb service."""
class ServiceStartTimeoutError(Error):
"""Raised when service start times out."""
class ServiceStartPortError(Error):
"""Raised when service start fails to find a port."""
class _Service:
_settings: "Settings"
_sock_port: Optional[int]
_internal_proc: Optional[subprocess.Popen]
_startup_debug_enabled: bool
def __init__(
self,
settings: "Settings",
) -> None:
self._settings = settings
self._stub = None
self._sock_port = None
self._internal_proc = None
self._startup_debug_enabled = _startup_debug.is_enabled()
_sentry.configure_scope(tags=dict(settings), process_context="service")
def _startup_debug_print(self, message: str) -> None:
if not self._startup_debug_enabled:
return
_startup_debug.print_message(message)
def _wait_for_ports(
self, fname: str, proc: Optional[subprocess.Popen] = None
) -> None:
"""Wait for the service to write the port file and then read it.
Args:
fname: The path to the port file.
proc: The process to wait for.
Raises:
ServiceStartTimeoutError: If the service takes too long to start.
ServiceStartPortError: If the service writes an invalid port file or unable to read it.
ServiceStartProcessError: If the service process exits unexpectedly.
"""
time_max = time.monotonic() + self._settings.x_service_wait
while time.monotonic() < time_max:
if proc and proc.poll():
# process finished
# define these variables for sentry context grab:
# command = proc.args
# sys_executable = sys.executable
# which_python = shutil.which("python3")
# proc_out = proc.stdout.read()
# proc_err = proc.stderr.read()
context = dict(
command=proc.args,
sys_executable=sys.executable,
which_python=shutil.which("python3"),
proc_out=proc.stdout.read() if proc.stdout else "",
proc_err=proc.stderr.read() if proc.stderr else "",
)
raise ServiceStartProcessError(
f"The wandb service process exited with {proc.returncode}. "
"Ensure that `sys.executable` is a valid python interpreter. "
"You can override it with the `_executable` setting "
"or with the `WANDB_X_EXECUTABLE` environment variable."
f"\n{context}",
context=context,
)
if not os.path.isfile(fname):
time.sleep(0.2)
continue
try:
pf = port_file.PortFile()
pf.read(fname)
if not pf.is_valid:
time.sleep(0.2)
continue
self._sock_port = pf.sock_port
except Exception as e:
# todo: point at the docs. this could be due to a number of reasons,
# for example, being unable to write to the port file etc.
raise ServiceStartPortError(
f"Failed to allocate port for wandb service: {e}."
)
return
raise ServiceStartTimeoutError(
"Timed out waiting for wandb service to start after "
f"{self._settings.x_service_wait} seconds. "
"Try increasing the timeout with the `_service_wait` setting."
)
def _launch_server(self) -> None:
"""Launch server and set ports."""
# References for starting processes
# - https://github.com/wandb/wandb/blob/archive/old-cli/wandb/__init__.py
# - https://stackoverflow.com/questions/1196074/how-to-start-a-background-process-in-python
self._startup_debug_print("launch")
kwargs: Dict[str, Any] = dict(close_fds=True)
# flags to handle keyboard interrupt signal that is causing a hang
if platform.system() == "Windows":
kwargs.update(creationflags=subprocess.CREATE_NEW_PROCESS_GROUP) # type: ignore [attr-defined]
else:
kwargs.update(start_new_session=True)
pid = str(os.getpid())
with tempfile.TemporaryDirectory() as tmpdir:
fname = os.path.join(tmpdir, f"port-{pid}.txt")
executable = self._settings.x_executable
exec_cmd_list = [executable, "-m"]
service_args = []
if not is_require_legacy_service():
try:
core_path = get_core_path()
except WandbCoreNotAvailableError as e:
_sentry.reraise(e)
service_args.extend([core_path])
if not error_reporting_enabled():
service_args.append("--no-observability")
if core_debug(default="False"):
service_args.extend(["--log-level", "-4"])
if dcgm_profiling_enabled():
service_args.append("--enable-dcgm-profiling")
exec_cmd_list = []
else:
service_args.extend(["wandb", "service", "--debug"])
termwarn(
"Using legacy-service, which is deprecated. If this is"
" unintentional, you can fix it by ensuring you do not call"
" `wandb.require('legacy-service')` and do not set the"
" WANDB_X_REQUIRE_LEGACY_SERVICE environment"
" variable."
)
service_args += [
"--port-filename",
fname,
"--pid",
pid,
]
if os.environ.get("WANDB_SERVICE_PROFILE") == "memray":
_ = get_module(
"memray",
required=(
"wandb service memory profiling requires memray, "
"install with `pip install memray`"
),
)
time_tag = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
output_file = f"wandb_service.memray.{time_tag}.bin"
cli_executable = (
pathlib.Path(__file__).parent.parent.parent.parent
/ "tools"
/ "cli.py"
)
exec_cmd_list = [
executable,
"-m",
"memray",
"run",
"-o",
output_file,
]
service_args[0] = str(cli_executable)
termlog(
f"wandb service memory profiling enabled, output file: {output_file}"
)
termlog(
f"Convert to flamegraph with: `python -m memray flamegraph {output_file}`"
)
try:
internal_proc = subprocess.Popen(
exec_cmd_list + service_args, # type: ignore[arg-type]
env=os.environ,
**kwargs,
)
except Exception as e:
_sentry.reraise(e)
self._startup_debug_print("wait_ports")
try:
self._wait_for_ports(fname, proc=internal_proc)
except Exception as e:
_sentry.reraise(e)
self._startup_debug_print("wait_ports_done")
self._internal_proc = internal_proc
self._startup_debug_print("launch_done")
def start(self) -> None:
self._launch_server()
@property
def sock_port(self) -> Optional[int]:
return self._sock_port
def join(self) -> int:
ret = 0
if self._internal_proc:
ret = self._internal_proc.wait()
return ret