File size: 8,910 Bytes
9c6594c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
"""Reliably launch and connect to backend server process (wandb service).

Backend server process can be connected to using tcp sockets transport.
"""

import datetime
import os
import pathlib
import platform
import shutil
import subprocess
import sys
import tempfile
import time
from typing import TYPE_CHECKING, Any, Dict, Optional

from wandb import _sentry
from wandb.env import (
    core_debug,
    dcgm_profiling_enabled,
    error_reporting_enabled,
    is_require_legacy_service,
)
from wandb.errors import Error, WandbCoreNotAvailableError
from wandb.errors.term import termlog, termwarn
from wandb.util import get_core_path, get_module

from . import _startup_debug, port_file

if TYPE_CHECKING:
    from wandb.sdk.wandb_settings import Settings


class ServiceStartProcessError(Error):
    """Raised when a known error occurs when launching wandb service."""


class ServiceStartTimeoutError(Error):
    """Raised when service start times out."""


class ServiceStartPortError(Error):
    """Raised when service start fails to find a port."""


class _Service:
    _settings: "Settings"
    _sock_port: Optional[int]
    _internal_proc: Optional[subprocess.Popen]
    _startup_debug_enabled: bool

    def __init__(
        self,
        settings: "Settings",
    ) -> None:
        self._settings = settings
        self._stub = None
        self._sock_port = None
        self._internal_proc = None
        self._startup_debug_enabled = _startup_debug.is_enabled()

        _sentry.configure_scope(tags=dict(settings), process_context="service")

    def _startup_debug_print(self, message: str) -> None:
        if not self._startup_debug_enabled:
            return
        _startup_debug.print_message(message)

    def _wait_for_ports(
        self, fname: str, proc: Optional[subprocess.Popen] = None
    ) -> None:
        """Wait for the service to write the port file and then read it.

        Args:
            fname: The path to the port file.
            proc: The process to wait for.

        Raises:
            ServiceStartTimeoutError: If the service takes too long to start.
            ServiceStartPortError: If the service writes an invalid port file or unable to read it.
            ServiceStartProcessError: If the service process exits unexpectedly.

        """
        time_max = time.monotonic() + self._settings.x_service_wait
        while time.monotonic() < time_max:
            if proc and proc.poll():
                # process finished
                # define these variables for sentry context grab:
                # command = proc.args
                # sys_executable = sys.executable
                # which_python = shutil.which("python3")
                # proc_out = proc.stdout.read()
                # proc_err = proc.stderr.read()
                context = dict(
                    command=proc.args,
                    sys_executable=sys.executable,
                    which_python=shutil.which("python3"),
                    proc_out=proc.stdout.read() if proc.stdout else "",
                    proc_err=proc.stderr.read() if proc.stderr else "",
                )
                raise ServiceStartProcessError(
                    f"The wandb service process exited with {proc.returncode}. "
                    "Ensure that `sys.executable` is a valid python interpreter. "
                    "You can override it with the `_executable` setting "
                    "or with the `WANDB_X_EXECUTABLE` environment variable."
                    f"\n{context}",
                    context=context,
                )
            if not os.path.isfile(fname):
                time.sleep(0.2)
                continue
            try:
                pf = port_file.PortFile()
                pf.read(fname)
                if not pf.is_valid:
                    time.sleep(0.2)
                    continue
                self._sock_port = pf.sock_port
            except Exception as e:
                # todo: point at the docs. this could be due to a number of reasons,
                #  for example, being unable to write to the port file etc.
                raise ServiceStartPortError(
                    f"Failed to allocate port for wandb service: {e}."
                )
            return
        raise ServiceStartTimeoutError(
            "Timed out waiting for wandb service to start after "
            f"{self._settings.x_service_wait} seconds. "
            "Try increasing the timeout with the `_service_wait` setting."
        )

    def _launch_server(self) -> None:
        """Launch server and set ports."""
        # References for starting processes
        # - https://github.com/wandb/wandb/blob/archive/old-cli/wandb/__init__.py
        # - https://stackoverflow.com/questions/1196074/how-to-start-a-background-process-in-python
        self._startup_debug_print("launch")

        kwargs: Dict[str, Any] = dict(close_fds=True)
        # flags to handle keyboard interrupt signal that is causing a hang
        if platform.system() == "Windows":
            kwargs.update(creationflags=subprocess.CREATE_NEW_PROCESS_GROUP)  # type: ignore [attr-defined]
        else:
            kwargs.update(start_new_session=True)

        pid = str(os.getpid())

        with tempfile.TemporaryDirectory() as tmpdir:
            fname = os.path.join(tmpdir, f"port-{pid}.txt")

            executable = self._settings.x_executable
            exec_cmd_list = [executable, "-m"]

            service_args = []

            if not is_require_legacy_service():
                try:
                    core_path = get_core_path()
                except WandbCoreNotAvailableError as e:
                    _sentry.reraise(e)

                service_args.extend([core_path])

                if not error_reporting_enabled():
                    service_args.append("--no-observability")

                if core_debug(default="False"):
                    service_args.extend(["--log-level", "-4"])

                if dcgm_profiling_enabled():
                    service_args.append("--enable-dcgm-profiling")

                exec_cmd_list = []
            else:
                service_args.extend(["wandb", "service", "--debug"])
                termwarn(
                    "Using legacy-service, which is deprecated. If this is"
                    " unintentional, you can fix it by ensuring you do not call"
                    " `wandb.require('legacy-service')` and do not set the"
                    " WANDB_X_REQUIRE_LEGACY_SERVICE environment"
                    " variable."
                )

            service_args += [
                "--port-filename",
                fname,
                "--pid",
                pid,
            ]

            if os.environ.get("WANDB_SERVICE_PROFILE") == "memray":
                _ = get_module(
                    "memray",
                    required=(
                        "wandb service memory profiling requires memray, "
                        "install with `pip install memray`"
                    ),
                )

                time_tag = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
                output_file = f"wandb_service.memray.{time_tag}.bin"
                cli_executable = (
                    pathlib.Path(__file__).parent.parent.parent.parent
                    / "tools"
                    / "cli.py"
                )
                exec_cmd_list = [
                    executable,
                    "-m",
                    "memray",
                    "run",
                    "-o",
                    output_file,
                ]
                service_args[0] = str(cli_executable)
                termlog(
                    f"wandb service memory profiling enabled, output file: {output_file}"
                )
                termlog(
                    f"Convert to flamegraph with: `python -m memray flamegraph {output_file}`"
                )

            try:
                internal_proc = subprocess.Popen(
                    exec_cmd_list + service_args,  # type: ignore[arg-type]
                    env=os.environ,
                    **kwargs,
                )
            except Exception as e:
                _sentry.reraise(e)

            self._startup_debug_print("wait_ports")
            try:
                self._wait_for_ports(fname, proc=internal_proc)
            except Exception as e:
                _sentry.reraise(e)
            self._startup_debug_print("wait_ports_done")
            self._internal_proc = internal_proc
        self._startup_debug_print("launch_done")

    def start(self) -> None:
        self._launch_server()

    @property
    def sock_port(self) -> Optional[int]:
        return self._sock_port

    def join(self) -> int:
        ret = 0
        if self._internal_proc:
            ret = self._internal_proc.wait()
        return ret