Skip to content

vllm.config.profiler

ProfilerKind module-attribute

ProfilerKind = Literal['torch', 'cuda']

logger module-attribute

logger = init_logger(__name__)

ProfilerConfig

Dataclass which contains profiler config for the engine.

Source code in vllm/config/profiler.py
@config
@dataclass
class ProfilerConfig:
    """Dataclass which contains profiler config for the engine."""

    profiler: ProfilerKind | None = None
    """Which profiler to use. Defaults to None. Options are:

    - 'torch': Use PyTorch profiler.\n
    - 'cuda': Use CUDA profiler."""

    torch_profiler_dir: str = ""
    """Directory to save torch profiler traces. Both AsyncLLM's CPU traces and
    worker's traces (CPU & GPU) will be saved under this directory. Note that
    it must be an absolute path."""

    torch_profiler_with_stack: bool = True
    """If `True`, enables stack tracing in the torch profiler. Enabled by default."""

    torch_profiler_with_flops: bool = False
    """If `True`, enables FLOPS counting in the torch profiler. Disabled by default."""

    torch_profiler_use_gzip: bool = True
    """If `True`, saves torch profiler traces in gzip format. Enabled by default"""

    torch_profiler_dump_cuda_time_total: bool = True
    """If `True`, dumps total CUDA time in torch profiler traces. Enabled by default."""

    torch_profiler_record_shapes: bool = False
    """If `True`, records tensor shapes in the torch profiler. Disabled by default."""

    torch_profiler_with_memory: bool = False
    """If `True`, enables memory profiling in the torch profiler.
    Disabled by default."""

    ignore_frontend: bool = False
    """If `True`, disables the front-end profiling of AsyncLLM when using the 
    'torch' profiler. This is needed to reduce overhead when using delay/limit options,
    since the front-end profiling does not track iterations and will capture the
    entire range.
    """

    delay_iterations: int = Field(default=0, ge=0)
    """Number of engine iterations to skip before starting profiling.
    Defaults to 0, meaning profiling starts immediately after receiving /start_profile.
    """

    max_iterations: int = Field(default=0, ge=0)
    """Maximum number of engine iterations to profile after starting profiling.
    Defaults to 0, meaning no limit.
    """

    def compute_hash(self) -> str:
        """
        WARNING: Whenever a new field is added to this config,
        ensure that it is included in the factors list if
        it affects the computation graph.

        Provide a hash that uniquely identifies all the configs
        that affect the structure of the computation
        graph from input ids/embeddings to the final hidden states,
        excluding anything before input ids/embeddings and after
        the final hidden states.
        """
        # no factors to consider.
        # this config will not affect the computation graph.
        factors: list[Any] = []
        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
        return hash_str

    def _get_from_env_if_set(self, field_name: str, env_var_name: str) -> None:
        """Get field from env var if set, with deprecation warning."""

        if envs.is_set(env_var_name):
            value = getattr(envs, env_var_name)
            logger.warning_once(
                "Using %s environment variable is deprecated and will be removed in "
                "v0.14.0 or v1.0.0, whichever is soonest. Please use "
                "--profiler-config.%s command line argument or "
                "ProfilerConfig(%s=...) config field instead.",
                env_var_name,
                field_name,
                field_name,
            )
            return value
        return None

    def _set_from_env_if_set(
        self,
        field_name: str,
        env_var_name: str,
        to_bool: bool = True,
        to_int: bool = False,
    ) -> None:
        """Set field from env var if set, with deprecation warning."""
        value = self._get_from_env_if_set(field_name, env_var_name)
        if value is not None:
            if to_bool:
                value = value == "1"
            if to_int:
                value = int(value)
            setattr(self, field_name, value)

    @model_validator(mode="after")
    def _validate_profiler_config(self) -> Self:
        maybe_use_cuda_profiler = self._get_from_env_if_set(
            "profiler", "VLLM_TORCH_CUDA_PROFILE"
        )
        if maybe_use_cuda_profiler is not None:
            self.profiler = "cuda" if maybe_use_cuda_profiler == "1" else None
        else:
            self._set_from_env_if_set(
                "torch_profiler_dir", "VLLM_TORCH_PROFILER_DIR", to_bool=False
            )
            if self.torch_profiler_dir:
                self.profiler = "torch"
                self._set_from_env_if_set(
                    "torch_profiler_record_shapes",
                    "VLLM_TORCH_PROFILER_RECORD_SHAPES",
                )
                self._set_from_env_if_set(
                    "torch_profiler_with_memory",
                    "VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY",
                )
                self._set_from_env_if_set(
                    "torch_profiler_with_stack",
                    "VLLM_TORCH_PROFILER_WITH_STACK",
                )
                self._set_from_env_if_set(
                    "torch_profiler_with_flops",
                    "VLLM_TORCH_PROFILER_WITH_FLOPS",
                )
                self._set_from_env_if_set(
                    "ignore_frontend",
                    "VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM",
                )
                self._set_from_env_if_set(
                    "torch_profiler_use_gzip",
                    "VLLM_TORCH_PROFILER_USE_GZIP",
                )
                self._set_from_env_if_set(
                    "torch_profiler_dump_cuda_time_total",
                    "VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL",
                )

        self._set_from_env_if_set(
            "delay_iterations", "VLLM_PROFILER_DELAY_ITERS", to_bool=False, to_int=True
        )
        self._set_from_env_if_set(
            "max_iterations", "VLLM_PROFILER_MAX_ITERS", to_bool=False, to_int=True
        )

        has_delay_or_limit = self.delay_iterations > 0 or self.max_iterations > 0
        if self.profiler == "torch" and has_delay_or_limit and not self.ignore_frontend:
            logger.warning_once(
                "Using 'torch' profiler with delay_iterations or max_iterations "
                "while ignore_frontend is False may result in high overhead."
            )

        profiler_dir = self.torch_profiler_dir
        if profiler_dir and self.profiler != "torch":
            raise ValueError(
                "torch_profiler_dir is only applicable when profiler is set to 'torch'"
            )
        if self.profiler == "torch" and not profiler_dir:
            raise ValueError("torch_profiler_dir must be set when profiler is 'torch'")

        if profiler_dir:
            is_gs_path = (
                profiler_dir.startswith("gs://")
                and profiler_dir[5:]
                and profiler_dir[5] != "/"
            )
            if not is_gs_path:
                self.torch_profiler_dir = os.path.abspath(
                    os.path.expanduser(profiler_dir)
                )

        return self

delay_iterations class-attribute instance-attribute

delay_iterations: int = Field(default=0, ge=0)

Number of engine iterations to skip before starting profiling. Defaults to 0, meaning profiling starts immediately after receiving /start_profile.

ignore_frontend class-attribute instance-attribute

ignore_frontend: bool = False

If True, disables the front-end profiling of AsyncLLM when using the 'torch' profiler. This is needed to reduce overhead when using delay/limit options, since the front-end profiling does not track iterations and will capture the entire range.

max_iterations class-attribute instance-attribute

max_iterations: int = Field(default=0, ge=0)

Maximum number of engine iterations to profile after starting profiling. Defaults to 0, meaning no limit.

profiler class-attribute instance-attribute

profiler: ProfilerKind | None = None

Which profiler to use. Defaults to None. Options are:

  • 'torch': Use PyTorch profiler.

  • 'cuda': Use CUDA profiler.

torch_profiler_dir class-attribute instance-attribute

torch_profiler_dir: str = ''

Directory to save torch profiler traces. Both AsyncLLM's CPU traces and worker's traces (CPU & GPU) will be saved under this directory. Note that it must be an absolute path.

torch_profiler_dump_cuda_time_total class-attribute instance-attribute

torch_profiler_dump_cuda_time_total: bool = True

If True, dumps total CUDA time in torch profiler traces. Enabled by default.

torch_profiler_record_shapes class-attribute instance-attribute

torch_profiler_record_shapes: bool = False

If True, records tensor shapes in the torch profiler. Disabled by default.

torch_profiler_use_gzip class-attribute instance-attribute

torch_profiler_use_gzip: bool = True

If True, saves torch profiler traces in gzip format. Enabled by default

torch_profiler_with_flops class-attribute instance-attribute

torch_profiler_with_flops: bool = False

If True, enables FLOPS counting in the torch profiler. Disabled by default.

torch_profiler_with_memory class-attribute instance-attribute

torch_profiler_with_memory: bool = False

If True, enables memory profiling in the torch profiler. Disabled by default.

torch_profiler_with_stack class-attribute instance-attribute

torch_profiler_with_stack: bool = True

If True, enables stack tracing in the torch profiler. Enabled by default.

_get_from_env_if_set

_get_from_env_if_set(
    field_name: str, env_var_name: str
) -> None

Get field from env var if set, with deprecation warning.

Source code in vllm/config/profiler.py
def _get_from_env_if_set(self, field_name: str, env_var_name: str) -> None:
    """Get field from env var if set, with deprecation warning."""

    if envs.is_set(env_var_name):
        value = getattr(envs, env_var_name)
        logger.warning_once(
            "Using %s environment variable is deprecated and will be removed in "
            "v0.14.0 or v1.0.0, whichever is soonest. Please use "
            "--profiler-config.%s command line argument or "
            "ProfilerConfig(%s=...) config field instead.",
            env_var_name,
            field_name,
            field_name,
        )
        return value
    return None

_set_from_env_if_set

_set_from_env_if_set(
    field_name: str,
    env_var_name: str,
    to_bool: bool = True,
    to_int: bool = False,
) -> None

Set field from env var if set, with deprecation warning.

Source code in vllm/config/profiler.py
def _set_from_env_if_set(
    self,
    field_name: str,
    env_var_name: str,
    to_bool: bool = True,
    to_int: bool = False,
) -> None:
    """Set field from env var if set, with deprecation warning."""
    value = self._get_from_env_if_set(field_name, env_var_name)
    if value is not None:
        if to_bool:
            value = value == "1"
        if to_int:
            value = int(value)
        setattr(self, field_name, value)

_validate_profiler_config

_validate_profiler_config() -> Self
Source code in vllm/config/profiler.py
@model_validator(mode="after")
def _validate_profiler_config(self) -> Self:
    maybe_use_cuda_profiler = self._get_from_env_if_set(
        "profiler", "VLLM_TORCH_CUDA_PROFILE"
    )
    if maybe_use_cuda_profiler is not None:
        self.profiler = "cuda" if maybe_use_cuda_profiler == "1" else None
    else:
        self._set_from_env_if_set(
            "torch_profiler_dir", "VLLM_TORCH_PROFILER_DIR", to_bool=False
        )
        if self.torch_profiler_dir:
            self.profiler = "torch"
            self._set_from_env_if_set(
                "torch_profiler_record_shapes",
                "VLLM_TORCH_PROFILER_RECORD_SHAPES",
            )
            self._set_from_env_if_set(
                "torch_profiler_with_memory",
                "VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY",
            )
            self._set_from_env_if_set(
                "torch_profiler_with_stack",
                "VLLM_TORCH_PROFILER_WITH_STACK",
            )
            self._set_from_env_if_set(
                "torch_profiler_with_flops",
                "VLLM_TORCH_PROFILER_WITH_FLOPS",
            )
            self._set_from_env_if_set(
                "ignore_frontend",
                "VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM",
            )
            self._set_from_env_if_set(
                "torch_profiler_use_gzip",
                "VLLM_TORCH_PROFILER_USE_GZIP",
            )
            self._set_from_env_if_set(
                "torch_profiler_dump_cuda_time_total",
                "VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL",
            )

    self._set_from_env_if_set(
        "delay_iterations", "VLLM_PROFILER_DELAY_ITERS", to_bool=False, to_int=True
    )
    self._set_from_env_if_set(
        "max_iterations", "VLLM_PROFILER_MAX_ITERS", to_bool=False, to_int=True
    )

    has_delay_or_limit = self.delay_iterations > 0 or self.max_iterations > 0
    if self.profiler == "torch" and has_delay_or_limit and not self.ignore_frontend:
        logger.warning_once(
            "Using 'torch' profiler with delay_iterations or max_iterations "
            "while ignore_frontend is False may result in high overhead."
        )

    profiler_dir = self.torch_profiler_dir
    if profiler_dir and self.profiler != "torch":
        raise ValueError(
            "torch_profiler_dir is only applicable when profiler is set to 'torch'"
        )
    if self.profiler == "torch" and not profiler_dir:
        raise ValueError("torch_profiler_dir must be set when profiler is 'torch'")

    if profiler_dir:
        is_gs_path = (
            profiler_dir.startswith("gs://")
            and profiler_dir[5:]
            and profiler_dir[5] != "/"
        )
        if not is_gs_path:
            self.torch_profiler_dir = os.path.abspath(
                os.path.expanduser(profiler_dir)
            )

    return self

compute_hash

compute_hash() -> str

WARNING: Whenever a new field is added to this config, ensure that it is included in the factors list if it affects the computation graph.

Provide a hash that uniquely identifies all the configs that affect the structure of the computation graph from input ids/embeddings to the final hidden states, excluding anything before input ids/embeddings and after the final hidden states.

Source code in vllm/config/profiler.py
def compute_hash(self) -> str:
    """
    WARNING: Whenever a new field is added to this config,
    ensure that it is included in the factors list if
    it affects the computation graph.

    Provide a hash that uniquely identifies all the configs
    that affect the structure of the computation
    graph from input ids/embeddings to the final hidden states,
    excluding anything before input ids/embeddings and after
    the final hidden states.
    """
    # no factors to consider.
    # this config will not affect the computation graph.
    factors: list[Any] = []
    hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
    return hash_str