Public API¶

The stable surface that user code imports and calls. Everything in this page is covered by TraceML's compatibility contract across v0.x minor releases.

Decorators¶

traceml.decorators.trace_step ¶

trace_step(model: Module)

Define a single training step boundary.

Source code in src/traceml/sdk/instrumentation.py

@contextmanager
def trace_step(model: nn.Module):
    """Define a single training step boundary."""
    if _traceml_disabled():
        yield
        return

    trace_state = get_trace_session_state()
    mem_tracker = StepMemoryTracker(model)
    step_completed = False

    try:
        mem_tracker.reset()
    except Exception as exc:
        _log_instrumentation_error("reset failed", exc)

    try:
        with timed_region(
            "_traceml_internal:step_time", scope="step", use_gpu=False
        ):
            with forward_auto_timer(), backward_auto_timer():
                if _should_auto_install_optimizer_timing():
                    ensure_optimizer_timing_installed()
                yield
                step_completed = True
    finally:
        if step_completed:
            trace_state.advance_step()

        try:
            mem_tracker.record()
        except Exception as exc:
            _log_instrumentation_error("record failed", exc)

        try:
            flush_step_events(model, trace_state.step)
        except Exception as exc:
            _log_instrumentation_error("flush failed", exc)

traceml.decorators.trace_model_instance ¶

trace_model_instance(model: Module, sample_layer_memory: bool = True, trace_layer_forward_memory: bool = True, trace_layer_backward_memory: bool = True, trace_layer_forward_time: bool = True, trace_layer_backward_time: bool = True, trace_execution: bool = True, include_names: Optional[List[str]] = None, exclude_names: Optional[List[str]] = None, leaf_only: bool = True) -> None

Manually trace a PyTorch model instance.

This is primarily used by the deep profile and integration layers for model-level hook attachment. It is independent of the automatic patch policy configured by traceml.init(...).

Source code in src/traceml/sdk/instrumentation.py

def trace_model_instance(
    model: nn.Module,
    sample_layer_memory: bool = True,
    trace_layer_forward_memory: bool = True,
    trace_layer_backward_memory: bool = True,
    trace_layer_forward_time: bool = True,
    trace_layer_backward_time: bool = True,
    trace_execution: bool = True,
    include_names: Optional[List[str]] = None,
    exclude_names: Optional[List[str]] = None,
    leaf_only: bool = True,
) -> None:
    """
    Manually trace a PyTorch model instance.

    This is primarily used by the deep profile and integration layers for
    model-level hook attachment. It is independent of the automatic patch
    policy configured by `traceml.init(...)`.
    """
    if _traceml_disabled() or _traceml_profile() != "deep":
        return

    try:
        if not isinstance(model, nn.Module):
            raise TypeError("trace_model_instance expects an nn.Module.")

        if sample_layer_memory:
            model._traceml_include_names = include_names
            model._traceml_exclude_names = exclude_names
            model._traceml_leaf_only = leaf_only
            layer_memory = collect_layer_parameter_memory(model)
            model_queue.put(layer_memory)

        if trace_layer_forward_memory:
            attach_layer_forward_memory_hooks(
                model,
                include_names=include_names,
                exclude_names=exclude_names,
                leaf_only=leaf_only,
            )

        if trace_layer_backward_memory:
            attach_layer_backward_memory_hooks(
                model,
                include_names=include_names,
                exclude_names=exclude_names,
                leaf_only=leaf_only,
            )

        if trace_layer_forward_time:
            attach_layer_forward_time_hooks(
                model,
                include_names=include_names,
                exclude_names=exclude_names,
                leaf_only=leaf_only,
            )

        if trace_layer_backward_time:
            attach_layer_backward_time_hooks(
                model,
                include_names=include_names,
                exclude_names=exclude_names,
                leaf_only=leaf_only,
            )

        if trace_execution:
            attach_execution_entry_hooks(model)

    except Exception as exc:
        _log_instrumentation_error(
            "Failed to trace model instance",
            exc,
        )

Hugging Face integration¶

traceml.integrations.huggingface.TraceMLTrainer ¶

TraceMLTrainer(*args, traceml_enabled: bool = True, traceml_kwargs: Optional[Dict[str, Any]] = None, **kwargs)

Bases: Trainer if HAS_TRANSFORMERS else object

A subclass of Hugging Face's Trainer that automatically integrates TraceML.

This class wraps the training_step with the trace_step context manager to capture step-level metrics (timing, memory, etc.).

Source code in src/traceml/integrations/huggingface.py

def __init__(
    self,
    *args,
    traceml_enabled: bool = True,
    traceml_kwargs: Optional[Dict[str, Any]] = None,
    **kwargs,
):
    if not HAS_TRANSFORMERS:
        raise ImportError(
            "TraceMLTrainer requires 'transformers' to be installed. "
            "Please run `pip install transformers`."
        )

    super().__init__(*args, **kwargs)
    self.traceml_enabled = traceml_enabled

    # If model-level tracing (Deep-Dive) is requested, apply it now
    self.traceml_kwargs = traceml_kwargs
    self._traceml_hooks_attached = False

training_step ¶

training_step(model, inputs, *args, **kwargs) -> Any

Overridden training step to include TraceML instrumentation.

Source code in src/traceml/integrations/huggingface.py

def training_step(self, model, inputs, *args, **kwargs) -> Any:
    """
    Overridden training step to include TraceML instrumentation.
    """
    # BYPASS LOGIC:
    # If the user launched the script with `--disable-traceml` (setting TRACEML_DISABLED="1")
    # or if `traceml_enabled` is explicitly False, we short-circuit immediately.
    # This completely skips any hook attachments, memory tracking, and timing regions
    if TRACEML_DISABLED or not self.traceml_enabled:
        return super().training_step(model, inputs, *args, **kwargs)

    if self.traceml_enabled:
        # Lazily attach hooks on the first step to ensure we catch the
        # final wrapped/moved model (e.g. DDP, Accelerator)
        if self.traceml_kwargs is not None and (
            not self._traceml_hooks_attached
            or id(model) != getattr(self, "_attached_model_id", None)
        ):
            try:
                trace_model_instance(model, **self.traceml_kwargs)
                self._attached_model_id = id(model)
                self._traceml_hooks_attached = True
                logger.info(
                    "[TraceML] Deep-Dive model tracing initialized (lazy)."
                )
            except Exception as e:
                logger.error(
                    f"[TraceML] Failed to initialize model tracing: {e}"
                )

        with trace_step(model):
            return super().training_step(model, inputs, *args, **kwargs)

    return super().training_step(model, inputs, *args, **kwargs)

PyTorch Lightning integration¶

traceml.integrations.lightning.TraceMLCallback ¶

TraceMLCallback()

Bases: Callback

Official TraceML Callback for PyTorch Lightning.

Captures full step time (forward + backward + optimizer) as well as individual phase timings. Safely handles gradient accumulation by treating each micro-batch as a step, providing 0-duration optimizer events on accumulating steps to preserve dashboard step alignment.

Source code in src/traceml/integrations/lightning.py

def __init__(self):
    super().__init__()
    self._traceml_step_ctx = None
    self._forward_ctx = None
    self._backward_ctx = None
    self._optimizer_ctx = None

    self._mem_tracker = None
    self._opt_step_occurred = False

CLI¶

TraceML ships with a CLI entry point installed as traceml.

traceml watch <script>    # run script with live terminal dashboard
traceml run <script>      # run script with minimal instrumentation
traceml deep <script>     # run with full instrumentation (step + memory + layer)

See traceml --help for the full set of options.