Autograd Backward: Reverse-Mode Gradients

medium · autograd, backpropagation, topological-sort

Autograd Backward: Reverse-Mode Gradients

Extend the Value class to compute gradients via reverse-mode autodiff.

What you are building

You will implement:

Value.backward() with reverse topological traversal
Activation ops: tanh, relu, exp, log

Methods to implement

1) `Value.backward(self)`

Compute gradients for all nodes reachable from self.

Requirements:

Build a topological ordering of nodes via DFS
Set self.grad = 1.0 (dL/dL)
Traverse nodes in reverse topological order and call node._backward()

2) `Value.tanh(self)`

tanh(x) = (e^(2x) - 1) / (e^(2x) + 1)
Derivative: 1 - tanh(x)^2

3) `Value.relu(self)`

relu(x) = max(0, x)
Derivative: 1 if x > 0, else 0

4) `Value.exp(self)`

exp(x) = e^x
Derivative: exp(x)

5) `Value.log(self)`

log(x) = ln(x)
Derivative: 1/x
Raise ValueError if x <= 0

Example

a = Value(2.0)
b = Value(-3.0)
c = Value(10.0)

f = (a * b + c) ** 2
f.backward()

assert a.grad == -24.0
assert b.grad == 16.0
assert c.grad == 8.0

Hints

Use += for gradient accumulation.
Zero grads manually when running backward repeatedly.

from __future__ import annotations
import math
from typing import Tuple, Set, Callable

class Value:
    """A scalar value with automatic differentiation support."""

def __init__(
        self,
        data: float,
        _children: Tuple["Value", ...] = (),
        _op: str = "",
    ) -> None:
        self.data = float(data)
        self.grad = 0.0
        self._backward: Callable[[], None] = lambda: None
        self._prev: Set["Value"] = set(_children)
        self._op = _op

def __repr__(self) -> str:
        return f"Value(data={self.data}, grad={self.grad})"

def __add__(self, other: "Value | float | int") -> "Value":
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data + other.data, (self, other), "+")

def _backward() -> None:
            self.grad += 1.0 * out.grad
            other.grad += 1.0 * out.grad

out._backward = _backward
        return out

def __mul__(self, other: "Value | float | int") -> "Value":
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self, other), "*")

def _backward() -> None:
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad

out._backward = _backward
        return out

def __pow__(self, n: float | int) -> "Value":
        assert isinstance(n, (int, float)), "Only int/float powers supported"
        out = Value(self.data**n, (self,), f"**{n}")

def _backward() -> None:
            self.grad += n * (self.data ** (n - 1)) * out.grad

out._backward = _backward
        return out

def __neg__(self) -> "Value":
        return self * -1

def __sub__(self, other: "Value | float | int") -> "Value":
        return self + (-other)

def __truediv__(self, other: "Value | float | int") -> "Value":
        return self * other**-1

def __radd__(self, other: float | int) -> "Value":
        return self + other

def __rmul__(self, other: float | int) -> "Value":
        return self * other

def __rsub__(self, other: float | int) -> "Value":
        return Value(other) + (-self)

def __rtruediv__(self, other: float | int) -> "Value":
        return Value(other) * self**-1

def backward(self) -> None:
        """TODO: Implement backward pass with topological sort."""
        raise NotImplementedError()

def tanh(self) -> "Value":
        """TODO: Implement tanh activation."""
        raise NotImplementedError()

def relu(self) -> "Value":
        """TODO: Implement ReLU activation."""
        raise NotImplementedError()

def exp(self) -> "Value":
        """TODO: Implement exponential function."""
        raise NotImplementedError()

def log(self) -> "Value":
        """TODO: Implement natural logarithm."""
        raise NotImplementedError()