Self-Attention: The Core of Transformers

hard · attention, transformers, self-attention

Self-Attention: The Core of Transformers

Implement scaled dot-product self-attention using scalar Value operations and list-based vectors.

What you are building

1) `softmax(scores: List[Value]) -> List[Value]`

Subtract max for numerical stability
Return probabilities that sum to 1

2) `attention_scores(Q, K, d_k) -> List[List[Value]]`

Q, K: lists of length T, each vector length d_k
scores[i][j] = dot(Q[i], K[j]) / sqrt(d_k)

3) `causal_mask(seq_len: int) -> List[List[float]]`

mask[i][j] = 0.0 if j <= i, else -1e9

4) `apply_attention(weights, V) -> List[List[Value]]`

weights: (T, T) probabilities
V: (T, d_v) values
Output: (T, d_v) weighted sums

5) `SelfAttention(Module)`

class SelfAttention(Module):
    def __init__(self, embed_dim: int):
        # W_q, W_k, W_v, W_o = Linear(embed_dim, embed_dim)
        pass

    def forward(self, x: List[List[Value]], causal: bool = True) -> List[List[Value]]:
        # x: (T, embed_dim)
        # returns: (T, embed_dim)
        pass

Notes

Use Value ops for all computations so gradients flow.
The causal mask can be added as a float; Value.__add__ handles it.
Softmax should be computed row-wise on the (T, T) score matrix.

Example

attn = SelfAttention(embed_dim=4)
x = [[Value(0.1) for _ in range(4)] for _ in range(3)]
out = attn(x, causal=True)
assert len(out) == 3 and len(out[0]) == 4

Hints

Use sum(w * v for w, v in zip(vec1, vec2)) for dot products.
Causal masking should zero out probability mass on future positions after softmax.

from __future__ import annotations
import math
import random
from typing import List, Iterator, Tuple, Callable, Set

class Value:
    """Value class with autograd support."""

def __init__(self, data: float, _children: Tuple["Value", ...] = (), _op: str = "") -> None:
        self.data = float(data)
        self.grad = 0.0
        self._backward: Callable[[], None] = lambda: None
        self._prev: Set["Value"] = set(_children)
        self._op = _op

def __repr__(self) -> str:
        return f"Value({self.data:.4f})"

def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data + other.data, (self, other), "+")
        def _backward():
            self.grad += out.grad
            other.grad += out.grad
        out._backward = _backward
        return out

def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self, other), "*")
        def _backward():
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad
        out._backward = _backward
        return out

def __pow__(self, n):
        out = Value(self.data ** n, (self,), f"**{n}")
        def _backward():
            self.grad += n * (self.data ** (n - 1)) * out.grad
        out._backward = _backward
        return out

def __neg__(self):
        return self * -1

def __sub__(self, other):
        return self + (-other)

def __truediv__(self, other):
        return self * (other ** -1)

def __radd__(self, other):
        return self + other

def __rmul__(self, other):
        return self * other

def exp(self):
        out = Value(math.exp(self.data), (self,), "exp")
        def _backward():
            self.grad += out.data * out.grad
        out._backward = _backward
        return out

def log(self):
        out = Value(math.log(self.data), (self,), "log")
        def _backward():
            self.grad += (1.0 / self.data) * out.grad
        out._backward = _backward
        return out

def backward(self):
        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)
        self.grad = 1.0
        for node in reversed(topo):
            node._backward()

class Module:
    """Module base class."""

def __init__(self) -> None:
        object.__setattr__(self, "_parameters", {})
        object.__setattr__(self, "_modules", {})
        object.__setattr__(self, "training", True)

def __setattr__(self, name: str, value) -> None:
        if isinstance(value, Value):
            self._parameters[name] = value
        elif isinstance(value, Module):
            self._modules[name] = value
        elif isinstance(value, (list, tuple)):
            self._register_list(name, value)
        object.__setattr__(self, name, value)

def _register_list(self, name: str, values) -> None:
        for i, v in enumerate(values):
            if isinstance(v, Value):
                self._parameters[f"{name}.{i}"] = v
            elif isinstance(v, Module):
                self._modules[f"{name}.{i}"] = v
            elif isinstance(v, (list, tuple)):
                self._register_list(f"{name}.{i}", v)

def __call__(self, *args, **kwargs):
        return self.forward(*args, **kwargs)

def forward(self, *args, **kwargs):
        raise NotImplementedError()

def parameters(self) -> List[Value]:
        params = list(self._parameters.values())
        for module in self._modules.values():
            params.extend(module.parameters())
        return params

class Linear(Module):
    """Linear layer."""

def __init__(self, in_features: int, out_features: int, bias: bool = True) -> None:
        super().__init__()
        k = 1.0 / (in_features ** 0.5)
        self.weight = [
            [Value(random.uniform(-k, k)) for _ in range(in_features)]
            for _ in range(out_features)
        ]
        self.bias = [Value(0.0) for _ in range(out_features)] if bias else None
        self.out_features = out_features

def forward(self, x: List[Value]) -> List[Value]:
        out = []
        for i in range(self.out_features):
            val = sum(w * xi for w, xi in zip(self.weight[i], x))
            if self.bias is not None:
                val = val + self.bias[i]
            out.append(val)
        return out

def softmax(scores: List[Value]) -> List[Value]:
    """TODO: Implement softmax with numerical stability."""
    raise NotImplementedError()

def attention_scores(Q: List[List[Value]], K: List[List[Value]], d_k: int) -> List[List[Value]]:
    """TODO: Compute scaled dot-product attention scores."""
    raise NotImplementedError()

def causal_mask(seq_len: int) -> List[List[float]]:
    """TODO: Create causal (autoregressive) mask."""
    raise NotImplementedError()

def apply_attention(weights: List[List[Value]], V: List[List[Value]]) -> List[List[Value]]:
    """TODO: Apply attention weights to values."""
    raise NotImplementedError()

class SelfAttention(Module):
    """TODO: Implement self-attention module."""

def __init__(self, embed_dim: int) -> None:
        """TODO: Initialize Q, K, V, O projections."""
        raise NotImplementedError()

def forward(self, x: List[List[Value]], causal: bool = True) -> List[List[Value]]:
        """TODO: Compute self-attention."""
        raise NotImplementedError()

from __future__ import annotations
import math
import random
from typing import List, Iterator, Tuple, Callable, Set

class Value:
    """Value class with autograd support."""

def __repr__(self) -> str:
        return f"Value({self.data:.4f})"

def __neg__(self):
        return self * -1

def __sub__(self, other):
        return self + (-other)

def __truediv__(self, other):
        return self * (other ** -1)

def __radd__(self, other):
        return self + other

def __rmul__(self, other):
        return self * other

def exp(self):
        out = Value(math.exp(self.data), (self,), "exp")
        def _backward():
            self.grad += out.data * out.grad
        out._backward = _backward
        return out

class Module:
    """Module base class."""

def __init__(self) -> None:
        object.__setattr__(self, "_parameters", {})
        object.__setattr__(self, "_modules", {})
        object.__setattr__(self, "training", True)

def __call__(self, *args, **kwargs):
        return self.forward(*args, **kwargs)

def forward(self, *args, **kwargs):
        raise NotImplementedError()

class Linear(Module):
    """Linear layer."""

def softmax(scores: List[Value]) -> List[Value]:
    """TODO: Implement softmax with numerical stability."""
    raise NotImplementedError()

def attention_scores(Q: List[List[Value]], K: List[List[Value]], d_k: int) -> List[List[Value]]:
    """TODO: Compute scaled dot-product attention scores."""
    raise NotImplementedError()

def causal_mask(seq_len: int) -> List[List[float]]:
    """TODO: Create causal (autoregressive) mask."""
    raise NotImplementedError()

def apply_attention(weights: List[List[Value]], V: List[List[Value]]) -> List[List[Value]]:
    """TODO: Apply attention weights to values."""
    raise NotImplementedError()

class SelfAttention(Module):
    """TODO: Implement self-attention module."""

def __init__(self, embed_dim: int) -> None:
        """TODO: Initialize Q, K, V, O projections."""
        raise NotImplementedError()

def forward(self, x: List[List[Value]], causal: bool = True) -> List[List[Value]]:
        """TODO: Compute self-attention."""
        raise NotImplementedError()

Auto-advance

Run tests to see results

No issues detected

Self-Attention: The Core of Transformers

What you are building

1) softmax(scores: List[Value]) -> List[Value]

2) attention_scores(Q, K, d_k) -> List[List[Value]]

3) causal_mask(seq_len: int) -> List[List[float]]

4) apply_attention(weights, V) -> List[List[Value]]

5) SelfAttention(Module)

Notes

Example

Hints

1) `softmax(scores: List[Value]) -> List[Value]`

2) `attention_scores(Q, K, d_k) -> List[List[Value]]`

3) `causal_mask(seq_len: int) -> List[List[float]]`

4) `apply_attention(weights, V) -> List[List[Value]]`

5) `SelfAttention(Module)`