Bigram Language Model

medium · language-models, softmax, training

Bigram Language Model

Build a trainable bigram language model. This is the smallest end-to-end LM: it predicts the next token using only the current token.

Model

A bigram LM stores logits for each token pair:

Parameters: W of shape (V, V)
Given token t, logits are W[t]

You can implement this with a single embedding table where embedding_dim = vocab_size.

Tasks

1) `softmax(scores)`

Accepts a list of Value
Must be numerically stable (subtract max)

2) `cross_entropy_loss(logits, targets)`

logits: list of length T, each length V
targets: list of length T
Return mean negative log-probability of correct targets

3) `BigramLM(Module)`

class BigramLM(Module):
    def __init__(self, vocab_size: int):
        pass

    def forward(self, token_ids: List[int]) -> List[List[Value]]:
        pass

4) `train_step(model, x, y, lr)`

Forward -> loss -> backward -> SGD update
Return loss.data

5) `generate(model, start_ids, max_new_tokens, temperature=1.0)`

Autoregressively sample next tokens
Use the logits from the last position

Notes

Use Value operations so gradients flow to parameters.
generate should return a list of token IDs including the prompt.

from __future__ import annotations
import math
import random
from typing import List, Iterator, Tuple, Callable, Set

class Value:
    """Scalar Value with autograd support."""

def __init__(self, data: float, _children: Tuple["Value", ...] = (), _op: str = "") -> None:
        self.data = float(data)
        self.grad = 0.0
        self._backward: Callable[[], None] = lambda: None
        self._prev: Set["Value"] = set(_children)
        self._op = _op

def __repr__(self) -> str:
        return f"Value({self.data:.4f})"

def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data + other.data, (self, other), "+")
        def _backward():
            self.grad += out.grad
            other.grad += out.grad
        out._backward = _backward
        return out

def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self, other), "*")
        def _backward():
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad
        out._backward = _backward
        return out

def __pow__(self, n):
        out = Value(self.data ** n, (self,), f"**{n}")
        def _backward():
            self.grad += n * (self.data ** (n - 1)) * out.grad
        out._backward = _backward
        return out

def __neg__(self):
        return self * -1

def __sub__(self, other):
        return self + (-other)

def __truediv__(self, other):
        return self * (other ** -1)

def __radd__(self, other):
        return self + other

def __rmul__(self, other):
        return self * other

def exp(self):
        out = Value(math.exp(self.data), (self,), "exp")
        def _backward():
            self.grad += out.data * out.grad
        out._backward = _backward
        return out

def log(self):
        out = Value(math.log(max(self.data, 1e-10)), (self,), "log")
        def _backward():
            self.grad += (1.0 / max(self.data, 1e-10)) * out.grad
        out._backward = _backward
        return out

def backward(self):
        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)
        self.grad = 1.0
        for node in reversed(topo):
            node._backward()

class Module:
    """Module base class."""

def __init__(self) -> None:
        object.__setattr__(self, "_parameters", {})
        object.__setattr__(self, "_modules", {})
        object.__setattr__(self, "training", True)

def __setattr__(self, name: str, value) -> None:
        if isinstance(value, Value):
            self._parameters[name] = value
        elif isinstance(value, Module):
            self._modules[name] = value
        elif isinstance(value, (list, tuple)):
            self._register_list(name, value)
        object.__setattr__(self, name, value)

def _register_list(self, name: str, values) -> None:
        for i, v in enumerate(values):
            if isinstance(v, Value):
                self._parameters[f"{name}.{i}"] = v
            elif isinstance(v, Module):
                self._modules[f"{name}.{i}"] = v
            elif isinstance(v, (list, tuple)):
                self._register_list(f"{name}.{i}", v)

def __call__(self, *args, **kwargs):
        return self.forward(*args, **kwargs)

def forward(self, *args, **kwargs):
        raise NotImplementedError()

def parameters(self) -> List[Value]:
        params = list(self._parameters.values())
        for module in self._modules.values():
            params.extend(module.parameters())
        return params

class Embedding(Module):
    """Embedding lookup table."""

def __init__(self, num_embeddings: int, embedding_dim: int) -> None:
        super().__init__()
        k = 1.0 / (embedding_dim ** 0.5)
        self.weight = [
            [Value(random.uniform(-k, k)) for _ in range(embedding_dim)]
            for _ in range(num_embeddings)
        ]
        self.num_embeddings = num_embeddings
        self.embedding_dim = embedding_dim

def forward(self, indices: List[int]) -> List[List[Value]]:
        return [self.weight[i] for i in indices]

def softmax(scores: List[Value]) -> List[Value]:
    """TODO: Implement numerically stable softmax."""
    raise NotImplementedError()

def cross_entropy_loss(logits: List[List[Value]], targets: List[int]) -> Value:
    """TODO: Mean cross-entropy loss over positions."""
    raise NotImplementedError()

class BigramLM(Module):
    """TODO: Implement bigram language model."""

def __init__(self, vocab_size: int) -> None:
        raise NotImplementedError()

def forward(self, token_ids: List[int]) -> List[List[Value]]:
        raise NotImplementedError()

def train_step(model: BigramLM, x: List[int], y: List[int], lr: float) -> float:
    """TODO: One SGD step; return loss value."""
    raise NotImplementedError()

def generate(
    model: BigramLM,
    start_ids: List[int],
    max_new_tokens: int,
    temperature: float = 1.0,
) -> List[int]:
    """TODO: Autoregressively sample tokens."""
    raise NotImplementedError()

from __future__ import annotations
import math
import random
from typing import List, Iterator, Tuple, Callable, Set

class Value:
    """Scalar Value with autograd support."""

def __repr__(self) -> str:
        return f"Value({self.data:.4f})"

def __neg__(self):
        return self * -1

def __sub__(self, other):
        return self + (-other)

def __truediv__(self, other):
        return self * (other ** -1)

def __radd__(self, other):
        return self + other

def __rmul__(self, other):
        return self * other

def exp(self):
        out = Value(math.exp(self.data), (self,), "exp")
        def _backward():
            self.grad += out.data * out.grad
        out._backward = _backward
        return out

class Module:
    """Module base class."""

def __init__(self) -> None:
        object.__setattr__(self, "_parameters", {})
        object.__setattr__(self, "_modules", {})
        object.__setattr__(self, "training", True)

def __call__(self, *args, **kwargs):
        return self.forward(*args, **kwargs)

def forward(self, *args, **kwargs):
        raise NotImplementedError()

class Embedding(Module):
    """Embedding lookup table."""

def forward(self, indices: List[int]) -> List[List[Value]]:
        return [self.weight[i] for i in indices]

def softmax(scores: List[Value]) -> List[Value]:
    """TODO: Implement numerically stable softmax."""
    raise NotImplementedError()

def cross_entropy_loss(logits: List[List[Value]], targets: List[int]) -> Value:
    """TODO: Mean cross-entropy loss over positions."""
    raise NotImplementedError()

class BigramLM(Module):
    """TODO: Implement bigram language model."""

def __init__(self, vocab_size: int) -> None:
        raise NotImplementedError()

def forward(self, token_ids: List[int]) -> List[List[Value]]:
        raise NotImplementedError()

def train_step(model: BigramLM, x: List[int], y: List[int], lr: float) -> float:
    """TODO: One SGD step; return loss value."""
    raise NotImplementedError()

Auto-advance

Run tests to see results

No issues detected

Bigram Language Model

Model

Tasks

1) softmax(scores)

2) cross_entropy_loss(logits, targets)

3) BigramLM(Module)

4) train_step(model, x, y, lr)

5) generate(model, start_ids, max_new_tokens, temperature=1.0)

Notes

1) `softmax(scores)`

2) `cross_entropy_loss(logits, targets)`

3) `BigramLM(Module)`

4) `train_step(model, x, y, lr)`

5) `generate(model, start_ids, max_new_tokens, temperature=1.0)`