new examples folder

This commit is contained in:
Kye Gomez 2026-04-22 13:00:57 -04:00
parent 963e11277d
commit 227dbb1532
5 changed files with 109 additions and 80 deletions

73
examples/moda_example.py Normal file
View File

@ -0,0 +1,73 @@
import torch
from open_mythos.moda import MoDAConfig, MoDAModel
# ---------------------------------------------------------------------------
# Smoke test
# ---------------------------------------------------------------------------
if __name__ == "__main__":
torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")
# Tiny config: 4 layers, 8 routed experts, top-2
cfg = MoDAConfig(
vocab_size=512,
d_model=128,
n_layers=4,
n_heads_q=4,
n_heads_kv=2,
head_dim=32,
max_seq_len=64,
# MoE: 2 shared + 8 routed, activate top-2
# (2+2)*64 = 256 ≈ equivalent to dense SwiGLU hidden~256
n_shared_experts=2,
n_routed_experts=8,
n_activated_experts=2,
expert_hidden_dim=64,
moe_balance_alpha=0.01,
moe_score_func="softmax",
)
model = MoDAModel(cfg).to(device)
print(f"Parameters: {model.num_parameters():,}")
print(model)
B, T = 2, 32
input_ids = torch.randint(0, cfg.vocab_size, (B, T), device=device)
labels = torch.randint(0, cfg.vocab_size, (B, T), device=device)
logits, loss = model(input_ids, labels)
assert logits.shape == (B, T, cfg.vocab_size)
print(f"Logits shape : {logits.shape}")
print(f"Loss (LM + balance): {loss.item():.4f}")
loss.backward()
# Verify gradients
last_writes = {
f"blocks.{cfg.n_layers - 1}.k_write.weight",
f"blocks.{cfg.n_layers - 1}.v_write.weight",
}
missing = [
name
for name, p in model.named_parameters()
if p.grad is None and name not in last_writes
]
if missing:
print(f"WARNING — unexpected missing gradients: {missing}")
else:
print("All parameters received gradients (excluding last-block writes).")
# Spot-check: MoE gate weights must receive gradients (through balance loss P_i)
gate0_grad = model.blocks[0].moe.gate.weight.grad
assert gate0_grad is not None, "blocks[0].moe.gate.weight has no gradient!"
print(f"blocks[0].moe.gate.weight grad norm : {gate0_grad.norm().item():.6f}")
# Spot-check: depth write projections gradient flows from layer ≥ 1 depth reads
k0_grad = model.blocks[0].k_write.weight.grad
assert k0_grad is not None, "blocks[0].k_write.weight has no gradient!"
print(f"blocks[0].k_write.weight grad norm : {k0_grad.norm().item():.6f}")
print("Smoke test passed.")

View File

@ -1061,74 +1061,3 @@ class MoDAModel(nn.Module):
f"(top-{c.n_activated_experts}), " f"(top-{c.n_activated_experts}), "
f"params={self.num_parameters():,}" f"params={self.num_parameters():,}"
) )
# # ---------------------------------------------------------------------------
# # Smoke test
# # ---------------------------------------------------------------------------
# if __name__ == "__main__":
# torch.manual_seed(42)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Device: {device}")
# # Tiny config: 4 layers, 8 routed experts, top-2
# cfg = MoDAConfig(
# vocab_size=512,
# d_model=128,
# n_layers=4,
# n_heads_q=4,
# n_heads_kv=2,
# head_dim=32,
# max_seq_len=64,
# # MoE: 2 shared + 8 routed, activate top-2
# # (2+2)*64 = 256 ≈ equivalent to dense SwiGLU hidden~256
# n_shared_experts=2,
# n_routed_experts=8,
# n_activated_experts=2,
# expert_hidden_dim=64,
# moe_balance_alpha=0.01,
# moe_score_func="softmax",
# )
# model = MoDAModel(cfg).to(device)
# print(f"Parameters: {model.num_parameters():,}")
# print(model)
# B, T = 2, 32
# input_ids = torch.randint(0, cfg.vocab_size, (B, T), device=device)
# labels = torch.randint(0, cfg.vocab_size, (B, T), device=device)
# logits, loss = model(input_ids, labels)
# assert logits.shape == (B, T, cfg.vocab_size)
# print(f"Logits shape : {logits.shape}")
# print(f"Loss (LM + balance): {loss.item():.4f}")
# loss.backward()
# # Verify gradients
# last_writes = {
# f"blocks.{cfg.n_layers - 1}.k_write.weight",
# f"blocks.{cfg.n_layers - 1}.v_write.weight",
# }
# missing = [
# name
# for name, p in model.named_parameters()
# if p.grad is None and name not in last_writes
# ]
# if missing:
# print(f"WARNING — unexpected missing gradients: {missing}")
# else:
# print("All parameters received gradients (excluding last-block writes).")
# # Spot-check: MoE gate weights must receive gradients (through balance loss P_i)
# gate0_grad = model.blocks[0].moe.gate.weight.grad
# assert gate0_grad is not None, "blocks[0].moe.gate.weight has no gradient!"
# print(f"blocks[0].moe.gate.weight grad norm : {gate0_grad.norm().item():.6f}")
# # Spot-check: depth write projections gradient flows from layer ≥ 1 depth reads
# k0_grad = model.blocks[0].k_write.weight.grad
# assert k0_grad is not None, "blocks[0].k_write.weight has no gradient!"
# print(f"blocks[0].k_write.weight grad norm : {k0_grad.norm().item():.6f}")
# print("Smoke test passed.")

View File

@ -1,16 +1,43 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
Side-by-side training of OpenMythos vs. a vanilla GQA transformer on a small Side-by-side training + benchmark of OpenMythos vs. a vanilla transformer on a
HuggingFace dataset (wikitext-2 by default). small HuggingFace dataset (TinyStories by default, streamed).
Both models share the same tiny config and see the exact same batches in the Both models share the same tiny MLA config and see the exact same batches in
same order, so per-step loss + throughput are directly comparable. The baseline the same order, so per-step train loss and throughput are directly comparable.
is a dense GQA + SwiGLU stack whose unique-layer depth matches the recurrent The baseline is a dense stack of the same TransformerBlock primitive with
block's unique-parameter depth (prelude + 1 + coda), so parameter counts land `use_moe=False`; its unique-layer depth matches the recurrent block's
in the same ballpark. unique-parameter depth (prelude + 1 + coda), so total parameter counts land in
the same ballpark. Attention kernel is shared (MLA in both models), so any
measured delta reflects the looped recurrent-depth architecture rather than
kernel differences.
python training/small_benchmark.py What the script measures
python training/small_benchmark.py --steps 500 --device cuda ------------------------
1. Per-step training loss + tokens/sec for both models, fed identical batches.
2. Periodic held-out eval loss on a separate dataset split (--eval-every).
3. Depth-extrapolation sweep at the end: OpenMythos is trained at
cfg.max_loop_iters, then evaluated at n_loops in --depth-sweep
(default 1,2,4,8,16). This is the experiment the recurrent-depth
architecture is designed to win eval loss should keep dropping past
the trained depth if depth extrapolation is working.
4. Summary table with initial/final/avg train loss, wall-clock, avg tok/s,
and sec/step for both models.
Defaults are tuned for a laptop CPU run in reasonable time; pass --device cuda
and bump --steps / --batch-size / --seq-len for a real comparison.
# Default CPU smoke run (TinyStories, 1k steps, batch 32, seq 256)
python tests/small_benchmark.py
# Heavier GPU run
python tests/small_benchmark.py --steps 5000 --batch-size 64 --seq-len 512 --device cuda
# Wikitext instead of TinyStories
python tests/small_benchmark.py --dataset wikitext --dataset-config wikitext-2-raw-v1
# Aggressive depth extrapolation sweep
python tests/small_benchmark.py --depth-sweep 1,2,4,8,16,32
""" """
from __future__ import annotations from __future__ import annotations