new examples folder

2026-05-02 17:43:27 +02:00 · 2026-04-22 13:00:57 -04:00 · 2026-04-22 13:00:57 -04:00 · 227dbb1532
commit 227dbb1532
parent 963e11277d
5 changed files with 109 additions and 80 deletions
--- a/examples/moda_example.py
+++ b/examples/moda_example.py
@ -0,0 +1,73 @@
 import torch
 from open_mythos.moda import MoDAConfig, MoDAModel
 # ---------------------------------------------------------------------------
 # Smoke test
 # ---------------------------------------------------------------------------
 if __name__ == "__main__":
    torch.manual_seed(42)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Device: {device}")
    # Tiny config: 4 layers, 8 routed experts, top-2
    cfg = MoDAConfig(
        vocab_size=512,
        d_model=128,
        n_layers=4,
        n_heads_q=4,
        n_heads_kv=2,
        head_dim=32,
        max_seq_len=64,
        # MoE: 2 shared + 8 routed, activate top-2
        # (2+2)*64 = 256 ≈ equivalent to dense SwiGLU hidden~256
        n_shared_experts=2,
        n_routed_experts=8,
        n_activated_experts=2,
        expert_hidden_dim=64,
        moe_balance_alpha=0.01,
        moe_score_func="softmax",
    )
    model = MoDAModel(cfg).to(device)
    print(f"Parameters: {model.num_parameters():,}")
    print(model)
    B, T = 2, 32
    input_ids = torch.randint(0, cfg.vocab_size, (B, T), device=device)
    labels = torch.randint(0, cfg.vocab_size, (B, T), device=device)
    logits, loss = model(input_ids, labels)
    assert logits.shape == (B, T, cfg.vocab_size)
    print(f"Logits shape : {logits.shape}")
    print(f"Loss (LM + balance): {loss.item():.4f}")
    loss.backward()
    # Verify gradients
    last_writes = {
        f"blocks.{cfg.n_layers - 1}.k_write.weight",
        f"blocks.{cfg.n_layers - 1}.v_write.weight",
    }
    missing = [
        name
        for name, p in model.named_parameters()
        if p.grad is None and name not in last_writes
    ]
    if missing:
        print(f"WARNING — unexpected missing gradients: {missing}")
    else:
        print("All parameters received gradients (excluding last-block writes).")
    # Spot-check: MoE gate weights must receive gradients (through balance loss P_i)
    gate0_grad = model.blocks[0].moe.gate.weight.grad
    assert gate0_grad is not None, "blocks[0].moe.gate.weight has no gradient!"
    print(f"blocks[0].moe.gate.weight grad norm : {gate0_grad.norm().item():.6f}")
    # Spot-check: depth write projections gradient flows from layer ≥ 1 depth reads
    k0_grad = model.blocks[0].k_write.weight.grad
    assert k0_grad is not None, "blocks[0].k_write.weight has no gradient!"
    print(f"blocks[0].k_write.weight grad norm  : {k0_grad.norm().item():.6f}")
    print("Smoke test passed.")
--- a/examples/variants_example.py
+++ b/examples/variants_example.py
--- a/open_mythos/moda.py
+++ b/open_mythos/moda.py
@ -1061,74 +1061,3 @@ class MoDAModel(nn.Module):
            f"(top-{c.n_activated_experts}), "
            f"params={self.num_parameters():,}"
        )
 # # ---------------------------------------------------------------------------
 # # Smoke test
 # # ---------------------------------------------------------------------------
 # if __name__ == "__main__":
 #     torch.manual_seed(42)
 #     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 #     print(f"Device: {device}")
 #     # Tiny config: 4 layers, 8 routed experts, top-2
 #     cfg = MoDAConfig(
 #         vocab_size=512,
 #         d_model=128,
 #         n_layers=4,
 #         n_heads_q=4,
 #         n_heads_kv=2,
 #         head_dim=32,
 #         max_seq_len=64,
 #         # MoE: 2 shared + 8 routed, activate top-2
 #         # (2+2)*64 = 256 ≈ equivalent to dense SwiGLU hidden~256
 #         n_shared_experts=2,
 #         n_routed_experts=8,
 #         n_activated_experts=2,
 #         expert_hidden_dim=64,
 #         moe_balance_alpha=0.01,
 #         moe_score_func="softmax",
 #     )
 #     model = MoDAModel(cfg).to(device)
 #     print(f"Parameters: {model.num_parameters():,}")
 #     print(model)
 #     B, T = 2, 32
 #     input_ids = torch.randint(0, cfg.vocab_size, (B, T), device=device)
 #     labels = torch.randint(0, cfg.vocab_size, (B, T), device=device)
 #     logits, loss = model(input_ids, labels)
 #     assert logits.shape == (B, T, cfg.vocab_size)
 #     print(f"Logits shape : {logits.shape}")
 #     print(f"Loss (LM + balance): {loss.item():.4f}")
 #     loss.backward()
 #     # Verify gradients
 #     last_writes = {
 #         f"blocks.{cfg.n_layers - 1}.k_write.weight",
 #         f"blocks.{cfg.n_layers - 1}.v_write.weight",
 #     }
 #     missing = [
 #         name
 #         for name, p in model.named_parameters()
 #         if p.grad is None and name not in last_writes
 #     ]
 #     if missing:
 #         print(f"WARNING — unexpected missing gradients: {missing}")
 #     else:
 #         print("All parameters received gradients (excluding last-block writes).")
 #     # Spot-check: MoE gate weights must receive gradients (through balance loss P_i)
 #     gate0_grad = model.blocks[0].moe.gate.weight.grad
 #     assert gate0_grad is not None, "blocks[0].moe.gate.weight has no gradient!"
 #     print(f"blocks[0].moe.gate.weight grad norm : {gate0_grad.norm().item():.6f}")
 #     # Spot-check: depth write projections gradient flows from layer ≥ 1 depth reads
 #     k0_grad = model.blocks[0].k_write.weight.grad
 #     assert k0_grad is not None, "blocks[0].k_write.weight has no gradient!"
 #     print(f"blocks[0].k_write.weight grad norm  : {k0_grad.norm().item():.6f}")
 #     print("Smoke test passed.")
--- a/tests/small_benchmark.py
+++ b/tests/small_benchmark.py
@ -1,16 +1,43 @@
 #!/usr/bin/env python3
 """
-Side-by-side training of OpenMythos vs. a vanilla GQA transformer on a small
+Side-by-side training + benchmark of OpenMythos vs. a vanilla transformer on a
-HuggingFace dataset (wikitext-2 by default).
+small HuggingFace dataset (TinyStories by default, streamed).
-Both models share the same tiny config and see the exact same batches in the
+Both models share the same tiny MLA config and see the exact same batches in
-same order, so per-step loss + throughput are directly comparable. The baseline
+the same order, so per-step train loss and throughput are directly comparable.
-is a dense GQA + SwiGLU stack whose unique-layer depth matches the recurrent
+The baseline is a dense stack of the same TransformerBlock primitive with
-block's unique-parameter depth (prelude + 1 + coda), so parameter counts land
+`use_moe=False`; its unique-layer depth matches the recurrent block's
-in the same ballpark.
+unique-parameter depth (prelude + 1 + coda), so total parameter counts land in
 the same ballpark. Attention kernel is shared (MLA in both models), so any
 measured delta reflects the looped recurrent-depth architecture rather than
 kernel differences.
-    python training/small_benchmark.py
+What the script measures
-    python training/small_benchmark.py --steps 500 --device cuda
+------------------------
 1. Per-step training loss + tokens/sec for both models, fed identical batches.
 2. Periodic held-out eval loss on a separate dataset split (--eval-every).
 3. Depth-extrapolation sweep at the end: OpenMythos is trained at
   cfg.max_loop_iters, then evaluated at n_loops in --depth-sweep
   (default 1,2,4,8,16). This is the experiment the recurrent-depth
   architecture is designed to win — eval loss should keep dropping past
   the trained depth if depth extrapolation is working.
 4. Summary table with initial/final/avg train loss, wall-clock, avg tok/s,
   and sec/step for both models.
 Defaults are tuned for a laptop CPU run in reasonable time; pass --device cuda
 and bump --steps / --batch-size / --seq-len for a real comparison.
    # Default CPU smoke run (TinyStories, 1k steps, batch 32, seq 256)
    python tests/small_benchmark.py
    # Heavier GPU run
    python tests/small_benchmark.py --steps 5000 --batch-size 64 --seq-len 512 --device cuda
    # Wikitext instead of TinyStories
    python tests/small_benchmark.py --dataset wikitext --dataset-config wikitext-2-raw-v1
    # Aggressive depth extrapolation sweep
    python tests/small_benchmark.py --depth-sweep 1,2,4,8,16,32
 """
 from __future__ import annotations
--- a/tests/test_main.py
+++ b/tests/test_main.py