From 227dbb153266cd154f3d335f18c6b80947879f14 Mon Sep 17 00:00:00 2001
From: Kye Gomez <kye@swarms.world>
Date: Wed, 22 Apr 2026 13:00:57 -0400
Subject: [PATCH] new examples folder

---
 examples/moda_example.py                | 73 +++++++++++++++++++++++++
 {tests => examples}/variants_example.py |  0
 open_mythos/moda.py                     | 71 ------------------------
 tests/small_benchmark.py                | 45 ++++++++++++---
 test_main.py => tests/test_main.py      |  0
 5 files changed, 109 insertions(+), 80 deletions(-)
 create mode 100644 examples/moda_example.py
 rename {tests => examples}/variants_example.py (100%)
 rename test_main.py => tests/test_main.py (100%)

diff --git a/examples/moda_example.py b/examples/moda_example.py
new file mode 100644
index 0000000..bffc92e
--- /dev/null
+++ b/examples/moda_example.py
@@ -0,0 +1,73 @@
+import torch
+from open_mythos.moda import MoDAConfig, MoDAModel
+
+
+# ---------------------------------------------------------------------------
+# Smoke test
+# ---------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    torch.manual_seed(42)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Device: {device}")
+
+    # Tiny config: 4 layers, 8 routed experts, top-2
+    cfg = MoDAConfig(
+        vocab_size=512,
+        d_model=128,
+        n_layers=4,
+        n_heads_q=4,
+        n_heads_kv=2,
+        head_dim=32,
+        max_seq_len=64,
+        # MoE: 2 shared + 8 routed, activate top-2
+        # (2+2)*64 = 256 ≈ equivalent to dense SwiGLU hidden~256
+        n_shared_experts=2,
+        n_routed_experts=8,
+        n_activated_experts=2,
+        expert_hidden_dim=64,
+        moe_balance_alpha=0.01,
+        moe_score_func="softmax",
+    )
+
+    model = MoDAModel(cfg).to(device)
+    print(f"Parameters: {model.num_parameters():,}")
+    print(model)
+
+    B, T = 2, 32
+    input_ids = torch.randint(0, cfg.vocab_size, (B, T), device=device)
+    labels = torch.randint(0, cfg.vocab_size, (B, T), device=device)
+
+    logits, loss = model(input_ids, labels)
+    assert logits.shape == (B, T, cfg.vocab_size)
+    print(f"Logits shape : {logits.shape}")
+    print(f"Loss (LM + balance): {loss.item():.4f}")
+
+    loss.backward()
+
+    # Verify gradients
+    last_writes = {
+        f"blocks.{cfg.n_layers - 1}.k_write.weight",
+        f"blocks.{cfg.n_layers - 1}.v_write.weight",
+    }
+    missing = [
+        name
+        for name, p in model.named_parameters()
+        if p.grad is None and name not in last_writes
+    ]
+    if missing:
+        print(f"WARNING — unexpected missing gradients: {missing}")
+    else:
+        print("All parameters received gradients (excluding last-block writes).")
+
+    # Spot-check: MoE gate weights must receive gradients (through balance loss P_i)
+    gate0_grad = model.blocks[0].moe.gate.weight.grad
+    assert gate0_grad is not None, "blocks[0].moe.gate.weight has no gradient!"
+    print(f"blocks[0].moe.gate.weight grad norm : {gate0_grad.norm().item():.6f}")
+
+    # Spot-check: depth write projections gradient flows from layer ≥ 1 depth reads
+    k0_grad = model.blocks[0].k_write.weight.grad
+    assert k0_grad is not None, "blocks[0].k_write.weight has no gradient!"
+    print(f"blocks[0].k_write.weight grad norm  : {k0_grad.norm().item():.6f}")
+
+    print("Smoke test passed.")
diff --git a/tests/variants_example.py b/examples/variants_example.py
similarity index 100%
rename from tests/variants_example.py
rename to examples/variants_example.py
diff --git a/open_mythos/moda.py b/open_mythos/moda.py
index e662d61..94f6af5 100644
--- a/open_mythos/moda.py
+++ b/open_mythos/moda.py
@@ -1061,74 +1061,3 @@ class MoDAModel(nn.Module):
             f"(top-{c.n_activated_experts}), "
             f"params={self.num_parameters():,}"
         )
-
-
-# # ---------------------------------------------------------------------------
-# # Smoke test
-# # ---------------------------------------------------------------------------
-
-# if __name__ == "__main__":
-#     torch.manual_seed(42)
-#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-#     print(f"Device: {device}")
-
-#     # Tiny config: 4 layers, 8 routed experts, top-2
-#     cfg = MoDAConfig(
-#         vocab_size=512,
-#         d_model=128,
-#         n_layers=4,
-#         n_heads_q=4,
-#         n_heads_kv=2,
-#         head_dim=32,
-#         max_seq_len=64,
-#         # MoE: 2 shared + 8 routed, activate top-2
-#         # (2+2)*64 = 256 ≈ equivalent to dense SwiGLU hidden~256
-#         n_shared_experts=2,
-#         n_routed_experts=8,
-#         n_activated_experts=2,
-#         expert_hidden_dim=64,
-#         moe_balance_alpha=0.01,
-#         moe_score_func="softmax",
-#     )
-
-#     model = MoDAModel(cfg).to(device)
-#     print(f"Parameters: {model.num_parameters():,}")
-#     print(model)
-
-#     B, T = 2, 32
-#     input_ids = torch.randint(0, cfg.vocab_size, (B, T), device=device)
-#     labels = torch.randint(0, cfg.vocab_size, (B, T), device=device)
-
-#     logits, loss = model(input_ids, labels)
-#     assert logits.shape == (B, T, cfg.vocab_size)
-#     print(f"Logits shape : {logits.shape}")
-#     print(f"Loss (LM + balance): {loss.item():.4f}")
-
-#     loss.backward()
-
-#     # Verify gradients
-#     last_writes = {
-#         f"blocks.{cfg.n_layers - 1}.k_write.weight",
-#         f"blocks.{cfg.n_layers - 1}.v_write.weight",
-#     }
-#     missing = [
-#         name
-#         for name, p in model.named_parameters()
-#         if p.grad is None and name not in last_writes
-#     ]
-#     if missing:
-#         print(f"WARNING — unexpected missing gradients: {missing}")
-#     else:
-#         print("All parameters received gradients (excluding last-block writes).")
-
-#     # Spot-check: MoE gate weights must receive gradients (through balance loss P_i)
-#     gate0_grad = model.blocks[0].moe.gate.weight.grad
-#     assert gate0_grad is not None, "blocks[0].moe.gate.weight has no gradient!"
-#     print(f"blocks[0].moe.gate.weight grad norm : {gate0_grad.norm().item():.6f}")
-
-#     # Spot-check: depth write projections gradient flows from layer ≥ 1 depth reads
-#     k0_grad = model.blocks[0].k_write.weight.grad
-#     assert k0_grad is not None, "blocks[0].k_write.weight has no gradient!"
-#     print(f"blocks[0].k_write.weight grad norm  : {k0_grad.norm().item():.6f}")
-
-#     print("Smoke test passed.")
diff --git a/tests/small_benchmark.py b/tests/small_benchmark.py
index c4e7df7..6289a86 100644
--- a/tests/small_benchmark.py
+++ b/tests/small_benchmark.py
@@ -1,16 +1,43 @@
 #!/usr/bin/env python3
 """
-Side-by-side training of OpenMythos vs. a vanilla GQA transformer on a small
-HuggingFace dataset (wikitext-2 by default).
+Side-by-side training + benchmark of OpenMythos vs. a vanilla transformer on a
+small HuggingFace dataset (TinyStories by default, streamed).
 
-Both models share the same tiny config and see the exact same batches in the
-same order, so per-step loss + throughput are directly comparable. The baseline
-is a dense GQA + SwiGLU stack whose unique-layer depth matches the recurrent
-block's unique-parameter depth (prelude + 1 + coda), so parameter counts land
-in the same ballpark.
+Both models share the same tiny MLA config and see the exact same batches in
+the same order, so per-step train loss and throughput are directly comparable.
+The baseline is a dense stack of the same TransformerBlock primitive with
+`use_moe=False`; its unique-layer depth matches the recurrent block's
+unique-parameter depth (prelude + 1 + coda), so total parameter counts land in
+the same ballpark. Attention kernel is shared (MLA in both models), so any
+measured delta reflects the looped recurrent-depth architecture rather than
+kernel differences.
 
-    python training/small_benchmark.py
-    python training/small_benchmark.py --steps 500 --device cuda
+What the script measures
+------------------------
+1. Per-step training loss + tokens/sec for both models, fed identical batches.
+2. Periodic held-out eval loss on a separate dataset split (--eval-every).
+3. Depth-extrapolation sweep at the end: OpenMythos is trained at
+   cfg.max_loop_iters, then evaluated at n_loops in --depth-sweep
+   (default 1,2,4,8,16). This is the experiment the recurrent-depth
+   architecture is designed to win — eval loss should keep dropping past
+   the trained depth if depth extrapolation is working.
+4. Summary table with initial/final/avg train loss, wall-clock, avg tok/s,
+   and sec/step for both models.
+
+Defaults are tuned for a laptop CPU run in reasonable time; pass --device cuda
+and bump --steps / --batch-size / --seq-len for a real comparison.
+
+    # Default CPU smoke run (TinyStories, 1k steps, batch 32, seq 256)
+    python tests/small_benchmark.py
+
+    # Heavier GPU run
+    python tests/small_benchmark.py --steps 5000 --batch-size 64 --seq-len 512 --device cuda
+
+    # Wikitext instead of TinyStories
+    python tests/small_benchmark.py --dataset wikitext --dataset-config wikitext-2-raw-v1
+
+    # Aggressive depth extrapolation sweep
+    python tests/small_benchmark.py --depth-sweep 1,2,4,8,16,32
 """
 
 from __future__ import annotations
diff --git a/test_main.py b/tests/test_main.py
similarity index 100%
rename from test_main.py
rename to tests/test_main.py