From 227dbb153266cd154f3d335f18c6b80947879f14 Mon Sep 17 00:00:00 2001 From: Kye Gomez Date: Wed, 22 Apr 2026 13:00:57 -0400 Subject: [PATCH] new examples folder --- examples/moda_example.py | 73 +++++++++++++++++++++++++ {tests => examples}/variants_example.py | 0 open_mythos/moda.py | 71 ------------------------ tests/small_benchmark.py | 45 ++++++++++++--- test_main.py => tests/test_main.py | 0 5 files changed, 109 insertions(+), 80 deletions(-) create mode 100644 examples/moda_example.py rename {tests => examples}/variants_example.py (100%) rename test_main.py => tests/test_main.py (100%) diff --git a/examples/moda_example.py b/examples/moda_example.py new file mode 100644 index 0000000..bffc92e --- /dev/null +++ b/examples/moda_example.py @@ -0,0 +1,73 @@ +import torch +from open_mythos.moda import MoDAConfig, MoDAModel + + +# --------------------------------------------------------------------------- +# Smoke test +# --------------------------------------------------------------------------- + +if __name__ == "__main__": + torch.manual_seed(42) + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + print(f"Device: {device}") + + # Tiny config: 4 layers, 8 routed experts, top-2 + cfg = MoDAConfig( + vocab_size=512, + d_model=128, + n_layers=4, + n_heads_q=4, + n_heads_kv=2, + head_dim=32, + max_seq_len=64, + # MoE: 2 shared + 8 routed, activate top-2 + # (2+2)*64 = 256 ≈ equivalent to dense SwiGLU hidden~256 + n_shared_experts=2, + n_routed_experts=8, + n_activated_experts=2, + expert_hidden_dim=64, + moe_balance_alpha=0.01, + moe_score_func="softmax", + ) + + model = MoDAModel(cfg).to(device) + print(f"Parameters: {model.num_parameters():,}") + print(model) + + B, T = 2, 32 + input_ids = torch.randint(0, cfg.vocab_size, (B, T), device=device) + labels = torch.randint(0, cfg.vocab_size, (B, T), device=device) + + logits, loss = model(input_ids, labels) + assert logits.shape == (B, T, cfg.vocab_size) + print(f"Logits shape : {logits.shape}") + print(f"Loss (LM + balance): {loss.item():.4f}") + + loss.backward() + + # Verify gradients + last_writes = { + f"blocks.{cfg.n_layers - 1}.k_write.weight", + f"blocks.{cfg.n_layers - 1}.v_write.weight", + } + missing = [ + name + for name, p in model.named_parameters() + if p.grad is None and name not in last_writes + ] + if missing: + print(f"WARNING — unexpected missing gradients: {missing}") + else: + print("All parameters received gradients (excluding last-block writes).") + + # Spot-check: MoE gate weights must receive gradients (through balance loss P_i) + gate0_grad = model.blocks[0].moe.gate.weight.grad + assert gate0_grad is not None, "blocks[0].moe.gate.weight has no gradient!" + print(f"blocks[0].moe.gate.weight grad norm : {gate0_grad.norm().item():.6f}") + + # Spot-check: depth write projections gradient flows from layer ≥ 1 depth reads + k0_grad = model.blocks[0].k_write.weight.grad + assert k0_grad is not None, "blocks[0].k_write.weight has no gradient!" + print(f"blocks[0].k_write.weight grad norm : {k0_grad.norm().item():.6f}") + + print("Smoke test passed.") diff --git a/tests/variants_example.py b/examples/variants_example.py similarity index 100% rename from tests/variants_example.py rename to examples/variants_example.py diff --git a/open_mythos/moda.py b/open_mythos/moda.py index e662d61..94f6af5 100644 --- a/open_mythos/moda.py +++ b/open_mythos/moda.py @@ -1061,74 +1061,3 @@ class MoDAModel(nn.Module): f"(top-{c.n_activated_experts}), " f"params={self.num_parameters():,}" ) - - -# # --------------------------------------------------------------------------- -# # Smoke test -# # --------------------------------------------------------------------------- - -# if __name__ == "__main__": -# torch.manual_seed(42) -# device = torch.device("cuda" if torch.cuda.is_available() else "cpu") -# print(f"Device: {device}") - -# # Tiny config: 4 layers, 8 routed experts, top-2 -# cfg = MoDAConfig( -# vocab_size=512, -# d_model=128, -# n_layers=4, -# n_heads_q=4, -# n_heads_kv=2, -# head_dim=32, -# max_seq_len=64, -# # MoE: 2 shared + 8 routed, activate top-2 -# # (2+2)*64 = 256 ≈ equivalent to dense SwiGLU hidden~256 -# n_shared_experts=2, -# n_routed_experts=8, -# n_activated_experts=2, -# expert_hidden_dim=64, -# moe_balance_alpha=0.01, -# moe_score_func="softmax", -# ) - -# model = MoDAModel(cfg).to(device) -# print(f"Parameters: {model.num_parameters():,}") -# print(model) - -# B, T = 2, 32 -# input_ids = torch.randint(0, cfg.vocab_size, (B, T), device=device) -# labels = torch.randint(0, cfg.vocab_size, (B, T), device=device) - -# logits, loss = model(input_ids, labels) -# assert logits.shape == (B, T, cfg.vocab_size) -# print(f"Logits shape : {logits.shape}") -# print(f"Loss (LM + balance): {loss.item():.4f}") - -# loss.backward() - -# # Verify gradients -# last_writes = { -# f"blocks.{cfg.n_layers - 1}.k_write.weight", -# f"blocks.{cfg.n_layers - 1}.v_write.weight", -# } -# missing = [ -# name -# for name, p in model.named_parameters() -# if p.grad is None and name not in last_writes -# ] -# if missing: -# print(f"WARNING — unexpected missing gradients: {missing}") -# else: -# print("All parameters received gradients (excluding last-block writes).") - -# # Spot-check: MoE gate weights must receive gradients (through balance loss P_i) -# gate0_grad = model.blocks[0].moe.gate.weight.grad -# assert gate0_grad is not None, "blocks[0].moe.gate.weight has no gradient!" -# print(f"blocks[0].moe.gate.weight grad norm : {gate0_grad.norm().item():.6f}") - -# # Spot-check: depth write projections gradient flows from layer ≥ 1 depth reads -# k0_grad = model.blocks[0].k_write.weight.grad -# assert k0_grad is not None, "blocks[0].k_write.weight has no gradient!" -# print(f"blocks[0].k_write.weight grad norm : {k0_grad.norm().item():.6f}") - -# print("Smoke test passed.") diff --git a/tests/small_benchmark.py b/tests/small_benchmark.py index c4e7df7..6289a86 100644 --- a/tests/small_benchmark.py +++ b/tests/small_benchmark.py @@ -1,16 +1,43 @@ #!/usr/bin/env python3 """ -Side-by-side training of OpenMythos vs. a vanilla GQA transformer on a small -HuggingFace dataset (wikitext-2 by default). +Side-by-side training + benchmark of OpenMythos vs. a vanilla transformer on a +small HuggingFace dataset (TinyStories by default, streamed). -Both models share the same tiny config and see the exact same batches in the -same order, so per-step loss + throughput are directly comparable. The baseline -is a dense GQA + SwiGLU stack whose unique-layer depth matches the recurrent -block's unique-parameter depth (prelude + 1 + coda), so parameter counts land -in the same ballpark. +Both models share the same tiny MLA config and see the exact same batches in +the same order, so per-step train loss and throughput are directly comparable. +The baseline is a dense stack of the same TransformerBlock primitive with +`use_moe=False`; its unique-layer depth matches the recurrent block's +unique-parameter depth (prelude + 1 + coda), so total parameter counts land in +the same ballpark. Attention kernel is shared (MLA in both models), so any +measured delta reflects the looped recurrent-depth architecture rather than +kernel differences. - python training/small_benchmark.py - python training/small_benchmark.py --steps 500 --device cuda +What the script measures +------------------------ +1. Per-step training loss + tokens/sec for both models, fed identical batches. +2. Periodic held-out eval loss on a separate dataset split (--eval-every). +3. Depth-extrapolation sweep at the end: OpenMythos is trained at + cfg.max_loop_iters, then evaluated at n_loops in --depth-sweep + (default 1,2,4,8,16). This is the experiment the recurrent-depth + architecture is designed to win — eval loss should keep dropping past + the trained depth if depth extrapolation is working. +4. Summary table with initial/final/avg train loss, wall-clock, avg tok/s, + and sec/step for both models. + +Defaults are tuned for a laptop CPU run in reasonable time; pass --device cuda +and bump --steps / --batch-size / --seq-len for a real comparison. + + # Default CPU smoke run (TinyStories, 1k steps, batch 32, seq 256) + python tests/small_benchmark.py + + # Heavier GPU run + python tests/small_benchmark.py --steps 5000 --batch-size 64 --seq-len 512 --device cuda + + # Wikitext instead of TinyStories + python tests/small_benchmark.py --dataset wikitext --dataset-config wikitext-2-raw-v1 + + # Aggressive depth extrapolation sweep + python tests/small_benchmark.py --depth-sweep 1,2,4,8,16,32 """ from __future__ import annotations diff --git a/test_main.py b/tests/test_main.py similarity index 100% rename from test_main.py rename to tests/test_main.py