mirror of
https://github.com/kyegomez/OpenMythos.git
synced 2026-05-02 17:43:27 +02:00
new examples folder
This commit is contained in:
parent
963e11277d
commit
227dbb1532
73
examples/moda_example.py
Normal file
73
examples/moda_example.py
Normal file
@ -0,0 +1,73 @@
|
|||||||
|
import torch
|
||||||
|
from open_mythos.moda import MoDAConfig, MoDAModel
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Smoke test
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
torch.manual_seed(42)
|
||||||
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
|
print(f"Device: {device}")
|
||||||
|
|
||||||
|
# Tiny config: 4 layers, 8 routed experts, top-2
|
||||||
|
cfg = MoDAConfig(
|
||||||
|
vocab_size=512,
|
||||||
|
d_model=128,
|
||||||
|
n_layers=4,
|
||||||
|
n_heads_q=4,
|
||||||
|
n_heads_kv=2,
|
||||||
|
head_dim=32,
|
||||||
|
max_seq_len=64,
|
||||||
|
# MoE: 2 shared + 8 routed, activate top-2
|
||||||
|
# (2+2)*64 = 256 ≈ equivalent to dense SwiGLU hidden~256
|
||||||
|
n_shared_experts=2,
|
||||||
|
n_routed_experts=8,
|
||||||
|
n_activated_experts=2,
|
||||||
|
expert_hidden_dim=64,
|
||||||
|
moe_balance_alpha=0.01,
|
||||||
|
moe_score_func="softmax",
|
||||||
|
)
|
||||||
|
|
||||||
|
model = MoDAModel(cfg).to(device)
|
||||||
|
print(f"Parameters: {model.num_parameters():,}")
|
||||||
|
print(model)
|
||||||
|
|
||||||
|
B, T = 2, 32
|
||||||
|
input_ids = torch.randint(0, cfg.vocab_size, (B, T), device=device)
|
||||||
|
labels = torch.randint(0, cfg.vocab_size, (B, T), device=device)
|
||||||
|
|
||||||
|
logits, loss = model(input_ids, labels)
|
||||||
|
assert logits.shape == (B, T, cfg.vocab_size)
|
||||||
|
print(f"Logits shape : {logits.shape}")
|
||||||
|
print(f"Loss (LM + balance): {loss.item():.4f}")
|
||||||
|
|
||||||
|
loss.backward()
|
||||||
|
|
||||||
|
# Verify gradients
|
||||||
|
last_writes = {
|
||||||
|
f"blocks.{cfg.n_layers - 1}.k_write.weight",
|
||||||
|
f"blocks.{cfg.n_layers - 1}.v_write.weight",
|
||||||
|
}
|
||||||
|
missing = [
|
||||||
|
name
|
||||||
|
for name, p in model.named_parameters()
|
||||||
|
if p.grad is None and name not in last_writes
|
||||||
|
]
|
||||||
|
if missing:
|
||||||
|
print(f"WARNING — unexpected missing gradients: {missing}")
|
||||||
|
else:
|
||||||
|
print("All parameters received gradients (excluding last-block writes).")
|
||||||
|
|
||||||
|
# Spot-check: MoE gate weights must receive gradients (through balance loss P_i)
|
||||||
|
gate0_grad = model.blocks[0].moe.gate.weight.grad
|
||||||
|
assert gate0_grad is not None, "blocks[0].moe.gate.weight has no gradient!"
|
||||||
|
print(f"blocks[0].moe.gate.weight grad norm : {gate0_grad.norm().item():.6f}")
|
||||||
|
|
||||||
|
# Spot-check: depth write projections gradient flows from layer ≥ 1 depth reads
|
||||||
|
k0_grad = model.blocks[0].k_write.weight.grad
|
||||||
|
assert k0_grad is not None, "blocks[0].k_write.weight has no gradient!"
|
||||||
|
print(f"blocks[0].k_write.weight grad norm : {k0_grad.norm().item():.6f}")
|
||||||
|
|
||||||
|
print("Smoke test passed.")
|
||||||
@ -1061,74 +1061,3 @@ class MoDAModel(nn.Module):
|
|||||||
f"(top-{c.n_activated_experts}), "
|
f"(top-{c.n_activated_experts}), "
|
||||||
f"params={self.num_parameters():,}"
|
f"params={self.num_parameters():,}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
# # ---------------------------------------------------------------------------
|
|
||||||
# # Smoke test
|
|
||||||
# # ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
# if __name__ == "__main__":
|
|
||||||
# torch.manual_seed(42)
|
|
||||||
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
||||||
# print(f"Device: {device}")
|
|
||||||
|
|
||||||
# # Tiny config: 4 layers, 8 routed experts, top-2
|
|
||||||
# cfg = MoDAConfig(
|
|
||||||
# vocab_size=512,
|
|
||||||
# d_model=128,
|
|
||||||
# n_layers=4,
|
|
||||||
# n_heads_q=4,
|
|
||||||
# n_heads_kv=2,
|
|
||||||
# head_dim=32,
|
|
||||||
# max_seq_len=64,
|
|
||||||
# # MoE: 2 shared + 8 routed, activate top-2
|
|
||||||
# # (2+2)*64 = 256 ≈ equivalent to dense SwiGLU hidden~256
|
|
||||||
# n_shared_experts=2,
|
|
||||||
# n_routed_experts=8,
|
|
||||||
# n_activated_experts=2,
|
|
||||||
# expert_hidden_dim=64,
|
|
||||||
# moe_balance_alpha=0.01,
|
|
||||||
# moe_score_func="softmax",
|
|
||||||
# )
|
|
||||||
|
|
||||||
# model = MoDAModel(cfg).to(device)
|
|
||||||
# print(f"Parameters: {model.num_parameters():,}")
|
|
||||||
# print(model)
|
|
||||||
|
|
||||||
# B, T = 2, 32
|
|
||||||
# input_ids = torch.randint(0, cfg.vocab_size, (B, T), device=device)
|
|
||||||
# labels = torch.randint(0, cfg.vocab_size, (B, T), device=device)
|
|
||||||
|
|
||||||
# logits, loss = model(input_ids, labels)
|
|
||||||
# assert logits.shape == (B, T, cfg.vocab_size)
|
|
||||||
# print(f"Logits shape : {logits.shape}")
|
|
||||||
# print(f"Loss (LM + balance): {loss.item():.4f}")
|
|
||||||
|
|
||||||
# loss.backward()
|
|
||||||
|
|
||||||
# # Verify gradients
|
|
||||||
# last_writes = {
|
|
||||||
# f"blocks.{cfg.n_layers - 1}.k_write.weight",
|
|
||||||
# f"blocks.{cfg.n_layers - 1}.v_write.weight",
|
|
||||||
# }
|
|
||||||
# missing = [
|
|
||||||
# name
|
|
||||||
# for name, p in model.named_parameters()
|
|
||||||
# if p.grad is None and name not in last_writes
|
|
||||||
# ]
|
|
||||||
# if missing:
|
|
||||||
# print(f"WARNING — unexpected missing gradients: {missing}")
|
|
||||||
# else:
|
|
||||||
# print("All parameters received gradients (excluding last-block writes).")
|
|
||||||
|
|
||||||
# # Spot-check: MoE gate weights must receive gradients (through balance loss P_i)
|
|
||||||
# gate0_grad = model.blocks[0].moe.gate.weight.grad
|
|
||||||
# assert gate0_grad is not None, "blocks[0].moe.gate.weight has no gradient!"
|
|
||||||
# print(f"blocks[0].moe.gate.weight grad norm : {gate0_grad.norm().item():.6f}")
|
|
||||||
|
|
||||||
# # Spot-check: depth write projections gradient flows from layer ≥ 1 depth reads
|
|
||||||
# k0_grad = model.blocks[0].k_write.weight.grad
|
|
||||||
# assert k0_grad is not None, "blocks[0].k_write.weight has no gradient!"
|
|
||||||
# print(f"blocks[0].k_write.weight grad norm : {k0_grad.norm().item():.6f}")
|
|
||||||
|
|
||||||
# print("Smoke test passed.")
|
|
||||||
|
|||||||
@ -1,16 +1,43 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""
|
"""
|
||||||
Side-by-side training of OpenMythos vs. a vanilla GQA transformer on a small
|
Side-by-side training + benchmark of OpenMythos vs. a vanilla transformer on a
|
||||||
HuggingFace dataset (wikitext-2 by default).
|
small HuggingFace dataset (TinyStories by default, streamed).
|
||||||
|
|
||||||
Both models share the same tiny config and see the exact same batches in the
|
Both models share the same tiny MLA config and see the exact same batches in
|
||||||
same order, so per-step loss + throughput are directly comparable. The baseline
|
the same order, so per-step train loss and throughput are directly comparable.
|
||||||
is a dense GQA + SwiGLU stack whose unique-layer depth matches the recurrent
|
The baseline is a dense stack of the same TransformerBlock primitive with
|
||||||
block's unique-parameter depth (prelude + 1 + coda), so parameter counts land
|
`use_moe=False`; its unique-layer depth matches the recurrent block's
|
||||||
in the same ballpark.
|
unique-parameter depth (prelude + 1 + coda), so total parameter counts land in
|
||||||
|
the same ballpark. Attention kernel is shared (MLA in both models), so any
|
||||||
|
measured delta reflects the looped recurrent-depth architecture rather than
|
||||||
|
kernel differences.
|
||||||
|
|
||||||
python training/small_benchmark.py
|
What the script measures
|
||||||
python training/small_benchmark.py --steps 500 --device cuda
|
------------------------
|
||||||
|
1. Per-step training loss + tokens/sec for both models, fed identical batches.
|
||||||
|
2. Periodic held-out eval loss on a separate dataset split (--eval-every).
|
||||||
|
3. Depth-extrapolation sweep at the end: OpenMythos is trained at
|
||||||
|
cfg.max_loop_iters, then evaluated at n_loops in --depth-sweep
|
||||||
|
(default 1,2,4,8,16). This is the experiment the recurrent-depth
|
||||||
|
architecture is designed to win — eval loss should keep dropping past
|
||||||
|
the trained depth if depth extrapolation is working.
|
||||||
|
4. Summary table with initial/final/avg train loss, wall-clock, avg tok/s,
|
||||||
|
and sec/step for both models.
|
||||||
|
|
||||||
|
Defaults are tuned for a laptop CPU run in reasonable time; pass --device cuda
|
||||||
|
and bump --steps / --batch-size / --seq-len for a real comparison.
|
||||||
|
|
||||||
|
# Default CPU smoke run (TinyStories, 1k steps, batch 32, seq 256)
|
||||||
|
python tests/small_benchmark.py
|
||||||
|
|
||||||
|
# Heavier GPU run
|
||||||
|
python tests/small_benchmark.py --steps 5000 --batch-size 64 --seq-len 512 --device cuda
|
||||||
|
|
||||||
|
# Wikitext instead of TinyStories
|
||||||
|
python tests/small_benchmark.py --dataset wikitext --dataset-config wikitext-2-raw-v1
|
||||||
|
|
||||||
|
# Aggressive depth extrapolation sweep
|
||||||
|
python tests/small_benchmark.py --depth-sweep 1,2,4,8,16,32
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user