From 1c54259fa8e61005a7c692df1385015cc9d33572 Mon Sep 17 00:00:00 2001 From: Kye Gomez Date: Sun, 19 Apr 2026 15:04:03 -0400 Subject: [PATCH] format --- open_mythos/variants.py | 191 +++++++++++++++++++++++++++++++--------- 1 file changed, 148 insertions(+), 43 deletions(-) diff --git a/open_mythos/variants.py b/open_mythos/variants.py index 2ffb9ec..8422b4e 100644 --- a/open_mythos/variants.py +++ b/open_mythos/variants.py @@ -1,64 +1,138 @@ from open_mythos.main import MythosConfig -# fmt: off def mythos_1b() -> MythosConfig: """1B parameter config. Small research/fine-tuning model. dim=2048, 16 experts, 16 loop iters, 4k context.""" return MythosConfig( - vocab_size=32000, dim=2048, n_heads=16, n_kv_heads=4, - max_seq_len=4096, max_loop_iters=16, prelude_layers=2, coda_layers=2, - attn_type="mla", kv_lora_rank=256, q_lora_rank=512, - qk_rope_head_dim=32, qk_nope_head_dim=64, v_head_dim=64, - n_experts=16, n_shared_experts=2, n_experts_per_tok=2, expert_dim=256, - act_threshold=0.99, rope_theta=500000.0, lora_rank=8, + vocab_size=32000, + dim=2048, + n_heads=16, + n_kv_heads=4, + max_seq_len=4096, + max_loop_iters=16, + prelude_layers=2, + coda_layers=2, + attn_type="mla", + kv_lora_rank=256, + q_lora_rank=512, + qk_rope_head_dim=32, + qk_nope_head_dim=64, + v_head_dim=64, + n_experts=16, + n_shared_experts=2, + n_experts_per_tok=2, + expert_dim=256, + act_threshold=0.99, + rope_theta=500000.0, + lora_rank=8, ) def mythos_3b() -> MythosConfig: """3B parameter config. Compact inference model. dim=3072, 32 experts, 16 loop iters, 4k context.""" return MythosConfig( - vocab_size=32000, dim=3072, n_heads=24, n_kv_heads=6, - max_seq_len=4096, max_loop_iters=16, prelude_layers=2, coda_layers=2, - attn_type="mla", kv_lora_rank=384, q_lora_rank=768, - qk_rope_head_dim=32, qk_nope_head_dim=96, v_head_dim=96, - n_experts=32, n_shared_experts=2, n_experts_per_tok=2, expert_dim=384, - act_threshold=0.99, rope_theta=500000.0, lora_rank=8, + vocab_size=32000, + dim=3072, + n_heads=24, + n_kv_heads=6, + max_seq_len=4096, + max_loop_iters=16, + prelude_layers=2, + coda_layers=2, + attn_type="mla", + kv_lora_rank=384, + q_lora_rank=768, + qk_rope_head_dim=32, + qk_nope_head_dim=96, + v_head_dim=96, + n_experts=32, + n_shared_experts=2, + n_experts_per_tok=2, + expert_dim=384, + act_threshold=0.99, + rope_theta=500000.0, + lora_rank=8, ) def mythos_10b() -> MythosConfig: """10B parameter config. Mid-scale general model. dim=4096, 64 experts, 24 loop iters, 8k context.""" return MythosConfig( - vocab_size=32000, dim=4096, n_heads=32, n_kv_heads=8, - max_seq_len=8192, max_loop_iters=24, prelude_layers=2, coda_layers=2, - attn_type="mla", kv_lora_rank=512, q_lora_rank=1024, - qk_rope_head_dim=64, qk_nope_head_dim=128, v_head_dim=128, - n_experts=64, n_shared_experts=2, n_experts_per_tok=4, expert_dim=512, - act_threshold=0.99, rope_theta=500000.0, lora_rank=16, + vocab_size=32000, + dim=4096, + n_heads=32, + n_kv_heads=8, + max_seq_len=8192, + max_loop_iters=24, + prelude_layers=2, + coda_layers=2, + attn_type="mla", + kv_lora_rank=512, + q_lora_rank=1024, + qk_rope_head_dim=64, + qk_nope_head_dim=128, + v_head_dim=128, + n_experts=64, + n_shared_experts=2, + n_experts_per_tok=4, + expert_dim=512, + act_threshold=0.99, + rope_theta=500000.0, + lora_rank=16, ) def mythos_50b() -> MythosConfig: """50B parameter config. Large reasoning model. dim=6144, 128 experts, 32 loop iters, 8k context.""" return MythosConfig( - vocab_size=32000, dim=6144, n_heads=48, n_kv_heads=8, - max_seq_len=8192, max_loop_iters=32, prelude_layers=3, coda_layers=3, - attn_type="mla", kv_lora_rank=512, q_lora_rank=1536, - qk_rope_head_dim=64, qk_nope_head_dim=128, v_head_dim=128, - n_experts=128, n_shared_experts=4, n_experts_per_tok=4, expert_dim=768, - act_threshold=0.99, rope_theta=500000.0, lora_rank=32, + vocab_size=32000, + dim=6144, + n_heads=48, + n_kv_heads=8, + max_seq_len=8192, + max_loop_iters=32, + prelude_layers=3, + coda_layers=3, + attn_type="mla", + kv_lora_rank=512, + q_lora_rank=1536, + qk_rope_head_dim=64, + qk_nope_head_dim=128, + v_head_dim=128, + n_experts=128, + n_shared_experts=4, + n_experts_per_tok=4, + expert_dim=768, + act_threshold=0.99, + rope_theta=500000.0, + lora_rank=32, ) def mythos_100b() -> MythosConfig: """100B parameter config. Frontier-class model. dim=8192, 160 experts, 32 loop iters, 1M context, 128k output.""" return MythosConfig( - vocab_size=32000, dim=8192, n_heads=64, n_kv_heads=8, - max_seq_len=1000000, max_loop_iters=32, prelude_layers=4, coda_layers=4, - attn_type="mla", kv_lora_rank=512, q_lora_rank=2048, - qk_rope_head_dim=64, qk_nope_head_dim=128, v_head_dim=128, - n_experts=160, n_shared_experts=4, n_experts_per_tok=8, expert_dim=1024, - act_threshold=0.99, rope_theta=1000000.0, lora_rank=64, + vocab_size=32000, + dim=8192, + n_heads=64, + n_kv_heads=8, + max_seq_len=1000000, + max_loop_iters=32, + prelude_layers=4, + coda_layers=4, + attn_type="mla", + kv_lora_rank=512, + q_lora_rank=2048, + qk_rope_head_dim=64, + qk_nope_head_dim=128, + v_head_dim=128, + n_experts=160, + n_shared_experts=4, + n_experts_per_tok=8, + expert_dim=1024, + act_threshold=0.99, + rope_theta=1000000.0, + lora_rank=64, max_output_tokens=131072, ) @@ -66,12 +140,27 @@ def mythos_100b() -> MythosConfig: def mythos_500b() -> MythosConfig: """500B parameter config. Ultra-scale MoE model. dim=12288, 256 experts, 48 loop iters, 1M context, 128k output.""" return MythosConfig( - vocab_size=100000, dim=12288, n_heads=96, n_kv_heads=16, - max_seq_len=1000000, max_loop_iters=48, prelude_layers=4, coda_layers=4, - attn_type="mla", kv_lora_rank=1024, q_lora_rank=3072, - qk_rope_head_dim=64, qk_nope_head_dim=128, v_head_dim=128, - n_experts=256, n_shared_experts=8, n_experts_per_tok=8, expert_dim=1536, - act_threshold=0.99, rope_theta=1000000.0, lora_rank=128, + vocab_size=100000, + dim=12288, + n_heads=96, + n_kv_heads=16, + max_seq_len=1000000, + max_loop_iters=48, + prelude_layers=4, + coda_layers=4, + attn_type="mla", + kv_lora_rank=1024, + q_lora_rank=3072, + qk_rope_head_dim=64, + qk_nope_head_dim=128, + v_head_dim=128, + n_experts=256, + n_shared_experts=8, + n_experts_per_tok=8, + expert_dim=1536, + act_threshold=0.99, + rope_theta=1000000.0, + lora_rank=128, max_output_tokens=131072, ) @@ -79,13 +168,29 @@ def mythos_500b() -> MythosConfig: def mythos_1t() -> MythosConfig: """1T parameter config. Maximum scale. dim=16384, 512 experts, 64 loop iters, 1M context, 128k output.""" return MythosConfig( - vocab_size=100000, dim=16384, n_heads=128, n_kv_heads=16, - max_seq_len=1000000, max_loop_iters=64, prelude_layers=6, coda_layers=6, - attn_type="mla", kv_lora_rank=1024, q_lora_rank=4096, - qk_rope_head_dim=64, qk_nope_head_dim=128, v_head_dim=128, - n_experts=512, n_shared_experts=8, n_experts_per_tok=8, expert_dim=2048, - act_threshold=0.99, rope_theta=2000000.0, lora_rank=256, + vocab_size=100000, + dim=16384, + n_heads=128, + n_kv_heads=16, + max_seq_len=1000000, + max_loop_iters=64, + prelude_layers=6, + coda_layers=6, + attn_type="mla", + kv_lora_rank=1024, + q_lora_rank=4096, + qk_rope_head_dim=64, + qk_nope_head_dim=128, + v_head_dim=128, + n_experts=512, + n_shared_experts=8, + n_experts_per_tok=8, + expert_dim=2048, + act_threshold=0.99, + rope_theta=2000000.0, + lora_rank=256, max_output_tokens=131072, ) + # fmt: on