This commit is contained in:
Kye Gomez 2026-04-19 15:04:03 -04:00
parent 9cce1c1401
commit 1c54259fa8

View File

@ -1,64 +1,138 @@
from open_mythos.main import MythosConfig from open_mythos.main import MythosConfig
# fmt: off
def mythos_1b() -> MythosConfig: def mythos_1b() -> MythosConfig:
"""1B parameter config. Small research/fine-tuning model. dim=2048, 16 experts, 16 loop iters, 4k context.""" """1B parameter config. Small research/fine-tuning model. dim=2048, 16 experts, 16 loop iters, 4k context."""
return MythosConfig( return MythosConfig(
vocab_size=32000, dim=2048, n_heads=16, n_kv_heads=4, vocab_size=32000,
max_seq_len=4096, max_loop_iters=16, prelude_layers=2, coda_layers=2, dim=2048,
attn_type="mla", kv_lora_rank=256, q_lora_rank=512, n_heads=16,
qk_rope_head_dim=32, qk_nope_head_dim=64, v_head_dim=64, n_kv_heads=4,
n_experts=16, n_shared_experts=2, n_experts_per_tok=2, expert_dim=256, max_seq_len=4096,
act_threshold=0.99, rope_theta=500000.0, lora_rank=8, max_loop_iters=16,
prelude_layers=2,
coda_layers=2,
attn_type="mla",
kv_lora_rank=256,
q_lora_rank=512,
qk_rope_head_dim=32,
qk_nope_head_dim=64,
v_head_dim=64,
n_experts=16,
n_shared_experts=2,
n_experts_per_tok=2,
expert_dim=256,
act_threshold=0.99,
rope_theta=500000.0,
lora_rank=8,
) )
def mythos_3b() -> MythosConfig: def mythos_3b() -> MythosConfig:
"""3B parameter config. Compact inference model. dim=3072, 32 experts, 16 loop iters, 4k context.""" """3B parameter config. Compact inference model. dim=3072, 32 experts, 16 loop iters, 4k context."""
return MythosConfig( return MythosConfig(
vocab_size=32000, dim=3072, n_heads=24, n_kv_heads=6, vocab_size=32000,
max_seq_len=4096, max_loop_iters=16, prelude_layers=2, coda_layers=2, dim=3072,
attn_type="mla", kv_lora_rank=384, q_lora_rank=768, n_heads=24,
qk_rope_head_dim=32, qk_nope_head_dim=96, v_head_dim=96, n_kv_heads=6,
n_experts=32, n_shared_experts=2, n_experts_per_tok=2, expert_dim=384, max_seq_len=4096,
act_threshold=0.99, rope_theta=500000.0, lora_rank=8, max_loop_iters=16,
prelude_layers=2,
coda_layers=2,
attn_type="mla",
kv_lora_rank=384,
q_lora_rank=768,
qk_rope_head_dim=32,
qk_nope_head_dim=96,
v_head_dim=96,
n_experts=32,
n_shared_experts=2,
n_experts_per_tok=2,
expert_dim=384,
act_threshold=0.99,
rope_theta=500000.0,
lora_rank=8,
) )
def mythos_10b() -> MythosConfig: def mythos_10b() -> MythosConfig:
"""10B parameter config. Mid-scale general model. dim=4096, 64 experts, 24 loop iters, 8k context.""" """10B parameter config. Mid-scale general model. dim=4096, 64 experts, 24 loop iters, 8k context."""
return MythosConfig( return MythosConfig(
vocab_size=32000, dim=4096, n_heads=32, n_kv_heads=8, vocab_size=32000,
max_seq_len=8192, max_loop_iters=24, prelude_layers=2, coda_layers=2, dim=4096,
attn_type="mla", kv_lora_rank=512, q_lora_rank=1024, n_heads=32,
qk_rope_head_dim=64, qk_nope_head_dim=128, v_head_dim=128, n_kv_heads=8,
n_experts=64, n_shared_experts=2, n_experts_per_tok=4, expert_dim=512, max_seq_len=8192,
act_threshold=0.99, rope_theta=500000.0, lora_rank=16, max_loop_iters=24,
prelude_layers=2,
coda_layers=2,
attn_type="mla",
kv_lora_rank=512,
q_lora_rank=1024,
qk_rope_head_dim=64,
qk_nope_head_dim=128,
v_head_dim=128,
n_experts=64,
n_shared_experts=2,
n_experts_per_tok=4,
expert_dim=512,
act_threshold=0.99,
rope_theta=500000.0,
lora_rank=16,
) )
def mythos_50b() -> MythosConfig: def mythos_50b() -> MythosConfig:
"""50B parameter config. Large reasoning model. dim=6144, 128 experts, 32 loop iters, 8k context.""" """50B parameter config. Large reasoning model. dim=6144, 128 experts, 32 loop iters, 8k context."""
return MythosConfig( return MythosConfig(
vocab_size=32000, dim=6144, n_heads=48, n_kv_heads=8, vocab_size=32000,
max_seq_len=8192, max_loop_iters=32, prelude_layers=3, coda_layers=3, dim=6144,
attn_type="mla", kv_lora_rank=512, q_lora_rank=1536, n_heads=48,
qk_rope_head_dim=64, qk_nope_head_dim=128, v_head_dim=128, n_kv_heads=8,
n_experts=128, n_shared_experts=4, n_experts_per_tok=4, expert_dim=768, max_seq_len=8192,
act_threshold=0.99, rope_theta=500000.0, lora_rank=32, max_loop_iters=32,
prelude_layers=3,
coda_layers=3,
attn_type="mla",
kv_lora_rank=512,
q_lora_rank=1536,
qk_rope_head_dim=64,
qk_nope_head_dim=128,
v_head_dim=128,
n_experts=128,
n_shared_experts=4,
n_experts_per_tok=4,
expert_dim=768,
act_threshold=0.99,
rope_theta=500000.0,
lora_rank=32,
) )
def mythos_100b() -> MythosConfig: def mythos_100b() -> MythosConfig:
"""100B parameter config. Frontier-class model. dim=8192, 160 experts, 32 loop iters, 1M context, 128k output.""" """100B parameter config. Frontier-class model. dim=8192, 160 experts, 32 loop iters, 1M context, 128k output."""
return MythosConfig( return MythosConfig(
vocab_size=32000, dim=8192, n_heads=64, n_kv_heads=8, vocab_size=32000,
max_seq_len=1000000, max_loop_iters=32, prelude_layers=4, coda_layers=4, dim=8192,
attn_type="mla", kv_lora_rank=512, q_lora_rank=2048, n_heads=64,
qk_rope_head_dim=64, qk_nope_head_dim=128, v_head_dim=128, n_kv_heads=8,
n_experts=160, n_shared_experts=4, n_experts_per_tok=8, expert_dim=1024, max_seq_len=1000000,
act_threshold=0.99, rope_theta=1000000.0, lora_rank=64, max_loop_iters=32,
prelude_layers=4,
coda_layers=4,
attn_type="mla",
kv_lora_rank=512,
q_lora_rank=2048,
qk_rope_head_dim=64,
qk_nope_head_dim=128,
v_head_dim=128,
n_experts=160,
n_shared_experts=4,
n_experts_per_tok=8,
expert_dim=1024,
act_threshold=0.99,
rope_theta=1000000.0,
lora_rank=64,
max_output_tokens=131072, max_output_tokens=131072,
) )
@ -66,12 +140,27 @@ def mythos_100b() -> MythosConfig:
def mythos_500b() -> MythosConfig: def mythos_500b() -> MythosConfig:
"""500B parameter config. Ultra-scale MoE model. dim=12288, 256 experts, 48 loop iters, 1M context, 128k output.""" """500B parameter config. Ultra-scale MoE model. dim=12288, 256 experts, 48 loop iters, 1M context, 128k output."""
return MythosConfig( return MythosConfig(
vocab_size=100000, dim=12288, n_heads=96, n_kv_heads=16, vocab_size=100000,
max_seq_len=1000000, max_loop_iters=48, prelude_layers=4, coda_layers=4, dim=12288,
attn_type="mla", kv_lora_rank=1024, q_lora_rank=3072, n_heads=96,
qk_rope_head_dim=64, qk_nope_head_dim=128, v_head_dim=128, n_kv_heads=16,
n_experts=256, n_shared_experts=8, n_experts_per_tok=8, expert_dim=1536, max_seq_len=1000000,
act_threshold=0.99, rope_theta=1000000.0, lora_rank=128, max_loop_iters=48,
prelude_layers=4,
coda_layers=4,
attn_type="mla",
kv_lora_rank=1024,
q_lora_rank=3072,
qk_rope_head_dim=64,
qk_nope_head_dim=128,
v_head_dim=128,
n_experts=256,
n_shared_experts=8,
n_experts_per_tok=8,
expert_dim=1536,
act_threshold=0.99,
rope_theta=1000000.0,
lora_rank=128,
max_output_tokens=131072, max_output_tokens=131072,
) )
@ -79,13 +168,29 @@ def mythos_500b() -> MythosConfig:
def mythos_1t() -> MythosConfig: def mythos_1t() -> MythosConfig:
"""1T parameter config. Maximum scale. dim=16384, 512 experts, 64 loop iters, 1M context, 128k output.""" """1T parameter config. Maximum scale. dim=16384, 512 experts, 64 loop iters, 1M context, 128k output."""
return MythosConfig( return MythosConfig(
vocab_size=100000, dim=16384, n_heads=128, n_kv_heads=16, vocab_size=100000,
max_seq_len=1000000, max_loop_iters=64, prelude_layers=6, coda_layers=6, dim=16384,
attn_type="mla", kv_lora_rank=1024, q_lora_rank=4096, n_heads=128,
qk_rope_head_dim=64, qk_nope_head_dim=128, v_head_dim=128, n_kv_heads=16,
n_experts=512, n_shared_experts=8, n_experts_per_tok=8, expert_dim=2048, max_seq_len=1000000,
act_threshold=0.99, rope_theta=2000000.0, lora_rank=256, max_loop_iters=64,
prelude_layers=6,
coda_layers=6,
attn_type="mla",
kv_lora_rank=1024,
q_lora_rank=4096,
qk_rope_head_dim=64,
qk_nope_head_dim=128,
v_head_dim=128,
n_experts=512,
n_shared_experts=8,
n_experts_per_tok=8,
expert_dim=2048,
act_threshold=0.99,
rope_theta=2000000.0,
lora_rank=256,
max_output_tokens=131072, max_output_tokens=131072,
) )
# fmt: on # fmt: on