mirror of
https://github.com/kyegomez/OpenMythos.git
synced 2026-05-02 17:43:27 +02:00
197 lines
5.0 KiB
Python
197 lines
5.0 KiB
Python
from open_mythos.main import MythosConfig
|
|
|
|
|
|
def mythos_1b() -> MythosConfig:
|
|
"""1B parameter config. Small research/fine-tuning model. dim=2048, 16 experts, 16 loop iters, 4k context."""
|
|
return MythosConfig(
|
|
vocab_size=32000,
|
|
dim=2048,
|
|
n_heads=16,
|
|
n_kv_heads=4,
|
|
max_seq_len=4096,
|
|
max_loop_iters=16,
|
|
prelude_layers=2,
|
|
coda_layers=2,
|
|
attn_type="mla",
|
|
kv_lora_rank=256,
|
|
q_lora_rank=512,
|
|
qk_rope_head_dim=32,
|
|
qk_nope_head_dim=64,
|
|
v_head_dim=64,
|
|
n_experts=16,
|
|
n_shared_experts=2,
|
|
n_experts_per_tok=2,
|
|
expert_dim=256,
|
|
act_threshold=0.99,
|
|
rope_theta=500000.0,
|
|
lora_rank=8,
|
|
)
|
|
|
|
|
|
def mythos_3b() -> MythosConfig:
|
|
"""3B parameter config. Compact inference model. dim=3072, 32 experts, 16 loop iters, 4k context."""
|
|
return MythosConfig(
|
|
vocab_size=32000,
|
|
dim=3072,
|
|
n_heads=24,
|
|
n_kv_heads=6,
|
|
max_seq_len=4096,
|
|
max_loop_iters=16,
|
|
prelude_layers=2,
|
|
coda_layers=2,
|
|
attn_type="mla",
|
|
kv_lora_rank=384,
|
|
q_lora_rank=768,
|
|
qk_rope_head_dim=32,
|
|
qk_nope_head_dim=96,
|
|
v_head_dim=96,
|
|
n_experts=32,
|
|
n_shared_experts=2,
|
|
n_experts_per_tok=2,
|
|
expert_dim=384,
|
|
act_threshold=0.99,
|
|
rope_theta=500000.0,
|
|
lora_rank=8,
|
|
)
|
|
|
|
|
|
def mythos_10b() -> MythosConfig:
|
|
"""10B parameter config. Mid-scale general model. dim=4096, 64 experts, 24 loop iters, 8k context."""
|
|
return MythosConfig(
|
|
vocab_size=32000,
|
|
dim=4096,
|
|
n_heads=32,
|
|
n_kv_heads=8,
|
|
max_seq_len=8192,
|
|
max_loop_iters=24,
|
|
prelude_layers=2,
|
|
coda_layers=2,
|
|
attn_type="mla",
|
|
kv_lora_rank=512,
|
|
q_lora_rank=1024,
|
|
qk_rope_head_dim=64,
|
|
qk_nope_head_dim=128,
|
|
v_head_dim=128,
|
|
n_experts=64,
|
|
n_shared_experts=2,
|
|
n_experts_per_tok=4,
|
|
expert_dim=512,
|
|
act_threshold=0.99,
|
|
rope_theta=500000.0,
|
|
lora_rank=16,
|
|
)
|
|
|
|
|
|
def mythos_50b() -> MythosConfig:
|
|
"""50B parameter config. Large reasoning model. dim=6144, 128 experts, 32 loop iters, 8k context."""
|
|
return MythosConfig(
|
|
vocab_size=32000,
|
|
dim=6144,
|
|
n_heads=48,
|
|
n_kv_heads=8,
|
|
max_seq_len=8192,
|
|
max_loop_iters=32,
|
|
prelude_layers=3,
|
|
coda_layers=3,
|
|
attn_type="mla",
|
|
kv_lora_rank=512,
|
|
q_lora_rank=1536,
|
|
qk_rope_head_dim=64,
|
|
qk_nope_head_dim=128,
|
|
v_head_dim=128,
|
|
n_experts=128,
|
|
n_shared_experts=4,
|
|
n_experts_per_tok=4,
|
|
expert_dim=768,
|
|
act_threshold=0.99,
|
|
rope_theta=500000.0,
|
|
lora_rank=32,
|
|
)
|
|
|
|
|
|
def mythos_100b() -> MythosConfig:
|
|
"""100B parameter config. Frontier-class model. dim=8192, 160 experts, 32 loop iters, 1M context, 128k output."""
|
|
return MythosConfig(
|
|
vocab_size=32000,
|
|
dim=8192,
|
|
n_heads=64,
|
|
n_kv_heads=8,
|
|
max_seq_len=1000000,
|
|
max_loop_iters=32,
|
|
prelude_layers=4,
|
|
coda_layers=4,
|
|
attn_type="mla",
|
|
kv_lora_rank=512,
|
|
q_lora_rank=2048,
|
|
qk_rope_head_dim=64,
|
|
qk_nope_head_dim=128,
|
|
v_head_dim=128,
|
|
n_experts=160,
|
|
n_shared_experts=4,
|
|
n_experts_per_tok=8,
|
|
expert_dim=1024,
|
|
act_threshold=0.99,
|
|
rope_theta=1000000.0,
|
|
lora_rank=64,
|
|
max_output_tokens=131072,
|
|
)
|
|
|
|
|
|
def mythos_500b() -> MythosConfig:
|
|
"""500B parameter config. Ultra-scale MoE model. dim=12288, 256 experts, 48 loop iters, 1M context, 128k output."""
|
|
return MythosConfig(
|
|
vocab_size=100000,
|
|
dim=12288,
|
|
n_heads=96,
|
|
n_kv_heads=16,
|
|
max_seq_len=1000000,
|
|
max_loop_iters=48,
|
|
prelude_layers=4,
|
|
coda_layers=4,
|
|
attn_type="mla",
|
|
kv_lora_rank=1024,
|
|
q_lora_rank=3072,
|
|
qk_rope_head_dim=64,
|
|
qk_nope_head_dim=128,
|
|
v_head_dim=128,
|
|
n_experts=256,
|
|
n_shared_experts=8,
|
|
n_experts_per_tok=8,
|
|
expert_dim=1536,
|
|
act_threshold=0.99,
|
|
rope_theta=1000000.0,
|
|
lora_rank=128,
|
|
max_output_tokens=131072,
|
|
)
|
|
|
|
|
|
def mythos_1t() -> MythosConfig:
|
|
"""1T parameter config. Maximum scale. dim=16384, 512 experts, 64 loop iters, 1M context, 128k output."""
|
|
return MythosConfig(
|
|
vocab_size=100000,
|
|
dim=16384,
|
|
n_heads=128,
|
|
n_kv_heads=16,
|
|
max_seq_len=1000000,
|
|
max_loop_iters=64,
|
|
prelude_layers=6,
|
|
coda_layers=6,
|
|
attn_type="mla",
|
|
kv_lora_rank=1024,
|
|
q_lora_rank=4096,
|
|
qk_rope_head_dim=64,
|
|
qk_nope_head_dim=128,
|
|
v_head_dim=128,
|
|
n_experts=512,
|
|
n_shared_experts=8,
|
|
n_experts_per_tok=8,
|
|
expert_dim=2048,
|
|
act_threshold=0.99,
|
|
rope_theta=2000000.0,
|
|
lora_rank=256,
|
|
max_output_tokens=131072,
|
|
)
|
|
|
|
|
|
# fmt: on
|