mirror of
https://github.com/kyegomez/OpenMythos.git
synced 2026-05-02 17:43:27 +02:00
variants
This commit is contained in:
parent
299a0cdb0a
commit
9cce1c1401
@ -15,6 +15,15 @@ from open_mythos.main import (
|
||||
apply_rope,
|
||||
loop_index_embedding,
|
||||
)
|
||||
from open_mythos.variants import (
|
||||
mythos_1b,
|
||||
mythos_3b,
|
||||
mythos_10b,
|
||||
mythos_50b,
|
||||
mythos_100b,
|
||||
mythos_500b,
|
||||
mythos_1t,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"MythosConfig",
|
||||
@ -32,4 +41,11 @@ __all__ = [
|
||||
"precompute_rope_freqs",
|
||||
"apply_rope",
|
||||
"loop_index_embedding",
|
||||
"mythos_1b",
|
||||
"mythos_3b",
|
||||
"mythos_10b",
|
||||
"mythos_50b",
|
||||
"mythos_100b",
|
||||
"mythos_500b",
|
||||
"mythos_1t",
|
||||
]
|
||||
|
||||
@ -78,6 +78,8 @@ class MythosConfig:
|
||||
rope_theta: float = 500000.0
|
||||
# LoRA depth adaptation
|
||||
lora_rank: int = 16
|
||||
# Maximum tokens to generate per forward pass
|
||||
max_output_tokens: int = 4096
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
91
open_mythos/variants.py
Normal file
91
open_mythos/variants.py
Normal file
@ -0,0 +1,91 @@
|
||||
from open_mythos.main import MythosConfig
|
||||
|
||||
# fmt: off
|
||||
|
||||
def mythos_1b() -> MythosConfig:
|
||||
"""1B parameter config. Small research/fine-tuning model. dim=2048, 16 experts, 16 loop iters, 4k context."""
|
||||
return MythosConfig(
|
||||
vocab_size=32000, dim=2048, n_heads=16, n_kv_heads=4,
|
||||
max_seq_len=4096, max_loop_iters=16, prelude_layers=2, coda_layers=2,
|
||||
attn_type="mla", kv_lora_rank=256, q_lora_rank=512,
|
||||
qk_rope_head_dim=32, qk_nope_head_dim=64, v_head_dim=64,
|
||||
n_experts=16, n_shared_experts=2, n_experts_per_tok=2, expert_dim=256,
|
||||
act_threshold=0.99, rope_theta=500000.0, lora_rank=8,
|
||||
)
|
||||
|
||||
|
||||
def mythos_3b() -> MythosConfig:
|
||||
"""3B parameter config. Compact inference model. dim=3072, 32 experts, 16 loop iters, 4k context."""
|
||||
return MythosConfig(
|
||||
vocab_size=32000, dim=3072, n_heads=24, n_kv_heads=6,
|
||||
max_seq_len=4096, max_loop_iters=16, prelude_layers=2, coda_layers=2,
|
||||
attn_type="mla", kv_lora_rank=384, q_lora_rank=768,
|
||||
qk_rope_head_dim=32, qk_nope_head_dim=96, v_head_dim=96,
|
||||
n_experts=32, n_shared_experts=2, n_experts_per_tok=2, expert_dim=384,
|
||||
act_threshold=0.99, rope_theta=500000.0, lora_rank=8,
|
||||
)
|
||||
|
||||
|
||||
def mythos_10b() -> MythosConfig:
|
||||
"""10B parameter config. Mid-scale general model. dim=4096, 64 experts, 24 loop iters, 8k context."""
|
||||
return MythosConfig(
|
||||
vocab_size=32000, dim=4096, n_heads=32, n_kv_heads=8,
|
||||
max_seq_len=8192, max_loop_iters=24, prelude_layers=2, coda_layers=2,
|
||||
attn_type="mla", kv_lora_rank=512, q_lora_rank=1024,
|
||||
qk_rope_head_dim=64, qk_nope_head_dim=128, v_head_dim=128,
|
||||
n_experts=64, n_shared_experts=2, n_experts_per_tok=4, expert_dim=512,
|
||||
act_threshold=0.99, rope_theta=500000.0, lora_rank=16,
|
||||
)
|
||||
|
||||
|
||||
def mythos_50b() -> MythosConfig:
|
||||
"""50B parameter config. Large reasoning model. dim=6144, 128 experts, 32 loop iters, 8k context."""
|
||||
return MythosConfig(
|
||||
vocab_size=32000, dim=6144, n_heads=48, n_kv_heads=8,
|
||||
max_seq_len=8192, max_loop_iters=32, prelude_layers=3, coda_layers=3,
|
||||
attn_type="mla", kv_lora_rank=512, q_lora_rank=1536,
|
||||
qk_rope_head_dim=64, qk_nope_head_dim=128, v_head_dim=128,
|
||||
n_experts=128, n_shared_experts=4, n_experts_per_tok=4, expert_dim=768,
|
||||
act_threshold=0.99, rope_theta=500000.0, lora_rank=32,
|
||||
)
|
||||
|
||||
|
||||
def mythos_100b() -> MythosConfig:
|
||||
"""100B parameter config. Frontier-class model. dim=8192, 160 experts, 32 loop iters, 1M context, 128k output."""
|
||||
return MythosConfig(
|
||||
vocab_size=32000, dim=8192, n_heads=64, n_kv_heads=8,
|
||||
max_seq_len=1000000, max_loop_iters=32, prelude_layers=4, coda_layers=4,
|
||||
attn_type="mla", kv_lora_rank=512, q_lora_rank=2048,
|
||||
qk_rope_head_dim=64, qk_nope_head_dim=128, v_head_dim=128,
|
||||
n_experts=160, n_shared_experts=4, n_experts_per_tok=8, expert_dim=1024,
|
||||
act_threshold=0.99, rope_theta=1000000.0, lora_rank=64,
|
||||
max_output_tokens=131072,
|
||||
)
|
||||
|
||||
|
||||
def mythos_500b() -> MythosConfig:
|
||||
"""500B parameter config. Ultra-scale MoE model. dim=12288, 256 experts, 48 loop iters, 1M context, 128k output."""
|
||||
return MythosConfig(
|
||||
vocab_size=100000, dim=12288, n_heads=96, n_kv_heads=16,
|
||||
max_seq_len=1000000, max_loop_iters=48, prelude_layers=4, coda_layers=4,
|
||||
attn_type="mla", kv_lora_rank=1024, q_lora_rank=3072,
|
||||
qk_rope_head_dim=64, qk_nope_head_dim=128, v_head_dim=128,
|
||||
n_experts=256, n_shared_experts=8, n_experts_per_tok=8, expert_dim=1536,
|
||||
act_threshold=0.99, rope_theta=1000000.0, lora_rank=128,
|
||||
max_output_tokens=131072,
|
||||
)
|
||||
|
||||
|
||||
def mythos_1t() -> MythosConfig:
|
||||
"""1T parameter config. Maximum scale. dim=16384, 512 experts, 64 loop iters, 1M context, 128k output."""
|
||||
return MythosConfig(
|
||||
vocab_size=100000, dim=16384, n_heads=128, n_kv_heads=16,
|
||||
max_seq_len=1000000, max_loop_iters=64, prelude_layers=6, coda_layers=6,
|
||||
attn_type="mla", kv_lora_rank=1024, q_lora_rank=4096,
|
||||
qk_rope_head_dim=64, qk_nope_head_dim=128, v_head_dim=128,
|
||||
n_experts=512, n_shared_experts=8, n_experts_per_tok=8, expert_dim=2048,
|
||||
act_threshold=0.99, rope_theta=2000000.0, lora_rank=256,
|
||||
max_output_tokens=131072,
|
||||
)
|
||||
|
||||
# fmt: on
|
||||
Loading…
x
Reference in New Issue
Block a user