mirror of
https://github.com/kyegomez/OpenMythos.git
synced 2026-05-02 17:43:27 +02:00
[feat][dropout][add dropout to config attn and
residuals][docs][datasets][add recommended training datasets doc][docs][readme-datasets][link datasets doc in readme]
This commit is contained in:
parent
0623ceb960
commit
5cfef742b5
@ -111,6 +111,7 @@ print(f"Parameters: {total:,}")
|
|||||||
| Page | Description |
|
| Page | Description |
|
||||||
|---|---|
|
|---|---|
|
||||||
| [`docs/open_mythos.md`](docs/open_mythos.md) | Full API reference for the `OpenMythos` class — constructor, `forward`, `generate`, all sub-modules, configuration reference, and usage examples |
|
| [`docs/open_mythos.md`](docs/open_mythos.md) | Full API reference for the `OpenMythos` class — constructor, `forward`, `generate`, all sub-modules, configuration reference, and usage examples |
|
||||||
|
| [`docs/datasets.md`](docs/datasets.md) | Recommended training datasets with token budget guidance per model size |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|||||||
43
docs/datasets.md
Normal file
43
docs/datasets.md
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
# Recommended Training Datasets
|
||||||
|
|
||||||
|
| Dataset | HuggingFace | Tokens | License | Use |
|
||||||
|
|---|---|---|---|---|
|
||||||
|
| FineWeb-Edu | [HuggingFaceFW/fineweb-edu](https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu) | 1.3T | Apache 2.0 | Primary pretraining |
|
||||||
|
| OpenHermes 2.5 | [teknium/OpenHermes-2.5](https://huggingface.co/datasets/teknium/OpenHermes-2.5) | ~1M samples | Apache 2.0 | Instruction tuning (~5% mix) |
|
||||||
|
| OpenWebMath | [open-web-math/open-web-math](https://huggingface.co/datasets/open-web-math/open-web-math) | 14.7B | ODC-By | Math/reasoning boost |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Primary Pretraining
|
||||||
|
|
||||||
|
### FineWeb-Edu
|
||||||
|
- **HuggingFace:** `HuggingFaceFW/fineweb-edu`
|
||||||
|
- **Size:** 1.3T tokens
|
||||||
|
- **License:** Apache 2.0
|
||||||
|
- **Why:** Web text filtered for educational quality. Outperforms The Pile, C4, and RefinedWeb on downstream benchmarks. Already deduplicated and cleaned.
|
||||||
|
- **Start with:** `sample-10BT` to validate your pipeline, then `sample-100BT` or the full corpus for a serious run.
|
||||||
|
|
||||||
|
## Supplementary
|
||||||
|
|
||||||
|
### OpenHermes 2.5
|
||||||
|
- **HuggingFace:** `teknium/OpenHermes-2.5`
|
||||||
|
- **Size:** ~1M instruction samples
|
||||||
|
- **License:** Apache 2.0
|
||||||
|
- **Why:** High-quality instruction-following data. Mix in ~5% by token count on top of FineWeb-Edu to improve instruction following without degrading general capability.
|
||||||
|
|
||||||
|
### OpenWebMath
|
||||||
|
- **HuggingFace:** `open-web-math/open-web-math`
|
||||||
|
- **Size:** ~14.7B tokens
|
||||||
|
- **License:** ODC-By
|
||||||
|
- **Why:** Math-focused web text. Add if you want stronger quantitative and symbolic reasoning. Particularly useful for the 10B+ variants where reasoning depth matters.
|
||||||
|
|
||||||
|
## Token Budget Recommendations
|
||||||
|
|
||||||
|
| Variant | Chinchilla-optimal | Recommended (looped) |
|
||||||
|
|---|---|---|
|
||||||
|
| 1B | ~20B tokens | ~10–15B tokens |
|
||||||
|
| 3B | ~60B tokens | ~30–40B tokens |
|
||||||
|
| 10B | ~200B tokens | ~100–150B tokens |
|
||||||
|
| 50B+ | ~1T+ tokens | ~500B+ tokens |
|
||||||
|
|
||||||
|
The looped architecture is more sample-efficient than a standard transformer — same validation loss is reachable with fewer tokens due to faster convergence. The "recommended (looped)" column reflects this and is based on the Tiny Shakespeare result where OpenMythos reached equivalent loss ~2.5× faster than nanoGPT.
|
||||||
@ -76,6 +76,8 @@ class MythosConfig:
|
|||||||
lora_rank: int = 16
|
lora_rank: int = 16
|
||||||
# Maximum tokens to generate per forward pass
|
# Maximum tokens to generate per forward pass
|
||||||
max_output_tokens: int = 4096
|
max_output_tokens: int = 4096
|
||||||
|
# Dropout (set 0.0 to disable; 0.1 is standard for pretraining)
|
||||||
|
dropout: float = 0.0
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@ -194,6 +196,7 @@ class GQAttention(nn.Module):
|
|||||||
self.wk = nn.Linear(cfg.dim, cfg.n_kv_heads * self.head_dim, bias=False)
|
self.wk = nn.Linear(cfg.dim, cfg.n_kv_heads * self.head_dim, bias=False)
|
||||||
self.wv = nn.Linear(cfg.dim, cfg.n_kv_heads * self.head_dim, bias=False)
|
self.wv = nn.Linear(cfg.dim, cfg.n_kv_heads * self.head_dim, bias=False)
|
||||||
self.wo = nn.Linear(cfg.n_heads * self.head_dim, cfg.dim, bias=False)
|
self.wo = nn.Linear(cfg.n_heads * self.head_dim, cfg.dim, bias=False)
|
||||||
|
self.attn_drop = nn.Dropout(cfg.dropout)
|
||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
@ -240,7 +243,7 @@ class GQAttention(nn.Module):
|
|||||||
attn = torch.matmul(q, k.transpose(-2, -1)) * scale
|
attn = torch.matmul(q, k.transpose(-2, -1)) * scale
|
||||||
if mask is not None:
|
if mask is not None:
|
||||||
attn = attn + mask
|
attn = attn + mask
|
||||||
attn = F.softmax(attn, dim=-1)
|
attn = self.attn_drop(F.softmax(attn, dim=-1))
|
||||||
out = torch.matmul(attn, v)
|
out = torch.matmul(attn, v)
|
||||||
out = out.transpose(1, 2).contiguous().view(B, T, -1)
|
out = out.transpose(1, 2).contiguous().view(B, T, -1)
|
||||||
return self.wo(out)
|
return self.wo(out)
|
||||||
@ -315,6 +318,7 @@ class MLAttention(nn.Module):
|
|||||||
)
|
)
|
||||||
|
|
||||||
self.wo = nn.Linear(cfg.n_heads * cfg.v_head_dim, cfg.dim, bias=False)
|
self.wo = nn.Linear(cfg.n_heads * cfg.v_head_dim, cfg.dim, bias=False)
|
||||||
|
self.attn_drop = nn.Dropout(cfg.dropout)
|
||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
@ -381,7 +385,7 @@ class MLAttention(nn.Module):
|
|||||||
attn = torch.matmul(q, k.transpose(-2, -1)) * scale
|
attn = torch.matmul(q, k.transpose(-2, -1)) * scale
|
||||||
if mask is not None:
|
if mask is not None:
|
||||||
attn = attn + mask
|
attn = attn + mask
|
||||||
attn = F.softmax(attn, dim=-1)
|
attn = self.attn_drop(F.softmax(attn, dim=-1))
|
||||||
out = torch.matmul(attn, v) # (B, H, T, v_dim)
|
out = torch.matmul(attn, v) # (B, H, T, v_dim)
|
||||||
out = out.transpose(1, 2).contiguous().view(B, T, -1)
|
out = out.transpose(1, 2).contiguous().view(B, T, -1)
|
||||||
return self.wo(out)
|
return self.wo(out)
|
||||||
@ -608,6 +612,7 @@ class TransformerBlock(nn.Module):
|
|||||||
self.ffn_norm = RMSNorm(cfg.dim)
|
self.ffn_norm = RMSNorm(cfg.dim)
|
||||||
self.attn = MLAttention(cfg) if cfg.attn_type == "mla" else GQAttention(cfg)
|
self.attn = MLAttention(cfg) if cfg.attn_type == "mla" else GQAttention(cfg)
|
||||||
self.ffn = MoEFFN(cfg) if use_moe else Expert(cfg.dim, cfg.dim * 4 // 3)
|
self.ffn = MoEFFN(cfg) if use_moe else Expert(cfg.dim, cfg.dim * 4 // 3)
|
||||||
|
self.resid_drop = nn.Dropout(cfg.dropout)
|
||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
@ -628,8 +633,8 @@ class TransformerBlock(nn.Module):
|
|||||||
Returns:
|
Returns:
|
||||||
Output tensor of shape (B, T, dim)
|
Output tensor of shape (B, T, dim)
|
||||||
"""
|
"""
|
||||||
x = x + self.attn(self.attn_norm(x), freqs_cis, mask, kv_cache, cache_key)
|
x = x + self.resid_drop(self.attn(self.attn_norm(x), freqs_cis, mask, kv_cache, cache_key))
|
||||||
x = x + self.ffn(self.ffn_norm(x))
|
x = x + self.resid_drop(self.ffn(self.ffn_norm(x)))
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user