torch>=2.1.0 transformers>=4.40.0 datasets>=2.18.0 pytest>=7.0.0 # optional — enables Flash Attention 2 in GQAttention (requires CUDA + build tools) # flash-attn>=2.8.3