{"id":588,"slug":"zyphra--zyda-2","name":"Zyda-2","author":"Zyphra","description":"\n\t\n\t\t\n\t\tZyda-2\n\t\n\n\n\nZyda-2 is a 5 trillion token language modeling dataset created by collecting open and high quality datasets and combining them and cross-deduplication and model-based quality filtering. Zyda-2 comprises diverse sources of web data, highly educational content, math, code, and scientific papers.\nTo construct Zyda-2, we took the best open-source datasets available: Zyda, FineWeb, DCLM, and Dolma. Models trained on Zyda-2 significantly outperform identical models trained on the… See the full description on the dataset page: https://huggingface.co/datasets/Zyphra/Zyda-2.","tags":"[\"Task_categories:text-Generation\",\"Language:en\",\"Size_categories:n>1T\"]","license":null,"framework":null,"parameters":null,"downloads":138634,"likes":98,"verified":0,"created_at":"2026-06-23 18:23:35","updated_at":"2026-06-29 14:23:35","source_url":"https://huggingface.co/datasets/Zyphra/Zyda-2","source_platform":"huggingface","hf_repo_id":"Zyphra/Zyda-2","ollama_name":"","category":"dataset","latest_version":"v1.0.0","version_count":1,"signature_count":1,"risk_level":null,"risk_score":null,"versions":[{"id":587,"model_id":588,"version":"v1.0.0","manifest_hash":"3309073e9e8ed364827153516469e1cb418e9d6d3484a6d282f3c9b526349183","file_count":0,"total_size":0,"r2_manifest_key":"manifests/datasets/zyphra--zyda-2/v1.0.0.json","created_at":"2026-06-23 18:23:35"}],"files":[],"signatures":[{"id":1121,"version_id":587,"signer_did":"did:quantamrkt:registry:shield-v1","algorithm":"ML-DSA-65","signature_hex":"0ddb6e075e30b5ac0ab675a777363a484576377c127a2210e23057f89e06bb55","attestation_type":"registry","signed_at":"2026-06-23 18:23:35"}],"hndl":null}