{"id":507,"slug":"jobs-git--zyda-2","name":"Zyda-2","author":"jobs-git","description":"\n\t\n\t\t\n\t\tZyda-2\n\t\n\n\n\nZyda-2 is a 5 trillion token language modeling dataset created by collecting open and high quality datasets and combining them and cross-deduplication and model-based quality filtering. Zyda-2 comprises diverse sources of web data, highly educational content, math, code, and scientific papers.\nTo construct Zyda-2, we took the best open-source datasets available: Zyda, FineWeb, DCLM, and Dolma. Models trained on Zyda-2 significantly outperform identical models trained on the… See the full description on the dataset page: https://huggingface.co/datasets/jobs-git/Zyda-2.","tags":"[\"Task_categories:text-Generation\",\"Language:en\",\"Size_categories:n>1T\"]","license":null,"framework":null,"parameters":null,"downloads":118897,"likes":1,"verified":0,"created_at":"2026-05-01 11:33:26","updated_at":"2026-05-06 04:06:05","source_url":"https://huggingface.co/datasets/jobs-git/Zyda-2","source_platform":"huggingface","hf_repo_id":"jobs-git/Zyda-2","ollama_name":"","category":"dataset","latest_version":"v1.0.0","version_count":1,"signature_count":1,"risk_level":null,"risk_score":null,"versions":[{"id":506,"model_id":507,"version":"v1.0.0","manifest_hash":"387de7d87e61dd8cc1a363f3d3070eb158232ba803227d27f15edfdb4c625374","file_count":0,"total_size":0,"r2_manifest_key":"manifests/datasets/jobs-git--zyda-2/v1.0.0.json","created_at":"2026-05-01 11:33:26"}],"files":[],"signatures":[{"id":1031,"version_id":506,"signer_did":"did:quantamrkt:registry:shield-v1","algorithm":"ML-DSA-65","signature_hex":"333229b73e94e49a13b45587d638bb45a7df08a4a30e566a978978981278ae05","attestation_type":"registry","signed_at":"2026-05-01 11:33:26"}],"hndl":null}