{"id":515,"slug":"huggingfacefw--fineweb-2","name":"fineweb-2","author":"HuggingFaceFW","description":"\n\t\n\t\t\n\t\t🥂 FineWeb2\n\t\n\n\n    \n\n\n\nA sparkling update with 1000s of languages\n\n\n\t\n\t\t\n\t\tWhat is it?\n\t\n\nThis is the second iteration of the popular 🍷 FineWeb dataset, bringing high quality pretraining data to over 1000 🗣️ languages.\nThe 🥂 FineWeb2 dataset is fully reproducible, available under the permissive ODC-By 1.0 license and extensively validated through hundreds of ablation experiments.\nIn particular, on the set of 9 diverse languages we used to guide our processing decisions, 🥂 FineWeb2… See the full description on the dataset page: https://huggingface.co/datasets/HuggingFaceFW/fineweb-2.","tags":"[\"Task_categories:text-Generation\",\"Language:aai\",\"Language:aak\",\"Language:aau\",\"Language:aaz\",\"Language:aba\"]","license":null,"framework":null,"parameters":null,"downloads":121214,"likes":792,"verified":0,"created_at":"2026-05-02 11:08:12","updated_at":"2026-05-08 00:07:22","source_url":"https://huggingface.co/datasets/HuggingFaceFW/fineweb-2","source_platform":"huggingface","hf_repo_id":"HuggingFaceFW/fineweb-2","ollama_name":"","category":"dataset","latest_version":"v1.0.0","version_count":1,"signature_count":1,"risk_level":null,"risk_score":null,"versions":[{"id":514,"model_id":515,"version":"v1.0.0","manifest_hash":"c3c779056d011559d03a367b11cf6e17b439e0e28f6f6b1ed32ff2654d5ca59e","file_count":0,"total_size":0,"r2_manifest_key":"manifests/datasets/huggingfacefw--fineweb-2/v1.0.0.json","created_at":"2026-05-02 11:08:12"}],"files":[],"signatures":[{"id":1039,"version_id":514,"signer_did":"did:quantamrkt:registry:shield-v1","algorithm":"ML-DSA-65","signature_hex":"e2fbadbe5a57a4ba7cd15db70bcdce283de78c0695f8d51ea4f04022146d0d60","attestation_type":"registry","signed_at":"2026-05-02 11:08:12"}],"hndl":null}