{"id":237,"slug":"epfml--fineweb-hq","name":"FineWeb-HQ","author":"epfml","description":"\n\t\n\t\t\n\t\tFineWeb-HQ\n\t\n\n\n\t\n\t\t\n\t\tDataset Summary\n\t\n\nFineWeb-HQ is a high-quality, model-filtered pretraining dataset derived as a subset of FineWeb. FineWeb-HQ was created by selecting the top 10% of FineWeb documents based on a deep learning classifier trained to identify structured and knowledge-rich samples. This classifier uses XLM-RoBERTa embeddings to score documents.\nTo validate our approach, we pretrained 1B-parameter LLM models with a Llama-like architecture across multiple languages and… See the full description on the dataset page: https://huggingface.co/datasets/epfml/FineWeb-HQ.","tags":"[\"Task_categories:text-Generation\",\"Language:en\",\"Size_categories:1B<n<10B\",\"Format:parquet\",\"Modality:tabular\",\"Modality:text\"]","license":null,"framework":null,"parameters":null,"downloads":166753,"likes":7,"verified":0,"created_at":"2026-04-20 18:22:11","updated_at":"2026-04-21 00:00:23","source_url":"https://huggingface.co/datasets/epfml/FineWeb-HQ","source_platform":"huggingface","hf_repo_id":"epfml/FineWeb-HQ","ollama_name":"","category":"dataset","latest_version":"v1.0.0","version_count":1,"signature_count":1,"risk_level":null,"risk_score":null,"versions":[{"id":236,"model_id":237,"version":"v1.0.0","manifest_hash":"1957abc24c421b79703c68e215b73024aab93003ae2fe51d24068d6b1b1d46e0","file_count":0,"total_size":0,"r2_manifest_key":"manifests/datasets/epfml--fineweb-hq/v1.0.0.json","created_at":"2026-04-20 18:22:11"}],"files":[],"signatures":[{"id":598,"version_id":236,"signer_did":"did:quantamrkt:registry:shield-v1","algorithm":"ML-DSA-65","signature_hex":"3683041c49a8988203d0303894efd9bdcd498b785a776cc7374c812e410d4629","attestation_type":"registry","signed_at":"2026-04-20 18:22:11"}],"hndl":null}