{"id":589,"slug":"jobs-git--hplt2.0_cleaned","name":"HPLT2.0_cleaned","author":"jobs-git","description":"This is a large-scale collection of web-crawled documents in 191 world languages, produced by the HPLT project. \nThe source of the data is mostly Internet Archive with some additions from Common Crawl.\nFor a detailed description of the dataset, please refer to https://hplt-project.org/datasets/v2.0\nThe Cleaned variant of HPLT Datasets v2.0\nThis is the cleaned variant of the HPLT Datasets v2.0 converted to the Parquet format semi-automatically when being uploaded here. \nThe original JSONL files… See the full description on the dataset page: https://huggingface.co/datasets/jobs-git/HPLT2.0_cleaned.","tags":"[\"Task_categories:fill-Mask\",\"Task_categories:text-Generation\",\"Task_ids:language-Modeling\",\"Multilinguality:multilingual\",\"Language:ace\",\"Language:af\"]","license":null,"framework":null,"parameters":null,"downloads":153463,"likes":0,"verified":0,"created_at":"2026-06-23 18:23:35","updated_at":"2026-06-29 14:23:35","source_url":"https://huggingface.co/datasets/jobs-git/HPLT2.0_cleaned","source_platform":"huggingface","hf_repo_id":"jobs-git/HPLT2.0_cleaned","ollama_name":"","category":"dataset","latest_version":"v1.0.0","version_count":1,"signature_count":1,"risk_level":null,"risk_score":null,"versions":[{"id":588,"model_id":589,"version":"v1.0.0","manifest_hash":"d03d54ccdc342831cddd26a429811e9b759d19ec2001e77667b291fac75c6194","file_count":0,"total_size":0,"r2_manifest_key":"manifests/datasets/jobs-git--hplt2.0_cleaned/v1.0.0.json","created_at":"2026-06-23 18:23:35"}],"files":[],"signatures":[{"id":1122,"version_id":588,"signer_did":"did:quantamrkt:registry:shield-v1","algorithm":"ML-DSA-65","signature_hex":"0b62a7b6a92facebff9e54c1677324f87c4f0ecd9dc25885e4dea88f730236d5","attestation_type":"registry","signed_at":"2026-06-23 18:23:35"}],"hndl":null}