{"id":691,"slug":"codeparrot--github-code-clean","name":"github-code-clean","author":"codeparrot","description":"The GitHub Code clean dataset in a more filtered version of codeparrot/github-code dataset, it consists of 115M code files from GitHub in 32 programming languages with 60 extensions totaling in almost 1TB of text data.","tags":"[\"Size_categories:10M<n<100M\",\"Modality:text\",\"Library:datasets\",\"Library:mlcroissant\"]","license":null,"framework":null,"parameters":null,"downloads":117130,"likes":142,"verified":0,"created_at":"2026-07-04 10:23:26","updated_at":"2026-07-04 13:23:31","source_url":"https://huggingface.co/datasets/codeparrot/github-code-clean","source_platform":"huggingface","hf_repo_id":"codeparrot/github-code-clean","ollama_name":"","category":"dataset","latest_version":"v1.0.0","version_count":1,"signature_count":1,"risk_level":null,"risk_score":null,"versions":[{"id":690,"model_id":691,"version":"v1.0.0","manifest_hash":"295af62852a38555427608409cea6fa9c1001c68d5f11b1c8e0b49d56005fe54","file_count":0,"total_size":0,"r2_manifest_key":"manifests/datasets/codeparrot--github-code-clean/v1.0.0.json","created_at":"2026-07-04 10:23:26"}],"files":[],"signatures":[{"id":1252,"version_id":690,"signer_did":"did:quantamrkt:registry:shield-v1","algorithm":"ML-DSA-65","signature_hex":"cbc8ad553ac9c00a7aa525123ffb8ee2179e07da41983ac1e8f2424d21b37e4c","attestation_type":"registry","signed_at":"2026-07-04 10:23:26"}],"hndl":null}