{"id":533,"slug":"mlfoundations--mint-1t-html","name":"MINT-1T-HTML","author":"mlfoundations","description":"\n  🍃 MINT-1T:Scaling Open-Source Multimodal Data by 10x: A Multimodal Dataset with One Trillion Tokens\n\n\n🍃 MINT-1T is an open-source Multimodal INTerleaved dataset with 1 trillion text tokens and 3.4 billion images, a 10x scale-up from existing open-source datasets. Additionally, we include previously untapped sources such as PDFs and ArXiv papers. 🍃 MINT-1T is designed to facilitate research in multimodal pretraining. 🍃 MINT-1T is created by a team from the University of Washington in… See the full description on the dataset page: https://huggingface.co/datasets/mlfoundations/MINT-1T-HTML.","tags":"[\"Task_categories:image-To-Text\",\"Task_categories:text-Generation\",\"Language:en\",\"Size_categories:100M<n<1B\",\"Format:parquet\",\"Modality:text\"]","license":null,"framework":null,"parameters":null,"downloads":138820,"likes":94,"verified":0,"created_at":"2026-05-06 14:47:30","updated_at":"2026-05-08 14:17:39","source_url":"https://huggingface.co/datasets/mlfoundations/MINT-1T-HTML","source_platform":"huggingface","hf_repo_id":"mlfoundations/MINT-1T-HTML","ollama_name":"","category":"dataset","latest_version":"v1.0.0","version_count":1,"signature_count":1,"risk_level":null,"risk_score":null,"versions":[{"id":532,"model_id":533,"version":"v1.0.0","manifest_hash":"6eb15dfc1aad62a1abd20ae7f7137d30095a127559a81523b45a3573ebaf3027","file_count":0,"total_size":0,"r2_manifest_key":"manifests/datasets/mlfoundations--mint-1t-html/v1.0.0.json","created_at":"2026-05-06 14:47:30"}],"files":[],"signatures":[{"id":1057,"version_id":532,"signer_did":"did:quantamrkt:registry:shield-v1","algorithm":"ML-DSA-65","signature_hex":"ec03be4cdf505d864aadc31701cf0aaec3cfe0a9f296fe73e744ea9cad72c0ba","attestation_type":"registry","signed_at":"2026-05-06 14:47:30"}],"hndl":null}