inference/generate.py · DeepSeek-V3.2

1

import os

2

import json

3

from argparse import ArgumentParser

4

from typing import List

5

6

import torch

7

import torch.distributed as dist

8

from transformers import AutoTokenizer

9

from safetensors.torch import load_model

10

11

from model import Transformer, ModelArgs

12

13

14

def sample(logits, temperature: float = 1.0):

15

"""

16

Samples a token from the logits using temperature scaling.

17

18

Args:

19

logits (torch.Tensor): The logits tensor for token predictions.

20

temperature (float, optional): Temperature for scaling logits. Defaults to 1.0.

21

22

Returns:

23

torch.Tensor: The sampled token.

24

"""

25

logits = logits / max(temperature, 1e-5)

26

probs = torch.softmax(logits, dim=-1, dtype=torch.float32)

27

return probs.div_(torch.empty_like(probs).exponential_(1)).argmax(dim=-1)

28

29

30

@torch.inference_mode()

31

def generate(

32

model: Transformer,

33

prompt_tokens: List[List[int]],

34

max_new_tokens: int,

35

eos_id: int,

36

temperature: float = 1.0

37

) -> List[List[int]]:

38

"""

39

Generates new tokens based on the given prompt tokens using the specified model.

40

41

Args:

42

model (Transformer): The transformer model used for token generation.

43

prompt_tokens (List[List[int]]): A list of lists containing the prompt tokens for each sequence.

44

max_new_tokens (int): The maximum number of new tokens to generate.

45

eos_id (int): The end-of-sequence token ID.

46

temperature (float, optional): The temperature value for sampling. Defaults to 1.0.

47

48

Returns:

49

List[List[int]]: A list of lists containing the generated tokens for each sequence.

50

"""

51

prompt_lens = [len(t) for t in prompt_tokens]

52

    assert max(prompt_lens) <= model.max_seq_len, f"Prompt length exceeds model maximum sequence length (max_seq_len={model.max_seq_len})"

53

total_len = min(model.max_seq_len, max_new_tokens + max(prompt_lens))

54

tokens = torch.full((len(prompt_tokens), total_len), -1, dtype=torch.long, device="cuda")

55

for i, t in enumerate(prompt_tokens):

56

tokens[i, :len(t)] = torch.tensor(t, dtype=torch.long, device="cuda")

57

prev_pos = 0

58

finished = torch.tensor([False] * len(prompt_tokens), device="cuda")

59

prompt_mask = tokens != -1

60

for cur_pos in range(min(prompt_lens), total_len):

61

logits = model.forward(tokens[:, prev_pos:cur_pos], prev_pos)

62

if temperature > 0:

63

next_token = sample(logits, temperature)

64

else:

65

next_token = logits.argmax(dim=-1)

66

next_token = torch.where(prompt_mask[:, cur_pos], tokens[:, cur_pos], next_token)

67

tokens[:, cur_pos] = next_token

68

finished |= torch.logical_and(~prompt_mask[:, cur_pos], next_token == eos_id)

69

prev_pos = cur_pos

70

if finished.all():

71

break

72

completion_tokens = []

73

for i, toks in enumerate(tokens.tolist()):

74

toks = toks[prompt_lens[i]:prompt_lens[i]+max_new_tokens]

75

if eos_id in toks:

76

toks = toks[:toks.index(eos_id)]

77

completion_tokens.append(toks)

78

return completion_tokens

79

80

81

def main(

82

ckpt_path: str,

83

config: str,

84

input_file: str = "",

85

interactive: bool = True,

86

max_new_tokens: int = 100,

87

temperature: float = 1.0,

88

) -> None:

89

"""

90

Main function to load the model and perform interactive or batch text generation.

91

92

Args:

93

ckpt_path (str): Path to the model checkpoint directory.

94

config (str): Path to the model configuration file.

95

input_file (str, optional): Path to a file containing input prompts. Defaults to "".

96

interactive (bool, optional): Whether to run in interactive mode. Defaults to True.

97

max_new_tokens (int, optional): Maximum number of new tokens to generate. Defaults to 100.

98

temperature (float, optional): Temperature for sampling. Defaults to 1.0.

99

"""

100

world_size = int(os.getenv("WORLD_SIZE", "1"))

101

rank = int(os.getenv("RANK", "0"))

102

local_rank = int(os.getenv("LOCAL_RANK", "0"))

103

if world_size > 1:

104

dist.init_process_group("nccl")

105

global print

106

if rank != 0:

107

print = lambda *_, **__: None

108

torch.cuda.set_device(local_rank)

109

torch.set_default_dtype(torch.bfloat16)

110

torch.set_num_threads(8)

111

torch.manual_seed(33377335)

112

with open(config) as f:

113

args = ModelArgs(**json.load(f))

114

print(args)

115

with torch.device("cuda"):

116

model = Transformer(args)

117

tokenizer = AutoTokenizer.from_pretrained(ckpt_path)

118

print("load model")

119

load_model(model, os.path.join(ckpt_path, f"model{rank}-mp{world_size}.safetensors"))

120

print("I'm DeepSeek 👋")

121

122

if interactive:

123

messages = []

124

while True:

125

if world_size == 1:

126

prompt = input(">>> ")

127

elif rank == 0:

128

prompt = input(">>> ")

129

objects = [prompt]

130

dist.broadcast_object_list(objects, 0)

131

else:

132

objects = [None]

133

dist.broadcast_object_list(objects, 0)

134

prompt = objects[0]

135

if prompt == "/exit":

136

break

137

elif prompt == "/clear":

138

messages.clear()

139

continue

140

messages.append({"role": "user", "content": prompt})

141

prompt_tokens = tokenizer.apply_chat_template(messages, add_generation_prompt=True)

142

completion_tokens = generate(model, [prompt_tokens], max_new_tokens, tokenizer.eos_token_id, temperature)

143

completion = tokenizer.decode(completion_tokens[0], skip_special_tokens=True)

144

print(completion)

145

messages.append({"role": "assistant", "content": completion})

146

else:

147

with open(input_file) as f:

148

prompts = f.read().split("\n\n")

149

        assert len(prompts) <= args.max_batch_size, f"Number of prompts exceeds maximum batch size ({args.max_batch_size})"

150

        prompt_tokens = [tokenizer.apply_chat_template([{"role": "user", "content": prompt}], add_generation_prompt=True) for prompt in prompts]

151

completion_tokens = generate(model, prompt_tokens, max_new_tokens, tokenizer.eos_token_id, temperature)

152

completions = tokenizer.batch_decode(completion_tokens, skip_special_tokens=True)

153

for prompt, completion in zip(prompts, completions):

154

print("Prompt:", prompt)

155

print("Completion:", completion)

156

print()

157

158

if world_size > 1:

159

dist.destroy_process_group()

160

161

162

if __name__ == "__main__":

163

"""

164

Command-line interface for distributed text generation.

165

166

Arguments:

167

--ckpt-path (str): Path to the model checkpoint directory.

168

--config (str): Path to the model configuration file.

169

--input-file (str, optional): File containing prompts for batch processing.

170

--interactive (bool, optional): Enable interactive mode for generating text.

171

--max-new-tokens (int, optional): Maximum number of new tokens to generate. Defaults to 200.

172

--temperature (float, optional): Temperature for sampling. Defaults to 0.2.

173

174

Raises:

175

AssertionError: If neither input-file nor interactive mode is specified.

176

"""

177

parser = ArgumentParser()

178

parser.add_argument("--ckpt-path", type=str, required=True)

179

parser.add_argument("--config", type=str, required=True)

180

parser.add_argument("--input-file", type=str, default="")

181

parser.add_argument("--interactive", action="store_true")

182

parser.add_argument("--max-new-tokens", type=int, default=200)

183

parser.add_argument("--temperature", type=float, default=0.6)

184

args = parser.parse_args()

185

assert args.input_file or args.interactive, "Either input-file or interactive mode must be specified"

186

main(args.ckpt_path, args.config, args.input_file, args.interactive, args.max_new_tokens, args.temperature)

187