refactor: Update pyproject.toml to include pytorch-gpu dependency and its source URL
Browse files- bad_gpt.py +32 -18
- dataset.py +2 -1
- poetry.lock +0 -0
- pyproject.toml +7 -1
bad_gpt.py
CHANGED
@@ -55,37 +55,43 @@ class BadGPTModel(nn.Module):
|
|
55 |
|
56 |
# Given a 2d matrix of dimensions token and sentence
|
57 |
# generate new tokens in the next sentence
|
58 |
-
def generate(self,
|
59 |
-
for
|
60 |
# Log progress so I don't go insane
|
61 |
-
if
|
62 |
-
logger.debug(f'Iteration {
|
63 |
# Crop out the last block_size tokens
|
64 |
-
|
65 |
-
logits = self(
|
66 |
# Logits has dimensions token, sentence, token_list
|
67 |
# We want to make a new sentence, so only look at the last sentence
|
68 |
logits = logits[:, -1, :]
|
69 |
# Get possible next tokens and select one
|
70 |
probabilities = F.softmax(logits, dim=-1)
|
71 |
-
|
72 |
# Add the new token to the end of the tensor
|
73 |
-
|
74 |
-
return
|
75 |
|
76 |
|
77 |
@torch.no_grad()
|
78 |
-
def estimate_loss(
|
79 |
out = {}
|
80 |
-
|
81 |
for split in ['train', 'val']:
|
82 |
losses = torch.zeros(eval_interval)
|
83 |
-
for
|
84 |
-
|
85 |
-
logits
|
86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
out[split] = losses.mean()
|
88 |
-
|
89 |
return out
|
90 |
|
91 |
|
@@ -123,8 +129,14 @@ class BadGPTTrainer():
|
|
123 |
logger.debug(
|
124 |
f"step {i}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
|
125 |
context_stack, answer_stack = self.batcher.get_batch(split='train')
|
126 |
-
|
127 |
self.device), answer_stack.to(self.device))
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
self.optimizer.zero_grad(set_to_none=True)
|
129 |
loss.backward()
|
130 |
self.optimizer.step()
|
@@ -167,6 +179,8 @@ class BadGPT():
|
|
167 |
learning_rate=lr
|
168 |
)
|
169 |
self._trainer.train()
|
|
|
|
|
170 |
|
171 |
def generate(self, prompt: str, response_size: int):
|
172 |
start_ids = encode(prompt)
|
@@ -174,5 +188,5 @@ class BadGPT():
|
|
174 |
# add batch dimension. it's just 1 batch, but we still need it cuz tensors
|
175 |
context = context[None, ...]
|
176 |
encoded = self._model.generate(
|
177 |
-
|
178 |
return decode(encoded.tolist())
|
|
|
55 |
|
56 |
# Given a 2d matrix of dimensions token and sentence
|
57 |
# generate new tokens in the next sentence
|
58 |
+
def generate(self, ctx: torch.Tensor, max_new_tokens: int):
|
59 |
+
for index in range(max_new_tokens):
|
60 |
# Log progress so I don't go insane
|
61 |
+
if index % 16 == 0:
|
62 |
+
logger.debug(f'Iteration {index} of {max_new_tokens}')
|
63 |
# Crop out the last block_size tokens
|
64 |
+
cropped_ctx = ctx[:, -self.block_size:]
|
65 |
+
logits = self(cropped_ctx)
|
66 |
# Logits has dimensions token, sentence, token_list
|
67 |
# We want to make a new sentence, so only look at the last sentence
|
68 |
logits = logits[:, -1, :]
|
69 |
# Get possible next tokens and select one
|
70 |
probabilities = F.softmax(logits, dim=-1)
|
71 |
+
ctx_next = torch.multinomial(probabilities, num_samples=1)
|
72 |
# Add the new token to the end of the tensor
|
73 |
+
ctx = torch.cat((ctx, ctx_next), dim=1)
|
74 |
+
return ctx
|
75 |
|
76 |
|
77 |
@torch.no_grad()
|
78 |
+
def estimate_loss(gpt: BadGPTModel, batcher: Batcher, eval_interval: int, device: Literal['cuda', 'cpu'] = 'cuda'):
|
79 |
out = {}
|
80 |
+
gpt.eval()
|
81 |
for split in ['train', 'val']:
|
82 |
losses = torch.zeros(eval_interval)
|
83 |
+
for epoch in range(eval_interval):
|
84 |
+
train, answer = batcher.get_batch(split='train')
|
85 |
+
logits = gpt.forward(train)
|
86 |
+
# Reformat pediction and answer so each entry can be compared
|
87 |
+
batch, block, vocab = logits.shape
|
88 |
+
logits = logits.view(batch * block, vocab)
|
89 |
+
answer = answer.view(batch * block)
|
90 |
+
# Compare entropy of predicted tokens to actual
|
91 |
+
loss = F.cross_entropy(logits, answer).item()
|
92 |
+
losses[epoch] = loss
|
93 |
out[split] = losses.mean()
|
94 |
+
gpt.train()
|
95 |
return out
|
96 |
|
97 |
|
|
|
129 |
logger.debug(
|
130 |
f"step {i}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
|
131 |
context_stack, answer_stack = self.batcher.get_batch(split='train')
|
132 |
+
logits = self.model(context_stack.to(
|
133 |
self.device), answer_stack.to(self.device))
|
134 |
+
batch, block, vocab = logits.shape
|
135 |
+
# Reformat logits and val so each entry can be compared
|
136 |
+
logits = logits.view(batch * block, vocab).to(self.device)
|
137 |
+
answer_stack = answer_stack.view(batch * block).to(self.device)
|
138 |
+
# Compare predicted tokens to actual
|
139 |
+
loss = F.cross_entropy(logits, answer_stack)
|
140 |
self.optimizer.zero_grad(set_to_none=True)
|
141 |
loss.backward()
|
142 |
self.optimizer.step()
|
|
|
179 |
learning_rate=lr
|
180 |
)
|
181 |
self._trainer.train()
|
182 |
+
# set to eval phase since we're only taking user input from here on
|
183 |
+
self._model.eval()
|
184 |
|
185 |
def generate(self, prompt: str, response_size: int):
|
186 |
start_ids = encode(prompt)
|
|
|
188 |
# add batch dimension. it's just 1 batch, but we still need it cuz tensors
|
189 |
context = context[None, ...]
|
190 |
encoded = self._model.generate(
|
191 |
+
ctx=context, max_new_tokens=response_size)[0]
|
192 |
return decode(encoded.tolist())
|
dataset.py
CHANGED
@@ -27,6 +27,7 @@ class Batcher():
|
|
27 |
def __init__(self, device: Literal['cuda', 'cpu'], batch_size: int, block_size: int):
|
28 |
self.device = device
|
29 |
self.batch_size = batch_size
|
|
|
30 |
from dataset import make_dataset
|
31 |
train_data = make_dataset('train')
|
32 |
val_data = make_dataset('validation')
|
@@ -41,5 +42,5 @@ class Batcher():
|
|
41 |
context_stack = torch.stack(
|
42 |
[data[i:i+self.block_size] for i in random_indexes]).to(self.device)
|
43 |
answer_stack = torch.stack(
|
44 |
-
[data[i+1:i+self.block_size+1] for i in random_indexes])
|
45 |
return context_stack, answer_stack
|
|
|
27 |
def __init__(self, device: Literal['cuda', 'cpu'], batch_size: int, block_size: int):
|
28 |
self.device = device
|
29 |
self.batch_size = batch_size
|
30 |
+
self.block_size = block_size
|
31 |
from dataset import make_dataset
|
32 |
train_data = make_dataset('train')
|
33 |
val_data = make_dataset('validation')
|
|
|
42 |
context_stack = torch.stack(
|
43 |
[data[i:i+self.block_size] for i in random_indexes]).to(self.device)
|
44 |
answer_stack = torch.stack(
|
45 |
+
[data[i+1:i+self.block_size+1] for i in random_indexes]).to(self.device)
|
46 |
return context_stack, answer_stack
|
poetry.lock
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
pyproject.toml
CHANGED
@@ -8,7 +8,7 @@ package-mode = false
|
|
8 |
|
9 |
[tool.poetry.dependencies]
|
10 |
python = "^3.10"
|
11 |
-
torch = "^2.3.0"
|
12 |
numpy = "^1.26.4"
|
13 |
datasets = "^2.19.0"
|
14 |
tiktoken = "^0.6.0"
|
@@ -17,6 +17,12 @@ tiktoken = "^0.6.0"
|
|
17 |
[tool.poetry.group.dev.dependencies]
|
18 |
ipykernel = "^6.29.4"
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
[build-system]
|
21 |
requires = ["poetry-core"]
|
22 |
build-backend = "poetry.core.masonry.api"
|
|
|
8 |
|
9 |
[tool.poetry.dependencies]
|
10 |
python = "^3.10"
|
11 |
+
torch = { version = "^2.3.0", source = "pytorch-gpu" }
|
12 |
numpy = "^1.26.4"
|
13 |
datasets = "^2.19.0"
|
14 |
tiktoken = "^0.6.0"
|
|
|
17 |
[tool.poetry.group.dev.dependencies]
|
18 |
ipykernel = "^6.29.4"
|
19 |
|
20 |
+
|
21 |
+
[[tool.poetry.source]]
|
22 |
+
name = "pytorch-gpu"
|
23 |
+
url = "https://download.pytorch.org/whl/cu118"
|
24 |
+
priority = "supplemental"
|
25 |
+
|
26 |
[build-system]
|
27 |
requires = ["poetry-core"]
|
28 |
build-backend = "poetry.core.masonry.api"
|