shamashel commited on
Commit
bb4e416
·
1 Parent(s): b4d4e2a

refactor: Update pyproject.toml to include pytorch-gpu dependency and its source URL

Browse files
Files changed (4) hide show
  1. bad_gpt.py +32 -18
  2. dataset.py +2 -1
  3. poetry.lock +0 -0
  4. pyproject.toml +7 -1
bad_gpt.py CHANGED
@@ -55,37 +55,43 @@ class BadGPTModel(nn.Module):
55
 
56
  # Given a 2d matrix of dimensions token and sentence
57
  # generate new tokens in the next sentence
58
- def generate(self, idx: torch.Tensor, max_new_tokens: int):
59
- for _ in range(max_new_tokens):
60
  # Log progress so I don't go insane
61
- if _ % 16 == 0:
62
- logger.debug(f'Iteration {_} of {max_new_tokens}')
63
  # Crop out the last block_size tokens
64
- cropped_idx = idx[:, -self.block_size:]
65
- logits = self(cropped_idx)
66
  # Logits has dimensions token, sentence, token_list
67
  # We want to make a new sentence, so only look at the last sentence
68
  logits = logits[:, -1, :]
69
  # Get possible next tokens and select one
70
  probabilities = F.softmax(logits, dim=-1)
71
- idx_next = torch.multinomial(probabilities, num_samples=1)
72
  # Add the new token to the end of the tensor
73
- idx = torch.cat((idx, idx_next), dim=1)
74
- return idx
75
 
76
 
77
  @torch.no_grad()
78
- def estimate_loss(model: nn.Module, batcher: Batcher, eval_interval: int, device: Literal['cuda', 'cpu'] = 'cuda'):
79
  out = {}
80
- model.eval() # set to eval phase
81
  for split in ['train', 'val']:
82
  losses = torch.zeros(eval_interval)
83
- for k in range(eval_interval):
84
- x, y = batcher.get_batch(split=split)
85
- logits, loss = model(x.to(device), y.to(device))
86
- losses[k] = loss.item()
 
 
 
 
 
 
87
  out[split] = losses.mean()
88
- model.train() # set back to training phase
89
  return out
90
 
91
 
@@ -123,8 +129,14 @@ class BadGPTTrainer():
123
  logger.debug(
124
  f"step {i}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
125
  context_stack, answer_stack = self.batcher.get_batch(split='train')
126
- _, loss = self.model(context_stack.to(
127
  self.device), answer_stack.to(self.device))
 
 
 
 
 
 
128
  self.optimizer.zero_grad(set_to_none=True)
129
  loss.backward()
130
  self.optimizer.step()
@@ -167,6 +179,8 @@ class BadGPT():
167
  learning_rate=lr
168
  )
169
  self._trainer.train()
 
 
170
 
171
  def generate(self, prompt: str, response_size: int):
172
  start_ids = encode(prompt)
@@ -174,5 +188,5 @@ class BadGPT():
174
  # add batch dimension. it's just 1 batch, but we still need it cuz tensors
175
  context = context[None, ...]
176
  encoded = self._model.generate(
177
- idx=context, max_new_tokens=response_size)[0]
178
  return decode(encoded.tolist())
 
55
 
56
  # Given a 2d matrix of dimensions token and sentence
57
  # generate new tokens in the next sentence
58
+ def generate(self, ctx: torch.Tensor, max_new_tokens: int):
59
+ for index in range(max_new_tokens):
60
  # Log progress so I don't go insane
61
+ if index % 16 == 0:
62
+ logger.debug(f'Iteration {index} of {max_new_tokens}')
63
  # Crop out the last block_size tokens
64
+ cropped_ctx = ctx[:, -self.block_size:]
65
+ logits = self(cropped_ctx)
66
  # Logits has dimensions token, sentence, token_list
67
  # We want to make a new sentence, so only look at the last sentence
68
  logits = logits[:, -1, :]
69
  # Get possible next tokens and select one
70
  probabilities = F.softmax(logits, dim=-1)
71
+ ctx_next = torch.multinomial(probabilities, num_samples=1)
72
  # Add the new token to the end of the tensor
73
+ ctx = torch.cat((ctx, ctx_next), dim=1)
74
+ return ctx
75
 
76
 
77
  @torch.no_grad()
78
+ def estimate_loss(gpt: BadGPTModel, batcher: Batcher, eval_interval: int, device: Literal['cuda', 'cpu'] = 'cuda'):
79
  out = {}
80
+ gpt.eval()
81
  for split in ['train', 'val']:
82
  losses = torch.zeros(eval_interval)
83
+ for epoch in range(eval_interval):
84
+ train, answer = batcher.get_batch(split='train')
85
+ logits = gpt.forward(train)
86
+ # Reformat pediction and answer so each entry can be compared
87
+ batch, block, vocab = logits.shape
88
+ logits = logits.view(batch * block, vocab)
89
+ answer = answer.view(batch * block)
90
+ # Compare entropy of predicted tokens to actual
91
+ loss = F.cross_entropy(logits, answer).item()
92
+ losses[epoch] = loss
93
  out[split] = losses.mean()
94
+ gpt.train()
95
  return out
96
 
97
 
 
129
  logger.debug(
130
  f"step {i}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
131
  context_stack, answer_stack = self.batcher.get_batch(split='train')
132
+ logits = self.model(context_stack.to(
133
  self.device), answer_stack.to(self.device))
134
+ batch, block, vocab = logits.shape
135
+ # Reformat logits and val so each entry can be compared
136
+ logits = logits.view(batch * block, vocab).to(self.device)
137
+ answer_stack = answer_stack.view(batch * block).to(self.device)
138
+ # Compare predicted tokens to actual
139
+ loss = F.cross_entropy(logits, answer_stack)
140
  self.optimizer.zero_grad(set_to_none=True)
141
  loss.backward()
142
  self.optimizer.step()
 
179
  learning_rate=lr
180
  )
181
  self._trainer.train()
182
+ # set to eval phase since we're only taking user input from here on
183
+ self._model.eval()
184
 
185
  def generate(self, prompt: str, response_size: int):
186
  start_ids = encode(prompt)
 
188
  # add batch dimension. it's just 1 batch, but we still need it cuz tensors
189
  context = context[None, ...]
190
  encoded = self._model.generate(
191
+ ctx=context, max_new_tokens=response_size)[0]
192
  return decode(encoded.tolist())
dataset.py CHANGED
@@ -27,6 +27,7 @@ class Batcher():
27
  def __init__(self, device: Literal['cuda', 'cpu'], batch_size: int, block_size: int):
28
  self.device = device
29
  self.batch_size = batch_size
 
30
  from dataset import make_dataset
31
  train_data = make_dataset('train')
32
  val_data = make_dataset('validation')
@@ -41,5 +42,5 @@ class Batcher():
41
  context_stack = torch.stack(
42
  [data[i:i+self.block_size] for i in random_indexes]).to(self.device)
43
  answer_stack = torch.stack(
44
- [data[i+1:i+self.block_size+1] for i in random_indexes])
45
  return context_stack, answer_stack
 
27
  def __init__(self, device: Literal['cuda', 'cpu'], batch_size: int, block_size: int):
28
  self.device = device
29
  self.batch_size = batch_size
30
+ self.block_size = block_size
31
  from dataset import make_dataset
32
  train_data = make_dataset('train')
33
  val_data = make_dataset('validation')
 
42
  context_stack = torch.stack(
43
  [data[i:i+self.block_size] for i in random_indexes]).to(self.device)
44
  answer_stack = torch.stack(
45
+ [data[i+1:i+self.block_size+1] for i in random_indexes]).to(self.device)
46
  return context_stack, answer_stack
poetry.lock CHANGED
The diff for this file is too large to render. See raw diff
 
pyproject.toml CHANGED
@@ -8,7 +8,7 @@ package-mode = false
8
 
9
  [tool.poetry.dependencies]
10
  python = "^3.10"
11
- torch = "^2.3.0"
12
  numpy = "^1.26.4"
13
  datasets = "^2.19.0"
14
  tiktoken = "^0.6.0"
@@ -17,6 +17,12 @@ tiktoken = "^0.6.0"
17
  [tool.poetry.group.dev.dependencies]
18
  ipykernel = "^6.29.4"
19
 
 
 
 
 
 
 
20
  [build-system]
21
  requires = ["poetry-core"]
22
  build-backend = "poetry.core.masonry.api"
 
8
 
9
  [tool.poetry.dependencies]
10
  python = "^3.10"
11
+ torch = { version = "^2.3.0", source = "pytorch-gpu" }
12
  numpy = "^1.26.4"
13
  datasets = "^2.19.0"
14
  tiktoken = "^0.6.0"
 
17
  [tool.poetry.group.dev.dependencies]
18
  ipykernel = "^6.29.4"
19
 
20
+
21
+ [[tool.poetry.source]]
22
+ name = "pytorch-gpu"
23
+ url = "https://download.pytorch.org/whl/cu118"
24
+ priority = "supplemental"
25
+
26
  [build-system]
27
  requires = ["poetry-core"]
28
  build-backend = "poetry.core.masonry.api"