winglian commited on
Commit
a363604
·
unverified ·
1 Parent(s): 501958b

better handling and logging of empty sharegpt turns (#603)

Browse files
src/axolotl/prompt_tokenizers.py CHANGED
@@ -358,10 +358,12 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
358
  ):
359
  if isinstance(part, tuple):
360
  if part[0] == "USER:":
361
- part = part[0] + part[1] if not user_token else part[1]
362
  # this is still the user query, we should
 
 
363
  res = self._tokenize(
364
- part.strip(),
365
  add_eos_token=False,
366
  strip_bos_token=True,
367
  )
@@ -371,10 +373,12 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
371
  labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
372
  elif part[0] == "ASSISTANT:":
373
  # TODO label assistant token/tokens w/ IGNORE_TOKEN_ID
374
- part = part[0] + part[1] if not assistant_token else part[1]
375
- # this should be the assistent response, should end with an eos token
 
 
376
  res = self._tokenize(
377
- part.strip(),
378
  add_eos_token=True,
379
  strip_bos_token=True,
380
  )
@@ -409,22 +413,31 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
409
  raise InvalidDataException(str(err)) from err
410
 
411
  def _tokenize(self, prompt, add_eos_token=True, strip_bos_token=False):
412
- result = self.tokenizer(
413
- prompt,
414
- truncation=True,
415
- max_length=self.sequence_len,
416
- padding=False,
417
- return_tensors=None,
418
- )
 
 
 
 
419
  if (
420
- result["input_ids"][-1] != self.tokenizer.eos_token_id
 
421
  and len(result["input_ids"]) < self.sequence_len
422
  and add_eos_token
423
  ):
424
  result["input_ids"].append(self.tokenizer.eos_token_id)
425
  result["attention_mask"].append(1)
426
 
427
- if result["input_ids"][0] == self.tokenizer.bos_token_id and strip_bos_token:
 
 
 
 
428
  result["input_ids"] = result["input_ids"][1:]
429
  result["attention_mask"] = result["attention_mask"][1:]
430
 
 
358
  ):
359
  if isinstance(part, tuple):
360
  if part[0] == "USER:":
361
+ turn = part[0] + part[1] if not user_token else part[1]
362
  # this is still the user query, we should
363
+ if not part[1].strip():
364
+ LOG.warning(f"user turn has empty text: {prompt}")
365
  res = self._tokenize(
366
+ turn.strip(),
367
  add_eos_token=False,
368
  strip_bos_token=True,
369
  )
 
373
  labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
374
  elif part[0] == "ASSISTANT:":
375
  # TODO label assistant token/tokens w/ IGNORE_TOKEN_ID
376
+ turn = part[0] + part[1] if not assistant_token else part[1]
377
+ # this should be the assistant response, should end with an eos token
378
+ if not part[1].strip():
379
+ LOG.warning(f"assistant turn has empty text: {prompt}")
380
  res = self._tokenize(
381
+ turn.strip(),
382
  add_eos_token=True,
383
  strip_bos_token=True,
384
  )
 
413
  raise InvalidDataException(str(err)) from err
414
 
415
  def _tokenize(self, prompt, add_eos_token=True, strip_bos_token=False):
416
+ if not prompt.strip():
417
+ LOG.warning("Empty text requested for tokenization.")
418
+ result = BatchEncoding(data={"input_ids": [], "attention_mask": []})
419
+ else:
420
+ result = self.tokenizer(
421
+ prompt,
422
+ truncation=True,
423
+ max_length=self.sequence_len,
424
+ padding=False,
425
+ return_tensors=None,
426
+ )
427
  if (
428
+ len(result["input_ids"]) > 0
429
+ and result["input_ids"][-1] != self.tokenizer.eos_token_id
430
  and len(result["input_ids"]) < self.sequence_len
431
  and add_eos_token
432
  ):
433
  result["input_ids"].append(self.tokenizer.eos_token_id)
434
  result["attention_mask"].append(1)
435
 
436
+ if (
437
+ len(result["input_ids"]) > 0
438
+ and result["input_ids"][0] == self.tokenizer.bos_token_id
439
+ and strip_bos_token
440
+ ):
441
  result["input_ids"] = result["input_ids"][1:]
442
  result["attention_mask"] = result["attention_mask"][1:]
443
 
tests/fixtures/conversation.missingturns.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "conversations": [
3
+ {
4
+ "from": "human",
5
+ "value": "how could i implement a minesweeper algorithm that utilises algebraic topology to solve boards?"
6
+ },
7
+ {
8
+ "from": "gpt",
9
+ "value": ""
10
+ },
11
+ {
12
+ "from": "human",
13
+ "value": "could you give me the gist of how it could though?"
14
+ },
15
+ {
16
+ "from": "gpt",
17
+ "value": "Sure. The basic idea would be to use algebraic topology to analyze the structure of the Minesweeper game board and determine the locations of mines based on certain topological invariants. This could involve representing the game board as a topological space and using topological invariants such as homotopy groups and Betti numbers to infer information about the location of mines on the board. However, as I mentioned before, implementing this would require a deep understanding of both algebraic topology and the game of Minesweeper, so it's not something that can be explained in a simple answer."
18
+ },
19
+ {
20
+ "from": "human",
21
+ "value": "give me a specific example"
22
+ },
23
+ {
24
+ "from": "gpt",
25
+ "value": "As a specific example, suppose we have a Minesweeper game board with the following configuration:\n```\n 1 1 1 1 1\n 1 ? ? ? 1\n 1 ? ? ? 1\n 1 ? ? ? 1\n 1 1 1 1 1\n```\nIn this case, the game board can be represented as a topological space, where each square on the board corresponds to a point in the space. The mines on the board can be thought of as \"holes\" in the space, which are points that are not part of the space. Using algebraic topology, we can analyze the structure of this space and determine the locations of the mines based on certain topological invariants.\n\nFor instance, one way to do this is to compute the homotopy groups of the space. Homotopy groups are a way of describing the holes in a space, and they can be used to infer information about the locations of the mines on the Minesweeper board. In the case of the board above, we can compute the first homotopy group of the space, which describes the number and \"shape\" of the holes in the space. In this case, the first homotopy group would tell us that there are four holes in the space, arranged in a square shape.\n\nBased on this information, we can infer that the mines in the game are located in the four corners of the board, as shown below:\n```\n 1 1 1 1 1\n 1 M ? ? 1\n 1 ? ? ? 1\n 1 ? ? M 1\n 1 1 1 1 1\n```\nOf course, this is just one example of how algebraic topology could be used to solve Minesweeper. There are many other possible approaches, and the specific details of how to implement them would depend on the specific game board and the topological invariants being used. As I mentioned before, implementing a solution like this would require a deep understanding of both algebraic topology and the game of Minesweeper."
26
+ },
27
+ {
28
+ "from": "human",
29
+ "value": "assuming i was someone with a deep understanding of both, could you explain the same concept to me with that assumption?"
30
+ },
31
+ {
32
+ "from": "gpt",
33
+ "value": "Sure. Assuming you have a deep understanding of both algebraic topology and the game of Minesweeper, here is how you might use algebraic topology to solve a Minesweeper game board:\n\n1. Represent the Minesweeper game board as a topological space. Each square on the board corresponds to a point in the space, and the mines on the board can be thought of as \"holes\" in the space, which are points that are not part of the space.\n2. Use topological invariants to infer information about the locations of the mines on the board. There are many possible topological invariants that could be used for this, such as homotopy groups, Betti numbers, or persistent homology.\n3. Use the information inferred from the topological invariants to determine the locations of the mines on the board. For example, if the first homotopy group of the space indicates that there are four holes in the space, arranged in a square shape, then we can infer that the mines are located in the four corners of the board.\n4. Use this information to solve the Minesweeper game. Once you know the locations of the mines, you can use this information to avoid them and clear the board.\n\nOf course, this is just one possible approach to using algebraic topology to solve Minesweeper. There may be other ways to do it, depending on the specific game board and the topological invariants being used. However, this should give you a general idea of how algebraic topology can be used to solve Minesweeper."
34
+ },
35
+ {
36
+ "from": "human",
37
+ "value": "how would i do it with persistent homology or any of the other topological invariants?"
38
+ },
39
+ {
40
+ "from": "gpt",
41
+ "value": "To use persistent homology or any of the other topological invariants to solve a Minesweeper game board, you would follow the same basic steps as described above, but with some differences in the specific details.\n\nFor instance, here is how you might use persistent homology to solve a Minesweeper game board:\n\n1. Represent the Minesweeper game board as a topological space. Each square on the board corresponds to a point in the space, and the mines on the board can be thought of as \"holes\" in the space, which are points that are not part of the space.\n2. Compute the persistent homology of the space. Persistent homology is a way of describing the holes in a space over time, and it can be used to infer information about the locations of the mines on the Minesweeper board.\n3. Use the information from the persistent homology to determine the locations of the mines on the board. For example, if the persistent homology indicates that there are four holes in the space, arranged in a square shape, then we can infer that the mines are located in the four corners of the board.\n4. Use this information to solve the Minesweeper game. Once you know the locations of the mines, you can use this information to avoid them and clear the board.\n\nOf course, this is just one possible approach to using persistent homology to solve Minesweeper. There may be other ways to do it, depending on the specific game board and the specific details of the persistent homology computation. However, this should give you a general idea of how persistent homology can be used to solve Minesweeper."
42
+ },
43
+ {
44
+ "from": "human",
45
+ "value": "how could i implement this in python?"
46
+ },
47
+ {
48
+ "from": "gpt",
49
+ "value": "To implement a Minesweeper solver that uses algebraic topology in Python, you would need to do the following:\n\n1. Install a Python package for algebraic topology. There are several options available, such as Gudhi, Dionysus, or scikit-tda. These packages provide algorithms and data structures for computing topological invariants, such as homotopy groups, Betti numbers, or persistent homology.\n2. Represent the Minesweeper game board as a topological space. This could be done using a data structure provided by the algebraic topology package, such as a simplicial complex or a Vietoris-Rips complex. Each square on the board would correspond to a point in the space, and the mines on the board would be represented as \"holes\" in the space.\n3. Compute the topological invariants of the space. This could be done using algorithms provided by the algebraic topology package. For example, you could compute the persistent homology of the space using the persistent homology algorithm.\n4. Use the information from the topological invariants to determine the locations of the mines on the board. This could be done using a combination of mathematical reasoning and programming. For example, if the persistent homology indicates that there are four holes in the space, arranged in a square shape, then you could use a piece of code to infer that the mines are located in the four corners of the board.\n5. Use this information to solve the Minesweeper game. Once you know the locations of the mines, you could use a piece of code to automatically click on the safe squares and clear the board.\n\nOf course, this is just one possible approach to implementing a Minesweeper solver that uses algebraic topology in Python. There may be other ways to do it, depending on the specific details of the implementation. However, this should give you a general idea of how to get started with such a project."
50
+ }
51
+ ]
52
+ }
tests/test_prompt_tokenizers.py CHANGED
@@ -3,7 +3,9 @@ import json
3
  import logging
4
  import unittest
5
  from pathlib import Path
 
6
 
 
7
  from transformers import AutoTokenizer, LlamaTokenizer
8
 
9
  from axolotl.prompt_strategies.alpaca_chat import NoSystemPrompter
@@ -29,6 +31,12 @@ class TestPromptTokenizationStrategies(unittest.TestCase):
29
  Test class for prompt tokenization strategies.
30
  """
31
 
 
 
 
 
 
 
32
  def setUp(self) -> None:
33
  # pylint: disable=duplicate-code
34
  self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
@@ -64,6 +72,24 @@ class TestPromptTokenizationStrategies(unittest.TestCase):
64
  self.assertEqual(len(example[fields]), len(tokenized_conversation[fields]))
65
  self.assertEqual(example[fields], tokenized_conversation[fields])
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  def test_no_sys_prompt(self):
68
  """
69
  tests the interface between the user and assistant parts
 
3
  import logging
4
  import unittest
5
  from pathlib import Path
6
+ from typing import Optional
7
 
8
+ import pytest
9
  from transformers import AutoTokenizer, LlamaTokenizer
10
 
11
  from axolotl.prompt_strategies.alpaca_chat import NoSystemPrompter
 
31
  Test class for prompt tokenization strategies.
32
  """
33
 
34
+ _caplog: Optional[pytest.LogCaptureFixture] = None
35
+
36
+ @pytest.fixture(autouse=True)
37
+ def inject_fixtures(self, caplog):
38
+ self._caplog = caplog
39
+
40
  def setUp(self) -> None:
41
  # pylint: disable=duplicate-code
42
  self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
 
72
  self.assertEqual(len(example[fields]), len(tokenized_conversation[fields]))
73
  self.assertEqual(example[fields], tokenized_conversation[fields])
74
 
75
+ def test_sharegpt_warnings_integration(self):
76
+ with open(
77
+ Path(__file__).parent / "fixtures/conversation.missingturns.json",
78
+ encoding="utf-8",
79
+ ) as fin:
80
+ data = fin.read()
81
+ conversation = json.loads(data)
82
+ prompter = ShareGPTPrompter("chat")
83
+ strat = ShareGPTPromptTokenizingStrategy(
84
+ prompter,
85
+ self.tokenizer,
86
+ False,
87
+ 2048,
88
+ )
89
+ with self._caplog.at_level(logging.WARNING):
90
+ strat.tokenize_prompt(conversation)
91
+ assert "assistant turn has empty text" in self._caplog.records[1].message
92
+
93
  def test_no_sys_prompt(self):
94
  """
95
  tests the interface between the user and assistant parts