christopher commited on
Commit
3a67ca3
·
1 Parent(s): 1bc48d2

Initial commit

Browse files
Files changed (2) hide show
  1. README.md +0 -2
  2. tokens_per_byte.py +7 -7
README.md CHANGED
@@ -1,7 +1,5 @@
1
  ---
2
  title: Tokens per Byte
3
- datasets:
4
- -
5
  tags:
6
  - evaluate
7
  - measurement
 
1
  ---
2
  title: Tokens per Byte
 
 
3
  tags:
4
  - evaluate
5
  - measurement
tokens_per_byte.py CHANGED
@@ -71,8 +71,7 @@ class TokensperByte(evaluate.Measurement):
71
  inputs_description=_KWARGS_DESCRIPTION,
72
  # This defines the format of each prediction and reference
73
  features=datasets.Features({
74
- 'predictions': datasets.Value('int64'),
75
- 'references': datasets.Value('int64'),
76
  }),
77
  # Homepage of the module for documentation
78
  homepage="http://module.homepage",
@@ -86,10 +85,11 @@ class TokensperByte(evaluate.Measurement):
86
  # TODO: Download external resources if needed
87
  pass
88
 
89
- def _compute(self, predictions, references):
90
  """Returns the scores"""
91
- # TODO: Compute the different scores of the module
92
- accuracy = sum(i == j for i, j in zip(predictions, references)) / len(predictions)
 
93
  return {
94
- "accuracy": accuracy,
95
- }
 
71
  inputs_description=_KWARGS_DESCRIPTION,
72
  # This defines the format of each prediction and reference
73
  features=datasets.Features({
74
+ 'text':datasets.Value("string"),
 
75
  }),
76
  # Homepage of the module for documentation
77
  homepage="http://module.homepage",
 
85
  # TODO: Download external resources if needed
86
  pass
87
 
88
+ def _compute(self, text, tokenizer):
89
  """Returns the scores"""
90
+ num_tokens = sum(tokenizer(text, return_length=True, return_attention_mask=False, add_special_tokens=False, return_token_type_ids=False)["length"])
91
+ num_bytes = sum([len(s.encode('utf-8')) for s in text])
92
+
93
  return {
94
+ "tokens_per_byte": num_tokens / num_bytes,
95
+ }