lvwerra HF staff commited on
Commit
4292831
·
1 Parent(s): e31a688

Update Space (evaluate main: d781f85c)

Browse files
Files changed (1) hide show
  1. text_duplicates.py +18 -10
text_duplicates.py CHANGED
@@ -12,10 +12,13 @@
12
  # See the License for the specific language governing permissions and
13
  # limitations under the License.
14
 
15
- import evaluate
16
- import datasets
17
- from collections import Counter
18
  import hashlib
 
 
 
 
 
 
19
 
20
  logger = evaluate.logging.get_logger(__name__)
21
 
@@ -47,10 +50,13 @@ Examples:
47
 
48
  # TODO: Add BibTeX citation
49
  _CITATION = ""
 
 
50
  def get_hash(example):
51
  """Get the hash of a string"""
52
  return hashlib.md5(example.strip().encode("utf-8")).hexdigest()
53
 
 
54
  @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
55
  class TextDuplicates(evaluate.Measurement):
56
  """This measurement returns the duplicate strings contained in the input(s)."""
@@ -64,19 +70,21 @@ class TextDuplicates(evaluate.Measurement):
64
  citation=_CITATION,
65
  inputs_description=_KWARGS_DESCRIPTION,
66
  # This defines the format of each prediction and reference
67
- features=datasets.Features({
68
- 'data': datasets.Value('string'),
69
- })
 
 
70
  )
71
 
72
- def _compute(self, data, list_duplicates = False):
73
  """Returns the duplicates contained in the input data and the number of times they are repeated."""
74
  if list_duplicates == True:
75
  logger.warning("This functionality can be memory-intensive for large datasets!")
76
  n_dedup = len(set([get_hash(d) for d in data]))
77
  c = Counter(data)
78
  duplicates = {k: v for k, v in c.items() if v > 1}
79
- return {"duplicate_fraction": 1 - (n_dedup/len(data)), "duplicates_list": duplicates}
80
  else:
81
- n_dedup = len(set([get_hash(d) for d in data]))
82
- return {"duplicate_fraction": 1 - (n_dedup/len(data))}
 
12
  # See the License for the specific language governing permissions and
13
  # limitations under the License.
14
 
 
 
 
15
  import hashlib
16
+ from collections import Counter
17
+
18
+ import datasets
19
+
20
+ import evaluate
21
+
22
 
23
  logger = evaluate.logging.get_logger(__name__)
24
 
 
50
 
51
  # TODO: Add BibTeX citation
52
  _CITATION = ""
53
+
54
+
55
  def get_hash(example):
56
  """Get the hash of a string"""
57
  return hashlib.md5(example.strip().encode("utf-8")).hexdigest()
58
 
59
+
60
  @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
61
  class TextDuplicates(evaluate.Measurement):
62
  """This measurement returns the duplicate strings contained in the input(s)."""
 
70
  citation=_CITATION,
71
  inputs_description=_KWARGS_DESCRIPTION,
72
  # This defines the format of each prediction and reference
73
+ features=datasets.Features(
74
+ {
75
+ "data": datasets.Value("string"),
76
+ }
77
+ ),
78
  )
79
 
80
+ def _compute(self, data, list_duplicates=False):
81
  """Returns the duplicates contained in the input data and the number of times they are repeated."""
82
  if list_duplicates == True:
83
  logger.warning("This functionality can be memory-intensive for large datasets!")
84
  n_dedup = len(set([get_hash(d) for d in data]))
85
  c = Counter(data)
86
  duplicates = {k: v for k, v in c.items() if v > 1}
87
+ return {"duplicate_fraction": 1 - (n_dedup / len(data)), "duplicates_list": duplicates}
88
  else:
89
+ n_dedup = len(set([get_hash(d) for d in data]))
90
+ return {"duplicate_fraction": 1 - (n_dedup / len(data))}