lvwerra HF staff commited on
Commit
ac0977e
·
1 Parent(s): 1fb8130

Update Space (evaluate main: f4aba41f)

Browse files
Files changed (2) hide show
  1. requirements.txt +1 -1
  2. text_duplicates.py +3 -3
requirements.txt CHANGED
@@ -1 +1 @@
1
- git+https://github.com/huggingface/evaluate.git@ce94aee21a54915d2f045b5a779d0927e3d1eacd
 
1
+ git+https://github.com/huggingface/evaluate.git@f4aba41fdabe7f42cf6c7dcd5bfab6dd83adfd30
text_duplicates.py CHANGED
@@ -32,7 +32,7 @@ Args:
32
 
33
  Returns:
34
  `duplicate_fraction` (`float`) : the fraction of strings that are duplicated.
35
- `duplicates_list` (`dict`) (optional) : a dictionary containing tuples with the duplicate strings and the number of times they are repeated.
36
 
37
  Examples:
38
  >>> data = ["hello sun","hello moon", "hello sun"]
@@ -45,7 +45,7 @@ Examples:
45
  >>> duplicates = evaluate.load("text_duplicates")
46
  >>> results = duplicates.compute(data=data, list_duplicates=True)
47
  >>> print(results)
48
- {'duplicate_fraction': 0.33333333333333337, 'duplicates_list': {'hello sun': 2}}
49
  """
50
 
51
  # TODO: Add BibTeX citation
@@ -84,7 +84,7 @@ class TextDuplicates(evaluate.Measurement):
84
  n_dedup = len(set([get_hash(d) for d in data]))
85
  c = Counter(data)
86
  duplicates = {k: v for k, v in c.items() if v > 1}
87
- return {"duplicate_fraction": 1 - (n_dedup / len(data)), "duplicates_list": duplicates}
88
  else:
89
  n_dedup = len(set([get_hash(d) for d in data]))
90
  return {"duplicate_fraction": 1 - (n_dedup / len(data))}
 
32
 
33
  Returns:
34
  `duplicate_fraction` (`float`) : the fraction of strings that are duplicated.
35
+ `duplicates_dict` (`dict`) (optional) : a dictionary containing tuples with the duplicate strings and the number of times they are repeated.
36
 
37
  Examples:
38
  >>> data = ["hello sun","hello moon", "hello sun"]
 
45
  >>> duplicates = evaluate.load("text_duplicates")
46
  >>> results = duplicates.compute(data=data, list_duplicates=True)
47
  >>> print(results)
48
+ {'duplicate_fraction': 0.33333333333333337, 'duplicates_dict': {'hello sun': 2}}
49
  """
50
 
51
  # TODO: Add BibTeX citation
 
84
  n_dedup = len(set([get_hash(d) for d in data]))
85
  c = Counter(data)
86
  duplicates = {k: v for k, v in c.items() if v > 1}
87
+ return {"duplicate_fraction": 1 - (n_dedup / len(data)), "duplicates_dict": duplicates}
88
  else:
89
  n_dedup = len(set([get_hash(d) for d in data]))
90
  return {"duplicate_fraction": 1 - (n_dedup / len(data))}