Update Space (evaluate main: f4aba41f)
Browse files- requirements.txt +1 -1
- text_duplicates.py +3 -3
requirements.txt
CHANGED
@@ -1 +1 @@
|
|
1 |
-
git+https://github.com/huggingface/evaluate.git@
|
|
|
1 |
+
git+https://github.com/huggingface/evaluate.git@f4aba41fdabe7f42cf6c7dcd5bfab6dd83adfd30
|
text_duplicates.py
CHANGED
@@ -32,7 +32,7 @@ Args:
|
|
32 |
|
33 |
Returns:
|
34 |
`duplicate_fraction` (`float`) : the fraction of strings that are duplicated.
|
35 |
-
`
|
36 |
|
37 |
Examples:
|
38 |
>>> data = ["hello sun","hello moon", "hello sun"]
|
@@ -45,7 +45,7 @@ Examples:
|
|
45 |
>>> duplicates = evaluate.load("text_duplicates")
|
46 |
>>> results = duplicates.compute(data=data, list_duplicates=True)
|
47 |
>>> print(results)
|
48 |
-
{'duplicate_fraction': 0.33333333333333337, '
|
49 |
"""
|
50 |
|
51 |
# TODO: Add BibTeX citation
|
@@ -84,7 +84,7 @@ class TextDuplicates(evaluate.Measurement):
|
|
84 |
n_dedup = len(set([get_hash(d) for d in data]))
|
85 |
c = Counter(data)
|
86 |
duplicates = {k: v for k, v in c.items() if v > 1}
|
87 |
-
return {"duplicate_fraction": 1 - (n_dedup / len(data)), "
|
88 |
else:
|
89 |
n_dedup = len(set([get_hash(d) for d in data]))
|
90 |
return {"duplicate_fraction": 1 - (n_dedup / len(data))}
|
|
|
32 |
|
33 |
Returns:
|
34 |
`duplicate_fraction` (`float`) : the fraction of strings that are duplicated.
|
35 |
+
`duplicates_dict` (`dict`) (optional) : a dictionary containing tuples with the duplicate strings and the number of times they are repeated.
|
36 |
|
37 |
Examples:
|
38 |
>>> data = ["hello sun","hello moon", "hello sun"]
|
|
|
45 |
>>> duplicates = evaluate.load("text_duplicates")
|
46 |
>>> results = duplicates.compute(data=data, list_duplicates=True)
|
47 |
>>> print(results)
|
48 |
+
{'duplicate_fraction': 0.33333333333333337, 'duplicates_dict': {'hello sun': 2}}
|
49 |
"""
|
50 |
|
51 |
# TODO: Add BibTeX citation
|
|
|
84 |
n_dedup = len(set([get_hash(d) for d in data]))
|
85 |
c = Counter(data)
|
86 |
duplicates = {k: v for k, v in c.items() if v > 1}
|
87 |
+
return {"duplicate_fraction": 1 - (n_dedup / len(data)), "duplicates_dict": duplicates}
|
88 |
else:
|
89 |
n_dedup = len(set([get_hash(d) for d in data]))
|
90 |
return {"duplicate_fraction": 1 - (n_dedup / len(data))}
|