lvwerra HF staff commited on
Commit
ac8143c
·
1 Parent(s): 73d6ee6

Update Space (evaluate main: 1a95c8c2)

Browse files
Files changed (4) hide show
  1. README.md +71 -6
  2. app.py +6 -0
  3. requirements.txt +2 -0
  4. text_duplicates.py +82 -0
README.md CHANGED
@@ -1,12 +1,77 @@
1
  ---
2
- title: Text_duplicates
3
- emoji: 📈
4
- colorFrom: red
5
- colorTo: green
6
  sdk: gradio
7
- sdk_version: 3.0.6
8
  app_file: app.py
9
  pinned: false
 
 
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Text Duplicates
3
+ emoji: 🤗
4
+ colorFrom: green
5
+ colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 3.0.2
8
  app_file: app.py
9
  pinned: false
10
+ tags:
11
+ - evaluate
12
+ - measurement
13
  ---
14
 
15
+ # Measurement Card for Text Duplicates
16
+
17
+ ## Measurement Description
18
+
19
+ The `text_duplicates` measurement returns the fraction of duplicated strings in the input data.
20
+
21
+ ## How to Use
22
+
23
+ This measurement requires a list of strings as input:
24
+
25
+ ```python
26
+ >>> data = ["hello sun","hello moon", "hello sun"]
27
+ >>> duplicates = evaluate.load("text_duplicates")
28
+ >>> results = duplicates.compute(data=data)
29
+ ```
30
+
31
+ ### Inputs
32
+ - **data** (list of `str`): The input list of strings for which the duplicates are calculated.
33
+
34
+ ### Output Values
35
+ - **duplicate_fraction**(`float`): the fraction of duplicates in the input string(s).
36
+ - **duplicates_list**(`list`): (optional) a list of tuples with the duplicate strings and the number of times they are repeated.
37
+
38
+ By default, this measurement outputs a dictionary containing the fraction of duplicates in the input string(s) (`duplicate_fraction`):
39
+ )
40
+ ```python
41
+ {'duplicate_fraction': 0.33333333333333337}
42
+ ```
43
+
44
+ With the `list_duplicates=True` option, this measurement will also output a dictionary of tuples with duplicate strings and their counts.
45
+
46
+ ```python
47
+ {'duplicate_fraction': 0.33333333333333337, 'duplicates_list': {'hello sun': 2}}
48
+ ```
49
+
50
+ Warning: the `list_duplicates=True` function can be memory-intensive for large datasets.
51
+
52
+ ### Examples
53
+
54
+ Example with no duplicates
55
+
56
+ ```python
57
+ >>> data = ["foo", "bar", "foobar"]
58
+ >>> duplicates = evaluate.load("text_duplicates")
59
+ >>> results = duplicates.compute(data=data)
60
+ >>> print(results)
61
+ {'duplicate_fraction': 0.0}
62
+ ```
63
+
64
+ Example with multiple duplicates and `list_duplicates=True`:
65
+ ```python
66
+ >>> data = ["hello sun", "goodbye moon", "hello sun", "foo bar", "foo bar"]
67
+ >>> duplicates = evaluate.load("text_duplicates")
68
+ >>> results = duplicates.compute(data=data)
69
+ >>> print(results)
70
+ {'duplicate_fraction': 0.4, 'duplicates_list': {'hello sun': 2, 'foo bar': 2}}
71
+ ```
72
+
73
+ ## Citation(s)
74
+
75
+
76
+ ## Further References
77
+ - [`hashlib` library](https://docs.python.org/3/library/hashlib.html)
app.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import evaluate
2
+ from evaluate.utils import launch_gradio_widget
3
+
4
+
5
+ module = evaluate.load("text_duplicates", type="measurement")
6
+ launch_gradio_widget(module)
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ git+https://github.com/huggingface/evaluate.git@main
2
+ datasets~=2.0
text_duplicates.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2022 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import evaluate
16
+ import datasets
17
+ from collections import Counter
18
+ import hashlib
19
+
20
+ logger = evaluate.logging.get_logger(__name__)
21
+
22
+ _DESCRIPTION = """
23
+ Returns the duplicate strings (if any) contained in the input.
24
+ """
25
+
26
+ _KWARGS_DESCRIPTION = """
27
+ Args:
28
+ `data`: a list of `str` to be checked for duplicates.
29
+
30
+ Returns:
31
+ `duplicate_fraction` (`float`) : the fraction of strings that are duplicated.
32
+ `duplicates_list` (`dict`) (optional) : a dictionary containing tuples with the duplicate strings and the number of times they are repeated.
33
+
34
+ Examples:
35
+ >>> data = ["hello sun","hello moon", "hello sun"]
36
+ >>> duplicates = evaluate.load("text_duplicates")
37
+ >>> results = duplicates.compute(data=data)
38
+ >>> print(results)
39
+ {'duplicate_fraction': 0.33333333333333337}
40
+
41
+ >>> data = ["hello sun","hello moon", "hello sun"]
42
+ >>> duplicates = evaluate.load("text_duplicates")
43
+ >>> results = duplicates.compute(data=data, list_duplicates=True)
44
+ >>> print(results)
45
+ {'duplicate_fraction': 0.33333333333333337, 'duplicates_list': {'hello sun': 2}}
46
+ """
47
+
48
+ # TODO: Add BibTeX citation
49
+ _CITATION = ""
50
+ def get_hash(example):
51
+ """Get the hash of a string"""
52
+ return hashlib.md5(example.strip().encode("utf-8")).hexdigest()
53
+
54
+ @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
55
+ class TextDuplicates(evaluate.EvaluationModule):
56
+ """This measurement returns the duplicate strings contained in the input(s)."""
57
+
58
+ def _info(self):
59
+ # TODO: Specifies the evaluate.EvaluationModuleInfo object
60
+ return evaluate.EvaluationModuleInfo(
61
+ # This is the description that will appear on the modules page.
62
+ module_type="measurement",
63
+ description=_DESCRIPTION,
64
+ citation=_CITATION,
65
+ inputs_description=_KWARGS_DESCRIPTION,
66
+ # This defines the format of each prediction and reference
67
+ features=datasets.Features({
68
+ 'data': datasets.Value('string'),
69
+ })
70
+ )
71
+
72
+ def _compute(self, data, list_duplicates = False):
73
+ """Returns the duplicates contained in the input data and the number of times they are repeated."""
74
+ if list_duplicates == True:
75
+ logger.warning("This functionality can be memory-intensive for large datasets!")
76
+ n_dedup = len(set([get_hash(d) for d in data]))
77
+ c = Counter(data)
78
+ duplicates = {k: v for k, v in c.items() if v > 1}
79
+ return {"duplicate_fraction": 1 - (n_dedup/len(data)), "duplicates_list": duplicates}
80
+ else:
81
+ n_dedup = len(set([get_hash(d) for d in data]))
82
+ return {"duplicate_fraction": 1 - (n_dedup/len(data))}