Spaces:
Running
Running
Avijit Ghosh
commited on
Commit
·
87e696f
1
Parent(s):
2ee7c27
Added more fields
Browse files- app.py +26 -3
- configs/crowspairs.yaml +1 -0
- configs/honest.yaml +1 -0
- configs/ieat.yaml +1 -0
- configs/imagedataleak.yaml +1 -0
- configs/measuringforgetting.yaml +2 -1
- configs/notmyvoice.yaml +1 -0
- configs/palms.yaml +2 -1
- configs/safelatentdiff.yaml +2 -1
- configs/stablebias.yaml +1 -0
- configs/tango.yaml +2 -1
- configs/videodiversemisinfo.yaml +1 -0
- configs/weat.yaml +8 -22
app.py
CHANGED
@@ -70,11 +70,15 @@ def showmodal(evt: gr.SelectData):
|
|
70 |
modal = Modal(visible=False)
|
71 |
titlemd = gr.Markdown("",visible=False)
|
72 |
authormd = gr.Markdown("",visible=False)
|
|
|
73 |
tagsmd = gr.Markdown("",visible=False)
|
74 |
abstractmd = gr.Markdown("",visible=False)
|
|
|
|
|
75 |
considerationsmd = gr.Markdown("",visible=False)
|
76 |
modelsmd = gr.Markdown("",visible=False)
|
77 |
datasetmd = gr.Markdown("",visible=False)
|
|
|
78 |
gallery = gr.Gallery([],visible=False)
|
79 |
if evt.index[1] == 4:
|
80 |
modal = Modal(visible=True)
|
@@ -92,26 +96,42 @@ def showmodal(evt: gr.SelectData):
|
|
92 |
modelstr = '### Applicable Models: '+''.join(['<span class="tag">'+model+'</span> ' for model in models])
|
93 |
modelsmd = gr.Markdown(modelstr, visible=True)
|
94 |
|
|
|
95 |
titlemd = gr.Markdown('# ['+itemdic['Link']+']('+itemdic['URL']+')',visible=True)
|
96 |
|
97 |
if pd.notnull(itemdic['Authors']):
|
98 |
authormd = gr.Markdown('## '+itemdic['Authors'],visible=True)
|
|
|
|
|
|
|
99 |
|
100 |
if pd.notnull(itemdic['Abstract']):
|
101 |
abstractmd = gr.Markdown(itemdic['Abstract'],visible=True)
|
102 |
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
if pd.notnull(itemdic['Considerations']):
|
104 |
considerationsmd = gr.Markdown('<strong>Considerations: </strong>'+ itemdic['Considerations'],visible=True)
|
105 |
|
106 |
if pd.notnull(itemdic['Datasets']):
|
107 |
datasetmd = gr.Markdown('#### [Dataset]('+itemdic['Datasets']+')',visible=True)
|
108 |
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
screenshots = itemdic['Screenshots']
|
110 |
if isinstance(screenshots, list):
|
111 |
if len(screenshots) > 0:
|
112 |
gallery = gr.Gallery(screenshots, visible=True, height=500, object_fit="scale-down", interactive=False, show_share_button=False)
|
113 |
|
114 |
-
return [modal, titlemd, authormd, tagsmd, abstractmd, considerationsmd, modelsmd, datasetmd, gallery]
|
115 |
|
116 |
with gr.Blocks(title = "Social Impact Measurement V2", css=custom_css, theme=gr.themes.Base()) as demo: #theme=gr.themes.Soft(),
|
117 |
# create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
|
@@ -160,11 +180,13 @@ The following categories are high-level, non-exhaustive, and present a synthesis
|
|
160 |
with Modal(visible=False) as modal:
|
161 |
titlemd = gr.Markdown(visible=False)
|
162 |
authormd = gr.Markdown(visible=False)
|
|
|
163 |
tagsmd = gr.Markdown(visible=False)
|
164 |
abstractmd = gr.Markdown(visible=False)
|
165 |
gr.Markdown("""## Construct Validity<br>
|
166 |
### How well it measures the concept it was designed to evaluate""", visible=True)
|
167 |
-
|
|
|
168 |
considerationsmd = gr.Markdown(visible=False)
|
169 |
gr.Markdown("""## Resources<br>
|
170 |
### What you need to do this evaluation""", visible=True)
|
@@ -172,8 +194,9 @@ The following categories are high-level, non-exhaustive, and present a synthesis
|
|
172 |
datasetmd = gr.Markdown(visible=False)
|
173 |
gr.Markdown("""## Results<br>
|
174 |
### Available evaluation results""", visible=True)
|
|
|
175 |
gallery = gr.Gallery(visible=False)
|
176 |
-
table_filtered.select(showmodal, None, [modal, titlemd, authormd, tagsmd, abstractmd, considerationsmd, modelsmd, datasetmd, gallery])
|
177 |
|
178 |
|
179 |
|
|
|
70 |
modal = Modal(visible=False)
|
71 |
titlemd = gr.Markdown("",visible=False)
|
72 |
authormd = gr.Markdown("",visible=False)
|
73 |
+
affiliationmd = gr.Markdown("",visible=False)
|
74 |
tagsmd = gr.Markdown("",visible=False)
|
75 |
abstractmd = gr.Markdown("",visible=False)
|
76 |
+
whatisbeingmd = gr.Markdown("",visible=False)
|
77 |
+
methodmd = gr.Markdown("",visible=False)
|
78 |
considerationsmd = gr.Markdown("",visible=False)
|
79 |
modelsmd = gr.Markdown("",visible=False)
|
80 |
datasetmd = gr.Markdown("",visible=False)
|
81 |
+
metricsmd = gr.Markdown("",visible=False)
|
82 |
gallery = gr.Gallery([],visible=False)
|
83 |
if evt.index[1] == 4:
|
84 |
modal = Modal(visible=True)
|
|
|
96 |
modelstr = '### Applicable Models: '+''.join(['<span class="tag">'+model+'</span> ' for model in models])
|
97 |
modelsmd = gr.Markdown(modelstr, visible=True)
|
98 |
|
99 |
+
|
100 |
titlemd = gr.Markdown('# ['+itemdic['Link']+']('+itemdic['URL']+')',visible=True)
|
101 |
|
102 |
if pd.notnull(itemdic['Authors']):
|
103 |
authormd = gr.Markdown('## '+itemdic['Authors'],visible=True)
|
104 |
+
|
105 |
+
if pd.notnull(itemdic['Affiliations']):
|
106 |
+
affiliationmd = gr.Markdown('<strong>Affiliations: </strong>'+ itemdic['Affiliations'],visible=True)
|
107 |
|
108 |
if pd.notnull(itemdic['Abstract']):
|
109 |
abstractmd = gr.Markdown(itemdic['Abstract'],visible=True)
|
110 |
|
111 |
+
if pd.notnull(itemdic['What it is evaluating']):
|
112 |
+
whatisbeingmd = gr.Markdown('<strong>Concept being evaluated: </strong>'+ itemdic['What it is evaluating'],visible=True)
|
113 |
+
|
114 |
+
if pd.notnull(itemdic['Methodology']):
|
115 |
+
methodmd = gr.Markdown('<strong>Method of Evaluation: </strong>'+ itemdic['Methodology'],visible=True)
|
116 |
+
|
117 |
if pd.notnull(itemdic['Considerations']):
|
118 |
considerationsmd = gr.Markdown('<strong>Considerations: </strong>'+ itemdic['Considerations'],visible=True)
|
119 |
|
120 |
if pd.notnull(itemdic['Datasets']):
|
121 |
datasetmd = gr.Markdown('#### [Dataset]('+itemdic['Datasets']+')',visible=True)
|
122 |
|
123 |
+
metrics = itemdic['Metrics']
|
124 |
+
if isinstance(metrics, list):
|
125 |
+
if len(metrics) > 0:
|
126 |
+
metricstr = '### Metrics: '+''.join(['<span class="tag">'+metric+'</span> ' for metric in metrics])
|
127 |
+
metricsmd = gr.Markdown(metricstr, visible=True)
|
128 |
+
|
129 |
screenshots = itemdic['Screenshots']
|
130 |
if isinstance(screenshots, list):
|
131 |
if len(screenshots) > 0:
|
132 |
gallery = gr.Gallery(screenshots, visible=True, height=500, object_fit="scale-down", interactive=False, show_share_button=False)
|
133 |
|
134 |
+
return [modal, titlemd, authormd, affiliationmd, tagsmd, abstractmd, whatisbeingmd, methodmd, considerationsmd, modelsmd, datasetmd, metricsmd, gallery]
|
135 |
|
136 |
with gr.Blocks(title = "Social Impact Measurement V2", css=custom_css, theme=gr.themes.Base()) as demo: #theme=gr.themes.Soft(),
|
137 |
# create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
|
|
|
180 |
with Modal(visible=False) as modal:
|
181 |
titlemd = gr.Markdown(visible=False)
|
182 |
authormd = gr.Markdown(visible=False)
|
183 |
+
affiliationmd = gr.Markdown(visible=False)
|
184 |
tagsmd = gr.Markdown(visible=False)
|
185 |
abstractmd = gr.Markdown(visible=False)
|
186 |
gr.Markdown("""## Construct Validity<br>
|
187 |
### How well it measures the concept it was designed to evaluate""", visible=True)
|
188 |
+
whatisbeingmd = gr.Markdown(visible=False)
|
189 |
+
methodmd = gr.Markdown(visible=False)
|
190 |
considerationsmd = gr.Markdown(visible=False)
|
191 |
gr.Markdown("""## Resources<br>
|
192 |
### What you need to do this evaluation""", visible=True)
|
|
|
194 |
datasetmd = gr.Markdown(visible=False)
|
195 |
gr.Markdown("""## Results<br>
|
196 |
### Available evaluation results""", visible=True)
|
197 |
+
metricsmd = gr.Markdown(visible=False)
|
198 |
gallery = gr.Gallery(visible=False)
|
199 |
+
table_filtered.select(showmodal, None, [modal, titlemd, authormd, affiliationmd, tagsmd, abstractmd, whatisbeingmd, methodmd, considerationsmd, modelsmd, datasetmd, metricsmd, gallery])
|
200 |
|
201 |
|
202 |
|
configs/crowspairs.yaml
CHANGED
@@ -17,3 +17,4 @@ Suggested Evaluation: Crow-S Pairs
|
|
17 |
Level: Dataset
|
18 |
URL: https://arxiv.org/abs/2010.00133
|
19 |
What it is evaluating: Protected class stereotypes
|
|
|
|
17 |
Level: Dataset
|
18 |
URL: https://arxiv.org/abs/2010.00133
|
19 |
What it is evaluating: Protected class stereotypes
|
20 |
+
Metrics: .nan
|
configs/honest.yaml
CHANGED
@@ -14,3 +14,4 @@ Suggested Evaluation: 'HONEST: Measuring Hurtful Sentence Completion in Language
|
|
14 |
Level: Output
|
15 |
URL: https://aclanthology.org/2021.naacl-main.191.pdf
|
16 |
What it is evaluating: Protected class stereotypes and hurtful language
|
|
|
|
14 |
Level: Output
|
15 |
URL: https://aclanthology.org/2021.naacl-main.191.pdf
|
16 |
What it is evaluating: Protected class stereotypes and hurtful language
|
17 |
+
Metrics: .nan
|
configs/ieat.yaml
CHANGED
@@ -15,3 +15,4 @@ Suggested Evaluation: Image Embedding Association Test (iEAT)
|
|
15 |
Level: Model
|
16 |
URL: https://dl.acm.org/doi/abs/10.1145/3442188.3445932
|
17 |
What it is evaluating: Embedding associations
|
|
|
|
15 |
Level: Model
|
16 |
URL: https://dl.acm.org/doi/abs/10.1145/3442188.3445932
|
17 |
What it is evaluating: Embedding associations
|
18 |
+
Metrics: .nan
|
configs/imagedataleak.yaml
CHANGED
@@ -13,3 +13,4 @@ Suggested Evaluation: Dataset leakage and model leakage
|
|
13 |
Level: Dataset
|
14 |
URL: https://arxiv.org/abs/1811.08489
|
15 |
What it is evaluating: Gender and label bias
|
|
|
|
13 |
Level: Dataset
|
14 |
URL: https://arxiv.org/abs/1811.08489
|
15 |
What it is evaluating: Gender and label bias
|
16 |
+
Metrics: .nan
|
configs/measuringforgetting.yaml
CHANGED
@@ -16,4 +16,5 @@ Screenshots:
|
|
16 |
Suggested Evaluation: Measuring forgetting of training examples
|
17 |
Level: Model
|
18 |
URL: https://arxiv.org/pdf/2207.00099.pdf
|
19 |
-
What it is evaluating: Measure whether models forget training examples over time, over different types of models (image, audio, text) and how order of training affects privacy attacks
|
|
|
|
16 |
Suggested Evaluation: Measuring forgetting of training examples
|
17 |
Level: Model
|
18 |
URL: https://arxiv.org/pdf/2207.00099.pdf
|
19 |
+
What it is evaluating: Measure whether models forget training examples over time, over different types of models (image, audio, text) and how order of training affects privacy attacks
|
20 |
+
Metrics: .nan
|
configs/notmyvoice.yaml
CHANGED
@@ -14,3 +14,4 @@ Suggested Evaluation: Not My Voice! A Taxonomy of Ethical and Safety Harms of Sp
|
|
14 |
Level: Taxonomy
|
15 |
URL: https://arxiv.org/pdf/2402.01708.pdf
|
16 |
What it is evaluating: Lists harms of audio/speech generators
|
|
|
|
14 |
Level: Taxonomy
|
15 |
URL: https://arxiv.org/pdf/2402.01708.pdf
|
16 |
What it is evaluating: Lists harms of audio/speech generators
|
17 |
+
Metrics: .nan
|
configs/palms.yaml
CHANGED
@@ -11,4 +11,5 @@ Screenshots: .nan
|
|
11 |
Suggested Evaluation: Human and Toxicity Evals of Cultural Value Categories
|
12 |
Level: Output
|
13 |
URL: http://arxiv.org/abs/2106.10328
|
14 |
-
What it is evaluating: Adherence to defined norms for a set of cultural categories
|
|
|
|
11 |
Suggested Evaluation: Human and Toxicity Evals of Cultural Value Categories
|
12 |
Level: Output
|
13 |
URL: http://arxiv.org/abs/2106.10328
|
14 |
+
What it is evaluating: Adherence to defined norms for a set of cultural categories
|
15 |
+
Metrics: .nan
|
configs/safelatentdiff.yaml
CHANGED
@@ -14,4 +14,5 @@ Screenshots:
|
|
14 |
Suggested Evaluation: Evaluating text-to-image models for safety
|
15 |
Level: Output
|
16 |
URL: https://arxiv.org/pdf/2211.05105.pdf
|
17 |
-
What it is evaluating: Generating images for diverse set of prompts (novel I2P benchmark) and investigating how often e.g. violent/nude images will be generated. There is a distinction between implicit and explicit safety, i.e. unsafe results with “normal” prompts.
|
|
|
|
14 |
Suggested Evaluation: Evaluating text-to-image models for safety
|
15 |
Level: Output
|
16 |
URL: https://arxiv.org/pdf/2211.05105.pdf
|
17 |
+
What it is evaluating: Generating images for diverse set of prompts (novel I2P benchmark) and investigating how often e.g. violent/nude images will be generated. There is a distinction between implicit and explicit safety, i.e. unsafe results with “normal” prompts.
|
18 |
+
Metrics: .nan
|
configs/stablebias.yaml
CHANGED
@@ -12,3 +12,4 @@ Suggested Evaluation: Characterizing the variation in generated images
|
|
12 |
Level: Output
|
13 |
URL: https://arxiv.org/abs/2303.11408
|
14 |
What it is evaluating: .nan
|
|
|
|
12 |
Level: Output
|
13 |
URL: https://arxiv.org/abs/2303.11408
|
14 |
What it is evaluating: .nan
|
15 |
+
Metrics: .nan
|
configs/tango.yaml
CHANGED
@@ -16,4 +16,5 @@ Screenshots:
|
|
16 |
Suggested Evaluation: Human and Toxicity Evals of Cultural Value Categories
|
17 |
Level: Output
|
18 |
URL: http://arxiv.org/abs/2106.10328
|
19 |
-
What it is evaluating: Bias measurement for trans and nonbinary community via measuring gender non-affirmative language, specifically 1) misgendering 2), negative responses to gender disclosure
|
|
|
|
16 |
Suggested Evaluation: Human and Toxicity Evals of Cultural Value Categories
|
17 |
Level: Output
|
18 |
URL: http://arxiv.org/abs/2106.10328
|
19 |
+
What it is evaluating: Bias measurement for trans and nonbinary community via measuring gender non-affirmative language, specifically 1) misgendering 2), negative responses to gender disclosure
|
20 |
+
Metrics: .nan
|
configs/videodiversemisinfo.yaml
CHANGED
@@ -14,3 +14,4 @@ Level: Output
|
|
14 |
URL: https://arxiv.org/abs/2210.10026
|
15 |
What it is evaluating: Human led evaluations of deepfakes to understand susceptibility
|
16 |
and representational harms (including political violence)
|
|
|
|
14 |
URL: https://arxiv.org/abs/2210.10026
|
15 |
What it is evaluating: Human led evaluations of deepfakes to understand susceptibility
|
16 |
and representational harms (including political violence)
|
17 |
+
Metrics: .nan
|
configs/weat.yaml
CHANGED
@@ -1,25 +1,6 @@
|
|
1 |
-
Abstract: "Artificial intelligence and machine learning are in a
|
2 |
-
|
3 |
-
|
4 |
-
characterizes many human institutions. Here we show for the first time that human-like\
|
5 |
-
\ semantic biases result from the\napplication of standard machine learning to ordinary\
|
6 |
-
\ language\u2014the same sort of language humans are exposed to every\nday. We replicate\
|
7 |
-
\ a spectrum of standard human biases as exposed by the Implicit Association Test\
|
8 |
-
\ and other well-known\npsychological studies. We replicate these using a widely\
|
9 |
-
\ used, purely statistical machine-learning model\u2014namely, the GloVe\nword embedding\u2014\
|
10 |
-
trained on a corpus of text from the Web. Our results indicate that language itself\
|
11 |
-
\ contains recoverable and\naccurate imprints of our historic biases, whether these\
|
12 |
-
\ are morally neutral as towards insects or flowers, problematic as towards\nrace\
|
13 |
-
\ or gender, or even simply veridical, reflecting the status quo for the distribution\
|
14 |
-
\ of gender with respect to careers or first\nnames. These regularities are captured\
|
15 |
-
\ by machine learning along with the rest of semantics. In addition to our empirical\n\
|
16 |
-
findings concerning language, we also contribute new methods for evaluating bias\
|
17 |
-
\ in text, the Word Embedding Association\nTest (WEAT) and the Word Embedding Factual\
|
18 |
-
\ Association Test (WEFAT). Our results have implications not only for AI and\n\
|
19 |
-
machine learning, but also for the fields of psychology, sociology, and human ethics,\
|
20 |
-
\ since they raise the possibility that mere\nexposure to everyday language can\
|
21 |
-
\ account for the biases we replicate here."
|
22 |
-
Applicable Models: .nan
|
23 |
Authors: Aylin Caliskan, Joanna J. Bryson, and Arvind Narayanan
|
24 |
Considerations: Although based in human associations, general societal attitudes do
|
25 |
not always represent subgroups of people and cultures.
|
@@ -40,3 +21,8 @@ Level: Model
|
|
40 |
URL: https://researchportal.bath.ac.uk/en/publications/semantics-derived-automatically-from-language-corpora-necessarily
|
41 |
What it is evaluating: Associations and word embeddings based on Implicit Associations
|
42 |
Test (IAT)
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Abstract: "Artificial intelligence and machine learning are currently undergoing rapid growth. Concerns persist regarding their potential to perpetuate biases inherent in human language. This study demonstrates that standard machine learning applied to everyday language reproduces a range of human biases, from implicit associations to societal norms. Using the GloVe word embedding model trained on web text, the research reveals that language itself contains historical biases, whether neutral or problematic. New evaluation methods, WEAT and WEFAT, are introduced. These findings have broad implications for AI, psychology, sociology, and ethics, suggesting that biases may stem from everyday linguistic exposure."
|
2 |
+
Applicable Models:
|
3 |
+
- GloVe (Opensource access)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
Authors: Aylin Caliskan, Joanna J. Bryson, and Arvind Narayanan
|
5 |
Considerations: Although based in human associations, general societal attitudes do
|
6 |
not always represent subgroups of people and cultures.
|
|
|
21 |
URL: https://researchportal.bath.ac.uk/en/publications/semantics-derived-automatically-from-language-corpora-necessarily
|
22 |
What it is evaluating: Associations and word embeddings based on Implicit Associations
|
23 |
Test (IAT)
|
24 |
+
Metrics:
|
25 |
+
- Cosine Similarity
|
26 |
+
- Effect Size
|
27 |
+
Affiliations: Princeton University, University of Bath
|
28 |
+
Methodology: Effect sizes between two sets of target words (e.g., programmer, engineer, scientist, ... and nurse, teacher, librarian, ...) and two sets of attribute words (e.g., man, male, ... and woman, female ...) are calculated using cosine similarity of the embeddings, with the null hypothesis that an unbaised model would have no difference betwewen the sets.
|