Avijit Ghosh commited on
Commit
87e696f
·
1 Parent(s): 2ee7c27

Added more fields

Browse files
app.py CHANGED
@@ -70,11 +70,15 @@ def showmodal(evt: gr.SelectData):
70
  modal = Modal(visible=False)
71
  titlemd = gr.Markdown("",visible=False)
72
  authormd = gr.Markdown("",visible=False)
 
73
  tagsmd = gr.Markdown("",visible=False)
74
  abstractmd = gr.Markdown("",visible=False)
 
 
75
  considerationsmd = gr.Markdown("",visible=False)
76
  modelsmd = gr.Markdown("",visible=False)
77
  datasetmd = gr.Markdown("",visible=False)
 
78
  gallery = gr.Gallery([],visible=False)
79
  if evt.index[1] == 4:
80
  modal = Modal(visible=True)
@@ -92,26 +96,42 @@ def showmodal(evt: gr.SelectData):
92
  modelstr = '### Applicable Models: '+''.join(['<span class="tag">'+model+'</span> ' for model in models])
93
  modelsmd = gr.Markdown(modelstr, visible=True)
94
 
 
95
  titlemd = gr.Markdown('# ['+itemdic['Link']+']('+itemdic['URL']+')',visible=True)
96
 
97
  if pd.notnull(itemdic['Authors']):
98
  authormd = gr.Markdown('## '+itemdic['Authors'],visible=True)
 
 
 
99
 
100
  if pd.notnull(itemdic['Abstract']):
101
  abstractmd = gr.Markdown(itemdic['Abstract'],visible=True)
102
 
 
 
 
 
 
 
103
  if pd.notnull(itemdic['Considerations']):
104
  considerationsmd = gr.Markdown('<strong>Considerations: </strong>'+ itemdic['Considerations'],visible=True)
105
 
106
  if pd.notnull(itemdic['Datasets']):
107
  datasetmd = gr.Markdown('#### [Dataset]('+itemdic['Datasets']+')',visible=True)
108
 
 
 
 
 
 
 
109
  screenshots = itemdic['Screenshots']
110
  if isinstance(screenshots, list):
111
  if len(screenshots) > 0:
112
  gallery = gr.Gallery(screenshots, visible=True, height=500, object_fit="scale-down", interactive=False, show_share_button=False)
113
 
114
- return [modal, titlemd, authormd, tagsmd, abstractmd, considerationsmd, modelsmd, datasetmd, gallery]
115
 
116
  with gr.Blocks(title = "Social Impact Measurement V2", css=custom_css, theme=gr.themes.Base()) as demo: #theme=gr.themes.Soft(),
117
  # create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
@@ -160,11 +180,13 @@ The following categories are high-level, non-exhaustive, and present a synthesis
160
  with Modal(visible=False) as modal:
161
  titlemd = gr.Markdown(visible=False)
162
  authormd = gr.Markdown(visible=False)
 
163
  tagsmd = gr.Markdown(visible=False)
164
  abstractmd = gr.Markdown(visible=False)
165
  gr.Markdown("""## Construct Validity<br>
166
  ### How well it measures the concept it was designed to evaluate""", visible=True)
167
- # gr.Markdown("### What it is evaluating", visible=True)
 
168
  considerationsmd = gr.Markdown(visible=False)
169
  gr.Markdown("""## Resources<br>
170
  ### What you need to do this evaluation""", visible=True)
@@ -172,8 +194,9 @@ The following categories are high-level, non-exhaustive, and present a synthesis
172
  datasetmd = gr.Markdown(visible=False)
173
  gr.Markdown("""## Results<br>
174
  ### Available evaluation results""", visible=True)
 
175
  gallery = gr.Gallery(visible=False)
176
- table_filtered.select(showmodal, None, [modal, titlemd, authormd, tagsmd, abstractmd, considerationsmd, modelsmd, datasetmd, gallery])
177
 
178
 
179
 
 
70
  modal = Modal(visible=False)
71
  titlemd = gr.Markdown("",visible=False)
72
  authormd = gr.Markdown("",visible=False)
73
+ affiliationmd = gr.Markdown("",visible=False)
74
  tagsmd = gr.Markdown("",visible=False)
75
  abstractmd = gr.Markdown("",visible=False)
76
+ whatisbeingmd = gr.Markdown("",visible=False)
77
+ methodmd = gr.Markdown("",visible=False)
78
  considerationsmd = gr.Markdown("",visible=False)
79
  modelsmd = gr.Markdown("",visible=False)
80
  datasetmd = gr.Markdown("",visible=False)
81
+ metricsmd = gr.Markdown("",visible=False)
82
  gallery = gr.Gallery([],visible=False)
83
  if evt.index[1] == 4:
84
  modal = Modal(visible=True)
 
96
  modelstr = '### Applicable Models: '+''.join(['<span class="tag">'+model+'</span> ' for model in models])
97
  modelsmd = gr.Markdown(modelstr, visible=True)
98
 
99
+
100
  titlemd = gr.Markdown('# ['+itemdic['Link']+']('+itemdic['URL']+')',visible=True)
101
 
102
  if pd.notnull(itemdic['Authors']):
103
  authormd = gr.Markdown('## '+itemdic['Authors'],visible=True)
104
+
105
+ if pd.notnull(itemdic['Affiliations']):
106
+ affiliationmd = gr.Markdown('<strong>Affiliations: </strong>'+ itemdic['Affiliations'],visible=True)
107
 
108
  if pd.notnull(itemdic['Abstract']):
109
  abstractmd = gr.Markdown(itemdic['Abstract'],visible=True)
110
 
111
+ if pd.notnull(itemdic['What it is evaluating']):
112
+ whatisbeingmd = gr.Markdown('<strong>Concept being evaluated: </strong>'+ itemdic['What it is evaluating'],visible=True)
113
+
114
+ if pd.notnull(itemdic['Methodology']):
115
+ methodmd = gr.Markdown('<strong>Method of Evaluation: </strong>'+ itemdic['Methodology'],visible=True)
116
+
117
  if pd.notnull(itemdic['Considerations']):
118
  considerationsmd = gr.Markdown('<strong>Considerations: </strong>'+ itemdic['Considerations'],visible=True)
119
 
120
  if pd.notnull(itemdic['Datasets']):
121
  datasetmd = gr.Markdown('#### [Dataset]('+itemdic['Datasets']+')',visible=True)
122
 
123
+ metrics = itemdic['Metrics']
124
+ if isinstance(metrics, list):
125
+ if len(metrics) > 0:
126
+ metricstr = '### Metrics: '+''.join(['<span class="tag">'+metric+'</span> ' for metric in metrics])
127
+ metricsmd = gr.Markdown(metricstr, visible=True)
128
+
129
  screenshots = itemdic['Screenshots']
130
  if isinstance(screenshots, list):
131
  if len(screenshots) > 0:
132
  gallery = gr.Gallery(screenshots, visible=True, height=500, object_fit="scale-down", interactive=False, show_share_button=False)
133
 
134
+ return [modal, titlemd, authormd, affiliationmd, tagsmd, abstractmd, whatisbeingmd, methodmd, considerationsmd, modelsmd, datasetmd, metricsmd, gallery]
135
 
136
  with gr.Blocks(title = "Social Impact Measurement V2", css=custom_css, theme=gr.themes.Base()) as demo: #theme=gr.themes.Soft(),
137
  # create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
 
180
  with Modal(visible=False) as modal:
181
  titlemd = gr.Markdown(visible=False)
182
  authormd = gr.Markdown(visible=False)
183
+ affiliationmd = gr.Markdown(visible=False)
184
  tagsmd = gr.Markdown(visible=False)
185
  abstractmd = gr.Markdown(visible=False)
186
  gr.Markdown("""## Construct Validity<br>
187
  ### How well it measures the concept it was designed to evaluate""", visible=True)
188
+ whatisbeingmd = gr.Markdown(visible=False)
189
+ methodmd = gr.Markdown(visible=False)
190
  considerationsmd = gr.Markdown(visible=False)
191
  gr.Markdown("""## Resources<br>
192
  ### What you need to do this evaluation""", visible=True)
 
194
  datasetmd = gr.Markdown(visible=False)
195
  gr.Markdown("""## Results<br>
196
  ### Available evaluation results""", visible=True)
197
+ metricsmd = gr.Markdown(visible=False)
198
  gallery = gr.Gallery(visible=False)
199
+ table_filtered.select(showmodal, None, [modal, titlemd, authormd, affiliationmd, tagsmd, abstractmd, whatisbeingmd, methodmd, considerationsmd, modelsmd, datasetmd, metricsmd, gallery])
200
 
201
 
202
 
configs/crowspairs.yaml CHANGED
@@ -17,3 +17,4 @@ Suggested Evaluation: Crow-S Pairs
17
  Level: Dataset
18
  URL: https://arxiv.org/abs/2010.00133
19
  What it is evaluating: Protected class stereotypes
 
 
17
  Level: Dataset
18
  URL: https://arxiv.org/abs/2010.00133
19
  What it is evaluating: Protected class stereotypes
20
+ Metrics: .nan
configs/honest.yaml CHANGED
@@ -14,3 +14,4 @@ Suggested Evaluation: 'HONEST: Measuring Hurtful Sentence Completion in Language
14
  Level: Output
15
  URL: https://aclanthology.org/2021.naacl-main.191.pdf
16
  What it is evaluating: Protected class stereotypes and hurtful language
 
 
14
  Level: Output
15
  URL: https://aclanthology.org/2021.naacl-main.191.pdf
16
  What it is evaluating: Protected class stereotypes and hurtful language
17
+ Metrics: .nan
configs/ieat.yaml CHANGED
@@ -15,3 +15,4 @@ Suggested Evaluation: Image Embedding Association Test (iEAT)
15
  Level: Model
16
  URL: https://dl.acm.org/doi/abs/10.1145/3442188.3445932
17
  What it is evaluating: Embedding associations
 
 
15
  Level: Model
16
  URL: https://dl.acm.org/doi/abs/10.1145/3442188.3445932
17
  What it is evaluating: Embedding associations
18
+ Metrics: .nan
configs/imagedataleak.yaml CHANGED
@@ -13,3 +13,4 @@ Suggested Evaluation: Dataset leakage and model leakage
13
  Level: Dataset
14
  URL: https://arxiv.org/abs/1811.08489
15
  What it is evaluating: Gender and label bias
 
 
13
  Level: Dataset
14
  URL: https://arxiv.org/abs/1811.08489
15
  What it is evaluating: Gender and label bias
16
+ Metrics: .nan
configs/measuringforgetting.yaml CHANGED
@@ -16,4 +16,5 @@ Screenshots:
16
  Suggested Evaluation: Measuring forgetting of training examples
17
  Level: Model
18
  URL: https://arxiv.org/pdf/2207.00099.pdf
19
- What it is evaluating: Measure whether models forget training examples over time, over different types of models (image, audio, text) and how order of training affects privacy attacks
 
 
16
  Suggested Evaluation: Measuring forgetting of training examples
17
  Level: Model
18
  URL: https://arxiv.org/pdf/2207.00099.pdf
19
+ What it is evaluating: Measure whether models forget training examples over time, over different types of models (image, audio, text) and how order of training affects privacy attacks
20
+ Metrics: .nan
configs/notmyvoice.yaml CHANGED
@@ -14,3 +14,4 @@ Suggested Evaluation: Not My Voice! A Taxonomy of Ethical and Safety Harms of Sp
14
  Level: Taxonomy
15
  URL: https://arxiv.org/pdf/2402.01708.pdf
16
  What it is evaluating: Lists harms of audio/speech generators
 
 
14
  Level: Taxonomy
15
  URL: https://arxiv.org/pdf/2402.01708.pdf
16
  What it is evaluating: Lists harms of audio/speech generators
17
+ Metrics: .nan
configs/palms.yaml CHANGED
@@ -11,4 +11,5 @@ Screenshots: .nan
11
  Suggested Evaluation: Human and Toxicity Evals of Cultural Value Categories
12
  Level: Output
13
  URL: http://arxiv.org/abs/2106.10328
14
- What it is evaluating: Adherence to defined norms for a set of cultural categories
 
 
11
  Suggested Evaluation: Human and Toxicity Evals of Cultural Value Categories
12
  Level: Output
13
  URL: http://arxiv.org/abs/2106.10328
14
+ What it is evaluating: Adherence to defined norms for a set of cultural categories
15
+ Metrics: .nan
configs/safelatentdiff.yaml CHANGED
@@ -14,4 +14,5 @@ Screenshots:
14
  Suggested Evaluation: Evaluating text-to-image models for safety
15
  Level: Output
16
  URL: https://arxiv.org/pdf/2211.05105.pdf
17
- What it is evaluating: Generating images for diverse set of prompts (novel I2P benchmark) and investigating how often e.g. violent/nude images will be generated. There is a distinction between implicit and explicit safety, i.e. unsafe results with “normal” prompts.
 
 
14
  Suggested Evaluation: Evaluating text-to-image models for safety
15
  Level: Output
16
  URL: https://arxiv.org/pdf/2211.05105.pdf
17
+ What it is evaluating: Generating images for diverse set of prompts (novel I2P benchmark) and investigating how often e.g. violent/nude images will be generated. There is a distinction between implicit and explicit safety, i.e. unsafe results with “normal” prompts.
18
+ Metrics: .nan
configs/stablebias.yaml CHANGED
@@ -12,3 +12,4 @@ Suggested Evaluation: Characterizing the variation in generated images
12
  Level: Output
13
  URL: https://arxiv.org/abs/2303.11408
14
  What it is evaluating: .nan
 
 
12
  Level: Output
13
  URL: https://arxiv.org/abs/2303.11408
14
  What it is evaluating: .nan
15
+ Metrics: .nan
configs/tango.yaml CHANGED
@@ -16,4 +16,5 @@ Screenshots:
16
  Suggested Evaluation: Human and Toxicity Evals of Cultural Value Categories
17
  Level: Output
18
  URL: http://arxiv.org/abs/2106.10328
19
- What it is evaluating: Bias measurement for trans and nonbinary community via measuring gender non-affirmative language, specifically 1) misgendering 2), negative responses to gender disclosure
 
 
16
  Suggested Evaluation: Human and Toxicity Evals of Cultural Value Categories
17
  Level: Output
18
  URL: http://arxiv.org/abs/2106.10328
19
+ What it is evaluating: Bias measurement for trans and nonbinary community via measuring gender non-affirmative language, specifically 1) misgendering 2), negative responses to gender disclosure
20
+ Metrics: .nan
configs/videodiversemisinfo.yaml CHANGED
@@ -14,3 +14,4 @@ Level: Output
14
  URL: https://arxiv.org/abs/2210.10026
15
  What it is evaluating: Human led evaluations of deepfakes to understand susceptibility
16
  and representational harms (including political violence)
 
 
14
  URL: https://arxiv.org/abs/2210.10026
15
  What it is evaluating: Human led evaluations of deepfakes to understand susceptibility
16
  and representational harms (including political violence)
17
+ Metrics: .nan
configs/weat.yaml CHANGED
@@ -1,25 +1,6 @@
1
- Abstract: "Artificial intelligence and machine learning are in a period of astounding\
2
- \ growth. However, there are concerns that these\ntechnologies may be used, either\
3
- \ with or without intention, to perpetuate the prejudice and unfairness that unfortunately\n\
4
- characterizes many human institutions. Here we show for the first time that human-like\
5
- \ semantic biases result from the\napplication of standard machine learning to ordinary\
6
- \ language\u2014the same sort of language humans are exposed to every\nday. We replicate\
7
- \ a spectrum of standard human biases as exposed by the Implicit Association Test\
8
- \ and other well-known\npsychological studies. We replicate these using a widely\
9
- \ used, purely statistical machine-learning model\u2014namely, the GloVe\nword embedding\u2014\
10
- trained on a corpus of text from the Web. Our results indicate that language itself\
11
- \ contains recoverable and\naccurate imprints of our historic biases, whether these\
12
- \ are morally neutral as towards insects or flowers, problematic as towards\nrace\
13
- \ or gender, or even simply veridical, reflecting the status quo for the distribution\
14
- \ of gender with respect to careers or first\nnames. These regularities are captured\
15
- \ by machine learning along with the rest of semantics. In addition to our empirical\n\
16
- findings concerning language, we also contribute new methods for evaluating bias\
17
- \ in text, the Word Embedding Association\nTest (WEAT) and the Word Embedding Factual\
18
- \ Association Test (WEFAT). Our results have implications not only for AI and\n\
19
- machine learning, but also for the fields of psychology, sociology, and human ethics,\
20
- \ since they raise the possibility that mere\nexposure to everyday language can\
21
- \ account for the biases we replicate here."
22
- Applicable Models: .nan
23
  Authors: Aylin Caliskan, Joanna J. Bryson, and Arvind Narayanan
24
  Considerations: Although based in human associations, general societal attitudes do
25
  not always represent subgroups of people and cultures.
@@ -40,3 +21,8 @@ Level: Model
40
  URL: https://researchportal.bath.ac.uk/en/publications/semantics-derived-automatically-from-language-corpora-necessarily
41
  What it is evaluating: Associations and word embeddings based on Implicit Associations
42
  Test (IAT)
 
 
 
 
 
 
1
+ Abstract: "Artificial intelligence and machine learning are currently undergoing rapid growth. Concerns persist regarding their potential to perpetuate biases inherent in human language. This study demonstrates that standard machine learning applied to everyday language reproduces a range of human biases, from implicit associations to societal norms. Using the GloVe word embedding model trained on web text, the research reveals that language itself contains historical biases, whether neutral or problematic. New evaluation methods, WEAT and WEFAT, are introduced. These findings have broad implications for AI, psychology, sociology, and ethics, suggesting that biases may stem from everyday linguistic exposure."
2
+ Applicable Models:
3
+ - GloVe (Opensource access)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  Authors: Aylin Caliskan, Joanna J. Bryson, and Arvind Narayanan
5
  Considerations: Although based in human associations, general societal attitudes do
6
  not always represent subgroups of people and cultures.
 
21
  URL: https://researchportal.bath.ac.uk/en/publications/semantics-derived-automatically-from-language-corpora-necessarily
22
  What it is evaluating: Associations and word embeddings based on Implicit Associations
23
  Test (IAT)
24
+ Metrics:
25
+ - Cosine Similarity
26
+ - Effect Size
27
+ Affiliations: Princeton University, University of Bath
28
+ Methodology: Effect sizes between two sets of target words (e.g., programmer, engineer, scientist, ... and nurse, teacher, librarian, ...) and two sets of attribute words (e.g., man, male, ... and woman, female ...) are calculated using cosine similarity of the embeddings, with the null hypothesis that an unbaised model would have no difference betwewen the sets.