Spaces:

emilylearning
/

causing_gender_pronouns_two

Runtime error

App Files Files Community

Emily McMilin commited on May 17, 2022

Commit

bae4168

1 Parent(s): 38542cb

adding baseline to plots and some clean up

Browse files

Files changed (1) hide show

app.py +47 -30

app.py CHANGED Viewed

@@ -1,3 +1,6 @@
 from typing import Optional
 import gradio as gr
 import torch
@@ -16,41 +19,35 @@ BASE = 'BERT_base'
 # Play with me, consts
 SUBREDDIT_CONDITIONING_VARIABLES = ["none", "subreddit"]
-WIKIBIO_CONDITIONING_VARIABLES = ['none', 'birth_date',  'birth_place'] # EMILY!!
 BERT_LIKE_MODELS = ["bert", "distilbert"]
-## Internal constants
 GENDER_OPTIONS = ['female', 'male']
 DECIMAL_PLACES = 1
-DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-MAX_TOKEN_LENGTH = 32
-NON_LOSS_TOKEN_ID = -100
 # Picked ints that will pop out visually during debug
 NON_GENDERED_TOKEN_ID = 30
 LABEL_DICT = {GENDER_OPTIONS[0]: 9, GENDER_OPTIONS[1]: -9}
 CLASSES = list(LABEL_DICT.keys())
-MULTITOKEN_WOMAN_WORD = 'policewoman'
-MULTITOKEN_MAN_WORD = 'spiderman'
 # Wikibio conts
 START_YEAR = 1800
 STOP_YEAR = 1999
 SPLIT_KEY = "DATE"
 # Reddit consts
 # List of randomly selected (tending towards those with seemingly more gender-neutral words)
 # in order of increasing self-identified female participation.
-# See http://bburky.com/subredditgenderratios/ , Minimum subreddit size: 100000
-# Update: 400000
 SUBREDDITS = [
     "GlobalOffensive",
     "pcmasterrace",
@@ -263,12 +260,16 @@ def get_tokenized_text_with_metadata(input_text, indie_vars, dataset, male_gende
         if len(text_portions) == 1:
             text_portions = ['Born in ', f" {text_portions[0]}"]
     tokenized_w_metadata = {'ids': [], 'atten_mask': [], 'toks': [], 'labels': []}
     for indie_var in indie_vars:
         if dataset == WIKIBIO:
             target_text = f"{indie_var}".join(text_portions)
         else:
             target_text = f"r/{indie_var}: {input_text}"
         tokenized_sample = tokenize_and_append_metadata(
@@ -302,10 +303,22 @@ def get_avg_prob_from_pipeline_outputs(mask_filled_text, gendered_token_ids, num
     ]
     return round(sum(pronoun_preds) / num_preds * 100, DECIMAL_PLACES)
-def get_figure(results, dataset, gender, indie_var_name):
     fig, ax = plt.subplots()
-    ax.plot(results)
     if dataset == REDDIT:
         ax.set_xlabel("Subreddit prepended to input text")
@@ -322,6 +335,7 @@ def predict_gender_pronouns(
     dataset,
     bert_like_models,
     normalizing,
     input_text,
 ):
     """Run inference on input_text for each model type, returning df and plots of precentage
@@ -330,15 +344,14 @@ def predict_gender_pronouns(
     male_gendered_token_ids, female_gendered_token_ids = get_gendered_token_ids(tokenizer)
     if dataset == REDDIT:
-        indie_vars = SUBREDDITS
         conditioning_variables = SUBREDDIT_CONDITIONING_VARIABLES
         indie_var_name = 'subreddit'
     else:
-        indie_vars = np.linspace(START_YEAR, STOP_YEAR, 20).astype(int)
         conditioning_variables = WIKIBIO_CONDITIONING_VARIABLES
         indie_var_name = 'date'
     tokenized = get_tokenized_text_with_metadata(
         input_text,
         indie_vars,
@@ -424,16 +437,15 @@ def predict_gender_pronouns(
         female_dfs.append(pd.DataFrame({prefix : female_pronoun_preds}))
         male_dfs.append(pd.DataFrame({prefix : male_pronoun_preds}))
-    # To display to user as an example
-    toks = tokenized["toks"][0]
-    target_text_w_masks = ' '.join(toks[1:-1])
     # Plots / dataframe for display to users
     female_results = pd.concat(female_dfs, axis=1).set_index(indie_var_name)
     male_results = pd.concat(male_dfs, axis=1).set_index(indie_var_name)
-    female_fig = get_figure(female_results, dataset, "female", indie_var_name)
-    male_fig = get_figure(male_results, dataset, "male", indie_var_name)
     female_results.reset_index(inplace=True)  # Gradio Dataframe doesn't 'see' index?
     male_results.reset_index(inplace=True)  # Gradio Dataframe doesn't 'see' index?
@@ -446,7 +458,6 @@ def predict_gender_pronouns(
     )
 gr.Interface(
     fn=predict_gender_pronouns,
     inputs=[
@@ -469,6 +480,12 @@ gr.Interface(
             default = "True",
             type="index",
         ),
         gr.inputs.Textbox(
             lines=5,
             label="Input Text: Sentence about a single person using some gendered pronouns to refer to them.",

 from typing import Optional
 import gradio as gr
 import torch
 # Play with me, consts
 SUBREDDIT_CONDITIONING_VARIABLES = ["none", "subreddit"]
+WIKIBIO_CONDITIONING_VARIABLES = ['none', 'birth_date']
 BERT_LIKE_MODELS = ["bert", "distilbert"]
+MAX_TOKEN_LENGTH = 32
+# Internal markers for rendering
+BASELINE_MARKER = 'baseline'
+REDDIT_BASELINE_TEXT = ' '
+WIKIBIO_BASELINE_TEXT = 'date'
+## Internal constants from training
 GENDER_OPTIONS = ['female', 'male']
 DECIMAL_PLACES = 1
+MULTITOKEN_WOMAN_WORD = 'policewoman'
+MULTITOKEN_MAN_WORD = 'spiderman'
 # Picked ints that will pop out visually during debug
 NON_GENDERED_TOKEN_ID = 30
 LABEL_DICT = {GENDER_OPTIONS[0]: 9, GENDER_OPTIONS[1]: -9}
 CLASSES = list(LABEL_DICT.keys())
+NON_LOSS_TOKEN_ID = -100
 # Wikibio conts
 START_YEAR = 1800
 STOP_YEAR = 1999
 SPLIT_KEY = "DATE"
 # Reddit consts
 # List of randomly selected (tending towards those with seemingly more gender-neutral words)
 # in order of increasing self-identified female participation.
+# See http://bburky.com/subredditgenderratios/ , Minimum subreddit size: 400000
 SUBREDDITS = [
     "GlobalOffensive",
     "pcmasterrace",
         if len(text_portions) == 1:
             text_portions = ['Born in ', f" {text_portions[0]}"]
     tokenized_w_metadata = {'ids': [], 'atten_mask': [], 'toks': [], 'labels': []}
     for indie_var in indie_vars:
         if dataset == WIKIBIO:
+            if indie_var == BASELINE_MARKER:
+                indie_var = WIKIBIO_BASELINE_TEXT
             target_text = f"{indie_var}".join(text_portions)
         else:
+            if indie_var == BASELINE_MARKER:
+                indie_var = REDDIT_BASELINE_TEXT
             target_text = f"r/{indie_var}: {input_text}"
         tokenized_sample = tokenize_and_append_metadata(
     ]
     return round(sum(pronoun_preds) / num_preds * 100, DECIMAL_PLACES)
+def get_figure(results, dataset, gender, indie_var_name, include_baseline=True):
+    colors = ['b', 'g', 'c', 'm', 'y', 'r', 'k']  # assert no
+    # Grab then remove baselines from df
+    baseline = results.loc[BASELINE_MARKER]
+    results.drop(index=BASELINE_MARKER, axis=1, inplace=True)
     fig, ax = plt.subplots()
+    for i, col in enumerate(results.columns):
+        ax.plot(results[col],  color=colors[i])#, color=colors)
+    if include_baseline == True:
+        for i, (name, value) in enumerate(baseline.items()):
+            if name == indie_var_name:
+                continue
+            ax.axhline(value, ls='--', color=colors[i])
     if dataset == REDDIT:
         ax.set_xlabel("Subreddit prepended to input text")
     dataset,
     bert_like_models,
     normalizing,
+    include_baseline,
     input_text,
 ):
     """Run inference on input_text for each model type, returning df and plots of precentage
     male_gendered_token_ids, female_gendered_token_ids = get_gendered_token_ids(tokenizer)
     if dataset == REDDIT:
+        indie_vars = [BASELINE_MARKER] + SUBREDDITS
         conditioning_variables = SUBREDDIT_CONDITIONING_VARIABLES
         indie_var_name = 'subreddit'
     else:
+        indie_vars =  [BASELINE_MARKER] + np.linspace(START_YEAR, STOP_YEAR, 20).astype(int).tolist()
         conditioning_variables = WIKIBIO_CONDITIONING_VARIABLES
         indie_var_name = 'date'
     tokenized = get_tokenized_text_with_metadata(
         input_text,
         indie_vars,
         female_dfs.append(pd.DataFrame({prefix : female_pronoun_preds}))
         male_dfs.append(pd.DataFrame({prefix : male_pronoun_preds}))
+    # Pick a sample to display to user as an example
+    toks = tokenized["toks"][3]
+    target_text_w_masks = ' '.join(toks[1:-1]) # Removing [CLS] and [SEP]
     # Plots / dataframe for display to users
     female_results = pd.concat(female_dfs, axis=1).set_index(indie_var_name)
     male_results = pd.concat(male_dfs, axis=1).set_index(indie_var_name)
+    female_fig = get_figure(female_results, dataset, "female", indie_var_name, include_baseline)
+    male_fig = get_figure(male_results, dataset, "male", indie_var_name, include_baseline)
     female_results.reset_index(inplace=True)  # Gradio Dataframe doesn't 'see' index?
     male_results.reset_index(inplace=True)  # Gradio Dataframe doesn't 'see' index?
     )
 gr.Interface(
     fn=predict_gender_pronouns,
     inputs=[
             default = "True",
             type="index",
         ),
+        gr.inputs.Dropdown(
+            ["False", "True"],
+            label="Include baseline predictions (dashed-lines)?",
+            default = "True",
+            type="index",
+        ),
         gr.inputs.Textbox(
             lines=5,
             label="Input Text: Sentence about a single person using some gendered pronouns to refer to them.",