File size: 7,362 Bytes
323a0f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4785c60
323a0f2
4785c60
323a0f2
4785c60
323a0f2
4785c60
323a0f2
 
4785c60
323a0f2
4785c60
323a0f2
4785c60
323a0f2
4785c60
323a0f2
4785c60
323a0f2
 
 
 
 
4785c60
323a0f2
4785c60
323a0f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c5ab1ee
 
 
 
 
 
 
 
 
 
 
 
 
4785c60
 
c5ab1ee
4785c60
c5ab1ee
4785c60
 
 
 
 
c5ab1ee
4785c60
c5ab1ee
4785c60
9e7a8dc
 
 
c5ab1ee
 
 
323a0f2
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import gradio as gr
import os

from peft import PeftConfig, PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer

base_model_name = "google/gemma-7b"
#adapter_model_name = "samidh/cope-g2b-2c-hs-skr-s1.5.9-sx-sk-s5.d25"
#adapter_model_name = "samidh/cope-g2b-2c-hs-skr-s1.5.9-sx-sk-s1.5.l1e4-e10-d25"
#adapter_model_name = "samidh/cope-g2b-2c-hs-s1.f5.9.l5e5-e10-d25-r8"
#adapter_model_name = "samidh/cope-g2b-2c-hs.s1.5.9-sx.s1.5.9o-hr.s5-sh.s5.l1e4-e10-d25-r8"
#adapter_model_name = "samidh/cope-ap-g2b-2c-hs.s1.5.9-sx.s1.5.9o-vl.s1.5.9-hr.s5-sh.s5.l1e4-e5-d25-r8"
#adapter_model_name = "samidh/cope-g2b-2c-hs.s1.5pcf.9.l5e5-e10-d25-r8"
#adapter_model_name = "samidh/cope-g7bq-2c-hs.s1.5fpc.9-sx.s1.5.9o-VL.s1.5.9-HR.s5-SH.s5-l1e4-e5-d25-r8"
adapter_model_name = "samidh/cope-g7bq-2c-hs.s1.5fpc.9-sx.s1.5.9o-VL.s1.5.9-HR.s5-SH.s5-l5e5-e3-d25-r8"

model = AutoModelForCausalLM.from_pretrained(base_model_name, token=os.environ['HF_TOKEN'])
model = PeftModel.from_pretrained(model, adapter_model_name, token=os.environ['HF_TOKEN'])
model.merge_and_unload()

tokenizer = AutoTokenizer.from_pretrained(base_model_name)

#inputs = tokenizer.encode("This movie was really", return_tensors="pt")

#outputs = model.generate(inputs, max_new_tokens=1)
#print(tokenizer.decode(outputs[0], skip_special_tokens=True)[-1])

#gr.load("models/samidh/cope-gemma2b-hs-2c-skr-s1.5.9.d25", hf_token=os.environ['HF_TOKEN']).launch()

PROMPT = """
INSTRUCTIONS
============
Examine the given POLICY and determine if the given CONTENT meets the criteria for ANY of the LABELS. Answer "1" if yes, and "0" if no.

POLICY
======

{policy}


CONTENT
=======

{content}


ANSWER
======

"""

DEFAULT_POLICY = """# Policy

## Overview

This policy guides the classification of content related to expressions that may be sensitive or controversial, aiming to foster a community where open dialogue is encouraged while maintaining respect for all individuals.

## Definition of Terms

- Protected Classes: Individuals or groups of individuals defined by sharing a race, ethnicity, religion, disability, sex, gender identity, or sexual orientation. The scope is narrowed to focus on characteristics commonly recognized in anti-discrimination laws. Religions themselves, as concepts ARE NOT protected - only their members are.
- Non-Protected Classes: Groups defined by other characteristics, such as age, socioeconomic status, and education, which are typically excluded partially or entirely from anti-discrimination laws.

## Interpretation of Language

## Definition of Labels

### (HS): Hate Speech

#### Includes

- Explicit Violence Advocacy: Direct and clear threats of or praise for violence or harm against individuals or groups based on their membership in a particular protected class. Stating an intention to defend against aggression DOES NOT qualify
- Intentional Dehumanization: Statements that clearly depict individuals or groups as inherently ""other"", alien, animalistic, unintelligent, immoral, unclean, or less-than-fully-human based on their membership in a particular protected class in a way that justifies harm or discrimination.
- Targeted Use of Derogatory Slurs: Targeting another person or group of people using a one-word name for a particular protected class that has an inherent negative connotation (e.g. Nigger, Kike, Cunt, Retard). Multi-word terms are never slurs.
- Explicit Discrimination Advocacy: Direct and clear calls for exclusion, segregation, or discrimination against individuals or groups based on their membership in a particular protected class, with a clear intent to promote inequality.
- Direct Hateful Insults: Content that directly addresses another person or group of people the second person (e.g. ""You over there"") and insults them based on their membership in a particular protected class

#### Excludes

- Artistic and Educational Content: Expressions intended for artistic, educational, or documentary purposes that discuss sensitive topics but do not advocate for violence or discrimination against individuals or groups based on their membership in a particular protected class.
- Political and Social Commentary: Commentary on political issues, social issues, and political ideologies that does not directly incite violence or discrimination against individuals or groups based on their membership in a particular protected class.
- Rebutting Hateful Language: Content that rebuts, condemns, questions, criticizes, or mocks a different person's hateful language or ideas OR that insults the person advocating those hateful
- Quoting Hateful Language: Content in which the author quotes someone else's hateful language or ideas while discussing, explaining, or neutrally factually presenting those ideas.
- Describing Sectarian Violence: Content that describes, but does not endorse or praise, violent physical injury against a specifically named race, ethnicity, nationality, sexual orientation, or religious community by another specifically named race, ethnicity, nationality, sexual orientation, or religious community
"""

DEFAULT_CONTENT = "LLMs steal our jobs."

# Function to make predictions
def predict(content, policy):
    input_text = PROMPT.format(policy=policy, content=content)
    inputs = tokenizer.encode(input_text, return_tensors="pt")
    outputs = model.generate(inputs, max_new_tokens=1)
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if int(decoded_output[-1]) == 0:
        return f'NON-Violating ({decoded_output[-1]})'
    else:
        return f'VIOLATING ({decoded_output[-1]})'


with gr.Blocks() as iface:
    gr.Markdown("# CoPE Alpha Preview")
    gr.Markdown("See if the given content violates your given policy.")
    
    with gr.Row():
        content_input = gr.Textbox(label="Content", lines=2, value=DEFAULT_CONTENT)
        policy_input = gr.Textbox(label="Policy", lines=10, value=DEFAULT_POLICY)
    
    submit_btn = gr.Button("Submit")
    output = gr.Label(label="Label")
    
    gr.Markdown("""
    ## About CoPE
    
    CoPE (the COntent Policy Evaluation engine) is a small language model capable of accurate content policy labeling. This is a **preview** of our alpha release and is strictly for **research** purposes. This should **NOT** be used for any production use cases.
    
    ## How to Use
                                                                                                                                                                                                                                                                       
    1. Enter your content in the "Content" box.
    2. Specify your policy in the "Policy" box.
    3. Click "Submit" to see the results.

    **Note**: Inference times are **very slow** (30-45 seconds) since this is built on dev infra and not yet optimized for live systems. Please be patient while testing!
    
    ## More Info

    - [Give us feedback](https://forms.gle/BHpt6BpH2utaf4ez9) to help us improve
    - [Read our FAQ](https://docs.google.com/document/d/1Cp3GJ5k2I-xWZ4GK9WI7Xv8TpKdHmjJ3E9RbzP5Cc_Y/edit) to learn more about CoPE
    - [Join our mailing list](https://forms.gle/PCABrZdhTuXE9w9ZA) to keep in touch
    """)
    
    submit_btn.click(predict, inputs=[content_input, policy_input], outputs=output)

# Launch the app
iface.launch()