brichett commited on
Commit
2e46e9b
·
verified ·
1 Parent(s): 40c3a36

Fix bug in handling violation_context from gradio_server

Browse files
src/enforcement_module/policy_enforcement_decider.py CHANGED
@@ -1,59 +1,64 @@
1
- from types import ModuleType
2
- import math
3
- from langchain.llms import OpenAI
4
- import requests
5
- import requests.models
6
-
7
-
8
- import requests
9
- from hamilton.function_modifiers import config
10
-
11
-
12
-
13
-
14
- def enforcement_prompt(user_input: str, violation_context: dict) -> str:
15
- """
16
- Generates the prompt to be sent to the LLM for determining the appropriate enforcement action.
17
- """
18
- dio_name = violation_context.get("entity_name", "None")
19
- dio_details = violation_context.get("entity_info", {}).get("Summary", "Unknown")
20
- dio_category = violation_context.get("entity_info", {}).get("Policy Category", "Unknown")
21
- radicalization_flag = violation_context.get("extremism_detected", False)
22
- dio_sentiment = violation_context.get("aspect_sentiment", "None")
23
-
24
- prompt = (
25
-
26
- f"Based on the following content and context, determine the appropriate enforcement action:\\n\\n"
27
- f"Content: '{user_input}'\\n"
28
- f"Radicalization detected: {radicalization_flag}\\n"
29
- f"DIO Mentioned: {dio_name} (Category: {dio_category})\\n"
30
- f"System's Classification of Content Sentiment Towards DIO: {dio_sentiment}"
31
- # f"{user_info}\\n\\n"
32
- "Policy Summary: Any entity mentioned in the 'DIO Mentioned' field is from the Dangerous Individuals & Organizations (DIO) blacklist. "
33
- "If this entity is mentioned in the user content in a supportive, glorifying, or representative manner, it constitutes a violation of our platform's community standards. "
34
- "Our platform does not allow content that glorifies, supports, or represents individuals or organizations involved in violence, hate, or terrorism. "
35
- "These entities include terrorist groups, hate groups, violent non-state actors, and others who advocate or engage in violent activities. "
36
- "Any content that endorses or supports these entities or their actions is a policy violation. "
37
- "Users may discuss these topics neutrally or in a condemnatory manner, but any positive portrayal is prohibited. "
38
- "Based on this policy, choose one of the following enforcement actions: Warning, Limited Visibility, Temporary Suspension, Permanent Ban. "
39
- "Provide a brief explanation for your choice."
40
- )
41
- return prompt
42
-
43
- def get_enforcement_decision(enforcement_prompt: str, mistral_public_url: str) -> dict:
44
- """
45
- Sends the enforcement prompt to the Mistral model server and retrieves the enforcement decision.
46
- """
47
- input_text = {
48
- "context": enforcement_prompt,
49
- "question": "What is the appropriate enforcement action?"
50
- }
51
-
52
- response = requests.post(f'{mistral_public_url}/mistral-inference', json=input_text, stream=False)
53
-
54
- return {
55
- "enforcement_action": response.text.strip(),
56
- "prompt": enforcement_prompt
57
- }
58
-
59
-
 
 
 
 
 
 
1
+ from types import ModuleType
2
+ import math
3
+ from langchain.llms import OpenAI
4
+ import requests
5
+ import requests.models
6
+
7
+
8
+ import requests
9
+ from hamilton.function_modifiers import config
10
+
11
+
12
+
13
+
14
+ def enforcement_prompt(user_input: str, violation_context: dict) -> str:
15
+ """
16
+ Generates the prompt to be sent to the LLM for determining the appropriate enforcement action.
17
+ """
18
+
19
+ print(f"Received enforcement user_input: {user_input}")
20
+ print(f"Received enforcement violation_context: {violation_context}")
21
+ if len(violation_context.keys()) == 1 and "detect_glorification" in violation_context.keys():
22
+ violation_context = violation_context['detect_glorification']
23
+ dio_name = violation_context.get("entity_name", "None")
24
+ dio_details = violation_context.get("entity_info", {}).get("Summary", "Unknown")
25
+ dio_category = violation_context.get("entity_info", {}).get("Policy Category", "Unknown")
26
+ radicalization_flag = violation_context.get("extremism_detected", False)
27
+ dio_sentiment = violation_context.get("aspect_sentiment", "None")
28
+
29
+ prompt = (
30
+
31
+ f"Based on the following content and context, determine the appropriate enforcement action:\\n\\n"
32
+ f"Content: '{user_input}'\\n"
33
+ f"Radicalization detected: {radicalization_flag}\\n"
34
+ f"DIO Mentioned: {dio_name} (Category: {dio_category})\\n"
35
+ f"System's Classification of Content Sentiment Towards DIO: {dio_sentiment}"
36
+ # f"{user_info}\\n\\n"
37
+ "Policy Summary: Any entity mentioned in the 'DIO Mentioned' field is from the Dangerous Individuals & Organizations (DIO) blacklist. "
38
+ "If this entity is mentioned in the user content in a supportive, glorifying, or representative manner, it constitutes a violation of our platform's community standards. "
39
+ "Our platform does not allow content that glorifies, supports, or represents individuals or organizations involved in violence, hate, or terrorism. "
40
+ "These entities include terrorist groups, hate groups, violent non-state actors, and others who advocate or engage in violent activities. "
41
+ "Any content that endorses or supports these entities or their actions is a policy violation. "
42
+ "Users may discuss these topics neutrally or in a condemnatory manner, but any positive portrayal is prohibited. "
43
+ "Based on this policy, choose one of the following enforcement actions: Warning, Limited Visibility, Temporary Suspension, Permanent Ban. "
44
+ "Provide a brief explanation for your choice."
45
+ )
46
+ return prompt
47
+
48
+ def get_enforcement_decision(enforcement_prompt: str, mistral_public_url: str) -> dict:
49
+ """
50
+ Sends the enforcement prompt to the Mistral model server and retrieves the enforcement decision.
51
+ """
52
+ input_text = {
53
+ "context": enforcement_prompt,
54
+ "question": "What is the appropriate enforcement action?"
55
+ }
56
+
57
+ response = requests.post(f'{mistral_public_url}/mistral-inference', json=input_text, stream=False)
58
+
59
+ return {
60
+ "enforcement_action": response.text.strip(),
61
+ "prompt": enforcement_prompt
62
+ }
63
+
64
+