"""Demo UI to show different levels of LLM security.""" import streamlit as st from streamlit_extras.stylable_container import stylable_container import pandas as pd import llm import config st.set_page_config( page_title="LLM security demo", page_icon="images/LEG.png", layout="wide", initial_sidebar_state="expanded", ) if "key" not in st.session_state: st.session_state["key"] = "value" st.logo("images/ML6_logo.png") st.title("đŸ•ĩī¸ LLM security demo") st.info( "You are a secret agent meeting your informant in a bar. Convince him to give you his secret! But be prepared, with every new level the informant will be more cautious." ) num_levels = [i for i in range(7)] # n random secrets secrets = [ "Perspicacity", "Obfuscation", "Sesquipedalian", "Synecdoche", "Defenestration", "Pulchritudinous", "Sesquipedalian", "Ineffable", ] level_names = [f"Level {i}" for i in num_levels] level_emojis = ["😴", "🙂", "😏", "đŸ˜Ĩ", "đŸĨĩ", "💀", "💀"] level_tabs = st.tabs(level_names) hint_css = """ { background-color: rgba(110, 110, 110, 0.1); padding: 16px; border-radius: 0.5rem; } code { white-space: pre-wrap !important; } """ for level_num, level_tab, name, emoji, secret in zip( num_levels, level_tabs, level_names, level_emojis, secrets ): # init "solved" state if f"solved_{name}" not in st.session_state: st.session_state[f"solved_{name}"] = False # init prompt try count if f"prompt_try_count_{name}" not in st.session_state: st.session_state[f"prompt_try_count_{name}"] = 0 # init secret guess count if f"secret_guess_count_{name}" not in st.session_state: st.session_state[f"secret_guess_count_{name}"] = 0 # init hint expander status for i in range(3): if f"opend_hint_{name}_{i}" not in st.session_state: st.session_state[f"opend_hint_{name}_{i}"] = False deployment_name = name.replace(" ", "-").lower() with level_tab: header_col1, header_col2 = st.columns(2, gap="medium") header_col1.subheader(f"{emoji} {name}") header_col2.subheader(f"Need help ...") col1, col2 = st.columns(2, gap="medium") with col1: with st.container(height=600, border=False): with st.container(border=True): txt = st.text_area( "Provide your prompt here:", key=f"txt_{name}", label_visibility="visible", height=200, placeholder="Your prompt", ) btn_submit_prompt = st.button( "Send prompt", key=f"submit_prompt_{name}" ) # Show response if len(txt) > 0 and btn_submit_prompt: st.session_state[f"prompt_try_count_{name}"] += 1 with st.container(border=True): st.write("Response:") # special checks for certain levels if level_num == 3 and llm.run_judge(level_num, {"user_input": txt}): st.error("Malicious user input detected. Try again.") elif level_num == 4: output = "".join( llm.stream_request(deployment_name, secret, txt) ) if secret.lower() in output.lower(): st.error( "Secret detected in user input. Try again." ) else: st.write(output) elif level_num == 5: output = "".join( llm.stream_request(deployment_name, secret, txt) ) if llm.is_subsequence(output, secret): st.error( "Secret detected in user input. Try again." ) else: st.write(output) elif level_num == 6: output = "".join( llm.stream_request(deployment_name, secret, txt) ) if llm.run_judge(level_num, {"secret": secret, "message": output}): st.error( "Secret detected in user input. Try again." ) else: st.write(output) else: st.write_stream( llm.stream_request(deployment_name, secret, txt) ) with st.container(border=True): secret_guess = st.text_input( "What is the secret?", key=f"guess_{name}", placeholder="Your guess", ) btn_submit_guess = st.button( "Submit guess", key=f"submit_guess_{name}" ) if btn_submit_guess: st.session_state[f"secret_guess_count_{name}"] += 1 if secret_guess.lower() == secret.lower(): st.success("You found the secret!") st.session_state[f"solved_{name}"] = True else: st.error("Wrong guess. Try again.") with col2: with st.container(border=True, height=600): st.info( "There are three levels of hints available to you. But be careful, if you open a hint before solving the secret, it will show up in your record.", icon="ℹī¸", ) hint_1_cont = stylable_container("hint_1_container", hint_css) hint1 = hint_1_cont.checkbox( "Hint 1 - **Description of security strategy**", key=f"hint1_checkbox_{name}", ) if hint1: # if hint gets revealed, it is marked as opened. Unless the secret was already found st.session_state[f"opend_hint_{name}_0"] = ( True if st.session_state[f"opend_hint_{name}_0"] else not st.session_state[f"solved_{name}"] ) hint_1_cont.write(config.LEVEL_DESCRIPTIONS[level_num]["info"]) hint_2_cont = stylable_container("hint_2_container", hint_css) hint2 = hint_2_cont.checkbox( "Hint 2 - **Code execution**", key=f"hint2_checkbox_{name}" ) if hint2: st.session_state[f"opend_hint_{name}_1"] = ( True if st.session_state[f"opend_hint_{name}_1"] else not st.session_state[f"solved_{name}"] ) def show_base_prompt(): # show prompt for key, val in prompts.items(): descr = key.replace("_", " ").capitalize() hint_2_cont.write(f"*{descr}:*") # custom_code_container(val) # val = val.replace("{{secret}}", '{{secret}}') hint_2_cont.code(val, language=None) user_input_holder = ( txt if len(txt) > 0 and btn_submit_prompt else None ) prompts = llm.get_full_prompt( deployment_name, user_input=user_input_holder ) if level_num == 3: special_prompt = llm.get_full_prompt( llm.special_checks[3], user_input=txt ) hint_2_cont.write( "Step 1: A **LLM judge** reviews the user input and determines if it is malicious or not." ) hint_2_cont.write("**LLM judge prompt:**") for key, val in special_prompt.items(): hint_2_cont.code(val, language=None) hint_2_cont.write( "Step 2: If the user input is not classified as malicious, the prompt containing the actual secret is executed and the response is shown." ) hint_2_cont.write("**Actual prompt:**") show_base_prompt() elif level_num == 4: hint_2_cont.write( "Step 1: The following prompt is executed:" ) show_base_prompt() hint_2_cont.write( "Step 2: In the response text of the LLM call, the secret is seach for with a simple python expression `secret.lower() in output.lower()`. If it contains the secret, the output will not be shown." ) elif level_num == 5: hint_2_cont.write( "Step 1: The following prompt is executed:" ) show_base_prompt() hint_2_cont.write( "Step 2: In the response text of the LLM call, the secret is seach for with a python function 'is_subsequence' which looks for substrings. If it contains the secret, the output will not be shown." ) llm.is_subsequence else: hint_2_cont.write( "Step 1: The following prompt is executed and the response is shown:" ) show_base_prompt() # st.divider() hint_3_cont = stylable_container("hint_3_container", hint_css) hint3 = hint_3_cont.checkbox( "Hint 3 - **Example solution**", key=f"hint3_checkbox_{name}", ) if hint3: st.session_state[f"opend_hint_{name}_2"] = ( True if st.session_state[f"opend_hint_{name}_2"] else not st.session_state[f"solved_{name}"] ) # custom_code_container( # config.LEVEL_DESCRIPTIONS[level_num]["solution"], # ) hint_3_cont.code( config.LEVEL_DESCRIPTIONS[level_num]["solution"], language=None, ) hint_3_cont.info("*May not allways work") with st.expander("🏆 Record", expanded=True): # build table table_data = [] for idx, name in enumerate(level_names): table_data.append( [ idx, st.session_state[f"prompt_try_count_{name}"], st.session_state[f"secret_guess_count_{name}"], "❌" if st.session_state[f"opend_hint_{name}_0"] else "-", "❌" if st.session_state[f"opend_hint_{name}_1"] else "-", "❌" if st.session_state[f"opend_hint_{name}_2"] else "-", "✅" if st.session_state[f"solved_{name}"] else "❌", secrets[idx] if st.session_state[f"solved_{name}"] else "...", ] ) # show as pandas dataframe st.table( pd.DataFrame( table_data, columns=[ "Level", "Prompt tries", "Secret guesses", "Used hint 1", "Used hint 2", "Used hint 3", "Solved", "Secret", ], index=level_emojis, ) ) # TODOS: # - add more levels # - use Gemini-Pro-Flash for supervisor LLM # - show the actual workflow of the safeguard (what gets executed) # - story telling --> new field hard to be 100 percentage save # - use LLM judge to look for secret in model output # - show which safe guards were used in 'Record' table # - funny: always return "I am sorry I cannot do that." # switch to azure deployment