Spaces:

NCSOFT
/

VARCO_Arena

Running

App Files Files Community

VARCO_Arena / app.py

sonsus

Update app.py

a3108f3 verified about 2 months ago

raw

history blame

17.5 kB

	# import shutil
	import os
	import select
	import subprocess
	import sys
	import time
	from datetime import datetime, timedelta, timezone
	from pathlib import Path
	from typing import *

	import streamlit as st

	sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))

	from varco_arena.varco_arena_core.prompts import load_prompt
	from view_utils import (
	default_page_setting,
	escape_markdown,
	set_nav_bar,
	show_linebreak_in_md,
	)

	VA_ROOT = Path(os.environ.get("VARCO_ARENA_RESULT_PATH", "./user_submit"))
	USR_SUB = VA_ROOT.parts[-1]

	import shutil

	import pandas as pd

	import analysis_utils as au
	from view_utils import visualization


	class DataCache:
	def __init__(self):
	self.cache = {}

	def store(self, key: str, data: dict):
	self.cache[key] = data

	def get(self, key: str) -> Optional[dict]:
	return self.cache.get(key)


	# Initialize the cache in session state if it doesn't exist
	if "data_cache" not in st.session_state:
	st.session_state.data_cache = DataCache()


	def purge_user_sub_data(data_path_to_purge: Union[Path, str] = None):
	if data_path_to_purge is None:
	print("nothing to purge")
	return
	else:
	shutil.rmtree(data_path_to_purge)
	print(f"purged {str(data_path_to_purge)}")
	return


	@st.cache_data
	def load_and_cache_data(result_file_path: Optional[str] = None) -> Tuple[Dict, Dict]:
	"""
	Load data from file, cache it in memory, then remove the file.
	Returns cached data on subsequent calls.

	Args:
	result_file_path: Path to the result JSON file

	Returns:
	Tuple of (all_result_dict, df_dict)
	"""
	# Check if we already have cached data for this path
	if result_file_path:
	cache_key = str(Path(result_file_path))
	cached_data = st.session_state.data_cache.get(cache_key)
	if cached_data:
	return cached_data["all_result_dict"], cached_data["df_dict"]

	# Initialize empty dicts
	all_result_dict = {}
	df_dict = {}

	if result_file_path is not None:
	try:
	result_file_path = Path(result_file_path)

	# Read and process data
	df = pd.read_json(result_file_path)
	for col in ["tstamp", "logs"]:
	if col in df.columns:
	df.drop(columns=[col], inplace=True)
	df = au.index_test_scenario(df)

	fig_dict_per_task = {}
	df_dict_per_task = {}

	# Process overall data
	fig_dict_per_task["Overall"] = visualization(df, is_overall=True)
	df_dict_per_task["Overall"] = df

	# Process per-task data
	for task in df["task"].unique():
	df_task = df[df["task"] == task]
	fig_dict_per_task[task] = visualization(df_task, is_overall=False)
	df_dict_per_task[task] = df_task

	# Create key from path components
	prm_name = result_file_path.parts[-2]
	exp_name = result_file_path.parts[-3]
	key = f"{exp_name}/{prm_name}"

	all_result_dict[key] = fig_dict_per_task
	df_dict[key] = df_dict_per_task

	# Store in cache before removing file
	cache_data = {"all_result_dict": all_result_dict, "df_dict": df_dict}
	st.session_state.data_cache.store(str(result_file_path), cache_data)

	# Remove user experiment directory
	purge_user_sub_data(data_path_to_purge=VA_ROOT)

	except Exception as e:
	st.error(f"Error processing data: {str(e)}")
	return {}, {}

	return all_result_dict, df_dict


	def upload_files(uploaded_files) -> Path:
	# prep directory for user submission
	user_sub_root = VA_ROOT
	if user_sub_root.exists():
	if not user_sub_root.is_dir():
	raise ValueError(
	f"{user_sub_root} file exists and is not a directory. Consider renaming it."
	)
	else:
	user_sub_root.mkdir(parents=True)

	KST = timezone(timedelta(hours=9))
	tstamp = datetime.now(KST)
	tstr = tstamp.strftime("%m-%d_%H:%M:%S")
	files_dir_str = "./" + str(user_sub_root / tstr)
	files_dir = Path(files_dir_str)
	files_dir.mkdir(parents=True, exist_ok=True)
	uploaded_files = list(uploaded_files)

	if not uploaded_files:
	st.warning("❌ No files to upload. Please drag/drop or browse files to upload.")
	# purge_user_sub_data(data_path_to_purge=VA_ROOT)
	elif len(uploaded_files) < 2:
	st.error("❌ You need at least 2 jsonlines files to properly run VA.")
	purge_user_sub_data(data_path_to_purge=VA_ROOT)
	else: # properly uploaded
	for file in uploaded_files:
	# Create a path for the file in the server directory
	file_path = files_dir / file.name

	# Save the file to the server directory
	with open(file_path, "wb") as f:
	f.write(file.getbuffer())

	jslfiles = list(files_dir.glob("*.jsonl"))
	st.success(f"✅ Successfully uploaded {len(jslfiles)} jsonl files.")
	return files_dir.resolve()


	def run_varco_arena(
	price_estimation: bool = False,
	# upload_dir: Union[str, Path] = None,
	promptname: str = None,
	exp_name: str = None,
	api_key: Optional[str] = None,
	evaluation_model: str = "gpt-4o-mini",
	update_interval: float = 1.0,
	):
	# Use environment variable for API key
	ptn = f"{str(st.session_state.upfiles_dir)}"
	outdir = Path(ptn)
	if exp_name:
	outdir = outdir / exp_name

	command = f"python varco_arena/main.py -i {ptn} -o {outdir} -k {api_key} -p {promptname} -e {evaluation_model} -j 64"
	if price_estimation:
	command = f"{command} -c"
	else:
	command = command.replace("python", "yes \| python ")
	print(command)

	api_key = None # clear immediately

	process = subprocess.Popen(
	command,
	stdout=subprocess.PIPE,
	stderr=subprocess.STDOUT,
	stdin=subprocess.PIPE,
	text=True,
	bufsize=1,
	shell=True,
	)

	# Set stdout and stdin to non-blocking mode
	os.set_blocking(process.stdout.fileno(), False)

	last_update_time = time.time()
	terminal_output = st.empty()
	full_output = f"{command}\n"
	while True:
	# Check if we have output to read
	if select.select([process.stdout], [], [], 0)[0]:
	output = process.stdout.readline()
	if output:
	full_output += output
	if price_estimation:
	to_show = full_output
	terminal_output.code(to_show, language="bash")
	else:
	current_time = time.time()
	if current_time - last_update_time > update_interval:
	lines = full_output.split("\n")
	if len(lines) < 5:
	to_show = full_output
	else:
	to_show = "\n".join(["...\n..\n.\n"] + lines[-5:])
	terminal_output.code(to_show, language="bash")
	last_update_time = current_time
	print(output)
	time.sleep(0.1)
	# Check if the process has finished
	if process.poll() is not None:
	# Read any remaining output
	remaining_output = process.stdout.read()
	if remaining_output:
	lines = remaining_output.split("\n")
	if len(lines) > 10:
	to_show += "\n".join(["\n...\n..\n.\n"] + lines[-10:])
	else:
	to_show += remaining_output
	terminal_output.code(to_show, language="bash")
	print(remaining_output)
	break

	return_code = process.poll()
	return outdir, return_code


	def main():
	# init lang
	st.session_state["korean"] = st.session_state.get("korean", False)

	sidebar_placeholder = default_page_setting()
	set_nav_bar(
	False, sidebar_placeholder=sidebar_placeholder, toggle_hashstr="app_init"
	)

	st.title("⚔️ VARCO ARENA ⚔️")
	if st.session_state.korean:
	st.write(
	"""바르코 아레나는 테스트셋 명령어별로 비교할 모델(생성문)의 토너먼트를 수행하고 결과들을 종합하여 모델들의 순위를 매기는 벤치마킹 시스템입니다. 이것은 reference 아웃풋과 비교하여 승률을 매기는 방법보다 정확하며 더 저렴합니다.

	모범답안을 필요로 하지 않으므로 커스텀 테스트셋 (50+ 행) 을 활용하는 경우 편리한 벤치마킹이 가능합니다."""
	)
	else:
	st.write(
	"""VARCO Arena is an LLM benchmarking system that compares model responses across customized test scenarios (recommend >50 prompts) without requiring reference answers.

	VARCO Arena conducts tournaments between models to be compared for each test set command, ranking models accurately at an affordable price. This is more accurate and cost-effective than rating win rates by comparing against reference outputs."""
	)

	st.divider()
	# Set up the file uploader
	if st.session_state.korean:
	st.markdown("### 1. 모델 출력파일 업로드")
	else:
	st.markdown("### 1. Upload LLM responses")
	uploaded_files = st.file_uploader(
	"Drag and Drop jsonlines files (.jsonl)", accept_multiple_files=True
	)
	if st.session_state.korean:
	st.info("업로드 하신 파일은 자동으로 삭제되며 수집되거나 사용되지 않습니다.\n- [입력 예시 파일 (*.jsonl)](https://huggingface.co/spaces/NCSOFT/VARCO_Arena/tree/main/varco_arena/rsc/inputs_for_dbg/dbg_llmbar_brief_inputs)")
	else:
	st.info(
	"Your uploads will be removed automatically, not being collected nor reused for any purpose.\n- [Example input files (*.jsonl)](https://huggingface.co/spaces/NCSOFT/VARCO_Arena/tree/main/varco_arena/rsc/inputs_for_dbg/dbg_llmbar_brief_inputs)"
	)
	# upload state
	if "upfiles_dir" not in st.session_state:
	st.session_state.upfiles_dir = None
	if st.button("업로드하기" if st.session_state.korean else "Upload Files"):
	st.session_state.upfiles_dir = upload_files(uploaded_files)
	if st.button("업로드한 파일 지우기" if st.session_state.korean else "Purge my uploads"):
	st.session_state.upfiles_dir = None
	if VA_ROOT.is_dir():
	shutil.rmtree(VA_ROOT)
	st.success(
	"✅ 업로드한 파일을 서버에서 지웠습니다"
	if st.session_state.korean
	else "✅ Removed your uploads from the server successfully"
	)
	else:
	st.error(
	"❌ 지울 파일이 없습니다"
	if st.session_state.korean
	else "❌ You have nothing uploaded"
	)

	if st.session_state.korean:
	with st.expander("❓❔ 무엇을 업로드 하나요❓❔"):
	st.info(open("guide_mds/input_jsonls_kr.md", encoding="UTF8").read())
	else:
	with st.expander("❓❔ What should I upload ❓❔"):
	st.info(open("guide_mds/input_jsonls_en.md", encoding="UTF8").read())

	# Form for cost estimation
	with st.form("cost_estimation_form"):
	if st.session_state.korean:
	st.write("### 2. 가격 산정")
	else:
	st.write("### 2. Cost Estimation")
	eval_model = st.selectbox(
	"Select Judge",
	open("eval_models_list.txt", encoding="UTF8").read().split("\n"),
	)
	promptname = st.selectbox(
	"Select Evalutaion Prompt",
	open("eval_prompt_list.txt", encoding="UTF8").read().split("\n"),
	)
	if st.session_state.korean:
	st.markdown("*`llmbar`외 다른 프롬프트는 충분히 검증된 프롬프트는 아닙니다. (동작은 함)")
	else:
	st.markdown(
	"*Eval prompts other than `llmbar` is working example, not the optimal ones."
	)
	if promptname == USR_SUB:
	raise ValueError(
	f"{USR_SUB=} is preserved name for the system. Consider another naming for the prompt or consider changing {VA_ROOT=} (USR_SUB == VA_ROOT.parts[-1])."
	)

	estimate_button = st.form_submit_button("Calculate Cost!")
	with st.expander(
	"LLM Judge에 활용되는 프롬프트 (`Calculate Cost!` 클릭시 갱신)"
	if st.session_state.korean
	else "Evaluation Prompt for LLM Judge (will refresh after `Calculate Cost!` clicked)"
	):
	prompt = load_prompt(promptname, task="-")
	kwargs = dict(
	inst="{inst}",
	src="{src}",
	out_a="{out_a}",
	out_b="{out_b}",
	task="-",
	)
	if promptname == "translation_pair":
	kwargs["source_lang"] = "{source_lang}"
	kwargs["target_lang"] = "{target_lang}"
	prompt_cmpl = prompt.complete_prompt(**kwargs)

	st.markdown(f"### Evaluation Prompt: {promptname}")
	for msg in prompt_cmpl:
	st.markdown(f"{msg['role']}")
	st.info(show_linebreak_in_md(escape_markdown(msg["content"])))

	if estimate_button:
	if st.session_state.get("upfiles_dir") is None:
	st.error(
	"❌ Requirements: You have to upload jsonlines files first to proceed"
	)
	else:
	st.markdown("##### Estimated Cost")
	dummy_api_key = "dummy"
	dummy_exp_name = "dummy"
	result_file_path, return_code = run_varco_arena(
	# upload_dir=st.session_state.upfiles_dir,
	promptname=promptname,
	api_key=dummy_api_key,
	exp_name=dummy_exp_name,
	price_estimation=True,
	evaluation_model=eval_model,
	)
	if return_code:
	st.error(
	"❌ RuntimeError: An error occurred during cost estimation. Restart from file upload!"
	)
	purge_user_sub_data(data_path_to_purge=VA_ROOT)

	else:
	st.success("✅ Cost estimation completed successfully")
	st.session_state.cost_estimated = True

	# Form for actual run
	with st.form("run_arena_form"):
	if st.session_state.korean:
	st.write("### 3. Varco Arena 구동하기")
	else:
	st.write("### 3. Run Varco Arena")
	api_key = st.text_input("Enter your OpenAI API Key", type="password")

	# demo exp name fixated
	KST = timezone(timedelta(hours=9))
	tstamp = datetime.now(KST)
	tstr = tstamp.strftime("%m-%d_%H:%M:%S")
	exp_name = f"{tstr}_KST_submit"

	if st.session_state.korean:
	st.write("주의:`Ctrl+C` 버튼은 구현되지 않았습니다. 구동 전 숙고해주세요.")
	else:
	st.write("Caution: `Ctrl+C` button hasn't been implemented.")
	run_button = st.form_submit_button(
	"🔥 Run Arena!",
	disabled=(not st.session_state.get("cost_estimated", False))
	or "result_file_path"
	in st.session_state.keys(), # run already performed once
	)

	if run_button:
	set_nav_bar(
	True,
	sidebar_placeholder=sidebar_placeholder,
	toggle_hashstr="app_during_run",
	)
	if st.session_state.get("upfiles_dir") is None:
	st.error(
	"❌ Requirements: You have to upload jsonlines files first to proceed"
	)
	elif not api_key:
	st.error("❌ Requirements: OpenAI key required to run VA.")
	else:
	result_file_path, return_code = run_varco_arena(
	# upload_dir=st.session_state.upfiles_dir,
	promptname=promptname,
	api_key=api_key,
	exp_name=exp_name,
	price_estimation=False,
	evaluation_model=eval_model,
	)
	if return_code:
	st.error(
	"❌ RuntimeError: An error occurred during Varco Arena run. Check the file and restart from file upload!"
	)
	purge_user_sub_data(data_path_to_purge=VA_ROOT)

	else:
	st.success("✅ Varco Arena run completed successfully")
	st.session_state.result_file_path = list(
	result_file_path.glob("**/result.json")
	)[-1]
	set_nav_bar(
	False, sidebar_placeholder=sidebar_placeholder, toggle_hashstr="app_run_done"
	)

	if st.session_state.get("result_file_path", None) is not None:
	print(f"{st.session_state.get('result_file_path', None)=}")
	load_and_cache_data(result_file_path=str(st.session_state.result_file_path))


	if __name__ == "__main__":
	main()