diff --git "a/dev.ipynb" "b/dev.ipynb"
--- "a/dev.ipynb"
+++ "b/dev.ipynb"
@@ -2,9 +2,18 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 47,
+ "execution_count": 1,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/andrewreed/Documents/success_projects/closed-vs-open-arena-elo/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+ " from .autonotebook import tqdm as notebook_tqdm\n"
+ ]
+ }
+ ],
"source": [
"import os\n",
"import pickle\n",
@@ -22,67 +31,60 @@
},
{
"cell_type": "code",
- "execution_count": 72,
+ "execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
- "fs = HfFileSystem()\n",
+ "from typing import Literal\n",
"\n",
"\n",
- "def extract_date(filename):\n",
- " return filename.split(\"/\")[-1].split(\".\")[0].split(\"_\")[-1]\n",
+ "def download_latest_data_from_space(\n",
+ " repo_id: str, file_type: Literal[\"pkl\", \"csv\"]\n",
+ ") -> str:\n",
+ " \"\"\"\n",
+ " Downloads the latest data file of the specified file type from the given repository space.\n",
"\n",
+ " Args:\n",
+ " repo_id (str): The ID of the repository space.\n",
+ " file_type (Literal[\"pkl\", \"csv\"]): The type of the data file to download. Must be either \"pkl\" or \"csv\".\n",
"\n",
- "ELO_DATA_FILES = \"spaces/lmsys/chatbot-arena-leaderboard/*.pkl\"\n",
- "elo_files = fs.glob(ELO_DATA_FILES)\n",
- "latest_elo_file = sorted(elo_files, key=extract_date, reverse=True)[0]\n",
+ " Returns:\n",
+ " str: The local file path of the downloaded data file.\n",
+ " \"\"\"\n",
"\n",
- "LEADERBOARD_DATA_FILES = \"spaces/lmsys/chatbot-arena-leaderboard/*.csv\"\n",
- "leaderboard_files = fs.glob(LEADERBOARD_DATA_FILES)\n",
- "latest_leaderboard_file = sorted(leaderboard_files, key=extract_date, reverse=True)[0]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 73,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "('leaderboard_table_20240426.csv', 'elo_results_20240426.pkl')"
- ]
- },
- "execution_count": 73,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "latest_leaderboard_file.split(\"/\")[-1], latest_elo_file.split(\"/\")[-1]"
+ " def extract_date(filename):\n",
+ " return filename.split(\"/\")[-1].split(\".\")[0].split(\"_\")[-1]\n",
+ "\n",
+ " fs = HfFileSystem()\n",
+ " data_file_path = f\"spaces/{repo_id}/*.{file_type}\"\n",
+ " files = fs.glob(data_file_path)\n",
+ " latest_file = sorted(files, key=extract_date, reverse=True)[0]\n",
+ "\n",
+ " latest_filepath_local = hf_hub_download(\n",
+ " repo_id=repo_id,\n",
+ " filename=latest_file.split(\"/\")[-1],\n",
+ " repo_type=\"space\",\n",
+ " )\n",
+ " return latest_filepath_local"
]
},
{
"cell_type": "code",
- "execution_count": 74,
+ "execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
- "latest_elo_file_local = hf_hub_download(\n",
- " repo_id=\"lmsys/chatbot-arena-leaderboard\",\n",
- " filename=latest_elo_file.split(\"/\")[-1],\n",
- " repo_type=\"space\",\n",
+ "latest_leaderboard_file_local = download_latest_data_from_space(\n",
+ " repo_id=\"lmsys/chatbot-arena-leaderboard\", file_type=\"csv\"\n",
")\n",
- "latest_leaderboard_file_local = hf_hub_download(\n",
- " repo_id=\"lmsys/chatbot-arena-leaderboard\",\n",
- " filename=latest_leaderboard_file.split(\"/\")[-1],\n",
- " repo_type=\"space\",\n",
+ "latest_elo_file_local = download_latest_data_from_space(\n",
+ " repo_id=\"lmsys/chatbot-arena-leaderboard\", file_type=\"pkl\"\n",
")"
]
},
{
"cell_type": "code",
- "execution_count": 76,
+ "execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@@ -122,7 +124,7 @@
},
{
"cell_type": "code",
- "execution_count": 77,
+ "execution_count": 5,
"metadata": {},
"outputs": [
{
@@ -131,7 +133,7 @@
"dict_keys(['Overall', 'Coding', 'Longer Query', 'English', 'Chinese', 'French', 'Exclude Ties', 'Exclude Short Query (< 5 tokens)', 'Exclude Refusal'])"
]
},
- "execution_count": 77,
+ "execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
@@ -142,7 +144,7 @@
},
{
"cell_type": "code",
- "execution_count": 78,
+ "execution_count": 6,
"metadata": {},
"outputs": [
{
@@ -177,48 +179,48 @@
"
\n",
" \n",
" RWKV-4-Raven-14B | \n",
- " 927.710294 | \n",
- " 27.143015 | \n",
- " 935.717850 | \n",
- " 916.546369 | \n",
+ " 928.451251 | \n",
+ " 26.146415 | \n",
+ " 937.017097 | \n",
+ " 919.444359 | \n",
" 5129 | \n",
- " 81 | \n",
+ " 82 | \n",
"
\n",
" \n",
" alpaca-13b | \n",
- " 907.324482 | \n",
- " 20.736682 | \n",
- " 915.536856 | \n",
- " 899.330070 | \n",
+ " 908.084359 | \n",
+ " 18.598539 | \n",
+ " 915.348707 | \n",
+ " 900.602847 | \n",
" 6111 | \n",
- " 85 | \n",
+ " 86 | \n",
"
\n",
" \n",
" bard-jan-24-gemini-pro | \n",
- " 1208.505408 | \n",
- " 6.679087 | \n",
- " 1213.291358 | \n",
- " 1203.926901 | \n",
- " 12388 | \n",
+ " 1208.712877 | \n",
+ " 7.975296 | \n",
+ " 1213.331583 | \n",
+ " 1203.004139 | \n",
+ " 12387 | \n",
" 6 | \n",
"
\n",
" \n",
" chatglm-6b | \n",
- " 886.107553 | \n",
- " 17.110417 | \n",
- " 894.034333 | \n",
- " 878.094776 | \n",
+ " 886.873429 | \n",
+ " 19.813751 | \n",
+ " 894.785321 | \n",
+ " 878.677878 | \n",
" 5195 | \n",
- " 86 | \n",
+ " 87 | \n",
"
\n",
" \n",
" chatglm2-6b | \n",
- " 932.678460 | \n",
- " 33.530570 | \n",
- " 943.455598 | \n",
- " 921.346322 | \n",
+ " 933.337288 | \n",
+ " 33.939472 | \n",
+ " 944.493496 | \n",
+ " 921.470740 | \n",
" 2880 | \n",
- " 81 | \n",
+ " 82 | \n",
"
\n",
" \n",
" ... | \n",
@@ -231,85 +233,85 @@
"
\n",
" \n",
" wizardlm-70b | \n",
- " 1107.992552 | \n",
- " 9.385887 | \n",
- " 1114.218223 | \n",
- " 1102.655575 | \n",
- " 8868 | \n",
+ " 1108.552744 | \n",
+ " 8.988005 | \n",
+ " 1114.390689 | \n",
+ " 1102.745236 | \n",
+ " 8867 | \n",
" 29 | \n",
"
\n",
" \n",
" yi-34b-chat | \n",
- " 1109.722447 | \n",
- " 8.596908 | \n",
- " 1115.182579 | \n",
- " 1103.991095 | \n",
- " 12252 | \n",
+ " 1111.132640 | \n",
+ " 7.801741 | \n",
+ " 1115.356993 | \n",
+ " 1105.658254 | \n",
+ " 13177 | \n",
" 29 | \n",
"
\n",
" \n",
" zephyr-7b-alpha | \n",
- " 1042.108710 | \n",
- " 43.900714 | \n",
- " 1052.991768 | \n",
- " 1027.160917 | \n",
+ " 1043.084267 | \n",
+ " 45.472021 | \n",
+ " 1054.269954 | \n",
+ " 1027.602171 | \n",
" 1901 | \n",
- " 58 | \n",
+ " 57 | \n",
"
\n",
" \n",
" zephyr-7b-beta | \n",
- " 1053.655680 | \n",
- " 10.297607 | \n",
- " 1059.923254 | \n",
- " 1047.601629 | \n",
+ " 1054.416300 | \n",
+ " 11.094606 | \n",
+ " 1060.265072 | \n",
+ " 1047.790509 | \n",
" 11924 | \n",
- " 54 | \n",
+ " 55 | \n",
"
\n",
" \n",
" zephyr-orpo-141b-A35b-v0.1 | \n",
- " 1124.677515 | \n",
- " 22.288515 | \n",
- " 1132.728887 | \n",
- " 1113.848432 | \n",
- " 4276 | \n",
+ " 1128.816337 | \n",
+ " 16.964385 | \n",
+ " 1134.862680 | \n",
+ " 1119.183571 | \n",
+ " 5207 | \n",
" 22 | \n",
"
\n",
" \n",
"\n",
- "91 rows × 6 columns
\n",
+ "92 rows × 6 columns
\n",
""
],
"text/plain": [
" rating variance rating_q975 rating_q025 \\\n",
- "RWKV-4-Raven-14B 927.710294 27.143015 935.717850 916.546369 \n",
- "alpaca-13b 907.324482 20.736682 915.536856 899.330070 \n",
- "bard-jan-24-gemini-pro 1208.505408 6.679087 1213.291358 1203.926901 \n",
- "chatglm-6b 886.107553 17.110417 894.034333 878.094776 \n",
- "chatglm2-6b 932.678460 33.530570 943.455598 921.346322 \n",
+ "RWKV-4-Raven-14B 928.451251 26.146415 937.017097 919.444359 \n",
+ "alpaca-13b 908.084359 18.598539 915.348707 900.602847 \n",
+ "bard-jan-24-gemini-pro 1208.712877 7.975296 1213.331583 1203.004139 \n",
+ "chatglm-6b 886.873429 19.813751 894.785321 878.677878 \n",
+ "chatglm2-6b 933.337288 33.939472 944.493496 921.470740 \n",
"... ... ... ... ... \n",
- "wizardlm-70b 1107.992552 9.385887 1114.218223 1102.655575 \n",
- "yi-34b-chat 1109.722447 8.596908 1115.182579 1103.991095 \n",
- "zephyr-7b-alpha 1042.108710 43.900714 1052.991768 1027.160917 \n",
- "zephyr-7b-beta 1053.655680 10.297607 1059.923254 1047.601629 \n",
- "zephyr-orpo-141b-A35b-v0.1 1124.677515 22.288515 1132.728887 1113.848432 \n",
+ "wizardlm-70b 1108.552744 8.988005 1114.390689 1102.745236 \n",
+ "yi-34b-chat 1111.132640 7.801741 1115.356993 1105.658254 \n",
+ "zephyr-7b-alpha 1043.084267 45.472021 1054.269954 1027.602171 \n",
+ "zephyr-7b-beta 1054.416300 11.094606 1060.265072 1047.790509 \n",
+ "zephyr-orpo-141b-A35b-v0.1 1128.816337 16.964385 1134.862680 1119.183571 \n",
"\n",
" num_battles final_ranking \n",
- "RWKV-4-Raven-14B 5129 81 \n",
- "alpaca-13b 6111 85 \n",
- "bard-jan-24-gemini-pro 12388 6 \n",
- "chatglm-6b 5195 86 \n",
- "chatglm2-6b 2880 81 \n",
+ "RWKV-4-Raven-14B 5129 82 \n",
+ "alpaca-13b 6111 86 \n",
+ "bard-jan-24-gemini-pro 12387 6 \n",
+ "chatglm-6b 5195 87 \n",
+ "chatglm2-6b 2880 82 \n",
"... ... ... \n",
- "wizardlm-70b 8868 29 \n",
- "yi-34b-chat 12252 29 \n",
- "zephyr-7b-alpha 1901 58 \n",
- "zephyr-7b-beta 11924 54 \n",
- "zephyr-orpo-141b-A35b-v0.1 4276 22 \n",
+ "wizardlm-70b 8867 29 \n",
+ "yi-34b-chat 13177 29 \n",
+ "zephyr-7b-alpha 1901 57 \n",
+ "zephyr-7b-beta 11924 55 \n",
+ "zephyr-orpo-141b-A35b-v0.1 5207 22 \n",
"\n",
- "[91 rows x 6 columns]"
+ "[92 rows x 6 columns]"
]
},
- "execution_count": 78,
+ "execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@@ -320,7 +322,7 @@
},
{
"cell_type": "code",
- "execution_count": 79,
+ "execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
@@ -330,7 +332,7 @@
},
{
"cell_type": "code",
- "execution_count": 80,
+ "execution_count": 8,
"metadata": {},
"outputs": [
{
@@ -432,17 +434,6 @@
" ... | \n",
" \n",
" \n",
- " 100 | \n",
- " mixtral-8x22b-instruct-v0.1 | \n",
- " Mixtral-8x22b-Instruct-v0.1 | \n",
- " - | \n",
- " 0.778 | \n",
- " 2024/4 | \n",
- " Apache 2.0 | \n",
- " Mistral | \n",
- " https://mistral.ai/news/mixtral-8x22b/ | \n",
- "
\n",
- " \n",
" 101 | \n",
" llama-3-70b-instruct | \n",
" Llama-3-70b-Instruct | \n",
@@ -486,9 +477,20 @@
" Microsoft | \n",
" https://azure.microsoft.com/en-us/blog/introdu... | \n",
"
\n",
+ " \n",
+ " 105 | \n",
+ " snowflake-arctic-instruct | \n",
+ " Snowflake Arctic Instruct | \n",
+ " - | \n",
+ " 0.673 | \n",
+ " 2024/4 | \n",
+ " Apache 2.0 | \n",
+ " Snowflake | \n",
+ " https://www.snowflake.com/blog/arctic-open-eff... | \n",
+ "
\n",
" \n",
"\n",
- "105 rows × 8 columns
\n",
+ "106 rows × 8 columns
\n",
""
],
"text/plain": [
@@ -499,11 +501,11 @@
"3 tulu-30b Tulu-30B \n",
"4 guanaco-65b Guanaco-65B \n",
".. ... ... \n",
- "100 mixtral-8x22b-instruct-v0.1 Mixtral-8x22b-Instruct-v0.1 \n",
"101 llama-3-70b-instruct Llama-3-70b-Instruct \n",
"102 llama-3-8b-instruct Llama-3-8b-Instruct \n",
"103 gemini-1.5-pro-api-0409-preview Gemini 1.5 Pro API-0409-Preview \n",
"104 phi-3-mini-128k-instruct Phi-3-Mini-128k-Instruct \n",
+ "105 snowflake-arctic-instruct Snowflake Arctic Instruct \n",
"\n",
" MT-bench (score) MMLU Knowledge cutoff date License \\\n",
"0 7.01 0.587 2023/6 Non-commercial \n",
@@ -512,11 +514,11 @@
"3 6.43 0.581 2023/6 Non-commercial \n",
"4 6.41 0.621 2023/5 Non-commercial \n",
".. ... ... ... ... \n",
- "100 - 0.778 2024/4 Apache 2.0 \n",
"101 - 0.820 2023/12 Llama 3 Community \n",
"102 - 0.684 2023/3 Llama 3 Community \n",
"103 - 0.819 2023/11 Proprietary \n",
"104 - 0.681 2023/10 MIT \n",
+ "105 - 0.673 2024/4 Apache 2.0 \n",
"\n",
" Organization Link \n",
"0 Microsoft https://huggingface.co/WizardLM/WizardLM-30B-V1.0 \n",
@@ -525,16 +527,16 @@
"3 AllenAI/UW https://huggingface.co/allenai/tulu-30b \n",
"4 UW https://huggingface.co/timdettmers/guanaco-65b... \n",
".. ... ... \n",
- "100 Mistral https://mistral.ai/news/mixtral-8x22b/ \n",
"101 Meta https://llama.meta.com/llama3/ \n",
"102 Meta https://llama.meta.com/llama3/ \n",
"103 Google https://blog.google/technology/ai/google-gemin... \n",
"104 Microsoft https://azure.microsoft.com/en-us/blog/introdu... \n",
+ "105 Snowflake https://www.snowflake.com/blog/arctic-open-eff... \n",
"\n",
- "[105 rows x 8 columns]"
+ "[106 rows x 8 columns]"
]
},
- "execution_count": 80,
+ "execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@@ -545,7 +547,7 @@
},
{
"cell_type": "code",
- "execution_count": 82,
+ "execution_count": 9,
"metadata": {},
"outputs": [
{
@@ -554,7 +556,7 @@
"dict_keys(['Overall', 'Coding', 'Longer Query', 'English', 'Chinese', 'French', 'Exclude Ties', 'Exclude Short Query (< 5 tokens)', 'Exclude Refusal'])"
]
},
- "execution_count": 82,
+ "execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@@ -565,7 +567,7 @@
},
{
"cell_type": "code",
- "execution_count": 86,
+ "execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
@@ -581,7 +583,7 @@
},
{
"cell_type": "code",
- "execution_count": 101,
+ "execution_count": 11,
"metadata": {},
"outputs": [
{
@@ -624,11 +626,11 @@
" \n",
" \n",
" 0 | \n",
- " 1257.399407 | \n",
- " 4.283316 | \n",
- " 1261.676224 | \n",
- " 1254.003626 | \n",
- " 30562 | \n",
+ " 1258.815279 | \n",
+ " 3.258132 | \n",
+ " 1262.796713 | \n",
+ " 1256.000508 | \n",
+ " 35931 | \n",
" 1 | \n",
" gpt-4-turbo-2024-04-09 | \n",
" GPT-4-Turbo-2024-04-09 | \n",
@@ -641,12 +643,12 @@
"
\n",
" \n",
" 1 | \n",
- " 1253.025095 | \n",
- " 2.069534 | \n",
- " 1256.111392 | \n",
- " 1250.435207 | \n",
- " 69871 | \n",
- " 1 | \n",
+ " 1252.684886 | \n",
+ " 1.799233 | \n",
+ " 1254.748391 | \n",
+ " 1249.873417 | \n",
+ " 73547 | \n",
+ " 2 | \n",
" gpt-4-1106-preview | \n",
" GPT-4-1106-preview | \n",
" 9.32 | \n",
@@ -658,11 +660,11 @@
"
\n",
" \n",
" 2 | \n",
- " 1251.114220 | \n",
- " 1.862842 | \n",
- " 1253.629093 | \n",
- " 1248.362042 | \n",
- " 75684 | \n",
+ " 1250.926206 | \n",
+ " 2.018201 | \n",
+ " 1253.851885 | \n",
+ " 1248.166034 | \n",
+ " 80997 | \n",
" 2 | \n",
" claude-3-opus-20240229 | \n",
" Claude 3 Opus | \n",
@@ -675,11 +677,11 @@
"
\n",
" \n",
" 3 | \n",
- " 1247.662508 | \n",
- " 3.263747 | \n",
- " 1251.582645 | \n",
- " 1244.380454 | \n",
- " 33723 | \n",
+ " 1249.618395 | \n",
+ " 3.233129 | \n",
+ " 1252.956497 | \n",
+ " 1246.247080 | \n",
+ " 39482 | \n",
" 2 | \n",
" gemini-1.5-pro-api-0409-preview | \n",
" Gemini 1.5 Pro API-0409-Preview | \n",
@@ -692,12 +694,12 @@
"
\n",
" \n",
" 4 | \n",
- " 1247.277052 | \n",
- " 1.923014 | \n",
- " 1249.489411 | \n",
- " 1244.340257 | \n",
- " 61924 | \n",
- " 3 | \n",
+ " 1246.777591 | \n",
+ " 1.942477 | \n",
+ " 1249.979712 | \n",
+ " 1244.305362 | \n",
+ " 67354 | \n",
+ " 2 | \n",
" gpt-4-0125-preview | \n",
" GPT-4-0125-preview | \n",
" - | \n",
@@ -708,308 +710,138 @@
" https://openai.com/blog/new-models-and-develop... | \n",
"
\n",
" \n",
- " 5 | \n",
- " 1208.505408 | \n",
- " 6.679087 | \n",
- " 1213.291358 | \n",
- " 1203.926901 | \n",
- " 12388 | \n",
- " 6 | \n",
- " bard-jan-24-gemini-pro | \n",
- " Bard (Gemini Pro) | \n",
- " - | \n",
- " - | \n",
- " Online | \n",
- " Proprietary | \n",
- " Google | \n",
- " https://bard.google.com/ | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
"
\n",
" \n",
- " 6 | \n",
- " 1207.497541 | \n",
- " 4.109466 | \n",
- " 1211.720734 | \n",
- " 1203.322762 | \n",
- " 27298 | \n",
- " 6 | \n",
- " llama-3-70b-instruct | \n",
- " Llama-3-70b-Instruct | \n",
- " - | \n",
- " 0.820 | \n",
- " 2023/12 | \n",
- " Llama 3 Community | \n",
- " Meta | \n",
- " https://llama.meta.com/llama3/ | \n",
+ " 87 | \n",
+ " 886.873429 | \n",
+ " 19.813751 | \n",
+ " 894.785321 | \n",
+ " 878.677878 | \n",
+ " 5195 | \n",
+ " 87 | \n",
+ " chatglm-6b | \n",
+ " ChatGLM-6B | \n",
+ " 4.50 | \n",
+ " 0.361 | \n",
+ " 2023/3 | \n",
+ " Non-commercial | \n",
+ " Tsinghua | \n",
+ " https://huggingface.co/THUDM/chatglm-6b | \n",
"
\n",
" \n",
- " 7 | \n",
- " 1201.671254 | \n",
- " 2.525563 | \n",
- " 1204.862512 | \n",
- " 1198.658822 | \n",
- " 75418 | \n",
- " 6 | \n",
- " claude-3-sonnet-20240229 | \n",
- " Claude 3 Sonnet | \n",
- " - | \n",
- " 0.790 | \n",
- " 2023/8 | \n",
- " Proprietary | \n",
- " Anthropic | \n",
- " https://www.anthropic.com/news/claude-3-family | \n",
+ " 88 | \n",
+ " 876.929108 | \n",
+ " 27.115855 | \n",
+ " 887.355529 | \n",
+ " 866.860534 | \n",
+ " 4521 | \n",
+ " 88 | \n",
+ " fastchat-t5-3b | \n",
+ " FastChat-T5-3B | \n",
+ " 3.04 | \n",
+ " 0.477 | \n",
+ " 2023/4 | \n",
+ " Apache 2.0 | \n",
+ " LMSYS | \n",
+ " https://huggingface.co/lmsys/fastchat-t5-3b-v1.0 | \n",
"
\n",
" \n",
- " 8 | \n",
- " 1191.684542 | \n",
- " 3.459717 | \n",
- " 1195.080256 | \n",
- " 1188.222382 | \n",
- " 41262 | \n",
- " 9 | \n",
- " command-r-plus | \n",
- " Command R+ | \n",
- " - | \n",
- " - | \n",
- " 2024/3 | \n",
- " CC-BY-NC-4.0 | \n",
- " Cohere | \n",
- " https://txt.cohere.com/command-r-plus-microsof... | \n",
- "
\n",
- " \n",
- " 9 | \n",
- " 1188.987389 | \n",
- " 3.124792 | \n",
- " 1193.335535 | \n",
- " 1185.935928 | \n",
- " 48390 | \n",
- " 9 | \n",
- " gpt-4-0314 | \n",
- " GPT-4-0314 | \n",
- " 8.96 | \n",
- " 0.864 | \n",
- " 2021/9 | \n",
- " Proprietary | \n",
- " OpenAI | \n",
- " https://openai.com/research/gpt-4 | \n",
- "
\n",
- " \n",
- " 10 | \n",
- " 1180.606870 | \n",
- " 3.097542 | \n",
- " 1183.825403 | \n",
- " 1177.255203 | \n",
- " 66065 | \n",
- " 11 | \n",
- " claude-3-haiku-20240307 | \n",
- " Claude 3 Haiku | \n",
- " - | \n",
- " 0.752 | \n",
- " 2023/8 | \n",
- " Proprietary | \n",
- " Anthropic | \n",
- " https://www.anthropic.com/news/claude-3-family | \n",
+ " 89 | \n",
+ " 848.932568 | \n",
+ " 36.961459 | \n",
+ " 859.103936 | \n",
+ " 837.364341 | \n",
+ " 3461 | \n",
+ " 90 | \n",
+ " stablelm-tuned-alpha-7b | \n",
+ " StableLM-Tuned-Alpha-7B | \n",
+ " 2.75 | \n",
+ " 0.244 | \n",
+ " 2023/4 | \n",
+ " CC-BY-NC-SA-4.0 | \n",
+ " Stability AI | \n",
+ " https://huggingface.co/stabilityai/stablelm-tu... | \n",
"
\n",
" \n",
- " 11 | \n",
- " 1164.896561 | \n",
- " 2.585577 | \n",
- " 1167.595696 | \n",
- " 1161.727454 | \n",
- " 67038 | \n",
- " 12 | \n",
- " gpt-4-0613 | \n",
- " GPT-4-0613 | \n",
- " 9.18 | \n",
- " - | \n",
- " 2021/9 | \n",
- " Proprietary | \n",
- " OpenAI | \n",
- " https://platform.openai.com/docs/models/gpt-4-... | \n",
+ " 90 | \n",
+ " 826.647332 | \n",
+ " 30.156414 | \n",
+ " 837.335988 | \n",
+ " 816.370788 | \n",
+ " 3666 | \n",
+ " 91 | \n",
+ " dolly-v2-12b | \n",
+ " Dolly-V2-12B | \n",
+ " 3.28 | \n",
+ " 0.257 | \n",
+ " 2023/4 | \n",
+ " MIT | \n",
+ " Databricks | \n",
+ " https://huggingface.co/databricks/dolly-v2-12b | \n",
"
\n",
" \n",
- " 12 | \n",
- " 1157.638992 | \n",
- " 2.541320 | \n",
- " 1160.496116 | \n",
- " 1154.927748 | \n",
- " 44120 | \n",
- " 13 | \n",
- " mistral-large-2402 | \n",
- " Mistral-Large-2402 | \n",
- " - | \n",
- " 0.812 | \n",
- " - | \n",
- " Proprietary | \n",
- " Mistral | \n",
- " https://mistral.ai/news/mistral-large/ | \n",
- "
\n",
- " \n",
- " 13 | \n",
- " 1153.464280 | \n",
- " 3.631512 | \n",
- " 1157.068850 | \n",
- " 1150.178903 | \n",
- " 32999 | \n",
- " 13 | \n",
- " qwen1.5-72b-chat | \n",
- " Qwen1.5-72B-Chat | \n",
- " 8.61 | \n",
- " 0.775 | \n",
- " 2024/2 | \n",
- " Qianwen LICENSE | \n",
- " Alibaba | \n",
- " https://qwenlm.github.io/blog/qwen1.5/ | \n",
- "
\n",
- " \n",
- " 14 | \n",
- " 1150.918473 | \n",
- " 9.062217 | \n",
- " 1155.969721 | \n",
- " 1145.229885 | \n",
- " 8622 | \n",
- " 13 | \n",
- " reka-flash-21b-20240226-online | \n",
- " Reka-Flash-21B-online | \n",
- " - | \n",
- " - | \n",
- " Online | \n",
- " Proprietary | \n",
- " Reka AI | \n",
- " https://docs.reka.ai/http-api.html#generation | \n",
- "
\n",
- " \n",
- " 15 | \n",
- " 1150.244313 | \n",
- " 5.551373 | \n",
- " 1154.745214 | \n",
- " 1145.496466 | \n",
- " 21768 | \n",
- " 14 | \n",
- " claude-1 | \n",
- " Claude-1 | \n",
- " 7.90 | \n",
- " 0.770 | \n",
- " - | \n",
- " Proprietary | \n",
- " Anthropic | \n",
- " https://www.anthropic.com/index/introducing-cl... | \n",
- "
\n",
- " \n",
- " 16 | \n",
- " 1149.267578 | \n",
- " 11.452272 | \n",
- " 1154.290155 | \n",
- " 1141.931621 | \n",
- " 9059 | \n",
- " 14 | \n",
- " reka-flash-21b-20240226 | \n",
- " Reka-Flash-21B | \n",
- " - | \n",
- " 0.735 | \n",
- " 2023/11 | \n",
- " Proprietary | \n",
- " Reka AI | \n",
- " https://www.reka.ai/news/reka-flash-efficient-... | \n",
- "
\n",
- " \n",
- " 17 | \n",
- " 1148.072155 | \n",
- " 3.071222 | \n",
- " 1151.980865 | \n",
- " 1144.992044 | \n",
- " 37413 | \n",
- " 14 | \n",
- " command-r | \n",
- " Command R | \n",
- " - | \n",
- " - | \n",
- " 2024/3 | \n",
- " CC-BY-NC-4.0 | \n",
- " Cohere | \n",
- " https://txt.cohere.com/command-r | \n",
- "
\n",
- " \n",
- " 18 | \n",
- " 1147.668325 | \n",
- " 3.542229 | \n",
- " 1150.726489 | \n",
- " 1143.868385 | \n",
- " 32738 | \n",
- " 14 | \n",
- " mistral-medium | \n",
- " Mistral Medium | \n",
- " 8.61 | \n",
- " 0.753 | \n",
- " - | \n",
- " Proprietary | \n",
- " Mistral | \n",
- " https://mistral.ai/news/la-plateforme/ | \n",
- "
\n",
- " \n",
- " 19 | \n",
- " 1147.473989 | \n",
- " 5.789710 | \n",
- " 1151.989352 | \n",
- " 1143.322918 | \n",
- " 17214 | \n",
- " 14 | \n",
- " mixtral-8x22b-instruct-v0.1 | \n",
- " Mixtral-8x22b-Instruct-v0.1 | \n",
- " - | \n",
- " 0.778 | \n",
- " 2024/4 | \n",
- " Apache 2.0 | \n",
- " Mistral | \n",
- " https://mistral.ai/news/mixtral-8x22b/ | \n",
+ " 91 | \n",
+ " 804.356329 | \n",
+ " 44.756983 | \n",
+ " 815.161492 | \n",
+ " 790.879536 | \n",
+ " 2538 | \n",
+ " 92 | \n",
+ " llama-13b | \n",
+ " LLaMA-13B | \n",
+ " 2.61 | \n",
+ " 0.470 | \n",
+ " 2023/2 | \n",
+ " Non-commercial | \n",
+ " Meta | \n",
+ " https://arxiv.org/abs/2302.13971 | \n",
"
\n",
" \n",
"\n",
+ "92 rows × 14 columns
\n",
""
],
"text/plain": [
" rating variance rating_q975 rating_q025 num_battles \\\n",
- "0 1257.399407 4.283316 1261.676224 1254.003626 30562 \n",
- "1 1253.025095 2.069534 1256.111392 1250.435207 69871 \n",
- "2 1251.114220 1.862842 1253.629093 1248.362042 75684 \n",
- "3 1247.662508 3.263747 1251.582645 1244.380454 33723 \n",
- "4 1247.277052 1.923014 1249.489411 1244.340257 61924 \n",
- "5 1208.505408 6.679087 1213.291358 1203.926901 12388 \n",
- "6 1207.497541 4.109466 1211.720734 1203.322762 27298 \n",
- "7 1201.671254 2.525563 1204.862512 1198.658822 75418 \n",
- "8 1191.684542 3.459717 1195.080256 1188.222382 41262 \n",
- "9 1188.987389 3.124792 1193.335535 1185.935928 48390 \n",
- "10 1180.606870 3.097542 1183.825403 1177.255203 66065 \n",
- "11 1164.896561 2.585577 1167.595696 1161.727454 67038 \n",
- "12 1157.638992 2.541320 1160.496116 1154.927748 44120 \n",
- "13 1153.464280 3.631512 1157.068850 1150.178903 32999 \n",
- "14 1150.918473 9.062217 1155.969721 1145.229885 8622 \n",
- "15 1150.244313 5.551373 1154.745214 1145.496466 21768 \n",
- "16 1149.267578 11.452272 1154.290155 1141.931621 9059 \n",
- "17 1148.072155 3.071222 1151.980865 1144.992044 37413 \n",
- "18 1147.668325 3.542229 1150.726489 1143.868385 32738 \n",
- "19 1147.473989 5.789710 1151.989352 1143.322918 17214 \n",
+ "0 1258.815279 3.258132 1262.796713 1256.000508 35931 \n",
+ "1 1252.684886 1.799233 1254.748391 1249.873417 73547 \n",
+ "2 1250.926206 2.018201 1253.851885 1248.166034 80997 \n",
+ "3 1249.618395 3.233129 1252.956497 1246.247080 39482 \n",
+ "4 1246.777591 1.942477 1249.979712 1244.305362 67354 \n",
+ ".. ... ... ... ... ... \n",
+ "87 886.873429 19.813751 894.785321 878.677878 5195 \n",
+ "88 876.929108 27.115855 887.355529 866.860534 4521 \n",
+ "89 848.932568 36.961459 859.103936 837.364341 3461 \n",
+ "90 826.647332 30.156414 837.335988 816.370788 3666 \n",
+ "91 804.356329 44.756983 815.161492 790.879536 2538 \n",
"\n",
" final_ranking key \\\n",
"0 1 gpt-4-turbo-2024-04-09 \n",
- "1 1 gpt-4-1106-preview \n",
+ "1 2 gpt-4-1106-preview \n",
"2 2 claude-3-opus-20240229 \n",
"3 2 gemini-1.5-pro-api-0409-preview \n",
- "4 3 gpt-4-0125-preview \n",
- "5 6 bard-jan-24-gemini-pro \n",
- "6 6 llama-3-70b-instruct \n",
- "7 6 claude-3-sonnet-20240229 \n",
- "8 9 command-r-plus \n",
- "9 9 gpt-4-0314 \n",
- "10 11 claude-3-haiku-20240307 \n",
- "11 12 gpt-4-0613 \n",
- "12 13 mistral-large-2402 \n",
- "13 13 qwen1.5-72b-chat \n",
- "14 13 reka-flash-21b-20240226-online \n",
- "15 14 claude-1 \n",
- "16 14 reka-flash-21b-20240226 \n",
- "17 14 command-r \n",
- "18 14 mistral-medium \n",
- "19 14 mixtral-8x22b-instruct-v0.1 \n",
+ "4 2 gpt-4-0125-preview \n",
+ ".. ... ... \n",
+ "87 87 chatglm-6b \n",
+ "88 88 fastchat-t5-3b \n",
+ "89 90 stablelm-tuned-alpha-7b \n",
+ "90 91 dolly-v2-12b \n",
+ "91 92 llama-13b \n",
"\n",
" Model MT-bench (score) MMLU \\\n",
"0 GPT-4-Turbo-2024-04-09 - - \n",
@@ -1017,43 +849,25 @@
"2 Claude 3 Opus - 0.868 \n",
"3 Gemini 1.5 Pro API-0409-Preview - 0.819 \n",
"4 GPT-4-0125-preview - - \n",
- "5 Bard (Gemini Pro) - - \n",
- "6 Llama-3-70b-Instruct - 0.820 \n",
- "7 Claude 3 Sonnet - 0.790 \n",
- "8 Command R+ - - \n",
- "9 GPT-4-0314 8.96 0.864 \n",
- "10 Claude 3 Haiku - 0.752 \n",
- "11 GPT-4-0613 9.18 - \n",
- "12 Mistral-Large-2402 - 0.812 \n",
- "13 Qwen1.5-72B-Chat 8.61 0.775 \n",
- "14 Reka-Flash-21B-online - - \n",
- "15 Claude-1 7.90 0.770 \n",
- "16 Reka-Flash-21B - 0.735 \n",
- "17 Command R - - \n",
- "18 Mistral Medium 8.61 0.753 \n",
- "19 Mixtral-8x22b-Instruct-v0.1 - 0.778 \n",
+ ".. ... ... ... \n",
+ "87 ChatGLM-6B 4.50 0.361 \n",
+ "88 FastChat-T5-3B 3.04 0.477 \n",
+ "89 StableLM-Tuned-Alpha-7B 2.75 0.244 \n",
+ "90 Dolly-V2-12B 3.28 0.257 \n",
+ "91 LLaMA-13B 2.61 0.470 \n",
"\n",
- " Knowledge cutoff date License Organization \\\n",
- "0 2023/12 Proprietary OpenAI \n",
- "1 2023/4 Proprietary OpenAI \n",
- "2 2023/8 Proprietary Anthropic \n",
- "3 2023/11 Proprietary Google \n",
- "4 2023/12 Proprietary OpenAI \n",
- "5 Online Proprietary Google \n",
- "6 2023/12 Llama 3 Community Meta \n",
- "7 2023/8 Proprietary Anthropic \n",
- "8 2024/3 CC-BY-NC-4.0 Cohere \n",
- "9 2021/9 Proprietary OpenAI \n",
- "10 2023/8 Proprietary Anthropic \n",
- "11 2021/9 Proprietary OpenAI \n",
- "12 - Proprietary Mistral \n",
- "13 2024/2 Qianwen LICENSE Alibaba \n",
- "14 Online Proprietary Reka AI \n",
- "15 - Proprietary Anthropic \n",
- "16 2023/11 Proprietary Reka AI \n",
- "17 2024/3 CC-BY-NC-4.0 Cohere \n",
- "18 - Proprietary Mistral \n",
- "19 2024/4 Apache 2.0 Mistral \n",
+ " Knowledge cutoff date License Organization \\\n",
+ "0 2023/12 Proprietary OpenAI \n",
+ "1 2023/4 Proprietary OpenAI \n",
+ "2 2023/8 Proprietary Anthropic \n",
+ "3 2023/11 Proprietary Google \n",
+ "4 2023/12 Proprietary OpenAI \n",
+ ".. ... ... ... \n",
+ "87 2023/3 Non-commercial Tsinghua \n",
+ "88 2023/4 Apache 2.0 LMSYS \n",
+ "89 2023/4 CC-BY-NC-SA-4.0 Stability AI \n",
+ "90 2023/4 MIT Databricks \n",
+ "91 2023/2 Non-commercial Meta \n",
"\n",
" Link \n",
"0 https://platform.openai.com/docs/models/gpt-4-... \n",
@@ -1061,30 +875,23 @@
"2 https://www.anthropic.com/news/claude-3-family \n",
"3 https://blog.google/technology/ai/google-gemin... \n",
"4 https://openai.com/blog/new-models-and-develop... \n",
- "5 https://bard.google.com/ \n",
- "6 https://llama.meta.com/llama3/ \n",
- "7 https://www.anthropic.com/news/claude-3-family \n",
- "8 https://txt.cohere.com/command-r-plus-microsof... \n",
- "9 https://openai.com/research/gpt-4 \n",
- "10 https://www.anthropic.com/news/claude-3-family \n",
- "11 https://platform.openai.com/docs/models/gpt-4-... \n",
- "12 https://mistral.ai/news/mistral-large/ \n",
- "13 https://qwenlm.github.io/blog/qwen1.5/ \n",
- "14 https://docs.reka.ai/http-api.html#generation \n",
- "15 https://www.anthropic.com/index/introducing-cl... \n",
- "16 https://www.reka.ai/news/reka-flash-efficient-... \n",
- "17 https://txt.cohere.com/command-r \n",
- "18 https://mistral.ai/news/la-plateforme/ \n",
- "19 https://mistral.ai/news/mixtral-8x22b/ "
+ ".. ... \n",
+ "87 https://huggingface.co/THUDM/chatglm-6b \n",
+ "88 https://huggingface.co/lmsys/fastchat-t5-3b-v1.0 \n",
+ "89 https://huggingface.co/stabilityai/stablelm-tu... \n",
+ "90 https://huggingface.co/databricks/dolly-v2-12b \n",
+ "91 https://arxiv.org/abs/2302.13971 \n",
+ "\n",
+ "[92 rows x 14 columns]"
]
},
- "execution_count": 101,
+ "execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "merged_dfs[\"Overall\"][:20]"
+ "merged_dfs[\"Overall\"]"
]
},
{
@@ -1096,7 +903,7 @@
},
{
"cell_type": "code",
- "execution_count": 113,
+ "execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
@@ -1106,213 +913,2653 @@
},
{
"cell_type": "code",
- "execution_count": 120,
+ "execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
- "t.to_json(\"release_date_mapping.json\", orient=\"records\", lines=True)"
+ "release_date_mapping = pd.read_json(\"release_date_mapping.json\", orient=\"records\")"
]
},
{
"cell_type": "code",
- "execution_count": 119,
+ "execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " key | \n",
+ " Model | \n",
+ " Release Date | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " gpt-4-turbo-2024-04-09 | \n",
+ " GPT-4-Turbo-2024-04-09 | \n",
+ " 2024-04-09 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " gpt-4-1106-preview | \n",
+ " GPT-4-1106-preview | \n",
+ " 2023-11-06 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " claude-3-opus-20240229 | \n",
+ " Claude 3 Opus | \n",
+ " 2024-02-29 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " gemini-1.5-pro-api-0409-preview | \n",
+ " Gemini 1.5 Pro API-0409-Preview | \n",
+ " 2024-04-09 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " gpt-4-0125-preview | \n",
+ " GPT-4-0125-preview | \n",
+ " 2024-01-25 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 86 | \n",
+ " chatglm-6b | \n",
+ " ChatGLM-6B | \n",
+ " 2023-03-13 | \n",
+ "
\n",
+ " \n",
+ " 87 | \n",
+ " fastchat-t5-3b | \n",
+ " FastChat-T5-3B | \n",
+ " 2023-04-27 | \n",
+ "
\n",
+ " \n",
+ " 88 | \n",
+ " stablelm-tuned-alpha-7b | \n",
+ " StableLM-Tuned-Alpha-7B | \n",
+ " 2023-04-19 | \n",
+ "
\n",
+ " \n",
+ " 89 | \n",
+ " dolly-v2-12b | \n",
+ " Dolly-V2-12B | \n",
+ " 2023-04-12 | \n",
+ "
\n",
+ " \n",
+ " 90 | \n",
+ " llama-13b | \n",
+ " LLaMA-13B | \n",
+ " 2023-02-27 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
91 rows × 3 columns
\n",
+ "
"
+ ],
"text/plain": [
- "[{'key': 'gpt-4-turbo-2024-04-09',\n",
- " 'Model': 'GPT-4-Turbo-2024-04-09',\n",
- " 'Release Date': ''},\n",
- " {'key': 'gpt-4-1106-preview',\n",
- " 'Model': 'GPT-4-1106-preview',\n",
- " 'Release Date': ''},\n",
- " {'key': 'claude-3-opus-20240229',\n",
- " 'Model': 'Claude 3 Opus',\n",
- " 'Release Date': ''},\n",
- " {'key': 'gemini-1.5-pro-api-0409-preview',\n",
- " 'Model': 'Gemini 1.5 Pro API-0409-Preview',\n",
- " 'Release Date': ''},\n",
- " {'key': 'gpt-4-0125-preview',\n",
- " 'Model': 'GPT-4-0125-preview',\n",
- " 'Release Date': ''},\n",
- " {'key': 'bard-jan-24-gemini-pro',\n",
- " 'Model': 'Bard (Gemini Pro)',\n",
- " 'Release Date': ''},\n",
- " {'key': 'llama-3-70b-instruct',\n",
- " 'Model': 'Llama-3-70b-Instruct',\n",
- " 'Release Date': ''},\n",
- " {'key': 'claude-3-sonnet-20240229',\n",
- " 'Model': 'Claude 3 Sonnet',\n",
- " 'Release Date': ''},\n",
- " {'key': 'command-r-plus', 'Model': 'Command R+', 'Release Date': ''},\n",
- " {'key': 'gpt-4-0314', 'Model': 'GPT-4-0314', 'Release Date': ''},\n",
- " {'key': 'claude-3-haiku-20240307',\n",
- " 'Model': 'Claude 3 Haiku',\n",
- " 'Release Date': ''},\n",
- " {'key': 'gpt-4-0613', 'Model': 'GPT-4-0613', 'Release Date': ''},\n",
- " {'key': 'mistral-large-2402',\n",
- " 'Model': 'Mistral-Large-2402',\n",
- " 'Release Date': ''},\n",
- " {'key': 'qwen1.5-72b-chat', 'Model': 'Qwen1.5-72B-Chat', 'Release Date': ''},\n",
- " {'key': 'reka-flash-21b-20240226-online',\n",
- " 'Model': 'Reka-Flash-21B-online',\n",
- " 'Release Date': ''},\n",
- " {'key': 'claude-1', 'Model': 'Claude-1', 'Release Date': ''},\n",
- " {'key': 'reka-flash-21b-20240226',\n",
- " 'Model': 'Reka-Flash-21B',\n",
- " 'Release Date': ''},\n",
- " {'key': 'command-r', 'Model': 'Command R', 'Release Date': ''},\n",
- " {'key': 'mistral-medium', 'Model': 'Mistral Medium', 'Release Date': ''},\n",
- " {'key': 'mixtral-8x22b-instruct-v0.1',\n",
- " 'Model': 'Mixtral-8x22b-Instruct-v0.1',\n",
- " 'Release Date': ''},\n",
- " {'key': 'llama-3-8b-instruct',\n",
- " 'Model': 'Llama-3-8b-Instruct',\n",
- " 'Release Date': ''},\n",
- " {'key': 'gemini-pro-dev-api',\n",
- " 'Model': 'Gemini Pro (Dev API)',\n",
- " 'Release Date': ''},\n",
- " {'key': 'qwen1.5-32b-chat', 'Model': 'Qwen1.5-32B-Chat', 'Release Date': ''},\n",
- " {'key': 'claude-2.0', 'Model': 'Claude-2.0', 'Release Date': ''},\n",
- " {'key': 'mistral-next', 'Model': 'Mistral-Next', 'Release Date': ''},\n",
- " {'key': 'zephyr-orpo-141b-A35b-v0.1',\n",
- " 'Model': 'Zephyr-ORPO-141b-A35b-v0.1',\n",
- " 'Release Date': ''},\n",
- " {'key': 'gpt-3.5-turbo-0613',\n",
- " 'Model': 'GPT-3.5-Turbo-0613',\n",
- " 'Release Date': ''},\n",
- " {'key': 'claude-2.1', 'Model': 'Claude-2.1', 'Release Date': ''},\n",
- " {'key': 'qwen1.5-14b-chat', 'Model': 'Qwen1.5-14B-Chat', 'Release Date': ''},\n",
- " {'key': 'starling-lm-7b-beta',\n",
- " 'Model': 'Starling-LM-7B-beta',\n",
- " 'Release Date': ''},\n",
- " {'key': 'gemini-pro', 'Model': 'Gemini Pro', 'Release Date': ''},\n",
- " {'key': 'mixtral-8x7b-instruct-v0.1',\n",
- " 'Model': 'Mixtral-8x7b-Instruct-v0.1',\n",
- " 'Release Date': ''},\n",
- " {'key': 'claude-instant-1', 'Model': 'Claude-Instant-1', 'Release Date': ''},\n",
- " {'key': 'yi-34b-chat', 'Model': 'Yi-34B-Chat', 'Release Date': ''},\n",
- " {'key': 'gpt-3.5-turbo-0314',\n",
- " 'Model': 'GPT-3.5-Turbo-0314',\n",
- " 'Release Date': ''},\n",
- " {'key': 'wizardlm-70b', 'Model': 'WizardLM-70B-v1.0', 'Release Date': ''},\n",
- " {'key': 'gpt-3.5-turbo-0125',\n",
- " 'Model': 'GPT-3.5-Turbo-0125',\n",
- " 'Release Date': ''},\n",
- " {'key': 'tulu-2-dpo-70b', 'Model': 'Tulu-2-DPO-70B', 'Release Date': ''},\n",
- " {'key': 'dbrx-instruct-preview',\n",
- " 'Model': 'DBRX-Instruct-Preview',\n",
- " 'Release Date': ''},\n",
- " {'key': 'openchat-3.5-0106',\n",
- " 'Model': 'OpenChat-3.5-0106',\n",
- " 'Release Date': ''},\n",
- " {'key': 'vicuna-33b', 'Model': 'Vicuna-33B', 'Release Date': ''},\n",
- " {'key': 'starling-lm-7b-alpha',\n",
- " 'Model': 'Starling-LM-7B-alpha',\n",
- " 'Release Date': ''},\n",
- " {'key': 'llama-2-70b-chat', 'Model': 'Llama-2-70b-chat', 'Release Date': ''},\n",
- " {'key': 'nous-hermes-2-mixtral-8x7b-dpo',\n",
- " 'Model': 'Nous-Hermes-2-Mixtral-8x7B-DPO',\n",
- " 'Release Date': ''},\n",
- " {'key': 'gemma-1.1-7b-it', 'Model': 'Gemma-1.1-7B-it', 'Release Date': ''},\n",
- " {'key': 'llama2-70b-steerlm-chat',\n",
- " 'Model': 'NV-Llama2-70B-SteerLM-Chat',\n",
- " 'Release Date': ''},\n",
- " {'key': 'deepseek-llm-67b-chat',\n",
- " 'Model': 'DeepSeek-LLM-67B-Chat',\n",
- " 'Release Date': ''},\n",
- " {'key': 'openhermes-2.5-mistral-7b',\n",
- " 'Model': 'OpenHermes-2.5-Mistral-7b',\n",
- " 'Release Date': ''},\n",
- " {'key': 'openchat-3.5', 'Model': 'OpenChat-3.5', 'Release Date': ''},\n",
- " {'key': 'pplx-70b-online', 'Model': 'pplx-70b-online', 'Release Date': ''},\n",
- " {'key': 'mistral-7b-instruct-v0.2',\n",
- " 'Model': 'Mistral-7B-Instruct-v0.2',\n",
- " 'Release Date': ''},\n",
- " {'key': 'qwen1.5-7b-chat', 'Model': 'Qwen1.5-7B-Chat', 'Release Date': ''},\n",
- " {'key': 'gpt-3.5-turbo-1106',\n",
- " 'Model': 'GPT-3.5-Turbo-1106',\n",
- " 'Release Date': ''},\n",
- " {'key': 'dolphin-2.2.1-mistral-7b',\n",
- " 'Model': 'Dolphin-2.2.1-Mistral-7B',\n",
- " 'Release Date': ''},\n",
- " {'key': 'solar-10.7b-instruct-v1.0',\n",
- " 'Model': 'SOLAR-10.7B-Instruct-v1.0',\n",
- " 'Release Date': ''},\n",
- " {'key': 'phi-3-mini-128k-instruct',\n",
- " 'Model': 'Phi-3-Mini-128k-Instruct',\n",
- " 'Release Date': ''},\n",
- " {'key': 'wizardlm-13b', 'Model': 'WizardLM-13b-v1.2', 'Release Date': ''},\n",
- " {'key': 'llama-2-13b-chat', 'Model': 'Llama-2-13b-chat', 'Release Date': ''},\n",
- " {'key': 'zephyr-7b-beta', 'Model': 'Zephyr-7b-beta', 'Release Date': ''},\n",
- " {'key': 'codellama-70b-instruct',\n",
- " 'Model': 'CodeLlama-70B-instruct',\n",
- " 'Release Date': ''},\n",
- " {'key': 'mpt-30b-chat', 'Model': 'MPT-30B-chat', 'Release Date': ''},\n",
- " {'key': 'vicuna-13b', 'Model': 'Vicuna-13B', 'Release Date': ''},\n",
- " {'key': 'codellama-34b-instruct',\n",
- " 'Model': 'CodeLlama-34B-instruct',\n",
- " 'Release Date': ''},\n",
- " {'key': 'gemma-7b-it', 'Model': 'Gemma-7B-it', 'Release Date': ''},\n",
- " {'key': 'pplx-7b-online', 'Model': 'pplx-7b-online', 'Release Date': ''},\n",
- " {'key': 'zephyr-7b-alpha', 'Model': 'Zephyr-7b-alpha', 'Release Date': ''},\n",
- " {'key': 'llama-2-7b-chat', 'Model': 'Llama-2-7b-chat', 'Release Date': ''},\n",
- " {'key': 'qwen-14b-chat', 'Model': 'Qwen-14B-Chat', 'Release Date': ''},\n",
- " {'key': 'falcon-180b-chat', 'Model': 'falcon-180b-chat', 'Release Date': ''},\n",
- " {'key': 'guanaco-33b', 'Model': 'Guanaco-33B', 'Release Date': ''},\n",
- " {'key': 'stripedhyena-nous-7b',\n",
- " 'Model': 'StripedHyena-Nous-7B',\n",
- " 'Release Date': ''},\n",
- " {'key': 'olmo-7b-instruct', 'Model': 'OLMo-7B-instruct', 'Release Date': ''},\n",
- " {'key': 'gemma-1.1-2b-it', 'Model': 'Gemma-1.1-2B-it', 'Release Date': ''},\n",
- " {'key': 'mistral-7b-instruct',\n",
- " 'Model': 'Mistral-7B-Instruct-v0.1',\n",
- " 'Release Date': ''},\n",
- " {'key': 'palm-2', 'Model': 'PaLM-Chat-Bison-001', 'Release Date': ''},\n",
- " {'key': 'vicuna-7b', 'Model': 'Vicuna-7B', 'Release Date': ''},\n",
- " {'key': 'qwen1.5-4b-chat', 'Model': 'Qwen1.5-4B-Chat', 'Release Date': ''},\n",
- " {'key': 'gemma-2b-it', 'Model': 'Gemma-2B-it', 'Release Date': ''},\n",
- " {'key': 'koala-13b', 'Model': 'Koala-13B', 'Release Date': ''},\n",
- " {'key': 'chatglm3-6b', 'Model': 'ChatGLM3-6B', 'Release Date': ''},\n",
- " {'key': 'gpt4all-13b-snoozy',\n",
- " 'Model': 'GPT4All-13B-Snoozy',\n",
- " 'Release Date': ''},\n",
- " {'key': 'chatglm2-6b', 'Model': 'ChatGLM2-6B', 'Release Date': ''},\n",
- " {'key': 'mpt-7b-chat', 'Model': 'MPT-7B-Chat', 'Release Date': ''},\n",
- " {'key': 'RWKV-4-Raven-14B', 'Model': 'RWKV-4-Raven-14B', 'Release Date': ''},\n",
- " {'key': 'alpaca-13b', 'Model': 'Alpaca-13B', 'Release Date': ''},\n",
- " {'key': 'oasst-pythia-12b',\n",
- " 'Model': 'OpenAssistant-Pythia-12B',\n",
- " 'Release Date': ''},\n",
- " {'key': 'chatglm-6b', 'Model': 'ChatGLM-6B', 'Release Date': ''},\n",
- " {'key': 'fastchat-t5-3b', 'Model': 'FastChat-T5-3B', 'Release Date': ''},\n",
- " {'key': 'stablelm-tuned-alpha-7b',\n",
- " 'Model': 'StableLM-Tuned-Alpha-7B',\n",
- " 'Release Date': ''},\n",
- " {'key': 'dolly-v2-12b', 'Model': 'Dolly-V2-12B', 'Release Date': ''},\n",
- " {'key': 'llama-13b', 'Model': 'LLaMA-13B', 'Release Date': ''}]"
+ " key Model \\\n",
+ "0 gpt-4-turbo-2024-04-09 GPT-4-Turbo-2024-04-09 \n",
+ "1 gpt-4-1106-preview GPT-4-1106-preview \n",
+ "2 claude-3-opus-20240229 Claude 3 Opus \n",
+ "3 gemini-1.5-pro-api-0409-preview Gemini 1.5 Pro API-0409-Preview \n",
+ "4 gpt-4-0125-preview GPT-4-0125-preview \n",
+ ".. ... ... \n",
+ "86 chatglm-6b ChatGLM-6B \n",
+ "87 fastchat-t5-3b FastChat-T5-3B \n",
+ "88 stablelm-tuned-alpha-7b StableLM-Tuned-Alpha-7B \n",
+ "89 dolly-v2-12b Dolly-V2-12B \n",
+ "90 llama-13b LLaMA-13B \n",
+ "\n",
+ " Release Date \n",
+ "0 2024-04-09 \n",
+ "1 2023-11-06 \n",
+ "2 2024-02-29 \n",
+ "3 2024-04-09 \n",
+ "4 2024-01-25 \n",
+ ".. ... \n",
+ "86 2023-03-13 \n",
+ "87 2023-04-27 \n",
+ "88 2023-04-19 \n",
+ "89 2023-04-12 \n",
+ "90 2023-02-27 \n",
+ "\n",
+ "[91 rows x 3 columns]"
]
},
- "execution_count": 119,
+ "execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "t.to_dict(orient=\"records\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Build plot"
+ "release_date_mapping"
]
},
{
- "cell_type": "markdown",
+ "cell_type": "code",
+ "execution_count": 15,
"metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " key | \n",
+ " Release Date | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " gpt-4-turbo-2024-04-09 | \n",
+ " 2024-04-09 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " gpt-4-1106-preview | \n",
+ " 2023-11-06 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " claude-3-opus-20240229 | \n",
+ " 2024-02-29 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " gemini-1.5-pro-api-0409-preview | \n",
+ " 2024-04-09 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " gpt-4-0125-preview | \n",
+ " 2024-01-25 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 86 | \n",
+ " chatglm-6b | \n",
+ " 2023-03-13 | \n",
+ "
\n",
+ " \n",
+ " 87 | \n",
+ " fastchat-t5-3b | \n",
+ " 2023-04-27 | \n",
+ "
\n",
+ " \n",
+ " 88 | \n",
+ " stablelm-tuned-alpha-7b | \n",
+ " 2023-04-19 | \n",
+ "
\n",
+ " \n",
+ " 89 | \n",
+ " dolly-v2-12b | \n",
+ " 2023-04-12 | \n",
+ "
\n",
+ " \n",
+ " 90 | \n",
+ " llama-13b | \n",
+ " 2023-02-27 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
91 rows × 2 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " key Release Date\n",
+ "0 gpt-4-turbo-2024-04-09 2024-04-09\n",
+ "1 gpt-4-1106-preview 2023-11-06\n",
+ "2 claude-3-opus-20240229 2024-02-29\n",
+ "3 gemini-1.5-pro-api-0409-preview 2024-04-09\n",
+ "4 gpt-4-0125-preview 2024-01-25\n",
+ ".. ... ...\n",
+ "86 chatglm-6b 2023-03-13\n",
+ "87 fastchat-t5-3b 2023-04-27\n",
+ "88 stablelm-tuned-alpha-7b 2023-04-19\n",
+ "89 dolly-v2-12b 2023-04-12\n",
+ "90 llama-13b 2023-02-27\n",
+ "\n",
+ "[91 rows x 2 columns]"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "release_date_mapping[[\"key\", \"Release Date\"]]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# add release dates into the merged data\n",
+ "for k, v in merged_dfs.items():\n",
+ " merged_dfs[k] = pd.merge(\n",
+ " merged_dfs[k], release_date_mapping[[\"key\", \"Release Date\"]], on=\"key\"\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['rating', 'variance', 'rating_q975', 'rating_q025', 'num_battles',\n",
+ " 'final_ranking', 'key', 'Model', 'MT-bench (score)', 'MMLU',\n",
+ " 'Knowledge cutoff date', 'License', 'Organization', 'Link',\n",
+ " 'Release Date'],\n",
+ " dtype='object')"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "merged_dfs[\"Overall\"].columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 77,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def format_data(df):\n",
+ " df[\"License\"] = df[\"License\"].apply(\n",
+ " lambda x: \"Proprietary LLM\" if x in PROPRIETARY_LICENSES else \"Open LLM\"\n",
+ " )\n",
+ " df[\"Release Date\"] = pd.to_datetime(df[\"Release Date\"])\n",
+ " df[\"Month-Year\"] = df[\"Release Date\"].dt.to_period(\"M\")\n",
+ " df[\"rating\"] = df[\"rating\"].round()\n",
+ " return df.reset_index(drop=True)\n",
+ "\n",
+ "\n",
+ "merged_dfs2 = {k: format_data(v) for k, v in merged_dfs.items()}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 81,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "5\n",
+ "5\n",
+ "5\n",
+ "5\n",
+ "5\n",
+ "5\n",
+ "5\n",
+ "5\n",
+ "5\n"
+ ]
+ }
+ ],
+ "source": [
+ "for k, df in merged_dfs2.items():\n",
+ " print(\n",
+ " int(\n",
+ " df.groupby([\"Release Date\", \"License\"])[\"rating\"]\n",
+ " .apply(lambda x: len(x))\n",
+ " .max()\n",
+ " )\n",
+ " )\n",
+ " (df[\"rating\"].min().round(),)\n",
+ " print()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Build plot"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 76,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "t = {\n",
+ " \"Overall\": {\n",
+ " \"min_elo_score\": 804.0,\n",
+ " \"max_elo_score\": 1259.0,\n",
+ " \"upper_models_per_month\": 5,\n",
+ " },\n",
+ " \"Coding\": {\n",
+ " \"min_elo_score\": 672.0,\n",
+ " \"max_elo_score\": 1270.0,\n",
+ " \"upper_models_per_month\": 5,\n",
+ " },\n",
+ " \"Longer Query\": {\n",
+ " \"min_elo_score\": 796.0,\n",
+ " \"max_elo_score\": 1273.0,\n",
+ " \"upper_models_per_month\": 5,\n",
+ " },\n",
+ " \"English\": {\n",
+ " \"min_elo_score\": 783.0,\n",
+ " \"max_elo_score\": 1246.0,\n",
+ " \"upper_models_per_month\": 5,\n",
+ " },\n",
+ " \"Chinese\": {\n",
+ " \"min_elo_score\": 753.0,\n",
+ " \"max_elo_score\": 1325.0,\n",
+ " \"upper_models_per_month\": 5,\n",
+ " },\n",
+ " \"French\": {\n",
+ " \"min_elo_score\": 694.0,\n",
+ " \"max_elo_score\": 1268.0,\n",
+ " \"upper_models_per_month\": 5,\n",
+ " },\n",
+ " \"Exclude Ties\": {\n",
+ " \"min_elo_score\": 654.0,\n",
+ " \"max_elo_score\": 1334.0,\n",
+ " \"upper_models_per_month\": 5,\n",
+ " },\n",
+ " \"Exclude Short Query (< 5 tokens)\": {\n",
+ " \"min_elo_score\": 796.0,\n",
+ " \"max_elo_score\": 1264.0,\n",
+ " \"upper_models_per_month\": 5,\n",
+ " },\n",
+ " \"Exclude Refusal\": {\n",
+ " \"min_elo_score\": 795.0,\n",
+ " \"max_elo_score\": 1264.0,\n",
+ " \"upper_models_per_month\": 5,\n",
+ " },\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "o = {\n",
+ " \"min_elo_score\": ,\n",
+ " \"max_elo_score\": ,\n",
+ " \"upper_models_per_month\": ,\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "PROPRIETARY_LICENSES = [\n",
+ " \"Proprietary\",\n",
+ " \"Non-commercial\",\n",
+ "]\n",
+ "\n",
+ "df = merged_dfs[\"Overall\"]\n",
+ "df[\"License\"] = df[\"License\"].apply(\n",
+ " lambda x: \"Proprietary LLM\" if x in PROPRIETARY_LICENSES else \"Open LLM\"\n",
+ ")\n",
+ "df[\"Release Date\"] = pd.to_datetime(df[\"Release Date\"])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df[\"Month-Year\"] = df[\"Release Date\"].dt.to_period(\"M\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 66,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "8"
+ ]
+ },
+ "execution_count": 66,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.groupby([\"Month-Year\", \"License\"])[\"rating\"].apply(lambda x: x.count()).max()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 69,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " rating | \n",
+ " variance | \n",
+ " rating_q975 | \n",
+ " rating_q025 | \n",
+ " num_battles | \n",
+ " final_ranking | \n",
+ " key | \n",
+ " Model | \n",
+ " MT-bench (score) | \n",
+ " MMLU | \n",
+ " Knowledge cutoff date | \n",
+ " License | \n",
+ " Organization | \n",
+ " Link | \n",
+ " Release Date | \n",
+ " license_binary | \n",
+ " Month-Year | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 4 | \n",
+ " 1246.777591 | \n",
+ " 1.942477 | \n",
+ " 1249.979712 | \n",
+ " 1244.305362 | \n",
+ " 67354 | \n",
+ " 2 | \n",
+ " gpt-4-0125-preview | \n",
+ " GPT-4-0125-preview | \n",
+ " - | \n",
+ " - | \n",
+ " 2023/12 | \n",
+ " Proprietary LLM | \n",
+ " OpenAI | \n",
+ " https://openai.com/blog/new-models-and-develop... | \n",
+ " 2024-01-25 | \n",
+ " Proprietary LLM | \n",
+ " 2024-01 | \n",
+ "
\n",
+ " \n",
+ " 32 | \n",
+ " 1111.132640 | \n",
+ " 7.801741 | \n",
+ " 1115.356993 | \n",
+ " 1105.658254 | \n",
+ " 13177 | \n",
+ " 29 | \n",
+ " yi-34b-chat | \n",
+ " Yi-34B-Chat | \n",
+ " - | \n",
+ " 0.735 | \n",
+ " 2023/6 | \n",
+ " Open LLM | \n",
+ " 01 AI | \n",
+ " https://huggingface.co/01-ai/Yi-34B-Chat | \n",
+ " 2024-01-23 | \n",
+ " Open LLM | \n",
+ " 2024-01 | \n",
+ "
\n",
+ " \n",
+ " 36 | \n",
+ " 1107.129810 | \n",
+ " 2.419182 | \n",
+ " 1110.056188 | \n",
+ " 1104.002581 | \n",
+ " 47220 | \n",
+ " 32 | \n",
+ " gpt-3.5-turbo-0125 | \n",
+ " GPT-3.5-Turbo-0125 | \n",
+ " - | \n",
+ " - | \n",
+ " 2021/9 | \n",
+ " Proprietary LLM | \n",
+ " OpenAI | \n",
+ " https://platform.openai.com/docs/models/gpt-3-... | \n",
+ " 2024-01-25 | \n",
+ " Proprietary LLM | \n",
+ " 2024-01 | \n",
+ "
\n",
+ " \n",
+ " 39 | \n",
+ " 1098.527455 | \n",
+ " 6.400166 | \n",
+ " 1103.343592 | \n",
+ " 1093.903695 | \n",
+ " 14159 | \n",
+ " 36 | \n",
+ " openchat-3.5-0106 | \n",
+ " OpenChat-3.5-0106 | \n",
+ " 7.8 | \n",
+ " 0.658 | \n",
+ " 2024/1 | \n",
+ " Open LLM | \n",
+ " OpenChat | \n",
+ " https://huggingface.co/openchat/openchat-3.5-0106 | \n",
+ " 2024-01-06 | \n",
+ " Open LLM | \n",
+ " 2024-01 | \n",
+ "
\n",
+ " \n",
+ " 43 | \n",
+ " 1087.307758 | \n",
+ " 18.314258 | \n",
+ " 1094.532598 | \n",
+ " 1078.413814 | \n",
+ " 3980 | \n",
+ " 40 | \n",
+ " nous-hermes-2-mixtral-8x7b-dpo | \n",
+ " Nous-Hermes-2-Mixtral-8x7B-DPO | \n",
+ " - | \n",
+ " - | \n",
+ " 2024/1 | \n",
+ " Open LLM | \n",
+ " NousResearch | \n",
+ " https://huggingface.co/NousResearch/Nous-Herme... | \n",
+ " 2024-01-13 | \n",
+ " Open LLM | \n",
+ " 2024-01 | \n",
+ "
\n",
+ " \n",
+ " 60 | \n",
+ " 1047.927688 | \n",
+ " 60.707225 | \n",
+ " 1061.952116 | \n",
+ " 1034.283514 | \n",
+ " 1321 | \n",
+ " 55 | \n",
+ " codellama-70b-instruct | \n",
+ " CodeLlama-70B-instruct | \n",
+ " - | \n",
+ " - | \n",
+ " 2024/1 | \n",
+ " Open LLM | \n",
+ " Meta | \n",
+ " https://huggingface.co/codellama/CodeLlama-70b-hf | \n",
+ " 2024-01-29 | \n",
+ " Open LLM | \n",
+ " 2024-01 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " rating variance rating_q975 rating_q025 num_battles \\\n",
+ "4 1246.777591 1.942477 1249.979712 1244.305362 67354 \n",
+ "32 1111.132640 7.801741 1115.356993 1105.658254 13177 \n",
+ "36 1107.129810 2.419182 1110.056188 1104.002581 47220 \n",
+ "39 1098.527455 6.400166 1103.343592 1093.903695 14159 \n",
+ "43 1087.307758 18.314258 1094.532598 1078.413814 3980 \n",
+ "60 1047.927688 60.707225 1061.952116 1034.283514 1321 \n",
+ "\n",
+ " final_ranking key \\\n",
+ "4 2 gpt-4-0125-preview \n",
+ "32 29 yi-34b-chat \n",
+ "36 32 gpt-3.5-turbo-0125 \n",
+ "39 36 openchat-3.5-0106 \n",
+ "43 40 nous-hermes-2-mixtral-8x7b-dpo \n",
+ "60 55 codellama-70b-instruct \n",
+ "\n",
+ " Model MT-bench (score) MMLU \\\n",
+ "4 GPT-4-0125-preview - - \n",
+ "32 Yi-34B-Chat - 0.735 \n",
+ "36 GPT-3.5-Turbo-0125 - - \n",
+ "39 OpenChat-3.5-0106 7.8 0.658 \n",
+ "43 Nous-Hermes-2-Mixtral-8x7B-DPO - - \n",
+ "60 CodeLlama-70B-instruct - - \n",
+ "\n",
+ " Knowledge cutoff date License Organization \\\n",
+ "4 2023/12 Proprietary LLM OpenAI \n",
+ "32 2023/6 Open LLM 01 AI \n",
+ "36 2021/9 Proprietary LLM OpenAI \n",
+ "39 2024/1 Open LLM OpenChat \n",
+ "43 2024/1 Open LLM NousResearch \n",
+ "60 2024/1 Open LLM Meta \n",
+ "\n",
+ " Link Release Date \\\n",
+ "4 https://openai.com/blog/new-models-and-develop... 2024-01-25 \n",
+ "32 https://huggingface.co/01-ai/Yi-34B-Chat 2024-01-23 \n",
+ "36 https://platform.openai.com/docs/models/gpt-3-... 2024-01-25 \n",
+ "39 https://huggingface.co/openchat/openchat-3.5-0106 2024-01-06 \n",
+ "43 https://huggingface.co/NousResearch/Nous-Herme... 2024-01-13 \n",
+ "60 https://huggingface.co/codellama/CodeLlama-70b-hf 2024-01-29 \n",
+ "\n",
+ " license_binary Month-Year \n",
+ "4 Proprietary LLM 2024-01 \n",
+ "32 Open LLM 2024-01 \n",
+ "36 Proprietary LLM 2024-01 \n",
+ "39 Open LLM 2024-01 \n",
+ "43 Open LLM 2024-01 \n",
+ "60 Open LLM 2024-01 "
+ ]
+ },
+ "execution_count": 69,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df[df[\"Month-Year\"] == \"2024-01\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 70,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/var/folders/w0/6t9rxkj97rv47l9sc0q22yth0000gn/T/ipykernel_7726/1725500526.py:1: DeprecationWarning:\n",
+ "\n",
+ "DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+ "\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " rating | \n",
+ " variance | \n",
+ " rating_q975 | \n",
+ " rating_q025 | \n",
+ " num_battles | \n",
+ " final_ranking | \n",
+ " key | \n",
+ " Model | \n",
+ " MT-bench (score) | \n",
+ " MMLU | \n",
+ " Knowledge cutoff date | \n",
+ " License | \n",
+ " Organization | \n",
+ " Link | \n",
+ " Release Date | \n",
+ " license_binary | \n",
+ " Month-Year | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1111.132640 | \n",
+ " 7.801741 | \n",
+ " 1115.356993 | \n",
+ " 1105.658254 | \n",
+ " 13177 | \n",
+ " 29 | \n",
+ " yi-34b-chat | \n",
+ " Yi-34B-Chat | \n",
+ " - | \n",
+ " 0.735 | \n",
+ " 2023/6 | \n",
+ " Open LLM | \n",
+ " 01 AI | \n",
+ " https://huggingface.co/01-ai/Yi-34B-Chat | \n",
+ " 2024-01-23 | \n",
+ " Open LLM | \n",
+ " 2024-01 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1098.527455 | \n",
+ " 6.400166 | \n",
+ " 1103.343592 | \n",
+ " 1093.903695 | \n",
+ " 14159 | \n",
+ " 36 | \n",
+ " openchat-3.5-0106 | \n",
+ " OpenChat-3.5-0106 | \n",
+ " 7.8 | \n",
+ " 0.658 | \n",
+ " 2024/1 | \n",
+ " Open LLM | \n",
+ " OpenChat | \n",
+ " https://huggingface.co/openchat/openchat-3.5-0106 | \n",
+ " 2024-01-06 | \n",
+ " Open LLM | \n",
+ " 2024-01 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1087.307758 | \n",
+ " 18.314258 | \n",
+ " 1094.532598 | \n",
+ " 1078.413814 | \n",
+ " 3980 | \n",
+ " 40 | \n",
+ " nous-hermes-2-mixtral-8x7b-dpo | \n",
+ " Nous-Hermes-2-Mixtral-8x7B-DPO | \n",
+ " - | \n",
+ " - | \n",
+ " 2024/1 | \n",
+ " Open LLM | \n",
+ " NousResearch | \n",
+ " https://huggingface.co/NousResearch/Nous-Herme... | \n",
+ " 2024-01-13 | \n",
+ " Open LLM | \n",
+ " 2024-01 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1246.777591 | \n",
+ " 1.942477 | \n",
+ " 1249.979712 | \n",
+ " 1244.305362 | \n",
+ " 67354 | \n",
+ " 2 | \n",
+ " gpt-4-0125-preview | \n",
+ " GPT-4-0125-preview | \n",
+ " - | \n",
+ " - | \n",
+ " 2023/12 | \n",
+ " Proprietary LLM | \n",
+ " OpenAI | \n",
+ " https://openai.com/blog/new-models-and-develop... | \n",
+ " 2024-01-25 | \n",
+ " Proprietary LLM | \n",
+ " 2024-01 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1107.129810 | \n",
+ " 2.419182 | \n",
+ " 1110.056188 | \n",
+ " 1104.002581 | \n",
+ " 47220 | \n",
+ " 32 | \n",
+ " gpt-3.5-turbo-0125 | \n",
+ " GPT-3.5-Turbo-0125 | \n",
+ " - | \n",
+ " - | \n",
+ " 2021/9 | \n",
+ " Proprietary LLM | \n",
+ " OpenAI | \n",
+ " https://platform.openai.com/docs/models/gpt-3-... | \n",
+ " 2024-01-25 | \n",
+ " Proprietary LLM | \n",
+ " 2024-01 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " rating variance rating_q975 rating_q025 num_battles \\\n",
+ "0 1111.132640 7.801741 1115.356993 1105.658254 13177 \n",
+ "1 1098.527455 6.400166 1103.343592 1093.903695 14159 \n",
+ "2 1087.307758 18.314258 1094.532598 1078.413814 3980 \n",
+ "3 1246.777591 1.942477 1249.979712 1244.305362 67354 \n",
+ "4 1107.129810 2.419182 1110.056188 1104.002581 47220 \n",
+ "\n",
+ " final_ranking key \\\n",
+ "0 29 yi-34b-chat \n",
+ "1 36 openchat-3.5-0106 \n",
+ "2 40 nous-hermes-2-mixtral-8x7b-dpo \n",
+ "3 2 gpt-4-0125-preview \n",
+ "4 32 gpt-3.5-turbo-0125 \n",
+ "\n",
+ " Model MT-bench (score) MMLU \\\n",
+ "0 Yi-34B-Chat - 0.735 \n",
+ "1 OpenChat-3.5-0106 7.8 0.658 \n",
+ "2 Nous-Hermes-2-Mixtral-8x7B-DPO - - \n",
+ "3 GPT-4-0125-preview - - \n",
+ "4 GPT-3.5-Turbo-0125 - - \n",
+ "\n",
+ " Knowledge cutoff date License Organization \\\n",
+ "0 2023/6 Open LLM 01 AI \n",
+ "1 2024/1 Open LLM OpenChat \n",
+ "2 2024/1 Open LLM NousResearch \n",
+ "3 2023/12 Proprietary LLM OpenAI \n",
+ "4 2021/9 Proprietary LLM OpenAI \n",
+ "\n",
+ " Link Release Date \\\n",
+ "0 https://huggingface.co/01-ai/Yi-34B-Chat 2024-01-23 \n",
+ "1 https://huggingface.co/openchat/openchat-3.5-0106 2024-01-06 \n",
+ "2 https://huggingface.co/NousResearch/Nous-Herme... 2024-01-13 \n",
+ "3 https://openai.com/blog/new-models-and-develop... 2024-01-25 \n",
+ "4 https://platform.openai.com/docs/models/gpt-3-... 2024-01-25 \n",
+ "\n",
+ " license_binary Month-Year \n",
+ "0 Open LLM 2024-01 \n",
+ "1 Open LLM 2024-01 \n",
+ "2 Open LLM 2024-01 \n",
+ "3 Proprietary LLM 2024-01 \n",
+ "4 Proprietary LLM 2024-01 "
+ ]
+ },
+ "execution_count": 70,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df[df[\"Month-Year\"] == \"2024-01\"].groupby([\"Month-Year\", \"License\"]).apply(\n",
+ " lambda x: x.nlargest(3, \"rating\")\n",
+ ").reset_index(drop=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['rating', 'variance', 'rating_q975', 'rating_q025', 'num_battles',\n",
+ " 'final_ranking', 'key', 'Model', 'MT-bench (score)', 'MMLU',\n",
+ " 'Knowledge cutoff date', 'License', 'Organization', 'Link',\n",
+ " 'Release Date', 'license_binary'],\n",
+ " dtype='object')"
+ ]
+ },
+ "execution_count": 51,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.keys()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.plotly.v1+json": {
+ "config": {
+ "plotlyServerURL": "https://plot.ly"
+ },
+ "data": [
+ {
+ "customdata": [
+ [
+ "OpenAI",
+ "Proprietary LLM",
+ "Proprietary LLM"
+ ],
+ [
+ "OpenAI",
+ "Proprietary LLM",
+ "Proprietary LLM"
+ ],
+ [
+ "Anthropic",
+ "Proprietary LLM",
+ "Proprietary LLM"
+ ],
+ [
+ "Google",
+ "Proprietary LLM",
+ "Proprietary LLM"
+ ],
+ [
+ "OpenAI",
+ "Proprietary LLM",
+ "Proprietary LLM"
+ ],
+ [
+ "Google",
+ "Proprietary LLM",
+ "Proprietary LLM"
+ ],
+ [
+ "Anthropic",
+ "Proprietary LLM",
+ "Proprietary LLM"
+ ],
+ [
+ "OpenAI",
+ "Proprietary LLM",
+ "Proprietary LLM"
+ ],
+ [
+ "Anthropic",
+ "Proprietary LLM",
+ "Proprietary LLM"
+ ],
+ [
+ "OpenAI",
+ "Proprietary LLM",
+ "Proprietary LLM"
+ ],
+ [
+ "Mistral",
+ "Proprietary LLM",
+ "Proprietary LLM"
+ ],
+ [
+ "Reka AI",
+ "Proprietary LLM",
+ "Proprietary LLM"
+ ],
+ [
+ "Anthropic",
+ "Proprietary LLM",
+ "Proprietary LLM"
+ ],
+ [
+ "Mistral",
+ "Proprietary LLM",
+ "Proprietary LLM"
+ ],
+ [
+ "Reka AI",
+ "Proprietary LLM",
+ "Proprietary LLM"
+ ],
+ [
+ "Google",
+ "Proprietary LLM",
+ "Proprietary LLM"
+ ],
+ [
+ "Anthropic",
+ "Proprietary LLM",
+ "Proprietary LLM"
+ ],
+ [
+ "Mistral",
+ "Proprietary LLM",
+ "Proprietary LLM"
+ ],
+ [
+ "OpenAI",
+ "Proprietary LLM",
+ "Proprietary LLM"
+ ],
+ [
+ "Anthropic",
+ "Proprietary LLM",
+ "Proprietary LLM"
+ ],
+ [
+ "Google",
+ "Proprietary LLM",
+ "Proprietary LLM"
+ ],
+ [
+ "Anthropic",
+ "Proprietary LLM",
+ "Proprietary LLM"
+ ],
+ [
+ "OpenAI",
+ "Proprietary LLM",
+ "Proprietary LLM"
+ ],
+ [
+ "OpenAI",
+ "Proprietary LLM",
+ "Proprietary LLM"
+ ],
+ [
+ "LMSYS",
+ "Proprietary LLM",
+ "Proprietary LLM"
+ ],
+ [
+ "Perplexity AI",
+ "Proprietary LLM",
+ "Proprietary LLM"
+ ],
+ [
+ "OpenAI",
+ "Proprietary LLM",
+ "Proprietary LLM"
+ ],
+ [
+ "Perplexity AI",
+ "Proprietary LLM",
+ "Proprietary LLM"
+ ],
+ [
+ "UW",
+ "Proprietary LLM",
+ "Proprietary LLM"
+ ],
+ [
+ "Google",
+ "Proprietary LLM",
+ "Proprietary LLM"
+ ],
+ [
+ "UC Berkeley",
+ "Proprietary LLM",
+ "Proprietary LLM"
+ ],
+ [
+ "Nomic AI",
+ "Proprietary LLM",
+ "Proprietary LLM"
+ ],
+ [
+ "Stanford",
+ "Proprietary LLM",
+ "Proprietary LLM"
+ ],
+ [
+ "Tsinghua",
+ "Proprietary LLM",
+ "Proprietary LLM"
+ ],
+ [
+ "Meta",
+ "Proprietary LLM",
+ "Proprietary LLM"
+ ]
+ ],
+ "hovertemplate": "%{hovertext}
license_binary=%{customdata[2]}
Release Date=%{x}
Arena ELO=%{y}
Organization=%{customdata[0]}
License=%{customdata[1]}",
+ "hovertext": [
+ "GPT-4-Turbo-2024-04-09",
+ "GPT-4-1106-preview",
+ "Claude 3 Opus",
+ "Gemini 1.5 Pro API-0409-Preview",
+ "GPT-4-0125-preview",
+ "Bard (Gemini Pro)",
+ "Claude 3 Sonnet",
+ "GPT-4-0314",
+ "Claude 3 Haiku",
+ "GPT-4-0613",
+ "Mistral-Large-2402",
+ "Reka-Flash-21B-online",
+ "Claude-1",
+ "Mistral Medium",
+ "Reka-Flash-21B",
+ "Gemini Pro (Dev API)",
+ "Claude-2.0",
+ "Mistral-Next",
+ "GPT-3.5-Turbo-0613",
+ "Claude-2.1",
+ "Gemini Pro",
+ "Claude-Instant-1",
+ "GPT-3.5-Turbo-0314",
+ "GPT-3.5-Turbo-0125",
+ "Vicuna-33B",
+ "pplx-70b-online",
+ "GPT-3.5-Turbo-1106",
+ "pplx-7b-online",
+ "Guanaco-33B",
+ "PaLM-Chat-Bison-001",
+ "Koala-13B",
+ "GPT4All-13B-Snoozy",
+ "Alpaca-13B",
+ "ChatGLM-6B",
+ "LLaMA-13B"
+ ],
+ "legendgroup": "Proprietary LLM",
+ "marker": {
+ "color": "#636efa",
+ "size": 8,
+ "symbol": "circle"
+ },
+ "mode": "markers",
+ "name": "Proprietary LLM",
+ "orientation": "v",
+ "showlegend": true,
+ "type": "scatter",
+ "x": [
+ "2024-04-09T00:00:00",
+ "2023-11-06T00:00:00",
+ "2024-02-29T00:00:00",
+ "2024-04-09T00:00:00",
+ "2024-01-25T00:00:00",
+ "2024-02-01T00:00:00",
+ "2024-02-29T00:00:00",
+ "2024-03-14T00:00:00",
+ "2024-03-07T00:00:00",
+ "2023-06-13T00:00:00",
+ "2024-02-24T00:00:00",
+ "2024-02-26T00:00:00",
+ "2023-03-14T00:00:00",
+ "2023-12-11T00:00:00",
+ "2024-02-26T00:00:00",
+ "2023-12-13T00:00:00",
+ "2023-07-11T00:00:00",
+ "2024-02-17T00:00:00",
+ "2023-06-13T00:00:00",
+ "2023-11-21T00:00:00",
+ "2023-12-13T00:00:00",
+ "2023-03-14T00:00:00",
+ "2024-03-14T00:00:00",
+ "2024-01-25T00:00:00",
+ "2023-06-21T00:00:00",
+ "2023-11-29T00:00:00",
+ "2023-11-06T00:00:00",
+ "2023-11-29T00:00:00",
+ "2023-05-22T00:00:00",
+ "2023-07-10T00:00:00",
+ "2023-04-03T00:00:00",
+ "2023-04-24T00:00:00",
+ "2023-03-13T00:00:00",
+ "2023-03-13T00:00:00",
+ "2023-02-27T00:00:00"
+ ],
+ "xaxis": "x",
+ "y": [
+ 1258.8152791324715,
+ 1252.6848856241577,
+ 1250.9262064295565,
+ 1249.6183945401244,
+ 1246.7775913509702,
+ 1208.7128773784577,
+ 1201.2654981955752,
+ 1189.557977031121,
+ 1180.8870022256567,
+ 1165.279013874706,
+ 1157.2129636222178,
+ 1153.368015144387,
+ 1150.6246111849628,
+ 1148.003325470259,
+ 1147.136619289767,
+ 1135.7254379948201,
+ 1132.3083987521873,
+ 1126.6887059695398,
+ 1119.8996424050451,
+ 1119.0708879096221,
+ 1115.3213731540973,
+ 1110.3806845414053,
+ 1108.9125926100855,
+ 1107.1298100300314,
+ 1093.8870113925889,
+ 1075.4285458870645,
+ 1072.711340370162,
+ 1043.3909111518306,
+ 1034.3952377983876,
+ 1009.7116452193085,
+ 969.48148016344,
+ 938.8924300511185,
+ 908.0843590844727,
+ 886.8734292498528,
+ 804.3563285706291
+ ],
+ "yaxis": "y"
+ },
+ {
+ "customdata": [
+ [
+ "Meta",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "Cohere",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "Meta",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "Alibaba",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "Cohere",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "Mistral",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "Alibaba",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "HuggingFace",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "Nexusflow",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "Alibaba",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "Mistral",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "01 AI",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "Microsoft",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "Databricks",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "AllenAI/UW",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "OpenChat",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "UC Berkeley",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "Meta",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "NousResearch",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "Google",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "Nvidia",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "DeepSeek AI",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "OpenChat",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "NousResearch",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "Alibaba",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "Mistral",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "Cognitive Computations",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "Upstage AI",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "Microsoft",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "Meta",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "HuggingFace",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "Microsoft",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "LMSYS",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "Meta",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "MosaicML",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "Meta",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "Google",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "HuggingFace",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "Meta",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "Alibaba",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "TII",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "Together AI",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "Allen AI",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "Google",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "Mistral",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "LMSYS",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "Alibaba",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "Google",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "Tsinghua",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "MosaicML",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "Tsinghua",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "RWKV",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "OpenAssistant",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "LMSYS",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "Stability AI",
+ "Open LLM",
+ "Open LLM"
+ ],
+ [
+ "Databricks",
+ "Open LLM",
+ "Open LLM"
+ ]
+ ],
+ "hovertemplate": "%{hovertext}
license_binary=%{customdata[2]}
Release Date=%{x}
Arena ELO=%{y}
Organization=%{customdata[0]}
License=%{customdata[1]}",
+ "hovertext": [
+ "Llama-3-70b-Instruct",
+ "Command R+",
+ "Llama-3-8b-Instruct",
+ "Qwen1.5-72B-Chat",
+ "Command R",
+ "Mixtral-8x22b-Instruct-v0.1",
+ "Qwen1.5-32B-Chat",
+ "Zephyr-ORPO-141b-A35b-v0.1",
+ "Starling-LM-7B-beta",
+ "Qwen1.5-14B-Chat",
+ "Mixtral-8x7b-Instruct-v0.1",
+ "Yi-34B-Chat",
+ "WizardLM-70B-v1.0",
+ "DBRX-Instruct-Preview",
+ "Tulu-2-DPO-70B",
+ "OpenChat-3.5-0106",
+ "Starling-LM-7B-alpha",
+ "Llama-2-70b-chat",
+ "Nous-Hermes-2-Mixtral-8x7B-DPO",
+ "Gemma-1.1-7B-it",
+ "NV-Llama2-70B-SteerLM-Chat",
+ "DeepSeek-LLM-67B-Chat",
+ "OpenChat-3.5",
+ "OpenHermes-2.5-Mistral-7b",
+ "Qwen1.5-7B-Chat",
+ "Mistral-7B-Instruct-v0.2",
+ "Dolphin-2.2.1-Mistral-7B",
+ "SOLAR-10.7B-Instruct-v1.0",
+ "WizardLM-13b-v1.2",
+ "Llama-2-13b-chat",
+ "Zephyr-7b-beta",
+ "Phi-3-Mini-128k-Instruct",
+ "Vicuna-13B",
+ "CodeLlama-70B-instruct",
+ "MPT-30B-chat",
+ "CodeLlama-34B-instruct",
+ "Gemma-7B-it",
+ "Zephyr-7b-alpha",
+ "Llama-2-7b-chat",
+ "Qwen-14B-Chat",
+ "falcon-180b-chat",
+ "StripedHyena-Nous-7B",
+ "OLMo-7B-instruct",
+ "Gemma-1.1-2B-it",
+ "Mistral-7B-Instruct-v0.1",
+ "Vicuna-7B",
+ "Qwen1.5-4B-Chat",
+ "Gemma-2B-it",
+ "ChatGLM3-6B",
+ "MPT-7B-Chat",
+ "ChatGLM2-6B",
+ "RWKV-4-Raven-14B",
+ "OpenAssistant-Pythia-12B",
+ "FastChat-T5-3B",
+ "StableLM-Tuned-Alpha-7B",
+ "Dolly-V2-12B"
+ ],
+ "legendgroup": "Open LLM",
+ "marker": {
+ "color": "#EF553B",
+ "size": 8,
+ "symbol": "circle"
+ },
+ "mode": "markers",
+ "name": "Open LLM",
+ "orientation": "v",
+ "showlegend": true,
+ "type": "scatter",
+ "x": [
+ "2024-04-18T00:00:00",
+ "2024-04-04T00:00:00",
+ "2024-04-18T00:00:00",
+ "2024-02-04T00:00:00",
+ "2024-03-11T00:00:00",
+ "2024-04-17T00:00:00",
+ "2024-02-04T00:00:00",
+ "2024-04-12T00:00:00",
+ "2024-03-20T00:00:00",
+ "2024-02-04T00:00:00",
+ "2023-12-11T00:00:00",
+ "2024-01-23T00:00:00",
+ "2023-08-09T00:00:00",
+ "2024-03-27T00:00:00",
+ "2023-11-12T00:00:00",
+ "2024-01-06T00:00:00",
+ "2023-11-25T00:00:00",
+ "2023-07-18T00:00:00",
+ "2024-01-13T00:00:00",
+ "2024-04-09T00:00:00",
+ "2023-11-24T00:00:00",
+ "2023-11-29T00:00:00",
+ "2023-11-16T00:00:00",
+ "2023-10-29T00:00:00",
+ "2024-02-04T00:00:00",
+ "2023-12-11T00:00:00",
+ "2023-10-30T00:00:00",
+ "2023-12-13T00:00:00",
+ "2023-07-25T00:00:00",
+ "2023-07-18T00:00:00",
+ "2023-10-26T00:00:00",
+ "2024-04-23T00:00:00",
+ "2023-07-23T00:00:00",
+ "2024-01-29T00:00:00",
+ "2023-06-09T00:00:00",
+ "2023-08-24T00:00:00",
+ "2024-02-21T00:00:00",
+ "2023-10-09T00:00:00",
+ "2023-07-18T00:00:00",
+ "2023-09-24T00:00:00",
+ "2023-09-05T00:00:00",
+ "2023-12-07T00:00:00",
+ "2024-02-23T00:00:00",
+ "2024-04-09T00:00:00",
+ "2023-09-27T00:00:00",
+ "2023-07-29T00:00:00",
+ "2024-02-04T00:00:00",
+ "2024-02-21T00:00:00",
+ "2023-10-25T00:00:00",
+ "2023-05-04T00:00:00",
+ "2023-06-25T00:00:00",
+ "2023-05-22T00:00:00",
+ "2023-04-03T00:00:00",
+ "2023-04-27T00:00:00",
+ "2023-04-19T00:00:00",
+ "2023-04-12T00:00:00"
+ ],
+ "xaxis": "x",
+ "y": [
+ 1209.6462958943152,
+ 1190.5291640364956,
+ 1152.500938092916,
+ 1152.485612667822,
+ 1147.8966494489798,
+ 1145.8123271934626,
+ 1133.8011394014864,
+ 1128.8163366984966,
+ 1118.5178781177128,
+ 1118.475700517794,
+ 1114,
+ 1111.1326399460543,
+ 1108.552744333791,
+ 1103.2167069462541,
+ 1102.79428840509,
+ 1098.527455141752,
+ 1091.5210240331344,
+ 1088.7078065720734,
+ 1087.307757938674,
+ 1082.9619916739105,
+ 1082.4713591517852,
+ 1079.7362777221456,
+ 1078.6663284631356,
+ 1078.6429577216027,
+ 1076.5321247427814,
+ 1074.0655548845186,
+ 1065.574858796917,
+ 1065.0611191304033,
+ 1061.9003873957429,
+ 1056.9265912995625,
+ 1054.4162995844372,
+ 1050.1481252382014,
+ 1047.9555279582555,
+ 1047.927687897156,
+ 1047.823066613369,
+ 1047.396876459045,
+ 1043.5443043467913,
+ 1043.0842673002462,
+ 1040.7537596503887,
+ 1038.586932982431,
+ 1037.076380506833,
+ 1023.112092466059,
+ 1020.7569311460566,
+ 1014.832737666584,
+ 1012.1048679697501,
+ 1009.3834445358582,
+ 1002.744713564041,
+ 999.6431193544297,
+ 960.7895509564338,
+ 933.340871331175,
+ 933.3372880828122,
+ 928.4512512366093,
+ 900.2948677134343,
+ 876.9291083582452,
+ 848.9325675003323,
+ 826.6473317994165
+ ],
+ "yaxis": "y"
+ }
+ ],
+ "layout": {
+ "legend": {
+ "title": {
+ "text": "license_binary"
+ },
+ "tracegroupgap": 0
+ },
+ "template": {
+ "data": {
+ "bar": [
+ {
+ "error_x": {
+ "color": "#2a3f5f"
+ },
+ "error_y": {
+ "color": "#2a3f5f"
+ },
+ "marker": {
+ "line": {
+ "color": "white",
+ "width": 0.5
+ },
+ "pattern": {
+ "fillmode": "overlay",
+ "size": 10,
+ "solidity": 0.2
+ }
+ },
+ "type": "bar"
+ }
+ ],
+ "barpolar": [
+ {
+ "marker": {
+ "line": {
+ "color": "white",
+ "width": 0.5
+ },
+ "pattern": {
+ "fillmode": "overlay",
+ "size": 10,
+ "solidity": 0.2
+ }
+ },
+ "type": "barpolar"
+ }
+ ],
+ "carpet": [
+ {
+ "aaxis": {
+ "endlinecolor": "#2a3f5f",
+ "gridcolor": "#C8D4E3",
+ "linecolor": "#C8D4E3",
+ "minorgridcolor": "#C8D4E3",
+ "startlinecolor": "#2a3f5f"
+ },
+ "baxis": {
+ "endlinecolor": "#2a3f5f",
+ "gridcolor": "#C8D4E3",
+ "linecolor": "#C8D4E3",
+ "minorgridcolor": "#C8D4E3",
+ "startlinecolor": "#2a3f5f"
+ },
+ "type": "carpet"
+ }
+ ],
+ "choropleth": [
+ {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ },
+ "type": "choropleth"
+ }
+ ],
+ "contour": [
+ {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ },
+ "colorscale": [
+ [
+ 0,
+ "#0d0887"
+ ],
+ [
+ 0.1111111111111111,
+ "#46039f"
+ ],
+ [
+ 0.2222222222222222,
+ "#7201a8"
+ ],
+ [
+ 0.3333333333333333,
+ "#9c179e"
+ ],
+ [
+ 0.4444444444444444,
+ "#bd3786"
+ ],
+ [
+ 0.5555555555555556,
+ "#d8576b"
+ ],
+ [
+ 0.6666666666666666,
+ "#ed7953"
+ ],
+ [
+ 0.7777777777777778,
+ "#fb9f3a"
+ ],
+ [
+ 0.8888888888888888,
+ "#fdca26"
+ ],
+ [
+ 1,
+ "#f0f921"
+ ]
+ ],
+ "type": "contour"
+ }
+ ],
+ "contourcarpet": [
+ {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ },
+ "type": "contourcarpet"
+ }
+ ],
+ "heatmap": [
+ {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ },
+ "colorscale": [
+ [
+ 0,
+ "#0d0887"
+ ],
+ [
+ 0.1111111111111111,
+ "#46039f"
+ ],
+ [
+ 0.2222222222222222,
+ "#7201a8"
+ ],
+ [
+ 0.3333333333333333,
+ "#9c179e"
+ ],
+ [
+ 0.4444444444444444,
+ "#bd3786"
+ ],
+ [
+ 0.5555555555555556,
+ "#d8576b"
+ ],
+ [
+ 0.6666666666666666,
+ "#ed7953"
+ ],
+ [
+ 0.7777777777777778,
+ "#fb9f3a"
+ ],
+ [
+ 0.8888888888888888,
+ "#fdca26"
+ ],
+ [
+ 1,
+ "#f0f921"
+ ]
+ ],
+ "type": "heatmap"
+ }
+ ],
+ "heatmapgl": [
+ {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ },
+ "colorscale": [
+ [
+ 0,
+ "#0d0887"
+ ],
+ [
+ 0.1111111111111111,
+ "#46039f"
+ ],
+ [
+ 0.2222222222222222,
+ "#7201a8"
+ ],
+ [
+ 0.3333333333333333,
+ "#9c179e"
+ ],
+ [
+ 0.4444444444444444,
+ "#bd3786"
+ ],
+ [
+ 0.5555555555555556,
+ "#d8576b"
+ ],
+ [
+ 0.6666666666666666,
+ "#ed7953"
+ ],
+ [
+ 0.7777777777777778,
+ "#fb9f3a"
+ ],
+ [
+ 0.8888888888888888,
+ "#fdca26"
+ ],
+ [
+ 1,
+ "#f0f921"
+ ]
+ ],
+ "type": "heatmapgl"
+ }
+ ],
+ "histogram": [
+ {
+ "marker": {
+ "pattern": {
+ "fillmode": "overlay",
+ "size": 10,
+ "solidity": 0.2
+ }
+ },
+ "type": "histogram"
+ }
+ ],
+ "histogram2d": [
+ {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ },
+ "colorscale": [
+ [
+ 0,
+ "#0d0887"
+ ],
+ [
+ 0.1111111111111111,
+ "#46039f"
+ ],
+ [
+ 0.2222222222222222,
+ "#7201a8"
+ ],
+ [
+ 0.3333333333333333,
+ "#9c179e"
+ ],
+ [
+ 0.4444444444444444,
+ "#bd3786"
+ ],
+ [
+ 0.5555555555555556,
+ "#d8576b"
+ ],
+ [
+ 0.6666666666666666,
+ "#ed7953"
+ ],
+ [
+ 0.7777777777777778,
+ "#fb9f3a"
+ ],
+ [
+ 0.8888888888888888,
+ "#fdca26"
+ ],
+ [
+ 1,
+ "#f0f921"
+ ]
+ ],
+ "type": "histogram2d"
+ }
+ ],
+ "histogram2dcontour": [
+ {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ },
+ "colorscale": [
+ [
+ 0,
+ "#0d0887"
+ ],
+ [
+ 0.1111111111111111,
+ "#46039f"
+ ],
+ [
+ 0.2222222222222222,
+ "#7201a8"
+ ],
+ [
+ 0.3333333333333333,
+ "#9c179e"
+ ],
+ [
+ 0.4444444444444444,
+ "#bd3786"
+ ],
+ [
+ 0.5555555555555556,
+ "#d8576b"
+ ],
+ [
+ 0.6666666666666666,
+ "#ed7953"
+ ],
+ [
+ 0.7777777777777778,
+ "#fb9f3a"
+ ],
+ [
+ 0.8888888888888888,
+ "#fdca26"
+ ],
+ [
+ 1,
+ "#f0f921"
+ ]
+ ],
+ "type": "histogram2dcontour"
+ }
+ ],
+ "mesh3d": [
+ {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ },
+ "type": "mesh3d"
+ }
+ ],
+ "parcoords": [
+ {
+ "line": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "type": "parcoords"
+ }
+ ],
+ "pie": [
+ {
+ "automargin": true,
+ "type": "pie"
+ }
+ ],
+ "scatter": [
+ {
+ "fillpattern": {
+ "fillmode": "overlay",
+ "size": 10,
+ "solidity": 0.2
+ },
+ "type": "scatter"
+ }
+ ],
+ "scatter3d": [
+ {
+ "line": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "marker": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "type": "scatter3d"
+ }
+ ],
+ "scattercarpet": [
+ {
+ "marker": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "type": "scattercarpet"
+ }
+ ],
+ "scattergeo": [
+ {
+ "marker": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "type": "scattergeo"
+ }
+ ],
+ "scattergl": [
+ {
+ "marker": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "type": "scattergl"
+ }
+ ],
+ "scattermapbox": [
+ {
+ "marker": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "type": "scattermapbox"
+ }
+ ],
+ "scatterpolar": [
+ {
+ "marker": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "type": "scatterpolar"
+ }
+ ],
+ "scatterpolargl": [
+ {
+ "marker": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "type": "scatterpolargl"
+ }
+ ],
+ "scatterternary": [
+ {
+ "marker": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "type": "scatterternary"
+ }
+ ],
+ "surface": [
+ {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ },
+ "colorscale": [
+ [
+ 0,
+ "#0d0887"
+ ],
+ [
+ 0.1111111111111111,
+ "#46039f"
+ ],
+ [
+ 0.2222222222222222,
+ "#7201a8"
+ ],
+ [
+ 0.3333333333333333,
+ "#9c179e"
+ ],
+ [
+ 0.4444444444444444,
+ "#bd3786"
+ ],
+ [
+ 0.5555555555555556,
+ "#d8576b"
+ ],
+ [
+ 0.6666666666666666,
+ "#ed7953"
+ ],
+ [
+ 0.7777777777777778,
+ "#fb9f3a"
+ ],
+ [
+ 0.8888888888888888,
+ "#fdca26"
+ ],
+ [
+ 1,
+ "#f0f921"
+ ]
+ ],
+ "type": "surface"
+ }
+ ],
+ "table": [
+ {
+ "cells": {
+ "fill": {
+ "color": "#EBF0F8"
+ },
+ "line": {
+ "color": "white"
+ }
+ },
+ "header": {
+ "fill": {
+ "color": "#C8D4E3"
+ },
+ "line": {
+ "color": "white"
+ }
+ },
+ "type": "table"
+ }
+ ]
+ },
+ "layout": {
+ "annotationdefaults": {
+ "arrowcolor": "#2a3f5f",
+ "arrowhead": 0,
+ "arrowwidth": 1
+ },
+ "autotypenumbers": "strict",
+ "coloraxis": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "colorscale": {
+ "diverging": [
+ [
+ 0,
+ "#8e0152"
+ ],
+ [
+ 0.1,
+ "#c51b7d"
+ ],
+ [
+ 0.2,
+ "#de77ae"
+ ],
+ [
+ 0.3,
+ "#f1b6da"
+ ],
+ [
+ 0.4,
+ "#fde0ef"
+ ],
+ [
+ 0.5,
+ "#f7f7f7"
+ ],
+ [
+ 0.6,
+ "#e6f5d0"
+ ],
+ [
+ 0.7,
+ "#b8e186"
+ ],
+ [
+ 0.8,
+ "#7fbc41"
+ ],
+ [
+ 0.9,
+ "#4d9221"
+ ],
+ [
+ 1,
+ "#276419"
+ ]
+ ],
+ "sequential": [
+ [
+ 0,
+ "#0d0887"
+ ],
+ [
+ 0.1111111111111111,
+ "#46039f"
+ ],
+ [
+ 0.2222222222222222,
+ "#7201a8"
+ ],
+ [
+ 0.3333333333333333,
+ "#9c179e"
+ ],
+ [
+ 0.4444444444444444,
+ "#bd3786"
+ ],
+ [
+ 0.5555555555555556,
+ "#d8576b"
+ ],
+ [
+ 0.6666666666666666,
+ "#ed7953"
+ ],
+ [
+ 0.7777777777777778,
+ "#fb9f3a"
+ ],
+ [
+ 0.8888888888888888,
+ "#fdca26"
+ ],
+ [
+ 1,
+ "#f0f921"
+ ]
+ ],
+ "sequentialminus": [
+ [
+ 0,
+ "#0d0887"
+ ],
+ [
+ 0.1111111111111111,
+ "#46039f"
+ ],
+ [
+ 0.2222222222222222,
+ "#7201a8"
+ ],
+ [
+ 0.3333333333333333,
+ "#9c179e"
+ ],
+ [
+ 0.4444444444444444,
+ "#bd3786"
+ ],
+ [
+ 0.5555555555555556,
+ "#d8576b"
+ ],
+ [
+ 0.6666666666666666,
+ "#ed7953"
+ ],
+ [
+ 0.7777777777777778,
+ "#fb9f3a"
+ ],
+ [
+ 0.8888888888888888,
+ "#fdca26"
+ ],
+ [
+ 1,
+ "#f0f921"
+ ]
+ ]
+ },
+ "colorway": [
+ "#636efa",
+ "#EF553B",
+ "#00cc96",
+ "#ab63fa",
+ "#FFA15A",
+ "#19d3f3",
+ "#FF6692",
+ "#B6E880",
+ "#FF97FF",
+ "#FECB52"
+ ],
+ "font": {
+ "color": "#2a3f5f"
+ },
+ "geo": {
+ "bgcolor": "white",
+ "lakecolor": "white",
+ "landcolor": "white",
+ "showlakes": true,
+ "showland": true,
+ "subunitcolor": "#C8D4E3"
+ },
+ "hoverlabel": {
+ "align": "left"
+ },
+ "hovermode": "closest",
+ "mapbox": {
+ "style": "light"
+ },
+ "paper_bgcolor": "white",
+ "plot_bgcolor": "white",
+ "polar": {
+ "angularaxis": {
+ "gridcolor": "#EBF0F8",
+ "linecolor": "#EBF0F8",
+ "ticks": ""
+ },
+ "bgcolor": "white",
+ "radialaxis": {
+ "gridcolor": "#EBF0F8",
+ "linecolor": "#EBF0F8",
+ "ticks": ""
+ }
+ },
+ "scene": {
+ "xaxis": {
+ "backgroundcolor": "white",
+ "gridcolor": "#DFE8F3",
+ "gridwidth": 2,
+ "linecolor": "#EBF0F8",
+ "showbackground": true,
+ "ticks": "",
+ "zerolinecolor": "#EBF0F8"
+ },
+ "yaxis": {
+ "backgroundcolor": "white",
+ "gridcolor": "#DFE8F3",
+ "gridwidth": 2,
+ "linecolor": "#EBF0F8",
+ "showbackground": true,
+ "ticks": "",
+ "zerolinecolor": "#EBF0F8"
+ },
+ "zaxis": {
+ "backgroundcolor": "white",
+ "gridcolor": "#DFE8F3",
+ "gridwidth": 2,
+ "linecolor": "#EBF0F8",
+ "showbackground": true,
+ "ticks": "",
+ "zerolinecolor": "#EBF0F8"
+ }
+ },
+ "shapedefaults": {
+ "line": {
+ "color": "#2a3f5f"
+ }
+ },
+ "ternary": {
+ "aaxis": {
+ "gridcolor": "#DFE8F3",
+ "linecolor": "#A2B1C6",
+ "ticks": ""
+ },
+ "baxis": {
+ "gridcolor": "#DFE8F3",
+ "linecolor": "#A2B1C6",
+ "ticks": ""
+ },
+ "bgcolor": "white",
+ "caxis": {
+ "gridcolor": "#DFE8F3",
+ "linecolor": "#A2B1C6",
+ "ticks": ""
+ }
+ },
+ "title": {
+ "x": 0.05
+ },
+ "xaxis": {
+ "automargin": true,
+ "gridcolor": "#EBF0F8",
+ "linecolor": "#EBF0F8",
+ "ticks": "",
+ "title": {
+ "standoff": 15
+ },
+ "zerolinecolor": "#EBF0F8",
+ "zerolinewidth": 2
+ },
+ "yaxis": {
+ "automargin": true,
+ "gridcolor": "#EBF0F8",
+ "linecolor": "#EBF0F8",
+ "ticks": "",
+ "title": {
+ "standoff": 15
+ },
+ "zerolinecolor": "#EBF0F8",
+ "zerolinewidth": 2
+ }
+ }
+ },
+ "title": {
+ "text": "Closed-source vs. Open-weight models (Arena ELO, 19 Apr 24)"
+ },
+ "xaxis": {
+ "anchor": "y",
+ "domain": [
+ 0,
+ 1
+ ],
+ "title": {
+ "text": "Release Date"
+ }
+ },
+ "yaxis": {
+ "anchor": "x",
+ "domain": [
+ 0,
+ 1
+ ],
+ "title": {
+ "text": "Arena ELO"
+ }
+ }
+ }
+ }
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import plotly.express as px\n",
+ "import plotly.graph_objects as go\n",
+ "\n",
+ "# Plotting\n",
+ "fig = px.scatter(\n",
+ " df,\n",
+ " x=\"Release Date\",\n",
+ " y=\"rating\",\n",
+ " color=\"license_binary\",\n",
+ " hover_name=\"Model\",\n",
+ " hover_data=[\n",
+ " \"Release Date\",\n",
+ " \"Organization\",\n",
+ " \"License\",\n",
+ " \"license_binary\",\n",
+ " ],\n",
+ " title=\"Closed-source vs. Open-weight models (Arena ELO, 19 Apr 24)\",\n",
+ " labels={\"rating\": \"Arena ELO\", \"Release Date\": \"Release Date\"},\n",
+ " template=\"plotly_white\",\n",
+ ")\n",
+ "fig.update_traces(marker=dict(size=8))\n",
+ "\n",
+ "# Display the plot\n",
+ "fig.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "plotly.graph_objs._figure.Figure"
+ ]
+ },
+ "execution_count": 37,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "type(fig)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
"source": []
}
],